]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/osd_types.cc
40d3138f459cb6922a7afed637ab6f67ba946fda
[ceph.git] / ceph / src / osd / osd_types.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include <boost/assign/list_of.hpp>
19
20 #include "osd_types.h"
21 #include "include/ceph_features.h"
22 extern "C" {
23 #include "crush/hash.h"
24 }
25 #include "PG.h"
26 #include "OSDMap.h"
27 #include "PGBackend.h"
28
29 const char *ceph_osd_flag_name(unsigned flag)
30 {
31 switch (flag) {
32 case CEPH_OSD_FLAG_ACK: return "ack";
33 case CEPH_OSD_FLAG_ONNVRAM: return "onnvram";
34 case CEPH_OSD_FLAG_ONDISK: return "ondisk";
35 case CEPH_OSD_FLAG_RETRY: return "retry";
36 case CEPH_OSD_FLAG_READ: return "read";
37 case CEPH_OSD_FLAG_WRITE: return "write";
38 case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap";
39 case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old";
40 case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads";
41 case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec";
42 case CEPH_OSD_FLAG_PGOP: return "pgop";
43 case CEPH_OSD_FLAG_EXEC: return "exec";
44 case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public";
45 case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads";
46 case CEPH_OSD_FLAG_RWORDERED: return "rwordered";
47 case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache";
48 case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
49 case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
50 case CEPH_OSD_FLAG_FLUSH: return "flush";
51 case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
52 case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
53 case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
54 case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
55 case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
56 case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
57 default: return "???";
58 }
59 }
60
61 string ceph_osd_flag_string(unsigned flags)
62 {
63 string s;
64 for (unsigned i=0; i<32; ++i) {
65 if (flags & (1u<<i)) {
66 if (s.length())
67 s += "+";
68 s += ceph_osd_flag_name(1u << i);
69 }
70 }
71 if (s.length())
72 return s;
73 return string("-");
74 }
75
76 const char * ceph_osd_op_flag_name(unsigned flag)
77 {
78 const char *name;
79
80 switch(flag) {
81 case CEPH_OSD_OP_FLAG_EXCL:
82 name = "excl";
83 break;
84 case CEPH_OSD_OP_FLAG_FAILOK:
85 name = "failok";
86 break;
87 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM:
88 name = "fadvise_random";
89 break;
90 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL:
91 name = "fadvise_sequential";
92 break;
93 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED:
94 name = "favise_willneed";
95 break;
96 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED:
97 name = "fadvise_dontneed";
98 break;
99 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
100 name = "fadvise_nocache";
101 break;
102 default:
103 name = "???";
104 };
105
106 return name;
107 }
108
109 string ceph_osd_op_flag_string(unsigned flags)
110 {
111 string s;
112 for (unsigned i=0; i<32; ++i) {
113 if (flags & (1u<<i)) {
114 if (s.length())
115 s += "+";
116 s += ceph_osd_op_flag_name(1u << i);
117 }
118 }
119 if (s.length())
120 return s;
121 return string("-");
122 }
123
124 string ceph_osd_alloc_hint_flag_string(unsigned flags)
125 {
126 string s;
127 for (unsigned i=0; i<32; ++i) {
128 if (flags & (1u<<i)) {
129 if (s.length())
130 s += "+";
131 s += ceph_osd_alloc_hint_flag_name(1u << i);
132 }
133 }
134 if (s.length())
135 return s;
136 return string("-");
137 }
138
139 void pg_shard_t::encode(bufferlist &bl) const
140 {
141 ENCODE_START(1, 1, bl);
142 ::encode(osd, bl);
143 ::encode(shard, bl);
144 ENCODE_FINISH(bl);
145 }
146 void pg_shard_t::decode(bufferlist::iterator &bl)
147 {
148 DECODE_START(1, bl);
149 ::decode(osd, bl);
150 ::decode(shard, bl);
151 DECODE_FINISH(bl);
152 }
153
154 ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
155 {
156 if (rhs.is_undefined())
157 return lhs << "?";
158 if (rhs.shard == shard_id_t::NO_SHARD)
159 return lhs << rhs.osd;
160 return lhs << rhs.osd << '(' << (unsigned)(rhs.shard) << ')';
161 }
162
163 // -- osd_reqid_t --
164 void osd_reqid_t::dump(Formatter *f) const
165 {
166 f->dump_stream("name") << name;
167 f->dump_int("inc", inc);
168 f->dump_unsigned("tid", tid);
169 }
170
171 void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o)
172 {
173 o.push_back(new osd_reqid_t);
174 o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
175 }
176
177 // -- object_locator_t --
178
179 void object_locator_t::encode(bufferlist& bl) const
180 {
181 // verify that nobody's corrupted the locator
182 assert(hash == -1 || key.empty());
183 __u8 encode_compat = 3;
184 ENCODE_START(6, encode_compat, bl);
185 ::encode(pool, bl);
186 int32_t preferred = -1; // tell old code there is no preferred osd (-1).
187 ::encode(preferred, bl);
188 ::encode(key, bl);
189 ::encode(nspace, bl);
190 ::encode(hash, bl);
191 if (hash != -1)
192 encode_compat = MAX(encode_compat, 6); // need to interpret the hash
193 ENCODE_FINISH_NEW_COMPAT(bl, encode_compat);
194 }
195
196 void object_locator_t::decode(bufferlist::iterator& p)
197 {
198 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p);
199 if (struct_v < 2) {
200 int32_t op;
201 ::decode(op, p);
202 pool = op;
203 int16_t pref;
204 ::decode(pref, p);
205 } else {
206 ::decode(pool, p);
207 int32_t preferred;
208 ::decode(preferred, p);
209 }
210 ::decode(key, p);
211 if (struct_v >= 5)
212 ::decode(nspace, p);
213 if (struct_v >= 6)
214 ::decode(hash, p);
215 else
216 hash = -1;
217 DECODE_FINISH(p);
218 // verify that nobody's corrupted the locator
219 assert(hash == -1 || key.empty());
220 }
221
222 void object_locator_t::dump(Formatter *f) const
223 {
224 f->dump_int("pool", pool);
225 f->dump_string("key", key);
226 f->dump_string("namespace", nspace);
227 f->dump_int("hash", hash);
228 }
229
230 void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
231 {
232 o.push_back(new object_locator_t);
233 o.push_back(new object_locator_t(123));
234 o.push_back(new object_locator_t(123, 876));
235 o.push_back(new object_locator_t(1, "n2"));
236 o.push_back(new object_locator_t(1234, "", "key"));
237 o.push_back(new object_locator_t(12, "n1", "key2"));
238 }
239
240 // -- request_redirect_t --
241 void request_redirect_t::encode(bufferlist& bl) const
242 {
243 ENCODE_START(1, 1, bl);
244 ::encode(redirect_locator, bl);
245 ::encode(redirect_object, bl);
246 ::encode(osd_instructions, bl);
247 ENCODE_FINISH(bl);
248 }
249
250 void request_redirect_t::decode(bufferlist::iterator& bl)
251 {
252 DECODE_START(1, bl);
253 ::decode(redirect_locator, bl);
254 ::decode(redirect_object, bl);
255 ::decode(osd_instructions, bl);
256 DECODE_FINISH(bl);
257 }
258
259 void request_redirect_t::dump(Formatter *f) const
260 {
261 f->dump_string("object", redirect_object);
262 f->open_object_section("locator");
263 redirect_locator.dump(f);
264 f->close_section(); // locator
265 }
266
267 void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
268 {
269 object_locator_t loc(1, "redir_obj");
270 o.push_back(new request_redirect_t());
271 o.push_back(new request_redirect_t(loc, 0));
272 o.push_back(new request_redirect_t(loc, "redir_obj"));
273 o.push_back(new request_redirect_t(loc));
274 }
275
276 void objectstore_perf_stat_t::dump(Formatter *f) const
277 {
278 f->dump_unsigned("commit_latency_ms", os_commit_latency);
279 f->dump_unsigned("apply_latency_ms", os_apply_latency);
280 }
281
282 void objectstore_perf_stat_t::encode(bufferlist &bl) const
283 {
284 ENCODE_START(1, 1, bl);
285 ::encode(os_commit_latency, bl);
286 ::encode(os_apply_latency, bl);
287 ENCODE_FINISH(bl);
288 }
289
290 void objectstore_perf_stat_t::decode(bufferlist::iterator &bl)
291 {
292 DECODE_START(1, bl);
293 ::decode(os_commit_latency, bl);
294 ::decode(os_apply_latency, bl);
295 DECODE_FINISH(bl);
296 }
297
298 void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o)
299 {
300 o.push_back(new objectstore_perf_stat_t());
301 o.push_back(new objectstore_perf_stat_t());
302 o.back()->os_commit_latency = 20;
303 o.back()->os_apply_latency = 30;
304 }
305
306 // -- osd_stat_t --
307 void osd_stat_t::dump(Formatter *f) const
308 {
309 f->dump_unsigned("up_from", up_from);
310 f->dump_unsigned("seq", seq);
311 f->dump_unsigned("kb", kb);
312 f->dump_unsigned("kb_used", kb_used);
313 f->dump_unsigned("kb_avail", kb_avail);
314 f->open_array_section("hb_peers");
315 for (auto p : hb_peers)
316 f->dump_int("osd", p);
317 f->close_section();
318 f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
319 f->dump_int("num_snap_trimming", num_snap_trimming);
320 f->open_object_section("op_queue_age_hist");
321 op_queue_age_hist.dump(f);
322 f->close_section();
323 f->open_object_section("perf_stat");
324 os_perf_stat.dump(f);
325 f->close_section();
326 }
327
328 void osd_stat_t::encode(bufferlist &bl) const
329 {
330 ENCODE_START(6, 2, bl);
331 ::encode(kb, bl);
332 ::encode(kb_used, bl);
333 ::encode(kb_avail, bl);
334 ::encode(snap_trim_queue_len, bl);
335 ::encode(num_snap_trimming, bl);
336 ::encode(hb_peers, bl);
337 ::encode((uint32_t)0, bl);
338 ::encode(op_queue_age_hist, bl);
339 ::encode(os_perf_stat, bl);
340 ::encode(up_from, bl);
341 ::encode(seq, bl);
342 ENCODE_FINISH(bl);
343 }
344
345 void osd_stat_t::decode(bufferlist::iterator &bl)
346 {
347 DECODE_START_LEGACY_COMPAT_LEN(6, 2, 2, bl);
348 ::decode(kb, bl);
349 ::decode(kb_used, bl);
350 ::decode(kb_avail, bl);
351 ::decode(snap_trim_queue_len, bl);
352 ::decode(num_snap_trimming, bl);
353 ::decode(hb_peers, bl);
354 vector<int> num_hb_out;
355 ::decode(num_hb_out, bl);
356 if (struct_v >= 3)
357 ::decode(op_queue_age_hist, bl);
358 if (struct_v >= 4)
359 ::decode(os_perf_stat, bl);
360 if (struct_v >= 6) {
361 ::decode(up_from, bl);
362 ::decode(seq, bl);
363 }
364 DECODE_FINISH(bl);
365 }
366
367 void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
368 {
369 o.push_back(new osd_stat_t);
370
371 o.push_back(new osd_stat_t);
372 o.back()->kb = 1;
373 o.back()->kb_used = 2;
374 o.back()->kb_avail = 3;
375 o.back()->hb_peers.push_back(7);
376 o.back()->snap_trim_queue_len = 8;
377 o.back()->num_snap_trimming = 99;
378 }
379
380 // -- pg_t --
381
382 int pg_t::print(char *o, int maxlen) const
383 {
384 if (preferred() >= 0)
385 return snprintf(o, maxlen, "%llu.%xp%d", (unsigned long long)pool(), ps(), preferred());
386 else
387 return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps());
388 }
389
390 bool pg_t::parse(const char *s)
391 {
392 uint64_t ppool;
393 uint32_t pseed;
394 int32_t pref;
395 int r = sscanf(s, "%llu.%xp%d", (long long unsigned *)&ppool, &pseed, &pref);
396 if (r < 2)
397 return false;
398 m_pool = ppool;
399 m_seed = pseed;
400 if (r == 3)
401 m_preferred = pref;
402 else
403 m_preferred = -1;
404 return true;
405 }
406
407 bool spg_t::parse(const char *s)
408 {
409 pgid.set_preferred(-1);
410 shard = shard_id_t::NO_SHARD;
411 uint64_t ppool;
412 uint32_t pseed;
413 int32_t pref;
414 uint32_t pshard;
415 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
416 if (r < 2)
417 return false;
418 pgid.set_pool(ppool);
419 pgid.set_ps(pseed);
420
421 const char *p = strchr(s, 'p');
422 if (p) {
423 r = sscanf(p, "p%d", &pref);
424 if (r == 1) {
425 pgid.set_preferred(pref);
426 } else {
427 return false;
428 }
429 }
430
431 p = strchr(s, 's');
432 if (p) {
433 r = sscanf(p, "s%d", &pshard);
434 if (r == 1) {
435 shard = shard_id_t(pshard);
436 } else {
437 return false;
438 }
439 }
440 return true;
441 }
442
443 char *spg_t::calc_name(char *buf, const char *suffix_backwords) const
444 {
445 while (*suffix_backwords)
446 *--buf = *suffix_backwords++;
447
448 if (!is_no_shard()) {
449 buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf);
450 *--buf = 's';
451 }
452
453 return pgid.calc_name(buf, "");
454 }
455
456 ostream& operator<<(ostream& out, const spg_t &pg)
457 {
458 char buf[spg_t::calc_name_buf_size];
459 buf[spg_t::calc_name_buf_size - 1] = '\0';
460 out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, "");
461 return out;
462 }
463
464 pg_t pg_t::get_ancestor(unsigned old_pg_num) const
465 {
466 int old_bits = cbits(old_pg_num);
467 int old_mask = (1 << old_bits) - 1;
468 pg_t ret = *this;
469 ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
470 return ret;
471 }
472
473 bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
474 {
475 assert(m_seed < old_pg_num);
476 if (new_pg_num <= old_pg_num)
477 return false;
478
479 bool split = false;
480 if (true) {
481 unsigned old_bits = cbits(old_pg_num);
482 unsigned old_mask = (1 << old_bits) - 1;
483 for (unsigned n = 1; ; n++) {
484 unsigned next_bit = (n << (old_bits-1));
485 unsigned s = next_bit | m_seed;
486
487 if (s < old_pg_num || s == m_seed)
488 continue;
489 if (s >= new_pg_num)
490 break;
491 if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) {
492 split = true;
493 if (children)
494 children->insert(pg_t(s, m_pool, m_preferred));
495 }
496 }
497 }
498 if (false) {
499 // brute force
500 int old_bits = cbits(old_pg_num);
501 int old_mask = (1 << old_bits) - 1;
502 for (unsigned x = old_pg_num; x < new_pg_num; ++x) {
503 unsigned o = ceph_stable_mod(x, old_pg_num, old_mask);
504 if (o == m_seed) {
505 split = true;
506 children->insert(pg_t(x, m_pool, m_preferred));
507 }
508 }
509 }
510 return split;
511 }
512
513 unsigned pg_t::get_split_bits(unsigned pg_num) const {
514 if (pg_num == 1)
515 return 0;
516 assert(pg_num > 1);
517
518 // Find unique p such that pg_num \in [2^(p-1), 2^p)
519 unsigned p = cbits(pg_num);
520 assert(p); // silence coverity #751330
521
522 if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1))))
523 return p;
524 else
525 return p - 1;
526 }
527
528 pg_t pg_t::get_parent() const
529 {
530 unsigned bits = cbits(m_seed);
531 assert(bits);
532 pg_t retval = *this;
533 retval.m_seed &= ~((~0)<<(bits - 1));
534 return retval;
535 }
536
537 hobject_t pg_t::get_hobj_start() const
538 {
539 return hobject_t(object_t(), string(), CEPH_NOSNAP, m_seed, m_pool,
540 string());
541 }
542
543 hobject_t pg_t::get_hobj_end(unsigned pg_num) const
544 {
545 // note: this assumes a bitwise sort; with the legacy nibblewise
546 // sort a PG did not always cover a single contiguous range of the
547 // (bit-reversed) hash range.
548 unsigned bits = get_split_bits(pg_num);
549 uint64_t rev_start = hobject_t::_reverse_bits(m_seed);
550 uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1;
551 if (rev_end >= 0x100000000) {
552 assert(rev_end == 0x100000000);
553 return hobject_t::get_max();
554 } else {
555 return hobject_t(object_t(), string(), CEPH_NOSNAP,
556 hobject_t::_reverse_bits(rev_end), m_pool,
557 string());
558 }
559 }
560
561 void pg_t::dump(Formatter *f) const
562 {
563 f->dump_unsigned("pool", m_pool);
564 f->dump_unsigned("seed", m_seed);
565 f->dump_int("preferred_osd", m_preferred);
566 }
567
568 void pg_t::generate_test_instances(list<pg_t*>& o)
569 {
570 o.push_back(new pg_t);
571 o.push_back(new pg_t(1, 2, -1));
572 o.push_back(new pg_t(13123, 3, -1));
573 o.push_back(new pg_t(131223, 4, 23));
574 }
575
576 char *pg_t::calc_name(char *buf, const char *suffix_backwords) const
577 {
578 while (*suffix_backwords)
579 *--buf = *suffix_backwords++;
580
581 if (m_preferred >= 0)
582 *--buf ='p';
583
584 buf = ritoa<uint32_t, 16>(m_seed, buf);
585
586 *--buf = '.';
587
588 return ritoa<uint64_t, 10>(m_pool, buf);
589 }
590
591 ostream& operator<<(ostream& out, const pg_t &pg)
592 {
593 char buf[pg_t::calc_name_buf_size];
594 buf[pg_t::calc_name_buf_size - 1] = '\0';
595 out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, "");
596 return out;
597 }
598
599
600 // -- coll_t --
601
602 void coll_t::calc_str()
603 {
604 switch (type) {
605 case TYPE_META:
606 strcpy(_str_buff, "meta");
607 _str = _str_buff;
608 break;
609 case TYPE_PG:
610 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
611 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_");
612 break;
613 case TYPE_PG_TEMP:
614 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
615 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_");
616 break;
617 default:
618 assert(0 == "unknown collection type");
619 }
620 }
621
622 bool coll_t::parse(const std::string& s)
623 {
624 if (s == "meta") {
625 type = TYPE_META;
626 pgid = spg_t();
627 removal_seq = 0;
628 calc_str();
629 assert(s == _str);
630 return true;
631 }
632 if (s.find("_head") == s.length() - 5 &&
633 pgid.parse(s.substr(0, s.length() - 5))) {
634 type = TYPE_PG;
635 removal_seq = 0;
636 calc_str();
637 assert(s == _str);
638 return true;
639 }
640 if (s.find("_TEMP") == s.length() - 5 &&
641 pgid.parse(s.substr(0, s.length() - 5))) {
642 type = TYPE_PG_TEMP;
643 removal_seq = 0;
644 calc_str();
645 assert(s == _str);
646 return true;
647 }
648 return false;
649 }
650
651 void coll_t::encode(bufferlist& bl) const
652 {
653 // when changing this, remember to update encoded_size() too.
654 if (is_temp()) {
655 // can't express this as v2...
656 __u8 struct_v = 3;
657 ::encode(struct_v, bl);
658 ::encode(to_str(), bl);
659 } else {
660 __u8 struct_v = 2;
661 ::encode(struct_v, bl);
662 ::encode((__u8)type, bl);
663 ::encode(pgid, bl);
664 snapid_t snap = CEPH_NOSNAP;
665 ::encode(snap, bl);
666 }
667 }
668
669 size_t coll_t::encoded_size() const
670 {
671 size_t r = sizeof(__u8);
672 if (is_temp()) {
673 // v3
674 r += sizeof(__u32);
675 if (_str) {
676 r += strlen(_str);
677 }
678 } else {
679 // v2
680 // 1. type
681 r += sizeof(__u8);
682 // 2. pgid
683 // - encoding header
684 r += sizeof(ceph_le32) + 2 * sizeof(__u8);
685 // - pg_t
686 r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
687 // - shard_id_t
688 r += sizeof(int8_t);
689 // 3. snapid_t
690 r += sizeof(uint64_t);
691 }
692
693 return r;
694 }
695
696 void coll_t::decode(bufferlist::iterator& bl)
697 {
698 __u8 struct_v;
699 ::decode(struct_v, bl);
700 switch (struct_v) {
701 case 1:
702 {
703 snapid_t snap;
704 ::decode(pgid, bl);
705 ::decode(snap, bl);
706
707 // infer the type
708 if (pgid == spg_t() && snap == 0) {
709 type = TYPE_META;
710 } else {
711 type = TYPE_PG;
712 }
713 removal_seq = 0;
714 }
715 break;
716
717 case 2:
718 {
719 __u8 _type;
720 snapid_t snap;
721 ::decode(_type, bl);
722 ::decode(pgid, bl);
723 ::decode(snap, bl);
724 type = (type_t)_type;
725 removal_seq = 0;
726 }
727 break;
728
729 case 3:
730 {
731 string str;
732 ::decode(str, bl);
733 bool ok = parse(str);
734 if (!ok)
735 throw std::domain_error(std::string("unable to parse pg ") + str);
736 }
737 break;
738
739 default:
740 {
741 ostringstream oss;
742 oss << "coll_t::decode(): don't know how to decode version "
743 << struct_v;
744 throw std::domain_error(oss.str());
745 }
746 }
747 }
748
749 void coll_t::dump(Formatter *f) const
750 {
751 f->dump_unsigned("type_id", (unsigned)type);
752 if (type != TYPE_META)
753 f->dump_stream("pgid") << pgid;
754 f->dump_string("name", to_str());
755 }
756
757 void coll_t::generate_test_instances(list<coll_t*>& o)
758 {
759 o.push_back(new coll_t());
760 o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
761 o.push_back(new coll_t(o.back()->get_temp()));
762 o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
763 o.push_back(new coll_t(o.back()->get_temp()));
764 o.push_back(new coll_t());
765 }
766
767 // ---
768
769 std::string pg_vector_string(const vector<int32_t> &a)
770 {
771 ostringstream oss;
772 oss << "[";
773 for (vector<int32_t>::const_iterator i = a.begin(); i != a.end(); ++i) {
774 if (i != a.begin())
775 oss << ",";
776 if (*i != CRUSH_ITEM_NONE)
777 oss << *i;
778 else
779 oss << "NONE";
780 }
781 oss << "]";
782 return oss.str();
783 }
784
785 std::string pg_state_string(int state)
786 {
787 ostringstream oss;
788 if (state & PG_STATE_STALE)
789 oss << "stale+";
790 if (state & PG_STATE_CREATING)
791 oss << "creating+";
792 if (state & PG_STATE_ACTIVE)
793 oss << "active+";
794 if (state & PG_STATE_ACTIVATING)
795 oss << "activating+";
796 if (state & PG_STATE_CLEAN)
797 oss << "clean+";
798 if (state & PG_STATE_RECOVERY_WAIT)
799 oss << "recovery_wait+";
800 if (state & PG_STATE_RECOVERY_TOOFULL)
801 oss << "recovery_toofull+";
802 if (state & PG_STATE_RECOVERING)
803 oss << "recovering+";
804 if (state & PG_STATE_DOWN)
805 oss << "down+";
806 if (state & PG_STATE_UNDERSIZED)
807 oss << "undersized+";
808 if (state & PG_STATE_DEGRADED)
809 oss << "degraded+";
810 if (state & PG_STATE_REMAPPED)
811 oss << "remapped+";
812 if (state & PG_STATE_SCRUBBING)
813 oss << "scrubbing+";
814 if (state & PG_STATE_DEEP_SCRUB)
815 oss << "deep+";
816 if (state & PG_STATE_INCONSISTENT)
817 oss << "inconsistent+";
818 if (state & PG_STATE_PEERING)
819 oss << "peering+";
820 if (state & PG_STATE_REPAIR)
821 oss << "repair+";
822 if ((state & PG_STATE_BACKFILL_WAIT) &&
823 !(state &PG_STATE_BACKFILL))
824 oss << "backfill_wait+";
825 if (state & PG_STATE_BACKFILL)
826 oss << "backfilling+";
827 if (state & PG_STATE_BACKFILL_TOOFULL)
828 oss << "backfill_toofull+";
829 if (state & PG_STATE_INCOMPLETE)
830 oss << "incomplete+";
831 if (state & PG_STATE_PEERED)
832 oss << "peered+";
833 if (state & PG_STATE_SNAPTRIM)
834 oss << "snaptrim+";
835 if (state & PG_STATE_SNAPTRIM_WAIT)
836 oss << "snaptrim_wait+";
837 string ret(oss.str());
838 if (ret.length() > 0)
839 ret.resize(ret.length() - 1);
840 else
841 ret = "unknown";
842 return ret;
843 }
844
845 int pg_string_state(const std::string& state)
846 {
847 int type;
848 if (state == "active")
849 type = PG_STATE_ACTIVE;
850 else if (state == "clean")
851 type = PG_STATE_CLEAN;
852 else if (state == "down")
853 type = PG_STATE_DOWN;
854 else if (state == "scrubbing")
855 type = PG_STATE_SCRUBBING;
856 else if (state == "degraded")
857 type = PG_STATE_DEGRADED;
858 else if (state == "inconsistent")
859 type = PG_STATE_INCONSISTENT;
860 else if (state == "peering")
861 type = PG_STATE_PEERING;
862 else if (state == "repair")
863 type = PG_STATE_REPAIR;
864 else if (state == "recovering")
865 type = PG_STATE_RECOVERING;
866 else if (state == "backfill_wait")
867 type = PG_STATE_BACKFILL_WAIT;
868 else if (state == "incomplete")
869 type = PG_STATE_INCOMPLETE;
870 else if (state == "stale")
871 type = PG_STATE_STALE;
872 else if (state == "remapped")
873 type = PG_STATE_REMAPPED;
874 else if (state == "deep_scrub")
875 type = PG_STATE_DEEP_SCRUB;
876 else if (state == "backfill")
877 type = PG_STATE_BACKFILL;
878 else if (state == "backfill_toofull")
879 type = PG_STATE_BACKFILL_TOOFULL;
880 else if (state == "recovery_wait")
881 type = PG_STATE_RECOVERY_WAIT;
882 else if (state == "recovery_toofull")
883 type = PG_STATE_RECOVERY_TOOFULL;
884 else if (state == "undersized")
885 type = PG_STATE_UNDERSIZED;
886 else if (state == "activating")
887 type = PG_STATE_ACTIVATING;
888 else if (state == "peered")
889 type = PG_STATE_PEERED;
890 else if (state == "snaptrim")
891 type = PG_STATE_SNAPTRIM;
892 else if (state == "snaptrim_wait")
893 type = PG_STATE_SNAPTRIM_WAIT;
894 else
895 type = -1;
896 return type;
897 }
898
899 // -- eversion_t --
900 string eversion_t::get_key_name() const
901 {
902 char key[32];
903 // Below is equivalent of sprintf("%010u.%020llu");
904 key[31] = 0;
905 ritoa<uint64_t, 10, 20>(version, key + 31);
906 key[10] = '.';
907 ritoa<uint32_t, 10, 10>(epoch, key + 10);
908 return string(key);
909 }
910
911
912 // -- pool_snap_info_t --
913 void pool_snap_info_t::dump(Formatter *f) const
914 {
915 f->dump_unsigned("snapid", snapid);
916 f->dump_stream("stamp") << stamp;
917 f->dump_string("name", name);
918 }
919
920 void pool_snap_info_t::encode(bufferlist& bl, uint64_t features) const
921 {
922 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
923 __u8 struct_v = 1;
924 ::encode(struct_v, bl);
925 ::encode(snapid, bl);
926 ::encode(stamp, bl);
927 ::encode(name, bl);
928 return;
929 }
930 ENCODE_START(2, 2, bl);
931 ::encode(snapid, bl);
932 ::encode(stamp, bl);
933 ::encode(name, bl);
934 ENCODE_FINISH(bl);
935 }
936
937 void pool_snap_info_t::decode(bufferlist::iterator& bl)
938 {
939 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
940 ::decode(snapid, bl);
941 ::decode(stamp, bl);
942 ::decode(name, bl);
943 DECODE_FINISH(bl);
944 }
945
946 void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
947 {
948 o.push_back(new pool_snap_info_t);
949 o.push_back(new pool_snap_info_t);
950 o.back()->snapid = 1;
951 o.back()->stamp = utime_t(1, 2);
952 o.back()->name = "foo";
953 }
954
955 // -- pool_opts_t --
956
957 typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
958 static opt_mapping_t opt_mapping = boost::assign::map_list_of
959 ("scrub_min_interval", pool_opts_t::opt_desc_t(
960 pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
961 ("scrub_max_interval", pool_opts_t::opt_desc_t(
962 pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
963 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
964 pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
965 ("recovery_priority", pool_opts_t::opt_desc_t(
966 pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
967 ("recovery_op_priority", pool_opts_t::opt_desc_t(
968 pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
969 ("scrub_priority", pool_opts_t::opt_desc_t(
970 pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT))
971 ("compression_mode", pool_opts_t::opt_desc_t(
972 pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR))
973 ("compression_algorithm", pool_opts_t::opt_desc_t(
974 pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR))
975 ("compression_required_ratio", pool_opts_t::opt_desc_t(
976 pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE))
977 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
978 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT))
979 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
980 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT))
981 ("csum_type", pool_opts_t::opt_desc_t(
982 pool_opts_t::CSUM_TYPE, pool_opts_t::INT))
983 ("csum_max_block", pool_opts_t::opt_desc_t(
984 pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT))
985 ("csum_min_block", pool_opts_t::opt_desc_t(
986 pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT));
987
988 bool pool_opts_t::is_opt_name(const std::string& name) {
989 return opt_mapping.find(name) != opt_mapping.end();
990 }
991
992 pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name) {
993 opt_mapping_t::iterator i = opt_mapping.find(name);
994 assert(i != opt_mapping.end());
995 return i->second;
996 }
997
998 bool pool_opts_t::is_set(pool_opts_t::key_t key) const {
999 return opts.find(key) != opts.end();
1000 }
1001
1002 const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const {
1003 opts_t::const_iterator i = opts.find(key);
1004 assert(i != opts.end());
1005 return i->second;
1006 }
1007
1008 bool pool_opts_t::unset(pool_opts_t::key_t key) {
1009 return opts.erase(key) > 0;
1010 }
1011
1012 class pool_opts_dumper_t : public boost::static_visitor<>
1013 {
1014 public:
1015 pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
1016 name(name_.c_str()), f(f_) {}
1017
1018 void operator()(std::string s) const {
1019 f->dump_string(name, s);
1020 }
1021 void operator()(int i) const {
1022 f->dump_int(name, i);
1023 }
1024 void operator()(double d) const {
1025 f->dump_float(name, d);
1026 }
1027
1028 private:
1029 const char* name;
1030 Formatter* f;
1031 };
1032
1033 void pool_opts_t::dump(const std::string& name, Formatter* f) const
1034 {
1035 const opt_desc_t& desc = get_opt_desc(name);
1036 opts_t::const_iterator i = opts.find(desc.key);
1037 if (i == opts.end()) {
1038 return;
1039 }
1040 boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
1041 }
1042
1043 void pool_opts_t::dump(Formatter* f) const
1044 {
1045 for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
1046 ++i) {
1047 const std::string& name = i->first;
1048 const opt_desc_t& desc = i->second;
1049 opts_t::const_iterator j = opts.find(desc.key);
1050 if (j == opts.end()) {
1051 continue;
1052 }
1053 boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
1054 }
1055 }
1056
1057 class pool_opts_encoder_t : public boost::static_visitor<>
1058 {
1059 public:
1060 explicit pool_opts_encoder_t(bufferlist& bl_) : bl(bl_) {}
1061
1062 void operator()(std::string s) const {
1063 ::encode(static_cast<int32_t>(pool_opts_t::STR), bl);
1064 ::encode(s, bl);
1065 }
1066 void operator()(int i) const {
1067 ::encode(static_cast<int32_t>(pool_opts_t::INT), bl);
1068 ::encode(i, bl);
1069 }
1070 void operator()(double d) const {
1071 ::encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
1072 ::encode(d, bl);
1073 }
1074
1075 private:
1076 bufferlist& bl;
1077 };
1078
1079 void pool_opts_t::encode(bufferlist& bl) const {
1080 ENCODE_START(1, 1, bl);
1081 uint32_t n = static_cast<uint32_t>(opts.size());
1082 ::encode(n, bl);
1083 for (opts_t::const_iterator i = opts.begin(); i != opts.end(); ++i) {
1084 ::encode(static_cast<int32_t>(i->first), bl);
1085 boost::apply_visitor(pool_opts_encoder_t(bl), i->second);
1086 }
1087 ENCODE_FINISH(bl);
1088 }
1089
1090 void pool_opts_t::decode(bufferlist::iterator& bl) {
1091 DECODE_START(1, bl);
1092 __u32 n;
1093 ::decode(n, bl);
1094 opts.clear();
1095 while (n--) {
1096 int32_t k, t;
1097 ::decode(k, bl);
1098 ::decode(t, bl);
1099 if (t == STR) {
1100 std::string s;
1101 ::decode(s, bl);
1102 opts[static_cast<key_t>(k)] = s;
1103 } else if (t == INT) {
1104 int i;
1105 ::decode(i, bl);
1106 opts[static_cast<key_t>(k)] = i;
1107 } else if (t == DOUBLE) {
1108 double d;
1109 ::decode(d, bl);
1110 opts[static_cast<key_t>(k)] = d;
1111 } else {
1112 assert(!"invalid type");
1113 }
1114 }
1115 DECODE_FINISH(bl);
1116 }
1117
1118 ostream& operator<<(ostream& out, const pool_opts_t& opts)
1119 {
1120 for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
1121 ++i) {
1122 const std::string& name = i->first;
1123 const pool_opts_t::opt_desc_t& desc = i->second;
1124 pool_opts_t::opts_t::const_iterator j = opts.opts.find(desc.key);
1125 if (j == opts.opts.end()) {
1126 continue;
1127 }
1128 out << " " << name << " " << j->second;
1129 }
1130 return out;
1131 }
1132
1133 // -- pg_pool_t --
1134
1135 void pg_pool_t::dump(Formatter *f) const
1136 {
1137 f->dump_unsigned("flags", get_flags());
1138 f->dump_string("flags_names", get_flags_string());
1139 f->dump_int("type", get_type());
1140 f->dump_int("size", get_size());
1141 f->dump_int("min_size", get_min_size());
1142 f->dump_int("crush_rule", get_crush_rule());
1143 f->dump_int("object_hash", get_object_hash());
1144 f->dump_unsigned("pg_num", get_pg_num());
1145 f->dump_unsigned("pg_placement_num", get_pgp_num());
1146 f->dump_unsigned("crash_replay_interval", get_crash_replay_interval());
1147 f->dump_stream("last_change") << get_last_change();
1148 f->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1149 f->dump_stream("last_force_op_resend_preluminous")
1150 << get_last_force_op_resend_preluminous();
1151 f->dump_unsigned("auid", get_auid());
1152 f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1153 f->dump_unsigned("snap_seq", get_snap_seq());
1154 f->dump_unsigned("snap_epoch", get_snap_epoch());
1155 f->open_array_section("pool_snaps");
1156 for (map<snapid_t, pool_snap_info_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
1157 f->open_object_section("pool_snap_info");
1158 p->second.dump(f);
1159 f->close_section();
1160 }
1161 f->close_section();
1162 f->dump_stream("removed_snaps") << removed_snaps;
1163 f->dump_unsigned("quota_max_bytes", quota_max_bytes);
1164 f->dump_unsigned("quota_max_objects", quota_max_objects);
1165 f->open_array_section("tiers");
1166 for (set<uint64_t>::const_iterator p = tiers.begin(); p != tiers.end(); ++p)
1167 f->dump_unsigned("pool_id", *p);
1168 f->close_section();
1169 f->dump_int("tier_of", tier_of);
1170 f->dump_int("read_tier", read_tier);
1171 f->dump_int("write_tier", write_tier);
1172 f->dump_string("cache_mode", get_cache_mode_name());
1173 f->dump_unsigned("target_max_bytes", target_max_bytes);
1174 f->dump_unsigned("target_max_objects", target_max_objects);
1175 f->dump_unsigned("cache_target_dirty_ratio_micro",
1176 cache_target_dirty_ratio_micro);
1177 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
1178 cache_target_dirty_high_ratio_micro);
1179 f->dump_unsigned("cache_target_full_ratio_micro",
1180 cache_target_full_ratio_micro);
1181 f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
1182 f->dump_unsigned("cache_min_evict_age", cache_min_evict_age);
1183 f->dump_string("erasure_code_profile", erasure_code_profile);
1184 f->open_object_section("hit_set_params");
1185 hit_set_params.dump(f);
1186 f->close_section(); // hit_set_params
1187 f->dump_unsigned("hit_set_period", hit_set_period);
1188 f->dump_unsigned("hit_set_count", hit_set_count);
1189 f->dump_bool("use_gmt_hitset", use_gmt_hitset);
1190 f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
1191 f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
1192 f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
1193 f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
1194 f->open_array_section("grade_table");
1195 for (unsigned i = 0; i < hit_set_count; ++i)
1196 f->dump_unsigned("value", get_grade(i));
1197 f->close_section();
1198 f->dump_unsigned("stripe_width", get_stripe_width());
1199 f->dump_unsigned("expected_num_objects", expected_num_objects);
1200 f->dump_bool("fast_read", fast_read);
1201 f->open_object_section("options");
1202 opts.dump(f);
1203 f->close_section(); // options
1204 }
1205
1206 void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
1207 for (size_t i = 0; i < from.size(); ++i) {
1208 if (from[i] != CRUSH_ITEM_NONE) {
1209 to->insert(
1210 pg_shard_t(
1211 from[i],
1212 ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1213 }
1214 }
1215 }
1216
1217 void pg_pool_t::calc_pg_masks()
1218 {
1219 pg_num_mask = (1 << cbits(pg_num-1)) - 1;
1220 pgp_num_mask = (1 << cbits(pgp_num-1)) - 1;
1221 }
1222
1223 unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const
1224 {
1225 if (pg_num == pg_num_mask + 1)
1226 return pg_num; // power-of-2 split
1227 unsigned mask = pg_num_mask >> 1;
1228 if ((pgid.ps() & mask) < (pg_num & mask))
1229 return pg_num_mask + 1; // smaller bin size (already split)
1230 else
1231 return (pg_num_mask + 1) >> 1; // bigger bin (not yet split)
1232 }
1233
1234 /*
1235 * we have two snap modes:
1236 * - pool global snaps
1237 * - snap existence/non-existence defined by snaps[] and snap_seq
1238 * - user managed snaps
1239 * - removal governed by removed_snaps
1240 *
1241 * we know which mode we're using based on whether removed_snaps is empty.
1242 */
1243 bool pg_pool_t::is_pool_snaps_mode() const
1244 {
1245 return removed_snaps.empty() && get_snap_seq() > 0;
1246 }
1247
1248 bool pg_pool_t::is_unmanaged_snaps_mode() const
1249 {
1250 return removed_snaps.size() && get_snap_seq() > 0;
1251 }
1252
1253 bool pg_pool_t::is_removed_snap(snapid_t s) const
1254 {
1255 if (is_pool_snaps_mode())
1256 return s <= get_snap_seq() && snaps.count(s) == 0;
1257 else
1258 return removed_snaps.contains(s);
1259 }
1260
1261 /*
1262 * build set of known-removed sets from either pool snaps or
1263 * explicit removed_snaps set.
1264 */
1265 void pg_pool_t::build_removed_snaps(interval_set<snapid_t>& rs) const
1266 {
1267 if (is_pool_snaps_mode()) {
1268 rs.clear();
1269 for (snapid_t s = 1; s <= get_snap_seq(); s = s + 1)
1270 if (snaps.count(s) == 0)
1271 rs.insert(s);
1272 } else {
1273 rs = removed_snaps;
1274 }
1275 }
1276
1277 snapid_t pg_pool_t::snap_exists(const char *s) const
1278 {
1279 for (map<snapid_t,pool_snap_info_t>::const_iterator p = snaps.begin();
1280 p != snaps.end();
1281 ++p)
1282 if (p->second.name == s)
1283 return p->second.snapid;
1284 return 0;
1285 }
1286
1287 void pg_pool_t::add_snap(const char *n, utime_t stamp)
1288 {
1289 assert(!is_unmanaged_snaps_mode());
1290 snapid_t s = get_snap_seq() + 1;
1291 snap_seq = s;
1292 snaps[s].snapid = s;
1293 snaps[s].name = n;
1294 snaps[s].stamp = stamp;
1295 }
1296
1297 void pg_pool_t::add_unmanaged_snap(uint64_t& snapid)
1298 {
1299 if (removed_snaps.empty()) {
1300 assert(!is_pool_snaps_mode());
1301 removed_snaps.insert(snapid_t(1));
1302 snap_seq = 1;
1303 }
1304 snapid = snap_seq = snap_seq + 1;
1305 }
1306
1307 void pg_pool_t::remove_snap(snapid_t s)
1308 {
1309 assert(snaps.count(s));
1310 snaps.erase(s);
1311 snap_seq = snap_seq + 1;
1312 }
1313
1314 void pg_pool_t::remove_unmanaged_snap(snapid_t s)
1315 {
1316 assert(is_unmanaged_snaps_mode());
1317 removed_snaps.insert(s);
1318 snap_seq = snap_seq + 1;
1319 removed_snaps.insert(get_snap_seq());
1320 }
1321
1322 SnapContext pg_pool_t::get_snap_context() const
1323 {
1324 vector<snapid_t> s(snaps.size());
1325 unsigned i = 0;
1326 for (map<snapid_t, pool_snap_info_t>::const_reverse_iterator p = snaps.rbegin();
1327 p != snaps.rend();
1328 ++p)
1329 s[i++] = p->first;
1330 return SnapContext(get_snap_seq(), s);
1331 }
1332
1333 uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
1334 {
1335 if (ns.empty())
1336 return ceph_str_hash(object_hash, key.data(), key.length());
1337 int nsl = ns.length();
1338 int len = key.length() + nsl + 1;
1339 char buf[len];
1340 memcpy(&buf[0], ns.data(), nsl);
1341 buf[nsl] = '\037';
1342 memcpy(&buf[nsl+1], key.data(), key.length());
1343 return ceph_str_hash(object_hash, &buf[0], len);
1344 }
1345
1346 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
1347 {
1348 return ceph_stable_mod(v, pg_num, pg_num_mask);
1349 }
1350
1351 /*
1352 * map a raw pg (with full precision ps) into an actual pg, for storage
1353 */
1354 pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
1355 {
1356 pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
1357 return pg;
1358 }
1359
1360 /*
1361 * map raw pg (full precision ps) into a placement seed. include
1362 * pool id in that value so that different pools don't use the same
1363 * seeds.
1364 */
1365 ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
1366 {
1367 if (flags & FLAG_HASHPSPOOL) {
1368 // Hash the pool id so that pool PGs do not overlap.
1369 return
1370 crush_hash32_2(CRUSH_HASH_RJENKINS1,
1371 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
1372 pg.pool());
1373 } else {
1374 // Legacy behavior; add ps and pool together. This is not a great
1375 // idea because the PGs from each pool will essentially overlap on
1376 // top of each other: 0.5 == 1.4 == 2.3 == ...
1377 return
1378 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
1379 pg.pool();
1380 }
1381 }
1382
1383 uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const
1384 {
1385 uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123);
1386 if (pg_num == pg_num_mask + 1) {
1387 r &= ~pg_num_mask;
1388 } else {
1389 unsigned smaller_mask = pg_num_mask >> 1;
1390 if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) {
1391 r &= ~pg_num_mask;
1392 } else {
1393 r &= ~smaller_mask;
1394 }
1395 }
1396 r |= pg.ps();
1397 return r;
1398 }
1399
1400 void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
1401 {
1402 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1403 // this encoding matches the old struct ceph_pg_pool
1404 __u8 struct_v = 2;
1405 ::encode(struct_v, bl);
1406 ::encode(type, bl);
1407 ::encode(size, bl);
1408 ::encode(crush_rule, bl);
1409 ::encode(object_hash, bl);
1410 ::encode(pg_num, bl);
1411 ::encode(pgp_num, bl);
1412 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1413 ::encode(lpg_num, bl);
1414 ::encode(lpgp_num, bl);
1415 ::encode(last_change, bl);
1416 ::encode(snap_seq, bl);
1417 ::encode(snap_epoch, bl);
1418
1419 __u32 n = snaps.size();
1420 ::encode(n, bl);
1421 n = removed_snaps.num_intervals();
1422 ::encode(n, bl);
1423
1424 ::encode(auid, bl);
1425
1426 ::encode_nohead(snaps, bl, features);
1427 ::encode_nohead(removed_snaps, bl);
1428 return;
1429 }
1430
1431 if ((features & CEPH_FEATURE_OSDENC) == 0) {
1432 __u8 struct_v = 4;
1433 ::encode(struct_v, bl);
1434 ::encode(type, bl);
1435 ::encode(size, bl);
1436 ::encode(crush_rule, bl);
1437 ::encode(object_hash, bl);
1438 ::encode(pg_num, bl);
1439 ::encode(pgp_num, bl);
1440 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1441 ::encode(lpg_num, bl);
1442 ::encode(lpgp_num, bl);
1443 ::encode(last_change, bl);
1444 ::encode(snap_seq, bl);
1445 ::encode(snap_epoch, bl);
1446 ::encode(snaps, bl, features);
1447 ::encode(removed_snaps, bl);
1448 ::encode(auid, bl);
1449 ::encode(flags, bl);
1450 ::encode(crash_replay_interval, bl);
1451 return;
1452 }
1453
1454 if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
1455 // we simply added last_force_op_resend here, which is a fully
1456 // backward compatible change. however, encoding the same map
1457 // differently between monitors triggers scrub noise (even though
1458 // they are decodable without the feature), so let's be pendantic
1459 // about it.
1460 ENCODE_START(14, 5, bl);
1461 ::encode(type, bl);
1462 ::encode(size, bl);
1463 ::encode(crush_rule, bl);
1464 ::encode(object_hash, bl);
1465 ::encode(pg_num, bl);
1466 ::encode(pgp_num, bl);
1467 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1468 ::encode(lpg_num, bl);
1469 ::encode(lpgp_num, bl);
1470 ::encode(last_change, bl);
1471 ::encode(snap_seq, bl);
1472 ::encode(snap_epoch, bl);
1473 ::encode(snaps, bl, features);
1474 ::encode(removed_snaps, bl);
1475 ::encode(auid, bl);
1476 ::encode(flags, bl);
1477 ::encode(crash_replay_interval, bl);
1478 ::encode(min_size, bl);
1479 ::encode(quota_max_bytes, bl);
1480 ::encode(quota_max_objects, bl);
1481 ::encode(tiers, bl);
1482 ::encode(tier_of, bl);
1483 __u8 c = cache_mode;
1484 ::encode(c, bl);
1485 ::encode(read_tier, bl);
1486 ::encode(write_tier, bl);
1487 ::encode(properties, bl);
1488 ::encode(hit_set_params, bl);
1489 ::encode(hit_set_period, bl);
1490 ::encode(hit_set_count, bl);
1491 ::encode(stripe_width, bl);
1492 ::encode(target_max_bytes, bl);
1493 ::encode(target_max_objects, bl);
1494 ::encode(cache_target_dirty_ratio_micro, bl);
1495 ::encode(cache_target_full_ratio_micro, bl);
1496 ::encode(cache_min_flush_age, bl);
1497 ::encode(cache_min_evict_age, bl);
1498 ::encode(erasure_code_profile, bl);
1499 ENCODE_FINISH(bl);
1500 return;
1501 }
1502
1503 uint8_t v = 25;
1504 if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
1505 // this was the first post-hammer thing we added; if it's missing, encode
1506 // like hammer.
1507 v = 21;
1508 }
1509 if ((features &
1510 (CEPH_FEATURE_RESEND_ON_SPLIT|CEPH_FEATURE_SERVER_JEWEL)) !=
1511 (CEPH_FEATURE_RESEND_ON_SPLIT|CEPH_FEATURE_SERVER_JEWEL)) {
1512 v = 24;
1513 }
1514
1515 ENCODE_START(v, 5, bl);
1516 ::encode(type, bl);
1517 ::encode(size, bl);
1518 ::encode(crush_rule, bl);
1519 ::encode(object_hash, bl);
1520 ::encode(pg_num, bl);
1521 ::encode(pgp_num, bl);
1522 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1523 ::encode(lpg_num, bl);
1524 ::encode(lpgp_num, bl);
1525 ::encode(last_change, bl);
1526 ::encode(snap_seq, bl);
1527 ::encode(snap_epoch, bl);
1528 ::encode(snaps, bl, features);
1529 ::encode(removed_snaps, bl);
1530 ::encode(auid, bl);
1531 ::encode(flags, bl);
1532 ::encode(crash_replay_interval, bl);
1533 ::encode(min_size, bl);
1534 ::encode(quota_max_bytes, bl);
1535 ::encode(quota_max_objects, bl);
1536 ::encode(tiers, bl);
1537 ::encode(tier_of, bl);
1538 __u8 c = cache_mode;
1539 ::encode(c, bl);
1540 ::encode(read_tier, bl);
1541 ::encode(write_tier, bl);
1542 ::encode(properties, bl);
1543 ::encode(hit_set_params, bl);
1544 ::encode(hit_set_period, bl);
1545 ::encode(hit_set_count, bl);
1546 ::encode(stripe_width, bl);
1547 ::encode(target_max_bytes, bl);
1548 ::encode(target_max_objects, bl);
1549 ::encode(cache_target_dirty_ratio_micro, bl);
1550 ::encode(cache_target_full_ratio_micro, bl);
1551 ::encode(cache_min_flush_age, bl);
1552 ::encode(cache_min_evict_age, bl);
1553 ::encode(erasure_code_profile, bl);
1554 ::encode(last_force_op_resend_preluminous, bl);
1555 ::encode(min_read_recency_for_promote, bl);
1556 ::encode(expected_num_objects, bl);
1557 if (v >= 19) {
1558 ::encode(cache_target_dirty_high_ratio_micro, bl);
1559 }
1560 if (v >= 20) {
1561 ::encode(min_write_recency_for_promote, bl);
1562 }
1563 if (v >= 21) {
1564 ::encode(use_gmt_hitset, bl);
1565 }
1566 if (v >= 22) {
1567 ::encode(fast_read, bl);
1568 }
1569 if (v >= 23) {
1570 ::encode(hit_set_grade_decay_rate, bl);
1571 ::encode(hit_set_search_last_n, bl);
1572 }
1573 if (v >= 24) {
1574 ::encode(opts, bl);
1575 }
1576 if (v >= 25) {
1577 ::encode(last_force_op_resend, bl);
1578 }
1579 ENCODE_FINISH(bl);
1580 }
1581
1582 void pg_pool_t::decode(bufferlist::iterator& bl)
1583 {
1584 DECODE_START_LEGACY_COMPAT_LEN(25, 5, 5, bl);
1585 ::decode(type, bl);
1586 ::decode(size, bl);
1587 ::decode(crush_rule, bl);
1588 ::decode(object_hash, bl);
1589 ::decode(pg_num, bl);
1590 ::decode(pgp_num, bl);
1591 {
1592 __u32 lpg_num, lpgp_num;
1593 ::decode(lpg_num, bl);
1594 ::decode(lpgp_num, bl);
1595 }
1596 ::decode(last_change, bl);
1597 ::decode(snap_seq, bl);
1598 ::decode(snap_epoch, bl);
1599
1600 if (struct_v >= 3) {
1601 ::decode(snaps, bl);
1602 ::decode(removed_snaps, bl);
1603 ::decode(auid, bl);
1604 } else {
1605 __u32 n, m;
1606 ::decode(n, bl);
1607 ::decode(m, bl);
1608 ::decode(auid, bl);
1609 ::decode_nohead(n, snaps, bl);
1610 ::decode_nohead(m, removed_snaps, bl);
1611 }
1612
1613 if (struct_v >= 4) {
1614 ::decode(flags, bl);
1615 ::decode(crash_replay_interval, bl);
1616 } else {
1617 flags = 0;
1618
1619 // if this looks like the 'data' pool, set the
1620 // crash_replay_interval appropriately. unfortunately, we can't
1621 // be precise here. this should be good enough to preserve replay
1622 // on the data pool for the majority of cluster upgrades, though.
1623 if (crush_rule == 0 && auid == 0)
1624 crash_replay_interval = 60;
1625 else
1626 crash_replay_interval = 0;
1627 }
1628 if (struct_v >= 7) {
1629 ::decode(min_size, bl);
1630 } else {
1631 min_size = size - size/2;
1632 }
1633 if (struct_v >= 8) {
1634 ::decode(quota_max_bytes, bl);
1635 ::decode(quota_max_objects, bl);
1636 }
1637 if (struct_v >= 9) {
1638 ::decode(tiers, bl);
1639 ::decode(tier_of, bl);
1640 __u8 v;
1641 ::decode(v, bl);
1642 cache_mode = (cache_mode_t)v;
1643 ::decode(read_tier, bl);
1644 ::decode(write_tier, bl);
1645 }
1646 if (struct_v >= 10) {
1647 ::decode(properties, bl);
1648 }
1649 if (struct_v >= 11) {
1650 ::decode(hit_set_params, bl);
1651 ::decode(hit_set_period, bl);
1652 ::decode(hit_set_count, bl);
1653 } else {
1654 pg_pool_t def;
1655 hit_set_period = def.hit_set_period;
1656 hit_set_count = def.hit_set_count;
1657 }
1658 if (struct_v >= 12) {
1659 ::decode(stripe_width, bl);
1660 } else {
1661 set_stripe_width(0);
1662 }
1663 if (struct_v >= 13) {
1664 ::decode(target_max_bytes, bl);
1665 ::decode(target_max_objects, bl);
1666 ::decode(cache_target_dirty_ratio_micro, bl);
1667 ::decode(cache_target_full_ratio_micro, bl);
1668 ::decode(cache_min_flush_age, bl);
1669 ::decode(cache_min_evict_age, bl);
1670 } else {
1671 target_max_bytes = 0;
1672 target_max_objects = 0;
1673 cache_target_dirty_ratio_micro = 0;
1674 cache_target_full_ratio_micro = 0;
1675 cache_min_flush_age = 0;
1676 cache_min_evict_age = 0;
1677 }
1678 if (struct_v >= 14) {
1679 ::decode(erasure_code_profile, bl);
1680 }
1681 if (struct_v >= 15) {
1682 ::decode(last_force_op_resend_preluminous, bl);
1683 } else {
1684 last_force_op_resend_preluminous = 0;
1685 }
1686 if (struct_v >= 16) {
1687 ::decode(min_read_recency_for_promote, bl);
1688 } else {
1689 min_read_recency_for_promote = 1;
1690 }
1691 if (struct_v >= 17) {
1692 ::decode(expected_num_objects, bl);
1693 } else {
1694 expected_num_objects = 0;
1695 }
1696 if (struct_v >= 19) {
1697 ::decode(cache_target_dirty_high_ratio_micro, bl);
1698 } else {
1699 cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
1700 }
1701 if (struct_v >= 20) {
1702 ::decode(min_write_recency_for_promote, bl);
1703 } else {
1704 min_write_recency_for_promote = 1;
1705 }
1706 if (struct_v >= 21) {
1707 ::decode(use_gmt_hitset, bl);
1708 } else {
1709 use_gmt_hitset = false;
1710 }
1711 if (struct_v >= 22) {
1712 ::decode(fast_read, bl);
1713 } else {
1714 fast_read = false;
1715 }
1716 if (struct_v >= 23) {
1717 ::decode(hit_set_grade_decay_rate, bl);
1718 ::decode(hit_set_search_last_n, bl);
1719 } else {
1720 hit_set_grade_decay_rate = 0;
1721 hit_set_search_last_n = 1;
1722 }
1723 if (struct_v >= 24) {
1724 ::decode(opts, bl);
1725 }
1726 if (struct_v >= 25) {
1727 ::decode(last_force_op_resend, bl);
1728 } else {
1729 last_force_op_resend = last_force_op_resend_preluminous;
1730 }
1731 DECODE_FINISH(bl);
1732 calc_pg_masks();
1733 calc_grade_table();
1734 }
1735
1736 void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
1737 {
1738 pg_pool_t a;
1739 o.push_back(new pg_pool_t(a));
1740
1741 a.type = TYPE_REPLICATED;
1742 a.size = 2;
1743 a.crush_rule = 3;
1744 a.object_hash = 4;
1745 a.pg_num = 6;
1746 a.pgp_num = 5;
1747 a.last_change = 9;
1748 a.last_force_op_resend = 123823;
1749 a.last_force_op_resend_preluminous = 123824;
1750 a.snap_seq = 10;
1751 a.snap_epoch = 11;
1752 a.auid = 12;
1753 a.crash_replay_interval = 13;
1754 a.quota_max_bytes = 473;
1755 a.quota_max_objects = 474;
1756 o.push_back(new pg_pool_t(a));
1757
1758 a.snaps[3].name = "asdf";
1759 a.snaps[3].snapid = 3;
1760 a.snaps[3].stamp = utime_t(123, 4);
1761 a.snaps[6].name = "qwer";
1762 a.snaps[6].snapid = 6;
1763 a.snaps[6].stamp = utime_t(23423, 4);
1764 o.push_back(new pg_pool_t(a));
1765
1766 a.removed_snaps.insert(2); // not quite valid to combine with snaps!
1767 a.quota_max_bytes = 2473;
1768 a.quota_max_objects = 4374;
1769 a.tiers.insert(0);
1770 a.tiers.insert(1);
1771 a.tier_of = 2;
1772 a.cache_mode = CACHEMODE_WRITEBACK;
1773 a.read_tier = 1;
1774 a.write_tier = 1;
1775 a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
1776 a.hit_set_period = 3600;
1777 a.hit_set_count = 8;
1778 a.min_read_recency_for_promote = 1;
1779 a.min_write_recency_for_promote = 1;
1780 a.hit_set_grade_decay_rate = 50;
1781 a.hit_set_search_last_n = 1;
1782 a.calc_grade_table();
1783 a.set_stripe_width(12345);
1784 a.target_max_bytes = 1238132132;
1785 a.target_max_objects = 1232132;
1786 a.cache_target_dirty_ratio_micro = 187232;
1787 a.cache_target_dirty_high_ratio_micro = 309856;
1788 a.cache_target_full_ratio_micro = 987222;
1789 a.cache_min_flush_age = 231;
1790 a.cache_min_evict_age = 2321;
1791 a.erasure_code_profile = "profile in osdmap";
1792 a.expected_num_objects = 123456;
1793 a.fast_read = false;
1794 o.push_back(new pg_pool_t(a));
1795 }
1796
1797 ostream& operator<<(ostream& out, const pg_pool_t& p)
1798 {
1799 out << p.get_type_name()
1800 << " size " << p.get_size()
1801 << " min_size " << p.get_min_size()
1802 << " crush_rule " << p.get_crush_rule()
1803 << " object_hash " << p.get_object_hash_name()
1804 << " pg_num " << p.get_pg_num()
1805 << " pgp_num " << p.get_pgp_num()
1806 << " last_change " << p.get_last_change();
1807 if (p.get_last_force_op_resend() ||
1808 p.get_last_force_op_resend_preluminous())
1809 out << " lfor " << p.get_last_force_op_resend() << "/"
1810 << p.get_last_force_op_resend_preluminous();
1811 if (p.get_auid())
1812 out << " owner " << p.get_auid();
1813 if (p.flags)
1814 out << " flags " << p.get_flags_string();
1815 if (p.crash_replay_interval)
1816 out << " crash_replay_interval " << p.crash_replay_interval;
1817 if (p.quota_max_bytes)
1818 out << " max_bytes " << p.quota_max_bytes;
1819 if (p.quota_max_objects)
1820 out << " max_objects " << p.quota_max_objects;
1821 if (!p.tiers.empty())
1822 out << " tiers " << p.tiers;
1823 if (p.is_tier())
1824 out << " tier_of " << p.tier_of;
1825 if (p.has_read_tier())
1826 out << " read_tier " << p.read_tier;
1827 if (p.has_write_tier())
1828 out << " write_tier " << p.write_tier;
1829 if (p.cache_mode)
1830 out << " cache_mode " << p.get_cache_mode_name();
1831 if (p.target_max_bytes)
1832 out << " target_bytes " << p.target_max_bytes;
1833 if (p.target_max_objects)
1834 out << " target_objects " << p.target_max_objects;
1835 if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
1836 out << " hit_set " << p.hit_set_params
1837 << " " << p.hit_set_period << "s"
1838 << " x" << p.hit_set_count << " decay_rate "
1839 << p.hit_set_grade_decay_rate
1840 << " search_last_n " << p.hit_set_search_last_n;
1841 }
1842 if (p.min_read_recency_for_promote)
1843 out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
1844 if (p.min_write_recency_for_promote)
1845 out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
1846 out << " stripe_width " << p.get_stripe_width();
1847 if (p.expected_num_objects)
1848 out << " expected_num_objects " << p.expected_num_objects;
1849 if (p.fast_read)
1850 out << " fast_read " << p.fast_read;
1851 out << p.opts;
1852 return out;
1853 }
1854
1855
1856 // -- object_stat_sum_t --
1857
1858 void object_stat_sum_t::dump(Formatter *f) const
1859 {
1860 f->dump_int("num_bytes", num_bytes);
1861 f->dump_int("num_objects", num_objects);
1862 f->dump_int("num_object_clones", num_object_clones);
1863 f->dump_int("num_object_copies", num_object_copies);
1864 f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
1865 f->dump_int("num_objects_missing", num_objects_missing);
1866 f->dump_int("num_objects_degraded", num_objects_degraded);
1867 f->dump_int("num_objects_misplaced", num_objects_misplaced);
1868 f->dump_int("num_objects_unfound", num_objects_unfound);
1869 f->dump_int("num_objects_dirty", num_objects_dirty);
1870 f->dump_int("num_whiteouts", num_whiteouts);
1871 f->dump_int("num_read", num_rd);
1872 f->dump_int("num_read_kb", num_rd_kb);
1873 f->dump_int("num_write", num_wr);
1874 f->dump_int("num_write_kb", num_wr_kb);
1875 f->dump_int("num_scrub_errors", num_scrub_errors);
1876 f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors);
1877 f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors);
1878 f->dump_int("num_objects_recovered", num_objects_recovered);
1879 f->dump_int("num_bytes_recovered", num_bytes_recovered);
1880 f->dump_int("num_keys_recovered", num_keys_recovered);
1881 f->dump_int("num_objects_omap", num_objects_omap);
1882 f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
1883 f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
1884 f->dump_int("num_flush", num_flush);
1885 f->dump_int("num_flush_kb", num_flush_kb);
1886 f->dump_int("num_evict", num_evict);
1887 f->dump_int("num_evict_kb", num_evict_kb);
1888 f->dump_int("num_promote", num_promote);
1889 f->dump_int("num_flush_mode_high", num_flush_mode_high);
1890 f->dump_int("num_flush_mode_low", num_flush_mode_low);
1891 f->dump_int("num_evict_mode_some", num_evict_mode_some);
1892 f->dump_int("num_evict_mode_full", num_evict_mode_full);
1893 f->dump_int("num_objects_pinned", num_objects_pinned);
1894 f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
1895 }
1896
1897 void object_stat_sum_t::encode(bufferlist& bl) const
1898 {
1899 ENCODE_START(16, 14, bl);
1900 #if defined(CEPH_LITTLE_ENDIAN)
1901 bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
1902 #else
1903 ::encode(num_bytes, bl);
1904 ::encode(num_objects, bl);
1905 ::encode(num_object_clones, bl);
1906 ::encode(num_object_copies, bl);
1907 ::encode(num_objects_missing_on_primary, bl);
1908 ::encode(num_objects_degraded, bl);
1909 ::encode(num_objects_unfound, bl);
1910 ::encode(num_rd, bl);
1911 ::encode(num_rd_kb, bl);
1912 ::encode(num_wr, bl);
1913 ::encode(num_wr_kb, bl);
1914 ::encode(num_scrub_errors, bl);
1915 ::encode(num_objects_recovered, bl);
1916 ::encode(num_bytes_recovered, bl);
1917 ::encode(num_keys_recovered, bl);
1918 ::encode(num_shallow_scrub_errors, bl);
1919 ::encode(num_deep_scrub_errors, bl);
1920 ::encode(num_objects_dirty, bl);
1921 ::encode(num_whiteouts, bl);
1922 ::encode(num_objects_omap, bl);
1923 ::encode(num_objects_hit_set_archive, bl);
1924 ::encode(num_objects_misplaced, bl);
1925 ::encode(num_bytes_hit_set_archive, bl);
1926 ::encode(num_flush, bl);
1927 ::encode(num_flush_kb, bl);
1928 ::encode(num_evict, bl);
1929 ::encode(num_evict_kb, bl);
1930 ::encode(num_promote, bl);
1931 ::encode(num_flush_mode_high, bl);
1932 ::encode(num_flush_mode_low, bl);
1933 ::encode(num_evict_mode_some, bl);
1934 ::encode(num_evict_mode_full, bl);
1935 ::encode(num_objects_pinned, bl);
1936 ::encode(num_objects_missing, bl);
1937 ::encode(num_legacy_snapsets, bl);
1938 #endif
1939 ENCODE_FINISH(bl);
1940 }
1941
1942 void object_stat_sum_t::decode(bufferlist::iterator& bl)
1943 {
1944 bool decode_finish = false;
1945 DECODE_START(16, bl);
1946 #if defined(CEPH_LITTLE_ENDIAN)
1947 if (struct_v >= 16) {
1948 bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
1949 decode_finish = true;
1950 }
1951 #endif
1952 if (!decode_finish) {
1953 ::decode(num_bytes, bl);
1954 ::decode(num_objects, bl);
1955 ::decode(num_object_clones, bl);
1956 ::decode(num_object_copies, bl);
1957 ::decode(num_objects_missing_on_primary, bl);
1958 ::decode(num_objects_degraded, bl);
1959 ::decode(num_objects_unfound, bl);
1960 ::decode(num_rd, bl);
1961 ::decode(num_rd_kb, bl);
1962 ::decode(num_wr, bl);
1963 ::decode(num_wr_kb, bl);
1964 ::decode(num_scrub_errors, bl);
1965 ::decode(num_objects_recovered, bl);
1966 ::decode(num_bytes_recovered, bl);
1967 ::decode(num_keys_recovered, bl);
1968 ::decode(num_shallow_scrub_errors, bl);
1969 ::decode(num_deep_scrub_errors, bl);
1970 ::decode(num_objects_dirty, bl);
1971 ::decode(num_whiteouts, bl);
1972 ::decode(num_objects_omap, bl);
1973 ::decode(num_objects_hit_set_archive, bl);
1974 ::decode(num_objects_misplaced, bl);
1975 ::decode(num_bytes_hit_set_archive, bl);
1976 ::decode(num_flush, bl);
1977 ::decode(num_flush_kb, bl);
1978 ::decode(num_evict, bl);
1979 ::decode(num_evict_kb, bl);
1980 ::decode(num_promote, bl);
1981 ::decode(num_flush_mode_high, bl);
1982 ::decode(num_flush_mode_low, bl);
1983 ::decode(num_evict_mode_some, bl);
1984 ::decode(num_evict_mode_full, bl);
1985 ::decode(num_objects_pinned, bl);
1986 ::decode(num_objects_missing, bl);
1987 if (struct_v >= 16) {
1988 ::decode(num_legacy_snapsets, bl);
1989 } else {
1990 num_legacy_snapsets = num_object_clones; // upper bound
1991 }
1992 }
1993 DECODE_FINISH(bl);
1994 }
1995
1996 void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
1997 {
1998 object_stat_sum_t a;
1999
2000 a.num_bytes = 1;
2001 a.num_objects = 3;
2002 a.num_object_clones = 4;
2003 a.num_object_copies = 5;
2004 a.num_objects_missing_on_primary = 6;
2005 a.num_objects_missing = 123;
2006 a.num_objects_degraded = 7;
2007 a.num_objects_unfound = 8;
2008 a.num_rd = 9; a.num_rd_kb = 10;
2009 a.num_wr = 11; a.num_wr_kb = 12;
2010 a.num_objects_recovered = 14;
2011 a.num_bytes_recovered = 15;
2012 a.num_keys_recovered = 16;
2013 a.num_deep_scrub_errors = 17;
2014 a.num_shallow_scrub_errors = 18;
2015 a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors;
2016 a.num_objects_dirty = 21;
2017 a.num_whiteouts = 22;
2018 a.num_objects_misplaced = 1232;
2019 a.num_objects_hit_set_archive = 2;
2020 a.num_bytes_hit_set_archive = 27;
2021 a.num_flush = 5;
2022 a.num_flush_kb = 6;
2023 a.num_evict = 7;
2024 a.num_evict_kb = 8;
2025 a.num_promote = 9;
2026 a.num_flush_mode_high = 0;
2027 a.num_flush_mode_low = 1;
2028 a.num_evict_mode_some = 1;
2029 a.num_evict_mode_full = 0;
2030 a.num_objects_pinned = 20;
2031 o.push_back(new object_stat_sum_t(a));
2032 }
2033
2034 void object_stat_sum_t::add(const object_stat_sum_t& o)
2035 {
2036 num_bytes += o.num_bytes;
2037 num_objects += o.num_objects;
2038 num_object_clones += o.num_object_clones;
2039 num_object_copies += o.num_object_copies;
2040 num_objects_missing_on_primary += o.num_objects_missing_on_primary;
2041 num_objects_missing += o.num_objects_missing;
2042 num_objects_degraded += o.num_objects_degraded;
2043 num_objects_misplaced += o.num_objects_misplaced;
2044 num_rd += o.num_rd;
2045 num_rd_kb += o.num_rd_kb;
2046 num_wr += o.num_wr;
2047 num_wr_kb += o.num_wr_kb;
2048 num_objects_unfound += o.num_objects_unfound;
2049 num_scrub_errors += o.num_scrub_errors;
2050 num_shallow_scrub_errors += o.num_shallow_scrub_errors;
2051 num_deep_scrub_errors += o.num_deep_scrub_errors;
2052 num_objects_recovered += o.num_objects_recovered;
2053 num_bytes_recovered += o.num_bytes_recovered;
2054 num_keys_recovered += o.num_keys_recovered;
2055 num_objects_dirty += o.num_objects_dirty;
2056 num_whiteouts += o.num_whiteouts;
2057 num_objects_omap += o.num_objects_omap;
2058 num_objects_hit_set_archive += o.num_objects_hit_set_archive;
2059 num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
2060 num_flush += o.num_flush;
2061 num_flush_kb += o.num_flush_kb;
2062 num_evict += o.num_evict;
2063 num_evict_kb += o.num_evict_kb;
2064 num_promote += o.num_promote;
2065 num_flush_mode_high += o.num_flush_mode_high;
2066 num_flush_mode_low += o.num_flush_mode_low;
2067 num_evict_mode_some += o.num_evict_mode_some;
2068 num_evict_mode_full += o.num_evict_mode_full;
2069 num_objects_pinned += o.num_objects_pinned;
2070 num_legacy_snapsets += o.num_legacy_snapsets;
2071 }
2072
2073 void object_stat_sum_t::sub(const object_stat_sum_t& o)
2074 {
2075 num_bytes -= o.num_bytes;
2076 num_objects -= o.num_objects;
2077 num_object_clones -= o.num_object_clones;
2078 num_object_copies -= o.num_object_copies;
2079 num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
2080 num_objects_missing -= o.num_objects_missing;
2081 num_objects_degraded -= o.num_objects_degraded;
2082 num_objects_misplaced -= o.num_objects_misplaced;
2083 num_rd -= o.num_rd;
2084 num_rd_kb -= o.num_rd_kb;
2085 num_wr -= o.num_wr;
2086 num_wr_kb -= o.num_wr_kb;
2087 num_objects_unfound -= o.num_objects_unfound;
2088 num_scrub_errors -= o.num_scrub_errors;
2089 num_shallow_scrub_errors -= o.num_shallow_scrub_errors;
2090 num_deep_scrub_errors -= o.num_deep_scrub_errors;
2091 num_objects_recovered -= o.num_objects_recovered;
2092 num_bytes_recovered -= o.num_bytes_recovered;
2093 num_keys_recovered -= o.num_keys_recovered;
2094 num_objects_dirty -= o.num_objects_dirty;
2095 num_whiteouts -= o.num_whiteouts;
2096 num_objects_omap -= o.num_objects_omap;
2097 num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
2098 num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
2099 num_flush -= o.num_flush;
2100 num_flush_kb -= o.num_flush_kb;
2101 num_evict -= o.num_evict;
2102 num_evict_kb -= o.num_evict_kb;
2103 num_promote -= o.num_promote;
2104 num_flush_mode_high -= o.num_flush_mode_high;
2105 num_flush_mode_low -= o.num_flush_mode_low;
2106 num_evict_mode_some -= o.num_evict_mode_some;
2107 num_evict_mode_full -= o.num_evict_mode_full;
2108 num_objects_pinned -= o.num_objects_pinned;
2109 num_legacy_snapsets -= o.num_legacy_snapsets;
2110 }
2111
2112 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
2113 {
2114 return
2115 l.num_bytes == r.num_bytes &&
2116 l.num_objects == r.num_objects &&
2117 l.num_object_clones == r.num_object_clones &&
2118 l.num_object_copies == r.num_object_copies &&
2119 l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
2120 l.num_objects_missing == r.num_objects_missing &&
2121 l.num_objects_degraded == r.num_objects_degraded &&
2122 l.num_objects_misplaced == r.num_objects_misplaced &&
2123 l.num_objects_unfound == r.num_objects_unfound &&
2124 l.num_rd == r.num_rd &&
2125 l.num_rd_kb == r.num_rd_kb &&
2126 l.num_wr == r.num_wr &&
2127 l.num_wr_kb == r.num_wr_kb &&
2128 l.num_scrub_errors == r.num_scrub_errors &&
2129 l.num_shallow_scrub_errors == r.num_shallow_scrub_errors &&
2130 l.num_deep_scrub_errors == r.num_deep_scrub_errors &&
2131 l.num_objects_recovered == r.num_objects_recovered &&
2132 l.num_bytes_recovered == r.num_bytes_recovered &&
2133 l.num_keys_recovered == r.num_keys_recovered &&
2134 l.num_objects_dirty == r.num_objects_dirty &&
2135 l.num_whiteouts == r.num_whiteouts &&
2136 l.num_objects_omap == r.num_objects_omap &&
2137 l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
2138 l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
2139 l.num_flush == r.num_flush &&
2140 l.num_flush_kb == r.num_flush_kb &&
2141 l.num_evict == r.num_evict &&
2142 l.num_evict_kb == r.num_evict_kb &&
2143 l.num_promote == r.num_promote &&
2144 l.num_flush_mode_high == r.num_flush_mode_high &&
2145 l.num_flush_mode_low == r.num_flush_mode_low &&
2146 l.num_evict_mode_some == r.num_evict_mode_some &&
2147 l.num_evict_mode_full == r.num_evict_mode_full &&
2148 l.num_objects_pinned == r.num_objects_pinned &&
2149 l.num_legacy_snapsets == r.num_legacy_snapsets;
2150 }
2151
2152 // -- object_stat_collection_t --
2153
2154 void object_stat_collection_t::dump(Formatter *f) const
2155 {
2156 f->open_object_section("stat_sum");
2157 sum.dump(f);
2158 f->close_section();
2159 }
2160
2161 void object_stat_collection_t::encode(bufferlist& bl) const
2162 {
2163 ENCODE_START(2, 2, bl);
2164 ::encode(sum, bl);
2165 ::encode((__u32)0, bl);
2166 ENCODE_FINISH(bl);
2167 }
2168
2169 void object_stat_collection_t::decode(bufferlist::iterator& bl)
2170 {
2171 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2172 ::decode(sum, bl);
2173 {
2174 map<string,object_stat_sum_t> cat_sum;
2175 ::decode(cat_sum, bl);
2176 }
2177 DECODE_FINISH(bl);
2178 }
2179
2180 void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o)
2181 {
2182 object_stat_collection_t a;
2183 o.push_back(new object_stat_collection_t(a));
2184 list<object_stat_sum_t*> l;
2185 object_stat_sum_t::generate_test_instances(l);
2186 for (list<object_stat_sum_t*>::iterator p = l.begin(); p != l.end(); ++p) {
2187 a.add(**p);
2188 o.push_back(new object_stat_collection_t(a));
2189 }
2190 }
2191
2192
2193 // -- pg_stat_t --
2194
2195 bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
2196 {
2197 if (primary && osd == acting_primary) {
2198 return true;
2199 } else if (!primary) {
2200 for(vector<int32_t>::const_iterator it = acting.begin();
2201 it != acting.end(); ++it)
2202 {
2203 if (*it == osd)
2204 return true;
2205 }
2206 }
2207 return false;
2208 }
2209
2210 void pg_stat_t::dump(Formatter *f) const
2211 {
2212 f->dump_stream("version") << version;
2213 f->dump_stream("reported_seq") << reported_seq;
2214 f->dump_stream("reported_epoch") << reported_epoch;
2215 f->dump_string("state", pg_state_string(state));
2216 f->dump_stream("last_fresh") << last_fresh;
2217 f->dump_stream("last_change") << last_change;
2218 f->dump_stream("last_active") << last_active;
2219 f->dump_stream("last_peered") << last_peered;
2220 f->dump_stream("last_clean") << last_clean;
2221 f->dump_stream("last_became_active") << last_became_active;
2222 f->dump_stream("last_became_peered") << last_became_peered;
2223 f->dump_stream("last_unstale") << last_unstale;
2224 f->dump_stream("last_undegraded") << last_undegraded;
2225 f->dump_stream("last_fullsized") << last_fullsized;
2226 f->dump_unsigned("mapping_epoch", mapping_epoch);
2227 f->dump_stream("log_start") << log_start;
2228 f->dump_stream("ondisk_log_start") << ondisk_log_start;
2229 f->dump_unsigned("created", created);
2230 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
2231 f->dump_stream("parent") << parent;
2232 f->dump_unsigned("parent_split_bits", parent_split_bits);
2233 f->dump_stream("last_scrub") << last_scrub;
2234 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2235 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2236 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2237 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2238 f->dump_int("log_size", log_size);
2239 f->dump_int("ondisk_log_size", ondisk_log_size);
2240 f->dump_bool("stats_invalid", stats_invalid);
2241 f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
2242 f->dump_bool("omap_stats_invalid", omap_stats_invalid);
2243 f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
2244 f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
2245 f->dump_bool("pin_stats_invalid", pin_stats_invalid);
2246 stats.dump(f);
2247 f->open_array_section("up");
2248 for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p)
2249 f->dump_int("osd", *p);
2250 f->close_section();
2251 f->open_array_section("acting");
2252 for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2253 f->dump_int("osd", *p);
2254 f->close_section();
2255 f->open_array_section("blocked_by");
2256 for (vector<int32_t>::const_iterator p = blocked_by.begin();
2257 p != blocked_by.end(); ++p)
2258 f->dump_int("osd", *p);
2259 f->close_section();
2260 f->dump_int("up_primary", up_primary);
2261 f->dump_int("acting_primary", acting_primary);
2262 }
2263
2264 void pg_stat_t::dump_brief(Formatter *f) const
2265 {
2266 f->dump_string("state", pg_state_string(state));
2267 f->open_array_section("up");
2268 for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p)
2269 f->dump_int("osd", *p);
2270 f->close_section();
2271 f->open_array_section("acting");
2272 for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2273 f->dump_int("osd", *p);
2274 f->close_section();
2275 f->dump_int("up_primary", up_primary);
2276 f->dump_int("acting_primary", acting_primary);
2277 }
2278
2279 void pg_stat_t::encode(bufferlist &bl) const
2280 {
2281 ENCODE_START(22, 22, bl);
2282 ::encode(version, bl);
2283 ::encode(reported_seq, bl);
2284 ::encode(reported_epoch, bl);
2285 ::encode(state, bl);
2286 ::encode(log_start, bl);
2287 ::encode(ondisk_log_start, bl);
2288 ::encode(created, bl);
2289 ::encode(last_epoch_clean, bl);
2290 ::encode(parent, bl);
2291 ::encode(parent_split_bits, bl);
2292 ::encode(last_scrub, bl);
2293 ::encode(last_scrub_stamp, bl);
2294 ::encode(stats, bl);
2295 ::encode(log_size, bl);
2296 ::encode(ondisk_log_size, bl);
2297 ::encode(up, bl);
2298 ::encode(acting, bl);
2299 ::encode(last_fresh, bl);
2300 ::encode(last_change, bl);
2301 ::encode(last_active, bl);
2302 ::encode(last_clean, bl);
2303 ::encode(last_unstale, bl);
2304 ::encode(mapping_epoch, bl);
2305 ::encode(last_deep_scrub, bl);
2306 ::encode(last_deep_scrub_stamp, bl);
2307 ::encode(stats_invalid, bl);
2308 ::encode(last_clean_scrub_stamp, bl);
2309 ::encode(last_became_active, bl);
2310 ::encode(dirty_stats_invalid, bl);
2311 ::encode(up_primary, bl);
2312 ::encode(acting_primary, bl);
2313 ::encode(omap_stats_invalid, bl);
2314 ::encode(hitset_stats_invalid, bl);
2315 ::encode(blocked_by, bl);
2316 ::encode(last_undegraded, bl);
2317 ::encode(last_fullsized, bl);
2318 ::encode(hitset_bytes_stats_invalid, bl);
2319 ::encode(last_peered, bl);
2320 ::encode(last_became_peered, bl);
2321 ::encode(pin_stats_invalid, bl);
2322 ENCODE_FINISH(bl);
2323 }
2324
2325 void pg_stat_t::decode(bufferlist::iterator &bl)
2326 {
2327 bool tmp;
2328 DECODE_START(22, bl);
2329 ::decode(version, bl);
2330 ::decode(reported_seq, bl);
2331 ::decode(reported_epoch, bl);
2332 ::decode(state, bl);
2333 ::decode(log_start, bl);
2334 ::decode(ondisk_log_start, bl);
2335 ::decode(created, bl);
2336 ::decode(last_epoch_clean, bl);
2337 ::decode(parent, bl);
2338 ::decode(parent_split_bits, bl);
2339 ::decode(last_scrub, bl);
2340 ::decode(last_scrub_stamp, bl);
2341 ::decode(stats, bl);
2342 ::decode(log_size, bl);
2343 ::decode(ondisk_log_size, bl);
2344 ::decode(up, bl);
2345 ::decode(acting, bl);
2346 ::decode(last_fresh, bl);
2347 ::decode(last_change, bl);
2348 ::decode(last_active, bl);
2349 ::decode(last_clean, bl);
2350 ::decode(last_unstale, bl);
2351 ::decode(mapping_epoch, bl);
2352 ::decode(last_deep_scrub, bl);
2353 ::decode(last_deep_scrub_stamp, bl);
2354 ::decode(tmp, bl);
2355 stats_invalid = tmp;
2356 ::decode(last_clean_scrub_stamp, bl);
2357 ::decode(last_became_active, bl);
2358 ::decode(tmp, bl);
2359 dirty_stats_invalid = tmp;
2360 ::decode(up_primary, bl);
2361 ::decode(acting_primary, bl);
2362 ::decode(tmp, bl);
2363 omap_stats_invalid = tmp;
2364 ::decode(tmp, bl);
2365 hitset_stats_invalid = tmp;
2366 ::decode(blocked_by, bl);
2367 ::decode(last_undegraded, bl);
2368 ::decode(last_fullsized, bl);
2369 ::decode(tmp, bl);
2370 hitset_bytes_stats_invalid = tmp;
2371 ::decode(last_peered, bl);
2372 ::decode(last_became_peered, bl);
2373 ::decode(tmp, bl);
2374 pin_stats_invalid = tmp;
2375 DECODE_FINISH(bl);
2376 }
2377
2378 void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
2379 {
2380 pg_stat_t a;
2381 o.push_back(new pg_stat_t(a));
2382
2383 a.version = eversion_t(1, 3);
2384 a.reported_epoch = 1;
2385 a.reported_seq = 2;
2386 a.state = 123;
2387 a.mapping_epoch = 998;
2388 a.last_fresh = utime_t(1002, 1);
2389 a.last_change = utime_t(1002, 2);
2390 a.last_active = utime_t(1002, 3);
2391 a.last_clean = utime_t(1002, 4);
2392 a.last_unstale = utime_t(1002, 5);
2393 a.last_undegraded = utime_t(1002, 7);
2394 a.last_fullsized = utime_t(1002, 8);
2395 a.log_start = eversion_t(1, 4);
2396 a.ondisk_log_start = eversion_t(1, 5);
2397 a.created = 6;
2398 a.last_epoch_clean = 7;
2399 a.parent = pg_t(1, 2, 3);
2400 a.parent_split_bits = 12;
2401 a.last_scrub = eversion_t(9, 10);
2402 a.last_scrub_stamp = utime_t(11, 12);
2403 a.last_deep_scrub = eversion_t(13, 14);
2404 a.last_deep_scrub_stamp = utime_t(15, 16);
2405 a.last_clean_scrub_stamp = utime_t(17, 18);
2406 list<object_stat_collection_t*> l;
2407 object_stat_collection_t::generate_test_instances(l);
2408 a.stats = *l.back();
2409 a.log_size = 99;
2410 a.ondisk_log_size = 88;
2411 a.up.push_back(123);
2412 a.up_primary = 123;
2413 a.acting.push_back(456);
2414 a.acting_primary = 456;
2415 o.push_back(new pg_stat_t(a));
2416
2417 a.up.push_back(124);
2418 a.up_primary = 124;
2419 a.acting.push_back(124);
2420 a.acting_primary = 124;
2421 a.blocked_by.push_back(155);
2422 a.blocked_by.push_back(156);
2423 o.push_back(new pg_stat_t(a));
2424 }
2425
2426 bool operator==(const pg_stat_t& l, const pg_stat_t& r)
2427 {
2428 return
2429 l.version == r.version &&
2430 l.reported_seq == r.reported_seq &&
2431 l.reported_epoch == r.reported_epoch &&
2432 l.state == r.state &&
2433 l.last_fresh == r.last_fresh &&
2434 l.last_change == r.last_change &&
2435 l.last_active == r.last_active &&
2436 l.last_peered == r.last_peered &&
2437 l.last_clean == r.last_clean &&
2438 l.last_unstale == r.last_unstale &&
2439 l.last_undegraded == r.last_undegraded &&
2440 l.last_fullsized == r.last_fullsized &&
2441 l.log_start == r.log_start &&
2442 l.ondisk_log_start == r.ondisk_log_start &&
2443 l.created == r.created &&
2444 l.last_epoch_clean == r.last_epoch_clean &&
2445 l.parent == r.parent &&
2446 l.parent_split_bits == r.parent_split_bits &&
2447 l.last_scrub == r.last_scrub &&
2448 l.last_deep_scrub == r.last_deep_scrub &&
2449 l.last_scrub_stamp == r.last_scrub_stamp &&
2450 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2451 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
2452 l.stats == r.stats &&
2453 l.stats_invalid == r.stats_invalid &&
2454 l.log_size == r.log_size &&
2455 l.ondisk_log_size == r.ondisk_log_size &&
2456 l.up == r.up &&
2457 l.acting == r.acting &&
2458 l.mapping_epoch == r.mapping_epoch &&
2459 l.blocked_by == r.blocked_by &&
2460 l.last_became_active == r.last_became_active &&
2461 l.last_became_peered == r.last_became_peered &&
2462 l.dirty_stats_invalid == r.dirty_stats_invalid &&
2463 l.omap_stats_invalid == r.omap_stats_invalid &&
2464 l.hitset_stats_invalid == r.hitset_stats_invalid &&
2465 l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
2466 l.up_primary == r.up_primary &&
2467 l.acting_primary == r.acting_primary &&
2468 l.pin_stats_invalid == r.pin_stats_invalid;
2469 }
2470
2471 // -- pool_stat_t --
2472
2473 void pool_stat_t::dump(Formatter *f) const
2474 {
2475 stats.dump(f);
2476 f->dump_int("log_size", log_size);
2477 f->dump_int("ondisk_log_size", ondisk_log_size);
2478 f->dump_int("up", up);
2479 f->dump_int("acting", acting);
2480 }
2481
2482 void pool_stat_t::encode(bufferlist &bl, uint64_t features) const
2483 {
2484 if ((features & CEPH_FEATURE_OSDENC) == 0) {
2485 __u8 v = 4;
2486 ::encode(v, bl);
2487 ::encode(stats, bl);
2488 ::encode(log_size, bl);
2489 ::encode(ondisk_log_size, bl);
2490 return;
2491 }
2492
2493 ENCODE_START(6, 5, bl);
2494 ::encode(stats, bl);
2495 ::encode(log_size, bl);
2496 ::encode(ondisk_log_size, bl);
2497 ::encode(up, bl);
2498 ::encode(acting, bl);
2499 ENCODE_FINISH(bl);
2500 }
2501
2502 void pool_stat_t::decode(bufferlist::iterator &bl)
2503 {
2504 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
2505 if (struct_v >= 4) {
2506 ::decode(stats, bl);
2507 ::decode(log_size, bl);
2508 ::decode(ondisk_log_size, bl);
2509 if (struct_v >= 6) {
2510 ::decode(up, bl);
2511 ::decode(acting, bl);
2512 } else {
2513 up = 0;
2514 acting = 0;
2515 }
2516 } else {
2517 ::decode(stats.sum.num_bytes, bl);
2518 uint64_t num_kb;
2519 ::decode(num_kb, bl);
2520 ::decode(stats.sum.num_objects, bl);
2521 ::decode(stats.sum.num_object_clones, bl);
2522 ::decode(stats.sum.num_object_copies, bl);
2523 ::decode(stats.sum.num_objects_missing_on_primary, bl);
2524 ::decode(stats.sum.num_objects_degraded, bl);
2525 ::decode(log_size, bl);
2526 ::decode(ondisk_log_size, bl);
2527 if (struct_v >= 2) {
2528 ::decode(stats.sum.num_rd, bl);
2529 ::decode(stats.sum.num_rd_kb, bl);
2530 ::decode(stats.sum.num_wr, bl);
2531 ::decode(stats.sum.num_wr_kb, bl);
2532 }
2533 if (struct_v >= 3) {
2534 ::decode(stats.sum.num_objects_unfound, bl);
2535 }
2536 }
2537 DECODE_FINISH(bl);
2538 }
2539
2540 void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
2541 {
2542 pool_stat_t a;
2543 o.push_back(new pool_stat_t(a));
2544
2545 list<object_stat_collection_t*> l;
2546 object_stat_collection_t::generate_test_instances(l);
2547 a.stats = *l.back();
2548 a.log_size = 123;
2549 a.ondisk_log_size = 456;
2550 a.acting = 3;
2551 a.up = 4;
2552 o.push_back(new pool_stat_t(a));
2553 }
2554
2555
2556 // -- pg_history_t --
2557
2558 void pg_history_t::encode(bufferlist &bl) const
2559 {
2560 ENCODE_START(9, 4, bl);
2561 ::encode(epoch_created, bl);
2562 ::encode(last_epoch_started, bl);
2563 ::encode(last_epoch_clean, bl);
2564 ::encode(last_epoch_split, bl);
2565 ::encode(same_interval_since, bl);
2566 ::encode(same_up_since, bl);
2567 ::encode(same_primary_since, bl);
2568 ::encode(last_scrub, bl);
2569 ::encode(last_scrub_stamp, bl);
2570 ::encode(last_deep_scrub, bl);
2571 ::encode(last_deep_scrub_stamp, bl);
2572 ::encode(last_clean_scrub_stamp, bl);
2573 ::encode(last_epoch_marked_full, bl);
2574 ::encode(last_interval_started, bl);
2575 ::encode(last_interval_clean, bl);
2576 ::encode(epoch_pool_created, bl);
2577 ENCODE_FINISH(bl);
2578 }
2579
2580 void pg_history_t::decode(bufferlist::iterator &bl)
2581 {
2582 DECODE_START_LEGACY_COMPAT_LEN(9, 4, 4, bl);
2583 ::decode(epoch_created, bl);
2584 ::decode(last_epoch_started, bl);
2585 if (struct_v >= 3)
2586 ::decode(last_epoch_clean, bl);
2587 else
2588 last_epoch_clean = last_epoch_started; // careful, it's a lie!
2589 ::decode(last_epoch_split, bl);
2590 ::decode(same_interval_since, bl);
2591 ::decode(same_up_since, bl);
2592 ::decode(same_primary_since, bl);
2593 if (struct_v >= 2) {
2594 ::decode(last_scrub, bl);
2595 ::decode(last_scrub_stamp, bl);
2596 }
2597 if (struct_v >= 5) {
2598 ::decode(last_deep_scrub, bl);
2599 ::decode(last_deep_scrub_stamp, bl);
2600 }
2601 if (struct_v >= 6) {
2602 ::decode(last_clean_scrub_stamp, bl);
2603 }
2604 if (struct_v >= 7) {
2605 ::decode(last_epoch_marked_full, bl);
2606 }
2607 if (struct_v >= 8) {
2608 ::decode(last_interval_started, bl);
2609 ::decode(last_interval_clean, bl);
2610 } else {
2611 if (last_epoch_started >= same_interval_since) {
2612 last_interval_started = same_interval_since;
2613 } else {
2614 last_interval_started = last_epoch_started; // best guess
2615 }
2616 if (last_epoch_clean >= same_interval_since) {
2617 last_interval_clean = same_interval_since;
2618 } else {
2619 last_interval_clean = last_epoch_clean; // best guess
2620 }
2621 }
2622 if (struct_v >= 9) {
2623 ::decode(epoch_pool_created, bl);
2624 } else {
2625 epoch_pool_created = epoch_created;
2626 }
2627 DECODE_FINISH(bl);
2628 }
2629
2630 void pg_history_t::dump(Formatter *f) const
2631 {
2632 f->dump_int("epoch_created", epoch_created);
2633 f->dump_int("epoch_pool_created", epoch_pool_created);
2634 f->dump_int("last_epoch_started", last_epoch_started);
2635 f->dump_int("last_interval_started", last_interval_started);
2636 f->dump_int("last_epoch_clean", last_epoch_clean);
2637 f->dump_int("last_interval_clean", last_interval_clean);
2638 f->dump_int("last_epoch_split", last_epoch_split);
2639 f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
2640 f->dump_int("same_up_since", same_up_since);
2641 f->dump_int("same_interval_since", same_interval_since);
2642 f->dump_int("same_primary_since", same_primary_since);
2643 f->dump_stream("last_scrub") << last_scrub;
2644 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2645 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2646 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2647 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2648 }
2649
2650 void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
2651 {
2652 o.push_back(new pg_history_t);
2653 o.push_back(new pg_history_t);
2654 o.back()->epoch_created = 1;
2655 o.back()->epoch_pool_created = 1;
2656 o.back()->last_epoch_started = 2;
2657 o.back()->last_interval_started = 2;
2658 o.back()->last_epoch_clean = 3;
2659 o.back()->last_interval_clean = 2;
2660 o.back()->last_epoch_split = 4;
2661 o.back()->same_up_since = 5;
2662 o.back()->same_interval_since = 6;
2663 o.back()->same_primary_since = 7;
2664 o.back()->last_scrub = eversion_t(8, 9);
2665 o.back()->last_scrub_stamp = utime_t(10, 11);
2666 o.back()->last_deep_scrub = eversion_t(12, 13);
2667 o.back()->last_deep_scrub_stamp = utime_t(14, 15);
2668 o.back()->last_clean_scrub_stamp = utime_t(16, 17);
2669 o.back()->last_epoch_marked_full = 18;
2670 }
2671
2672
2673 // -- pg_info_t --
2674
2675 void pg_info_t::encode(bufferlist &bl) const
2676 {
2677 ENCODE_START(32, 26, bl);
2678 ::encode(pgid.pgid, bl);
2679 ::encode(last_update, bl);
2680 ::encode(last_complete, bl);
2681 ::encode(log_tail, bl);
2682 if (last_backfill_bitwise && !last_backfill.is_max()) {
2683 ::encode(hobject_t(), bl);
2684 } else {
2685 ::encode(last_backfill, bl);
2686 }
2687 ::encode(stats, bl);
2688 history.encode(bl);
2689 ::encode(purged_snaps, bl);
2690 ::encode(last_epoch_started, bl);
2691 ::encode(last_user_version, bl);
2692 ::encode(hit_set, bl);
2693 ::encode(pgid.shard, bl);
2694 ::encode(last_backfill, bl);
2695 ::encode(last_backfill_bitwise, bl);
2696 ::encode(last_interval_started, bl);
2697 ENCODE_FINISH(bl);
2698 }
2699
2700 void pg_info_t::decode(bufferlist::iterator &bl)
2701 {
2702 DECODE_START(32, bl);
2703 ::decode(pgid.pgid, bl);
2704 ::decode(last_update, bl);
2705 ::decode(last_complete, bl);
2706 ::decode(log_tail, bl);
2707 {
2708 hobject_t old_last_backfill;
2709 ::decode(old_last_backfill, bl);
2710 }
2711 ::decode(stats, bl);
2712 history.decode(bl);
2713 ::decode(purged_snaps, bl);
2714 ::decode(last_epoch_started, bl);
2715 ::decode(last_user_version, bl);
2716 ::decode(hit_set, bl);
2717 ::decode(pgid.shard, bl);
2718 ::decode(last_backfill, bl);
2719 ::decode(last_backfill_bitwise, bl);
2720 if (struct_v >= 32) {
2721 ::decode(last_interval_started, bl);
2722 } else {
2723 last_interval_started = last_epoch_started;
2724 }
2725 DECODE_FINISH(bl);
2726 }
2727
2728 // -- pg_info_t --
2729
2730 void pg_info_t::dump(Formatter *f) const
2731 {
2732 f->dump_stream("pgid") << pgid;
2733 f->dump_stream("last_update") << last_update;
2734 f->dump_stream("last_complete") << last_complete;
2735 f->dump_stream("log_tail") << log_tail;
2736 f->dump_int("last_user_version", last_user_version);
2737 f->dump_stream("last_backfill") << last_backfill;
2738 f->dump_int("last_backfill_bitwise", (int)last_backfill_bitwise);
2739 f->open_array_section("purged_snaps");
2740 for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
2741 i != purged_snaps.end();
2742 ++i) {
2743 f->open_object_section("purged_snap_interval");
2744 f->dump_stream("start") << i.get_start();
2745 f->dump_stream("length") << i.get_len();
2746 f->close_section();
2747 }
2748 f->close_section();
2749 f->open_object_section("history");
2750 history.dump(f);
2751 f->close_section();
2752 f->open_object_section("stats");
2753 stats.dump(f);
2754 f->close_section();
2755
2756 f->dump_int("empty", is_empty());
2757 f->dump_int("dne", dne());
2758 f->dump_int("incomplete", is_incomplete());
2759 f->dump_int("last_epoch_started", last_epoch_started);
2760
2761 f->open_object_section("hit_set_history");
2762 hit_set.dump(f);
2763 f->close_section();
2764 }
2765
2766 void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
2767 {
2768 o.push_back(new pg_info_t);
2769 o.push_back(new pg_info_t);
2770 list<pg_history_t*> h;
2771 pg_history_t::generate_test_instances(h);
2772 o.back()->history = *h.back();
2773 o.back()->pgid = spg_t(pg_t(1, 2, -1), shard_id_t::NO_SHARD);
2774 o.back()->last_update = eversion_t(3, 4);
2775 o.back()->last_complete = eversion_t(5, 6);
2776 o.back()->last_user_version = 2;
2777 o.back()->log_tail = eversion_t(7, 8);
2778 o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
2779 o.back()->last_backfill_bitwise = true;
2780 {
2781 list<pg_stat_t*> s;
2782 pg_stat_t::generate_test_instances(s);
2783 o.back()->stats = *s.back();
2784 }
2785 {
2786 list<pg_hit_set_history_t*> s;
2787 pg_hit_set_history_t::generate_test_instances(s);
2788 o.back()->hit_set = *s.back();
2789 }
2790 }
2791
2792 // -- pg_notify_t --
2793 void pg_notify_t::encode(bufferlist &bl) const
2794 {
2795 ENCODE_START(2, 2, bl);
2796 ::encode(query_epoch, bl);
2797 ::encode(epoch_sent, bl);
2798 ::encode(info, bl);
2799 ::encode(to, bl);
2800 ::encode(from, bl);
2801 ENCODE_FINISH(bl);
2802 }
2803
2804 void pg_notify_t::decode(bufferlist::iterator &bl)
2805 {
2806 DECODE_START(2, bl);
2807 ::decode(query_epoch, bl);
2808 ::decode(epoch_sent, bl);
2809 ::decode(info, bl);
2810 ::decode(to, bl);
2811 ::decode(from, bl);
2812 DECODE_FINISH(bl);
2813 }
2814
2815 void pg_notify_t::dump(Formatter *f) const
2816 {
2817 f->dump_int("from", from);
2818 f->dump_int("to", to);
2819 f->dump_unsigned("query_epoch", query_epoch);
2820 f->dump_unsigned("epoch_sent", epoch_sent);
2821 {
2822 f->open_object_section("info");
2823 info.dump(f);
2824 f->close_section();
2825 }
2826 }
2827
2828 void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
2829 {
2830 o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1, pg_info_t()));
2831 o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10, pg_info_t()));
2832 }
2833
2834 ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
2835 {
2836 lhs << "(query:" << notify.query_epoch
2837 << " sent:" << notify.epoch_sent
2838 << " " << notify.info;
2839 if (notify.from != shard_id_t::NO_SHARD ||
2840 notify.to != shard_id_t::NO_SHARD)
2841 lhs << " " << (unsigned)notify.from
2842 << "->" << (unsigned)notify.to;
2843 return lhs << ")";
2844 }
2845
2846 // -- pg_interval_t --
2847
2848 void PastIntervals::pg_interval_t::encode(bufferlist& bl) const
2849 {
2850 ENCODE_START(4, 2, bl);
2851 ::encode(first, bl);
2852 ::encode(last, bl);
2853 ::encode(up, bl);
2854 ::encode(acting, bl);
2855 ::encode(maybe_went_rw, bl);
2856 ::encode(primary, bl);
2857 ::encode(up_primary, bl);
2858 ENCODE_FINISH(bl);
2859 }
2860
2861 void PastIntervals::pg_interval_t::decode(bufferlist::iterator& bl)
2862 {
2863 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
2864 ::decode(first, bl);
2865 ::decode(last, bl);
2866 ::decode(up, bl);
2867 ::decode(acting, bl);
2868 ::decode(maybe_went_rw, bl);
2869 if (struct_v >= 3) {
2870 ::decode(primary, bl);
2871 } else {
2872 if (acting.size())
2873 primary = acting[0];
2874 }
2875 if (struct_v >= 4) {
2876 ::decode(up_primary, bl);
2877 } else {
2878 if (up.size())
2879 up_primary = up[0];
2880 }
2881 DECODE_FINISH(bl);
2882 }
2883
2884 void PastIntervals::pg_interval_t::dump(Formatter *f) const
2885 {
2886 f->dump_unsigned("first", first);
2887 f->dump_unsigned("last", last);
2888 f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0);
2889 f->open_array_section("up");
2890 for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
2891 f->dump_int("osd", *p);
2892 f->close_section();
2893 f->open_array_section("acting");
2894 for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2895 f->dump_int("osd", *p);
2896 f->close_section();
2897 f->dump_int("primary", primary);
2898 f->dump_int("up_primary", up_primary);
2899 }
2900
2901 void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
2902 {
2903 o.push_back(new pg_interval_t);
2904 o.push_back(new pg_interval_t);
2905 o.back()->up.push_back(1);
2906 o.back()->acting.push_back(2);
2907 o.back()->acting.push_back(3);
2908 o.back()->first = 4;
2909 o.back()->last = 5;
2910 o.back()->maybe_went_rw = true;
2911 }
2912
2913 WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
2914
2915 class pi_simple_rep : public PastIntervals::interval_rep {
2916 map<epoch_t, PastIntervals::pg_interval_t> interval_map;
2917
2918 pi_simple_rep(
2919 bool ec_pool,
2920 std::list<PastIntervals::pg_interval_t> &&intervals) {
2921 for (auto &&i: intervals)
2922 add_interval(ec_pool, i);
2923 }
2924
2925 public:
2926 pi_simple_rep() = default;
2927 pi_simple_rep(const pi_simple_rep &) = default;
2928 pi_simple_rep(pi_simple_rep &&) = default;
2929 pi_simple_rep &operator=(pi_simple_rep &&) = default;
2930 pi_simple_rep &operator=(const pi_simple_rep &) = default;
2931
2932 size_t size() const override { return interval_map.size(); }
2933 bool empty() const override { return interval_map.empty(); }
2934 void clear() override { interval_map.clear(); }
2935 pair<epoch_t, epoch_t> get_bounds() const override {
2936 auto iter = interval_map.begin();
2937 if (iter != interval_map.end()) {
2938 auto riter = interval_map.rbegin();
2939 return make_pair(
2940 iter->second.first,
2941 riter->second.last + 1);
2942 } else {
2943 return make_pair(0, 0);
2944 }
2945 }
2946 set<pg_shard_t> get_all_participants(
2947 bool ec_pool) const override {
2948 set<pg_shard_t> all_participants;
2949
2950 // We need to decide who might have unfound objects that we need
2951 auto p = interval_map.rbegin();
2952 auto end = interval_map.rend();
2953 for (; p != end; ++p) {
2954 const PastIntervals::pg_interval_t &interval(p->second);
2955 // If nothing changed, we don't care about this interval.
2956 if (!interval.maybe_went_rw)
2957 continue;
2958
2959 int i = 0;
2960 std::vector<int>::const_iterator a = interval.acting.begin();
2961 std::vector<int>::const_iterator a_end = interval.acting.end();
2962 for (; a != a_end; ++a, ++i) {
2963 pg_shard_t shard(*a, ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD);
2964 if (*a != CRUSH_ITEM_NONE)
2965 all_participants.insert(shard);
2966 }
2967 }
2968 return all_participants;
2969 }
2970 void add_interval(
2971 bool ec_pool,
2972 const PastIntervals::pg_interval_t &interval) override {
2973 interval_map[interval.first] = interval;
2974 }
2975 unique_ptr<PastIntervals::interval_rep> clone() const override {
2976 return unique_ptr<PastIntervals::interval_rep>(new pi_simple_rep(*this));
2977 }
2978 ostream &print(ostream &out) const override {
2979 return out << interval_map;
2980 }
2981 void encode(bufferlist &bl) const override {
2982 ::encode(interval_map, bl);
2983 }
2984 void decode(bufferlist::iterator &bl) override {
2985 ::decode(interval_map, bl);
2986 }
2987 void dump(Formatter *f) const override {
2988 f->open_array_section("PastIntervals::compat_rep");
2989 for (auto &&i: interval_map) {
2990 f->open_object_section("pg_interval_t");
2991 f->dump_int("epoch", i.first);
2992 f->open_object_section("interval");
2993 i.second.dump(f);
2994 f->close_section();
2995 f->close_section();
2996 }
2997 f->close_section();
2998 }
2999 bool is_classic() const override {
3000 return true;
3001 }
3002 static void generate_test_instances(list<pi_simple_rep*> &o) {
3003 using ival = PastIntervals::pg_interval_t;
3004 using ivallst = std::list<ival>;
3005 o.push_back(
3006 new pi_simple_rep(
3007 true, ivallst
3008 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3009 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3010 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3011 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3012 }));
3013 o.push_back(
3014 new pi_simple_rep(
3015 false, ivallst
3016 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3017 , ival{{ 1, 2}, { 1, 2}, 20, 30, true, 1, 1}
3018 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3019 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3020 }));
3021 o.push_back(
3022 new pi_simple_rep(
3023 true, ivallst
3024 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3025 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3026 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3027 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3028 }));
3029 return;
3030 }
3031 void iterate_mayberw_back_to(
3032 bool ec_pool,
3033 epoch_t les,
3034 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3035 for (auto i = interval_map.rbegin(); i != interval_map.rend(); ++i) {
3036 if (!i->second.maybe_went_rw)
3037 continue;
3038 if (i->second.last < les)
3039 break;
3040 set<pg_shard_t> actingset;
3041 for (unsigned j = 0; j < i->second.acting.size(); ++j) {
3042 if (i->second.acting[j] == CRUSH_ITEM_NONE)
3043 continue;
3044 actingset.insert(
3045 pg_shard_t(
3046 i->second.acting[j],
3047 ec_pool ? shard_id_t(j) : shard_id_t::NO_SHARD));
3048 }
3049 f(i->second.first, actingset);
3050 }
3051 }
3052
3053 bool has_full_intervals() const override { return true; }
3054 void iterate_all_intervals(
3055 std::function<void(const PastIntervals::pg_interval_t &)> &&f
3056 ) const override {
3057 for (auto &&i: interval_map) {
3058 f(i.second);
3059 }
3060 }
3061 virtual ~pi_simple_rep() override {}
3062 };
3063
3064 /**
3065 * pi_compact_rep
3066 *
3067 * PastIntervals only needs to be able to answer two questions:
3068 * 1) Where should the primary look for unfound objects?
3069 * 2) List a set of subsets of the OSDs such that contacting at least
3070 * one from each subset guarrantees we speak to at least one witness
3071 * of any completed write.
3072 *
3073 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3074 * we don't need to keep any where maybe_went_rw would be false. We also
3075 * needn't keep two intervals where the actingset in one is a subset
3076 * of the other (only need to keep the smaller of the two sets). In order
3077 * to accurately trim the set of intervals as last_epoch_started changes
3078 * without rebuilding the set from scratch, we'll retain the larger set
3079 * if it in an older interval.
3080 */
3081 struct compact_interval_t {
3082 epoch_t first;
3083 epoch_t last;
3084 set<pg_shard_t> acting;
3085 bool supersedes(const compact_interval_t &other) {
3086 for (auto &&i: acting) {
3087 if (!other.acting.count(i))
3088 return false;
3089 }
3090 return true;
3091 }
3092 void dump(Formatter *f) const {
3093 f->open_object_section("compact_interval_t");
3094 f->dump_stream("first") << first;
3095 f->dump_stream("last") << last;
3096 f->dump_stream("acting") << acting;
3097 f->close_section();
3098 }
3099 void encode(bufferlist &bl) const {
3100 ENCODE_START(1, 1, bl);
3101 ::encode(first, bl);
3102 ::encode(last, bl);
3103 ::encode(acting, bl);
3104 ENCODE_FINISH(bl);
3105 }
3106 void decode(bufferlist::iterator &bl) {
3107 DECODE_START(1, bl);
3108 ::decode(first, bl);
3109 ::decode(last, bl);
3110 ::decode(acting, bl);
3111 DECODE_FINISH(bl);
3112 }
3113 static void generate_test_instances(list<compact_interval_t*> & o) {
3114 /* Not going to be used, we'll generate pi_compact_rep directly */
3115 }
3116 };
3117 ostream &operator<<(ostream &o, const compact_interval_t &rhs)
3118 {
3119 return o << "([" << rhs.first << "," << rhs.last
3120 << "] acting " << rhs.acting << ")";
3121 }
3122 WRITE_CLASS_ENCODER(compact_interval_t)
3123
3124 class pi_compact_rep : public PastIntervals::interval_rep {
3125 epoch_t first = 0;
3126 epoch_t last = 0; // inclusive
3127 set<pg_shard_t> all_participants;
3128 list<compact_interval_t> intervals;
3129 pi_compact_rep(
3130 bool ec_pool,
3131 std::list<PastIntervals::pg_interval_t> &&intervals) {
3132 for (auto &&i: intervals)
3133 add_interval(ec_pool, i);
3134 }
3135 public:
3136 pi_compact_rep() = default;
3137 pi_compact_rep(const pi_compact_rep &) = default;
3138 pi_compact_rep(pi_compact_rep &&) = default;
3139 pi_compact_rep &operator=(const pi_compact_rep &) = default;
3140 pi_compact_rep &operator=(pi_compact_rep &&) = default;
3141
3142 size_t size() const override { return intervals.size(); }
3143 bool empty() const override {
3144 return first > last || (first == 0 && last == 0);
3145 }
3146 void clear() override {
3147 *this = pi_compact_rep();
3148 }
3149 pair<epoch_t, epoch_t> get_bounds() const override {
3150 return make_pair(first, last + 1);
3151 }
3152 set<pg_shard_t> get_all_participants(
3153 bool ec_pool) const override {
3154 return all_participants;
3155 }
3156 void add_interval(
3157 bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
3158 if (first == 0)
3159 first = interval.first;
3160 assert(interval.last > last);
3161 last = interval.last;
3162 set<pg_shard_t> acting;
3163 for (unsigned i = 0; i < interval.acting.size(); ++i) {
3164 if (interval.acting[i] == CRUSH_ITEM_NONE)
3165 continue;
3166 acting.insert(
3167 pg_shard_t(
3168 interval.acting[i],
3169 ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3170 }
3171 all_participants.insert(acting.begin(), acting.end());
3172 if (!interval.maybe_went_rw)
3173 return;
3174 intervals.push_back(
3175 compact_interval_t{interval.first, interval.last, acting});
3176 auto plast = intervals.end();
3177 --plast;
3178 for (auto cur = intervals.begin(); cur != plast; ) {
3179 if (plast->supersedes(*cur)) {
3180 intervals.erase(cur++);
3181 } else {
3182 ++cur;
3183 }
3184 }
3185 }
3186 unique_ptr<PastIntervals::interval_rep> clone() const override {
3187 return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
3188 }
3189 ostream &print(ostream &out) const override {
3190 return out << "([" << first << "," << last
3191 << "] intervals=" << intervals << ")";
3192 }
3193 void encode(bufferlist &bl) const override {
3194 ENCODE_START(1, 1, bl);
3195 ::encode(first, bl);
3196 ::encode(last, bl);
3197 ::encode(all_participants, bl);
3198 ::encode(intervals, bl);
3199 ENCODE_FINISH(bl);
3200 }
3201 void decode(bufferlist::iterator &bl) override {
3202 DECODE_START(1, bl);
3203 ::decode(first, bl);
3204 ::decode(last, bl);
3205 ::decode(all_participants, bl);
3206 ::decode(intervals, bl);
3207 DECODE_FINISH(bl);
3208 }
3209 void dump(Formatter *f) const override {
3210 f->open_object_section("PastIntervals::compact_rep");
3211 f->dump_stream("first") << first;
3212 f->dump_stream("last") << last;
3213 f->open_array_section("all_participants");
3214 for (auto& i : all_participants) {
3215 f->dump_object("pg_shard", i);
3216 }
3217 f->close_section();
3218 f->open_array_section("intervals");
3219 for (auto &&i: intervals) {
3220 i.dump(f);
3221 }
3222 f->close_section();
3223 f->close_section();
3224 }
3225 bool is_classic() const override {
3226 return false;
3227 }
3228 static void generate_test_instances(list<pi_compact_rep*> &o) {
3229 using ival = PastIntervals::pg_interval_t;
3230 using ivallst = std::list<ival>;
3231 o.push_back(
3232 new pi_compact_rep(
3233 true, ivallst
3234 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3235 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3236 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3237 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3238 }));
3239 o.push_back(
3240 new pi_compact_rep(
3241 false, ivallst
3242 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3243 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3244 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3245 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3246 }));
3247 o.push_back(
3248 new pi_compact_rep(
3249 true, ivallst
3250 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3251 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3252 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3253 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3254 }));
3255 }
3256 void iterate_mayberw_back_to(
3257 bool ec_pool,
3258 epoch_t les,
3259 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3260 for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
3261 if (i->last < les)
3262 break;
3263 f(i->first, i->acting);
3264 }
3265 }
3266 virtual ~pi_compact_rep() override {}
3267 };
3268 WRITE_CLASS_ENCODER(pi_compact_rep)
3269
3270 PastIntervals::PastIntervals(const PastIntervals &rhs)
3271 : past_intervals(rhs.past_intervals ?
3272 rhs.past_intervals->clone() :
3273 nullptr) {}
3274
3275 PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
3276 {
3277 PastIntervals other(rhs);
3278 swap(other);
3279 return *this;
3280 }
3281
3282 ostream& operator<<(ostream& out, const PastIntervals &i)
3283 {
3284 if (i.past_intervals) {
3285 return i.past_intervals->print(out);
3286 } else {
3287 return out << "(empty)";
3288 }
3289 }
3290
3291 ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
3292 {
3293 return out << "PriorSet("
3294 << "ec_pool: " << i.ec_pool
3295 << ", probe: " << i.probe
3296 << ", down: " << i.down
3297 << ", blocked_by: " << i.blocked_by
3298 << ", pg_down: " << i.pg_down
3299 << ")";
3300 }
3301
3302 void PastIntervals::decode(bufferlist::iterator &bl)
3303 {
3304 DECODE_START(1, bl);
3305 __u8 type = 0;
3306 ::decode(type, bl);
3307 switch (type) {
3308 case 0:
3309 break;
3310 case 1:
3311 past_intervals.reset(new pi_simple_rep);
3312 past_intervals->decode(bl);
3313 break;
3314 case 2:
3315 past_intervals.reset(new pi_compact_rep);
3316 past_intervals->decode(bl);
3317 break;
3318 }
3319 DECODE_FINISH(bl);
3320 }
3321
3322 void PastIntervals::decode_classic(bufferlist::iterator &bl)
3323 {
3324 past_intervals.reset(new pi_simple_rep);
3325 past_intervals->decode(bl);
3326 }
3327
3328 void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
3329 {
3330 {
3331 list<pi_simple_rep *> simple;
3332 pi_simple_rep::generate_test_instances(simple);
3333 for (auto &&i: simple) {
3334 // takes ownership of contents
3335 o.push_back(new PastIntervals(i));
3336 }
3337 }
3338 {
3339 list<pi_compact_rep *> compact;
3340 pi_compact_rep::generate_test_instances(compact);
3341 for (auto &&i: compact) {
3342 // takes ownership of contents
3343 o.push_back(new PastIntervals(i));
3344 }
3345 }
3346 return;
3347 }
3348
3349 void PastIntervals::update_type(bool ec_pool, bool compact)
3350 {
3351 if (!compact) {
3352 if (!past_intervals) {
3353 past_intervals.reset(new pi_simple_rep);
3354 } else {
3355 // we never convert from compact back to classic
3356 assert(is_classic());
3357 }
3358 } else {
3359 if (!past_intervals) {
3360 past_intervals.reset(new pi_compact_rep);
3361 } else if (is_classic()) {
3362 auto old = std::move(past_intervals);
3363 past_intervals.reset(new pi_compact_rep);
3364 assert(old->has_full_intervals());
3365 old->iterate_all_intervals([&](const pg_interval_t &i) {
3366 past_intervals->add_interval(ec_pool, i);
3367 });
3368 }
3369 }
3370 }
3371
3372 void PastIntervals::update_type_from_map(bool ec_pool, const OSDMap &osdmap)
3373 {
3374 update_type(ec_pool, osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
3375 }
3376
3377 bool PastIntervals::is_new_interval(
3378 int old_acting_primary,
3379 int new_acting_primary,
3380 const vector<int> &old_acting,
3381 const vector<int> &new_acting,
3382 int old_up_primary,
3383 int new_up_primary,
3384 const vector<int> &old_up,
3385 const vector<int> &new_up,
3386 int old_size,
3387 int new_size,
3388 int old_min_size,
3389 int new_min_size,
3390 unsigned old_pg_num,
3391 unsigned new_pg_num,
3392 bool old_sort_bitwise,
3393 bool new_sort_bitwise,
3394 pg_t pgid) {
3395 return old_acting_primary != new_acting_primary ||
3396 new_acting != old_acting ||
3397 old_up_primary != new_up_primary ||
3398 new_up != old_up ||
3399 old_min_size != new_min_size ||
3400 old_size != new_size ||
3401 pgid.is_split(old_pg_num, new_pg_num, 0) ||
3402 old_sort_bitwise != new_sort_bitwise;
3403 }
3404
3405 bool PastIntervals::is_new_interval(
3406 int old_acting_primary,
3407 int new_acting_primary,
3408 const vector<int> &old_acting,
3409 const vector<int> &new_acting,
3410 int old_up_primary,
3411 int new_up_primary,
3412 const vector<int> &old_up,
3413 const vector<int> &new_up,
3414 OSDMapRef osdmap,
3415 OSDMapRef lastmap,
3416 pg_t pgid) {
3417 return !(lastmap->get_pools().count(pgid.pool())) ||
3418 is_new_interval(old_acting_primary,
3419 new_acting_primary,
3420 old_acting,
3421 new_acting,
3422 old_up_primary,
3423 new_up_primary,
3424 old_up,
3425 new_up,
3426 lastmap->get_pools().find(pgid.pool())->second.size,
3427 osdmap->get_pools().find(pgid.pool())->second.size,
3428 lastmap->get_pools().find(pgid.pool())->second.min_size,
3429 osdmap->get_pools().find(pgid.pool())->second.min_size,
3430 lastmap->get_pg_num(pgid.pool()),
3431 osdmap->get_pg_num(pgid.pool()),
3432 lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3433 osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3434 pgid);
3435 }
3436
3437 bool PastIntervals::check_new_interval(
3438 int old_acting_primary,
3439 int new_acting_primary,
3440 const vector<int> &old_acting,
3441 const vector<int> &new_acting,
3442 int old_up_primary,
3443 int new_up_primary,
3444 const vector<int> &old_up,
3445 const vector<int> &new_up,
3446 epoch_t same_interval_since,
3447 epoch_t last_epoch_clean,
3448 OSDMapRef osdmap,
3449 OSDMapRef lastmap,
3450 pg_t pgid,
3451 IsPGRecoverablePredicate *could_have_gone_active,
3452 PastIntervals *past_intervals,
3453 std::ostream *out)
3454 {
3455 /*
3456 * We have to be careful to gracefully deal with situations like
3457 * so. Say we have a power outage or something that takes out both
3458 * OSDs, but the monitor doesn't mark them down in the same epoch.
3459 * The history may look like
3460 *
3461 * 1: A B
3462 * 2: B
3463 * 3: let's say B dies for good, too (say, from the power spike)
3464 * 4: A
3465 *
3466 * which makes it look like B may have applied updates to the PG
3467 * that we need in order to proceed. This sucks...
3468 *
3469 * To minimize the risk of this happening, we CANNOT go active if
3470 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3471 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3472 * Then, we have something like
3473 *
3474 * 1: A B
3475 * 2: B up_thru[B]=0
3476 * 3:
3477 * 4: A
3478 *
3479 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
3480 *
3481 * or,
3482 *
3483 * 1: A B
3484 * 2: B up_thru[B]=0
3485 * 3: B up_thru[B]=2
3486 * 4:
3487 * 5: A
3488 *
3489 * -> we must wait for B, bc it was alive through 2, and could have
3490 * written to the pg.
3491 *
3492 * If B is really dead, then an administrator will need to manually
3493 * intervene by marking the OSD as "lost."
3494 */
3495
3496 // remember past interval
3497 // NOTE: a change in the up set primary triggers an interval
3498 // change, even though the interval members in the pg_interval_t
3499 // do not change.
3500 assert(past_intervals);
3501 assert(past_intervals->past_intervals);
3502 if (is_new_interval(
3503 old_acting_primary,
3504 new_acting_primary,
3505 old_acting,
3506 new_acting,
3507 old_up_primary,
3508 new_up_primary,
3509 old_up,
3510 new_up,
3511 osdmap,
3512 lastmap,
3513 pgid)) {
3514 pg_interval_t i;
3515 i.first = same_interval_since;
3516 i.last = osdmap->get_epoch() - 1;
3517 assert(i.first <= i.last);
3518 i.acting = old_acting;
3519 i.up = old_up;
3520 i.primary = old_acting_primary;
3521 i.up_primary = old_up_primary;
3522
3523 unsigned num_acting = 0;
3524 for (vector<int>::const_iterator p = i.acting.begin(); p != i.acting.end();
3525 ++p)
3526 if (*p != CRUSH_ITEM_NONE)
3527 ++num_acting;
3528
3529 assert(lastmap->get_pools().count(pgid.pool()));
3530 const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
3531 set<pg_shard_t> old_acting_shards;
3532 old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
3533
3534 if (num_acting &&
3535 i.primary != -1 &&
3536 num_acting >= old_pg_pool.min_size &&
3537 (*could_have_gone_active)(old_acting_shards)) {
3538 if (out)
3539 *out << __func__ << " " << i
3540 << ": not rw,"
3541 << " up_thru " << lastmap->get_up_thru(i.primary)
3542 << " up_from " << lastmap->get_up_from(i.primary)
3543 << " last_epoch_clean " << last_epoch_clean
3544 << std::endl;
3545 if (lastmap->get_up_thru(i.primary) >= i.first &&
3546 lastmap->get_up_from(i.primary) <= i.first) {
3547 i.maybe_went_rw = true;
3548 if (out)
3549 *out << __func__ << " " << i
3550 << " : primary up " << lastmap->get_up_from(i.primary)
3551 << "-" << lastmap->get_up_thru(i.primary)
3552 << " includes interval"
3553 << std::endl;
3554 } else if (last_epoch_clean >= i.first &&
3555 last_epoch_clean <= i.last) {
3556 // If the last_epoch_clean is included in this interval, then
3557 // the pg must have been rw (for recovery to have completed).
3558 // This is important because we won't know the _real_
3559 // first_epoch because we stop at last_epoch_clean, and we
3560 // don't want the oldest interval to randomly have
3561 // maybe_went_rw false depending on the relative up_thru vs
3562 // last_epoch_clean timing.
3563 i.maybe_went_rw = true;
3564 if (out)
3565 *out << __func__ << " " << i
3566 << " : includes last_epoch_clean " << last_epoch_clean
3567 << " and presumed to have been rw"
3568 << std::endl;
3569 } else {
3570 i.maybe_went_rw = false;
3571 if (out)
3572 *out << __func__ << " " << i
3573 << " : primary up " << lastmap->get_up_from(i.primary)
3574 << "-" << lastmap->get_up_thru(i.primary)
3575 << " does not include interval"
3576 << std::endl;
3577 }
3578 } else {
3579 i.maybe_went_rw = false;
3580 if (out)
3581 *out << __func__ << " " << i << " : acting set is too small" << std::endl;
3582 }
3583 past_intervals->past_intervals->add_interval(old_pg_pool.ec_pool(), i);
3584 return true;
3585 } else {
3586 return false;
3587 }
3588 }
3589
3590
3591 // true if the given map affects the prior set
3592 bool PastIntervals::PriorSet::affected_by_map(
3593 const OSDMap &osdmap,
3594 const DoutPrefixProvider *dpp) const
3595 {
3596 for (set<pg_shard_t>::iterator p = probe.begin();
3597 p != probe.end();
3598 ++p) {
3599 int o = p->osd;
3600
3601 // did someone in the prior set go down?
3602 if (osdmap.is_down(o) && down.count(o) == 0) {
3603 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
3604 return true;
3605 }
3606
3607 // did a down osd in cur get (re)marked as lost?
3608 map<int, epoch_t>::const_iterator r = blocked_by.find(o);
3609 if (r != blocked_by.end()) {
3610 if (!osdmap.exists(o)) {
3611 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
3612 return true;
3613 }
3614 if (osdmap.get_info(o).lost_at != r->second) {
3615 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
3616 return true;
3617 }
3618 }
3619 }
3620
3621 // did someone in the prior down set go up?
3622 for (set<int>::const_iterator p = down.begin();
3623 p != down.end();
3624 ++p) {
3625 int o = *p;
3626
3627 if (osdmap.is_up(o)) {
3628 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
3629 return true;
3630 }
3631
3632 // did someone in the prior set get lost or destroyed?
3633 if (!osdmap.exists(o)) {
3634 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
3635 return true;
3636 }
3637 // did a down osd in down get (re)marked as lost?
3638 map<int, epoch_t>::const_iterator r = blocked_by.find(o);
3639 if (r != blocked_by.end()) {
3640 if (osdmap.get_info(o).lost_at != r->second) {
3641 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
3642 return true;
3643 }
3644 }
3645 }
3646
3647 return false;
3648 }
3649
3650 ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
3651 {
3652 out << "interval(" << i.first << "-" << i.last
3653 << " up " << i.up << "(" << i.up_primary << ")"
3654 << " acting " << i.acting << "(" << i.primary << ")";
3655 if (i.maybe_went_rw)
3656 out << " maybe_went_rw";
3657 out << ")";
3658 return out;
3659 }
3660
3661
3662
3663 // -- pg_query_t --
3664
3665 void pg_query_t::encode(bufferlist &bl, uint64_t features) const {
3666 ENCODE_START(3, 3, bl);
3667 ::encode(type, bl);
3668 ::encode(since, bl);
3669 history.encode(bl);
3670 ::encode(epoch_sent, bl);
3671 ::encode(to, bl);
3672 ::encode(from, bl);
3673 ENCODE_FINISH(bl);
3674 }
3675
3676 void pg_query_t::decode(bufferlist::iterator &bl) {
3677 DECODE_START(3, bl);
3678 ::decode(type, bl);
3679 ::decode(since, bl);
3680 history.decode(bl);
3681 ::decode(epoch_sent, bl);
3682 ::decode(to, bl);
3683 ::decode(from, bl);
3684 DECODE_FINISH(bl);
3685 }
3686
3687 void pg_query_t::dump(Formatter *f) const
3688 {
3689 f->dump_int("from", from);
3690 f->dump_int("to", to);
3691 f->dump_string("type", get_type_name());
3692 f->dump_stream("since") << since;
3693 f->dump_stream("epoch_sent") << epoch_sent;
3694 f->open_object_section("history");
3695 history.dump(f);
3696 f->close_section();
3697 }
3698 void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
3699 {
3700 o.push_back(new pg_query_t());
3701 list<pg_history_t*> h;
3702 pg_history_t::generate_test_instances(h);
3703 o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4));
3704 o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4));
3705 o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0),
3706 eversion_t(4, 5), *h.back(), 4));
3707 o.push_back(new pg_query_t(pg_query_t::FULLLOG,
3708 shard_id_t::NO_SHARD, shard_id_t::NO_SHARD,
3709 *h.back(), 5));
3710 }
3711
3712 // -- ObjectModDesc --
3713 void ObjectModDesc::visit(Visitor *visitor) const
3714 {
3715 bufferlist::iterator bp = bl.begin();
3716 try {
3717 while (!bp.end()) {
3718 DECODE_START(max_required_version, bp);
3719 uint8_t code;
3720 ::decode(code, bp);
3721 switch (code) {
3722 case APPEND: {
3723 uint64_t size;
3724 ::decode(size, bp);
3725 visitor->append(size);
3726 break;
3727 }
3728 case SETATTRS: {
3729 map<string, boost::optional<bufferlist> > attrs;
3730 ::decode(attrs, bp);
3731 visitor->setattrs(attrs);
3732 break;
3733 }
3734 case DELETE: {
3735 version_t old_version;
3736 ::decode(old_version, bp);
3737 visitor->rmobject(old_version);
3738 break;
3739 }
3740 case CREATE: {
3741 visitor->create();
3742 break;
3743 }
3744 case UPDATE_SNAPS: {
3745 set<snapid_t> snaps;
3746 ::decode(snaps, bp);
3747 visitor->update_snaps(snaps);
3748 break;
3749 }
3750 case TRY_DELETE: {
3751 version_t old_version;
3752 ::decode(old_version, bp);
3753 visitor->try_rmobject(old_version);
3754 break;
3755 }
3756 case ROLLBACK_EXTENTS: {
3757 vector<pair<uint64_t, uint64_t> > extents;
3758 version_t gen;
3759 ::decode(gen, bp);
3760 ::decode(extents, bp);
3761 visitor->rollback_extents(gen,extents);
3762 break;
3763 }
3764 default:
3765 assert(0 == "Invalid rollback code");
3766 }
3767 DECODE_FINISH(bp);
3768 }
3769 } catch (...) {
3770 assert(0 == "Invalid encoding");
3771 }
3772 }
3773
3774 struct DumpVisitor : public ObjectModDesc::Visitor {
3775 Formatter *f;
3776 explicit DumpVisitor(Formatter *f) : f(f) {}
3777 void append(uint64_t old_size) override {
3778 f->open_object_section("op");
3779 f->dump_string("code", "APPEND");
3780 f->dump_unsigned("old_size", old_size);
3781 f->close_section();
3782 }
3783 void setattrs(map<string, boost::optional<bufferlist> > &attrs) override {
3784 f->open_object_section("op");
3785 f->dump_string("code", "SETATTRS");
3786 f->open_array_section("attrs");
3787 for (map<string, boost::optional<bufferlist> >::iterator i = attrs.begin();
3788 i != attrs.end();
3789 ++i) {
3790 f->dump_string("attr_name", i->first);
3791 }
3792 f->close_section();
3793 f->close_section();
3794 }
3795 void rmobject(version_t old_version) override {
3796 f->open_object_section("op");
3797 f->dump_string("code", "RMOBJECT");
3798 f->dump_unsigned("old_version", old_version);
3799 f->close_section();
3800 }
3801 void try_rmobject(version_t old_version) override {
3802 f->open_object_section("op");
3803 f->dump_string("code", "TRY_RMOBJECT");
3804 f->dump_unsigned("old_version", old_version);
3805 f->close_section();
3806 }
3807 void create() override {
3808 f->open_object_section("op");
3809 f->dump_string("code", "CREATE");
3810 f->close_section();
3811 }
3812 void update_snaps(const set<snapid_t> &snaps) override {
3813 f->open_object_section("op");
3814 f->dump_string("code", "UPDATE_SNAPS");
3815 f->dump_stream("snaps") << snaps;
3816 f->close_section();
3817 }
3818 void rollback_extents(
3819 version_t gen,
3820 const vector<pair<uint64_t, uint64_t> > &extents) override {
3821 f->open_object_section("op");
3822 f->dump_string("code", "ROLLBACK_EXTENTS");
3823 f->dump_unsigned("gen", gen);
3824 f->dump_stream("snaps") << extents;
3825 f->close_section();
3826 }
3827 };
3828
3829 void ObjectModDesc::dump(Formatter *f) const
3830 {
3831 f->open_object_section("object_mod_desc");
3832 f->dump_bool("can_local_rollback", can_local_rollback);
3833 f->dump_bool("rollback_info_completed", rollback_info_completed);
3834 {
3835 f->open_array_section("ops");
3836 DumpVisitor vis(f);
3837 visit(&vis);
3838 f->close_section();
3839 }
3840 f->close_section();
3841 }
3842
3843 void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o)
3844 {
3845 map<string, boost::optional<bufferlist> > attrs;
3846 attrs[OI_ATTR];
3847 attrs[SS_ATTR];
3848 attrs["asdf"];
3849 o.push_back(new ObjectModDesc());
3850 o.back()->append(100);
3851 o.back()->setattrs(attrs);
3852 o.push_back(new ObjectModDesc());
3853 o.back()->rmobject(1001);
3854 o.push_back(new ObjectModDesc());
3855 o.back()->create();
3856 o.back()->setattrs(attrs);
3857 o.push_back(new ObjectModDesc());
3858 o.back()->create();
3859 o.back()->setattrs(attrs);
3860 o.back()->mark_unrollbackable();
3861 o.back()->append(1000);
3862 }
3863
3864 void ObjectModDesc::encode(bufferlist &_bl) const
3865 {
3866 ENCODE_START(max_required_version, max_required_version, _bl);
3867 ::encode(can_local_rollback, _bl);
3868 ::encode(rollback_info_completed, _bl);
3869 ::encode(bl, _bl);
3870 ENCODE_FINISH(_bl);
3871 }
3872 void ObjectModDesc::decode(bufferlist::iterator &_bl)
3873 {
3874 DECODE_START(2, _bl);
3875 max_required_version = struct_v;
3876 ::decode(can_local_rollback, _bl);
3877 ::decode(rollback_info_completed, _bl);
3878 ::decode(bl, _bl);
3879 // ensure bl does not pin a larger buffer in memory
3880 bl.rebuild();
3881 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3882 DECODE_FINISH(_bl);
3883 }
3884
3885 // -- pg_log_entry_t --
3886
3887 string pg_log_entry_t::get_key_name() const
3888 {
3889 return version.get_key_name();
3890 }
3891
3892 void pg_log_entry_t::encode_with_checksum(bufferlist& bl) const
3893 {
3894 bufferlist ebl(sizeof(*this)*2);
3895 encode(ebl);
3896 __u32 crc = ebl.crc32c(0);
3897 ::encode(ebl, bl);
3898 ::encode(crc, bl);
3899 }
3900
3901 void pg_log_entry_t::decode_with_checksum(bufferlist::iterator& p)
3902 {
3903 bufferlist bl;
3904 ::decode(bl, p);
3905 __u32 crc;
3906 ::decode(crc, p);
3907 if (crc != bl.crc32c(0))
3908 throw buffer::malformed_input("bad checksum on pg_log_entry_t");
3909 bufferlist::iterator q = bl.begin();
3910 decode(q);
3911 }
3912
3913 void pg_log_entry_t::encode(bufferlist &bl) const
3914 {
3915 ENCODE_START(11, 4, bl);
3916 ::encode(op, bl);
3917 ::encode(soid, bl);
3918 ::encode(version, bl);
3919
3920 /**
3921 * Added with reverting_to:
3922 * Previous code used prior_version to encode
3923 * what we now call reverting_to. This will
3924 * allow older code to decode reverting_to
3925 * into prior_version as expected.
3926 */
3927 if (op == LOST_REVERT)
3928 ::encode(reverting_to, bl);
3929 else
3930 ::encode(prior_version, bl);
3931
3932 ::encode(reqid, bl);
3933 ::encode(mtime, bl);
3934 if (op == LOST_REVERT)
3935 ::encode(prior_version, bl);
3936 ::encode(snaps, bl);
3937 ::encode(user_version, bl);
3938 ::encode(mod_desc, bl);
3939 ::encode(extra_reqids, bl);
3940 if (op == ERROR)
3941 ::encode(return_code, bl);
3942 ENCODE_FINISH(bl);
3943 }
3944
3945 void pg_log_entry_t::decode(bufferlist::iterator &bl)
3946 {
3947 DECODE_START_LEGACY_COMPAT_LEN(11, 4, 4, bl);
3948 ::decode(op, bl);
3949 if (struct_v < 2) {
3950 sobject_t old_soid;
3951 ::decode(old_soid, bl);
3952 soid.oid = old_soid.oid;
3953 soid.snap = old_soid.snap;
3954 invalid_hash = true;
3955 } else {
3956 ::decode(soid, bl);
3957 }
3958 if (struct_v < 3)
3959 invalid_hash = true;
3960 ::decode(version, bl);
3961
3962 if (struct_v >= 6 && op == LOST_REVERT)
3963 ::decode(reverting_to, bl);
3964 else
3965 ::decode(prior_version, bl);
3966
3967 ::decode(reqid, bl);
3968
3969 ::decode(mtime, bl);
3970 if (struct_v < 5)
3971 invalid_pool = true;
3972
3973 if (op == LOST_REVERT) {
3974 if (struct_v >= 6) {
3975 ::decode(prior_version, bl);
3976 } else {
3977 reverting_to = prior_version;
3978 }
3979 }
3980 if (struct_v >= 7 || // for v >= 7, this is for all ops.
3981 op == CLONE) { // for v < 7, it's only present for CLONE.
3982 ::decode(snaps, bl);
3983 // ensure snaps does not pin a larger buffer in memory
3984 snaps.rebuild();
3985 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3986 }
3987
3988 if (struct_v >= 8)
3989 ::decode(user_version, bl);
3990 else
3991 user_version = version.version;
3992
3993 if (struct_v >= 9)
3994 ::decode(mod_desc, bl);
3995 else
3996 mod_desc.mark_unrollbackable();
3997 if (struct_v >= 10)
3998 ::decode(extra_reqids, bl);
3999 if (struct_v >= 11 && op == ERROR)
4000 ::decode(return_code, bl);
4001 DECODE_FINISH(bl);
4002 }
4003
4004 void pg_log_entry_t::dump(Formatter *f) const
4005 {
4006 f->dump_string("op", get_op_name());
4007 f->dump_stream("object") << soid;
4008 f->dump_stream("version") << version;
4009 f->dump_stream("prior_version") << prior_version;
4010 f->dump_stream("reqid") << reqid;
4011 f->open_array_section("extra_reqids");
4012 for (auto p = extra_reqids.begin();
4013 p != extra_reqids.end();
4014 ++p) {
4015 f->open_object_section("extra_reqid");
4016 f->dump_stream("reqid") << p->first;
4017 f->dump_stream("user_version") << p->second;
4018 f->close_section();
4019 }
4020 f->close_section();
4021 f->dump_stream("mtime") << mtime;
4022 f->dump_int("return_code", return_code);
4023 if (snaps.length() > 0) {
4024 vector<snapid_t> v;
4025 bufferlist c = snaps;
4026 bufferlist::iterator p = c.begin();
4027 try {
4028 ::decode(v, p);
4029 } catch (...) {
4030 v.clear();
4031 }
4032 f->open_object_section("snaps");
4033 for (vector<snapid_t>::iterator p = v.begin(); p != v.end(); ++p)
4034 f->dump_unsigned("snap", *p);
4035 f->close_section();
4036 }
4037 {
4038 f->open_object_section("mod_desc");
4039 mod_desc.dump(f);
4040 f->close_section();
4041 }
4042 }
4043
4044 void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
4045 {
4046 o.push_back(new pg_log_entry_t());
4047 hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
4048 o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
4049 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4050 utime_t(8,9), 0));
4051 o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4),
4052 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4053 utime_t(8,9), -ENOENT));
4054 }
4055
4056 ostream& operator<<(ostream& out, const pg_log_entry_t& e)
4057 {
4058 out << e.version << " (" << e.prior_version << ") "
4059 << std::left << std::setw(8) << e.get_op_name() << ' '
4060 << e.soid << " by " << e.reqid << " " << e.mtime
4061 << " " << e.return_code;
4062 if (e.snaps.length()) {
4063 vector<snapid_t> snaps;
4064 bufferlist c = e.snaps;
4065 bufferlist::iterator p = c.begin();
4066 try {
4067 ::decode(snaps, p);
4068 } catch (...) {
4069 snaps.clear();
4070 }
4071 out << " snaps " << snaps;
4072 }
4073 return out;
4074 }
4075
4076
4077 // -- pg_log_t --
4078
4079 // out: pg_log_t that only has entries that apply to import_pgid using curmap
4080 // reject: Entries rejected from "in" are in the reject.log. Other fields not set.
4081 void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
4082 const string &hit_set_namespace, const pg_log_t &in,
4083 pg_log_t &out, pg_log_t &reject)
4084 {
4085 out = in;
4086 out.log.clear();
4087 reject.log.clear();
4088
4089 for (list<pg_log_entry_t>::const_iterator i = in.log.begin();
4090 i != in.log.end(); ++i) {
4091
4092 // Reject pg log entries for temporary objects
4093 if (i->soid.is_temp()) {
4094 reject.log.push_back(*i);
4095 continue;
4096 }
4097
4098 if (i->soid.nspace != hit_set_namespace) {
4099 object_t oid = i->soid.oid;
4100 object_locator_t loc(i->soid);
4101 pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
4102 pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
4103
4104 if (import_pgid.pgid == pgid) {
4105 out.log.push_back(*i);
4106 } else {
4107 reject.log.push_back(*i);
4108 }
4109 } else {
4110 out.log.push_back(*i);
4111 }
4112 }
4113 }
4114
4115 void pg_log_t::encode(bufferlist& bl) const
4116 {
4117 ENCODE_START(6, 3, bl);
4118 ::encode(head, bl);
4119 ::encode(tail, bl);
4120 ::encode(log, bl);
4121 ::encode(can_rollback_to, bl);
4122 ::encode(rollback_info_trimmed_to, bl);
4123 ENCODE_FINISH(bl);
4124 }
4125
4126 void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
4127 {
4128 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
4129 ::decode(head, bl);
4130 ::decode(tail, bl);
4131 if (struct_v < 2) {
4132 bool backlog;
4133 ::decode(backlog, bl);
4134 }
4135 ::decode(log, bl);
4136 if (struct_v >= 5)
4137 ::decode(can_rollback_to, bl);
4138
4139 if (struct_v >= 6)
4140 ::decode(rollback_info_trimmed_to, bl);
4141 else
4142 rollback_info_trimmed_to = tail;
4143 DECODE_FINISH(bl);
4144
4145 // handle hobject_t format change
4146 if (struct_v < 4) {
4147 for (list<pg_log_entry_t>::iterator i = log.begin();
4148 i != log.end();
4149 ++i) {
4150 if (!i->soid.is_max() && i->soid.pool == -1)
4151 i->soid.pool = pool;
4152 }
4153 }
4154 }
4155
4156 void pg_log_t::dump(Formatter *f) const
4157 {
4158 f->dump_stream("head") << head;
4159 f->dump_stream("tail") << tail;
4160 f->open_array_section("log");
4161 for (list<pg_log_entry_t>::const_iterator p = log.begin(); p != log.end(); ++p) {
4162 f->open_object_section("entry");
4163 p->dump(f);
4164 f->close_section();
4165 }
4166 f->close_section();
4167 }
4168
4169 void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
4170 {
4171 o.push_back(new pg_log_t);
4172
4173 // this is nonsensical:
4174 o.push_back(new pg_log_t);
4175 o.back()->head = eversion_t(1,2);
4176 o.back()->tail = eversion_t(3,4);
4177 list<pg_log_entry_t*> e;
4178 pg_log_entry_t::generate_test_instances(e);
4179 for (list<pg_log_entry_t*>::iterator p = e.begin(); p != e.end(); ++p)
4180 o.back()->log.push_back(**p);
4181 }
4182
4183 void pg_log_t::copy_after(const pg_log_t &other, eversion_t v)
4184 {
4185 can_rollback_to = other.can_rollback_to;
4186 head = other.head;
4187 tail = other.tail;
4188 for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4189 i != other.log.rend();
4190 ++i) {
4191 assert(i->version > other.tail);
4192 if (i->version <= v) {
4193 // make tail accurate.
4194 tail = i->version;
4195 break;
4196 }
4197 log.push_front(*i);
4198 }
4199 }
4200
4201 void pg_log_t::copy_range(const pg_log_t &other, eversion_t from, eversion_t to)
4202 {
4203 can_rollback_to = other.can_rollback_to;
4204 list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4205 assert(i != other.log.rend());
4206 while (i->version > to) {
4207 ++i;
4208 assert(i != other.log.rend());
4209 }
4210 assert(i->version == to);
4211 head = to;
4212 for ( ; i != other.log.rend(); ++i) {
4213 if (i->version <= from) {
4214 tail = i->version;
4215 break;
4216 }
4217 log.push_front(*i);
4218 }
4219 }
4220
4221 void pg_log_t::copy_up_to(const pg_log_t &other, int max)
4222 {
4223 can_rollback_to = other.can_rollback_to;
4224 int n = 0;
4225 head = other.head;
4226 tail = other.tail;
4227 for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4228 i != other.log.rend();
4229 ++i) {
4230 if (n++ >= max) {
4231 tail = i->version;
4232 break;
4233 }
4234 log.push_front(*i);
4235 }
4236 }
4237
4238 ostream& pg_log_t::print(ostream& out) const
4239 {
4240 out << *this << std::endl;
4241 for (list<pg_log_entry_t>::const_iterator p = log.begin();
4242 p != log.end();
4243 ++p)
4244 out << *p << std::endl;
4245 return out;
4246 }
4247
4248 // -- pg_missing_t --
4249
4250 ostream& operator<<(ostream& out, const pg_missing_item& i)
4251 {
4252 out << i.need;
4253 if (i.have != eversion_t())
4254 out << "(" << i.have << ")";
4255 return out;
4256 }
4257
4258 // -- object_copy_cursor_t --
4259
4260 void object_copy_cursor_t::encode(bufferlist& bl) const
4261 {
4262 ENCODE_START(1, 1, bl);
4263 ::encode(attr_complete, bl);
4264 ::encode(data_offset, bl);
4265 ::encode(data_complete, bl);
4266 ::encode(omap_offset, bl);
4267 ::encode(omap_complete, bl);
4268 ENCODE_FINISH(bl);
4269 }
4270
4271 void object_copy_cursor_t::decode(bufferlist::iterator &bl)
4272 {
4273 DECODE_START(1, bl);
4274 ::decode(attr_complete, bl);
4275 ::decode(data_offset, bl);
4276 ::decode(data_complete, bl);
4277 ::decode(omap_offset, bl);
4278 ::decode(omap_complete, bl);
4279 DECODE_FINISH(bl);
4280 }
4281
4282 void object_copy_cursor_t::dump(Formatter *f) const
4283 {
4284 f->dump_unsigned("attr_complete", (int)attr_complete);
4285 f->dump_unsigned("data_offset", data_offset);
4286 f->dump_unsigned("data_complete", (int)data_complete);
4287 f->dump_string("omap_offset", omap_offset);
4288 f->dump_unsigned("omap_complete", (int)omap_complete);
4289 }
4290
4291 void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
4292 {
4293 o.push_back(new object_copy_cursor_t);
4294 o.push_back(new object_copy_cursor_t);
4295 o.back()->attr_complete = true;
4296 o.back()->data_offset = 123;
4297 o.push_back(new object_copy_cursor_t);
4298 o.back()->attr_complete = true;
4299 o.back()->data_complete = true;
4300 o.back()->omap_offset = "foo";
4301 o.push_back(new object_copy_cursor_t);
4302 o.back()->attr_complete = true;
4303 o.back()->data_complete = true;
4304 o.back()->omap_complete = true;
4305 }
4306
4307 // -- object_copy_data_t --
4308
4309 void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const
4310 {
4311 ENCODE_START(7, 5, bl);
4312 ::encode(size, bl);
4313 ::encode(mtime, bl);
4314 ::encode(attrs, bl);
4315 ::encode(data, bl);
4316 ::encode(omap_data, bl);
4317 ::encode(cursor, bl);
4318 ::encode(omap_header, bl);
4319 ::encode(snaps, bl);
4320 ::encode(snap_seq, bl);
4321 ::encode(flags, bl);
4322 ::encode(data_digest, bl);
4323 ::encode(omap_digest, bl);
4324 ::encode(reqids, bl);
4325 ::encode(truncate_seq, bl);
4326 ::encode(truncate_size, bl);
4327 ENCODE_FINISH(bl);
4328 }
4329
4330 void object_copy_data_t::decode(bufferlist::iterator& bl)
4331 {
4332 DECODE_START(7, bl);
4333 if (struct_v < 5) {
4334 // old
4335 ::decode(size, bl);
4336 ::decode(mtime, bl);
4337 {
4338 string category;
4339 ::decode(category, bl); // no longer used
4340 }
4341 ::decode(attrs, bl);
4342 ::decode(data, bl);
4343 {
4344 map<string,bufferlist> omap;
4345 ::decode(omap, bl);
4346 omap_data.clear();
4347 if (!omap.empty())
4348 ::encode(omap, omap_data);
4349 }
4350 ::decode(cursor, bl);
4351 if (struct_v >= 2)
4352 ::decode(omap_header, bl);
4353 if (struct_v >= 3) {
4354 ::decode(snaps, bl);
4355 ::decode(snap_seq, bl);
4356 } else {
4357 snaps.clear();
4358 snap_seq = 0;
4359 }
4360 if (struct_v >= 4) {
4361 ::decode(flags, bl);
4362 ::decode(data_digest, bl);
4363 ::decode(omap_digest, bl);
4364 }
4365 } else {
4366 // current
4367 ::decode(size, bl);
4368 ::decode(mtime, bl);
4369 ::decode(attrs, bl);
4370 ::decode(data, bl);
4371 ::decode(omap_data, bl);
4372 ::decode(cursor, bl);
4373 ::decode(omap_header, bl);
4374 ::decode(snaps, bl);
4375 ::decode(snap_seq, bl);
4376 if (struct_v >= 4) {
4377 ::decode(flags, bl);
4378 ::decode(data_digest, bl);
4379 ::decode(omap_digest, bl);
4380 }
4381 if (struct_v >= 6) {
4382 ::decode(reqids, bl);
4383 }
4384 if (struct_v >= 7) {
4385 ::decode(truncate_seq, bl);
4386 ::decode(truncate_size, bl);
4387 }
4388 }
4389 DECODE_FINISH(bl);
4390 }
4391
4392 void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o)
4393 {
4394 o.push_back(new object_copy_data_t());
4395
4396 list<object_copy_cursor_t*> cursors;
4397 object_copy_cursor_t::generate_test_instances(cursors);
4398 list<object_copy_cursor_t*>::iterator ci = cursors.begin();
4399 o.back()->cursor = **(ci++);
4400
4401 o.push_back(new object_copy_data_t());
4402 o.back()->cursor = **(ci++);
4403
4404 o.push_back(new object_copy_data_t());
4405 o.back()->size = 1234;
4406 o.back()->mtime.set_from_double(1234);
4407 bufferptr bp("there", 5);
4408 bufferlist bl;
4409 bl.push_back(bp);
4410 o.back()->attrs["hello"] = bl;
4411 bufferptr bp2("not", 3);
4412 bufferlist bl2;
4413 bl2.push_back(bp2);
4414 map<string,bufferlist> omap;
4415 omap["why"] = bl2;
4416 ::encode(omap, o.back()->omap_data);
4417 bufferptr databp("iamsomedatatocontain", 20);
4418 o.back()->data.push_back(databp);
4419 o.back()->omap_header.append("this is an omap header");
4420 o.back()->snaps.push_back(123);
4421 o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
4422 }
4423
4424 void object_copy_data_t::dump(Formatter *f) const
4425 {
4426 f->open_object_section("cursor");
4427 cursor.dump(f);
4428 f->close_section(); // cursor
4429 f->dump_int("size", size);
4430 f->dump_stream("mtime") << mtime;
4431 /* we should really print out the attrs here, but bufferlist
4432 const-correctness prevents that */
4433 f->dump_int("attrs_size", attrs.size());
4434 f->dump_int("flags", flags);
4435 f->dump_unsigned("data_digest", data_digest);
4436 f->dump_unsigned("omap_digest", omap_digest);
4437 f->dump_int("omap_data_length", omap_data.length());
4438 f->dump_int("omap_header_length", omap_header.length());
4439 f->dump_int("data_length", data.length());
4440 f->open_array_section("snaps");
4441 for (vector<snapid_t>::const_iterator p = snaps.begin();
4442 p != snaps.end(); ++p)
4443 f->dump_unsigned("snap", *p);
4444 f->close_section();
4445 f->open_array_section("reqids");
4446 for (auto p = reqids.begin();
4447 p != reqids.end();
4448 ++p) {
4449 f->open_object_section("extra_reqid");
4450 f->dump_stream("reqid") << p->first;
4451 f->dump_stream("user_version") << p->second;
4452 f->close_section();
4453 }
4454 f->close_section();
4455 }
4456
4457 // -- pg_create_t --
4458
4459 void pg_create_t::encode(bufferlist &bl) const
4460 {
4461 ENCODE_START(1, 1, bl);
4462 ::encode(created, bl);
4463 ::encode(parent, bl);
4464 ::encode(split_bits, bl);
4465 ENCODE_FINISH(bl);
4466 }
4467
4468 void pg_create_t::decode(bufferlist::iterator &bl)
4469 {
4470 DECODE_START(1, bl);
4471 ::decode(created, bl);
4472 ::decode(parent, bl);
4473 ::decode(split_bits, bl);
4474 DECODE_FINISH(bl);
4475 }
4476
4477 void pg_create_t::dump(Formatter *f) const
4478 {
4479 f->dump_unsigned("created", created);
4480 f->dump_stream("parent") << parent;
4481 f->dump_int("split_bits", split_bits);
4482 }
4483
4484 void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
4485 {
4486 o.push_back(new pg_create_t);
4487 o.push_back(new pg_create_t(1, pg_t(3, 4, -1), 2));
4488 }
4489
4490
4491 // -- pg_hit_set_info_t --
4492
4493 void pg_hit_set_info_t::encode(bufferlist& bl) const
4494 {
4495 ENCODE_START(2, 1, bl);
4496 ::encode(begin, bl);
4497 ::encode(end, bl);
4498 ::encode(version, bl);
4499 ::encode(using_gmt, bl);
4500 ENCODE_FINISH(bl);
4501 }
4502
4503 void pg_hit_set_info_t::decode(bufferlist::iterator& p)
4504 {
4505 DECODE_START(2, p);
4506 ::decode(begin, p);
4507 ::decode(end, p);
4508 ::decode(version, p);
4509 if (struct_v >= 2) {
4510 ::decode(using_gmt, p);
4511 } else {
4512 using_gmt = false;
4513 }
4514 DECODE_FINISH(p);
4515 }
4516
4517 void pg_hit_set_info_t::dump(Formatter *f) const
4518 {
4519 f->dump_stream("begin") << begin;
4520 f->dump_stream("end") << end;
4521 f->dump_stream("version") << version;
4522 f->dump_stream("using_gmt") << using_gmt;
4523 }
4524
4525 void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
4526 {
4527 ls.push_back(new pg_hit_set_info_t);
4528 ls.push_back(new pg_hit_set_info_t);
4529 ls.back()->begin = utime_t(1, 2);
4530 ls.back()->end = utime_t(3, 4);
4531 }
4532
4533
4534 // -- pg_hit_set_history_t --
4535
4536 void pg_hit_set_history_t::encode(bufferlist& bl) const
4537 {
4538 ENCODE_START(1, 1, bl);
4539 ::encode(current_last_update, bl);
4540 {
4541 utime_t dummy_stamp;
4542 ::encode(dummy_stamp, bl);
4543 }
4544 {
4545 pg_hit_set_info_t dummy_info;
4546 ::encode(dummy_info, bl);
4547 }
4548 ::encode(history, bl);
4549 ENCODE_FINISH(bl);
4550 }
4551
4552 void pg_hit_set_history_t::decode(bufferlist::iterator& p)
4553 {
4554 DECODE_START(1, p);
4555 ::decode(current_last_update, p);
4556 {
4557 utime_t dummy_stamp;
4558 ::decode(dummy_stamp, p);
4559 }
4560 {
4561 pg_hit_set_info_t dummy_info;
4562 ::decode(dummy_info, p);
4563 }
4564 ::decode(history, p);
4565 DECODE_FINISH(p);
4566 }
4567
4568 void pg_hit_set_history_t::dump(Formatter *f) const
4569 {
4570 f->dump_stream("current_last_update") << current_last_update;
4571 f->open_array_section("history");
4572 for (list<pg_hit_set_info_t>::const_iterator p = history.begin();
4573 p != history.end(); ++p) {
4574 f->open_object_section("info");
4575 p->dump(f);
4576 f->close_section();
4577 }
4578 f->close_section();
4579 }
4580
4581 void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls)
4582 {
4583 ls.push_back(new pg_hit_set_history_t);
4584 ls.push_back(new pg_hit_set_history_t);
4585 ls.back()->current_last_update = eversion_t(1, 2);
4586 ls.back()->history.push_back(pg_hit_set_info_t());
4587 }
4588
4589 // -- osd_peer_stat_t --
4590
4591 void osd_peer_stat_t::encode(bufferlist& bl) const
4592 {
4593 ENCODE_START(1, 1, bl);
4594 ::encode(stamp, bl);
4595 ENCODE_FINISH(bl);
4596 }
4597
4598 void osd_peer_stat_t::decode(bufferlist::iterator& bl)
4599 {
4600 DECODE_START(1, bl);
4601 ::decode(stamp, bl);
4602 DECODE_FINISH(bl);
4603 }
4604
4605 void osd_peer_stat_t::dump(Formatter *f) const
4606 {
4607 f->dump_stream("stamp") << stamp;
4608 }
4609
4610 void osd_peer_stat_t::generate_test_instances(list<osd_peer_stat_t*>& o)
4611 {
4612 o.push_back(new osd_peer_stat_t);
4613 o.push_back(new osd_peer_stat_t);
4614 o.back()->stamp = utime_t(1, 2);
4615 }
4616
4617 ostream& operator<<(ostream& out, const osd_peer_stat_t &stat)
4618 {
4619 return out << "stat(" << stat.stamp << ")";
4620 }
4621
4622
4623 // -- OSDSuperblock --
4624
4625 void OSDSuperblock::encode(bufferlist &bl) const
4626 {
4627 ENCODE_START(8, 5, bl);
4628 ::encode(cluster_fsid, bl);
4629 ::encode(whoami, bl);
4630 ::encode(current_epoch, bl);
4631 ::encode(oldest_map, bl);
4632 ::encode(newest_map, bl);
4633 ::encode(weight, bl);
4634 compat_features.encode(bl);
4635 ::encode(clean_thru, bl);
4636 ::encode(mounted, bl);
4637 ::encode(osd_fsid, bl);
4638 ::encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full
4639 ::encode((uint32_t)0, bl); // map<int64_t,epoch_t> pool_last_epoch_marked_full
4640 ENCODE_FINISH(bl);
4641 }
4642
4643 void OSDSuperblock::decode(bufferlist::iterator &bl)
4644 {
4645 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl);
4646 if (struct_v < 3) {
4647 string magic;
4648 ::decode(magic, bl);
4649 }
4650 ::decode(cluster_fsid, bl);
4651 ::decode(whoami, bl);
4652 ::decode(current_epoch, bl);
4653 ::decode(oldest_map, bl);
4654 ::decode(newest_map, bl);
4655 ::decode(weight, bl);
4656 if (struct_v >= 2) {
4657 compat_features.decode(bl);
4658 } else { //upgrade it!
4659 compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4660 }
4661 ::decode(clean_thru, bl);
4662 ::decode(mounted, bl);
4663 if (struct_v >= 4)
4664 ::decode(osd_fsid, bl);
4665 if (struct_v >= 6) {
4666 epoch_t last_map_marked_full;
4667 ::decode(last_map_marked_full, bl);
4668 }
4669 if (struct_v >= 7) {
4670 map<int64_t,epoch_t> pool_last_map_marked_full;
4671 ::decode(pool_last_map_marked_full, bl);
4672 }
4673 DECODE_FINISH(bl);
4674 }
4675
4676 void OSDSuperblock::dump(Formatter *f) const
4677 {
4678 f->dump_stream("cluster_fsid") << cluster_fsid;
4679 f->dump_stream("osd_fsid") << osd_fsid;
4680 f->dump_int("whoami", whoami);
4681 f->dump_int("current_epoch", current_epoch);
4682 f->dump_int("oldest_map", oldest_map);
4683 f->dump_int("newest_map", newest_map);
4684 f->dump_float("weight", weight);
4685 f->open_object_section("compat");
4686 compat_features.dump(f);
4687 f->close_section();
4688 f->dump_int("clean_thru", clean_thru);
4689 f->dump_int("last_epoch_mounted", mounted);
4690 }
4691
4692 void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
4693 {
4694 OSDSuperblock z;
4695 o.push_back(new OSDSuperblock(z));
4696 memset(&z.cluster_fsid, 1, sizeof(z.cluster_fsid));
4697 memset(&z.osd_fsid, 2, sizeof(z.osd_fsid));
4698 z.whoami = 3;
4699 z.current_epoch = 4;
4700 z.oldest_map = 5;
4701 z.newest_map = 9;
4702 z.mounted = 8;
4703 z.clean_thru = 7;
4704 o.push_back(new OSDSuperblock(z));
4705 o.push_back(new OSDSuperblock(z));
4706 }
4707
4708 // -- SnapSet --
4709
4710 void SnapSet::encode(bufferlist& bl) const
4711 {
4712 ENCODE_START(3, 2, bl);
4713 ::encode(seq, bl);
4714 ::encode(head_exists, bl);
4715 ::encode(snaps, bl);
4716 ::encode(clones, bl);
4717 ::encode(clone_overlap, bl);
4718 ::encode(clone_size, bl);
4719 ::encode(clone_snaps, bl);
4720 ENCODE_FINISH(bl);
4721 }
4722
4723 void SnapSet::decode(bufferlist::iterator& bl)
4724 {
4725 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
4726 ::decode(seq, bl);
4727 ::decode(head_exists, bl);
4728 ::decode(snaps, bl);
4729 ::decode(clones, bl);
4730 ::decode(clone_overlap, bl);
4731 ::decode(clone_size, bl);
4732 if (struct_v >= 3) {
4733 ::decode(clone_snaps, bl);
4734 } else {
4735 clone_snaps.clear();
4736 }
4737 DECODE_FINISH(bl);
4738 }
4739
4740 void SnapSet::dump(Formatter *f) const
4741 {
4742 SnapContext sc(seq, snaps);
4743 f->open_object_section("snap_context");
4744 sc.dump(f);
4745 f->close_section();
4746 f->dump_int("head_exists", head_exists);
4747 f->open_array_section("clones");
4748 for (vector<snapid_t>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
4749 f->open_object_section("clone");
4750 f->dump_unsigned("snap", *p);
4751 f->dump_unsigned("size", clone_size.find(*p)->second);
4752 f->dump_stream("overlap") << clone_overlap.find(*p)->second;
4753 auto q = clone_snaps.find(*p);
4754 if (q != clone_snaps.end()) {
4755 f->open_array_section("snaps");
4756 for (auto s : q->second) {
4757 f->dump_unsigned("snap", s);
4758 }
4759 f->close_section();
4760 }
4761 f->close_section();
4762 }
4763 f->close_section();
4764 }
4765
4766 void SnapSet::generate_test_instances(list<SnapSet*>& o)
4767 {
4768 o.push_back(new SnapSet);
4769 o.push_back(new SnapSet);
4770 o.back()->head_exists = true;
4771 o.back()->seq = 123;
4772 o.back()->snaps.push_back(123);
4773 o.back()->snaps.push_back(12);
4774 o.push_back(new SnapSet);
4775 o.back()->head_exists = true;
4776 o.back()->seq = 123;
4777 o.back()->snaps.push_back(123);
4778 o.back()->snaps.push_back(12);
4779 o.back()->clones.push_back(12);
4780 o.back()->clone_size[12] = 12345;
4781 o.back()->clone_overlap[12];
4782 o.back()->clone_snaps[12] = {12, 10, 8};
4783 }
4784
4785 ostream& operator<<(ostream& out, const SnapSet& cs)
4786 {
4787 if (cs.is_legacy()) {
4788 out << cs.seq << "=" << cs.snaps << ":"
4789 << cs.clones
4790 << (cs.head_exists ? "+head":"");
4791 if (!cs.clone_snaps.empty()) {
4792 out << "+stray_clone_snaps=" << cs.clone_snaps;
4793 }
4794 return out;
4795 } else {
4796 return out << cs.seq << "=" << cs.snaps << ":"
4797 << cs.clone_snaps;
4798 }
4799 }
4800
4801 void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
4802 {
4803 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
4804 // correct: it will not include snaps that still logically exist
4805 // but for which there was no clone that is defined. For all
4806 // practical purposes this doesn't matter, since we only use that
4807 // information to clone on the OSD, and we have already moved
4808 // forward past that part of the object history.
4809
4810 seq = ss.seq;
4811 set<snapid_t> _snaps;
4812 set<snapid_t> _clones;
4813 head_exists = false;
4814 for (vector<librados::clone_info_t>::const_iterator p = ss.clones.begin();
4815 p != ss.clones.end();
4816 ++p) {
4817 if (p->cloneid == librados::SNAP_HEAD) {
4818 head_exists = true;
4819 } else {
4820 _clones.insert(p->cloneid);
4821 _snaps.insert(p->snaps.begin(), p->snaps.end());
4822 clone_size[p->cloneid] = p->size;
4823 clone_overlap[p->cloneid]; // the entry must exist, even if it's empty.
4824 for (vector<pair<uint64_t, uint64_t> >::const_iterator q =
4825 p->overlap.begin(); q != p->overlap.end(); ++q)
4826 clone_overlap[p->cloneid].insert(q->first, q->second);
4827 if (!legacy) {
4828 // p->snaps is ascending; clone_snaps is descending
4829 vector<snapid_t>& v = clone_snaps[p->cloneid];
4830 for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
4831 v.push_back(*q);
4832 }
4833 }
4834 }
4835 }
4836
4837 // ascending
4838 clones.clear();
4839 clones.reserve(_clones.size());
4840 for (set<snapid_t>::iterator p = _clones.begin(); p != _clones.end(); ++p)
4841 clones.push_back(*p);
4842
4843 // descending
4844 snaps.clear();
4845 snaps.reserve(_snaps.size());
4846 for (set<snapid_t>::reverse_iterator p = _snaps.rbegin();
4847 p != _snaps.rend(); ++p)
4848 snaps.push_back(*p);
4849 }
4850
4851 uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
4852 {
4853 assert(clone_size.count(clone));
4854 uint64_t size = clone_size.find(clone)->second;
4855 assert(clone_overlap.count(clone));
4856 const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
4857 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
4858 i != overlap.end();
4859 ++i) {
4860 assert(size >= i.get_len());
4861 size -= i.get_len();
4862 }
4863 return size;
4864 }
4865
4866 void SnapSet::filter(const pg_pool_t &pinfo)
4867 {
4868 vector<snapid_t> oldsnaps;
4869 oldsnaps.swap(snaps);
4870 for (vector<snapid_t>::const_iterator i = oldsnaps.begin();
4871 i != oldsnaps.end();
4872 ++i) {
4873 if (!pinfo.is_removed_snap(*i))
4874 snaps.push_back(*i);
4875 }
4876 }
4877
4878 SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const
4879 {
4880 SnapSet ss = *this;
4881 ss.filter(pinfo);
4882 return ss;
4883 }
4884
4885 // -- watch_info_t --
4886
4887 void watch_info_t::encode(bufferlist& bl, uint64_t features) const
4888 {
4889 ENCODE_START(4, 3, bl);
4890 ::encode(cookie, bl);
4891 ::encode(timeout_seconds, bl);
4892 ::encode(addr, bl, features);
4893 ENCODE_FINISH(bl);
4894 }
4895
4896 void watch_info_t::decode(bufferlist::iterator& bl)
4897 {
4898 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
4899 ::decode(cookie, bl);
4900 if (struct_v < 2) {
4901 uint64_t ver;
4902 ::decode(ver, bl);
4903 }
4904 ::decode(timeout_seconds, bl);
4905 if (struct_v >= 4) {
4906 ::decode(addr, bl);
4907 }
4908 DECODE_FINISH(bl);
4909 }
4910
4911 void watch_info_t::dump(Formatter *f) const
4912 {
4913 f->dump_unsigned("cookie", cookie);
4914 f->dump_unsigned("timeout_seconds", timeout_seconds);
4915 f->open_object_section("addr");
4916 addr.dump(f);
4917 f->close_section();
4918 }
4919
4920 void watch_info_t::generate_test_instances(list<watch_info_t*>& o)
4921 {
4922 o.push_back(new watch_info_t);
4923 o.push_back(new watch_info_t);
4924 o.back()->cookie = 123;
4925 o.back()->timeout_seconds = 99;
4926 entity_addr_t ea;
4927 ea.set_type(entity_addr_t::TYPE_LEGACY);
4928 ea.set_nonce(1);
4929 ea.set_family(AF_INET);
4930 ea.set_in4_quad(0, 127);
4931 ea.set_in4_quad(1, 0);
4932 ea.set_in4_quad(2, 1);
4933 ea.set_in4_quad(3, 2);
4934 ea.set_port(2);
4935 o.back()->addr = ea;
4936 }
4937
4938 // -- object_manifest_t --
4939
4940 void object_manifest_t::encode(bufferlist& bl) const
4941 {
4942 ENCODE_START(1, 1, bl);
4943 ::encode(type, bl);
4944 switch (type) {
4945 case TYPE_NONE: break;
4946 case TYPE_REDIRECT:
4947 ::encode(redirect_target, bl);
4948 break;
4949 default:
4950 ceph_abort();
4951 }
4952 ENCODE_FINISH(bl);
4953 }
4954
4955 void object_manifest_t::decode(bufferlist::iterator& bl)
4956 {
4957 DECODE_START(1, bl);
4958 ::decode(type, bl);
4959 switch (type) {
4960 case TYPE_NONE: break;
4961 case TYPE_REDIRECT:
4962 ::decode(redirect_target, bl);
4963 break;
4964 default:
4965 ceph_abort();
4966 }
4967 DECODE_FINISH(bl);
4968 }
4969
4970 void object_manifest_t::dump(Formatter *f) const
4971 {
4972 f->dump_unsigned("type", type);
4973 f->open_object_section("redirect_target");
4974 redirect_target.dump(f);
4975 f->close_section();
4976 }
4977
4978 void object_manifest_t::generate_test_instances(list<object_manifest_t*>& o)
4979 {
4980 o.push_back(new object_manifest_t());
4981 o.back()->type = TYPE_REDIRECT;
4982 }
4983
4984 ostream& operator<<(ostream& out, const object_manifest_t& om)
4985 {
4986 return out << "type:" << om.type << " redirect_target:" << om.redirect_target;
4987 }
4988
4989 // -- object_info_t --
4990
4991 void object_info_t::copy_user_bits(const object_info_t& other)
4992 {
4993 // these bits are copied from head->clone.
4994 size = other.size;
4995 mtime = other.mtime;
4996 local_mtime = other.local_mtime;
4997 last_reqid = other.last_reqid;
4998 truncate_seq = other.truncate_seq;
4999 truncate_size = other.truncate_size;
5000 flags = other.flags;
5001 user_version = other.user_version;
5002 data_digest = other.data_digest;
5003 omap_digest = other.omap_digest;
5004 }
5005
5006 ps_t object_info_t::legacy_object_locator_to_ps(const object_t &oid,
5007 const object_locator_t &loc) {
5008 ps_t ps;
5009 if (loc.key.length())
5010 // Hack, we don't have the osd map, so we don't really know the hash...
5011 ps = ceph_str_hash(CEPH_STR_HASH_RJENKINS, loc.key.c_str(),
5012 loc.key.length());
5013 else
5014 ps = ceph_str_hash(CEPH_STR_HASH_RJENKINS, oid.name.c_str(),
5015 oid.name.length());
5016 return ps;
5017 }
5018
5019 void object_info_t::encode(bufferlist& bl, uint64_t features) const
5020 {
5021 object_locator_t myoloc(soid);
5022 map<entity_name_t, watch_info_t> old_watchers;
5023 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator i =
5024 watchers.begin();
5025 i != watchers.end();
5026 ++i) {
5027 old_watchers.insert(make_pair(i->first.second, i->second));
5028 }
5029 ENCODE_START(17, 8, bl);
5030 ::encode(soid, bl);
5031 ::encode(myoloc, bl); //Retained for compatibility
5032 ::encode((__u32)0, bl); // was category, no longer used
5033 ::encode(version, bl);
5034 ::encode(prior_version, bl);
5035 ::encode(last_reqid, bl);
5036 ::encode(size, bl);
5037 ::encode(mtime, bl);
5038 if (soid.snap == CEPH_NOSNAP)
5039 ::encode(osd_reqid_t(), bl); // used to be wrlock_by
5040 else
5041 ::encode(legacy_snaps, bl);
5042 ::encode(truncate_seq, bl);
5043 ::encode(truncate_size, bl);
5044 ::encode(is_lost(), bl);
5045 ::encode(old_watchers, bl, features);
5046 /* shenanigans to avoid breaking backwards compatibility in the disk format.
5047 * When we can, switch this out for simply putting the version_t on disk. */
5048 eversion_t user_eversion(0, user_version);
5049 ::encode(user_eversion, bl);
5050 ::encode(test_flag(FLAG_USES_TMAP), bl);
5051 ::encode(watchers, bl, features);
5052 __u32 _flags = flags;
5053 ::encode(_flags, bl);
5054 ::encode(local_mtime, bl);
5055 ::encode(data_digest, bl);
5056 ::encode(omap_digest, bl);
5057 ::encode(expected_object_size, bl);
5058 ::encode(expected_write_size, bl);
5059 ::encode(alloc_hint_flags, bl);
5060 if (has_manifest()) {
5061 ::encode(manifest, bl);
5062 }
5063 ENCODE_FINISH(bl);
5064 }
5065
5066 void object_info_t::decode(bufferlist::iterator& bl)
5067 {
5068 object_locator_t myoloc;
5069 DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
5070 map<entity_name_t, watch_info_t> old_watchers;
5071 ::decode(soid, bl);
5072 ::decode(myoloc, bl);
5073 {
5074 string category;
5075 ::decode(category, bl); // no longer used
5076 }
5077 ::decode(version, bl);
5078 ::decode(prior_version, bl);
5079 ::decode(last_reqid, bl);
5080 ::decode(size, bl);
5081 ::decode(mtime, bl);
5082 if (soid.snap == CEPH_NOSNAP) {
5083 osd_reqid_t wrlock_by;
5084 ::decode(wrlock_by, bl);
5085 } else {
5086 ::decode(legacy_snaps, bl);
5087 }
5088 ::decode(truncate_seq, bl);
5089 ::decode(truncate_size, bl);
5090
5091 // if this is struct_v >= 13, we will overwrite this
5092 // below since this field is just here for backwards
5093 // compatibility
5094 __u8 lo;
5095 ::decode(lo, bl);
5096 flags = (flag_t)lo;
5097
5098 ::decode(old_watchers, bl);
5099 eversion_t user_eversion;
5100 ::decode(user_eversion, bl);
5101 user_version = user_eversion.version;
5102
5103 if (struct_v >= 9) {
5104 bool uses_tmap = false;
5105 ::decode(uses_tmap, bl);
5106 if (uses_tmap)
5107 set_flag(FLAG_USES_TMAP);
5108 } else {
5109 set_flag(FLAG_USES_TMAP);
5110 }
5111 if (struct_v < 10)
5112 soid.pool = myoloc.pool;
5113 if (struct_v >= 11) {
5114 ::decode(watchers, bl);
5115 } else {
5116 for (map<entity_name_t, watch_info_t>::iterator i = old_watchers.begin();
5117 i != old_watchers.end();
5118 ++i) {
5119 watchers.insert(
5120 make_pair(
5121 make_pair(i->second.cookie, i->first), i->second));
5122 }
5123 }
5124 if (struct_v >= 13) {
5125 __u32 _flags;
5126 ::decode(_flags, bl);
5127 flags = (flag_t)_flags;
5128 }
5129 if (struct_v >= 14) {
5130 ::decode(local_mtime, bl);
5131 } else {
5132 local_mtime = utime_t();
5133 }
5134 if (struct_v >= 15) {
5135 ::decode(data_digest, bl);
5136 ::decode(omap_digest, bl);
5137 } else {
5138 data_digest = omap_digest = -1;
5139 clear_flag(FLAG_DATA_DIGEST);
5140 clear_flag(FLAG_OMAP_DIGEST);
5141 }
5142 if (struct_v >= 16) {
5143 ::decode(expected_object_size, bl);
5144 ::decode(expected_write_size, bl);
5145 ::decode(alloc_hint_flags, bl);
5146 } else {
5147 expected_object_size = 0;
5148 expected_write_size = 0;
5149 alloc_hint_flags = 0;
5150 }
5151 if (struct_v >= 17) {
5152 if (has_manifest()) {
5153 ::decode(manifest, bl);
5154 }
5155 }
5156 DECODE_FINISH(bl);
5157 }
5158
5159 void object_info_t::dump(Formatter *f) const
5160 {
5161 f->open_object_section("oid");
5162 soid.dump(f);
5163 f->close_section();
5164 f->dump_stream("version") << version;
5165 f->dump_stream("prior_version") << prior_version;
5166 f->dump_stream("last_reqid") << last_reqid;
5167 f->dump_unsigned("user_version", user_version);
5168 f->dump_unsigned("size", size);
5169 f->dump_stream("mtime") << mtime;
5170 f->dump_stream("local_mtime") << local_mtime;
5171 f->dump_unsigned("lost", (int)is_lost());
5172 f->dump_unsigned("flags", (int)flags);
5173 f->open_array_section("legacy_snaps");
5174 for (auto s : legacy_snaps) {
5175 f->dump_unsigned("snap", s);
5176 }
5177 f->close_section();
5178 f->dump_unsigned("truncate_seq", truncate_seq);
5179 f->dump_unsigned("truncate_size", truncate_size);
5180 f->dump_unsigned("data_digest", data_digest);
5181 f->dump_unsigned("omap_digest", omap_digest);
5182 f->dump_unsigned("expected_object_size", expected_object_size);
5183 f->dump_unsigned("expected_write_size", expected_write_size);
5184 f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
5185 f->dump_object("manifest", manifest);
5186 f->open_object_section("watchers");
5187 for (map<pair<uint64_t, entity_name_t>,watch_info_t>::const_iterator p =
5188 watchers.begin(); p != watchers.end(); ++p) {
5189 stringstream ss;
5190 ss << p->first.second;
5191 f->open_object_section(ss.str().c_str());
5192 p->second.dump(f);
5193 f->close_section();
5194 }
5195 f->close_section();
5196 }
5197
5198 void object_info_t::generate_test_instances(list<object_info_t*>& o)
5199 {
5200 o.push_back(new object_info_t());
5201
5202 // fixme
5203 }
5204
5205
5206 ostream& operator<<(ostream& out, const object_info_t& oi)
5207 {
5208 out << oi.soid << "(" << oi.version
5209 << " " << oi.last_reqid;
5210 if (oi.soid.snap != CEPH_NOSNAP && !oi.legacy_snaps.empty())
5211 out << " " << oi.legacy_snaps;
5212 if (oi.flags)
5213 out << " " << oi.get_flag_string();
5214 out << " s " << oi.size;
5215 out << " uv " << oi.user_version;
5216 if (oi.is_data_digest())
5217 out << " dd " << std::hex << oi.data_digest << std::dec;
5218 if (oi.is_omap_digest())
5219 out << " od " << std::hex << oi.omap_digest << std::dec;
5220 out << " alloc_hint [" << oi.expected_object_size
5221 << " " << oi.expected_write_size
5222 << " " << oi.alloc_hint_flags << "]";
5223 if (oi.has_manifest())
5224 out << " " << oi.manifest;
5225
5226 out << ")";
5227 return out;
5228 }
5229
5230 // -- ObjectRecovery --
5231 void ObjectRecoveryProgress::encode(bufferlist &bl) const
5232 {
5233 ENCODE_START(1, 1, bl);
5234 ::encode(first, bl);
5235 ::encode(data_complete, bl);
5236 ::encode(data_recovered_to, bl);
5237 ::encode(omap_recovered_to, bl);
5238 ::encode(omap_complete, bl);
5239 ENCODE_FINISH(bl);
5240 }
5241
5242 void ObjectRecoveryProgress::decode(bufferlist::iterator &bl)
5243 {
5244 DECODE_START(1, bl);
5245 ::decode(first, bl);
5246 ::decode(data_complete, bl);
5247 ::decode(data_recovered_to, bl);
5248 ::decode(omap_recovered_to, bl);
5249 ::decode(omap_complete, bl);
5250 DECODE_FINISH(bl);
5251 }
5252
5253 ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog)
5254 {
5255 return prog.print(out);
5256 }
5257
5258 void ObjectRecoveryProgress::generate_test_instances(
5259 list<ObjectRecoveryProgress*>& o)
5260 {
5261 o.push_back(new ObjectRecoveryProgress);
5262 o.back()->first = false;
5263 o.back()->data_complete = true;
5264 o.back()->omap_complete = true;
5265 o.back()->data_recovered_to = 100;
5266
5267 o.push_back(new ObjectRecoveryProgress);
5268 o.back()->first = true;
5269 o.back()->data_complete = false;
5270 o.back()->omap_complete = false;
5271 o.back()->data_recovered_to = 0;
5272 }
5273
5274 ostream &ObjectRecoveryProgress::print(ostream &out) const
5275 {
5276 return out << "ObjectRecoveryProgress("
5277 << ( first ? "" : "!" ) << "first, "
5278 << "data_recovered_to:" << data_recovered_to
5279 << ", data_complete:" << ( data_complete ? "true" : "false" )
5280 << ", omap_recovered_to:" << omap_recovered_to
5281 << ", omap_complete:" << ( omap_complete ? "true" : "false" )
5282 << ")";
5283 }
5284
5285 void ObjectRecoveryProgress::dump(Formatter *f) const
5286 {
5287 f->dump_int("first?", first);
5288 f->dump_int("data_complete?", data_complete);
5289 f->dump_unsigned("data_recovered_to", data_recovered_to);
5290 f->dump_int("omap_complete?", omap_complete);
5291 f->dump_string("omap_recovered_to", omap_recovered_to);
5292 }
5293
5294 void ObjectRecoveryInfo::encode(bufferlist &bl, uint64_t features) const
5295 {
5296 ENCODE_START(2, 1, bl);
5297 ::encode(soid, bl);
5298 ::encode(version, bl);
5299 ::encode(size, bl);
5300 ::encode(oi, bl, features);
5301 ::encode(ss, bl);
5302 ::encode(copy_subset, bl);
5303 ::encode(clone_subset, bl);
5304 ENCODE_FINISH(bl);
5305 }
5306
5307 void ObjectRecoveryInfo::decode(bufferlist::iterator &bl,
5308 int64_t pool)
5309 {
5310 DECODE_START(2, bl);
5311 ::decode(soid, bl);
5312 ::decode(version, bl);
5313 ::decode(size, bl);
5314 ::decode(oi, bl);
5315 ::decode(ss, bl);
5316 ::decode(copy_subset, bl);
5317 ::decode(clone_subset, bl);
5318 DECODE_FINISH(bl);
5319
5320 if (struct_v < 2) {
5321 if (!soid.is_max() && soid.pool == -1)
5322 soid.pool = pool;
5323 map<hobject_t, interval_set<uint64_t>> tmp;
5324 tmp.swap(clone_subset);
5325 for (map<hobject_t, interval_set<uint64_t>>::iterator i = tmp.begin();
5326 i != tmp.end();
5327 ++i) {
5328 hobject_t first(i->first);
5329 if (!first.is_max() && first.pool == -1)
5330 first.pool = pool;
5331 clone_subset[first].swap(i->second);
5332 }
5333 }
5334 }
5335
5336 void ObjectRecoveryInfo::generate_test_instances(
5337 list<ObjectRecoveryInfo*>& o)
5338 {
5339 o.push_back(new ObjectRecoveryInfo);
5340 o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
5341 o.back()->version = eversion_t(0,0);
5342 o.back()->size = 100;
5343 }
5344
5345
5346 void ObjectRecoveryInfo::dump(Formatter *f) const
5347 {
5348 f->dump_stream("object") << soid;
5349 f->dump_stream("at_version") << version;
5350 f->dump_stream("size") << size;
5351 {
5352 f->open_object_section("object_info");
5353 oi.dump(f);
5354 f->close_section();
5355 }
5356 {
5357 f->open_object_section("snapset");
5358 ss.dump(f);
5359 f->close_section();
5360 }
5361 f->dump_stream("copy_subset") << copy_subset;
5362 f->dump_stream("clone_subset") << clone_subset;
5363 }
5364
5365 ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
5366 {
5367 return inf.print(out);
5368 }
5369
5370 ostream &ObjectRecoveryInfo::print(ostream &out) const
5371 {
5372 return out << "ObjectRecoveryInfo("
5373 << soid << "@" << version
5374 << ", size: " << size
5375 << ", copy_subset: " << copy_subset
5376 << ", clone_subset: " << clone_subset
5377 << ", snapset: " << ss
5378 << ")";
5379 }
5380
5381 // -- PushReplyOp --
5382 void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o)
5383 {
5384 o.push_back(new PushReplyOp);
5385 o.push_back(new PushReplyOp);
5386 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5387 o.push_back(new PushReplyOp);
5388 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5389 }
5390
5391 void PushReplyOp::encode(bufferlist &bl) const
5392 {
5393 ENCODE_START(1, 1, bl);
5394 ::encode(soid, bl);
5395 ENCODE_FINISH(bl);
5396 }
5397
5398 void PushReplyOp::decode(bufferlist::iterator &bl)
5399 {
5400 DECODE_START(1, bl);
5401 ::decode(soid, bl);
5402 DECODE_FINISH(bl);
5403 }
5404
5405 void PushReplyOp::dump(Formatter *f) const
5406 {
5407 f->dump_stream("soid") << soid;
5408 }
5409
5410 ostream &PushReplyOp::print(ostream &out) const
5411 {
5412 return out
5413 << "PushReplyOp(" << soid
5414 << ")";
5415 }
5416
5417 ostream& operator<<(ostream& out, const PushReplyOp &op)
5418 {
5419 return op.print(out);
5420 }
5421
5422 uint64_t PushReplyOp::cost(CephContext *cct) const
5423 {
5424
5425 return cct->_conf->osd_push_per_object_cost +
5426 cct->_conf->osd_recovery_max_chunk;
5427 }
5428
5429 // -- PullOp --
5430 void PullOp::generate_test_instances(list<PullOp*> &o)
5431 {
5432 o.push_back(new PullOp);
5433 o.push_back(new PullOp);
5434 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5435 o.back()->recovery_info.version = eversion_t(3, 10);
5436 o.push_back(new PullOp);
5437 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5438 o.back()->recovery_info.version = eversion_t(0, 0);
5439 }
5440
5441 void PullOp::encode(bufferlist &bl, uint64_t features) const
5442 {
5443 ENCODE_START(1, 1, bl);
5444 ::encode(soid, bl);
5445 ::encode(recovery_info, bl, features);
5446 ::encode(recovery_progress, bl);
5447 ENCODE_FINISH(bl);
5448 }
5449
5450 void PullOp::decode(bufferlist::iterator &bl)
5451 {
5452 DECODE_START(1, bl);
5453 ::decode(soid, bl);
5454 ::decode(recovery_info, bl);
5455 ::decode(recovery_progress, bl);
5456 DECODE_FINISH(bl);
5457 }
5458
5459 void PullOp::dump(Formatter *f) const
5460 {
5461 f->dump_stream("soid") << soid;
5462 {
5463 f->open_object_section("recovery_info");
5464 recovery_info.dump(f);
5465 f->close_section();
5466 }
5467 {
5468 f->open_object_section("recovery_progress");
5469 recovery_progress.dump(f);
5470 f->close_section();
5471 }
5472 }
5473
5474 ostream &PullOp::print(ostream &out) const
5475 {
5476 return out
5477 << "PullOp(" << soid
5478 << ", recovery_info: " << recovery_info
5479 << ", recovery_progress: " << recovery_progress
5480 << ")";
5481 }
5482
5483 ostream& operator<<(ostream& out, const PullOp &op)
5484 {
5485 return op.print(out);
5486 }
5487
5488 uint64_t PullOp::cost(CephContext *cct) const
5489 {
5490 return cct->_conf->osd_push_per_object_cost +
5491 cct->_conf->osd_recovery_max_chunk;
5492 }
5493
5494 // -- PushOp --
5495 void PushOp::generate_test_instances(list<PushOp*> &o)
5496 {
5497 o.push_back(new PushOp);
5498 o.push_back(new PushOp);
5499 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5500 o.back()->version = eversion_t(3, 10);
5501 o.push_back(new PushOp);
5502 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5503 o.back()->version = eversion_t(0, 0);
5504 }
5505
5506 void PushOp::encode(bufferlist &bl, uint64_t features) const
5507 {
5508 ENCODE_START(1, 1, bl);
5509 ::encode(soid, bl);
5510 ::encode(version, bl);
5511 ::encode(data, bl);
5512 ::encode(data_included, bl);
5513 ::encode(omap_header, bl);
5514 ::encode(omap_entries, bl);
5515 ::encode(attrset, bl);
5516 ::encode(recovery_info, bl, features);
5517 ::encode(after_progress, bl);
5518 ::encode(before_progress, bl);
5519 ENCODE_FINISH(bl);
5520 }
5521
5522 void PushOp::decode(bufferlist::iterator &bl)
5523 {
5524 DECODE_START(1, bl);
5525 ::decode(soid, bl);
5526 ::decode(version, bl);
5527 ::decode(data, bl);
5528 ::decode(data_included, bl);
5529 ::decode(omap_header, bl);
5530 ::decode(omap_entries, bl);
5531 ::decode(attrset, bl);
5532 ::decode(recovery_info, bl);
5533 ::decode(after_progress, bl);
5534 ::decode(before_progress, bl);
5535 DECODE_FINISH(bl);
5536 }
5537
5538 void PushOp::dump(Formatter *f) const
5539 {
5540 f->dump_stream("soid") << soid;
5541 f->dump_stream("version") << version;
5542 f->dump_int("data_len", data.length());
5543 f->dump_stream("data_included") << data_included;
5544 f->dump_int("omap_header_len", omap_header.length());
5545 f->dump_int("omap_entries_len", omap_entries.size());
5546 f->dump_int("attrset_len", attrset.size());
5547 {
5548 f->open_object_section("recovery_info");
5549 recovery_info.dump(f);
5550 f->close_section();
5551 }
5552 {
5553 f->open_object_section("after_progress");
5554 after_progress.dump(f);
5555 f->close_section();
5556 }
5557 {
5558 f->open_object_section("before_progress");
5559 before_progress.dump(f);
5560 f->close_section();
5561 }
5562 }
5563
5564 ostream &PushOp::print(ostream &out) const
5565 {
5566 return out
5567 << "PushOp(" << soid
5568 << ", version: " << version
5569 << ", data_included: " << data_included
5570 << ", data_size: " << data.length()
5571 << ", omap_header_size: " << omap_header.length()
5572 << ", omap_entries_size: " << omap_entries.size()
5573 << ", attrset_size: " << attrset.size()
5574 << ", recovery_info: " << recovery_info
5575 << ", after_progress: " << after_progress
5576 << ", before_progress: " << before_progress
5577 << ")";
5578 }
5579
5580 ostream& operator<<(ostream& out, const PushOp &op)
5581 {
5582 return op.print(out);
5583 }
5584
5585 uint64_t PushOp::cost(CephContext *cct) const
5586 {
5587 uint64_t cost = data_included.size();
5588 for (map<string, bufferlist>::const_iterator i =
5589 omap_entries.begin();
5590 i != omap_entries.end();
5591 ++i) {
5592 cost += i->second.length();
5593 }
5594 cost += cct->_conf->osd_push_per_object_cost;
5595 return cost;
5596 }
5597
5598 // -- ScrubMap --
5599
5600 void ScrubMap::merge_incr(const ScrubMap &l)
5601 {
5602 assert(valid_through == l.incr_since);
5603 valid_through = l.valid_through;
5604
5605 for (map<hobject_t,object>::const_iterator p = l.objects.begin();
5606 p != l.objects.end();
5607 ++p){
5608 if (p->second.negative) {
5609 map<hobject_t,object>::iterator q = objects.find(p->first);
5610 if (q != objects.end()) {
5611 objects.erase(q);
5612 }
5613 } else {
5614 objects[p->first] = p->second;
5615 }
5616 }
5617 }
5618
5619 void ScrubMap::encode(bufferlist& bl) const
5620 {
5621 ENCODE_START(3, 2, bl);
5622 ::encode(objects, bl);
5623 ::encode((__u32)0, bl); // used to be attrs; now deprecated
5624 bufferlist old_logbl; // not used
5625 ::encode(old_logbl, bl);
5626 ::encode(valid_through, bl);
5627 ::encode(incr_since, bl);
5628 ENCODE_FINISH(bl);
5629 }
5630
5631 void ScrubMap::decode(bufferlist::iterator& bl, int64_t pool)
5632 {
5633 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
5634 ::decode(objects, bl);
5635 {
5636 map<string,string> attrs; // deprecated
5637 ::decode(attrs, bl);
5638 }
5639 bufferlist old_logbl; // not used
5640 ::decode(old_logbl, bl);
5641 ::decode(valid_through, bl);
5642 ::decode(incr_since, bl);
5643 DECODE_FINISH(bl);
5644
5645 // handle hobject_t upgrade
5646 if (struct_v < 3) {
5647 map<hobject_t, object> tmp;
5648 tmp.swap(objects);
5649 for (map<hobject_t, object>::iterator i = tmp.begin();
5650 i != tmp.end();
5651 ++i) {
5652 hobject_t first(i->first);
5653 if (!first.is_max() && first.pool == -1)
5654 first.pool = pool;
5655 objects[first] = i->second;
5656 }
5657 }
5658 }
5659
5660 void ScrubMap::dump(Formatter *f) const
5661 {
5662 f->dump_stream("valid_through") << valid_through;
5663 f->dump_stream("incremental_since") << incr_since;
5664 f->open_array_section("objects");
5665 for (map<hobject_t,object>::const_iterator p = objects.begin(); p != objects.end(); ++p) {
5666 f->open_object_section("object");
5667 f->dump_string("name", p->first.oid.name);
5668 f->dump_unsigned("hash", p->first.get_hash());
5669 f->dump_string("key", p->first.get_key());
5670 f->dump_int("snapid", p->first.snap);
5671 p->second.dump(f);
5672 f->close_section();
5673 }
5674 f->close_section();
5675 }
5676
5677 void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
5678 {
5679 o.push_back(new ScrubMap);
5680 o.push_back(new ScrubMap);
5681 o.back()->valid_through = eversion_t(1, 2);
5682 o.back()->incr_since = eversion_t(3, 4);
5683 list<object*> obj;
5684 object::generate_test_instances(obj);
5685 o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back();
5686 obj.pop_back();
5687 o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back();
5688 }
5689
5690 // -- ScrubMap::object --
5691
5692 void ScrubMap::object::encode(bufferlist& bl) const
5693 {
5694 bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
5695 ENCODE_START(8, 7, bl);
5696 ::encode(size, bl);
5697 ::encode(negative, bl);
5698 ::encode(attrs, bl);
5699 ::encode(digest, bl);
5700 ::encode(digest_present, bl);
5701 ::encode((uint32_t)0, bl); // obsolete nlinks
5702 ::encode((uint32_t)0, bl); // snapcolls
5703 ::encode(omap_digest, bl);
5704 ::encode(omap_digest_present, bl);
5705 ::encode(compat_read_error, bl);
5706 ::encode(stat_error, bl);
5707 ::encode(read_error, bl);
5708 ::encode(ec_hash_mismatch, bl);
5709 ::encode(ec_size_mismatch, bl);
5710 ENCODE_FINISH(bl);
5711 }
5712
5713 void ScrubMap::object::decode(bufferlist::iterator& bl)
5714 {
5715 DECODE_START(8, bl);
5716 ::decode(size, bl);
5717 bool tmp, compat_read_error = false;
5718 ::decode(tmp, bl);
5719 negative = tmp;
5720 ::decode(attrs, bl);
5721 ::decode(digest, bl);
5722 ::decode(tmp, bl);
5723 digest_present = tmp;
5724 {
5725 uint32_t nlinks;
5726 ::decode(nlinks, bl);
5727 set<snapid_t> snapcolls;
5728 ::decode(snapcolls, bl);
5729 }
5730 ::decode(omap_digest, bl);
5731 ::decode(tmp, bl);
5732 omap_digest_present = tmp;
5733 ::decode(compat_read_error, bl);
5734 ::decode(tmp, bl);
5735 stat_error = tmp;
5736 if (struct_v >= 8) {
5737 ::decode(tmp, bl);
5738 read_error = tmp;
5739 ::decode(tmp, bl);
5740 ec_hash_mismatch = tmp;
5741 ::decode(tmp, bl);
5742 ec_size_mismatch = tmp;
5743 }
5744 // If older encoder found a read_error, set read_error
5745 if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
5746 read_error = true;
5747 DECODE_FINISH(bl);
5748 }
5749
5750 void ScrubMap::object::dump(Formatter *f) const
5751 {
5752 f->dump_int("size", size);
5753 f->dump_int("negative", negative);
5754 f->open_array_section("attrs");
5755 for (map<string,bufferptr>::const_iterator p = attrs.begin(); p != attrs.end(); ++p) {
5756 f->open_object_section("attr");
5757 f->dump_string("name", p->first);
5758 f->dump_int("length", p->second.length());
5759 f->close_section();
5760 }
5761 f->close_section();
5762 }
5763
5764 void ScrubMap::object::generate_test_instances(list<object*>& o)
5765 {
5766 o.push_back(new object);
5767 o.push_back(new object);
5768 o.back()->negative = true;
5769 o.push_back(new object);
5770 o.back()->size = 123;
5771 o.back()->attrs["foo"] = buffer::copy("foo", 3);
5772 o.back()->attrs["bar"] = buffer::copy("barval", 6);
5773 }
5774
5775 // -- OSDOp --
5776
5777 ostream& operator<<(ostream& out, const OSDOp& op)
5778 {
5779 out << ceph_osd_op_name(op.op.op);
5780 if (ceph_osd_op_type_data(op.op.op)) {
5781 // data extent
5782 switch (op.op.op) {
5783 case CEPH_OSD_OP_ASSERT_VER:
5784 out << " v" << op.op.assert_ver.ver;
5785 break;
5786 case CEPH_OSD_OP_TRUNCATE:
5787 out << " " << op.op.extent.offset;
5788 break;
5789 case CEPH_OSD_OP_MASKTRUNC:
5790 case CEPH_OSD_OP_TRIMTRUNC:
5791 out << " " << op.op.extent.truncate_seq << "@"
5792 << (int64_t)op.op.extent.truncate_size;
5793 break;
5794 case CEPH_OSD_OP_ROLLBACK:
5795 out << " " << snapid_t(op.op.snap.snapid);
5796 break;
5797 case CEPH_OSD_OP_WATCH:
5798 out << " " << ceph_osd_watch_op_name(op.op.watch.op)
5799 << " cookie " << op.op.watch.cookie;
5800 if (op.op.watch.gen)
5801 out << " gen " << op.op.watch.gen;
5802 break;
5803 case CEPH_OSD_OP_NOTIFY:
5804 case CEPH_OSD_OP_NOTIFY_ACK:
5805 out << " cookie " << op.op.notify.cookie;
5806 break;
5807 case CEPH_OSD_OP_COPY_GET:
5808 out << " max " << op.op.copy_get.max;
5809 break;
5810 case CEPH_OSD_OP_COPY_FROM:
5811 out << " ver " << op.op.copy_from.src_version;
5812 break;
5813 case CEPH_OSD_OP_SETALLOCHINT:
5814 out << " object_size " << op.op.alloc_hint.expected_object_size
5815 << " write_size " << op.op.alloc_hint.expected_write_size;
5816 break;
5817 case CEPH_OSD_OP_READ:
5818 case CEPH_OSD_OP_SPARSE_READ:
5819 case CEPH_OSD_OP_SYNC_READ:
5820 case CEPH_OSD_OP_WRITE:
5821 case CEPH_OSD_OP_WRITEFULL:
5822 case CEPH_OSD_OP_ZERO:
5823 case CEPH_OSD_OP_APPEND:
5824 case CEPH_OSD_OP_MAPEXT:
5825 out << " " << op.op.extent.offset << "~" << op.op.extent.length;
5826 if (op.op.extent.truncate_seq)
5827 out << " [" << op.op.extent.truncate_seq << "@"
5828 << (int64_t)op.op.extent.truncate_size << "]";
5829 if (op.op.flags)
5830 out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]";
5831 default:
5832 // don't show any arg info
5833 break;
5834 }
5835 } else if (ceph_osd_op_type_attr(op.op.op)) {
5836 // xattr name
5837 if (op.op.xattr.name_len && op.indata.length()) {
5838 out << " ";
5839 op.indata.write(0, op.op.xattr.name_len, out);
5840 }
5841 if (op.op.xattr.value_len)
5842 out << " (" << op.op.xattr.value_len << ")";
5843 if (op.op.op == CEPH_OSD_OP_CMPXATTR)
5844 out << " op " << (int)op.op.xattr.cmp_op
5845 << " mode " << (int)op.op.xattr.cmp_mode;
5846 } else if (ceph_osd_op_type_exec(op.op.op)) {
5847 // class.method
5848 if (op.op.cls.class_len && op.indata.length()) {
5849 out << " ";
5850 op.indata.write(0, op.op.cls.class_len, out);
5851 out << ".";
5852 op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out);
5853 }
5854 } else if (ceph_osd_op_type_pg(op.op.op)) {
5855 switch (op.op.op) {
5856 case CEPH_OSD_OP_PGLS:
5857 case CEPH_OSD_OP_PGLS_FILTER:
5858 case CEPH_OSD_OP_PGNLS:
5859 case CEPH_OSD_OP_PGNLS_FILTER:
5860 out << " start_epoch " << op.op.pgls.start_epoch;
5861 break;
5862 case CEPH_OSD_OP_PG_HITSET_LS:
5863 break;
5864 case CEPH_OSD_OP_PG_HITSET_GET:
5865 out << " " << utime_t(op.op.hit_set_get.stamp);
5866 break;
5867 case CEPH_OSD_OP_SCRUBLS:
5868 break;
5869 }
5870 }
5871 return out;
5872 }
5873
5874
5875 void OSDOp::split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in)
5876 {
5877 bufferlist::iterator datap = in.begin();
5878 for (unsigned i = 0; i < ops.size(); i++) {
5879 if (ops[i].op.payload_len) {
5880 datap.copy(ops[i].op.payload_len, ops[i].indata);
5881 }
5882 }
5883 }
5884
5885 void OSDOp::merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out)
5886 {
5887 for (unsigned i = 0; i < ops.size(); i++) {
5888 if (ops[i].indata.length()) {
5889 ops[i].op.payload_len = ops[i].indata.length();
5890 out.append(ops[i].indata);
5891 }
5892 }
5893 }
5894
5895 void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in)
5896 {
5897 bufferlist::iterator datap = in.begin();
5898 for (unsigned i = 0; i < ops.size(); i++) {
5899 if (ops[i].op.payload_len) {
5900 datap.copy(ops[i].op.payload_len, ops[i].outdata);
5901 }
5902 }
5903 }
5904
5905 void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out)
5906 {
5907 for (unsigned i = 0; i < ops.size(); i++) {
5908 if (ops[i].outdata.length()) {
5909 ops[i].op.payload_len = ops[i].outdata.length();
5910 out.append(ops[i].outdata);
5911 }
5912 }
5913 }
5914
5915 bool store_statfs_t::operator==(const store_statfs_t& other) const
5916 {
5917 return total == other.total
5918 && available == other.available
5919 && allocated == other.allocated
5920 && stored == other.stored
5921 && compressed == other.compressed
5922 && compressed_allocated == other.compressed_allocated
5923 && compressed_original == other.compressed_original;
5924 }
5925
5926 void store_statfs_t::dump(Formatter *f) const
5927 {
5928 f->dump_int("total", total);
5929 f->dump_int("available", available);
5930 f->dump_int("allocated", allocated);
5931 f->dump_int("stored", stored);
5932 f->dump_int("compressed", compressed);
5933 f->dump_int("compressed_allocated", compressed_allocated);
5934 f->dump_int("compressed_original", compressed_original);
5935 }
5936
5937 ostream& operator<<(ostream& out, const store_statfs_t &s)
5938 {
5939 out << std::hex
5940 << "store_statfs(0x" << s.available
5941 << "/0x" << s.total
5942 << ", stored 0x" << s.stored
5943 << "/0x" << s.allocated
5944 << ", compress 0x" << s.compressed
5945 << "/0x" << s.compressed_allocated
5946 << "/0x" << s.compressed_original
5947 << std::dec
5948 << ")";
5949 return out;
5950 }