]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/osd_types.h
update sources to v12.1.2
[ceph.git] / ceph / src / osd / osd_types.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #ifndef CEPH_OSD_TYPES_H
19 #define CEPH_OSD_TYPES_H
20
21 #include <sstream>
22 #include <stdio.h>
23 #include <memory>
24 #include <boost/scoped_ptr.hpp>
25 #include <boost/optional/optional_io.hpp>
26 #include <boost/variant.hpp>
27
28 #include "include/rados/rados_types.hpp"
29 #include "include/mempool.h"
30
31 #include "msg/msg_types.h"
32 #include "include/types.h"
33 #include "include/utime.h"
34 #include "include/CompatSet.h"
35 #include "common/histogram.h"
36 #include "include/interval_set.h"
37 #include "include/inline_memory.h"
38 #include "common/Formatter.h"
39 #include "common/bloom_filter.hpp"
40 #include "common/hobject.h"
41 #include "common/snap_types.h"
42 #include "HitSet.h"
43 #include "Watch.h"
44 #include "include/cmp.h"
45 #include "librados/ListObjectImpl.h"
46 #include "compressor/Compressor.h"
47 #include <atomic>
48
49 #define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
50
51 #define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
52 #define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
53 #define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
54 #define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
55 #define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
56 #define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
57 #define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
58 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
59 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
60 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
61 #define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
62 #define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
63 #define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
64 #define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
65 #define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
66 #define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
67
68
69 /// min recovery priority for MBackfillReserve
70 #define OSD_RECOVERY_PRIORITY_MIN 0
71
72 /// base backfill priority for MBackfillReserve
73 #define OSD_BACKFILL_PRIORITY_BASE 100
74
75 /// base backfill priority for MBackfillReserve (degraded PG)
76 #define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
77
78 /// base recovery priority for MBackfillReserve
79 #define OSD_RECOVERY_PRIORITY_BASE 180
80
81 /// base backfill priority for MBackfillReserve (inactive PG)
82 #define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
83
84 /// max manually/automatically set recovery priority for MBackfillReserve
85 #define OSD_RECOVERY_PRIORITY_MAX 254
86
87 /// max recovery priority for MBackfillReserve, only when forced manually
88 #define OSD_RECOVERY_PRIORITY_FORCED 255
89
90
91 typedef hobject_t collection_list_handle_t;
92
93 /// convert a single CPEH_OSD_FLAG_* to a string
94 const char *ceph_osd_flag_name(unsigned flag);
95 /// convert a single CEPH_OSD_OF_FLAG_* to a string
96 const char *ceph_osd_op_flag_name(unsigned flag);
97
98 /// convert CEPH_OSD_FLAG_* op flags to a string
99 string ceph_osd_flag_string(unsigned flags);
100 /// conver CEPH_OSD_OP_FLAG_* op flags to a string
101 string ceph_osd_op_flag_string(unsigned flags);
102 /// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a string
103 string ceph_osd_alloc_hint_flag_string(unsigned flags);
104
105
106 /**
107 * osd request identifier
108 *
109 * caller name + incarnation# + tid to unique identify this request.
110 */
111 struct osd_reqid_t {
112 entity_name_t name; // who
113 ceph_tid_t tid;
114 int32_t inc; // incarnation
115
116 osd_reqid_t()
117 : tid(0), inc(0)
118 {}
119 osd_reqid_t(const osd_reqid_t& other)
120 : name(other.name), tid(other.tid), inc(other.inc)
121 {}
122 osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
123 : name(a), tid(t), inc(i)
124 {}
125
126 DENC(osd_reqid_t, v, p) {
127 DENC_START(2, 2, p);
128 denc(v.name, p);
129 denc(v.tid, p);
130 denc(v.inc, p);
131 DENC_FINISH(p);
132 }
133 void dump(Formatter *f) const;
134 static void generate_test_instances(list<osd_reqid_t*>& o);
135 };
136 WRITE_CLASS_DENC(osd_reqid_t)
137
138
139
140 struct pg_shard_t {
141 int32_t osd;
142 shard_id_t shard;
143 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
144 explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
145 pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
146 bool is_undefined() const {
147 return osd == -1;
148 }
149 void encode(bufferlist &bl) const;
150 void decode(bufferlist::iterator &bl);
151 void dump(Formatter *f) const {
152 f->dump_unsigned("osd", osd);
153 if (shard != shard_id_t::NO_SHARD) {
154 f->dump_unsigned("shard", shard);
155 }
156 }
157 };
158 WRITE_CLASS_ENCODER(pg_shard_t)
159 WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
160 WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
161 ostream &operator<<(ostream &lhs, const pg_shard_t &rhs);
162
163 class IsPGRecoverablePredicate {
164 public:
165 /**
166 * have encodes the shards available
167 */
168 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
169 virtual ~IsPGRecoverablePredicate() {}
170 };
171
172 class IsPGReadablePredicate {
173 public:
174 /**
175 * have encodes the shards available
176 */
177 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
178 virtual ~IsPGReadablePredicate() {}
179 };
180
181 inline ostream& operator<<(ostream& out, const osd_reqid_t& r) {
182 return out << r.name << "." << r.inc << ":" << r.tid;
183 }
184
185 inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
186 return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
187 }
188 inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
189 return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
190 }
191 inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
192 return (l.name < r.name) || (l.inc < r.inc) ||
193 (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
194 }
195 inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
196 return (l.name < r.name) || (l.inc < r.inc) ||
197 (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
198 }
199 inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
200 inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
201
202 namespace std {
203 template<> struct hash<osd_reqid_t> {
204 size_t operator()(const osd_reqid_t &r) const {
205 static hash<uint64_t> H;
206 return H(r.name.num() ^ r.tid ^ r.inc);
207 }
208 };
209 } // namespace std
210
211
212 // -----
213
214 // a locator constrains the placement of an object. mainly, which pool
215 // does it go in.
216 struct object_locator_t {
217 // You specify either the hash or the key -- not both
218 int64_t pool; ///< pool id
219 string key; ///< key string (if non-empty)
220 string nspace; ///< namespace
221 int64_t hash; ///< hash position (if >= 0)
222
223 explicit object_locator_t()
224 : pool(-1), hash(-1) {}
225 explicit object_locator_t(int64_t po)
226 : pool(po), hash(-1) {}
227 explicit object_locator_t(int64_t po, int64_t ps)
228 : pool(po), hash(ps) {}
229 explicit object_locator_t(int64_t po, string ns)
230 : pool(po), nspace(ns), hash(-1) {}
231 explicit object_locator_t(int64_t po, string ns, int64_t ps)
232 : pool(po), nspace(ns), hash(ps) {}
233 explicit object_locator_t(int64_t po, string ns, string s)
234 : pool(po), key(s), nspace(ns), hash(-1) {}
235 explicit object_locator_t(const hobject_t& soid)
236 : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
237
238 int64_t get_pool() const {
239 return pool;
240 }
241
242 void clear() {
243 pool = -1;
244 key = "";
245 nspace = "";
246 hash = -1;
247 }
248
249 bool empty() const {
250 return pool == -1;
251 }
252
253 void encode(bufferlist& bl) const;
254 void decode(bufferlist::iterator& p);
255 void dump(Formatter *f) const;
256 static void generate_test_instances(list<object_locator_t*>& o);
257 };
258 WRITE_CLASS_ENCODER(object_locator_t)
259
260 inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
261 return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
262 }
263 inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
264 return !(l == r);
265 }
266
267 inline ostream& operator<<(ostream& out, const object_locator_t& loc)
268 {
269 out << "@" << loc.pool;
270 if (loc.nspace.length())
271 out << ";" << loc.nspace;
272 if (loc.key.length())
273 out << ":" << loc.key;
274 return out;
275 }
276
277 struct request_redirect_t {
278 private:
279 object_locator_t redirect_locator; ///< this is authoritative
280 string redirect_object; ///< If non-empty, the request goes to this object name
281 bufferlist osd_instructions; ///< a bufferlist for the OSDs, passed but not interpreted by clients
282
283 friend ostream& operator<<(ostream& out, const request_redirect_t& redir);
284 public:
285
286 request_redirect_t() {}
287 explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
288 redirect_locator(orig) { redirect_locator.pool = rpool; }
289 explicit request_redirect_t(const object_locator_t& rloc) :
290 redirect_locator(rloc) {}
291 explicit request_redirect_t(const object_locator_t& orig,
292 const string& robj) :
293 redirect_locator(orig), redirect_object(robj) {}
294
295 void set_instructions(const bufferlist& bl) { osd_instructions = bl; }
296 const bufferlist& get_instructions() { return osd_instructions; }
297
298 bool empty() const { return redirect_locator.empty() &&
299 redirect_object.empty(); }
300
301 void combine_with_locator(object_locator_t& orig, string& obj) const {
302 orig = redirect_locator;
303 if (!redirect_object.empty())
304 obj = redirect_object;
305 }
306
307 void encode(bufferlist& bl) const;
308 void decode(bufferlist::iterator& bl);
309 void dump(Formatter *f) const;
310 static void generate_test_instances(list<request_redirect_t*>& o);
311 };
312 WRITE_CLASS_ENCODER(request_redirect_t)
313
314 inline ostream& operator<<(ostream& out, const request_redirect_t& redir) {
315 out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
316 return out;
317 }
318
319 // Internal OSD op flags - set by the OSD based on the op types
320 enum {
321 CEPH_OSD_RMW_FLAG_READ = (1 << 1),
322 CEPH_OSD_RMW_FLAG_WRITE = (1 << 2),
323 CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3),
324 CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
325 CEPH_OSD_RMW_FLAG_PGOP = (1 << 5),
326 CEPH_OSD_RMW_FLAG_CACHE = (1 << 6),
327 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7),
328 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
329 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9),
330 CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10),
331 };
332
333
334 // pg stuff
335
336 #define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
337
338 // placement seed (a hash value)
339 typedef uint32_t ps_t;
340
341 // old (v1) pg_t encoding (wrap old struct ceph_pg)
342 struct old_pg_t {
343 ceph_pg v;
344 void encode(bufferlist& bl) const {
345 ::encode_raw(v, bl);
346 }
347 void decode(bufferlist::iterator& bl) {
348 ::decode_raw(v, bl);
349 }
350 };
351 WRITE_CLASS_ENCODER(old_pg_t)
352
353 // placement group id
354 struct pg_t {
355 uint64_t m_pool;
356 uint32_t m_seed;
357 int32_t m_preferred;
358
359 pg_t() : m_pool(0), m_seed(0), m_preferred(-1) {}
360 pg_t(ps_t seed, uint64_t pool, int pref=-1) :
361 m_pool(pool), m_seed(seed), m_preferred(pref) {}
362 // cppcheck-suppress noExplicitConstructor
363 pg_t(const ceph_pg& cpg) :
364 m_pool(cpg.pool), m_seed(cpg.ps), m_preferred((__s16)cpg.preferred) {}
365
366 // cppcheck-suppress noExplicitConstructor
367 pg_t(const old_pg_t& opg) {
368 *this = opg.v;
369 }
370
371 old_pg_t get_old_pg() const {
372 old_pg_t o;
373 assert(m_pool < 0xffffffffull);
374 o.v.pool = m_pool;
375 o.v.ps = m_seed;
376 o.v.preferred = (__s16)m_preferred;
377 return o;
378 }
379
380 ps_t ps() const {
381 return m_seed;
382 }
383 uint64_t pool() const {
384 return m_pool;
385 }
386 int32_t preferred() const {
387 return m_preferred;
388 }
389
390 static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
391 char *calc_name(char *buf, const char *suffix_backwords) const;
392
393 void set_ps(ps_t p) {
394 m_seed = p;
395 }
396 void set_pool(uint64_t p) {
397 m_pool = p;
398 }
399 void set_preferred(int32_t osd) {
400 m_preferred = osd;
401 }
402
403 pg_t get_parent() const;
404 pg_t get_ancestor(unsigned old_pg_num) const;
405
406 int print(char *o, int maxlen) const;
407 bool parse(const char *s);
408
409 bool is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *pchildren) const;
410
411 /**
412 * Returns b such that for all object o:
413 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
414 */
415 unsigned get_split_bits(unsigned pg_num) const;
416
417 bool contains(int bits, const ghobject_t& oid) {
418 return oid.match(bits, ps());
419 }
420 bool contains(int bits, const hobject_t& oid) {
421 return oid.match(bits, ps());
422 }
423
424 hobject_t get_hobj_start() const;
425 hobject_t get_hobj_end(unsigned pg_num) const;
426
427 void encode(bufferlist& bl) const {
428 __u8 v = 1;
429 ::encode(v, bl);
430 ::encode(m_pool, bl);
431 ::encode(m_seed, bl);
432 ::encode(m_preferred, bl);
433 }
434 void decode(bufferlist::iterator& bl) {
435 __u8 v;
436 ::decode(v, bl);
437 ::decode(m_pool, bl);
438 ::decode(m_seed, bl);
439 ::decode(m_preferred, bl);
440 }
441 void decode_old(bufferlist::iterator& bl) {
442 old_pg_t opg;
443 ::decode(opg, bl);
444 *this = opg;
445 }
446 void dump(Formatter *f) const;
447 static void generate_test_instances(list<pg_t*>& o);
448 };
449 WRITE_CLASS_ENCODER(pg_t)
450
451 inline bool operator<(const pg_t& l, const pg_t& r) {
452 return l.pool() < r.pool() ||
453 (l.pool() == r.pool() && (l.preferred() < r.preferred() ||
454 (l.preferred() == r.preferred() && (l.ps() < r.ps()))));
455 }
456 inline bool operator<=(const pg_t& l, const pg_t& r) {
457 return l.pool() < r.pool() ||
458 (l.pool() == r.pool() && (l.preferred() < r.preferred() ||
459 (l.preferred() == r.preferred() && (l.ps() <= r.ps()))));
460 }
461 inline bool operator==(const pg_t& l, const pg_t& r) {
462 return l.pool() == r.pool() &&
463 l.preferred() == r.preferred() &&
464 l.ps() == r.ps();
465 }
466 inline bool operator!=(const pg_t& l, const pg_t& r) {
467 return l.pool() != r.pool() ||
468 l.preferred() != r.preferred() ||
469 l.ps() != r.ps();
470 }
471 inline bool operator>(const pg_t& l, const pg_t& r) {
472 return l.pool() > r.pool() ||
473 (l.pool() == r.pool() && (l.preferred() > r.preferred() ||
474 (l.preferred() == r.preferred() && (l.ps() > r.ps()))));
475 }
476 inline bool operator>=(const pg_t& l, const pg_t& r) {
477 return l.pool() > r.pool() ||
478 (l.pool() == r.pool() && (l.preferred() > r.preferred() ||
479 (l.preferred() == r.preferred() && (l.ps() >= r.ps()))));
480 }
481
482 ostream& operator<<(ostream& out, const pg_t &pg);
483
484 namespace std {
485 template<> struct hash< pg_t >
486 {
487 size_t operator()( const pg_t& x ) const
488 {
489 static hash<uint32_t> H;
490 return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ x.preferred());
491 }
492 };
493 } // namespace std
494
495 struct spg_t {
496 pg_t pgid;
497 shard_id_t shard;
498 spg_t() : shard(shard_id_t::NO_SHARD) {}
499 spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
500 explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
501 unsigned get_split_bits(unsigned pg_num) const {
502 return pgid.get_split_bits(pg_num);
503 }
504 spg_t get_parent() const {
505 return spg_t(pgid.get_parent(), shard);
506 }
507 ps_t ps() const {
508 return pgid.ps();
509 }
510 uint64_t pool() const {
511 return pgid.pool();
512 }
513 int32_t preferred() const {
514 return pgid.preferred();
515 }
516
517 static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
518 char *calc_name(char *buf, const char *suffix_backwords) const;
519
520 bool parse(const char *s);
521 bool parse(const std::string& s) {
522 return parse(s.c_str());
523 }
524 bool is_split(unsigned old_pg_num, unsigned new_pg_num,
525 set<spg_t> *pchildren) const {
526 set<pg_t> _children;
527 set<pg_t> *children = pchildren ? &_children : NULL;
528 bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
529 if (pchildren && is_split) {
530 for (set<pg_t>::iterator i = _children.begin();
531 i != _children.end();
532 ++i) {
533 pchildren->insert(spg_t(*i, shard));
534 }
535 }
536 return is_split;
537 }
538 bool is_no_shard() const {
539 return shard == shard_id_t::NO_SHARD;
540 }
541
542 ghobject_t make_pgmeta_oid() const {
543 return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
544 }
545
546 void encode(bufferlist &bl) const {
547 ENCODE_START(1, 1, bl);
548 ::encode(pgid, bl);
549 ::encode(shard, bl);
550 ENCODE_FINISH(bl);
551 }
552 void decode(bufferlist::iterator &bl) {
553 DECODE_START(1, bl);
554 ::decode(pgid, bl);
555 ::decode(shard, bl);
556 DECODE_FINISH(bl);
557 }
558
559 ghobject_t make_temp_ghobject(const string& name) const {
560 return ghobject_t(
561 hobject_t(object_t(name), "", CEPH_NOSNAP,
562 pgid.ps(),
563 hobject_t::POOL_TEMP_START - pgid.pool(), ""),
564 ghobject_t::NO_GEN,
565 shard);
566 }
567
568 unsigned hash_to_shard(unsigned num_shards) const {
569 return ps() % num_shards;
570 }
571 };
572 WRITE_CLASS_ENCODER(spg_t)
573 WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
574 WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
575
576 namespace std {
577 template<> struct hash< spg_t >
578 {
579 size_t operator()( const spg_t& x ) const
580 {
581 static hash<uint32_t> H;
582 return H(hash<pg_t>()(x.pgid) ^ x.shard);
583 }
584 };
585 } // namespace std
586
587 ostream& operator<<(ostream& out, const spg_t &pg);
588
589 // ----------------------
590
591 class coll_t {
592 enum type_t {
593 TYPE_META = 0,
594 TYPE_LEGACY_TEMP = 1, /* no longer used */
595 TYPE_PG = 2,
596 TYPE_PG_TEMP = 3,
597 };
598 type_t type;
599 spg_t pgid;
600 uint64_t removal_seq; // note: deprecated, not encoded
601
602 char _str_buff[spg_t::calc_name_buf_size];
603 char *_str;
604
605 void calc_str();
606
607 coll_t(type_t t, spg_t p, uint64_t r)
608 : type(t), pgid(p), removal_seq(r) {
609 calc_str();
610 }
611
612 public:
613 coll_t() : type(TYPE_META), removal_seq(0)
614 {
615 calc_str();
616 }
617
618 coll_t(const coll_t& other)
619 : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
620 calc_str();
621 }
622
623 explicit coll_t(spg_t pgid)
624 : type(TYPE_PG), pgid(pgid), removal_seq(0)
625 {
626 calc_str();
627 }
628
629 coll_t& operator=(const coll_t& rhs)
630 {
631 this->type = rhs.type;
632 this->pgid = rhs.pgid;
633 this->removal_seq = rhs.removal_seq;
634 this->calc_str();
635 return *this;
636 }
637
638 // named constructors
639 static coll_t meta() {
640 return coll_t();
641 }
642 static coll_t pg(spg_t p) {
643 return coll_t(p);
644 }
645
646 const std::string to_str() const {
647 return string(_str);
648 }
649 const char *c_str() const {
650 return _str;
651 }
652
653 bool parse(const std::string& s);
654
655 int operator<(const coll_t &rhs) const {
656 return type < rhs.type ||
657 (type == rhs.type && pgid < rhs.pgid);
658 }
659
660 bool is_meta() const {
661 return type == TYPE_META;
662 }
663 bool is_pg_prefix(spg_t *pgid_) const {
664 if (type == TYPE_PG || type == TYPE_PG_TEMP) {
665 *pgid_ = pgid;
666 return true;
667 }
668 return false;
669 }
670 bool is_pg() const {
671 return type == TYPE_PG;
672 }
673 bool is_pg(spg_t *pgid_) const {
674 if (type == TYPE_PG) {
675 *pgid_ = pgid;
676 return true;
677 }
678 return false;
679 }
680 bool is_temp() const {
681 return type == TYPE_PG_TEMP;
682 }
683 bool is_temp(spg_t *pgid_) const {
684 if (type == TYPE_PG_TEMP) {
685 *pgid_ = pgid;
686 return true;
687 }
688 return false;
689 }
690
691 void encode(bufferlist& bl) const;
692 void decode(bufferlist::iterator& bl);
693 size_t encoded_size() const;
694
695 inline bool operator==(const coll_t& rhs) const {
696 // only compare type if meta
697 if (type != rhs.type)
698 return false;
699 if (type == TYPE_META)
700 return true;
701 return type == rhs.type && pgid == rhs.pgid;
702 }
703 inline bool operator!=(const coll_t& rhs) const {
704 return !(*this == rhs);
705 }
706
707 // get a TEMP collection that corresponds to the current collection,
708 // which we presume is a pg collection.
709 coll_t get_temp() const {
710 assert(type == TYPE_PG);
711 return coll_t(TYPE_PG_TEMP, pgid, 0);
712 }
713
714 ghobject_t get_min_hobj() const {
715 ghobject_t o;
716 switch (type) {
717 case TYPE_PG:
718 o.hobj.pool = pgid.pool();
719 o.set_shard(pgid.shard);
720 break;
721 case TYPE_META:
722 o.hobj.pool = -1;
723 break;
724 default:
725 break;
726 }
727 return o;
728 }
729
730 unsigned hash_to_shard(unsigned num_shards) const {
731 if (type == TYPE_PG)
732 return pgid.hash_to_shard(num_shards);
733 return 0; // whatever.
734 }
735
736 void dump(Formatter *f) const;
737 static void generate_test_instances(list<coll_t*>& o);
738 };
739
740 WRITE_CLASS_ENCODER(coll_t)
741
742 inline ostream& operator<<(ostream& out, const coll_t& c) {
743 out << c.to_str();
744 return out;
745 }
746
747 namespace std {
748 template<> struct hash<coll_t> {
749 size_t operator()(const coll_t &c) const {
750 size_t h = 0;
751 string str(c.to_str());
752 std::string::const_iterator end(str.end());
753 for (std::string::const_iterator s = str.begin(); s != end; ++s) {
754 h += *s;
755 h += (h << 10);
756 h ^= (h >> 6);
757 }
758 h += (h << 3);
759 h ^= (h >> 11);
760 h += (h << 15);
761 return h;
762 }
763 };
764 } // namespace std
765
766 inline ostream& operator<<(ostream& out, const ceph_object_layout &ol)
767 {
768 out << pg_t(ol.ol_pgid);
769 int su = ol.ol_stripe_unit;
770 if (su)
771 out << ".su=" << su;
772 return out;
773 }
774
775
776
777 // compound rados version type
778 /* WARNING: If add member in eversion_t, please make sure the encode/decode function
779 * work well. For little-endian machine, we should make sure there is no padding
780 * in 32-bit machine and 64-bit machine.
781 */
782 class eversion_t {
783 public:
784 version_t version;
785 epoch_t epoch;
786 __u32 __pad;
787 eversion_t() : version(0), epoch(0), __pad(0) {}
788 eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
789
790 // cppcheck-suppress noExplicitConstructor
791 eversion_t(const ceph_eversion& ce) :
792 version(ce.version),
793 epoch(ce.epoch),
794 __pad(0) { }
795
796 explicit eversion_t(bufferlist& bl) : __pad(0) { decode(bl); }
797
798 static eversion_t max() {
799 eversion_t max;
800 max.version -= 1;
801 max.epoch -= 1;
802 return max;
803 }
804
805 operator ceph_eversion() {
806 ceph_eversion c;
807 c.epoch = epoch;
808 c.version = version;
809 return c;
810 }
811
812 string get_key_name() const;
813
814 void encode(bufferlist &bl) const {
815 #if defined(CEPH_LITTLE_ENDIAN)
816 bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
817 #else
818 ::encode(version, bl);
819 ::encode(epoch, bl);
820 #endif
821 }
822 void decode(bufferlist::iterator &bl) {
823 #if defined(CEPH_LITTLE_ENDIAN)
824 bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
825 #else
826 ::decode(version, bl);
827 ::decode(epoch, bl);
828 #endif
829 }
830 void decode(bufferlist& bl) {
831 bufferlist::iterator p = bl.begin();
832 decode(p);
833 }
834 };
835 WRITE_CLASS_ENCODER(eversion_t)
836
837 inline bool operator==(const eversion_t& l, const eversion_t& r) {
838 return (l.epoch == r.epoch) && (l.version == r.version);
839 }
840 inline bool operator!=(const eversion_t& l, const eversion_t& r) {
841 return (l.epoch != r.epoch) || (l.version != r.version);
842 }
843 inline bool operator<(const eversion_t& l, const eversion_t& r) {
844 return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
845 }
846 inline bool operator<=(const eversion_t& l, const eversion_t& r) {
847 return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
848 }
849 inline bool operator>(const eversion_t& l, const eversion_t& r) {
850 return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
851 }
852 inline bool operator>=(const eversion_t& l, const eversion_t& r) {
853 return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
854 }
855 inline ostream& operator<<(ostream& out, const eversion_t& e) {
856 return out << e.epoch << "'" << e.version;
857 }
858
859 /**
860 * objectstore_perf_stat_t
861 *
862 * current perf information about the osd
863 */
864 struct objectstore_perf_stat_t {
865 // cur_op_latency is in ms since double add/sub are not associative
866 uint32_t os_commit_latency;
867 uint32_t os_apply_latency;
868
869 objectstore_perf_stat_t() :
870 os_commit_latency(0), os_apply_latency(0) {}
871
872 bool operator==(const objectstore_perf_stat_t &r) const {
873 return os_commit_latency == r.os_commit_latency &&
874 os_apply_latency == r.os_apply_latency;
875 }
876
877 void add(const objectstore_perf_stat_t &o) {
878 os_commit_latency += o.os_commit_latency;
879 os_apply_latency += o.os_apply_latency;
880 }
881 void sub(const objectstore_perf_stat_t &o) {
882 os_commit_latency -= o.os_commit_latency;
883 os_apply_latency -= o.os_apply_latency;
884 }
885 void dump(Formatter *f) const;
886 void encode(bufferlist &bl) const;
887 void decode(bufferlist::iterator &bl);
888 static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
889 };
890 WRITE_CLASS_ENCODER(objectstore_perf_stat_t)
891
892 /** osd_stat
893 * aggregate stats for an osd
894 */
895 struct osd_stat_t {
896 int64_t kb, kb_used, kb_avail;
897 vector<int> hb_peers;
898 int32_t snap_trim_queue_len, num_snap_trimming;
899
900 pow2_hist_t op_queue_age_hist;
901
902 objectstore_perf_stat_t os_perf_stat;
903
904 epoch_t up_from = 0;
905 uint64_t seq = 0;
906
907 osd_stat_t() : kb(0), kb_used(0), kb_avail(0),
908 snap_trim_queue_len(0), num_snap_trimming(0) {}
909
910 void add(const osd_stat_t& o) {
911 kb += o.kb;
912 kb_used += o.kb_used;
913 kb_avail += o.kb_avail;
914 snap_trim_queue_len += o.snap_trim_queue_len;
915 num_snap_trimming += o.num_snap_trimming;
916 op_queue_age_hist.add(o.op_queue_age_hist);
917 os_perf_stat.add(o.os_perf_stat);
918 }
919 void sub(const osd_stat_t& o) {
920 kb -= o.kb;
921 kb_used -= o.kb_used;
922 kb_avail -= o.kb_avail;
923 snap_trim_queue_len -= o.snap_trim_queue_len;
924 num_snap_trimming -= o.num_snap_trimming;
925 op_queue_age_hist.sub(o.op_queue_age_hist);
926 os_perf_stat.sub(o.os_perf_stat);
927 }
928
929 void dump(Formatter *f) const;
930 void encode(bufferlist &bl) const;
931 void decode(bufferlist::iterator &bl);
932 static void generate_test_instances(std::list<osd_stat_t*>& o);
933 };
934 WRITE_CLASS_ENCODER(osd_stat_t)
935
936 inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
937 return l.kb == r.kb &&
938 l.kb_used == r.kb_used &&
939 l.kb_avail == r.kb_avail &&
940 l.snap_trim_queue_len == r.snap_trim_queue_len &&
941 l.num_snap_trimming == r.num_snap_trimming &&
942 l.hb_peers == r.hb_peers &&
943 l.op_queue_age_hist == r.op_queue_age_hist &&
944 l.os_perf_stat == r.os_perf_stat;
945 }
946 inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
947 return !(l == r);
948 }
949
950
951
952 inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
953 return out << "osd_stat(" << kb_t(s.kb_used) << " used, "
954 << kb_t(s.kb_avail) << " avail, "
955 << kb_t(s.kb) << " total, "
956 << "peers " << s.hb_peers
957 << " op hist " << s.op_queue_age_hist.h
958 << ")";
959 }
960
961
962 /*
963 * pg states
964 */
965 #define PG_STATE_CREATING (1<<0) // creating
966 #define PG_STATE_ACTIVE (1<<1) // i am active. (primary: replicas too)
967 #define PG_STATE_CLEAN (1<<2) // peers are complete, clean of stray replicas.
968 #define PG_STATE_DOWN (1<<4) // a needed replica is down, PG offline
969 //#define PG_STATE_REPLAY (1<<5) // crashed, waiting for replay
970 //#define PG_STATE_STRAY (1<<6) // i must notify the primary i exist.
971 //#define PG_STATE_SPLITTING (1<<7) // i am splitting
972 #define PG_STATE_SCRUBBING (1<<8) // scrubbing
973 //#define PG_STATE_SCRUBQ (1<<9) // queued for scrub
974 #define PG_STATE_DEGRADED (1<<10) // pg contains objects with reduced redundancy
975 #define PG_STATE_INCONSISTENT (1<<11) // pg replicas are inconsistent (but shouldn't be)
976 #define PG_STATE_PEERING (1<<12) // pg is (re)peering
977 #define PG_STATE_REPAIR (1<<13) // pg should repair on next scrub
978 #define PG_STATE_RECOVERING (1<<14) // pg is recovering/migrating objects
979 #define PG_STATE_BACKFILL_WAIT (1<<15) // [active] reserving backfill
980 #define PG_STATE_INCOMPLETE (1<<16) // incomplete content, peering failed.
981 #define PG_STATE_STALE (1<<17) // our state for this pg is stale, unknown.
982 #define PG_STATE_REMAPPED (1<<18) // pg is explicitly remapped to different OSDs than CRUSH
983 #define PG_STATE_DEEP_SCRUB (1<<19) // deep scrub: check CRC32 on files
984 #define PG_STATE_BACKFILL (1<<20) // [active] backfilling pg content
985 #define PG_STATE_BACKFILL_TOOFULL (1<<21) // backfill can't proceed: too full
986 #define PG_STATE_RECOVERY_WAIT (1<<22) // waiting for recovery reservations
987 #define PG_STATE_UNDERSIZED (1<<23) // pg acting < pool size
988 #define PG_STATE_ACTIVATING (1<<24) // pg is peered but not yet active
989 #define PG_STATE_PEERED (1<<25) // peered, cannot go active, can recover
990 #define PG_STATE_SNAPTRIM (1<<26) // trimming snaps
991 #define PG_STATE_SNAPTRIM_WAIT (1<<27) // queued to trim snaps
992 #define PG_STATE_RECOVERY_TOOFULL (1<<28) // recovery can't proceed: too full
993 #define PG_STATE_SNAPTRIM_ERROR (1<<29) // error stopped trimming snaps
994 #define PG_STATE_FORCED_RECOVERY (1<<30) // force recovery of this pg before any other
995 #define PG_STATE_FORCED_BACKFILL (1<<31) // force backfill of this pg before any other
996
997 std::string pg_state_string(int state);
998 std::string pg_vector_string(const vector<int32_t> &a);
999 int pg_string_state(const std::string& state);
1000
1001
1002 /*
1003 * pool_snap_info_t
1004 *
1005 * attributes for a single pool snapshot.
1006 */
1007 struct pool_snap_info_t {
1008 snapid_t snapid;
1009 utime_t stamp;
1010 string name;
1011
1012 void dump(Formatter *f) const;
1013 void encode(bufferlist& bl, uint64_t features) const;
1014 void decode(bufferlist::iterator& bl);
1015 static void generate_test_instances(list<pool_snap_info_t*>& o);
1016 };
1017 WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
1018
1019 inline ostream& operator<<(ostream& out, const pool_snap_info_t& si) {
1020 return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
1021 }
1022
1023
1024 /*
1025 * pool_opts_t
1026 *
1027 * pool options.
1028 */
1029
1030 class pool_opts_t {
1031 public:
1032 enum key_t {
1033 SCRUB_MIN_INTERVAL,
1034 SCRUB_MAX_INTERVAL,
1035 DEEP_SCRUB_INTERVAL,
1036 RECOVERY_PRIORITY,
1037 RECOVERY_OP_PRIORITY,
1038 SCRUB_PRIORITY,
1039 COMPRESSION_MODE,
1040 COMPRESSION_ALGORITHM,
1041 COMPRESSION_REQUIRED_RATIO,
1042 COMPRESSION_MAX_BLOB_SIZE,
1043 COMPRESSION_MIN_BLOB_SIZE,
1044 CSUM_TYPE,
1045 CSUM_MAX_BLOCK,
1046 CSUM_MIN_BLOCK,
1047 };
1048
1049 enum type_t {
1050 STR,
1051 INT,
1052 DOUBLE,
1053 };
1054
1055 struct opt_desc_t {
1056 key_t key;
1057 type_t type;
1058
1059 opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
1060
1061 bool operator==(const opt_desc_t& rhs) const {
1062 return key == rhs.key && type == rhs.type;
1063 }
1064 };
1065
1066 typedef boost::variant<std::string,int,double> value_t;
1067
1068 static bool is_opt_name(const std::string& name);
1069 static opt_desc_t get_opt_desc(const std::string& name);
1070
1071 pool_opts_t() : opts() {}
1072
1073 bool is_set(key_t key) const;
1074
1075 template<typename T>
1076 void set(key_t key, const T &val) {
1077 value_t value = val;
1078 opts[key] = value;
1079 }
1080
1081 template<typename T>
1082 bool get(key_t key, T *val) const {
1083 opts_t::const_iterator i = opts.find(key);
1084 if (i == opts.end()) {
1085 return false;
1086 }
1087 *val = boost::get<T>(i->second);
1088 return true;
1089 }
1090
1091 const value_t& get(key_t key) const;
1092
1093 bool unset(key_t key);
1094
1095 void dump(const std::string& name, Formatter *f) const;
1096
1097 void dump(Formatter *f) const;
1098 void encode(bufferlist &bl) const;
1099 void decode(bufferlist::iterator &bl);
1100
1101 private:
1102 typedef std::map<key_t, value_t> opts_t;
1103 opts_t opts;
1104
1105 friend ostream& operator<<(ostream& out, const pool_opts_t& opts);
1106 };
1107 WRITE_CLASS_ENCODER(pool_opts_t)
1108
1109 /*
1110 * pg_pool
1111 */
1112 struct pg_pool_t {
1113 static const char *APPLICATION_NAME_CEPHFS;
1114 static const char *APPLICATION_NAME_RBD;
1115 static const char *APPLICATION_NAME_RGW;
1116
1117 enum {
1118 TYPE_REPLICATED = 1, // replication
1119 //TYPE_RAID4 = 2, // raid4 (never implemented)
1120 TYPE_ERASURE = 3, // erasure-coded
1121 };
1122 static const char *get_type_name(int t) {
1123 switch (t) {
1124 case TYPE_REPLICATED: return "replicated";
1125 //case TYPE_RAID4: return "raid4";
1126 case TYPE_ERASURE: return "erasure";
1127 default: return "???";
1128 }
1129 }
1130 const char *get_type_name() const {
1131 return get_type_name(type);
1132 }
1133
1134 enum {
1135 FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
1136 FLAG_FULL = 1<<1, // pool is full
1137 FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
1138 FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
1139 FLAG_NODELETE = 1<<4, // pool can't be deleted
1140 FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
1141 FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
1142 FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1143 FLAG_NOSCRUB = 1<<8, // block periodic scrub
1144 FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
1145 };
1146
1147 static const char *get_flag_name(int f) {
1148 switch (f) {
1149 case FLAG_HASHPSPOOL: return "hashpspool";
1150 case FLAG_FULL: return "full";
1151 case FLAG_EC_OVERWRITES: return "ec_overwrites";
1152 case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
1153 case FLAG_NODELETE: return "nodelete";
1154 case FLAG_NOPGCHANGE: return "nopgchange";
1155 case FLAG_NOSIZECHANGE: return "nosizechange";
1156 case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
1157 case FLAG_NOSCRUB: return "noscrub";
1158 case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
1159 default: return "???";
1160 }
1161 }
1162 static string get_flags_string(uint64_t f) {
1163 string s;
1164 for (unsigned n=0; f && n<64; ++n) {
1165 if (f & (1ull << n)) {
1166 if (s.length())
1167 s += ",";
1168 s += get_flag_name(1ull << n);
1169 }
1170 }
1171 return s;
1172 }
1173 string get_flags_string() const {
1174 return get_flags_string(flags);
1175 }
1176 static uint64_t get_flag_by_name(const string& name) {
1177 if (name == "hashpspool")
1178 return FLAG_HASHPSPOOL;
1179 if (name == "full")
1180 return FLAG_FULL;
1181 if (name == "ec_overwrites")
1182 return FLAG_EC_OVERWRITES;
1183 if (name == "incomplete_clones")
1184 return FLAG_INCOMPLETE_CLONES;
1185 if (name == "nodelete")
1186 return FLAG_NODELETE;
1187 if (name == "nopgchange")
1188 return FLAG_NOPGCHANGE;
1189 if (name == "nosizechange")
1190 return FLAG_NOSIZECHANGE;
1191 if (name == "write_fadvise_dontneed")
1192 return FLAG_WRITE_FADVISE_DONTNEED;
1193 if (name == "noscrub")
1194 return FLAG_NOSCRUB;
1195 if (name == "nodeep-scrub")
1196 return FLAG_NODEEP_SCRUB;
1197 return 0;
1198 }
1199
1200 /// converts the acting/up vector to a set of pg shards
1201 void convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const;
1202
1203 typedef enum {
1204 CACHEMODE_NONE = 0, ///< no caching
1205 CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
1206 CACHEMODE_FORWARD = 2, ///< forward if not in cache
1207 CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
1208 CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later
1209 CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later
1210 CACHEMODE_PROXY = 6, ///< proxy if not in cache
1211 } cache_mode_t;
1212 static const char *get_cache_mode_name(cache_mode_t m) {
1213 switch (m) {
1214 case CACHEMODE_NONE: return "none";
1215 case CACHEMODE_WRITEBACK: return "writeback";
1216 case CACHEMODE_FORWARD: return "forward";
1217 case CACHEMODE_READONLY: return "readonly";
1218 case CACHEMODE_READFORWARD: return "readforward";
1219 case CACHEMODE_READPROXY: return "readproxy";
1220 case CACHEMODE_PROXY: return "proxy";
1221 default: return "unknown";
1222 }
1223 }
1224 static cache_mode_t get_cache_mode_from_str(const string& s) {
1225 if (s == "none")
1226 return CACHEMODE_NONE;
1227 if (s == "writeback")
1228 return CACHEMODE_WRITEBACK;
1229 if (s == "forward")
1230 return CACHEMODE_FORWARD;
1231 if (s == "readonly")
1232 return CACHEMODE_READONLY;
1233 if (s == "readforward")
1234 return CACHEMODE_READFORWARD;
1235 if (s == "readproxy")
1236 return CACHEMODE_READPROXY;
1237 if (s == "proxy")
1238 return CACHEMODE_PROXY;
1239 return (cache_mode_t)-1;
1240 }
1241 const char *get_cache_mode_name() const {
1242 return get_cache_mode_name(cache_mode);
1243 }
1244 bool cache_mode_requires_hit_set() const {
1245 switch (cache_mode) {
1246 case CACHEMODE_NONE:
1247 case CACHEMODE_FORWARD:
1248 case CACHEMODE_READONLY:
1249 case CACHEMODE_PROXY:
1250 return false;
1251 case CACHEMODE_WRITEBACK:
1252 case CACHEMODE_READFORWARD:
1253 case CACHEMODE_READPROXY:
1254 return true;
1255 default:
1256 assert(0 == "implement me");
1257 }
1258 }
1259
1260 uint64_t flags; ///< FLAG_*
1261 __u8 type; ///< TYPE_*
1262 __u8 size, min_size; ///< number of osds in each pg
1263 __u8 crush_rule; ///< crush placement rule
1264 __u8 object_hash; ///< hash mapping object name to ps
1265 private:
1266 __u32 pg_num, pgp_num; ///< number of pgs
1267
1268
1269 public:
1270 map<string,string> properties; ///< OBSOLETE
1271 string erasure_code_profile; ///< name of the erasure code profile in OSDMap
1272 epoch_t last_change; ///< most recent epoch changed, exclusing snapshot changes
1273 epoch_t last_force_op_resend; ///< last epoch that forced clients to resend
1274 /// last epoch that forced clients to resend (pre-luminous clients only)
1275 epoch_t last_force_op_resend_preluminous;
1276 snapid_t snap_seq; ///< seq for per-pool snapshot
1277 epoch_t snap_epoch; ///< osdmap epoch of last snap
1278 uint64_t auid; ///< who owns the pg
1279 __u32 crash_replay_interval; ///< seconds to allow clients to replay ACKed but unCOMMITted requests
1280
1281 uint64_t quota_max_bytes; ///< maximum number of bytes for this pool
1282 uint64_t quota_max_objects; ///< maximum number of objects for this pool
1283
1284 /*
1285 * Pool snaps (global to this pool). These define a SnapContext for
1286 * the pool, unless the client manually specifies an alternate
1287 * context.
1288 */
1289 map<snapid_t, pool_snap_info_t> snaps;
1290 /*
1291 * Alternatively, if we are defining non-pool snaps (e.g. via the
1292 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1293 * used). Snaps and removed_snaps are to be used exclusive of each
1294 * other!
1295 */
1296 interval_set<snapid_t> removed_snaps;
1297
1298 unsigned pg_num_mask, pgp_num_mask;
1299
1300 set<uint64_t> tiers; ///< pools that are tiers of us
1301 int64_t tier_of; ///< pool for which we are a tier
1302 // Note that write wins for read+write ops
1303 int64_t read_tier; ///< pool/tier for objecter to direct reads to
1304 int64_t write_tier; ///< pool/tier for objecter to direct writes to
1305 cache_mode_t cache_mode; ///< cache pool mode
1306
1307 bool is_tier() const { return tier_of >= 0; }
1308 bool has_tiers() const { return !tiers.empty(); }
1309 void clear_tier() {
1310 tier_of = -1;
1311 clear_read_tier();
1312 clear_write_tier();
1313 clear_tier_tunables();
1314 }
1315 bool has_read_tier() const { return read_tier >= 0; }
1316 void clear_read_tier() { read_tier = -1; }
1317 bool has_write_tier() const { return write_tier >= 0; }
1318 void clear_write_tier() { write_tier = -1; }
1319 void clear_tier_tunables() {
1320 if (cache_mode != CACHEMODE_NONE)
1321 flags |= FLAG_INCOMPLETE_CLONES;
1322 cache_mode = CACHEMODE_NONE;
1323
1324 target_max_bytes = 0;
1325 target_max_objects = 0;
1326 cache_target_dirty_ratio_micro = 0;
1327 cache_target_dirty_high_ratio_micro = 0;
1328 cache_target_full_ratio_micro = 0;
1329 hit_set_params = HitSet::Params();
1330 hit_set_period = 0;
1331 hit_set_count = 0;
1332 hit_set_grade_decay_rate = 0;
1333 hit_set_search_last_n = 0;
1334 grade_table.resize(0);
1335 }
1336
1337 uint64_t target_max_bytes; ///< tiering: target max pool size
1338 uint64_t target_max_objects; ///< tiering: target max pool size
1339
1340 uint32_t cache_target_dirty_ratio_micro; ///< cache: fraction of target to leave dirty
1341 uint32_t cache_target_dirty_high_ratio_micro; ///<cache: fraction of target to flush with high speed
1342 uint32_t cache_target_full_ratio_micro; ///< cache: fraction of target to fill before we evict in earnest
1343
1344 uint32_t cache_min_flush_age; ///< minimum age (seconds) before we can flush
1345 uint32_t cache_min_evict_age; ///< minimum age (seconds) before we can evict
1346
1347 HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
1348 uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
1349 uint32_t hit_set_count; ///< number of periods to retain
1350 bool use_gmt_hitset; ///< use gmt to name the hitset archive object
1351 uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote on read
1352 uint32_t min_write_recency_for_promote; ///< minimum number of HitSet to check before promote on write
1353 uint32_t hit_set_grade_decay_rate; ///< current hit_set has highest priority on objects
1354 ///temperature count,the follow hit_set's priority decay
1355 ///by this params than pre hit_set
1356 uint32_t hit_set_search_last_n; ///<accumulate atmost N hit_sets for temperature
1357
1358 uint32_t stripe_width; ///< erasure coded stripe size in bytes
1359
1360 uint64_t expected_num_objects; ///< expected number of objects on this pool, a value of 0 indicates
1361 ///< user does not specify any expected value
1362 bool fast_read; ///< whether turn on fast read on the pool or not
1363
1364 pool_opts_t opts; ///< options
1365
1366 /// application -> key/value metadata
1367 map<string, std::map<string, string>> application_metadata;
1368
1369 private:
1370 vector<uint32_t> grade_table;
1371
1372 public:
1373 uint32_t get_grade(unsigned i) const {
1374 if (grade_table.size() <= i)
1375 return 0;
1376 return grade_table[i];
1377 }
1378 void calc_grade_table() {
1379 unsigned v = 1000000;
1380 grade_table.resize(hit_set_count);
1381 for (unsigned i = 0; i < hit_set_count; i++) {
1382 v = v * (1 - (hit_set_grade_decay_rate / 100.0));
1383 grade_table[i] = v;
1384 }
1385 }
1386
1387 pg_pool_t()
1388 : flags(0), type(0), size(0), min_size(0),
1389 crush_rule(0), object_hash(0),
1390 pg_num(0), pgp_num(0),
1391 last_change(0),
1392 last_force_op_resend(0),
1393 last_force_op_resend_preluminous(0),
1394 snap_seq(0), snap_epoch(0),
1395 auid(0),
1396 crash_replay_interval(0),
1397 quota_max_bytes(0), quota_max_objects(0),
1398 pg_num_mask(0), pgp_num_mask(0),
1399 tier_of(-1), read_tier(-1), write_tier(-1),
1400 cache_mode(CACHEMODE_NONE),
1401 target_max_bytes(0), target_max_objects(0),
1402 cache_target_dirty_ratio_micro(0),
1403 cache_target_dirty_high_ratio_micro(0),
1404 cache_target_full_ratio_micro(0),
1405 cache_min_flush_age(0),
1406 cache_min_evict_age(0),
1407 hit_set_params(),
1408 hit_set_period(0),
1409 hit_set_count(0),
1410 use_gmt_hitset(true),
1411 min_read_recency_for_promote(0),
1412 min_write_recency_for_promote(0),
1413 hit_set_grade_decay_rate(0),
1414 hit_set_search_last_n(0),
1415 stripe_width(0),
1416 expected_num_objects(0),
1417 fast_read(false),
1418 opts()
1419 { }
1420
1421 void dump(Formatter *f) const;
1422
1423 uint64_t get_flags() const { return flags; }
1424 bool has_flag(uint64_t f) const { return flags & f; }
1425 void set_flag(uint64_t f) { flags |= f; }
1426 void unset_flag(uint64_t f) { flags &= ~f; }
1427
1428 bool ec_pool() const {
1429 return type == TYPE_ERASURE;
1430 }
1431 bool require_rollback() const {
1432 return ec_pool();
1433 }
1434
1435 /// true if incomplete clones may be present
1436 bool allow_incomplete_clones() const {
1437 return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
1438 }
1439
1440 unsigned get_type() const { return type; }
1441 unsigned get_size() const { return size; }
1442 unsigned get_min_size() const { return min_size; }
1443 int get_crush_rule() const { return crush_rule; }
1444 int get_object_hash() const { return object_hash; }
1445 const char *get_object_hash_name() const {
1446 return ceph_str_hash_name(get_object_hash());
1447 }
1448 epoch_t get_last_change() const { return last_change; }
1449 epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
1450 epoch_t get_last_force_op_resend_preluminous() const {
1451 return last_force_op_resend_preluminous;
1452 }
1453 epoch_t get_snap_epoch() const { return snap_epoch; }
1454 snapid_t get_snap_seq() const { return snap_seq; }
1455 uint64_t get_auid() const { return auid; }
1456 unsigned get_crash_replay_interval() const { return crash_replay_interval; }
1457
1458 void set_snap_seq(snapid_t s) { snap_seq = s; }
1459 void set_snap_epoch(epoch_t e) { snap_epoch = e; }
1460
1461 void set_stripe_width(uint32_t s) { stripe_width = s; }
1462 uint32_t get_stripe_width() const { return stripe_width; }
1463
1464 bool is_replicated() const { return get_type() == TYPE_REPLICATED; }
1465 bool is_erasure() const { return get_type() == TYPE_ERASURE; }
1466
1467 bool supports_omap() const {
1468 return !(get_type() == TYPE_ERASURE);
1469 }
1470
1471 bool requires_aligned_append() const {
1472 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
1473 }
1474 uint64_t required_alignment() const { return stripe_width; }
1475
1476 bool allows_ecoverwrites() const {
1477 return has_flag(FLAG_EC_OVERWRITES);
1478 }
1479
1480 bool can_shift_osds() const {
1481 switch (get_type()) {
1482 case TYPE_REPLICATED:
1483 return true;
1484 case TYPE_ERASURE:
1485 return false;
1486 default:
1487 assert(0 == "unhandled pool type");
1488 }
1489 }
1490
1491 unsigned get_pg_num() const { return pg_num; }
1492 unsigned get_pgp_num() const { return pgp_num; }
1493
1494 unsigned get_pg_num_mask() const { return pg_num_mask; }
1495 unsigned get_pgp_num_mask() const { return pgp_num_mask; }
1496
1497 // if pg_num is not a multiple of two, pgs are not equally sized.
1498 // return, for a given pg, the fraction (denominator) of the total
1499 // pool size that it represents.
1500 unsigned get_pg_num_divisor(pg_t pgid) const;
1501
1502 void set_pg_num(int p) {
1503 pg_num = p;
1504 calc_pg_masks();
1505 }
1506 void set_pgp_num(int p) {
1507 pgp_num = p;
1508 calc_pg_masks();
1509 }
1510
1511 void set_quota_max_bytes(uint64_t m) {
1512 quota_max_bytes = m;
1513 }
1514 uint64_t get_quota_max_bytes() {
1515 return quota_max_bytes;
1516 }
1517
1518 void set_quota_max_objects(uint64_t m) {
1519 quota_max_objects = m;
1520 }
1521 uint64_t get_quota_max_objects() {
1522 return quota_max_objects;
1523 }
1524
1525 void set_last_force_op_resend(uint64_t t) {
1526 last_force_op_resend = t;
1527 last_force_op_resend_preluminous = t;
1528 }
1529
1530 void calc_pg_masks();
1531
1532 /*
1533 * we have two snap modes:
1534 * - pool global snaps
1535 * - snap existence/non-existence defined by snaps[] and snap_seq
1536 * - user managed snaps
1537 * - removal governed by removed_snaps
1538 *
1539 * we know which mode we're using based on whether removed_snaps is empty.
1540 * If nothing has been created, both functions report false.
1541 */
1542 bool is_pool_snaps_mode() const;
1543 bool is_unmanaged_snaps_mode() const;
1544 bool is_removed_snap(snapid_t s) const;
1545
1546 /*
1547 * build set of known-removed sets from either pool snaps or
1548 * explicit removed_snaps set.
1549 */
1550 void build_removed_snaps(interval_set<snapid_t>& rs) const;
1551 snapid_t snap_exists(const char *s) const;
1552 void add_snap(const char *n, utime_t stamp);
1553 void add_unmanaged_snap(uint64_t& snapid);
1554 void remove_snap(snapid_t s);
1555 void remove_unmanaged_snap(snapid_t s);
1556
1557 SnapContext get_snap_context() const;
1558
1559 /// hash a object name+namespace key to a hash position
1560 uint32_t hash_key(const string& key, const string& ns) const;
1561
1562 /// round a hash position down to a pg num
1563 uint32_t raw_hash_to_pg(uint32_t v) const;
1564
1565 /*
1566 * map a raw pg (with full precision ps) into an actual pg, for storage
1567 */
1568 pg_t raw_pg_to_pg(pg_t pg) const;
1569
1570 /*
1571 * map raw pg (full precision ps) into a placement seed. include
1572 * pool id in that value so that different pools don't use the same
1573 * seeds.
1574 */
1575 ps_t raw_pg_to_pps(pg_t pg) const;
1576
1577 /// choose a random hash position within a pg
1578 uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
1579
1580 void encode(bufferlist& bl, uint64_t features) const;
1581 void decode(bufferlist::iterator& bl);
1582
1583 static void generate_test_instances(list<pg_pool_t*>& o);
1584 };
1585 WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
1586
1587 ostream& operator<<(ostream& out, const pg_pool_t& p);
1588
1589
1590 /**
1591 * a summation of object stats
1592 *
1593 * This is just a container for object stats; we don't know what for.
1594 *
1595 * If you add members in object_stat_sum_t, you should make sure there are
1596 * not padding among these members.
1597 * You should also modify the padding_check function.
1598
1599 */
1600 struct object_stat_sum_t {
1601 /**************************************************************************
1602 * WARNING: be sure to update operator==, floor, and split when
1603 * adding/removing fields!
1604 **************************************************************************/
1605 int64_t num_bytes; // in bytes
1606 int64_t num_objects;
1607 int64_t num_object_clones;
1608 int64_t num_object_copies; // num_objects * num_replicas
1609 int64_t num_objects_missing_on_primary;
1610 int64_t num_objects_degraded;
1611 int64_t num_objects_unfound;
1612 int64_t num_rd;
1613 int64_t num_rd_kb;
1614 int64_t num_wr;
1615 int64_t num_wr_kb;
1616 int64_t num_scrub_errors; // total deep and shallow scrub errors
1617 int64_t num_objects_recovered;
1618 int64_t num_bytes_recovered;
1619 int64_t num_keys_recovered;
1620 int64_t num_shallow_scrub_errors;
1621 int64_t num_deep_scrub_errors;
1622 int64_t num_objects_dirty;
1623 int64_t num_whiteouts;
1624 int64_t num_objects_omap;
1625 int64_t num_objects_hit_set_archive;
1626 int64_t num_objects_misplaced;
1627 int64_t num_bytes_hit_set_archive;
1628 int64_t num_flush;
1629 int64_t num_flush_kb;
1630 int64_t num_evict;
1631 int64_t num_evict_kb;
1632 int64_t num_promote;
1633 int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0
1634 int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0
1635 int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0
1636 int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0
1637 int64_t num_objects_pinned;
1638 int64_t num_objects_missing;
1639 int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
1640
1641 object_stat_sum_t()
1642 : num_bytes(0),
1643 num_objects(0), num_object_clones(0), num_object_copies(0),
1644 num_objects_missing_on_primary(0), num_objects_degraded(0),
1645 num_objects_unfound(0),
1646 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1647 num_scrub_errors(0),
1648 num_objects_recovered(0),
1649 num_bytes_recovered(0),
1650 num_keys_recovered(0),
1651 num_shallow_scrub_errors(0),
1652 num_deep_scrub_errors(0),
1653 num_objects_dirty(0),
1654 num_whiteouts(0),
1655 num_objects_omap(0),
1656 num_objects_hit_set_archive(0),
1657 num_objects_misplaced(0),
1658 num_bytes_hit_set_archive(0),
1659 num_flush(0),
1660 num_flush_kb(0),
1661 num_evict(0),
1662 num_evict_kb(0),
1663 num_promote(0),
1664 num_flush_mode_high(0), num_flush_mode_low(0),
1665 num_evict_mode_some(0), num_evict_mode_full(0),
1666 num_objects_pinned(0),
1667 num_objects_missing(0),
1668 num_legacy_snapsets(0)
1669 {}
1670
1671 void floor(int64_t f) {
1672 #define FLOOR(x) if (x < f) x = f
1673 FLOOR(num_bytes);
1674 FLOOR(num_objects);
1675 FLOOR(num_object_clones);
1676 FLOOR(num_object_copies);
1677 FLOOR(num_objects_missing_on_primary);
1678 FLOOR(num_objects_missing);
1679 FLOOR(num_objects_degraded);
1680 FLOOR(num_objects_misplaced);
1681 FLOOR(num_objects_unfound);
1682 FLOOR(num_rd);
1683 FLOOR(num_rd_kb);
1684 FLOOR(num_wr);
1685 FLOOR(num_wr_kb);
1686 FLOOR(num_scrub_errors);
1687 FLOOR(num_shallow_scrub_errors);
1688 FLOOR(num_deep_scrub_errors);
1689 FLOOR(num_objects_recovered);
1690 FLOOR(num_bytes_recovered);
1691 FLOOR(num_keys_recovered);
1692 FLOOR(num_objects_dirty);
1693 FLOOR(num_whiteouts);
1694 FLOOR(num_objects_omap);
1695 FLOOR(num_objects_hit_set_archive);
1696 FLOOR(num_bytes_hit_set_archive);
1697 FLOOR(num_flush);
1698 FLOOR(num_flush_kb);
1699 FLOOR(num_evict);
1700 FLOOR(num_evict_kb);
1701 FLOOR(num_promote);
1702 FLOOR(num_flush_mode_high);
1703 FLOOR(num_flush_mode_low);
1704 FLOOR(num_evict_mode_some);
1705 FLOOR(num_evict_mode_full);
1706 FLOOR(num_objects_pinned);
1707 FLOOR(num_legacy_snapsets);
1708 #undef FLOOR
1709 }
1710
1711 void split(vector<object_stat_sum_t> &out) const {
1712 #define SPLIT(PARAM) \
1713 for (unsigned i = 0; i < out.size(); ++i) { \
1714 out[i].PARAM = PARAM / out.size(); \
1715 if (i < (PARAM % out.size())) { \
1716 out[i].PARAM++; \
1717 } \
1718 }
1719 #define SPLIT_PRESERVE_NONZERO(PARAM) \
1720 for (unsigned i = 0; i < out.size(); ++i) { \
1721 if (PARAM) \
1722 out[i].PARAM = 1 + PARAM / out.size(); \
1723 else \
1724 out[i].PARAM = 0; \
1725 }
1726
1727 SPLIT(num_bytes);
1728 SPLIT(num_objects);
1729 SPLIT(num_object_clones);
1730 SPLIT(num_object_copies);
1731 SPLIT(num_objects_missing_on_primary);
1732 SPLIT(num_objects_missing);
1733 SPLIT(num_objects_degraded);
1734 SPLIT(num_objects_misplaced);
1735 SPLIT(num_objects_unfound);
1736 SPLIT(num_rd);
1737 SPLIT(num_rd_kb);
1738 SPLIT(num_wr);
1739 SPLIT(num_wr_kb);
1740 SPLIT(num_scrub_errors);
1741 SPLIT(num_shallow_scrub_errors);
1742 SPLIT(num_deep_scrub_errors);
1743 SPLIT(num_objects_recovered);
1744 SPLIT(num_bytes_recovered);
1745 SPLIT(num_keys_recovered);
1746 SPLIT(num_objects_dirty);
1747 SPLIT(num_whiteouts);
1748 SPLIT(num_objects_omap);
1749 SPLIT(num_objects_hit_set_archive);
1750 SPLIT(num_bytes_hit_set_archive);
1751 SPLIT(num_flush);
1752 SPLIT(num_flush_kb);
1753 SPLIT(num_evict);
1754 SPLIT(num_evict_kb);
1755 SPLIT(num_promote);
1756 SPLIT(num_flush_mode_high);
1757 SPLIT(num_flush_mode_low);
1758 SPLIT(num_evict_mode_some);
1759 SPLIT(num_evict_mode_full);
1760 SPLIT(num_objects_pinned);
1761 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
1762 #undef SPLIT
1763 #undef SPLIT_PRESERVE_NONZERO
1764 }
1765
1766 void clear() {
1767 memset(this, 0, sizeof(*this));
1768 }
1769
1770 void calc_copies(int nrep) {
1771 num_object_copies = nrep * num_objects;
1772 }
1773
1774 bool is_zero() const {
1775 return mem_is_zero((char*)this, sizeof(*this));
1776 }
1777
1778 void add(const object_stat_sum_t& o);
1779 void sub(const object_stat_sum_t& o);
1780
1781 void dump(Formatter *f) const;
1782 void padding_check() {
1783 static_assert(
1784 sizeof(object_stat_sum_t) ==
1785 sizeof(num_bytes) +
1786 sizeof(num_objects) +
1787 sizeof(num_object_clones) +
1788 sizeof(num_object_copies) +
1789 sizeof(num_objects_missing_on_primary) +
1790 sizeof(num_objects_degraded) +
1791 sizeof(num_objects_unfound) +
1792 sizeof(num_rd) +
1793 sizeof(num_rd_kb) +
1794 sizeof(num_wr) +
1795 sizeof(num_wr_kb) +
1796 sizeof(num_scrub_errors) +
1797 sizeof(num_objects_recovered) +
1798 sizeof(num_bytes_recovered) +
1799 sizeof(num_keys_recovered) +
1800 sizeof(num_shallow_scrub_errors) +
1801 sizeof(num_deep_scrub_errors) +
1802 sizeof(num_objects_dirty) +
1803 sizeof(num_whiteouts) +
1804 sizeof(num_objects_omap) +
1805 sizeof(num_objects_hit_set_archive) +
1806 sizeof(num_objects_misplaced) +
1807 sizeof(num_bytes_hit_set_archive) +
1808 sizeof(num_flush) +
1809 sizeof(num_flush_kb) +
1810 sizeof(num_evict) +
1811 sizeof(num_evict_kb) +
1812 sizeof(num_promote) +
1813 sizeof(num_flush_mode_high) +
1814 sizeof(num_flush_mode_low) +
1815 sizeof(num_evict_mode_some) +
1816 sizeof(num_evict_mode_full) +
1817 sizeof(num_objects_pinned) +
1818 sizeof(num_objects_missing) +
1819 sizeof(num_legacy_snapsets)
1820 ,
1821 "object_stat_sum_t have padding");
1822 }
1823 void encode(bufferlist& bl) const;
1824 void decode(bufferlist::iterator& bl);
1825 static void generate_test_instances(list<object_stat_sum_t*>& o);
1826 };
1827 WRITE_CLASS_ENCODER(object_stat_sum_t)
1828
1829 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
1830
1831 /**
1832 * a collection of object stat sums
1833 *
1834 * This is a collection of stat sums over different categories.
1835 */
1836 struct object_stat_collection_t {
1837 /**************************************************************************
1838 * WARNING: be sure to update the operator== when adding/removing fields! *
1839 **************************************************************************/
1840 object_stat_sum_t sum;
1841
1842 void calc_copies(int nrep) {
1843 sum.calc_copies(nrep);
1844 }
1845
1846 void dump(Formatter *f) const;
1847 void encode(bufferlist& bl) const;
1848 void decode(bufferlist::iterator& bl);
1849 static void generate_test_instances(list<object_stat_collection_t*>& o);
1850
1851 bool is_zero() const {
1852 return sum.is_zero();
1853 }
1854
1855 void clear() {
1856 sum.clear();
1857 }
1858
1859 void floor(int64_t f) {
1860 sum.floor(f);
1861 }
1862
1863 void add(const object_stat_sum_t& o) {
1864 sum.add(o);
1865 }
1866
1867 void add(const object_stat_collection_t& o) {
1868 sum.add(o.sum);
1869 }
1870 void sub(const object_stat_collection_t& o) {
1871 sum.sub(o.sum);
1872 }
1873 };
1874 WRITE_CLASS_ENCODER(object_stat_collection_t)
1875
1876 inline bool operator==(const object_stat_collection_t& l,
1877 const object_stat_collection_t& r) {
1878 return l.sum == r.sum;
1879 }
1880
1881
1882 /** pg_stat
1883 * aggregate stats for a single PG.
1884 */
1885 struct pg_stat_t {
1886 /**************************************************************************
1887 * WARNING: be sure to update the operator== when adding/removing fields! *
1888 **************************************************************************/
1889 eversion_t version;
1890 version_t reported_seq; // sequence number
1891 epoch_t reported_epoch; // epoch of this report
1892 __u32 state;
1893 utime_t last_fresh; // last reported
1894 utime_t last_change; // new state != previous state
1895 utime_t last_active; // state & PG_STATE_ACTIVE
1896 utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
1897 utime_t last_clean; // state & PG_STATE_CLEAN
1898 utime_t last_unstale; // (state & PG_STATE_STALE) == 0
1899 utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
1900 utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
1901
1902 eversion_t log_start; // (log_start,version]
1903 eversion_t ondisk_log_start; // there may be more on disk
1904
1905 epoch_t created;
1906 epoch_t last_epoch_clean;
1907 pg_t parent;
1908 __u32 parent_split_bits;
1909
1910 eversion_t last_scrub;
1911 eversion_t last_deep_scrub;
1912 utime_t last_scrub_stamp;
1913 utime_t last_deep_scrub_stamp;
1914 utime_t last_clean_scrub_stamp;
1915
1916 object_stat_collection_t stats;
1917
1918 int64_t log_size;
1919 int64_t ondisk_log_size; // >= active_log_size
1920
1921 vector<int32_t> up, acting;
1922 epoch_t mapping_epoch;
1923
1924 vector<int32_t> blocked_by; ///< osds on which the pg is blocked
1925
1926 utime_t last_became_active;
1927 utime_t last_became_peered;
1928
1929 /// up, acting primaries
1930 int32_t up_primary;
1931 int32_t acting_primary;
1932
1933 bool stats_invalid:1;
1934 /// true if num_objects_dirty is not accurate (because it was not
1935 /// maintained starting from pool creation)
1936 bool dirty_stats_invalid:1;
1937 bool omap_stats_invalid:1;
1938 bool hitset_stats_invalid:1;
1939 bool hitset_bytes_stats_invalid:1;
1940 bool pin_stats_invalid:1;
1941
1942 pg_stat_t()
1943 : reported_seq(0),
1944 reported_epoch(0),
1945 state(0),
1946 created(0), last_epoch_clean(0),
1947 parent_split_bits(0),
1948 log_size(0), ondisk_log_size(0),
1949 mapping_epoch(0),
1950 up_primary(-1),
1951 acting_primary(-1),
1952 stats_invalid(false),
1953 dirty_stats_invalid(false),
1954 omap_stats_invalid(false),
1955 hitset_stats_invalid(false),
1956 hitset_bytes_stats_invalid(false),
1957 pin_stats_invalid(false)
1958 { }
1959
1960 epoch_t get_effective_last_epoch_clean() const {
1961 if (state & PG_STATE_CLEAN) {
1962 // we are clean as of this report, and should thus take the
1963 // reported epoch
1964 return reported_epoch;
1965 } else {
1966 return last_epoch_clean;
1967 }
1968 }
1969
1970 pair<epoch_t, version_t> get_version_pair() const {
1971 return make_pair(reported_epoch, reported_seq);
1972 }
1973
1974 void floor(int64_t f) {
1975 stats.floor(f);
1976 if (log_size < f)
1977 log_size = f;
1978 if (ondisk_log_size < f)
1979 ondisk_log_size = f;
1980 }
1981
1982 void add(const pg_stat_t& o) {
1983 stats.add(o.stats);
1984 log_size += o.log_size;
1985 ondisk_log_size += o.ondisk_log_size;
1986 }
1987 void sub(const pg_stat_t& o) {
1988 stats.sub(o.stats);
1989 log_size -= o.log_size;
1990 ondisk_log_size -= o.ondisk_log_size;
1991 }
1992
1993 bool is_acting_osd(int32_t osd, bool primary) const;
1994 void dump(Formatter *f) const;
1995 void dump_brief(Formatter *f) const;
1996 void encode(bufferlist &bl) const;
1997 void decode(bufferlist::iterator &bl);
1998 static void generate_test_instances(list<pg_stat_t*>& o);
1999 };
2000 WRITE_CLASS_ENCODER(pg_stat_t)
2001
2002 bool operator==(const pg_stat_t& l, const pg_stat_t& r);
2003
2004 /*
2005 * summation over an entire pool
2006 */
2007 struct pool_stat_t {
2008 object_stat_collection_t stats;
2009 int64_t log_size;
2010 int64_t ondisk_log_size; // >= active_log_size
2011 int32_t up; ///< number of up replicas or shards
2012 int32_t acting; ///< number of acting replicas or shards
2013
2014 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0)
2015 { }
2016
2017 void floor(int64_t f) {
2018 stats.floor(f);
2019 if (log_size < f)
2020 log_size = f;
2021 if (ondisk_log_size < f)
2022 ondisk_log_size = f;
2023 if (up < f)
2024 up = f;
2025 if (acting < f)
2026 acting = f;
2027 }
2028
2029 void add(const pg_stat_t& o) {
2030 stats.add(o.stats);
2031 log_size += o.log_size;
2032 ondisk_log_size += o.ondisk_log_size;
2033 up += o.up.size();
2034 acting += o.acting.size();
2035 }
2036 void sub(const pg_stat_t& o) {
2037 stats.sub(o.stats);
2038 log_size -= o.log_size;
2039 ondisk_log_size -= o.ondisk_log_size;
2040 up -= o.up.size();
2041 acting -= o.acting.size();
2042 }
2043
2044 bool is_zero() const {
2045 return (stats.is_zero() &&
2046 log_size == 0 &&
2047 ondisk_log_size == 0 &&
2048 up == 0 &&
2049 acting == 0);
2050 }
2051
2052 void dump(Formatter *f) const;
2053 void encode(bufferlist &bl, uint64_t features) const;
2054 void decode(bufferlist::iterator &bl);
2055 static void generate_test_instances(list<pool_stat_t*>& o);
2056 };
2057 WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
2058
2059
2060 // -----------------------------------------
2061
2062 /**
2063 * pg_hit_set_info_t - information about a single recorded HitSet
2064 *
2065 * Track basic metadata about a HitSet, like the nubmer of insertions
2066 * and the time range it covers.
2067 */
2068 struct pg_hit_set_info_t {
2069 utime_t begin, end; ///< time interval
2070 eversion_t version; ///< version this HitSet object was written
2071 bool using_gmt; ///< use gmt for creating the hit_set archive object name
2072
2073 friend bool operator==(const pg_hit_set_info_t& l,
2074 const pg_hit_set_info_t& r) {
2075 return
2076 l.begin == r.begin &&
2077 l.end == r.end &&
2078 l.version == r.version &&
2079 l.using_gmt == r.using_gmt;
2080 }
2081
2082 explicit pg_hit_set_info_t(bool using_gmt = true)
2083 : using_gmt(using_gmt) {}
2084
2085 void encode(bufferlist &bl) const;
2086 void decode(bufferlist::iterator &bl);
2087 void dump(Formatter *f) const;
2088 static void generate_test_instances(list<pg_hit_set_info_t*>& o);
2089 };
2090 WRITE_CLASS_ENCODER(pg_hit_set_info_t)
2091
2092 /**
2093 * pg_hit_set_history_t - information about a history of hitsets
2094 *
2095 * Include information about the currently accumulating hit set as well
2096 * as archived/historical ones.
2097 */
2098 struct pg_hit_set_history_t {
2099 eversion_t current_last_update; ///< last version inserted into current set
2100 list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
2101
2102 friend bool operator==(const pg_hit_set_history_t& l,
2103 const pg_hit_set_history_t& r) {
2104 return
2105 l.current_last_update == r.current_last_update &&
2106 l.history == r.history;
2107 }
2108
2109 void encode(bufferlist &bl) const;
2110 void decode(bufferlist::iterator &bl);
2111 void dump(Formatter *f) const;
2112 static void generate_test_instances(list<pg_hit_set_history_t*>& o);
2113 };
2114 WRITE_CLASS_ENCODER(pg_hit_set_history_t)
2115
2116
2117 // -----------------------------------------
2118
2119 /**
2120 * pg_history_t - information about recent pg peering/mapping history
2121 *
2122 * This is aggressively shared between OSDs to bound the amount of past
2123 * history they need to worry about.
2124 */
2125 struct pg_history_t {
2126 epoch_t epoch_created; // epoch in which *pg* was created (pool or pg)
2127 epoch_t epoch_pool_created; // epoch in which *pool* was created
2128 // (note: may be pg creation epoch for
2129 // pre-luminous clusters)
2130 epoch_t last_epoch_started; // lower bound on last epoch started (anywhere, not necessarily locally)
2131 epoch_t last_interval_started; // first epoch of last_epoch_started interval
2132 epoch_t last_epoch_clean; // lower bound on last epoch the PG was completely clean.
2133 epoch_t last_interval_clean; // first epoch of last_epoch_clean interval
2134 epoch_t last_epoch_split; // as parent or child
2135 epoch_t last_epoch_marked_full; // pool or cluster
2136
2137 /**
2138 * In the event of a map discontinuity, same_*_since may reflect the first
2139 * map the osd has seen in the new map sequence rather than the actual start
2140 * of the interval. This is ok since a discontinuity at epoch e means there
2141 * must have been a clean interval between e and now and that we cannot be
2142 * in the active set during the interval containing e.
2143 */
2144 epoch_t same_up_since; // same acting set since
2145 epoch_t same_interval_since; // same acting AND up set since
2146 epoch_t same_primary_since; // same primary at least back through this epoch.
2147
2148 eversion_t last_scrub;
2149 eversion_t last_deep_scrub;
2150 utime_t last_scrub_stamp;
2151 utime_t last_deep_scrub_stamp;
2152 utime_t last_clean_scrub_stamp;
2153
2154 friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
2155 return
2156 l.epoch_created == r.epoch_created &&
2157 l.epoch_pool_created == r.epoch_pool_created &&
2158 l.last_epoch_started == r.last_epoch_started &&
2159 l.last_interval_started == r.last_interval_started &&
2160 l.last_epoch_clean == r.last_epoch_clean &&
2161 l.last_interval_clean == r.last_interval_clean &&
2162 l.last_epoch_split == r.last_epoch_split &&
2163 l.last_epoch_marked_full == r.last_epoch_marked_full &&
2164 l.same_up_since == r.same_up_since &&
2165 l.same_interval_since == r.same_interval_since &&
2166 l.same_primary_since == r.same_primary_since &&
2167 l.last_scrub == r.last_scrub &&
2168 l.last_deep_scrub == r.last_deep_scrub &&
2169 l.last_scrub_stamp == r.last_scrub_stamp &&
2170 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2171 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp;
2172 }
2173
2174 pg_history_t()
2175 : epoch_created(0),
2176 epoch_pool_created(0),
2177 last_epoch_started(0),
2178 last_interval_started(0),
2179 last_epoch_clean(0),
2180 last_interval_clean(0),
2181 last_epoch_split(0),
2182 last_epoch_marked_full(0),
2183 same_up_since(0), same_interval_since(0), same_primary_since(0) {}
2184
2185 bool merge(const pg_history_t &other) {
2186 // Here, we only update the fields which cannot be calculated from the OSDmap.
2187 bool modified = false;
2188 if (epoch_created < other.epoch_created) {
2189 epoch_created = other.epoch_created;
2190 modified = true;
2191 }
2192 if (epoch_pool_created < other.epoch_pool_created) {
2193 // FIXME: for jewel compat only; this should either be 0 or always the
2194 // same value across all pg instances.
2195 epoch_pool_created = other.epoch_pool_created;
2196 modified = true;
2197 }
2198 if (last_epoch_started < other.last_epoch_started) {
2199 last_epoch_started = other.last_epoch_started;
2200 modified = true;
2201 }
2202 if (last_interval_started < other.last_interval_started) {
2203 last_interval_started = other.last_interval_started;
2204 modified = true;
2205 }
2206 if (last_epoch_clean < other.last_epoch_clean) {
2207 last_epoch_clean = other.last_epoch_clean;
2208 modified = true;
2209 }
2210 if (last_interval_clean < other.last_interval_clean) {
2211 last_interval_clean = other.last_interval_clean;
2212 modified = true;
2213 }
2214 if (last_epoch_split < other.last_epoch_split) {
2215 last_epoch_split = other.last_epoch_split;
2216 modified = true;
2217 }
2218 if (last_epoch_marked_full < other.last_epoch_marked_full) {
2219 last_epoch_marked_full = other.last_epoch_marked_full;
2220 modified = true;
2221 }
2222 if (other.last_scrub > last_scrub) {
2223 last_scrub = other.last_scrub;
2224 modified = true;
2225 }
2226 if (other.last_scrub_stamp > last_scrub_stamp) {
2227 last_scrub_stamp = other.last_scrub_stamp;
2228 modified = true;
2229 }
2230 if (other.last_deep_scrub > last_deep_scrub) {
2231 last_deep_scrub = other.last_deep_scrub;
2232 modified = true;
2233 }
2234 if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
2235 last_deep_scrub_stamp = other.last_deep_scrub_stamp;
2236 modified = true;
2237 }
2238 if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
2239 last_clean_scrub_stamp = other.last_clean_scrub_stamp;
2240 modified = true;
2241 }
2242 return modified;
2243 }
2244
2245 void encode(bufferlist& bl) const;
2246 void decode(bufferlist::iterator& p);
2247 void dump(Formatter *f) const;
2248 static void generate_test_instances(list<pg_history_t*>& o);
2249 };
2250 WRITE_CLASS_ENCODER(pg_history_t)
2251
2252 inline ostream& operator<<(ostream& out, const pg_history_t& h) {
2253 return out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
2254 << " lis/c " << h.last_interval_started
2255 << "/" << h.last_interval_clean
2256 << " les/c/f " << h.last_epoch_started << "/" << h.last_epoch_clean
2257 << "/" << h.last_epoch_marked_full
2258 << " " << h.same_up_since
2259 << "/" << h.same_interval_since
2260 << "/" << h.same_primary_since;
2261 }
2262
2263
2264 /**
2265 * pg_info_t - summary of PG statistics.
2266 *
2267 * some notes:
2268 * - last_complete implies we have all objects that existed as of that
2269 * stamp, OR a newer object, OR have already applied a later delete.
2270 * - if last_complete >= log.bottom, then we know pg contents thru log.head.
2271 * otherwise, we have no idea what the pg is supposed to contain.
2272 */
2273 struct pg_info_t {
2274 spg_t pgid;
2275 eversion_t last_update; ///< last object version applied to store.
2276 eversion_t last_complete; ///< last version pg was complete through.
2277 epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd
2278 epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
2279
2280 version_t last_user_version; ///< last user object version applied to store
2281
2282 eversion_t log_tail; ///< oldest log entry.
2283
2284 hobject_t last_backfill; ///< objects >= this and < last_complete may be missing
2285 bool last_backfill_bitwise; ///< true if last_backfill reflects a bitwise (vs nibblewise) sort
2286
2287 interval_set<snapid_t> purged_snaps;
2288
2289 pg_stat_t stats;
2290
2291 pg_history_t history;
2292 pg_hit_set_history_t hit_set;
2293
2294 friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
2295 return
2296 l.pgid == r.pgid &&
2297 l.last_update == r.last_update &&
2298 l.last_complete == r.last_complete &&
2299 l.last_epoch_started == r.last_epoch_started &&
2300 l.last_interval_started == r.last_interval_started &&
2301 l.last_user_version == r.last_user_version &&
2302 l.log_tail == r.log_tail &&
2303 l.last_backfill == r.last_backfill &&
2304 l.last_backfill_bitwise == r.last_backfill_bitwise &&
2305 l.purged_snaps == r.purged_snaps &&
2306 l.stats == r.stats &&
2307 l.history == r.history &&
2308 l.hit_set == r.hit_set;
2309 }
2310
2311 pg_info_t()
2312 : last_epoch_started(0),
2313 last_interval_started(0),
2314 last_user_version(0),
2315 last_backfill(hobject_t::get_max()),
2316 last_backfill_bitwise(false)
2317 { }
2318 // cppcheck-suppress noExplicitConstructor
2319 pg_info_t(spg_t p)
2320 : pgid(p),
2321 last_epoch_started(0),
2322 last_interval_started(0),
2323 last_user_version(0),
2324 last_backfill(hobject_t::get_max()),
2325 last_backfill_bitwise(false)
2326 { }
2327
2328 void set_last_backfill(hobject_t pos) {
2329 last_backfill = pos;
2330 last_backfill_bitwise = true;
2331 }
2332
2333 bool is_empty() const { return last_update.version == 0; }
2334 bool dne() const { return history.epoch_created == 0; }
2335
2336 bool is_incomplete() const { return !last_backfill.is_max(); }
2337
2338 void encode(bufferlist& bl) const;
2339 void decode(bufferlist::iterator& p);
2340 void dump(Formatter *f) const;
2341 bool overlaps_with(const pg_info_t &oinfo) const {
2342 return last_update > oinfo.log_tail ?
2343 oinfo.last_update >= log_tail :
2344 last_update >= oinfo.log_tail;
2345 }
2346 static void generate_test_instances(list<pg_info_t*>& o);
2347 };
2348 WRITE_CLASS_ENCODER(pg_info_t)
2349
2350 inline ostream& operator<<(ostream& out, const pg_info_t& pgi)
2351 {
2352 out << pgi.pgid << "(";
2353 if (pgi.dne())
2354 out << " DNE";
2355 if (pgi.is_empty())
2356 out << " empty";
2357 else {
2358 out << " v " << pgi.last_update;
2359 if (pgi.last_complete != pgi.last_update)
2360 out << " lc " << pgi.last_complete;
2361 out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
2362 }
2363 if (pgi.is_incomplete())
2364 out << " lb " << pgi.last_backfill
2365 << (pgi.last_backfill_bitwise ? " (bitwise)" : " (NIBBLEWISE)");
2366 //out << " c " << pgi.epoch_created;
2367 out << " local-lis/les=" << pgi.last_interval_started
2368 << "/" << pgi.last_epoch_started;
2369 out << " n=" << pgi.stats.stats.sum.num_objects;
2370 out << " " << pgi.history
2371 << ")";
2372 return out;
2373 }
2374
2375 /**
2376 * pg_fast_info_t - common pg_info_t fields
2377 *
2378 * These are the fields of pg_info_t (and children) that are updated for
2379 * most IO operations.
2380 *
2381 * ** WARNING **
2382 * Because we rely on these fields to be applied to the normal
2383 * info struct, adding a new field here that is not also new in info
2384 * means that we must set an incompat OSD feature bit!
2385 */
2386 struct pg_fast_info_t {
2387 eversion_t last_update;
2388 eversion_t last_complete;
2389 version_t last_user_version;
2390 struct { // pg_stat_t stats
2391 eversion_t version;
2392 version_t reported_seq;
2393 utime_t last_fresh;
2394 utime_t last_active;
2395 utime_t last_peered;
2396 utime_t last_clean;
2397 utime_t last_unstale;
2398 utime_t last_undegraded;
2399 utime_t last_fullsized;
2400 int64_t log_size; // (also ondisk_log_size, which has the same value)
2401 struct { // object_stat_collection_t stats;
2402 struct { // objct_stat_sum_t sum
2403 int64_t num_bytes; // in bytes
2404 int64_t num_objects;
2405 int64_t num_object_copies;
2406 int64_t num_rd;
2407 int64_t num_rd_kb;
2408 int64_t num_wr;
2409 int64_t num_wr_kb;
2410 int64_t num_objects_dirty;
2411 } sum;
2412 } stats;
2413 } stats;
2414
2415 void populate_from(const pg_info_t& info) {
2416 last_update = info.last_update;
2417 last_complete = info.last_complete;
2418 last_user_version = info.last_user_version;
2419 stats.version = info.stats.version;
2420 stats.reported_seq = info.stats.reported_seq;
2421 stats.last_fresh = info.stats.last_fresh;
2422 stats.last_active = info.stats.last_active;
2423 stats.last_peered = info.stats.last_peered;
2424 stats.last_clean = info.stats.last_clean;
2425 stats.last_unstale = info.stats.last_unstale;
2426 stats.last_undegraded = info.stats.last_undegraded;
2427 stats.last_fullsized = info.stats.last_fullsized;
2428 stats.log_size = info.stats.log_size;
2429 stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
2430 stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
2431 stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
2432 stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
2433 stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
2434 stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
2435 stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
2436 stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
2437 }
2438
2439 bool try_apply_to(pg_info_t* info) {
2440 if (last_update <= info->last_update)
2441 return false;
2442 info->last_update = last_update;
2443 info->last_complete = last_complete;
2444 info->last_user_version = last_user_version;
2445 info->stats.version = stats.version;
2446 info->stats.reported_seq = stats.reported_seq;
2447 info->stats.last_fresh = stats.last_fresh;
2448 info->stats.last_active = stats.last_active;
2449 info->stats.last_peered = stats.last_peered;
2450 info->stats.last_clean = stats.last_clean;
2451 info->stats.last_unstale = stats.last_unstale;
2452 info->stats.last_undegraded = stats.last_undegraded;
2453 info->stats.last_fullsized = stats.last_fullsized;
2454 info->stats.log_size = stats.log_size;
2455 info->stats.ondisk_log_size = stats.log_size;
2456 info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
2457 info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
2458 info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
2459 info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
2460 info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
2461 info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
2462 info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
2463 info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
2464 return true;
2465 }
2466
2467 void encode(bufferlist& bl) const {
2468 ENCODE_START(1, 1, bl);
2469 ::encode(last_update, bl);
2470 ::encode(last_complete, bl);
2471 ::encode(last_user_version, bl);
2472 ::encode(stats.version, bl);
2473 ::encode(stats.reported_seq, bl);
2474 ::encode(stats.last_fresh, bl);
2475 ::encode(stats.last_active, bl);
2476 ::encode(stats.last_peered, bl);
2477 ::encode(stats.last_clean, bl);
2478 ::encode(stats.last_unstale, bl);
2479 ::encode(stats.last_undegraded, bl);
2480 ::encode(stats.last_fullsized, bl);
2481 ::encode(stats.log_size, bl);
2482 ::encode(stats.stats.sum.num_bytes, bl);
2483 ::encode(stats.stats.sum.num_objects, bl);
2484 ::encode(stats.stats.sum.num_object_copies, bl);
2485 ::encode(stats.stats.sum.num_rd, bl);
2486 ::encode(stats.stats.sum.num_rd_kb, bl);
2487 ::encode(stats.stats.sum.num_wr, bl);
2488 ::encode(stats.stats.sum.num_wr_kb, bl);
2489 ::encode(stats.stats.sum.num_objects_dirty, bl);
2490 ENCODE_FINISH(bl);
2491 }
2492 void decode(bufferlist::iterator& p) {
2493 DECODE_START(1, p);
2494 ::decode(last_update, p);
2495 ::decode(last_complete, p);
2496 ::decode(last_user_version, p);
2497 ::decode(stats.version, p);
2498 ::decode(stats.reported_seq, p);
2499 ::decode(stats.last_fresh, p);
2500 ::decode(stats.last_active, p);
2501 ::decode(stats.last_peered, p);
2502 ::decode(stats.last_clean, p);
2503 ::decode(stats.last_unstale, p);
2504 ::decode(stats.last_undegraded, p);
2505 ::decode(stats.last_fullsized, p);
2506 ::decode(stats.log_size, p);
2507 ::decode(stats.stats.sum.num_bytes, p);
2508 ::decode(stats.stats.sum.num_objects, p);
2509 ::decode(stats.stats.sum.num_object_copies, p);
2510 ::decode(stats.stats.sum.num_rd, p);
2511 ::decode(stats.stats.sum.num_rd_kb, p);
2512 ::decode(stats.stats.sum.num_wr, p);
2513 ::decode(stats.stats.sum.num_wr_kb, p);
2514 ::decode(stats.stats.sum.num_objects_dirty, p);
2515 DECODE_FINISH(p);
2516 }
2517 };
2518 WRITE_CLASS_ENCODER(pg_fast_info_t)
2519
2520
2521 struct pg_notify_t {
2522 epoch_t query_epoch;
2523 epoch_t epoch_sent;
2524 pg_info_t info;
2525 shard_id_t to;
2526 shard_id_t from;
2527 pg_notify_t() :
2528 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
2529 from(shard_id_t::NO_SHARD) {}
2530 pg_notify_t(
2531 shard_id_t to,
2532 shard_id_t from,
2533 epoch_t query_epoch,
2534 epoch_t epoch_sent,
2535 const pg_info_t &info)
2536 : query_epoch(query_epoch),
2537 epoch_sent(epoch_sent),
2538 info(info), to(to), from(from) {
2539 assert(from == info.pgid.shard);
2540 }
2541 void encode(bufferlist &bl) const;
2542 void decode(bufferlist::iterator &p);
2543 void dump(Formatter *f) const;
2544 static void generate_test_instances(list<pg_notify_t*> &o);
2545 };
2546 WRITE_CLASS_ENCODER(pg_notify_t)
2547 ostream &operator<<(ostream &lhs, const pg_notify_t &notify);
2548
2549
2550 class OSDMap;
2551 /**
2552 * PastIntervals -- information needed to determine the PriorSet and
2553 * the might_have_unfound set
2554 */
2555 class PastIntervals {
2556 public:
2557 struct pg_interval_t {
2558 vector<int32_t> up, acting;
2559 epoch_t first, last;
2560 bool maybe_went_rw;
2561 int32_t primary;
2562 int32_t up_primary;
2563
2564 pg_interval_t()
2565 : first(0), last(0),
2566 maybe_went_rw(false),
2567 primary(-1),
2568 up_primary(-1)
2569 {}
2570
2571 pg_interval_t(
2572 vector<int32_t> &&up,
2573 vector<int32_t> &&acting,
2574 epoch_t first,
2575 epoch_t last,
2576 bool maybe_went_rw,
2577 int32_t primary,
2578 int32_t up_primary)
2579 : up(up), acting(acting), first(first), last(last),
2580 maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
2581 {}
2582
2583 void encode(bufferlist& bl) const;
2584 void decode(bufferlist::iterator& bl);
2585 void dump(Formatter *f) const;
2586 static void generate_test_instances(list<pg_interval_t*>& o);
2587 };
2588
2589 PastIntervals() = default;
2590 PastIntervals(bool ec_pool, const OSDMap &osdmap) : PastIntervals() {
2591 update_type_from_map(ec_pool, osdmap);
2592 }
2593 PastIntervals(bool ec_pool, bool compact) : PastIntervals() {
2594 update_type(ec_pool, compact);
2595 }
2596 PastIntervals(PastIntervals &&rhs) = default;
2597 PastIntervals &operator=(PastIntervals &&rhs) = default;
2598
2599 PastIntervals(const PastIntervals &rhs);
2600 PastIntervals &operator=(const PastIntervals &rhs);
2601
2602 class interval_rep {
2603 public:
2604 virtual size_t size() const = 0;
2605 virtual bool empty() const = 0;
2606 virtual void clear() = 0;
2607 virtual pair<epoch_t, epoch_t> get_bounds() const = 0;
2608 virtual set<pg_shard_t> get_all_participants(
2609 bool ec_pool) const = 0;
2610 virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
2611 virtual unique_ptr<interval_rep> clone() const = 0;
2612 virtual ostream &print(ostream &out) const = 0;
2613 virtual void encode(bufferlist &bl) const = 0;
2614 virtual void decode(bufferlist::iterator &bl) = 0;
2615 virtual void dump(Formatter *f) const = 0;
2616 virtual bool is_classic() const = 0;
2617 virtual void iterate_mayberw_back_to(
2618 bool ec_pool,
2619 epoch_t les,
2620 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const = 0;
2621
2622 virtual bool has_full_intervals() const { return false; }
2623 virtual void iterate_all_intervals(
2624 std::function<void(const pg_interval_t &)> &&f) const {
2625 assert(!has_full_intervals());
2626 assert(0 == "not valid for this implementation");
2627 }
2628
2629 virtual ~interval_rep() {}
2630 };
2631 friend class pi_simple_rep;
2632 friend class pi_compact_rep;
2633 private:
2634
2635 unique_ptr<interval_rep> past_intervals;
2636
2637 PastIntervals(interval_rep *rep) : past_intervals(rep) {}
2638
2639 public:
2640 void add_interval(bool ec_pool, const pg_interval_t &interval) {
2641 assert(past_intervals);
2642 return past_intervals->add_interval(ec_pool, interval);
2643 }
2644
2645 bool is_classic() const {
2646 assert(past_intervals);
2647 return past_intervals->is_classic();
2648 }
2649
2650 void encode(bufferlist &bl) const {
2651 ENCODE_START(1, 1, bl);
2652 if (past_intervals) {
2653 __u8 type = is_classic() ? 1 : 2;
2654 ::encode(type, bl);
2655 past_intervals->encode(bl);
2656 } else {
2657 ::encode((__u8)0, bl);
2658 }
2659 ENCODE_FINISH(bl);
2660 }
2661 void encode_classic(bufferlist &bl) const {
2662 if (past_intervals) {
2663 assert(past_intervals->is_classic());
2664 past_intervals->encode(bl);
2665 } else {
2666 // it's a map<>
2667 ::encode((uint32_t)0, bl);
2668 }
2669 }
2670
2671 void decode(bufferlist::iterator &bl);
2672 void decode_classic(bufferlist::iterator &bl);
2673
2674 void dump(Formatter *f) const {
2675 assert(past_intervals);
2676 past_intervals->dump(f);
2677 }
2678 static void generate_test_instances(list<PastIntervals *> & o);
2679
2680 /**
2681 * Determines whether there is an interval change
2682 */
2683 static bool is_new_interval(
2684 int old_acting_primary,
2685 int new_acting_primary,
2686 const vector<int> &old_acting,
2687 const vector<int> &new_acting,
2688 int old_up_primary,
2689 int new_up_primary,
2690 const vector<int> &old_up,
2691 const vector<int> &new_up,
2692 int old_size,
2693 int new_size,
2694 int old_min_size,
2695 int new_min_size,
2696 unsigned old_pg_num,
2697 unsigned new_pg_num,
2698 bool old_sort_bitwise,
2699 bool new_sort_bitwise,
2700 bool old_recovery_deletes,
2701 bool new_recovery_deletes,
2702 pg_t pgid
2703 );
2704
2705 /**
2706 * Determines whether there is an interval change
2707 */
2708 static bool is_new_interval(
2709 int old_acting_primary, ///< [in] primary as of lastmap
2710 int new_acting_primary, ///< [in] primary as of lastmap
2711 const vector<int> &old_acting, ///< [in] acting as of lastmap
2712 const vector<int> &new_acting, ///< [in] acting as of osdmap
2713 int old_up_primary, ///< [in] up primary of lastmap
2714 int new_up_primary, ///< [in] up primary of osdmap
2715 const vector<int> &old_up, ///< [in] up as of lastmap
2716 const vector<int> &new_up, ///< [in] up as of osdmap
2717 ceph::shared_ptr<const OSDMap> osdmap, ///< [in] current map
2718 ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
2719 pg_t pgid ///< [in] pgid for pg
2720 );
2721
2722 /**
2723 * Integrates a new map into *past_intervals, returns true
2724 * if an interval was closed out.
2725 */
2726 static bool check_new_interval(
2727 int old_acting_primary, ///< [in] primary as of lastmap
2728 int new_acting_primary, ///< [in] primary as of osdmap
2729 const vector<int> &old_acting, ///< [in] acting as of lastmap
2730 const vector<int> &new_acting, ///< [in] acting as of osdmap
2731 int old_up_primary, ///< [in] up primary of lastmap
2732 int new_up_primary, ///< [in] up primary of osdmap
2733 const vector<int> &old_up, ///< [in] up as of lastmap
2734 const vector<int> &new_up, ///< [in] up as of osdmap
2735 epoch_t same_interval_since, ///< [in] as of osdmap
2736 epoch_t last_epoch_clean, ///< [in] current
2737 ceph::shared_ptr<const OSDMap> osdmap, ///< [in] current map
2738 ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
2739 pg_t pgid, ///< [in] pgid for pg
2740 IsPGRecoverablePredicate *could_have_gone_active, /// [in] predicate whether the pg can be active
2741 PastIntervals *past_intervals, ///< [out] intervals
2742 ostream *out = 0 ///< [out] debug ostream
2743 );
2744
2745 friend ostream& operator<<(ostream& out, const PastIntervals &i);
2746
2747 template <typename F>
2748 void iterate_mayberw_back_to(
2749 bool ec_pool,
2750 epoch_t les,
2751 F &&f) const {
2752 assert(past_intervals);
2753 past_intervals->iterate_mayberw_back_to(ec_pool, les, std::forward<F>(f));
2754 }
2755 void clear() {
2756 assert(past_intervals);
2757 past_intervals->clear();
2758 }
2759
2760 /**
2761 * Should return a value which gives an indication of the amount
2762 * of state contained
2763 */
2764 size_t size() const {
2765 assert(past_intervals);
2766 return past_intervals->size();
2767 }
2768
2769 bool empty() const {
2770 assert(past_intervals);
2771 return past_intervals->empty();
2772 }
2773
2774 void swap(PastIntervals &other) {
2775 using std::swap;
2776 swap(other.past_intervals, past_intervals);
2777 }
2778
2779 /**
2780 * Return all shards which have been in the acting set back to the
2781 * latest epoch to which we have trimmed except for pg_whoami
2782 */
2783 set<pg_shard_t> get_might_have_unfound(
2784 pg_shard_t pg_whoami,
2785 bool ec_pool) const {
2786 assert(past_intervals);
2787 auto ret = past_intervals->get_all_participants(ec_pool);
2788 ret.erase(pg_whoami);
2789 return ret;
2790 }
2791
2792 /**
2793 * Return all shards which we might want to talk to for peering
2794 */
2795 set<pg_shard_t> get_all_probe(
2796 bool ec_pool) const {
2797 assert(past_intervals);
2798 return past_intervals->get_all_participants(ec_pool);
2799 }
2800
2801 /* Return the set of epochs [start, end) represented by the
2802 * past_interval set.
2803 */
2804 pair<epoch_t, epoch_t> get_bounds() const {
2805 assert(past_intervals);
2806 return past_intervals->get_bounds();
2807 }
2808
2809 enum osd_state_t {
2810 UP,
2811 DOWN,
2812 DNE,
2813 LOST
2814 };
2815 struct PriorSet {
2816 bool ec_pool = false;
2817 set<pg_shard_t> probe; /// current+prior OSDs we need to probe.
2818 set<int> down; /// down osds that would normally be in @a probe and might be interesting.
2819 map<int, epoch_t> blocked_by; /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
2820
2821 bool pg_down = false; /// some down osds are included in @a cur; the DOWN pg state bit should be set.
2822 unique_ptr<IsPGRecoverablePredicate> pcontdec;
2823
2824 PriorSet() = default;
2825 PriorSet(PriorSet &&) = default;
2826 PriorSet &operator=(PriorSet &&) = default;
2827
2828 PriorSet &operator=(const PriorSet &) = delete;
2829 PriorSet(const PriorSet &) = delete;
2830
2831 bool operator==(const PriorSet &rhs) const {
2832 return (ec_pool == rhs.ec_pool) &&
2833 (probe == rhs.probe) &&
2834 (down == rhs.down) &&
2835 (blocked_by == rhs.blocked_by) &&
2836 (pg_down == rhs.pg_down);
2837 }
2838
2839 bool affected_by_map(
2840 const OSDMap &osdmap,
2841 const DoutPrefixProvider *dpp) const;
2842
2843 // For verifying tests
2844 PriorSet(
2845 bool ec_pool,
2846 set<pg_shard_t> probe,
2847 set<int> down,
2848 map<int, epoch_t> blocked_by,
2849 bool pg_down,
2850 IsPGRecoverablePredicate *pcontdec)
2851 : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
2852 pg_down(pg_down), pcontdec(pcontdec) {}
2853
2854 private:
2855 template <typename F>
2856 PriorSet(
2857 const PastIntervals &past_intervals,
2858 bool ec_pool,
2859 epoch_t last_epoch_started,
2860 IsPGRecoverablePredicate *c,
2861 F f,
2862 const vector<int> &up,
2863 const vector<int> &acting,
2864 const DoutPrefixProvider *dpp);
2865
2866 friend class PastIntervals;
2867 };
2868
2869 void update_type(bool ec_pool, bool compact);
2870 void update_type_from_map(bool ec_pool, const OSDMap &osdmap);
2871
2872 template <typename... Args>
2873 PriorSet get_prior_set(Args&&... args) const {
2874 return PriorSet(*this, std::forward<Args>(args)...);
2875 }
2876 };
2877 WRITE_CLASS_ENCODER(PastIntervals)
2878
2879 ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i);
2880 ostream& operator<<(ostream& out, const PastIntervals &i);
2881 ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i);
2882
2883 template <typename F>
2884 PastIntervals::PriorSet::PriorSet(
2885 const PastIntervals &past_intervals,
2886 bool ec_pool,
2887 epoch_t last_epoch_started,
2888 IsPGRecoverablePredicate *c,
2889 F f,
2890 const vector<int> &up,
2891 const vector<int> &acting,
2892 const DoutPrefixProvider *dpp)
2893 : ec_pool(ec_pool), pg_down(false), pcontdec(c)
2894 {
2895 /*
2896 * We have to be careful to gracefully deal with situations like
2897 * so. Say we have a power outage or something that takes out both
2898 * OSDs, but the monitor doesn't mark them down in the same epoch.
2899 * The history may look like
2900 *
2901 * 1: A B
2902 * 2: B
2903 * 3: let's say B dies for good, too (say, from the power spike)
2904 * 4: A
2905 *
2906 * which makes it look like B may have applied updates to the PG
2907 * that we need in order to proceed. This sucks...
2908 *
2909 * To minimize the risk of this happening, we CANNOT go active if
2910 * _any_ OSDs in the prior set are down until we send an MOSDAlive
2911 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
2912 * Then, we have something like
2913 *
2914 * 1: A B
2915 * 2: B up_thru[B]=0
2916 * 3:
2917 * 4: A
2918 *
2919 * -> we can ignore B, bc it couldn't have gone active (alive_thru
2920 * still 0).
2921 *
2922 * or,
2923 *
2924 * 1: A B
2925 * 2: B up_thru[B]=0
2926 * 3: B up_thru[B]=2
2927 * 4:
2928 * 5: A
2929 *
2930 * -> we must wait for B, bc it was alive through 2, and could have
2931 * written to the pg.
2932 *
2933 * If B is really dead, then an administrator will need to manually
2934 * intervene by marking the OSD as "lost."
2935 */
2936
2937 // Include current acting and up nodes... not because they may
2938 // contain old data (this interval hasn't gone active, obviously),
2939 // but because we want their pg_info to inform choose_acting(), and
2940 // so that we know what they do/do not have explicitly before
2941 // sending them any new info/logs/whatever.
2942 for (unsigned i = 0; i < acting.size(); i++) {
2943 if (acting[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
2944 probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
2945 }
2946 // It may be possible to exclude the up nodes, but let's keep them in
2947 // there for now.
2948 for (unsigned i = 0; i < up.size(); i++) {
2949 if (up[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
2950 probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
2951 }
2952
2953 set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
2954 ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
2955 for (auto &&i: all_probe) {
2956 switch (f(0, i.osd, nullptr)) {
2957 case UP: {
2958 probe.insert(i);
2959 break;
2960 }
2961 case DNE:
2962 case LOST:
2963 case DOWN: {
2964 down.insert(i.osd);
2965 break;
2966 }
2967 }
2968 }
2969
2970 past_intervals.iterate_mayberw_back_to(
2971 ec_pool,
2972 last_epoch_started,
2973 [&](epoch_t start, const set<pg_shard_t> &acting) {
2974 ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
2975 << ", acting: " << acting << dendl;
2976
2977 // look at candidate osds during this interval. each falls into
2978 // one of three categories: up, down (but potentially
2979 // interesting), or lost (down, but we won't wait for it).
2980 set<pg_shard_t> up_now;
2981 map<int, epoch_t> candidate_blocked_by;
2982 // any candidates down now (that might have useful data)
2983 bool any_down_now = false;
2984
2985 // consider ACTING osds
2986 for (auto &&so: acting) {
2987 epoch_t lost_at = 0;
2988 switch (f(start, so.osd, &lost_at)) {
2989 case UP: {
2990 // include past acting osds if they are up.
2991 up_now.insert(so);
2992 break;
2993 }
2994 case DNE: {
2995 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
2996 << " no longer exists" << dendl;
2997 break;
2998 }
2999 case LOST: {
3000 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3001 << " is down, but lost_at " << lost_at << dendl;
3002 up_now.insert(so);
3003 break;
3004 }
3005 case DOWN: {
3006 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3007 << " is down" << dendl;
3008 candidate_blocked_by[so.osd] = lost_at;
3009 any_down_now = true;
3010 break;
3011 }
3012 }
3013 }
3014
3015 // if not enough osds survived this interval, and we may have gone rw,
3016 // then we need to wait for one of those osds to recover to
3017 // ensure that we haven't lost any information.
3018 if (!(*pcontdec)(up_now) && any_down_now) {
3019 // fixme: how do we identify a "clean" shutdown anyway?
3020 ldpp_dout(dpp, 10) << "build_prior possibly went active+rw,"
3021 << " insufficient up; including down osds" << dendl;
3022 assert(!candidate_blocked_by.empty());
3023 pg_down = true;
3024 blocked_by.insert(
3025 candidate_blocked_by.begin(),
3026 candidate_blocked_by.end());
3027 }
3028 });
3029
3030 ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
3031 << " down " << down
3032 << " blocked_by " << blocked_by
3033 << (pg_down ? " pg_down":"")
3034 << dendl;
3035 }
3036
3037 /**
3038 * pg_query_t - used to ask a peer for information about a pg.
3039 *
3040 * note: if version=0, type=LOG, then we just provide our full log.
3041 */
3042 struct pg_query_t {
3043 enum {
3044 INFO = 0,
3045 LOG = 1,
3046 MISSING = 4,
3047 FULLLOG = 5,
3048 };
3049 const char *get_type_name() const {
3050 switch (type) {
3051 case INFO: return "info";
3052 case LOG: return "log";
3053 case MISSING: return "missing";
3054 case FULLLOG: return "fulllog";
3055 default: return "???";
3056 }
3057 }
3058
3059 __s32 type;
3060 eversion_t since;
3061 pg_history_t history;
3062 epoch_t epoch_sent;
3063 shard_id_t to;
3064 shard_id_t from;
3065
3066 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
3067 from(shard_id_t::NO_SHARD) {}
3068 pg_query_t(
3069 int t,
3070 shard_id_t to,
3071 shard_id_t from,
3072 const pg_history_t& h,
3073 epoch_t epoch_sent)
3074 : type(t),
3075 history(h),
3076 epoch_sent(epoch_sent),
3077 to(to), from(from) {
3078 assert(t != LOG);
3079 }
3080 pg_query_t(
3081 int t,
3082 shard_id_t to,
3083 shard_id_t from,
3084 eversion_t s,
3085 const pg_history_t& h,
3086 epoch_t epoch_sent)
3087 : type(t), since(s), history(h),
3088 epoch_sent(epoch_sent), to(to), from(from) {
3089 assert(t == LOG);
3090 }
3091
3092 void encode(bufferlist &bl, uint64_t features) const;
3093 void decode(bufferlist::iterator &bl);
3094
3095 void dump(Formatter *f) const;
3096 static void generate_test_instances(list<pg_query_t*>& o);
3097 };
3098 WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
3099
3100 inline ostream& operator<<(ostream& out, const pg_query_t& q) {
3101 out << "query(" << q.get_type_name() << " " << q.since;
3102 if (q.type == pg_query_t::LOG)
3103 out << " " << q.history;
3104 out << ")";
3105 return out;
3106 }
3107
3108 class PGBackend;
3109 class ObjectModDesc {
3110 bool can_local_rollback;
3111 bool rollback_info_completed;
3112
3113 // version required to decode, reflected in encode/decode version
3114 __u8 max_required_version = 1;
3115 public:
3116 class Visitor {
3117 public:
3118 virtual void append(uint64_t old_offset) {}
3119 virtual void setattrs(map<string, boost::optional<bufferlist> > &attrs) {}
3120 virtual void rmobject(version_t old_version) {}
3121 /**
3122 * Used to support the unfound_lost_delete log event: if the stashed
3123 * version exists, we unstash it, otherwise, we do nothing. This way
3124 * each replica rolls back to whatever state it had prior to the attempt
3125 * at mark unfound lost delete
3126 */
3127 virtual void try_rmobject(version_t old_version) {
3128 rmobject(old_version);
3129 }
3130 virtual void create() {}
3131 virtual void update_snaps(const set<snapid_t> &old_snaps) {}
3132 virtual void rollback_extents(
3133 version_t gen,
3134 const vector<pair<uint64_t, uint64_t> > &extents) {}
3135 virtual ~Visitor() {}
3136 };
3137 void visit(Visitor *visitor) const;
3138 mutable bufferlist bl;
3139 enum ModID {
3140 APPEND = 1,
3141 SETATTRS = 2,
3142 DELETE = 3,
3143 CREATE = 4,
3144 UPDATE_SNAPS = 5,
3145 TRY_DELETE = 6,
3146 ROLLBACK_EXTENTS = 7
3147 };
3148 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3149 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3150 }
3151 void claim(ObjectModDesc &other) {
3152 bl.clear();
3153 bl.claim(other.bl);
3154 can_local_rollback = other.can_local_rollback;
3155 rollback_info_completed = other.rollback_info_completed;
3156 }
3157 void claim_append(ObjectModDesc &other) {
3158 if (!can_local_rollback || rollback_info_completed)
3159 return;
3160 if (!other.can_local_rollback) {
3161 mark_unrollbackable();
3162 return;
3163 }
3164 bl.claim_append(other.bl);
3165 rollback_info_completed = other.rollback_info_completed;
3166 }
3167 void swap(ObjectModDesc &other) {
3168 bl.swap(other.bl);
3169
3170 using std::swap;
3171 swap(other.can_local_rollback, can_local_rollback);
3172 swap(other.rollback_info_completed, rollback_info_completed);
3173 swap(other.max_required_version, max_required_version);
3174 }
3175 void append_id(ModID id) {
3176 uint8_t _id(id);
3177 ::encode(_id, bl);
3178 }
3179 void append(uint64_t old_size) {
3180 if (!can_local_rollback || rollback_info_completed)
3181 return;
3182 ENCODE_START(1, 1, bl);
3183 append_id(APPEND);
3184 ::encode(old_size, bl);
3185 ENCODE_FINISH(bl);
3186 }
3187 void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
3188 if (!can_local_rollback || rollback_info_completed)
3189 return;
3190 ENCODE_START(1, 1, bl);
3191 append_id(SETATTRS);
3192 ::encode(old_attrs, bl);
3193 ENCODE_FINISH(bl);
3194 }
3195 bool rmobject(version_t deletion_version) {
3196 if (!can_local_rollback || rollback_info_completed)
3197 return false;
3198 ENCODE_START(1, 1, bl);
3199 append_id(DELETE);
3200 ::encode(deletion_version, bl);
3201 ENCODE_FINISH(bl);
3202 rollback_info_completed = true;
3203 return true;
3204 }
3205 bool try_rmobject(version_t deletion_version) {
3206 if (!can_local_rollback || rollback_info_completed)
3207 return false;
3208 ENCODE_START(1, 1, bl);
3209 append_id(TRY_DELETE);
3210 ::encode(deletion_version, bl);
3211 ENCODE_FINISH(bl);
3212 rollback_info_completed = true;
3213 return true;
3214 }
3215 void create() {
3216 if (!can_local_rollback || rollback_info_completed)
3217 return;
3218 rollback_info_completed = true;
3219 ENCODE_START(1, 1, bl);
3220 append_id(CREATE);
3221 ENCODE_FINISH(bl);
3222 }
3223 void update_snaps(const set<snapid_t> &old_snaps) {
3224 if (!can_local_rollback || rollback_info_completed)
3225 return;
3226 ENCODE_START(1, 1, bl);
3227 append_id(UPDATE_SNAPS);
3228 ::encode(old_snaps, bl);
3229 ENCODE_FINISH(bl);
3230 }
3231 void rollback_extents(
3232 version_t gen, const vector<pair<uint64_t, uint64_t> > &extents) {
3233 assert(can_local_rollback);
3234 assert(!rollback_info_completed);
3235 if (max_required_version < 2)
3236 max_required_version = 2;
3237 ENCODE_START(2, 2, bl);
3238 append_id(ROLLBACK_EXTENTS);
3239 ::encode(gen, bl);
3240 ::encode(extents, bl);
3241 ENCODE_FINISH(bl);
3242 }
3243
3244 // cannot be rolled back
3245 void mark_unrollbackable() {
3246 can_local_rollback = false;
3247 bl.clear();
3248 }
3249 bool can_rollback() const {
3250 return can_local_rollback;
3251 }
3252 bool empty() const {
3253 return can_local_rollback && (bl.length() == 0);
3254 }
3255
3256 bool requires_kraken() const {
3257 return max_required_version >= 2;
3258 }
3259
3260 /**
3261 * Create fresh copy of bl bytes to avoid keeping large buffers around
3262 * in the case that bl contains ptrs which point into a much larger
3263 * message buffer
3264 */
3265 void trim_bl() const {
3266 if (bl.length() > 0)
3267 bl.rebuild();
3268 }
3269 void encode(bufferlist &bl) const;
3270 void decode(bufferlist::iterator &bl);
3271 void dump(Formatter *f) const;
3272 static void generate_test_instances(list<ObjectModDesc*>& o);
3273 };
3274 WRITE_CLASS_ENCODER(ObjectModDesc)
3275
3276
3277 /**
3278 * pg_log_entry_t - single entry/event in pg log
3279 *
3280 */
3281 struct pg_log_entry_t {
3282 enum {
3283 MODIFY = 1, // some unspecified modification (but not *all* modifications)
3284 CLONE = 2, // cloned object from head
3285 DELETE = 3, // deleted object
3286 BACKLOG = 4, // event invented by generate_backlog [deprecated]
3287 LOST_REVERT = 5, // lost new version, revert to an older version.
3288 LOST_DELETE = 6, // lost new version, revert to no object (deleted).
3289 LOST_MARK = 7, // lost new version, now EIO
3290 PROMOTE = 8, // promoted object from another tier
3291 CLEAN = 9, // mark an object clean
3292 ERROR = 10, // write that returned an error
3293 };
3294 static const char *get_op_name(int op) {
3295 switch (op) {
3296 case MODIFY:
3297 return "modify";
3298 case PROMOTE:
3299 return "promote";
3300 case CLONE:
3301 return "clone";
3302 case DELETE:
3303 return "delete";
3304 case BACKLOG:
3305 return "backlog";
3306 case LOST_REVERT:
3307 return "l_revert";
3308 case LOST_DELETE:
3309 return "l_delete";
3310 case LOST_MARK:
3311 return "l_mark";
3312 case CLEAN:
3313 return "clean";
3314 case ERROR:
3315 return "error";
3316 default:
3317 return "unknown";
3318 }
3319 }
3320 const char *get_op_name() const {
3321 return get_op_name(op);
3322 }
3323
3324 // describes state for a locally-rollbackable entry
3325 ObjectModDesc mod_desc;
3326 bufferlist snaps; // only for clone entries
3327 hobject_t soid;
3328 osd_reqid_t reqid; // caller+tid to uniquely identify request
3329 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids;
3330 eversion_t version, prior_version, reverting_to;
3331 version_t user_version; // the user version for this entry
3332 utime_t mtime; // this is the _user_ mtime, mind you
3333 int32_t return_code; // only stored for ERRORs for dup detection
3334
3335 __s32 op;
3336 bool invalid_hash; // only when decoding sobject_t based entries
3337 bool invalid_pool; // only when decoding pool-less hobject based entries
3338
3339 pg_log_entry_t()
3340 : user_version(0), return_code(0), op(0),
3341 invalid_hash(false), invalid_pool(false) {
3342 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3343 }
3344 pg_log_entry_t(int _op, const hobject_t& _soid,
3345 const eversion_t& v, const eversion_t& pv,
3346 version_t uv,
3347 const osd_reqid_t& rid, const utime_t& mt,
3348 int return_code)
3349 : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
3350 mtime(mt), return_code(return_code), op(_op),
3351 invalid_hash(false), invalid_pool(false) {
3352 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3353 }
3354
3355 bool is_clone() const { return op == CLONE; }
3356 bool is_modify() const { return op == MODIFY; }
3357 bool is_promote() const { return op == PROMOTE; }
3358 bool is_clean() const { return op == CLEAN; }
3359 bool is_backlog() const { return op == BACKLOG; }
3360 bool is_lost_revert() const { return op == LOST_REVERT; }
3361 bool is_lost_delete() const { return op == LOST_DELETE; }
3362 bool is_lost_mark() const { return op == LOST_MARK; }
3363 bool is_error() const { return op == ERROR; }
3364
3365 bool is_update() const {
3366 return
3367 is_clone() || is_modify() || is_promote() || is_clean() ||
3368 is_backlog() || is_lost_revert() || is_lost_mark();
3369 }
3370 bool is_delete() const {
3371 return op == DELETE || op == LOST_DELETE;
3372 }
3373
3374 bool can_rollback() const {
3375 return mod_desc.can_rollback();
3376 }
3377
3378 void mark_unrollbackable() {
3379 mod_desc.mark_unrollbackable();
3380 }
3381
3382 bool requires_kraken() const {
3383 return mod_desc.requires_kraken();
3384 }
3385
3386 // Errors are only used for dup detection, whereas
3387 // the index by objects is used by recovery, copy_get,
3388 // and other facilities that don't expect or need to
3389 // be aware of error entries.
3390 bool object_is_indexed() const {
3391 return !is_error();
3392 }
3393
3394 bool reqid_is_indexed() const {
3395 return reqid != osd_reqid_t() &&
3396 (op == MODIFY || op == DELETE || op == ERROR);
3397 }
3398
3399 string get_key_name() const;
3400 void encode_with_checksum(bufferlist& bl) const;
3401 void decode_with_checksum(bufferlist::iterator& p);
3402
3403 void encode(bufferlist &bl) const;
3404 void decode(bufferlist::iterator &bl);
3405 void dump(Formatter *f) const;
3406 static void generate_test_instances(list<pg_log_entry_t*>& o);
3407
3408 };
3409 WRITE_CLASS_ENCODER(pg_log_entry_t)
3410
3411 ostream& operator<<(ostream& out, const pg_log_entry_t& e);
3412
3413 struct pg_log_dup_t {
3414 osd_reqid_t reqid; // caller+tid to uniquely identify request
3415 eversion_t version;
3416 version_t user_version; // the user version for this entry
3417 int32_t return_code; // only stored for ERRORs for dup detection
3418
3419 pg_log_dup_t()
3420 : user_version(0), return_code(0)
3421 {}
3422 explicit pg_log_dup_t(const pg_log_entry_t& entry)
3423 : reqid(entry.reqid), version(entry.version),
3424 user_version(entry.user_version), return_code(entry.return_code)
3425 {}
3426 pg_log_dup_t(const eversion_t& v, version_t uv,
3427 const osd_reqid_t& rid, int return_code)
3428 : reqid(rid), version(v), user_version(uv),
3429 return_code(return_code)
3430 {}
3431
3432 string get_key_name() const;
3433 void encode(bufferlist &bl) const;
3434 void decode(bufferlist::iterator &bl);
3435 void dump(Formatter *f) const;
3436 static void generate_test_instances(list<pg_log_dup_t*>& o);
3437
3438 friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
3439 };
3440 WRITE_CLASS_ENCODER(pg_log_dup_t)
3441
3442 std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
3443
3444 /**
3445 * pg_log_t - incremental log of recent pg changes.
3446 *
3447 * serves as a recovery queue for recent changes.
3448 */
3449 struct pg_log_t {
3450 /*
3451 * head - newest entry (update|delete)
3452 * tail - entry previous to oldest (update|delete) for which we have
3453 * complete negative information.
3454 * i.e. we can infer pg contents for any store whose last_update >= tail.
3455 */
3456 eversion_t head; // newest entry
3457 eversion_t tail; // version prior to oldest
3458
3459 protected:
3460 // We can rollback rollback-able entries > can_rollback_to
3461 eversion_t can_rollback_to;
3462
3463 // always <= can_rollback_to, indicates how far stashed rollback
3464 // data can be found
3465 eversion_t rollback_info_trimmed_to;
3466
3467 public:
3468 // the actual log
3469 mempool::osd_pglog::list<pg_log_entry_t> log;
3470
3471 // entries just for dup op detection ordered oldest to newest
3472 mempool::osd_pglog::list<pg_log_dup_t> dups;
3473
3474 pg_log_t() = default;
3475 pg_log_t(const eversion_t &last_update,
3476 const eversion_t &log_tail,
3477 const eversion_t &can_rollback_to,
3478 const eversion_t &rollback_info_trimmed_to,
3479 mempool::osd_pglog::list<pg_log_entry_t> &&entries,
3480 mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
3481 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3482 rollback_info_trimmed_to(rollback_info_trimmed_to),
3483 log(std::move(entries)), dups(std::move(dup_entries)) {}
3484 pg_log_t(const eversion_t &last_update,
3485 const eversion_t &log_tail,
3486 const eversion_t &can_rollback_to,
3487 const eversion_t &rollback_info_trimmed_to,
3488 const std::list<pg_log_entry_t> &entries,
3489 const std::list<pg_log_dup_t> &dup_entries)
3490 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3491 rollback_info_trimmed_to(rollback_info_trimmed_to) {
3492 for (auto &&entry: entries) {
3493 log.push_back(entry);
3494 }
3495 for (auto &&entry: dup_entries) {
3496 dups.push_back(entry);
3497 }
3498 }
3499
3500 void clear() {
3501 eversion_t z;
3502 rollback_info_trimmed_to = can_rollback_to = head = tail = z;
3503 log.clear();
3504 dups.clear();
3505 }
3506
3507 eversion_t get_rollback_info_trimmed_to() const {
3508 return rollback_info_trimmed_to;
3509 }
3510 eversion_t get_can_rollback_to() const {
3511 return can_rollback_to;
3512 }
3513
3514
3515 pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
3516 mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
3517 oldlog.swap(log);
3518
3519 eversion_t old_tail;
3520 unsigned mask = ~((~0)<<split_bits);
3521 for (auto i = oldlog.begin();
3522 i != oldlog.end();
3523 ) {
3524 if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
3525 childlog.push_back(*i);
3526 } else {
3527 log.push_back(*i);
3528 }
3529 oldlog.erase(i++);
3530 }
3531
3532 // osd_reqid is unique, so it doesn't matter if there are extra
3533 // dup entries in each pg. To avoid storing oid with the dup
3534 // entries, just copy the whole list.
3535 auto childdups(dups);
3536
3537 return pg_log_t(
3538 head,
3539 tail,
3540 can_rollback_to,
3541 rollback_info_trimmed_to,
3542 std::move(childlog),
3543 std::move(childdups));
3544 }
3545
3546 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
3547 assert(newhead >= tail);
3548
3549 mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
3550 mempool::osd_pglog::list<pg_log_entry_t> divergent;
3551 while (true) {
3552 if (p == log.begin()) {
3553 // yikes, the whole thing is divergent!
3554 using std::swap;
3555 swap(divergent, log);
3556 break;
3557 }
3558 --p;
3559 if (p->version.version <= newhead.version) {
3560 /*
3561 * look at eversion.version here. we want to avoid a situation like:
3562 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
3563 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
3564 * lower_bound = 100'9
3565 * i.e, same request, different version. If the eversion.version is > the
3566 * lower_bound, we it is divergent.
3567 */
3568 ++p;
3569 divergent.splice(divergent.begin(), log, p, log.end());
3570 break;
3571 }
3572 assert(p->version > newhead);
3573 }
3574 head = newhead;
3575
3576 if (can_rollback_to > newhead)
3577 can_rollback_to = newhead;
3578
3579 if (rollback_info_trimmed_to > newhead)
3580 rollback_info_trimmed_to = newhead;
3581
3582 return divergent;
3583 }
3584
3585 bool empty() const {
3586 return log.empty();
3587 }
3588
3589 bool null() const {
3590 return head.version == 0 && head.epoch == 0;
3591 }
3592
3593 size_t approx_size() const {
3594 return head.version - tail.version;
3595 }
3596
3597 static void filter_log(spg_t import_pgid, const OSDMap &curmap,
3598 const string &hit_set_namespace, const pg_log_t &in,
3599 pg_log_t &out, pg_log_t &reject);
3600
3601 /**
3602 * copy entries from the tail of another pg_log_t
3603 *
3604 * @param other pg_log_t to copy from
3605 * @param from copy entries after this version
3606 */
3607 void copy_after(const pg_log_t &other, eversion_t from);
3608
3609 /**
3610 * copy a range of entries from another pg_log_t
3611 *
3612 * @param other pg_log_t to copy from
3613 * @param from copy entries after this version
3614 * @param to up to and including this version
3615 */
3616 void copy_range(const pg_log_t &other, eversion_t from, eversion_t to);
3617
3618 /**
3619 * copy up to N entries
3620 *
3621 * @param other source log
3622 * @param max max number of entries to copy
3623 */
3624 void copy_up_to(const pg_log_t &other, int max);
3625
3626 ostream& print(ostream& out) const;
3627
3628 void encode(bufferlist &bl) const;
3629 void decode(bufferlist::iterator &bl, int64_t pool = -1);
3630 void dump(Formatter *f) const;
3631 static void generate_test_instances(list<pg_log_t*>& o);
3632 };
3633 WRITE_CLASS_ENCODER(pg_log_t)
3634
3635 inline ostream& operator<<(ostream& out, const pg_log_t& log)
3636 {
3637 out << "log((" << log.tail << "," << log.head << "], crt="
3638 << log.get_can_rollback_to() << ")";
3639 return out;
3640 }
3641
3642
3643 /**
3644 * pg_missing_t - summary of missing objects.
3645 *
3646 * kept in memory, as a supplement to pg_log_t
3647 * also used to pass missing info in messages.
3648 */
3649 struct pg_missing_item {
3650 eversion_t need, have;
3651 enum missing_flags_t {
3652 FLAG_NONE = 0,
3653 FLAG_DELETE = 1,
3654 } flags;
3655 pg_missing_item() : flags(FLAG_NONE) {}
3656 explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
3657 pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false) : need(n), have(h) {
3658 set_delete(is_delete);
3659 }
3660
3661 void encode(bufferlist& bl, uint64_t features) const {
3662 if (HAVE_FEATURE(features, OSD_RECOVERY_DELETES)) {
3663 // encoding a zeroed eversion_t to differentiate between this and
3664 // legacy unversioned encoding - a need value of 0'0 is not
3665 // possible. This can be replaced with the legacy encoding
3666 // macros post-luminous.
3667 eversion_t e;
3668 ::encode(e, bl);
3669 ::encode(need, bl);
3670 ::encode(have, bl);
3671 ::encode(static_cast<uint8_t>(flags), bl);
3672 } else {
3673 // legacy unversioned encoding
3674 ::encode(need, bl);
3675 ::encode(have, bl);
3676 }
3677 }
3678 void decode(bufferlist::iterator& bl) {
3679 eversion_t e;
3680 ::decode(e, bl);
3681 if (e != eversion_t()) {
3682 // legacy encoding, this is the need value
3683 need = e;
3684 ::decode(have, bl);
3685 } else {
3686 ::decode(need, bl);
3687 ::decode(have, bl);
3688 uint8_t f;
3689 ::decode(f, bl);
3690 flags = static_cast<missing_flags_t>(f);
3691 }
3692 }
3693
3694 void set_delete(bool is_delete) {
3695 flags = is_delete ? FLAG_DELETE : FLAG_NONE;
3696 }
3697
3698 bool is_delete() const {
3699 return (flags & FLAG_DELETE) == FLAG_DELETE;
3700 }
3701
3702 string flag_str() const {
3703 if (flags == FLAG_NONE) {
3704 return "none";
3705 } else {
3706 return "delete";
3707 }
3708 }
3709
3710 void dump(Formatter *f) const {
3711 f->dump_stream("need") << need;
3712 f->dump_stream("have") << have;
3713 f->dump_stream("flags") << flag_str();
3714 }
3715 static void generate_test_instances(list<pg_missing_item*>& o) {
3716 o.push_back(new pg_missing_item);
3717 o.push_back(new pg_missing_item);
3718 o.back()->need = eversion_t(1, 2);
3719 o.back()->have = eversion_t(1, 1);
3720 o.push_back(new pg_missing_item);
3721 o.back()->need = eversion_t(3, 5);
3722 o.back()->have = eversion_t(3, 4);
3723 o.back()->flags = FLAG_DELETE;
3724 }
3725 bool operator==(const pg_missing_item &rhs) const {
3726 return need == rhs.need && have == rhs.have && flags == rhs.flags;
3727 }
3728 bool operator!=(const pg_missing_item &rhs) const {
3729 return !(*this == rhs);
3730 }
3731 };
3732 WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
3733 ostream& operator<<(ostream& out, const pg_missing_item &item);
3734
3735 class pg_missing_const_i {
3736 public:
3737 virtual const map<hobject_t, pg_missing_item> &
3738 get_items() const = 0;
3739 virtual const map<version_t, hobject_t> &get_rmissing() const = 0;
3740 virtual bool get_may_include_deletes() const = 0;
3741 virtual unsigned int num_missing() const = 0;
3742 virtual bool have_missing() const = 0;
3743 virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
3744 virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
3745 virtual eversion_t have_old(const hobject_t& oid) const = 0;
3746 virtual ~pg_missing_const_i() {}
3747 };
3748
3749
3750 template <bool Track>
3751 class ChangeTracker {
3752 public:
3753 void changed(const hobject_t &obj) {}
3754 template <typename F>
3755 void get_changed(F &&f) const {}
3756 void flush() {}
3757 bool is_clean() const {
3758 return true;
3759 }
3760 };
3761 template <>
3762 class ChangeTracker<true> {
3763 set<hobject_t> _changed;
3764 public:
3765 void changed(const hobject_t &obj) {
3766 _changed.insert(obj);
3767 }
3768 template <typename F>
3769 void get_changed(F &&f) const {
3770 for (auto const &i: _changed) {
3771 f(i);
3772 }
3773 }
3774 void flush() {
3775 _changed.clear();
3776 }
3777 bool is_clean() const {
3778 return _changed.empty();
3779 }
3780 };
3781
3782 template <bool TrackChanges>
3783 class pg_missing_set : public pg_missing_const_i {
3784 using item = pg_missing_item;
3785 map<hobject_t, item> missing; // oid -> (need v, have v)
3786 map<version_t, hobject_t> rmissing; // v -> oid
3787 ChangeTracker<TrackChanges> tracker;
3788
3789 public:
3790 pg_missing_set() = default;
3791
3792 template <typename missing_type>
3793 pg_missing_set(const missing_type &m) {
3794 missing = m.get_items();
3795 rmissing = m.get_rmissing();
3796 may_include_deletes = m.get_may_include_deletes();
3797 for (auto &&i: missing)
3798 tracker.changed(i.first);
3799 }
3800
3801 bool may_include_deletes = false;
3802
3803 const map<hobject_t, item> &get_items() const override {
3804 return missing;
3805 }
3806 const map<version_t, hobject_t> &get_rmissing() const override {
3807 return rmissing;
3808 }
3809 bool get_may_include_deletes() const override {
3810 return may_include_deletes;
3811 }
3812 unsigned int num_missing() const override {
3813 return missing.size();
3814 }
3815 bool have_missing() const override {
3816 return !missing.empty();
3817 }
3818 bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
3819 auto iter = missing.find(oid);
3820 if (iter == missing.end())
3821 return false;
3822 if (out)
3823 *out = iter->second;
3824 return true;
3825 }
3826 bool is_missing(const hobject_t& oid, eversion_t v) const override {
3827 map<hobject_t, item>::const_iterator m =
3828 missing.find(oid);
3829 if (m == missing.end())
3830 return false;
3831 const item &item(m->second);
3832 if (item.need > v)
3833 return false;
3834 return true;
3835 }
3836 eversion_t have_old(const hobject_t& oid) const override {
3837 map<hobject_t, item>::const_iterator m =
3838 missing.find(oid);
3839 if (m == missing.end())
3840 return eversion_t();
3841 const item &item(m->second);
3842 return item.have;
3843 }
3844
3845 void claim(pg_missing_set& o) {
3846 static_assert(!TrackChanges, "Can't use claim with TrackChanges");
3847 missing.swap(o.missing);
3848 rmissing.swap(o.rmissing);
3849 }
3850
3851 /*
3852 * this needs to be called in log order as we extend the log. it
3853 * assumes missing is accurate up through the previous log entry.
3854 */
3855 void add_next_event(const pg_log_entry_t& e) {
3856 map<hobject_t, item>::iterator missing_it;
3857 missing_it = missing.find(e.soid);
3858 bool is_missing_divergent_item = missing_it != missing.end();
3859 if (e.prior_version == eversion_t() || e.is_clone()) {
3860 // new object.
3861 if (is_missing_divergent_item) { // use iterator
3862 rmissing.erase((missing_it->second).need.version);
3863 missing_it->second = item(e.version, eversion_t(), e.is_delete()); // .have = nil
3864 } else // create new element in missing map
3865 missing[e.soid] = item(e.version, eversion_t(), e.is_delete()); // .have = nil
3866 } else if (is_missing_divergent_item) {
3867 // already missing (prior).
3868 rmissing.erase((missing_it->second).need.version);
3869 (missing_it->second).need = e.version; // leave .have unchanged.
3870 missing_it->second.set_delete(e.is_delete());
3871 } else if (e.is_backlog()) {
3872 // May not have prior version
3873 assert(0 == "these don't exist anymore");
3874 } else {
3875 // not missing, we must have prior_version (if any)
3876 assert(!is_missing_divergent_item);
3877 missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
3878 }
3879 rmissing[e.version.version] = e.soid;
3880 tracker.changed(e.soid);
3881 }
3882
3883 void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
3884 if (missing.count(oid)) {
3885 rmissing.erase(missing[oid].need.version);
3886 missing[oid].need = need; // no not adjust .have
3887 missing[oid].set_delete(is_delete);
3888 } else {
3889 missing[oid] = item(need, eversion_t(), is_delete);
3890 }
3891 rmissing[need.version] = oid;
3892
3893 tracker.changed(oid);
3894 }
3895
3896 void revise_have(hobject_t oid, eversion_t have) {
3897 if (missing.count(oid)) {
3898 tracker.changed(oid);
3899 missing[oid].have = have;
3900 }
3901 }
3902
3903 void add(const hobject_t& oid, eversion_t need, eversion_t have,
3904 bool is_delete) {
3905 missing[oid] = item(need, have, is_delete);
3906 rmissing[need.version] = oid;
3907 tracker.changed(oid);
3908 }
3909
3910 void rm(const hobject_t& oid, eversion_t v) {
3911 std::map<hobject_t, item>::iterator p = missing.find(oid);
3912 if (p != missing.end() && p->second.need <= v)
3913 rm(p);
3914 }
3915
3916 void rm(std::map<hobject_t, item>::const_iterator m) {
3917 tracker.changed(m->first);
3918 rmissing.erase(m->second.need.version);
3919 missing.erase(m);
3920 }
3921
3922 void got(const hobject_t& oid, eversion_t v) {
3923 std::map<hobject_t, item>::iterator p = missing.find(oid);
3924 assert(p != missing.end());
3925 assert(p->second.need <= v || p->second.is_delete());
3926 got(p);
3927 }
3928
3929 void got(std::map<hobject_t, item>::const_iterator m) {
3930 tracker.changed(m->first);
3931 rmissing.erase(m->second.need.version);
3932 missing.erase(m);
3933 }
3934
3935 void split_into(
3936 pg_t child_pgid,
3937 unsigned split_bits,
3938 pg_missing_set *omissing) {
3939 omissing->may_include_deletes = may_include_deletes;
3940 unsigned mask = ~((~0)<<split_bits);
3941 for (map<hobject_t, item>::iterator i = missing.begin();
3942 i != missing.end();
3943 ) {
3944 if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
3945 omissing->add(i->first, i->second.need, i->second.have,
3946 i->second.is_delete());
3947 rm(i++);
3948 } else {
3949 ++i;
3950 }
3951 }
3952 }
3953
3954 void clear() {
3955 for (auto const &i: missing)
3956 tracker.changed(i.first);
3957 missing.clear();
3958 rmissing.clear();
3959 }
3960
3961 void encode(bufferlist &bl) const {
3962 ENCODE_START(4, 2, bl);
3963 ::encode(missing, bl, may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0);
3964 ::encode(may_include_deletes, bl);
3965 ENCODE_FINISH(bl);
3966 }
3967 void decode(bufferlist::iterator &bl, int64_t pool = -1) {
3968 for (auto const &i: missing)
3969 tracker.changed(i.first);
3970 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
3971 ::decode(missing, bl);
3972 if (struct_v >= 4) {
3973 ::decode(may_include_deletes, bl);
3974 }
3975 DECODE_FINISH(bl);
3976
3977 if (struct_v < 3) {
3978 // Handle hobject_t upgrade
3979 map<hobject_t, item> tmp;
3980 for (map<hobject_t, item>::iterator i =
3981 missing.begin();
3982 i != missing.end();
3983 ) {
3984 if (!i->first.is_max() && i->first.pool == -1) {
3985 hobject_t to_insert(i->first);
3986 to_insert.pool = pool;
3987 tmp[to_insert] = i->second;
3988 missing.erase(i++);
3989 } else {
3990 ++i;
3991 }
3992 }
3993 missing.insert(tmp.begin(), tmp.end());
3994 }
3995
3996 for (map<hobject_t,item>::iterator it =
3997 missing.begin();
3998 it != missing.end();
3999 ++it)
4000 rmissing[it->second.need.version] = it->first;
4001 for (auto const &i: missing)
4002 tracker.changed(i.first);
4003 }
4004 void dump(Formatter *f) const {
4005 f->open_array_section("missing");
4006 for (map<hobject_t,item>::const_iterator p =
4007 missing.begin(); p != missing.end(); ++p) {
4008 f->open_object_section("item");
4009 f->dump_stream("object") << p->first;
4010 p->second.dump(f);
4011 f->close_section();
4012 }
4013 f->close_section();
4014 f->dump_bool("may_include_deletes", may_include_deletes);
4015 }
4016 template <typename F>
4017 void filter_objects(F &&f) {
4018 for (auto i = missing.begin(); i != missing.end();) {
4019 if (f(i->first)) {
4020 rm(i++);
4021 } else {
4022 ++i;
4023 }
4024 }
4025 }
4026 static void generate_test_instances(list<pg_missing_set*>& o) {
4027 o.push_back(new pg_missing_set);
4028 o.push_back(new pg_missing_set);
4029 o.back()->add(
4030 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4031 eversion_t(5, 6), eversion_t(5, 1), false);
4032 o.push_back(new pg_missing_set);
4033 o.back()->add(
4034 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4035 eversion_t(5, 6), eversion_t(5, 1), true);
4036 o.back()->may_include_deletes = true;
4037 }
4038 template <typename F>
4039 void get_changed(F &&f) const {
4040 tracker.get_changed(f);
4041 }
4042 void flush() {
4043 tracker.flush();
4044 }
4045 bool is_clean() const {
4046 return tracker.is_clean();
4047 }
4048 template <typename missing_t>
4049 bool debug_verify_from_init(
4050 const missing_t &init_missing,
4051 ostream *oss) const {
4052 if (!TrackChanges)
4053 return true;
4054 auto check_missing(init_missing.get_items());
4055 tracker.get_changed([&](const hobject_t &hoid) {
4056 check_missing.erase(hoid);
4057 if (missing.count(hoid)) {
4058 check_missing.insert(*(missing.find(hoid)));
4059 }
4060 });
4061 bool ok = true;
4062 if (check_missing.size() != missing.size()) {
4063 if (oss) {
4064 *oss << "Size mismatch, check: " << check_missing.size()
4065 << ", actual: " << missing.size() << "\n";
4066 }
4067 ok = false;
4068 }
4069 for (auto &i: missing) {
4070 if (!check_missing.count(i.first)) {
4071 if (oss)
4072 *oss << "check_missing missing " << i.first << "\n";
4073 ok = false;
4074 } else if (check_missing[i.first] != i.second) {
4075 if (oss)
4076 *oss << "check_missing missing item mismatch on " << i.first
4077 << ", check: " << check_missing[i.first]
4078 << ", actual: " << i.second << "\n";
4079 ok = false;
4080 }
4081 }
4082 if (oss && !ok) {
4083 *oss << "check_missing: " << check_missing << "\n";
4084 set<hobject_t> changed;
4085 tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
4086 *oss << "changed: " << changed << "\n";
4087 }
4088 return ok;
4089 }
4090 };
4091 template <bool TrackChanges>
4092 void encode(
4093 const pg_missing_set<TrackChanges> &c, bufferlist &bl, uint64_t features=0) {
4094 ENCODE_DUMP_PRE();
4095 c.encode(bl);
4096 ENCODE_DUMP_POST(cl);
4097 }
4098 template <bool TrackChanges>
4099 void decode(pg_missing_set<TrackChanges> &c, bufferlist::iterator &p) {
4100 c.decode(p);
4101 }
4102 template <bool TrackChanges>
4103 ostream& operator<<(ostream& out, const pg_missing_set<TrackChanges> &missing)
4104 {
4105 out << "missing(" << missing.num_missing()
4106 << " may_include_deletes = " << missing.may_include_deletes;
4107 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
4108 out << ")";
4109 return out;
4110 }
4111
4112 using pg_missing_t = pg_missing_set<false>;
4113 using pg_missing_tracker_t = pg_missing_set<true>;
4114
4115
4116 /**
4117 * pg list objects response format
4118 *
4119 */
4120 struct pg_nls_response_t {
4121 collection_list_handle_t handle;
4122 list<librados::ListObjectImpl> entries;
4123
4124 void encode(bufferlist& bl) const {
4125 ENCODE_START(1, 1, bl);
4126 ::encode(handle, bl);
4127 __u32 n = (__u32)entries.size();
4128 ::encode(n, bl);
4129 for (list<librados::ListObjectImpl>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
4130 ::encode(i->nspace, bl);
4131 ::encode(i->oid, bl);
4132 ::encode(i->locator, bl);
4133 }
4134 ENCODE_FINISH(bl);
4135 }
4136 void decode(bufferlist::iterator& bl) {
4137 DECODE_START(1, bl);
4138 ::decode(handle, bl);
4139 __u32 n;
4140 ::decode(n, bl);
4141 entries.clear();
4142 while (n--) {
4143 librados::ListObjectImpl i;
4144 ::decode(i.nspace, bl);
4145 ::decode(i.oid, bl);
4146 ::decode(i.locator, bl);
4147 entries.push_back(i);
4148 }
4149 DECODE_FINISH(bl);
4150 }
4151 void dump(Formatter *f) const {
4152 f->dump_stream("handle") << handle;
4153 f->open_array_section("entries");
4154 for (list<librados::ListObjectImpl>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4155 f->open_object_section("object");
4156 f->dump_string("namespace", p->nspace);
4157 f->dump_string("object", p->oid);
4158 f->dump_string("key", p->locator);
4159 f->close_section();
4160 }
4161 f->close_section();
4162 }
4163 static void generate_test_instances(list<pg_nls_response_t*>& o) {
4164 o.push_back(new pg_nls_response_t);
4165 o.push_back(new pg_nls_response_t);
4166 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4167 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4168 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4169 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4170 o.push_back(new pg_nls_response_t);
4171 o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
4172 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4173 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4174 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4175 o.push_back(new pg_nls_response_t);
4176 o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
4177 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4178 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4179 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4180 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4181 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4182 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4183 }
4184 };
4185
4186 WRITE_CLASS_ENCODER(pg_nls_response_t)
4187
4188 // For backwards compatibility with older OSD requests
4189 struct pg_ls_response_t {
4190 collection_list_handle_t handle;
4191 list<pair<object_t, string> > entries;
4192
4193 void encode(bufferlist& bl) const {
4194 __u8 v = 1;
4195 ::encode(v, bl);
4196 ::encode(handle, bl);
4197 ::encode(entries, bl);
4198 }
4199 void decode(bufferlist::iterator& bl) {
4200 __u8 v;
4201 ::decode(v, bl);
4202 assert(v == 1);
4203 ::decode(handle, bl);
4204 ::decode(entries, bl);
4205 }
4206 void dump(Formatter *f) const {
4207 f->dump_stream("handle") << handle;
4208 f->open_array_section("entries");
4209 for (list<pair<object_t, string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4210 f->open_object_section("object");
4211 f->dump_stream("object") << p->first;
4212 f->dump_string("key", p->second);
4213 f->close_section();
4214 }
4215 f->close_section();
4216 }
4217 static void generate_test_instances(list<pg_ls_response_t*>& o) {
4218 o.push_back(new pg_ls_response_t);
4219 o.push_back(new pg_ls_response_t);
4220 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4221 o.back()->entries.push_back(make_pair(object_t("one"), string()));
4222 o.back()->entries.push_back(make_pair(object_t("two"), string("twokey")));
4223 }
4224 };
4225
4226 WRITE_CLASS_ENCODER(pg_ls_response_t)
4227
4228 /**
4229 * object_copy_cursor_t
4230 */
4231 struct object_copy_cursor_t {
4232 uint64_t data_offset;
4233 string omap_offset;
4234 bool attr_complete;
4235 bool data_complete;
4236 bool omap_complete;
4237
4238 object_copy_cursor_t()
4239 : data_offset(0),
4240 attr_complete(false),
4241 data_complete(false),
4242 omap_complete(false)
4243 {}
4244
4245 bool is_initial() const {
4246 return !attr_complete && data_offset == 0 && omap_offset.empty();
4247 }
4248 bool is_complete() const {
4249 return attr_complete && data_complete && omap_complete;
4250 }
4251
4252 static void generate_test_instances(list<object_copy_cursor_t*>& o);
4253 void encode(bufferlist& bl) const;
4254 void decode(bufferlist::iterator &bl);
4255 void dump(Formatter *f) const;
4256 };
4257 WRITE_CLASS_ENCODER(object_copy_cursor_t)
4258
4259 /**
4260 * object_copy_data_t
4261 *
4262 * Return data from a copy request. The semantics are a little strange
4263 * as a result of the encoding's heritage.
4264 *
4265 * In particular, the sender unconditionally fills in the cursor (from what
4266 * it receives and sends), the size, and the mtime, but is responsible for
4267 * figuring out whether it should put any data in the attrs, data, or
4268 * omap members (corresponding to xattrs, object data, and the omap entries)
4269 * based on external data (the client includes a max amount to return with
4270 * the copy request). The client then looks into the attrs, data, and/or omap
4271 * based on the contents of the cursor.
4272 */
4273 struct object_copy_data_t {
4274 enum {
4275 FLAG_DATA_DIGEST = 1<<0,
4276 FLAG_OMAP_DIGEST = 1<<1,
4277 };
4278 object_copy_cursor_t cursor;
4279 uint64_t size;
4280 utime_t mtime;
4281 uint32_t data_digest, omap_digest;
4282 uint32_t flags;
4283 map<string, bufferlist> attrs;
4284 bufferlist data;
4285 bufferlist omap_header;
4286 bufferlist omap_data;
4287
4288 /// which snaps we are defined for (if a snap and not the head)
4289 vector<snapid_t> snaps;
4290 ///< latest snap seq for the object (if head)
4291 snapid_t snap_seq;
4292
4293 ///< recent reqids on this object
4294 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > reqids;
4295
4296 uint64_t truncate_seq;
4297 uint64_t truncate_size;
4298
4299 public:
4300 object_copy_data_t() :
4301 size((uint64_t)-1), data_digest(-1),
4302 omap_digest(-1), flags(0),
4303 truncate_seq(0),
4304 truncate_size(0) {}
4305
4306 static void generate_test_instances(list<object_copy_data_t*>& o);
4307 void encode(bufferlist& bl, uint64_t features) const;
4308 void decode(bufferlist::iterator& bl);
4309 void dump(Formatter *f) const;
4310 };
4311 WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
4312
4313 /**
4314 * pg creation info
4315 */
4316 struct pg_create_t {
4317 epoch_t created; // epoch pg created
4318 pg_t parent; // split from parent (if != pg_t())
4319 __s32 split_bits;
4320
4321 pg_create_t()
4322 : created(0), split_bits(0) {}
4323 pg_create_t(unsigned c, pg_t p, int s)
4324 : created(c), parent(p), split_bits(s) {}
4325
4326 void encode(bufferlist &bl) const;
4327 void decode(bufferlist::iterator &bl);
4328 void dump(Formatter *f) const;
4329 static void generate_test_instances(list<pg_create_t*>& o);
4330 };
4331 WRITE_CLASS_ENCODER(pg_create_t)
4332
4333 // -----------------------------------------
4334
4335 struct osd_peer_stat_t {
4336 utime_t stamp;
4337
4338 osd_peer_stat_t() { }
4339
4340 void encode(bufferlist &bl) const;
4341 void decode(bufferlist::iterator &bl);
4342 void dump(Formatter *f) const;
4343 static void generate_test_instances(list<osd_peer_stat_t*>& o);
4344 };
4345 WRITE_CLASS_ENCODER(osd_peer_stat_t)
4346
4347 ostream& operator<<(ostream& out, const osd_peer_stat_t &stat);
4348
4349
4350 // -----------------------------------------
4351
4352 class ObjectExtent {
4353 /**
4354 * ObjectExtents are used for specifying IO behavior against RADOS
4355 * objects when one is using the ObjectCacher.
4356 *
4357 * To use this in a real system, *every member* must be filled
4358 * out correctly. In particular, make sure to initialize the
4359 * oloc correctly, as its default values are deliberate poison
4360 * and will cause internal ObjectCacher asserts.
4361 *
4362 * Similarly, your buffer_extents vector *must* specify a total
4363 * size equal to your length. If the buffer_extents inadvertently
4364 * contain less space than the length member specifies, you
4365 * will get unintelligible asserts deep in the ObjectCacher.
4366 *
4367 * If you are trying to do testing and don't care about actual
4368 * RADOS function, the simplest thing to do is to initialize
4369 * the ObjectExtent (truncate_size can be 0), create a single entry
4370 * in buffer_extents matching the length, and set oloc.pool to 0.
4371 */
4372 public:
4373 object_t oid; // object id
4374 uint64_t objectno;
4375 uint64_t offset; // in object
4376 uint64_t length; // in object
4377 uint64_t truncate_size; // in object
4378
4379 object_locator_t oloc; // object locator (pool etc)
4380
4381 vector<pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
4382
4383 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
4384 ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
4385 oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
4386 };
4387
4388 inline ostream& operator<<(ostream& out, const ObjectExtent &ex)
4389 {
4390 return out << "extent("
4391 << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
4392 << " " << ex.offset << "~" << ex.length
4393 << " -> " << ex.buffer_extents
4394 << ")";
4395 }
4396
4397
4398 // ---------------------------------------
4399
4400 class OSDSuperblock {
4401 public:
4402 uuid_d cluster_fsid, osd_fsid;
4403 int32_t whoami; // my role in this fs.
4404 epoch_t current_epoch; // most recent epoch
4405 epoch_t oldest_map, newest_map; // oldest/newest maps we have.
4406 double weight;
4407
4408 CompatSet compat_features;
4409
4410 // last interval over which i mounted and was then active
4411 epoch_t mounted; // last epoch i mounted
4412 epoch_t clean_thru; // epoch i was active and clean thru
4413
4414 OSDSuperblock() :
4415 whoami(-1),
4416 current_epoch(0), oldest_map(0), newest_map(0), weight(0),
4417 mounted(0), clean_thru(0) {
4418 }
4419
4420 void encode(bufferlist &bl) const;
4421 void decode(bufferlist::iterator &bl);
4422 void dump(Formatter *f) const;
4423 static void generate_test_instances(list<OSDSuperblock*>& o);
4424 };
4425 WRITE_CLASS_ENCODER(OSDSuperblock)
4426
4427 inline ostream& operator<<(ostream& out, const OSDSuperblock& sb)
4428 {
4429 return out << "sb(" << sb.cluster_fsid
4430 << " osd." << sb.whoami
4431 << " " << sb.osd_fsid
4432 << " e" << sb.current_epoch
4433 << " [" << sb.oldest_map << "," << sb.newest_map << "]"
4434 << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
4435 << ")";
4436 }
4437
4438
4439 // -------
4440
4441
4442
4443
4444
4445
4446 /*
4447 * attached to object head. describes most recent snap context, and
4448 * set of existing clones.
4449 */
4450 struct SnapSet {
4451 snapid_t seq;
4452 bool head_exists;
4453 vector<snapid_t> snaps; // descending
4454 vector<snapid_t> clones; // ascending
4455 map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest
4456 map<snapid_t, uint64_t> clone_size;
4457 map<snapid_t, vector<snapid_t>> clone_snaps; // descending
4458
4459 SnapSet() : seq(0), head_exists(false) {}
4460 explicit SnapSet(bufferlist& bl) {
4461 bufferlist::iterator p = bl.begin();
4462 decode(p);
4463 }
4464
4465 bool is_legacy() const {
4466 return clone_snaps.size() < clones.size() || !head_exists;
4467 }
4468
4469 /// populate SnapSet from a librados::snap_set_t
4470 void from_snap_set(const librados::snap_set_t& ss, bool legacy);
4471
4472 /// get space accounted to clone
4473 uint64_t get_clone_bytes(snapid_t clone) const;
4474
4475 void encode(bufferlist& bl) const;
4476 void decode(bufferlist::iterator& bl);
4477 void dump(Formatter *f) const;
4478 static void generate_test_instances(list<SnapSet*>& o);
4479
4480 SnapContext get_ssc_as_of(snapid_t as_of) const {
4481 SnapContext out;
4482 out.seq = as_of;
4483 for (vector<snapid_t>::const_iterator i = snaps.begin();
4484 i != snaps.end();
4485 ++i) {
4486 if (*i <= as_of)
4487 out.snaps.push_back(*i);
4488 }
4489 return out;
4490 }
4491
4492 // return min element of snaps > after, return max if no such element
4493 snapid_t get_first_snap_after(snapid_t after, snapid_t max) const {
4494 for (vector<snapid_t>::const_reverse_iterator i = snaps.rbegin();
4495 i != snaps.rend();
4496 ++i) {
4497 if (*i > after)
4498 return *i;
4499 }
4500 return max;
4501 }
4502
4503 SnapSet get_filtered(const pg_pool_t &pinfo) const;
4504 void filter(const pg_pool_t &pinfo);
4505 };
4506 WRITE_CLASS_ENCODER(SnapSet)
4507
4508 ostream& operator<<(ostream& out, const SnapSet& cs);
4509
4510
4511
4512 #define OI_ATTR "_"
4513 #define SS_ATTR "snapset"
4514
4515 struct watch_info_t {
4516 uint64_t cookie;
4517 uint32_t timeout_seconds;
4518 entity_addr_t addr;
4519
4520 watch_info_t() : cookie(0), timeout_seconds(0) { }
4521 watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
4522
4523 void encode(bufferlist& bl, uint64_t features) const;
4524 void decode(bufferlist::iterator& bl);
4525 void dump(Formatter *f) const;
4526 static void generate_test_instances(list<watch_info_t*>& o);
4527 };
4528 WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
4529
4530 static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
4531 return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
4532 && l.addr == r.addr;
4533 }
4534
4535 static inline ostream& operator<<(ostream& out, const watch_info_t& w) {
4536 return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
4537 << " " << w.addr << ")";
4538 }
4539
4540 struct notify_info_t {
4541 uint64_t cookie;
4542 uint64_t notify_id;
4543 uint32_t timeout;
4544 bufferlist bl;
4545 };
4546
4547 static inline ostream& operator<<(ostream& out, const notify_info_t& n) {
4548 return out << "notify(cookie " << n.cookie
4549 << " notify" << n.notify_id
4550 << " " << n.timeout << "s)";
4551 }
4552
4553 struct object_info_t;
4554 struct object_manifest_t {
4555 enum {
4556 TYPE_NONE = 0,
4557 TYPE_REDIRECT = 1, // start with this
4558 TYPE_CHUNKED = 2, // do this later
4559 };
4560 uint8_t type; // redirect, chunked, ...
4561 hobject_t redirect_target;
4562
4563 object_manifest_t() : type(0) { }
4564 object_manifest_t(uint8_t type, const hobject_t& redirect_target)
4565 : type(type), redirect_target(redirect_target) { }
4566
4567 bool is_empty() const {
4568 return type == TYPE_NONE;
4569 }
4570 bool is_redirect() const {
4571 return type == TYPE_REDIRECT;
4572 }
4573 bool is_chunked() const {
4574 return type == TYPE_CHUNKED;
4575 }
4576 static const char *get_type_name(uint8_t m) {
4577 switch (m) {
4578 case TYPE_NONE: return "none";
4579 case TYPE_REDIRECT: return "redirect";
4580 case TYPE_CHUNKED: return "chunked";
4581 default: return "unknown";
4582 }
4583 }
4584 const char *get_type_name() const {
4585 return get_type_name(type);
4586 }
4587 static void generate_test_instances(list<object_manifest_t*>& o);
4588 void encode(bufferlist &bl) const;
4589 void decode(bufferlist::iterator &bl);
4590 void dump(Formatter *f) const;
4591 friend ostream& operator<<(ostream& out, const object_info_t& oi);
4592 };
4593 WRITE_CLASS_ENCODER(object_manifest_t)
4594 ostream& operator<<(ostream& out, const object_manifest_t& oi);
4595
4596 struct object_info_t {
4597 hobject_t soid;
4598 eversion_t version, prior_version;
4599 version_t user_version;
4600 osd_reqid_t last_reqid;
4601
4602 uint64_t size;
4603 utime_t mtime;
4604 utime_t local_mtime; // local mtime
4605
4606 // note: these are currently encoded into a total 16 bits; see
4607 // encode()/decode() for the weirdness.
4608 typedef enum {
4609 FLAG_LOST = 1<<0,
4610 FLAG_WHITEOUT = 1<<1, // object logically does not exist
4611 FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
4612 FLAG_OMAP = 1 << 3, // has (or may have) some/any omap data
4613 FLAG_DATA_DIGEST = 1 << 4, // has data crc
4614 FLAG_OMAP_DIGEST = 1 << 5, // has omap crc
4615 FLAG_CACHE_PIN = 1 << 6, // pin the object in cache tier
4616 FLAG_MANIFEST = 1 << 7, // has manifest
4617 // ...
4618 FLAG_USES_TMAP = 1<<8, // deprecated; no longer used.
4619 } flag_t;
4620
4621 flag_t flags;
4622
4623 static string get_flag_string(flag_t flags) {
4624 string s;
4625 if (flags & FLAG_LOST)
4626 s += "|lost";
4627 if (flags & FLAG_WHITEOUT)
4628 s += "|whiteout";
4629 if (flags & FLAG_DIRTY)
4630 s += "|dirty";
4631 if (flags & FLAG_USES_TMAP)
4632 s += "|uses_tmap";
4633 if (flags & FLAG_OMAP)
4634 s += "|omap";
4635 if (flags & FLAG_DATA_DIGEST)
4636 s += "|data_digest";
4637 if (flags & FLAG_OMAP_DIGEST)
4638 s += "|omap_digest";
4639 if (flags & FLAG_CACHE_PIN)
4640 s += "|cache_pin";
4641 if (flags & FLAG_MANIFEST)
4642 s += "|manifest";
4643 if (s.length())
4644 return s.substr(1);
4645 return s;
4646 }
4647 string get_flag_string() const {
4648 return get_flag_string(flags);
4649 }
4650
4651 /// [clone] descending. pre-luminous; moved to SnapSet
4652 vector<snapid_t> legacy_snaps;
4653
4654 uint64_t truncate_seq, truncate_size;
4655
4656 map<pair<uint64_t, entity_name_t>, watch_info_t> watchers;
4657
4658 // opportunistic checksums; may or may not be present
4659 __u32 data_digest; ///< data crc32c
4660 __u32 omap_digest; ///< omap crc32c
4661
4662 // alloc hint attribute
4663 uint64_t expected_object_size, expected_write_size;
4664 uint32_t alloc_hint_flags;
4665
4666 struct object_manifest_t manifest;
4667
4668 void copy_user_bits(const object_info_t& other);
4669
4670 static ps_t legacy_object_locator_to_ps(const object_t &oid,
4671 const object_locator_t &loc);
4672
4673 bool test_flag(flag_t f) const {
4674 return (flags & f) == f;
4675 }
4676 void set_flag(flag_t f) {
4677 flags = (flag_t)(flags | f);
4678 }
4679 void clear_flag(flag_t f) {
4680 flags = (flag_t)(flags & ~f);
4681 }
4682 bool is_lost() const {
4683 return test_flag(FLAG_LOST);
4684 }
4685 bool is_whiteout() const {
4686 return test_flag(FLAG_WHITEOUT);
4687 }
4688 bool is_dirty() const {
4689 return test_flag(FLAG_DIRTY);
4690 }
4691 bool is_omap() const {
4692 return test_flag(FLAG_OMAP);
4693 }
4694 bool is_data_digest() const {
4695 return test_flag(FLAG_DATA_DIGEST);
4696 }
4697 bool is_omap_digest() const {
4698 return test_flag(FLAG_OMAP_DIGEST);
4699 }
4700 bool is_cache_pinned() const {
4701 return test_flag(FLAG_CACHE_PIN);
4702 }
4703 bool has_manifest() const {
4704 return test_flag(FLAG_MANIFEST);
4705 }
4706
4707 void set_data_digest(__u32 d) {
4708 set_flag(FLAG_DATA_DIGEST);
4709 data_digest = d;
4710 }
4711 void set_omap_digest(__u32 d) {
4712 set_flag(FLAG_OMAP_DIGEST);
4713 omap_digest = d;
4714 }
4715 void clear_data_digest() {
4716 clear_flag(FLAG_DATA_DIGEST);
4717 data_digest = -1;
4718 }
4719 void clear_omap_digest() {
4720 clear_flag(FLAG_OMAP_DIGEST);
4721 omap_digest = -1;
4722 }
4723 void new_object() {
4724 set_data_digest(-1);
4725 set_omap_digest(-1);
4726 }
4727
4728 void encode(bufferlist& bl, uint64_t features) const;
4729 void decode(bufferlist::iterator& bl);
4730 void decode(bufferlist& bl) {
4731 bufferlist::iterator p = bl.begin();
4732 decode(p);
4733 }
4734 void dump(Formatter *f) const;
4735 static void generate_test_instances(list<object_info_t*>& o);
4736
4737 explicit object_info_t()
4738 : user_version(0), size(0), flags((flag_t)0),
4739 truncate_seq(0), truncate_size(0),
4740 data_digest(-1), omap_digest(-1),
4741 expected_object_size(0), expected_write_size(0),
4742 alloc_hint_flags(0)
4743 {}
4744
4745 explicit object_info_t(const hobject_t& s)
4746 : soid(s),
4747 user_version(0), size(0), flags((flag_t)0),
4748 truncate_seq(0), truncate_size(0),
4749 data_digest(-1), omap_digest(-1),
4750 expected_object_size(0), expected_write_size(0),
4751 alloc_hint_flags(0)
4752 {}
4753
4754 explicit object_info_t(bufferlist& bl) {
4755 decode(bl);
4756 }
4757 };
4758 WRITE_CLASS_ENCODER_FEATURES(object_info_t)
4759
4760 ostream& operator<<(ostream& out, const object_info_t& oi);
4761
4762
4763
4764 // Object recovery
4765 struct ObjectRecoveryInfo {
4766 hobject_t soid;
4767 eversion_t version;
4768 uint64_t size;
4769 object_info_t oi;
4770 SnapSet ss; // only populated if soid is_snap()
4771 interval_set<uint64_t> copy_subset;
4772 map<hobject_t, interval_set<uint64_t>> clone_subset;
4773
4774 ObjectRecoveryInfo() : size(0) { }
4775
4776 static void generate_test_instances(list<ObjectRecoveryInfo*>& o);
4777 void encode(bufferlist &bl, uint64_t features) const;
4778 void decode(bufferlist::iterator &bl, int64_t pool = -1);
4779 ostream &print(ostream &out) const;
4780 void dump(Formatter *f) const;
4781 };
4782 WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
4783 ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf);
4784
4785 struct ObjectRecoveryProgress {
4786 uint64_t data_recovered_to;
4787 string omap_recovered_to;
4788 bool first;
4789 bool data_complete;
4790 bool omap_complete;
4791 bool error = false;
4792
4793 ObjectRecoveryProgress()
4794 : data_recovered_to(0),
4795 first(true),
4796 data_complete(false), omap_complete(false) { }
4797
4798 bool is_complete(const ObjectRecoveryInfo& info) const {
4799 return (data_recovered_to >= (
4800 info.copy_subset.empty() ?
4801 0 : info.copy_subset.range_end())) &&
4802 omap_complete;
4803 }
4804
4805 static void generate_test_instances(list<ObjectRecoveryProgress*>& o);
4806 void encode(bufferlist &bl) const;
4807 void decode(bufferlist::iterator &bl);
4808 ostream &print(ostream &out) const;
4809 void dump(Formatter *f) const;
4810 };
4811 WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
4812 ostream& operator<<(ostream& out, const ObjectRecoveryProgress &prog);
4813
4814 struct PushReplyOp {
4815 hobject_t soid;
4816
4817 static void generate_test_instances(list<PushReplyOp*>& o);
4818 void encode(bufferlist &bl) const;
4819 void decode(bufferlist::iterator &bl);
4820 ostream &print(ostream &out) const;
4821 void dump(Formatter *f) const;
4822
4823 uint64_t cost(CephContext *cct) const;
4824 };
4825 WRITE_CLASS_ENCODER(PushReplyOp)
4826 ostream& operator<<(ostream& out, const PushReplyOp &op);
4827
4828 struct PullOp {
4829 hobject_t soid;
4830
4831 ObjectRecoveryInfo recovery_info;
4832 ObjectRecoveryProgress recovery_progress;
4833
4834 static void generate_test_instances(list<PullOp*>& o);
4835 void encode(bufferlist &bl, uint64_t features) const;
4836 void decode(bufferlist::iterator &bl);
4837 ostream &print(ostream &out) const;
4838 void dump(Formatter *f) const;
4839
4840 uint64_t cost(CephContext *cct) const;
4841 };
4842 WRITE_CLASS_ENCODER_FEATURES(PullOp)
4843 ostream& operator<<(ostream& out, const PullOp &op);
4844
4845 struct PushOp {
4846 hobject_t soid;
4847 eversion_t version;
4848 bufferlist data;
4849 interval_set<uint64_t> data_included;
4850 bufferlist omap_header;
4851 map<string, bufferlist> omap_entries;
4852 map<string, bufferlist> attrset;
4853
4854 ObjectRecoveryInfo recovery_info;
4855 ObjectRecoveryProgress before_progress;
4856 ObjectRecoveryProgress after_progress;
4857
4858 static void generate_test_instances(list<PushOp*>& o);
4859 void encode(bufferlist &bl, uint64_t features) const;
4860 void decode(bufferlist::iterator &bl);
4861 ostream &print(ostream &out) const;
4862 void dump(Formatter *f) const;
4863
4864 uint64_t cost(CephContext *cct) const;
4865 };
4866 WRITE_CLASS_ENCODER_FEATURES(PushOp)
4867 ostream& operator<<(ostream& out, const PushOp &op);
4868
4869
4870 /*
4871 * summarize pg contents for purposes of a scrub
4872 */
4873 struct ScrubMap {
4874 struct object {
4875 map<string,bufferptr> attrs;
4876 uint64_t size;
4877 __u32 omap_digest; ///< omap crc32c
4878 __u32 digest; ///< data crc32c
4879 bool negative:1;
4880 bool digest_present:1;
4881 bool omap_digest_present:1;
4882 bool read_error:1;
4883 bool stat_error:1;
4884 bool ec_hash_mismatch:1;
4885 bool ec_size_mismatch:1;
4886
4887 object() :
4888 // Init invalid size so it won't match if we get a stat EIO error
4889 size(-1), omap_digest(0), digest(0),
4890 negative(false), digest_present(false), omap_digest_present(false),
4891 read_error(false), stat_error(false), ec_hash_mismatch(false), ec_size_mismatch(false) {}
4892
4893 void encode(bufferlist& bl) const;
4894 void decode(bufferlist::iterator& bl);
4895 void dump(Formatter *f) const;
4896 static void generate_test_instances(list<object*>& o);
4897 };
4898 WRITE_CLASS_ENCODER(object)
4899
4900 map<hobject_t,object> objects;
4901 eversion_t valid_through;
4902 eversion_t incr_since;
4903
4904 void merge_incr(const ScrubMap &l);
4905 void insert(const ScrubMap &r) {
4906 objects.insert(r.objects.begin(), r.objects.end());
4907 }
4908 void swap(ScrubMap &r) {
4909 using std::swap;
4910 swap(objects, r.objects);
4911 swap(valid_through, r.valid_through);
4912 swap(incr_since, r.incr_since);
4913 }
4914
4915 void encode(bufferlist& bl) const;
4916 void decode(bufferlist::iterator& bl, int64_t pool=-1);
4917 void dump(Formatter *f) const;
4918 static void generate_test_instances(list<ScrubMap*>& o);
4919 };
4920 WRITE_CLASS_ENCODER(ScrubMap::object)
4921 WRITE_CLASS_ENCODER(ScrubMap)
4922
4923 struct OSDOp {
4924 ceph_osd_op op;
4925 sobject_t soid;
4926
4927 bufferlist indata, outdata;
4928 errorcode32_t rval;
4929
4930 OSDOp() : rval(0) {
4931 memset(&op, 0, sizeof(ceph_osd_op));
4932 }
4933
4934 /**
4935 * split a bufferlist into constituent indata members of a vector of OSDOps
4936 *
4937 * @param ops [out] vector of OSDOps
4938 * @param in [in] combined data buffer
4939 */
4940 static void split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in);
4941
4942 /**
4943 * merge indata members of a vector of OSDOp into a single bufferlist
4944 *
4945 * Notably this also encodes certain other OSDOp data into the data
4946 * buffer, including the sobject_t soid.
4947 *
4948 * @param ops [in] vector of OSDOps
4949 * @param out [out] combined data buffer
4950 */
4951 static void merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out);
4952
4953 /**
4954 * split a bufferlist into constituent outdata members of a vector of OSDOps
4955 *
4956 * @param ops [out] vector of OSDOps
4957 * @param in [in] combined data buffer
4958 */
4959 static void split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in);
4960
4961 /**
4962 * merge outdata members of a vector of OSDOps into a single bufferlist
4963 *
4964 * @param ops [in] vector of OSDOps
4965 * @param out [out] combined data buffer
4966 */
4967 static void merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out);
4968
4969 /**
4970 * Clear data as much as possible, leave minimal data for historical op dump
4971 *
4972 * @param ops [in] vector of OSDOps
4973 */
4974 static void clear_data(vector<OSDOp>& ops);
4975 };
4976
4977 ostream& operator<<(ostream& out, const OSDOp& op);
4978
4979 struct watch_item_t {
4980 entity_name_t name;
4981 uint64_t cookie;
4982 uint32_t timeout_seconds;
4983 entity_addr_t addr;
4984
4985 watch_item_t() : cookie(0), timeout_seconds(0) { }
4986 watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
4987 const entity_addr_t& addr)
4988 : name(name), cookie(cookie), timeout_seconds(timeout),
4989 addr(addr) { }
4990
4991 void encode(bufferlist &bl, uint64_t features) const {
4992 ENCODE_START(2, 1, bl);
4993 ::encode(name, bl);
4994 ::encode(cookie, bl);
4995 ::encode(timeout_seconds, bl);
4996 ::encode(addr, bl, features);
4997 ENCODE_FINISH(bl);
4998 }
4999 void decode(bufferlist::iterator &bl) {
5000 DECODE_START(2, bl);
5001 ::decode(name, bl);
5002 ::decode(cookie, bl);
5003 ::decode(timeout_seconds, bl);
5004 if (struct_v >= 2) {
5005 ::decode(addr, bl);
5006 }
5007 DECODE_FINISH(bl);
5008 }
5009 };
5010 WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
5011
5012 struct obj_watch_item_t {
5013 hobject_t obj;
5014 watch_item_t wi;
5015 };
5016
5017 /**
5018 * obj list watch response format
5019 *
5020 */
5021 struct obj_list_watch_response_t {
5022 list<watch_item_t> entries;
5023
5024 void encode(bufferlist& bl, uint64_t features) const {
5025 ENCODE_START(1, 1, bl);
5026 ::encode(entries, bl, features);
5027 ENCODE_FINISH(bl);
5028 }
5029 void decode(bufferlist::iterator& bl) {
5030 DECODE_START(1, bl);
5031 ::decode(entries, bl);
5032 DECODE_FINISH(bl);
5033 }
5034 void dump(Formatter *f) const {
5035 f->open_array_section("entries");
5036 for (list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
5037 f->open_object_section("watch");
5038 f->dump_stream("watcher") << p->name;
5039 f->dump_int("cookie", p->cookie);
5040 f->dump_int("timeout", p->timeout_seconds);
5041 f->open_object_section("addr");
5042 p->addr.dump(f);
5043 f->close_section();
5044 f->close_section();
5045 }
5046 f->close_section();
5047 }
5048 static void generate_test_instances(list<obj_list_watch_response_t*>& o) {
5049 entity_addr_t ea;
5050 o.push_back(new obj_list_watch_response_t);
5051 o.push_back(new obj_list_watch_response_t);
5052 ea.set_type(entity_addr_t::TYPE_LEGACY);
5053 ea.set_nonce(1000);
5054 ea.set_family(AF_INET);
5055 ea.set_in4_quad(0, 127);
5056 ea.set_in4_quad(1, 0);
5057 ea.set_in4_quad(2, 0);
5058 ea.set_in4_quad(3, 1);
5059 ea.set_port(1024);
5060 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
5061 ea.set_nonce(1001);
5062 ea.set_in4_quad(3, 2);
5063 ea.set_port(1025);
5064 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
5065 }
5066 };
5067 WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
5068
5069 struct clone_info {
5070 snapid_t cloneid;
5071 vector<snapid_t> snaps; // ascending
5072 vector< pair<uint64_t,uint64_t> > overlap;
5073 uint64_t size;
5074
5075 clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
5076
5077 void encode(bufferlist& bl) const {
5078 ENCODE_START(1, 1, bl);
5079 ::encode(cloneid, bl);
5080 ::encode(snaps, bl);
5081 ::encode(overlap, bl);
5082 ::encode(size, bl);
5083 ENCODE_FINISH(bl);
5084 }
5085 void decode(bufferlist::iterator& bl) {
5086 DECODE_START(1, bl);
5087 ::decode(cloneid, bl);
5088 ::decode(snaps, bl);
5089 ::decode(overlap, bl);
5090 ::decode(size, bl);
5091 DECODE_FINISH(bl);
5092 }
5093 void dump(Formatter *f) const {
5094 if (cloneid == CEPH_NOSNAP)
5095 f->dump_string("cloneid", "HEAD");
5096 else
5097 f->dump_unsigned("cloneid", cloneid.val);
5098 f->open_array_section("snapshots");
5099 for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
5100 f->open_object_section("snap");
5101 f->dump_unsigned("id", p->val);
5102 f->close_section();
5103 }
5104 f->close_section();
5105 f->open_array_section("overlaps");
5106 for (vector< pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
5107 q != overlap.end(); ++q) {
5108 f->open_object_section("overlap");
5109 f->dump_unsigned("offset", q->first);
5110 f->dump_unsigned("length", q->second);
5111 f->close_section();
5112 }
5113 f->close_section();
5114 f->dump_unsigned("size", size);
5115 }
5116 static void generate_test_instances(list<clone_info*>& o) {
5117 o.push_back(new clone_info);
5118 o.push_back(new clone_info);
5119 o.back()->cloneid = 1;
5120 o.back()->snaps.push_back(1);
5121 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5122 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5123 o.back()->size = 16384;
5124 o.push_back(new clone_info);
5125 o.back()->cloneid = CEPH_NOSNAP;
5126 o.back()->size = 32768;
5127 }
5128 };
5129 WRITE_CLASS_ENCODER(clone_info)
5130
5131 /**
5132 * obj list snaps response format
5133 *
5134 */
5135 struct obj_list_snap_response_t {
5136 vector<clone_info> clones; // ascending
5137 snapid_t seq;
5138
5139 void encode(bufferlist& bl) const {
5140 ENCODE_START(2, 1, bl);
5141 ::encode(clones, bl);
5142 ::encode(seq, bl);
5143 ENCODE_FINISH(bl);
5144 }
5145 void decode(bufferlist::iterator& bl) {
5146 DECODE_START(2, bl);
5147 ::decode(clones, bl);
5148 if (struct_v >= 2)
5149 ::decode(seq, bl);
5150 else
5151 seq = CEPH_NOSNAP;
5152 DECODE_FINISH(bl);
5153 }
5154 void dump(Formatter *f) const {
5155 f->open_array_section("clones");
5156 for (vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
5157 f->open_object_section("clone");
5158 p->dump(f);
5159 f->close_section();
5160 }
5161 f->dump_unsigned("seq", seq);
5162 f->close_section();
5163 }
5164 static void generate_test_instances(list<obj_list_snap_response_t*>& o) {
5165 o.push_back(new obj_list_snap_response_t);
5166 o.push_back(new obj_list_snap_response_t);
5167 clone_info cl;
5168 cl.cloneid = 1;
5169 cl.snaps.push_back(1);
5170 cl.overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5171 cl.overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5172 cl.size = 16384;
5173 o.back()->clones.push_back(cl);
5174 cl.cloneid = CEPH_NOSNAP;
5175 cl.snaps.clear();
5176 cl.overlap.clear();
5177 cl.size = 32768;
5178 o.back()->clones.push_back(cl);
5179 o.back()->seq = 123;
5180 }
5181 };
5182
5183 WRITE_CLASS_ENCODER(obj_list_snap_response_t)
5184
5185 // PromoteCounter
5186
5187 struct PromoteCounter {
5188 std::atomic_ullong attempts{0};
5189 std::atomic_ullong objects{0};
5190 std::atomic_ullong bytes{0};
5191
5192 void attempt() {
5193 attempts++;
5194 }
5195
5196 void finish(uint64_t size) {
5197 objects++;
5198 bytes += size;
5199 }
5200
5201 void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
5202 *a = attempts;
5203 *o = objects;
5204 *b = bytes;
5205 attempts = *a / 2;
5206 objects = *o / 2;
5207 bytes = *b / 2;
5208 }
5209 };
5210
5211 /** store_statfs_t
5212 * ObjectStore full statfs information
5213 */
5214 struct store_statfs_t
5215 {
5216 uint64_t total = 0; // Total bytes
5217 uint64_t available = 0; // Free bytes available
5218
5219 int64_t allocated = 0; // Bytes allocated by the store
5220 int64_t stored = 0; // Bytes actually stored by the user
5221 int64_t compressed = 0; // Bytes stored after compression
5222 int64_t compressed_allocated = 0; // Bytes allocated for compressed data
5223 int64_t compressed_original = 0; // Bytes that were successfully compressed
5224
5225 void reset() {
5226 *this = store_statfs_t();
5227 }
5228 bool operator ==(const store_statfs_t& other) const;
5229 void dump(Formatter *f) const;
5230 };
5231 ostream &operator<<(ostream &lhs, const store_statfs_t &rhs);
5232
5233 #endif