]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/osd_types.h
55c1bc582b49f36daf116f46a0233690c5632ac2
[ceph.git] / ceph / src / osd / osd_types.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #ifndef CEPH_OSD_TYPES_H
19 #define CEPH_OSD_TYPES_H
20
21 #include <atomic>
22 #include <sstream>
23 #include <cstdio>
24 #include <memory>
25 #include <string_view>
26
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/optional/optional_io.hpp>
29 #include <boost/variant.hpp>
30 #include <boost/smart_ptr/local_shared_ptr.hpp>
31
32 #include "include/rados/rados_types.hpp"
33 #include "include/mempool.h"
34
35 #include "msg/msg_types.h"
36 #include "include/compat.h"
37 #include "include/types.h"
38 #include "include/utime.h"
39 #include "include/CompatSet.h"
40 #include "common/ceph_context.h"
41 #include "common/histogram.h"
42 #include "include/interval_set.h"
43 #include "include/inline_memory.h"
44 #include "common/Formatter.h"
45 #include "common/bloom_filter.hpp"
46 #include "common/hobject.h"
47 #include "common/snap_types.h"
48 #include "HitSet.h"
49 #include "Watch.h"
50 #include "include/cmp.h"
51 #include "librados/ListObjectImpl.h"
52 #include "compressor/Compressor.h"
53 #include "osd_perf_counters.h"
54
55 #define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
56
57 #define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
58 #define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
59 #define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
60 #define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
61 #define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
62 #define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
63 #define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
64 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
65 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
66 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
67 #define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
68 #define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
69 #define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
70 #define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
71 #define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
72 #define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
73 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2 CompatSet::Feature(17, "new snapmapper key structure")
74
75
76 /// pool priority range set by user
77 #define OSD_POOL_PRIORITY_MAX 10
78 #define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX
79
80 /// min recovery priority for MBackfillReserve
81 #define OSD_RECOVERY_PRIORITY_MIN 0
82
83 /// base backfill priority for MBackfillReserve
84 #define OSD_BACKFILL_PRIORITY_BASE 100
85
86 /// base backfill priority for MBackfillReserve (degraded PG)
87 #define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
88
89 /// base recovery priority for MBackfillReserve
90 #define OSD_RECOVERY_PRIORITY_BASE 180
91
92 /// base backfill priority for MBackfillReserve (inactive PG)
93 #define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
94
95 /// base recovery priority for MRecoveryReserve (inactive PG)
96 #define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220
97
98 /// max manually/automatically set recovery priority for MBackfillReserve
99 #define OSD_RECOVERY_PRIORITY_MAX 253
100
101 /// backfill priority for MBackfillReserve, when forced manually
102 #define OSD_BACKFILL_PRIORITY_FORCED 254
103
104 /// recovery priority for MRecoveryReserve, when forced manually
105 #define OSD_RECOVERY_PRIORITY_FORCED 255
106
107 /// priority for pg deletion when osd is not fullish
108 #define OSD_DELETE_PRIORITY_NORMAL 179
109
110 /// priority for pg deletion when osd is approaching full
111 #define OSD_DELETE_PRIORITY_FULLISH 219
112
113 /// priority when more full
114 #define OSD_DELETE_PRIORITY_FULL 255
115
116 static std::map<int, int> max_prio_map = {
117 {OSD_BACKFILL_PRIORITY_BASE, OSD_BACKFILL_DEGRADED_PRIORITY_BASE - 1},
118 {OSD_BACKFILL_DEGRADED_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_BASE - 1},
119 {OSD_RECOVERY_PRIORITY_BASE, OSD_BACKFILL_INACTIVE_PRIORITY_BASE - 1},
120 {OSD_RECOVERY_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX},
121 {OSD_BACKFILL_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX}
122 };
123
124 typedef hobject_t collection_list_handle_t;
125
126 /// convert a single CPEH_OSD_FLAG_* to a std::string
127 const char *ceph_osd_flag_name(unsigned flag);
128 /// convert a single CEPH_OSD_OF_FLAG_* to a std::string
129 const char *ceph_osd_op_flag_name(unsigned flag);
130
131 /// convert CEPH_OSD_FLAG_* op flags to a std::string
132 std::string ceph_osd_flag_string(unsigned flags);
133 /// conver CEPH_OSD_OP_FLAG_* op flags to a std::string
134 std::string ceph_osd_op_flag_string(unsigned flags);
135 /// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a std::string
136 std::string ceph_osd_alloc_hint_flag_string(unsigned flags);
137
138 typedef std::map<std::string,std::string> osd_alert_list_t;
139 /// map osd id -> alert_list_t
140 typedef std::map<int, osd_alert_list_t> osd_alerts_t;
141 void dump(ceph::Formatter* f, const osd_alerts_t& alerts);
142
143
144 typedef interval_set<
145 snapid_t,
146 mempool::osdmap::flat_map> snap_interval_set_t;
147
148
149 /**
150 * osd request identifier
151 *
152 * caller name + incarnation# + tid to unique identify this request.
153 */
154 struct osd_reqid_t {
155 entity_name_t name; // who
156 ceph_tid_t tid;
157 int32_t inc; // incarnation
158
159 osd_reqid_t()
160 : tid(0), inc(0)
161 {}
162 osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
163 : name(a), tid(t), inc(i)
164 {}
165
166 DENC(osd_reqid_t, v, p) {
167 DENC_START(2, 2, p);
168 denc(v.name, p);
169 denc(v.tid, p);
170 denc(v.inc, p);
171 DENC_FINISH(p);
172 }
173 void dump(ceph::Formatter *f) const;
174 static void generate_test_instances(std::list<osd_reqid_t*>& o);
175 };
176 WRITE_CLASS_DENC(osd_reqid_t)
177
178
179
180 struct pg_shard_t {
181 static const int32_t NO_OSD = 0x7fffffff;
182 int32_t osd;
183 shard_id_t shard;
184 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
185 explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
186 pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
187 bool is_undefined() const {
188 return osd == -1;
189 }
190 std::string get_osd() const { return (osd == NO_OSD ? "NONE" : std::to_string(osd)); }
191 void encode(ceph::buffer::list &bl) const;
192 void decode(ceph::buffer::list::const_iterator &bl);
193 void dump(ceph::Formatter *f) const {
194 f->dump_unsigned("osd", osd);
195 if (shard != shard_id_t::NO_SHARD) {
196 f->dump_unsigned("shard", shard);
197 }
198 }
199 };
200 WRITE_CLASS_ENCODER(pg_shard_t)
201 WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
202 WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
203 std::ostream& operator<<(std::ostream &lhs, const pg_shard_t &rhs);
204
205 using HobjToShardSetMapping = std::map<hobject_t, std::set<pg_shard_t>>;
206
207 class IsPGRecoverablePredicate {
208 public:
209 /**
210 * have encodes the shards available
211 */
212 virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
213 virtual ~IsPGRecoverablePredicate() {}
214 };
215
216 class IsPGReadablePredicate {
217 public:
218 /**
219 * have encodes the shards available
220 */
221 virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
222 virtual ~IsPGReadablePredicate() {}
223 };
224
225 inline std::ostream& operator<<(std::ostream& out, const osd_reqid_t& r) {
226 return out << r.name << "." << r.inc << ":" << r.tid;
227 }
228
229 inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
230 return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
231 }
232 inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
233 return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
234 }
235 inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
236 return (l.name < r.name) || (l.inc < r.inc) ||
237 (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
238 }
239 inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
240 return (l.name < r.name) || (l.inc < r.inc) ||
241 (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
242 }
243 inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
244 inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
245
246 namespace std {
247 template<> struct hash<osd_reqid_t> {
248 size_t operator()(const osd_reqid_t &r) const {
249 static hash<uint64_t> H;
250 return H(r.name.num() ^ r.tid ^ r.inc);
251 }
252 };
253 } // namespace std
254
255
256 // -----
257
258 // a locator constrains the placement of an object. mainly, which pool
259 // does it go in.
260 struct object_locator_t {
261 // You specify either the hash or the key -- not both
262 std::int64_t pool; ///< pool id
263 std::string key; ///< key string (if non-empty)
264 std::string nspace; ///< namespace
265 std::int64_t hash; ///< hash position (if >= 0)
266
267 explicit object_locator_t()
268 : pool(-1), hash(-1) {}
269 explicit object_locator_t(int64_t po)
270 : pool(po), hash(-1) {}
271 explicit object_locator_t(int64_t po, int64_t ps)
272 : pool(po), hash(ps) {}
273 explicit object_locator_t(int64_t po, std::string_view ns)
274 : pool(po), nspace(ns), hash(-1) {}
275 explicit object_locator_t(int64_t po, std::string_view ns, int64_t ps)
276 : pool(po), nspace(ns), hash(ps) {}
277 explicit object_locator_t(int64_t po, std::string_view ns, std::string_view s)
278 : pool(po), key(s), nspace(ns), hash(-1) {}
279 explicit object_locator_t(const hobject_t& soid)
280 : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
281
282 int64_t get_pool() const {
283 return pool;
284 }
285
286 void clear() {
287 pool = -1;
288 key = "";
289 nspace = "";
290 hash = -1;
291 }
292
293 bool empty() const {
294 return pool == -1;
295 }
296
297 void encode(ceph::buffer::list& bl) const;
298 void decode(ceph::buffer::list::const_iterator& p);
299 void dump(ceph::Formatter *f) const;
300 static void generate_test_instances(std::list<object_locator_t*>& o);
301 };
302 WRITE_CLASS_ENCODER(object_locator_t)
303
304 inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
305 return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
306 }
307 inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
308 return !(l == r);
309 }
310
311 inline std::ostream& operator<<(std::ostream& out, const object_locator_t& loc)
312 {
313 out << "@" << loc.pool;
314 if (loc.nspace.length())
315 out << ";" << loc.nspace;
316 if (loc.key.length())
317 out << ":" << loc.key;
318 return out;
319 }
320
321 struct request_redirect_t {
322 private:
323 object_locator_t redirect_locator; ///< this is authoritative
324 std::string redirect_object; ///< If non-empty, the request goes to this object name
325
326 friend std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir);
327 public:
328
329 request_redirect_t() {}
330 explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
331 redirect_locator(orig) { redirect_locator.pool = rpool; }
332 explicit request_redirect_t(const object_locator_t& rloc) :
333 redirect_locator(rloc) {}
334 explicit request_redirect_t(const object_locator_t& orig,
335 const std::string& robj) :
336 redirect_locator(orig), redirect_object(robj) {}
337
338 bool empty() const { return redirect_locator.empty() &&
339 redirect_object.empty(); }
340
341 void combine_with_locator(object_locator_t& orig, std::string& obj) const {
342 orig = redirect_locator;
343 if (!redirect_object.empty())
344 obj = redirect_object;
345 }
346
347 void encode(ceph::buffer::list& bl) const;
348 void decode(ceph::buffer::list::const_iterator& bl);
349 void dump(ceph::Formatter *f) const;
350 static void generate_test_instances(std::list<request_redirect_t*>& o);
351 };
352 WRITE_CLASS_ENCODER(request_redirect_t)
353
354 inline std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir) {
355 out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
356 return out;
357 }
358
359 // Internal OSD op flags - set by the OSD based on the op types
360 enum {
361 CEPH_OSD_RMW_FLAG_READ = (1 << 1),
362 CEPH_OSD_RMW_FLAG_WRITE = (1 << 2),
363 CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3),
364 CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
365 CEPH_OSD_RMW_FLAG_PGOP = (1 << 5),
366 CEPH_OSD_RMW_FLAG_CACHE = (1 << 6),
367 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7),
368 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
369 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9),
370 CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10),
371 CEPH_OSD_RMW_FLAG_RETURNVEC = (1 << 11),
372 };
373
374
375 // pg stuff
376
377 #define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
378
379 // placement seed (a hash value)
380 typedef uint32_t ps_t;
381
382 // old (v1) pg_t encoding (wrap old struct ceph_pg)
383 struct old_pg_t {
384 ceph_pg v;
385 void encode(ceph::buffer::list& bl) const {
386 ceph::encode_raw(v, bl);
387 }
388 void decode(ceph::buffer::list::const_iterator& bl) {
389 ceph::decode_raw(v, bl);
390 }
391 };
392 WRITE_CLASS_ENCODER(old_pg_t)
393
394 // placement group id
395 struct pg_t {
396 uint64_t m_pool;
397 uint32_t m_seed;
398
399 pg_t() : m_pool(0), m_seed(0) {}
400 pg_t(ps_t seed, uint64_t pool) :
401 m_pool(pool), m_seed(seed) {}
402 // cppcheck-suppress noExplicitConstructor
403 pg_t(const ceph_pg& cpg) :
404 m_pool(cpg.pool), m_seed(cpg.ps) {}
405
406 // cppcheck-suppress noExplicitConstructor
407 pg_t(const old_pg_t& opg) {
408 *this = opg.v;
409 }
410
411 old_pg_t get_old_pg() const {
412 old_pg_t o;
413 ceph_assert(m_pool < 0xffffffffull);
414 o.v.pool = m_pool;
415 o.v.ps = m_seed;
416 o.v.preferred = (__s16)-1;
417 return o;
418 }
419
420 ps_t ps() const {
421 return m_seed;
422 }
423 int64_t pool() const {
424 return m_pool;
425 }
426
427 static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
428 char *calc_name(char *buf, const char *suffix_backwords) const;
429
430 void set_ps(ps_t p) {
431 m_seed = p;
432 }
433 void set_pool(uint64_t p) {
434 m_pool = p;
435 }
436
437 pg_t get_parent() const;
438 pg_t get_ancestor(unsigned old_pg_num) const;
439
440 int print(char *o, int maxlen) const;
441 bool parse(const char *s);
442
443 bool is_split(unsigned old_pg_num, unsigned new_pg_num, std::set<pg_t> *pchildren) const;
444
445 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, pg_t *parent) const;
446 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
447 return ps() < new_pg_num && is_split(new_pg_num, old_pg_num, nullptr);
448 }
449
450 /**
451 * Returns b such that for all object o:
452 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
453 */
454 unsigned get_split_bits(unsigned pg_num) const;
455
456 bool contains(int bits, const ghobject_t& oid) const {
457 return
458 (int64_t)m_pool == oid.hobj.get_logical_pool() &&
459 oid.match(bits, ps());
460 }
461 bool contains(int bits, const hobject_t& oid) const {
462 return
463 (int64_t)m_pool == oid.get_logical_pool() &&
464 oid.match(bits, ps());
465 }
466
467 hobject_t get_hobj_start() const;
468 hobject_t get_hobj_end(unsigned pg_num) const;
469
470 // strong ordering is supported
471 inline int compare(const pg_t& p) const noexcept {
472 if (auto delta = pool() - p.pool(); delta != 0) {
473 return delta;
474 } else if (ps() < p.ps()) {
475 return -1;
476 } else if (ps() > p.ps()) {
477 return 1;
478 } else {
479 return 0;
480 }
481 }
482
483 void encode(ceph::buffer::list& bl) const {
484 using ceph::encode;
485 __u8 v = 1;
486 encode(v, bl);
487 encode(m_pool, bl);
488 encode(m_seed, bl);
489 encode((int32_t)-1, bl); // was preferred
490 }
491 void decode(ceph::buffer::list::const_iterator& bl) {
492 using ceph::decode;
493 __u8 v;
494 decode(v, bl);
495 decode(m_pool, bl);
496 decode(m_seed, bl);
497 bl += sizeof(int32_t); // was preferred
498 }
499 void decode_old(ceph::buffer::list::const_iterator& bl) {
500 using ceph::decode;
501 old_pg_t opg;
502 decode(opg, bl);
503 *this = opg;
504 }
505 void dump(ceph::Formatter *f) const;
506 static void generate_test_instances(std::list<pg_t*>& o);
507 };
508 WRITE_CLASS_ENCODER(pg_t)
509
510 inline bool operator<(const pg_t& l, const pg_t& r) {
511 return l.compare(r) < 0;
512 }
513 inline bool operator<=(const pg_t& l, const pg_t& r) {
514 return l.compare(r) <= 0;
515 }
516 inline bool operator==(const pg_t& l, const pg_t& r) {
517 return l.compare(r) == 0;
518 }
519 inline bool operator!=(const pg_t& l, const pg_t& r) {
520 return l.compare(r) != 0;
521 }
522 inline bool operator>(const pg_t& l, const pg_t& r) {
523 return l.compare(r) > 0;
524 }
525 inline bool operator>=(const pg_t& l, const pg_t& r) {
526 return l.compare(r) >= 0;
527 }
528
529 std::ostream& operator<<(std::ostream& out, const pg_t &pg);
530
531 namespace std {
532 template<> struct hash< pg_t >
533 {
534 size_t operator()( const pg_t& x ) const
535 {
536 static hash<uint32_t> H;
537 // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
538 return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ (int32_t)(-1));
539 }
540 };
541 } // namespace std
542
543 struct spg_t {
544 pg_t pgid;
545 shard_id_t shard;
546 spg_t() : shard(shard_id_t::NO_SHARD) {}
547 spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
548 explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
549 unsigned get_split_bits(unsigned pg_num) const {
550 return pgid.get_split_bits(pg_num);
551 }
552 spg_t get_parent() const {
553 return spg_t(pgid.get_parent(), shard);
554 }
555 ps_t ps() const {
556 return pgid.ps();
557 }
558 uint64_t pool() const {
559 return pgid.pool();
560 }
561 void reset_shard(shard_id_t s) {
562 shard = s;
563 }
564
565 static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
566 char *calc_name(char *buf, const char *suffix_backwords) const;
567
568 bool parse(const char *s);
569 bool parse(const std::string& s) {
570 return parse(s.c_str());
571 }
572
573 spg_t get_ancestor(unsigned old_pg_num) const {
574 return spg_t(pgid.get_ancestor(old_pg_num), shard);
575 }
576
577 bool is_split(unsigned old_pg_num, unsigned new_pg_num,
578 std::set<spg_t> *pchildren) const {
579 std::set<pg_t> _children;
580 std::set<pg_t> *children = pchildren ? &_children : NULL;
581 bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
582 if (pchildren && is_split) {
583 for (std::set<pg_t>::iterator i = _children.begin();
584 i != _children.end();
585 ++i) {
586 pchildren->insert(spg_t(*i, shard));
587 }
588 }
589 return is_split;
590 }
591 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
592 return pgid.is_merge_target(old_pg_num, new_pg_num);
593 }
594 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num,
595 spg_t *parent) const {
596 spg_t out = *this;
597 bool r = pgid.is_merge_source(old_pg_num, new_pg_num, &out.pgid);
598 if (r && parent) {
599 *parent = out;
600 }
601 return r;
602 }
603
604 bool is_no_shard() const {
605 return shard == shard_id_t::NO_SHARD;
606 }
607
608 ghobject_t make_pgmeta_oid() const {
609 return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
610 }
611
612 void encode(ceph::buffer::list &bl) const {
613 ENCODE_START(1, 1, bl);
614 encode(pgid, bl);
615 encode(shard, bl);
616 ENCODE_FINISH(bl);
617 }
618 void decode(ceph::buffer::list::const_iterator& bl) {
619 DECODE_START(1, bl);
620 decode(pgid, bl);
621 decode(shard, bl);
622 DECODE_FINISH(bl);
623 }
624
625 ghobject_t make_temp_ghobject(const std::string& name) const {
626 return ghobject_t(
627 hobject_t(object_t(name), "", CEPH_NOSNAP,
628 pgid.ps(),
629 hobject_t::get_temp_pool(pgid.pool()),
630 ""),
631 ghobject_t::NO_GEN,
632 shard);
633 }
634
635 unsigned hash_to_shard(unsigned num_shards) const {
636 return ps() % num_shards;
637 }
638 };
639 WRITE_CLASS_ENCODER(spg_t)
640 WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
641 WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
642
643 namespace std {
644 template<> struct hash< spg_t >
645 {
646 size_t operator()( const spg_t& x ) const
647 {
648 static hash<uint32_t> H;
649 return H(hash<pg_t>()(x.pgid) ^ x.shard);
650 }
651 };
652 } // namespace std
653
654 std::ostream& operator<<(std::ostream& out, const spg_t &pg);
655
656 // ----------------------
657
658 class coll_t {
659 enum type_t : uint8_t {
660 TYPE_META = 0,
661 TYPE_LEGACY_TEMP = 1, /* no longer used */
662 TYPE_PG = 2,
663 TYPE_PG_TEMP = 3,
664 };
665 type_t type;
666 spg_t pgid;
667 uint64_t removal_seq; // note: deprecated, not encoded
668
669 char _str_buff[spg_t::calc_name_buf_size];
670 char *_str;
671
672 void calc_str();
673
674 coll_t(type_t t, spg_t p, uint64_t r)
675 : type(t), pgid(p), removal_seq(r) {
676 calc_str();
677 }
678
679 friend class denc_coll_t;
680 public:
681 coll_t() : type(TYPE_META), removal_seq(0)
682 {
683 calc_str();
684 }
685
686 coll_t(const coll_t& other)
687 : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
688 calc_str();
689 }
690
691 explicit coll_t(spg_t pgid)
692 : type(TYPE_PG), pgid(pgid), removal_seq(0)
693 {
694 calc_str();
695 }
696
697 coll_t& operator=(const coll_t& rhs)
698 {
699 this->type = rhs.type;
700 this->pgid = rhs.pgid;
701 this->removal_seq = rhs.removal_seq;
702 this->calc_str();
703 return *this;
704 }
705
706 // named constructors
707 static coll_t meta() {
708 return coll_t();
709 }
710 static coll_t pg(spg_t p) {
711 return coll_t(p);
712 }
713
714 const std::string to_str() const {
715 return std::string(_str);
716 }
717 const char *c_str() const {
718 return _str;
719 }
720
721 bool parse(const std::string& s);
722
723 int operator<(const coll_t &rhs) const {
724 return type < rhs.type ||
725 (type == rhs.type && pgid < rhs.pgid);
726 }
727
728 bool is_meta() const {
729 return type == TYPE_META;
730 }
731 bool is_pg_prefix(spg_t *pgid_) const {
732 if (type == TYPE_PG || type == TYPE_PG_TEMP) {
733 *pgid_ = pgid;
734 return true;
735 }
736 return false;
737 }
738 bool is_pg() const {
739 return type == TYPE_PG;
740 }
741 bool is_pg(spg_t *pgid_) const {
742 if (type == TYPE_PG) {
743 *pgid_ = pgid;
744 return true;
745 }
746 return false;
747 }
748 bool is_temp() const {
749 return type == TYPE_PG_TEMP;
750 }
751 bool is_temp(spg_t *pgid_) const {
752 if (type == TYPE_PG_TEMP) {
753 *pgid_ = pgid;
754 return true;
755 }
756 return false;
757 }
758 int64_t pool() const {
759 return pgid.pool();
760 }
761
762 void encode(ceph::buffer::list& bl) const;
763 void decode(ceph::buffer::list::const_iterator& bl);
764 size_t encoded_size() const;
765
766 inline bool operator==(const coll_t& rhs) const {
767 // only compare type if meta
768 if (type != rhs.type)
769 return false;
770 if (type == TYPE_META)
771 return true;
772 return type == rhs.type && pgid == rhs.pgid;
773 }
774 inline bool operator!=(const coll_t& rhs) const {
775 return !(*this == rhs);
776 }
777
778 // get a TEMP collection that corresponds to the current collection,
779 // which we presume is a pg collection.
780 coll_t get_temp() const {
781 ceph_assert(type == TYPE_PG);
782 return coll_t(TYPE_PG_TEMP, pgid, 0);
783 }
784
785 ghobject_t get_min_hobj() const {
786 ghobject_t o;
787 switch (type) {
788 case TYPE_PG:
789 o.hobj.pool = pgid.pool();
790 o.set_shard(pgid.shard);
791 break;
792 case TYPE_META:
793 o.hobj.pool = -1;
794 break;
795 default:
796 break;
797 }
798 return o;
799 }
800
801 unsigned hash_to_shard(unsigned num_shards) const {
802 if (type == TYPE_PG)
803 return pgid.hash_to_shard(num_shards);
804 return 0; // whatever.
805 }
806
807 void dump(ceph::Formatter *f) const;
808 static void generate_test_instances(std::list<coll_t*>& o);
809 };
810
811 WRITE_CLASS_ENCODER(coll_t)
812
813 inline std::ostream& operator<<(std::ostream& out, const coll_t& c) {
814 out << c.to_str();
815 return out;
816 }
817
818 namespace std {
819 template<> struct hash<coll_t> {
820 size_t operator()(const coll_t &c) const {
821 size_t h = 0;
822 std::string str(c.to_str());
823 std::string::const_iterator end(str.end());
824 for (std::string::const_iterator s = str.begin(); s != end; ++s) {
825 h += *s;
826 h += (h << 10);
827 h ^= (h >> 6);
828 }
829 h += (h << 3);
830 h ^= (h >> 11);
831 h += (h << 15);
832 return h;
833 }
834 };
835 } // namespace std
836
837 inline std::ostream& operator<<(std::ostream& out, const ceph_object_layout &ol)
838 {
839 out << pg_t(ol.ol_pgid);
840 int su = ol.ol_stripe_unit;
841 if (su)
842 out << ".su=" << su;
843 return out;
844 }
845
846 struct denc_coll_t {
847 coll_t coll;
848
849 auto &get_type() const { return coll.type; }
850 auto &get_type() { return coll.type; }
851 auto &get_pgid() const { return coll.pgid; }
852 auto &get_pgid() { return coll.pgid; }
853
854 denc_coll_t() = default;
855 denc_coll_t(const denc_coll_t &) = default;
856 denc_coll_t(denc_coll_t &&) = default;
857
858 denc_coll_t &operator=(const denc_coll_t &) = default;
859 denc_coll_t &operator=(denc_coll_t &&) = default;
860
861 explicit denc_coll_t(const coll_t &coll) : coll(coll) {}
862 operator coll_t() const {
863 return coll;
864 }
865
866 bool operator<(const denc_coll_t &rhs) const {
867 return coll < rhs.coll;
868 }
869
870 DENC(denc_coll_t, v, p) {
871 DENC_START(1, 1, p);
872 denc(v.get_type(), p);
873 denc(v.get_pgid().pgid.m_pool, p);
874 denc(v.get_pgid().pgid.m_seed, p);
875 denc(v.get_pgid().shard.id, p);
876 DENC_FINISH(p);
877 }
878 };
879 WRITE_CLASS_DENC(denc_coll_t)
880
881
882 // compound rados version type
883 /* WARNING: If add member in eversion_t, please make sure the encode/decode function
884 * work well. For little-endian machine, we should make sure there is no padding
885 * in 32-bit machine and 64-bit machine.
886 */
887 class eversion_t {
888 public:
889 version_t version;
890 epoch_t epoch;
891 __u32 __pad;
892 eversion_t() : version(0), epoch(0), __pad(0) {}
893 eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
894
895 // cppcheck-suppress noExplicitConstructor
896 eversion_t(const ceph_eversion& ce) :
897 version(ce.version),
898 epoch(ce.epoch),
899 __pad(0) { }
900
901 explicit eversion_t(ceph::buffer::list& bl) : __pad(0) { decode(bl); }
902
903 static const eversion_t& max() {
904 static const eversion_t max(-1,-1);
905 return max;
906 }
907
908 operator ceph_eversion() {
909 ceph_eversion c;
910 c.epoch = epoch;
911 c.version = version;
912 return c;
913 }
914
915 std::string get_key_name() const;
916
917 // key must point to the beginning of a block of 32 chars
918 inline void get_key_name(char* key) const {
919 // Below is equivalent of sprintf("%010u.%020llu");
920 key[31] = 0;
921 ritoa<uint64_t, 10, 20>(version, key + 31);
922 key[10] = '.';
923 ritoa<uint32_t, 10, 10>(epoch, key + 10);
924 }
925
926 void encode(ceph::buffer::list &bl) const {
927 #if defined(CEPH_LITTLE_ENDIAN)
928 bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
929 #else
930 using ceph::encode;
931 encode(version, bl);
932 encode(epoch, bl);
933 #endif
934 }
935 void decode(ceph::buffer::list::const_iterator &bl) {
936 #if defined(CEPH_LITTLE_ENDIAN)
937 bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
938 #else
939 using ceph::decode;
940 decode(version, bl);
941 decode(epoch, bl);
942 #endif
943 }
944 void decode(ceph::buffer::list& bl) {
945 auto p = std::cbegin(bl);
946 decode(p);
947 }
948 };
949 WRITE_CLASS_ENCODER(eversion_t)
950
951 inline bool operator==(const eversion_t& l, const eversion_t& r) {
952 return (l.epoch == r.epoch) && (l.version == r.version);
953 }
954 inline bool operator!=(const eversion_t& l, const eversion_t& r) {
955 return (l.epoch != r.epoch) || (l.version != r.version);
956 }
957 inline bool operator<(const eversion_t& l, const eversion_t& r) {
958 return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
959 }
960 inline bool operator<=(const eversion_t& l, const eversion_t& r) {
961 return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
962 }
963 inline bool operator>(const eversion_t& l, const eversion_t& r) {
964 return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
965 }
966 inline bool operator>=(const eversion_t& l, const eversion_t& r) {
967 return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
968 }
969 inline std::ostream& operator<<(std::ostream& out, const eversion_t& e) {
970 return out << e.epoch << "'" << e.version;
971 }
972
973 /**
974 * objectstore_perf_stat_t
975 *
976 * current perf information about the osd
977 */
978 struct objectstore_perf_stat_t {
979 // cur_op_latency is in ns since double add/sub are not associative
980 uint64_t os_commit_latency_ns;
981 uint64_t os_apply_latency_ns;
982
983 objectstore_perf_stat_t() :
984 os_commit_latency_ns(0), os_apply_latency_ns(0) {}
985
986 bool operator==(const objectstore_perf_stat_t &r) const {
987 return os_commit_latency_ns == r.os_commit_latency_ns &&
988 os_apply_latency_ns == r.os_apply_latency_ns;
989 }
990
991 void add(const objectstore_perf_stat_t &o) {
992 os_commit_latency_ns += o.os_commit_latency_ns;
993 os_apply_latency_ns += o.os_apply_latency_ns;
994 }
995 void sub(const objectstore_perf_stat_t &o) {
996 os_commit_latency_ns -= o.os_commit_latency_ns;
997 os_apply_latency_ns -= o.os_apply_latency_ns;
998 }
999 void dump(ceph::Formatter *f) const;
1000 void encode(ceph::buffer::list &bl, uint64_t features) const;
1001 void decode(ceph::buffer::list::const_iterator &bl);
1002 static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
1003 };
1004 WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t)
1005
1006 /*
1007 * pg states
1008 */
1009 #define PG_STATE_CREATING (1ULL << 0) // creating
1010 #define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too)
1011 #define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas.
1012 #define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline
1013 #define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound
1014 #define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound
1015 #define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging
1016 #define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing
1017 //#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub
1018 #define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy
1019 #define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
1020 #define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering
1021 #define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub
1022 #define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects
1023 #define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill
1024 #define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed.
1025 #define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown.
1026 #define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
1027 #define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files
1028 #define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content
1029 #define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full
1030 #define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations
1031 #define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size
1032 #define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active
1033 #define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover
1034 #define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps
1035 #define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps
1036 #define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full
1037 #define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps
1038 #define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other
1039 #define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other
1040 #define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors
1041 #define PG_STATE_LAGGY (1ULL << 33) // PG is laggy/unreabable due to slow/delayed pings
1042 #define PG_STATE_WAIT (1ULL << 34) // PG is waiting for prior intervals' readable period to expire
1043
1044 std::string pg_state_string(uint64_t state);
1045 std::string pg_vector_string(const std::vector<int32_t> &a);
1046 std::optional<uint64_t> pg_string_state(const std::string& state);
1047
1048
1049 /*
1050 * pool_snap_info_t
1051 *
1052 * attributes for a single pool snapshot.
1053 */
1054 struct pool_snap_info_t {
1055 snapid_t snapid;
1056 utime_t stamp;
1057 std::string name;
1058
1059 void dump(ceph::Formatter *f) const;
1060 void encode(ceph::buffer::list& bl, uint64_t features) const;
1061 void decode(ceph::buffer::list::const_iterator& bl);
1062 static void generate_test_instances(std::list<pool_snap_info_t*>& o);
1063 };
1064 WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
1065
1066 inline std::ostream& operator<<(std::ostream& out, const pool_snap_info_t& si) {
1067 return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
1068 }
1069
1070
1071 /*
1072 * pool_opts_t
1073 *
1074 * pool options.
1075 */
1076
1077 // The order of items in the list is important, therefore,
1078 // you should always add to the end of the list when adding new options.
1079
1080 class pool_opts_t {
1081 public:
1082 enum key_t {
1083 SCRUB_MIN_INTERVAL,
1084 SCRUB_MAX_INTERVAL,
1085 DEEP_SCRUB_INTERVAL,
1086 RECOVERY_PRIORITY,
1087 RECOVERY_OP_PRIORITY,
1088 SCRUB_PRIORITY,
1089 COMPRESSION_MODE,
1090 COMPRESSION_ALGORITHM,
1091 COMPRESSION_REQUIRED_RATIO,
1092 COMPRESSION_MAX_BLOB_SIZE,
1093 COMPRESSION_MIN_BLOB_SIZE,
1094 CSUM_TYPE,
1095 CSUM_MAX_BLOCK,
1096 CSUM_MIN_BLOCK,
1097 FINGERPRINT_ALGORITHM,
1098 PG_NUM_MIN, // min pg_num
1099 TARGET_SIZE_BYTES, // total bytes in pool
1100 TARGET_SIZE_RATIO, // fraction of total cluster
1101 PG_AUTOSCALE_BIAS,
1102 READ_LEASE_INTERVAL,
1103 DEDUP_TIER,
1104 DEDUP_CHUNK_ALGORITHM,
1105 DEDUP_CDC_CHUNK_SIZE,
1106 PG_NUM_MAX, // max pg_num
1107 };
1108
1109 enum type_t {
1110 STR,
1111 INT,
1112 DOUBLE,
1113 };
1114
1115 struct opt_desc_t {
1116 key_t key;
1117 type_t type;
1118
1119 opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
1120
1121 bool operator==(const opt_desc_t& rhs) const {
1122 return key == rhs.key && type == rhs.type;
1123 }
1124 };
1125
1126 typedef boost::variant<std::string,int64_t,double> value_t;
1127
1128 static bool is_opt_name(const std::string& name);
1129 static opt_desc_t get_opt_desc(const std::string& name);
1130
1131 pool_opts_t() : opts() {}
1132
1133 bool is_set(key_t key) const;
1134
1135 template<typename T>
1136 void set(key_t key, const T &val) {
1137 value_t value = val;
1138 opts[key] = value;
1139 }
1140
1141 template<typename T>
1142 bool get(key_t key, T *val) const {
1143 opts_t::const_iterator i = opts.find(key);
1144 if (i == opts.end()) {
1145 return false;
1146 }
1147 *val = boost::get<T>(i->second);
1148 return true;
1149 }
1150
1151 template<typename T>
1152 T value_or(key_t key, T&& default_value) const {
1153 auto i = opts.find(key);
1154 if (i == opts.end()) {
1155 return std::forward<T>(default_value);
1156 }
1157 return boost::get<T>(i->second);
1158 }
1159
1160 const value_t& get(key_t key) const;
1161
1162 bool unset(key_t key);
1163
1164 void dump(const std::string& name, ceph::Formatter *f) const;
1165
1166 void dump(ceph::Formatter *f) const;
1167 void encode(ceph::buffer::list &bl, uint64_t features) const;
1168 void decode(ceph::buffer::list::const_iterator &bl);
1169
1170 private:
1171 typedef std::map<key_t, value_t> opts_t;
1172 opts_t opts;
1173
1174 friend std::ostream& operator<<(std::ostream& out, const pool_opts_t& opts);
1175 };
1176 WRITE_CLASS_ENCODER_FEATURES(pool_opts_t)
1177
1178 struct pg_merge_meta_t {
1179 pg_t source_pgid;
1180 epoch_t ready_epoch = 0;
1181 epoch_t last_epoch_started = 0;
1182 epoch_t last_epoch_clean = 0;
1183 eversion_t source_version;
1184 eversion_t target_version;
1185
1186 void encode(ceph::buffer::list& bl) const {
1187 ENCODE_START(1, 1, bl);
1188 encode(source_pgid, bl);
1189 encode(ready_epoch, bl);
1190 encode(last_epoch_started, bl);
1191 encode(last_epoch_clean, bl);
1192 encode(source_version, bl);
1193 encode(target_version, bl);
1194 ENCODE_FINISH(bl);
1195 }
1196 void decode(ceph::buffer::list::const_iterator& p) {
1197 DECODE_START(1, p);
1198 decode(source_pgid, p);
1199 decode(ready_epoch, p);
1200 decode(last_epoch_started, p);
1201 decode(last_epoch_clean, p);
1202 decode(source_version, p);
1203 decode(target_version, p);
1204 DECODE_FINISH(p);
1205 }
1206 void dump(ceph::Formatter *f) const {
1207 f->dump_stream("source_pgid") << source_pgid;
1208 f->dump_unsigned("ready_epoch", ready_epoch);
1209 f->dump_unsigned("last_epoch_started", last_epoch_started);
1210 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
1211 f->dump_stream("source_version") << source_version;
1212 f->dump_stream("target_version") << target_version;
1213 }
1214 };
1215 WRITE_CLASS_ENCODER(pg_merge_meta_t)
1216
1217 class OSDMap;
1218
1219 /*
1220 * pg_pool
1221 */
1222 struct pg_pool_t {
1223 static const char *APPLICATION_NAME_CEPHFS;
1224 static const char *APPLICATION_NAME_RBD;
1225 static const char *APPLICATION_NAME_RGW;
1226
1227 enum {
1228 TYPE_REPLICATED = 1, // replication
1229 //TYPE_RAID4 = 2, // raid4 (never implemented)
1230 TYPE_ERASURE = 3, // erasure-coded
1231 };
1232 static constexpr uint32_t pg_CRUSH_ITEM_NONE = 0x7fffffff; /* can't import crush.h here */
1233 static std::string_view get_type_name(int t) {
1234 switch (t) {
1235 case TYPE_REPLICATED: return "replicated";
1236 //case TYPE_RAID4: return "raid4";
1237 case TYPE_ERASURE: return "erasure";
1238 default: return "???";
1239 }
1240 }
1241 std::string_view get_type_name() const {
1242 return get_type_name(type);
1243 }
1244
1245 enum {
1246 FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
1247 FLAG_FULL = 1<<1, // pool is full
1248 FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
1249 FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
1250 FLAG_NODELETE = 1<<4, // pool can't be deleted
1251 FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
1252 FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
1253 FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1254 FLAG_NOSCRUB = 1<<8, // block periodic scrub
1255 FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
1256 FLAG_FULL_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
1257 FLAG_NEARFULL = 1<<11, // pool is nearfull
1258 FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
1259 FLAG_SELFMANAGED_SNAPS = 1<<13, // pool uses selfmanaged snaps
1260 FLAG_POOL_SNAPS = 1<<14, // pool has pool snaps
1261 FLAG_CREATING = 1<<15, // initial pool PGs are being created
1262 FLAG_EIO = 1<<16, // return EIO for all client ops
1263 FLAG_BULK = 1<<17, //pool is large
1264 };
1265
1266 static const char *get_flag_name(uint64_t f) {
1267 switch (f) {
1268 case FLAG_HASHPSPOOL: return "hashpspool";
1269 case FLAG_FULL: return "full";
1270 case FLAG_EC_OVERWRITES: return "ec_overwrites";
1271 case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
1272 case FLAG_NODELETE: return "nodelete";
1273 case FLAG_NOPGCHANGE: return "nopgchange";
1274 case FLAG_NOSIZECHANGE: return "nosizechange";
1275 case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
1276 case FLAG_NOSCRUB: return "noscrub";
1277 case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
1278 case FLAG_FULL_QUOTA: return "full_quota";
1279 case FLAG_NEARFULL: return "nearfull";
1280 case FLAG_BACKFILLFULL: return "backfillfull";
1281 case FLAG_SELFMANAGED_SNAPS: return "selfmanaged_snaps";
1282 case FLAG_POOL_SNAPS: return "pool_snaps";
1283 case FLAG_CREATING: return "creating";
1284 case FLAG_EIO: return "eio";
1285 case FLAG_BULK: return "bulk";
1286 default: return "???";
1287 }
1288 }
1289 static std::string get_flags_string(uint64_t f) {
1290 std::string s;
1291 for (unsigned n=0; f && n<64; ++n) {
1292 if (f & (1ull << n)) {
1293 if (s.length())
1294 s += ",";
1295 s += get_flag_name(1ull << n);
1296 }
1297 }
1298 return s;
1299 }
1300 std::string get_flags_string() const {
1301 return get_flags_string(flags);
1302 }
1303 static uint64_t get_flag_by_name(const std::string& name) {
1304 if (name == "hashpspool")
1305 return FLAG_HASHPSPOOL;
1306 if (name == "full")
1307 return FLAG_FULL;
1308 if (name == "ec_overwrites")
1309 return FLAG_EC_OVERWRITES;
1310 if (name == "incomplete_clones")
1311 return FLAG_INCOMPLETE_CLONES;
1312 if (name == "nodelete")
1313 return FLAG_NODELETE;
1314 if (name == "nopgchange")
1315 return FLAG_NOPGCHANGE;
1316 if (name == "nosizechange")
1317 return FLAG_NOSIZECHANGE;
1318 if (name == "write_fadvise_dontneed")
1319 return FLAG_WRITE_FADVISE_DONTNEED;
1320 if (name == "noscrub")
1321 return FLAG_NOSCRUB;
1322 if (name == "nodeep-scrub")
1323 return FLAG_NODEEP_SCRUB;
1324 if (name == "full_quota")
1325 return FLAG_FULL_QUOTA;
1326 if (name == "nearfull")
1327 return FLAG_NEARFULL;
1328 if (name == "backfillfull")
1329 return FLAG_BACKFILLFULL;
1330 if (name == "selfmanaged_snaps")
1331 return FLAG_SELFMANAGED_SNAPS;
1332 if (name == "pool_snaps")
1333 return FLAG_POOL_SNAPS;
1334 if (name == "creating")
1335 return FLAG_CREATING;
1336 if (name == "eio")
1337 return FLAG_EIO;
1338 if (name == "bulk")
1339 return FLAG_BULK;
1340 return 0;
1341 }
1342
1343 /// converts the acting/up vector to a set of pg shards
1344 void convert_to_pg_shards(const std::vector<int> &from, std::set<pg_shard_t>* to) const;
1345
1346 typedef enum {
1347 CACHEMODE_NONE = 0, ///< no caching
1348 CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
1349 CACHEMODE_FORWARD = 2, ///< forward if not in cache
1350 CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
1351 CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later
1352 CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later
1353 CACHEMODE_PROXY = 6, ///< proxy if not in cache
1354 } cache_mode_t;
1355 static const char *get_cache_mode_name(cache_mode_t m) {
1356 switch (m) {
1357 case CACHEMODE_NONE: return "none";
1358 case CACHEMODE_WRITEBACK: return "writeback";
1359 case CACHEMODE_FORWARD: return "forward";
1360 case CACHEMODE_READONLY: return "readonly";
1361 case CACHEMODE_READFORWARD: return "readforward";
1362 case CACHEMODE_READPROXY: return "readproxy";
1363 case CACHEMODE_PROXY: return "proxy";
1364 default: return "unknown";
1365 }
1366 }
1367 static cache_mode_t get_cache_mode_from_str(const std::string& s) {
1368 if (s == "none")
1369 return CACHEMODE_NONE;
1370 if (s == "writeback")
1371 return CACHEMODE_WRITEBACK;
1372 if (s == "forward")
1373 return CACHEMODE_FORWARD;
1374 if (s == "readonly")
1375 return CACHEMODE_READONLY;
1376 if (s == "readforward")
1377 return CACHEMODE_READFORWARD;
1378 if (s == "readproxy")
1379 return CACHEMODE_READPROXY;
1380 if (s == "proxy")
1381 return CACHEMODE_PROXY;
1382 return (cache_mode_t)-1;
1383 }
1384 const char *get_cache_mode_name() const {
1385 return get_cache_mode_name(cache_mode);
1386 }
1387 bool cache_mode_requires_hit_set() const {
1388 switch (cache_mode) {
1389 case CACHEMODE_NONE:
1390 case CACHEMODE_FORWARD:
1391 case CACHEMODE_READONLY:
1392 case CACHEMODE_PROXY:
1393 return false;
1394 case CACHEMODE_WRITEBACK:
1395 case CACHEMODE_READFORWARD:
1396 case CACHEMODE_READPROXY:
1397 return true;
1398 default:
1399 ceph_abort_msg("implement me");
1400 }
1401 }
1402
1403 enum class pg_autoscale_mode_t : uint8_t {
1404 OFF = 0,
1405 WARN = 1,
1406 ON = 2,
1407 UNKNOWN = UINT8_MAX,
1408 };
1409 static const char *get_pg_autoscale_mode_name(pg_autoscale_mode_t m) {
1410 switch (m) {
1411 case pg_autoscale_mode_t::OFF: return "off";
1412 case pg_autoscale_mode_t::ON: return "on";
1413 case pg_autoscale_mode_t::WARN: return "warn";
1414 default: return "???";
1415 }
1416 }
1417 static pg_autoscale_mode_t get_pg_autoscale_mode_by_name(const std::string& m) {
1418 if (m == "off") {
1419 return pg_autoscale_mode_t::OFF;
1420 }
1421 if (m == "warn") {
1422 return pg_autoscale_mode_t::WARN;
1423 }
1424 if (m == "on") {
1425 return pg_autoscale_mode_t::ON;
1426 }
1427 return pg_autoscale_mode_t::UNKNOWN;
1428 }
1429
1430 utime_t create_time;
1431 uint64_t flags = 0; ///< FLAG_*
1432 __u8 type = 0; ///< TYPE_*
1433 __u8 size = 0, min_size = 0; ///< number of osds in each pg
1434 __u8 crush_rule = 0; ///< crush placement rule
1435 __u8 object_hash = 0; ///< hash mapping object name to ps
1436 pg_autoscale_mode_t pg_autoscale_mode = pg_autoscale_mode_t::UNKNOWN;
1437
1438 private:
1439 __u32 pg_num = 0, pgp_num = 0; ///< number of pgs
1440 __u32 pg_num_pending = 0; ///< pg_num we are about to merge down to
1441 __u32 pg_num_target = 0; ///< pg_num we should converge toward
1442 __u32 pgp_num_target = 0; ///< pgp_num we should converge toward
1443
1444 public:
1445 std::map<std::string, std::string> properties; ///< OBSOLETE
1446 std::string erasure_code_profile; ///< name of the erasure code profile in OSDMap
1447 epoch_t last_change = 0; ///< most recent epoch changed, exclusing snapshot changes
1448 // If non-zero, require OSDs in at least this many different instances...
1449 uint32_t peering_crush_bucket_count = 0;
1450 // of this bucket type...
1451 uint32_t peering_crush_bucket_barrier = 0;
1452 // including this one
1453 int32_t peering_crush_mandatory_member = pg_CRUSH_ITEM_NONE;
1454 // The per-bucket replica count is calculated with this "target"
1455 // instead of the above crush_bucket_count. This means we can maintain a
1456 // target size of 4 without attempting to place them all in 1 DC
1457 uint32_t peering_crush_bucket_target = 0;
1458 /// last epoch that forced clients to resend
1459 epoch_t last_force_op_resend = 0;
1460 /// last epoch that forced clients to resend (pre-nautilus clients only)
1461 epoch_t last_force_op_resend_prenautilus = 0;
1462 /// last epoch that forced clients to resend (pre-luminous clients only)
1463 epoch_t last_force_op_resend_preluminous = 0;
1464
1465 /// metadata for the most recent PG merge
1466 pg_merge_meta_t last_pg_merge_meta;
1467
1468 snapid_t snap_seq = 0; ///< seq for per-pool snapshot
1469 epoch_t snap_epoch = 0; ///< osdmap epoch of last snap
1470 uint64_t auid = 0; ///< who owns the pg
1471
1472 uint64_t quota_max_bytes = 0; ///< maximum number of bytes for this pool
1473 uint64_t quota_max_objects = 0; ///< maximum number of objects for this pool
1474
1475 /*
1476 * Pool snaps (global to this pool). These define a SnapContext for
1477 * the pool, unless the client manually specifies an alternate
1478 * context.
1479 */
1480 std::map<snapid_t, pool_snap_info_t> snaps;
1481 /*
1482 * Alternatively, if we are defining non-pool snaps (e.g. via the
1483 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1484 * used). Snaps and removed_snaps are to be used exclusive of each
1485 * other!
1486 */
1487 interval_set<snapid_t> removed_snaps;
1488
1489 unsigned pg_num_mask = 0, pgp_num_mask = 0;
1490
1491 std::set<uint64_t> tiers; ///< pools that are tiers of us
1492 int64_t tier_of = -1; ///< pool for which we are a tier
1493 // Note that write wins for read+write ops
1494 int64_t read_tier = -1; ///< pool/tier for objecter to direct reads to
1495 int64_t write_tier = -1; ///< pool/tier for objecter to direct writes to
1496 cache_mode_t cache_mode = CACHEMODE_NONE; ///< cache pool mode
1497
1498 bool is_tier() const { return tier_of >= 0; }
1499 bool has_tiers() const { return !tiers.empty(); }
1500 void clear_tier() {
1501 tier_of = -1;
1502 clear_read_tier();
1503 clear_write_tier();
1504 clear_tier_tunables();
1505 }
1506 bool has_read_tier() const { return read_tier >= 0; }
1507 void clear_read_tier() { read_tier = -1; }
1508 bool has_write_tier() const { return write_tier >= 0; }
1509 void clear_write_tier() { write_tier = -1; }
1510 void clear_tier_tunables() {
1511 if (cache_mode != CACHEMODE_NONE)
1512 flags |= FLAG_INCOMPLETE_CLONES;
1513 cache_mode = CACHEMODE_NONE;
1514
1515 target_max_bytes = 0;
1516 target_max_objects = 0;
1517 cache_target_dirty_ratio_micro = 0;
1518 cache_target_dirty_high_ratio_micro = 0;
1519 cache_target_full_ratio_micro = 0;
1520 hit_set_params = HitSet::Params();
1521 hit_set_period = 0;
1522 hit_set_count = 0;
1523 hit_set_grade_decay_rate = 0;
1524 hit_set_search_last_n = 0;
1525 grade_table.resize(0);
1526 }
1527
1528 bool is_stretch_pool() const {
1529 return peering_crush_bucket_count != 0;
1530 }
1531
1532 bool stretch_set_can_peer(const std::set<int>& want, const OSDMap& osdmap,
1533 std::ostream *out) const;
1534 bool stretch_set_can_peer(const std::vector<int>& want, const OSDMap& osdmap,
1535 std::ostream *out) const {
1536 if (!is_stretch_pool()) return true;
1537 std::set<int> swant;
1538 for (auto i : want) swant.insert(i);
1539 return stretch_set_can_peer(swant, osdmap, out);
1540 }
1541
1542 uint64_t target_max_bytes = 0; ///< tiering: target max pool size
1543 uint64_t target_max_objects = 0; ///< tiering: target max pool size
1544
1545 uint32_t cache_target_dirty_ratio_micro = 0; ///< cache: fraction of target to leave dirty
1546 uint32_t cache_target_dirty_high_ratio_micro = 0; ///< cache: fraction of target to flush with high speed
1547 uint32_t cache_target_full_ratio_micro = 0; ///< cache: fraction of target to fill before we evict in earnest
1548
1549 uint32_t cache_min_flush_age = 0; ///< minimum age (seconds) before we can flush
1550 uint32_t cache_min_evict_age = 0; ///< minimum age (seconds) before we can evict
1551
1552 HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
1553 uint32_t hit_set_period = 0; ///< periodicity of HitSet segments (seconds)
1554 uint32_t hit_set_count = 0; ///< number of periods to retain
1555 bool use_gmt_hitset = true; ///< use gmt to name the hitset archive object
1556 uint32_t min_read_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on read
1557 uint32_t min_write_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on write
1558 uint32_t hit_set_grade_decay_rate = 0; ///< current hit_set has highest priority on objects
1559 ///< temperature count,the follow hit_set's priority decay
1560 ///< by this params than pre hit_set
1561 uint32_t hit_set_search_last_n = 0; ///< accumulate atmost N hit_sets for temperature
1562
1563 uint32_t stripe_width = 0; ///< erasure coded stripe size in bytes
1564
1565 uint64_t expected_num_objects = 0; ///< expected number of objects on this pool, a value of 0 indicates
1566 ///< user does not specify any expected value
1567 bool fast_read = false; ///< whether turn on fast read on the pool or not
1568
1569 pool_opts_t opts; ///< options
1570
1571 typedef enum {
1572 TYPE_FINGERPRINT_NONE = 0,
1573 TYPE_FINGERPRINT_SHA1 = 1,
1574 TYPE_FINGERPRINT_SHA256 = 2,
1575 TYPE_FINGERPRINT_SHA512 = 3,
1576 } fingerprint_t;
1577 static fingerprint_t get_fingerprint_from_str(const std::string& s) {
1578 if (s == "none")
1579 return TYPE_FINGERPRINT_NONE;
1580 if (s == "sha1")
1581 return TYPE_FINGERPRINT_SHA1;
1582 if (s == "sha256")
1583 return TYPE_FINGERPRINT_SHA256;
1584 if (s == "sha512")
1585 return TYPE_FINGERPRINT_SHA512;
1586 return (fingerprint_t)-1;
1587 }
1588 const fingerprint_t get_fingerprint_type() const {
1589 std::string fp_str;
1590 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1591 return get_fingerprint_from_str(fp_str);
1592 }
1593 const char *get_fingerprint_name() const {
1594 std::string fp_str;
1595 fingerprint_t fp_t;
1596 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1597 fp_t = get_fingerprint_from_str(fp_str);
1598 return get_fingerprint_name(fp_t);
1599 }
1600 static const char *get_fingerprint_name(fingerprint_t m) {
1601 switch (m) {
1602 case TYPE_FINGERPRINT_NONE: return "none";
1603 case TYPE_FINGERPRINT_SHA1: return "sha1";
1604 case TYPE_FINGERPRINT_SHA256: return "sha256";
1605 case TYPE_FINGERPRINT_SHA512: return "sha512";
1606 default: return "unknown";
1607 }
1608 }
1609
1610 typedef enum {
1611 TYPE_DEDUP_CHUNK_NONE = 0,
1612 TYPE_DEDUP_CHUNK_FASTCDC = 1,
1613 TYPE_DEDUP_CHUNK_FIXEDCDC = 2,
1614 } dedup_chunk_algo_t;
1615 static dedup_chunk_algo_t get_dedup_chunk_algorithm_from_str(const std::string& s) {
1616 if (s == "none")
1617 return TYPE_DEDUP_CHUNK_NONE;
1618 if (s == "fastcdc")
1619 return TYPE_DEDUP_CHUNK_FASTCDC;
1620 if (s == "fixed")
1621 return TYPE_DEDUP_CHUNK_FIXEDCDC;
1622 return (dedup_chunk_algo_t)-1;
1623 }
1624 const dedup_chunk_algo_t get_dedup_chunk_algorithm_type() const {
1625 std::string algo_str;
1626 opts.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM, &algo_str);
1627 return get_dedup_chunk_algorithm_from_str(algo_str);
1628 }
1629 const char *get_dedup_chunk_algorithm_name() const {
1630 std::string dedup_chunk_algo_str;
1631 dedup_chunk_algo_t dedup_chunk_algo_t;
1632 opts.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM, &dedup_chunk_algo_str);
1633 dedup_chunk_algo_t = get_dedup_chunk_algorithm_from_str(dedup_chunk_algo_str);
1634 return get_dedup_chunk_algorithm_name(dedup_chunk_algo_t);
1635 }
1636 static const char *get_dedup_chunk_algorithm_name(dedup_chunk_algo_t m) {
1637 switch (m) {
1638 case TYPE_DEDUP_CHUNK_NONE: return "none";
1639 case TYPE_DEDUP_CHUNK_FASTCDC: return "fastcdc";
1640 case TYPE_DEDUP_CHUNK_FIXEDCDC: return "fixed";
1641 default: return "unknown";
1642 }
1643 }
1644
1645 int64_t get_dedup_tier() const {
1646 int64_t tier_id = 0;
1647 opts.get(pool_opts_t::DEDUP_TIER, &tier_id);
1648 return tier_id;
1649 }
1650 int64_t get_dedup_cdc_chunk_size() const {
1651 int64_t chunk_size = 0;
1652 opts.get(pool_opts_t::DEDUP_CDC_CHUNK_SIZE, &chunk_size);
1653 return chunk_size;
1654 }
1655
1656 /// application -> key/value metadata
1657 std::map<std::string, std::map<std::string, std::string>> application_metadata;
1658
1659 private:
1660 std::vector<uint32_t> grade_table;
1661
1662 public:
1663 uint32_t get_grade(unsigned i) const {
1664 if (grade_table.size() <= i)
1665 return 0;
1666 return grade_table[i];
1667 }
1668 void calc_grade_table() {
1669 unsigned v = 1000000;
1670 grade_table.resize(hit_set_count);
1671 for (unsigned i = 0; i < hit_set_count; i++) {
1672 v = v * (1 - (hit_set_grade_decay_rate / 100.0));
1673 grade_table[i] = v;
1674 }
1675 }
1676
1677 pg_pool_t() = default;
1678
1679 void dump(ceph::Formatter *f) const;
1680
1681 const utime_t &get_create_time() const { return create_time; }
1682 uint64_t get_flags() const { return flags; }
1683 bool has_flag(uint64_t f) const { return flags & f; }
1684 void set_flag(uint64_t f) { flags |= f; }
1685 void unset_flag(uint64_t f) { flags &= ~f; }
1686
1687 bool require_rollback() const {
1688 return is_erasure();
1689 }
1690
1691 /// true if incomplete clones may be present
1692 bool allow_incomplete_clones() const {
1693 return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
1694 }
1695
1696 unsigned get_type() const { return type; }
1697 unsigned get_size() const { return size; }
1698 unsigned get_min_size() const { return min_size; }
1699 int get_crush_rule() const { return crush_rule; }
1700 int get_object_hash() const { return object_hash; }
1701 const char *get_object_hash_name() const {
1702 return ceph_str_hash_name(get_object_hash());
1703 }
1704 epoch_t get_last_change() const { return last_change; }
1705 epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
1706 epoch_t get_last_force_op_resend_prenautilus() const {
1707 return last_force_op_resend_prenautilus;
1708 }
1709 epoch_t get_last_force_op_resend_preluminous() const {
1710 return last_force_op_resend_preluminous;
1711 }
1712 epoch_t get_snap_epoch() const { return snap_epoch; }
1713 snapid_t get_snap_seq() const { return snap_seq; }
1714 uint64_t get_auid() const { return auid; }
1715
1716 void set_snap_seq(snapid_t s) { snap_seq = s; }
1717 void set_snap_epoch(epoch_t e) { snap_epoch = e; }
1718
1719 void set_stripe_width(uint32_t s) { stripe_width = s; }
1720 uint32_t get_stripe_width() const { return stripe_width; }
1721
1722 bool is_replicated() const { return get_type() == TYPE_REPLICATED; }
1723 bool is_erasure() const { return get_type() == TYPE_ERASURE; }
1724
1725 bool supports_omap() const {
1726 return !(get_type() == TYPE_ERASURE);
1727 }
1728
1729 bool requires_aligned_append() const {
1730 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
1731 }
1732 uint64_t required_alignment() const { return stripe_width; }
1733
1734 bool allows_ecoverwrites() const {
1735 return has_flag(FLAG_EC_OVERWRITES);
1736 }
1737
1738 bool can_shift_osds() const {
1739 switch (get_type()) {
1740 case TYPE_REPLICATED:
1741 return true;
1742 case TYPE_ERASURE:
1743 return false;
1744 default:
1745 ceph_abort_msg("unhandled pool type");
1746 }
1747 }
1748
1749 unsigned get_pg_num() const { return pg_num; }
1750 unsigned get_pgp_num() const { return pgp_num; }
1751 unsigned get_pg_num_target() const { return pg_num_target; }
1752 unsigned get_pgp_num_target() const { return pgp_num_target; }
1753 unsigned get_pg_num_pending() const { return pg_num_pending; }
1754
1755 unsigned get_pg_num_mask() const { return pg_num_mask; }
1756 unsigned get_pgp_num_mask() const { return pgp_num_mask; }
1757
1758 // if pg_num is not a multiple of two, pgs are not equally sized.
1759 // return, for a given pg, the fraction (denominator) of the total
1760 // pool size that it represents.
1761 unsigned get_pg_num_divisor(pg_t pgid) const;
1762
1763 bool is_pending_merge(pg_t pgid, bool *target) const;
1764
1765 void set_pg_num(int p) {
1766 pg_num = p;
1767 pg_num_pending = p;
1768 calc_pg_masks();
1769 }
1770 void set_pgp_num(int p) {
1771 pgp_num = p;
1772 calc_pg_masks();
1773 }
1774 void set_pg_num_pending(int p) {
1775 pg_num_pending = p;
1776 calc_pg_masks();
1777 }
1778 void set_pg_num_target(int p) {
1779 pg_num_target = p;
1780 }
1781 void set_pgp_num_target(int p) {
1782 pgp_num_target = p;
1783 }
1784 void dec_pg_num(pg_t source_pgid,
1785 epoch_t ready_epoch,
1786 eversion_t source_version,
1787 eversion_t target_version,
1788 epoch_t last_epoch_started,
1789 epoch_t last_epoch_clean) {
1790 --pg_num;
1791 last_pg_merge_meta.source_pgid = source_pgid;
1792 last_pg_merge_meta.ready_epoch = ready_epoch;
1793 last_pg_merge_meta.source_version = source_version;
1794 last_pg_merge_meta.target_version = target_version;
1795 last_pg_merge_meta.last_epoch_started = last_epoch_started;
1796 last_pg_merge_meta.last_epoch_clean = last_epoch_clean;
1797 calc_pg_masks();
1798 }
1799
1800 void set_quota_max_bytes(uint64_t m) {
1801 quota_max_bytes = m;
1802 }
1803 uint64_t get_quota_max_bytes() {
1804 return quota_max_bytes;
1805 }
1806
1807 void set_quota_max_objects(uint64_t m) {
1808 quota_max_objects = m;
1809 }
1810 uint64_t get_quota_max_objects() {
1811 return quota_max_objects;
1812 }
1813
1814 void set_last_force_op_resend(uint64_t t) {
1815 last_force_op_resend = t;
1816 last_force_op_resend_prenautilus = t;
1817 last_force_op_resend_preluminous = t;
1818 }
1819
1820 void calc_pg_masks();
1821
1822 /*
1823 * we have two snap modes:
1824 * - pool global snaps
1825 * - snap existence/non-existence defined by snaps[] and snap_seq
1826 * - user managed snaps
1827 * - removal governed by removed_snaps
1828 *
1829 * we know which mode we're using based on whether removed_snaps is empty.
1830 * If nothing has been created, both functions report false.
1831 */
1832 bool is_pool_snaps_mode() const;
1833 bool is_unmanaged_snaps_mode() const;
1834 bool is_removed_snap(snapid_t s) const;
1835
1836 snapid_t snap_exists(std::string_view s) const;
1837 void add_snap(const char *n, utime_t stamp);
1838 uint64_t add_unmanaged_snap(bool preoctopus_compat);
1839 void remove_snap(snapid_t s);
1840 void remove_unmanaged_snap(snapid_t s, bool preoctopus_compat);
1841
1842 SnapContext get_snap_context() const;
1843
1844 /// hash a object name+namespace key to a hash position
1845 uint32_t hash_key(const std::string& key, const std::string& ns) const;
1846
1847 /// round a hash position down to a pg num
1848 uint32_t raw_hash_to_pg(uint32_t v) const;
1849
1850 /*
1851 * map a raw pg (with full precision ps) into an actual pg, for storage
1852 */
1853 pg_t raw_pg_to_pg(pg_t pg) const;
1854
1855 /*
1856 * map raw pg (full precision ps) into a placement seed. include
1857 * pool id in that value so that different pools don't use the same
1858 * seeds.
1859 */
1860 ps_t raw_pg_to_pps(pg_t pg) const;
1861
1862 /// choose a random hash position within a pg
1863 uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
1864
1865 void encode(ceph::buffer::list& bl, uint64_t features) const;
1866 void decode(ceph::buffer::list::const_iterator& bl);
1867
1868 static void generate_test_instances(std::list<pg_pool_t*>& o);
1869 };
1870 WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
1871
1872 std::ostream& operator<<(std::ostream& out, const pg_pool_t& p);
1873
1874
1875 /**
1876 * a summation of object stats
1877 *
1878 * This is just a container for object stats; we don't know what for.
1879 *
1880 * If you add members in object_stat_sum_t, you should make sure there are
1881 * not padding among these members.
1882 * You should also modify the padding_check function.
1883
1884 */
1885 struct object_stat_sum_t {
1886 /**************************************************************************
1887 * WARNING: be sure to update operator==, floor, and split when
1888 * adding/removing fields!
1889 **************************************************************************/
1890 int64_t num_bytes; // in bytes
1891 int64_t num_objects;
1892 int64_t num_object_clones;
1893 int64_t num_object_copies; // num_objects * num_replicas
1894 int64_t num_objects_missing_on_primary;
1895 int64_t num_objects_degraded;
1896 int64_t num_objects_unfound;
1897 int64_t num_rd;
1898 int64_t num_rd_kb;
1899 int64_t num_wr;
1900 int64_t num_wr_kb;
1901 int64_t num_scrub_errors; // total deep and shallow scrub errors
1902 int64_t num_objects_recovered;
1903 int64_t num_bytes_recovered;
1904 int64_t num_keys_recovered;
1905 int64_t num_shallow_scrub_errors;
1906 int64_t num_deep_scrub_errors;
1907 int64_t num_objects_dirty;
1908 int64_t num_whiteouts;
1909 int64_t num_objects_omap;
1910 int64_t num_objects_hit_set_archive;
1911 int64_t num_objects_misplaced;
1912 int64_t num_bytes_hit_set_archive;
1913 int64_t num_flush;
1914 int64_t num_flush_kb;
1915 int64_t num_evict;
1916 int64_t num_evict_kb;
1917 int64_t num_promote;
1918 int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0
1919 int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0
1920 int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0
1921 int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0
1922 int64_t num_objects_pinned;
1923 int64_t num_objects_missing;
1924 int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
1925 int64_t num_large_omap_objects = 0;
1926 int64_t num_objects_manifest = 0;
1927 int64_t num_omap_bytes = 0;
1928 int64_t num_omap_keys = 0;
1929 int64_t num_objects_repaired = 0;
1930
1931 object_stat_sum_t()
1932 : num_bytes(0),
1933 num_objects(0), num_object_clones(0), num_object_copies(0),
1934 num_objects_missing_on_primary(0), num_objects_degraded(0),
1935 num_objects_unfound(0),
1936 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1937 num_scrub_errors(0),
1938 num_objects_recovered(0),
1939 num_bytes_recovered(0),
1940 num_keys_recovered(0),
1941 num_shallow_scrub_errors(0),
1942 num_deep_scrub_errors(0),
1943 num_objects_dirty(0),
1944 num_whiteouts(0),
1945 num_objects_omap(0),
1946 num_objects_hit_set_archive(0),
1947 num_objects_misplaced(0),
1948 num_bytes_hit_set_archive(0),
1949 num_flush(0),
1950 num_flush_kb(0),
1951 num_evict(0),
1952 num_evict_kb(0),
1953 num_promote(0),
1954 num_flush_mode_high(0), num_flush_mode_low(0),
1955 num_evict_mode_some(0), num_evict_mode_full(0),
1956 num_objects_pinned(0),
1957 num_objects_missing(0),
1958 num_legacy_snapsets(0)
1959 {}
1960
1961 void floor(int64_t f) {
1962 #define FLOOR(x) if (x < f) x = f
1963 FLOOR(num_bytes);
1964 FLOOR(num_objects);
1965 FLOOR(num_object_clones);
1966 FLOOR(num_object_copies);
1967 FLOOR(num_objects_missing_on_primary);
1968 FLOOR(num_objects_missing);
1969 FLOOR(num_objects_degraded);
1970 FLOOR(num_objects_misplaced);
1971 FLOOR(num_objects_unfound);
1972 FLOOR(num_rd);
1973 FLOOR(num_rd_kb);
1974 FLOOR(num_wr);
1975 FLOOR(num_wr_kb);
1976 FLOOR(num_large_omap_objects);
1977 FLOOR(num_objects_manifest);
1978 FLOOR(num_omap_bytes);
1979 FLOOR(num_omap_keys);
1980 FLOOR(num_shallow_scrub_errors);
1981 FLOOR(num_deep_scrub_errors);
1982 num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
1983 FLOOR(num_objects_recovered);
1984 FLOOR(num_bytes_recovered);
1985 FLOOR(num_keys_recovered);
1986 FLOOR(num_objects_dirty);
1987 FLOOR(num_whiteouts);
1988 FLOOR(num_objects_omap);
1989 FLOOR(num_objects_hit_set_archive);
1990 FLOOR(num_bytes_hit_set_archive);
1991 FLOOR(num_flush);
1992 FLOOR(num_flush_kb);
1993 FLOOR(num_evict);
1994 FLOOR(num_evict_kb);
1995 FLOOR(num_promote);
1996 FLOOR(num_flush_mode_high);
1997 FLOOR(num_flush_mode_low);
1998 FLOOR(num_evict_mode_some);
1999 FLOOR(num_evict_mode_full);
2000 FLOOR(num_objects_pinned);
2001 FLOOR(num_legacy_snapsets);
2002 FLOOR(num_objects_repaired);
2003 #undef FLOOR
2004 }
2005
2006 void split(std::vector<object_stat_sum_t> &out) const {
2007 #define SPLIT(PARAM) \
2008 for (unsigned i = 0; i < out.size(); ++i) { \
2009 out[i].PARAM = PARAM / out.size(); \
2010 if (i < (PARAM % out.size())) { \
2011 out[i].PARAM++; \
2012 } \
2013 }
2014 #define SPLIT_PRESERVE_NONZERO(PARAM) \
2015 for (unsigned i = 0; i < out.size(); ++i) { \
2016 if (PARAM) \
2017 out[i].PARAM = 1 + PARAM / out.size(); \
2018 else \
2019 out[i].PARAM = 0; \
2020 }
2021
2022 SPLIT(num_bytes);
2023 SPLIT(num_objects);
2024 SPLIT(num_object_clones);
2025 SPLIT(num_object_copies);
2026 SPLIT(num_objects_missing_on_primary);
2027 SPLIT(num_objects_missing);
2028 SPLIT(num_objects_degraded);
2029 SPLIT(num_objects_misplaced);
2030 SPLIT(num_objects_unfound);
2031 SPLIT(num_rd);
2032 SPLIT(num_rd_kb);
2033 SPLIT(num_wr);
2034 SPLIT(num_wr_kb);
2035 SPLIT(num_large_omap_objects);
2036 SPLIT(num_objects_manifest);
2037 SPLIT(num_omap_bytes);
2038 SPLIT(num_omap_keys);
2039 SPLIT(num_objects_repaired);
2040 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
2041 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
2042 for (unsigned i = 0; i < out.size(); ++i) {
2043 out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
2044 out[i].num_deep_scrub_errors;
2045 }
2046 SPLIT(num_objects_recovered);
2047 SPLIT(num_bytes_recovered);
2048 SPLIT(num_keys_recovered);
2049 SPLIT(num_objects_dirty);
2050 SPLIT(num_whiteouts);
2051 SPLIT(num_objects_omap);
2052 SPLIT(num_objects_hit_set_archive);
2053 SPLIT(num_bytes_hit_set_archive);
2054 SPLIT(num_flush);
2055 SPLIT(num_flush_kb);
2056 SPLIT(num_evict);
2057 SPLIT(num_evict_kb);
2058 SPLIT(num_promote);
2059 SPLIT(num_flush_mode_high);
2060 SPLIT(num_flush_mode_low);
2061 SPLIT(num_evict_mode_some);
2062 SPLIT(num_evict_mode_full);
2063 SPLIT(num_objects_pinned);
2064 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
2065 #undef SPLIT
2066 #undef SPLIT_PRESERVE_NONZERO
2067 }
2068
2069 void clear() {
2070 // FIPS zeroization audit 20191117: this memset is not security related.
2071 memset(this, 0, sizeof(*this));
2072 }
2073
2074 void calc_copies(int nrep) {
2075 num_object_copies = nrep * num_objects;
2076 }
2077
2078 bool is_zero() const {
2079 return mem_is_zero((char*)this, sizeof(*this));
2080 }
2081
2082 void add(const object_stat_sum_t& o);
2083 void sub(const object_stat_sum_t& o);
2084
2085 void dump(ceph::Formatter *f) const;
2086 void padding_check() {
2087 static_assert(
2088 sizeof(object_stat_sum_t) ==
2089 sizeof(num_bytes) +
2090 sizeof(num_objects) +
2091 sizeof(num_object_clones) +
2092 sizeof(num_object_copies) +
2093 sizeof(num_objects_missing_on_primary) +
2094 sizeof(num_objects_degraded) +
2095 sizeof(num_objects_unfound) +
2096 sizeof(num_rd) +
2097 sizeof(num_rd_kb) +
2098 sizeof(num_wr) +
2099 sizeof(num_wr_kb) +
2100 sizeof(num_scrub_errors) +
2101 sizeof(num_large_omap_objects) +
2102 sizeof(num_objects_manifest) +
2103 sizeof(num_omap_bytes) +
2104 sizeof(num_omap_keys) +
2105 sizeof(num_objects_repaired) +
2106 sizeof(num_objects_recovered) +
2107 sizeof(num_bytes_recovered) +
2108 sizeof(num_keys_recovered) +
2109 sizeof(num_shallow_scrub_errors) +
2110 sizeof(num_deep_scrub_errors) +
2111 sizeof(num_objects_dirty) +
2112 sizeof(num_whiteouts) +
2113 sizeof(num_objects_omap) +
2114 sizeof(num_objects_hit_set_archive) +
2115 sizeof(num_objects_misplaced) +
2116 sizeof(num_bytes_hit_set_archive) +
2117 sizeof(num_flush) +
2118 sizeof(num_flush_kb) +
2119 sizeof(num_evict) +
2120 sizeof(num_evict_kb) +
2121 sizeof(num_promote) +
2122 sizeof(num_flush_mode_high) +
2123 sizeof(num_flush_mode_low) +
2124 sizeof(num_evict_mode_some) +
2125 sizeof(num_evict_mode_full) +
2126 sizeof(num_objects_pinned) +
2127 sizeof(num_objects_missing) +
2128 sizeof(num_legacy_snapsets)
2129 ,
2130 "object_stat_sum_t have padding");
2131 }
2132 void encode(ceph::buffer::list& bl) const;
2133 void decode(ceph::buffer::list::const_iterator& bl);
2134 static void generate_test_instances(std::list<object_stat_sum_t*>& o);
2135 };
2136 WRITE_CLASS_ENCODER(object_stat_sum_t)
2137
2138 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
2139
2140 /**
2141 * a collection of object stat sums
2142 *
2143 * This is a collection of stat sums over different categories.
2144 */
2145 struct object_stat_collection_t {
2146 /**************************************************************************
2147 * WARNING: be sure to update the operator== when adding/removing fields! *
2148 **************************************************************************/
2149 object_stat_sum_t sum;
2150
2151 void calc_copies(int nrep) {
2152 sum.calc_copies(nrep);
2153 }
2154
2155 void dump(ceph::Formatter *f) const;
2156 void encode(ceph::buffer::list& bl) const;
2157 void decode(ceph::buffer::list::const_iterator& bl);
2158 static void generate_test_instances(std::list<object_stat_collection_t*>& o);
2159
2160 bool is_zero() const {
2161 return sum.is_zero();
2162 }
2163
2164 void clear() {
2165 sum.clear();
2166 }
2167
2168 void floor(int64_t f) {
2169 sum.floor(f);
2170 }
2171
2172 void add(const object_stat_sum_t& o) {
2173 sum.add(o);
2174 }
2175
2176 void add(const object_stat_collection_t& o) {
2177 sum.add(o.sum);
2178 }
2179 void sub(const object_stat_collection_t& o) {
2180 sum.sub(o.sum);
2181 }
2182 };
2183 WRITE_CLASS_ENCODER(object_stat_collection_t)
2184
2185 inline bool operator==(const object_stat_collection_t& l,
2186 const object_stat_collection_t& r) {
2187 return l.sum == r.sum;
2188 }
2189
2190 enum class scrub_level_t : bool { shallow = false, deep = true };
2191 enum class scrub_type_t : bool { not_repair = false, do_repair = true };
2192
2193 /// is there a scrub in our future?
2194 enum class pg_scrub_sched_status_t : uint16_t {
2195 unknown, ///< status not reported yet
2196 not_queued, ///< not in the OSD's scrub queue. Probably not active.
2197 active, ///< scrubbing
2198 scheduled, ///< scheduled for a scrub at an already determined time
2199 queued ///< queued to be scrubbed
2200 };
2201
2202 struct pg_scrubbing_status_t {
2203 utime_t m_scheduled_at{};
2204 int32_t m_duration_seconds{0}; // relevant when scrubbing
2205 pg_scrub_sched_status_t m_sched_status{pg_scrub_sched_status_t::unknown};
2206 bool m_is_active{false};
2207 scrub_level_t m_is_deep{scrub_level_t::shallow};
2208 bool m_is_periodic{true};
2209 };
2210
2211 bool operator==(const pg_scrubbing_status_t& l, const pg_scrubbing_status_t& r);
2212
2213 /** pg_stat
2214 * aggregate stats for a single PG.
2215 */
2216 struct pg_stat_t {
2217 /**************************************************************************
2218 * WARNING: be sure to update the operator== when adding/removing fields! *
2219 **************************************************************************/
2220 eversion_t version;
2221 version_t reported_seq; // sequence number
2222 epoch_t reported_epoch; // epoch of this report
2223 uint64_t state;
2224 utime_t last_fresh; // last reported
2225 utime_t last_change; // new state != previous state
2226 utime_t last_active; // state & PG_STATE_ACTIVE
2227 utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
2228 utime_t last_clean; // state & PG_STATE_CLEAN
2229 utime_t last_unstale; // (state & PG_STATE_STALE) == 0
2230 utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
2231 utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
2232
2233 eversion_t log_start; // (log_start,version]
2234 eversion_t ondisk_log_start; // there may be more on disk
2235
2236 epoch_t created;
2237 epoch_t last_epoch_clean;
2238 pg_t parent;
2239 __u32 parent_split_bits;
2240
2241 eversion_t last_scrub;
2242 eversion_t last_deep_scrub;
2243 utime_t last_scrub_stamp;
2244 utime_t last_deep_scrub_stamp;
2245 utime_t last_clean_scrub_stamp;
2246 int32_t last_scrub_duration{0};
2247
2248 object_stat_collection_t stats;
2249
2250 int64_t log_size;
2251 int64_t ondisk_log_size; // >= active_log_size
2252 int64_t objects_scrubbed;
2253 double scrub_duration;
2254
2255 std::vector<int32_t> up, acting;
2256 std::vector<pg_shard_t> avail_no_missing;
2257 std::map< std::set<pg_shard_t>, int32_t > object_location_counts;
2258 epoch_t mapping_epoch;
2259
2260 std::vector<int32_t> blocked_by; ///< osds on which the pg is blocked
2261
2262 interval_set<snapid_t> purged_snaps; ///< recently removed snaps that we've purged
2263
2264 utime_t last_became_active;
2265 utime_t last_became_peered;
2266
2267 /// up, acting primaries
2268 int32_t up_primary;
2269 int32_t acting_primary;
2270
2271 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
2272 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
2273 uint32_t snaptrimq_len;
2274 int64_t objects_trimmed;
2275 double snaptrim_duration;
2276
2277 pg_scrubbing_status_t scrub_sched_status;
2278
2279 bool stats_invalid:1;
2280 /// true if num_objects_dirty is not accurate (because it was not
2281 /// maintained starting from pool creation)
2282 bool dirty_stats_invalid:1;
2283 bool omap_stats_invalid:1;
2284 bool hitset_stats_invalid:1;
2285 bool hitset_bytes_stats_invalid:1;
2286 bool pin_stats_invalid:1;
2287 bool manifest_stats_invalid:1;
2288
2289 pg_stat_t()
2290 : reported_seq(0),
2291 reported_epoch(0),
2292 state(0),
2293 created(0), last_epoch_clean(0),
2294 parent_split_bits(0),
2295 log_size(0), ondisk_log_size(0),
2296 objects_scrubbed(0),
2297 scrub_duration(0),
2298 mapping_epoch(0),
2299 up_primary(-1),
2300 acting_primary(-1),
2301 snaptrimq_len(0),
2302 objects_trimmed(0),
2303 snaptrim_duration(0.0),
2304 stats_invalid(false),
2305 dirty_stats_invalid(false),
2306 omap_stats_invalid(false),
2307 hitset_stats_invalid(false),
2308 hitset_bytes_stats_invalid(false),
2309 pin_stats_invalid(false),
2310 manifest_stats_invalid(false)
2311 { }
2312
2313 epoch_t get_effective_last_epoch_clean() const {
2314 if (state & PG_STATE_CLEAN) {
2315 // we are clean as of this report, and should thus take the
2316 // reported epoch
2317 return reported_epoch;
2318 } else {
2319 return last_epoch_clean;
2320 }
2321 }
2322
2323 std::pair<epoch_t, version_t> get_version_pair() const {
2324 return { reported_epoch, reported_seq };
2325 }
2326
2327 void floor(int64_t f) {
2328 stats.floor(f);
2329 if (log_size < f)
2330 log_size = f;
2331 if (ondisk_log_size < f)
2332 ondisk_log_size = f;
2333 if (snaptrimq_len < f)
2334 snaptrimq_len = f;
2335 }
2336
2337 void add_sub_invalid_flags(const pg_stat_t& o) {
2338 // adding (or subtracting!) invalid stats render our stats invalid too
2339 stats_invalid |= o.stats_invalid;
2340 dirty_stats_invalid |= o.dirty_stats_invalid;
2341 omap_stats_invalid |= o.omap_stats_invalid;
2342 hitset_stats_invalid |= o.hitset_stats_invalid;
2343 hitset_bytes_stats_invalid |= o.hitset_bytes_stats_invalid;
2344 pin_stats_invalid |= o.pin_stats_invalid;
2345 manifest_stats_invalid |= o.manifest_stats_invalid;
2346 }
2347 void add(const pg_stat_t& o) {
2348 stats.add(o.stats);
2349 log_size += o.log_size;
2350 ondisk_log_size += o.ondisk_log_size;
2351 snaptrimq_len = std::min((uint64_t)snaptrimq_len + o.snaptrimq_len,
2352 (uint64_t)(1ull << 31));
2353 add_sub_invalid_flags(o);
2354 }
2355 void sub(const pg_stat_t& o) {
2356 stats.sub(o.stats);
2357 log_size -= o.log_size;
2358 ondisk_log_size -= o.ondisk_log_size;
2359 if (o.snaptrimq_len < snaptrimq_len) {
2360 snaptrimq_len -= o.snaptrimq_len;
2361 } else {
2362 snaptrimq_len = 0;
2363 }
2364 add_sub_invalid_flags(o);
2365 }
2366
2367 bool is_acting_osd(int32_t osd, bool primary) const;
2368 void dump(ceph::Formatter *f) const;
2369 void dump_brief(ceph::Formatter *f) const;
2370 std::string dump_scrub_schedule() const;
2371 void encode(ceph::buffer::list &bl) const;
2372 void decode(ceph::buffer::list::const_iterator &bl);
2373 static void generate_test_instances(std::list<pg_stat_t*>& o);
2374 };
2375 WRITE_CLASS_ENCODER(pg_stat_t)
2376
2377 bool operator==(const pg_stat_t& l, const pg_stat_t& r);
2378
2379 /** store_statfs_t
2380 * ObjectStore full statfs information
2381 */
2382 struct store_statfs_t
2383 {
2384 uint64_t total = 0; ///< Total bytes
2385 uint64_t available = 0; ///< Free bytes available
2386 uint64_t internally_reserved = 0; ///< Bytes reserved for internal purposes
2387
2388 int64_t allocated = 0; ///< Bytes allocated by the store
2389
2390 int64_t data_stored = 0; ///< Bytes actually stored by the user
2391 int64_t data_compressed = 0; ///< Bytes stored after compression
2392 int64_t data_compressed_allocated = 0; ///< Bytes allocated for compressed data
2393 int64_t data_compressed_original = 0; ///< Bytes that were compressed
2394
2395 int64_t omap_allocated = 0; ///< approx usage of omap data
2396 int64_t internal_metadata = 0; ///< approx usage of internal metadata
2397
2398 void reset() {
2399 *this = store_statfs_t();
2400 }
2401 void floor(int64_t f) {
2402 #define FLOOR(x) if (int64_t(x) < f) x = f
2403 FLOOR(total);
2404 FLOOR(available);
2405 FLOOR(internally_reserved);
2406 FLOOR(allocated);
2407 FLOOR(data_stored);
2408 FLOOR(data_compressed);
2409 FLOOR(data_compressed_allocated);
2410 FLOOR(data_compressed_original);
2411
2412 FLOOR(omap_allocated);
2413 FLOOR(internal_metadata);
2414 #undef FLOOR
2415 }
2416
2417 bool operator ==(const store_statfs_t& other) const;
2418 bool is_zero() const {
2419 return *this == store_statfs_t();
2420 }
2421
2422 uint64_t get_used() const {
2423 return total - available - internally_reserved;
2424 }
2425
2426 // this accumulates both actually used and statfs's internally_reserved
2427 uint64_t get_used_raw() const {
2428 return total - available;
2429 }
2430
2431 float get_used_raw_ratio() const {
2432 if (total) {
2433 return (float)get_used_raw() / (float)total;
2434 } else {
2435 return 0.0;
2436 }
2437 }
2438
2439 // helpers to ease legacy code porting
2440 uint64_t kb_avail() const {
2441 return available >> 10;
2442 }
2443 uint64_t kb() const {
2444 return total >> 10;
2445 }
2446 uint64_t kb_used() const {
2447 return (total - available - internally_reserved) >> 10;
2448 }
2449 uint64_t kb_used_raw() const {
2450 return get_used_raw() >> 10;
2451 }
2452
2453 uint64_t kb_used_data() const {
2454 return allocated >> 10;
2455 }
2456 uint64_t kb_used_omap() const {
2457 return omap_allocated >> 10;
2458 }
2459
2460 uint64_t kb_used_internal_metadata() const {
2461 return internal_metadata >> 10;
2462 }
2463
2464 void add(const store_statfs_t& o) {
2465 total += o.total;
2466 available += o.available;
2467 internally_reserved += o.internally_reserved;
2468 allocated += o.allocated;
2469 data_stored += o.data_stored;
2470 data_compressed += o.data_compressed;
2471 data_compressed_allocated += o.data_compressed_allocated;
2472 data_compressed_original += o.data_compressed_original;
2473 omap_allocated += o.omap_allocated;
2474 internal_metadata += o.internal_metadata;
2475 }
2476 void sub(const store_statfs_t& o) {
2477 total -= o.total;
2478 available -= o.available;
2479 internally_reserved -= o.internally_reserved;
2480 allocated -= o.allocated;
2481 data_stored -= o.data_stored;
2482 data_compressed -= o.data_compressed;
2483 data_compressed_allocated -= o.data_compressed_allocated;
2484 data_compressed_original -= o.data_compressed_original;
2485 omap_allocated -= o.omap_allocated;
2486 internal_metadata -= o.internal_metadata;
2487 }
2488 void dump(ceph::Formatter *f) const;
2489 DENC(store_statfs_t, v, p) {
2490 DENC_START(1, 1, p);
2491 denc(v.total, p);
2492 denc(v.available, p);
2493 denc(v.internally_reserved, p);
2494 denc(v.allocated, p);
2495 denc(v.data_stored, p);
2496 denc(v.data_compressed, p);
2497 denc(v.data_compressed_allocated, p);
2498 denc(v.data_compressed_original, p);
2499 denc(v.omap_allocated, p);
2500 denc(v.internal_metadata, p);
2501 DENC_FINISH(p);
2502 }
2503 static void generate_test_instances(std::list<store_statfs_t*>& o);
2504 };
2505 WRITE_CLASS_DENC(store_statfs_t)
2506
2507 std::ostream &operator<<(std::ostream &lhs, const store_statfs_t &rhs);
2508
2509 /** osd_stat
2510 * aggregate stats for an osd
2511 */
2512 struct osd_stat_t {
2513 store_statfs_t statfs;
2514 std::vector<int> hb_peers;
2515 int32_t snap_trim_queue_len, num_snap_trimming;
2516 uint64_t num_shards_repaired;
2517
2518 pow2_hist_t op_queue_age_hist;
2519
2520 objectstore_perf_stat_t os_perf_stat;
2521 osd_alerts_t os_alerts;
2522
2523 epoch_t up_from = 0;
2524 uint64_t seq = 0;
2525
2526 uint32_t num_pgs = 0;
2527
2528 uint32_t num_osds = 0;
2529 uint32_t num_per_pool_osds = 0;
2530 uint32_t num_per_pool_omap_osds = 0;
2531
2532 struct Interfaces {
2533 uint32_t last_update; // in seconds
2534 uint32_t back_pingtime[3];
2535 uint32_t back_min[3];
2536 uint32_t back_max[3];
2537 uint32_t back_last;
2538 uint32_t front_pingtime[3];
2539 uint32_t front_min[3];
2540 uint32_t front_max[3];
2541 uint32_t front_last;
2542 };
2543 std::map<int, Interfaces> hb_pingtime; ///< map of osd id to Interfaces
2544
2545 osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
2546 num_shards_repaired(0) {}
2547
2548 void add(const osd_stat_t& o) {
2549 statfs.add(o.statfs);
2550 snap_trim_queue_len += o.snap_trim_queue_len;
2551 num_snap_trimming += o.num_snap_trimming;
2552 num_shards_repaired += o.num_shards_repaired;
2553 op_queue_age_hist.add(o.op_queue_age_hist);
2554 os_perf_stat.add(o.os_perf_stat);
2555 num_pgs += o.num_pgs;
2556 num_osds += o.num_osds;
2557 num_per_pool_osds += o.num_per_pool_osds;
2558 num_per_pool_omap_osds += o.num_per_pool_omap_osds;
2559 for (const auto& a : o.os_alerts) {
2560 auto& target = os_alerts[a.first];
2561 for (auto& i : a.second) {
2562 target.emplace(i.first, i.second);
2563 }
2564 }
2565 }
2566 void sub(const osd_stat_t& o) {
2567 statfs.sub(o.statfs);
2568 snap_trim_queue_len -= o.snap_trim_queue_len;
2569 num_snap_trimming -= o.num_snap_trimming;
2570 num_shards_repaired -= o.num_shards_repaired;
2571 op_queue_age_hist.sub(o.op_queue_age_hist);
2572 os_perf_stat.sub(o.os_perf_stat);
2573 num_pgs -= o.num_pgs;
2574 num_osds -= o.num_osds;
2575 num_per_pool_osds -= o.num_per_pool_osds;
2576 num_per_pool_omap_osds -= o.num_per_pool_omap_osds;
2577 for (const auto& a : o.os_alerts) {
2578 auto& target = os_alerts[a.first];
2579 for (auto& i : a.second) {
2580 target.erase(i.first);
2581 }
2582 if (target.empty()) {
2583 os_alerts.erase(a.first);
2584 }
2585 }
2586 }
2587 void dump(ceph::Formatter *f, bool with_net = true) const;
2588 void dump_ping_time(ceph::Formatter *f) const;
2589 void encode(ceph::buffer::list &bl, uint64_t features) const;
2590 void decode(ceph::buffer::list::const_iterator &bl);
2591 static void generate_test_instances(std::list<osd_stat_t*>& o);
2592 };
2593 WRITE_CLASS_ENCODER_FEATURES(osd_stat_t)
2594
2595 inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
2596 return l.statfs == r.statfs &&
2597 l.snap_trim_queue_len == r.snap_trim_queue_len &&
2598 l.num_snap_trimming == r.num_snap_trimming &&
2599 l.num_shards_repaired == r.num_shards_repaired &&
2600 l.hb_peers == r.hb_peers &&
2601 l.op_queue_age_hist == r.op_queue_age_hist &&
2602 l.os_perf_stat == r.os_perf_stat &&
2603 l.num_pgs == r.num_pgs &&
2604 l.num_osds == r.num_osds &&
2605 l.num_per_pool_osds == r.num_per_pool_osds &&
2606 l.num_per_pool_omap_osds == r.num_per_pool_omap_osds;
2607 }
2608 inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
2609 return !(l == r);
2610 }
2611
2612 inline std::ostream& operator<<(std::ostream& out, const osd_stat_t& s) {
2613 return out << "osd_stat(" << s.statfs << ", "
2614 << "peers " << s.hb_peers
2615 << " op hist " << s.op_queue_age_hist.h
2616 << ")";
2617 }
2618
2619 /*
2620 * summation over an entire pool
2621 */
2622 struct pool_stat_t {
2623 object_stat_collection_t stats;
2624 store_statfs_t store_stats;
2625 int64_t log_size;
2626 int64_t ondisk_log_size; // >= active_log_size
2627 int32_t up; ///< number of up replicas or shards
2628 int32_t acting; ///< number of acting replicas or shards
2629 int32_t num_store_stats; ///< amount of store_stats accumulated
2630
2631 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
2632 num_store_stats(0)
2633 { }
2634
2635 void floor(int64_t f) {
2636 stats.floor(f);
2637 store_stats.floor(f);
2638 if (log_size < f)
2639 log_size = f;
2640 if (ondisk_log_size < f)
2641 ondisk_log_size = f;
2642 if (up < f)
2643 up = f;
2644 if (acting < f)
2645 acting = f;
2646 if (num_store_stats < f)
2647 num_store_stats = f;
2648 }
2649
2650 void add(const store_statfs_t& o) {
2651 store_stats.add(o);
2652 ++num_store_stats;
2653 }
2654 void sub(const store_statfs_t& o) {
2655 store_stats.sub(o);
2656 --num_store_stats;
2657 }
2658
2659 void add(const pg_stat_t& o) {
2660 stats.add(o.stats);
2661 log_size += o.log_size;
2662 ondisk_log_size += o.ondisk_log_size;
2663 up += o.up.size();
2664 acting += o.acting.size();
2665 }
2666 void sub(const pg_stat_t& o) {
2667 stats.sub(o.stats);
2668 log_size -= o.log_size;
2669 ondisk_log_size -= o.ondisk_log_size;
2670 up -= o.up.size();
2671 acting -= o.acting.size();
2672 }
2673
2674 bool is_zero() const {
2675 return (stats.is_zero() &&
2676 store_stats.is_zero() &&
2677 log_size == 0 &&
2678 ondisk_log_size == 0 &&
2679 up == 0 &&
2680 acting == 0 &&
2681 num_store_stats == 0);
2682 }
2683
2684 // helper accessors to retrieve used/netto bytes depending on the
2685 // collection method: new per-pool objectstore report or legacy PG
2686 // summation at OSD.
2687 // In legacy mode used and netto values are the same. But for new per-pool
2688 // collection 'used' provides amount of space ALLOCATED at all related OSDs
2689 // and 'netto' is amount of stored user data.
2690 uint64_t get_allocated_data_bytes(bool per_pool) const {
2691 if (per_pool) {
2692 return store_stats.allocated;
2693 } else {
2694 // legacy mode, use numbers from 'stats'
2695 return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
2696 }
2697 }
2698 uint64_t get_allocated_omap_bytes(bool per_pool_omap) const {
2699 if (per_pool_omap) {
2700 return store_stats.omap_allocated;
2701 } else {
2702 // omap is not broken out by pool by nautilus bluestore; report the
2703 // scrub value. this will be imprecise in that it won't account for
2704 // any storage overhead/efficiency.
2705 return stats.sum.num_omap_bytes;
2706 }
2707 }
2708 uint64_t get_user_data_bytes(float raw_used_rate, ///< space amp factor
2709 bool per_pool) const {
2710 // NOTE: we need the space amp factor so that we can work backwards from
2711 // the raw utilization to the amount of data that the user actually stored.
2712 if (per_pool) {
2713 return raw_used_rate ? store_stats.data_stored / raw_used_rate : 0;
2714 } else {
2715 // legacy mode, use numbers from 'stats'. note that we do NOT use the
2716 // raw_used_rate factor here because we are working from the PG stats
2717 // directly.
2718 return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
2719 }
2720 }
2721 uint64_t get_user_omap_bytes(float raw_used_rate, ///< space amp factor
2722 bool per_pool_omap) const {
2723 if (per_pool_omap) {
2724 return raw_used_rate ? store_stats.omap_allocated / raw_used_rate : 0;
2725 } else {
2726 // omap usage is lazily reported during scrub; this value may lag.
2727 return stats.sum.num_omap_bytes;
2728 }
2729 }
2730
2731 void dump(ceph::Formatter *f) const;
2732 void encode(ceph::buffer::list &bl, uint64_t features) const;
2733 void decode(ceph::buffer::list::const_iterator &bl);
2734 static void generate_test_instances(std::list<pool_stat_t*>& o);
2735 };
2736 WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
2737
2738
2739 // -----------------------------------------
2740
2741 /**
2742 * pg_hit_set_info_t - information about a single recorded HitSet
2743 *
2744 * Track basic metadata about a HitSet, like the number of insertions
2745 * and the time range it covers.
2746 */
2747 struct pg_hit_set_info_t {
2748 utime_t begin, end; ///< time interval
2749 eversion_t version; ///< version this HitSet object was written
2750 bool using_gmt; ///< use gmt for creating the hit_set archive object name
2751
2752 friend bool operator==(const pg_hit_set_info_t& l,
2753 const pg_hit_set_info_t& r) {
2754 return
2755 l.begin == r.begin &&
2756 l.end == r.end &&
2757 l.version == r.version &&
2758 l.using_gmt == r.using_gmt;
2759 }
2760
2761 explicit pg_hit_set_info_t(bool using_gmt = true)
2762 : using_gmt(using_gmt) {}
2763
2764 void encode(ceph::buffer::list &bl) const;
2765 void decode(ceph::buffer::list::const_iterator &bl);
2766 void dump(ceph::Formatter *f) const;
2767 static void generate_test_instances(std::list<pg_hit_set_info_t*>& o);
2768 };
2769 WRITE_CLASS_ENCODER(pg_hit_set_info_t)
2770
2771 /**
2772 * pg_hit_set_history_t - information about a history of hitsets
2773 *
2774 * Include information about the currently accumulating hit set as well
2775 * as archived/historical ones.
2776 */
2777 struct pg_hit_set_history_t {
2778 eversion_t current_last_update; ///< last version inserted into current set
2779 std::list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
2780
2781 friend bool operator==(const pg_hit_set_history_t& l,
2782 const pg_hit_set_history_t& r) {
2783 return
2784 l.current_last_update == r.current_last_update &&
2785 l.history == r.history;
2786 }
2787
2788 void encode(ceph::buffer::list &bl) const;
2789 void decode(ceph::buffer::list::const_iterator &bl);
2790 void dump(ceph::Formatter *f) const;
2791 static void generate_test_instances(std::list<pg_hit_set_history_t*>& o);
2792 };
2793 WRITE_CLASS_ENCODER(pg_hit_set_history_t)
2794
2795
2796 // -----------------------------------------
2797
2798 /**
2799 * pg_history_t - information about recent pg peering/mapping history
2800 *
2801 * This is aggressively shared between OSDs to bound the amount of past
2802 * history they need to worry about.
2803 */
2804 struct pg_history_t {
2805 epoch_t epoch_created = 0; // epoch in which *pg* was created (pool or pg)
2806 epoch_t epoch_pool_created = 0; // epoch in which *pool* was created
2807 // (note: may be pg creation epoch for
2808 // pre-luminous clusters)
2809 epoch_t last_epoch_started = 0;; // lower bound on last epoch started (anywhere, not necessarily locally)
2810 // https://docs.ceph.com/docs/master/dev/osd_internals/last_epoch_started/
2811 epoch_t last_interval_started = 0;; // first epoch of last_epoch_started interval
2812 epoch_t last_epoch_clean = 0;; // lower bound on last epoch the PG was completely clean.
2813 epoch_t last_interval_clean = 0;; // first epoch of last_epoch_clean interval
2814 epoch_t last_epoch_split = 0;; // as parent or child
2815 epoch_t last_epoch_marked_full = 0;; // pool or cluster
2816
2817 /**
2818 * In the event of a map discontinuity, same_*_since may reflect the first
2819 * map the osd has seen in the new map sequence rather than the actual start
2820 * of the interval. This is ok since a discontinuity at epoch e means there
2821 * must have been a clean interval between e and now and that we cannot be
2822 * in the active set during the interval containing e.
2823 */
2824 epoch_t same_up_since = 0;; // same acting set since
2825 epoch_t same_interval_since = 0;; // same acting AND up set since
2826 epoch_t same_primary_since = 0;; // same primary at least back through this epoch.
2827
2828 eversion_t last_scrub;
2829 eversion_t last_deep_scrub;
2830 utime_t last_scrub_stamp;
2831 utime_t last_deep_scrub_stamp;
2832 utime_t last_clean_scrub_stamp;
2833
2834 /// upper bound on how long prior interval readable (relative to encode time)
2835 ceph::timespan prior_readable_until_ub = ceph::timespan::zero();
2836
2837 friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
2838 return
2839 l.epoch_created == r.epoch_created &&
2840 l.epoch_pool_created == r.epoch_pool_created &&
2841 l.last_epoch_started == r.last_epoch_started &&
2842 l.last_interval_started == r.last_interval_started &&
2843 l.last_epoch_clean == r.last_epoch_clean &&
2844 l.last_interval_clean == r.last_interval_clean &&
2845 l.last_epoch_split == r.last_epoch_split &&
2846 l.last_epoch_marked_full == r.last_epoch_marked_full &&
2847 l.same_up_since == r.same_up_since &&
2848 l.same_interval_since == r.same_interval_since &&
2849 l.same_primary_since == r.same_primary_since &&
2850 l.last_scrub == r.last_scrub &&
2851 l.last_deep_scrub == r.last_deep_scrub &&
2852 l.last_scrub_stamp == r.last_scrub_stamp &&
2853 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2854 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
2855 l.prior_readable_until_ub == r.prior_readable_until_ub;
2856 }
2857
2858 pg_history_t() {}
2859 pg_history_t(epoch_t created, utime_t stamp)
2860 : epoch_created(created),
2861 epoch_pool_created(created),
2862 same_up_since(created),
2863 same_interval_since(created),
2864 same_primary_since(created),
2865 last_scrub_stamp(stamp),
2866 last_deep_scrub_stamp(stamp),
2867 last_clean_scrub_stamp(stamp) {}
2868
2869 bool merge(const pg_history_t &other) {
2870 // Here, we only update the fields which cannot be calculated from the OSDmap.
2871 bool modified = false;
2872 if (epoch_created < other.epoch_created) {
2873 epoch_created = other.epoch_created;
2874 modified = true;
2875 }
2876 if (epoch_pool_created < other.epoch_pool_created) {
2877 // FIXME: for jewel compat only; this should either be 0 or always the
2878 // same value across all pg instances.
2879 epoch_pool_created = other.epoch_pool_created;
2880 modified = true;
2881 }
2882 if (last_epoch_started < other.last_epoch_started) {
2883 last_epoch_started = other.last_epoch_started;
2884 modified = true;
2885 }
2886 if (last_interval_started < other.last_interval_started) {
2887 last_interval_started = other.last_interval_started;
2888 // if we are learning about a newer *started* interval, our
2889 // readable_until_ub is obsolete
2890 prior_readable_until_ub = other.prior_readable_until_ub;
2891 modified = true;
2892 } else if (other.last_interval_started == last_interval_started &&
2893 other.prior_readable_until_ub < prior_readable_until_ub) {
2894 // if other is the *same* interval, than pull our upper bound in
2895 // if they have a tighter bound.
2896 prior_readable_until_ub = other.prior_readable_until_ub;
2897 modified = true;
2898 }
2899 if (last_epoch_clean < other.last_epoch_clean) {
2900 last_epoch_clean = other.last_epoch_clean;
2901 modified = true;
2902 }
2903 if (last_interval_clean < other.last_interval_clean) {
2904 last_interval_clean = other.last_interval_clean;
2905 modified = true;
2906 }
2907 if (last_epoch_split < other.last_epoch_split) {
2908 last_epoch_split = other.last_epoch_split;
2909 modified = true;
2910 }
2911 if (last_epoch_marked_full < other.last_epoch_marked_full) {
2912 last_epoch_marked_full = other.last_epoch_marked_full;
2913 modified = true;
2914 }
2915 if (other.last_scrub > last_scrub) {
2916 last_scrub = other.last_scrub;
2917 modified = true;
2918 }
2919 if (other.last_scrub_stamp > last_scrub_stamp) {
2920 last_scrub_stamp = other.last_scrub_stamp;
2921 modified = true;
2922 }
2923 if (other.last_deep_scrub > last_deep_scrub) {
2924 last_deep_scrub = other.last_deep_scrub;
2925 modified = true;
2926 }
2927 if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
2928 last_deep_scrub_stamp = other.last_deep_scrub_stamp;
2929 modified = true;
2930 }
2931 if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
2932 last_clean_scrub_stamp = other.last_clean_scrub_stamp;
2933 modified = true;
2934 }
2935 return modified;
2936 }
2937
2938 void encode(ceph::buffer::list& bl) const;
2939 void decode(ceph::buffer::list::const_iterator& p);
2940 void dump(ceph::Formatter *f) const;
2941 static void generate_test_instances(std::list<pg_history_t*>& o);
2942
2943 ceph::signedspan refresh_prior_readable_until_ub(
2944 ceph::signedspan now, ///< now, relative to osd startup_time
2945 ceph::signedspan ub) { ///< ub, relative to osd startup_time
2946 if (now >= ub) {
2947 // prior interval(s) are unreadable; we can zero the upper bound
2948 prior_readable_until_ub = ceph::signedspan::zero();
2949 return ceph::signedspan::zero();
2950 } else {
2951 prior_readable_until_ub = ub - now;
2952 return ub;
2953 }
2954 }
2955 ceph::signedspan get_prior_readable_until_ub(ceph::signedspan now) {
2956 if (prior_readable_until_ub == ceph::signedspan::zero()) {
2957 return ceph::signedspan::zero();
2958 }
2959 return now + prior_readable_until_ub;
2960 }
2961 };
2962 WRITE_CLASS_ENCODER(pg_history_t)
2963
2964 inline std::ostream& operator<<(std::ostream& out, const pg_history_t& h) {
2965 out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
2966 << " lis/c=" << h.last_interval_started
2967 << "/" << h.last_interval_clean
2968 << " les/c/f=" << h.last_epoch_started << "/" << h.last_epoch_clean
2969 << "/" << h.last_epoch_marked_full
2970 << " sis=" << h.same_interval_since;
2971 if (h.prior_readable_until_ub != ceph::timespan::zero()) {
2972 out << " pruub=" << h.prior_readable_until_ub;
2973 }
2974 return out;
2975 }
2976
2977
2978 /**
2979 * pg_info_t - summary of PG statistics.
2980 *
2981 * some notes:
2982 * - last_complete implies we have all objects that existed as of that
2983 * stamp, OR a newer object, OR have already applied a later delete.
2984 * - if last_complete >= log.tail, then we know pg contents thru log.head.
2985 * otherwise, we have no idea what the pg is supposed to contain.
2986 */
2987 struct pg_info_t {
2988 spg_t pgid;
2989 eversion_t last_update; ///< last object version applied to store.
2990 eversion_t last_complete; ///< last version pg was complete through.
2991 epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd
2992 epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
2993
2994 version_t last_user_version; ///< last user object version applied to store
2995
2996 eversion_t log_tail; ///< oldest log entry.
2997
2998 hobject_t last_backfill; ///< objects >= this and < last_complete may be missing
2999
3000 interval_set<snapid_t> purged_snaps;
3001
3002 pg_stat_t stats;
3003
3004 pg_history_t history;
3005 pg_hit_set_history_t hit_set;
3006
3007 friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
3008 return
3009 l.pgid == r.pgid &&
3010 l.last_update == r.last_update &&
3011 l.last_complete == r.last_complete &&
3012 l.last_epoch_started == r.last_epoch_started &&
3013 l.last_interval_started == r.last_interval_started &&
3014 l.last_user_version == r.last_user_version &&
3015 l.log_tail == r.log_tail &&
3016 l.last_backfill == r.last_backfill &&
3017 l.purged_snaps == r.purged_snaps &&
3018 l.stats == r.stats &&
3019 l.history == r.history &&
3020 l.hit_set == r.hit_set;
3021 }
3022
3023 pg_info_t()
3024 : last_epoch_started(0),
3025 last_interval_started(0),
3026 last_user_version(0),
3027 last_backfill(hobject_t::get_max())
3028 { }
3029 // cppcheck-suppress noExplicitConstructor
3030 pg_info_t(spg_t p)
3031 : pgid(p),
3032 last_epoch_started(0),
3033 last_interval_started(0),
3034 last_user_version(0),
3035 last_backfill(hobject_t::get_max())
3036 { }
3037
3038 void set_last_backfill(hobject_t pos) {
3039 last_backfill = pos;
3040 }
3041
3042 bool is_empty() const { return last_update.version == 0; }
3043 bool dne() const { return history.epoch_created == 0; }
3044
3045 bool has_missing() const { return last_complete != last_update; }
3046 bool is_incomplete() const { return !last_backfill.is_max(); }
3047
3048 void encode(ceph::buffer::list& bl) const;
3049 void decode(ceph::buffer::list::const_iterator& p);
3050 void dump(ceph::Formatter *f) const;
3051 static void generate_test_instances(std::list<pg_info_t*>& o);
3052 };
3053 WRITE_CLASS_ENCODER(pg_info_t)
3054
3055 inline std::ostream& operator<<(std::ostream& out, const pg_info_t& pgi)
3056 {
3057 out << pgi.pgid << "(";
3058 if (pgi.dne())
3059 out << " DNE";
3060 if (pgi.is_empty())
3061 out << " empty";
3062 else {
3063 out << " v " << pgi.last_update;
3064 if (pgi.last_complete != pgi.last_update)
3065 out << " lc " << pgi.last_complete;
3066 out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
3067 }
3068 if (pgi.is_incomplete())
3069 out << " lb " << pgi.last_backfill;
3070 //out << " c " << pgi.epoch_created;
3071 out << " local-lis/les=" << pgi.last_interval_started
3072 << "/" << pgi.last_epoch_started;
3073 out << " n=" << pgi.stats.stats.sum.num_objects;
3074 out << " " << pgi.history
3075 << ")";
3076 return out;
3077 }
3078
3079 /**
3080 * pg_fast_info_t - common pg_info_t fields
3081 *
3082 * These are the fields of pg_info_t (and children) that are updated for
3083 * most IO operations.
3084 *
3085 * ** WARNING **
3086 * Because we rely on these fields to be applied to the normal
3087 * info struct, adding a new field here that is not also new in info
3088 * means that we must set an incompat OSD feature bit!
3089 */
3090 struct pg_fast_info_t {
3091 eversion_t last_update;
3092 eversion_t last_complete;
3093 version_t last_user_version;
3094 struct { // pg_stat_t stats
3095 eversion_t version;
3096 version_t reported_seq;
3097 utime_t last_fresh;
3098 utime_t last_active;
3099 utime_t last_peered;
3100 utime_t last_clean;
3101 utime_t last_unstale;
3102 utime_t last_undegraded;
3103 utime_t last_fullsized;
3104 int64_t log_size; // (also ondisk_log_size, which has the same value)
3105 struct { // object_stat_collection_t stats;
3106 struct { // objct_stat_sum_t sum
3107 int64_t num_bytes; // in bytes
3108 int64_t num_objects;
3109 int64_t num_object_copies;
3110 int64_t num_rd;
3111 int64_t num_rd_kb;
3112 int64_t num_wr;
3113 int64_t num_wr_kb;
3114 int64_t num_objects_dirty;
3115 } sum;
3116 } stats;
3117 } stats;
3118
3119 void populate_from(const pg_info_t& info) {
3120 last_update = info.last_update;
3121 last_complete = info.last_complete;
3122 last_user_version = info.last_user_version;
3123 stats.version = info.stats.version;
3124 stats.reported_seq = info.stats.reported_seq;
3125 stats.last_fresh = info.stats.last_fresh;
3126 stats.last_active = info.stats.last_active;
3127 stats.last_peered = info.stats.last_peered;
3128 stats.last_clean = info.stats.last_clean;
3129 stats.last_unstale = info.stats.last_unstale;
3130 stats.last_undegraded = info.stats.last_undegraded;
3131 stats.last_fullsized = info.stats.last_fullsized;
3132 stats.log_size = info.stats.log_size;
3133 stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
3134 stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
3135 stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
3136 stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
3137 stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
3138 stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
3139 stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
3140 stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
3141 }
3142
3143 bool try_apply_to(pg_info_t* info) {
3144 if (last_update <= info->last_update)
3145 return false;
3146 info->last_update = last_update;
3147 info->last_complete = last_complete;
3148 info->last_user_version = last_user_version;
3149 info->stats.version = stats.version;
3150 info->stats.reported_seq = stats.reported_seq;
3151 info->stats.last_fresh = stats.last_fresh;
3152 info->stats.last_active = stats.last_active;
3153 info->stats.last_peered = stats.last_peered;
3154 info->stats.last_clean = stats.last_clean;
3155 info->stats.last_unstale = stats.last_unstale;
3156 info->stats.last_undegraded = stats.last_undegraded;
3157 info->stats.last_fullsized = stats.last_fullsized;
3158 info->stats.log_size = stats.log_size;
3159 info->stats.ondisk_log_size = stats.log_size;
3160 info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
3161 info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
3162 info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
3163 info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
3164 info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
3165 info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
3166 info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
3167 info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
3168 return true;
3169 }
3170
3171 void encode(ceph::buffer::list& bl) const {
3172 ENCODE_START(1, 1, bl);
3173 encode(last_update, bl);
3174 encode(last_complete, bl);
3175 encode(last_user_version, bl);
3176 encode(stats.version, bl);
3177 encode(stats.reported_seq, bl);
3178 encode(stats.last_fresh, bl);
3179 encode(stats.last_active, bl);
3180 encode(stats.last_peered, bl);
3181 encode(stats.last_clean, bl);
3182 encode(stats.last_unstale, bl);
3183 encode(stats.last_undegraded, bl);
3184 encode(stats.last_fullsized, bl);
3185 encode(stats.log_size, bl);
3186 encode(stats.stats.sum.num_bytes, bl);
3187 encode(stats.stats.sum.num_objects, bl);
3188 encode(stats.stats.sum.num_object_copies, bl);
3189 encode(stats.stats.sum.num_rd, bl);
3190 encode(stats.stats.sum.num_rd_kb, bl);
3191 encode(stats.stats.sum.num_wr, bl);
3192 encode(stats.stats.sum.num_wr_kb, bl);
3193 encode(stats.stats.sum.num_objects_dirty, bl);
3194 ENCODE_FINISH(bl);
3195 }
3196 void decode(ceph::buffer::list::const_iterator& p) {
3197 DECODE_START(1, p);
3198 decode(last_update, p);
3199 decode(last_complete, p);
3200 decode(last_user_version, p);
3201 decode(stats.version, p);
3202 decode(stats.reported_seq, p);
3203 decode(stats.last_fresh, p);
3204 decode(stats.last_active, p);
3205 decode(stats.last_peered, p);
3206 decode(stats.last_clean, p);
3207 decode(stats.last_unstale, p);
3208 decode(stats.last_undegraded, p);
3209 decode(stats.last_fullsized, p);
3210 decode(stats.log_size, p);
3211 decode(stats.stats.sum.num_bytes, p);
3212 decode(stats.stats.sum.num_objects, p);
3213 decode(stats.stats.sum.num_object_copies, p);
3214 decode(stats.stats.sum.num_rd, p);
3215 decode(stats.stats.sum.num_rd_kb, p);
3216 decode(stats.stats.sum.num_wr, p);
3217 decode(stats.stats.sum.num_wr_kb, p);
3218 decode(stats.stats.sum.num_objects_dirty, p);
3219 DECODE_FINISH(p);
3220 }
3221 };
3222 WRITE_CLASS_ENCODER(pg_fast_info_t)
3223
3224
3225 /**
3226 * PastIntervals -- information needed to determine the PriorSet and
3227 * the might_have_unfound set
3228 */
3229 class PastIntervals {
3230 #ifdef WITH_SEASTAR
3231 using OSDMapRef = boost::local_shared_ptr<const OSDMap>;
3232 #else
3233 using OSDMapRef = std::shared_ptr<const OSDMap>;
3234 #endif
3235 public:
3236 struct pg_interval_t {
3237 std::vector<int32_t> up, acting;
3238 epoch_t first, last;
3239 bool maybe_went_rw;
3240 int32_t primary;
3241 int32_t up_primary;
3242
3243 pg_interval_t()
3244 : first(0), last(0),
3245 maybe_went_rw(false),
3246 primary(-1),
3247 up_primary(-1)
3248 {}
3249
3250 pg_interval_t(
3251 std::vector<int32_t> &&up,
3252 std::vector<int32_t> &&acting,
3253 epoch_t first,
3254 epoch_t last,
3255 bool maybe_went_rw,
3256 int32_t primary,
3257 int32_t up_primary)
3258 : up(up), acting(acting), first(first), last(last),
3259 maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
3260 {}
3261
3262 void encode(ceph::buffer::list& bl) const;
3263 void decode(ceph::buffer::list::const_iterator& bl);
3264 void dump(ceph::Formatter *f) const;
3265 static void generate_test_instances(std::list<pg_interval_t*>& o);
3266 };
3267
3268 PastIntervals();
3269 PastIntervals(PastIntervals &&rhs) = default;
3270 PastIntervals &operator=(PastIntervals &&rhs) = default;
3271
3272 PastIntervals(const PastIntervals &rhs);
3273 PastIntervals &operator=(const PastIntervals &rhs);
3274
3275 class interval_rep {
3276 public:
3277 virtual size_t size() const = 0;
3278 virtual bool empty() const = 0;
3279 virtual void clear() = 0;
3280 virtual std::pair<epoch_t, epoch_t> get_bounds() const = 0;
3281 virtual std::set<pg_shard_t> get_all_participants(
3282 bool ec_pool) const = 0;
3283 virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
3284 virtual std::unique_ptr<interval_rep> clone() const = 0;
3285 virtual std::ostream &print(std::ostream &out) const = 0;
3286 virtual void encode(ceph::buffer::list &bl) const = 0;
3287 virtual void decode(ceph::buffer::list::const_iterator &bl) = 0;
3288 virtual void dump(ceph::Formatter *f) const = 0;
3289 virtual void iterate_mayberw_back_to(
3290 epoch_t les,
3291 std::function<void(epoch_t, const std::set<pg_shard_t> &)> &&f) const = 0;
3292
3293 virtual bool has_full_intervals() const { return false; }
3294 virtual void iterate_all_intervals(
3295 std::function<void(const pg_interval_t &)> &&f) const {
3296 ceph_assert(!has_full_intervals());
3297 ceph_abort_msg("not valid for this implementation");
3298 }
3299 virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0;
3300
3301 virtual ~interval_rep() {}
3302 };
3303 friend class pi_compact_rep;
3304 private:
3305
3306 std::unique_ptr<interval_rep> past_intervals;
3307
3308 explicit PastIntervals(interval_rep *rep) : past_intervals(rep) {}
3309
3310 public:
3311 void add_interval(bool ec_pool, const pg_interval_t &interval) {
3312 ceph_assert(past_intervals);
3313 return past_intervals->add_interval(ec_pool, interval);
3314 }
3315
3316 void encode(ceph::buffer::list &bl) const {
3317 ENCODE_START(1, 1, bl);
3318 if (past_intervals) {
3319 __u8 type = 2;
3320 encode(type, bl);
3321 past_intervals->encode(bl);
3322 } else {
3323 encode((__u8)0, bl);
3324 }
3325 ENCODE_FINISH(bl);
3326 }
3327
3328 void decode(ceph::buffer::list::const_iterator &bl);
3329
3330 void dump(ceph::Formatter *f) const {
3331 ceph_assert(past_intervals);
3332 past_intervals->dump(f);
3333 }
3334 static void generate_test_instances(std::list<PastIntervals *> & o);
3335
3336 /**
3337 * Determines whether there is an interval change
3338 */
3339 static bool is_new_interval(
3340 int old_acting_primary,
3341 int new_acting_primary,
3342 const std::vector<int> &old_acting,
3343 const std::vector<int> &new_acting,
3344 int old_up_primary,
3345 int new_up_primary,
3346 const std::vector<int> &old_up,
3347 const std::vector<int> &new_up,
3348 int old_size,
3349 int new_size,
3350 int old_min_size,
3351 int new_min_size,
3352 unsigned old_pg_num,
3353 unsigned new_pg_num,
3354 unsigned old_pg_num_pending,
3355 unsigned new_pg_num_pending,
3356 bool old_sort_bitwise,
3357 bool new_sort_bitwise,
3358 bool old_recovery_deletes,
3359 bool new_recovery_deletes,
3360 uint32_t old_crush_count,
3361 uint32_t new_crush_count,
3362 uint32_t old_crush_target,
3363 uint32_t new_crush_target,
3364 uint32_t old_crush_barrier,
3365 uint32_t new_crush_barrier,
3366 int32_t old_crush_member,
3367 int32_t new_crush_member,
3368 pg_t pgid
3369 );
3370
3371 /**
3372 * Determines whether there is an interval change
3373 */
3374 static bool is_new_interval(
3375 int old_acting_primary, ///< [in] primary as of lastmap
3376 int new_acting_primary, ///< [in] primary as of lastmap
3377 const std::vector<int> &old_acting, ///< [in] acting as of lastmap
3378 const std::vector<int> &new_acting, ///< [in] acting as of osdmap
3379 int old_up_primary, ///< [in] up primary of lastmap
3380 int new_up_primary, ///< [in] up primary of osdmap
3381 const std::vector<int> &old_up, ///< [in] up as of lastmap
3382 const std::vector<int> &new_up, ///< [in] up as of osdmap
3383 const OSDMap *osdmap, ///< [in] current map
3384 const OSDMap *lastmap, ///< [in] last map
3385 pg_t pgid ///< [in] pgid for pg
3386 );
3387
3388 /**
3389 * Integrates a new map into *past_intervals, returns true
3390 * if an interval was closed out.
3391 */
3392 static bool check_new_interval(
3393 int old_acting_primary, ///< [in] primary as of lastmap
3394 int new_acting_primary, ///< [in] primary as of osdmap
3395 const std::vector<int> &old_acting, ///< [in] acting as of lastmap
3396 const std::vector<int> &new_acting, ///< [in] acting as of osdmap
3397 int old_up_primary, ///< [in] up primary of lastmap
3398 int new_up_primary, ///< [in] up primary of osdmap
3399 const std::vector<int> &old_up, ///< [in] up as of lastmap
3400 const std::vector<int> &new_up, ///< [in] up as of osdmap
3401 epoch_t same_interval_since, ///< [in] as of osdmap
3402 epoch_t last_epoch_clean, ///< [in] current
3403 const OSDMap *osdmap, ///< [in] current map
3404 const OSDMap *lastmap, ///< [in] last map
3405 pg_t pgid, ///< [in] pgid for pg
3406 const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
3407 PastIntervals *past_intervals, ///< [out] intervals
3408 std::ostream *out = 0 ///< [out] debug ostream
3409 );
3410 static bool check_new_interval(
3411 int old_acting_primary, ///< [in] primary as of lastmap
3412 int new_acting_primary, ///< [in] primary as of osdmap
3413 const std::vector<int> &old_acting, ///< [in] acting as of lastmap
3414 const std::vector<int> &new_acting, ///< [in] acting as of osdmap
3415 int old_up_primary, ///< [in] up primary of lastmap
3416 int new_up_primary, ///< [in] up primary of osdmap
3417 const std::vector<int> &old_up, ///< [in] up as of lastmap
3418 const std::vector<int> &new_up, ///< [in] up as of osdmap
3419 epoch_t same_interval_since, ///< [in] as of osdmap
3420 epoch_t last_epoch_clean, ///< [in] current
3421 OSDMapRef osdmap, ///< [in] current map
3422 OSDMapRef lastmap, ///< [in] last map
3423 pg_t pgid, ///< [in] pgid for pg
3424 const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
3425 PastIntervals *past_intervals, ///< [out] intervals
3426 std::ostream *out = 0 ///< [out] debug ostream
3427 ) {
3428 return check_new_interval(
3429 old_acting_primary, new_acting_primary,
3430 old_acting, new_acting,
3431 old_up_primary, new_up_primary,
3432 old_up, new_up,
3433 same_interval_since, last_epoch_clean,
3434 osdmap.get(), lastmap.get(),
3435 pgid,
3436 could_have_gone_active,
3437 past_intervals,
3438 out);
3439 }
3440
3441 friend std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
3442
3443 template <typename F>
3444 void iterate_mayberw_back_to(
3445 epoch_t les,
3446 F &&f) const {
3447 ceph_assert(past_intervals);
3448 past_intervals->iterate_mayberw_back_to(les, std::forward<F>(f));
3449 }
3450 void clear() {
3451 ceph_assert(past_intervals);
3452 past_intervals->clear();
3453 }
3454
3455 /**
3456 * Should return a value which gives an indication of the amount
3457 * of state contained
3458 */
3459 size_t size() const {
3460 ceph_assert(past_intervals);
3461 return past_intervals->size();
3462 }
3463
3464 bool empty() const {
3465 ceph_assert(past_intervals);
3466 return past_intervals->empty();
3467 }
3468
3469 void swap(PastIntervals &other) {
3470 using std::swap;
3471 swap(other.past_intervals, past_intervals);
3472 }
3473
3474 /**
3475 * Return all shards which have been in the acting set back to the
3476 * latest epoch to which we have trimmed except for pg_whoami
3477 */
3478 std::set<pg_shard_t> get_might_have_unfound(
3479 pg_shard_t pg_whoami,
3480 bool ec_pool) const {
3481 ceph_assert(past_intervals);
3482 auto ret = past_intervals->get_all_participants(ec_pool);
3483 ret.erase(pg_whoami);
3484 return ret;
3485 }
3486
3487 /**
3488 * Return all shards which we might want to talk to for peering
3489 */
3490 std::set<pg_shard_t> get_all_probe(
3491 bool ec_pool) const {
3492 ceph_assert(past_intervals);
3493 return past_intervals->get_all_participants(ec_pool);
3494 }
3495
3496 /* Return the set of epochs [start, end) represented by the
3497 * past_interval set.
3498 */
3499 std::pair<epoch_t, epoch_t> get_bounds() const {
3500 ceph_assert(past_intervals);
3501 return past_intervals->get_bounds();
3502 }
3503
3504 void adjust_start_backwards(epoch_t last_epoch_clean) {
3505 ceph_assert(past_intervals);
3506 past_intervals->adjust_start_backwards(last_epoch_clean);
3507 }
3508
3509 enum osd_state_t {
3510 UP,
3511 DOWN,
3512 DNE,
3513 LOST
3514 };
3515 struct PriorSet {
3516 bool ec_pool = false;
3517 std::set<pg_shard_t> probe; ///< current+prior OSDs we need to probe.
3518 std::set<int> down; ///< down osds that would normally be in @a probe and might be interesting.
3519 std::map<int, epoch_t> blocked_by; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
3520
3521 bool pg_down = false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
3522 const IsPGRecoverablePredicate* pcontdec = nullptr;
3523
3524 PriorSet() = default;
3525 PriorSet(PriorSet &&) = default;
3526 PriorSet &operator=(PriorSet &&) = default;
3527
3528 PriorSet &operator=(const PriorSet &) = delete;
3529 PriorSet(const PriorSet &) = delete;
3530
3531 bool operator==(const PriorSet &rhs) const {
3532 return (ec_pool == rhs.ec_pool) &&
3533 (probe == rhs.probe) &&
3534 (down == rhs.down) &&
3535 (blocked_by == rhs.blocked_by) &&
3536 (pg_down == rhs.pg_down);
3537 }
3538
3539 bool affected_by_map(
3540 const OSDMap &osdmap,
3541 const DoutPrefixProvider *dpp) const;
3542
3543 // For verifying tests
3544 PriorSet(
3545 bool ec_pool,
3546 std::set<pg_shard_t> probe,
3547 std::set<int> down,
3548 std::map<int, epoch_t> blocked_by,
3549 bool pg_down,
3550 const IsPGRecoverablePredicate *pcontdec)
3551 : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
3552 pg_down(pg_down), pcontdec(pcontdec) {}
3553
3554 private:
3555 template <typename F>
3556 PriorSet(
3557 const PastIntervals &past_intervals,
3558 bool ec_pool,
3559 epoch_t last_epoch_started,
3560 const IsPGRecoverablePredicate *c,
3561 F f,
3562 const std::vector<int> &up,
3563 const std::vector<int> &acting,
3564 const DoutPrefixProvider *dpp);
3565
3566 friend class PastIntervals;
3567 };
3568
3569 template <typename... Args>
3570 PriorSet get_prior_set(Args&&... args) const {
3571 return PriorSet(*this, std::forward<Args>(args)...);
3572 }
3573 };
3574 WRITE_CLASS_ENCODER(PastIntervals)
3575
3576 std::ostream& operator<<(std::ostream& out, const PastIntervals::pg_interval_t& i);
3577 std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
3578 std::ostream& operator<<(std::ostream& out, const PastIntervals::PriorSet &i);
3579
3580 template <typename F>
3581 PastIntervals::PriorSet::PriorSet(
3582 const PastIntervals &past_intervals,
3583 bool ec_pool,
3584 epoch_t last_epoch_started,
3585 const IsPGRecoverablePredicate *c,
3586 F f,
3587 const std::vector<int> &up,
3588 const std::vector<int> &acting,
3589 const DoutPrefixProvider *dpp)
3590 : ec_pool(ec_pool), pg_down(false), pcontdec(c)
3591 {
3592 /*
3593 * We have to be careful to gracefully deal with situations like
3594 * so. Say we have a power outage or something that takes out both
3595 * OSDs, but the monitor doesn't mark them down in the same epoch.
3596 * The history may look like
3597 *
3598 * 1: A B
3599 * 2: B
3600 * 3: let's say B dies for good, too (say, from the power spike)
3601 * 4: A
3602 *
3603 * which makes it look like B may have applied updates to the PG
3604 * that we need in order to proceed. This sucks...
3605 *
3606 * To minimize the risk of this happening, we CANNOT go active if
3607 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3608 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3609 * Then, we have something like
3610 *
3611 * 1: A B
3612 * 2: B up_thru[B]=0
3613 * 3:
3614 * 4: A
3615 *
3616 * -> we can ignore B, bc it couldn't have gone active (alive_thru
3617 * still 0).
3618 *
3619 * or,
3620 *
3621 * 1: A B
3622 * 2: B up_thru[B]=0
3623 * 3: B up_thru[B]=2
3624 * 4:
3625 * 5: A
3626 *
3627 * -> we must wait for B, bc it was alive through 2, and could have
3628 * written to the pg.
3629 *
3630 * If B is really dead, then an administrator will need to manually
3631 * intervene by marking the OSD as "lost."
3632 */
3633
3634 // Include current acting and up nodes... not because they may
3635 // contain old data (this interval hasn't gone active, obviously),
3636 // but because we want their pg_info to inform choose_acting(), and
3637 // so that we know what they do/do not have explicitly before
3638 // sending them any new info/logs/whatever.
3639 for (unsigned i = 0; i < acting.size(); i++) {
3640 if (acting[i] != pg_pool_t::pg_CRUSH_ITEM_NONE)
3641 probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3642 }
3643 // It may be possible to exclude the up nodes, but let's keep them in
3644 // there for now.
3645 for (unsigned i = 0; i < up.size(); i++) {
3646 if (up[i] != pg_pool_t::pg_CRUSH_ITEM_NONE)
3647 probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3648 }
3649
3650 std::set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
3651 ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
3652 for (auto &&i: all_probe) {
3653 switch (f(0, i.osd, nullptr)) {
3654 case UP: {
3655 probe.insert(i);
3656 break;
3657 }
3658 case DNE:
3659 case LOST:
3660 case DOWN: {
3661 down.insert(i.osd);
3662 break;
3663 }
3664 }
3665 }
3666
3667 past_intervals.iterate_mayberw_back_to(
3668 last_epoch_started,
3669 [&](epoch_t start, const std::set<pg_shard_t> &acting) {
3670 ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
3671 << ", acting: " << acting << dendl;
3672
3673 // look at candidate osds during this interval. each falls into
3674 // one of three categories: up, down (but potentially
3675 // interesting), or lost (down, but we won't wait for it).
3676 std::set<pg_shard_t> up_now;
3677 std::map<int, epoch_t> candidate_blocked_by;
3678 // any candidates down now (that might have useful data)
3679 bool any_down_now = false;
3680
3681 // consider ACTING osds
3682 for (auto &&so: acting) {
3683 epoch_t lost_at = 0;
3684 switch (f(start, so.osd, &lost_at)) {
3685 case UP: {
3686 // include past acting osds if they are up.
3687 up_now.insert(so);
3688 break;
3689 }
3690 case DNE: {
3691 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3692 << " no longer exists" << dendl;
3693 break;
3694 }
3695 case LOST: {
3696 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3697 << " is down, but lost_at " << lost_at << dendl;
3698 up_now.insert(so);
3699 break;
3700 }
3701 case DOWN: {
3702 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3703 << " is down" << dendl;
3704 candidate_blocked_by[so.osd] = lost_at;
3705 any_down_now = true;
3706 break;
3707 }
3708 }
3709 }
3710
3711 // if not enough osds survived this interval, and we may have gone rw,
3712 // then we need to wait for one of those osds to recover to
3713 // ensure that we haven't lost any information.
3714 if (!(*pcontdec)(up_now) && any_down_now) {
3715 // fixme: how do we identify a "clean" shutdown anyway?
3716 ldpp_dout(dpp, 10) << "build_prior possibly went active+rw,"
3717 << " insufficient up; including down osds" << dendl;
3718 ceph_assert(!candidate_blocked_by.empty());
3719 pg_down = true;
3720 blocked_by.insert(
3721 candidate_blocked_by.begin(),
3722 candidate_blocked_by.end());
3723 }
3724 });
3725
3726 ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
3727 << " down " << down
3728 << " blocked_by " << blocked_by
3729 << (pg_down ? " pg_down":"")
3730 << dendl;
3731 }
3732
3733 struct pg_notify_t {
3734 epoch_t query_epoch;
3735 epoch_t epoch_sent;
3736 pg_info_t info;
3737 shard_id_t to;
3738 shard_id_t from;
3739 PastIntervals past_intervals;
3740 pg_notify_t() :
3741 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
3742 from(shard_id_t::NO_SHARD) {}
3743 pg_notify_t(
3744 shard_id_t to,
3745 shard_id_t from,
3746 epoch_t query_epoch,
3747 epoch_t epoch_sent,
3748 const pg_info_t &info,
3749 const PastIntervals& pi)
3750 : query_epoch(query_epoch),
3751 epoch_sent(epoch_sent),
3752 info(info), to(to), from(from),
3753 past_intervals(pi) {
3754 ceph_assert(from == info.pgid.shard);
3755 }
3756 void encode(ceph::buffer::list &bl) const;
3757 void decode(ceph::buffer::list::const_iterator &p);
3758 void dump(ceph::Formatter *f) const;
3759 static void generate_test_instances(std::list<pg_notify_t*> &o);
3760 };
3761 WRITE_CLASS_ENCODER(pg_notify_t)
3762 std::ostream &operator<<(std::ostream &lhs, const pg_notify_t &notify);
3763
3764
3765 /**
3766 * pg_query_t - used to ask a peer for information about a pg.
3767 *
3768 * note: if version=0, type=LOG, then we just provide our full log.
3769 */
3770 struct pg_query_t {
3771 enum {
3772 INFO = 0,
3773 LOG = 1,
3774 MISSING = 4,
3775 FULLLOG = 5,
3776 };
3777 std::string_view get_type_name() const {
3778 switch (type) {
3779 case INFO: return "info";
3780 case LOG: return "log";
3781 case MISSING: return "missing";
3782 case FULLLOG: return "fulllog";
3783 default: return "???";
3784 }
3785 }
3786
3787 __s32 type;
3788 eversion_t since;
3789 pg_history_t history;
3790 epoch_t epoch_sent;
3791 shard_id_t to;
3792 shard_id_t from;
3793
3794 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
3795 from(shard_id_t::NO_SHARD) {}
3796 pg_query_t(
3797 int t,
3798 shard_id_t to,
3799 shard_id_t from,
3800 const pg_history_t& h,
3801 epoch_t epoch_sent)
3802 : type(t),
3803 history(h),
3804 epoch_sent(epoch_sent),
3805 to(to), from(from) {
3806 ceph_assert(t != LOG);
3807 }
3808 pg_query_t(
3809 int t,
3810 shard_id_t to,
3811 shard_id_t from,
3812 eversion_t s,
3813 const pg_history_t& h,
3814 epoch_t epoch_sent)
3815 : type(t), since(s), history(h),
3816 epoch_sent(epoch_sent), to(to), from(from) {
3817 ceph_assert(t == LOG);
3818 }
3819
3820 void encode(ceph::buffer::list &bl, uint64_t features) const;
3821 void decode(ceph::buffer::list::const_iterator &bl);
3822
3823 void dump(ceph::Formatter *f) const;
3824 static void generate_test_instances(std::list<pg_query_t*>& o);
3825 };
3826 WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
3827
3828 inline std::ostream& operator<<(std::ostream& out, const pg_query_t& q) {
3829 out << "query(" << q.get_type_name() << " " << q.since;
3830 if (q.type == pg_query_t::LOG)
3831 out << " " << q.history;
3832 out << " epoch_sent " << q.epoch_sent;
3833 out << ")";
3834 return out;
3835 }
3836
3837 /**
3838 * pg_lease_t - readable lease metadata, from primary -> non-primary
3839 *
3840 * This metadata serves to increase either or both of the lease expiration
3841 * and upper bound on the non-primary.
3842 */
3843 struct pg_lease_t {
3844 /// pg readable_until value; replicas must not be readable beyond this
3845 ceph::signedspan readable_until = ceph::signedspan::zero();
3846
3847 /// upper bound on any acting osd's readable_until
3848 ceph::signedspan readable_until_ub = ceph::signedspan::zero();
3849
3850 /// duration of the lease (in case clock deltas aren't available)
3851 ceph::signedspan interval = ceph::signedspan::zero();
3852
3853 pg_lease_t() {}
3854 pg_lease_t(ceph::signedspan ru, ceph::signedspan ruub,
3855 ceph::signedspan i)
3856 : readable_until(ru),
3857 readable_until_ub(ruub),
3858 interval(i) {}
3859
3860 void encode(ceph::buffer::list &bl) const;
3861 void decode(ceph::buffer::list::const_iterator &bl);
3862 void dump(ceph::Formatter *f) const;
3863 static void generate_test_instances(std::list<pg_lease_t*>& o);
3864
3865 friend std::ostream& operator<<(std::ostream& out, const pg_lease_t& l) {
3866 return out << "pg_lease(ru " << l.readable_until
3867 << " ub " << l.readable_until_ub
3868 << " int " << l.interval << ")";
3869 }
3870 };
3871 WRITE_CLASS_ENCODER(pg_lease_t)
3872
3873 /**
3874 * pg_lease_ack_t - lease ack, from non-primary -> primary
3875 *
3876 * This metadata acknowledges to the primary what a non-primary's noted
3877 * upper bound is.
3878 */
3879 struct pg_lease_ack_t {
3880 /// highest upper bound non-primary has recorded (primary's clock)
3881 ceph::signedspan readable_until_ub = ceph::signedspan::zero();
3882
3883 pg_lease_ack_t() {}
3884 pg_lease_ack_t(ceph::signedspan ub)
3885 : readable_until_ub(ub) {}
3886
3887 void encode(ceph::buffer::list &bl) const;
3888 void decode(ceph::buffer::list::const_iterator &bl);
3889 void dump(ceph::Formatter *f) const;
3890 static void generate_test_instances(std::list<pg_lease_ack_t*>& o);
3891
3892 friend std::ostream& operator<<(std::ostream& out, const pg_lease_ack_t& l) {
3893 return out << "pg_lease_ack(ruub " << l.readable_until_ub << ")";
3894 }
3895 };
3896 WRITE_CLASS_ENCODER(pg_lease_ack_t)
3897
3898
3899
3900 class PGBackend;
3901 class ObjectModDesc {
3902 bool can_local_rollback;
3903 bool rollback_info_completed;
3904
3905 // version required to decode, reflected in encode/decode version
3906 __u8 max_required_version = 1;
3907 public:
3908 class Visitor {
3909 public:
3910 virtual void append(uint64_t old_offset) {}
3911 virtual void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &attrs) {}
3912 virtual void rmobject(version_t old_version) {}
3913 /**
3914 * Used to support the unfound_lost_delete log event: if the stashed
3915 * version exists, we unstash it, otherwise, we do nothing. This way
3916 * each replica rolls back to whatever state it had prior to the attempt
3917 * at mark unfound lost delete
3918 */
3919 virtual void try_rmobject(version_t old_version) {
3920 rmobject(old_version);
3921 }
3922 virtual void create() {}
3923 virtual void update_snaps(const std::set<snapid_t> &old_snaps) {}
3924 virtual void rollback_extents(
3925 version_t gen,
3926 const std::vector<std::pair<uint64_t, uint64_t> > &extents) {}
3927 virtual ~Visitor() {}
3928 };
3929 void visit(Visitor *visitor) const;
3930 mutable ceph::buffer::list bl;
3931 enum ModID {
3932 APPEND = 1,
3933 SETATTRS = 2,
3934 DELETE = 3,
3935 CREATE = 4,
3936 UPDATE_SNAPS = 5,
3937 TRY_DELETE = 6,
3938 ROLLBACK_EXTENTS = 7
3939 };
3940 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3941 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3942 }
3943 void claim(ObjectModDesc &other) {
3944 bl = std::move(other.bl);
3945 can_local_rollback = other.can_local_rollback;
3946 rollback_info_completed = other.rollback_info_completed;
3947 }
3948 void claim_append(ObjectModDesc &other) {
3949 if (!can_local_rollback || rollback_info_completed)
3950 return;
3951 if (!other.can_local_rollback) {
3952 mark_unrollbackable();
3953 return;
3954 }
3955 bl.claim_append(other.bl);
3956 rollback_info_completed = other.rollback_info_completed;
3957 }
3958 void swap(ObjectModDesc &other) {
3959 bl.swap(other.bl);
3960
3961 using std::swap;
3962 swap(other.can_local_rollback, can_local_rollback);
3963 swap(other.rollback_info_completed, rollback_info_completed);
3964 swap(other.max_required_version, max_required_version);
3965 }
3966 void append_id(ModID id) {
3967 using ceph::encode;
3968 uint8_t _id(id);
3969 encode(_id, bl);
3970 }
3971 void append(uint64_t old_size) {
3972 if (!can_local_rollback || rollback_info_completed)
3973 return;
3974 ENCODE_START(1, 1, bl);
3975 append_id(APPEND);
3976 encode(old_size, bl);
3977 ENCODE_FINISH(bl);
3978 }
3979 void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &old_attrs) {
3980 if (!can_local_rollback || rollback_info_completed)
3981 return;
3982 ENCODE_START(1, 1, bl);
3983 append_id(SETATTRS);
3984 encode(old_attrs, bl);
3985 ENCODE_FINISH(bl);
3986 }
3987 bool rmobject(version_t deletion_version) {
3988 if (!can_local_rollback || rollback_info_completed)
3989 return false;
3990 ENCODE_START(1, 1, bl);
3991 append_id(DELETE);
3992 encode(deletion_version, bl);
3993 ENCODE_FINISH(bl);
3994 rollback_info_completed = true;
3995 return true;
3996 }
3997 bool try_rmobject(version_t deletion_version) {
3998 if (!can_local_rollback || rollback_info_completed)
3999 return false;
4000 ENCODE_START(1, 1, bl);
4001 append_id(TRY_DELETE);
4002 encode(deletion_version, bl);
4003 ENCODE_FINISH(bl);
4004 rollback_info_completed = true;
4005 return true;
4006 }
4007 void create() {
4008 if (!can_local_rollback || rollback_info_completed)
4009 return;
4010 rollback_info_completed = true;
4011 ENCODE_START(1, 1, bl);
4012 append_id(CREATE);
4013 ENCODE_FINISH(bl);
4014 }
4015 void update_snaps(const std::set<snapid_t> &old_snaps) {
4016 if (!can_local_rollback || rollback_info_completed)
4017 return;
4018 ENCODE_START(1, 1, bl);
4019 append_id(UPDATE_SNAPS);
4020 encode(old_snaps, bl);
4021 ENCODE_FINISH(bl);
4022 }
4023 void rollback_extents(
4024 version_t gen, const std::vector<std::pair<uint64_t, uint64_t> > &extents) {
4025 ceph_assert(can_local_rollback);
4026 ceph_assert(!rollback_info_completed);
4027 if (max_required_version < 2)
4028 max_required_version = 2;
4029 ENCODE_START(2, 2, bl);
4030 append_id(ROLLBACK_EXTENTS);
4031 encode(gen, bl);
4032 encode(extents, bl);
4033 ENCODE_FINISH(bl);
4034 }
4035
4036 // cannot be rolled back
4037 void mark_unrollbackable() {
4038 can_local_rollback = false;
4039 bl.clear();
4040 }
4041 bool can_rollback() const {
4042 return can_local_rollback;
4043 }
4044 bool empty() const {
4045 return can_local_rollback && (bl.length() == 0);
4046 }
4047
4048 bool requires_kraken() const {
4049 return max_required_version >= 2;
4050 }
4051
4052 /**
4053 * Create fresh copy of bl bytes to avoid keeping large buffers around
4054 * in the case that bl contains ptrs which point into a much larger
4055 * message buffer
4056 */
4057 void trim_bl() const {
4058 if (bl.length() > 0)
4059 bl.rebuild();
4060 }
4061 void encode(ceph::buffer::list &bl) const;
4062 void decode(ceph::buffer::list::const_iterator &bl);
4063 void dump(ceph::Formatter *f) const;
4064 static void generate_test_instances(std::list<ObjectModDesc*>& o);
4065 };
4066 WRITE_CLASS_ENCODER(ObjectModDesc)
4067
4068 class ObjectCleanRegions {
4069 private:
4070 bool new_object;
4071 bool clean_omap;
4072 interval_set<uint64_t> clean_offsets;
4073 static std::atomic<uint32_t> max_num_intervals;
4074
4075 /**
4076 * trim the number of intervals if clean_offsets.num_intervals()
4077 * exceeds the given upbound max_num_intervals
4078 * etc. max_num_intervals=2, clean_offsets:{[5~10], [20~5]}
4079 * then new interval [30~10] will evict out the shortest one [20~5]
4080 * finally, clean_offsets becomes {[5~10], [30~10]}
4081 */
4082 void trim();
4083 friend std::ostream& operator<<(std::ostream& out, const ObjectCleanRegions& ocr);
4084 public:
4085 ObjectCleanRegions() : new_object(false), clean_omap(true) {
4086 clean_offsets.insert(0, (uint64_t)-1);
4087 }
4088 ObjectCleanRegions(uint64_t offset, uint64_t len, bool co)
4089 : new_object(false), clean_omap(co) {
4090 clean_offsets.insert(offset, len);
4091 }
4092 bool operator==(const ObjectCleanRegions &orc) const {
4093 return new_object == orc.new_object && clean_omap == orc.clean_omap && clean_offsets == orc.clean_offsets;
4094 }
4095 static void set_max_num_intervals(uint32_t num);
4096 void merge(const ObjectCleanRegions &other);
4097 void mark_data_region_dirty(uint64_t offset, uint64_t len);
4098 void mark_omap_dirty();
4099 void mark_object_new();
4100 void mark_fully_dirty();
4101 interval_set<uint64_t> get_dirty_regions() const;
4102 bool omap_is_dirty() const;
4103 bool object_is_exist() const;
4104 bool is_clean_region(uint64_t offset, uint64_t len) const;
4105
4106 void encode(ceph::buffer::list &bl) const;
4107 void decode(ceph::buffer::list::const_iterator &bl);
4108 void dump(ceph::Formatter *f) const;
4109 static void generate_test_instances(std::list<ObjectCleanRegions*>& o);
4110 };
4111 WRITE_CLASS_ENCODER(ObjectCleanRegions)
4112 std::ostream& operator<<(std::ostream& out, const ObjectCleanRegions& ocr);
4113
4114
4115 struct OSDOp {
4116 ceph_osd_op op;
4117 sobject_t soid;
4118
4119 ceph::buffer::list indata, outdata;
4120 errorcode32_t rval = 0;
4121
4122 OSDOp() {
4123 // FIPS zeroization audit 20191115: this memset clean for security
4124 memset(&op, 0, sizeof(ceph_osd_op));
4125 }
4126
4127 OSDOp(const int op_code) {
4128 // FIPS zeroization audit 20191115: this memset clean for security
4129 memset(&op, 0, sizeof(ceph_osd_op));
4130 op.op = op_code;
4131 }
4132
4133 /**
4134 * split a ceph::buffer::list into constituent indata members of a vector of OSDOps
4135 *
4136 * @param ops [out] vector of OSDOps
4137 * @param in [in] combined data buffer
4138 */
4139 template<typename V>
4140 static void split_osd_op_vector_in_data(V& ops,
4141 ceph::buffer::list& in) {
4142 ceph::buffer::list::iterator datap = in.begin();
4143 for (unsigned i = 0; i < ops.size(); i++) {
4144 if (ops[i].op.payload_len) {
4145 datap.copy(ops[i].op.payload_len, ops[i].indata);
4146 }
4147 }
4148 }
4149
4150 /**
4151 * merge indata members of a vector of OSDOp into a single ceph::buffer::list
4152 *
4153 * Notably this also encodes certain other OSDOp data into the data
4154 * buffer, including the sobject_t soid.
4155 *
4156 * @param ops [in] vector of OSDOps
4157 * @param out [out] combined data buffer
4158 */
4159 template<typename V>
4160 static void merge_osd_op_vector_in_data(V& ops, ceph::buffer::list& out) {
4161 for (unsigned i = 0; i < ops.size(); i++) {
4162 if (ops[i].indata.length()) {
4163 ops[i].op.payload_len = ops[i].indata.length();
4164 out.append(ops[i].indata);
4165 }
4166 }
4167 }
4168
4169 /**
4170 * split a ceph::buffer::list into constituent outdata members of a vector of OSDOps
4171 *
4172 * @param ops [out] vector of OSDOps
4173 * @param in [in] combined data buffer
4174 */
4175 static void split_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& in);
4176
4177 /**
4178 * merge outdata members of a vector of OSDOps into a single ceph::buffer::list
4179 *
4180 * @param ops [in] vector of OSDOps
4181 * @param out [out] combined data buffer
4182 */
4183 static void merge_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& out);
4184
4185 /**
4186 * Clear data as much as possible, leave minimal data for historical op dump
4187 *
4188 * @param ops [in] vector of OSDOps
4189 */
4190 template<typename V>
4191 static void clear_data(V& ops) {
4192 for (unsigned i = 0; i < ops.size(); i++) {
4193 OSDOp& op = ops[i];
4194 op.outdata.clear();
4195 if (ceph_osd_op_type_attr(op.op.op) &&
4196 op.op.xattr.name_len &&
4197 op.indata.length() >= op.op.xattr.name_len) {
4198 ceph::buffer::list bl;
4199 bl.push_back(ceph::buffer::ptr_node::create(op.op.xattr.name_len));
4200 bl.begin().copy_in(op.op.xattr.name_len, op.indata);
4201 op.indata = std::move(bl);
4202 } else if (ceph_osd_op_type_exec(op.op.op) &&
4203 op.op.cls.class_len &&
4204 op.indata.length() >
4205 (op.op.cls.class_len + op.op.cls.method_len)) {
4206 __u8 len = op.op.cls.class_len + op.op.cls.method_len;
4207 ceph::buffer::list bl;
4208 bl.push_back(ceph::buffer::ptr_node::create(len));
4209 bl.begin().copy_in(len, op.indata);
4210 op.indata = std::move(bl);
4211 } else {
4212 op.indata.clear();
4213 }
4214 }
4215 }
4216 };
4217 std::ostream& operator<<(std::ostream& out, const OSDOp& op);
4218
4219 struct pg_log_op_return_item_t {
4220 int32_t rval;
4221 ceph::buffer::list bl;
4222 void encode(ceph::buffer::list& p) const {
4223 using ceph::encode;
4224 encode(rval, p);
4225 encode(bl, p);
4226 }
4227 void decode(ceph::buffer::list::const_iterator& p) {
4228 using ceph::decode;
4229 decode(rval, p);
4230 decode(bl, p);
4231 }
4232 void dump(ceph::Formatter *f) const {
4233 f->dump_int("rval", rval);
4234 f->dump_unsigned("bl_length", bl.length());
4235 }
4236 friend bool operator==(const pg_log_op_return_item_t& lhs,
4237 const pg_log_op_return_item_t& rhs) {
4238 return lhs.rval == rhs.rval &&
4239 lhs.bl.contents_equal(rhs.bl);
4240 }
4241 friend bool operator!=(const pg_log_op_return_item_t& lhs,
4242 const pg_log_op_return_item_t& rhs) {
4243 return !(lhs == rhs);
4244 }
4245 friend std::ostream& operator<<(std::ostream& out, const pg_log_op_return_item_t& i) {
4246 return out << "r=" << i.rval << "+" << i.bl.length() << "b";
4247 }
4248 };
4249 WRITE_CLASS_ENCODER(pg_log_op_return_item_t)
4250
4251 /**
4252 * pg_log_entry_t - single entry/event in pg log
4253 *
4254 */
4255 struct pg_log_entry_t {
4256 enum {
4257 MODIFY = 1, // some unspecified modification (but not *all* modifications)
4258 CLONE = 2, // cloned object from head
4259 DELETE = 3, // deleted object
4260 //BACKLOG = 4, // event invented by generate_backlog [obsolete]
4261 LOST_REVERT = 5, // lost new version, revert to an older version.
4262 LOST_DELETE = 6, // lost new version, revert to no object (deleted).
4263 LOST_MARK = 7, // lost new version, now EIO
4264 PROMOTE = 8, // promoted object from another tier
4265 CLEAN = 9, // mark an object clean
4266 ERROR = 10, // write that returned an error
4267 };
4268 static const char *get_op_name(int op) {
4269 switch (op) {
4270 case MODIFY:
4271 return "modify";
4272 case PROMOTE:
4273 return "promote";
4274 case CLONE:
4275 return "clone";
4276 case DELETE:
4277 return "delete";
4278 case LOST_REVERT:
4279 return "l_revert";
4280 case LOST_DELETE:
4281 return "l_delete";
4282 case LOST_MARK:
4283 return "l_mark";
4284 case CLEAN:
4285 return "clean";
4286 case ERROR:
4287 return "error";
4288 default:
4289 return "unknown";
4290 }
4291 }
4292 const char *get_op_name() const {
4293 return get_op_name(op);
4294 }
4295
4296 // describes state for a locally-rollbackable entry
4297 ObjectModDesc mod_desc;
4298 ceph::buffer::list snaps; // only for clone entries
4299 hobject_t soid;
4300 osd_reqid_t reqid; // caller+tid to uniquely identify request
4301 mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > extra_reqids;
4302
4303 /// map extra_reqids by index to error return code (if any)
4304 mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
4305
4306 eversion_t version, prior_version, reverting_to;
4307 version_t user_version; // the user version for this entry
4308 utime_t mtime; // this is the _user_ mtime, mind you
4309 int32_t return_code; // only stored for ERRORs for dup detection
4310
4311 std::vector<pg_log_op_return_item_t> op_returns;
4312
4313 __s32 op;
4314 bool invalid_hash; // only when decoding sobject_t based entries
4315 bool invalid_pool; // only when decoding pool-less hobject based entries
4316 ObjectCleanRegions clean_regions;
4317
4318 pg_log_entry_t()
4319 : user_version(0), return_code(0), op(0),
4320 invalid_hash(false), invalid_pool(false) {
4321 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4322 }
4323 pg_log_entry_t(int _op, const hobject_t& _soid,
4324 const eversion_t& v, const eversion_t& pv,
4325 version_t uv,
4326 const osd_reqid_t& rid, const utime_t& mt,
4327 int return_code)
4328 : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
4329 mtime(mt), return_code(return_code), op(_op),
4330 invalid_hash(false), invalid_pool(false) {
4331 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4332 }
4333
4334 bool is_clone() const { return op == CLONE; }
4335 bool is_modify() const { return op == MODIFY; }
4336 bool is_promote() const { return op == PROMOTE; }
4337 bool is_clean() const { return op == CLEAN; }
4338 bool is_lost_revert() const { return op == LOST_REVERT; }
4339 bool is_lost_delete() const { return op == LOST_DELETE; }
4340 bool is_lost_mark() const { return op == LOST_MARK; }
4341 bool is_error() const { return op == ERROR; }
4342
4343 bool is_update() const {
4344 return
4345 is_clone() || is_modify() || is_promote() || is_clean() ||
4346 is_lost_revert() || is_lost_mark();
4347 }
4348 bool is_delete() const {
4349 return op == DELETE || op == LOST_DELETE;
4350 }
4351
4352 bool can_rollback() const {
4353 return mod_desc.can_rollback();
4354 }
4355
4356 void mark_unrollbackable() {
4357 mod_desc.mark_unrollbackable();
4358 }
4359
4360 bool requires_kraken() const {
4361 return mod_desc.requires_kraken();
4362 }
4363
4364 // Errors are only used for dup detection, whereas
4365 // the index by objects is used by recovery, copy_get,
4366 // and other facilities that don't expect or need to
4367 // be aware of error entries.
4368 bool object_is_indexed() const {
4369 return !is_error();
4370 }
4371
4372 bool reqid_is_indexed() const {
4373 return reqid != osd_reqid_t() &&
4374 (op == MODIFY || op == DELETE || op == ERROR);
4375 }
4376
4377 void set_op_returns(const std::vector<OSDOp>& ops) {
4378 op_returns.resize(ops.size());
4379 for (unsigned i = 0; i < ops.size(); ++i) {
4380 op_returns[i].rval = ops[i].rval;
4381 op_returns[i].bl = ops[i].outdata;
4382 }
4383 }
4384
4385 std::string get_key_name() const;
4386 void encode_with_checksum(ceph::buffer::list& bl) const;
4387 void decode_with_checksum(ceph::buffer::list::const_iterator& p);
4388
4389 void encode(ceph::buffer::list &bl) const;
4390 void decode(ceph::buffer::list::const_iterator &bl);
4391 void dump(ceph::Formatter *f) const;
4392 static void generate_test_instances(std::list<pg_log_entry_t*>& o);
4393
4394 };
4395 WRITE_CLASS_ENCODER(pg_log_entry_t)
4396
4397 std::ostream& operator<<(std::ostream& out, const pg_log_entry_t& e);
4398
4399 struct pg_log_dup_t {
4400 osd_reqid_t reqid; // caller+tid to uniquely identify request
4401 eversion_t version;
4402 version_t user_version; // the user version for this entry
4403 int32_t return_code; // only stored for ERRORs for dup detection
4404
4405 std::vector<pg_log_op_return_item_t> op_returns;
4406
4407 pg_log_dup_t()
4408 : user_version(0), return_code(0)
4409 {}
4410 explicit pg_log_dup_t(const pg_log_entry_t& entry)
4411 : reqid(entry.reqid), version(entry.version),
4412 user_version(entry.user_version),
4413 return_code(entry.return_code),
4414 op_returns(entry.op_returns)
4415 {}
4416 pg_log_dup_t(const eversion_t& v, version_t uv,
4417 const osd_reqid_t& rid, int return_code)
4418 : reqid(rid), version(v), user_version(uv),
4419 return_code(return_code)
4420 {}
4421
4422 std::string get_key_name() const;
4423 void encode(ceph::buffer::list &bl) const;
4424 void decode(ceph::buffer::list::const_iterator &bl);
4425 void dump(ceph::Formatter *f) const;
4426 static void generate_test_instances(std::list<pg_log_dup_t*>& o);
4427
4428 bool operator==(const pg_log_dup_t &rhs) const {
4429 return reqid == rhs.reqid &&
4430 version == rhs.version &&
4431 user_version == rhs.user_version &&
4432 return_code == rhs.return_code &&
4433 op_returns == rhs.op_returns;
4434 }
4435 bool operator!=(const pg_log_dup_t &rhs) const {
4436 return !(*this == rhs);
4437 }
4438
4439 friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
4440 };
4441 WRITE_CLASS_ENCODER(pg_log_dup_t)
4442
4443 std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
4444
4445 /**
4446 * pg_log_t - incremental log of recent pg changes.
4447 *
4448 * serves as a recovery queue for recent changes.
4449 */
4450 struct pg_log_t {
4451 /*
4452 * head - newest entry (update|delete)
4453 * tail - entry previous to oldest (update|delete) for which we have
4454 * complete negative information.
4455 * i.e. we can infer pg contents for any store whose last_update >= tail.
4456 */
4457 eversion_t head; // newest entry
4458 eversion_t tail; // version prior to oldest
4459
4460 protected:
4461 // We can rollback rollback-able entries > can_rollback_to
4462 eversion_t can_rollback_to;
4463
4464 // always <= can_rollback_to, indicates how far stashed rollback
4465 // data can be found
4466 eversion_t rollback_info_trimmed_to;
4467
4468 public:
4469 // the actual log
4470 mempool::osd_pglog::list<pg_log_entry_t> log;
4471
4472 // entries just for dup op detection ordered oldest to newest
4473 mempool::osd_pglog::list<pg_log_dup_t> dups;
4474
4475 pg_log_t() = default;
4476 pg_log_t(const eversion_t &last_update,
4477 const eversion_t &log_tail,
4478 const eversion_t &can_rollback_to,
4479 const eversion_t &rollback_info_trimmed_to,
4480 mempool::osd_pglog::list<pg_log_entry_t> &&entries,
4481 mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
4482 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
4483 rollback_info_trimmed_to(rollback_info_trimmed_to),
4484 log(std::move(entries)), dups(std::move(dup_entries)) {}
4485 pg_log_t(const eversion_t &last_update,
4486 const eversion_t &log_tail,
4487 const eversion_t &can_rollback_to,
4488 const eversion_t &rollback_info_trimmed_to,
4489 const std::list<pg_log_entry_t> &entries,
4490 const std::list<pg_log_dup_t> &dup_entries)
4491 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
4492 rollback_info_trimmed_to(rollback_info_trimmed_to) {
4493 for (auto &&entry: entries) {
4494 log.push_back(entry);
4495 }
4496 for (auto &&entry: dup_entries) {
4497 dups.push_back(entry);
4498 }
4499 }
4500
4501 void clear() {
4502 eversion_t z;
4503 rollback_info_trimmed_to = can_rollback_to = head = tail = z;
4504 log.clear();
4505 dups.clear();
4506 }
4507
4508 eversion_t get_rollback_info_trimmed_to() const {
4509 return rollback_info_trimmed_to;
4510 }
4511 eversion_t get_can_rollback_to() const {
4512 return can_rollback_to;
4513 }
4514
4515
4516 pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
4517 mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
4518 oldlog.swap(log);
4519
4520 eversion_t old_tail;
4521 unsigned mask = ~((~0)<<split_bits);
4522 for (auto i = oldlog.begin();
4523 i != oldlog.end();
4524 ) {
4525 if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
4526 childlog.push_back(*i);
4527 } else {
4528 log.push_back(*i);
4529 }
4530 oldlog.erase(i++);
4531 }
4532
4533 // osd_reqid is unique, so it doesn't matter if there are extra
4534 // dup entries in each pg. To avoid storing oid with the dup
4535 // entries, just copy the whole list.
4536 auto childdups(dups);
4537
4538 return pg_log_t(
4539 head,
4540 tail,
4541 can_rollback_to,
4542 rollback_info_trimmed_to,
4543 std::move(childlog),
4544 std::move(childdups));
4545 }
4546
4547 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
4548 ceph_assert(newhead >= tail);
4549
4550 mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
4551 mempool::osd_pglog::list<pg_log_entry_t> divergent;
4552 while (true) {
4553 if (p == log.begin()) {
4554 // yikes, the whole thing is divergent!
4555 using std::swap;
4556 swap(divergent, log);
4557 break;
4558 }
4559 --p;
4560 if (p->version.version <= newhead.version) {
4561 /*
4562 * look at eversion.version here. we want to avoid a situation like:
4563 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4564 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4565 * lower_bound = 100'9
4566 * i.e, same request, different version. If the eversion.version is > the
4567 * lower_bound, we it is divergent.
4568 */
4569 ++p;
4570 divergent.splice(divergent.begin(), log, p, log.end());
4571 break;
4572 }
4573 ceph_assert(p->version > newhead);
4574 }
4575 head = newhead;
4576
4577 if (can_rollback_to > newhead)
4578 can_rollback_to = newhead;
4579
4580 if (rollback_info_trimmed_to > newhead)
4581 rollback_info_trimmed_to = newhead;
4582
4583 return divergent;
4584 }
4585
4586 void merge_from(const std::vector<pg_log_t*>& slogs, eversion_t last_update) {
4587 log.clear();
4588
4589 // sort and merge dups
4590 std::multimap<eversion_t,pg_log_dup_t> sorted;
4591 for (auto& d : dups) {
4592 sorted.emplace(d.version, d);
4593 }
4594 for (auto l : slogs) {
4595 for (auto& d : l->dups) {
4596 sorted.emplace(d.version, d);
4597 }
4598 }
4599 dups.clear();
4600 for (auto& i : sorted) {
4601 dups.push_back(i.second);
4602 }
4603
4604 head = last_update;
4605 tail = last_update;
4606 can_rollback_to = last_update;
4607 rollback_info_trimmed_to = last_update;
4608 }
4609
4610 bool empty() const {
4611 return log.empty();
4612 }
4613
4614 bool null() const {
4615 return head.version == 0 && head.epoch == 0;
4616 }
4617
4618 uint64_t approx_size() const {
4619 return head.version - tail.version;
4620 }
4621
4622 static void filter_log(spg_t import_pgid, const OSDMap &curmap,
4623 const std::string &hit_set_namespace, const pg_log_t &in,
4624 pg_log_t &out, pg_log_t &reject);
4625
4626 /**
4627 * copy entries from the tail of another pg_log_t
4628 *
4629 * @param other pg_log_t to copy from
4630 * @param from copy entries after this version
4631 */
4632 void copy_after(CephContext* cct, const pg_log_t &other, eversion_t from);
4633
4634 /**
4635 * copy up to N entries
4636 *
4637 * @param other source log
4638 * @param max max number of entries to copy
4639 */
4640 void copy_up_to(CephContext* cct, const pg_log_t &other, int max);
4641
4642 std::ostream& print(std::ostream& out) const;
4643
4644 void encode(ceph::buffer::list &bl) const;
4645 void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
4646 void dump(ceph::Formatter *f) const;
4647 static void generate_test_instances(std::list<pg_log_t*>& o);
4648 };
4649 WRITE_CLASS_ENCODER(pg_log_t)
4650
4651 inline std::ostream& operator<<(std::ostream& out, const pg_log_t& log)
4652 {
4653 out << "log((" << log.tail << "," << log.head << "], crt="
4654 << log.get_can_rollback_to() << ")";
4655 return out;
4656 }
4657
4658
4659 /**
4660 * pg_missing_t - summary of missing objects.
4661 *
4662 * kept in memory, as a supplement to pg_log_t
4663 * also used to pass missing info in messages.
4664 */
4665 struct pg_missing_item {
4666 eversion_t need, have;
4667 ObjectCleanRegions clean_regions;
4668 enum missing_flags_t {
4669 FLAG_NONE = 0,
4670 FLAG_DELETE = 1,
4671 } flags;
4672 pg_missing_item() : flags(FLAG_NONE) {}
4673 explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
4674 pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false, bool old_style = false) :
4675 need(n), have(h) {
4676 set_delete(is_delete);
4677 if (old_style)
4678 clean_regions.mark_fully_dirty();
4679 }
4680
4681 void encode(ceph::buffer::list& bl, uint64_t features) const {
4682 using ceph::encode;
4683 if (HAVE_FEATURE(features, SERVER_OCTOPUS)) {
4684 // encoding a zeroed eversion_t to differentiate between OSD_RECOVERY_DELETES、
4685 // SERVER_OCTOPUS and legacy unversioned encoding - a need value of 0'0 is not
4686 // possible. This can be replaced with the legacy encoding
4687 encode(eversion_t(), bl);
4688 encode(eversion_t(-1, -1), bl);
4689 encode(need, bl);
4690 encode(have, bl);
4691 encode(static_cast<uint8_t>(flags), bl);
4692 encode(clean_regions, bl);
4693 } else {
4694 encode(eversion_t(), bl);
4695 encode(need, bl);
4696 encode(have, bl);
4697 encode(static_cast<uint8_t>(flags), bl);
4698 }
4699 }
4700 void decode(ceph::buffer::list::const_iterator& bl) {
4701 using ceph::decode;
4702 eversion_t e, l;
4703 decode(e, bl);
4704 decode(l, bl);
4705 if(l == eversion_t(-1, -1)) {
4706 // support all
4707 decode(need, bl);
4708 decode(have, bl);
4709 uint8_t f;
4710 decode(f, bl);
4711 flags = static_cast<missing_flags_t>(f);
4712 decode(clean_regions, bl);
4713 } else {
4714 // support OSD_RECOVERY_DELETES
4715 need = l;
4716 decode(have, bl);
4717 uint8_t f;
4718 decode(f, bl);
4719 flags = static_cast<missing_flags_t>(f);
4720 clean_regions.mark_fully_dirty();
4721 }
4722 }
4723
4724 void set_delete(bool is_delete) {
4725 flags = is_delete ? FLAG_DELETE : FLAG_NONE;
4726 }
4727
4728 bool is_delete() const {
4729 return (flags & FLAG_DELETE) == FLAG_DELETE;
4730 }
4731
4732 std::string flag_str() const {
4733 if (flags == FLAG_NONE) {
4734 return "none";
4735 } else {
4736 return "delete";
4737 }
4738 }
4739
4740 void dump(ceph::Formatter *f) const {
4741 f->dump_stream("need") << need;
4742 f->dump_stream("have") << have;
4743 f->dump_stream("flags") << flag_str();
4744 f->dump_stream("clean_regions") << clean_regions;
4745 }
4746 static void generate_test_instances(std::list<pg_missing_item*>& o) {
4747 o.push_back(new pg_missing_item);
4748 o.push_back(new pg_missing_item);
4749 o.back()->need = eversion_t(1, 2);
4750 o.back()->have = eversion_t(1, 1);
4751 o.push_back(new pg_missing_item);
4752 o.back()->need = eversion_t(3, 5);
4753 o.back()->have = eversion_t(3, 4);
4754 o.back()->clean_regions.mark_data_region_dirty(4096, 8192);
4755 o.back()->clean_regions.mark_omap_dirty();
4756 o.back()->flags = FLAG_DELETE;
4757 }
4758 bool operator==(const pg_missing_item &rhs) const {
4759 return need == rhs.need && have == rhs.have && flags == rhs.flags;
4760 }
4761 bool operator!=(const pg_missing_item &rhs) const {
4762 return !(*this == rhs);
4763 }
4764 };
4765 WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
4766 std::ostream& operator<<(std::ostream& out, const pg_missing_item &item);
4767
4768 class pg_missing_const_i {
4769 public:
4770 virtual const std::map<hobject_t, pg_missing_item> &
4771 get_items() const = 0;
4772 virtual const std::map<version_t, hobject_t> &get_rmissing() const = 0;
4773 virtual bool get_may_include_deletes() const = 0;
4774 virtual unsigned int num_missing() const = 0;
4775 virtual bool have_missing() const = 0;
4776 virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
4777 virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
4778 virtual ~pg_missing_const_i() {}
4779 };
4780
4781
4782 template <bool Track>
4783 class ChangeTracker {
4784 public:
4785 void changed(const hobject_t &obj) {}
4786 template <typename F>
4787 void get_changed(F &&f) const {}
4788 void flush() {}
4789 bool is_clean() const {
4790 return true;
4791 }
4792 };
4793 template <>
4794 class ChangeTracker<true> {
4795 std::set<hobject_t> _changed;
4796 public:
4797 void changed(const hobject_t &obj) {
4798 _changed.insert(obj);
4799 }
4800 template <typename F>
4801 void get_changed(F &&f) const {
4802 for (auto const &i: _changed) {
4803 f(i);
4804 }
4805 }
4806 void flush() {
4807 _changed.clear();
4808 }
4809 bool is_clean() const {
4810 return _changed.empty();
4811 }
4812 };
4813
4814 template <bool TrackChanges>
4815 class pg_missing_set : public pg_missing_const_i {
4816 using item = pg_missing_item;
4817 std::map<hobject_t, item> missing; // oid -> (need v, have v)
4818 std::map<version_t, hobject_t> rmissing; // v -> oid
4819 ChangeTracker<TrackChanges> tracker;
4820
4821 public:
4822 pg_missing_set() = default;
4823
4824 template <typename missing_type>
4825 pg_missing_set(const missing_type &m) {
4826 missing = m.get_items();
4827 rmissing = m.get_rmissing();
4828 may_include_deletes = m.get_may_include_deletes();
4829 for (auto &&i: missing)
4830 tracker.changed(i.first);
4831 }
4832
4833 bool may_include_deletes = false;
4834
4835 const std::map<hobject_t, item> &get_items() const override {
4836 return missing;
4837 }
4838 const std::map<version_t, hobject_t> &get_rmissing() const override {
4839 return rmissing;
4840 }
4841 bool get_may_include_deletes() const override {
4842 return may_include_deletes;
4843 }
4844 unsigned int num_missing() const override {
4845 return missing.size();
4846 }
4847 bool have_missing() const override {
4848 return !missing.empty();
4849 }
4850 void merge(const pg_log_entry_t& e) {
4851 auto miter = missing.find(e.soid);
4852 if (miter != missing.end() && miter->second.have != eversion_t() && e.version > miter->second.have)
4853 miter->second.clean_regions.merge(e.clean_regions);
4854 }
4855 bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
4856 auto iter = missing.find(oid);
4857 if (iter == missing.end())
4858 return false;
4859 if (out)
4860 *out = iter->second;
4861 return true;
4862 }
4863 bool is_missing(const hobject_t& oid, eversion_t v) const override {
4864 std::map<hobject_t, item>::const_iterator m =
4865 missing.find(oid);
4866 if (m == missing.end())
4867 return false;
4868 const item &item(m->second);
4869 if (item.need > v)
4870 return false;
4871 return true;
4872 }
4873 eversion_t get_oldest_need() const {
4874 if (missing.empty()) {
4875 return eversion_t();
4876 }
4877 auto it = missing.find(rmissing.begin()->second);
4878 ceph_assert(it != missing.end());
4879 return it->second.need;
4880 }
4881
4882 void claim(pg_missing_set&& o) {
4883 static_assert(!TrackChanges, "Can't use claim with TrackChanges");
4884 missing = std::move(o.missing);
4885 rmissing = std::move(o.rmissing);
4886 }
4887
4888 /*
4889 * this needs to be called in log order as we extend the log. it
4890 * assumes missing is accurate up through the previous log entry.
4891 */
4892 void add_next_event(const pg_log_entry_t& e) {
4893 std::map<hobject_t, item>::iterator missing_it;
4894 missing_it = missing.find(e.soid);
4895 bool is_missing_divergent_item = missing_it != missing.end();
4896 if (e.prior_version == eversion_t() || e.is_clone()) {
4897 // new object.
4898 if (is_missing_divergent_item) { // use iterator
4899 rmissing.erase(missing_it->second.need.version);
4900 // .have = nil
4901 missing_it->second = item(e.version, eversion_t(), e.is_delete());
4902 missing_it->second.clean_regions.mark_fully_dirty();
4903 } else {
4904 // create new element in missing map
4905 // .have = nil
4906 missing[e.soid] = item(e.version, eversion_t(), e.is_delete());
4907 missing[e.soid].clean_regions.mark_fully_dirty();
4908 }
4909 } else if (is_missing_divergent_item) {
4910 // already missing (prior).
4911 rmissing.erase((missing_it->second).need.version);
4912 missing_it->second.need = e.version; // leave .have unchanged.
4913 missing_it->second.set_delete(e.is_delete());
4914 if (e.is_lost_revert())
4915 missing_it->second.clean_regions.mark_fully_dirty();
4916 else
4917 missing_it->second.clean_regions.merge(e.clean_regions);
4918 } else {
4919 // not missing, we must have prior_version (if any)
4920 ceph_assert(!is_missing_divergent_item);
4921 missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
4922 if (e.is_lost_revert())
4923 missing[e.soid].clean_regions.mark_fully_dirty();
4924 else
4925 missing[e.soid].clean_regions = e.clean_regions;
4926 }
4927 rmissing[e.version.version] = e.soid;
4928 tracker.changed(e.soid);
4929 }
4930
4931 void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
4932 auto p = missing.find(oid);
4933 if (p != missing.end()) {
4934 rmissing.erase((p->second).need.version);
4935 p->second.need = need; // do not adjust .have
4936 p->second.set_delete(is_delete);
4937 p->second.clean_regions.mark_fully_dirty();
4938 } else {
4939 missing[oid] = item(need, eversion_t(), is_delete);
4940 missing[oid].clean_regions.mark_fully_dirty();
4941 }
4942 rmissing[need.version] = oid;
4943
4944 tracker.changed(oid);
4945 }
4946
4947 void revise_have(hobject_t oid, eversion_t have) {
4948 auto p = missing.find(oid);
4949 if (p != missing.end()) {
4950 tracker.changed(oid);
4951 (p->second).have = have;
4952 }
4953 }
4954
4955 void mark_fully_dirty(const hobject_t& oid) {
4956 auto p = missing.find(oid);
4957 if (p != missing.end()) {
4958 tracker.changed(oid);
4959 (p->second).clean_regions.mark_fully_dirty();
4960 }
4961 }
4962
4963 void add(const hobject_t& oid, eversion_t need, eversion_t have,
4964 bool is_delete) {
4965 missing[oid] = item(need, have, is_delete, true);
4966 rmissing[need.version] = oid;
4967 tracker.changed(oid);
4968 }
4969
4970 void add(const hobject_t& oid, pg_missing_item&& item) {
4971 rmissing[item.need.version] = oid;
4972 missing.insert({oid, std::move(item)});
4973 tracker.changed(oid);
4974 }
4975
4976 void rm(const hobject_t& oid, eversion_t v) {
4977 std::map<hobject_t, item>::iterator p = missing.find(oid);
4978 if (p != missing.end() && p->second.need <= v)
4979 rm(p);
4980 }
4981
4982 void rm(std::map<hobject_t, item>::const_iterator m) {
4983 tracker.changed(m->first);
4984 rmissing.erase(m->second.need.version);
4985 missing.erase(m);
4986 }
4987
4988 void got(const hobject_t& oid, eversion_t v) {
4989 std::map<hobject_t, item>::iterator p = missing.find(oid);
4990 ceph_assert(p != missing.end());
4991 ceph_assert(p->second.need <= v || p->second.is_delete());
4992 got(p);
4993 }
4994
4995 void got(std::map<hobject_t, item>::const_iterator m) {
4996 tracker.changed(m->first);
4997 rmissing.erase(m->second.need.version);
4998 missing.erase(m);
4999 }
5000
5001 void split_into(
5002 pg_t child_pgid,
5003 unsigned split_bits,
5004 pg_missing_set *omissing) {
5005 omissing->may_include_deletes = may_include_deletes;
5006 unsigned mask = ~((~0)<<split_bits);
5007 for (std::map<hobject_t, item>::iterator i = missing.begin();
5008 i != missing.end();
5009 ) {
5010 if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
5011 omissing->add(i->first, i->second.need, i->second.have,
5012 i->second.is_delete());
5013 rm(i++);
5014 } else {
5015 ++i;
5016 }
5017 }
5018 }
5019
5020 void clear() {
5021 for (auto const &i: missing)
5022 tracker.changed(i.first);
5023 missing.clear();
5024 rmissing.clear();
5025 }
5026
5027 void encode(ceph::buffer::list &bl, uint64_t features) const {
5028 ENCODE_START(5, 2, bl)
5029 encode(missing, bl, features);
5030 encode(may_include_deletes, bl);
5031 ENCODE_FINISH(bl);
5032 }
5033 void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1) {
5034 for (auto const &i: missing)
5035 tracker.changed(i.first);
5036 DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl);
5037 decode(missing, bl);
5038 if (struct_v >= 4) {
5039 decode(may_include_deletes, bl);
5040 }
5041 DECODE_FINISH(bl);
5042
5043 if (struct_v < 3) {
5044 // Handle hobject_t upgrade
5045 std::map<hobject_t, item> tmp;
5046 for (std::map<hobject_t, item>::iterator i =
5047 missing.begin();
5048 i != missing.end();
5049 ) {
5050 if (!i->first.is_max() && i->first.pool == -1) {
5051 hobject_t to_insert(i->first);
5052 to_insert.pool = pool;
5053 tmp[to_insert] = i->second;
5054 missing.erase(i++);
5055 } else {
5056 ++i;
5057 }
5058 }
5059 missing.insert(tmp.begin(), tmp.end());
5060 }
5061
5062 for (std::map<hobject_t,item>::iterator it =
5063 missing.begin();
5064 it != missing.end();
5065 ++it)
5066 rmissing[it->second.need.version] = it->first;
5067 for (auto const &i: missing)
5068 tracker.changed(i.first);
5069 }
5070 void dump(ceph::Formatter *f) const {
5071 f->open_array_section("missing");
5072 for (std::map<hobject_t,item>::const_iterator p =
5073 missing.begin(); p != missing.end(); ++p) {
5074 f->open_object_section("item");
5075 f->dump_stream("object") << p->first;
5076 p->second.dump(f);
5077 f->close_section();
5078 }
5079 f->close_section();
5080 f->dump_bool("may_include_deletes", may_include_deletes);
5081 }
5082 template <typename F>
5083 void filter_objects(F &&f) {
5084 for (auto i = missing.begin(); i != missing.end();) {
5085 if (f(i->first)) {
5086 rm(i++);
5087 } else {
5088 ++i;
5089 }
5090 }
5091 }
5092 static void generate_test_instances(std::list<pg_missing_set*>& o) {
5093 o.push_back(new pg_missing_set);
5094 o.back()->may_include_deletes = true;
5095 o.push_back(new pg_missing_set);
5096 o.back()->add(
5097 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
5098 eversion_t(5, 6), eversion_t(5, 1), false);
5099 o.back()->may_include_deletes = true;
5100 o.push_back(new pg_missing_set);
5101 o.back()->add(
5102 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
5103 eversion_t(5, 6), eversion_t(5, 1), true);
5104 o.back()->may_include_deletes = true;
5105 }
5106 template <typename F>
5107 void get_changed(F &&f) const {
5108 tracker.get_changed(f);
5109 }
5110 void flush() {
5111 tracker.flush();
5112 }
5113 bool is_clean() const {
5114 return tracker.is_clean();
5115 }
5116 template <typename missing_t>
5117 bool debug_verify_from_init(
5118 const missing_t &init_missing,
5119 std::ostream *oss) const {
5120 if (!TrackChanges)
5121 return true;
5122 auto check_missing(init_missing.get_items());
5123 tracker.get_changed([&](const hobject_t &hoid) {
5124 check_missing.erase(hoid);
5125 if (missing.count(hoid)) {
5126 check_missing.insert(*(missing.find(hoid)));
5127 }
5128 });
5129 bool ok = true;
5130 if (check_missing.size() != missing.size()) {
5131 if (oss) {
5132 *oss << "Size mismatch, check: " << check_missing.size()
5133 << ", actual: " << missing.size() << "\n";
5134 }
5135 ok = false;
5136 }
5137 for (auto &i: missing) {
5138 if (!check_missing.count(i.first)) {
5139 if (oss)
5140 *oss << "check_missing missing " << i.first << "\n";
5141 ok = false;
5142 } else if (check_missing[i.first] != i.second) {
5143 if (oss)
5144 *oss << "check_missing missing item mismatch on " << i.first
5145 << ", check: " << check_missing[i.first]
5146 << ", actual: " << i.second << "\n";
5147 ok = false;
5148 }
5149 }
5150 if (oss && !ok) {
5151 *oss << "check_missing: " << check_missing << "\n";
5152 std::set<hobject_t> changed;
5153 tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
5154 *oss << "changed: " << changed << "\n";
5155 }
5156 return ok;
5157 }
5158 };
5159 template <bool TrackChanges>
5160 void encode(
5161 const pg_missing_set<TrackChanges> &c, ceph::buffer::list &bl, uint64_t features=0) {
5162 ENCODE_DUMP_PRE();
5163 c.encode(bl, features);
5164 ENCODE_DUMP_POST(cl);
5165 }
5166 template <bool TrackChanges>
5167 void decode(pg_missing_set<TrackChanges> &c, ceph::buffer::list::const_iterator &p) {
5168 c.decode(p);
5169 }
5170 template <bool TrackChanges>
5171 std::ostream& operator<<(std::ostream& out, const pg_missing_set<TrackChanges> &missing)
5172 {
5173 out << "missing(" << missing.num_missing()
5174 << " may_include_deletes = " << missing.may_include_deletes;
5175 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
5176 out << ")";
5177 return out;
5178 }
5179
5180 using pg_missing_t = pg_missing_set<false>;
5181 using pg_missing_tracker_t = pg_missing_set<true>;
5182
5183
5184
5185
5186 /**
5187 * pg list objects response format
5188 *
5189 */
5190
5191 template<typename T>
5192 struct pg_nls_response_template {
5193 collection_list_handle_t handle;
5194 std::vector<T> entries;
5195
5196 void encode(ceph::buffer::list& bl) const {
5197 ENCODE_START(1, 1, bl);
5198 encode(handle, bl);
5199 __u32 n = (__u32)entries.size();
5200 encode(n, bl);
5201 for (auto i = entries.begin(); i != entries.end(); ++i) {
5202 encode(i->nspace, bl);
5203 encode(i->oid, bl);
5204 encode(i->locator, bl);
5205 }
5206 ENCODE_FINISH(bl);
5207 }
5208 void decode(ceph::buffer::list::const_iterator& bl) {
5209 DECODE_START(1, bl);
5210 decode(handle, bl);
5211 __u32 n;
5212 decode(n, bl);
5213 entries.clear();
5214 while (n--) {
5215 T i;
5216 decode(i.nspace, bl);
5217 decode(i.oid, bl);
5218 decode(i.locator, bl);
5219 entries.push_back(i);
5220 }
5221 DECODE_FINISH(bl);
5222 }
5223 void dump(ceph::Formatter *f) const {
5224 f->dump_stream("handle") << handle;
5225 f->open_array_section("entries");
5226 for (auto p = entries.begin(); p != entries.end(); ++p) {
5227 f->open_object_section("object");
5228 f->dump_string("namespace", p->nspace);
5229 f->dump_string("object", p->oid);
5230 f->dump_string("key", p->locator);
5231 f->close_section();
5232 }
5233 f->close_section();
5234 }
5235 static void generate_test_instances(std::list<pg_nls_response_template<T>*>& o) {
5236 o.push_back(new pg_nls_response_template<T>);
5237 o.push_back(new pg_nls_response_template<T>);
5238 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5239 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
5240 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
5241 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
5242 o.push_back(new pg_nls_response_template<T>);
5243 o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
5244 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5245 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5246 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5247 o.push_back(new pg_nls_response_template<T>);
5248 o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
5249 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
5250 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
5251 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
5252 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5253 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5254 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5255 }
5256 };
5257
5258 using pg_nls_response_t = pg_nls_response_template<librados::ListObjectImpl>;
5259
5260 WRITE_CLASS_ENCODER(pg_nls_response_t)
5261
5262 // For backwards compatibility with older OSD requests
5263 struct pg_ls_response_t {
5264 collection_list_handle_t handle;
5265 std::list<std::pair<object_t, std::string> > entries;
5266
5267 void encode(ceph::buffer::list& bl) const {
5268 using ceph::encode;
5269 __u8 v = 1;
5270 encode(v, bl);
5271 encode(handle, bl);
5272 encode(entries, bl);
5273 }
5274 void decode(ceph::buffer::list::const_iterator& bl) {
5275 using ceph::decode;
5276 __u8 v;
5277 decode(v, bl);
5278 ceph_assert(v == 1);
5279 decode(handle, bl);
5280 decode(entries, bl);
5281 }
5282 void dump(ceph::Formatter *f) const {
5283 f->dump_stream("handle") << handle;
5284 f->open_array_section("entries");
5285 for (std::list<std::pair<object_t, std::string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
5286 f->open_object_section("object");
5287 f->dump_stream("object") << p->first;
5288 f->dump_string("key", p->second);
5289 f->close_section();
5290 }
5291 f->close_section();
5292 }
5293 static void generate_test_instances(std::list<pg_ls_response_t*>& o) {
5294 o.push_back(new pg_ls_response_t);
5295 o.push_back(new pg_ls_response_t);
5296 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5297 o.back()->entries.push_back(std::make_pair(object_t("one"), std::string()));
5298 o.back()->entries.push_back(std::make_pair(object_t("two"), std::string("twokey")));
5299 }
5300 };
5301
5302 WRITE_CLASS_ENCODER(pg_ls_response_t)
5303
5304 /**
5305 * object_copy_cursor_t
5306 */
5307 struct object_copy_cursor_t {
5308 uint64_t data_offset;
5309 std::string omap_offset;
5310 bool attr_complete;
5311 bool data_complete;
5312 bool omap_complete;
5313
5314 object_copy_cursor_t()
5315 : data_offset(0),
5316 attr_complete(false),
5317 data_complete(false),
5318 omap_complete(false)
5319 {}
5320
5321 bool is_initial() const {
5322 return !attr_complete && data_offset == 0 && omap_offset.empty();
5323 }
5324 bool is_complete() const {
5325 return attr_complete && data_complete && omap_complete;
5326 }
5327
5328 static void generate_test_instances(std::list<object_copy_cursor_t*>& o);
5329 void encode(ceph::buffer::list& bl) const;
5330 void decode(ceph::buffer::list::const_iterator &bl);
5331 void dump(ceph::Formatter *f) const;
5332 };
5333 WRITE_CLASS_ENCODER(object_copy_cursor_t)
5334
5335 /**
5336 * object_copy_data_t
5337 *
5338 * Return data from a copy request. The semantics are a little strange
5339 * as a result of the encoding's heritage.
5340 *
5341 * In particular, the sender unconditionally fills in the cursor (from what
5342 * it receives and sends), the size, and the mtime, but is responsible for
5343 * figuring out whether it should put any data in the attrs, data, or
5344 * omap members (corresponding to xattrs, object data, and the omap entries)
5345 * based on external data (the client includes a max amount to return with
5346 * the copy request). The client then looks into the attrs, data, and/or omap
5347 * based on the contents of the cursor.
5348 */
5349 struct object_copy_data_t {
5350 enum {
5351 FLAG_DATA_DIGEST = 1<<0,
5352 FLAG_OMAP_DIGEST = 1<<1,
5353 };
5354 object_copy_cursor_t cursor;
5355 uint64_t size;
5356 utime_t mtime;
5357 uint32_t data_digest, omap_digest;
5358 uint32_t flags;
5359 std::map<std::string, ceph::buffer::list, std::less<>> attrs;
5360 ceph::buffer::list data;
5361 ceph::buffer::list omap_header;
5362 ceph::buffer::list omap_data;
5363
5364 /// which snaps we are defined for (if a snap and not the head)
5365 std::vector<snapid_t> snaps;
5366 /// latest snap seq for the object (if head)
5367 snapid_t snap_seq;
5368
5369 /// recent reqids on this object
5370 mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > reqids;
5371
5372 /// map reqids by index to error return code (if any)
5373 mempool::osd_pglog::map<uint32_t, int> reqid_return_codes;
5374
5375 uint64_t truncate_seq;
5376 uint64_t truncate_size;
5377
5378 public:
5379 object_copy_data_t() :
5380 size((uint64_t)-1), data_digest(-1),
5381 omap_digest(-1), flags(0),
5382 truncate_seq(0),
5383 truncate_size(0) {}
5384
5385 static void generate_test_instances(std::list<object_copy_data_t*>& o);
5386 void encode(ceph::buffer::list& bl, uint64_t features) const;
5387 void decode(ceph::buffer::list::const_iterator& bl);
5388 void dump(ceph::Formatter *f) const;
5389 };
5390 WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
5391
5392 /**
5393 * pg creation info
5394 */
5395 struct pg_create_t {
5396 epoch_t created; // epoch pg created
5397 pg_t parent; // split from parent (if != pg_t())
5398 __s32 split_bits;
5399
5400 pg_create_t()
5401 : created(0), split_bits(0) {}
5402 pg_create_t(unsigned c, pg_t p, int s)
5403 : created(c), parent(p), split_bits(s) {}
5404
5405 void encode(ceph::buffer::list &bl) const;
5406 void decode(ceph::buffer::list::const_iterator &bl);
5407 void dump(ceph::Formatter *f) const;
5408 static void generate_test_instances(std::list<pg_create_t*>& o);
5409 };
5410 WRITE_CLASS_ENCODER(pg_create_t)
5411
5412 // -----------------------------------------
5413
5414 class ObjectExtent {
5415 /**
5416 * ObjectExtents are used for specifying IO behavior against RADOS
5417 * objects when one is using the ObjectCacher.
5418 *
5419 * To use this in a real system, *every member* must be filled
5420 * out correctly. In particular, make sure to initialize the
5421 * oloc correctly, as its default values are deliberate poison
5422 * and will cause internal ObjectCacher asserts.
5423 *
5424 * Similarly, your buffer_extents vector *must* specify a total
5425 * size equal to your length. If the buffer_extents inadvertently
5426 * contain less space than the length member specifies, you
5427 * will get unintelligible asserts deep in the ObjectCacher.
5428 *
5429 * If you are trying to do testing and don't care about actual
5430 * RADOS function, the simplest thing to do is to initialize
5431 * the ObjectExtent (truncate_size can be 0), create a single entry
5432 * in buffer_extents matching the length, and set oloc.pool to 0.
5433 */
5434 public:
5435 object_t oid; // object id
5436 uint64_t objectno;
5437 uint64_t offset; // in object
5438 uint64_t length; // in object
5439 uint64_t truncate_size; // in object
5440
5441 object_locator_t oloc; // object locator (pool etc)
5442
5443 std::vector<std::pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
5444
5445 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
5446 ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
5447 oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
5448 };
5449
5450 inline std::ostream& operator<<(std::ostream& out, const ObjectExtent &ex)
5451 {
5452 return out << "extent("
5453 << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
5454 << " " << ex.offset << "~" << ex.length
5455 << " -> " << ex.buffer_extents
5456 << ")";
5457 }
5458
5459
5460 // ---------------------------------------
5461
5462 class OSDSuperblock {
5463 public:
5464 uuid_d cluster_fsid, osd_fsid;
5465 int32_t whoami = -1; // my role in this fs.
5466 epoch_t current_epoch = 0; // most recent epoch
5467 epoch_t oldest_map = 0, newest_map = 0; // oldest/newest maps we have.
5468 double weight = 0.0;
5469
5470 CompatSet compat_features;
5471
5472 // last interval over which i mounted and was then active
5473 epoch_t mounted = 0; // last epoch i mounted
5474 epoch_t clean_thru = 0; // epoch i was active and clean thru
5475
5476 epoch_t purged_snaps_last = 0;
5477 utime_t last_purged_snaps_scrub;
5478
5479 void encode(ceph::buffer::list &bl) const;
5480 void decode(ceph::buffer::list::const_iterator &bl);
5481 void dump(ceph::Formatter *f) const;
5482 static void generate_test_instances(std::list<OSDSuperblock*>& o);
5483 };
5484 WRITE_CLASS_ENCODER(OSDSuperblock)
5485
5486 inline std::ostream& operator<<(std::ostream& out, const OSDSuperblock& sb)
5487 {
5488 return out << "sb(" << sb.cluster_fsid
5489 << " osd." << sb.whoami
5490 << " " << sb.osd_fsid
5491 << " e" << sb.current_epoch
5492 << " [" << sb.oldest_map << "," << sb.newest_map << "]"
5493 << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
5494 << ")";
5495 }
5496
5497
5498 // -------
5499
5500
5501
5502
5503
5504
5505 /*
5506 * attached to object head. describes most recent snap context, and
5507 * set of existing clones.
5508 */
5509 struct SnapSet {
5510 snapid_t seq;
5511 // NOTE: this is for pre-octopus compatibility only! remove in Q release
5512 std::vector<snapid_t> snaps; // descending
5513 std::vector<snapid_t> clones; // ascending
5514 std::map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest
5515 std::map<snapid_t, uint64_t> clone_size;
5516 std::map<snapid_t, std::vector<snapid_t>> clone_snaps; // descending
5517
5518 SnapSet() : seq(0) {}
5519 explicit SnapSet(ceph::buffer::list& bl) {
5520 auto p = std::cbegin(bl);
5521 decode(p);
5522 }
5523
5524 /// populate SnapSet from a librados::snap_set_t
5525 void from_snap_set(const librados::snap_set_t& ss, bool legacy);
5526
5527 /// get space accounted to clone
5528 uint64_t get_clone_bytes(snapid_t clone) const;
5529
5530 void encode(ceph::buffer::list& bl) const;
5531 void decode(ceph::buffer::list::const_iterator& bl);
5532 void dump(ceph::Formatter *f) const;
5533 static void generate_test_instances(std::list<SnapSet*>& o);
5534
5535 SnapContext get_ssc_as_of(snapid_t as_of) const {
5536 SnapContext out;
5537 out.seq = as_of;
5538 for (auto p = clone_snaps.rbegin();
5539 p != clone_snaps.rend();
5540 ++p) {
5541 for (auto snap : p->second) {
5542 if (snap <= as_of) {
5543 out.snaps.push_back(snap);
5544 }
5545 }
5546 }
5547 return out;
5548 }
5549
5550
5551 SnapSet get_filtered(const pg_pool_t &pinfo) const;
5552 void filter(const pg_pool_t &pinfo);
5553 };
5554 WRITE_CLASS_ENCODER(SnapSet)
5555
5556 std::ostream& operator<<(std::ostream& out, const SnapSet& cs);
5557
5558
5559
5560 #define OI_ATTR "_"
5561 #define SS_ATTR "snapset"
5562
5563 struct watch_info_t {
5564 uint64_t cookie;
5565 uint32_t timeout_seconds;
5566 entity_addr_t addr;
5567
5568 watch_info_t() : cookie(0), timeout_seconds(0) { }
5569 watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
5570
5571 void encode(ceph::buffer::list& bl, uint64_t features) const;
5572 void decode(ceph::buffer::list::const_iterator& bl);
5573 void dump(ceph::Formatter *f) const;
5574 static void generate_test_instances(std::list<watch_info_t*>& o);
5575 };
5576 WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
5577
5578 static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
5579 return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
5580 && l.addr == r.addr;
5581 }
5582
5583 static inline std::ostream& operator<<(std::ostream& out, const watch_info_t& w) {
5584 return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
5585 << " " << w.addr << ")";
5586 }
5587
5588 struct notify_info_t {
5589 uint64_t cookie;
5590 uint64_t notify_id;
5591 uint32_t timeout;
5592 ceph::buffer::list bl;
5593 };
5594
5595 static inline std::ostream& operator<<(std::ostream& out, const notify_info_t& n) {
5596 return out << "notify(cookie " << n.cookie
5597 << " notify" << n.notify_id
5598 << " " << n.timeout << "s)";
5599 }
5600
5601 class object_ref_delta_t {
5602 std::map<hobject_t, int> ref_delta;
5603
5604 public:
5605 object_ref_delta_t() = default;
5606 object_ref_delta_t(const object_ref_delta_t &) = default;
5607 object_ref_delta_t(object_ref_delta_t &&) = default;
5608
5609 object_ref_delta_t(decltype(ref_delta) &&ref_delta)
5610 : ref_delta(std::move(ref_delta)) {}
5611 object_ref_delta_t(const decltype(ref_delta) &ref_delta)
5612 : ref_delta(ref_delta) {}
5613
5614 object_ref_delta_t &operator=(const object_ref_delta_t &) = default;
5615 object_ref_delta_t &operator=(object_ref_delta_t &&) = default;
5616
5617 void dec_ref(const hobject_t &hoid, unsigned num=1) {
5618 mut_ref(hoid, -num);
5619 }
5620 void inc_ref(const hobject_t &hoid, unsigned num=1) {
5621 mut_ref(hoid, num);
5622 }
5623 void mut_ref(const hobject_t &hoid, int num) {
5624 [[maybe_unused]] auto [iter, _] = ref_delta.try_emplace(hoid, 0);
5625 iter->second += num;
5626 if (iter->second == 0)
5627 ref_delta.erase(iter);
5628 }
5629
5630 auto begin() const { return ref_delta.begin(); }
5631 auto end() const { return ref_delta.end(); }
5632 auto find(hobject_t &key) const { return ref_delta.find(key); }
5633
5634 bool operator==(const object_ref_delta_t &rhs) const {
5635 return ref_delta == rhs.ref_delta;
5636 }
5637 bool operator!=(const object_ref_delta_t &rhs) const {
5638 return !(*this == rhs);
5639 }
5640 bool is_empty() {
5641 return ref_delta.empty();
5642 }
5643 uint64_t size() {
5644 return ref_delta.size();
5645 }
5646 friend std::ostream& operator<<(std::ostream& out, const object_ref_delta_t & ci);
5647 };
5648
5649 struct chunk_info_t {
5650 typedef enum {
5651 FLAG_DIRTY = 1,
5652 FLAG_MISSING = 2,
5653 FLAG_HAS_REFERENCE = 4,
5654 FLAG_HAS_FINGERPRINT = 8,
5655 } cflag_t;
5656 uint32_t offset;
5657 uint32_t length;
5658 hobject_t oid;
5659 cflag_t flags; // FLAG_*
5660
5661 chunk_info_t() : offset(0), length(0), flags((cflag_t)0) { }
5662 chunk_info_t(uint32_t offset, uint32_t length, hobject_t oid) :
5663 offset(offset), length(length), oid(oid), flags((cflag_t)0) { }
5664
5665 static std::string get_flag_string(uint64_t flags) {
5666 std::string r;
5667 if (flags & FLAG_DIRTY) {
5668 r += "|dirty";
5669 }
5670 if (flags & FLAG_MISSING) {
5671 r += "|missing";
5672 }
5673 if (flags & FLAG_HAS_REFERENCE) {
5674 r += "|has_reference";
5675 }
5676 if (flags & FLAG_HAS_FINGERPRINT) {
5677 r += "|has_fingerprint";
5678 }
5679 if (r.length())
5680 return r.substr(1);
5681 return r;
5682 }
5683 bool test_flag(cflag_t f) const {
5684 return (flags & f) == f;
5685 }
5686 void set_flag(cflag_t f) {
5687 flags = (cflag_t)(flags | f);
5688 }
5689 void set_flags(cflag_t f) {
5690 flags = f;
5691 }
5692 void clear_flag(cflag_t f) {
5693 flags = (cflag_t)(flags & ~f);
5694 }
5695 void clear_flags() {
5696 flags = (cflag_t)0;
5697 }
5698 bool is_dirty() const {
5699 return test_flag(FLAG_DIRTY);
5700 }
5701 bool is_missing() const {
5702 return test_flag(FLAG_MISSING);
5703 }
5704 bool has_reference() const {
5705 return test_flag(FLAG_HAS_REFERENCE);
5706 }
5707 bool has_fingerprint() const {
5708 return test_flag(FLAG_HAS_FINGERPRINT);
5709 }
5710 void encode(ceph::buffer::list &bl) const;
5711 void decode(ceph::buffer::list::const_iterator &bl);
5712 void dump(ceph::Formatter *f) const;
5713 friend std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
5714 bool operator==(const chunk_info_t& cit) const;
5715 bool operator!=(const chunk_info_t& cit) const {
5716 return !(cit == *this);
5717 }
5718 };
5719 WRITE_CLASS_ENCODER(chunk_info_t)
5720 std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
5721
5722 struct object_info_t;
5723 struct object_manifest_t {
5724 enum {
5725 TYPE_NONE = 0,
5726 TYPE_REDIRECT = 1,
5727 TYPE_CHUNKED = 2,
5728 };
5729 uint8_t type; // redirect, chunked, ...
5730 hobject_t redirect_target;
5731 std::map<uint64_t, chunk_info_t> chunk_map;
5732
5733 object_manifest_t() : type(0) { }
5734 object_manifest_t(uint8_t type, const hobject_t& redirect_target)
5735 : type(type), redirect_target(redirect_target) { }
5736
5737 bool is_empty() const {
5738 return type == TYPE_NONE;
5739 }
5740 bool is_redirect() const {
5741 return type == TYPE_REDIRECT;
5742 }
5743 bool is_chunked() const {
5744 return type == TYPE_CHUNKED;
5745 }
5746 static std::string_view get_type_name(uint8_t m) {
5747 switch (m) {
5748 case TYPE_NONE: return "none";
5749 case TYPE_REDIRECT: return "redirect";
5750 case TYPE_CHUNKED: return "chunked";
5751 default: return "unknown";
5752 }
5753 }
5754 std::string_view get_type_name() const {
5755 return get_type_name(type);
5756 }
5757 void clear() {
5758 type = 0;
5759 redirect_target = hobject_t();
5760 chunk_map.clear();
5761 }
5762
5763 /**
5764 * calc_refs_to_inc_on_set
5765 *
5766 * Takes a manifest and returns the set of refs to
5767 * increment upon set-chunk
5768 *
5769 * l should be nullptr if there are no clones, or
5770 * l and g may each be null if the corresponding clone does not exist.
5771 * *this contains the set of new references to set
5772 *
5773 */
5774 void calc_refs_to_inc_on_set(
5775 const object_manifest_t* g, ///< [in] manifest for clone > *this
5776 const object_manifest_t* l, ///< [in] manifest for clone < *this
5777 object_ref_delta_t &delta ///< [out] set of refs to drop
5778 ) const;
5779
5780 /**
5781 * calc_refs_to_drop_on_modify
5782 *
5783 * Takes a manifest and returns the set of refs to
5784 * drop upon modification
5785 *
5786 * l should be nullptr if there are no clones, or
5787 * l may be null if the corresponding clone does not exist.
5788 *
5789 */
5790 void calc_refs_to_drop_on_modify(
5791 const object_manifest_t* l, ///< [in] manifest for previous clone
5792 const ObjectCleanRegions& clean_regions, ///< [in] clean regions
5793 object_ref_delta_t &delta ///< [out] set of refs to drop
5794 ) const;
5795
5796 /**
5797 * calc_refs_to_drop_on_removal
5798 *
5799 * Takes the two adjacent manifests and returns the set of refs to
5800 * drop upon removal of the clone containing *this.
5801 *
5802 * g should be nullptr if *this is on HEAD, l should be nullptr if
5803 * *this is on the oldest clone (or head if there are no clones).
5804 */
5805 void calc_refs_to_drop_on_removal(
5806 const object_manifest_t* g, ///< [in] manifest for clone > *this
5807 const object_manifest_t* l, ///< [in] manifest for clone < *this
5808 object_ref_delta_t &delta ///< [out] set of refs to drop
5809 ) const;
5810
5811 static void generate_test_instances(std::list<object_manifest_t*>& o);
5812 void encode(ceph::buffer::list &bl) const;
5813 void decode(ceph::buffer::list::const_iterator &bl);
5814 void dump(ceph::Formatter *f) const;
5815 friend std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
5816 };
5817 WRITE_CLASS_ENCODER(object_manifest_t)
5818 std::ostream& operator<<(std::ostream& out, const object_manifest_t& oi);
5819
5820 struct object_info_t {
5821 hobject_t soid;
5822 eversion_t version, prior_version;
5823 version_t user_version;
5824 osd_reqid_t last_reqid;
5825
5826 uint64_t size;
5827 utime_t mtime;
5828 utime_t local_mtime; // local mtime
5829
5830 // note: these are currently encoded into a total 16 bits; see
5831 // encode()/decode() for the weirdness.
5832 typedef enum {
5833 FLAG_LOST = 1<<0,
5834 FLAG_WHITEOUT = 1<<1, // object logically does not exist
5835 FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
5836 FLAG_OMAP = 1<<3, // has (or may have) some/any omap data
5837 FLAG_DATA_DIGEST = 1<<4, // has data crc
5838 FLAG_OMAP_DIGEST = 1<<5, // has omap crc
5839 FLAG_CACHE_PIN = 1<<6, // pin the object in cache tier
5840 FLAG_MANIFEST = 1<<7, // has manifest
5841 FLAG_USES_TMAP = 1<<8, // deprecated; no longer used
5842 FLAG_REDIRECT_HAS_REFERENCE = 1<<9, // has reference
5843 } flag_t;
5844
5845 flag_t flags;
5846
5847 static std::string get_flag_string(flag_t flags) {
5848 std::string s;
5849 std::vector<std::string> sv = get_flag_vector(flags);
5850 for (auto ss : sv) {
5851 s += std::string("|") + ss;
5852 }
5853 if (s.length())
5854 return s.substr(1);
5855 return s;
5856 }
5857 static std::vector<std::string> get_flag_vector(flag_t flags) {
5858 std::vector<std::string> sv;
5859 if (flags & FLAG_LOST)
5860 sv.insert(sv.end(), "lost");
5861 if (flags & FLAG_WHITEOUT)
5862 sv.insert(sv.end(), "whiteout");
5863 if (flags & FLAG_DIRTY)
5864 sv.insert(sv.end(), "dirty");
5865 if (flags & FLAG_USES_TMAP)
5866 sv.insert(sv.end(), "uses_tmap");
5867 if (flags & FLAG_OMAP)
5868 sv.insert(sv.end(), "omap");
5869 if (flags & FLAG_DATA_DIGEST)
5870 sv.insert(sv.end(), "data_digest");
5871 if (flags & FLAG_OMAP_DIGEST)
5872 sv.insert(sv.end(), "omap_digest");
5873 if (flags & FLAG_CACHE_PIN)
5874 sv.insert(sv.end(), "cache_pin");
5875 if (flags & FLAG_MANIFEST)
5876 sv.insert(sv.end(), "manifest");
5877 if (flags & FLAG_REDIRECT_HAS_REFERENCE)
5878 sv.insert(sv.end(), "redirect_has_reference");
5879 return sv;
5880 }
5881 std::string get_flag_string() const {
5882 return get_flag_string(flags);
5883 }
5884
5885 uint64_t truncate_seq, truncate_size;
5886
5887 std::map<std::pair<uint64_t, entity_name_t>, watch_info_t> watchers;
5888
5889 // opportunistic checksums; may or may not be present
5890 __u32 data_digest; ///< data crc32c
5891 __u32 omap_digest; ///< omap crc32c
5892
5893 // alloc hint attribute
5894 uint64_t expected_object_size, expected_write_size;
5895 uint32_t alloc_hint_flags;
5896
5897 struct object_manifest_t manifest;
5898
5899 void copy_user_bits(const object_info_t& other);
5900
5901 bool test_flag(flag_t f) const {
5902 return (flags & f) == f;
5903 }
5904 void set_flag(flag_t f) {
5905 flags = (flag_t)(flags | f);
5906 }
5907 void clear_flag(flag_t f) {
5908 flags = (flag_t)(flags & ~f);
5909 }
5910 bool is_lost() const {
5911 return test_flag(FLAG_LOST);
5912 }
5913 bool is_whiteout() const {
5914 return test_flag(FLAG_WHITEOUT);
5915 }
5916 bool is_dirty() const {
5917 return test_flag(FLAG_DIRTY);
5918 }
5919 bool is_omap() const {
5920 return test_flag(FLAG_OMAP);
5921 }
5922 bool is_data_digest() const {
5923 return test_flag(FLAG_DATA_DIGEST);
5924 }
5925 bool is_omap_digest() const {
5926 return test_flag(FLAG_OMAP_DIGEST);
5927 }
5928 bool is_cache_pinned() const {
5929 return test_flag(FLAG_CACHE_PIN);
5930 }
5931 bool has_manifest() const {
5932 return test_flag(FLAG_MANIFEST);
5933 }
5934 void set_data_digest(__u32 d) {
5935 set_flag(FLAG_DATA_DIGEST);
5936 data_digest = d;
5937 }
5938 void set_omap_digest(__u32 d) {
5939 set_flag(FLAG_OMAP_DIGEST);
5940 omap_digest = d;
5941 }
5942 void clear_data_digest() {
5943 clear_flag(FLAG_DATA_DIGEST);
5944 data_digest = -1;
5945 }
5946 void clear_omap_digest() {
5947 clear_flag(FLAG_OMAP_DIGEST);
5948 omap_digest = -1;
5949 }
5950 void new_object() {
5951 clear_data_digest();
5952 clear_omap_digest();
5953 }
5954
5955 void encode(ceph::buffer::list& bl, uint64_t features) const;
5956 void decode(ceph::buffer::list::const_iterator& bl);
5957 void decode(const ceph::buffer::list& bl) {
5958 auto p = std::cbegin(bl);
5959 decode(p);
5960 }
5961
5962 void encode_no_oid(ceph::buffer::list& bl, uint64_t features) {
5963 // TODO: drop soid field and remove the denc no_oid methods
5964 auto tmp_oid = hobject_t(hobject_t::get_max());
5965 tmp_oid.swap(soid);
5966 encode(bl, features);
5967 soid = tmp_oid;
5968 }
5969 void decode_no_oid(ceph::buffer::list::const_iterator& bl) {
5970 decode(bl);
5971 ceph_assert(soid.is_max());
5972 }
5973 void decode_no_oid(const ceph::buffer::list& bl) {
5974 auto p = std::cbegin(bl);
5975 decode_no_oid(p);
5976 }
5977 void decode_no_oid(const ceph::buffer::list& bl, const hobject_t& _soid) {
5978 auto p = std::cbegin(bl);
5979 decode_no_oid(p);
5980 soid = _soid;
5981 }
5982
5983 void dump(ceph::Formatter *f) const;
5984 static void generate_test_instances(std::list<object_info_t*>& o);
5985
5986 explicit object_info_t()
5987 : user_version(0), size(0), flags((flag_t)0),
5988 truncate_seq(0), truncate_size(0),
5989 data_digest(-1), omap_digest(-1),
5990 expected_object_size(0), expected_write_size(0),
5991 alloc_hint_flags(0)
5992 {}
5993
5994 explicit object_info_t(const hobject_t& s)
5995 : soid(s),
5996 user_version(0), size(0), flags((flag_t)0),
5997 truncate_seq(0), truncate_size(0),
5998 data_digest(-1), omap_digest(-1),
5999 expected_object_size(0), expected_write_size(0),
6000 alloc_hint_flags(0)
6001 {}
6002
6003 explicit object_info_t(const ceph::buffer::list& bl) {
6004 decode(bl);
6005 }
6006
6007 explicit object_info_t(const ceph::buffer::list& bl, const hobject_t& _soid) {
6008 decode_no_oid(bl);
6009 soid = _soid;
6010 }
6011 };
6012 WRITE_CLASS_ENCODER_FEATURES(object_info_t)
6013
6014 std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
6015
6016
6017
6018 // Object recovery
6019 struct ObjectRecoveryInfo {
6020 hobject_t soid;
6021 eversion_t version;
6022 uint64_t size;
6023 object_info_t oi;
6024 SnapSet ss; // only populated if soid is_snap()
6025 interval_set<uint64_t> copy_subset;
6026 std::map<hobject_t, interval_set<uint64_t>> clone_subset;
6027 bool object_exist;
6028
6029 ObjectRecoveryInfo() : size(0), object_exist(true) { }
6030
6031 static void generate_test_instances(std::list<ObjectRecoveryInfo*>& o);
6032 void encode(ceph::buffer::list &bl, uint64_t features) const;
6033 void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
6034 std::ostream &print(std::ostream &out) const;
6035 void dump(ceph::Formatter *f) const;
6036 };
6037 WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
6038 std::ostream& operator<<(std::ostream& out, const ObjectRecoveryInfo &inf);
6039
6040 struct ObjectRecoveryProgress {
6041 uint64_t data_recovered_to;
6042 std::string omap_recovered_to;
6043 bool first;
6044 bool data_complete;
6045 bool omap_complete;
6046 bool error = false;
6047
6048 ObjectRecoveryProgress()
6049 : data_recovered_to(0),
6050 first(true),
6051 data_complete(false), omap_complete(false) { }
6052
6053 bool is_complete(const ObjectRecoveryInfo& info) const {
6054 return (data_recovered_to >= (
6055 info.copy_subset.empty() ?
6056 0 : info.copy_subset.range_end())) &&
6057 omap_complete;
6058 }
6059
6060 static void generate_test_instances(std::list<ObjectRecoveryProgress*>& o);
6061 void encode(ceph::buffer::list &bl) const;
6062 void decode(ceph::buffer::list::const_iterator &bl);
6063 std::ostream &print(std::ostream &out) const;
6064 void dump(ceph::Formatter *f) const;
6065 };
6066 WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
6067 std::ostream& operator<<(std::ostream& out, const ObjectRecoveryProgress &prog);
6068
6069 struct PushReplyOp {
6070 hobject_t soid;
6071
6072 static void generate_test_instances(std::list<PushReplyOp*>& o);
6073 void encode(ceph::buffer::list &bl) const;
6074 void decode(ceph::buffer::list::const_iterator &bl);
6075 std::ostream &print(std::ostream &out) const;
6076 void dump(ceph::Formatter *f) const;
6077
6078 uint64_t cost(CephContext *cct) const;
6079 };
6080 WRITE_CLASS_ENCODER(PushReplyOp)
6081 std::ostream& operator<<(std::ostream& out, const PushReplyOp &op);
6082
6083 struct PullOp {
6084 hobject_t soid;
6085
6086 ObjectRecoveryInfo recovery_info;
6087 ObjectRecoveryProgress recovery_progress;
6088
6089 static void generate_test_instances(std::list<PullOp*>& o);
6090 void encode(ceph::buffer::list &bl, uint64_t features) const;
6091 void decode(ceph::buffer::list::const_iterator &bl);
6092 std::ostream &print(std::ostream &out) const;
6093 void dump(ceph::Formatter *f) const;
6094
6095 uint64_t cost(CephContext *cct) const;
6096 };
6097 WRITE_CLASS_ENCODER_FEATURES(PullOp)
6098 std::ostream& operator<<(std::ostream& out, const PullOp &op);
6099
6100 struct PushOp {
6101 hobject_t soid;
6102 eversion_t version;
6103 ceph::buffer::list data;
6104 interval_set<uint64_t> data_included;
6105 ceph::buffer::list omap_header;
6106 std::map<std::string, ceph::buffer::list> omap_entries;
6107 std::map<std::string, ceph::buffer::list, std::less<>> attrset;
6108
6109 ObjectRecoveryInfo recovery_info;
6110 ObjectRecoveryProgress before_progress;
6111 ObjectRecoveryProgress after_progress;
6112
6113 static void generate_test_instances(std::list<PushOp*>& o);
6114 void encode(ceph::buffer::list &bl, uint64_t features) const;
6115 void decode(ceph::buffer::list::const_iterator &bl);
6116 std::ostream &print(std::ostream &out) const;
6117 void dump(ceph::Formatter *f) const;
6118
6119 uint64_t cost(CephContext *cct) const;
6120 };
6121 WRITE_CLASS_ENCODER_FEATURES(PushOp)
6122 std::ostream& operator<<(std::ostream& out, const PushOp &op);
6123
6124 /*
6125 * summarize pg contents for purposes of a scrub
6126 */
6127 struct ScrubMap {
6128 struct object {
6129 std::map<std::string, ceph::buffer::ptr, std::less<>> attrs;
6130 uint64_t size;
6131 __u32 omap_digest; ///< omap crc32c
6132 __u32 digest; ///< data crc32c
6133 bool negative:1;
6134 bool digest_present:1;
6135 bool omap_digest_present:1;
6136 bool read_error:1;
6137 bool stat_error:1;
6138 bool ec_hash_mismatch:1;
6139 bool ec_size_mismatch:1;
6140 bool large_omap_object_found:1;
6141 uint64_t large_omap_object_key_count = 0;
6142 uint64_t large_omap_object_value_size = 0;
6143 uint64_t object_omap_bytes = 0;
6144 uint64_t object_omap_keys = 0;
6145
6146 object() :
6147 // Init invalid size so it won't match if we get a stat EIO error
6148 size(-1), omap_digest(0), digest(0),
6149 negative(false), digest_present(false), omap_digest_present(false),
6150 read_error(false), stat_error(false), ec_hash_mismatch(false),
6151 ec_size_mismatch(false), large_omap_object_found(false) {}
6152
6153 void encode(ceph::buffer::list& bl) const;
6154 void decode(ceph::buffer::list::const_iterator& bl);
6155 void dump(ceph::Formatter *f) const;
6156 static void generate_test_instances(std::list<object*>& o);
6157 };
6158 WRITE_CLASS_ENCODER(object)
6159
6160 std::map<hobject_t,object> objects;
6161 eversion_t valid_through;
6162 eversion_t incr_since;
6163 bool has_large_omap_object_errors:1;
6164 bool has_omap_keys:1;
6165
6166 void merge_incr(const ScrubMap &l);
6167 void clear_from(const hobject_t& start) {
6168 objects.erase(objects.lower_bound(start), objects.end());
6169 }
6170 void insert(const ScrubMap &r) {
6171 objects.insert(r.objects.begin(), r.objects.end());
6172 }
6173 void swap(ScrubMap &r) {
6174 using std::swap;
6175 swap(objects, r.objects);
6176 swap(valid_through, r.valid_through);
6177 swap(incr_since, r.incr_since);
6178 }
6179
6180 void encode(ceph::buffer::list& bl) const;
6181 void decode(ceph::buffer::list::const_iterator& bl, int64_t pool=-1);
6182 void dump(ceph::Formatter *f) const;
6183 static void generate_test_instances(std::list<ScrubMap*>& o);
6184 };
6185 WRITE_CLASS_ENCODER(ScrubMap::object)
6186 WRITE_CLASS_ENCODER(ScrubMap)
6187
6188 struct ScrubMapBuilder {
6189 bool deep = false;
6190 std::vector<hobject_t> ls;
6191 size_t pos = 0;
6192 int64_t data_pos = 0;
6193 std::string omap_pos;
6194 int ret = 0;
6195 ceph::buffer::hash data_hash, omap_hash; ///< accumulatinng hash value
6196 uint64_t omap_keys = 0;
6197 uint64_t omap_bytes = 0;
6198
6199 bool empty() {
6200 return ls.empty();
6201 }
6202 bool done() {
6203 return pos >= ls.size();
6204 }
6205 void reset() {
6206 *this = ScrubMapBuilder();
6207 }
6208
6209 bool data_done() {
6210 return data_pos < 0;
6211 }
6212
6213 void next_object() {
6214 ++pos;
6215 data_pos = 0;
6216 omap_pos.clear();
6217 omap_keys = 0;
6218 omap_bytes = 0;
6219 }
6220
6221 friend std::ostream& operator<<(std::ostream& out, const ScrubMapBuilder& pos) {
6222 out << "(" << pos.pos << "/" << pos.ls.size();
6223 if (pos.pos < pos.ls.size()) {
6224 out << " " << pos.ls[pos.pos];
6225 }
6226 if (pos.data_pos < 0) {
6227 out << " byte " << pos.data_pos;
6228 }
6229 if (!pos.omap_pos.empty()) {
6230 out << " key " << pos.omap_pos;
6231 }
6232 if (pos.deep) {
6233 out << " deep";
6234 }
6235 if (pos.ret) {
6236 out << " ret " << pos.ret;
6237 }
6238 return out << ")";
6239 }
6240 };
6241
6242 struct watch_item_t {
6243 entity_name_t name;
6244 uint64_t cookie;
6245 uint32_t timeout_seconds;
6246 entity_addr_t addr;
6247
6248 watch_item_t() : cookie(0), timeout_seconds(0) { }
6249 watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
6250 const entity_addr_t& addr)
6251 : name(name), cookie(cookie), timeout_seconds(timeout),
6252 addr(addr) { }
6253
6254 void encode(ceph::buffer::list &bl, uint64_t features) const {
6255 ENCODE_START(2, 1, bl);
6256 encode(name, bl);
6257 encode(cookie, bl);
6258 encode(timeout_seconds, bl);
6259 encode(addr, bl, features);
6260 ENCODE_FINISH(bl);
6261 }
6262 void decode(ceph::buffer::list::const_iterator &bl) {
6263 DECODE_START(2, bl);
6264 decode(name, bl);
6265 decode(cookie, bl);
6266 decode(timeout_seconds, bl);
6267 if (struct_v >= 2) {
6268 decode(addr, bl);
6269 }
6270 DECODE_FINISH(bl);
6271 }
6272 void dump(ceph::Formatter *f) const {
6273 f->dump_stream("watcher") << name;
6274 f->dump_int("cookie", cookie);
6275 f->dump_int("timeout", timeout_seconds);
6276 f->open_object_section("addr");
6277 addr.dump(f);
6278 f->close_section();
6279 }
6280 static void generate_test_instances(std::list<watch_item_t*>& o) {
6281 entity_addr_t ea;
6282 ea.set_type(entity_addr_t::TYPE_LEGACY);
6283 ea.set_nonce(1000);
6284 ea.set_family(AF_INET);
6285 ea.set_in4_quad(0, 127);
6286 ea.set_in4_quad(1, 0);
6287 ea.set_in4_quad(2, 0);
6288 ea.set_in4_quad(3, 1);
6289 ea.set_port(1024);
6290 o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
6291 ea.set_nonce(1001);
6292 ea.set_in4_quad(3, 2);
6293 ea.set_port(1025);
6294 o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
6295 }
6296 };
6297 WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
6298
6299 struct obj_watch_item_t {
6300 hobject_t obj;
6301 watch_item_t wi;
6302 };
6303
6304 /**
6305 * obj list watch response format
6306 *
6307 */
6308 struct obj_list_watch_response_t {
6309 std::list<watch_item_t> entries;
6310
6311 void encode(ceph::buffer::list& bl, uint64_t features) const {
6312 ENCODE_START(1, 1, bl);
6313 encode(entries, bl, features);
6314 ENCODE_FINISH(bl);
6315 }
6316 void decode(ceph::buffer::list::const_iterator& bl) {
6317 DECODE_START(1, bl);
6318 decode(entries, bl);
6319 DECODE_FINISH(bl);
6320 }
6321 void dump(ceph::Formatter *f) const {
6322 f->open_array_section("entries");
6323 for (std::list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
6324 f->open_object_section("watch");
6325 p->dump(f);
6326 f->close_section();
6327 }
6328 f->close_section();
6329 }
6330 static void generate_test_instances(std::list<obj_list_watch_response_t*>& o) {
6331 entity_addr_t ea;
6332 o.push_back(new obj_list_watch_response_t);
6333 o.push_back(new obj_list_watch_response_t);
6334 std::list<watch_item_t*> test_watchers;
6335 watch_item_t::generate_test_instances(test_watchers);
6336 for (auto &e : test_watchers) {
6337 o.back()->entries.push_back(*e);
6338 delete e;
6339 }
6340 }
6341 };
6342 WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
6343
6344 struct clone_info {
6345 snapid_t cloneid;
6346 std::vector<snapid_t> snaps; // ascending
6347 std::vector< std::pair<uint64_t,uint64_t> > overlap;
6348 uint64_t size;
6349
6350 clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
6351
6352 void encode(ceph::buffer::list& bl) const {
6353 ENCODE_START(1, 1, bl);
6354 encode(cloneid, bl);
6355 encode(snaps, bl);
6356 encode(overlap, bl);
6357 encode(size, bl);
6358 ENCODE_FINISH(bl);
6359 }
6360 void decode(ceph::buffer::list::const_iterator& bl) {
6361 DECODE_START(1, bl);
6362 decode(cloneid, bl);
6363 decode(snaps, bl);
6364 decode(overlap, bl);
6365 decode(size, bl);
6366 DECODE_FINISH(bl);
6367 }
6368 void dump(ceph::Formatter *f) const {
6369 if (cloneid == CEPH_NOSNAP)
6370 f->dump_string("cloneid", "HEAD");
6371 else
6372 f->dump_unsigned("cloneid", cloneid.val);
6373 f->open_array_section("snapshots");
6374 for (std::vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
6375 f->open_object_section("snap");
6376 f->dump_unsigned("id", p->val);
6377 f->close_section();
6378 }
6379 f->close_section();
6380 f->open_array_section("overlaps");
6381 for (std::vector< std::pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
6382 q != overlap.end(); ++q) {
6383 f->open_object_section("overlap");
6384 f->dump_unsigned("offset", q->first);
6385 f->dump_unsigned("length", q->second);
6386 f->close_section();
6387 }
6388 f->close_section();
6389 f->dump_unsigned("size", size);
6390 }
6391 static void generate_test_instances(std::list<clone_info*>& o) {
6392 o.push_back(new clone_info);
6393 o.push_back(new clone_info);
6394 o.back()->cloneid = 1;
6395 o.back()->snaps.push_back(1);
6396 o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
6397 o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
6398 o.back()->size = 16384;
6399 o.push_back(new clone_info);
6400 o.back()->cloneid = CEPH_NOSNAP;
6401 o.back()->size = 32768;
6402 }
6403 };
6404 WRITE_CLASS_ENCODER(clone_info)
6405
6406 /**
6407 * obj list snaps response format
6408 *
6409 */
6410 struct obj_list_snap_response_t {
6411 std::vector<clone_info> clones; // ascending
6412 snapid_t seq;
6413
6414 void encode(ceph::buffer::list& bl) const {
6415 ENCODE_START(2, 1, bl);
6416 encode(clones, bl);
6417 encode(seq, bl);
6418 ENCODE_FINISH(bl);
6419 }
6420 void decode(ceph::buffer::list::const_iterator& bl) {
6421 DECODE_START(2, bl);
6422 decode(clones, bl);
6423 if (struct_v >= 2)
6424 decode(seq, bl);
6425 else
6426 seq = CEPH_NOSNAP;
6427 DECODE_FINISH(bl);
6428 }
6429 void dump(ceph::Formatter *f) const {
6430 f->open_array_section("clones");
6431 for (std::vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
6432 f->open_object_section("clone");
6433 p->dump(f);
6434 f->close_section();
6435 }
6436 f->dump_unsigned("seq", seq);
6437 f->close_section();
6438 }
6439 static void generate_test_instances(std::list<obj_list_snap_response_t*>& o) {
6440 o.push_back(new obj_list_snap_response_t);
6441 o.push_back(new obj_list_snap_response_t);
6442 clone_info cl;
6443 cl.cloneid = 1;
6444 cl.snaps.push_back(1);
6445 cl.overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
6446 cl.overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
6447 cl.size = 16384;
6448 o.back()->clones.push_back(cl);
6449 cl.cloneid = CEPH_NOSNAP;
6450 cl.snaps.clear();
6451 cl.overlap.clear();
6452 cl.size = 32768;
6453 o.back()->clones.push_back(cl);
6454 o.back()->seq = 123;
6455 }
6456 };
6457
6458 WRITE_CLASS_ENCODER(obj_list_snap_response_t)
6459
6460 // PromoteCounter
6461
6462 struct PromoteCounter {
6463 std::atomic<unsigned long long> attempts{0};
6464 std::atomic<unsigned long long> objects{0};
6465 std::atomic<unsigned long long> bytes{0};
6466
6467 void attempt() {
6468 attempts++;
6469 }
6470
6471 void finish(uint64_t size) {
6472 objects++;
6473 bytes += size;
6474 }
6475
6476 void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
6477 *a = attempts;
6478 *o = objects;
6479 *b = bytes;
6480 attempts = *a / 2;
6481 objects = *o / 2;
6482 bytes = *b / 2;
6483 }
6484 };
6485
6486 struct pool_pg_num_history_t {
6487 /// last epoch updated
6488 epoch_t epoch = 0;
6489 /// poolid -> epoch -> pg_num
6490 std::map<int64_t, std::map<epoch_t,uint32_t>> pg_nums;
6491 /// pair(epoch, poolid)
6492 std::set<std::pair<epoch_t,int64_t>> deleted_pools;
6493
6494 void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) {
6495 pg_nums[pool][epoch] = pg_num;
6496 }
6497 void log_pool_delete(epoch_t epoch, int64_t pool) {
6498 deleted_pools.insert(std::make_pair(epoch, pool));
6499 }
6500
6501 /// prune history based on oldest osdmap epoch in the cluster
6502 void prune(epoch_t oldest_epoch) {
6503 auto i = deleted_pools.begin();
6504 while (i != deleted_pools.end()) {
6505 if (i->first >= oldest_epoch) {
6506 break;
6507 }
6508 pg_nums.erase(i->second);
6509 i = deleted_pools.erase(i);
6510 }
6511 for (auto& j : pg_nums) {
6512 auto k = j.second.lower_bound(oldest_epoch);
6513 // keep this and the entry before it (just to be paranoid)
6514 if (k != j.second.begin()) {
6515 --k;
6516 j.second.erase(j.second.begin(), k);
6517 }
6518 }
6519 }
6520
6521 void encode(ceph::buffer::list& bl) const {
6522 ENCODE_START(1, 1, bl);
6523 encode(epoch, bl);
6524 encode(pg_nums, bl);
6525 encode(deleted_pools, bl);
6526 ENCODE_FINISH(bl);
6527 }
6528 void decode(ceph::buffer::list::const_iterator& p) {
6529 DECODE_START(1, p);
6530 decode(epoch, p);
6531 decode(pg_nums, p);
6532 decode(deleted_pools, p);
6533 DECODE_FINISH(p);
6534 }
6535 void dump(ceph::Formatter *f) const {
6536 f->dump_unsigned("epoch", epoch);
6537 f->open_object_section("pools");
6538 for (auto& i : pg_nums) {
6539 f->open_object_section("pool");
6540 f->dump_unsigned("pool_id", i.first);
6541 f->open_array_section("changes");
6542 for (auto& j : i.second) {
6543 f->open_object_section("change");
6544 f->dump_unsigned("epoch", j.first);
6545 f->dump_unsigned("pg_num", j.second);
6546 f->close_section();
6547 }
6548 f->close_section();
6549 f->close_section();
6550 }
6551 f->close_section();
6552 f->open_array_section("deleted_pools");
6553 for (auto& i : deleted_pools) {
6554 f->open_object_section("deletion");
6555 f->dump_unsigned("pool_id", i.second);
6556 f->dump_unsigned("epoch", i.first);
6557 f->close_section();
6558 }
6559 f->close_section();
6560 }
6561 static void generate_test_instances(std::list<pool_pg_num_history_t*>& ls) {
6562 ls.push_back(new pool_pg_num_history_t);
6563 }
6564 friend std::ostream& operator<<(std::ostream& out, const pool_pg_num_history_t& h) {
6565 return out << "pg_num_history(e" << h.epoch
6566 << " pg_nums " << h.pg_nums
6567 << " deleted_pools " << h.deleted_pools
6568 << ")";
6569 }
6570 };
6571 WRITE_CLASS_ENCODER(pool_pg_num_history_t)
6572
6573 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
6574 // easily skip them
6575 static const std::string_view infover_key = "_infover";
6576 static const std::string_view info_key = "_info";
6577 static const std::string_view biginfo_key = "_biginfo";
6578 static const std::string_view epoch_key = "_epoch";
6579 static const std::string_view fastinfo_key = "_fastinfo";
6580
6581 static const __u8 pg_latest_struct_v = 10;
6582 // v10 is the new past_intervals encoding
6583 // v9 was fastinfo_key addition
6584 // v8 was the move to a per-pg pgmeta object
6585 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
6586 // (first appeared in cuttlefish).
6587 static const __u8 pg_compat_struct_v = 10;
6588
6589 int prepare_info_keymap(
6590 CephContext* cct,
6591 std::map<std::string,ceph::buffer::list> *km,
6592 std::string *key_to_remove,
6593 epoch_t epoch,
6594 pg_info_t &info,
6595 pg_info_t &last_written_info,
6596 PastIntervals &past_intervals,
6597 bool dirty_big_info,
6598 bool dirty_epoch,
6599 bool try_fast_info,
6600 PerfCounters *logger = nullptr,
6601 DoutPrefixProvider *dpp = nullptr);
6602
6603 namespace ceph::os {
6604 class Transaction;
6605 };
6606
6607 void create_pg_collection(
6608 ceph::os::Transaction& t, spg_t pgid, int bits);
6609
6610 void init_pg_ondisk(
6611 ceph::os::Transaction& t, spg_t pgid, const pg_pool_t *pool);
6612
6613 // omap specific stats
6614 struct omap_stat_t {
6615 int large_omap_objects;
6616 int64_t omap_bytes;
6617 int64_t omap_keys;
6618 };
6619
6620 // filter for pg listings
6621 class PGLSFilter {
6622 CephContext* cct;
6623 protected:
6624 std::string xattr;
6625 public:
6626 PGLSFilter();
6627 virtual ~PGLSFilter();
6628 virtual bool filter(const hobject_t &obj,
6629 const ceph::buffer::list& xattr_data) const = 0;
6630
6631 /**
6632 * Arguments passed from the RADOS client. Implementations must
6633 * handle any encoding errors, and return an appropriate error code,
6634 * or 0 on valid input.
6635 */
6636 virtual int init(ceph::buffer::list::const_iterator &params) = 0;
6637
6638 /**
6639 * xattr key, or empty string. If non-empty, this xattr will be fetched
6640 * and the value passed into ::filter
6641 */
6642 virtual const std::string& get_xattr() const { return xattr; }
6643
6644 /**
6645 * If true, objects without the named xattr (if xattr name is not empty)
6646 * will be rejected without calling ::filter
6647 */
6648 virtual bool reject_empty_xattr() const { return true; }
6649 };
6650
6651 class PGLSPlainFilter : public PGLSFilter {
6652 std::string val;
6653 public:
6654 int init(ceph::buffer::list::const_iterator &params) override;
6655 ~PGLSPlainFilter() override {}
6656 bool filter(const hobject_t& obj,
6657 const ceph::buffer::list& xattr_data) const override;
6658 };
6659
6660 // alias name for this structure:
6661 using missing_map_t = std::map<hobject_t,
6662 std::pair<std::optional<uint32_t>,
6663 std::optional<uint32_t>>>;
6664
6665 #endif