]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/osd_types.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / osd / osd_types.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #ifndef CEPH_OSD_TYPES_H
19 #define CEPH_OSD_TYPES_H
20
21 #include <atomic>
22 #include <sstream>
23 #include <cstdio>
24 #include <memory>
25 #include <string_view>
26
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/optional/optional_io.hpp>
29 #include <boost/variant.hpp>
30 #include <boost/smart_ptr/local_shared_ptr.hpp>
31
32 #include "include/rados/rados_types.hpp"
33 #include "include/mempool.h"
34
35 #include "msg/msg_types.h"
36 #include "include/compat.h"
37 #include "include/types.h"
38 #include "include/utime.h"
39 #include "include/CompatSet.h"
40 #include "common/ceph_context.h"
41 #include "common/histogram.h"
42 #include "include/interval_set.h"
43 #include "include/inline_memory.h"
44 #include "common/Formatter.h"
45 #include "common/bloom_filter.hpp"
46 #include "common/hobject.h"
47 #include "common/snap_types.h"
48 #include "HitSet.h"
49 #include "Watch.h"
50 #include "librados/ListObjectImpl.h"
51 #include "compressor/Compressor.h"
52 #include "osd_perf_counters.h"
53
54 #define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
55
56 #define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
57 #define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
58 #define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
59 #define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
60 #define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
61 #define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
62 #define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
63 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
64 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
65 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
66 #define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
67 #define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
68 #define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
69 #define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
70 #define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
71 #define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
72 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2 CompatSet::Feature(17, "new snapmapper key structure")
73
74
75 /// pool priority range set by user
76 #define OSD_POOL_PRIORITY_MAX 10
77 #define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX
78
79 /// min recovery priority for MBackfillReserve
80 #define OSD_RECOVERY_PRIORITY_MIN 0
81
82 /// base backfill priority for MBackfillReserve
83 #define OSD_BACKFILL_PRIORITY_BASE 100
84
85 /// base backfill priority for MBackfillReserve (degraded PG)
86 #define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
87
88 /// base recovery priority for MBackfillReserve
89 #define OSD_RECOVERY_PRIORITY_BASE 180
90
91 /// base backfill priority for MBackfillReserve (inactive PG)
92 #define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
93
94 /// base recovery priority for MRecoveryReserve (inactive PG)
95 #define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220
96
97 /// max manually/automatically set recovery priority for MBackfillReserve
98 #define OSD_RECOVERY_PRIORITY_MAX 253
99
100 /// backfill priority for MBackfillReserve, when forced manually
101 #define OSD_BACKFILL_PRIORITY_FORCED 254
102
103 /// recovery priority for MRecoveryReserve, when forced manually
104 #define OSD_RECOVERY_PRIORITY_FORCED 255
105
106 /// priority for pg deletion when osd is not fullish
107 #define OSD_DELETE_PRIORITY_NORMAL 179
108
109 /// priority for pg deletion when osd is approaching full
110 #define OSD_DELETE_PRIORITY_FULLISH 219
111
112 /// priority when more full
113 #define OSD_DELETE_PRIORITY_FULL 255
114
115 static std::map<int, int> max_prio_map = {
116 {OSD_BACKFILL_PRIORITY_BASE, OSD_BACKFILL_DEGRADED_PRIORITY_BASE - 1},
117 {OSD_BACKFILL_DEGRADED_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_BASE - 1},
118 {OSD_RECOVERY_PRIORITY_BASE, OSD_BACKFILL_INACTIVE_PRIORITY_BASE - 1},
119 {OSD_RECOVERY_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX},
120 {OSD_BACKFILL_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX}
121 };
122
123 typedef hobject_t collection_list_handle_t;
124
125 /// convert a single CPEH_OSD_FLAG_* to a std::string
126 const char *ceph_osd_flag_name(unsigned flag);
127 /// convert a single CEPH_OSD_OF_FLAG_* to a std::string
128 const char *ceph_osd_op_flag_name(unsigned flag);
129
130 /// convert CEPH_OSD_FLAG_* op flags to a std::string
131 std::string ceph_osd_flag_string(unsigned flags);
132 /// conver CEPH_OSD_OP_FLAG_* op flags to a std::string
133 std::string ceph_osd_op_flag_string(unsigned flags);
134 /// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a std::string
135 std::string ceph_osd_alloc_hint_flag_string(unsigned flags);
136
137 typedef std::map<std::string,std::string> osd_alert_list_t;
138 /// map osd id -> alert_list_t
139 typedef std::map<int, osd_alert_list_t> osd_alerts_t;
140 void dump(ceph::Formatter* f, const osd_alerts_t& alerts);
141
142
143 typedef interval_set<
144 snapid_t,
145 mempool::osdmap::flat_map> snap_interval_set_t;
146
147
148 /**
149 * osd request identifier
150 *
151 * caller name + incarnation# + tid to unique identify this request.
152 */
153 struct osd_reqid_t {
154 entity_name_t name; // who
155 ceph_tid_t tid;
156 int32_t inc; // incarnation
157
158 osd_reqid_t()
159 : tid(0), inc(0)
160 {}
161 osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
162 : name(a), tid(t), inc(i)
163 {}
164
165 DENC(osd_reqid_t, v, p) {
166 DENC_START(2, 2, p);
167 denc(v.name, p);
168 denc(v.tid, p);
169 denc(v.inc, p);
170 DENC_FINISH(p);
171 }
172 void dump(ceph::Formatter *f) const;
173 static void generate_test_instances(std::list<osd_reqid_t*>& o);
174 };
175 WRITE_CLASS_DENC(osd_reqid_t)
176
177
178
179 struct pg_shard_t {
180 static const int32_t NO_OSD = 0x7fffffff;
181 int32_t osd;
182 shard_id_t shard;
183 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
184 explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
185 pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
186 bool is_undefined() const {
187 return osd == -1;
188 }
189 std::string get_osd() const { return (osd == NO_OSD ? "NONE" : std::to_string(osd)); }
190 void encode(ceph::buffer::list &bl) const;
191 void decode(ceph::buffer::list::const_iterator &bl);
192 void dump(ceph::Formatter *f) const {
193 f->dump_unsigned("osd", osd);
194 if (shard != shard_id_t::NO_SHARD) {
195 f->dump_unsigned("shard", shard);
196 }
197 }
198 auto operator<=>(const pg_shard_t&) const = default;
199 };
200 WRITE_CLASS_ENCODER(pg_shard_t)
201 std::ostream& operator<<(std::ostream &lhs, const pg_shard_t &rhs);
202
203 using HobjToShardSetMapping = std::map<hobject_t, std::set<pg_shard_t>>;
204
205 class IsPGRecoverablePredicate {
206 public:
207 /**
208 * have encodes the shards available
209 */
210 virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
211 virtual ~IsPGRecoverablePredicate() {}
212 };
213
214 class IsPGReadablePredicate {
215 public:
216 /**
217 * have encodes the shards available
218 */
219 virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
220 virtual ~IsPGReadablePredicate() {}
221 };
222
223 inline std::ostream& operator<<(std::ostream& out, const osd_reqid_t& r) {
224 return out << r.name << "." << r.inc << ":" << r.tid;
225 }
226
227 inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
228 return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
229 }
230 inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
231 return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
232 }
233 inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
234 return (l.name < r.name) || (l.inc < r.inc) ||
235 (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
236 }
237 inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
238 return (l.name < r.name) || (l.inc < r.inc) ||
239 (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
240 }
241 inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
242 inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
243
244 namespace std {
245 template<> struct hash<osd_reqid_t> {
246 size_t operator()(const osd_reqid_t &r) const {
247 static hash<uint64_t> H;
248 return H(r.name.num() ^ r.tid ^ r.inc);
249 }
250 };
251 } // namespace std
252
253
254 // -----
255
256 // a locator constrains the placement of an object. mainly, which pool
257 // does it go in.
258 struct object_locator_t {
259 // You specify either the hash or the key -- not both
260 std::int64_t pool; ///< pool id
261 std::string key; ///< key string (if non-empty)
262 std::string nspace; ///< namespace
263 std::int64_t hash; ///< hash position (if >= 0)
264
265 explicit object_locator_t()
266 : pool(-1), hash(-1) {}
267 explicit object_locator_t(int64_t po)
268 : pool(po), hash(-1) {}
269 explicit object_locator_t(int64_t po, int64_t ps)
270 : pool(po), hash(ps) {}
271 explicit object_locator_t(int64_t po, std::string_view ns)
272 : pool(po), nspace(ns), hash(-1) {}
273 explicit object_locator_t(int64_t po, std::string_view ns, int64_t ps)
274 : pool(po), nspace(ns), hash(ps) {}
275 explicit object_locator_t(int64_t po, std::string_view ns, std::string_view s)
276 : pool(po), key(s), nspace(ns), hash(-1) {}
277 explicit object_locator_t(const hobject_t& soid)
278 : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
279
280 int64_t get_pool() const {
281 return pool;
282 }
283
284 void clear() {
285 pool = -1;
286 key = "";
287 nspace = "";
288 hash = -1;
289 }
290
291 bool empty() const {
292 return pool == -1;
293 }
294
295 void encode(ceph::buffer::list& bl) const;
296 void decode(ceph::buffer::list::const_iterator& p);
297 void dump(ceph::Formatter *f) const;
298 static void generate_test_instances(std::list<object_locator_t*>& o);
299 };
300 WRITE_CLASS_ENCODER(object_locator_t)
301
302 inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
303 return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
304 }
305 inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
306 return !(l == r);
307 }
308
309 inline std::ostream& operator<<(std::ostream& out, const object_locator_t& loc)
310 {
311 out << "@" << loc.pool;
312 if (loc.nspace.length())
313 out << ";" << loc.nspace;
314 if (loc.key.length())
315 out << ":" << loc.key;
316 return out;
317 }
318
319 struct request_redirect_t {
320 private:
321 object_locator_t redirect_locator; ///< this is authoritative
322 std::string redirect_object; ///< If non-empty, the request goes to this object name
323
324 friend std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir);
325 public:
326
327 request_redirect_t() {}
328 explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
329 redirect_locator(orig) { redirect_locator.pool = rpool; }
330 explicit request_redirect_t(const object_locator_t& rloc) :
331 redirect_locator(rloc) {}
332 explicit request_redirect_t(const object_locator_t& orig,
333 const std::string& robj) :
334 redirect_locator(orig), redirect_object(robj) {}
335
336 bool empty() const { return redirect_locator.empty() &&
337 redirect_object.empty(); }
338
339 void combine_with_locator(object_locator_t& orig, std::string& obj) const {
340 orig = redirect_locator;
341 if (!redirect_object.empty())
342 obj = redirect_object;
343 }
344
345 void encode(ceph::buffer::list& bl) const;
346 void decode(ceph::buffer::list::const_iterator& bl);
347 void dump(ceph::Formatter *f) const;
348 static void generate_test_instances(std::list<request_redirect_t*>& o);
349 };
350 WRITE_CLASS_ENCODER(request_redirect_t)
351
352 inline std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir) {
353 out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
354 return out;
355 }
356
357 // Internal OSD op flags - set by the OSD based on the op types
358 enum {
359 CEPH_OSD_RMW_FLAG_READ = (1 << 1),
360 CEPH_OSD_RMW_FLAG_WRITE = (1 << 2),
361 CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3),
362 CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
363 CEPH_OSD_RMW_FLAG_PGOP = (1 << 5),
364 CEPH_OSD_RMW_FLAG_CACHE = (1 << 6),
365 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7),
366 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
367 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9),
368 CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10),
369 CEPH_OSD_RMW_FLAG_RETURNVEC = (1 << 11),
370 };
371
372
373 // pg stuff
374
375 #define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
376
377 // placement seed (a hash value)
378 typedef uint32_t ps_t;
379
380 // old (v1) pg_t encoding (wrap old struct ceph_pg)
381 struct old_pg_t {
382 ceph_pg v;
383 void encode(ceph::buffer::list& bl) const {
384 ceph::encode_raw(v, bl);
385 }
386 void decode(ceph::buffer::list::const_iterator& bl) {
387 ceph::decode_raw(v, bl);
388 }
389 };
390 WRITE_CLASS_ENCODER(old_pg_t)
391
392 // placement group id
393 struct pg_t {
394 uint64_t m_pool;
395 uint32_t m_seed;
396
397 pg_t() : m_pool(0), m_seed(0) {}
398 pg_t(ps_t seed, uint64_t pool) :
399 m_pool(pool), m_seed(seed) {}
400 // cppcheck-suppress noExplicitConstructor
401 pg_t(const ceph_pg& cpg) :
402 m_pool(cpg.pool), m_seed(cpg.ps) {}
403
404 // cppcheck-suppress noExplicitConstructor
405 pg_t(const old_pg_t& opg) {
406 *this = opg.v;
407 }
408
409 old_pg_t get_old_pg() const {
410 old_pg_t o;
411 ceph_assert(m_pool < 0xffffffffull);
412 o.v.pool = m_pool;
413 o.v.ps = m_seed;
414 o.v.preferred = (__s16)-1;
415 return o;
416 }
417
418 ps_t ps() const {
419 return m_seed;
420 }
421 int64_t pool() const {
422 return m_pool;
423 }
424
425 static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
426 char *calc_name(char *buf, const char *suffix_backwords) const;
427
428 void set_ps(ps_t p) {
429 m_seed = p;
430 }
431 void set_pool(uint64_t p) {
432 m_pool = p;
433 }
434
435 pg_t get_parent() const;
436 pg_t get_ancestor(unsigned old_pg_num) const;
437
438 int print(char *o, int maxlen) const;
439 bool parse(const char *s);
440
441 bool is_split(unsigned old_pg_num, unsigned new_pg_num, std::set<pg_t> *pchildren) const;
442
443 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, pg_t *parent) const;
444 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
445 return ps() < new_pg_num && is_split(new_pg_num, old_pg_num, nullptr);
446 }
447
448 /**
449 * Returns b such that for all object o:
450 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
451 */
452 unsigned get_split_bits(unsigned pg_num) const;
453
454 bool contains(int bits, const ghobject_t& oid) const {
455 return
456 (int64_t)m_pool == oid.hobj.get_logical_pool() &&
457 oid.match(bits, ps());
458 }
459 bool contains(int bits, const hobject_t& oid) const {
460 return
461 (int64_t)m_pool == oid.get_logical_pool() &&
462 oid.match(bits, ps());
463 }
464
465 hobject_t get_hobj_start() const;
466 hobject_t get_hobj_end(unsigned pg_num) const;
467
468 // strong ordering is supported
469 auto operator<=>(const pg_t&) const noexcept = default;
470
471 void encode(ceph::buffer::list& bl) const {
472 using ceph::encode;
473 __u8 v = 1;
474 encode(v, bl);
475 encode(m_pool, bl);
476 encode(m_seed, bl);
477 encode((int32_t)-1, bl); // was preferred
478 }
479 void decode(ceph::buffer::list::const_iterator& bl) {
480 using ceph::decode;
481 __u8 v;
482 decode(v, bl);
483 decode(m_pool, bl);
484 decode(m_seed, bl);
485 bl += sizeof(int32_t); // was preferred
486 }
487 void decode_old(ceph::buffer::list::const_iterator& bl) {
488 using ceph::decode;
489 old_pg_t opg;
490 decode(opg, bl);
491 *this = opg;
492 }
493 void dump(ceph::Formatter *f) const;
494 static void generate_test_instances(std::list<pg_t*>& o);
495 };
496 WRITE_CLASS_ENCODER(pg_t)
497
498 std::ostream& operator<<(std::ostream& out, const pg_t &pg);
499
500 namespace std {
501 template<> struct hash< pg_t >
502 {
503 size_t operator()( const pg_t& x ) const
504 {
505 static hash<uint32_t> H;
506 // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
507 return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ (int32_t)(-1));
508 }
509 };
510 } // namespace std
511
512 struct spg_t {
513 pg_t pgid;
514 shard_id_t shard;
515 spg_t() : shard(shard_id_t::NO_SHARD) {}
516 spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
517 explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
518 auto operator<=>(const spg_t&) const = default;
519 unsigned get_split_bits(unsigned pg_num) const {
520 return pgid.get_split_bits(pg_num);
521 }
522 spg_t get_parent() const {
523 return spg_t(pgid.get_parent(), shard);
524 }
525 ps_t ps() const {
526 return pgid.ps();
527 }
528 uint64_t pool() const {
529 return pgid.pool();
530 }
531 void reset_shard(shard_id_t s) {
532 shard = s;
533 }
534
535 static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
536 char *calc_name(char *buf, const char *suffix_backwords) const;
537 // and a (limited) version that uses an internal buffer:
538 std::string calc_name_sring() const;
539
540 bool parse(const char *s);
541 bool parse(const std::string& s) {
542 return parse(s.c_str());
543 }
544
545 spg_t get_ancestor(unsigned old_pg_num) const {
546 return spg_t(pgid.get_ancestor(old_pg_num), shard);
547 }
548
549 bool is_split(unsigned old_pg_num, unsigned new_pg_num,
550 std::set<spg_t> *pchildren) const {
551 std::set<pg_t> _children;
552 std::set<pg_t> *children = pchildren ? &_children : NULL;
553 bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
554 if (pchildren && is_split) {
555 for (std::set<pg_t>::iterator i = _children.begin();
556 i != _children.end();
557 ++i) {
558 pchildren->insert(spg_t(*i, shard));
559 }
560 }
561 return is_split;
562 }
563 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
564 return pgid.is_merge_target(old_pg_num, new_pg_num);
565 }
566 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num,
567 spg_t *parent) const {
568 spg_t out = *this;
569 bool r = pgid.is_merge_source(old_pg_num, new_pg_num, &out.pgid);
570 if (r && parent) {
571 *parent = out;
572 }
573 return r;
574 }
575
576 bool is_no_shard() const {
577 return shard == shard_id_t::NO_SHARD;
578 }
579
580 ghobject_t make_pgmeta_oid() const {
581 return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
582 }
583
584 void encode(ceph::buffer::list &bl) const {
585 ENCODE_START(1, 1, bl);
586 encode(pgid, bl);
587 encode(shard, bl);
588 ENCODE_FINISH(bl);
589 }
590 void decode(ceph::buffer::list::const_iterator& bl) {
591 DECODE_START(1, bl);
592 decode(pgid, bl);
593 decode(shard, bl);
594 DECODE_FINISH(bl);
595 }
596
597 ghobject_t make_temp_ghobject(const std::string& name) const {
598 return ghobject_t(
599 hobject_t(object_t(name), "", CEPH_NOSNAP,
600 pgid.ps(),
601 hobject_t::get_temp_pool(pgid.pool()),
602 ""),
603 ghobject_t::NO_GEN,
604 shard);
605 }
606
607 unsigned hash_to_shard(unsigned num_shards) const {
608 return ps() % num_shards;
609 }
610 };
611 WRITE_CLASS_ENCODER(spg_t)
612
613 namespace std {
614 template<> struct hash< spg_t >
615 {
616 size_t operator()( const spg_t& x ) const
617 {
618 static hash<uint32_t> H;
619 return H(hash<pg_t>()(x.pgid) ^ x.shard);
620 }
621 };
622 } // namespace std
623
624 std::ostream& operator<<(std::ostream& out, const spg_t &pg);
625
626 // ----------------------
627
628 class coll_t {
629 enum type_t : uint8_t {
630 TYPE_META = 0,
631 TYPE_LEGACY_TEMP = 1, /* no longer used */
632 TYPE_PG = 2,
633 TYPE_PG_TEMP = 3,
634 };
635 type_t type;
636 spg_t pgid;
637 uint64_t removal_seq; // note: deprecated, not encoded
638
639 char _str_buff[spg_t::calc_name_buf_size];
640 char *_str;
641
642 void calc_str();
643
644 coll_t(type_t t, spg_t p, uint64_t r)
645 : type(t), pgid(p), removal_seq(r) {
646 calc_str();
647 }
648
649 friend class denc_coll_t;
650 public:
651 coll_t() : type(TYPE_META), removal_seq(0)
652 {
653 calc_str();
654 }
655
656 coll_t(const coll_t& other)
657 : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
658 calc_str();
659 }
660
661 explicit coll_t(spg_t pgid)
662 : type(TYPE_PG), pgid(pgid), removal_seq(0)
663 {
664 calc_str();
665 }
666
667 coll_t& operator=(const coll_t& rhs)
668 {
669 this->type = rhs.type;
670 this->pgid = rhs.pgid;
671 this->removal_seq = rhs.removal_seq;
672 this->calc_str();
673 return *this;
674 }
675
676 // named constructors
677 static coll_t meta() {
678 return coll_t();
679 }
680 static coll_t pg(spg_t p) {
681 return coll_t(p);
682 }
683
684 const std::string to_str() const {
685 return std::string(_str);
686 }
687 const char *c_str() const {
688 return _str;
689 }
690
691 bool parse(const std::string& s);
692
693 int operator<(const coll_t &rhs) const {
694 return type < rhs.type ||
695 (type == rhs.type && pgid < rhs.pgid);
696 }
697
698 bool is_meta() const {
699 return type == TYPE_META;
700 }
701 bool is_pg_prefix(spg_t *pgid_) const {
702 if (type == TYPE_PG || type == TYPE_PG_TEMP) {
703 *pgid_ = pgid;
704 return true;
705 }
706 return false;
707 }
708 bool is_pg() const {
709 return type == TYPE_PG;
710 }
711 bool is_pg(spg_t *pgid_) const {
712 if (type == TYPE_PG) {
713 *pgid_ = pgid;
714 return true;
715 }
716 return false;
717 }
718 bool is_temp() const {
719 return type == TYPE_PG_TEMP;
720 }
721 bool is_temp(spg_t *pgid_) const {
722 if (type == TYPE_PG_TEMP) {
723 *pgid_ = pgid;
724 return true;
725 }
726 return false;
727 }
728 int64_t pool() const {
729 return pgid.pool();
730 }
731
732 void encode(ceph::buffer::list& bl) const;
733 void decode(ceph::buffer::list::const_iterator& bl);
734 size_t encoded_size() const;
735
736 inline bool operator==(const coll_t& rhs) const {
737 // only compare type if meta
738 if (type != rhs.type)
739 return false;
740 if (type == TYPE_META)
741 return true;
742 return type == rhs.type && pgid == rhs.pgid;
743 }
744 inline bool operator!=(const coll_t& rhs) const {
745 return !(*this == rhs);
746 }
747
748 // get a TEMP collection that corresponds to the current collection,
749 // which we presume is a pg collection.
750 coll_t get_temp() const {
751 ceph_assert(type == TYPE_PG);
752 return coll_t(TYPE_PG_TEMP, pgid, 0);
753 }
754
755 ghobject_t get_min_hobj() const {
756 ghobject_t o;
757 switch (type) {
758 case TYPE_PG:
759 o.hobj.pool = pgid.pool();
760 o.set_shard(pgid.shard);
761 break;
762 case TYPE_META:
763 o.hobj.pool = -1;
764 break;
765 default:
766 break;
767 }
768 return o;
769 }
770
771 unsigned hash_to_shard(unsigned num_shards) const {
772 if (type == TYPE_PG)
773 return pgid.hash_to_shard(num_shards);
774 return 0; // whatever.
775 }
776
777 void dump(ceph::Formatter *f) const;
778 static void generate_test_instances(std::list<coll_t*>& o);
779 };
780
781 WRITE_CLASS_ENCODER(coll_t)
782
783 inline std::ostream& operator<<(std::ostream& out, const coll_t& c) {
784 out << c.to_str();
785 return out;
786 }
787
788 #if FMT_VERSION >= 90000
789 template <> struct fmt::formatter<coll_t> : fmt::ostream_formatter {};
790 #endif
791
792 namespace std {
793 template<> struct hash<coll_t> {
794 size_t operator()(const coll_t &c) const {
795 size_t h = 0;
796 std::string str(c.to_str());
797 std::string::const_iterator end(str.end());
798 for (std::string::const_iterator s = str.begin(); s != end; ++s) {
799 h += *s;
800 h += (h << 10);
801 h ^= (h >> 6);
802 }
803 h += (h << 3);
804 h ^= (h >> 11);
805 h += (h << 15);
806 return h;
807 }
808 };
809 } // namespace std
810
811 inline std::ostream& operator<<(std::ostream& out, const ceph_object_layout &ol)
812 {
813 out << pg_t(ol.ol_pgid);
814 int su = ol.ol_stripe_unit;
815 if (su)
816 out << ".su=" << su;
817 return out;
818 }
819
820 struct denc_coll_t {
821 coll_t coll;
822
823 auto &get_type() const { return coll.type; }
824 auto &get_type() { return coll.type; }
825 auto &get_pgid() const { return coll.pgid; }
826 auto &get_pgid() { return coll.pgid; }
827
828 denc_coll_t() = default;
829 denc_coll_t(const denc_coll_t &) = default;
830 denc_coll_t(denc_coll_t &&) = default;
831
832 denc_coll_t &operator=(const denc_coll_t &) = default;
833 denc_coll_t &operator=(denc_coll_t &&) = default;
834
835 explicit denc_coll_t(const coll_t &coll) : coll(coll) {}
836 operator coll_t() const {
837 return coll;
838 }
839
840 bool operator<(const denc_coll_t &rhs) const {
841 return coll < rhs.coll;
842 }
843
844 DENC(denc_coll_t, v, p) {
845 DENC_START(1, 1, p);
846 denc(v.get_type(), p);
847 denc(v.get_pgid().pgid.m_pool, p);
848 denc(v.get_pgid().pgid.m_seed, p);
849 denc(v.get_pgid().shard.id, p);
850 DENC_FINISH(p);
851 }
852 };
853 WRITE_CLASS_DENC(denc_coll_t)
854
855
856 // compound rados version type
857 /* WARNING: If add member in eversion_t, please make sure the encode/decode function
858 * work well. For little-endian machine, we should make sure there is no padding
859 * in 32-bit machine and 64-bit machine.
860 */
861 class eversion_t {
862 public:
863 version_t version;
864 epoch_t epoch;
865 __u32 __pad;
866 eversion_t() : version(0), epoch(0), __pad(0) {}
867 eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
868
869 // cppcheck-suppress noExplicitConstructor
870 eversion_t(const ceph_eversion& ce) :
871 version(ce.version),
872 epoch(ce.epoch),
873 __pad(0) { }
874
875 explicit eversion_t(ceph::buffer::list& bl) : __pad(0) { decode(bl); }
876
877 static const eversion_t& max() {
878 static const eversion_t max(-1,-1);
879 return max;
880 }
881
882 operator ceph_eversion() {
883 ceph_eversion c;
884 c.epoch = epoch;
885 c.version = version;
886 return c;
887 }
888
889 std::string get_key_name() const;
890
891 // key must point to the beginning of a block of 32 chars
892 inline void get_key_name(char* key) const {
893 // Below is equivalent of sprintf("%010u.%020llu");
894 key[31] = 0;
895 ritoa<uint64_t, 10, 20>(version, key + 31);
896 key[10] = '.';
897 ritoa<uint32_t, 10, 10>(epoch, key + 10);
898 }
899
900 void encode(ceph::buffer::list &bl) const {
901 #if defined(CEPH_LITTLE_ENDIAN)
902 bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
903 #else
904 using ceph::encode;
905 encode(version, bl);
906 encode(epoch, bl);
907 #endif
908 }
909 void decode(ceph::buffer::list::const_iterator &bl) {
910 #if defined(CEPH_LITTLE_ENDIAN)
911 bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
912 #else
913 using ceph::decode;
914 decode(version, bl);
915 decode(epoch, bl);
916 #endif
917 }
918 void decode(ceph::buffer::list& bl) {
919 auto p = std::cbegin(bl);
920 decode(p);
921 }
922 };
923 WRITE_CLASS_ENCODER(eversion_t)
924
925 inline bool operator==(const eversion_t& l, const eversion_t& r) {
926 return (l.epoch == r.epoch) && (l.version == r.version);
927 }
928 inline bool operator!=(const eversion_t& l, const eversion_t& r) {
929 return (l.epoch != r.epoch) || (l.version != r.version);
930 }
931 inline bool operator<(const eversion_t& l, const eversion_t& r) {
932 return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
933 }
934 inline bool operator<=(const eversion_t& l, const eversion_t& r) {
935 return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
936 }
937 inline bool operator>(const eversion_t& l, const eversion_t& r) {
938 return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
939 }
940 inline bool operator>=(const eversion_t& l, const eversion_t& r) {
941 return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
942 }
943 inline std::ostream& operator<<(std::ostream& out, const eversion_t& e) {
944 return out << e.epoch << "'" << e.version;
945 }
946
947 /**
948 * objectstore_perf_stat_t
949 *
950 * current perf information about the osd
951 */
952 struct objectstore_perf_stat_t {
953 // cur_op_latency is in ns since double add/sub are not associative
954 uint64_t os_commit_latency_ns;
955 uint64_t os_apply_latency_ns;
956
957 objectstore_perf_stat_t() :
958 os_commit_latency_ns(0), os_apply_latency_ns(0) {}
959
960 bool operator==(const objectstore_perf_stat_t &r) const {
961 return os_commit_latency_ns == r.os_commit_latency_ns &&
962 os_apply_latency_ns == r.os_apply_latency_ns;
963 }
964
965 void add(const objectstore_perf_stat_t &o) {
966 os_commit_latency_ns += o.os_commit_latency_ns;
967 os_apply_latency_ns += o.os_apply_latency_ns;
968 }
969 void sub(const objectstore_perf_stat_t &o) {
970 os_commit_latency_ns -= o.os_commit_latency_ns;
971 os_apply_latency_ns -= o.os_apply_latency_ns;
972 }
973 void dump(ceph::Formatter *f) const;
974 void encode(ceph::buffer::list &bl, uint64_t features) const;
975 void decode(ceph::buffer::list::const_iterator &bl);
976 static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
977 };
978 WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t)
979
980 /*
981 * pg states
982 */
983 #define PG_STATE_CREATING (1ULL << 0) // creating
984 #define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too)
985 #define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas.
986 #define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline
987 #define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound
988 #define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound
989 #define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging
990 #define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing
991 //#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub
992 #define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy
993 #define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
994 #define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering
995 #define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub
996 #define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects
997 #define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill
998 #define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed.
999 #define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown.
1000 #define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
1001 #define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files
1002 #define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content
1003 #define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full
1004 #define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations
1005 #define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size
1006 #define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active
1007 #define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover
1008 #define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps
1009 #define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps
1010 #define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full
1011 #define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps
1012 #define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other
1013 #define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other
1014 #define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors
1015 #define PG_STATE_LAGGY (1ULL << 33) // PG is laggy/unreabable due to slow/delayed pings
1016 #define PG_STATE_WAIT (1ULL << 34) // PG is waiting for prior intervals' readable period to expire
1017
1018 std::string pg_state_string(uint64_t state);
1019 std::string pg_vector_string(const std::vector<int32_t> &a);
1020 std::optional<uint64_t> pg_string_state(const std::string& state);
1021
1022
1023 /*
1024 * pool_snap_info_t
1025 *
1026 * attributes for a single pool snapshot.
1027 */
1028 struct pool_snap_info_t {
1029 snapid_t snapid;
1030 utime_t stamp;
1031 std::string name;
1032
1033 void dump(ceph::Formatter *f) const;
1034 void encode(ceph::buffer::list& bl, uint64_t features) const;
1035 void decode(ceph::buffer::list::const_iterator& bl);
1036 static void generate_test_instances(std::list<pool_snap_info_t*>& o);
1037 };
1038 WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
1039
1040 inline std::ostream& operator<<(std::ostream& out, const pool_snap_info_t& si) {
1041 return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
1042 }
1043
1044
1045 /*
1046 * pool_opts_t
1047 *
1048 * pool options.
1049 */
1050
1051 // The order of items in the list is important, therefore,
1052 // you should always add to the end of the list when adding new options.
1053
1054 class pool_opts_t {
1055 public:
1056 enum key_t {
1057 SCRUB_MIN_INTERVAL,
1058 SCRUB_MAX_INTERVAL,
1059 DEEP_SCRUB_INTERVAL,
1060 RECOVERY_PRIORITY,
1061 RECOVERY_OP_PRIORITY,
1062 SCRUB_PRIORITY,
1063 COMPRESSION_MODE,
1064 COMPRESSION_ALGORITHM,
1065 COMPRESSION_REQUIRED_RATIO,
1066 COMPRESSION_MAX_BLOB_SIZE,
1067 COMPRESSION_MIN_BLOB_SIZE,
1068 CSUM_TYPE,
1069 CSUM_MAX_BLOCK,
1070 CSUM_MIN_BLOCK,
1071 FINGERPRINT_ALGORITHM,
1072 PG_NUM_MIN, // min pg_num
1073 TARGET_SIZE_BYTES, // total bytes in pool
1074 TARGET_SIZE_RATIO, // fraction of total cluster
1075 PG_AUTOSCALE_BIAS,
1076 READ_LEASE_INTERVAL,
1077 DEDUP_TIER,
1078 DEDUP_CHUNK_ALGORITHM,
1079 DEDUP_CDC_CHUNK_SIZE,
1080 PG_NUM_MAX, // max pg_num
1081 };
1082
1083 enum type_t {
1084 STR,
1085 INT,
1086 DOUBLE,
1087 };
1088
1089 struct opt_desc_t {
1090 key_t key;
1091 type_t type;
1092
1093 opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
1094
1095 bool operator==(const opt_desc_t& rhs) const {
1096 return key == rhs.key && type == rhs.type;
1097 }
1098 };
1099
1100 typedef boost::variant<std::string,int64_t,double> value_t;
1101
1102 static bool is_opt_name(const std::string& name);
1103 static opt_desc_t get_opt_desc(const std::string& name);
1104
1105 pool_opts_t() : opts() {}
1106
1107 bool is_set(key_t key) const;
1108
1109 template<typename T>
1110 void set(key_t key, const T &val) {
1111 value_t value = val;
1112 opts[key] = value;
1113 }
1114
1115 template<typename T>
1116 bool get(key_t key, T *val) const {
1117 opts_t::const_iterator i = opts.find(key);
1118 if (i == opts.end()) {
1119 return false;
1120 }
1121 *val = boost::get<T>(i->second);
1122 return true;
1123 }
1124
1125 template<typename T>
1126 T value_or(key_t key, T&& default_value) const {
1127 auto i = opts.find(key);
1128 if (i == opts.end()) {
1129 return std::forward<T>(default_value);
1130 }
1131 return boost::get<T>(i->second);
1132 }
1133
1134 const value_t& get(key_t key) const;
1135
1136 bool unset(key_t key);
1137
1138 void dump(const std::string& name, ceph::Formatter *f) const;
1139
1140 void dump(ceph::Formatter *f) const;
1141 void encode(ceph::buffer::list &bl, uint64_t features) const;
1142 void decode(ceph::buffer::list::const_iterator &bl);
1143
1144 private:
1145 typedef std::map<key_t, value_t> opts_t;
1146 opts_t opts;
1147
1148 friend std::ostream& operator<<(std::ostream& out, const pool_opts_t& opts);
1149 };
1150 WRITE_CLASS_ENCODER_FEATURES(pool_opts_t)
1151
1152 struct pg_merge_meta_t {
1153 pg_t source_pgid;
1154 epoch_t ready_epoch = 0;
1155 epoch_t last_epoch_started = 0;
1156 epoch_t last_epoch_clean = 0;
1157 eversion_t source_version;
1158 eversion_t target_version;
1159
1160 void encode(ceph::buffer::list& bl) const {
1161 ENCODE_START(1, 1, bl);
1162 encode(source_pgid, bl);
1163 encode(ready_epoch, bl);
1164 encode(last_epoch_started, bl);
1165 encode(last_epoch_clean, bl);
1166 encode(source_version, bl);
1167 encode(target_version, bl);
1168 ENCODE_FINISH(bl);
1169 }
1170 void decode(ceph::buffer::list::const_iterator& p) {
1171 DECODE_START(1, p);
1172 decode(source_pgid, p);
1173 decode(ready_epoch, p);
1174 decode(last_epoch_started, p);
1175 decode(last_epoch_clean, p);
1176 decode(source_version, p);
1177 decode(target_version, p);
1178 DECODE_FINISH(p);
1179 }
1180 void dump(ceph::Formatter *f) const {
1181 f->dump_stream("source_pgid") << source_pgid;
1182 f->dump_unsigned("ready_epoch", ready_epoch);
1183 f->dump_unsigned("last_epoch_started", last_epoch_started);
1184 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
1185 f->dump_stream("source_version") << source_version;
1186 f->dump_stream("target_version") << target_version;
1187 }
1188 };
1189 WRITE_CLASS_ENCODER(pg_merge_meta_t)
1190
1191 class OSDMap;
1192
1193 /*
1194 * pg_pool
1195 */
1196 struct pg_pool_t {
1197 static const char *APPLICATION_NAME_CEPHFS;
1198 static const char *APPLICATION_NAME_RBD;
1199 static const char *APPLICATION_NAME_RGW;
1200
1201 enum {
1202 TYPE_REPLICATED = 1, // replication
1203 //TYPE_RAID4 = 2, // raid4 (never implemented)
1204 TYPE_ERASURE = 3, // erasure-coded
1205 };
1206 static constexpr uint32_t pg_CRUSH_ITEM_NONE = 0x7fffffff; /* can't import crush.h here */
1207 static std::string_view get_type_name(int t) {
1208 switch (t) {
1209 case TYPE_REPLICATED: return "replicated";
1210 //case TYPE_RAID4: return "raid4";
1211 case TYPE_ERASURE: return "erasure";
1212 default: return "???";
1213 }
1214 }
1215 std::string_view get_type_name() const {
1216 return get_type_name(type);
1217 }
1218
1219 enum {
1220 FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
1221 FLAG_FULL = 1<<1, // pool is full
1222 FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
1223 FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
1224 FLAG_NODELETE = 1<<4, // pool can't be deleted
1225 FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
1226 FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
1227 FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1228 FLAG_NOSCRUB = 1<<8, // block periodic scrub
1229 FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
1230 FLAG_FULL_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
1231 FLAG_NEARFULL = 1<<11, // pool is nearfull
1232 FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
1233 FLAG_SELFMANAGED_SNAPS = 1<<13, // pool uses selfmanaged snaps
1234 FLAG_POOL_SNAPS = 1<<14, // pool has pool snaps
1235 FLAG_CREATING = 1<<15, // initial pool PGs are being created
1236 FLAG_EIO = 1<<16, // return EIO for all client ops
1237 FLAG_BULK = 1<<17, //pool is large
1238 // PGs from this pool are allowed to be created on crimson osds.
1239 // Pool features are restricted to those supported by crimson-osd.
1240 // Note, does not prohibit being created on classic osd.
1241 FLAG_CRIMSON = 1<<18,
1242 };
1243
1244 static const char *get_flag_name(uint64_t f) {
1245 switch (f) {
1246 case FLAG_HASHPSPOOL: return "hashpspool";
1247 case FLAG_FULL: return "full";
1248 case FLAG_EC_OVERWRITES: return "ec_overwrites";
1249 case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
1250 case FLAG_NODELETE: return "nodelete";
1251 case FLAG_NOPGCHANGE: return "nopgchange";
1252 case FLAG_NOSIZECHANGE: return "nosizechange";
1253 case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
1254 case FLAG_NOSCRUB: return "noscrub";
1255 case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
1256 case FLAG_FULL_QUOTA: return "full_quota";
1257 case FLAG_NEARFULL: return "nearfull";
1258 case FLAG_BACKFILLFULL: return "backfillfull";
1259 case FLAG_SELFMANAGED_SNAPS: return "selfmanaged_snaps";
1260 case FLAG_POOL_SNAPS: return "pool_snaps";
1261 case FLAG_CREATING: return "creating";
1262 case FLAG_EIO: return "eio";
1263 case FLAG_BULK: return "bulk";
1264 case FLAG_CRIMSON: return "crimson";
1265 default: return "???";
1266 }
1267 }
1268 static std::string get_flags_string(uint64_t f) {
1269 std::string s;
1270 for (unsigned n=0; f && n<64; ++n) {
1271 if (f & (1ull << n)) {
1272 if (s.length())
1273 s += ",";
1274 s += get_flag_name(1ull << n);
1275 }
1276 }
1277 return s;
1278 }
1279 std::string get_flags_string() const {
1280 return get_flags_string(flags);
1281 }
1282 static uint64_t get_flag_by_name(const std::string& name) {
1283 if (name == "hashpspool")
1284 return FLAG_HASHPSPOOL;
1285 if (name == "full")
1286 return FLAG_FULL;
1287 if (name == "ec_overwrites")
1288 return FLAG_EC_OVERWRITES;
1289 if (name == "incomplete_clones")
1290 return FLAG_INCOMPLETE_CLONES;
1291 if (name == "nodelete")
1292 return FLAG_NODELETE;
1293 if (name == "nopgchange")
1294 return FLAG_NOPGCHANGE;
1295 if (name == "nosizechange")
1296 return FLAG_NOSIZECHANGE;
1297 if (name == "write_fadvise_dontneed")
1298 return FLAG_WRITE_FADVISE_DONTNEED;
1299 if (name == "noscrub")
1300 return FLAG_NOSCRUB;
1301 if (name == "nodeep-scrub")
1302 return FLAG_NODEEP_SCRUB;
1303 if (name == "full_quota")
1304 return FLAG_FULL_QUOTA;
1305 if (name == "nearfull")
1306 return FLAG_NEARFULL;
1307 if (name == "backfillfull")
1308 return FLAG_BACKFILLFULL;
1309 if (name == "selfmanaged_snaps")
1310 return FLAG_SELFMANAGED_SNAPS;
1311 if (name == "pool_snaps")
1312 return FLAG_POOL_SNAPS;
1313 if (name == "creating")
1314 return FLAG_CREATING;
1315 if (name == "eio")
1316 return FLAG_EIO;
1317 if (name == "bulk")
1318 return FLAG_BULK;
1319 if (name == "crimson")
1320 return FLAG_CRIMSON;
1321 return 0;
1322 }
1323
1324 /// converts the acting/up vector to a set of pg shards
1325 void convert_to_pg_shards(const std::vector<int> &from, std::set<pg_shard_t>* to) const;
1326
1327 typedef enum {
1328 CACHEMODE_NONE = 0, ///< no caching
1329 CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
1330 CACHEMODE_FORWARD = 2, ///< forward if not in cache
1331 CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
1332 CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later
1333 CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later
1334 CACHEMODE_PROXY = 6, ///< proxy if not in cache
1335 } cache_mode_t;
1336 static const char *get_cache_mode_name(cache_mode_t m) {
1337 switch (m) {
1338 case CACHEMODE_NONE: return "none";
1339 case CACHEMODE_WRITEBACK: return "writeback";
1340 case CACHEMODE_FORWARD: return "forward";
1341 case CACHEMODE_READONLY: return "readonly";
1342 case CACHEMODE_READFORWARD: return "readforward";
1343 case CACHEMODE_READPROXY: return "readproxy";
1344 case CACHEMODE_PROXY: return "proxy";
1345 default: return "unknown";
1346 }
1347 }
1348 static cache_mode_t get_cache_mode_from_str(const std::string& s) {
1349 if (s == "none")
1350 return CACHEMODE_NONE;
1351 if (s == "writeback")
1352 return CACHEMODE_WRITEBACK;
1353 if (s == "forward")
1354 return CACHEMODE_FORWARD;
1355 if (s == "readonly")
1356 return CACHEMODE_READONLY;
1357 if (s == "readforward")
1358 return CACHEMODE_READFORWARD;
1359 if (s == "readproxy")
1360 return CACHEMODE_READPROXY;
1361 if (s == "proxy")
1362 return CACHEMODE_PROXY;
1363 return (cache_mode_t)-1;
1364 }
1365 const char *get_cache_mode_name() const {
1366 return get_cache_mode_name(cache_mode);
1367 }
1368 bool cache_mode_requires_hit_set() const {
1369 switch (cache_mode) {
1370 case CACHEMODE_NONE:
1371 case CACHEMODE_FORWARD:
1372 case CACHEMODE_READONLY:
1373 case CACHEMODE_PROXY:
1374 return false;
1375 case CACHEMODE_WRITEBACK:
1376 case CACHEMODE_READFORWARD:
1377 case CACHEMODE_READPROXY:
1378 return true;
1379 default:
1380 ceph_abort_msg("implement me");
1381 }
1382 }
1383
1384 enum class pg_autoscale_mode_t : uint8_t {
1385 OFF = 0,
1386 WARN = 1,
1387 ON = 2,
1388 UNKNOWN = UINT8_MAX,
1389 };
1390 static const char *get_pg_autoscale_mode_name(pg_autoscale_mode_t m) {
1391 switch (m) {
1392 case pg_autoscale_mode_t::OFF: return "off";
1393 case pg_autoscale_mode_t::ON: return "on";
1394 case pg_autoscale_mode_t::WARN: return "warn";
1395 default: return "???";
1396 }
1397 }
1398 static pg_autoscale_mode_t get_pg_autoscale_mode_by_name(const std::string& m) {
1399 if (m == "off") {
1400 return pg_autoscale_mode_t::OFF;
1401 }
1402 if (m == "warn") {
1403 return pg_autoscale_mode_t::WARN;
1404 }
1405 if (m == "on") {
1406 return pg_autoscale_mode_t::ON;
1407 }
1408 return pg_autoscale_mode_t::UNKNOWN;
1409 }
1410
1411 utime_t create_time;
1412 uint64_t flags = 0; ///< FLAG_*
1413 __u8 type = 0; ///< TYPE_*
1414 __u8 size = 0, min_size = 0; ///< number of osds in each pg
1415 __u8 crush_rule = 0; ///< crush placement rule
1416 __u8 object_hash = 0; ///< hash mapping object name to ps
1417 pg_autoscale_mode_t pg_autoscale_mode = pg_autoscale_mode_t::UNKNOWN;
1418
1419 private:
1420 __u32 pg_num = 0, pgp_num = 0; ///< number of pgs
1421 __u32 pg_num_pending = 0; ///< pg_num we are about to merge down to
1422 __u32 pg_num_target = 0; ///< pg_num we should converge toward
1423 __u32 pgp_num_target = 0; ///< pgp_num we should converge toward
1424
1425 public:
1426 std::map<std::string, std::string> properties; ///< OBSOLETE
1427 std::string erasure_code_profile; ///< name of the erasure code profile in OSDMap
1428 epoch_t last_change = 0; ///< most recent epoch changed, exclusing snapshot changes
1429 // If non-zero, require OSDs in at least this many different instances...
1430 uint32_t peering_crush_bucket_count = 0;
1431 // of this bucket type...
1432 uint32_t peering_crush_bucket_barrier = 0;
1433 // including this one
1434 int32_t peering_crush_mandatory_member = pg_CRUSH_ITEM_NONE;
1435 // The per-bucket replica count is calculated with this "target"
1436 // instead of the above crush_bucket_count. This means we can maintain a
1437 // target size of 4 without attempting to place them all in 1 DC
1438 uint32_t peering_crush_bucket_target = 0;
1439 /// last epoch that forced clients to resend
1440 epoch_t last_force_op_resend = 0;
1441 /// last epoch that forced clients to resend (pre-nautilus clients only)
1442 epoch_t last_force_op_resend_prenautilus = 0;
1443 /// last epoch that forced clients to resend (pre-luminous clients only)
1444 epoch_t last_force_op_resend_preluminous = 0;
1445
1446 /// metadata for the most recent PG merge
1447 pg_merge_meta_t last_pg_merge_meta;
1448
1449 snapid_t snap_seq = 0; ///< seq for per-pool snapshot
1450 epoch_t snap_epoch = 0; ///< osdmap epoch of last snap
1451 uint64_t auid = 0; ///< who owns the pg
1452
1453 uint64_t quota_max_bytes = 0; ///< maximum number of bytes for this pool
1454 uint64_t quota_max_objects = 0; ///< maximum number of objects for this pool
1455
1456 /*
1457 * Pool snaps (global to this pool). These define a SnapContext for
1458 * the pool, unless the client manually specifies an alternate
1459 * context.
1460 */
1461 std::map<snapid_t, pool_snap_info_t> snaps;
1462 /*
1463 * Alternatively, if we are defining non-pool snaps (e.g. via the
1464 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1465 * used). Snaps and removed_snaps are to be used exclusive of each
1466 * other!
1467 */
1468 interval_set<snapid_t> removed_snaps;
1469
1470 unsigned pg_num_mask = 0, pgp_num_mask = 0;
1471
1472 std::set<uint64_t> tiers; ///< pools that are tiers of us
1473 int64_t tier_of = -1; ///< pool for which we are a tier
1474 // Note that write wins for read+write ops
1475 int64_t read_tier = -1; ///< pool/tier for objecter to direct reads to
1476 int64_t write_tier = -1; ///< pool/tier for objecter to direct writes to
1477 cache_mode_t cache_mode = CACHEMODE_NONE; ///< cache pool mode
1478
1479 bool is_tier() const { return tier_of >= 0; }
1480 bool has_tiers() const { return !tiers.empty(); }
1481 void clear_tier() {
1482 tier_of = -1;
1483 clear_read_tier();
1484 clear_write_tier();
1485 clear_tier_tunables();
1486 }
1487 bool has_read_tier() const { return read_tier >= 0; }
1488 void clear_read_tier() { read_tier = -1; }
1489 bool has_write_tier() const { return write_tier >= 0; }
1490 void clear_write_tier() { write_tier = -1; }
1491 void clear_tier_tunables() {
1492 if (cache_mode != CACHEMODE_NONE)
1493 flags |= FLAG_INCOMPLETE_CLONES;
1494 cache_mode = CACHEMODE_NONE;
1495
1496 target_max_bytes = 0;
1497 target_max_objects = 0;
1498 cache_target_dirty_ratio_micro = 0;
1499 cache_target_dirty_high_ratio_micro = 0;
1500 cache_target_full_ratio_micro = 0;
1501 hit_set_params = HitSet::Params();
1502 hit_set_period = 0;
1503 hit_set_count = 0;
1504 hit_set_grade_decay_rate = 0;
1505 hit_set_search_last_n = 0;
1506 grade_table.resize(0);
1507 }
1508
1509 bool has_snaps() const {
1510 return snaps.size() > 0;
1511 }
1512
1513 bool is_stretch_pool() const {
1514 return peering_crush_bucket_count != 0;
1515 }
1516
1517 bool stretch_set_can_peer(const std::set<int>& want, const OSDMap& osdmap,
1518 std::ostream *out) const;
1519 bool stretch_set_can_peer(const std::vector<int>& want, const OSDMap& osdmap,
1520 std::ostream *out) const {
1521 if (!is_stretch_pool()) return true;
1522 std::set<int> swant;
1523 for (auto i : want) swant.insert(i);
1524 return stretch_set_can_peer(swant, osdmap, out);
1525 }
1526
1527 uint64_t target_max_bytes = 0; ///< tiering: target max pool size
1528 uint64_t target_max_objects = 0; ///< tiering: target max pool size
1529
1530 uint32_t cache_target_dirty_ratio_micro = 0; ///< cache: fraction of target to leave dirty
1531 uint32_t cache_target_dirty_high_ratio_micro = 0; ///< cache: fraction of target to flush with high speed
1532 uint32_t cache_target_full_ratio_micro = 0; ///< cache: fraction of target to fill before we evict in earnest
1533
1534 uint32_t cache_min_flush_age = 0; ///< minimum age (seconds) before we can flush
1535 uint32_t cache_min_evict_age = 0; ///< minimum age (seconds) before we can evict
1536
1537 HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
1538 uint32_t hit_set_period = 0; ///< periodicity of HitSet segments (seconds)
1539 uint32_t hit_set_count = 0; ///< number of periods to retain
1540 bool use_gmt_hitset = true; ///< use gmt to name the hitset archive object
1541 uint32_t min_read_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on read
1542 uint32_t min_write_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on write
1543 uint32_t hit_set_grade_decay_rate = 0; ///< current hit_set has highest priority on objects
1544 ///< temperature count,the follow hit_set's priority decay
1545 ///< by this params than pre hit_set
1546 uint32_t hit_set_search_last_n = 0; ///< accumulate atmost N hit_sets for temperature
1547
1548 uint32_t stripe_width = 0; ///< erasure coded stripe size in bytes
1549
1550 uint64_t expected_num_objects = 0; ///< expected number of objects on this pool, a value of 0 indicates
1551 ///< user does not specify any expected value
1552 bool fast_read = false; ///< whether turn on fast read on the pool or not
1553
1554 pool_opts_t opts; ///< options
1555
1556 typedef enum {
1557 TYPE_FINGERPRINT_NONE = 0,
1558 TYPE_FINGERPRINT_SHA1 = 1,
1559 TYPE_FINGERPRINT_SHA256 = 2,
1560 TYPE_FINGERPRINT_SHA512 = 3,
1561 } fingerprint_t;
1562 static fingerprint_t get_fingerprint_from_str(const std::string& s) {
1563 if (s == "none")
1564 return TYPE_FINGERPRINT_NONE;
1565 if (s == "sha1")
1566 return TYPE_FINGERPRINT_SHA1;
1567 if (s == "sha256")
1568 return TYPE_FINGERPRINT_SHA256;
1569 if (s == "sha512")
1570 return TYPE_FINGERPRINT_SHA512;
1571 return (fingerprint_t)-1;
1572 }
1573 const fingerprint_t get_fingerprint_type() const {
1574 std::string fp_str;
1575 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1576 return get_fingerprint_from_str(fp_str);
1577 }
1578 const char *get_fingerprint_name() const {
1579 std::string fp_str;
1580 fingerprint_t fp_t;
1581 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1582 fp_t = get_fingerprint_from_str(fp_str);
1583 return get_fingerprint_name(fp_t);
1584 }
1585 static const char *get_fingerprint_name(fingerprint_t m) {
1586 switch (m) {
1587 case TYPE_FINGERPRINT_NONE: return "none";
1588 case TYPE_FINGERPRINT_SHA1: return "sha1";
1589 case TYPE_FINGERPRINT_SHA256: return "sha256";
1590 case TYPE_FINGERPRINT_SHA512: return "sha512";
1591 default: return "unknown";
1592 }
1593 }
1594
1595 typedef enum {
1596 TYPE_DEDUP_CHUNK_NONE = 0,
1597 TYPE_DEDUP_CHUNK_FASTCDC = 1,
1598 TYPE_DEDUP_CHUNK_FIXEDCDC = 2,
1599 } dedup_chunk_algo_t;
1600 static dedup_chunk_algo_t get_dedup_chunk_algorithm_from_str(const std::string& s) {
1601 if (s == "none")
1602 return TYPE_DEDUP_CHUNK_NONE;
1603 if (s == "fastcdc")
1604 return TYPE_DEDUP_CHUNK_FASTCDC;
1605 if (s == "fixed")
1606 return TYPE_DEDUP_CHUNK_FIXEDCDC;
1607 return (dedup_chunk_algo_t)-1;
1608 }
1609 const dedup_chunk_algo_t get_dedup_chunk_algorithm_type() const {
1610 std::string algo_str;
1611 opts.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM, &algo_str);
1612 return get_dedup_chunk_algorithm_from_str(algo_str);
1613 }
1614 const char *get_dedup_chunk_algorithm_name() const {
1615 std::string dedup_chunk_algo_str;
1616 dedup_chunk_algo_t dedup_chunk_algo_t;
1617 opts.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM, &dedup_chunk_algo_str);
1618 dedup_chunk_algo_t = get_dedup_chunk_algorithm_from_str(dedup_chunk_algo_str);
1619 return get_dedup_chunk_algorithm_name(dedup_chunk_algo_t);
1620 }
1621 static const char *get_dedup_chunk_algorithm_name(dedup_chunk_algo_t m) {
1622 switch (m) {
1623 case TYPE_DEDUP_CHUNK_NONE: return "none";
1624 case TYPE_DEDUP_CHUNK_FASTCDC: return "fastcdc";
1625 case TYPE_DEDUP_CHUNK_FIXEDCDC: return "fixed";
1626 default: return "unknown";
1627 }
1628 }
1629
1630 int64_t get_dedup_tier() const {
1631 int64_t tier_id = 0;
1632 opts.get(pool_opts_t::DEDUP_TIER, &tier_id);
1633 return tier_id;
1634 }
1635 int64_t get_dedup_cdc_chunk_size() const {
1636 int64_t chunk_size = 0;
1637 opts.get(pool_opts_t::DEDUP_CDC_CHUNK_SIZE, &chunk_size);
1638 return chunk_size;
1639 }
1640
1641 /// application -> key/value metadata
1642 std::map<std::string, std::map<std::string, std::string>> application_metadata;
1643
1644 private:
1645 std::vector<uint32_t> grade_table;
1646
1647 public:
1648 uint32_t get_grade(unsigned i) const {
1649 if (grade_table.size() <= i)
1650 return 0;
1651 return grade_table[i];
1652 }
1653 void calc_grade_table() {
1654 unsigned v = 1000000;
1655 grade_table.resize(hit_set_count);
1656 for (unsigned i = 0; i < hit_set_count; i++) {
1657 v = v * (1 - (hit_set_grade_decay_rate / 100.0));
1658 grade_table[i] = v;
1659 }
1660 }
1661
1662 pg_pool_t() = default;
1663
1664 void dump(ceph::Formatter *f) const;
1665
1666 const utime_t &get_create_time() const { return create_time; }
1667 uint64_t get_flags() const { return flags; }
1668 bool has_flag(uint64_t f) const { return flags & f; }
1669 void set_flag(uint64_t f) { flags |= f; }
1670 void unset_flag(uint64_t f) { flags &= ~f; }
1671
1672 bool require_rollback() const {
1673 return is_erasure();
1674 }
1675
1676 /// true if incomplete clones may be present
1677 bool allow_incomplete_clones() const {
1678 return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
1679 }
1680
1681 unsigned get_type() const { return type; }
1682 unsigned get_size() const { return size; }
1683 unsigned get_min_size() const { return min_size; }
1684 int get_crush_rule() const { return crush_rule; }
1685 int get_object_hash() const { return object_hash; }
1686 const char *get_object_hash_name() const {
1687 return ceph_str_hash_name(get_object_hash());
1688 }
1689 epoch_t get_last_change() const { return last_change; }
1690 epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
1691 epoch_t get_last_force_op_resend_prenautilus() const {
1692 return last_force_op_resend_prenautilus;
1693 }
1694 epoch_t get_last_force_op_resend_preluminous() const {
1695 return last_force_op_resend_preluminous;
1696 }
1697 epoch_t get_snap_epoch() const { return snap_epoch; }
1698 snapid_t get_snap_seq() const { return snap_seq; }
1699 uint64_t get_auid() const { return auid; }
1700
1701 void set_snap_seq(snapid_t s) { snap_seq = s; }
1702 void set_snap_epoch(epoch_t e) { snap_epoch = e; }
1703
1704 void set_stripe_width(uint32_t s) { stripe_width = s; }
1705 uint32_t get_stripe_width() const { return stripe_width; }
1706
1707 bool is_replicated() const { return get_type() == TYPE_REPLICATED; }
1708 bool is_erasure() const { return get_type() == TYPE_ERASURE; }
1709
1710 bool supports_omap() const {
1711 return !(get_type() == TYPE_ERASURE);
1712 }
1713
1714 bool requires_aligned_append() const {
1715 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
1716 }
1717 uint64_t required_alignment() const { return stripe_width; }
1718
1719 bool allows_ecoverwrites() const {
1720 return has_flag(FLAG_EC_OVERWRITES);
1721 }
1722
1723 bool is_crimson() const {
1724 return has_flag(FLAG_CRIMSON);
1725 }
1726
1727 bool can_shift_osds() const {
1728 switch (get_type()) {
1729 case TYPE_REPLICATED:
1730 return true;
1731 case TYPE_ERASURE:
1732 return false;
1733 default:
1734 ceph_abort_msg("unhandled pool type");
1735 }
1736 }
1737
1738 unsigned get_pg_num() const { return pg_num; }
1739 unsigned get_pgp_num() const { return pgp_num; }
1740 unsigned get_pg_num_target() const { return pg_num_target; }
1741 unsigned get_pgp_num_target() const { return pgp_num_target; }
1742 unsigned get_pg_num_pending() const { return pg_num_pending; }
1743
1744 unsigned get_pg_num_mask() const { return pg_num_mask; }
1745 unsigned get_pgp_num_mask() const { return pgp_num_mask; }
1746
1747 // if pg_num is not a multiple of two, pgs are not equally sized.
1748 // return, for a given pg, the fraction (denominator) of the total
1749 // pool size that it represents.
1750 unsigned get_pg_num_divisor(pg_t pgid) const;
1751
1752 bool is_pending_merge(pg_t pgid, bool *target) const;
1753
1754 void set_pg_num(int p) {
1755 pg_num = p;
1756 pg_num_pending = p;
1757 calc_pg_masks();
1758 }
1759 void set_pgp_num(int p) {
1760 pgp_num = p;
1761 calc_pg_masks();
1762 }
1763 void set_pg_num_pending(int p) {
1764 pg_num_pending = p;
1765 calc_pg_masks();
1766 }
1767 void set_pg_num_target(int p) {
1768 pg_num_target = p;
1769 }
1770 void set_pgp_num_target(int p) {
1771 pgp_num_target = p;
1772 }
1773 void dec_pg_num(pg_t source_pgid,
1774 epoch_t ready_epoch,
1775 eversion_t source_version,
1776 eversion_t target_version,
1777 epoch_t last_epoch_started,
1778 epoch_t last_epoch_clean) {
1779 --pg_num;
1780 last_pg_merge_meta.source_pgid = source_pgid;
1781 last_pg_merge_meta.ready_epoch = ready_epoch;
1782 last_pg_merge_meta.source_version = source_version;
1783 last_pg_merge_meta.target_version = target_version;
1784 last_pg_merge_meta.last_epoch_started = last_epoch_started;
1785 last_pg_merge_meta.last_epoch_clean = last_epoch_clean;
1786 calc_pg_masks();
1787 }
1788
1789 void set_quota_max_bytes(uint64_t m) {
1790 quota_max_bytes = m;
1791 }
1792 uint64_t get_quota_max_bytes() {
1793 return quota_max_bytes;
1794 }
1795
1796 void set_quota_max_objects(uint64_t m) {
1797 quota_max_objects = m;
1798 }
1799 uint64_t get_quota_max_objects() {
1800 return quota_max_objects;
1801 }
1802
1803 void set_last_force_op_resend(uint64_t t) {
1804 last_force_op_resend = t;
1805 last_force_op_resend_prenautilus = t;
1806 last_force_op_resend_preluminous = t;
1807 }
1808
1809 void calc_pg_masks();
1810
1811 /*
1812 * we have two snap modes:
1813 * - pool global snaps
1814 * - snap existence/non-existence defined by snaps[] and snap_seq
1815 * - user managed snaps
1816 * - removal governed by removed_snaps
1817 *
1818 * we know which mode we're using based on whether removed_snaps is empty.
1819 * If nothing has been created, both functions report false.
1820 */
1821 bool is_pool_snaps_mode() const;
1822 bool is_unmanaged_snaps_mode() const;
1823 bool is_removed_snap(snapid_t s) const;
1824
1825 snapid_t snap_exists(std::string_view s) const;
1826 void add_snap(const char *n, utime_t stamp);
1827 uint64_t add_unmanaged_snap(bool preoctopus_compat);
1828 void remove_snap(snapid_t s);
1829 void remove_unmanaged_snap(snapid_t s, bool preoctopus_compat);
1830
1831 SnapContext get_snap_context() const;
1832
1833 /// hash a object name+namespace key to a hash position
1834 uint32_t hash_key(const std::string& key, const std::string& ns) const;
1835
1836 /// round a hash position down to a pg num
1837 uint32_t raw_hash_to_pg(uint32_t v) const;
1838
1839 /*
1840 * map a raw pg (with full precision ps) into an actual pg, for storage
1841 */
1842 pg_t raw_pg_to_pg(pg_t pg) const;
1843
1844 /*
1845 * map raw pg (full precision ps) into a placement seed. include
1846 * pool id in that value so that different pools don't use the same
1847 * seeds.
1848 */
1849 ps_t raw_pg_to_pps(pg_t pg) const;
1850
1851 /// choose a random hash position within a pg
1852 uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
1853
1854 void encode(ceph::buffer::list& bl, uint64_t features) const;
1855 void decode(ceph::buffer::list::const_iterator& bl);
1856
1857 static void generate_test_instances(std::list<pg_pool_t*>& o);
1858 };
1859 WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
1860
1861 std::ostream& operator<<(std::ostream& out, const pg_pool_t& p);
1862
1863
1864 /**
1865 * a summation of object stats
1866 *
1867 * This is just a container for object stats; we don't know what for.
1868 *
1869 * If you add members in object_stat_sum_t, you should make sure there are
1870 * not padding among these members.
1871 * You should also modify the padding_check function.
1872
1873 */
1874 struct object_stat_sum_t {
1875 /**************************************************************************
1876 * WARNING: be sure to update operator==, floor, and split when
1877 * adding/removing fields!
1878 **************************************************************************/
1879 int64_t num_bytes{0}; // in bytes
1880 int64_t num_objects{0};
1881 int64_t num_object_clones{0};
1882 int64_t num_object_copies{0}; // num_objects * num_replicas
1883 int64_t num_objects_missing_on_primary{0};
1884 int64_t num_objects_degraded{0};
1885 int64_t num_objects_unfound{0};
1886 int64_t num_rd{0};
1887 int64_t num_rd_kb{0};
1888 int64_t num_wr{0};
1889 int64_t num_wr_kb{0};
1890 int64_t num_scrub_errors{0}; // total deep and shallow scrub errors
1891 int64_t num_objects_recovered{0};
1892 int64_t num_bytes_recovered{0};
1893 int64_t num_keys_recovered{0};
1894 int64_t num_shallow_scrub_errors{0};
1895 int64_t num_deep_scrub_errors{0};
1896 int64_t num_objects_dirty{0};
1897 int64_t num_whiteouts{0};
1898 int64_t num_objects_omap{0};
1899 int64_t num_objects_hit_set_archive{0};
1900 int64_t num_objects_misplaced{0};
1901 int64_t num_bytes_hit_set_archive{0};
1902 int64_t num_flush{0};
1903 int64_t num_flush_kb{0};
1904 int64_t num_evict{0};
1905 int64_t num_evict_kb{0};
1906 int64_t num_promote{0};
1907 int32_t num_flush_mode_high{0}; // 1 when in high flush mode, otherwise 0
1908 int32_t num_flush_mode_low{0}; // 1 when in low flush mode, otherwise 0
1909 int32_t num_evict_mode_some{0}; // 1 when in evict some mode, otherwise 0
1910 int32_t num_evict_mode_full{0}; // 1 when in evict full mode, otherwise 0
1911 int64_t num_objects_pinned{0};
1912 int64_t num_objects_missing{0};
1913 int64_t num_legacy_snapsets{0}; ///< upper bound on pre-luminous-style SnapSets
1914 int64_t num_large_omap_objects{0};
1915 int64_t num_objects_manifest{0};
1916 int64_t num_omap_bytes{0};
1917 int64_t num_omap_keys{0};
1918 int64_t num_objects_repaired{0};
1919
1920 object_stat_sum_t() = default;
1921
1922 void floor(int64_t f) {
1923 #define FLOOR(x) if (x < f) x = f
1924 FLOOR(num_bytes);
1925 FLOOR(num_objects);
1926 FLOOR(num_object_clones);
1927 FLOOR(num_object_copies);
1928 FLOOR(num_objects_missing_on_primary);
1929 FLOOR(num_objects_missing);
1930 FLOOR(num_objects_degraded);
1931 FLOOR(num_objects_misplaced);
1932 FLOOR(num_objects_unfound);
1933 FLOOR(num_rd);
1934 FLOOR(num_rd_kb);
1935 FLOOR(num_wr);
1936 FLOOR(num_wr_kb);
1937 FLOOR(num_large_omap_objects);
1938 FLOOR(num_objects_manifest);
1939 FLOOR(num_omap_bytes);
1940 FLOOR(num_omap_keys);
1941 FLOOR(num_shallow_scrub_errors);
1942 FLOOR(num_deep_scrub_errors);
1943 num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
1944 FLOOR(num_objects_recovered);
1945 FLOOR(num_bytes_recovered);
1946 FLOOR(num_keys_recovered);
1947 FLOOR(num_objects_dirty);
1948 FLOOR(num_whiteouts);
1949 FLOOR(num_objects_omap);
1950 FLOOR(num_objects_hit_set_archive);
1951 FLOOR(num_bytes_hit_set_archive);
1952 FLOOR(num_flush);
1953 FLOOR(num_flush_kb);
1954 FLOOR(num_evict);
1955 FLOOR(num_evict_kb);
1956 FLOOR(num_promote);
1957 FLOOR(num_flush_mode_high);
1958 FLOOR(num_flush_mode_low);
1959 FLOOR(num_evict_mode_some);
1960 FLOOR(num_evict_mode_full);
1961 FLOOR(num_objects_pinned);
1962 FLOOR(num_legacy_snapsets);
1963 FLOOR(num_objects_repaired);
1964 #undef FLOOR
1965 }
1966
1967 void split(std::vector<object_stat_sum_t> &out) const {
1968 #define SPLIT(PARAM) \
1969 for (unsigned i = 0; i < out.size(); ++i) { \
1970 out[i].PARAM = PARAM / out.size(); \
1971 if (i < (PARAM % out.size())) { \
1972 out[i].PARAM++; \
1973 } \
1974 }
1975 #define SPLIT_PRESERVE_NONZERO(PARAM) \
1976 for (unsigned i = 0; i < out.size(); ++i) { \
1977 if (PARAM) \
1978 out[i].PARAM = 1 + PARAM / out.size(); \
1979 else \
1980 out[i].PARAM = 0; \
1981 }
1982
1983 SPLIT(num_bytes);
1984 SPLIT(num_objects);
1985 SPLIT(num_object_clones);
1986 SPLIT(num_object_copies);
1987 SPLIT(num_objects_missing_on_primary);
1988 SPLIT(num_objects_missing);
1989 SPLIT(num_objects_degraded);
1990 SPLIT(num_objects_misplaced);
1991 SPLIT(num_objects_unfound);
1992 SPLIT(num_rd);
1993 SPLIT(num_rd_kb);
1994 SPLIT(num_wr);
1995 SPLIT(num_wr_kb);
1996 SPLIT(num_large_omap_objects);
1997 SPLIT(num_objects_manifest);
1998 SPLIT(num_omap_bytes);
1999 SPLIT(num_omap_keys);
2000 SPLIT(num_objects_repaired);
2001 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
2002 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
2003 for (unsigned i = 0; i < out.size(); ++i) {
2004 out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
2005 out[i].num_deep_scrub_errors;
2006 }
2007 SPLIT(num_objects_recovered);
2008 SPLIT(num_bytes_recovered);
2009 SPLIT(num_keys_recovered);
2010 SPLIT(num_objects_dirty);
2011 SPLIT(num_whiteouts);
2012 SPLIT(num_objects_omap);
2013 SPLIT(num_objects_hit_set_archive);
2014 SPLIT(num_bytes_hit_set_archive);
2015 SPLIT(num_flush);
2016 SPLIT(num_flush_kb);
2017 SPLIT(num_evict);
2018 SPLIT(num_evict_kb);
2019 SPLIT(num_promote);
2020 SPLIT(num_flush_mode_high);
2021 SPLIT(num_flush_mode_low);
2022 SPLIT(num_evict_mode_some);
2023 SPLIT(num_evict_mode_full);
2024 SPLIT(num_objects_pinned);
2025 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
2026 #undef SPLIT
2027 #undef SPLIT_PRESERVE_NONZERO
2028 }
2029
2030 void clear() {
2031 // FIPS zeroization audit 20191117: this memset is not security related.
2032 memset(this, 0, sizeof(*this));
2033 }
2034
2035 void calc_copies(int nrep) {
2036 num_object_copies = nrep * num_objects;
2037 }
2038
2039 bool is_zero() const {
2040 return mem_is_zero((char*)this, sizeof(*this));
2041 }
2042
2043 void add(const object_stat_sum_t& o);
2044 void sub(const object_stat_sum_t& o);
2045
2046 void dump(ceph::Formatter *f) const;
2047 void padding_check() {
2048 static_assert(
2049 sizeof(object_stat_sum_t) ==
2050 sizeof(num_bytes) +
2051 sizeof(num_objects) +
2052 sizeof(num_object_clones) +
2053 sizeof(num_object_copies) +
2054 sizeof(num_objects_missing_on_primary) +
2055 sizeof(num_objects_degraded) +
2056 sizeof(num_objects_unfound) +
2057 sizeof(num_rd) +
2058 sizeof(num_rd_kb) +
2059 sizeof(num_wr) +
2060 sizeof(num_wr_kb) +
2061 sizeof(num_scrub_errors) +
2062 sizeof(num_large_omap_objects) +
2063 sizeof(num_objects_manifest) +
2064 sizeof(num_omap_bytes) +
2065 sizeof(num_omap_keys) +
2066 sizeof(num_objects_repaired) +
2067 sizeof(num_objects_recovered) +
2068 sizeof(num_bytes_recovered) +
2069 sizeof(num_keys_recovered) +
2070 sizeof(num_shallow_scrub_errors) +
2071 sizeof(num_deep_scrub_errors) +
2072 sizeof(num_objects_dirty) +
2073 sizeof(num_whiteouts) +
2074 sizeof(num_objects_omap) +
2075 sizeof(num_objects_hit_set_archive) +
2076 sizeof(num_objects_misplaced) +
2077 sizeof(num_bytes_hit_set_archive) +
2078 sizeof(num_flush) +
2079 sizeof(num_flush_kb) +
2080 sizeof(num_evict) +
2081 sizeof(num_evict_kb) +
2082 sizeof(num_promote) +
2083 sizeof(num_flush_mode_high) +
2084 sizeof(num_flush_mode_low) +
2085 sizeof(num_evict_mode_some) +
2086 sizeof(num_evict_mode_full) +
2087 sizeof(num_objects_pinned) +
2088 sizeof(num_objects_missing) +
2089 sizeof(num_legacy_snapsets)
2090 ,
2091 "object_stat_sum_t have padding");
2092 }
2093 void encode(ceph::buffer::list& bl) const;
2094 void decode(ceph::buffer::list::const_iterator& bl);
2095 static void generate_test_instances(std::list<object_stat_sum_t*>& o);
2096 };
2097 WRITE_CLASS_ENCODER(object_stat_sum_t)
2098
2099 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
2100
2101 /**
2102 * a collection of object stat sums
2103 *
2104 * This is a collection of stat sums over different categories.
2105 */
2106 struct object_stat_collection_t {
2107 /**************************************************************************
2108 * WARNING: be sure to update the operator== when adding/removing fields! *
2109 **************************************************************************/
2110 object_stat_sum_t sum;
2111
2112 void calc_copies(int nrep) {
2113 sum.calc_copies(nrep);
2114 }
2115
2116 void dump(ceph::Formatter *f) const;
2117 void encode(ceph::buffer::list& bl) const;
2118 void decode(ceph::buffer::list::const_iterator& bl);
2119 static void generate_test_instances(std::list<object_stat_collection_t*>& o);
2120
2121 bool is_zero() const {
2122 return sum.is_zero();
2123 }
2124
2125 void clear() {
2126 sum.clear();
2127 }
2128
2129 void floor(int64_t f) {
2130 sum.floor(f);
2131 }
2132
2133 void add(const object_stat_sum_t& o) {
2134 sum.add(o);
2135 }
2136
2137 void add(const object_stat_collection_t& o) {
2138 sum.add(o.sum);
2139 }
2140 void sub(const object_stat_collection_t& o) {
2141 sum.sub(o.sum);
2142 }
2143 };
2144 WRITE_CLASS_ENCODER(object_stat_collection_t)
2145
2146 inline bool operator==(const object_stat_collection_t& l,
2147 const object_stat_collection_t& r) {
2148 return l.sum == r.sum;
2149 }
2150
2151 enum class scrub_level_t : bool { shallow = false, deep = true };
2152 enum class scrub_type_t : bool { not_repair = false, do_repair = true };
2153
2154 /// is there a scrub in our future?
2155 enum class pg_scrub_sched_status_t : uint16_t {
2156 unknown, ///< status not reported yet
2157 not_queued, ///< not in the OSD's scrub queue. Probably not active.
2158 active, ///< scrubbing
2159 scheduled, ///< scheduled for a scrub at an already determined time
2160 queued, ///< queued to be scrubbed
2161 blocked ///< blocked waiting for objects to be unlocked
2162 };
2163
2164 struct pg_scrubbing_status_t {
2165 utime_t m_scheduled_at{};
2166 int32_t m_duration_seconds{0}; // relevant when scrubbing
2167 pg_scrub_sched_status_t m_sched_status{pg_scrub_sched_status_t::unknown};
2168 bool m_is_active{false};
2169 scrub_level_t m_is_deep{scrub_level_t::shallow};
2170 bool m_is_periodic{true};
2171 };
2172
2173 bool operator==(const pg_scrubbing_status_t& l, const pg_scrubbing_status_t& r);
2174
2175 /** pg_stat
2176 * aggregate stats for a single PG.
2177 */
2178 struct pg_stat_t {
2179 /**************************************************************************
2180 * WARNING: be sure to update the operator== when adding/removing fields! *
2181 **************************************************************************/
2182 eversion_t version;
2183 version_t reported_seq; // sequence number
2184 epoch_t reported_epoch; // epoch of this report
2185 uint64_t state;
2186 utime_t last_fresh; // last reported
2187 utime_t last_change; // new state != previous state
2188 utime_t last_active; // state & PG_STATE_ACTIVE
2189 utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
2190 utime_t last_clean; // state & PG_STATE_CLEAN
2191 utime_t last_unstale; // (state & PG_STATE_STALE) == 0
2192 utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
2193 utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
2194
2195 eversion_t log_start; // (log_start,version]
2196 eversion_t ondisk_log_start; // there may be more on disk
2197
2198 epoch_t created;
2199 epoch_t last_epoch_clean;
2200 pg_t parent;
2201 __u32 parent_split_bits;
2202
2203 eversion_t last_scrub;
2204 eversion_t last_deep_scrub;
2205 utime_t last_scrub_stamp;
2206 utime_t last_deep_scrub_stamp;
2207 utime_t last_clean_scrub_stamp;
2208 int32_t last_scrub_duration{0};
2209
2210 object_stat_collection_t stats;
2211
2212 int64_t log_size;
2213 int64_t log_dups_size;
2214 int64_t ondisk_log_size; // >= active_log_size
2215 int64_t objects_scrubbed;
2216 double scrub_duration;
2217
2218 std::vector<int32_t> up, acting;
2219 std::vector<pg_shard_t> avail_no_missing;
2220 std::map< std::set<pg_shard_t>, int32_t > object_location_counts;
2221 epoch_t mapping_epoch;
2222
2223 std::vector<int32_t> blocked_by; ///< osds on which the pg is blocked
2224
2225 interval_set<snapid_t> purged_snaps; ///< recently removed snaps that we've purged
2226
2227 utime_t last_became_active;
2228 utime_t last_became_peered;
2229
2230 /// up, acting primaries
2231 int32_t up_primary;
2232 int32_t acting_primary;
2233
2234 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
2235 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
2236 uint32_t snaptrimq_len;
2237 int64_t objects_trimmed;
2238 double snaptrim_duration;
2239
2240 pg_scrubbing_status_t scrub_sched_status;
2241
2242 bool stats_invalid:1;
2243 /// true if num_objects_dirty is not accurate (because it was not
2244 /// maintained starting from pool creation)
2245 bool dirty_stats_invalid:1;
2246 bool omap_stats_invalid:1;
2247 bool hitset_stats_invalid:1;
2248 bool hitset_bytes_stats_invalid:1;
2249 bool pin_stats_invalid:1;
2250 bool manifest_stats_invalid:1;
2251
2252 pg_stat_t()
2253 : reported_seq(0),
2254 reported_epoch(0),
2255 state(0),
2256 created(0), last_epoch_clean(0),
2257 parent_split_bits(0),
2258 log_size(0), log_dups_size(0),
2259 ondisk_log_size(0),
2260 objects_scrubbed(0),
2261 scrub_duration(0),
2262 mapping_epoch(0),
2263 up_primary(-1),
2264 acting_primary(-1),
2265 snaptrimq_len(0),
2266 objects_trimmed(0),
2267 snaptrim_duration(0.0),
2268 stats_invalid(false),
2269 dirty_stats_invalid(false),
2270 omap_stats_invalid(false),
2271 hitset_stats_invalid(false),
2272 hitset_bytes_stats_invalid(false),
2273 pin_stats_invalid(false),
2274 manifest_stats_invalid(false)
2275 { }
2276
2277 epoch_t get_effective_last_epoch_clean() const {
2278 if (state & PG_STATE_CLEAN) {
2279 // we are clean as of this report, and should thus take the
2280 // reported epoch
2281 return reported_epoch;
2282 } else {
2283 return last_epoch_clean;
2284 }
2285 }
2286
2287 std::pair<epoch_t, version_t> get_version_pair() const {
2288 return { reported_epoch, reported_seq };
2289 }
2290
2291 void floor(int64_t f) {
2292 stats.floor(f);
2293 if (log_size < f)
2294 log_size = f;
2295 if (ondisk_log_size < f)
2296 ondisk_log_size = f;
2297 if (snaptrimq_len < f)
2298 snaptrimq_len = f;
2299 }
2300
2301 void add_sub_invalid_flags(const pg_stat_t& o) {
2302 // adding (or subtracting!) invalid stats render our stats invalid too
2303 stats_invalid |= o.stats_invalid;
2304 dirty_stats_invalid |= o.dirty_stats_invalid;
2305 omap_stats_invalid |= o.omap_stats_invalid;
2306 hitset_stats_invalid |= o.hitset_stats_invalid;
2307 hitset_bytes_stats_invalid |= o.hitset_bytes_stats_invalid;
2308 pin_stats_invalid |= o.pin_stats_invalid;
2309 manifest_stats_invalid |= o.manifest_stats_invalid;
2310 }
2311 void add(const pg_stat_t& o) {
2312 stats.add(o.stats);
2313 log_size += o.log_size;
2314 log_dups_size += o.log_dups_size;
2315 ondisk_log_size += o.ondisk_log_size;
2316 snaptrimq_len = std::min((uint64_t)snaptrimq_len + o.snaptrimq_len,
2317 (uint64_t)(1ull << 31));
2318 add_sub_invalid_flags(o);
2319 }
2320 void sub(const pg_stat_t& o) {
2321 stats.sub(o.stats);
2322 log_size -= o.log_size;
2323 log_dups_size -= o.log_dups_size;
2324 ondisk_log_size -= o.ondisk_log_size;
2325 if (o.snaptrimq_len < snaptrimq_len) {
2326 snaptrimq_len -= o.snaptrimq_len;
2327 } else {
2328 snaptrimq_len = 0;
2329 }
2330 add_sub_invalid_flags(o);
2331 }
2332
2333 bool is_acting_osd(int32_t osd, bool primary) const;
2334 void dump(ceph::Formatter *f) const;
2335 void dump_brief(ceph::Formatter *f) const;
2336 std::string dump_scrub_schedule() const;
2337 void encode(ceph::buffer::list &bl) const;
2338 void decode(ceph::buffer::list::const_iterator &bl);
2339 static void generate_test_instances(std::list<pg_stat_t*>& o);
2340 };
2341 WRITE_CLASS_ENCODER(pg_stat_t)
2342
2343 bool operator==(const pg_stat_t& l, const pg_stat_t& r);
2344
2345 /** store_statfs_t
2346 * ObjectStore full statfs information
2347 */
2348 struct store_statfs_t
2349 {
2350 uint64_t total = 0; ///< Total bytes
2351 uint64_t available = 0; ///< Free bytes available
2352 uint64_t internally_reserved = 0; ///< Bytes reserved for internal purposes
2353
2354 int64_t allocated = 0; ///< Bytes allocated by the store
2355
2356 int64_t data_stored = 0; ///< Bytes actually stored by the user
2357 int64_t data_compressed = 0; ///< Bytes stored after compression
2358 int64_t data_compressed_allocated = 0; ///< Bytes allocated for compressed data
2359 int64_t data_compressed_original = 0; ///< Bytes that were compressed
2360
2361 int64_t omap_allocated = 0; ///< approx usage of omap data
2362 int64_t internal_metadata = 0; ///< approx usage of internal metadata
2363
2364 void reset() {
2365 *this = store_statfs_t();
2366 }
2367 void floor(int64_t f) {
2368 #define FLOOR(x) if (int64_t(x) < f) x = f
2369 FLOOR(total);
2370 FLOOR(available);
2371 FLOOR(internally_reserved);
2372 FLOOR(allocated);
2373 FLOOR(data_stored);
2374 FLOOR(data_compressed);
2375 FLOOR(data_compressed_allocated);
2376 FLOOR(data_compressed_original);
2377
2378 FLOOR(omap_allocated);
2379 FLOOR(internal_metadata);
2380 #undef FLOOR
2381 }
2382
2383 bool operator ==(const store_statfs_t& other) const;
2384 bool is_zero() const {
2385 return *this == store_statfs_t();
2386 }
2387
2388 uint64_t get_used() const {
2389 return total - available - internally_reserved;
2390 }
2391
2392 // this accumulates both actually used and statfs's internally_reserved
2393 uint64_t get_used_raw() const {
2394 return total - available;
2395 }
2396
2397 float get_used_raw_ratio() const {
2398 if (total) {
2399 return (float)get_used_raw() / (float)total;
2400 } else {
2401 return 0.0;
2402 }
2403 }
2404
2405 // helpers to ease legacy code porting
2406 uint64_t kb_avail() const {
2407 return available >> 10;
2408 }
2409 uint64_t kb() const {
2410 return total >> 10;
2411 }
2412 uint64_t kb_used() const {
2413 return (total - available - internally_reserved) >> 10;
2414 }
2415 uint64_t kb_used_raw() const {
2416 return get_used_raw() >> 10;
2417 }
2418
2419 uint64_t kb_used_data() const {
2420 return allocated >> 10;
2421 }
2422 uint64_t kb_used_omap() const {
2423 return omap_allocated >> 10;
2424 }
2425
2426 uint64_t kb_used_internal_metadata() const {
2427 return internal_metadata >> 10;
2428 }
2429
2430 void add(const store_statfs_t& o) {
2431 total += o.total;
2432 available += o.available;
2433 internally_reserved += o.internally_reserved;
2434 allocated += o.allocated;
2435 data_stored += o.data_stored;
2436 data_compressed += o.data_compressed;
2437 data_compressed_allocated += o.data_compressed_allocated;
2438 data_compressed_original += o.data_compressed_original;
2439 omap_allocated += o.omap_allocated;
2440 internal_metadata += o.internal_metadata;
2441 }
2442 void sub(const store_statfs_t& o) {
2443 total -= o.total;
2444 available -= o.available;
2445 internally_reserved -= o.internally_reserved;
2446 allocated -= o.allocated;
2447 data_stored -= o.data_stored;
2448 data_compressed -= o.data_compressed;
2449 data_compressed_allocated -= o.data_compressed_allocated;
2450 data_compressed_original -= o.data_compressed_original;
2451 omap_allocated -= o.omap_allocated;
2452 internal_metadata -= o.internal_metadata;
2453 }
2454 void dump(ceph::Formatter *f) const;
2455 DENC(store_statfs_t, v, p) {
2456 DENC_START(1, 1, p);
2457 denc(v.total, p);
2458 denc(v.available, p);
2459 denc(v.internally_reserved, p);
2460 denc(v.allocated, p);
2461 denc(v.data_stored, p);
2462 denc(v.data_compressed, p);
2463 denc(v.data_compressed_allocated, p);
2464 denc(v.data_compressed_original, p);
2465 denc(v.omap_allocated, p);
2466 denc(v.internal_metadata, p);
2467 DENC_FINISH(p);
2468 }
2469 static void generate_test_instances(std::list<store_statfs_t*>& o);
2470 };
2471 WRITE_CLASS_DENC(store_statfs_t)
2472
2473 std::ostream &operator<<(std::ostream &lhs, const store_statfs_t &rhs);
2474
2475 /** osd_stat
2476 * aggregate stats for an osd
2477 */
2478 struct osd_stat_t {
2479 store_statfs_t statfs;
2480 std::vector<int> hb_peers;
2481 int32_t snap_trim_queue_len, num_snap_trimming;
2482 uint64_t num_shards_repaired;
2483
2484 pow2_hist_t op_queue_age_hist;
2485
2486 objectstore_perf_stat_t os_perf_stat;
2487 osd_alerts_t os_alerts;
2488
2489 epoch_t up_from = 0;
2490 uint64_t seq = 0;
2491
2492 uint32_t num_pgs = 0;
2493
2494 uint32_t num_osds = 0;
2495 uint32_t num_per_pool_osds = 0;
2496 uint32_t num_per_pool_omap_osds = 0;
2497
2498 struct Interfaces {
2499 uint32_t last_update; // in seconds
2500 uint32_t back_pingtime[3];
2501 uint32_t back_min[3];
2502 uint32_t back_max[3];
2503 uint32_t back_last;
2504 uint32_t front_pingtime[3];
2505 uint32_t front_min[3];
2506 uint32_t front_max[3];
2507 uint32_t front_last;
2508 };
2509 std::map<int, Interfaces> hb_pingtime; ///< map of osd id to Interfaces
2510
2511 osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
2512 num_shards_repaired(0) {}
2513
2514 void add(const osd_stat_t& o) {
2515 statfs.add(o.statfs);
2516 snap_trim_queue_len += o.snap_trim_queue_len;
2517 num_snap_trimming += o.num_snap_trimming;
2518 num_shards_repaired += o.num_shards_repaired;
2519 op_queue_age_hist.add(o.op_queue_age_hist);
2520 os_perf_stat.add(o.os_perf_stat);
2521 num_pgs += o.num_pgs;
2522 num_osds += o.num_osds;
2523 num_per_pool_osds += o.num_per_pool_osds;
2524 num_per_pool_omap_osds += o.num_per_pool_omap_osds;
2525 for (const auto& a : o.os_alerts) {
2526 auto& target = os_alerts[a.first];
2527 for (auto& i : a.second) {
2528 target.emplace(i.first, i.second);
2529 }
2530 }
2531 }
2532 void sub(const osd_stat_t& o) {
2533 statfs.sub(o.statfs);
2534 snap_trim_queue_len -= o.snap_trim_queue_len;
2535 num_snap_trimming -= o.num_snap_trimming;
2536 num_shards_repaired -= o.num_shards_repaired;
2537 op_queue_age_hist.sub(o.op_queue_age_hist);
2538 os_perf_stat.sub(o.os_perf_stat);
2539 num_pgs -= o.num_pgs;
2540 num_osds -= o.num_osds;
2541 num_per_pool_osds -= o.num_per_pool_osds;
2542 num_per_pool_omap_osds -= o.num_per_pool_omap_osds;
2543 for (const auto& a : o.os_alerts) {
2544 auto& target = os_alerts[a.first];
2545 for (auto& i : a.second) {
2546 target.erase(i.first);
2547 }
2548 if (target.empty()) {
2549 os_alerts.erase(a.first);
2550 }
2551 }
2552 }
2553 void dump(ceph::Formatter *f, bool with_net = true) const;
2554 void dump_ping_time(ceph::Formatter *f) const;
2555 void encode(ceph::buffer::list &bl, uint64_t features) const;
2556 void decode(ceph::buffer::list::const_iterator &bl);
2557 static void generate_test_instances(std::list<osd_stat_t*>& o);
2558 };
2559 WRITE_CLASS_ENCODER_FEATURES(osd_stat_t)
2560
2561 inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
2562 return l.statfs == r.statfs &&
2563 l.snap_trim_queue_len == r.snap_trim_queue_len &&
2564 l.num_snap_trimming == r.num_snap_trimming &&
2565 l.num_shards_repaired == r.num_shards_repaired &&
2566 l.hb_peers == r.hb_peers &&
2567 l.op_queue_age_hist == r.op_queue_age_hist &&
2568 l.os_perf_stat == r.os_perf_stat &&
2569 l.num_pgs == r.num_pgs &&
2570 l.num_osds == r.num_osds &&
2571 l.num_per_pool_osds == r.num_per_pool_osds &&
2572 l.num_per_pool_omap_osds == r.num_per_pool_omap_osds;
2573 }
2574 inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
2575 return !(l == r);
2576 }
2577
2578 inline std::ostream& operator<<(std::ostream& out, const osd_stat_t& s) {
2579 return out << "osd_stat(" << s.statfs << ", "
2580 << "peers " << s.hb_peers
2581 << " op hist " << s.op_queue_age_hist.h
2582 << ")";
2583 }
2584
2585 /*
2586 * summation over an entire pool
2587 */
2588 struct pool_stat_t {
2589 object_stat_collection_t stats;
2590 store_statfs_t store_stats;
2591 int64_t log_size;
2592 int64_t ondisk_log_size; // >= active_log_size
2593 int32_t up; ///< number of up replicas or shards
2594 int32_t acting; ///< number of acting replicas or shards
2595 int32_t num_store_stats; ///< amount of store_stats accumulated
2596
2597 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
2598 num_store_stats(0)
2599 { }
2600
2601 void floor(int64_t f) {
2602 stats.floor(f);
2603 store_stats.floor(f);
2604 if (log_size < f)
2605 log_size = f;
2606 if (ondisk_log_size < f)
2607 ondisk_log_size = f;
2608 if (up < f)
2609 up = f;
2610 if (acting < f)
2611 acting = f;
2612 if (num_store_stats < f)
2613 num_store_stats = f;
2614 }
2615
2616 void add(const store_statfs_t& o) {
2617 store_stats.add(o);
2618 ++num_store_stats;
2619 }
2620 void sub(const store_statfs_t& o) {
2621 store_stats.sub(o);
2622 --num_store_stats;
2623 }
2624
2625 void add(const pg_stat_t& o) {
2626 stats.add(o.stats);
2627 log_size += o.log_size;
2628 ondisk_log_size += o.ondisk_log_size;
2629 up += o.up.size();
2630 acting += o.acting.size();
2631 }
2632 void sub(const pg_stat_t& o) {
2633 stats.sub(o.stats);
2634 log_size -= o.log_size;
2635 ondisk_log_size -= o.ondisk_log_size;
2636 up -= o.up.size();
2637 acting -= o.acting.size();
2638 }
2639
2640 bool is_zero() const {
2641 return (stats.is_zero() &&
2642 store_stats.is_zero() &&
2643 log_size == 0 &&
2644 ondisk_log_size == 0 &&
2645 up == 0 &&
2646 acting == 0 &&
2647 num_store_stats == 0);
2648 }
2649
2650 // helper accessors to retrieve used/netto bytes depending on the
2651 // collection method: new per-pool objectstore report or legacy PG
2652 // summation at OSD.
2653 // In legacy mode used and netto values are the same. But for new per-pool
2654 // collection 'used' provides amount of space ALLOCATED at all related OSDs
2655 // and 'netto' is amount of stored user data.
2656 uint64_t get_allocated_data_bytes(bool per_pool) const {
2657 if (per_pool) {
2658 return store_stats.allocated;
2659 } else {
2660 // legacy mode, use numbers from 'stats'
2661 return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
2662 }
2663 }
2664 uint64_t get_allocated_omap_bytes(bool per_pool_omap) const {
2665 if (per_pool_omap) {
2666 return store_stats.omap_allocated;
2667 } else {
2668 // omap is not broken out by pool by nautilus bluestore; report the
2669 // scrub value. this will be imprecise in that it won't account for
2670 // any storage overhead/efficiency.
2671 return stats.sum.num_omap_bytes;
2672 }
2673 }
2674 uint64_t get_user_data_bytes(float raw_used_rate, ///< space amp factor
2675 bool per_pool) const {
2676 // NOTE: we need the space amp factor so that we can work backwards from
2677 // the raw utilization to the amount of data that the user actually stored.
2678 if (per_pool) {
2679 return raw_used_rate ? store_stats.data_stored / raw_used_rate : 0;
2680 } else {
2681 // legacy mode, use numbers from 'stats'. note that we do NOT use the
2682 // raw_used_rate factor here because we are working from the PG stats
2683 // directly.
2684 return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
2685 }
2686 }
2687 uint64_t get_user_omap_bytes(float raw_used_rate, ///< space amp factor
2688 bool per_pool_omap) const {
2689 if (per_pool_omap) {
2690 return raw_used_rate ? store_stats.omap_allocated / raw_used_rate : 0;
2691 } else {
2692 // omap usage is lazily reported during scrub; this value may lag.
2693 return stats.sum.num_omap_bytes;
2694 }
2695 }
2696
2697 void dump(ceph::Formatter *f) const;
2698 void encode(ceph::buffer::list &bl, uint64_t features) const;
2699 void decode(ceph::buffer::list::const_iterator &bl);
2700 static void generate_test_instances(std::list<pool_stat_t*>& o);
2701 };
2702 WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
2703
2704
2705 // -----------------------------------------
2706
2707 /**
2708 * pg_hit_set_info_t - information about a single recorded HitSet
2709 *
2710 * Track basic metadata about a HitSet, like the number of insertions
2711 * and the time range it covers.
2712 */
2713 struct pg_hit_set_info_t {
2714 utime_t begin, end; ///< time interval
2715 eversion_t version; ///< version this HitSet object was written
2716 bool using_gmt; ///< use gmt for creating the hit_set archive object name
2717
2718 friend bool operator==(const pg_hit_set_info_t& l,
2719 const pg_hit_set_info_t& r) {
2720 return
2721 l.begin == r.begin &&
2722 l.end == r.end &&
2723 l.version == r.version &&
2724 l.using_gmt == r.using_gmt;
2725 }
2726
2727 explicit pg_hit_set_info_t(bool using_gmt = true)
2728 : using_gmt(using_gmt) {}
2729
2730 void encode(ceph::buffer::list &bl) const;
2731 void decode(ceph::buffer::list::const_iterator &bl);
2732 void dump(ceph::Formatter *f) const;
2733 static void generate_test_instances(std::list<pg_hit_set_info_t*>& o);
2734 };
2735 WRITE_CLASS_ENCODER(pg_hit_set_info_t)
2736
2737 /**
2738 * pg_hit_set_history_t - information about a history of hitsets
2739 *
2740 * Include information about the currently accumulating hit set as well
2741 * as archived/historical ones.
2742 */
2743 struct pg_hit_set_history_t {
2744 eversion_t current_last_update; ///< last version inserted into current set
2745 std::list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
2746
2747 friend bool operator==(const pg_hit_set_history_t& l,
2748 const pg_hit_set_history_t& r) {
2749 return
2750 l.current_last_update == r.current_last_update &&
2751 l.history == r.history;
2752 }
2753
2754 void encode(ceph::buffer::list &bl) const;
2755 void decode(ceph::buffer::list::const_iterator &bl);
2756 void dump(ceph::Formatter *f) const;
2757 static void generate_test_instances(std::list<pg_hit_set_history_t*>& o);
2758 };
2759 WRITE_CLASS_ENCODER(pg_hit_set_history_t)
2760
2761
2762 // -----------------------------------------
2763
2764 /**
2765 * pg_history_t - information about recent pg peering/mapping history
2766 *
2767 * This is aggressively shared between OSDs to bound the amount of past
2768 * history they need to worry about.
2769 */
2770 struct pg_history_t {
2771 epoch_t epoch_created = 0; // epoch in which *pg* was created (pool or pg)
2772 epoch_t epoch_pool_created = 0; // epoch in which *pool* was created
2773 // (note: may be pg creation epoch for
2774 // pre-luminous clusters)
2775 epoch_t last_epoch_started = 0;; // lower bound on last epoch started (anywhere, not necessarily locally)
2776 // https://docs.ceph.com/docs/master/dev/osd_internals/last_epoch_started/
2777 epoch_t last_interval_started = 0;; // first epoch of last_epoch_started interval
2778 epoch_t last_epoch_clean = 0;; // lower bound on last epoch the PG was completely clean.
2779 epoch_t last_interval_clean = 0;; // first epoch of last_epoch_clean interval
2780 epoch_t last_epoch_split = 0;; // as parent or child
2781 epoch_t last_epoch_marked_full = 0;; // pool or cluster
2782
2783 /**
2784 * In the event of a map discontinuity, same_*_since may reflect the first
2785 * map the osd has seen in the new map sequence rather than the actual start
2786 * of the interval. This is ok since a discontinuity at epoch e means there
2787 * must have been a clean interval between e and now and that we cannot be
2788 * in the active set during the interval containing e.
2789 */
2790 epoch_t same_up_since = 0;; // same acting set since
2791 epoch_t same_interval_since = 0;; // same acting AND up set since
2792 epoch_t same_primary_since = 0;; // same primary at least back through this epoch.
2793
2794 eversion_t last_scrub;
2795 eversion_t last_deep_scrub;
2796 utime_t last_scrub_stamp;
2797 utime_t last_deep_scrub_stamp;
2798 utime_t last_clean_scrub_stamp;
2799
2800 /// upper bound on how long prior interval readable (relative to encode time)
2801 ceph::timespan prior_readable_until_ub = ceph::timespan::zero();
2802
2803 friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
2804 return
2805 l.epoch_created == r.epoch_created &&
2806 l.epoch_pool_created == r.epoch_pool_created &&
2807 l.last_epoch_started == r.last_epoch_started &&
2808 l.last_interval_started == r.last_interval_started &&
2809 l.last_epoch_clean == r.last_epoch_clean &&
2810 l.last_interval_clean == r.last_interval_clean &&
2811 l.last_epoch_split == r.last_epoch_split &&
2812 l.last_epoch_marked_full == r.last_epoch_marked_full &&
2813 l.same_up_since == r.same_up_since &&
2814 l.same_interval_since == r.same_interval_since &&
2815 l.same_primary_since == r.same_primary_since &&
2816 l.last_scrub == r.last_scrub &&
2817 l.last_deep_scrub == r.last_deep_scrub &&
2818 l.last_scrub_stamp == r.last_scrub_stamp &&
2819 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2820 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
2821 l.prior_readable_until_ub == r.prior_readable_until_ub;
2822 }
2823
2824 pg_history_t() {}
2825 pg_history_t(epoch_t created, utime_t stamp)
2826 : epoch_created(created),
2827 epoch_pool_created(created),
2828 same_up_since(created),
2829 same_interval_since(created),
2830 same_primary_since(created),
2831 last_scrub_stamp(stamp),
2832 last_deep_scrub_stamp(stamp),
2833 last_clean_scrub_stamp(stamp) {}
2834
2835 bool merge(const pg_history_t &other) {
2836 // Here, we only update the fields which cannot be calculated from the OSDmap.
2837 bool modified = false;
2838 if (epoch_created < other.epoch_created) {
2839 epoch_created = other.epoch_created;
2840 modified = true;
2841 }
2842 if (epoch_pool_created < other.epoch_pool_created) {
2843 // FIXME: for jewel compat only; this should either be 0 or always the
2844 // same value across all pg instances.
2845 epoch_pool_created = other.epoch_pool_created;
2846 modified = true;
2847 }
2848 if (last_epoch_started < other.last_epoch_started) {
2849 last_epoch_started = other.last_epoch_started;
2850 modified = true;
2851 }
2852 if (last_interval_started < other.last_interval_started) {
2853 last_interval_started = other.last_interval_started;
2854 // if we are learning about a newer *started* interval, our
2855 // readable_until_ub is obsolete
2856 prior_readable_until_ub = other.prior_readable_until_ub;
2857 modified = true;
2858 } else if (other.last_interval_started == last_interval_started &&
2859 other.prior_readable_until_ub < prior_readable_until_ub) {
2860 // if other is the *same* interval, than pull our upper bound in
2861 // if they have a tighter bound.
2862 prior_readable_until_ub = other.prior_readable_until_ub;
2863 modified = true;
2864 }
2865 if (last_epoch_clean < other.last_epoch_clean) {
2866 last_epoch_clean = other.last_epoch_clean;
2867 modified = true;
2868 }
2869 if (last_interval_clean < other.last_interval_clean) {
2870 last_interval_clean = other.last_interval_clean;
2871 modified = true;
2872 }
2873 if (last_epoch_split < other.last_epoch_split) {
2874 last_epoch_split = other.last_epoch_split;
2875 modified = true;
2876 }
2877 if (last_epoch_marked_full < other.last_epoch_marked_full) {
2878 last_epoch_marked_full = other.last_epoch_marked_full;
2879 modified = true;
2880 }
2881 if (other.last_scrub > last_scrub) {
2882 last_scrub = other.last_scrub;
2883 modified = true;
2884 }
2885 if (other.last_scrub_stamp > last_scrub_stamp) {
2886 last_scrub_stamp = other.last_scrub_stamp;
2887 modified = true;
2888 }
2889 if (other.last_deep_scrub > last_deep_scrub) {
2890 last_deep_scrub = other.last_deep_scrub;
2891 modified = true;
2892 }
2893 if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
2894 last_deep_scrub_stamp = other.last_deep_scrub_stamp;
2895 modified = true;
2896 }
2897 if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
2898 last_clean_scrub_stamp = other.last_clean_scrub_stamp;
2899 modified = true;
2900 }
2901 return modified;
2902 }
2903
2904 void encode(ceph::buffer::list& bl) const;
2905 void decode(ceph::buffer::list::const_iterator& p);
2906 void dump(ceph::Formatter *f) const;
2907 static void generate_test_instances(std::list<pg_history_t*>& o);
2908
2909 ceph::signedspan refresh_prior_readable_until_ub(
2910 ceph::signedspan now, ///< now, relative to osd startup_time
2911 ceph::signedspan ub) { ///< ub, relative to osd startup_time
2912 if (now >= ub) {
2913 // prior interval(s) are unreadable; we can zero the upper bound
2914 prior_readable_until_ub = ceph::signedspan::zero();
2915 return ceph::signedspan::zero();
2916 } else {
2917 prior_readable_until_ub = ub - now;
2918 return ub;
2919 }
2920 }
2921 ceph::signedspan get_prior_readable_until_ub(ceph::signedspan now) {
2922 if (prior_readable_until_ub == ceph::signedspan::zero()) {
2923 return ceph::signedspan::zero();
2924 }
2925 return now + prior_readable_until_ub;
2926 }
2927 };
2928 WRITE_CLASS_ENCODER(pg_history_t)
2929
2930 inline std::ostream& operator<<(std::ostream& out, const pg_history_t& h) {
2931 out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
2932 << " lis/c=" << h.last_interval_started
2933 << "/" << h.last_interval_clean
2934 << " les/c/f=" << h.last_epoch_started << "/" << h.last_epoch_clean
2935 << "/" << h.last_epoch_marked_full
2936 << " sis=" << h.same_interval_since;
2937 if (h.prior_readable_until_ub != ceph::timespan::zero()) {
2938 out << " pruub=" << h.prior_readable_until_ub;
2939 }
2940 return out;
2941 }
2942
2943
2944 /**
2945 * pg_info_t - summary of PG statistics.
2946 *
2947 * some notes:
2948 * - last_complete implies we have all objects that existed as of that
2949 * stamp, OR a newer object, OR have already applied a later delete.
2950 * - if last_complete >= log.tail, then we know pg contents thru log.head.
2951 * otherwise, we have no idea what the pg is supposed to contain.
2952 */
2953 struct pg_info_t {
2954 spg_t pgid;
2955 eversion_t last_update; ///< last object version applied to store.
2956 eversion_t last_complete; ///< last version pg was complete through.
2957 epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd
2958 epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
2959
2960 version_t last_user_version; ///< last user object version applied to store
2961
2962 eversion_t log_tail; ///< oldest log entry.
2963
2964 hobject_t last_backfill; ///< objects >= this and < last_complete may be missing
2965
2966 interval_set<snapid_t> purged_snaps;
2967
2968 pg_stat_t stats;
2969
2970 pg_history_t history;
2971 pg_hit_set_history_t hit_set;
2972
2973 friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
2974 return
2975 l.pgid == r.pgid &&
2976 l.last_update == r.last_update &&
2977 l.last_complete == r.last_complete &&
2978 l.last_epoch_started == r.last_epoch_started &&
2979 l.last_interval_started == r.last_interval_started &&
2980 l.last_user_version == r.last_user_version &&
2981 l.log_tail == r.log_tail &&
2982 l.last_backfill == r.last_backfill &&
2983 l.purged_snaps == r.purged_snaps &&
2984 l.stats == r.stats &&
2985 l.history == r.history &&
2986 l.hit_set == r.hit_set;
2987 }
2988
2989 pg_info_t()
2990 : last_epoch_started(0),
2991 last_interval_started(0),
2992 last_user_version(0),
2993 last_backfill(hobject_t::get_max())
2994 { }
2995 // cppcheck-suppress noExplicitConstructor
2996 pg_info_t(spg_t p)
2997 : pgid(p),
2998 last_epoch_started(0),
2999 last_interval_started(0),
3000 last_user_version(0),
3001 last_backfill(hobject_t::get_max())
3002 { }
3003
3004 void set_last_backfill(hobject_t pos) {
3005 last_backfill = pos;
3006 }
3007
3008 bool is_empty() const { return last_update.version == 0; }
3009 bool dne() const { return history.epoch_created == 0; }
3010
3011 bool has_missing() const { return last_complete != last_update; }
3012 bool is_incomplete() const { return !last_backfill.is_max(); }
3013
3014 void encode(ceph::buffer::list& bl) const;
3015 void decode(ceph::buffer::list::const_iterator& p);
3016 void dump(ceph::Formatter *f) const;
3017 static void generate_test_instances(std::list<pg_info_t*>& o);
3018 };
3019 WRITE_CLASS_ENCODER(pg_info_t)
3020
3021 inline std::ostream& operator<<(std::ostream& out, const pg_info_t& pgi)
3022 {
3023 out << pgi.pgid << "(";
3024 if (pgi.dne())
3025 out << " DNE";
3026 if (pgi.is_empty())
3027 out << " empty";
3028 else {
3029 out << " v " << pgi.last_update;
3030 if (pgi.last_complete != pgi.last_update)
3031 out << " lc " << pgi.last_complete;
3032 out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
3033 }
3034 if (pgi.is_incomplete())
3035 out << " lb " << pgi.last_backfill;
3036 //out << " c " << pgi.epoch_created;
3037 out << " local-lis/les=" << pgi.last_interval_started
3038 << "/" << pgi.last_epoch_started;
3039 out << " n=" << pgi.stats.stats.sum.num_objects;
3040 out << " " << pgi.history
3041 << ")";
3042 return out;
3043 }
3044
3045 /**
3046 * pg_fast_info_t - common pg_info_t fields
3047 *
3048 * These are the fields of pg_info_t (and children) that are updated for
3049 * most IO operations.
3050 *
3051 * ** WARNING **
3052 * Because we rely on these fields to be applied to the normal
3053 * info struct, adding a new field here that is not also new in info
3054 * means that we must set an incompat OSD feature bit!
3055 */
3056 struct pg_fast_info_t {
3057 eversion_t last_update;
3058 eversion_t last_complete;
3059 version_t last_user_version;
3060 struct { // pg_stat_t stats
3061 eversion_t version;
3062 version_t reported_seq;
3063 utime_t last_fresh;
3064 utime_t last_active;
3065 utime_t last_peered;
3066 utime_t last_clean;
3067 utime_t last_unstale;
3068 utime_t last_undegraded;
3069 utime_t last_fullsized;
3070 int64_t log_size; // (also ondisk_log_size, which has the same value)
3071 struct { // object_stat_collection_t stats;
3072 struct { // objct_stat_sum_t sum
3073 int64_t num_bytes; // in bytes
3074 int64_t num_objects;
3075 int64_t num_object_copies;
3076 int64_t num_rd;
3077 int64_t num_rd_kb;
3078 int64_t num_wr;
3079 int64_t num_wr_kb;
3080 int64_t num_objects_dirty;
3081 } sum;
3082 } stats;
3083 } stats;
3084
3085 void populate_from(const pg_info_t& info) {
3086 last_update = info.last_update;
3087 last_complete = info.last_complete;
3088 last_user_version = info.last_user_version;
3089 stats.version = info.stats.version;
3090 stats.reported_seq = info.stats.reported_seq;
3091 stats.last_fresh = info.stats.last_fresh;
3092 stats.last_active = info.stats.last_active;
3093 stats.last_peered = info.stats.last_peered;
3094 stats.last_clean = info.stats.last_clean;
3095 stats.last_unstale = info.stats.last_unstale;
3096 stats.last_undegraded = info.stats.last_undegraded;
3097 stats.last_fullsized = info.stats.last_fullsized;
3098 stats.log_size = info.stats.log_size;
3099 stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
3100 stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
3101 stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
3102 stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
3103 stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
3104 stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
3105 stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
3106 stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
3107 }
3108
3109 bool try_apply_to(pg_info_t* info) {
3110 if (last_update <= info->last_update)
3111 return false;
3112 info->last_update = last_update;
3113 info->last_complete = last_complete;
3114 info->last_user_version = last_user_version;
3115 info->stats.version = stats.version;
3116 info->stats.reported_seq = stats.reported_seq;
3117 info->stats.last_fresh = stats.last_fresh;
3118 info->stats.last_active = stats.last_active;
3119 info->stats.last_peered = stats.last_peered;
3120 info->stats.last_clean = stats.last_clean;
3121 info->stats.last_unstale = stats.last_unstale;
3122 info->stats.last_undegraded = stats.last_undegraded;
3123 info->stats.last_fullsized = stats.last_fullsized;
3124 info->stats.log_size = stats.log_size;
3125 info->stats.ondisk_log_size = stats.log_size;
3126 info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
3127 info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
3128 info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
3129 info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
3130 info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
3131 info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
3132 info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
3133 info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
3134 return true;
3135 }
3136
3137 void encode(ceph::buffer::list& bl) const {
3138 ENCODE_START(1, 1, bl);
3139 encode(last_update, bl);
3140 encode(last_complete, bl);
3141 encode(last_user_version, bl);
3142 encode(stats.version, bl);
3143 encode(stats.reported_seq, bl);
3144 encode(stats.last_fresh, bl);
3145 encode(stats.last_active, bl);
3146 encode(stats.last_peered, bl);
3147 encode(stats.last_clean, bl);
3148 encode(stats.last_unstale, bl);
3149 encode(stats.last_undegraded, bl);
3150 encode(stats.last_fullsized, bl);
3151 encode(stats.log_size, bl);
3152 encode(stats.stats.sum.num_bytes, bl);
3153 encode(stats.stats.sum.num_objects, bl);
3154 encode(stats.stats.sum.num_object_copies, bl);
3155 encode(stats.stats.sum.num_rd, bl);
3156 encode(stats.stats.sum.num_rd_kb, bl);
3157 encode(stats.stats.sum.num_wr, bl);
3158 encode(stats.stats.sum.num_wr_kb, bl);
3159 encode(stats.stats.sum.num_objects_dirty, bl);
3160 ENCODE_FINISH(bl);
3161 }
3162 void decode(ceph::buffer::list::const_iterator& p) {
3163 DECODE_START(1, p);
3164 decode(last_update, p);
3165 decode(last_complete, p);
3166 decode(last_user_version, p);
3167 decode(stats.version, p);
3168 decode(stats.reported_seq, p);
3169 decode(stats.last_fresh, p);
3170 decode(stats.last_active, p);
3171 decode(stats.last_peered, p);
3172 decode(stats.last_clean, p);
3173 decode(stats.last_unstale, p);
3174 decode(stats.last_undegraded, p);
3175 decode(stats.last_fullsized, p);
3176 decode(stats.log_size, p);
3177 decode(stats.stats.sum.num_bytes, p);
3178 decode(stats.stats.sum.num_objects, p);
3179 decode(stats.stats.sum.num_object_copies, p);
3180 decode(stats.stats.sum.num_rd, p);
3181 decode(stats.stats.sum.num_rd_kb, p);
3182 decode(stats.stats.sum.num_wr, p);
3183 decode(stats.stats.sum.num_wr_kb, p);
3184 decode(stats.stats.sum.num_objects_dirty, p);
3185 DECODE_FINISH(p);
3186 }
3187 };
3188 WRITE_CLASS_ENCODER(pg_fast_info_t)
3189
3190
3191 /**
3192 * PastIntervals -- information needed to determine the PriorSet and
3193 * the might_have_unfound set
3194 */
3195 class PastIntervals {
3196 #ifdef WITH_SEASTAR
3197 using OSDMapRef = boost::local_shared_ptr<const OSDMap>;
3198 #else
3199 using OSDMapRef = std::shared_ptr<const OSDMap>;
3200 #endif
3201 public:
3202 struct pg_interval_t {
3203 std::vector<int32_t> up, acting;
3204 epoch_t first, last;
3205 bool maybe_went_rw;
3206 int32_t primary;
3207 int32_t up_primary;
3208
3209 pg_interval_t()
3210 : first(0), last(0),
3211 maybe_went_rw(false),
3212 primary(-1),
3213 up_primary(-1)
3214 {}
3215
3216 pg_interval_t(
3217 std::vector<int32_t> &&up,
3218 std::vector<int32_t> &&acting,
3219 epoch_t first,
3220 epoch_t last,
3221 bool maybe_went_rw,
3222 int32_t primary,
3223 int32_t up_primary)
3224 : up(up), acting(acting), first(first), last(last),
3225 maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
3226 {}
3227
3228 void encode(ceph::buffer::list& bl) const;
3229 void decode(ceph::buffer::list::const_iterator& bl);
3230 void dump(ceph::Formatter *f) const;
3231 static void generate_test_instances(std::list<pg_interval_t*>& o);
3232 };
3233
3234 PastIntervals();
3235 PastIntervals(PastIntervals &&rhs) = default;
3236 PastIntervals &operator=(PastIntervals &&rhs) = default;
3237
3238 PastIntervals(const PastIntervals &rhs);
3239 PastIntervals &operator=(const PastIntervals &rhs);
3240
3241 class interval_rep {
3242 public:
3243 virtual size_t size() const = 0;
3244 virtual bool empty() const = 0;
3245 virtual void clear() = 0;
3246 virtual std::pair<epoch_t, epoch_t> get_bounds() const = 0;
3247 virtual std::set<pg_shard_t> get_all_participants(
3248 bool ec_pool) const = 0;
3249 virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
3250 virtual std::unique_ptr<interval_rep> clone() const = 0;
3251 virtual std::ostream &print(std::ostream &out) const = 0;
3252 virtual void encode(ceph::buffer::list &bl) const = 0;
3253 virtual void decode(ceph::buffer::list::const_iterator &bl) = 0;
3254 virtual void dump(ceph::Formatter *f) const = 0;
3255 virtual void iterate_mayberw_back_to(
3256 epoch_t les,
3257 std::function<void(epoch_t, const std::set<pg_shard_t> &)> &&f) const = 0;
3258
3259 virtual bool has_full_intervals() const { return false; }
3260 virtual void iterate_all_intervals(
3261 std::function<void(const pg_interval_t &)> &&f) const {
3262 ceph_assert(!has_full_intervals());
3263 ceph_abort_msg("not valid for this implementation");
3264 }
3265 virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0;
3266
3267 virtual ~interval_rep() {}
3268 };
3269 friend class pi_compact_rep;
3270 private:
3271
3272 std::unique_ptr<interval_rep> past_intervals;
3273
3274 explicit PastIntervals(interval_rep *rep) : past_intervals(rep) {}
3275
3276 public:
3277 void add_interval(bool ec_pool, const pg_interval_t &interval) {
3278 ceph_assert(past_intervals);
3279 return past_intervals->add_interval(ec_pool, interval);
3280 }
3281
3282 void encode(ceph::buffer::list &bl) const {
3283 ENCODE_START(1, 1, bl);
3284 if (past_intervals) {
3285 __u8 type = 2;
3286 encode(type, bl);
3287 past_intervals->encode(bl);
3288 } else {
3289 encode((__u8)0, bl);
3290 }
3291 ENCODE_FINISH(bl);
3292 }
3293
3294 void decode(ceph::buffer::list::const_iterator &bl);
3295
3296 void dump(ceph::Formatter *f) const {
3297 ceph_assert(past_intervals);
3298 past_intervals->dump(f);
3299 }
3300 static void generate_test_instances(std::list<PastIntervals *> & o);
3301
3302 /**
3303 * Determines whether there is an interval change
3304 */
3305 static bool is_new_interval(
3306 int old_acting_primary,
3307 int new_acting_primary,
3308 const std::vector<int> &old_acting,
3309 const std::vector<int> &new_acting,
3310 int old_up_primary,
3311 int new_up_primary,
3312 const std::vector<int> &old_up,
3313 const std::vector<int> &new_up,
3314 int old_size,
3315 int new_size,
3316 int old_min_size,
3317 int new_min_size,
3318 unsigned old_pg_num,
3319 unsigned new_pg_num,
3320 unsigned old_pg_num_pending,
3321 unsigned new_pg_num_pending,
3322 bool old_sort_bitwise,
3323 bool new_sort_bitwise,
3324 bool old_recovery_deletes,
3325 bool new_recovery_deletes,
3326 uint32_t old_crush_count,
3327 uint32_t new_crush_count,
3328 uint32_t old_crush_target,
3329 uint32_t new_crush_target,
3330 uint32_t old_crush_barrier,
3331 uint32_t new_crush_barrier,
3332 int32_t old_crush_member,
3333 int32_t new_crush_member,
3334 pg_t pgid
3335 );
3336
3337 /**
3338 * Determines whether there is an interval change
3339 */
3340 static bool is_new_interval(
3341 int old_acting_primary, ///< [in] primary as of lastmap
3342 int new_acting_primary, ///< [in] primary as of lastmap
3343 const std::vector<int> &old_acting, ///< [in] acting as of lastmap
3344 const std::vector<int> &new_acting, ///< [in] acting as of osdmap
3345 int old_up_primary, ///< [in] up primary of lastmap
3346 int new_up_primary, ///< [in] up primary of osdmap
3347 const std::vector<int> &old_up, ///< [in] up as of lastmap
3348 const std::vector<int> &new_up, ///< [in] up as of osdmap
3349 const OSDMap *osdmap, ///< [in] current map
3350 const OSDMap *lastmap, ///< [in] last map
3351 pg_t pgid ///< [in] pgid for pg
3352 );
3353
3354 /**
3355 * Integrates a new map into *past_intervals, returns true
3356 * if an interval was closed out.
3357 */
3358 static bool check_new_interval(
3359 int old_acting_primary, ///< [in] primary as of lastmap
3360 int new_acting_primary, ///< [in] primary as of osdmap
3361 const std::vector<int> &old_acting, ///< [in] acting as of lastmap
3362 const std::vector<int> &new_acting, ///< [in] acting as of osdmap
3363 int old_up_primary, ///< [in] up primary of lastmap
3364 int new_up_primary, ///< [in] up primary of osdmap
3365 const std::vector<int> &old_up, ///< [in] up as of lastmap
3366 const std::vector<int> &new_up, ///< [in] up as of osdmap
3367 epoch_t same_interval_since, ///< [in] as of osdmap
3368 epoch_t last_epoch_clean, ///< [in] current
3369 const OSDMap *osdmap, ///< [in] current map
3370 const OSDMap *lastmap, ///< [in] last map
3371 pg_t pgid, ///< [in] pgid for pg
3372 const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
3373 PastIntervals *past_intervals, ///< [out] intervals
3374 std::ostream *out = 0 ///< [out] debug ostream
3375 );
3376 static bool check_new_interval(
3377 int old_acting_primary, ///< [in] primary as of lastmap
3378 int new_acting_primary, ///< [in] primary as of osdmap
3379 const std::vector<int> &old_acting, ///< [in] acting as of lastmap
3380 const std::vector<int> &new_acting, ///< [in] acting as of osdmap
3381 int old_up_primary, ///< [in] up primary of lastmap
3382 int new_up_primary, ///< [in] up primary of osdmap
3383 const std::vector<int> &old_up, ///< [in] up as of lastmap
3384 const std::vector<int> &new_up, ///< [in] up as of osdmap
3385 epoch_t same_interval_since, ///< [in] as of osdmap
3386 epoch_t last_epoch_clean, ///< [in] current
3387 OSDMapRef osdmap, ///< [in] current map
3388 OSDMapRef lastmap, ///< [in] last map
3389 pg_t pgid, ///< [in] pgid for pg
3390 const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
3391 PastIntervals *past_intervals, ///< [out] intervals
3392 std::ostream *out = 0 ///< [out] debug ostream
3393 ) {
3394 return check_new_interval(
3395 old_acting_primary, new_acting_primary,
3396 old_acting, new_acting,
3397 old_up_primary, new_up_primary,
3398 old_up, new_up,
3399 same_interval_since, last_epoch_clean,
3400 osdmap.get(), lastmap.get(),
3401 pgid,
3402 could_have_gone_active,
3403 past_intervals,
3404 out);
3405 }
3406
3407 friend std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
3408
3409 template <typename F>
3410 void iterate_mayberw_back_to(
3411 epoch_t les,
3412 F &&f) const {
3413 ceph_assert(past_intervals);
3414 past_intervals->iterate_mayberw_back_to(les, std::forward<F>(f));
3415 }
3416 void clear() {
3417 ceph_assert(past_intervals);
3418 past_intervals->clear();
3419 }
3420
3421 /**
3422 * Should return a value which gives an indication of the amount
3423 * of state contained
3424 */
3425 size_t size() const {
3426 ceph_assert(past_intervals);
3427 return past_intervals->size();
3428 }
3429
3430 bool empty() const {
3431 ceph_assert(past_intervals);
3432 return past_intervals->empty();
3433 }
3434
3435 void swap(PastIntervals &other) {
3436 using std::swap;
3437 swap(other.past_intervals, past_intervals);
3438 }
3439
3440 /**
3441 * Return all shards which have been in the acting set back to the
3442 * latest epoch to which we have trimmed except for pg_whoami
3443 */
3444 std::set<pg_shard_t> get_might_have_unfound(
3445 pg_shard_t pg_whoami,
3446 bool ec_pool) const {
3447 ceph_assert(past_intervals);
3448 auto ret = past_intervals->get_all_participants(ec_pool);
3449 ret.erase(pg_whoami);
3450 return ret;
3451 }
3452
3453 /**
3454 * Return all shards which we might want to talk to for peering
3455 */
3456 std::set<pg_shard_t> get_all_probe(
3457 bool ec_pool) const {
3458 ceph_assert(past_intervals);
3459 return past_intervals->get_all_participants(ec_pool);
3460 }
3461
3462 /* Return the set of epochs [start, end) represented by the
3463 * past_interval set.
3464 */
3465 std::pair<epoch_t, epoch_t> get_bounds() const {
3466 ceph_assert(past_intervals);
3467 return past_intervals->get_bounds();
3468 }
3469
3470 void adjust_start_backwards(epoch_t last_epoch_clean) {
3471 ceph_assert(past_intervals);
3472 past_intervals->adjust_start_backwards(last_epoch_clean);
3473 }
3474
3475 enum osd_state_t {
3476 UP,
3477 DOWN,
3478 DNE,
3479 LOST
3480 };
3481 struct PriorSet {
3482 bool ec_pool = false;
3483 std::set<pg_shard_t> probe; ///< current+prior OSDs we need to probe.
3484 std::set<int> down; ///< down osds that would normally be in @a probe and might be interesting.
3485 std::map<int, epoch_t> blocked_by; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
3486
3487 bool pg_down = false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
3488 const IsPGRecoverablePredicate* pcontdec = nullptr;
3489
3490 PriorSet() = default;
3491 PriorSet(PriorSet &&) = default;
3492 PriorSet &operator=(PriorSet &&) = default;
3493
3494 PriorSet &operator=(const PriorSet &) = delete;
3495 PriorSet(const PriorSet &) = delete;
3496
3497 bool operator==(const PriorSet &rhs) const {
3498 return (ec_pool == rhs.ec_pool) &&
3499 (probe == rhs.probe) &&
3500 (down == rhs.down) &&
3501 (blocked_by == rhs.blocked_by) &&
3502 (pg_down == rhs.pg_down);
3503 }
3504
3505 bool affected_by_map(
3506 const OSDMap &osdmap,
3507 const DoutPrefixProvider *dpp) const;
3508
3509 // For verifying tests
3510 PriorSet(
3511 bool ec_pool,
3512 std::set<pg_shard_t> probe,
3513 std::set<int> down,
3514 std::map<int, epoch_t> blocked_by,
3515 bool pg_down,
3516 const IsPGRecoverablePredicate *pcontdec)
3517 : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
3518 pg_down(pg_down), pcontdec(pcontdec) {}
3519
3520 private:
3521 template <typename F>
3522 PriorSet(
3523 const PastIntervals &past_intervals,
3524 bool ec_pool,
3525 epoch_t last_epoch_started,
3526 const IsPGRecoverablePredicate *c,
3527 F f,
3528 const std::vector<int> &up,
3529 const std::vector<int> &acting,
3530 const DoutPrefixProvider *dpp);
3531
3532 friend class PastIntervals;
3533 };
3534
3535 template <typename... Args>
3536 PriorSet get_prior_set(Args&&... args) const {
3537 return PriorSet(*this, std::forward<Args>(args)...);
3538 }
3539 };
3540 WRITE_CLASS_ENCODER(PastIntervals)
3541
3542 std::ostream& operator<<(std::ostream& out, const PastIntervals::pg_interval_t& i);
3543 std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
3544 std::ostream& operator<<(std::ostream& out, const PastIntervals::PriorSet &i);
3545
3546 template <typename F>
3547 PastIntervals::PriorSet::PriorSet(
3548 const PastIntervals &past_intervals,
3549 bool ec_pool,
3550 epoch_t last_epoch_started,
3551 const IsPGRecoverablePredicate *c,
3552 F f,
3553 const std::vector<int> &up,
3554 const std::vector<int> &acting,
3555 const DoutPrefixProvider *dpp)
3556 : ec_pool(ec_pool), pg_down(false), pcontdec(c)
3557 {
3558 /*
3559 * We have to be careful to gracefully deal with situations like
3560 * so. Say we have a power outage or something that takes out both
3561 * OSDs, but the monitor doesn't mark them down in the same epoch.
3562 * The history may look like
3563 *
3564 * 1: A B
3565 * 2: B
3566 * 3: let's say B dies for good, too (say, from the power spike)
3567 * 4: A
3568 *
3569 * which makes it look like B may have applied updates to the PG
3570 * that we need in order to proceed. This sucks...
3571 *
3572 * To minimize the risk of this happening, we CANNOT go active if
3573 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3574 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3575 * Then, we have something like
3576 *
3577 * 1: A B
3578 * 2: B up_thru[B]=0
3579 * 3:
3580 * 4: A
3581 *
3582 * -> we can ignore B, bc it couldn't have gone active (alive_thru
3583 * still 0).
3584 *
3585 * or,
3586 *
3587 * 1: A B
3588 * 2: B up_thru[B]=0
3589 * 3: B up_thru[B]=2
3590 * 4:
3591 * 5: A
3592 *
3593 * -> we must wait for B, bc it was alive through 2, and could have
3594 * written to the pg.
3595 *
3596 * If B is really dead, then an administrator will need to manually
3597 * intervene by marking the OSD as "lost."
3598 */
3599
3600 // Include current acting and up nodes... not because they may
3601 // contain old data (this interval hasn't gone active, obviously),
3602 // but because we want their pg_info to inform choose_acting(), and
3603 // so that we know what they do/do not have explicitly before
3604 // sending them any new info/logs/whatever.
3605 for (unsigned i = 0; i < acting.size(); i++) {
3606 if (acting[i] != pg_pool_t::pg_CRUSH_ITEM_NONE)
3607 probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3608 }
3609 // It may be possible to exclude the up nodes, but let's keep them in
3610 // there for now.
3611 for (unsigned i = 0; i < up.size(); i++) {
3612 if (up[i] != pg_pool_t::pg_CRUSH_ITEM_NONE)
3613 probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3614 }
3615
3616 std::set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
3617 ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
3618 for (auto &&i: all_probe) {
3619 switch (f(0, i.osd, nullptr)) {
3620 case UP: {
3621 probe.insert(i);
3622 break;
3623 }
3624 case DNE:
3625 case LOST:
3626 case DOWN: {
3627 down.insert(i.osd);
3628 break;
3629 }
3630 }
3631 }
3632
3633 past_intervals.iterate_mayberw_back_to(
3634 last_epoch_started,
3635 [&](epoch_t start, const std::set<pg_shard_t> &acting) {
3636 ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
3637 << ", acting: " << acting << dendl;
3638
3639 // look at candidate osds during this interval. each falls into
3640 // one of three categories: up, down (but potentially
3641 // interesting), or lost (down, but we won't wait for it).
3642 std::set<pg_shard_t> up_now;
3643 std::map<int, epoch_t> candidate_blocked_by;
3644 // any candidates down now (that might have useful data)
3645 bool any_down_now = false;
3646
3647 // consider ACTING osds
3648 for (auto &&so: acting) {
3649 epoch_t lost_at = 0;
3650 switch (f(start, so.osd, &lost_at)) {
3651 case UP: {
3652 // include past acting osds if they are up.
3653 up_now.insert(so);
3654 break;
3655 }
3656 case DNE: {
3657 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3658 << " no longer exists" << dendl;
3659 break;
3660 }
3661 case LOST: {
3662 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3663 << " is down, but lost_at " << lost_at << dendl;
3664 up_now.insert(so);
3665 break;
3666 }
3667 case DOWN: {
3668 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3669 << " is down" << dendl;
3670 candidate_blocked_by[so.osd] = lost_at;
3671 any_down_now = true;
3672 break;
3673 }
3674 }
3675 }
3676
3677 // if not enough osds survived this interval, and we may have gone rw,
3678 // then we need to wait for one of those osds to recover to
3679 // ensure that we haven't lost any information.
3680 if (!(*pcontdec)(up_now) && any_down_now) {
3681 // fixme: how do we identify a "clean" shutdown anyway?
3682 ldpp_dout(dpp, 10) << "build_prior possibly went active+rw,"
3683 << " insufficient up; including down osds" << dendl;
3684 ceph_assert(!candidate_blocked_by.empty());
3685 pg_down = true;
3686 blocked_by.insert(
3687 candidate_blocked_by.begin(),
3688 candidate_blocked_by.end());
3689 }
3690 });
3691
3692 ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
3693 << " down " << down
3694 << " blocked_by " << blocked_by
3695 << (pg_down ? " pg_down":"")
3696 << dendl;
3697 }
3698
3699 struct pg_notify_t {
3700 epoch_t query_epoch;
3701 epoch_t epoch_sent;
3702 pg_info_t info;
3703 shard_id_t to;
3704 shard_id_t from;
3705 PastIntervals past_intervals;
3706 pg_notify_t() :
3707 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
3708 from(shard_id_t::NO_SHARD) {}
3709 pg_notify_t(
3710 shard_id_t to,
3711 shard_id_t from,
3712 epoch_t query_epoch,
3713 epoch_t epoch_sent,
3714 const pg_info_t &info,
3715 const PastIntervals& pi)
3716 : query_epoch(query_epoch),
3717 epoch_sent(epoch_sent),
3718 info(info), to(to), from(from),
3719 past_intervals(pi) {
3720 ceph_assert(from == info.pgid.shard);
3721 }
3722 void encode(ceph::buffer::list &bl) const;
3723 void decode(ceph::buffer::list::const_iterator &p);
3724 void dump(ceph::Formatter *f) const;
3725 static void generate_test_instances(std::list<pg_notify_t*> &o);
3726 };
3727 WRITE_CLASS_ENCODER(pg_notify_t)
3728 std::ostream &operator<<(std::ostream &lhs, const pg_notify_t &notify);
3729
3730
3731 /**
3732 * pg_query_t - used to ask a peer for information about a pg.
3733 *
3734 * note: if version=0, type=LOG, then we just provide our full log.
3735 */
3736 struct pg_query_t {
3737 enum {
3738 INFO = 0,
3739 LOG = 1,
3740 MISSING = 4,
3741 FULLLOG = 5,
3742 };
3743 std::string_view get_type_name() const {
3744 switch (type) {
3745 case INFO: return "info";
3746 case LOG: return "log";
3747 case MISSING: return "missing";
3748 case FULLLOG: return "fulllog";
3749 default: return "???";
3750 }
3751 }
3752
3753 __s32 type;
3754 eversion_t since;
3755 pg_history_t history;
3756 epoch_t epoch_sent;
3757 shard_id_t to;
3758 shard_id_t from;
3759
3760 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
3761 from(shard_id_t::NO_SHARD) {}
3762 pg_query_t(
3763 int t,
3764 shard_id_t to,
3765 shard_id_t from,
3766 const pg_history_t& h,
3767 epoch_t epoch_sent)
3768 : type(t),
3769 history(h),
3770 epoch_sent(epoch_sent),
3771 to(to), from(from) {
3772 ceph_assert(t != LOG);
3773 }
3774 pg_query_t(
3775 int t,
3776 shard_id_t to,
3777 shard_id_t from,
3778 eversion_t s,
3779 const pg_history_t& h,
3780 epoch_t epoch_sent)
3781 : type(t), since(s), history(h),
3782 epoch_sent(epoch_sent), to(to), from(from) {
3783 ceph_assert(t == LOG);
3784 }
3785
3786 void encode(ceph::buffer::list &bl, uint64_t features) const;
3787 void decode(ceph::buffer::list::const_iterator &bl);
3788
3789 void dump(ceph::Formatter *f) const;
3790 static void generate_test_instances(std::list<pg_query_t*>& o);
3791 };
3792 WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
3793
3794 inline std::ostream& operator<<(std::ostream& out, const pg_query_t& q) {
3795 out << "query(" << q.get_type_name() << " " << q.since;
3796 if (q.type == pg_query_t::LOG)
3797 out << " " << q.history;
3798 out << " epoch_sent " << q.epoch_sent;
3799 out << ")";
3800 return out;
3801 }
3802
3803 /**
3804 * pg_lease_t - readable lease metadata, from primary -> non-primary
3805 *
3806 * This metadata serves to increase either or both of the lease expiration
3807 * and upper bound on the non-primary.
3808 */
3809 struct pg_lease_t {
3810 /// pg readable_until value; replicas must not be readable beyond this
3811 ceph::signedspan readable_until = ceph::signedspan::zero();
3812
3813 /// upper bound on any acting osd's readable_until
3814 ceph::signedspan readable_until_ub = ceph::signedspan::zero();
3815
3816 /// duration of the lease (in case clock deltas aren't available)
3817 ceph::signedspan interval = ceph::signedspan::zero();
3818
3819 pg_lease_t() {}
3820 pg_lease_t(ceph::signedspan ru, ceph::signedspan ruub,
3821 ceph::signedspan i)
3822 : readable_until(ru),
3823 readable_until_ub(ruub),
3824 interval(i) {}
3825
3826 void encode(ceph::buffer::list &bl) const;
3827 void decode(ceph::buffer::list::const_iterator &bl);
3828 void dump(ceph::Formatter *f) const;
3829 static void generate_test_instances(std::list<pg_lease_t*>& o);
3830
3831 friend std::ostream& operator<<(std::ostream& out, const pg_lease_t& l) {
3832 return out << "pg_lease(ru " << l.readable_until
3833 << " ub " << l.readable_until_ub
3834 << " int " << l.interval << ")";
3835 }
3836 };
3837 WRITE_CLASS_ENCODER(pg_lease_t)
3838
3839 /**
3840 * pg_lease_ack_t - lease ack, from non-primary -> primary
3841 *
3842 * This metadata acknowledges to the primary what a non-primary's noted
3843 * upper bound is.
3844 */
3845 struct pg_lease_ack_t {
3846 /// highest upper bound non-primary has recorded (primary's clock)
3847 ceph::signedspan readable_until_ub = ceph::signedspan::zero();
3848
3849 pg_lease_ack_t() {}
3850 pg_lease_ack_t(ceph::signedspan ub)
3851 : readable_until_ub(ub) {}
3852
3853 void encode(ceph::buffer::list &bl) const;
3854 void decode(ceph::buffer::list::const_iterator &bl);
3855 void dump(ceph::Formatter *f) const;
3856 static void generate_test_instances(std::list<pg_lease_ack_t*>& o);
3857
3858 friend std::ostream& operator<<(std::ostream& out, const pg_lease_ack_t& l) {
3859 return out << "pg_lease_ack(ruub " << l.readable_until_ub << ")";
3860 }
3861 };
3862 WRITE_CLASS_ENCODER(pg_lease_ack_t)
3863
3864
3865
3866 class PGBackend;
3867 class ObjectModDesc {
3868 bool can_local_rollback;
3869 bool rollback_info_completed;
3870
3871 // version required to decode, reflected in encode/decode version
3872 __u8 max_required_version = 1;
3873 public:
3874 class Visitor {
3875 public:
3876 virtual void append(uint64_t old_offset) {}
3877 virtual void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &attrs) {}
3878 virtual void rmobject(version_t old_version) {}
3879 /**
3880 * Used to support the unfound_lost_delete log event: if the stashed
3881 * version exists, we unstash it, otherwise, we do nothing. This way
3882 * each replica rolls back to whatever state it had prior to the attempt
3883 * at mark unfound lost delete
3884 */
3885 virtual void try_rmobject(version_t old_version) {
3886 rmobject(old_version);
3887 }
3888 virtual void create() {}
3889 virtual void update_snaps(const std::set<snapid_t> &old_snaps) {}
3890 virtual void rollback_extents(
3891 version_t gen,
3892 const std::vector<std::pair<uint64_t, uint64_t> > &extents) {}
3893 virtual ~Visitor() {}
3894 };
3895 void visit(Visitor *visitor) const;
3896 mutable ceph::buffer::list bl;
3897 enum ModID {
3898 APPEND = 1,
3899 SETATTRS = 2,
3900 DELETE = 3,
3901 CREATE = 4,
3902 UPDATE_SNAPS = 5,
3903 TRY_DELETE = 6,
3904 ROLLBACK_EXTENTS = 7
3905 };
3906 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3907 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3908 }
3909 void claim(ObjectModDesc &other) {
3910 bl = std::move(other.bl);
3911 can_local_rollback = other.can_local_rollback;
3912 rollback_info_completed = other.rollback_info_completed;
3913 }
3914 void claim_append(ObjectModDesc &other) {
3915 if (!can_local_rollback || rollback_info_completed)
3916 return;
3917 if (!other.can_local_rollback) {
3918 mark_unrollbackable();
3919 return;
3920 }
3921 bl.claim_append(other.bl);
3922 rollback_info_completed = other.rollback_info_completed;
3923 }
3924 void swap(ObjectModDesc &other) {
3925 bl.swap(other.bl);
3926
3927 using std::swap;
3928 swap(other.can_local_rollback, can_local_rollback);
3929 swap(other.rollback_info_completed, rollback_info_completed);
3930 swap(other.max_required_version, max_required_version);
3931 }
3932 void append_id(ModID id) {
3933 using ceph::encode;
3934 uint8_t _id(id);
3935 encode(_id, bl);
3936 }
3937 void append(uint64_t old_size) {
3938 if (!can_local_rollback || rollback_info_completed)
3939 return;
3940 ENCODE_START(1, 1, bl);
3941 append_id(APPEND);
3942 encode(old_size, bl);
3943 ENCODE_FINISH(bl);
3944 }
3945 void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &old_attrs) {
3946 if (!can_local_rollback || rollback_info_completed)
3947 return;
3948 ENCODE_START(1, 1, bl);
3949 append_id(SETATTRS);
3950 encode(old_attrs, bl);
3951 ENCODE_FINISH(bl);
3952 }
3953 bool rmobject(version_t deletion_version) {
3954 if (!can_local_rollback || rollback_info_completed)
3955 return false;
3956 ENCODE_START(1, 1, bl);
3957 append_id(DELETE);
3958 encode(deletion_version, bl);
3959 ENCODE_FINISH(bl);
3960 rollback_info_completed = true;
3961 return true;
3962 }
3963 bool try_rmobject(version_t deletion_version) {
3964 if (!can_local_rollback || rollback_info_completed)
3965 return false;
3966 ENCODE_START(1, 1, bl);
3967 append_id(TRY_DELETE);
3968 encode(deletion_version, bl);
3969 ENCODE_FINISH(bl);
3970 rollback_info_completed = true;
3971 return true;
3972 }
3973 void create() {
3974 if (!can_local_rollback || rollback_info_completed)
3975 return;
3976 rollback_info_completed = true;
3977 ENCODE_START(1, 1, bl);
3978 append_id(CREATE);
3979 ENCODE_FINISH(bl);
3980 }
3981 void update_snaps(const std::set<snapid_t> &old_snaps) {
3982 if (!can_local_rollback || rollback_info_completed)
3983 return;
3984 ENCODE_START(1, 1, bl);
3985 append_id(UPDATE_SNAPS);
3986 encode(old_snaps, bl);
3987 ENCODE_FINISH(bl);
3988 }
3989 void rollback_extents(
3990 version_t gen, const std::vector<std::pair<uint64_t, uint64_t> > &extents) {
3991 ceph_assert(can_local_rollback);
3992 ceph_assert(!rollback_info_completed);
3993 if (max_required_version < 2)
3994 max_required_version = 2;
3995 ENCODE_START(2, 2, bl);
3996 append_id(ROLLBACK_EXTENTS);
3997 encode(gen, bl);
3998 encode(extents, bl);
3999 ENCODE_FINISH(bl);
4000 }
4001
4002 // cannot be rolled back
4003 void mark_unrollbackable() {
4004 can_local_rollback = false;
4005 bl.clear();
4006 }
4007 bool can_rollback() const {
4008 return can_local_rollback;
4009 }
4010 bool empty() const {
4011 return can_local_rollback && (bl.length() == 0);
4012 }
4013
4014 bool requires_kraken() const {
4015 return max_required_version >= 2;
4016 }
4017
4018 /**
4019 * Create fresh copy of bl bytes to avoid keeping large buffers around
4020 * in the case that bl contains ptrs which point into a much larger
4021 * message buffer
4022 */
4023 void trim_bl() const {
4024 if (bl.length() > 0)
4025 bl.rebuild();
4026 }
4027 void encode(ceph::buffer::list &bl) const;
4028 void decode(ceph::buffer::list::const_iterator &bl);
4029 void dump(ceph::Formatter *f) const;
4030 static void generate_test_instances(std::list<ObjectModDesc*>& o);
4031 };
4032 WRITE_CLASS_ENCODER(ObjectModDesc)
4033
4034 class ObjectCleanRegions {
4035 private:
4036 bool new_object;
4037 bool clean_omap;
4038 interval_set<uint64_t> clean_offsets;
4039 static std::atomic<uint32_t> max_num_intervals;
4040
4041 /**
4042 * trim the number of intervals if clean_offsets.num_intervals()
4043 * exceeds the given upbound max_num_intervals
4044 * etc. max_num_intervals=2, clean_offsets:{[5~10], [20~5]}
4045 * then new interval [30~10] will evict out the shortest one [20~5]
4046 * finally, clean_offsets becomes {[5~10], [30~10]}
4047 */
4048 void trim();
4049 friend std::ostream& operator<<(std::ostream& out, const ObjectCleanRegions& ocr);
4050 public:
4051 ObjectCleanRegions() : new_object(false), clean_omap(true) {
4052 clean_offsets.insert(0, (uint64_t)-1);
4053 }
4054 ObjectCleanRegions(uint64_t offset, uint64_t len, bool co)
4055 : new_object(false), clean_omap(co) {
4056 clean_offsets.insert(offset, len);
4057 }
4058 bool operator==(const ObjectCleanRegions &orc) const {
4059 return new_object == orc.new_object && clean_omap == orc.clean_omap && clean_offsets == orc.clean_offsets;
4060 }
4061 static void set_max_num_intervals(uint32_t num);
4062 void merge(const ObjectCleanRegions &other);
4063 void mark_data_region_dirty(uint64_t offset, uint64_t len);
4064 void mark_omap_dirty();
4065 void mark_object_new();
4066 void mark_fully_dirty();
4067 interval_set<uint64_t> get_dirty_regions() const;
4068 bool omap_is_dirty() const;
4069 bool object_is_exist() const;
4070 bool is_clean_region(uint64_t offset, uint64_t len) const;
4071
4072 void encode(ceph::buffer::list &bl) const;
4073 void decode(ceph::buffer::list::const_iterator &bl);
4074 void dump(ceph::Formatter *f) const;
4075 static void generate_test_instances(std::list<ObjectCleanRegions*>& o);
4076 };
4077 WRITE_CLASS_ENCODER(ObjectCleanRegions)
4078 std::ostream& operator<<(std::ostream& out, const ObjectCleanRegions& ocr);
4079
4080
4081 struct OSDOp {
4082 ceph_osd_op op;
4083 sobject_t soid;
4084
4085 ceph::buffer::list indata, outdata;
4086 errorcode32_t rval;
4087
4088 OSDOp() {
4089 // FIPS zeroization audit 20191115: this memset clean for security
4090 memset(&op, 0, sizeof(ceph_osd_op));
4091 }
4092
4093 OSDOp(const int op_code) {
4094 // FIPS zeroization audit 20191115: this memset clean for security
4095 memset(&op, 0, sizeof(ceph_osd_op));
4096 op.op = op_code;
4097 }
4098
4099 /**
4100 * split a ceph::buffer::list into constituent indata members of a vector of OSDOps
4101 *
4102 * @param ops [out] vector of OSDOps
4103 * @param in [in] combined data buffer
4104 */
4105 template<typename V>
4106 static void split_osd_op_vector_in_data(V& ops,
4107 ceph::buffer::list& in) {
4108 ceph::buffer::list::iterator datap = in.begin();
4109 for (unsigned i = 0; i < ops.size(); i++) {
4110 if (ops[i].op.payload_len) {
4111 datap.copy(ops[i].op.payload_len, ops[i].indata);
4112 }
4113 }
4114 }
4115
4116 /**
4117 * merge indata members of a vector of OSDOp into a single ceph::buffer::list
4118 *
4119 * Notably this also encodes certain other OSDOp data into the data
4120 * buffer, including the sobject_t soid.
4121 *
4122 * @param ops [in] vector of OSDOps
4123 * @param out [out] combined data buffer
4124 */
4125 template<typename V>
4126 static void merge_osd_op_vector_in_data(V& ops, ceph::buffer::list& out) {
4127 for (unsigned i = 0; i < ops.size(); i++) {
4128 if (ops[i].indata.length()) {
4129 ops[i].op.payload_len = ops[i].indata.length();
4130 out.append(ops[i].indata);
4131 }
4132 }
4133 }
4134
4135 /**
4136 * split a ceph::buffer::list into constituent outdata members of a vector of OSDOps
4137 *
4138 * @param ops [out] vector of OSDOps
4139 * @param in [in] combined data buffer
4140 */
4141 static void split_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& in);
4142
4143 /**
4144 * merge outdata members of a vector of OSDOps into a single ceph::buffer::list
4145 *
4146 * @param ops [in] vector of OSDOps
4147 * @param out [out] combined data buffer
4148 */
4149 static void merge_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& out);
4150
4151 /**
4152 * Clear data as much as possible, leave minimal data for historical op dump
4153 *
4154 * @param ops [in] vector of OSDOps
4155 */
4156 template<typename V>
4157 static void clear_data(V& ops) {
4158 for (unsigned i = 0; i < ops.size(); i++) {
4159 OSDOp& op = ops[i];
4160 op.outdata.clear();
4161 if (ceph_osd_op_type_attr(op.op.op) &&
4162 op.op.xattr.name_len &&
4163 op.indata.length() >= op.op.xattr.name_len) {
4164 ceph::buffer::list bl;
4165 bl.push_back(ceph::buffer::ptr_node::create(op.op.xattr.name_len));
4166 bl.begin().copy_in(op.op.xattr.name_len, op.indata);
4167 op.indata = std::move(bl);
4168 } else if (ceph_osd_op_type_exec(op.op.op) &&
4169 op.op.cls.class_len &&
4170 op.indata.length() >
4171 (op.op.cls.class_len + op.op.cls.method_len)) {
4172 __u8 len = op.op.cls.class_len + op.op.cls.method_len;
4173 ceph::buffer::list bl;
4174 bl.push_back(ceph::buffer::ptr_node::create(len));
4175 bl.begin().copy_in(len, op.indata);
4176 op.indata = std::move(bl);
4177 } else {
4178 op.indata.clear();
4179 }
4180 }
4181 }
4182 };
4183 std::ostream& operator<<(std::ostream& out, const OSDOp& op);
4184
4185 struct pg_log_op_return_item_t {
4186 int32_t rval;
4187 ceph::buffer::list bl;
4188 void encode(ceph::buffer::list& p) const {
4189 using ceph::encode;
4190 encode(rval, p);
4191 encode(bl, p);
4192 }
4193 void decode(ceph::buffer::list::const_iterator& p) {
4194 using ceph::decode;
4195 decode(rval, p);
4196 decode(bl, p);
4197 }
4198 void dump(ceph::Formatter *f) const {
4199 f->dump_int("rval", rval);
4200 f->dump_unsigned("bl_length", bl.length());
4201 }
4202 friend bool operator==(const pg_log_op_return_item_t& lhs,
4203 const pg_log_op_return_item_t& rhs) {
4204 return lhs.rval == rhs.rval &&
4205 lhs.bl.contents_equal(rhs.bl);
4206 }
4207 friend bool operator!=(const pg_log_op_return_item_t& lhs,
4208 const pg_log_op_return_item_t& rhs) {
4209 return !(lhs == rhs);
4210 }
4211 friend std::ostream& operator<<(std::ostream& out, const pg_log_op_return_item_t& i) {
4212 return out << "r=" << i.rval << "+" << i.bl.length() << "b";
4213 }
4214 };
4215 WRITE_CLASS_ENCODER(pg_log_op_return_item_t)
4216
4217 /**
4218 * pg_log_entry_t - single entry/event in pg log
4219 *
4220 */
4221 struct pg_log_entry_t {
4222 enum {
4223 MODIFY = 1, // some unspecified modification (but not *all* modifications)
4224 CLONE = 2, // cloned object from head
4225 DELETE = 3, // deleted object
4226 //BACKLOG = 4, // event invented by generate_backlog [obsolete]
4227 LOST_REVERT = 5, // lost new version, revert to an older version.
4228 LOST_DELETE = 6, // lost new version, revert to no object (deleted).
4229 LOST_MARK = 7, // lost new version, now EIO
4230 PROMOTE = 8, // promoted object from another tier
4231 CLEAN = 9, // mark an object clean
4232 ERROR = 10, // write that returned an error
4233 };
4234 static const char *get_op_name(int op) {
4235 switch (op) {
4236 case MODIFY:
4237 return "modify";
4238 case PROMOTE:
4239 return "promote";
4240 case CLONE:
4241 return "clone";
4242 case DELETE:
4243 return "delete";
4244 case LOST_REVERT:
4245 return "l_revert";
4246 case LOST_DELETE:
4247 return "l_delete";
4248 case LOST_MARK:
4249 return "l_mark";
4250 case CLEAN:
4251 return "clean";
4252 case ERROR:
4253 return "error";
4254 default:
4255 return "unknown";
4256 }
4257 }
4258 const char *get_op_name() const {
4259 return get_op_name(op);
4260 }
4261
4262 // describes state for a locally-rollbackable entry
4263 ObjectModDesc mod_desc;
4264 ceph::buffer::list snaps; // only for clone entries
4265 hobject_t soid;
4266 osd_reqid_t reqid; // caller+tid to uniquely identify request
4267 mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > extra_reqids;
4268
4269 /// map extra_reqids by index to error return code (if any)
4270 mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
4271
4272 eversion_t version, prior_version, reverting_to;
4273 version_t user_version; // the user version for this entry
4274 utime_t mtime; // this is the _user_ mtime, mind you
4275 int32_t return_code; // only stored for ERRORs for dup detection
4276
4277 std::vector<pg_log_op_return_item_t> op_returns;
4278
4279 __s32 op;
4280 bool invalid_hash; // only when decoding sobject_t based entries
4281 bool invalid_pool; // only when decoding pool-less hobject based entries
4282 ObjectCleanRegions clean_regions;
4283
4284 pg_log_entry_t()
4285 : user_version(0), return_code(0), op(0),
4286 invalid_hash(false), invalid_pool(false) {
4287 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4288 }
4289 pg_log_entry_t(int _op, const hobject_t& _soid,
4290 const eversion_t& v, const eversion_t& pv,
4291 version_t uv,
4292 const osd_reqid_t& rid, const utime_t& mt,
4293 int return_code)
4294 : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
4295 mtime(mt), return_code(return_code), op(_op),
4296 invalid_hash(false), invalid_pool(false) {
4297 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4298 }
4299
4300 bool is_clone() const { return op == CLONE; }
4301 bool is_modify() const { return op == MODIFY; }
4302 bool is_promote() const { return op == PROMOTE; }
4303 bool is_clean() const { return op == CLEAN; }
4304 bool is_lost_revert() const { return op == LOST_REVERT; }
4305 bool is_lost_delete() const { return op == LOST_DELETE; }
4306 bool is_lost_mark() const { return op == LOST_MARK; }
4307 bool is_error() const { return op == ERROR; }
4308
4309 bool is_update() const {
4310 return
4311 is_clone() || is_modify() || is_promote() || is_clean() ||
4312 is_lost_revert() || is_lost_mark();
4313 }
4314 bool is_delete() const {
4315 return op == DELETE || op == LOST_DELETE;
4316 }
4317
4318 bool can_rollback() const {
4319 return mod_desc.can_rollback();
4320 }
4321
4322 void mark_unrollbackable() {
4323 mod_desc.mark_unrollbackable();
4324 }
4325
4326 bool requires_kraken() const {
4327 return mod_desc.requires_kraken();
4328 }
4329
4330 // Errors are only used for dup detection, whereas
4331 // the index by objects is used by recovery, copy_get,
4332 // and other facilities that don't expect or need to
4333 // be aware of error entries.
4334 bool object_is_indexed() const {
4335 return !is_error();
4336 }
4337
4338 bool reqid_is_indexed() const {
4339 return reqid != osd_reqid_t() &&
4340 (op == MODIFY || op == DELETE || op == ERROR);
4341 }
4342
4343 void set_op_returns(const std::vector<OSDOp>& ops) {
4344 op_returns.resize(ops.size());
4345 for (unsigned i = 0; i < ops.size(); ++i) {
4346 op_returns[i].rval = ops[i].rval;
4347 op_returns[i].bl = ops[i].outdata;
4348 }
4349 }
4350
4351 std::string get_key_name() const;
4352 void encode_with_checksum(ceph::buffer::list& bl) const;
4353 void decode_with_checksum(ceph::buffer::list::const_iterator& p);
4354
4355 void encode(ceph::buffer::list &bl) const;
4356 void decode(ceph::buffer::list::const_iterator &bl);
4357 void dump(ceph::Formatter *f) const;
4358 static void generate_test_instances(std::list<pg_log_entry_t*>& o);
4359
4360 };
4361 WRITE_CLASS_ENCODER(pg_log_entry_t)
4362
4363 std::ostream& operator<<(std::ostream& out, const pg_log_entry_t& e);
4364
4365 struct pg_log_dup_t {
4366 osd_reqid_t reqid; // caller+tid to uniquely identify request
4367 eversion_t version;
4368 version_t user_version; // the user version for this entry
4369 int32_t return_code; // only stored for ERRORs for dup detection
4370
4371 std::vector<pg_log_op_return_item_t> op_returns;
4372
4373 pg_log_dup_t()
4374 : user_version(0), return_code(0)
4375 {}
4376 explicit pg_log_dup_t(const pg_log_entry_t& entry)
4377 : reqid(entry.reqid), version(entry.version),
4378 user_version(entry.user_version),
4379 return_code(entry.return_code),
4380 op_returns(entry.op_returns)
4381 {}
4382 pg_log_dup_t(const eversion_t& v, version_t uv,
4383 const osd_reqid_t& rid, int return_code)
4384 : reqid(rid), version(v), user_version(uv),
4385 return_code(return_code)
4386 {}
4387
4388 std::string get_key_name() const;
4389 void encode(ceph::buffer::list &bl) const;
4390 void decode(ceph::buffer::list::const_iterator &bl);
4391 void dump(ceph::Formatter *f) const;
4392 static void generate_test_instances(std::list<pg_log_dup_t*>& o);
4393
4394 bool operator==(const pg_log_dup_t &rhs) const {
4395 return reqid == rhs.reqid &&
4396 version == rhs.version &&
4397 user_version == rhs.user_version &&
4398 return_code == rhs.return_code &&
4399 op_returns == rhs.op_returns;
4400 }
4401 bool operator!=(const pg_log_dup_t &rhs) const {
4402 return !(*this == rhs);
4403 }
4404
4405 friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
4406 };
4407 WRITE_CLASS_ENCODER(pg_log_dup_t)
4408
4409 std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
4410
4411 /**
4412 * pg_log_t - incremental log of recent pg changes.
4413 *
4414 * serves as a recovery queue for recent changes.
4415 */
4416 struct pg_log_t {
4417 /*
4418 * head - newest entry (update|delete)
4419 * tail - entry previous to oldest (update|delete) for which we have
4420 * complete negative information.
4421 * i.e. we can infer pg contents for any store whose last_update >= tail.
4422 */
4423 eversion_t head; // newest entry
4424 eversion_t tail; // version prior to oldest
4425
4426 protected:
4427 // We can rollback rollback-able entries > can_rollback_to
4428 eversion_t can_rollback_to;
4429
4430 // always <= can_rollback_to, indicates how far stashed rollback
4431 // data can be found
4432 eversion_t rollback_info_trimmed_to;
4433
4434 public:
4435 // the actual log
4436 mempool::osd_pglog::list<pg_log_entry_t> log;
4437
4438 // entries just for dup op detection ordered oldest to newest
4439 mempool::osd_pglog::list<pg_log_dup_t> dups;
4440
4441 pg_log_t() = default;
4442 pg_log_t(const eversion_t &last_update,
4443 const eversion_t &log_tail,
4444 const eversion_t &can_rollback_to,
4445 const eversion_t &rollback_info_trimmed_to,
4446 mempool::osd_pglog::list<pg_log_entry_t> &&entries,
4447 mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
4448 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
4449 rollback_info_trimmed_to(rollback_info_trimmed_to),
4450 log(std::move(entries)), dups(std::move(dup_entries)) {}
4451 pg_log_t(const eversion_t &last_update,
4452 const eversion_t &log_tail,
4453 const eversion_t &can_rollback_to,
4454 const eversion_t &rollback_info_trimmed_to,
4455 const std::list<pg_log_entry_t> &entries,
4456 const std::list<pg_log_dup_t> &dup_entries)
4457 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
4458 rollback_info_trimmed_to(rollback_info_trimmed_to) {
4459 for (auto &&entry: entries) {
4460 log.push_back(entry);
4461 }
4462 for (auto &&entry: dup_entries) {
4463 dups.push_back(entry);
4464 }
4465 }
4466
4467 void clear() {
4468 eversion_t z;
4469 rollback_info_trimmed_to = can_rollback_to = head = tail = z;
4470 log.clear();
4471 dups.clear();
4472 }
4473
4474 eversion_t get_rollback_info_trimmed_to() const {
4475 return rollback_info_trimmed_to;
4476 }
4477 eversion_t get_can_rollback_to() const {
4478 return can_rollback_to;
4479 }
4480
4481
4482 pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
4483 mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
4484 oldlog.swap(log);
4485
4486 eversion_t old_tail;
4487 unsigned mask = ~((~0)<<split_bits);
4488 for (auto i = oldlog.begin();
4489 i != oldlog.end();
4490 ) {
4491 if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
4492 childlog.push_back(*i);
4493 } else {
4494 log.push_back(*i);
4495 }
4496 oldlog.erase(i++);
4497 }
4498
4499 // osd_reqid is unique, so it doesn't matter if there are extra
4500 // dup entries in each pg. To avoid storing oid with the dup
4501 // entries, just copy the whole list.
4502 auto childdups(dups);
4503
4504 return pg_log_t(
4505 head,
4506 tail,
4507 can_rollback_to,
4508 rollback_info_trimmed_to,
4509 std::move(childlog),
4510 std::move(childdups));
4511 }
4512
4513 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
4514 ceph_assert(newhead >= tail);
4515
4516 mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
4517 mempool::osd_pglog::list<pg_log_entry_t> divergent;
4518 while (true) {
4519 if (p == log.begin()) {
4520 // yikes, the whole thing is divergent!
4521 using std::swap;
4522 swap(divergent, log);
4523 break;
4524 }
4525 --p;
4526 if (p->version.version <= newhead.version) {
4527 /*
4528 * look at eversion.version here. we want to avoid a situation like:
4529 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4530 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4531 * lower_bound = 100'9
4532 * i.e, same request, different version. If the eversion.version is > the
4533 * lower_bound, we it is divergent.
4534 */
4535 ++p;
4536 divergent.splice(divergent.begin(), log, p, log.end());
4537 break;
4538 }
4539 ceph_assert(p->version > newhead);
4540 }
4541 head = newhead;
4542
4543 if (can_rollback_to > newhead)
4544 can_rollback_to = newhead;
4545
4546 if (rollback_info_trimmed_to > newhead)
4547 rollback_info_trimmed_to = newhead;
4548
4549 return divergent;
4550 }
4551
4552 void merge_from(const std::vector<pg_log_t*>& slogs, eversion_t last_update) {
4553 log.clear();
4554
4555 // sort and merge dups
4556 std::multimap<eversion_t,pg_log_dup_t> sorted;
4557 for (auto& d : dups) {
4558 sorted.emplace(d.version, d);
4559 }
4560 for (auto l : slogs) {
4561 for (auto& d : l->dups) {
4562 sorted.emplace(d.version, d);
4563 }
4564 }
4565 dups.clear();
4566 for (auto& i : sorted) {
4567 dups.push_back(i.second);
4568 }
4569
4570 head = last_update;
4571 tail = last_update;
4572 can_rollback_to = last_update;
4573 rollback_info_trimmed_to = last_update;
4574 }
4575
4576 bool empty() const {
4577 return log.empty();
4578 }
4579
4580 bool null() const {
4581 return head.version == 0 && head.epoch == 0;
4582 }
4583
4584 uint64_t approx_size() const {
4585 return head.version - tail.version;
4586 }
4587
4588 static void filter_log(spg_t import_pgid, const OSDMap &curmap,
4589 const std::string &hit_set_namespace, const pg_log_t &in,
4590 pg_log_t &out, pg_log_t &reject);
4591
4592 /**
4593 * copy entries from the tail of another pg_log_t
4594 *
4595 * @param other pg_log_t to copy from
4596 * @param from copy entries after this version
4597 */
4598 void copy_after(CephContext* cct, const pg_log_t &other, eversion_t from);
4599
4600 /**
4601 * copy up to N entries
4602 *
4603 * @param other source log
4604 * @param max max number of entries to copy
4605 */
4606 void copy_up_to(CephContext* cct, const pg_log_t &other, int max);
4607
4608 std::ostream& print(std::ostream& out) const;
4609
4610 void encode(ceph::buffer::list &bl) const;
4611 void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
4612 void dump(ceph::Formatter *f) const;
4613 static void generate_test_instances(std::list<pg_log_t*>& o);
4614 };
4615 WRITE_CLASS_ENCODER(pg_log_t)
4616
4617 inline std::ostream& operator<<(std::ostream& out, const pg_log_t& log)
4618 {
4619 out << "log((" << log.tail << "," << log.head << "], crt="
4620 << log.get_can_rollback_to() << ")";
4621 return out;
4622 }
4623
4624
4625 /**
4626 * pg_missing_t - summary of missing objects.
4627 *
4628 * kept in memory, as a supplement to pg_log_t
4629 * also used to pass missing info in messages.
4630 */
4631 struct pg_missing_item {
4632 eversion_t need, have;
4633 ObjectCleanRegions clean_regions;
4634 enum missing_flags_t {
4635 FLAG_NONE = 0,
4636 FLAG_DELETE = 1,
4637 } flags;
4638 pg_missing_item() : flags(FLAG_NONE) {}
4639 explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
4640 pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false, bool old_style = false) :
4641 need(n), have(h) {
4642 set_delete(is_delete);
4643 if (old_style)
4644 clean_regions.mark_fully_dirty();
4645 }
4646
4647 void encode(ceph::buffer::list& bl, uint64_t features) const {
4648 using ceph::encode;
4649 if (HAVE_FEATURE(features, SERVER_OCTOPUS)) {
4650 // encoding a zeroed eversion_t to differentiate between OSD_RECOVERY_DELETES、
4651 // SERVER_OCTOPUS and legacy unversioned encoding - a need value of 0'0 is not
4652 // possible. This can be replaced with the legacy encoding
4653 encode(eversion_t(), bl);
4654 encode(eversion_t(-1, -1), bl);
4655 encode(need, bl);
4656 encode(have, bl);
4657 encode(static_cast<uint8_t>(flags), bl);
4658 encode(clean_regions, bl);
4659 } else {
4660 encode(eversion_t(), bl);
4661 encode(need, bl);
4662 encode(have, bl);
4663 encode(static_cast<uint8_t>(flags), bl);
4664 }
4665 }
4666 void decode(ceph::buffer::list::const_iterator& bl) {
4667 using ceph::decode;
4668 eversion_t e, l;
4669 decode(e, bl);
4670 decode(l, bl);
4671 if(l == eversion_t(-1, -1)) {
4672 // support all
4673 decode(need, bl);
4674 decode(have, bl);
4675 uint8_t f;
4676 decode(f, bl);
4677 flags = static_cast<missing_flags_t>(f);
4678 decode(clean_regions, bl);
4679 } else {
4680 // support OSD_RECOVERY_DELETES
4681 need = l;
4682 decode(have, bl);
4683 uint8_t f;
4684 decode(f, bl);
4685 flags = static_cast<missing_flags_t>(f);
4686 clean_regions.mark_fully_dirty();
4687 }
4688 }
4689
4690 void set_delete(bool is_delete) {
4691 flags = is_delete ? FLAG_DELETE : FLAG_NONE;
4692 }
4693
4694 bool is_delete() const {
4695 return (flags & FLAG_DELETE) == FLAG_DELETE;
4696 }
4697
4698 std::string flag_str() const {
4699 if (flags == FLAG_NONE) {
4700 return "none";
4701 } else {
4702 return "delete";
4703 }
4704 }
4705
4706 void dump(ceph::Formatter *f) const {
4707 f->dump_stream("need") << need;
4708 f->dump_stream("have") << have;
4709 f->dump_stream("flags") << flag_str();
4710 f->dump_stream("clean_regions") << clean_regions;
4711 }
4712 static void generate_test_instances(std::list<pg_missing_item*>& o) {
4713 o.push_back(new pg_missing_item);
4714 o.push_back(new pg_missing_item);
4715 o.back()->need = eversion_t(1, 2);
4716 o.back()->have = eversion_t(1, 1);
4717 o.push_back(new pg_missing_item);
4718 o.back()->need = eversion_t(3, 5);
4719 o.back()->have = eversion_t(3, 4);
4720 o.back()->clean_regions.mark_data_region_dirty(4096, 8192);
4721 o.back()->clean_regions.mark_omap_dirty();
4722 o.back()->flags = FLAG_DELETE;
4723 }
4724 bool operator==(const pg_missing_item &rhs) const {
4725 return need == rhs.need && have == rhs.have && flags == rhs.flags;
4726 }
4727 bool operator!=(const pg_missing_item &rhs) const {
4728 return !(*this == rhs);
4729 }
4730 };
4731 WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
4732 std::ostream& operator<<(std::ostream& out, const pg_missing_item &item);
4733 #if FMT_VERSION >= 90000
4734 template <> struct fmt::formatter<pg_missing_item> : fmt::ostream_formatter {};
4735 #endif
4736
4737 class pg_missing_const_i {
4738 public:
4739 virtual const std::map<hobject_t, pg_missing_item> &
4740 get_items() const = 0;
4741 virtual const std::map<version_t, hobject_t> &get_rmissing() const = 0;
4742 virtual bool get_may_include_deletes() const = 0;
4743 virtual unsigned int num_missing() const = 0;
4744 virtual bool have_missing() const = 0;
4745 virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
4746 virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
4747 virtual ~pg_missing_const_i() {}
4748 };
4749
4750
4751 template <bool Track>
4752 class ChangeTracker {
4753 public:
4754 void changed(const hobject_t &obj) {}
4755 template <typename F>
4756 void get_changed(F &&f) const {}
4757 void flush() {}
4758 bool is_clean() const {
4759 return true;
4760 }
4761 };
4762 template <>
4763 class ChangeTracker<true> {
4764 std::set<hobject_t> _changed;
4765 public:
4766 void changed(const hobject_t &obj) {
4767 _changed.insert(obj);
4768 }
4769 template <typename F>
4770 void get_changed(F &&f) const {
4771 for (auto const &i: _changed) {
4772 f(i);
4773 }
4774 }
4775 void flush() {
4776 _changed.clear();
4777 }
4778 bool is_clean() const {
4779 return _changed.empty();
4780 }
4781 };
4782
4783 template <bool TrackChanges>
4784 class pg_missing_set : public pg_missing_const_i {
4785 using item = pg_missing_item;
4786 std::map<hobject_t, item> missing; // oid -> (need v, have v)
4787 std::map<version_t, hobject_t> rmissing; // v -> oid
4788 ChangeTracker<TrackChanges> tracker;
4789
4790 public:
4791 pg_missing_set() = default;
4792
4793 template <typename missing_type>
4794 pg_missing_set(const missing_type &m) {
4795 missing = m.get_items();
4796 rmissing = m.get_rmissing();
4797 may_include_deletes = m.get_may_include_deletes();
4798 for (auto &&i: missing)
4799 tracker.changed(i.first);
4800 }
4801
4802 bool may_include_deletes = false;
4803
4804 const std::map<hobject_t, item> &get_items() const override {
4805 return missing;
4806 }
4807 const std::map<version_t, hobject_t> &get_rmissing() const override {
4808 return rmissing;
4809 }
4810 bool get_may_include_deletes() const override {
4811 return may_include_deletes;
4812 }
4813 unsigned int num_missing() const override {
4814 return missing.size();
4815 }
4816 bool have_missing() const override {
4817 return !missing.empty();
4818 }
4819 void merge(const pg_log_entry_t& e) {
4820 auto miter = missing.find(e.soid);
4821 if (miter != missing.end() && miter->second.have != eversion_t() && e.version > miter->second.have)
4822 miter->second.clean_regions.merge(e.clean_regions);
4823 }
4824 bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
4825 auto iter = missing.find(oid);
4826 if (iter == missing.end())
4827 return false;
4828 if (out)
4829 *out = iter->second;
4830 return true;
4831 }
4832 bool is_missing(const hobject_t& oid, eversion_t v) const override {
4833 std::map<hobject_t, item>::const_iterator m =
4834 missing.find(oid);
4835 if (m == missing.end())
4836 return false;
4837 const item &item(m->second);
4838 if (item.need > v)
4839 return false;
4840 return true;
4841 }
4842 eversion_t get_oldest_need() const {
4843 if (missing.empty()) {
4844 return eversion_t();
4845 }
4846 auto it = missing.find(rmissing.begin()->second);
4847 ceph_assert(it != missing.end());
4848 return it->second.need;
4849 }
4850
4851 void claim(pg_missing_set&& o) {
4852 static_assert(!TrackChanges, "Can't use claim with TrackChanges");
4853 missing = std::move(o.missing);
4854 rmissing = std::move(o.rmissing);
4855 }
4856
4857 /*
4858 * this needs to be called in log order as we extend the log. it
4859 * assumes missing is accurate up through the previous log entry.
4860 */
4861 void add_next_event(const pg_log_entry_t& e) {
4862 std::map<hobject_t, item>::iterator missing_it;
4863 missing_it = missing.find(e.soid);
4864 bool is_missing_divergent_item = missing_it != missing.end();
4865 if (e.prior_version == eversion_t() || e.is_clone()) {
4866 // new object.
4867 if (is_missing_divergent_item) { // use iterator
4868 rmissing.erase(missing_it->second.need.version);
4869 // .have = nil
4870 missing_it->second = item(e.version, eversion_t(), e.is_delete());
4871 missing_it->second.clean_regions.mark_fully_dirty();
4872 } else {
4873 // create new element in missing map
4874 // .have = nil
4875 missing[e.soid] = item(e.version, eversion_t(), e.is_delete());
4876 missing[e.soid].clean_regions.mark_fully_dirty();
4877 }
4878 } else if (is_missing_divergent_item) {
4879 // already missing (prior).
4880 rmissing.erase((missing_it->second).need.version);
4881 missing_it->second.need = e.version; // leave .have unchanged.
4882 missing_it->second.set_delete(e.is_delete());
4883 if (e.is_lost_revert())
4884 missing_it->second.clean_regions.mark_fully_dirty();
4885 else
4886 missing_it->second.clean_regions.merge(e.clean_regions);
4887 } else {
4888 // not missing, we must have prior_version (if any)
4889 ceph_assert(!is_missing_divergent_item);
4890 missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
4891 if (e.is_lost_revert())
4892 missing[e.soid].clean_regions.mark_fully_dirty();
4893 else
4894 missing[e.soid].clean_regions = e.clean_regions;
4895 }
4896 rmissing[e.version.version] = e.soid;
4897 tracker.changed(e.soid);
4898 }
4899
4900 void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
4901 auto p = missing.find(oid);
4902 if (p != missing.end()) {
4903 rmissing.erase((p->second).need.version);
4904 p->second.need = need; // do not adjust .have
4905 p->second.set_delete(is_delete);
4906 p->second.clean_regions.mark_fully_dirty();
4907 } else {
4908 missing[oid] = item(need, eversion_t(), is_delete);
4909 missing[oid].clean_regions.mark_fully_dirty();
4910 }
4911 rmissing[need.version] = oid;
4912
4913 tracker.changed(oid);
4914 }
4915
4916 void revise_have(hobject_t oid, eversion_t have) {
4917 auto p = missing.find(oid);
4918 if (p != missing.end()) {
4919 tracker.changed(oid);
4920 (p->second).have = have;
4921 }
4922 }
4923
4924 void mark_fully_dirty(const hobject_t& oid) {
4925 auto p = missing.find(oid);
4926 if (p != missing.end()) {
4927 tracker.changed(oid);
4928 (p->second).clean_regions.mark_fully_dirty();
4929 }
4930 }
4931
4932 void add(const hobject_t& oid, eversion_t need, eversion_t have,
4933 bool is_delete) {
4934 missing[oid] = item(need, have, is_delete, true);
4935 rmissing[need.version] = oid;
4936 tracker.changed(oid);
4937 }
4938
4939 void add(const hobject_t& oid, pg_missing_item&& item) {
4940 rmissing[item.need.version] = oid;
4941 missing.insert({oid, std::move(item)});
4942 tracker.changed(oid);
4943 }
4944
4945 void rm(const hobject_t& oid, eversion_t v) {
4946 std::map<hobject_t, item>::iterator p = missing.find(oid);
4947 if (p != missing.end() && p->second.need <= v)
4948 rm(p);
4949 }
4950
4951 void rm(std::map<hobject_t, item>::const_iterator m) {
4952 tracker.changed(m->first);
4953 rmissing.erase(m->second.need.version);
4954 missing.erase(m);
4955 }
4956
4957 void got(const hobject_t& oid, eversion_t v) {
4958 std::map<hobject_t, item>::iterator p = missing.find(oid);
4959 ceph_assert(p != missing.end());
4960 ceph_assert(p->second.need <= v || p->second.is_delete());
4961 got(p);
4962 }
4963
4964 void got(std::map<hobject_t, item>::const_iterator m) {
4965 tracker.changed(m->first);
4966 rmissing.erase(m->second.need.version);
4967 missing.erase(m);
4968 }
4969
4970 void split_into(
4971 pg_t child_pgid,
4972 unsigned split_bits,
4973 pg_missing_set *omissing) {
4974 omissing->may_include_deletes = may_include_deletes;
4975 unsigned mask = ~((~0)<<split_bits);
4976 for (std::map<hobject_t, item>::iterator i = missing.begin();
4977 i != missing.end();
4978 ) {
4979 if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
4980 omissing->add(i->first, i->second.need, i->second.have,
4981 i->second.is_delete());
4982 rm(i++);
4983 } else {
4984 ++i;
4985 }
4986 }
4987 }
4988
4989 void clear() {
4990 for (auto const &i: missing)
4991 tracker.changed(i.first);
4992 missing.clear();
4993 rmissing.clear();
4994 }
4995
4996 void encode(ceph::buffer::list &bl, uint64_t features) const {
4997 ENCODE_START(5, 2, bl)
4998 encode(missing, bl, features);
4999 encode(may_include_deletes, bl);
5000 ENCODE_FINISH(bl);
5001 }
5002 void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1) {
5003 for (auto const &i: missing)
5004 tracker.changed(i.first);
5005 DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl);
5006 decode(missing, bl);
5007 if (struct_v >= 4) {
5008 decode(may_include_deletes, bl);
5009 }
5010 DECODE_FINISH(bl);
5011
5012 if (struct_v < 3) {
5013 // Handle hobject_t upgrade
5014 std::map<hobject_t, item> tmp;
5015 for (std::map<hobject_t, item>::iterator i =
5016 missing.begin();
5017 i != missing.end();
5018 ) {
5019 if (!i->first.is_max() && i->first.pool == -1) {
5020 hobject_t to_insert(i->first);
5021 to_insert.pool = pool;
5022 tmp[to_insert] = i->second;
5023 missing.erase(i++);
5024 } else {
5025 ++i;
5026 }
5027 }
5028 missing.insert(tmp.begin(), tmp.end());
5029 }
5030
5031 for (std::map<hobject_t,item>::iterator it =
5032 missing.begin();
5033 it != missing.end();
5034 ++it)
5035 rmissing[it->second.need.version] = it->first;
5036 for (auto const &i: missing)
5037 tracker.changed(i.first);
5038 }
5039 void dump(ceph::Formatter *f) const {
5040 f->open_array_section("missing");
5041 for (std::map<hobject_t,item>::const_iterator p =
5042 missing.begin(); p != missing.end(); ++p) {
5043 f->open_object_section("item");
5044 f->dump_stream("object") << p->first;
5045 p->second.dump(f);
5046 f->close_section();
5047 }
5048 f->close_section();
5049 f->dump_bool("may_include_deletes", may_include_deletes);
5050 }
5051 template <typename F>
5052 void filter_objects(F &&f) {
5053 for (auto i = missing.begin(); i != missing.end();) {
5054 if (f(i->first)) {
5055 rm(i++);
5056 } else {
5057 ++i;
5058 }
5059 }
5060 }
5061 static void generate_test_instances(std::list<pg_missing_set*>& o) {
5062 o.push_back(new pg_missing_set);
5063 o.back()->may_include_deletes = true;
5064 o.push_back(new pg_missing_set);
5065 o.back()->add(
5066 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
5067 eversion_t(5, 6), eversion_t(5, 1), false);
5068 o.back()->may_include_deletes = true;
5069 o.push_back(new pg_missing_set);
5070 o.back()->add(
5071 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
5072 eversion_t(5, 6), eversion_t(5, 1), true);
5073 o.back()->may_include_deletes = true;
5074 }
5075 template <typename F>
5076 void get_changed(F &&f) const {
5077 tracker.get_changed(f);
5078 }
5079 void flush() {
5080 tracker.flush();
5081 }
5082 bool is_clean() const {
5083 return tracker.is_clean();
5084 }
5085 template <typename missing_t>
5086 bool debug_verify_from_init(
5087 const missing_t &init_missing,
5088 std::ostream *oss) const {
5089 if (!TrackChanges)
5090 return true;
5091 auto check_missing(init_missing.get_items());
5092 tracker.get_changed([&](const hobject_t &hoid) {
5093 check_missing.erase(hoid);
5094 if (missing.count(hoid)) {
5095 check_missing.insert(*(missing.find(hoid)));
5096 }
5097 });
5098 bool ok = true;
5099 if (check_missing.size() != missing.size()) {
5100 if (oss) {
5101 *oss << "Size mismatch, check: " << check_missing.size()
5102 << ", actual: " << missing.size() << "\n";
5103 }
5104 ok = false;
5105 }
5106 for (auto &i: missing) {
5107 if (!check_missing.count(i.first)) {
5108 if (oss)
5109 *oss << "check_missing missing " << i.first << "\n";
5110 ok = false;
5111 } else if (check_missing[i.first] != i.second) {
5112 if (oss)
5113 *oss << "check_missing missing item mismatch on " << i.first
5114 << ", check: " << check_missing[i.first]
5115 << ", actual: " << i.second << "\n";
5116 ok = false;
5117 }
5118 }
5119 if (oss && !ok) {
5120 *oss << "check_missing: " << check_missing << "\n";
5121 std::set<hobject_t> changed;
5122 tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
5123 *oss << "changed: " << changed << "\n";
5124 }
5125 return ok;
5126 }
5127 };
5128 template <bool TrackChanges>
5129 void encode(
5130 const pg_missing_set<TrackChanges> &c, ceph::buffer::list &bl, uint64_t features=0) {
5131 ENCODE_DUMP_PRE();
5132 c.encode(bl, features);
5133 ENCODE_DUMP_POST(cl);
5134 }
5135 template <bool TrackChanges>
5136 void decode(pg_missing_set<TrackChanges> &c, ceph::buffer::list::const_iterator &p) {
5137 c.decode(p);
5138 }
5139 template <bool TrackChanges>
5140 std::ostream& operator<<(std::ostream& out, const pg_missing_set<TrackChanges> &missing)
5141 {
5142 out << "missing(" << missing.num_missing()
5143 << " may_include_deletes = " << missing.may_include_deletes;
5144 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
5145 out << ")";
5146 return out;
5147 }
5148
5149 using pg_missing_t = pg_missing_set<false>;
5150 using pg_missing_tracker_t = pg_missing_set<true>;
5151
5152
5153
5154
5155 /**
5156 * pg list objects response format
5157 *
5158 */
5159
5160 template<typename T>
5161 struct pg_nls_response_template {
5162 collection_list_handle_t handle;
5163 std::vector<T> entries;
5164
5165 void encode(ceph::buffer::list& bl) const {
5166 ENCODE_START(1, 1, bl);
5167 encode(handle, bl);
5168 __u32 n = (__u32)entries.size();
5169 encode(n, bl);
5170 for (auto i = entries.begin(); i != entries.end(); ++i) {
5171 encode(i->nspace, bl);
5172 encode(i->oid, bl);
5173 encode(i->locator, bl);
5174 }
5175 ENCODE_FINISH(bl);
5176 }
5177 void decode(ceph::buffer::list::const_iterator& bl) {
5178 DECODE_START(1, bl);
5179 decode(handle, bl);
5180 __u32 n;
5181 decode(n, bl);
5182 entries.clear();
5183 while (n--) {
5184 T i;
5185 decode(i.nspace, bl);
5186 decode(i.oid, bl);
5187 decode(i.locator, bl);
5188 entries.push_back(i);
5189 }
5190 DECODE_FINISH(bl);
5191 }
5192 void dump(ceph::Formatter *f) const {
5193 f->dump_stream("handle") << handle;
5194 f->open_array_section("entries");
5195 for (auto p = entries.begin(); p != entries.end(); ++p) {
5196 f->open_object_section("object");
5197 f->dump_string("namespace", p->nspace);
5198 f->dump_string("object", p->oid);
5199 f->dump_string("key", p->locator);
5200 f->close_section();
5201 }
5202 f->close_section();
5203 }
5204 static void generate_test_instances(std::list<pg_nls_response_template<T>*>& o) {
5205 o.push_back(new pg_nls_response_template<T>);
5206 o.push_back(new pg_nls_response_template<T>);
5207 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5208 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
5209 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
5210 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
5211 o.push_back(new pg_nls_response_template<T>);
5212 o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
5213 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5214 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5215 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5216 o.push_back(new pg_nls_response_template<T>);
5217 o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
5218 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
5219 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
5220 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
5221 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5222 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5223 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5224 }
5225 };
5226
5227 using pg_nls_response_t = pg_nls_response_template<librados::ListObjectImpl>;
5228
5229 WRITE_CLASS_ENCODER(pg_nls_response_t)
5230
5231 // For backwards compatibility with older OSD requests
5232 struct pg_ls_response_t {
5233 collection_list_handle_t handle;
5234 std::list<std::pair<object_t, std::string> > entries;
5235
5236 void encode(ceph::buffer::list& bl) const {
5237 using ceph::encode;
5238 __u8 v = 1;
5239 encode(v, bl);
5240 encode(handle, bl);
5241 encode(entries, bl);
5242 }
5243 void decode(ceph::buffer::list::const_iterator& bl) {
5244 using ceph::decode;
5245 __u8 v;
5246 decode(v, bl);
5247 ceph_assert(v == 1);
5248 decode(handle, bl);
5249 decode(entries, bl);
5250 }
5251 void dump(ceph::Formatter *f) const {
5252 f->dump_stream("handle") << handle;
5253 f->open_array_section("entries");
5254 for (std::list<std::pair<object_t, std::string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
5255 f->open_object_section("object");
5256 f->dump_stream("object") << p->first;
5257 f->dump_string("key", p->second);
5258 f->close_section();
5259 }
5260 f->close_section();
5261 }
5262 static void generate_test_instances(std::list<pg_ls_response_t*>& o) {
5263 o.push_back(new pg_ls_response_t);
5264 o.push_back(new pg_ls_response_t);
5265 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5266 o.back()->entries.push_back(std::make_pair(object_t("one"), std::string()));
5267 o.back()->entries.push_back(std::make_pair(object_t("two"), std::string("twokey")));
5268 }
5269 };
5270
5271 WRITE_CLASS_ENCODER(pg_ls_response_t)
5272
5273 /**
5274 * object_copy_cursor_t
5275 */
5276 struct object_copy_cursor_t {
5277 uint64_t data_offset;
5278 std::string omap_offset;
5279 bool attr_complete;
5280 bool data_complete;
5281 bool omap_complete;
5282
5283 object_copy_cursor_t()
5284 : data_offset(0),
5285 attr_complete(false),
5286 data_complete(false),
5287 omap_complete(false)
5288 {}
5289
5290 bool is_initial() const {
5291 return !attr_complete && data_offset == 0 && omap_offset.empty();
5292 }
5293 bool is_complete() const {
5294 return attr_complete && data_complete && omap_complete;
5295 }
5296
5297 static void generate_test_instances(std::list<object_copy_cursor_t*>& o);
5298 void encode(ceph::buffer::list& bl) const;
5299 void decode(ceph::buffer::list::const_iterator &bl);
5300 void dump(ceph::Formatter *f) const;
5301 };
5302 WRITE_CLASS_ENCODER(object_copy_cursor_t)
5303
5304 /**
5305 * object_copy_data_t
5306 *
5307 * Return data from a copy request. The semantics are a little strange
5308 * as a result of the encoding's heritage.
5309 *
5310 * In particular, the sender unconditionally fills in the cursor (from what
5311 * it receives and sends), the size, and the mtime, but is responsible for
5312 * figuring out whether it should put any data in the attrs, data, or
5313 * omap members (corresponding to xattrs, object data, and the omap entries)
5314 * based on external data (the client includes a max amount to return with
5315 * the copy request). The client then looks into the attrs, data, and/or omap
5316 * based on the contents of the cursor.
5317 */
5318 struct object_copy_data_t {
5319 enum {
5320 FLAG_DATA_DIGEST = 1<<0,
5321 FLAG_OMAP_DIGEST = 1<<1,
5322 };
5323 object_copy_cursor_t cursor;
5324 uint64_t size;
5325 utime_t mtime;
5326 uint32_t data_digest, omap_digest;
5327 uint32_t flags;
5328 std::map<std::string, ceph::buffer::list, std::less<>> attrs;
5329 ceph::buffer::list data;
5330 ceph::buffer::list omap_header;
5331 ceph::buffer::list omap_data;
5332
5333 /// which snaps we are defined for (if a snap and not the head)
5334 std::vector<snapid_t> snaps;
5335 /// latest snap seq for the object (if head)
5336 snapid_t snap_seq;
5337
5338 /// recent reqids on this object
5339 mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > reqids;
5340
5341 /// map reqids by index to error return code (if any)
5342 mempool::osd_pglog::map<uint32_t, int> reqid_return_codes;
5343
5344 uint64_t truncate_seq;
5345 uint64_t truncate_size;
5346
5347 public:
5348 object_copy_data_t() :
5349 size((uint64_t)-1), data_digest(-1),
5350 omap_digest(-1), flags(0),
5351 truncate_seq(0),
5352 truncate_size(0) {}
5353
5354 static void generate_test_instances(std::list<object_copy_data_t*>& o);
5355 void encode(ceph::buffer::list& bl, uint64_t features) const;
5356 void decode(ceph::buffer::list::const_iterator& bl);
5357 void dump(ceph::Formatter *f) const;
5358 };
5359 WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
5360
5361 /**
5362 * pg creation info
5363 */
5364 struct pg_create_t {
5365 epoch_t created; // epoch pg created
5366 pg_t parent; // split from parent (if != pg_t())
5367 __s32 split_bits;
5368
5369 pg_create_t()
5370 : created(0), split_bits(0) {}
5371 pg_create_t(unsigned c, pg_t p, int s)
5372 : created(c), parent(p), split_bits(s) {}
5373
5374 void encode(ceph::buffer::list &bl) const;
5375 void decode(ceph::buffer::list::const_iterator &bl);
5376 void dump(ceph::Formatter *f) const;
5377 static void generate_test_instances(std::list<pg_create_t*>& o);
5378 };
5379 WRITE_CLASS_ENCODER(pg_create_t)
5380
5381 // -----------------------------------------
5382
5383 class ObjectExtent {
5384 /**
5385 * ObjectExtents are used for specifying IO behavior against RADOS
5386 * objects when one is using the ObjectCacher.
5387 *
5388 * To use this in a real system, *every member* must be filled
5389 * out correctly. In particular, make sure to initialize the
5390 * oloc correctly, as its default values are deliberate poison
5391 * and will cause internal ObjectCacher asserts.
5392 *
5393 * Similarly, your buffer_extents vector *must* specify a total
5394 * size equal to your length. If the buffer_extents inadvertently
5395 * contain less space than the length member specifies, you
5396 * will get unintelligible asserts deep in the ObjectCacher.
5397 *
5398 * If you are trying to do testing and don't care about actual
5399 * RADOS function, the simplest thing to do is to initialize
5400 * the ObjectExtent (truncate_size can be 0), create a single entry
5401 * in buffer_extents matching the length, and set oloc.pool to 0.
5402 */
5403 public:
5404 object_t oid; // object id
5405 uint64_t objectno;
5406 uint64_t offset; // in object
5407 uint64_t length; // in object
5408 uint64_t truncate_size; // in object
5409
5410 object_locator_t oloc; // object locator (pool etc)
5411
5412 std::vector<std::pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
5413
5414 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
5415 ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
5416 oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
5417 };
5418
5419 inline std::ostream& operator<<(std::ostream& out, const ObjectExtent &ex)
5420 {
5421 return out << "extent("
5422 << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
5423 << " " << ex.offset << "~" << ex.length
5424 << " -> " << ex.buffer_extents
5425 << ")";
5426 }
5427
5428
5429 // ---------------------------------------
5430
5431 class OSDSuperblock {
5432 public:
5433 uuid_d cluster_fsid, osd_fsid;
5434 int32_t whoami = -1; // my role in this fs.
5435 epoch_t current_epoch = 0; // most recent epoch
5436 epoch_t oldest_map = 0, newest_map = 0; // oldest/newest maps we have.
5437 double weight = 0.0;
5438
5439 CompatSet compat_features;
5440
5441 // last interval over which i mounted and was then active
5442 epoch_t mounted = 0; // last epoch i mounted
5443 epoch_t clean_thru = 0; // epoch i was active and clean thru
5444
5445 epoch_t purged_snaps_last = 0;
5446 utime_t last_purged_snaps_scrub;
5447
5448 epoch_t cluster_osdmap_trim_lower_bound = 0;
5449
5450 void encode(ceph::buffer::list &bl) const;
5451 void decode(ceph::buffer::list::const_iterator &bl);
5452 void dump(ceph::Formatter *f) const;
5453 static void generate_test_instances(std::list<OSDSuperblock*>& o);
5454 };
5455 WRITE_CLASS_ENCODER(OSDSuperblock)
5456
5457 inline std::ostream& operator<<(std::ostream& out, const OSDSuperblock& sb)
5458 {
5459 return out << "sb(" << sb.cluster_fsid
5460 << " osd." << sb.whoami
5461 << " " << sb.osd_fsid
5462 << " e" << sb.current_epoch
5463 << " [" << sb.oldest_map << "," << sb.newest_map << "]"
5464 << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
5465 << " tlb=" << sb.cluster_osdmap_trim_lower_bound
5466 << ")";
5467 }
5468
5469
5470 // -------
5471
5472
5473
5474
5475
5476
5477 /*
5478 * attached to object head. describes most recent snap context, and
5479 * set of existing clones.
5480 */
5481 struct SnapSet {
5482 snapid_t seq;
5483 // NOTE: this is for pre-octopus compatibility only! remove in Q release
5484 std::vector<snapid_t> snaps; // descending
5485 std::vector<snapid_t> clones; // ascending
5486 std::map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest
5487 std::map<snapid_t, uint64_t> clone_size;
5488 std::map<snapid_t, std::vector<snapid_t>> clone_snaps; // descending
5489
5490 SnapSet() : seq(0) {}
5491 explicit SnapSet(ceph::buffer::list& bl) {
5492 auto p = std::cbegin(bl);
5493 decode(p);
5494 }
5495
5496 /// populate SnapSet from a librados::snap_set_t
5497 void from_snap_set(const librados::snap_set_t& ss, bool legacy);
5498
5499 /// get space accounted to clone
5500 uint64_t get_clone_bytes(snapid_t clone) const;
5501
5502 void encode(ceph::buffer::list& bl) const;
5503 void decode(ceph::buffer::list::const_iterator& bl);
5504 void dump(ceph::Formatter *f) const;
5505 static void generate_test_instances(std::list<SnapSet*>& o);
5506
5507 SnapContext get_ssc_as_of(snapid_t as_of) const {
5508 SnapContext out;
5509 out.seq = as_of;
5510 for (auto p = clone_snaps.rbegin();
5511 p != clone_snaps.rend();
5512 ++p) {
5513 for (auto snap : p->second) {
5514 if (snap <= as_of) {
5515 out.snaps.push_back(snap);
5516 }
5517 }
5518 }
5519 return out;
5520 }
5521
5522
5523 SnapSet get_filtered(const pg_pool_t &pinfo) const;
5524 void filter(const pg_pool_t &pinfo);
5525 };
5526 WRITE_CLASS_ENCODER(SnapSet)
5527
5528 std::ostream& operator<<(std::ostream& out, const SnapSet& cs);
5529
5530
5531
5532 #define OI_ATTR "_"
5533 #define SS_ATTR "snapset"
5534
5535 struct watch_info_t {
5536 uint64_t cookie;
5537 uint32_t timeout_seconds;
5538 entity_addr_t addr;
5539
5540 watch_info_t() : cookie(0), timeout_seconds(0) { }
5541 watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
5542
5543 void encode(ceph::buffer::list& bl, uint64_t features) const;
5544 void decode(ceph::buffer::list::const_iterator& bl);
5545 void dump(ceph::Formatter *f) const;
5546 static void generate_test_instances(std::list<watch_info_t*>& o);
5547 };
5548 WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
5549
5550 static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
5551 return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
5552 && l.addr == r.addr;
5553 }
5554
5555 static inline std::ostream& operator<<(std::ostream& out, const watch_info_t& w) {
5556 return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
5557 << " " << w.addr << ")";
5558 }
5559
5560 struct notify_info_t {
5561 uint64_t cookie;
5562 uint64_t notify_id;
5563 uint32_t timeout;
5564 ceph::buffer::list bl;
5565 };
5566
5567 static inline std::ostream& operator<<(std::ostream& out, const notify_info_t& n) {
5568 return out << "notify(cookie " << n.cookie
5569 << " notify" << n.notify_id
5570 << " " << n.timeout << "s)";
5571 }
5572
5573 class object_ref_delta_t {
5574 std::map<hobject_t, int> ref_delta;
5575
5576 public:
5577 object_ref_delta_t() = default;
5578 object_ref_delta_t(const object_ref_delta_t &) = default;
5579 object_ref_delta_t(object_ref_delta_t &&) = default;
5580
5581 object_ref_delta_t(decltype(ref_delta) &&ref_delta)
5582 : ref_delta(std::move(ref_delta)) {}
5583 object_ref_delta_t(const decltype(ref_delta) &ref_delta)
5584 : ref_delta(ref_delta) {}
5585
5586 object_ref_delta_t &operator=(const object_ref_delta_t &) = default;
5587 object_ref_delta_t &operator=(object_ref_delta_t &&) = default;
5588
5589 void dec_ref(const hobject_t &hoid, unsigned num=1) {
5590 mut_ref(hoid, -num);
5591 }
5592 void inc_ref(const hobject_t &hoid, unsigned num=1) {
5593 mut_ref(hoid, num);
5594 }
5595 void mut_ref(const hobject_t &hoid, int num) {
5596 [[maybe_unused]] auto [iter, _] = ref_delta.try_emplace(hoid, 0);
5597 iter->second += num;
5598 if (iter->second == 0)
5599 ref_delta.erase(iter);
5600 }
5601
5602 auto begin() const { return ref_delta.begin(); }
5603 auto end() const { return ref_delta.end(); }
5604 auto find(hobject_t &key) const { return ref_delta.find(key); }
5605
5606 bool operator==(const object_ref_delta_t &rhs) const {
5607 return ref_delta == rhs.ref_delta;
5608 }
5609 bool operator!=(const object_ref_delta_t &rhs) const {
5610 return !(*this == rhs);
5611 }
5612 bool is_empty() {
5613 return ref_delta.empty();
5614 }
5615 uint64_t size() {
5616 return ref_delta.size();
5617 }
5618 friend std::ostream& operator<<(std::ostream& out, const object_ref_delta_t & ci);
5619 };
5620
5621 struct chunk_info_t {
5622 typedef enum {
5623 FLAG_DIRTY = 1,
5624 FLAG_MISSING = 2,
5625 FLAG_HAS_REFERENCE = 4,
5626 FLAG_HAS_FINGERPRINT = 8,
5627 } cflag_t;
5628 uint32_t offset;
5629 uint32_t length;
5630 hobject_t oid;
5631 cflag_t flags; // FLAG_*
5632
5633 chunk_info_t() : offset(0), length(0), flags((cflag_t)0) { }
5634 chunk_info_t(uint32_t offset, uint32_t length, hobject_t oid) :
5635 offset(offset), length(length), oid(oid), flags((cflag_t)0) { }
5636
5637 static std::string get_flag_string(uint64_t flags) {
5638 std::string r;
5639 if (flags & FLAG_DIRTY) {
5640 r += "|dirty";
5641 }
5642 if (flags & FLAG_MISSING) {
5643 r += "|missing";
5644 }
5645 if (flags & FLAG_HAS_REFERENCE) {
5646 r += "|has_reference";
5647 }
5648 if (flags & FLAG_HAS_FINGERPRINT) {
5649 r += "|has_fingerprint";
5650 }
5651 if (r.length())
5652 return r.substr(1);
5653 return r;
5654 }
5655 bool test_flag(cflag_t f) const {
5656 return (flags & f) == f;
5657 }
5658 void set_flag(cflag_t f) {
5659 flags = (cflag_t)(flags | f);
5660 }
5661 void set_flags(cflag_t f) {
5662 flags = f;
5663 }
5664 void clear_flag(cflag_t f) {
5665 flags = (cflag_t)(flags & ~f);
5666 }
5667 void clear_flags() {
5668 flags = (cflag_t)0;
5669 }
5670 bool is_dirty() const {
5671 return test_flag(FLAG_DIRTY);
5672 }
5673 bool is_missing() const {
5674 return test_flag(FLAG_MISSING);
5675 }
5676 bool has_reference() const {
5677 return test_flag(FLAG_HAS_REFERENCE);
5678 }
5679 bool has_fingerprint() const {
5680 return test_flag(FLAG_HAS_FINGERPRINT);
5681 }
5682 void encode(ceph::buffer::list &bl) const;
5683 void decode(ceph::buffer::list::const_iterator &bl);
5684 void dump(ceph::Formatter *f) const;
5685 friend std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
5686 bool operator==(const chunk_info_t& cit) const;
5687 bool operator!=(const chunk_info_t& cit) const {
5688 return !(cit == *this);
5689 }
5690 };
5691 WRITE_CLASS_ENCODER(chunk_info_t)
5692 std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
5693
5694 struct object_info_t;
5695 struct object_manifest_t {
5696 enum {
5697 TYPE_NONE = 0,
5698 TYPE_REDIRECT = 1,
5699 TYPE_CHUNKED = 2,
5700 };
5701 uint8_t type; // redirect, chunked, ...
5702 hobject_t redirect_target;
5703 std::map<uint64_t, chunk_info_t> chunk_map;
5704
5705 object_manifest_t() : type(0) { }
5706 object_manifest_t(uint8_t type, const hobject_t& redirect_target)
5707 : type(type), redirect_target(redirect_target) { }
5708
5709 bool is_empty() const {
5710 return type == TYPE_NONE;
5711 }
5712 bool is_redirect() const {
5713 return type == TYPE_REDIRECT;
5714 }
5715 bool is_chunked() const {
5716 return type == TYPE_CHUNKED;
5717 }
5718 static std::string_view get_type_name(uint8_t m) {
5719 switch (m) {
5720 case TYPE_NONE: return "none";
5721 case TYPE_REDIRECT: return "redirect";
5722 case TYPE_CHUNKED: return "chunked";
5723 default: return "unknown";
5724 }
5725 }
5726 std::string_view get_type_name() const {
5727 return get_type_name(type);
5728 }
5729 void clear() {
5730 type = 0;
5731 redirect_target = hobject_t();
5732 chunk_map.clear();
5733 }
5734
5735 /**
5736 * calc_refs_to_inc_on_set
5737 *
5738 * Takes a manifest and returns the set of refs to
5739 * increment upon set-chunk
5740 *
5741 * l should be nullptr if there are no clones, or
5742 * l and g may each be null if the corresponding clone does not exist.
5743 * *this contains the set of new references to set
5744 *
5745 */
5746 void calc_refs_to_inc_on_set(
5747 const object_manifest_t* g, ///< [in] manifest for clone > *this
5748 const object_manifest_t* l, ///< [in] manifest for clone < *this
5749 object_ref_delta_t &delta ///< [out] set of refs to drop
5750 ) const;
5751
5752 /**
5753 * calc_refs_to_drop_on_modify
5754 *
5755 * Takes a manifest and returns the set of refs to
5756 * drop upon modification
5757 *
5758 * l should be nullptr if there are no clones, or
5759 * l may be null if the corresponding clone does not exist.
5760 *
5761 */
5762 void calc_refs_to_drop_on_modify(
5763 const object_manifest_t* l, ///< [in] manifest for previous clone
5764 const ObjectCleanRegions& clean_regions, ///< [in] clean regions
5765 object_ref_delta_t &delta ///< [out] set of refs to drop
5766 ) const;
5767
5768 /**
5769 * calc_refs_to_drop_on_removal
5770 *
5771 * Takes the two adjacent manifests and returns the set of refs to
5772 * drop upon removal of the clone containing *this.
5773 *
5774 * g should be nullptr if *this is on HEAD, l should be nullptr if
5775 * *this is on the oldest clone (or head if there are no clones).
5776 */
5777 void calc_refs_to_drop_on_removal(
5778 const object_manifest_t* g, ///< [in] manifest for clone > *this
5779 const object_manifest_t* l, ///< [in] manifest for clone < *this
5780 object_ref_delta_t &delta ///< [out] set of refs to drop
5781 ) const;
5782
5783 static void generate_test_instances(std::list<object_manifest_t*>& o);
5784 void encode(ceph::buffer::list &bl) const;
5785 void decode(ceph::buffer::list::const_iterator &bl);
5786 void dump(ceph::Formatter *f) const;
5787 friend std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
5788 };
5789 WRITE_CLASS_ENCODER(object_manifest_t)
5790 std::ostream& operator<<(std::ostream& out, const object_manifest_t& oi);
5791
5792 struct object_info_t {
5793 hobject_t soid;
5794 eversion_t version, prior_version;
5795 version_t user_version;
5796 osd_reqid_t last_reqid;
5797
5798 uint64_t size;
5799 utime_t mtime;
5800 utime_t local_mtime; // local mtime
5801
5802 // note: these are currently encoded into a total 16 bits; see
5803 // encode()/decode() for the weirdness.
5804 typedef enum {
5805 FLAG_LOST = 1<<0,
5806 FLAG_WHITEOUT = 1<<1, // object logically does not exist
5807 FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
5808 FLAG_OMAP = 1<<3, // has (or may have) some/any omap data
5809 FLAG_DATA_DIGEST = 1<<4, // has data crc
5810 FLAG_OMAP_DIGEST = 1<<5, // has omap crc
5811 FLAG_CACHE_PIN = 1<<6, // pin the object in cache tier
5812 FLAG_MANIFEST = 1<<7, // has manifest
5813 FLAG_USES_TMAP = 1<<8, // deprecated; no longer used
5814 FLAG_REDIRECT_HAS_REFERENCE = 1<<9, // has reference
5815 } flag_t;
5816
5817 flag_t flags;
5818
5819 static std::string get_flag_string(flag_t flags) {
5820 std::string s;
5821 std::vector<std::string> sv = get_flag_vector(flags);
5822 for (auto ss : sv) {
5823 s += std::string("|") + ss;
5824 }
5825 if (s.length())
5826 return s.substr(1);
5827 return s;
5828 }
5829 static std::vector<std::string> get_flag_vector(flag_t flags) {
5830 std::vector<std::string> sv;
5831 if (flags & FLAG_LOST)
5832 sv.insert(sv.end(), "lost");
5833 if (flags & FLAG_WHITEOUT)
5834 sv.insert(sv.end(), "whiteout");
5835 if (flags & FLAG_DIRTY)
5836 sv.insert(sv.end(), "dirty");
5837 if (flags & FLAG_USES_TMAP)
5838 sv.insert(sv.end(), "uses_tmap");
5839 if (flags & FLAG_OMAP)
5840 sv.insert(sv.end(), "omap");
5841 if (flags & FLAG_DATA_DIGEST)
5842 sv.insert(sv.end(), "data_digest");
5843 if (flags & FLAG_OMAP_DIGEST)
5844 sv.insert(sv.end(), "omap_digest");
5845 if (flags & FLAG_CACHE_PIN)
5846 sv.insert(sv.end(), "cache_pin");
5847 if (flags & FLAG_MANIFEST)
5848 sv.insert(sv.end(), "manifest");
5849 if (flags & FLAG_REDIRECT_HAS_REFERENCE)
5850 sv.insert(sv.end(), "redirect_has_reference");
5851 return sv;
5852 }
5853 std::string get_flag_string() const {
5854 return get_flag_string(flags);
5855 }
5856
5857 uint64_t truncate_seq, truncate_size;
5858
5859 std::map<std::pair<uint64_t, entity_name_t>, watch_info_t> watchers;
5860
5861 // opportunistic checksums; may or may not be present
5862 __u32 data_digest; ///< data crc32c
5863 __u32 omap_digest; ///< omap crc32c
5864
5865 // alloc hint attribute
5866 uint64_t expected_object_size, expected_write_size;
5867 uint32_t alloc_hint_flags;
5868
5869 struct object_manifest_t manifest;
5870
5871 void copy_user_bits(const object_info_t& other);
5872
5873 bool test_flag(flag_t f) const {
5874 return (flags & f) == f;
5875 }
5876 void set_flag(flag_t f) {
5877 flags = (flag_t)(flags | f);
5878 }
5879 void clear_flag(flag_t f) {
5880 flags = (flag_t)(flags & ~f);
5881 }
5882 bool is_lost() const {
5883 return test_flag(FLAG_LOST);
5884 }
5885 bool is_whiteout() const {
5886 return test_flag(FLAG_WHITEOUT);
5887 }
5888 bool is_dirty() const {
5889 return test_flag(FLAG_DIRTY);
5890 }
5891 bool is_omap() const {
5892 return test_flag(FLAG_OMAP);
5893 }
5894 bool is_data_digest() const {
5895 return test_flag(FLAG_DATA_DIGEST);
5896 }
5897 bool is_omap_digest() const {
5898 return test_flag(FLAG_OMAP_DIGEST);
5899 }
5900 bool is_cache_pinned() const {
5901 return test_flag(FLAG_CACHE_PIN);
5902 }
5903 bool has_manifest() const {
5904 return test_flag(FLAG_MANIFEST);
5905 }
5906 void set_data_digest(__u32 d) {
5907 set_flag(FLAG_DATA_DIGEST);
5908 data_digest = d;
5909 }
5910 void set_omap_digest(__u32 d) {
5911 set_flag(FLAG_OMAP_DIGEST);
5912 omap_digest = d;
5913 }
5914 void clear_data_digest() {
5915 clear_flag(FLAG_DATA_DIGEST);
5916 data_digest = -1;
5917 }
5918 void clear_omap_digest() {
5919 clear_flag(FLAG_OMAP_DIGEST);
5920 omap_digest = -1;
5921 }
5922 void new_object() {
5923 clear_data_digest();
5924 clear_omap_digest();
5925 }
5926
5927 void encode(ceph::buffer::list& bl, uint64_t features) const;
5928 void decode(ceph::buffer::list::const_iterator& bl);
5929 void decode(const ceph::buffer::list& bl) {
5930 auto p = std::cbegin(bl);
5931 decode(p);
5932 }
5933
5934 void encode_no_oid(ceph::buffer::list& bl, uint64_t features) {
5935 // TODO: drop soid field and remove the denc no_oid methods
5936 auto tmp_oid = hobject_t(hobject_t::get_max());
5937 tmp_oid.swap(soid);
5938 encode(bl, features);
5939 soid = tmp_oid;
5940 }
5941 void decode_no_oid(ceph::buffer::list::const_iterator& bl) {
5942 decode(bl);
5943 ceph_assert(soid.is_max());
5944 }
5945 void decode_no_oid(const ceph::buffer::list& bl) {
5946 auto p = std::cbegin(bl);
5947 decode_no_oid(p);
5948 }
5949 void decode_no_oid(const ceph::buffer::list& bl, const hobject_t& _soid) {
5950 auto p = std::cbegin(bl);
5951 decode_no_oid(p);
5952 soid = _soid;
5953 }
5954
5955 void dump(ceph::Formatter *f) const;
5956 static void generate_test_instances(std::list<object_info_t*>& o);
5957
5958 explicit object_info_t()
5959 : user_version(0), size(0), flags((flag_t)0),
5960 truncate_seq(0), truncate_size(0),
5961 data_digest(-1), omap_digest(-1),
5962 expected_object_size(0), expected_write_size(0),
5963 alloc_hint_flags(0)
5964 {}
5965
5966 explicit object_info_t(const hobject_t& s)
5967 : soid(s),
5968 user_version(0), size(0), flags((flag_t)0),
5969 truncate_seq(0), truncate_size(0),
5970 data_digest(-1), omap_digest(-1),
5971 expected_object_size(0), expected_write_size(0),
5972 alloc_hint_flags(0)
5973 {}
5974
5975 explicit object_info_t(const ceph::buffer::list& bl) {
5976 decode(bl);
5977 }
5978
5979 explicit object_info_t(const ceph::buffer::list& bl, const hobject_t& _soid) {
5980 decode_no_oid(bl);
5981 soid = _soid;
5982 }
5983 };
5984 WRITE_CLASS_ENCODER_FEATURES(object_info_t)
5985
5986 std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
5987
5988
5989
5990 // Object recovery
5991 struct ObjectRecoveryInfo {
5992 hobject_t soid;
5993 eversion_t version;
5994 uint64_t size;
5995 object_info_t oi;
5996 SnapSet ss; // only populated if soid is_snap()
5997 interval_set<uint64_t> copy_subset;
5998 std::map<hobject_t, interval_set<uint64_t>> clone_subset;
5999 bool object_exist;
6000
6001 ObjectRecoveryInfo() : size(0), object_exist(true) { }
6002
6003 static void generate_test_instances(std::list<ObjectRecoveryInfo*>& o);
6004 void encode(ceph::buffer::list &bl, uint64_t features) const;
6005 void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
6006 std::ostream &print(std::ostream &out) const;
6007 void dump(ceph::Formatter *f) const;
6008 };
6009 WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
6010 std::ostream& operator<<(std::ostream& out, const ObjectRecoveryInfo &inf);
6011
6012 struct ObjectRecoveryProgress {
6013 uint64_t data_recovered_to;
6014 std::string omap_recovered_to;
6015 bool first;
6016 bool data_complete;
6017 bool omap_complete;
6018 bool error = false;
6019
6020 ObjectRecoveryProgress()
6021 : data_recovered_to(0),
6022 first(true),
6023 data_complete(false), omap_complete(false) { }
6024
6025 bool is_complete(const ObjectRecoveryInfo& info) const {
6026 return (data_recovered_to >= (
6027 info.copy_subset.empty() ?
6028 0 : info.copy_subset.range_end())) &&
6029 omap_complete;
6030 }
6031
6032 uint64_t estimate_remaining_data_to_recover(const ObjectRecoveryInfo& info) const {
6033 // Overestimates in case of clones, but avoids traversing copy_subset
6034 return info.size - data_recovered_to;
6035 }
6036
6037 static void generate_test_instances(std::list<ObjectRecoveryProgress*>& o);
6038 void encode(ceph::buffer::list &bl) const;
6039 void decode(ceph::buffer::list::const_iterator &bl);
6040 std::ostream &print(std::ostream &out) const;
6041 void dump(ceph::Formatter *f) const;
6042 };
6043 WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
6044 std::ostream& operator<<(std::ostream& out, const ObjectRecoveryProgress &prog);
6045
6046 struct PushReplyOp {
6047 hobject_t soid;
6048
6049 static void generate_test_instances(std::list<PushReplyOp*>& o);
6050 void encode(ceph::buffer::list &bl) const;
6051 void decode(ceph::buffer::list::const_iterator &bl);
6052 std::ostream &print(std::ostream &out) const;
6053 void dump(ceph::Formatter *f) const;
6054
6055 uint64_t cost(CephContext *cct) const;
6056 };
6057 WRITE_CLASS_ENCODER(PushReplyOp)
6058 std::ostream& operator<<(std::ostream& out, const PushReplyOp &op);
6059
6060 struct PullOp {
6061 hobject_t soid;
6062
6063 ObjectRecoveryInfo recovery_info;
6064 ObjectRecoveryProgress recovery_progress;
6065
6066 static void generate_test_instances(std::list<PullOp*>& o);
6067 void encode(ceph::buffer::list &bl, uint64_t features) const;
6068 void decode(ceph::buffer::list::const_iterator &bl);
6069 std::ostream &print(std::ostream &out) const;
6070 void dump(ceph::Formatter *f) const;
6071
6072 uint64_t cost(CephContext *cct) const;
6073 };
6074 WRITE_CLASS_ENCODER_FEATURES(PullOp)
6075 std::ostream& operator<<(std::ostream& out, const PullOp &op);
6076
6077 struct PushOp {
6078 hobject_t soid;
6079 eversion_t version;
6080 ceph::buffer::list data;
6081 interval_set<uint64_t> data_included;
6082 ceph::buffer::list omap_header;
6083 std::map<std::string, ceph::buffer::list> omap_entries;
6084 std::map<std::string, ceph::buffer::list, std::less<>> attrset;
6085
6086 ObjectRecoveryInfo recovery_info;
6087 ObjectRecoveryProgress before_progress;
6088 ObjectRecoveryProgress after_progress;
6089
6090 static void generate_test_instances(std::list<PushOp*>& o);
6091 void encode(ceph::buffer::list &bl, uint64_t features) const;
6092 void decode(ceph::buffer::list::const_iterator &bl);
6093 std::ostream &print(std::ostream &out) const;
6094 void dump(ceph::Formatter *f) const;
6095
6096 uint64_t cost(CephContext *cct) const;
6097 };
6098 WRITE_CLASS_ENCODER_FEATURES(PushOp)
6099 std::ostream& operator<<(std::ostream& out, const PushOp &op);
6100
6101 /*
6102 * summarize pg contents for purposes of a scrub
6103 *
6104 * If members are added to ScrubMap, make sure to modify swap().
6105 */
6106 struct ScrubMap {
6107 struct object {
6108 std::map<std::string, ceph::buffer::ptr, std::less<>> attrs;
6109 uint64_t size;
6110 __u32 omap_digest; ///< omap crc32c
6111 __u32 digest; ///< data crc32c
6112 bool negative:1;
6113 bool digest_present:1;
6114 bool omap_digest_present:1;
6115 bool read_error:1;
6116 bool stat_error:1;
6117 bool ec_hash_mismatch:1;
6118 bool ec_size_mismatch:1;
6119 bool large_omap_object_found:1;
6120 uint64_t large_omap_object_key_count = 0;
6121 uint64_t large_omap_object_value_size = 0;
6122 uint64_t object_omap_bytes = 0;
6123 uint64_t object_omap_keys = 0;
6124
6125 object() :
6126 // Init invalid size so it won't match if we get a stat EIO error
6127 size(-1), omap_digest(0), digest(0),
6128 negative(false), digest_present(false), omap_digest_present(false),
6129 read_error(false), stat_error(false), ec_hash_mismatch(false),
6130 ec_size_mismatch(false), large_omap_object_found(false) {}
6131
6132 void encode(ceph::buffer::list& bl) const;
6133 void decode(ceph::buffer::list::const_iterator& bl);
6134 void dump(ceph::Formatter *f) const;
6135 static void generate_test_instances(std::list<object*>& o);
6136 };
6137 WRITE_CLASS_ENCODER(object)
6138
6139 std::map<hobject_t,object> objects;
6140 eversion_t valid_through;
6141 eversion_t incr_since;
6142 bool has_large_omap_object_errors{false};
6143 bool has_omap_keys{false};
6144
6145 void merge_incr(const ScrubMap &l);
6146 void clear_from(const hobject_t& start) {
6147 objects.erase(objects.lower_bound(start), objects.end());
6148 }
6149 void insert(const ScrubMap &r) {
6150 objects.insert(r.objects.begin(), r.objects.end());
6151 }
6152 void swap(ScrubMap &r) {
6153 using std::swap;
6154 swap(objects, r.objects);
6155 swap(valid_through, r.valid_through);
6156 swap(incr_since, r.incr_since);
6157 swap(has_large_omap_object_errors, r.has_large_omap_object_errors);
6158 swap(has_omap_keys, r.has_omap_keys);
6159 }
6160
6161 void encode(ceph::buffer::list& bl) const;
6162 void decode(ceph::buffer::list::const_iterator& bl, int64_t pool=-1);
6163 void dump(ceph::Formatter *f) const;
6164 static void generate_test_instances(std::list<ScrubMap*>& o);
6165 };
6166 WRITE_CLASS_ENCODER(ScrubMap::object)
6167 WRITE_CLASS_ENCODER(ScrubMap)
6168
6169 struct ScrubMapBuilder {
6170 bool deep = false;
6171 std::vector<hobject_t> ls;
6172 size_t pos = 0;
6173 int64_t data_pos = 0;
6174 std::string omap_pos;
6175 int ret = 0;
6176 ceph::buffer::hash data_hash, omap_hash; ///< accumulatinng hash value
6177 uint64_t omap_keys = 0;
6178 uint64_t omap_bytes = 0;
6179
6180 bool empty() {
6181 return ls.empty();
6182 }
6183 bool done() {
6184 return pos >= ls.size();
6185 }
6186 void reset() {
6187 *this = ScrubMapBuilder();
6188 }
6189
6190 bool data_done() {
6191 return data_pos < 0;
6192 }
6193
6194 void next_object() {
6195 ++pos;
6196 data_pos = 0;
6197 omap_pos.clear();
6198 omap_keys = 0;
6199 omap_bytes = 0;
6200 }
6201
6202 friend std::ostream& operator<<(std::ostream& out, const ScrubMapBuilder& pos) {
6203 out << "(" << pos.pos << "/" << pos.ls.size();
6204 if (pos.pos < pos.ls.size()) {
6205 out << " " << pos.ls[pos.pos];
6206 }
6207 if (pos.data_pos < 0) {
6208 out << " byte " << pos.data_pos;
6209 }
6210 if (!pos.omap_pos.empty()) {
6211 out << " key " << pos.omap_pos;
6212 }
6213 if (pos.deep) {
6214 out << " deep";
6215 }
6216 if (pos.ret) {
6217 out << " ret " << pos.ret;
6218 }
6219 return out << ")";
6220 }
6221 };
6222
6223 struct watch_item_t {
6224 entity_name_t name;
6225 uint64_t cookie;
6226 uint32_t timeout_seconds;
6227 entity_addr_t addr;
6228
6229 watch_item_t() : cookie(0), timeout_seconds(0) { }
6230 watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
6231 const entity_addr_t& addr)
6232 : name(name), cookie(cookie), timeout_seconds(timeout),
6233 addr(addr) { }
6234
6235 void encode(ceph::buffer::list &bl, uint64_t features) const {
6236 ENCODE_START(2, 1, bl);
6237 encode(name, bl);
6238 encode(cookie, bl);
6239 encode(timeout_seconds, bl);
6240 encode(addr, bl, features);
6241 ENCODE_FINISH(bl);
6242 }
6243 void decode(ceph::buffer::list::const_iterator &bl) {
6244 DECODE_START(2, bl);
6245 decode(name, bl);
6246 decode(cookie, bl);
6247 decode(timeout_seconds, bl);
6248 if (struct_v >= 2) {
6249 decode(addr, bl);
6250 }
6251 DECODE_FINISH(bl);
6252 }
6253 void dump(ceph::Formatter *f) const {
6254 f->dump_stream("watcher") << name;
6255 f->dump_int("cookie", cookie);
6256 f->dump_int("timeout", timeout_seconds);
6257 f->open_object_section("addr");
6258 addr.dump(f);
6259 f->close_section();
6260 }
6261 static void generate_test_instances(std::list<watch_item_t*>& o) {
6262 entity_addr_t ea;
6263 ea.set_type(entity_addr_t::TYPE_LEGACY);
6264 ea.set_nonce(1000);
6265 ea.set_family(AF_INET);
6266 ea.set_in4_quad(0, 127);
6267 ea.set_in4_quad(1, 0);
6268 ea.set_in4_quad(2, 0);
6269 ea.set_in4_quad(3, 1);
6270 ea.set_port(1024);
6271 o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
6272 ea.set_nonce(1001);
6273 ea.set_in4_quad(3, 2);
6274 ea.set_port(1025);
6275 o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
6276 }
6277 };
6278 WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
6279
6280 struct obj_watch_item_t {
6281 hobject_t obj;
6282 watch_item_t wi;
6283 };
6284
6285 /**
6286 * obj list watch response format
6287 *
6288 */
6289 struct obj_list_watch_response_t {
6290 std::list<watch_item_t> entries;
6291
6292 void encode(ceph::buffer::list& bl, uint64_t features) const {
6293 ENCODE_START(1, 1, bl);
6294 encode(entries, bl, features);
6295 ENCODE_FINISH(bl);
6296 }
6297 void decode(ceph::buffer::list::const_iterator& bl) {
6298 DECODE_START(1, bl);
6299 decode(entries, bl);
6300 DECODE_FINISH(bl);
6301 }
6302 void dump(ceph::Formatter *f) const {
6303 f->open_array_section("entries");
6304 for (std::list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
6305 f->open_object_section("watch");
6306 p->dump(f);
6307 f->close_section();
6308 }
6309 f->close_section();
6310 }
6311 static void generate_test_instances(std::list<obj_list_watch_response_t*>& o) {
6312 entity_addr_t ea;
6313 o.push_back(new obj_list_watch_response_t);
6314 o.push_back(new obj_list_watch_response_t);
6315 std::list<watch_item_t*> test_watchers;
6316 watch_item_t::generate_test_instances(test_watchers);
6317 for (auto &e : test_watchers) {
6318 o.back()->entries.push_back(*e);
6319 delete e;
6320 }
6321 }
6322 };
6323 WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
6324
6325 struct clone_info {
6326 snapid_t cloneid;
6327 std::vector<snapid_t> snaps; // ascending
6328 std::vector< std::pair<uint64_t,uint64_t> > overlap;
6329 uint64_t size;
6330
6331 clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
6332
6333 void encode(ceph::buffer::list& bl) const {
6334 ENCODE_START(1, 1, bl);
6335 encode(cloneid, bl);
6336 encode(snaps, bl);
6337 encode(overlap, bl);
6338 encode(size, bl);
6339 ENCODE_FINISH(bl);
6340 }
6341 void decode(ceph::buffer::list::const_iterator& bl) {
6342 DECODE_START(1, bl);
6343 decode(cloneid, bl);
6344 decode(snaps, bl);
6345 decode(overlap, bl);
6346 decode(size, bl);
6347 DECODE_FINISH(bl);
6348 }
6349 void dump(ceph::Formatter *f) const {
6350 if (cloneid == CEPH_NOSNAP)
6351 f->dump_string("cloneid", "HEAD");
6352 else
6353 f->dump_unsigned("cloneid", cloneid.val);
6354 f->open_array_section("snapshots");
6355 for (std::vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
6356 f->open_object_section("snap");
6357 f->dump_unsigned("id", p->val);
6358 f->close_section();
6359 }
6360 f->close_section();
6361 f->open_array_section("overlaps");
6362 for (std::vector< std::pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
6363 q != overlap.end(); ++q) {
6364 f->open_object_section("overlap");
6365 f->dump_unsigned("offset", q->first);
6366 f->dump_unsigned("length", q->second);
6367 f->close_section();
6368 }
6369 f->close_section();
6370 f->dump_unsigned("size", size);
6371 }
6372 static void generate_test_instances(std::list<clone_info*>& o) {
6373 o.push_back(new clone_info);
6374 o.push_back(new clone_info);
6375 o.back()->cloneid = 1;
6376 o.back()->snaps.push_back(1);
6377 o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
6378 o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
6379 o.back()->size = 16384;
6380 o.push_back(new clone_info);
6381 o.back()->cloneid = CEPH_NOSNAP;
6382 o.back()->size = 32768;
6383 }
6384 };
6385 WRITE_CLASS_ENCODER(clone_info)
6386
6387 /**
6388 * obj list snaps response format
6389 *
6390 */
6391 struct obj_list_snap_response_t {
6392 std::vector<clone_info> clones; // ascending
6393 snapid_t seq;
6394
6395 void encode(ceph::buffer::list& bl) const {
6396 ENCODE_START(2, 1, bl);
6397 encode(clones, bl);
6398 encode(seq, bl);
6399 ENCODE_FINISH(bl);
6400 }
6401 void decode(ceph::buffer::list::const_iterator& bl) {
6402 DECODE_START(2, bl);
6403 decode(clones, bl);
6404 if (struct_v >= 2)
6405 decode(seq, bl);
6406 else
6407 seq = CEPH_NOSNAP;
6408 DECODE_FINISH(bl);
6409 }
6410 void dump(ceph::Formatter *f) const {
6411 f->open_array_section("clones");
6412 for (std::vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
6413 f->open_object_section("clone");
6414 p->dump(f);
6415 f->close_section();
6416 }
6417 f->dump_unsigned("seq", seq);
6418 f->close_section();
6419 }
6420 static void generate_test_instances(std::list<obj_list_snap_response_t*>& o) {
6421 o.push_back(new obj_list_snap_response_t);
6422 o.push_back(new obj_list_snap_response_t);
6423 clone_info cl;
6424 cl.cloneid = 1;
6425 cl.snaps.push_back(1);
6426 cl.overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
6427 cl.overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
6428 cl.size = 16384;
6429 o.back()->clones.push_back(cl);
6430 cl.cloneid = CEPH_NOSNAP;
6431 cl.snaps.clear();
6432 cl.overlap.clear();
6433 cl.size = 32768;
6434 o.back()->clones.push_back(cl);
6435 o.back()->seq = 123;
6436 }
6437 };
6438
6439 WRITE_CLASS_ENCODER(obj_list_snap_response_t)
6440
6441 // PromoteCounter
6442
6443 struct PromoteCounter {
6444 std::atomic<unsigned long long> attempts{0};
6445 std::atomic<unsigned long long> objects{0};
6446 std::atomic<unsigned long long> bytes{0};
6447
6448 void attempt() {
6449 attempts++;
6450 }
6451
6452 void finish(uint64_t size) {
6453 objects++;
6454 bytes += size;
6455 }
6456
6457 void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
6458 *a = attempts;
6459 *o = objects;
6460 *b = bytes;
6461 attempts = *a / 2;
6462 objects = *o / 2;
6463 bytes = *b / 2;
6464 }
6465 };
6466
6467 struct pool_pg_num_history_t {
6468 /// last epoch updated
6469 epoch_t epoch = 0;
6470 /// poolid -> epoch -> pg_num
6471 std::map<int64_t, std::map<epoch_t,uint32_t>> pg_nums;
6472 /// pair(epoch, poolid)
6473 std::set<std::pair<epoch_t,int64_t>> deleted_pools;
6474
6475 void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) {
6476 pg_nums[pool][epoch] = pg_num;
6477 }
6478 void log_pool_delete(epoch_t epoch, int64_t pool) {
6479 deleted_pools.insert(std::make_pair(epoch, pool));
6480 }
6481
6482 /// prune history based on oldest osdmap epoch in the cluster
6483 void prune(epoch_t oldest_epoch) {
6484 auto i = deleted_pools.begin();
6485 while (i != deleted_pools.end()) {
6486 if (i->first >= oldest_epoch) {
6487 break;
6488 }
6489 pg_nums.erase(i->second);
6490 i = deleted_pools.erase(i);
6491 }
6492 for (auto& j : pg_nums) {
6493 auto k = j.second.lower_bound(oldest_epoch);
6494 // keep this and the entry before it (just to be paranoid)
6495 if (k != j.second.begin()) {
6496 --k;
6497 j.second.erase(j.second.begin(), k);
6498 }
6499 }
6500 }
6501
6502 void encode(ceph::buffer::list& bl) const {
6503 ENCODE_START(1, 1, bl);
6504 encode(epoch, bl);
6505 encode(pg_nums, bl);
6506 encode(deleted_pools, bl);
6507 ENCODE_FINISH(bl);
6508 }
6509 void decode(ceph::buffer::list::const_iterator& p) {
6510 DECODE_START(1, p);
6511 decode(epoch, p);
6512 decode(pg_nums, p);
6513 decode(deleted_pools, p);
6514 DECODE_FINISH(p);
6515 }
6516 void dump(ceph::Formatter *f) const {
6517 f->dump_unsigned("epoch", epoch);
6518 f->open_object_section("pools");
6519 for (auto& i : pg_nums) {
6520 f->open_object_section("pool");
6521 f->dump_unsigned("pool_id", i.first);
6522 f->open_array_section("changes");
6523 for (auto& j : i.second) {
6524 f->open_object_section("change");
6525 f->dump_unsigned("epoch", j.first);
6526 f->dump_unsigned("pg_num", j.second);
6527 f->close_section();
6528 }
6529 f->close_section();
6530 f->close_section();
6531 }
6532 f->close_section();
6533 f->open_array_section("deleted_pools");
6534 for (auto& i : deleted_pools) {
6535 f->open_object_section("deletion");
6536 f->dump_unsigned("pool_id", i.second);
6537 f->dump_unsigned("epoch", i.first);
6538 f->close_section();
6539 }
6540 f->close_section();
6541 }
6542 static void generate_test_instances(std::list<pool_pg_num_history_t*>& ls) {
6543 ls.push_back(new pool_pg_num_history_t);
6544 }
6545 friend std::ostream& operator<<(std::ostream& out, const pool_pg_num_history_t& h) {
6546 return out << "pg_num_history(e" << h.epoch
6547 << " pg_nums " << h.pg_nums
6548 << " deleted_pools " << h.deleted_pools
6549 << ")";
6550 }
6551 };
6552 WRITE_CLASS_ENCODER(pool_pg_num_history_t)
6553
6554 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
6555 // easily skip them
6556 static const std::string_view infover_key = "_infover";
6557 static const std::string_view info_key = "_info";
6558 static const std::string_view biginfo_key = "_biginfo";
6559 static const std::string_view epoch_key = "_epoch";
6560 static const std::string_view fastinfo_key = "_fastinfo";
6561
6562 static const __u8 pg_latest_struct_v = 10;
6563 // v10 is the new past_intervals encoding
6564 // v9 was fastinfo_key addition
6565 // v8 was the move to a per-pg pgmeta object
6566 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
6567 // (first appeared in cuttlefish).
6568 static const __u8 pg_compat_struct_v = 10;
6569
6570 int prepare_info_keymap(
6571 CephContext* cct,
6572 std::map<std::string,ceph::buffer::list> *km,
6573 std::string *key_to_remove,
6574 epoch_t epoch,
6575 pg_info_t &info,
6576 pg_info_t &last_written_info,
6577 PastIntervals &past_intervals,
6578 bool dirty_big_info,
6579 bool dirty_epoch,
6580 bool try_fast_info,
6581 PerfCounters *logger = nullptr,
6582 DoutPrefixProvider *dpp = nullptr);
6583
6584 namespace ceph::os {
6585 class Transaction;
6586 };
6587
6588 void create_pg_collection(
6589 ceph::os::Transaction& t, spg_t pgid, int bits);
6590
6591 void init_pg_ondisk(
6592 ceph::os::Transaction& t, spg_t pgid, const pg_pool_t *pool);
6593
6594 // filter for pg listings
6595 class PGLSFilter {
6596 CephContext* cct;
6597 protected:
6598 std::string xattr;
6599 public:
6600 PGLSFilter();
6601 virtual ~PGLSFilter();
6602 virtual bool filter(const hobject_t &obj,
6603 const ceph::buffer::list& xattr_data) const = 0;
6604
6605 /**
6606 * Arguments passed from the RADOS client. Implementations must
6607 * handle any encoding errors, and return an appropriate error code,
6608 * or 0 on valid input.
6609 */
6610 virtual int init(ceph::buffer::list::const_iterator &params) = 0;
6611
6612 /**
6613 * xattr key, or empty string. If non-empty, this xattr will be fetched
6614 * and the value passed into ::filter
6615 */
6616 virtual const std::string& get_xattr() const { return xattr; }
6617
6618 /**
6619 * If true, objects without the named xattr (if xattr name is not empty)
6620 * will be rejected without calling ::filter
6621 */
6622 virtual bool reject_empty_xattr() const { return true; }
6623 };
6624
6625 class PGLSPlainFilter : public PGLSFilter {
6626 std::string val;
6627 public:
6628 int init(ceph::buffer::list::const_iterator &params) override;
6629 ~PGLSPlainFilter() override {}
6630 bool filter(const hobject_t& obj,
6631 const ceph::buffer::list& xattr_data) const override;
6632 };
6633
6634 // alias name for this structure:
6635 using missing_map_t = std::map<hobject_t,
6636 std::pair<std::optional<uint32_t>,
6637 std::optional<uint32_t>>>;
6638
6639 #endif