]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/osd_types.h
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / osd / osd_types.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18#ifndef CEPH_OSD_TYPES_H
19#define CEPH_OSD_TYPES_H
20
21#include <sstream>
22#include <stdio.h>
23#include <memory>
11fdf7f2 24#include <string_view>
7c673cae
FG
25#include <boost/scoped_ptr.hpp>
26#include <boost/optional/optional_io.hpp>
27#include <boost/variant.hpp>
28
29#include "include/rados/rados_types.hpp"
30#include "include/mempool.h"
31
32#include "msg/msg_types.h"
33#include "include/types.h"
34#include "include/utime.h"
35#include "include/CompatSet.h"
36#include "common/histogram.h"
37#include "include/interval_set.h"
38#include "include/inline_memory.h"
39#include "common/Formatter.h"
40#include "common/bloom_filter.hpp"
41#include "common/hobject.h"
42#include "common/snap_types.h"
43#include "HitSet.h"
44#include "Watch.h"
45#include "include/cmp.h"
46#include "librados/ListObjectImpl.h"
47#include "compressor/Compressor.h"
48#include <atomic>
49
50#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
51
52#define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
53#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
54#define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
55#define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
56#define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
57#define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
58#define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
59#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
60#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
61#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
62#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
63#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
64#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
65#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
66#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
c07f9fc5 67#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
7c673cae
FG
68
69
81eedcae
TL
70/// pool priority range set by user
71#define OSD_POOL_PRIORITY_MAX 10
72#define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX
73
7c673cae
FG
74/// min recovery priority for MBackfillReserve
75#define OSD_RECOVERY_PRIORITY_MIN 0
76
77/// base backfill priority for MBackfillReserve
78#define OSD_BACKFILL_PRIORITY_BASE 100
79
80/// base backfill priority for MBackfillReserve (degraded PG)
81#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
82
83/// base recovery priority for MBackfillReserve
84#define OSD_RECOVERY_PRIORITY_BASE 180
85
86/// base backfill priority for MBackfillReserve (inactive PG)
87#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
88
81eedcae
TL
89/// base recovery priority for MRecoveryReserve (inactive PG)
90#define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220
91
c07f9fc5 92/// max manually/automatically set recovery priority for MBackfillReserve
a8e16298 93#define OSD_RECOVERY_PRIORITY_MAX 253
c07f9fc5 94
a8e16298
TL
95/// backfill priority for MBackfillReserve, when forced manually
96#define OSD_BACKFILL_PRIORITY_FORCED 254
97
98/// recovery priority for MRecoveryReserve, when forced manually
c07f9fc5 99#define OSD_RECOVERY_PRIORITY_FORCED 255
7c673cae 100
11fdf7f2
TL
101/// priority for pg deletion when osd is not fullish
102#define OSD_DELETE_PRIORITY_NORMAL 179
103
104/// priority for pg deletion when osd is approaching full
105#define OSD_DELETE_PRIORITY_FULLISH 219
106
107/// priority when more full
108#define OSD_DELETE_PRIORITY_FULL 255
109
81eedcae
TL
110static std::map<int, int> max_prio_map = {
111 {OSD_BACKFILL_PRIORITY_BASE, OSD_BACKFILL_DEGRADED_PRIORITY_BASE - 1},
112 {OSD_BACKFILL_DEGRADED_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_BASE - 1},
113 {OSD_RECOVERY_PRIORITY_BASE, OSD_BACKFILL_INACTIVE_PRIORITY_BASE - 1},
114 {OSD_RECOVERY_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX},
115 {OSD_BACKFILL_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX}
116};
7c673cae
FG
117
118typedef hobject_t collection_list_handle_t;
119
120/// convert a single CPEH_OSD_FLAG_* to a string
121const char *ceph_osd_flag_name(unsigned flag);
122/// convert a single CEPH_OSD_OF_FLAG_* to a string
123const char *ceph_osd_op_flag_name(unsigned flag);
124
125/// convert CEPH_OSD_FLAG_* op flags to a string
126string ceph_osd_flag_string(unsigned flags);
127/// conver CEPH_OSD_OP_FLAG_* op flags to a string
128string ceph_osd_op_flag_string(unsigned flags);
129/// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a string
130string ceph_osd_alloc_hint_flag_string(unsigned flags);
131
11fdf7f2
TL
132typedef map<string,string> osd_alert_list_t;
133/// map osd id -> alert_list_t
134typedef map<int, osd_alert_list_t> osd_alerts_t;
135void dump(Formatter* f, const osd_alerts_t& alerts);
7c673cae
FG
136
137/**
138 * osd request identifier
139 *
140 * caller name + incarnation# + tid to unique identify this request.
141 */
142struct osd_reqid_t {
143 entity_name_t name; // who
c07f9fc5 144 ceph_tid_t tid;
7c673cae
FG
145 int32_t inc; // incarnation
146
147 osd_reqid_t()
c07f9fc5
FG
148 : tid(0), inc(0)
149 {}
150 osd_reqid_t(const osd_reqid_t& other)
151 : name(other.name), tid(other.tid), inc(other.inc)
152 {}
7c673cae 153 osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
c07f9fc5
FG
154 : name(a), tid(t), inc(i)
155 {}
7c673cae
FG
156
157 DENC(osd_reqid_t, v, p) {
158 DENC_START(2, 2, p);
159 denc(v.name, p);
160 denc(v.tid, p);
161 denc(v.inc, p);
162 DENC_FINISH(p);
163 }
164 void dump(Formatter *f) const;
165 static void generate_test_instances(list<osd_reqid_t*>& o);
166};
167WRITE_CLASS_DENC(osd_reqid_t)
168
169
170
171struct pg_shard_t {
b32b8144 172 static const int32_t NO_OSD = 0x7fffffff;
7c673cae
FG
173 int32_t osd;
174 shard_id_t shard;
175 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
176 explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
177 pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
178 bool is_undefined() const {
179 return osd == -1;
180 }
b32b8144 181 string get_osd() const { return (osd == NO_OSD ? "NONE" : to_string(osd)); }
7c673cae 182 void encode(bufferlist &bl) const;
11fdf7f2 183 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
184 void dump(Formatter *f) const {
185 f->dump_unsigned("osd", osd);
186 if (shard != shard_id_t::NO_SHARD) {
187 f->dump_unsigned("shard", shard);
188 }
189 }
190};
191WRITE_CLASS_ENCODER(pg_shard_t)
192WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
193WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
194ostream &operator<<(ostream &lhs, const pg_shard_t &rhs);
195
196class IsPGRecoverablePredicate {
197public:
198 /**
199 * have encodes the shards available
200 */
201 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
202 virtual ~IsPGRecoverablePredicate() {}
203};
204
205class IsPGReadablePredicate {
206public:
207 /**
208 * have encodes the shards available
209 */
210 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
211 virtual ~IsPGReadablePredicate() {}
212};
213
214inline ostream& operator<<(ostream& out, const osd_reqid_t& r) {
215 return out << r.name << "." << r.inc << ":" << r.tid;
216}
217
218inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
219 return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
220}
221inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
222 return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
223}
224inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
225 return (l.name < r.name) || (l.inc < r.inc) ||
226 (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
227}
228inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
229 return (l.name < r.name) || (l.inc < r.inc) ||
230 (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
231}
232inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
233inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
234
235namespace std {
236 template<> struct hash<osd_reqid_t> {
237 size_t operator()(const osd_reqid_t &r) const {
238 static hash<uint64_t> H;
239 return H(r.name.num() ^ r.tid ^ r.inc);
240 }
241 };
242} // namespace std
243
244
245// -----
246
247// a locator constrains the placement of an object. mainly, which pool
248// does it go in.
249struct object_locator_t {
250 // You specify either the hash or the key -- not both
251 int64_t pool; ///< pool id
252 string key; ///< key string (if non-empty)
253 string nspace; ///< namespace
254 int64_t hash; ///< hash position (if >= 0)
255
256 explicit object_locator_t()
257 : pool(-1), hash(-1) {}
258 explicit object_locator_t(int64_t po)
259 : pool(po), hash(-1) {}
260 explicit object_locator_t(int64_t po, int64_t ps)
261 : pool(po), hash(ps) {}
262 explicit object_locator_t(int64_t po, string ns)
263 : pool(po), nspace(ns), hash(-1) {}
264 explicit object_locator_t(int64_t po, string ns, int64_t ps)
265 : pool(po), nspace(ns), hash(ps) {}
266 explicit object_locator_t(int64_t po, string ns, string s)
267 : pool(po), key(s), nspace(ns), hash(-1) {}
268 explicit object_locator_t(const hobject_t& soid)
269 : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
270
271 int64_t get_pool() const {
272 return pool;
273 }
274
275 void clear() {
276 pool = -1;
277 key = "";
278 nspace = "";
279 hash = -1;
280 }
281
282 bool empty() const {
283 return pool == -1;
284 }
285
286 void encode(bufferlist& bl) const;
11fdf7f2 287 void decode(bufferlist::const_iterator& p);
7c673cae
FG
288 void dump(Formatter *f) const;
289 static void generate_test_instances(list<object_locator_t*>& o);
290};
291WRITE_CLASS_ENCODER(object_locator_t)
292
293inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
294 return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
295}
296inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
297 return !(l == r);
298}
299
300inline ostream& operator<<(ostream& out, const object_locator_t& loc)
301{
302 out << "@" << loc.pool;
303 if (loc.nspace.length())
304 out << ";" << loc.nspace;
305 if (loc.key.length())
306 out << ":" << loc.key;
307 return out;
308}
309
310struct request_redirect_t {
311private:
312 object_locator_t redirect_locator; ///< this is authoritative
313 string redirect_object; ///< If non-empty, the request goes to this object name
7c673cae
FG
314
315 friend ostream& operator<<(ostream& out, const request_redirect_t& redir);
316public:
317
318 request_redirect_t() {}
319 explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
320 redirect_locator(orig) { redirect_locator.pool = rpool; }
321 explicit request_redirect_t(const object_locator_t& rloc) :
322 redirect_locator(rloc) {}
323 explicit request_redirect_t(const object_locator_t& orig,
324 const string& robj) :
325 redirect_locator(orig), redirect_object(robj) {}
326
7c673cae
FG
327 bool empty() const { return redirect_locator.empty() &&
328 redirect_object.empty(); }
329
330 void combine_with_locator(object_locator_t& orig, string& obj) const {
331 orig = redirect_locator;
332 if (!redirect_object.empty())
333 obj = redirect_object;
334 }
335
336 void encode(bufferlist& bl) const;
11fdf7f2 337 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
338 void dump(Formatter *f) const;
339 static void generate_test_instances(list<request_redirect_t*>& o);
340};
341WRITE_CLASS_ENCODER(request_redirect_t)
342
343inline ostream& operator<<(ostream& out, const request_redirect_t& redir) {
344 out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
345 return out;
346}
347
348// Internal OSD op flags - set by the OSD based on the op types
349enum {
350 CEPH_OSD_RMW_FLAG_READ = (1 << 1),
351 CEPH_OSD_RMW_FLAG_WRITE = (1 << 2),
352 CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3),
353 CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
354 CEPH_OSD_RMW_FLAG_PGOP = (1 << 5),
355 CEPH_OSD_RMW_FLAG_CACHE = (1 << 6),
356 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7),
357 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
358 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9),
359 CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10),
360};
361
362
363// pg stuff
364
365#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
366
367// placement seed (a hash value)
368typedef uint32_t ps_t;
369
370// old (v1) pg_t encoding (wrap old struct ceph_pg)
371struct old_pg_t {
372 ceph_pg v;
373 void encode(bufferlist& bl) const {
374 ::encode_raw(v, bl);
375 }
11fdf7f2 376 void decode(bufferlist::const_iterator& bl) {
7c673cae
FG
377 ::decode_raw(v, bl);
378 }
379};
380WRITE_CLASS_ENCODER(old_pg_t)
381
382// placement group id
383struct pg_t {
384 uint64_t m_pool;
385 uint32_t m_seed;
7c673cae 386
11fdf7f2
TL
387 pg_t() : m_pool(0), m_seed(0) {}
388 pg_t(ps_t seed, uint64_t pool) :
389 m_pool(pool), m_seed(seed) {}
7c673cae
FG
390 // cppcheck-suppress noExplicitConstructor
391 pg_t(const ceph_pg& cpg) :
11fdf7f2 392 m_pool(cpg.pool), m_seed(cpg.ps) {}
7c673cae
FG
393
394 // cppcheck-suppress noExplicitConstructor
395 pg_t(const old_pg_t& opg) {
396 *this = opg.v;
397 }
398
399 old_pg_t get_old_pg() const {
400 old_pg_t o;
11fdf7f2 401 ceph_assert(m_pool < 0xffffffffull);
7c673cae
FG
402 o.v.pool = m_pool;
403 o.v.ps = m_seed;
11fdf7f2 404 o.v.preferred = (__s16)-1;
7c673cae
FG
405 return o;
406 }
407
408 ps_t ps() const {
409 return m_seed;
410 }
11fdf7f2 411 int64_t pool() const {
7c673cae
FG
412 return m_pool;
413 }
7c673cae
FG
414
415 static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
416 char *calc_name(char *buf, const char *suffix_backwords) const;
417
418 void set_ps(ps_t p) {
419 m_seed = p;
420 }
421 void set_pool(uint64_t p) {
422 m_pool = p;
423 }
7c673cae
FG
424
425 pg_t get_parent() const;
426 pg_t get_ancestor(unsigned old_pg_num) const;
427
428 int print(char *o, int maxlen) const;
429 bool parse(const char *s);
430
431 bool is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *pchildren) const;
432
11fdf7f2
TL
433 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, pg_t *parent) const;
434 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
435 return ps() < new_pg_num && is_split(new_pg_num, old_pg_num, nullptr);
436 }
437
7c673cae
FG
438 /**
439 * Returns b such that for all object o:
440 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
441 */
442 unsigned get_split_bits(unsigned pg_num) const;
443
444 bool contains(int bits, const ghobject_t& oid) {
f64942e4
AA
445 return
446 (int64_t)m_pool == oid.hobj.get_logical_pool() &&
447 oid.match(bits, ps());
7c673cae
FG
448 }
449 bool contains(int bits, const hobject_t& oid) {
f64942e4
AA
450 return
451 (int64_t)m_pool == oid.get_logical_pool() &&
452 oid.match(bits, ps());
7c673cae
FG
453 }
454
455 hobject_t get_hobj_start() const;
456 hobject_t get_hobj_end(unsigned pg_num) const;
457
458 void encode(bufferlist& bl) const {
11fdf7f2 459 using ceph::encode;
7c673cae 460 __u8 v = 1;
11fdf7f2
TL
461 encode(v, bl);
462 encode(m_pool, bl);
463 encode(m_seed, bl);
464 encode((int32_t)-1, bl); // was preferred
7c673cae 465 }
11fdf7f2
TL
466 void decode(bufferlist::const_iterator& bl) {
467 using ceph::decode;
7c673cae 468 __u8 v;
11fdf7f2
TL
469 decode(v, bl);
470 decode(m_pool, bl);
471 decode(m_seed, bl);
472 bl.advance(sizeof(int32_t)); // was preferred
7c673cae 473 }
11fdf7f2
TL
474 void decode_old(bufferlist::const_iterator& bl) {
475 using ceph::decode;
7c673cae 476 old_pg_t opg;
11fdf7f2 477 decode(opg, bl);
7c673cae
FG
478 *this = opg;
479 }
480 void dump(Formatter *f) const;
481 static void generate_test_instances(list<pg_t*>& o);
482};
483WRITE_CLASS_ENCODER(pg_t)
484
485inline bool operator<(const pg_t& l, const pg_t& r) {
486 return l.pool() < r.pool() ||
11fdf7f2 487 (l.pool() == r.pool() && (l.ps() < r.ps()));
7c673cae
FG
488}
489inline bool operator<=(const pg_t& l, const pg_t& r) {
490 return l.pool() < r.pool() ||
11fdf7f2 491 (l.pool() == r.pool() && (l.ps() <= r.ps()));
7c673cae
FG
492}
493inline bool operator==(const pg_t& l, const pg_t& r) {
494 return l.pool() == r.pool() &&
7c673cae
FG
495 l.ps() == r.ps();
496}
497inline bool operator!=(const pg_t& l, const pg_t& r) {
498 return l.pool() != r.pool() ||
7c673cae
FG
499 l.ps() != r.ps();
500}
501inline bool operator>(const pg_t& l, const pg_t& r) {
502 return l.pool() > r.pool() ||
11fdf7f2 503 (l.pool() == r.pool() && (l.ps() > r.ps()));
7c673cae
FG
504}
505inline bool operator>=(const pg_t& l, const pg_t& r) {
506 return l.pool() > r.pool() ||
11fdf7f2 507 (l.pool() == r.pool() && (l.ps() >= r.ps()));
7c673cae
FG
508}
509
510ostream& operator<<(ostream& out, const pg_t &pg);
511
512namespace std {
513 template<> struct hash< pg_t >
514 {
515 size_t operator()( const pg_t& x ) const
516 {
517 static hash<uint32_t> H;
11fdf7f2
TL
518 // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
519 return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ (int32_t)(-1));
7c673cae
FG
520 }
521 };
522} // namespace std
523
524struct spg_t {
525 pg_t pgid;
526 shard_id_t shard;
527 spg_t() : shard(shard_id_t::NO_SHARD) {}
528 spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
529 explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
530 unsigned get_split_bits(unsigned pg_num) const {
531 return pgid.get_split_bits(pg_num);
532 }
533 spg_t get_parent() const {
534 return spg_t(pgid.get_parent(), shard);
535 }
536 ps_t ps() const {
537 return pgid.ps();
538 }
539 uint64_t pool() const {
540 return pgid.pool();
541 }
7c673cae
FG
542
543 static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
544 char *calc_name(char *buf, const char *suffix_backwords) const;
545
546 bool parse(const char *s);
547 bool parse(const std::string& s) {
548 return parse(s.c_str());
549 }
11fdf7f2
TL
550
551 spg_t get_ancestor(unsigned old_pg_num) const {
552 return spg_t(pgid.get_ancestor(old_pg_num), shard);
553 }
554
7c673cae
FG
555 bool is_split(unsigned old_pg_num, unsigned new_pg_num,
556 set<spg_t> *pchildren) const {
557 set<pg_t> _children;
558 set<pg_t> *children = pchildren ? &_children : NULL;
559 bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
560 if (pchildren && is_split) {
561 for (set<pg_t>::iterator i = _children.begin();
562 i != _children.end();
563 ++i) {
564 pchildren->insert(spg_t(*i, shard));
565 }
566 }
567 return is_split;
568 }
11fdf7f2
TL
569 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
570 return pgid.is_merge_target(old_pg_num, new_pg_num);
571 }
572 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num,
573 spg_t *parent) const {
574 spg_t out = *this;
575 bool r = pgid.is_merge_source(old_pg_num, new_pg_num, &out.pgid);
576 if (r && parent) {
577 *parent = out;
578 }
579 return r;
580 }
581
7c673cae
FG
582 bool is_no_shard() const {
583 return shard == shard_id_t::NO_SHARD;
584 }
585
586 ghobject_t make_pgmeta_oid() const {
587 return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
588 }
589
590 void encode(bufferlist &bl) const {
591 ENCODE_START(1, 1, bl);
11fdf7f2
TL
592 encode(pgid, bl);
593 encode(shard, bl);
7c673cae
FG
594 ENCODE_FINISH(bl);
595 }
11fdf7f2 596 void decode(bufferlist::const_iterator& bl) {
7c673cae 597 DECODE_START(1, bl);
11fdf7f2
TL
598 decode(pgid, bl);
599 decode(shard, bl);
7c673cae
FG
600 DECODE_FINISH(bl);
601 }
602
603 ghobject_t make_temp_ghobject(const string& name) const {
604 return ghobject_t(
605 hobject_t(object_t(name), "", CEPH_NOSNAP,
606 pgid.ps(),
f64942e4
AA
607 hobject_t::get_temp_pool(pgid.pool()),
608 ""),
7c673cae
FG
609 ghobject_t::NO_GEN,
610 shard);
611 }
612
613 unsigned hash_to_shard(unsigned num_shards) const {
614 return ps() % num_shards;
615 }
616};
617WRITE_CLASS_ENCODER(spg_t)
618WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
619WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
620
621namespace std {
622 template<> struct hash< spg_t >
623 {
624 size_t operator()( const spg_t& x ) const
625 {
626 static hash<uint32_t> H;
627 return H(hash<pg_t>()(x.pgid) ^ x.shard);
628 }
629 };
630} // namespace std
631
632ostream& operator<<(ostream& out, const spg_t &pg);
633
634// ----------------------
635
636class coll_t {
637 enum type_t {
638 TYPE_META = 0,
639 TYPE_LEGACY_TEMP = 1, /* no longer used */
640 TYPE_PG = 2,
641 TYPE_PG_TEMP = 3,
642 };
643 type_t type;
644 spg_t pgid;
645 uint64_t removal_seq; // note: deprecated, not encoded
646
647 char _str_buff[spg_t::calc_name_buf_size];
648 char *_str;
649
650 void calc_str();
651
652 coll_t(type_t t, spg_t p, uint64_t r)
653 : type(t), pgid(p), removal_seq(r) {
654 calc_str();
655 }
656
657public:
658 coll_t() : type(TYPE_META), removal_seq(0)
659 {
660 calc_str();
661 }
662
663 coll_t(const coll_t& other)
664 : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
665 calc_str();
666 }
667
668 explicit coll_t(spg_t pgid)
669 : type(TYPE_PG), pgid(pgid), removal_seq(0)
670 {
671 calc_str();
672 }
673
674 coll_t& operator=(const coll_t& rhs)
675 {
676 this->type = rhs.type;
677 this->pgid = rhs.pgid;
678 this->removal_seq = rhs.removal_seq;
679 this->calc_str();
680 return *this;
681 }
682
683 // named constructors
684 static coll_t meta() {
685 return coll_t();
686 }
687 static coll_t pg(spg_t p) {
688 return coll_t(p);
689 }
690
691 const std::string to_str() const {
692 return string(_str);
693 }
694 const char *c_str() const {
695 return _str;
696 }
697
698 bool parse(const std::string& s);
699
700 int operator<(const coll_t &rhs) const {
701 return type < rhs.type ||
702 (type == rhs.type && pgid < rhs.pgid);
703 }
704
705 bool is_meta() const {
706 return type == TYPE_META;
707 }
708 bool is_pg_prefix(spg_t *pgid_) const {
709 if (type == TYPE_PG || type == TYPE_PG_TEMP) {
710 *pgid_ = pgid;
711 return true;
712 }
713 return false;
714 }
715 bool is_pg() const {
716 return type == TYPE_PG;
717 }
718 bool is_pg(spg_t *pgid_) const {
719 if (type == TYPE_PG) {
720 *pgid_ = pgid;
721 return true;
722 }
723 return false;
724 }
725 bool is_temp() const {
726 return type == TYPE_PG_TEMP;
727 }
728 bool is_temp(spg_t *pgid_) const {
729 if (type == TYPE_PG_TEMP) {
730 *pgid_ = pgid;
731 return true;
732 }
733 return false;
734 }
735
736 void encode(bufferlist& bl) const;
11fdf7f2 737 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
738 size_t encoded_size() const;
739
740 inline bool operator==(const coll_t& rhs) const {
741 // only compare type if meta
742 if (type != rhs.type)
743 return false;
744 if (type == TYPE_META)
745 return true;
746 return type == rhs.type && pgid == rhs.pgid;
747 }
748 inline bool operator!=(const coll_t& rhs) const {
749 return !(*this == rhs);
750 }
751
752 // get a TEMP collection that corresponds to the current collection,
753 // which we presume is a pg collection.
754 coll_t get_temp() const {
11fdf7f2 755 ceph_assert(type == TYPE_PG);
7c673cae
FG
756 return coll_t(TYPE_PG_TEMP, pgid, 0);
757 }
758
759 ghobject_t get_min_hobj() const {
760 ghobject_t o;
761 switch (type) {
762 case TYPE_PG:
763 o.hobj.pool = pgid.pool();
764 o.set_shard(pgid.shard);
765 break;
766 case TYPE_META:
767 o.hobj.pool = -1;
768 break;
769 default:
770 break;
771 }
772 return o;
773 }
774
775 unsigned hash_to_shard(unsigned num_shards) const {
776 if (type == TYPE_PG)
777 return pgid.hash_to_shard(num_shards);
778 return 0; // whatever.
779 }
780
781 void dump(Formatter *f) const;
782 static void generate_test_instances(list<coll_t*>& o);
783};
784
785WRITE_CLASS_ENCODER(coll_t)
786
787inline ostream& operator<<(ostream& out, const coll_t& c) {
788 out << c.to_str();
789 return out;
790}
791
792namespace std {
793 template<> struct hash<coll_t> {
794 size_t operator()(const coll_t &c) const {
795 size_t h = 0;
796 string str(c.to_str());
797 std::string::const_iterator end(str.end());
798 for (std::string::const_iterator s = str.begin(); s != end; ++s) {
799 h += *s;
800 h += (h << 10);
801 h ^= (h >> 6);
802 }
803 h += (h << 3);
804 h ^= (h >> 11);
805 h += (h << 15);
806 return h;
807 }
808 };
809} // namespace std
810
811inline ostream& operator<<(ostream& out, const ceph_object_layout &ol)
812{
813 out << pg_t(ol.ol_pgid);
814 int su = ol.ol_stripe_unit;
815 if (su)
816 out << ".su=" << su;
817 return out;
818}
819
820
821
822// compound rados version type
823/* WARNING: If add member in eversion_t, please make sure the encode/decode function
824 * work well. For little-endian machine, we should make sure there is no padding
825 * in 32-bit machine and 64-bit machine.
826 */
827class eversion_t {
828public:
829 version_t version;
830 epoch_t epoch;
831 __u32 __pad;
832 eversion_t() : version(0), epoch(0), __pad(0) {}
833 eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
834
835 // cppcheck-suppress noExplicitConstructor
c07f9fc5 836 eversion_t(const ceph_eversion& ce) :
7c673cae
FG
837 version(ce.version),
838 epoch(ce.epoch),
839 __pad(0) { }
840
841 explicit eversion_t(bufferlist& bl) : __pad(0) { decode(bl); }
842
11fdf7f2
TL
843 static const eversion_t& max() {
844 static const eversion_t max(-1,-1);
7c673cae
FG
845 return max;
846 }
847
848 operator ceph_eversion() {
849 ceph_eversion c;
850 c.epoch = epoch;
851 c.version = version;
852 return c;
853 }
854
855 string get_key_name() const;
856
11fdf7f2
TL
857 // key must point to the beginning of a block of 32 chars
858 inline void get_key_name(char* key) const {
859 // Below is equivalent of sprintf("%010u.%020llu");
860 key[31] = 0;
861 ritoa<uint64_t, 10, 20>(version, key + 31);
862 key[10] = '.';
863 ritoa<uint32_t, 10, 10>(epoch, key + 10);
864 }
865
7c673cae
FG
866 void encode(bufferlist &bl) const {
867#if defined(CEPH_LITTLE_ENDIAN)
868 bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
869#else
11fdf7f2
TL
870 using ceph::encode;
871 encode(version, bl);
872 encode(epoch, bl);
7c673cae
FG
873#endif
874 }
11fdf7f2 875 void decode(bufferlist::const_iterator &bl) {
7c673cae
FG
876#if defined(CEPH_LITTLE_ENDIAN)
877 bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
878#else
11fdf7f2
TL
879 using ceph::decode;
880 decode(version, bl);
881 decode(epoch, bl);
7c673cae
FG
882#endif
883 }
884 void decode(bufferlist& bl) {
11fdf7f2 885 auto p = std::cbegin(bl);
7c673cae
FG
886 decode(p);
887 }
888};
889WRITE_CLASS_ENCODER(eversion_t)
890
891inline bool operator==(const eversion_t& l, const eversion_t& r) {
892 return (l.epoch == r.epoch) && (l.version == r.version);
893}
894inline bool operator!=(const eversion_t& l, const eversion_t& r) {
895 return (l.epoch != r.epoch) || (l.version != r.version);
896}
897inline bool operator<(const eversion_t& l, const eversion_t& r) {
898 return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
899}
900inline bool operator<=(const eversion_t& l, const eversion_t& r) {
901 return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
902}
903inline bool operator>(const eversion_t& l, const eversion_t& r) {
904 return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
905}
906inline bool operator>=(const eversion_t& l, const eversion_t& r) {
907 return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
908}
909inline ostream& operator<<(ostream& out, const eversion_t& e) {
910 return out << e.epoch << "'" << e.version;
911}
912
913/**
914 * objectstore_perf_stat_t
915 *
916 * current perf information about the osd
917 */
918struct objectstore_perf_stat_t {
11fdf7f2
TL
919 // cur_op_latency is in ns since double add/sub are not associative
920 uint64_t os_commit_latency_ns;
921 uint64_t os_apply_latency_ns;
7c673cae
FG
922
923 objectstore_perf_stat_t() :
11fdf7f2 924 os_commit_latency_ns(0), os_apply_latency_ns(0) {}
7c673cae
FG
925
926 bool operator==(const objectstore_perf_stat_t &r) const {
11fdf7f2
TL
927 return os_commit_latency_ns == r.os_commit_latency_ns &&
928 os_apply_latency_ns == r.os_apply_latency_ns;
7c673cae
FG
929 }
930
931 void add(const objectstore_perf_stat_t &o) {
11fdf7f2
TL
932 os_commit_latency_ns += o.os_commit_latency_ns;
933 os_apply_latency_ns += o.os_apply_latency_ns;
7c673cae
FG
934 }
935 void sub(const objectstore_perf_stat_t &o) {
11fdf7f2
TL
936 os_commit_latency_ns -= o.os_commit_latency_ns;
937 os_apply_latency_ns -= o.os_apply_latency_ns;
7c673cae
FG
938 }
939 void dump(Formatter *f) const;
11fdf7f2
TL
940 void encode(bufferlist &bl, uint64_t features) const;
941 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
942 static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
943};
11fdf7f2 944WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t)
7c673cae
FG
945
946/*
947 * pg states
948 */
11fdf7f2
TL
949#define PG_STATE_CREATING (1ULL << 0) // creating
950#define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too)
951#define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas.
952#define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline
953#define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound
954#define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound
955#define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging
956#define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing
957//#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub
958#define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy
959#define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
960#define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering
961#define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub
962#define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects
963#define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill
964#define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed.
965#define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown.
966#define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
967#define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files
968#define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content
969#define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full
970#define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations
971#define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size
972#define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active
973#define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover
974#define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps
975#define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps
976#define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full
977#define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps
978#define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other
979#define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other
980#define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors
981
982std::string pg_state_string(uint64_t state);
7c673cae 983std::string pg_vector_string(const vector<int32_t> &a);
3efd9988 984boost::optional<uint64_t> pg_string_state(const std::string& state);
7c673cae
FG
985
986
987/*
988 * pool_snap_info_t
989 *
990 * attributes for a single pool snapshot.
991 */
992struct pool_snap_info_t {
993 snapid_t snapid;
994 utime_t stamp;
995 string name;
996
997 void dump(Formatter *f) const;
998 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 999 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
1000 static void generate_test_instances(list<pool_snap_info_t*>& o);
1001};
1002WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
1003
1004inline ostream& operator<<(ostream& out, const pool_snap_info_t& si) {
1005 return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
1006}
1007
1008
1009/*
1010 * pool_opts_t
1011 *
1012 * pool options.
1013 */
1014
1015class pool_opts_t {
1016public:
1017 enum key_t {
1018 SCRUB_MIN_INTERVAL,
1019 SCRUB_MAX_INTERVAL,
1020 DEEP_SCRUB_INTERVAL,
1021 RECOVERY_PRIORITY,
1022 RECOVERY_OP_PRIORITY,
1023 SCRUB_PRIORITY,
1024 COMPRESSION_MODE,
1025 COMPRESSION_ALGORITHM,
1026 COMPRESSION_REQUIRED_RATIO,
1027 COMPRESSION_MAX_BLOB_SIZE,
1028 COMPRESSION_MIN_BLOB_SIZE,
1029 CSUM_TYPE,
1030 CSUM_MAX_BLOCK,
1031 CSUM_MIN_BLOCK,
11fdf7f2
TL
1032 FINGERPRINT_ALGORITHM,
1033 PG_NUM_MIN, // min pg_num
1034 TARGET_SIZE_BYTES, // total bytes in pool
1035 TARGET_SIZE_RATIO, // fraction of total cluster
1036 PG_AUTOSCALE_BIAS,
7c673cae
FG
1037 };
1038
1039 enum type_t {
1040 STR,
1041 INT,
1042 DOUBLE,
1043 };
1044
1045 struct opt_desc_t {
1046 key_t key;
1047 type_t type;
1048
1049 opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
1050
1051 bool operator==(const opt_desc_t& rhs) const {
1052 return key == rhs.key && type == rhs.type;
1053 }
1054 };
1055
11fdf7f2 1056 typedef boost::variant<std::string,int64_t,double> value_t;
7c673cae
FG
1057
1058 static bool is_opt_name(const std::string& name);
1059 static opt_desc_t get_opt_desc(const std::string& name);
1060
1061 pool_opts_t() : opts() {}
1062
1063 bool is_set(key_t key) const;
1064
1065 template<typename T>
1066 void set(key_t key, const T &val) {
1067 value_t value = val;
1068 opts[key] = value;
1069 }
1070
1071 template<typename T>
1072 bool get(key_t key, T *val) const {
1073 opts_t::const_iterator i = opts.find(key);
1074 if (i == opts.end()) {
1075 return false;
1076 }
1077 *val = boost::get<T>(i->second);
1078 return true;
1079 }
1080
1081 const value_t& get(key_t key) const;
1082
1083 bool unset(key_t key);
1084
1085 void dump(const std::string& name, Formatter *f) const;
1086
1087 void dump(Formatter *f) const;
11fdf7f2
TL
1088 void encode(bufferlist &bl, uint64_t features) const;
1089 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
1090
1091private:
1092 typedef std::map<key_t, value_t> opts_t;
1093 opts_t opts;
1094
1095 friend ostream& operator<<(ostream& out, const pool_opts_t& opts);
1096};
11fdf7f2
TL
1097WRITE_CLASS_ENCODER_FEATURES(pool_opts_t)
1098
1099struct pg_merge_meta_t {
1100 pg_t source_pgid;
1101 epoch_t ready_epoch = 0;
1102 epoch_t last_epoch_started = 0;
1103 epoch_t last_epoch_clean = 0;
1104 eversion_t source_version;
1105 eversion_t target_version;
1106
1107 void encode(bufferlist& bl) const {
1108 ENCODE_START(1, 1, bl);
1109 encode(source_pgid, bl);
1110 encode(ready_epoch, bl);
1111 encode(last_epoch_started, bl);
1112 encode(last_epoch_clean, bl);
1113 encode(source_version, bl);
1114 encode(target_version, bl);
1115 ENCODE_FINISH(bl);
1116 }
1117 void decode(bufferlist::const_iterator& p) {
1118 DECODE_START(1, p);
1119 decode(source_pgid, p);
1120 decode(ready_epoch, p);
1121 decode(last_epoch_started, p);
1122 decode(last_epoch_clean, p);
1123 decode(source_version, p);
1124 decode(target_version, p);
1125 DECODE_FINISH(p);
1126 }
1127 void dump(Formatter *f) const {
1128 f->dump_stream("source_pgid") << source_pgid;
1129 f->dump_unsigned("ready_epoch", ready_epoch);
1130 f->dump_unsigned("last_epoch_started", last_epoch_started);
1131 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
1132 f->dump_stream("source_version") << source_version;
1133 f->dump_stream("target_version") << target_version;
1134 }
1135};
1136WRITE_CLASS_ENCODER(pg_merge_meta_t)
7c673cae
FG
1137
1138/*
1139 * pg_pool
1140 */
1141struct pg_pool_t {
c07f9fc5
FG
1142 static const char *APPLICATION_NAME_CEPHFS;
1143 static const char *APPLICATION_NAME_RBD;
1144 static const char *APPLICATION_NAME_RGW;
1145
7c673cae
FG
1146 enum {
1147 TYPE_REPLICATED = 1, // replication
1148 //TYPE_RAID4 = 2, // raid4 (never implemented)
1149 TYPE_ERASURE = 3, // erasure-coded
1150 };
11fdf7f2 1151 static std::string_view get_type_name(int t) {
7c673cae
FG
1152 switch (t) {
1153 case TYPE_REPLICATED: return "replicated";
1154 //case TYPE_RAID4: return "raid4";
1155 case TYPE_ERASURE: return "erasure";
1156 default: return "???";
1157 }
1158 }
11fdf7f2 1159 std::string_view get_type_name() const {
7c673cae
FG
1160 return get_type_name(type);
1161 }
7c673cae
FG
1162
1163 enum {
1164 FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
1165 FLAG_FULL = 1<<1, // pool is full
1166 FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
1167 FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
1168 FLAG_NODELETE = 1<<4, // pool can't be deleted
1169 FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
1170 FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
1171 FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1172 FLAG_NOSCRUB = 1<<8, // block periodic scrub
1173 FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
11fdf7f2 1174 FLAG_FULL_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
3efd9988
FG
1175 FLAG_NEARFULL = 1<<11, // pool is nearfull
1176 FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
11fdf7f2
TL
1177 FLAG_SELFMANAGED_SNAPS = 1<<13, // pool uses selfmanaged snaps
1178 FLAG_POOL_SNAPS = 1<<14, // pool has pool snaps
1179 FLAG_CREATING = 1<<15, // initial pool PGs are being created
7c673cae
FG
1180 };
1181
1182 static const char *get_flag_name(int f) {
1183 switch (f) {
1184 case FLAG_HASHPSPOOL: return "hashpspool";
1185 case FLAG_FULL: return "full";
1186 case FLAG_EC_OVERWRITES: return "ec_overwrites";
1187 case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
1188 case FLAG_NODELETE: return "nodelete";
1189 case FLAG_NOPGCHANGE: return "nopgchange";
1190 case FLAG_NOSIZECHANGE: return "nosizechange";
1191 case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
1192 case FLAG_NOSCRUB: return "noscrub";
1193 case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
11fdf7f2 1194 case FLAG_FULL_QUOTA: return "full_quota";
3efd9988
FG
1195 case FLAG_NEARFULL: return "nearfull";
1196 case FLAG_BACKFILLFULL: return "backfillfull";
11fdf7f2
TL
1197 case FLAG_SELFMANAGED_SNAPS: return "selfmanaged_snaps";
1198 case FLAG_POOL_SNAPS: return "pool_snaps";
1199 case FLAG_CREATING: return "creating";
7c673cae
FG
1200 default: return "???";
1201 }
1202 }
1203 static string get_flags_string(uint64_t f) {
1204 string s;
1205 for (unsigned n=0; f && n<64; ++n) {
1206 if (f & (1ull << n)) {
1207 if (s.length())
1208 s += ",";
1209 s += get_flag_name(1ull << n);
1210 }
1211 }
1212 return s;
1213 }
1214 string get_flags_string() const {
1215 return get_flags_string(flags);
1216 }
1217 static uint64_t get_flag_by_name(const string& name) {
1218 if (name == "hashpspool")
1219 return FLAG_HASHPSPOOL;
1220 if (name == "full")
1221 return FLAG_FULL;
1222 if (name == "ec_overwrites")
1223 return FLAG_EC_OVERWRITES;
1224 if (name == "incomplete_clones")
1225 return FLAG_INCOMPLETE_CLONES;
1226 if (name == "nodelete")
1227 return FLAG_NODELETE;
1228 if (name == "nopgchange")
1229 return FLAG_NOPGCHANGE;
1230 if (name == "nosizechange")
1231 return FLAG_NOSIZECHANGE;
1232 if (name == "write_fadvise_dontneed")
1233 return FLAG_WRITE_FADVISE_DONTNEED;
1234 if (name == "noscrub")
1235 return FLAG_NOSCRUB;
1236 if (name == "nodeep-scrub")
1237 return FLAG_NODEEP_SCRUB;
11fdf7f2
TL
1238 if (name == "full_quota")
1239 return FLAG_FULL_QUOTA;
3efd9988
FG
1240 if (name == "nearfull")
1241 return FLAG_NEARFULL;
1242 if (name == "backfillfull")
1243 return FLAG_BACKFILLFULL;
11fdf7f2
TL
1244 if (name == "selfmanaged_snaps")
1245 return FLAG_SELFMANAGED_SNAPS;
1246 if (name == "pool_snaps")
1247 return FLAG_POOL_SNAPS;
1248 if (name == "creating")
1249 return FLAG_CREATING;
7c673cae
FG
1250 return 0;
1251 }
1252
1253 /// converts the acting/up vector to a set of pg shards
1254 void convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const;
1255
1256 typedef enum {
1257 CACHEMODE_NONE = 0, ///< no caching
1258 CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
1259 CACHEMODE_FORWARD = 2, ///< forward if not in cache
1260 CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
1261 CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later
1262 CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later
1263 CACHEMODE_PROXY = 6, ///< proxy if not in cache
1264 } cache_mode_t;
1265 static const char *get_cache_mode_name(cache_mode_t m) {
1266 switch (m) {
1267 case CACHEMODE_NONE: return "none";
1268 case CACHEMODE_WRITEBACK: return "writeback";
1269 case CACHEMODE_FORWARD: return "forward";
1270 case CACHEMODE_READONLY: return "readonly";
1271 case CACHEMODE_READFORWARD: return "readforward";
1272 case CACHEMODE_READPROXY: return "readproxy";
1273 case CACHEMODE_PROXY: return "proxy";
1274 default: return "unknown";
1275 }
1276 }
1277 static cache_mode_t get_cache_mode_from_str(const string& s) {
1278 if (s == "none")
1279 return CACHEMODE_NONE;
1280 if (s == "writeback")
1281 return CACHEMODE_WRITEBACK;
1282 if (s == "forward")
1283 return CACHEMODE_FORWARD;
1284 if (s == "readonly")
1285 return CACHEMODE_READONLY;
1286 if (s == "readforward")
1287 return CACHEMODE_READFORWARD;
1288 if (s == "readproxy")
1289 return CACHEMODE_READPROXY;
1290 if (s == "proxy")
1291 return CACHEMODE_PROXY;
1292 return (cache_mode_t)-1;
1293 }
1294 const char *get_cache_mode_name() const {
1295 return get_cache_mode_name(cache_mode);
1296 }
1297 bool cache_mode_requires_hit_set() const {
1298 switch (cache_mode) {
1299 case CACHEMODE_NONE:
1300 case CACHEMODE_FORWARD:
1301 case CACHEMODE_READONLY:
1302 case CACHEMODE_PROXY:
1303 return false;
1304 case CACHEMODE_WRITEBACK:
1305 case CACHEMODE_READFORWARD:
1306 case CACHEMODE_READPROXY:
1307 return true;
1308 default:
11fdf7f2
TL
1309 ceph_abort_msg("implement me");
1310 }
1311 }
1312
1313 enum {
1314 PG_AUTOSCALE_MODE_OFF = 0,
1315 PG_AUTOSCALE_MODE_WARN = 1,
1316 PG_AUTOSCALE_MODE_ON = 2,
1317 };
1318 static const char *get_pg_autoscale_mode_name(int m) {
1319 switch (m) {
1320 case PG_AUTOSCALE_MODE_OFF: return "off";
1321 case PG_AUTOSCALE_MODE_ON: return "on";
1322 case PG_AUTOSCALE_MODE_WARN: return "warn";
1323 default: return "???";
1324 }
1325 }
1326 static int get_pg_autoscale_mode_by_name(const string& m) {
1327 if (m == "off") {
1328 return PG_AUTOSCALE_MODE_OFF;
1329 }
1330 if (m == "warn") {
1331 return PG_AUTOSCALE_MODE_WARN;
1332 }
1333 if (m == "on") {
1334 return PG_AUTOSCALE_MODE_ON;
7c673cae 1335 }
11fdf7f2 1336 return -1;
7c673cae
FG
1337 }
1338
11fdf7f2 1339 utime_t create_time;
7c673cae
FG
1340 uint64_t flags; ///< FLAG_*
1341 __u8 type; ///< TYPE_*
1342 __u8 size, min_size; ///< number of osds in each pg
31f18b77 1343 __u8 crush_rule; ///< crush placement rule
7c673cae 1344 __u8 object_hash; ///< hash mapping object name to ps
11fdf7f2 1345 __u8 pg_autoscale_mode; ///< PG_AUTOSCALE_MODE_
7c673cae 1346private:
11fdf7f2
TL
1347 __u32 pg_num = 0, pgp_num = 0; ///< number of pgs
1348 __u32 pg_num_pending = 0; ///< pg_num we are about to merge down to
1349 __u32 pg_num_target = 0; ///< pg_num we should converge toward
1350 __u32 pgp_num_target = 0; ///< pgp_num we should converge toward
7c673cae
FG
1351
1352public:
1353 map<string,string> properties; ///< OBSOLETE
1354 string erasure_code_profile; ///< name of the erasure code profile in OSDMap
1355 epoch_t last_change; ///< most recent epoch changed, exclusing snapshot changes
11fdf7f2
TL
1356
1357 /// last epoch that forced clients to resend
1358 epoch_t last_force_op_resend = 0;
1359 /// last epoch that forced clients to resend (pre-nautilus clients only)
1360 epoch_t last_force_op_resend_prenautilus = 0;
7c673cae 1361 /// last epoch that forced clients to resend (pre-luminous clients only)
11fdf7f2
TL
1362 epoch_t last_force_op_resend_preluminous = 0;
1363
1364 /// metadata for the most recent PG merge
1365 pg_merge_meta_t last_pg_merge_meta;
1366
7c673cae
FG
1367 snapid_t snap_seq; ///< seq for per-pool snapshot
1368 epoch_t snap_epoch; ///< osdmap epoch of last snap
1369 uint64_t auid; ///< who owns the pg
7c673cae
FG
1370
1371 uint64_t quota_max_bytes; ///< maximum number of bytes for this pool
1372 uint64_t quota_max_objects; ///< maximum number of objects for this pool
1373
1374 /*
1375 * Pool snaps (global to this pool). These define a SnapContext for
1376 * the pool, unless the client manually specifies an alternate
1377 * context.
1378 */
1379 map<snapid_t, pool_snap_info_t> snaps;
1380 /*
1381 * Alternatively, if we are defining non-pool snaps (e.g. via the
1382 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1383 * used). Snaps and removed_snaps are to be used exclusive of each
1384 * other!
1385 */
1386 interval_set<snapid_t> removed_snaps;
1387
1388 unsigned pg_num_mask, pgp_num_mask;
1389
1390 set<uint64_t> tiers; ///< pools that are tiers of us
1391 int64_t tier_of; ///< pool for which we are a tier
1392 // Note that write wins for read+write ops
1393 int64_t read_tier; ///< pool/tier for objecter to direct reads to
1394 int64_t write_tier; ///< pool/tier for objecter to direct writes to
1395 cache_mode_t cache_mode; ///< cache pool mode
1396
1397 bool is_tier() const { return tier_of >= 0; }
1398 bool has_tiers() const { return !tiers.empty(); }
1399 void clear_tier() {
1400 tier_of = -1;
1401 clear_read_tier();
1402 clear_write_tier();
1403 clear_tier_tunables();
1404 }
1405 bool has_read_tier() const { return read_tier >= 0; }
1406 void clear_read_tier() { read_tier = -1; }
1407 bool has_write_tier() const { return write_tier >= 0; }
1408 void clear_write_tier() { write_tier = -1; }
1409 void clear_tier_tunables() {
1410 if (cache_mode != CACHEMODE_NONE)
1411 flags |= FLAG_INCOMPLETE_CLONES;
1412 cache_mode = CACHEMODE_NONE;
1413
1414 target_max_bytes = 0;
1415 target_max_objects = 0;
1416 cache_target_dirty_ratio_micro = 0;
1417 cache_target_dirty_high_ratio_micro = 0;
1418 cache_target_full_ratio_micro = 0;
1419 hit_set_params = HitSet::Params();
1420 hit_set_period = 0;
1421 hit_set_count = 0;
1422 hit_set_grade_decay_rate = 0;
1423 hit_set_search_last_n = 0;
1424 grade_table.resize(0);
1425 }
1426
1427 uint64_t target_max_bytes; ///< tiering: target max pool size
1428 uint64_t target_max_objects; ///< tiering: target max pool size
1429
1430 uint32_t cache_target_dirty_ratio_micro; ///< cache: fraction of target to leave dirty
11fdf7f2 1431 uint32_t cache_target_dirty_high_ratio_micro; ///< cache: fraction of target to flush with high speed
7c673cae
FG
1432 uint32_t cache_target_full_ratio_micro; ///< cache: fraction of target to fill before we evict in earnest
1433
1434 uint32_t cache_min_flush_age; ///< minimum age (seconds) before we can flush
1435 uint32_t cache_min_evict_age; ///< minimum age (seconds) before we can evict
1436
1437 HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
1438 uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
1439 uint32_t hit_set_count; ///< number of periods to retain
1440 bool use_gmt_hitset; ///< use gmt to name the hitset archive object
1441 uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote on read
1442 uint32_t min_write_recency_for_promote; ///< minimum number of HitSet to check before promote on write
1443 uint32_t hit_set_grade_decay_rate; ///< current hit_set has highest priority on objects
11fdf7f2
TL
1444 ///< temperature count,the follow hit_set's priority decay
1445 ///< by this params than pre hit_set
1446 uint32_t hit_set_search_last_n; ///< accumulate atmost N hit_sets for temperature
7c673cae
FG
1447
1448 uint32_t stripe_width; ///< erasure coded stripe size in bytes
1449
1450 uint64_t expected_num_objects; ///< expected number of objects on this pool, a value of 0 indicates
1451 ///< user does not specify any expected value
1452 bool fast_read; ///< whether turn on fast read on the pool or not
1453
1454 pool_opts_t opts; ///< options
1455
11fdf7f2
TL
1456 typedef enum {
1457 TYPE_FINGERPRINT_NONE = 0,
1458 TYPE_FINGERPRINT_SHA1 = 1,
1459 } fingerprint_t;
1460 static fingerprint_t get_fingerprint_from_str(const string& s) {
1461 if (s == "none")
1462 return TYPE_FINGERPRINT_NONE;
1463 if (s == "sha1")
1464 return TYPE_FINGERPRINT_SHA1;
1465 return (fingerprint_t)-1;
1466 }
1467 const fingerprint_t get_fingerprint_type() const {
1468 string fp_str;
1469 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1470 return get_fingerprint_from_str(fp_str);
1471 }
1472 const char *get_fingerprint_name() const {
1473 string fp_str;
1474 fingerprint_t fp_t;
1475 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1476 fp_t = get_fingerprint_from_str(fp_str);
1477 return get_fingerprint_name(fp_t);
1478 }
1479 static const char *get_fingerprint_name(fingerprint_t m) {
1480 switch (m) {
1481 case TYPE_FINGERPRINT_NONE: return "none";
1482 case TYPE_FINGERPRINT_SHA1: return "sha1";
1483 default: return "unknown";
1484 }
1485 }
1486
c07f9fc5
FG
1487 /// application -> key/value metadata
1488 map<string, std::map<string, string>> application_metadata;
1489
7c673cae
FG
1490private:
1491 vector<uint32_t> grade_table;
1492
1493public:
1494 uint32_t get_grade(unsigned i) const {
1495 if (grade_table.size() <= i)
1496 return 0;
1497 return grade_table[i];
1498 }
1499 void calc_grade_table() {
1500 unsigned v = 1000000;
1501 grade_table.resize(hit_set_count);
1502 for (unsigned i = 0; i < hit_set_count; i++) {
1503 v = v * (1 - (hit_set_grade_decay_rate / 100.0));
1504 grade_table[i] = v;
1505 }
1506 }
1507
1508 pg_pool_t()
1509 : flags(0), type(0), size(0), min_size(0),
31f18b77 1510 crush_rule(0), object_hash(0),
7c673cae 1511 last_change(0),
7c673cae
FG
1512 snap_seq(0), snap_epoch(0),
1513 auid(0),
7c673cae
FG
1514 quota_max_bytes(0), quota_max_objects(0),
1515 pg_num_mask(0), pgp_num_mask(0),
1516 tier_of(-1), read_tier(-1), write_tier(-1),
1517 cache_mode(CACHEMODE_NONE),
1518 target_max_bytes(0), target_max_objects(0),
1519 cache_target_dirty_ratio_micro(0),
1520 cache_target_dirty_high_ratio_micro(0),
1521 cache_target_full_ratio_micro(0),
1522 cache_min_flush_age(0),
1523 cache_min_evict_age(0),
1524 hit_set_params(),
1525 hit_set_period(0),
1526 hit_set_count(0),
1527 use_gmt_hitset(true),
1528 min_read_recency_for_promote(0),
1529 min_write_recency_for_promote(0),
1530 hit_set_grade_decay_rate(0),
1531 hit_set_search_last_n(0),
1532 stripe_width(0),
1533 expected_num_objects(0),
1534 fast_read(false),
1535 opts()
1536 { }
1537
1538 void dump(Formatter *f) const;
1539
11fdf7f2 1540 const utime_t &get_create_time() const { return create_time; }
7c673cae
FG
1541 uint64_t get_flags() const { return flags; }
1542 bool has_flag(uint64_t f) const { return flags & f; }
1543 void set_flag(uint64_t f) { flags |= f; }
1544 void unset_flag(uint64_t f) { flags &= ~f; }
1545
7c673cae 1546 bool require_rollback() const {
11fdf7f2 1547 return is_erasure();
7c673cae
FG
1548 }
1549
1550 /// true if incomplete clones may be present
1551 bool allow_incomplete_clones() const {
1552 return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
1553 }
1554
1555 unsigned get_type() const { return type; }
1556 unsigned get_size() const { return size; }
1557 unsigned get_min_size() const { return min_size; }
31f18b77 1558 int get_crush_rule() const { return crush_rule; }
7c673cae
FG
1559 int get_object_hash() const { return object_hash; }
1560 const char *get_object_hash_name() const {
1561 return ceph_str_hash_name(get_object_hash());
1562 }
1563 epoch_t get_last_change() const { return last_change; }
1564 epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
11fdf7f2
TL
1565 epoch_t get_last_force_op_resend_prenautilus() const {
1566 return last_force_op_resend_prenautilus;
1567 }
7c673cae
FG
1568 epoch_t get_last_force_op_resend_preluminous() const {
1569 return last_force_op_resend_preluminous;
1570 }
1571 epoch_t get_snap_epoch() const { return snap_epoch; }
1572 snapid_t get_snap_seq() const { return snap_seq; }
1573 uint64_t get_auid() const { return auid; }
7c673cae
FG
1574
1575 void set_snap_seq(snapid_t s) { snap_seq = s; }
1576 void set_snap_epoch(epoch_t e) { snap_epoch = e; }
1577
1578 void set_stripe_width(uint32_t s) { stripe_width = s; }
1579 uint32_t get_stripe_width() const { return stripe_width; }
1580
1581 bool is_replicated() const { return get_type() == TYPE_REPLICATED; }
1582 bool is_erasure() const { return get_type() == TYPE_ERASURE; }
1583
1584 bool supports_omap() const {
1585 return !(get_type() == TYPE_ERASURE);
1586 }
1587
1588 bool requires_aligned_append() const {
1589 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
1590 }
1591 uint64_t required_alignment() const { return stripe_width; }
1592
1593 bool allows_ecoverwrites() const {
1594 return has_flag(FLAG_EC_OVERWRITES);
1595 }
1596
1597 bool can_shift_osds() const {
1598 switch (get_type()) {
1599 case TYPE_REPLICATED:
1600 return true;
1601 case TYPE_ERASURE:
1602 return false;
1603 default:
11fdf7f2 1604 ceph_abort_msg("unhandled pool type");
7c673cae
FG
1605 }
1606 }
1607
1608 unsigned get_pg_num() const { return pg_num; }
1609 unsigned get_pgp_num() const { return pgp_num; }
11fdf7f2
TL
1610 unsigned get_pg_num_target() const { return pg_num_target; }
1611 unsigned get_pgp_num_target() const { return pgp_num_target; }
1612 unsigned get_pg_num_pending() const { return pg_num_pending; }
7c673cae
FG
1613
1614 unsigned get_pg_num_mask() const { return pg_num_mask; }
1615 unsigned get_pgp_num_mask() const { return pgp_num_mask; }
1616
1617 // if pg_num is not a multiple of two, pgs are not equally sized.
1618 // return, for a given pg, the fraction (denominator) of the total
1619 // pool size that it represents.
1620 unsigned get_pg_num_divisor(pg_t pgid) const;
1621
11fdf7f2
TL
1622 bool is_pending_merge(pg_t pgid, bool *target) const;
1623
7c673cae
FG
1624 void set_pg_num(int p) {
1625 pg_num = p;
11fdf7f2 1626 pg_num_pending = p;
7c673cae
FG
1627 calc_pg_masks();
1628 }
1629 void set_pgp_num(int p) {
1630 pgp_num = p;
1631 calc_pg_masks();
1632 }
11fdf7f2
TL
1633 void set_pg_num_pending(int p) {
1634 pg_num_pending = p;
1635 calc_pg_masks();
1636 }
1637 void set_pg_num_target(int p) {
1638 pg_num_target = p;
1639 }
1640 void set_pgp_num_target(int p) {
1641 pgp_num_target = p;
1642 }
1643 void dec_pg_num(pg_t source_pgid,
1644 epoch_t ready_epoch,
1645 eversion_t source_version,
1646 eversion_t target_version,
1647 epoch_t last_epoch_started,
1648 epoch_t last_epoch_clean) {
1649 --pg_num;
1650 last_pg_merge_meta.source_pgid = source_pgid;
1651 last_pg_merge_meta.ready_epoch = ready_epoch;
1652 last_pg_merge_meta.source_version = source_version;
1653 last_pg_merge_meta.target_version = target_version;
1654 last_pg_merge_meta.last_epoch_started = last_epoch_started;
1655 last_pg_merge_meta.last_epoch_clean = last_epoch_clean;
1656 calc_pg_masks();
1657 }
7c673cae
FG
1658
1659 void set_quota_max_bytes(uint64_t m) {
1660 quota_max_bytes = m;
1661 }
1662 uint64_t get_quota_max_bytes() {
1663 return quota_max_bytes;
1664 }
1665
1666 void set_quota_max_objects(uint64_t m) {
1667 quota_max_objects = m;
1668 }
1669 uint64_t get_quota_max_objects() {
1670 return quota_max_objects;
1671 }
1672
1673 void set_last_force_op_resend(uint64_t t) {
1674 last_force_op_resend = t;
11fdf7f2 1675 last_force_op_resend_prenautilus = t;
7c673cae
FG
1676 last_force_op_resend_preluminous = t;
1677 }
1678
1679 void calc_pg_masks();
1680
1681 /*
1682 * we have two snap modes:
1683 * - pool global snaps
1684 * - snap existence/non-existence defined by snaps[] and snap_seq
1685 * - user managed snaps
1686 * - removal governed by removed_snaps
1687 *
1688 * we know which mode we're using based on whether removed_snaps is empty.
1689 * If nothing has been created, both functions report false.
1690 */
1691 bool is_pool_snaps_mode() const;
1692 bool is_unmanaged_snaps_mode() const;
1693 bool is_removed_snap(snapid_t s) const;
1694
1695 /*
1696 * build set of known-removed sets from either pool snaps or
1697 * explicit removed_snaps set.
1698 */
1699 void build_removed_snaps(interval_set<snapid_t>& rs) const;
91327a77 1700 bool maybe_updated_removed_snaps(const interval_set<snapid_t>& cached) const;
7c673cae
FG
1701 snapid_t snap_exists(const char *s) const;
1702 void add_snap(const char *n, utime_t stamp);
1703 void add_unmanaged_snap(uint64_t& snapid);
1704 void remove_snap(snapid_t s);
1705 void remove_unmanaged_snap(snapid_t s);
1706
1707 SnapContext get_snap_context() const;
1708
1709 /// hash a object name+namespace key to a hash position
1710 uint32_t hash_key(const string& key, const string& ns) const;
1711
1712 /// round a hash position down to a pg num
1713 uint32_t raw_hash_to_pg(uint32_t v) const;
1714
1715 /*
1716 * map a raw pg (with full precision ps) into an actual pg, for storage
1717 */
1718 pg_t raw_pg_to_pg(pg_t pg) const;
1719
1720 /*
1721 * map raw pg (full precision ps) into a placement seed. include
1722 * pool id in that value so that different pools don't use the same
1723 * seeds.
1724 */
1725 ps_t raw_pg_to_pps(pg_t pg) const;
1726
1727 /// choose a random hash position within a pg
1728 uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
1729
1730 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 1731 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
1732
1733 static void generate_test_instances(list<pg_pool_t*>& o);
1734};
1735WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
1736
1737ostream& operator<<(ostream& out, const pg_pool_t& p);
1738
1739
1740/**
1741 * a summation of object stats
1742 *
1743 * This is just a container for object stats; we don't know what for.
1744 *
1745 * If you add members in object_stat_sum_t, you should make sure there are
1746 * not padding among these members.
1747 * You should also modify the padding_check function.
1748
1749 */
1750struct object_stat_sum_t {
1751 /**************************************************************************
1752 * WARNING: be sure to update operator==, floor, and split when
1753 * adding/removing fields!
1754 **************************************************************************/
1755 int64_t num_bytes; // in bytes
1756 int64_t num_objects;
1757 int64_t num_object_clones;
1758 int64_t num_object_copies; // num_objects * num_replicas
1759 int64_t num_objects_missing_on_primary;
1760 int64_t num_objects_degraded;
1761 int64_t num_objects_unfound;
1762 int64_t num_rd;
1763 int64_t num_rd_kb;
1764 int64_t num_wr;
1765 int64_t num_wr_kb;
1766 int64_t num_scrub_errors; // total deep and shallow scrub errors
1767 int64_t num_objects_recovered;
1768 int64_t num_bytes_recovered;
1769 int64_t num_keys_recovered;
1770 int64_t num_shallow_scrub_errors;
1771 int64_t num_deep_scrub_errors;
1772 int64_t num_objects_dirty;
1773 int64_t num_whiteouts;
1774 int64_t num_objects_omap;
1775 int64_t num_objects_hit_set_archive;
1776 int64_t num_objects_misplaced;
1777 int64_t num_bytes_hit_set_archive;
1778 int64_t num_flush;
1779 int64_t num_flush_kb;
1780 int64_t num_evict;
1781 int64_t num_evict_kb;
1782 int64_t num_promote;
1783 int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0
1784 int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0
1785 int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0
1786 int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0
1787 int64_t num_objects_pinned;
1788 int64_t num_objects_missing;
1789 int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
28e407b8 1790 int64_t num_large_omap_objects = 0;
11fdf7f2
TL
1791 int64_t num_objects_manifest = 0;
1792 int64_t num_omap_bytes = 0;
1793 int64_t num_omap_keys = 0;
1794 int64_t num_objects_repaired = 0;
7c673cae
FG
1795
1796 object_stat_sum_t()
1797 : num_bytes(0),
1798 num_objects(0), num_object_clones(0), num_object_copies(0),
1799 num_objects_missing_on_primary(0), num_objects_degraded(0),
1800 num_objects_unfound(0),
1801 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1802 num_scrub_errors(0),
1803 num_objects_recovered(0),
1804 num_bytes_recovered(0),
1805 num_keys_recovered(0),
1806 num_shallow_scrub_errors(0),
1807 num_deep_scrub_errors(0),
1808 num_objects_dirty(0),
1809 num_whiteouts(0),
1810 num_objects_omap(0),
1811 num_objects_hit_set_archive(0),
1812 num_objects_misplaced(0),
1813 num_bytes_hit_set_archive(0),
1814 num_flush(0),
1815 num_flush_kb(0),
1816 num_evict(0),
1817 num_evict_kb(0),
1818 num_promote(0),
1819 num_flush_mode_high(0), num_flush_mode_low(0),
1820 num_evict_mode_some(0), num_evict_mode_full(0),
1821 num_objects_pinned(0),
1822 num_objects_missing(0),
1823 num_legacy_snapsets(0)
1824 {}
1825
1826 void floor(int64_t f) {
1827#define FLOOR(x) if (x < f) x = f
1828 FLOOR(num_bytes);
1829 FLOOR(num_objects);
1830 FLOOR(num_object_clones);
1831 FLOOR(num_object_copies);
1832 FLOOR(num_objects_missing_on_primary);
1833 FLOOR(num_objects_missing);
1834 FLOOR(num_objects_degraded);
1835 FLOOR(num_objects_misplaced);
1836 FLOOR(num_objects_unfound);
1837 FLOOR(num_rd);
1838 FLOOR(num_rd_kb);
1839 FLOOR(num_wr);
1840 FLOOR(num_wr_kb);
28e407b8 1841 FLOOR(num_large_omap_objects);
11fdf7f2
TL
1842 FLOOR(num_objects_manifest);
1843 FLOOR(num_omap_bytes);
1844 FLOOR(num_omap_keys);
7c673cae
FG
1845 FLOOR(num_shallow_scrub_errors);
1846 FLOOR(num_deep_scrub_errors);
94b18763 1847 num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
7c673cae
FG
1848 FLOOR(num_objects_recovered);
1849 FLOOR(num_bytes_recovered);
1850 FLOOR(num_keys_recovered);
1851 FLOOR(num_objects_dirty);
1852 FLOOR(num_whiteouts);
1853 FLOOR(num_objects_omap);
1854 FLOOR(num_objects_hit_set_archive);
1855 FLOOR(num_bytes_hit_set_archive);
1856 FLOOR(num_flush);
1857 FLOOR(num_flush_kb);
1858 FLOOR(num_evict);
1859 FLOOR(num_evict_kb);
1860 FLOOR(num_promote);
1861 FLOOR(num_flush_mode_high);
1862 FLOOR(num_flush_mode_low);
1863 FLOOR(num_evict_mode_some);
1864 FLOOR(num_evict_mode_full);
1865 FLOOR(num_objects_pinned);
1866 FLOOR(num_legacy_snapsets);
11fdf7f2 1867 FLOOR(num_objects_repaired);
7c673cae
FG
1868#undef FLOOR
1869 }
1870
1871 void split(vector<object_stat_sum_t> &out) const {
1872#define SPLIT(PARAM) \
1873 for (unsigned i = 0; i < out.size(); ++i) { \
1874 out[i].PARAM = PARAM / out.size(); \
1875 if (i < (PARAM % out.size())) { \
1876 out[i].PARAM++; \
1877 } \
1878 }
1879#define SPLIT_PRESERVE_NONZERO(PARAM) \
1880 for (unsigned i = 0; i < out.size(); ++i) { \
1881 if (PARAM) \
1882 out[i].PARAM = 1 + PARAM / out.size(); \
1883 else \
1884 out[i].PARAM = 0; \
1885 }
1886
1887 SPLIT(num_bytes);
1888 SPLIT(num_objects);
1889 SPLIT(num_object_clones);
1890 SPLIT(num_object_copies);
1891 SPLIT(num_objects_missing_on_primary);
1892 SPLIT(num_objects_missing);
1893 SPLIT(num_objects_degraded);
1894 SPLIT(num_objects_misplaced);
1895 SPLIT(num_objects_unfound);
1896 SPLIT(num_rd);
1897 SPLIT(num_rd_kb);
1898 SPLIT(num_wr);
1899 SPLIT(num_wr_kb);
11fdf7f2
TL
1900 SPLIT(num_large_omap_objects);
1901 SPLIT(num_objects_manifest);
1902 SPLIT(num_omap_bytes);
1903 SPLIT(num_omap_keys);
1904 SPLIT(num_objects_repaired);
94b18763
FG
1905 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
1906 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
1907 for (unsigned i = 0; i < out.size(); ++i) {
1908 out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
1909 out[i].num_deep_scrub_errors;
1910 }
7c673cae
FG
1911 SPLIT(num_objects_recovered);
1912 SPLIT(num_bytes_recovered);
1913 SPLIT(num_keys_recovered);
1914 SPLIT(num_objects_dirty);
1915 SPLIT(num_whiteouts);
1916 SPLIT(num_objects_omap);
1917 SPLIT(num_objects_hit_set_archive);
1918 SPLIT(num_bytes_hit_set_archive);
1919 SPLIT(num_flush);
1920 SPLIT(num_flush_kb);
1921 SPLIT(num_evict);
1922 SPLIT(num_evict_kb);
1923 SPLIT(num_promote);
1924 SPLIT(num_flush_mode_high);
1925 SPLIT(num_flush_mode_low);
1926 SPLIT(num_evict_mode_some);
1927 SPLIT(num_evict_mode_full);
1928 SPLIT(num_objects_pinned);
1929 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
1930#undef SPLIT
1931#undef SPLIT_PRESERVE_NONZERO
1932 }
1933
1934 void clear() {
92f5a8d4 1935 // FIPS zeroization audit 20191117: this memset is not security related.
7c673cae
FG
1936 memset(this, 0, sizeof(*this));
1937 }
1938
1939 void calc_copies(int nrep) {
1940 num_object_copies = nrep * num_objects;
1941 }
1942
1943 bool is_zero() const {
1944 return mem_is_zero((char*)this, sizeof(*this));
1945 }
1946
1947 void add(const object_stat_sum_t& o);
1948 void sub(const object_stat_sum_t& o);
1949
1950 void dump(Formatter *f) const;
1951 void padding_check() {
1952 static_assert(
1953 sizeof(object_stat_sum_t) ==
1954 sizeof(num_bytes) +
1955 sizeof(num_objects) +
1956 sizeof(num_object_clones) +
1957 sizeof(num_object_copies) +
1958 sizeof(num_objects_missing_on_primary) +
1959 sizeof(num_objects_degraded) +
1960 sizeof(num_objects_unfound) +
1961 sizeof(num_rd) +
1962 sizeof(num_rd_kb) +
1963 sizeof(num_wr) +
1964 sizeof(num_wr_kb) +
1965 sizeof(num_scrub_errors) +
28e407b8 1966 sizeof(num_large_omap_objects) +
11fdf7f2
TL
1967 sizeof(num_objects_manifest) +
1968 sizeof(num_omap_bytes) +
1969 sizeof(num_omap_keys) +
1970 sizeof(num_objects_repaired) +
7c673cae
FG
1971 sizeof(num_objects_recovered) +
1972 sizeof(num_bytes_recovered) +
1973 sizeof(num_keys_recovered) +
1974 sizeof(num_shallow_scrub_errors) +
1975 sizeof(num_deep_scrub_errors) +
1976 sizeof(num_objects_dirty) +
1977 sizeof(num_whiteouts) +
1978 sizeof(num_objects_omap) +
1979 sizeof(num_objects_hit_set_archive) +
1980 sizeof(num_objects_misplaced) +
1981 sizeof(num_bytes_hit_set_archive) +
1982 sizeof(num_flush) +
1983 sizeof(num_flush_kb) +
1984 sizeof(num_evict) +
1985 sizeof(num_evict_kb) +
1986 sizeof(num_promote) +
1987 sizeof(num_flush_mode_high) +
1988 sizeof(num_flush_mode_low) +
1989 sizeof(num_evict_mode_some) +
1990 sizeof(num_evict_mode_full) +
1991 sizeof(num_objects_pinned) +
1992 sizeof(num_objects_missing) +
1993 sizeof(num_legacy_snapsets)
1994 ,
1995 "object_stat_sum_t have padding");
1996 }
1997 void encode(bufferlist& bl) const;
11fdf7f2 1998 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
1999 static void generate_test_instances(list<object_stat_sum_t*>& o);
2000};
2001WRITE_CLASS_ENCODER(object_stat_sum_t)
2002
2003bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
2004
2005/**
2006 * a collection of object stat sums
2007 *
2008 * This is a collection of stat sums over different categories.
2009 */
2010struct object_stat_collection_t {
2011 /**************************************************************************
2012 * WARNING: be sure to update the operator== when adding/removing fields! *
2013 **************************************************************************/
2014 object_stat_sum_t sum;
2015
2016 void calc_copies(int nrep) {
2017 sum.calc_copies(nrep);
2018 }
2019
2020 void dump(Formatter *f) const;
2021 void encode(bufferlist& bl) const;
11fdf7f2 2022 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
2023 static void generate_test_instances(list<object_stat_collection_t*>& o);
2024
2025 bool is_zero() const {
2026 return sum.is_zero();
2027 }
2028
2029 void clear() {
2030 sum.clear();
2031 }
2032
2033 void floor(int64_t f) {
2034 sum.floor(f);
2035 }
2036
2037 void add(const object_stat_sum_t& o) {
2038 sum.add(o);
2039 }
2040
2041 void add(const object_stat_collection_t& o) {
2042 sum.add(o.sum);
2043 }
2044 void sub(const object_stat_collection_t& o) {
2045 sum.sub(o.sum);
2046 }
2047};
2048WRITE_CLASS_ENCODER(object_stat_collection_t)
2049
2050inline bool operator==(const object_stat_collection_t& l,
2051 const object_stat_collection_t& r) {
2052 return l.sum == r.sum;
2053}
2054
2055
2056/** pg_stat
2057 * aggregate stats for a single PG.
2058 */
2059struct pg_stat_t {
2060 /**************************************************************************
2061 * WARNING: be sure to update the operator== when adding/removing fields! *
2062 **************************************************************************/
2063 eversion_t version;
2064 version_t reported_seq; // sequence number
2065 epoch_t reported_epoch; // epoch of this report
11fdf7f2 2066 uint64_t state;
7c673cae
FG
2067 utime_t last_fresh; // last reported
2068 utime_t last_change; // new state != previous state
2069 utime_t last_active; // state & PG_STATE_ACTIVE
2070 utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
2071 utime_t last_clean; // state & PG_STATE_CLEAN
2072 utime_t last_unstale; // (state & PG_STATE_STALE) == 0
2073 utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
2074 utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
2075
2076 eversion_t log_start; // (log_start,version]
2077 eversion_t ondisk_log_start; // there may be more on disk
2078
2079 epoch_t created;
2080 epoch_t last_epoch_clean;
2081 pg_t parent;
2082 __u32 parent_split_bits;
2083
2084 eversion_t last_scrub;
2085 eversion_t last_deep_scrub;
2086 utime_t last_scrub_stamp;
2087 utime_t last_deep_scrub_stamp;
2088 utime_t last_clean_scrub_stamp;
2089
2090 object_stat_collection_t stats;
2091
2092 int64_t log_size;
2093 int64_t ondisk_log_size; // >= active_log_size
2094
2095 vector<int32_t> up, acting;
81eedcae
TL
2096 vector<pg_shard_t> avail_no_missing;
2097 map< std::set<pg_shard_t>, int32_t > object_location_counts;
7c673cae
FG
2098 epoch_t mapping_epoch;
2099
2100 vector<int32_t> blocked_by; ///< osds on which the pg is blocked
2101
11fdf7f2
TL
2102 interval_set<snapid_t> purged_snaps; ///< recently removed snaps that we've purged
2103
7c673cae
FG
2104 utime_t last_became_active;
2105 utime_t last_became_peered;
2106
2107 /// up, acting primaries
2108 int32_t up_primary;
2109 int32_t acting_primary;
2110
b32b8144
FG
2111 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
2112 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
2113 uint32_t snaptrimq_len;
2114
7c673cae
FG
2115 bool stats_invalid:1;
2116 /// true if num_objects_dirty is not accurate (because it was not
2117 /// maintained starting from pool creation)
2118 bool dirty_stats_invalid:1;
2119 bool omap_stats_invalid:1;
2120 bool hitset_stats_invalid:1;
2121 bool hitset_bytes_stats_invalid:1;
2122 bool pin_stats_invalid:1;
11fdf7f2 2123 bool manifest_stats_invalid:1;
7c673cae
FG
2124
2125 pg_stat_t()
2126 : reported_seq(0),
2127 reported_epoch(0),
2128 state(0),
2129 created(0), last_epoch_clean(0),
2130 parent_split_bits(0),
2131 log_size(0), ondisk_log_size(0),
2132 mapping_epoch(0),
2133 up_primary(-1),
2134 acting_primary(-1),
b32b8144 2135 snaptrimq_len(0),
7c673cae
FG
2136 stats_invalid(false),
2137 dirty_stats_invalid(false),
2138 omap_stats_invalid(false),
2139 hitset_stats_invalid(false),
2140 hitset_bytes_stats_invalid(false),
11fdf7f2
TL
2141 pin_stats_invalid(false),
2142 manifest_stats_invalid(false)
7c673cae
FG
2143 { }
2144
2145 epoch_t get_effective_last_epoch_clean() const {
2146 if (state & PG_STATE_CLEAN) {
2147 // we are clean as of this report, and should thus take the
2148 // reported epoch
2149 return reported_epoch;
2150 } else {
2151 return last_epoch_clean;
2152 }
2153 }
2154
2155 pair<epoch_t, version_t> get_version_pair() const {
2156 return make_pair(reported_epoch, reported_seq);
2157 }
2158
2159 void floor(int64_t f) {
2160 stats.floor(f);
2161 if (log_size < f)
2162 log_size = f;
2163 if (ondisk_log_size < f)
2164 ondisk_log_size = f;
b32b8144
FG
2165 if (snaptrimq_len < f)
2166 snaptrimq_len = f;
7c673cae
FG
2167 }
2168
11fdf7f2
TL
2169 void add_sub_invalid_flags(const pg_stat_t& o) {
2170 // adding (or subtracting!) invalid stats render our stats invalid too
2171 stats_invalid |= o.stats_invalid;
2172 dirty_stats_invalid |= o.dirty_stats_invalid;
eafe8130 2173 omap_stats_invalid |= o.omap_stats_invalid;
11fdf7f2 2174 hitset_stats_invalid |= o.hitset_stats_invalid;
eafe8130 2175 hitset_bytes_stats_invalid |= o.hitset_bytes_stats_invalid;
11fdf7f2
TL
2176 pin_stats_invalid |= o.pin_stats_invalid;
2177 manifest_stats_invalid |= o.manifest_stats_invalid;
2178 }
7c673cae
FG
2179 void add(const pg_stat_t& o) {
2180 stats.add(o.stats);
2181 log_size += o.log_size;
2182 ondisk_log_size += o.ondisk_log_size;
11fdf7f2
TL
2183 snaptrimq_len = std::min((uint64_t)snaptrimq_len + o.snaptrimq_len,
2184 (uint64_t)(1ull << 31));
2185 add_sub_invalid_flags(o);
7c673cae
FG
2186 }
2187 void sub(const pg_stat_t& o) {
2188 stats.sub(o.stats);
2189 log_size -= o.log_size;
2190 ondisk_log_size -= o.ondisk_log_size;
b32b8144
FG
2191 if (o.snaptrimq_len < snaptrimq_len) {
2192 snaptrimq_len -= o.snaptrimq_len;
2193 } else {
2194 snaptrimq_len = 0;
2195 }
11fdf7f2 2196 add_sub_invalid_flags(o);
7c673cae
FG
2197 }
2198
2199 bool is_acting_osd(int32_t osd, bool primary) const;
2200 void dump(Formatter *f) const;
2201 void dump_brief(Formatter *f) const;
2202 void encode(bufferlist &bl) const;
11fdf7f2 2203 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
2204 static void generate_test_instances(list<pg_stat_t*>& o);
2205};
2206WRITE_CLASS_ENCODER(pg_stat_t)
2207
2208bool operator==(const pg_stat_t& l, const pg_stat_t& r);
2209
11fdf7f2
TL
2210/** store_statfs_t
2211 * ObjectStore full statfs information
2212 */
2213struct store_statfs_t
2214{
2215 uint64_t total = 0; ///< Total bytes
2216 uint64_t available = 0; ///< Free bytes available
2217 uint64_t internally_reserved = 0; ///< Bytes reserved for internal purposes
2218
2219 int64_t allocated = 0; ///< Bytes allocated by the store
2220
2221 int64_t data_stored = 0; ///< Bytes actually stored by the user
2222 int64_t data_compressed = 0; ///< Bytes stored after compression
2223 int64_t data_compressed_allocated = 0; ///< Bytes allocated for compressed data
2224 int64_t data_compressed_original = 0; ///< Bytes that were compressed
2225
2226 int64_t omap_allocated = 0; ///< approx usage of omap data
2227 int64_t internal_metadata = 0; ///< approx usage of internal metadata
2228
2229 void reset() {
2230 *this = store_statfs_t();
2231 }
2232 void floor(int64_t f) {
2233#define FLOOR(x) if (int64_t(x) < f) x = f
2234 FLOOR(total);
2235 FLOOR(available);
2236 FLOOR(internally_reserved);
2237 FLOOR(allocated);
2238 FLOOR(data_stored);
2239 FLOOR(data_compressed);
2240 FLOOR(data_compressed_allocated);
2241 FLOOR(data_compressed_original);
2242
2243 FLOOR(omap_allocated);
2244 FLOOR(internal_metadata);
2245#undef FLOOR
2246 }
2247
2248 bool operator ==(const store_statfs_t& other) const;
2249 bool is_zero() const {
2250 return *this == store_statfs_t();
2251 }
2252
2253 uint64_t get_used() const {
2254 return total - available - internally_reserved;
2255 }
2256
2257 // this accumulates both actually used and statfs's internally_reserved
2258 uint64_t get_used_raw() const {
2259 return total - available;
2260 }
2261
2262 float get_used_raw_ratio() const {
2263 if (total) {
2264 return (float)get_used_raw() / (float)total;
2265 } else {
2266 return 0.0;
2267 }
2268 }
2269
2270 // helpers to ease legacy code porting
2271 uint64_t kb_avail() const {
2272 return available >> 10;
2273 }
2274 uint64_t kb() const {
2275 return total >> 10;
2276 }
2277 uint64_t kb_used() const {
2278 return (total - available - internally_reserved) >> 10;
2279 }
2280 uint64_t kb_used_raw() const {
2281 return get_used_raw() >> 10;
2282 }
2283
2284 uint64_t kb_used_data() const {
2285 return allocated >> 10;
2286 }
2287 uint64_t kb_used_omap() const {
2288 return omap_allocated >> 10;
2289 }
2290
2291 uint64_t kb_used_internal_metadata() const {
2292 return internal_metadata >> 10;
2293 }
2294
2295 void add(const store_statfs_t& o) {
2296 total += o.total;
2297 available += o.available;
2298 internally_reserved += o.internally_reserved;
2299 allocated += o.allocated;
2300 data_stored += o.data_stored;
2301 data_compressed += o.data_compressed;
2302 data_compressed_allocated += o.data_compressed_allocated;
2303 data_compressed_original += o.data_compressed_original;
2304 omap_allocated += o.omap_allocated;
2305 internal_metadata += o.internal_metadata;
2306 }
2307 void sub(const store_statfs_t& o) {
2308 total -= o.total;
2309 available -= o.available;
2310 internally_reserved -= o.internally_reserved;
2311 allocated -= o.allocated;
2312 data_stored -= o.data_stored;
2313 data_compressed -= o.data_compressed;
2314 data_compressed_allocated -= o.data_compressed_allocated;
2315 data_compressed_original -= o.data_compressed_original;
2316 omap_allocated -= o.omap_allocated;
2317 internal_metadata -= o.internal_metadata;
2318 }
2319 void dump(Formatter *f) const;
2320 DENC(store_statfs_t, v, p) {
2321 DENC_START(1, 1, p);
2322 denc(v.total, p);
2323 denc(v.available, p);
2324 denc(v.internally_reserved, p);
2325 denc(v.allocated, p);
2326 denc(v.data_stored, p);
2327 denc(v.data_compressed, p);
2328 denc(v.data_compressed_allocated, p);
2329 denc(v.data_compressed_original, p);
2330 denc(v.omap_allocated, p);
2331 denc(v.internal_metadata, p);
2332 DENC_FINISH(p);
2333 }
2334 static void generate_test_instances(list<store_statfs_t*>& o);
2335};
2336WRITE_CLASS_DENC(store_statfs_t)
2337
2338ostream &operator<<(ostream &lhs, const store_statfs_t &rhs);
2339
2340/** osd_stat
2341 * aggregate stats for an osd
2342 */
2343struct osd_stat_t {
2344 store_statfs_t statfs;
2345 vector<int> hb_peers;
2346 int32_t snap_trim_queue_len, num_snap_trimming;
2347 uint64_t num_shards_repaired;
2348
2349 pow2_hist_t op_queue_age_hist;
2350
2351 objectstore_perf_stat_t os_perf_stat;
2352 osd_alerts_t os_alerts;
2353
2354 epoch_t up_from = 0;
2355 uint64_t seq = 0;
2356
2357 uint32_t num_pgs = 0;
2358
81eedcae
TL
2359 uint32_t num_osds = 0;
2360 uint32_t num_per_pool_osds = 0;
2361
eafe8130
TL
2362 struct Interfaces {
2363 uint32_t last_update; // in seconds
2364 uint32_t back_pingtime[3];
2365 uint32_t back_min[3];
2366 uint32_t back_max[3];
2367 uint32_t back_last;
2368 uint32_t front_pingtime[3];
2369 uint32_t front_min[3];
2370 uint32_t front_max[3];
2371 uint32_t front_last;
2372 };
2373 map<int, Interfaces> hb_pingtime; ///< map of osd id to Interfaces
2374
11fdf7f2
TL
2375 osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
2376 num_shards_repaired(0) {}
2377
2378 void add(const osd_stat_t& o) {
2379 statfs.add(o.statfs);
2380 snap_trim_queue_len += o.snap_trim_queue_len;
2381 num_snap_trimming += o.num_snap_trimming;
2382 num_shards_repaired += o.num_shards_repaired;
2383 op_queue_age_hist.add(o.op_queue_age_hist);
2384 os_perf_stat.add(o.os_perf_stat);
2385 num_pgs += o.num_pgs;
81eedcae
TL
2386 num_osds += o.num_osds;
2387 num_per_pool_osds += o.num_per_pool_osds;
11fdf7f2
TL
2388 for (const auto& a : o.os_alerts) {
2389 auto& target = os_alerts[a.first];
2390 for (auto& i : a.second) {
2391 target.emplace(i.first, i.second);
2392 }
2393 }
2394 }
2395 void sub(const osd_stat_t& o) {
2396 statfs.sub(o.statfs);
2397 snap_trim_queue_len -= o.snap_trim_queue_len;
2398 num_snap_trimming -= o.num_snap_trimming;
2399 num_shards_repaired -= o.num_shards_repaired;
2400 op_queue_age_hist.sub(o.op_queue_age_hist);
2401 os_perf_stat.sub(o.os_perf_stat);
2402 num_pgs -= o.num_pgs;
81eedcae
TL
2403 num_osds -= o.num_osds;
2404 num_per_pool_osds -= o.num_per_pool_osds;
11fdf7f2
TL
2405 for (const auto& a : o.os_alerts) {
2406 auto& target = os_alerts[a.first];
2407 for (auto& i : a.second) {
2408 target.erase(i.first);
2409 }
2410 if (target.empty()) {
2411 os_alerts.erase(a.first);
2412 }
2413 }
2414 }
ded94939 2415 void dump(Formatter *f, bool with_net = true) const;
11fdf7f2
TL
2416 void encode(bufferlist &bl, uint64_t features) const;
2417 void decode(bufferlist::const_iterator &bl);
2418 static void generate_test_instances(std::list<osd_stat_t*>& o);
2419};
2420WRITE_CLASS_ENCODER_FEATURES(osd_stat_t)
2421
2422inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
2423 return l.statfs == r.statfs &&
2424 l.snap_trim_queue_len == r.snap_trim_queue_len &&
2425 l.num_snap_trimming == r.num_snap_trimming &&
2426 l.num_shards_repaired == r.num_shards_repaired &&
2427 l.hb_peers == r.hb_peers &&
2428 l.op_queue_age_hist == r.op_queue_age_hist &&
2429 l.os_perf_stat == r.os_perf_stat &&
81eedcae
TL
2430 l.num_pgs == r.num_pgs &&
2431 l.num_osds == r.num_osds &&
2432 l.num_per_pool_osds == r.num_per_pool_osds;
11fdf7f2
TL
2433}
2434inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
2435 return !(l == r);
2436}
2437
2438inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
2439 return out << "osd_stat(" << s.statfs << ", "
2440 << "peers " << s.hb_peers
2441 << " op hist " << s.op_queue_age_hist.h
2442 << ")";
2443}
2444
7c673cae
FG
2445/*
2446 * summation over an entire pool
2447 */
2448struct pool_stat_t {
2449 object_stat_collection_t stats;
11fdf7f2 2450 store_statfs_t store_stats;
7c673cae
FG
2451 int64_t log_size;
2452 int64_t ondisk_log_size; // >= active_log_size
2453 int32_t up; ///< number of up replicas or shards
2454 int32_t acting; ///< number of acting replicas or shards
11fdf7f2 2455 int32_t num_store_stats; ///< amount of store_stats accumulated
7c673cae 2456
11fdf7f2
TL
2457 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
2458 num_store_stats(0)
7c673cae
FG
2459 { }
2460
2461 void floor(int64_t f) {
2462 stats.floor(f);
11fdf7f2 2463 store_stats.floor(f);
7c673cae
FG
2464 if (log_size < f)
2465 log_size = f;
2466 if (ondisk_log_size < f)
2467 ondisk_log_size = f;
2468 if (up < f)
2469 up = f;
2470 if (acting < f)
2471 acting = f;
11fdf7f2
TL
2472 if (num_store_stats < f)
2473 num_store_stats = f;
2474 }
2475
2476 void add(const store_statfs_t& o) {
2477 store_stats.add(o);
2478 ++num_store_stats;
2479 }
2480 void sub(const store_statfs_t& o) {
2481 store_stats.sub(o);
2482 --num_store_stats;
7c673cae
FG
2483 }
2484
2485 void add(const pg_stat_t& o) {
2486 stats.add(o.stats);
2487 log_size += o.log_size;
2488 ondisk_log_size += o.ondisk_log_size;
2489 up += o.up.size();
2490 acting += o.acting.size();
2491 }
2492 void sub(const pg_stat_t& o) {
2493 stats.sub(o.stats);
2494 log_size -= o.log_size;
2495 ondisk_log_size -= o.ondisk_log_size;
2496 up -= o.up.size();
2497 acting -= o.acting.size();
2498 }
2499
2500 bool is_zero() const {
2501 return (stats.is_zero() &&
11fdf7f2 2502 store_stats.is_zero() &&
7c673cae
FG
2503 log_size == 0 &&
2504 ondisk_log_size == 0 &&
2505 up == 0 &&
11fdf7f2
TL
2506 acting == 0 &&
2507 num_store_stats == 0);
2508 }
2509
2510 // helper accessors to retrieve used/netto bytes depending on the
2511 // collection method: new per-pool objectstore report or legacy PG
2512 // summation at OSD.
2513 // In legacy mode used and netto values are the same. But for new per-pool
2514 // collection 'used' provides amount of space ALLOCATED at all related OSDs
2515 // and 'netto' is amount of stored user data.
81eedcae 2516 uint64_t get_allocated_bytes(bool per_pool) const {
11fdf7f2 2517 uint64_t allocated_bytes;
81eedcae 2518 if (per_pool) {
11fdf7f2
TL
2519 allocated_bytes = store_stats.allocated;
2520 } else {
2521 // legacy mode, use numbers from 'stats'
2522 allocated_bytes = stats.sum.num_bytes +
2523 stats.sum.num_bytes_hit_set_archive;
2524 }
2525 // omap is not broken out by pool by nautilus bluestore
2526 allocated_bytes += stats.sum.num_omap_bytes;
2527 return allocated_bytes;
2528 }
81eedcae 2529 uint64_t get_user_bytes(float raw_used_rate, bool per_pool) const {
11fdf7f2 2530 uint64_t user_bytes;
81eedcae 2531 if (per_pool) {
11fdf7f2
TL
2532 user_bytes = raw_used_rate ? store_stats.data_stored / raw_used_rate : 0;
2533 } else {
2534 // legacy mode, use numbers from 'stats'
2535 user_bytes = stats.sum.num_bytes +
2536 stats.sum.num_bytes_hit_set_archive;
2537 }
2538 // omap is not broken out by pool by nautilus bluestore
2539 user_bytes += stats.sum.num_omap_bytes;
2540 return user_bytes;
7c673cae
FG
2541 }
2542
2543 void dump(Formatter *f) const;
2544 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 2545 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
2546 static void generate_test_instances(list<pool_stat_t*>& o);
2547};
2548WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
2549
2550
2551// -----------------------------------------
2552
2553/**
2554 * pg_hit_set_info_t - information about a single recorded HitSet
2555 *
11fdf7f2 2556 * Track basic metadata about a HitSet, like the number of insertions
7c673cae
FG
2557 * and the time range it covers.
2558 */
2559struct pg_hit_set_info_t {
2560 utime_t begin, end; ///< time interval
2561 eversion_t version; ///< version this HitSet object was written
2562 bool using_gmt; ///< use gmt for creating the hit_set archive object name
2563
2564 friend bool operator==(const pg_hit_set_info_t& l,
2565 const pg_hit_set_info_t& r) {
2566 return
2567 l.begin == r.begin &&
2568 l.end == r.end &&
2569 l.version == r.version &&
2570 l.using_gmt == r.using_gmt;
2571 }
2572
2573 explicit pg_hit_set_info_t(bool using_gmt = true)
2574 : using_gmt(using_gmt) {}
2575
2576 void encode(bufferlist &bl) const;
11fdf7f2 2577 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
2578 void dump(Formatter *f) const;
2579 static void generate_test_instances(list<pg_hit_set_info_t*>& o);
2580};
2581WRITE_CLASS_ENCODER(pg_hit_set_info_t)
2582
2583/**
2584 * pg_hit_set_history_t - information about a history of hitsets
2585 *
2586 * Include information about the currently accumulating hit set as well
2587 * as archived/historical ones.
2588 */
2589struct pg_hit_set_history_t {
2590 eversion_t current_last_update; ///< last version inserted into current set
2591 list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
2592
2593 friend bool operator==(const pg_hit_set_history_t& l,
2594 const pg_hit_set_history_t& r) {
2595 return
2596 l.current_last_update == r.current_last_update &&
2597 l.history == r.history;
2598 }
2599
2600 void encode(bufferlist &bl) const;
11fdf7f2 2601 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
2602 void dump(Formatter *f) const;
2603 static void generate_test_instances(list<pg_hit_set_history_t*>& o);
2604};
2605WRITE_CLASS_ENCODER(pg_hit_set_history_t)
2606
2607
2608// -----------------------------------------
2609
2610/**
2611 * pg_history_t - information about recent pg peering/mapping history
2612 *
2613 * This is aggressively shared between OSDs to bound the amount of past
2614 * history they need to worry about.
2615 */
2616struct pg_history_t {
31f18b77
FG
2617 epoch_t epoch_created; // epoch in which *pg* was created (pool or pg)
2618 epoch_t epoch_pool_created; // epoch in which *pool* was created
2619 // (note: may be pg creation epoch for
2620 // pre-luminous clusters)
7c673cae
FG
2621 epoch_t last_epoch_started; // lower bound on last epoch started (anywhere, not necessarily locally)
2622 epoch_t last_interval_started; // first epoch of last_epoch_started interval
2623 epoch_t last_epoch_clean; // lower bound on last epoch the PG was completely clean.
2624 epoch_t last_interval_clean; // first epoch of last_epoch_clean interval
31f18b77 2625 epoch_t last_epoch_split; // as parent or child
7c673cae
FG
2626 epoch_t last_epoch_marked_full; // pool or cluster
2627
2628 /**
2629 * In the event of a map discontinuity, same_*_since may reflect the first
2630 * map the osd has seen in the new map sequence rather than the actual start
2631 * of the interval. This is ok since a discontinuity at epoch e means there
2632 * must have been a clean interval between e and now and that we cannot be
2633 * in the active set during the interval containing e.
2634 */
2635 epoch_t same_up_since; // same acting set since
2636 epoch_t same_interval_since; // same acting AND up set since
2637 epoch_t same_primary_since; // same primary at least back through this epoch.
2638
2639 eversion_t last_scrub;
2640 eversion_t last_deep_scrub;
2641 utime_t last_scrub_stamp;
2642 utime_t last_deep_scrub_stamp;
2643 utime_t last_clean_scrub_stamp;
2644
2645 friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
2646 return
2647 l.epoch_created == r.epoch_created &&
31f18b77 2648 l.epoch_pool_created == r.epoch_pool_created &&
7c673cae
FG
2649 l.last_epoch_started == r.last_epoch_started &&
2650 l.last_interval_started == r.last_interval_started &&
2651 l.last_epoch_clean == r.last_epoch_clean &&
2652 l.last_interval_clean == r.last_interval_clean &&
2653 l.last_epoch_split == r.last_epoch_split &&
2654 l.last_epoch_marked_full == r.last_epoch_marked_full &&
2655 l.same_up_since == r.same_up_since &&
2656 l.same_interval_since == r.same_interval_since &&
2657 l.same_primary_since == r.same_primary_since &&
2658 l.last_scrub == r.last_scrub &&
2659 l.last_deep_scrub == r.last_deep_scrub &&
2660 l.last_scrub_stamp == r.last_scrub_stamp &&
2661 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2662 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp;
2663 }
2664
2665 pg_history_t()
2666 : epoch_created(0),
31f18b77 2667 epoch_pool_created(0),
7c673cae
FG
2668 last_epoch_started(0),
2669 last_interval_started(0),
2670 last_epoch_clean(0),
2671 last_interval_clean(0),
2672 last_epoch_split(0),
2673 last_epoch_marked_full(0),
2674 same_up_since(0), same_interval_since(0), same_primary_since(0) {}
2675
2676 bool merge(const pg_history_t &other) {
2677 // Here, we only update the fields which cannot be calculated from the OSDmap.
2678 bool modified = false;
2679 if (epoch_created < other.epoch_created) {
2680 epoch_created = other.epoch_created;
2681 modified = true;
2682 }
31f18b77
FG
2683 if (epoch_pool_created < other.epoch_pool_created) {
2684 // FIXME: for jewel compat only; this should either be 0 or always the
2685 // same value across all pg instances.
2686 epoch_pool_created = other.epoch_pool_created;
2687 modified = true;
2688 }
7c673cae
FG
2689 if (last_epoch_started < other.last_epoch_started) {
2690 last_epoch_started = other.last_epoch_started;
2691 modified = true;
2692 }
2693 if (last_interval_started < other.last_interval_started) {
2694 last_interval_started = other.last_interval_started;
2695 modified = true;
2696 }
2697 if (last_epoch_clean < other.last_epoch_clean) {
2698 last_epoch_clean = other.last_epoch_clean;
2699 modified = true;
2700 }
2701 if (last_interval_clean < other.last_interval_clean) {
2702 last_interval_clean = other.last_interval_clean;
2703 modified = true;
2704 }
2705 if (last_epoch_split < other.last_epoch_split) {
2706 last_epoch_split = other.last_epoch_split;
2707 modified = true;
2708 }
2709 if (last_epoch_marked_full < other.last_epoch_marked_full) {
2710 last_epoch_marked_full = other.last_epoch_marked_full;
2711 modified = true;
2712 }
2713 if (other.last_scrub > last_scrub) {
2714 last_scrub = other.last_scrub;
2715 modified = true;
2716 }
2717 if (other.last_scrub_stamp > last_scrub_stamp) {
2718 last_scrub_stamp = other.last_scrub_stamp;
2719 modified = true;
2720 }
2721 if (other.last_deep_scrub > last_deep_scrub) {
2722 last_deep_scrub = other.last_deep_scrub;
2723 modified = true;
2724 }
2725 if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
2726 last_deep_scrub_stamp = other.last_deep_scrub_stamp;
2727 modified = true;
2728 }
2729 if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
2730 last_clean_scrub_stamp = other.last_clean_scrub_stamp;
2731 modified = true;
2732 }
2733 return modified;
2734 }
2735
2736 void encode(bufferlist& bl) const;
11fdf7f2 2737 void decode(bufferlist::const_iterator& p);
7c673cae
FG
2738 void dump(Formatter *f) const;
2739 static void generate_test_instances(list<pg_history_t*>& o);
2740};
2741WRITE_CLASS_ENCODER(pg_history_t)
2742
2743inline ostream& operator<<(ostream& out, const pg_history_t& h) {
31f18b77 2744 return out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
7c673cae
FG
2745 << " lis/c " << h.last_interval_started
2746 << "/" << h.last_interval_clean
2747 << " les/c/f " << h.last_epoch_started << "/" << h.last_epoch_clean
2748 << "/" << h.last_epoch_marked_full
2749 << " " << h.same_up_since
2750 << "/" << h.same_interval_since
2751 << "/" << h.same_primary_since;
2752}
2753
2754
2755/**
2756 * pg_info_t - summary of PG statistics.
2757 *
2758 * some notes:
2759 * - last_complete implies we have all objects that existed as of that
2760 * stamp, OR a newer object, OR have already applied a later delete.
2761 * - if last_complete >= log.bottom, then we know pg contents thru log.head.
2762 * otherwise, we have no idea what the pg is supposed to contain.
2763 */
2764struct pg_info_t {
2765 spg_t pgid;
2766 eversion_t last_update; ///< last object version applied to store.
2767 eversion_t last_complete; ///< last version pg was complete through.
2768 epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd
2769 epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
2770
2771 version_t last_user_version; ///< last user object version applied to store
2772
2773 eversion_t log_tail; ///< oldest log entry.
2774
2775 hobject_t last_backfill; ///< objects >= this and < last_complete may be missing
2776 bool last_backfill_bitwise; ///< true if last_backfill reflects a bitwise (vs nibblewise) sort
2777
2778 interval_set<snapid_t> purged_snaps;
2779
2780 pg_stat_t stats;
2781
2782 pg_history_t history;
2783 pg_hit_set_history_t hit_set;
2784
2785 friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
2786 return
2787 l.pgid == r.pgid &&
2788 l.last_update == r.last_update &&
2789 l.last_complete == r.last_complete &&
2790 l.last_epoch_started == r.last_epoch_started &&
2791 l.last_interval_started == r.last_interval_started &&
2792 l.last_user_version == r.last_user_version &&
2793 l.log_tail == r.log_tail &&
2794 l.last_backfill == r.last_backfill &&
2795 l.last_backfill_bitwise == r.last_backfill_bitwise &&
2796 l.purged_snaps == r.purged_snaps &&
2797 l.stats == r.stats &&
2798 l.history == r.history &&
2799 l.hit_set == r.hit_set;
2800 }
2801
2802 pg_info_t()
2803 : last_epoch_started(0),
2804 last_interval_started(0),
2805 last_user_version(0),
2806 last_backfill(hobject_t::get_max()),
2807 last_backfill_bitwise(false)
2808 { }
2809 // cppcheck-suppress noExplicitConstructor
2810 pg_info_t(spg_t p)
2811 : pgid(p),
2812 last_epoch_started(0),
2813 last_interval_started(0),
2814 last_user_version(0),
2815 last_backfill(hobject_t::get_max()),
2816 last_backfill_bitwise(false)
2817 { }
2818
2819 void set_last_backfill(hobject_t pos) {
2820 last_backfill = pos;
2821 last_backfill_bitwise = true;
2822 }
2823
2824 bool is_empty() const { return last_update.version == 0; }
2825 bool dne() const { return history.epoch_created == 0; }
2826
11fdf7f2 2827 bool has_missing() const { return last_complete != last_update; }
7c673cae
FG
2828 bool is_incomplete() const { return !last_backfill.is_max(); }
2829
2830 void encode(bufferlist& bl) const;
11fdf7f2 2831 void decode(bufferlist::const_iterator& p);
7c673cae 2832 void dump(Formatter *f) const;
7c673cae
FG
2833 static void generate_test_instances(list<pg_info_t*>& o);
2834};
2835WRITE_CLASS_ENCODER(pg_info_t)
2836
2837inline ostream& operator<<(ostream& out, const pg_info_t& pgi)
2838{
2839 out << pgi.pgid << "(";
2840 if (pgi.dne())
2841 out << " DNE";
2842 if (pgi.is_empty())
2843 out << " empty";
2844 else {
2845 out << " v " << pgi.last_update;
2846 if (pgi.last_complete != pgi.last_update)
2847 out << " lc " << pgi.last_complete;
2848 out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
2849 }
2850 if (pgi.is_incomplete())
2851 out << " lb " << pgi.last_backfill
2852 << (pgi.last_backfill_bitwise ? " (bitwise)" : " (NIBBLEWISE)");
2853 //out << " c " << pgi.epoch_created;
2854 out << " local-lis/les=" << pgi.last_interval_started
2855 << "/" << pgi.last_epoch_started;
2856 out << " n=" << pgi.stats.stats.sum.num_objects;
2857 out << " " << pgi.history
2858 << ")";
2859 return out;
2860}
2861
2862/**
2863 * pg_fast_info_t - common pg_info_t fields
2864 *
2865 * These are the fields of pg_info_t (and children) that are updated for
2866 * most IO operations.
2867 *
2868 * ** WARNING **
2869 * Because we rely on these fields to be applied to the normal
2870 * info struct, adding a new field here that is not also new in info
2871 * means that we must set an incompat OSD feature bit!
2872 */
2873struct pg_fast_info_t {
2874 eversion_t last_update;
2875 eversion_t last_complete;
2876 version_t last_user_version;
2877 struct { // pg_stat_t stats
2878 eversion_t version;
2879 version_t reported_seq;
2880 utime_t last_fresh;
2881 utime_t last_active;
2882 utime_t last_peered;
2883 utime_t last_clean;
2884 utime_t last_unstale;
2885 utime_t last_undegraded;
2886 utime_t last_fullsized;
2887 int64_t log_size; // (also ondisk_log_size, which has the same value)
2888 struct { // object_stat_collection_t stats;
2889 struct { // objct_stat_sum_t sum
2890 int64_t num_bytes; // in bytes
2891 int64_t num_objects;
2892 int64_t num_object_copies;
2893 int64_t num_rd;
2894 int64_t num_rd_kb;
2895 int64_t num_wr;
2896 int64_t num_wr_kb;
2897 int64_t num_objects_dirty;
2898 } sum;
2899 } stats;
2900 } stats;
2901
2902 void populate_from(const pg_info_t& info) {
2903 last_update = info.last_update;
2904 last_complete = info.last_complete;
2905 last_user_version = info.last_user_version;
2906 stats.version = info.stats.version;
2907 stats.reported_seq = info.stats.reported_seq;
2908 stats.last_fresh = info.stats.last_fresh;
2909 stats.last_active = info.stats.last_active;
2910 stats.last_peered = info.stats.last_peered;
2911 stats.last_clean = info.stats.last_clean;
2912 stats.last_unstale = info.stats.last_unstale;
2913 stats.last_undegraded = info.stats.last_undegraded;
2914 stats.last_fullsized = info.stats.last_fullsized;
2915 stats.log_size = info.stats.log_size;
2916 stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
2917 stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
2918 stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
2919 stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
2920 stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
2921 stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
2922 stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
2923 stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
2924 }
2925
2926 bool try_apply_to(pg_info_t* info) {
2927 if (last_update <= info->last_update)
2928 return false;
2929 info->last_update = last_update;
2930 info->last_complete = last_complete;
2931 info->last_user_version = last_user_version;
2932 info->stats.version = stats.version;
2933 info->stats.reported_seq = stats.reported_seq;
2934 info->stats.last_fresh = stats.last_fresh;
2935 info->stats.last_active = stats.last_active;
2936 info->stats.last_peered = stats.last_peered;
2937 info->stats.last_clean = stats.last_clean;
2938 info->stats.last_unstale = stats.last_unstale;
2939 info->stats.last_undegraded = stats.last_undegraded;
2940 info->stats.last_fullsized = stats.last_fullsized;
2941 info->stats.log_size = stats.log_size;
2942 info->stats.ondisk_log_size = stats.log_size;
2943 info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
2944 info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
2945 info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
2946 info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
2947 info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
2948 info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
2949 info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
2950 info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
2951 return true;
2952 }
2953
2954 void encode(bufferlist& bl) const {
2955 ENCODE_START(1, 1, bl);
11fdf7f2
TL
2956 encode(last_update, bl);
2957 encode(last_complete, bl);
2958 encode(last_user_version, bl);
2959 encode(stats.version, bl);
2960 encode(stats.reported_seq, bl);
2961 encode(stats.last_fresh, bl);
2962 encode(stats.last_active, bl);
2963 encode(stats.last_peered, bl);
2964 encode(stats.last_clean, bl);
2965 encode(stats.last_unstale, bl);
2966 encode(stats.last_undegraded, bl);
2967 encode(stats.last_fullsized, bl);
2968 encode(stats.log_size, bl);
2969 encode(stats.stats.sum.num_bytes, bl);
2970 encode(stats.stats.sum.num_objects, bl);
2971 encode(stats.stats.sum.num_object_copies, bl);
2972 encode(stats.stats.sum.num_rd, bl);
2973 encode(stats.stats.sum.num_rd_kb, bl);
2974 encode(stats.stats.sum.num_wr, bl);
2975 encode(stats.stats.sum.num_wr_kb, bl);
2976 encode(stats.stats.sum.num_objects_dirty, bl);
7c673cae
FG
2977 ENCODE_FINISH(bl);
2978 }
11fdf7f2 2979 void decode(bufferlist::const_iterator& p) {
7c673cae 2980 DECODE_START(1, p);
11fdf7f2
TL
2981 decode(last_update, p);
2982 decode(last_complete, p);
2983 decode(last_user_version, p);
2984 decode(stats.version, p);
2985 decode(stats.reported_seq, p);
2986 decode(stats.last_fresh, p);
2987 decode(stats.last_active, p);
2988 decode(stats.last_peered, p);
2989 decode(stats.last_clean, p);
2990 decode(stats.last_unstale, p);
2991 decode(stats.last_undegraded, p);
2992 decode(stats.last_fullsized, p);
2993 decode(stats.log_size, p);
2994 decode(stats.stats.sum.num_bytes, p);
2995 decode(stats.stats.sum.num_objects, p);
2996 decode(stats.stats.sum.num_object_copies, p);
2997 decode(stats.stats.sum.num_rd, p);
2998 decode(stats.stats.sum.num_rd_kb, p);
2999 decode(stats.stats.sum.num_wr, p);
3000 decode(stats.stats.sum.num_wr_kb, p);
3001 decode(stats.stats.sum.num_objects_dirty, p);
7c673cae
FG
3002 DECODE_FINISH(p);
3003 }
3004};
3005WRITE_CLASS_ENCODER(pg_fast_info_t)
3006
3007
3008struct pg_notify_t {
3009 epoch_t query_epoch;
3010 epoch_t epoch_sent;
3011 pg_info_t info;
3012 shard_id_t to;
3013 shard_id_t from;
3014 pg_notify_t() :
3015 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
3016 from(shard_id_t::NO_SHARD) {}
3017 pg_notify_t(
3018 shard_id_t to,
3019 shard_id_t from,
3020 epoch_t query_epoch,
3021 epoch_t epoch_sent,
3022 const pg_info_t &info)
3023 : query_epoch(query_epoch),
3024 epoch_sent(epoch_sent),
3025 info(info), to(to), from(from) {
11fdf7f2 3026 ceph_assert(from == info.pgid.shard);
7c673cae
FG
3027 }
3028 void encode(bufferlist &bl) const;
11fdf7f2 3029 void decode(bufferlist::const_iterator &p);
7c673cae
FG
3030 void dump(Formatter *f) const;
3031 static void generate_test_instances(list<pg_notify_t*> &o);
3032};
3033WRITE_CLASS_ENCODER(pg_notify_t)
3034ostream &operator<<(ostream &lhs, const pg_notify_t &notify);
3035
3036
3037class OSDMap;
3038/**
3039 * PastIntervals -- information needed to determine the PriorSet and
3040 * the might_have_unfound set
3041 */
3042class PastIntervals {
3043public:
3044 struct pg_interval_t {
3045 vector<int32_t> up, acting;
3046 epoch_t first, last;
3047 bool maybe_went_rw;
3048 int32_t primary;
3049 int32_t up_primary;
3050
3051 pg_interval_t()
3052 : first(0), last(0),
3053 maybe_went_rw(false),
3054 primary(-1),
3055 up_primary(-1)
3056 {}
3057
3058 pg_interval_t(
3059 vector<int32_t> &&up,
3060 vector<int32_t> &&acting,
3061 epoch_t first,
3062 epoch_t last,
3063 bool maybe_went_rw,
3064 int32_t primary,
3065 int32_t up_primary)
3066 : up(up), acting(acting), first(first), last(last),
3067 maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
3068 {}
3069
3070 void encode(bufferlist& bl) const;
11fdf7f2 3071 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
3072 void dump(Formatter *f) const;
3073 static void generate_test_instances(list<pg_interval_t*>& o);
3074 };
3075
11fdf7f2 3076 PastIntervals();
7c673cae
FG
3077 PastIntervals(PastIntervals &&rhs) = default;
3078 PastIntervals &operator=(PastIntervals &&rhs) = default;
3079
3080 PastIntervals(const PastIntervals &rhs);
3081 PastIntervals &operator=(const PastIntervals &rhs);
3082
3083 class interval_rep {
3084 public:
3085 virtual size_t size() const = 0;
3086 virtual bool empty() const = 0;
3087 virtual void clear() = 0;
3088 virtual pair<epoch_t, epoch_t> get_bounds() const = 0;
3089 virtual set<pg_shard_t> get_all_participants(
3090 bool ec_pool) const = 0;
3091 virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
3092 virtual unique_ptr<interval_rep> clone() const = 0;
3093 virtual ostream &print(ostream &out) const = 0;
3094 virtual void encode(bufferlist &bl) const = 0;
11fdf7f2 3095 virtual void decode(bufferlist::const_iterator &bl) = 0;
7c673cae 3096 virtual void dump(Formatter *f) const = 0;
7c673cae 3097 virtual void iterate_mayberw_back_to(
7c673cae
FG
3098 epoch_t les,
3099 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const = 0;
3100
3101 virtual bool has_full_intervals() const { return false; }
3102 virtual void iterate_all_intervals(
3103 std::function<void(const pg_interval_t &)> &&f) const {
11fdf7f2
TL
3104 ceph_assert(!has_full_intervals());
3105 ceph_abort_msg("not valid for this implementation");
7c673cae 3106 }
11fdf7f2 3107 virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0;
7c673cae
FG
3108
3109 virtual ~interval_rep() {}
3110 };
7c673cae
FG
3111 friend class pi_compact_rep;
3112private:
3113
3114 unique_ptr<interval_rep> past_intervals;
3115
11fdf7f2 3116 explicit PastIntervals(interval_rep *rep) : past_intervals(rep) {}
7c673cae
FG
3117
3118public:
3119 void add_interval(bool ec_pool, const pg_interval_t &interval) {
11fdf7f2 3120 ceph_assert(past_intervals);
7c673cae
FG
3121 return past_intervals->add_interval(ec_pool, interval);
3122 }
3123
7c673cae
FG
3124 void encode(bufferlist &bl) const {
3125 ENCODE_START(1, 1, bl);
3126 if (past_intervals) {
11fdf7f2
TL
3127 __u8 type = 2;
3128 encode(type, bl);
7c673cae
FG
3129 past_intervals->encode(bl);
3130 } else {
11fdf7f2 3131 encode((__u8)0, bl);
7c673cae
FG
3132 }
3133 ENCODE_FINISH(bl);
3134 }
7c673cae 3135
11fdf7f2 3136 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
3137
3138 void dump(Formatter *f) const {
11fdf7f2 3139 ceph_assert(past_intervals);
7c673cae
FG
3140 past_intervals->dump(f);
3141 }
3142 static void generate_test_instances(list<PastIntervals *> & o);
3143
3144 /**
3145 * Determines whether there is an interval change
3146 */
3147 static bool is_new_interval(
3148 int old_acting_primary,
3149 int new_acting_primary,
3150 const vector<int> &old_acting,
3151 const vector<int> &new_acting,
3152 int old_up_primary,
3153 int new_up_primary,
3154 const vector<int> &old_up,
3155 const vector<int> &new_up,
3156 int old_size,
3157 int new_size,
3158 int old_min_size,
3159 int new_min_size,
3160 unsigned old_pg_num,
3161 unsigned new_pg_num,
11fdf7f2
TL
3162 unsigned old_pg_num_pending,
3163 unsigned new_pg_num_pending,
7c673cae
FG
3164 bool old_sort_bitwise,
3165 bool new_sort_bitwise,
c07f9fc5
FG
3166 bool old_recovery_deletes,
3167 bool new_recovery_deletes,
7c673cae
FG
3168 pg_t pgid
3169 );
3170
3171 /**
3172 * Determines whether there is an interval change
3173 */
3174 static bool is_new_interval(
3175 int old_acting_primary, ///< [in] primary as of lastmap
3176 int new_acting_primary, ///< [in] primary as of lastmap
3177 const vector<int> &old_acting, ///< [in] acting as of lastmap
3178 const vector<int> &new_acting, ///< [in] acting as of osdmap
3179 int old_up_primary, ///< [in] up primary of lastmap
3180 int new_up_primary, ///< [in] up primary of osdmap
3181 const vector<int> &old_up, ///< [in] up as of lastmap
3182 const vector<int> &new_up, ///< [in] up as of osdmap
11fdf7f2
TL
3183 std::shared_ptr<const OSDMap> osdmap, ///< [in] current map
3184 std::shared_ptr<const OSDMap> lastmap, ///< [in] last map
7c673cae
FG
3185 pg_t pgid ///< [in] pgid for pg
3186 );
3187
3188 /**
3189 * Integrates a new map into *past_intervals, returns true
3190 * if an interval was closed out.
3191 */
3192 static bool check_new_interval(
3193 int old_acting_primary, ///< [in] primary as of lastmap
3194 int new_acting_primary, ///< [in] primary as of osdmap
3195 const vector<int> &old_acting, ///< [in] acting as of lastmap
3196 const vector<int> &new_acting, ///< [in] acting as of osdmap
3197 int old_up_primary, ///< [in] up primary of lastmap
3198 int new_up_primary, ///< [in] up primary of osdmap
3199 const vector<int> &old_up, ///< [in] up as of lastmap
3200 const vector<int> &new_up, ///< [in] up as of osdmap
3201 epoch_t same_interval_since, ///< [in] as of osdmap
3202 epoch_t last_epoch_clean, ///< [in] current
11fdf7f2
TL
3203 std::shared_ptr<const OSDMap> osdmap, ///< [in] current map
3204 std::shared_ptr<const OSDMap> lastmap, ///< [in] last map
7c673cae 3205 pg_t pgid, ///< [in] pgid for pg
11fdf7f2 3206 IsPGRecoverablePredicate *could_have_gone_active, ///< [in] predicate whether the pg can be active
7c673cae
FG
3207 PastIntervals *past_intervals, ///< [out] intervals
3208 ostream *out = 0 ///< [out] debug ostream
3209 );
c07f9fc5 3210
7c673cae
FG
3211 friend ostream& operator<<(ostream& out, const PastIntervals &i);
3212
3213 template <typename F>
3214 void iterate_mayberw_back_to(
7c673cae
FG
3215 epoch_t les,
3216 F &&f) const {
11fdf7f2
TL
3217 ceph_assert(past_intervals);
3218 past_intervals->iterate_mayberw_back_to(les, std::forward<F>(f));
7c673cae
FG
3219 }
3220 void clear() {
11fdf7f2 3221 ceph_assert(past_intervals);
7c673cae
FG
3222 past_intervals->clear();
3223 }
3224
3225 /**
3226 * Should return a value which gives an indication of the amount
3227 * of state contained
3228 */
3229 size_t size() const {
11fdf7f2 3230 ceph_assert(past_intervals);
7c673cae
FG
3231 return past_intervals->size();
3232 }
3233
3234 bool empty() const {
11fdf7f2 3235 ceph_assert(past_intervals);
7c673cae
FG
3236 return past_intervals->empty();
3237 }
3238
3239 void swap(PastIntervals &other) {
31f18b77
FG
3240 using std::swap;
3241 swap(other.past_intervals, past_intervals);
7c673cae
FG
3242 }
3243
3244 /**
3245 * Return all shards which have been in the acting set back to the
3246 * latest epoch to which we have trimmed except for pg_whoami
3247 */
3248 set<pg_shard_t> get_might_have_unfound(
3249 pg_shard_t pg_whoami,
3250 bool ec_pool) const {
11fdf7f2 3251 ceph_assert(past_intervals);
7c673cae
FG
3252 auto ret = past_intervals->get_all_participants(ec_pool);
3253 ret.erase(pg_whoami);
3254 return ret;
3255 }
3256
3257 /**
3258 * Return all shards which we might want to talk to for peering
3259 */
3260 set<pg_shard_t> get_all_probe(
3261 bool ec_pool) const {
11fdf7f2 3262 ceph_assert(past_intervals);
7c673cae
FG
3263 return past_intervals->get_all_participants(ec_pool);
3264 }
3265
3266 /* Return the set of epochs [start, end) represented by the
3267 * past_interval set.
3268 */
3269 pair<epoch_t, epoch_t> get_bounds() const {
11fdf7f2 3270 ceph_assert(past_intervals);
7c673cae
FG
3271 return past_intervals->get_bounds();
3272 }
3273
11fdf7f2
TL
3274 void adjust_start_backwards(epoch_t last_epoch_clean) {
3275 ceph_assert(past_intervals);
3276 past_intervals->adjust_start_backwards(last_epoch_clean);
3277 }
3278
7c673cae
FG
3279 enum osd_state_t {
3280 UP,
3281 DOWN,
3282 DNE,
3283 LOST
3284 };
3285 struct PriorSet {
3286 bool ec_pool = false;
11fdf7f2
TL
3287 set<pg_shard_t> probe; ///< current+prior OSDs we need to probe.
3288 set<int> down; ///< down osds that would normally be in @a probe and might be interesting.
3289 map<int, epoch_t> blocked_by; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
7c673cae 3290
11fdf7f2 3291 bool pg_down = false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
7c673cae
FG
3292 unique_ptr<IsPGRecoverablePredicate> pcontdec;
3293
3294 PriorSet() = default;
3295 PriorSet(PriorSet &&) = default;
3296 PriorSet &operator=(PriorSet &&) = default;
3297
3298 PriorSet &operator=(const PriorSet &) = delete;
3299 PriorSet(const PriorSet &) = delete;
3300
3301 bool operator==(const PriorSet &rhs) const {
3302 return (ec_pool == rhs.ec_pool) &&
3303 (probe == rhs.probe) &&
3304 (down == rhs.down) &&
3305 (blocked_by == rhs.blocked_by) &&
3306 (pg_down == rhs.pg_down);
3307 }
3308
3309 bool affected_by_map(
3310 const OSDMap &osdmap,
3311 const DoutPrefixProvider *dpp) const;
3312
3313 // For verifying tests
3314 PriorSet(
3315 bool ec_pool,
3316 set<pg_shard_t> probe,
3317 set<int> down,
3318 map<int, epoch_t> blocked_by,
3319 bool pg_down,
3320 IsPGRecoverablePredicate *pcontdec)
3321 : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
3322 pg_down(pg_down), pcontdec(pcontdec) {}
3323
3324 private:
3325 template <typename F>
3326 PriorSet(
3327 const PastIntervals &past_intervals,
3328 bool ec_pool,
3329 epoch_t last_epoch_started,
3330 IsPGRecoverablePredicate *c,
3331 F f,
3332 const vector<int> &up,
3333 const vector<int> &acting,
3334 const DoutPrefixProvider *dpp);
3335
3336 friend class PastIntervals;
3337 };
3338
7c673cae
FG
3339 template <typename... Args>
3340 PriorSet get_prior_set(Args&&... args) const {
3341 return PriorSet(*this, std::forward<Args>(args)...);
3342 }
3343};
3344WRITE_CLASS_ENCODER(PastIntervals)
3345
3346ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i);
3347ostream& operator<<(ostream& out, const PastIntervals &i);
3348ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i);
3349
3350template <typename F>
3351PastIntervals::PriorSet::PriorSet(
3352 const PastIntervals &past_intervals,
3353 bool ec_pool,
3354 epoch_t last_epoch_started,
3355 IsPGRecoverablePredicate *c,
3356 F f,
3357 const vector<int> &up,
3358 const vector<int> &acting,
3359 const DoutPrefixProvider *dpp)
3360 : ec_pool(ec_pool), pg_down(false), pcontdec(c)
3361{
3362 /*
3363 * We have to be careful to gracefully deal with situations like
3364 * so. Say we have a power outage or something that takes out both
3365 * OSDs, but the monitor doesn't mark them down in the same epoch.
3366 * The history may look like
3367 *
3368 * 1: A B
3369 * 2: B
3370 * 3: let's say B dies for good, too (say, from the power spike)
3371 * 4: A
3372 *
3373 * which makes it look like B may have applied updates to the PG
3374 * that we need in order to proceed. This sucks...
3375 *
3376 * To minimize the risk of this happening, we CANNOT go active if
3377 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3378 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3379 * Then, we have something like
3380 *
3381 * 1: A B
3382 * 2: B up_thru[B]=0
3383 * 3:
3384 * 4: A
3385 *
3386 * -> we can ignore B, bc it couldn't have gone active (alive_thru
3387 * still 0).
3388 *
3389 * or,
3390 *
3391 * 1: A B
3392 * 2: B up_thru[B]=0
3393 * 3: B up_thru[B]=2
3394 * 4:
3395 * 5: A
3396 *
3397 * -> we must wait for B, bc it was alive through 2, and could have
3398 * written to the pg.
3399 *
3400 * If B is really dead, then an administrator will need to manually
3401 * intervene by marking the OSD as "lost."
3402 */
3403
3404 // Include current acting and up nodes... not because they may
3405 // contain old data (this interval hasn't gone active, obviously),
3406 // but because we want their pg_info to inform choose_acting(), and
3407 // so that we know what they do/do not have explicitly before
3408 // sending them any new info/logs/whatever.
3409 for (unsigned i = 0; i < acting.size(); i++) {
3410 if (acting[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3411 probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3412 }
3413 // It may be possible to exclude the up nodes, but let's keep them in
3414 // there for now.
3415 for (unsigned i = 0; i < up.size(); i++) {
3416 if (up[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3417 probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3418 }
3419
3420 set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
3421 ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
3422 for (auto &&i: all_probe) {
3423 switch (f(0, i.osd, nullptr)) {
3424 case UP: {
3425 probe.insert(i);
3426 break;
3427 }
3428 case DNE:
3429 case LOST:
3430 case DOWN: {
3431 down.insert(i.osd);
3432 break;
3433 }
3434 }
3435 }
3436
3437 past_intervals.iterate_mayberw_back_to(
7c673cae
FG
3438 last_epoch_started,
3439 [&](epoch_t start, const set<pg_shard_t> &acting) {
3440 ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
3441 << ", acting: " << acting << dendl;
3442
3443 // look at candidate osds during this interval. each falls into
3444 // one of three categories: up, down (but potentially
3445 // interesting), or lost (down, but we won't wait for it).
3446 set<pg_shard_t> up_now;
3447 map<int, epoch_t> candidate_blocked_by;
3448 // any candidates down now (that might have useful data)
3449 bool any_down_now = false;
3450
3451 // consider ACTING osds
3452 for (auto &&so: acting) {
3453 epoch_t lost_at = 0;
3454 switch (f(start, so.osd, &lost_at)) {
3455 case UP: {
3456 // include past acting osds if they are up.
3457 up_now.insert(so);
3458 break;
3459 }
3460 case DNE: {
3461 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3462 << " no longer exists" << dendl;
3463 break;
3464 }
3465 case LOST: {
3466 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3467 << " is down, but lost_at " << lost_at << dendl;
3468 up_now.insert(so);
3469 break;
3470 }
3471 case DOWN: {
3472 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3473 << " is down" << dendl;
3474 candidate_blocked_by[so.osd] = lost_at;
3475 any_down_now = true;
3476 break;
3477 }
3478 }
3479 }
3480
3481 // if not enough osds survived this interval, and we may have gone rw,
3482 // then we need to wait for one of those osds to recover to
3483 // ensure that we haven't lost any information.
3484 if (!(*pcontdec)(up_now) && any_down_now) {
3485 // fixme: how do we identify a "clean" shutdown anyway?
3486 ldpp_dout(dpp, 10) << "build_prior possibly went active+rw,"
3487 << " insufficient up; including down osds" << dendl;
11fdf7f2 3488 ceph_assert(!candidate_blocked_by.empty());
7c673cae
FG
3489 pg_down = true;
3490 blocked_by.insert(
3491 candidate_blocked_by.begin(),
3492 candidate_blocked_by.end());
3493 }
3494 });
3495
3496 ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
3497 << " down " << down
3498 << " blocked_by " << blocked_by
3499 << (pg_down ? " pg_down":"")
3500 << dendl;
3501}
3502
3503/**
3504 * pg_query_t - used to ask a peer for information about a pg.
3505 *
3506 * note: if version=0, type=LOG, then we just provide our full log.
3507 */
3508struct pg_query_t {
3509 enum {
3510 INFO = 0,
3511 LOG = 1,
3512 MISSING = 4,
3513 FULLLOG = 5,
3514 };
11fdf7f2 3515 std::string_view get_type_name() const {
7c673cae
FG
3516 switch (type) {
3517 case INFO: return "info";
3518 case LOG: return "log";
3519 case MISSING: return "missing";
3520 case FULLLOG: return "fulllog";
3521 default: return "???";
3522 }
3523 }
3524
3525 __s32 type;
3526 eversion_t since;
3527 pg_history_t history;
3528 epoch_t epoch_sent;
3529 shard_id_t to;
3530 shard_id_t from;
3531
3532 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
3533 from(shard_id_t::NO_SHARD) {}
3534 pg_query_t(
3535 int t,
3536 shard_id_t to,
3537 shard_id_t from,
3538 const pg_history_t& h,
3539 epoch_t epoch_sent)
3540 : type(t),
3541 history(h),
3542 epoch_sent(epoch_sent),
3543 to(to), from(from) {
11fdf7f2 3544 ceph_assert(t != LOG);
7c673cae
FG
3545 }
3546 pg_query_t(
3547 int t,
3548 shard_id_t to,
3549 shard_id_t from,
3550 eversion_t s,
3551 const pg_history_t& h,
3552 epoch_t epoch_sent)
3553 : type(t), since(s), history(h),
3554 epoch_sent(epoch_sent), to(to), from(from) {
11fdf7f2 3555 ceph_assert(t == LOG);
7c673cae
FG
3556 }
3557
3558 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 3559 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
3560
3561 void dump(Formatter *f) const;
3562 static void generate_test_instances(list<pg_query_t*>& o);
3563};
3564WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
3565
3566inline ostream& operator<<(ostream& out, const pg_query_t& q) {
3567 out << "query(" << q.get_type_name() << " " << q.since;
3568 if (q.type == pg_query_t::LOG)
3569 out << " " << q.history;
11fdf7f2 3570 out << " epoch_sent " << q.epoch_sent;
7c673cae
FG
3571 out << ")";
3572 return out;
3573}
3574
3575class PGBackend;
3576class ObjectModDesc {
3577 bool can_local_rollback;
3578 bool rollback_info_completed;
3579
3580 // version required to decode, reflected in encode/decode version
3581 __u8 max_required_version = 1;
3582public:
3583 class Visitor {
3584 public:
3585 virtual void append(uint64_t old_offset) {}
3586 virtual void setattrs(map<string, boost::optional<bufferlist> > &attrs) {}
3587 virtual void rmobject(version_t old_version) {}
3588 /**
3589 * Used to support the unfound_lost_delete log event: if the stashed
3590 * version exists, we unstash it, otherwise, we do nothing. This way
3591 * each replica rolls back to whatever state it had prior to the attempt
3592 * at mark unfound lost delete
3593 */
3594 virtual void try_rmobject(version_t old_version) {
3595 rmobject(old_version);
3596 }
3597 virtual void create() {}
3598 virtual void update_snaps(const set<snapid_t> &old_snaps) {}
3599 virtual void rollback_extents(
3600 version_t gen,
3601 const vector<pair<uint64_t, uint64_t> > &extents) {}
3602 virtual ~Visitor() {}
3603 };
3604 void visit(Visitor *visitor) const;
3605 mutable bufferlist bl;
3606 enum ModID {
3607 APPEND = 1,
3608 SETATTRS = 2,
3609 DELETE = 3,
3610 CREATE = 4,
3611 UPDATE_SNAPS = 5,
3612 TRY_DELETE = 6,
3613 ROLLBACK_EXTENTS = 7
3614 };
31f18b77
FG
3615 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3616 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3617 }
7c673cae
FG
3618 void claim(ObjectModDesc &other) {
3619 bl.clear();
3620 bl.claim(other.bl);
3621 can_local_rollback = other.can_local_rollback;
3622 rollback_info_completed = other.rollback_info_completed;
3623 }
3624 void claim_append(ObjectModDesc &other) {
3625 if (!can_local_rollback || rollback_info_completed)
3626 return;
3627 if (!other.can_local_rollback) {
3628 mark_unrollbackable();
3629 return;
3630 }
3631 bl.claim_append(other.bl);
3632 rollback_info_completed = other.rollback_info_completed;
3633 }
3634 void swap(ObjectModDesc &other) {
3635 bl.swap(other.bl);
3636
31f18b77
FG
3637 using std::swap;
3638 swap(other.can_local_rollback, can_local_rollback);
3639 swap(other.rollback_info_completed, rollback_info_completed);
3640 swap(other.max_required_version, max_required_version);
7c673cae
FG
3641 }
3642 void append_id(ModID id) {
11fdf7f2 3643 using ceph::encode;
7c673cae 3644 uint8_t _id(id);
11fdf7f2 3645 encode(_id, bl);
7c673cae
FG
3646 }
3647 void append(uint64_t old_size) {
3648 if (!can_local_rollback || rollback_info_completed)
3649 return;
3650 ENCODE_START(1, 1, bl);
3651 append_id(APPEND);
11fdf7f2 3652 encode(old_size, bl);
7c673cae
FG
3653 ENCODE_FINISH(bl);
3654 }
3655 void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
3656 if (!can_local_rollback || rollback_info_completed)
3657 return;
3658 ENCODE_START(1, 1, bl);
3659 append_id(SETATTRS);
11fdf7f2 3660 encode(old_attrs, bl);
7c673cae
FG
3661 ENCODE_FINISH(bl);
3662 }
3663 bool rmobject(version_t deletion_version) {
3664 if (!can_local_rollback || rollback_info_completed)
3665 return false;
3666 ENCODE_START(1, 1, bl);
3667 append_id(DELETE);
11fdf7f2 3668 encode(deletion_version, bl);
7c673cae
FG
3669 ENCODE_FINISH(bl);
3670 rollback_info_completed = true;
3671 return true;
3672 }
3673 bool try_rmobject(version_t deletion_version) {
3674 if (!can_local_rollback || rollback_info_completed)
3675 return false;
3676 ENCODE_START(1, 1, bl);
3677 append_id(TRY_DELETE);
11fdf7f2 3678 encode(deletion_version, bl);
7c673cae
FG
3679 ENCODE_FINISH(bl);
3680 rollback_info_completed = true;
3681 return true;
3682 }
3683 void create() {
3684 if (!can_local_rollback || rollback_info_completed)
3685 return;
3686 rollback_info_completed = true;
3687 ENCODE_START(1, 1, bl);
3688 append_id(CREATE);
3689 ENCODE_FINISH(bl);
3690 }
3691 void update_snaps(const set<snapid_t> &old_snaps) {
3692 if (!can_local_rollback || rollback_info_completed)
3693 return;
3694 ENCODE_START(1, 1, bl);
3695 append_id(UPDATE_SNAPS);
11fdf7f2 3696 encode(old_snaps, bl);
7c673cae
FG
3697 ENCODE_FINISH(bl);
3698 }
3699 void rollback_extents(
3700 version_t gen, const vector<pair<uint64_t, uint64_t> > &extents) {
11fdf7f2
TL
3701 ceph_assert(can_local_rollback);
3702 ceph_assert(!rollback_info_completed);
7c673cae
FG
3703 if (max_required_version < 2)
3704 max_required_version = 2;
3705 ENCODE_START(2, 2, bl);
3706 append_id(ROLLBACK_EXTENTS);
11fdf7f2
TL
3707 encode(gen, bl);
3708 encode(extents, bl);
7c673cae
FG
3709 ENCODE_FINISH(bl);
3710 }
3711
3712 // cannot be rolled back
3713 void mark_unrollbackable() {
3714 can_local_rollback = false;
3715 bl.clear();
3716 }
3717 bool can_rollback() const {
3718 return can_local_rollback;
3719 }
3720 bool empty() const {
3721 return can_local_rollback && (bl.length() == 0);
3722 }
3723
3724 bool requires_kraken() const {
3725 return max_required_version >= 2;
3726 }
3727
3728 /**
3729 * Create fresh copy of bl bytes to avoid keeping large buffers around
3730 * in the case that bl contains ptrs which point into a much larger
3731 * message buffer
3732 */
31f18b77 3733 void trim_bl() const {
7c673cae
FG
3734 if (bl.length() > 0)
3735 bl.rebuild();
3736 }
3737 void encode(bufferlist &bl) const;
11fdf7f2 3738 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
3739 void dump(Formatter *f) const;
3740 static void generate_test_instances(list<ObjectModDesc*>& o);
3741};
3742WRITE_CLASS_ENCODER(ObjectModDesc)
3743
3744
3745/**
3746 * pg_log_entry_t - single entry/event in pg log
3747 *
3748 */
3749struct pg_log_entry_t {
3750 enum {
3751 MODIFY = 1, // some unspecified modification (but not *all* modifications)
3752 CLONE = 2, // cloned object from head
3753 DELETE = 3, // deleted object
11fdf7f2 3754 //BACKLOG = 4, // event invented by generate_backlog [obsolete]
7c673cae
FG
3755 LOST_REVERT = 5, // lost new version, revert to an older version.
3756 LOST_DELETE = 6, // lost new version, revert to no object (deleted).
3757 LOST_MARK = 7, // lost new version, now EIO
3758 PROMOTE = 8, // promoted object from another tier
3759 CLEAN = 9, // mark an object clean
3760 ERROR = 10, // write that returned an error
3761 };
3762 static const char *get_op_name(int op) {
3763 switch (op) {
3764 case MODIFY:
3765 return "modify";
3766 case PROMOTE:
3767 return "promote";
3768 case CLONE:
3769 return "clone";
3770 case DELETE:
3771 return "delete";
7c673cae
FG
3772 case LOST_REVERT:
3773 return "l_revert";
3774 case LOST_DELETE:
3775 return "l_delete";
3776 case LOST_MARK:
3777 return "l_mark";
3778 case CLEAN:
3779 return "clean";
3780 case ERROR:
3781 return "error";
3782 default:
3783 return "unknown";
3784 }
3785 }
3786 const char *get_op_name() const {
3787 return get_op_name(op);
3788 }
3789
3790 // describes state for a locally-rollbackable entry
3791 ObjectModDesc mod_desc;
3792 bufferlist snaps; // only for clone entries
3793 hobject_t soid;
3794 osd_reqid_t reqid; // caller+tid to uniquely identify request
31f18b77 3795 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids;
11fdf7f2
TL
3796
3797 /// map extra_reqids by index to error return code (if any)
3798 mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
3799
7c673cae
FG
3800 eversion_t version, prior_version, reverting_to;
3801 version_t user_version; // the user version for this entry
3802 utime_t mtime; // this is the _user_ mtime, mind you
3803 int32_t return_code; // only stored for ERRORs for dup detection
3804
3805 __s32 op;
3806 bool invalid_hash; // only when decoding sobject_t based entries
3807 bool invalid_pool; // only when decoding pool-less hobject based entries
3808
3809 pg_log_entry_t()
3810 : user_version(0), return_code(0), op(0),
31f18b77
FG
3811 invalid_hash(false), invalid_pool(false) {
3812 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3813 }
7c673cae
FG
3814 pg_log_entry_t(int _op, const hobject_t& _soid,
3815 const eversion_t& v, const eversion_t& pv,
3816 version_t uv,
3817 const osd_reqid_t& rid, const utime_t& mt,
3818 int return_code)
3819 : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
3820 mtime(mt), return_code(return_code), op(_op),
31f18b77
FG
3821 invalid_hash(false), invalid_pool(false) {
3822 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3823 }
7c673cae
FG
3824
3825 bool is_clone() const { return op == CLONE; }
3826 bool is_modify() const { return op == MODIFY; }
3827 bool is_promote() const { return op == PROMOTE; }
3828 bool is_clean() const { return op == CLEAN; }
7c673cae
FG
3829 bool is_lost_revert() const { return op == LOST_REVERT; }
3830 bool is_lost_delete() const { return op == LOST_DELETE; }
3831 bool is_lost_mark() const { return op == LOST_MARK; }
3832 bool is_error() const { return op == ERROR; }
3833
3834 bool is_update() const {
3835 return
3836 is_clone() || is_modify() || is_promote() || is_clean() ||
11fdf7f2 3837 is_lost_revert() || is_lost_mark();
7c673cae
FG
3838 }
3839 bool is_delete() const {
3840 return op == DELETE || op == LOST_DELETE;
3841 }
3842
3843 bool can_rollback() const {
3844 return mod_desc.can_rollback();
3845 }
3846
3847 void mark_unrollbackable() {
3848 mod_desc.mark_unrollbackable();
3849 }
3850
3851 bool requires_kraken() const {
3852 return mod_desc.requires_kraken();
3853 }
3854
3855 // Errors are only used for dup detection, whereas
3856 // the index by objects is used by recovery, copy_get,
3857 // and other facilities that don't expect or need to
3858 // be aware of error entries.
3859 bool object_is_indexed() const {
3860 return !is_error();
3861 }
3862
3863 bool reqid_is_indexed() const {
3864 return reqid != osd_reqid_t() &&
3865 (op == MODIFY || op == DELETE || op == ERROR);
3866 }
3867
3868 string get_key_name() const;
3869 void encode_with_checksum(bufferlist& bl) const;
11fdf7f2 3870 void decode_with_checksum(bufferlist::const_iterator& p);
7c673cae
FG
3871
3872 void encode(bufferlist &bl) const;
11fdf7f2 3873 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
3874 void dump(Formatter *f) const;
3875 static void generate_test_instances(list<pg_log_entry_t*>& o);
3876
3877};
3878WRITE_CLASS_ENCODER(pg_log_entry_t)
3879
3880ostream& operator<<(ostream& out, const pg_log_entry_t& e);
3881
c07f9fc5
FG
3882struct pg_log_dup_t {
3883 osd_reqid_t reqid; // caller+tid to uniquely identify request
3884 eversion_t version;
3885 version_t user_version; // the user version for this entry
3886 int32_t return_code; // only stored for ERRORs for dup detection
7c673cae 3887
c07f9fc5
FG
3888 pg_log_dup_t()
3889 : user_version(0), return_code(0)
3890 {}
3891 explicit pg_log_dup_t(const pg_log_entry_t& entry)
3892 : reqid(entry.reqid), version(entry.version),
3893 user_version(entry.user_version), return_code(entry.return_code)
3894 {}
3895 pg_log_dup_t(const eversion_t& v, version_t uv,
3896 const osd_reqid_t& rid, int return_code)
3897 : reqid(rid), version(v), user_version(uv),
3898 return_code(return_code)
3899 {}
3900
3901 string get_key_name() const;
3902 void encode(bufferlist &bl) const;
11fdf7f2 3903 void decode(bufferlist::const_iterator &bl);
c07f9fc5
FG
3904 void dump(Formatter *f) const;
3905 static void generate_test_instances(list<pg_log_dup_t*>& o);
3906
181888fb
FG
3907 bool operator==(const pg_log_dup_t &rhs) const {
3908 return reqid == rhs.reqid &&
3909 version == rhs.version &&
3910 user_version == rhs.user_version &&
3911 return_code == rhs.return_code;
3912 }
3913 bool operator!=(const pg_log_dup_t &rhs) const {
3914 return !(*this == rhs);
3915 }
3916
c07f9fc5
FG
3917 friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
3918};
3919WRITE_CLASS_ENCODER(pg_log_dup_t)
3920
3921std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
7c673cae
FG
3922
3923/**
3924 * pg_log_t - incremental log of recent pg changes.
3925 *
3926 * serves as a recovery queue for recent changes.
3927 */
3928struct pg_log_t {
3929 /*
3930 * head - newest entry (update|delete)
3931 * tail - entry previous to oldest (update|delete) for which we have
3932 * complete negative information.
3933 * i.e. we can infer pg contents for any store whose last_update >= tail.
3934 */
3935 eversion_t head; // newest entry
3936 eversion_t tail; // version prior to oldest
3937
3938protected:
3939 // We can rollback rollback-able entries > can_rollback_to
3940 eversion_t can_rollback_to;
3941
3942 // always <= can_rollback_to, indicates how far stashed rollback
3943 // data can be found
3944 eversion_t rollback_info_trimmed_to;
3945
3946public:
c07f9fc5
FG
3947 // the actual log
3948 mempool::osd_pglog::list<pg_log_entry_t> log;
3949
3950 // entries just for dup op detection ordered oldest to newest
3951 mempool::osd_pglog::list<pg_log_dup_t> dups;
3952
7c673cae
FG
3953 pg_log_t() = default;
3954 pg_log_t(const eversion_t &last_update,
3955 const eversion_t &log_tail,
3956 const eversion_t &can_rollback_to,
3957 const eversion_t &rollback_info_trimmed_to,
c07f9fc5
FG
3958 mempool::osd_pglog::list<pg_log_entry_t> &&entries,
3959 mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
7c673cae
FG
3960 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3961 rollback_info_trimmed_to(rollback_info_trimmed_to),
c07f9fc5 3962 log(std::move(entries)), dups(std::move(dup_entries)) {}
7c673cae
FG
3963 pg_log_t(const eversion_t &last_update,
3964 const eversion_t &log_tail,
3965 const eversion_t &can_rollback_to,
3966 const eversion_t &rollback_info_trimmed_to,
c07f9fc5
FG
3967 const std::list<pg_log_entry_t> &entries,
3968 const std::list<pg_log_dup_t> &dup_entries)
7c673cae
FG
3969 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3970 rollback_info_trimmed_to(rollback_info_trimmed_to) {
3971 for (auto &&entry: entries) {
3972 log.push_back(entry);
3973 }
c07f9fc5
FG
3974 for (auto &&entry: dup_entries) {
3975 dups.push_back(entry);
3976 }
7c673cae
FG
3977 }
3978
3979 void clear() {
3980 eversion_t z;
3981 rollback_info_trimmed_to = can_rollback_to = head = tail = z;
3982 log.clear();
c07f9fc5 3983 dups.clear();
7c673cae
FG
3984 }
3985
3986 eversion_t get_rollback_info_trimmed_to() const {
3987 return rollback_info_trimmed_to;
3988 }
3989 eversion_t get_can_rollback_to() const {
3990 return can_rollback_to;
3991 }
3992
3993
3994 pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
31f18b77 3995 mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
7c673cae
FG
3996 oldlog.swap(log);
3997
3998 eversion_t old_tail;
3999 unsigned mask = ~((~0)<<split_bits);
4000 for (auto i = oldlog.begin();
4001 i != oldlog.end();
4002 ) {
4003 if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
4004 childlog.push_back(*i);
4005 } else {
4006 log.push_back(*i);
4007 }
4008 oldlog.erase(i++);
4009 }
4010
c07f9fc5
FG
4011 // osd_reqid is unique, so it doesn't matter if there are extra
4012 // dup entries in each pg. To avoid storing oid with the dup
4013 // entries, just copy the whole list.
4014 auto childdups(dups);
4015
7c673cae
FG
4016 return pg_log_t(
4017 head,
4018 tail,
4019 can_rollback_to,
4020 rollback_info_trimmed_to,
c07f9fc5
FG
4021 std::move(childlog),
4022 std::move(childdups));
4023 }
7c673cae 4024
31f18b77 4025 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
11fdf7f2 4026 ceph_assert(newhead >= tail);
7c673cae 4027
31f18b77
FG
4028 mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
4029 mempool::osd_pglog::list<pg_log_entry_t> divergent;
7c673cae
FG
4030 while (true) {
4031 if (p == log.begin()) {
4032 // yikes, the whole thing is divergent!
31f18b77
FG
4033 using std::swap;
4034 swap(divergent, log);
7c673cae
FG
4035 break;
4036 }
4037 --p;
4038 if (p->version.version <= newhead.version) {
4039 /*
4040 * look at eversion.version here. we want to avoid a situation like:
4041 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4042 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4043 * lower_bound = 100'9
4044 * i.e, same request, different version. If the eversion.version is > the
4045 * lower_bound, we it is divergent.
4046 */
4047 ++p;
4048 divergent.splice(divergent.begin(), log, p, log.end());
4049 break;
4050 }
11fdf7f2 4051 ceph_assert(p->version > newhead);
7c673cae
FG
4052 }
4053 head = newhead;
4054
4055 if (can_rollback_to > newhead)
4056 can_rollback_to = newhead;
4057
4058 if (rollback_info_trimmed_to > newhead)
4059 rollback_info_trimmed_to = newhead;
4060
4061 return divergent;
4062 }
4063
11fdf7f2
TL
4064 void merge_from(const vector<pg_log_t*>& slogs, eversion_t last_update) {
4065 log.clear();
4066
4067 // sort and merge dups
4068 multimap<eversion_t,pg_log_dup_t> sorted;
4069 for (auto& d : dups) {
4070 sorted.emplace(d.version, d);
4071 }
4072 for (auto l : slogs) {
4073 for (auto& d : l->dups) {
4074 sorted.emplace(d.version, d);
4075 }
4076 }
4077 dups.clear();
4078 for (auto& i : sorted) {
4079 dups.push_back(i.second);
4080 }
4081
4082 head = last_update;
4083 tail = last_update;
4084 can_rollback_to = last_update;
4085 rollback_info_trimmed_to = last_update;
4086 }
4087
7c673cae
FG
4088 bool empty() const {
4089 return log.empty();
4090 }
4091
4092 bool null() const {
4093 return head.version == 0 && head.epoch == 0;
4094 }
4095
4096 size_t approx_size() const {
4097 return head.version - tail.version;
4098 }
4099
4100 static void filter_log(spg_t import_pgid, const OSDMap &curmap,
4101 const string &hit_set_namespace, const pg_log_t &in,
4102 pg_log_t &out, pg_log_t &reject);
4103
4104 /**
4105 * copy entries from the tail of another pg_log_t
4106 *
4107 * @param other pg_log_t to copy from
4108 * @param from copy entries after this version
4109 */
81eedcae 4110 void copy_after(CephContext* cct, const pg_log_t &other, eversion_t from);
7c673cae
FG
4111
4112 /**
4113 * copy up to N entries
4114 *
4115 * @param other source log
4116 * @param max max number of entries to copy
4117 */
81eedcae 4118 void copy_up_to(CephContext* cct, const pg_log_t &other, int max);
7c673cae
FG
4119
4120 ostream& print(ostream& out) const;
4121
4122 void encode(bufferlist &bl) const;
11fdf7f2 4123 void decode(bufferlist::const_iterator &bl, int64_t pool = -1);
7c673cae
FG
4124 void dump(Formatter *f) const;
4125 static void generate_test_instances(list<pg_log_t*>& o);
4126};
4127WRITE_CLASS_ENCODER(pg_log_t)
4128
c07f9fc5 4129inline ostream& operator<<(ostream& out, const pg_log_t& log)
7c673cae
FG
4130{
4131 out << "log((" << log.tail << "," << log.head << "], crt="
4132 << log.get_can_rollback_to() << ")";
4133 return out;
4134}
4135
4136
4137/**
4138 * pg_missing_t - summary of missing objects.
4139 *
4140 * kept in memory, as a supplement to pg_log_t
4141 * also used to pass missing info in messages.
4142 */
4143struct pg_missing_item {
4144 eversion_t need, have;
c07f9fc5
FG
4145 enum missing_flags_t {
4146 FLAG_NONE = 0,
4147 FLAG_DELETE = 1,
4148 } flags;
4149 pg_missing_item() : flags(FLAG_NONE) {}
4150 explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
4151 pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false) : need(n), have(h) {
4152 set_delete(is_delete);
4153 }
4154
4155 void encode(bufferlist& bl, uint64_t features) const {
11fdf7f2 4156 using ceph::encode;
c07f9fc5
FG
4157 if (HAVE_FEATURE(features, OSD_RECOVERY_DELETES)) {
4158 // encoding a zeroed eversion_t to differentiate between this and
4159 // legacy unversioned encoding - a need value of 0'0 is not
4160 // possible. This can be replaced with the legacy encoding
4161 // macros post-luminous.
4162 eversion_t e;
11fdf7f2
TL
4163 encode(e, bl);
4164 encode(need, bl);
4165 encode(have, bl);
4166 encode(static_cast<uint8_t>(flags), bl);
c07f9fc5
FG
4167 } else {
4168 // legacy unversioned encoding
11fdf7f2
TL
4169 encode(need, bl);
4170 encode(have, bl);
c07f9fc5 4171 }
7c673cae 4172 }
11fdf7f2
TL
4173 void decode(bufferlist::const_iterator& bl) {
4174 using ceph::decode;
c07f9fc5 4175 eversion_t e;
11fdf7f2 4176 decode(e, bl);
c07f9fc5
FG
4177 if (e != eversion_t()) {
4178 // legacy encoding, this is the need value
4179 need = e;
11fdf7f2 4180 decode(have, bl);
c07f9fc5 4181 } else {
11fdf7f2
TL
4182 decode(need, bl);
4183 decode(have, bl);
c07f9fc5 4184 uint8_t f;
11fdf7f2 4185 decode(f, bl);
c07f9fc5
FG
4186 flags = static_cast<missing_flags_t>(f);
4187 }
4188 }
4189
4190 void set_delete(bool is_delete) {
4191 flags = is_delete ? FLAG_DELETE : FLAG_NONE;
4192 }
4193
4194 bool is_delete() const {
4195 return (flags & FLAG_DELETE) == FLAG_DELETE;
4196 }
4197
4198 string flag_str() const {
4199 if (flags == FLAG_NONE) {
4200 return "none";
4201 } else {
4202 return "delete";
4203 }
7c673cae 4204 }
c07f9fc5 4205
7c673cae
FG
4206 void dump(Formatter *f) const {
4207 f->dump_stream("need") << need;
4208 f->dump_stream("have") << have;
c07f9fc5 4209 f->dump_stream("flags") << flag_str();
7c673cae
FG
4210 }
4211 static void generate_test_instances(list<pg_missing_item*>& o) {
4212 o.push_back(new pg_missing_item);
4213 o.push_back(new pg_missing_item);
4214 o.back()->need = eversion_t(1, 2);
4215 o.back()->have = eversion_t(1, 1);
c07f9fc5
FG
4216 o.push_back(new pg_missing_item);
4217 o.back()->need = eversion_t(3, 5);
4218 o.back()->have = eversion_t(3, 4);
4219 o.back()->flags = FLAG_DELETE;
7c673cae
FG
4220 }
4221 bool operator==(const pg_missing_item &rhs) const {
c07f9fc5 4222 return need == rhs.need && have == rhs.have && flags == rhs.flags;
7c673cae
FG
4223 }
4224 bool operator!=(const pg_missing_item &rhs) const {
4225 return !(*this == rhs);
4226 }
4227};
c07f9fc5 4228WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
7c673cae
FG
4229ostream& operator<<(ostream& out, const pg_missing_item &item);
4230
4231class pg_missing_const_i {
4232public:
4233 virtual const map<hobject_t, pg_missing_item> &
4234 get_items() const = 0;
4235 virtual const map<version_t, hobject_t> &get_rmissing() const = 0;
c07f9fc5 4236 virtual bool get_may_include_deletes() const = 0;
7c673cae
FG
4237 virtual unsigned int num_missing() const = 0;
4238 virtual bool have_missing() const = 0;
4239 virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
4240 virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
7c673cae
FG
4241 virtual ~pg_missing_const_i() {}
4242};
4243
4244
4245template <bool Track>
4246class ChangeTracker {
4247public:
4248 void changed(const hobject_t &obj) {}
4249 template <typename F>
4250 void get_changed(F &&f) const {}
4251 void flush() {}
4252 bool is_clean() const {
4253 return true;
4254 }
4255};
4256template <>
4257class ChangeTracker<true> {
4258 set<hobject_t> _changed;
4259public:
4260 void changed(const hobject_t &obj) {
4261 _changed.insert(obj);
4262 }
4263 template <typename F>
4264 void get_changed(F &&f) const {
4265 for (auto const &i: _changed) {
4266 f(i);
4267 }
4268 }
4269 void flush() {
4270 _changed.clear();
4271 }
4272 bool is_clean() const {
4273 return _changed.empty();
4274 }
4275};
4276
4277template <bool TrackChanges>
4278class pg_missing_set : public pg_missing_const_i {
4279 using item = pg_missing_item;
4280 map<hobject_t, item> missing; // oid -> (need v, have v)
4281 map<version_t, hobject_t> rmissing; // v -> oid
4282 ChangeTracker<TrackChanges> tracker;
4283
4284public:
4285 pg_missing_set() = default;
4286
4287 template <typename missing_type>
4288 pg_missing_set(const missing_type &m) {
7c673cae
FG
4289 missing = m.get_items();
4290 rmissing = m.get_rmissing();
c07f9fc5 4291 may_include_deletes = m.get_may_include_deletes();
7c673cae
FG
4292 for (auto &&i: missing)
4293 tracker.changed(i.first);
4294 }
4295
c07f9fc5
FG
4296 bool may_include_deletes = false;
4297
7c673cae
FG
4298 const map<hobject_t, item> &get_items() const override {
4299 return missing;
4300 }
4301 const map<version_t, hobject_t> &get_rmissing() const override {
4302 return rmissing;
4303 }
c07f9fc5
FG
4304 bool get_may_include_deletes() const override {
4305 return may_include_deletes;
4306 }
7c673cae
FG
4307 unsigned int num_missing() const override {
4308 return missing.size();
4309 }
4310 bool have_missing() const override {
4311 return !missing.empty();
4312 }
4313 bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
4314 auto iter = missing.find(oid);
4315 if (iter == missing.end())
4316 return false;
4317 if (out)
4318 *out = iter->second;
4319 return true;
4320 }
4321 bool is_missing(const hobject_t& oid, eversion_t v) const override {
4322 map<hobject_t, item>::const_iterator m =
4323 missing.find(oid);
4324 if (m == missing.end())
4325 return false;
4326 const item &item(m->second);
4327 if (item.need > v)
4328 return false;
4329 return true;
4330 }
11fdf7f2
TL
4331 eversion_t get_oldest_need() const {
4332 if (missing.empty()) {
7c673cae 4333 return eversion_t();
11fdf7f2
TL
4334 }
4335 auto it = missing.find(rmissing.begin()->second);
4336 ceph_assert(it != missing.end());
4337 return it->second.need;
7c673cae
FG
4338 }
4339
4340 void claim(pg_missing_set& o) {
4341 static_assert(!TrackChanges, "Can't use claim with TrackChanges");
4342 missing.swap(o.missing);
4343 rmissing.swap(o.rmissing);
4344 }
4345
4346 /*
4347 * this needs to be called in log order as we extend the log. it
4348 * assumes missing is accurate up through the previous log entry.
4349 */
4350 void add_next_event(const pg_log_entry_t& e) {
c07f9fc5
FG
4351 map<hobject_t, item>::iterator missing_it;
4352 missing_it = missing.find(e.soid);
4353 bool is_missing_divergent_item = missing_it != missing.end();
4354 if (e.prior_version == eversion_t() || e.is_clone()) {
4355 // new object.
4356 if (is_missing_divergent_item) { // use iterator
7c673cae 4357 rmissing.erase((missing_it->second).need.version);
c07f9fc5
FG
4358 missing_it->second = item(e.version, eversion_t(), e.is_delete()); // .have = nil
4359 } else // create new element in missing map
4360 missing[e.soid] = item(e.version, eversion_t(), e.is_delete()); // .have = nil
4361 } else if (is_missing_divergent_item) {
4362 // already missing (prior).
4363 rmissing.erase((missing_it->second).need.version);
4364 (missing_it->second).need = e.version; // leave .have unchanged.
4365 missing_it->second.set_delete(e.is_delete());
c07f9fc5
FG
4366 } else {
4367 // not missing, we must have prior_version (if any)
11fdf7f2 4368 ceph_assert(!is_missing_divergent_item);
c07f9fc5 4369 missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
7c673cae 4370 }
c07f9fc5 4371 rmissing[e.version.version] = e.soid;
7c673cae
FG
4372 tracker.changed(e.soid);
4373 }
4374
c07f9fc5 4375 void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
7c673cae
FG
4376 if (missing.count(oid)) {
4377 rmissing.erase(missing[oid].need.version);
4378 missing[oid].need = need; // no not adjust .have
c07f9fc5 4379 missing[oid].set_delete(is_delete);
7c673cae 4380 } else {
c07f9fc5 4381 missing[oid] = item(need, eversion_t(), is_delete);
7c673cae
FG
4382 }
4383 rmissing[need.version] = oid;
4384
4385 tracker.changed(oid);
4386 }
4387
4388 void revise_have(hobject_t oid, eversion_t have) {
4389 if (missing.count(oid)) {
4390 tracker.changed(oid);
4391 missing[oid].have = have;
4392 }
4393 }
4394
c07f9fc5
FG
4395 void add(const hobject_t& oid, eversion_t need, eversion_t have,
4396 bool is_delete) {
4397 missing[oid] = item(need, have, is_delete);
7c673cae
FG
4398 rmissing[need.version] = oid;
4399 tracker.changed(oid);
4400 }
4401
4402 void rm(const hobject_t& oid, eversion_t v) {
4403 std::map<hobject_t, item>::iterator p = missing.find(oid);
4404 if (p != missing.end() && p->second.need <= v)
4405 rm(p);
4406 }
4407
4408 void rm(std::map<hobject_t, item>::const_iterator m) {
4409 tracker.changed(m->first);
4410 rmissing.erase(m->second.need.version);
4411 missing.erase(m);
4412 }
4413
4414 void got(const hobject_t& oid, eversion_t v) {
4415 std::map<hobject_t, item>::iterator p = missing.find(oid);
11fdf7f2
TL
4416 ceph_assert(p != missing.end());
4417 ceph_assert(p->second.need <= v || p->second.is_delete());
7c673cae
FG
4418 got(p);
4419 }
4420
4421 void got(std::map<hobject_t, item>::const_iterator m) {
4422 tracker.changed(m->first);
4423 rmissing.erase(m->second.need.version);
4424 missing.erase(m);
4425 }
4426
4427 void split_into(
4428 pg_t child_pgid,
4429 unsigned split_bits,
4430 pg_missing_set *omissing) {
c07f9fc5 4431 omissing->may_include_deletes = may_include_deletes;
7c673cae
FG
4432 unsigned mask = ~((~0)<<split_bits);
4433 for (map<hobject_t, item>::iterator i = missing.begin();
4434 i != missing.end();
4435 ) {
4436 if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
c07f9fc5
FG
4437 omissing->add(i->first, i->second.need, i->second.have,
4438 i->second.is_delete());
7c673cae
FG
4439 rm(i++);
4440 } else {
4441 ++i;
4442 }
4443 }
4444 }
4445
4446 void clear() {
4447 for (auto const &i: missing)
4448 tracker.changed(i.first);
4449 missing.clear();
4450 rmissing.clear();
4451 }
4452
4453 void encode(bufferlist &bl) const {
c07f9fc5 4454 ENCODE_START(4, 2, bl);
11fdf7f2
TL
4455 encode(missing, bl, may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0);
4456 encode(may_include_deletes, bl);
7c673cae
FG
4457 ENCODE_FINISH(bl);
4458 }
11fdf7f2 4459 void decode(bufferlist::const_iterator &bl, int64_t pool = -1) {
7c673cae
FG
4460 for (auto const &i: missing)
4461 tracker.changed(i.first);
c07f9fc5 4462 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
11fdf7f2 4463 decode(missing, bl);
c07f9fc5 4464 if (struct_v >= 4) {
11fdf7f2 4465 decode(may_include_deletes, bl);
c07f9fc5 4466 }
7c673cae
FG
4467 DECODE_FINISH(bl);
4468
4469 if (struct_v < 3) {
4470 // Handle hobject_t upgrade
4471 map<hobject_t, item> tmp;
4472 for (map<hobject_t, item>::iterator i =
4473 missing.begin();
4474 i != missing.end();
4475 ) {
4476 if (!i->first.is_max() && i->first.pool == -1) {
4477 hobject_t to_insert(i->first);
4478 to_insert.pool = pool;
4479 tmp[to_insert] = i->second;
4480 missing.erase(i++);
4481 } else {
4482 ++i;
4483 }
4484 }
4485 missing.insert(tmp.begin(), tmp.end());
4486 }
4487
4488 for (map<hobject_t,item>::iterator it =
4489 missing.begin();
4490 it != missing.end();
4491 ++it)
4492 rmissing[it->second.need.version] = it->first;
4493 for (auto const &i: missing)
4494 tracker.changed(i.first);
4495 }
4496 void dump(Formatter *f) const {
4497 f->open_array_section("missing");
4498 for (map<hobject_t,item>::const_iterator p =
4499 missing.begin(); p != missing.end(); ++p) {
4500 f->open_object_section("item");
4501 f->dump_stream("object") << p->first;
4502 p->second.dump(f);
4503 f->close_section();
4504 }
4505 f->close_section();
c07f9fc5 4506 f->dump_bool("may_include_deletes", may_include_deletes);
7c673cae
FG
4507 }
4508 template <typename F>
4509 void filter_objects(F &&f) {
4510 for (auto i = missing.begin(); i != missing.end();) {
4511 if (f(i->first)) {
4512 rm(i++);
4513 } else {
4514 ++i;
4515 }
4516 }
4517 }
4518 static void generate_test_instances(list<pg_missing_set*>& o) {
4519 o.push_back(new pg_missing_set);
4520 o.push_back(new pg_missing_set);
4521 o.back()->add(
4522 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
c07f9fc5
FG
4523 eversion_t(5, 6), eversion_t(5, 1), false);
4524 o.push_back(new pg_missing_set);
4525 o.back()->add(
4526 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4527 eversion_t(5, 6), eversion_t(5, 1), true);
4528 o.back()->may_include_deletes = true;
7c673cae
FG
4529 }
4530 template <typename F>
4531 void get_changed(F &&f) const {
4532 tracker.get_changed(f);
4533 }
4534 void flush() {
4535 tracker.flush();
4536 }
4537 bool is_clean() const {
4538 return tracker.is_clean();
4539 }
4540 template <typename missing_t>
4541 bool debug_verify_from_init(
4542 const missing_t &init_missing,
4543 ostream *oss) const {
4544 if (!TrackChanges)
4545 return true;
4546 auto check_missing(init_missing.get_items());
4547 tracker.get_changed([&](const hobject_t &hoid) {
4548 check_missing.erase(hoid);
4549 if (missing.count(hoid)) {
4550 check_missing.insert(*(missing.find(hoid)));
4551 }
4552 });
4553 bool ok = true;
4554 if (check_missing.size() != missing.size()) {
4555 if (oss) {
4556 *oss << "Size mismatch, check: " << check_missing.size()
4557 << ", actual: " << missing.size() << "\n";
4558 }
4559 ok = false;
4560 }
4561 for (auto &i: missing) {
4562 if (!check_missing.count(i.first)) {
4563 if (oss)
4564 *oss << "check_missing missing " << i.first << "\n";
4565 ok = false;
4566 } else if (check_missing[i.first] != i.second) {
4567 if (oss)
4568 *oss << "check_missing missing item mismatch on " << i.first
4569 << ", check: " << check_missing[i.first]
4570 << ", actual: " << i.second << "\n";
4571 ok = false;
4572 }
4573 }
4574 if (oss && !ok) {
4575 *oss << "check_missing: " << check_missing << "\n";
4576 set<hobject_t> changed;
4577 tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
4578 *oss << "changed: " << changed << "\n";
4579 }
4580 return ok;
4581 }
4582};
4583template <bool TrackChanges>
4584void encode(
4585 const pg_missing_set<TrackChanges> &c, bufferlist &bl, uint64_t features=0) {
4586 ENCODE_DUMP_PRE();
4587 c.encode(bl);
4588 ENCODE_DUMP_POST(cl);
4589}
4590template <bool TrackChanges>
11fdf7f2 4591void decode(pg_missing_set<TrackChanges> &c, bufferlist::const_iterator &p) {
7c673cae
FG
4592 c.decode(p);
4593}
4594template <bool TrackChanges>
4595ostream& operator<<(ostream& out, const pg_missing_set<TrackChanges> &missing)
4596{
c07f9fc5
FG
4597 out << "missing(" << missing.num_missing()
4598 << " may_include_deletes = " << missing.may_include_deletes;
7c673cae
FG
4599 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
4600 out << ")";
4601 return out;
4602}
4603
4604using pg_missing_t = pg_missing_set<false>;
4605using pg_missing_tracker_t = pg_missing_set<true>;
4606
4607
4608/**
4609 * pg list objects response format
4610 *
4611 */
4612struct pg_nls_response_t {
4613 collection_list_handle_t handle;
4614 list<librados::ListObjectImpl> entries;
4615
4616 void encode(bufferlist& bl) const {
4617 ENCODE_START(1, 1, bl);
11fdf7f2 4618 encode(handle, bl);
7c673cae 4619 __u32 n = (__u32)entries.size();
11fdf7f2 4620 encode(n, bl);
7c673cae 4621 for (list<librados::ListObjectImpl>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
11fdf7f2
TL
4622 encode(i->nspace, bl);
4623 encode(i->oid, bl);
4624 encode(i->locator, bl);
7c673cae
FG
4625 }
4626 ENCODE_FINISH(bl);
4627 }
11fdf7f2 4628 void decode(bufferlist::const_iterator& bl) {
7c673cae 4629 DECODE_START(1, bl);
11fdf7f2 4630 decode(handle, bl);
7c673cae 4631 __u32 n;
11fdf7f2 4632 decode(n, bl);
7c673cae
FG
4633 entries.clear();
4634 while (n--) {
4635 librados::ListObjectImpl i;
11fdf7f2
TL
4636 decode(i.nspace, bl);
4637 decode(i.oid, bl);
4638 decode(i.locator, bl);
7c673cae
FG
4639 entries.push_back(i);
4640 }
4641 DECODE_FINISH(bl);
4642 }
4643 void dump(Formatter *f) const {
4644 f->dump_stream("handle") << handle;
4645 f->open_array_section("entries");
4646 for (list<librados::ListObjectImpl>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4647 f->open_object_section("object");
4648 f->dump_string("namespace", p->nspace);
4649 f->dump_string("object", p->oid);
4650 f->dump_string("key", p->locator);
4651 f->close_section();
4652 }
4653 f->close_section();
4654 }
4655 static void generate_test_instances(list<pg_nls_response_t*>& o) {
4656 o.push_back(new pg_nls_response_t);
4657 o.push_back(new pg_nls_response_t);
4658 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4659 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4660 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4661 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4662 o.push_back(new pg_nls_response_t);
4663 o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
4664 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4665 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4666 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4667 o.push_back(new pg_nls_response_t);
4668 o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
4669 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4670 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4671 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4672 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4673 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4674 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4675 }
4676};
4677
4678WRITE_CLASS_ENCODER(pg_nls_response_t)
4679
4680// For backwards compatibility with older OSD requests
4681struct pg_ls_response_t {
4682 collection_list_handle_t handle;
4683 list<pair<object_t, string> > entries;
4684
4685 void encode(bufferlist& bl) const {
11fdf7f2 4686 using ceph::encode;
7c673cae 4687 __u8 v = 1;
11fdf7f2
TL
4688 encode(v, bl);
4689 encode(handle, bl);
4690 encode(entries, bl);
7c673cae 4691 }
11fdf7f2
TL
4692 void decode(bufferlist::const_iterator& bl) {
4693 using ceph::decode;
7c673cae 4694 __u8 v;
11fdf7f2
TL
4695 decode(v, bl);
4696 ceph_assert(v == 1);
4697 decode(handle, bl);
4698 decode(entries, bl);
7c673cae
FG
4699 }
4700 void dump(Formatter *f) const {
4701 f->dump_stream("handle") << handle;
4702 f->open_array_section("entries");
4703 for (list<pair<object_t, string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4704 f->open_object_section("object");
4705 f->dump_stream("object") << p->first;
4706 f->dump_string("key", p->second);
4707 f->close_section();
4708 }
4709 f->close_section();
4710 }
4711 static void generate_test_instances(list<pg_ls_response_t*>& o) {
4712 o.push_back(new pg_ls_response_t);
4713 o.push_back(new pg_ls_response_t);
4714 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4715 o.back()->entries.push_back(make_pair(object_t("one"), string()));
4716 o.back()->entries.push_back(make_pair(object_t("two"), string("twokey")));
4717 }
4718};
4719
4720WRITE_CLASS_ENCODER(pg_ls_response_t)
4721
4722/**
4723 * object_copy_cursor_t
4724 */
4725struct object_copy_cursor_t {
4726 uint64_t data_offset;
4727 string omap_offset;
4728 bool attr_complete;
4729 bool data_complete;
4730 bool omap_complete;
4731
4732 object_copy_cursor_t()
4733 : data_offset(0),
4734 attr_complete(false),
4735 data_complete(false),
4736 omap_complete(false)
4737 {}
4738
4739 bool is_initial() const {
4740 return !attr_complete && data_offset == 0 && omap_offset.empty();
4741 }
4742 bool is_complete() const {
4743 return attr_complete && data_complete && omap_complete;
4744 }
4745
4746 static void generate_test_instances(list<object_copy_cursor_t*>& o);
4747 void encode(bufferlist& bl) const;
11fdf7f2 4748 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
4749 void dump(Formatter *f) const;
4750};
4751WRITE_CLASS_ENCODER(object_copy_cursor_t)
4752
4753/**
4754 * object_copy_data_t
4755 *
4756 * Return data from a copy request. The semantics are a little strange
4757 * as a result of the encoding's heritage.
4758 *
4759 * In particular, the sender unconditionally fills in the cursor (from what
4760 * it receives and sends), the size, and the mtime, but is responsible for
4761 * figuring out whether it should put any data in the attrs, data, or
4762 * omap members (corresponding to xattrs, object data, and the omap entries)
4763 * based on external data (the client includes a max amount to return with
4764 * the copy request). The client then looks into the attrs, data, and/or omap
4765 * based on the contents of the cursor.
4766 */
4767struct object_copy_data_t {
4768 enum {
4769 FLAG_DATA_DIGEST = 1<<0,
4770 FLAG_OMAP_DIGEST = 1<<1,
4771 };
4772 object_copy_cursor_t cursor;
4773 uint64_t size;
4774 utime_t mtime;
4775 uint32_t data_digest, omap_digest;
4776 uint32_t flags;
4777 map<string, bufferlist> attrs;
4778 bufferlist data;
4779 bufferlist omap_header;
4780 bufferlist omap_data;
4781
4782 /// which snaps we are defined for (if a snap and not the head)
4783 vector<snapid_t> snaps;
11fdf7f2 4784 /// latest snap seq for the object (if head)
7c673cae
FG
4785 snapid_t snap_seq;
4786
11fdf7f2 4787 /// recent reqids on this object
31f18b77 4788 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > reqids;
7c673cae 4789
11fdf7f2
TL
4790 /// map reqids by index to error return code (if any)
4791 mempool::osd_pglog::map<uint32_t, int> reqid_return_codes;
4792
7c673cae
FG
4793 uint64_t truncate_seq;
4794 uint64_t truncate_size;
4795
4796public:
4797 object_copy_data_t() :
4798 size((uint64_t)-1), data_digest(-1),
4799 omap_digest(-1), flags(0),
4800 truncate_seq(0),
4801 truncate_size(0) {}
4802
4803 static void generate_test_instances(list<object_copy_data_t*>& o);
4804 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 4805 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
4806 void dump(Formatter *f) const;
4807};
4808WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
4809
4810/**
4811 * pg creation info
4812 */
4813struct pg_create_t {
4814 epoch_t created; // epoch pg created
4815 pg_t parent; // split from parent (if != pg_t())
4816 __s32 split_bits;
4817
4818 pg_create_t()
4819 : created(0), split_bits(0) {}
4820 pg_create_t(unsigned c, pg_t p, int s)
4821 : created(c), parent(p), split_bits(s) {}
4822
4823 void encode(bufferlist &bl) const;
11fdf7f2 4824 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
4825 void dump(Formatter *f) const;
4826 static void generate_test_instances(list<pg_create_t*>& o);
4827};
4828WRITE_CLASS_ENCODER(pg_create_t)
4829
7c673cae
FG
4830// -----------------------------------------
4831
4832class ObjectExtent {
4833 /**
4834 * ObjectExtents are used for specifying IO behavior against RADOS
4835 * objects when one is using the ObjectCacher.
4836 *
4837 * To use this in a real system, *every member* must be filled
4838 * out correctly. In particular, make sure to initialize the
4839 * oloc correctly, as its default values are deliberate poison
4840 * and will cause internal ObjectCacher asserts.
4841 *
4842 * Similarly, your buffer_extents vector *must* specify a total
4843 * size equal to your length. If the buffer_extents inadvertently
4844 * contain less space than the length member specifies, you
4845 * will get unintelligible asserts deep in the ObjectCacher.
4846 *
4847 * If you are trying to do testing and don't care about actual
4848 * RADOS function, the simplest thing to do is to initialize
4849 * the ObjectExtent (truncate_size can be 0), create a single entry
4850 * in buffer_extents matching the length, and set oloc.pool to 0.
4851 */
4852 public:
4853 object_t oid; // object id
4854 uint64_t objectno;
4855 uint64_t offset; // in object
4856 uint64_t length; // in object
4857 uint64_t truncate_size; // in object
4858
4859 object_locator_t oloc; // object locator (pool etc)
4860
4861 vector<pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
4862
4863 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
4864 ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
4865 oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
4866};
4867
4868inline ostream& operator<<(ostream& out, const ObjectExtent &ex)
4869{
4870 return out << "extent("
4871 << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
4872 << " " << ex.offset << "~" << ex.length
4873 << " -> " << ex.buffer_extents
4874 << ")";
4875}
4876
4877
7c673cae
FG
4878// ---------------------------------------
4879
4880class OSDSuperblock {
4881public:
4882 uuid_d cluster_fsid, osd_fsid;
4883 int32_t whoami; // my role in this fs.
4884 epoch_t current_epoch; // most recent epoch
4885 epoch_t oldest_map, newest_map; // oldest/newest maps we have.
4886 double weight;
4887
4888 CompatSet compat_features;
4889
4890 // last interval over which i mounted and was then active
4891 epoch_t mounted; // last epoch i mounted
4892 epoch_t clean_thru; // epoch i was active and clean thru
4893
4894 OSDSuperblock() :
4895 whoami(-1),
4896 current_epoch(0), oldest_map(0), newest_map(0), weight(0),
4897 mounted(0), clean_thru(0) {
4898 }
4899
4900 void encode(bufferlist &bl) const;
11fdf7f2 4901 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
4902 void dump(Formatter *f) const;
4903 static void generate_test_instances(list<OSDSuperblock*>& o);
4904};
4905WRITE_CLASS_ENCODER(OSDSuperblock)
4906
4907inline ostream& operator<<(ostream& out, const OSDSuperblock& sb)
4908{
4909 return out << "sb(" << sb.cluster_fsid
4910 << " osd." << sb.whoami
4911 << " " << sb.osd_fsid
4912 << " e" << sb.current_epoch
4913 << " [" << sb.oldest_map << "," << sb.newest_map << "]"
4914 << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
4915 << ")";
4916}
4917
4918
4919// -------
4920
4921
4922
4923
4924
4925
4926/*
4927 * attached to object head. describes most recent snap context, and
4928 * set of existing clones.
4929 */
4930struct SnapSet {
4931 snapid_t seq;
7c673cae
FG
4932 vector<snapid_t> snaps; // descending
4933 vector<snapid_t> clones; // ascending
4934 map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest
4935 map<snapid_t, uint64_t> clone_size;
4936 map<snapid_t, vector<snapid_t>> clone_snaps; // descending
4937
11fdf7f2 4938 SnapSet() : seq(0) {}
7c673cae 4939 explicit SnapSet(bufferlist& bl) {
11fdf7f2 4940 auto p = std::cbegin(bl);
7c673cae
FG
4941 decode(p);
4942 }
4943
7c673cae
FG
4944 /// populate SnapSet from a librados::snap_set_t
4945 void from_snap_set(const librados::snap_set_t& ss, bool legacy);
4946
4947 /// get space accounted to clone
4948 uint64_t get_clone_bytes(snapid_t clone) const;
4949
4950 void encode(bufferlist& bl) const;
11fdf7f2 4951 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
4952 void dump(Formatter *f) const;
4953 static void generate_test_instances(list<SnapSet*>& o);
4954
4955 SnapContext get_ssc_as_of(snapid_t as_of) const {
4956 SnapContext out;
4957 out.seq = as_of;
4958 for (vector<snapid_t>::const_iterator i = snaps.begin();
4959 i != snaps.end();
4960 ++i) {
4961 if (*i <= as_of)
4962 out.snaps.push_back(*i);
4963 }
4964 return out;
4965 }
4966
7c673cae
FG
4967
4968 SnapSet get_filtered(const pg_pool_t &pinfo) const;
4969 void filter(const pg_pool_t &pinfo);
4970};
4971WRITE_CLASS_ENCODER(SnapSet)
4972
4973ostream& operator<<(ostream& out, const SnapSet& cs);
4974
4975
4976
4977#define OI_ATTR "_"
4978#define SS_ATTR "snapset"
4979
4980struct watch_info_t {
4981 uint64_t cookie;
4982 uint32_t timeout_seconds;
4983 entity_addr_t addr;
4984
4985 watch_info_t() : cookie(0), timeout_seconds(0) { }
4986 watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
4987
4988 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 4989 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
4990 void dump(Formatter *f) const;
4991 static void generate_test_instances(list<watch_info_t*>& o);
4992};
4993WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
4994
4995static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
4996 return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
4997 && l.addr == r.addr;
4998}
4999
5000static inline ostream& operator<<(ostream& out, const watch_info_t& w) {
5001 return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
5002 << " " << w.addr << ")";
5003}
5004
5005struct notify_info_t {
5006 uint64_t cookie;
5007 uint64_t notify_id;
5008 uint32_t timeout;
5009 bufferlist bl;
5010};
5011
5012static inline ostream& operator<<(ostream& out, const notify_info_t& n) {
5013 return out << "notify(cookie " << n.cookie
5014 << " notify" << n.notify_id
5015 << " " << n.timeout << "s)";
5016}
5017
11fdf7f2
TL
5018struct chunk_info_t {
5019 typedef enum {
5020 FLAG_DIRTY = 1,
5021 FLAG_MISSING = 2,
5022 FLAG_HAS_REFERENCE = 4,
5023 FLAG_HAS_FINGERPRINT = 8,
5024 } cflag_t;
5025 uint32_t offset;
5026 uint32_t length;
5027 hobject_t oid;
5028 cflag_t flags; // FLAG_*
5029
5030 chunk_info_t() : offset(0), length(0), flags((cflag_t)0) { }
5031
5032 static string get_flag_string(uint64_t flags) {
5033 string r;
5034 if (flags & FLAG_DIRTY) {
5035 r += "|dirty";
5036 }
5037 if (flags & FLAG_MISSING) {
5038 r += "|missing";
5039 }
5040 if (flags & FLAG_HAS_REFERENCE) {
5041 r += "|has_reference";
5042 }
5043 if (flags & FLAG_HAS_FINGERPRINT) {
5044 r += "|has_fingerprint";
5045 }
5046 if (r.length())
5047 return r.substr(1);
5048 return r;
5049 }
5050 bool test_flag(cflag_t f) const {
5051 return (flags & f) == f;
5052 }
5053 void set_flag(cflag_t f) {
5054 flags = (cflag_t)(flags | f);
5055 }
5056 void set_flags(cflag_t f) {
5057 flags = f;
5058 }
5059 void clear_flag(cflag_t f) {
5060 flags = (cflag_t)(flags & ~f);
5061 }
5062 void clear_flags() {
5063 flags = (cflag_t)0;
5064 }
5065 bool is_dirty() const {
5066 return test_flag(FLAG_DIRTY);
5067 }
5068 bool is_missing() const {
5069 return test_flag(FLAG_MISSING);
5070 }
5071 bool has_reference() const {
5072 return test_flag(FLAG_HAS_REFERENCE);
5073 }
5074 bool has_fingerprint() const {
5075 return test_flag(FLAG_HAS_FINGERPRINT);
5076 }
5077 void encode(bufferlist &bl) const;
5078 void decode(bufferlist::const_iterator &bl);
5079 void dump(Formatter *f) const;
5080 friend ostream& operator<<(ostream& out, const chunk_info_t& ci);
5081};
5082WRITE_CLASS_ENCODER(chunk_info_t)
5083ostream& operator<<(ostream& out, const chunk_info_t& ci);
5084
31f18b77
FG
5085struct object_info_t;
5086struct object_manifest_t {
5087 enum {
5088 TYPE_NONE = 0,
11fdf7f2
TL
5089 TYPE_REDIRECT = 1,
5090 TYPE_CHUNKED = 2,
31f18b77
FG
5091 };
5092 uint8_t type; // redirect, chunked, ...
5093 hobject_t redirect_target;
11fdf7f2 5094 map <uint64_t, chunk_info_t> chunk_map;
31f18b77
FG
5095
5096 object_manifest_t() : type(0) { }
5097 object_manifest_t(uint8_t type, const hobject_t& redirect_target)
5098 : type(type), redirect_target(redirect_target) { }
5099
5100 bool is_empty() const {
5101 return type == TYPE_NONE;
5102 }
5103 bool is_redirect() const {
5104 return type == TYPE_REDIRECT;
5105 }
5106 bool is_chunked() const {
5107 return type == TYPE_CHUNKED;
5108 }
11fdf7f2 5109 static std::string_view get_type_name(uint8_t m) {
31f18b77
FG
5110 switch (m) {
5111 case TYPE_NONE: return "none";
5112 case TYPE_REDIRECT: return "redirect";
5113 case TYPE_CHUNKED: return "chunked";
5114 default: return "unknown";
5115 }
5116 }
11fdf7f2 5117 std::string_view get_type_name() const {
31f18b77
FG
5118 return get_type_name(type);
5119 }
11fdf7f2
TL
5120 void clear() {
5121 type = 0;
5122 redirect_target = hobject_t();
5123 chunk_map.clear();
5124 }
31f18b77
FG
5125 static void generate_test_instances(list<object_manifest_t*>& o);
5126 void encode(bufferlist &bl) const;
11fdf7f2 5127 void decode(bufferlist::const_iterator &bl);
31f18b77
FG
5128 void dump(Formatter *f) const;
5129 friend ostream& operator<<(ostream& out, const object_info_t& oi);
5130};
5131WRITE_CLASS_ENCODER(object_manifest_t)
5132ostream& operator<<(ostream& out, const object_manifest_t& oi);
7c673cae
FG
5133
5134struct object_info_t {
5135 hobject_t soid;
5136 eversion_t version, prior_version;
5137 version_t user_version;
5138 osd_reqid_t last_reqid;
5139
5140 uint64_t size;
5141 utime_t mtime;
5142 utime_t local_mtime; // local mtime
5143
5144 // note: these are currently encoded into a total 16 bits; see
5145 // encode()/decode() for the weirdness.
5146 typedef enum {
11fdf7f2
TL
5147 FLAG_LOST = 1<<0,
5148 FLAG_WHITEOUT = 1<<1, // object logically does not exist
5149 FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
5150 FLAG_OMAP = 1<<3, // has (or may have) some/any omap data
5151 FLAG_DATA_DIGEST = 1<<4, // has data crc
5152 FLAG_OMAP_DIGEST = 1<<5, // has omap crc
5153 FLAG_CACHE_PIN = 1<<6, // pin the object in cache tier
5154 FLAG_MANIFEST = 1<<7, // has manifest
5155 FLAG_USES_TMAP = 1<<8, // deprecated; no longer used
5156 FLAG_REDIRECT_HAS_REFERENCE = 1<<9, // has reference
7c673cae
FG
5157 } flag_t;
5158
5159 flag_t flags;
5160
5161 static string get_flag_string(flag_t flags) {
5162 string s;
94b18763
FG
5163 vector<string> sv = get_flag_vector(flags);
5164 for (auto ss : sv) {
5165 s += string("|") + ss;
5166 }
5167 if (s.length())
5168 return s.substr(1);
5169 return s;
5170 }
5171 static vector<string> get_flag_vector(flag_t flags) {
5172 vector<string> sv;
7c673cae 5173 if (flags & FLAG_LOST)
94b18763 5174 sv.insert(sv.end(), "lost");
7c673cae 5175 if (flags & FLAG_WHITEOUT)
94b18763 5176 sv.insert(sv.end(), "whiteout");
7c673cae 5177 if (flags & FLAG_DIRTY)
94b18763 5178 sv.insert(sv.end(), "dirty");
7c673cae 5179 if (flags & FLAG_USES_TMAP)
94b18763 5180 sv.insert(sv.end(), "uses_tmap");
7c673cae 5181 if (flags & FLAG_OMAP)
94b18763 5182 sv.insert(sv.end(), "omap");
7c673cae 5183 if (flags & FLAG_DATA_DIGEST)
94b18763 5184 sv.insert(sv.end(), "data_digest");
7c673cae 5185 if (flags & FLAG_OMAP_DIGEST)
94b18763 5186 sv.insert(sv.end(), "omap_digest");
7c673cae 5187 if (flags & FLAG_CACHE_PIN)
94b18763 5188 sv.insert(sv.end(), "cache_pin");
31f18b77 5189 if (flags & FLAG_MANIFEST)
94b18763 5190 sv.insert(sv.end(), "manifest");
11fdf7f2
TL
5191 if (flags & FLAG_REDIRECT_HAS_REFERENCE)
5192 sv.insert(sv.end(), "redirect_has_reference");
94b18763 5193 return sv;
7c673cae
FG
5194 }
5195 string get_flag_string() const {
5196 return get_flag_string(flags);
5197 }
5198
7c673cae
FG
5199 uint64_t truncate_seq, truncate_size;
5200
5201 map<pair<uint64_t, entity_name_t>, watch_info_t> watchers;
5202
5203 // opportunistic checksums; may or may not be present
5204 __u32 data_digest; ///< data crc32c
5205 __u32 omap_digest; ///< omap crc32c
5206
5207 // alloc hint attribute
5208 uint64_t expected_object_size, expected_write_size;
5209 uint32_t alloc_hint_flags;
5210
31f18b77
FG
5211 struct object_manifest_t manifest;
5212
7c673cae
FG
5213 void copy_user_bits(const object_info_t& other);
5214
7c673cae
FG
5215 bool test_flag(flag_t f) const {
5216 return (flags & f) == f;
5217 }
5218 void set_flag(flag_t f) {
5219 flags = (flag_t)(flags | f);
5220 }
5221 void clear_flag(flag_t f) {
5222 flags = (flag_t)(flags & ~f);
5223 }
5224 bool is_lost() const {
5225 return test_flag(FLAG_LOST);
5226 }
5227 bool is_whiteout() const {
5228 return test_flag(FLAG_WHITEOUT);
5229 }
5230 bool is_dirty() const {
5231 return test_flag(FLAG_DIRTY);
5232 }
5233 bool is_omap() const {
5234 return test_flag(FLAG_OMAP);
5235 }
5236 bool is_data_digest() const {
5237 return test_flag(FLAG_DATA_DIGEST);
5238 }
5239 bool is_omap_digest() const {
5240 return test_flag(FLAG_OMAP_DIGEST);
5241 }
5242 bool is_cache_pinned() const {
5243 return test_flag(FLAG_CACHE_PIN);
5244 }
31f18b77
FG
5245 bool has_manifest() const {
5246 return test_flag(FLAG_MANIFEST);
5247 }
7c673cae
FG
5248 void set_data_digest(__u32 d) {
5249 set_flag(FLAG_DATA_DIGEST);
5250 data_digest = d;
5251 }
5252 void set_omap_digest(__u32 d) {
5253 set_flag(FLAG_OMAP_DIGEST);
5254 omap_digest = d;
5255 }
5256 void clear_data_digest() {
5257 clear_flag(FLAG_DATA_DIGEST);
5258 data_digest = -1;
5259 }
5260 void clear_omap_digest() {
5261 clear_flag(FLAG_OMAP_DIGEST);
5262 omap_digest = -1;
5263 }
5264 void new_object() {
28e407b8
AA
5265 clear_data_digest();
5266 clear_omap_digest();
7c673cae
FG
5267 }
5268
5269 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 5270 void decode(bufferlist::const_iterator& bl);
7c673cae 5271 void decode(bufferlist& bl) {
11fdf7f2 5272 auto p = std::cbegin(bl);
7c673cae
FG
5273 decode(p);
5274 }
5275 void dump(Formatter *f) const;
5276 static void generate_test_instances(list<object_info_t*>& o);
5277
5278 explicit object_info_t()
5279 : user_version(0), size(0), flags((flag_t)0),
5280 truncate_seq(0), truncate_size(0),
5281 data_digest(-1), omap_digest(-1),
5282 expected_object_size(0), expected_write_size(0),
5283 alloc_hint_flags(0)
5284 {}
5285
5286 explicit object_info_t(const hobject_t& s)
5287 : soid(s),
5288 user_version(0), size(0), flags((flag_t)0),
5289 truncate_seq(0), truncate_size(0),
5290 data_digest(-1), omap_digest(-1),
5291 expected_object_size(0), expected_write_size(0),
5292 alloc_hint_flags(0)
5293 {}
5294
5295 explicit object_info_t(bufferlist& bl) {
5296 decode(bl);
5297 }
5298};
5299WRITE_CLASS_ENCODER_FEATURES(object_info_t)
5300
5301ostream& operator<<(ostream& out, const object_info_t& oi);
5302
5303
5304
5305// Object recovery
5306struct ObjectRecoveryInfo {
5307 hobject_t soid;
5308 eversion_t version;
5309 uint64_t size;
5310 object_info_t oi;
5311 SnapSet ss; // only populated if soid is_snap()
5312 interval_set<uint64_t> copy_subset;
5313 map<hobject_t, interval_set<uint64_t>> clone_subset;
5314
5315 ObjectRecoveryInfo() : size(0) { }
5316
5317 static void generate_test_instances(list<ObjectRecoveryInfo*>& o);
5318 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 5319 void decode(bufferlist::const_iterator &bl, int64_t pool = -1);
7c673cae
FG
5320 ostream &print(ostream &out) const;
5321 void dump(Formatter *f) const;
5322};
5323WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
5324ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf);
5325
5326struct ObjectRecoveryProgress {
5327 uint64_t data_recovered_to;
5328 string omap_recovered_to;
5329 bool first;
5330 bool data_complete;
5331 bool omap_complete;
224ce89b 5332 bool error = false;
7c673cae
FG
5333
5334 ObjectRecoveryProgress()
5335 : data_recovered_to(0),
5336 first(true),
5337 data_complete(false), omap_complete(false) { }
5338
5339 bool is_complete(const ObjectRecoveryInfo& info) const {
5340 return (data_recovered_to >= (
5341 info.copy_subset.empty() ?
5342 0 : info.copy_subset.range_end())) &&
5343 omap_complete;
5344 }
5345
5346 static void generate_test_instances(list<ObjectRecoveryProgress*>& o);
5347 void encode(bufferlist &bl) const;
11fdf7f2 5348 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
5349 ostream &print(ostream &out) const;
5350 void dump(Formatter *f) const;
5351};
5352WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
5353ostream& operator<<(ostream& out, const ObjectRecoveryProgress &prog);
5354
5355struct PushReplyOp {
5356 hobject_t soid;
5357
5358 static void generate_test_instances(list<PushReplyOp*>& o);
5359 void encode(bufferlist &bl) const;
11fdf7f2 5360 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
5361 ostream &print(ostream &out) const;
5362 void dump(Formatter *f) const;
5363
5364 uint64_t cost(CephContext *cct) const;
5365};
5366WRITE_CLASS_ENCODER(PushReplyOp)
5367ostream& operator<<(ostream& out, const PushReplyOp &op);
5368
5369struct PullOp {
5370 hobject_t soid;
5371
5372 ObjectRecoveryInfo recovery_info;
5373 ObjectRecoveryProgress recovery_progress;
5374
5375 static void generate_test_instances(list<PullOp*>& o);
5376 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 5377 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
5378 ostream &print(ostream &out) const;
5379 void dump(Formatter *f) const;
5380
5381 uint64_t cost(CephContext *cct) const;
5382};
5383WRITE_CLASS_ENCODER_FEATURES(PullOp)
5384ostream& operator<<(ostream& out, const PullOp &op);
5385
5386struct PushOp {
5387 hobject_t soid;
5388 eversion_t version;
5389 bufferlist data;
5390 interval_set<uint64_t> data_included;
5391 bufferlist omap_header;
5392 map<string, bufferlist> omap_entries;
5393 map<string, bufferlist> attrset;
5394
5395 ObjectRecoveryInfo recovery_info;
5396 ObjectRecoveryProgress before_progress;
5397 ObjectRecoveryProgress after_progress;
5398
5399 static void generate_test_instances(list<PushOp*>& o);
5400 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 5401 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
5402 ostream &print(ostream &out) const;
5403 void dump(Formatter *f) const;
5404
5405 uint64_t cost(CephContext *cct) const;
5406};
5407WRITE_CLASS_ENCODER_FEATURES(PushOp)
5408ostream& operator<<(ostream& out, const PushOp &op);
5409
5410
5411/*
5412 * summarize pg contents for purposes of a scrub
5413 */
5414struct ScrubMap {
5415 struct object {
5416 map<string,bufferptr> attrs;
5417 uint64_t size;
5418 __u32 omap_digest; ///< omap crc32c
5419 __u32 digest; ///< data crc32c
5420 bool negative:1;
5421 bool digest_present:1;
5422 bool omap_digest_present:1;
5423 bool read_error:1;
5424 bool stat_error:1;
5425 bool ec_hash_mismatch:1;
5426 bool ec_size_mismatch:1;
28e407b8
AA
5427 bool large_omap_object_found:1;
5428 uint64_t large_omap_object_key_count = 0;
5429 uint64_t large_omap_object_value_size = 0;
11fdf7f2
TL
5430 uint64_t object_omap_bytes = 0;
5431 uint64_t object_omap_keys = 0;
7c673cae
FG
5432
5433 object() :
5434 // Init invalid size so it won't match if we get a stat EIO error
5435 size(-1), omap_digest(0), digest(0),
28e407b8
AA
5436 negative(false), digest_present(false), omap_digest_present(false),
5437 read_error(false), stat_error(false), ec_hash_mismatch(false),
5438 ec_size_mismatch(false), large_omap_object_found(false) {}
7c673cae
FG
5439
5440 void encode(bufferlist& bl) const;
11fdf7f2 5441 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
5442 void dump(Formatter *f) const;
5443 static void generate_test_instances(list<object*>& o);
5444 };
5445 WRITE_CLASS_ENCODER(object)
5446
5447 map<hobject_t,object> objects;
5448 eversion_t valid_through;
5449 eversion_t incr_since;
28e407b8 5450 bool has_large_omap_object_errors:1;
11fdf7f2 5451 bool has_omap_keys:1;
7c673cae
FG
5452
5453 void merge_incr(const ScrubMap &l);
28e407b8
AA
5454 void clear_from(const hobject_t& start) {
5455 objects.erase(objects.lower_bound(start), objects.end());
5456 }
7c673cae
FG
5457 void insert(const ScrubMap &r) {
5458 objects.insert(r.objects.begin(), r.objects.end());
5459 }
5460 void swap(ScrubMap &r) {
31f18b77
FG
5461 using std::swap;
5462 swap(objects, r.objects);
5463 swap(valid_through, r.valid_through);
5464 swap(incr_since, r.incr_since);
7c673cae
FG
5465 }
5466
5467 void encode(bufferlist& bl) const;
11fdf7f2 5468 void decode(bufferlist::const_iterator& bl, int64_t pool=-1);
7c673cae
FG
5469 void dump(Formatter *f) const;
5470 static void generate_test_instances(list<ScrubMap*>& o);
5471};
5472WRITE_CLASS_ENCODER(ScrubMap::object)
5473WRITE_CLASS_ENCODER(ScrubMap)
5474
28e407b8
AA
5475struct ScrubMapBuilder {
5476 bool deep = false;
5477 vector<hobject_t> ls;
5478 size_t pos = 0;
5479 int64_t data_pos = 0;
5480 string omap_pos;
5481 int ret = 0;
5482 bufferhash data_hash, omap_hash; ///< accumulatinng hash value
5483 uint64_t omap_keys = 0;
5484 uint64_t omap_bytes = 0;
5485
5486 bool empty() {
5487 return ls.empty();
5488 }
5489 bool done() {
5490 return pos >= ls.size();
5491 }
5492 void reset() {
5493 *this = ScrubMapBuilder();
5494 }
5495
5496 bool data_done() {
5497 return data_pos < 0;
5498 }
5499
5500 void next_object() {
5501 ++pos;
5502 data_pos = 0;
5503 omap_pos.clear();
5504 omap_keys = 0;
5505 omap_bytes = 0;
5506 }
5507
5508 friend ostream& operator<<(ostream& out, const ScrubMapBuilder& pos) {
5509 out << "(" << pos.pos << "/" << pos.ls.size();
5510 if (pos.pos < pos.ls.size()) {
5511 out << " " << pos.ls[pos.pos];
5512 }
5513 if (pos.data_pos < 0) {
5514 out << " byte " << pos.data_pos;
5515 }
5516 if (!pos.omap_pos.empty()) {
5517 out << " key " << pos.omap_pos;
5518 }
5519 if (pos.deep) {
5520 out << " deep";
5521 }
5522 if (pos.ret) {
5523 out << " ret " << pos.ret;
5524 }
5525 return out << ")";
5526 }
5527};
5528
7c673cae
FG
5529struct OSDOp {
5530 ceph_osd_op op;
5531 sobject_t soid;
5532
5533 bufferlist indata, outdata;
224ce89b 5534 errorcode32_t rval;
7c673cae
FG
5535
5536 OSDOp() : rval(0) {
92f5a8d4 5537 // FIPS zeroization audit 20191115: this memset clean for security
7c673cae
FG
5538 memset(&op, 0, sizeof(ceph_osd_op));
5539 }
5540
5541 /**
5542 * split a bufferlist into constituent indata members of a vector of OSDOps
5543 *
5544 * @param ops [out] vector of OSDOps
5545 * @param in [in] combined data buffer
5546 */
5547 static void split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in);
5548
5549 /**
5550 * merge indata members of a vector of OSDOp into a single bufferlist
5551 *
5552 * Notably this also encodes certain other OSDOp data into the data
5553 * buffer, including the sobject_t soid.
5554 *
5555 * @param ops [in] vector of OSDOps
5556 * @param out [out] combined data buffer
5557 */
5558 static void merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out);
5559
5560 /**
5561 * split a bufferlist into constituent outdata members of a vector of OSDOps
5562 *
5563 * @param ops [out] vector of OSDOps
5564 * @param in [in] combined data buffer
5565 */
5566 static void split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in);
5567
5568 /**
5569 * merge outdata members of a vector of OSDOps into a single bufferlist
5570 *
5571 * @param ops [in] vector of OSDOps
5572 * @param out [out] combined data buffer
5573 */
5574 static void merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out);
224ce89b
WB
5575
5576 /**
5577 * Clear data as much as possible, leave minimal data for historical op dump
5578 *
5579 * @param ops [in] vector of OSDOps
5580 */
5581 static void clear_data(vector<OSDOp>& ops);
7c673cae
FG
5582};
5583
5584ostream& operator<<(ostream& out, const OSDOp& op);
5585
5586struct watch_item_t {
5587 entity_name_t name;
5588 uint64_t cookie;
5589 uint32_t timeout_seconds;
5590 entity_addr_t addr;
5591
5592 watch_item_t() : cookie(0), timeout_seconds(0) { }
5593 watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
5594 const entity_addr_t& addr)
5595 : name(name), cookie(cookie), timeout_seconds(timeout),
5596 addr(addr) { }
5597
5598 void encode(bufferlist &bl, uint64_t features) const {
5599 ENCODE_START(2, 1, bl);
11fdf7f2
TL
5600 encode(name, bl);
5601 encode(cookie, bl);
5602 encode(timeout_seconds, bl);
5603 encode(addr, bl, features);
7c673cae
FG
5604 ENCODE_FINISH(bl);
5605 }
11fdf7f2 5606 void decode(bufferlist::const_iterator &bl) {
7c673cae 5607 DECODE_START(2, bl);
11fdf7f2
TL
5608 decode(name, bl);
5609 decode(cookie, bl);
5610 decode(timeout_seconds, bl);
7c673cae 5611 if (struct_v >= 2) {
11fdf7f2 5612 decode(addr, bl);
7c673cae
FG
5613 }
5614 DECODE_FINISH(bl);
5615 }
5616};
5617WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
5618
5619struct obj_watch_item_t {
5620 hobject_t obj;
5621 watch_item_t wi;
5622};
5623
5624/**
5625 * obj list watch response format
5626 *
5627 */
5628struct obj_list_watch_response_t {
5629 list<watch_item_t> entries;
5630
5631 void encode(bufferlist& bl, uint64_t features) const {
5632 ENCODE_START(1, 1, bl);
11fdf7f2 5633 encode(entries, bl, features);
7c673cae
FG
5634 ENCODE_FINISH(bl);
5635 }
11fdf7f2 5636 void decode(bufferlist::const_iterator& bl) {
7c673cae 5637 DECODE_START(1, bl);
11fdf7f2 5638 decode(entries, bl);
7c673cae
FG
5639 DECODE_FINISH(bl);
5640 }
5641 void dump(Formatter *f) const {
5642 f->open_array_section("entries");
5643 for (list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
5644 f->open_object_section("watch");
5645 f->dump_stream("watcher") << p->name;
5646 f->dump_int("cookie", p->cookie);
5647 f->dump_int("timeout", p->timeout_seconds);
5648 f->open_object_section("addr");
5649 p->addr.dump(f);
5650 f->close_section();
5651 f->close_section();
5652 }
5653 f->close_section();
5654 }
5655 static void generate_test_instances(list<obj_list_watch_response_t*>& o) {
5656 entity_addr_t ea;
5657 o.push_back(new obj_list_watch_response_t);
5658 o.push_back(new obj_list_watch_response_t);
5659 ea.set_type(entity_addr_t::TYPE_LEGACY);
5660 ea.set_nonce(1000);
5661 ea.set_family(AF_INET);
5662 ea.set_in4_quad(0, 127);
5663 ea.set_in4_quad(1, 0);
5664 ea.set_in4_quad(2, 0);
5665 ea.set_in4_quad(3, 1);
5666 ea.set_port(1024);
5667 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
5668 ea.set_nonce(1001);
5669 ea.set_in4_quad(3, 2);
5670 ea.set_port(1025);
5671 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
5672 }
5673};
5674WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
5675
5676struct clone_info {
5677 snapid_t cloneid;
5678 vector<snapid_t> snaps; // ascending
5679 vector< pair<uint64_t,uint64_t> > overlap;
5680 uint64_t size;
5681
5682 clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
5683
5684 void encode(bufferlist& bl) const {
5685 ENCODE_START(1, 1, bl);
11fdf7f2
TL
5686 encode(cloneid, bl);
5687 encode(snaps, bl);
5688 encode(overlap, bl);
5689 encode(size, bl);
7c673cae
FG
5690 ENCODE_FINISH(bl);
5691 }
11fdf7f2 5692 void decode(bufferlist::const_iterator& bl) {
7c673cae 5693 DECODE_START(1, bl);
11fdf7f2
TL
5694 decode(cloneid, bl);
5695 decode(snaps, bl);
5696 decode(overlap, bl);
5697 decode(size, bl);
7c673cae
FG
5698 DECODE_FINISH(bl);
5699 }
5700 void dump(Formatter *f) const {
5701 if (cloneid == CEPH_NOSNAP)
5702 f->dump_string("cloneid", "HEAD");
5703 else
5704 f->dump_unsigned("cloneid", cloneid.val);
5705 f->open_array_section("snapshots");
5706 for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
5707 f->open_object_section("snap");
5708 f->dump_unsigned("id", p->val);
5709 f->close_section();
5710 }
5711 f->close_section();
5712 f->open_array_section("overlaps");
5713 for (vector< pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
5714 q != overlap.end(); ++q) {
5715 f->open_object_section("overlap");
5716 f->dump_unsigned("offset", q->first);
5717 f->dump_unsigned("length", q->second);
5718 f->close_section();
5719 }
5720 f->close_section();
5721 f->dump_unsigned("size", size);
5722 }
5723 static void generate_test_instances(list<clone_info*>& o) {
5724 o.push_back(new clone_info);
5725 o.push_back(new clone_info);
5726 o.back()->cloneid = 1;
5727 o.back()->snaps.push_back(1);
5728 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5729 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5730 o.back()->size = 16384;
5731 o.push_back(new clone_info);
5732 o.back()->cloneid = CEPH_NOSNAP;
5733 o.back()->size = 32768;
5734 }
5735};
5736WRITE_CLASS_ENCODER(clone_info)
5737
5738/**
5739 * obj list snaps response format
5740 *
5741 */
5742struct obj_list_snap_response_t {
5743 vector<clone_info> clones; // ascending
5744 snapid_t seq;
5745
5746 void encode(bufferlist& bl) const {
5747 ENCODE_START(2, 1, bl);
11fdf7f2
TL
5748 encode(clones, bl);
5749 encode(seq, bl);
7c673cae
FG
5750 ENCODE_FINISH(bl);
5751 }
11fdf7f2 5752 void decode(bufferlist::const_iterator& bl) {
7c673cae 5753 DECODE_START(2, bl);
11fdf7f2 5754 decode(clones, bl);
7c673cae 5755 if (struct_v >= 2)
11fdf7f2 5756 decode(seq, bl);
7c673cae
FG
5757 else
5758 seq = CEPH_NOSNAP;
5759 DECODE_FINISH(bl);
5760 }
5761 void dump(Formatter *f) const {
5762 f->open_array_section("clones");
5763 for (vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
5764 f->open_object_section("clone");
5765 p->dump(f);
5766 f->close_section();
5767 }
5768 f->dump_unsigned("seq", seq);
5769 f->close_section();
5770 }
5771 static void generate_test_instances(list<obj_list_snap_response_t*>& o) {
5772 o.push_back(new obj_list_snap_response_t);
5773 o.push_back(new obj_list_snap_response_t);
5774 clone_info cl;
5775 cl.cloneid = 1;
5776 cl.snaps.push_back(1);
5777 cl.overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5778 cl.overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5779 cl.size = 16384;
5780 o.back()->clones.push_back(cl);
5781 cl.cloneid = CEPH_NOSNAP;
5782 cl.snaps.clear();
5783 cl.overlap.clear();
5784 cl.size = 32768;
5785 o.back()->clones.push_back(cl);
5786 o.back()->seq = 123;
5787 }
5788};
5789
5790WRITE_CLASS_ENCODER(obj_list_snap_response_t)
5791
5792// PromoteCounter
5793
5794struct PromoteCounter {
11fdf7f2
TL
5795 std::atomic<unsigned long long> attempts{0};
5796 std::atomic<unsigned long long> objects{0};
5797 std::atomic<unsigned long long> bytes{0};
7c673cae
FG
5798
5799 void attempt() {
5800 attempts++;
5801 }
5802
5803 void finish(uint64_t size) {
5804 objects++;
5805 bytes += size;
5806 }
5807
5808 void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
5809 *a = attempts;
5810 *o = objects;
5811 *b = bytes;
5812 attempts = *a / 2;
5813 objects = *o / 2;
5814 bytes = *b / 2;
5815 }
5816};
5817
11fdf7f2
TL
5818struct pool_pg_num_history_t {
5819 /// last epoch updated
5820 epoch_t epoch = 0;
5821 /// poolid -> epoch -> pg_num
5822 map<int64_t,map<epoch_t,uint32_t>> pg_nums;
5823 /// pair(epoch, poolid)
5824 set<pair<epoch_t,int64_t>> deleted_pools;
7c673cae 5825
11fdf7f2
TL
5826 void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) {
5827 pg_nums[pool][epoch] = pg_num;
5828 }
5829 void log_pool_delete(epoch_t epoch, int64_t pool) {
5830 deleted_pools.insert(make_pair(epoch, pool));
5831 }
7c673cae 5832
11fdf7f2
TL
5833 /// prune history based on oldest osdmap epoch in the cluster
5834 void prune(epoch_t oldest_epoch) {
5835 auto i = deleted_pools.begin();
5836 while (i != deleted_pools.end()) {
5837 if (i->first >= oldest_epoch) {
5838 break;
5839 }
5840 pg_nums.erase(i->second);
5841 i = deleted_pools.erase(i);
5842 }
5843 for (auto& j : pg_nums) {
5844 auto k = j.second.lower_bound(oldest_epoch);
5845 // keep this and the entry before it (just to be paranoid)
5846 if (k != j.second.begin()) {
5847 --k;
5848 j.second.erase(j.second.begin(), k);
5849 }
5850 }
5851 }
5852
5853 void encode(bufferlist& bl) const {
5854 ENCODE_START(1, 1, bl);
5855 encode(epoch, bl);
5856 encode(pg_nums, bl);
5857 encode(deleted_pools, bl);
5858 ENCODE_FINISH(bl);
5859 }
5860 void decode(bufferlist::const_iterator& p) {
5861 DECODE_START(1, p);
5862 decode(epoch, p);
5863 decode(pg_nums, p);
5864 decode(deleted_pools, p);
5865 DECODE_FINISH(p);
5866 }
5867 void dump(Formatter *f) const {
5868 f->dump_unsigned("epoch", epoch);
5869 f->open_object_section("pools");
5870 for (auto& i : pg_nums) {
5871 f->open_object_section("pool");
5872 f->dump_unsigned("pool_id", i.first);
5873 f->open_array_section("changes");
5874 for (auto& j : i.second) {
5875 f->open_object_section("change");
5876 f->dump_unsigned("epoch", j.first);
5877 f->dump_unsigned("pg_num", j.second);
5878 f->close_section();
5879 }
5880 f->close_section();
5881 f->close_section();
5882 }
5883 f->close_section();
5884 f->open_array_section("deleted_pools");
5885 for (auto& i : deleted_pools) {
5886 f->open_object_section("deletion");
5887 f->dump_unsigned("pool_id", i.second);
5888 f->dump_unsigned("epoch", i.first);
5889 f->close_section();
5890 }
5891 f->close_section();
5892 }
5893 static void generate_test_instances(list<pool_pg_num_history_t*>& ls) {
5894 ls.push_back(new pool_pg_num_history_t);
5895 }
5896 friend ostream& operator<<(ostream& out, const pool_pg_num_history_t& h) {
5897 return out << "pg_num_history(e" << h.epoch
5898 << " pg_nums " << h.pg_nums
5899 << " deleted_pools " << h.deleted_pools
5900 << ")";
7c673cae 5901 }
7c673cae 5902};
11fdf7f2
TL
5903WRITE_CLASS_ENCODER(pool_pg_num_history_t)
5904
5905// omap specific stats
5906struct omap_stat_t {
5907 int large_omap_objects;
5908 int64_t omap_bytes;
5909 int64_t omap_keys;
5910};
7c673cae
FG
5911
5912#endif