]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/osd_types.h
import ceph nautilus 14.2.2
[ceph.git] / ceph / src / osd / osd_types.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18#ifndef CEPH_OSD_TYPES_H
19#define CEPH_OSD_TYPES_H
20
21#include <sstream>
22#include <stdio.h>
23#include <memory>
11fdf7f2 24#include <string_view>
7c673cae
FG
25#include <boost/scoped_ptr.hpp>
26#include <boost/optional/optional_io.hpp>
27#include <boost/variant.hpp>
28
29#include "include/rados/rados_types.hpp"
30#include "include/mempool.h"
31
32#include "msg/msg_types.h"
33#include "include/types.h"
34#include "include/utime.h"
35#include "include/CompatSet.h"
36#include "common/histogram.h"
37#include "include/interval_set.h"
38#include "include/inline_memory.h"
39#include "common/Formatter.h"
40#include "common/bloom_filter.hpp"
41#include "common/hobject.h"
42#include "common/snap_types.h"
43#include "HitSet.h"
44#include "Watch.h"
45#include "include/cmp.h"
46#include "librados/ListObjectImpl.h"
47#include "compressor/Compressor.h"
48#include <atomic>
49
50#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
51
52#define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
53#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
54#define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
55#define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
56#define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
57#define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
58#define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
59#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
60#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
61#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
62#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
63#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
64#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
65#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
66#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
c07f9fc5 67#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
7c673cae
FG
68
69
81eedcae
TL
70/// pool priority range set by user
71#define OSD_POOL_PRIORITY_MAX 10
72#define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX
73
7c673cae
FG
74/// min recovery priority for MBackfillReserve
75#define OSD_RECOVERY_PRIORITY_MIN 0
76
77/// base backfill priority for MBackfillReserve
78#define OSD_BACKFILL_PRIORITY_BASE 100
79
80/// base backfill priority for MBackfillReserve (degraded PG)
81#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
82
83/// base recovery priority for MBackfillReserve
84#define OSD_RECOVERY_PRIORITY_BASE 180
85
86/// base backfill priority for MBackfillReserve (inactive PG)
87#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
88
81eedcae
TL
89/// base recovery priority for MRecoveryReserve (inactive PG)
90#define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220
91
c07f9fc5 92/// max manually/automatically set recovery priority for MBackfillReserve
a8e16298 93#define OSD_RECOVERY_PRIORITY_MAX 253
c07f9fc5 94
a8e16298
TL
95/// backfill priority for MBackfillReserve, when forced manually
96#define OSD_BACKFILL_PRIORITY_FORCED 254
97
98/// recovery priority for MRecoveryReserve, when forced manually
c07f9fc5 99#define OSD_RECOVERY_PRIORITY_FORCED 255
7c673cae 100
11fdf7f2
TL
101/// priority for pg deletion when osd is not fullish
102#define OSD_DELETE_PRIORITY_NORMAL 179
103
104/// priority for pg deletion when osd is approaching full
105#define OSD_DELETE_PRIORITY_FULLISH 219
106
107/// priority when more full
108#define OSD_DELETE_PRIORITY_FULL 255
109
81eedcae
TL
110static std::map<int, int> max_prio_map = {
111 {OSD_BACKFILL_PRIORITY_BASE, OSD_BACKFILL_DEGRADED_PRIORITY_BASE - 1},
112 {OSD_BACKFILL_DEGRADED_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_BASE - 1},
113 {OSD_RECOVERY_PRIORITY_BASE, OSD_BACKFILL_INACTIVE_PRIORITY_BASE - 1},
114 {OSD_RECOVERY_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX},
115 {OSD_BACKFILL_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX}
116};
7c673cae
FG
117
118typedef hobject_t collection_list_handle_t;
119
120/// convert a single CPEH_OSD_FLAG_* to a string
121const char *ceph_osd_flag_name(unsigned flag);
122/// convert a single CEPH_OSD_OF_FLAG_* to a string
123const char *ceph_osd_op_flag_name(unsigned flag);
124
125/// convert CEPH_OSD_FLAG_* op flags to a string
126string ceph_osd_flag_string(unsigned flags);
127/// conver CEPH_OSD_OP_FLAG_* op flags to a string
128string ceph_osd_op_flag_string(unsigned flags);
129/// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a string
130string ceph_osd_alloc_hint_flag_string(unsigned flags);
131
11fdf7f2
TL
132typedef map<string,string> osd_alert_list_t;
133/// map osd id -> alert_list_t
134typedef map<int, osd_alert_list_t> osd_alerts_t;
135void dump(Formatter* f, const osd_alerts_t& alerts);
7c673cae
FG
136
137/**
138 * osd request identifier
139 *
140 * caller name + incarnation# + tid to unique identify this request.
141 */
142struct osd_reqid_t {
143 entity_name_t name; // who
c07f9fc5 144 ceph_tid_t tid;
7c673cae
FG
145 int32_t inc; // incarnation
146
147 osd_reqid_t()
c07f9fc5
FG
148 : tid(0), inc(0)
149 {}
150 osd_reqid_t(const osd_reqid_t& other)
151 : name(other.name), tid(other.tid), inc(other.inc)
152 {}
7c673cae 153 osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
c07f9fc5
FG
154 : name(a), tid(t), inc(i)
155 {}
7c673cae
FG
156
157 DENC(osd_reqid_t, v, p) {
158 DENC_START(2, 2, p);
159 denc(v.name, p);
160 denc(v.tid, p);
161 denc(v.inc, p);
162 DENC_FINISH(p);
163 }
164 void dump(Formatter *f) const;
165 static void generate_test_instances(list<osd_reqid_t*>& o);
166};
167WRITE_CLASS_DENC(osd_reqid_t)
168
169
170
171struct pg_shard_t {
b32b8144 172 static const int32_t NO_OSD = 0x7fffffff;
7c673cae
FG
173 int32_t osd;
174 shard_id_t shard;
175 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
176 explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
177 pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
178 bool is_undefined() const {
179 return osd == -1;
180 }
b32b8144 181 string get_osd() const { return (osd == NO_OSD ? "NONE" : to_string(osd)); }
7c673cae 182 void encode(bufferlist &bl) const;
11fdf7f2 183 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
184 void dump(Formatter *f) const {
185 f->dump_unsigned("osd", osd);
186 if (shard != shard_id_t::NO_SHARD) {
187 f->dump_unsigned("shard", shard);
188 }
189 }
190};
191WRITE_CLASS_ENCODER(pg_shard_t)
192WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
193WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
194ostream &operator<<(ostream &lhs, const pg_shard_t &rhs);
195
196class IsPGRecoverablePredicate {
197public:
198 /**
199 * have encodes the shards available
200 */
201 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
202 virtual ~IsPGRecoverablePredicate() {}
203};
204
205class IsPGReadablePredicate {
206public:
207 /**
208 * have encodes the shards available
209 */
210 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
211 virtual ~IsPGReadablePredicate() {}
212};
213
214inline ostream& operator<<(ostream& out, const osd_reqid_t& r) {
215 return out << r.name << "." << r.inc << ":" << r.tid;
216}
217
218inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
219 return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
220}
221inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
222 return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
223}
224inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
225 return (l.name < r.name) || (l.inc < r.inc) ||
226 (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
227}
228inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
229 return (l.name < r.name) || (l.inc < r.inc) ||
230 (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
231}
232inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
233inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
234
235namespace std {
236 template<> struct hash<osd_reqid_t> {
237 size_t operator()(const osd_reqid_t &r) const {
238 static hash<uint64_t> H;
239 return H(r.name.num() ^ r.tid ^ r.inc);
240 }
241 };
242} // namespace std
243
244
245// -----
246
247// a locator constrains the placement of an object. mainly, which pool
248// does it go in.
249struct object_locator_t {
250 // You specify either the hash or the key -- not both
251 int64_t pool; ///< pool id
252 string key; ///< key string (if non-empty)
253 string nspace; ///< namespace
254 int64_t hash; ///< hash position (if >= 0)
255
256 explicit object_locator_t()
257 : pool(-1), hash(-1) {}
258 explicit object_locator_t(int64_t po)
259 : pool(po), hash(-1) {}
260 explicit object_locator_t(int64_t po, int64_t ps)
261 : pool(po), hash(ps) {}
262 explicit object_locator_t(int64_t po, string ns)
263 : pool(po), nspace(ns), hash(-1) {}
264 explicit object_locator_t(int64_t po, string ns, int64_t ps)
265 : pool(po), nspace(ns), hash(ps) {}
266 explicit object_locator_t(int64_t po, string ns, string s)
267 : pool(po), key(s), nspace(ns), hash(-1) {}
268 explicit object_locator_t(const hobject_t& soid)
269 : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
270
271 int64_t get_pool() const {
272 return pool;
273 }
274
275 void clear() {
276 pool = -1;
277 key = "";
278 nspace = "";
279 hash = -1;
280 }
281
282 bool empty() const {
283 return pool == -1;
284 }
285
286 void encode(bufferlist& bl) const;
11fdf7f2 287 void decode(bufferlist::const_iterator& p);
7c673cae
FG
288 void dump(Formatter *f) const;
289 static void generate_test_instances(list<object_locator_t*>& o);
290};
291WRITE_CLASS_ENCODER(object_locator_t)
292
293inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
294 return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
295}
296inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
297 return !(l == r);
298}
299
300inline ostream& operator<<(ostream& out, const object_locator_t& loc)
301{
302 out << "@" << loc.pool;
303 if (loc.nspace.length())
304 out << ";" << loc.nspace;
305 if (loc.key.length())
306 out << ":" << loc.key;
307 return out;
308}
309
310struct request_redirect_t {
311private:
312 object_locator_t redirect_locator; ///< this is authoritative
313 string redirect_object; ///< If non-empty, the request goes to this object name
7c673cae
FG
314
315 friend ostream& operator<<(ostream& out, const request_redirect_t& redir);
316public:
317
318 request_redirect_t() {}
319 explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
320 redirect_locator(orig) { redirect_locator.pool = rpool; }
321 explicit request_redirect_t(const object_locator_t& rloc) :
322 redirect_locator(rloc) {}
323 explicit request_redirect_t(const object_locator_t& orig,
324 const string& robj) :
325 redirect_locator(orig), redirect_object(robj) {}
326
7c673cae
FG
327 bool empty() const { return redirect_locator.empty() &&
328 redirect_object.empty(); }
329
330 void combine_with_locator(object_locator_t& orig, string& obj) const {
331 orig = redirect_locator;
332 if (!redirect_object.empty())
333 obj = redirect_object;
334 }
335
336 void encode(bufferlist& bl) const;
11fdf7f2 337 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
338 void dump(Formatter *f) const;
339 static void generate_test_instances(list<request_redirect_t*>& o);
340};
341WRITE_CLASS_ENCODER(request_redirect_t)
342
343inline ostream& operator<<(ostream& out, const request_redirect_t& redir) {
344 out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
345 return out;
346}
347
348// Internal OSD op flags - set by the OSD based on the op types
349enum {
350 CEPH_OSD_RMW_FLAG_READ = (1 << 1),
351 CEPH_OSD_RMW_FLAG_WRITE = (1 << 2),
352 CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3),
353 CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
354 CEPH_OSD_RMW_FLAG_PGOP = (1 << 5),
355 CEPH_OSD_RMW_FLAG_CACHE = (1 << 6),
356 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7),
357 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
358 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9),
359 CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10),
360};
361
362
363// pg stuff
364
365#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
366
367// placement seed (a hash value)
368typedef uint32_t ps_t;
369
370// old (v1) pg_t encoding (wrap old struct ceph_pg)
371struct old_pg_t {
372 ceph_pg v;
373 void encode(bufferlist& bl) const {
374 ::encode_raw(v, bl);
375 }
11fdf7f2 376 void decode(bufferlist::const_iterator& bl) {
7c673cae
FG
377 ::decode_raw(v, bl);
378 }
379};
380WRITE_CLASS_ENCODER(old_pg_t)
381
382// placement group id
383struct pg_t {
384 uint64_t m_pool;
385 uint32_t m_seed;
7c673cae 386
11fdf7f2
TL
387 pg_t() : m_pool(0), m_seed(0) {}
388 pg_t(ps_t seed, uint64_t pool) :
389 m_pool(pool), m_seed(seed) {}
7c673cae
FG
390 // cppcheck-suppress noExplicitConstructor
391 pg_t(const ceph_pg& cpg) :
11fdf7f2 392 m_pool(cpg.pool), m_seed(cpg.ps) {}
7c673cae
FG
393
394 // cppcheck-suppress noExplicitConstructor
395 pg_t(const old_pg_t& opg) {
396 *this = opg.v;
397 }
398
399 old_pg_t get_old_pg() const {
400 old_pg_t o;
11fdf7f2 401 ceph_assert(m_pool < 0xffffffffull);
7c673cae
FG
402 o.v.pool = m_pool;
403 o.v.ps = m_seed;
11fdf7f2 404 o.v.preferred = (__s16)-1;
7c673cae
FG
405 return o;
406 }
407
408 ps_t ps() const {
409 return m_seed;
410 }
11fdf7f2 411 int64_t pool() const {
7c673cae
FG
412 return m_pool;
413 }
7c673cae
FG
414
415 static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
416 char *calc_name(char *buf, const char *suffix_backwords) const;
417
418 void set_ps(ps_t p) {
419 m_seed = p;
420 }
421 void set_pool(uint64_t p) {
422 m_pool = p;
423 }
7c673cae
FG
424
425 pg_t get_parent() const;
426 pg_t get_ancestor(unsigned old_pg_num) const;
427
428 int print(char *o, int maxlen) const;
429 bool parse(const char *s);
430
431 bool is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *pchildren) const;
432
11fdf7f2
TL
433 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, pg_t *parent) const;
434 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
435 return ps() < new_pg_num && is_split(new_pg_num, old_pg_num, nullptr);
436 }
437
7c673cae
FG
438 /**
439 * Returns b such that for all object o:
440 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
441 */
442 unsigned get_split_bits(unsigned pg_num) const;
443
444 bool contains(int bits, const ghobject_t& oid) {
f64942e4
AA
445 return
446 (int64_t)m_pool == oid.hobj.get_logical_pool() &&
447 oid.match(bits, ps());
7c673cae
FG
448 }
449 bool contains(int bits, const hobject_t& oid) {
f64942e4
AA
450 return
451 (int64_t)m_pool == oid.get_logical_pool() &&
452 oid.match(bits, ps());
7c673cae
FG
453 }
454
455 hobject_t get_hobj_start() const;
456 hobject_t get_hobj_end(unsigned pg_num) const;
457
458 void encode(bufferlist& bl) const {
11fdf7f2 459 using ceph::encode;
7c673cae 460 __u8 v = 1;
11fdf7f2
TL
461 encode(v, bl);
462 encode(m_pool, bl);
463 encode(m_seed, bl);
464 encode((int32_t)-1, bl); // was preferred
7c673cae 465 }
11fdf7f2
TL
466 void decode(bufferlist::const_iterator& bl) {
467 using ceph::decode;
7c673cae 468 __u8 v;
11fdf7f2
TL
469 decode(v, bl);
470 decode(m_pool, bl);
471 decode(m_seed, bl);
472 bl.advance(sizeof(int32_t)); // was preferred
7c673cae 473 }
11fdf7f2
TL
474 void decode_old(bufferlist::const_iterator& bl) {
475 using ceph::decode;
7c673cae 476 old_pg_t opg;
11fdf7f2 477 decode(opg, bl);
7c673cae
FG
478 *this = opg;
479 }
480 void dump(Formatter *f) const;
481 static void generate_test_instances(list<pg_t*>& o);
482};
483WRITE_CLASS_ENCODER(pg_t)
484
485inline bool operator<(const pg_t& l, const pg_t& r) {
486 return l.pool() < r.pool() ||
11fdf7f2 487 (l.pool() == r.pool() && (l.ps() < r.ps()));
7c673cae
FG
488}
489inline bool operator<=(const pg_t& l, const pg_t& r) {
490 return l.pool() < r.pool() ||
11fdf7f2 491 (l.pool() == r.pool() && (l.ps() <= r.ps()));
7c673cae
FG
492}
493inline bool operator==(const pg_t& l, const pg_t& r) {
494 return l.pool() == r.pool() &&
7c673cae
FG
495 l.ps() == r.ps();
496}
497inline bool operator!=(const pg_t& l, const pg_t& r) {
498 return l.pool() != r.pool() ||
7c673cae
FG
499 l.ps() != r.ps();
500}
501inline bool operator>(const pg_t& l, const pg_t& r) {
502 return l.pool() > r.pool() ||
11fdf7f2 503 (l.pool() == r.pool() && (l.ps() > r.ps()));
7c673cae
FG
504}
505inline bool operator>=(const pg_t& l, const pg_t& r) {
506 return l.pool() > r.pool() ||
11fdf7f2 507 (l.pool() == r.pool() && (l.ps() >= r.ps()));
7c673cae
FG
508}
509
510ostream& operator<<(ostream& out, const pg_t &pg);
511
512namespace std {
513 template<> struct hash< pg_t >
514 {
515 size_t operator()( const pg_t& x ) const
516 {
517 static hash<uint32_t> H;
11fdf7f2
TL
518 // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
519 return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ (int32_t)(-1));
7c673cae
FG
520 }
521 };
522} // namespace std
523
524struct spg_t {
525 pg_t pgid;
526 shard_id_t shard;
527 spg_t() : shard(shard_id_t::NO_SHARD) {}
528 spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
529 explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
530 unsigned get_split_bits(unsigned pg_num) const {
531 return pgid.get_split_bits(pg_num);
532 }
533 spg_t get_parent() const {
534 return spg_t(pgid.get_parent(), shard);
535 }
536 ps_t ps() const {
537 return pgid.ps();
538 }
539 uint64_t pool() const {
540 return pgid.pool();
541 }
7c673cae
FG
542
543 static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
544 char *calc_name(char *buf, const char *suffix_backwords) const;
545
546 bool parse(const char *s);
547 bool parse(const std::string& s) {
548 return parse(s.c_str());
549 }
11fdf7f2
TL
550
551 spg_t get_ancestor(unsigned old_pg_num) const {
552 return spg_t(pgid.get_ancestor(old_pg_num), shard);
553 }
554
7c673cae
FG
555 bool is_split(unsigned old_pg_num, unsigned new_pg_num,
556 set<spg_t> *pchildren) const {
557 set<pg_t> _children;
558 set<pg_t> *children = pchildren ? &_children : NULL;
559 bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
560 if (pchildren && is_split) {
561 for (set<pg_t>::iterator i = _children.begin();
562 i != _children.end();
563 ++i) {
564 pchildren->insert(spg_t(*i, shard));
565 }
566 }
567 return is_split;
568 }
11fdf7f2
TL
569 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
570 return pgid.is_merge_target(old_pg_num, new_pg_num);
571 }
572 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num,
573 spg_t *parent) const {
574 spg_t out = *this;
575 bool r = pgid.is_merge_source(old_pg_num, new_pg_num, &out.pgid);
576 if (r && parent) {
577 *parent = out;
578 }
579 return r;
580 }
581
7c673cae
FG
582 bool is_no_shard() const {
583 return shard == shard_id_t::NO_SHARD;
584 }
585
586 ghobject_t make_pgmeta_oid() const {
587 return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
588 }
589
590 void encode(bufferlist &bl) const {
591 ENCODE_START(1, 1, bl);
11fdf7f2
TL
592 encode(pgid, bl);
593 encode(shard, bl);
7c673cae
FG
594 ENCODE_FINISH(bl);
595 }
11fdf7f2 596 void decode(bufferlist::const_iterator& bl) {
7c673cae 597 DECODE_START(1, bl);
11fdf7f2
TL
598 decode(pgid, bl);
599 decode(shard, bl);
7c673cae
FG
600 DECODE_FINISH(bl);
601 }
602
603 ghobject_t make_temp_ghobject(const string& name) const {
604 return ghobject_t(
605 hobject_t(object_t(name), "", CEPH_NOSNAP,
606 pgid.ps(),
f64942e4
AA
607 hobject_t::get_temp_pool(pgid.pool()),
608 ""),
7c673cae
FG
609 ghobject_t::NO_GEN,
610 shard);
611 }
612
613 unsigned hash_to_shard(unsigned num_shards) const {
614 return ps() % num_shards;
615 }
616};
617WRITE_CLASS_ENCODER(spg_t)
618WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
619WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
620
621namespace std {
622 template<> struct hash< spg_t >
623 {
624 size_t operator()( const spg_t& x ) const
625 {
626 static hash<uint32_t> H;
627 return H(hash<pg_t>()(x.pgid) ^ x.shard);
628 }
629 };
630} // namespace std
631
632ostream& operator<<(ostream& out, const spg_t &pg);
633
634// ----------------------
635
636class coll_t {
637 enum type_t {
638 TYPE_META = 0,
639 TYPE_LEGACY_TEMP = 1, /* no longer used */
640 TYPE_PG = 2,
641 TYPE_PG_TEMP = 3,
642 };
643 type_t type;
644 spg_t pgid;
645 uint64_t removal_seq; // note: deprecated, not encoded
646
647 char _str_buff[spg_t::calc_name_buf_size];
648 char *_str;
649
650 void calc_str();
651
652 coll_t(type_t t, spg_t p, uint64_t r)
653 : type(t), pgid(p), removal_seq(r) {
654 calc_str();
655 }
656
657public:
658 coll_t() : type(TYPE_META), removal_seq(0)
659 {
660 calc_str();
661 }
662
663 coll_t(const coll_t& other)
664 : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
665 calc_str();
666 }
667
668 explicit coll_t(spg_t pgid)
669 : type(TYPE_PG), pgid(pgid), removal_seq(0)
670 {
671 calc_str();
672 }
673
674 coll_t& operator=(const coll_t& rhs)
675 {
676 this->type = rhs.type;
677 this->pgid = rhs.pgid;
678 this->removal_seq = rhs.removal_seq;
679 this->calc_str();
680 return *this;
681 }
682
683 // named constructors
684 static coll_t meta() {
685 return coll_t();
686 }
687 static coll_t pg(spg_t p) {
688 return coll_t(p);
689 }
690
691 const std::string to_str() const {
692 return string(_str);
693 }
694 const char *c_str() const {
695 return _str;
696 }
697
698 bool parse(const std::string& s);
699
700 int operator<(const coll_t &rhs) const {
701 return type < rhs.type ||
702 (type == rhs.type && pgid < rhs.pgid);
703 }
704
705 bool is_meta() const {
706 return type == TYPE_META;
707 }
708 bool is_pg_prefix(spg_t *pgid_) const {
709 if (type == TYPE_PG || type == TYPE_PG_TEMP) {
710 *pgid_ = pgid;
711 return true;
712 }
713 return false;
714 }
715 bool is_pg() const {
716 return type == TYPE_PG;
717 }
718 bool is_pg(spg_t *pgid_) const {
719 if (type == TYPE_PG) {
720 *pgid_ = pgid;
721 return true;
722 }
723 return false;
724 }
725 bool is_temp() const {
726 return type == TYPE_PG_TEMP;
727 }
728 bool is_temp(spg_t *pgid_) const {
729 if (type == TYPE_PG_TEMP) {
730 *pgid_ = pgid;
731 return true;
732 }
733 return false;
734 }
735
736 void encode(bufferlist& bl) const;
11fdf7f2 737 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
738 size_t encoded_size() const;
739
740 inline bool operator==(const coll_t& rhs) const {
741 // only compare type if meta
742 if (type != rhs.type)
743 return false;
744 if (type == TYPE_META)
745 return true;
746 return type == rhs.type && pgid == rhs.pgid;
747 }
748 inline bool operator!=(const coll_t& rhs) const {
749 return !(*this == rhs);
750 }
751
752 // get a TEMP collection that corresponds to the current collection,
753 // which we presume is a pg collection.
754 coll_t get_temp() const {
11fdf7f2 755 ceph_assert(type == TYPE_PG);
7c673cae
FG
756 return coll_t(TYPE_PG_TEMP, pgid, 0);
757 }
758
759 ghobject_t get_min_hobj() const {
760 ghobject_t o;
761 switch (type) {
762 case TYPE_PG:
763 o.hobj.pool = pgid.pool();
764 o.set_shard(pgid.shard);
765 break;
766 case TYPE_META:
767 o.hobj.pool = -1;
768 break;
769 default:
770 break;
771 }
772 return o;
773 }
774
775 unsigned hash_to_shard(unsigned num_shards) const {
776 if (type == TYPE_PG)
777 return pgid.hash_to_shard(num_shards);
778 return 0; // whatever.
779 }
780
781 void dump(Formatter *f) const;
782 static void generate_test_instances(list<coll_t*>& o);
783};
784
785WRITE_CLASS_ENCODER(coll_t)
786
787inline ostream& operator<<(ostream& out, const coll_t& c) {
788 out << c.to_str();
789 return out;
790}
791
792namespace std {
793 template<> struct hash<coll_t> {
794 size_t operator()(const coll_t &c) const {
795 size_t h = 0;
796 string str(c.to_str());
797 std::string::const_iterator end(str.end());
798 for (std::string::const_iterator s = str.begin(); s != end; ++s) {
799 h += *s;
800 h += (h << 10);
801 h ^= (h >> 6);
802 }
803 h += (h << 3);
804 h ^= (h >> 11);
805 h += (h << 15);
806 return h;
807 }
808 };
809} // namespace std
810
811inline ostream& operator<<(ostream& out, const ceph_object_layout &ol)
812{
813 out << pg_t(ol.ol_pgid);
814 int su = ol.ol_stripe_unit;
815 if (su)
816 out << ".su=" << su;
817 return out;
818}
819
820
821
822// compound rados version type
823/* WARNING: If add member in eversion_t, please make sure the encode/decode function
824 * work well. For little-endian machine, we should make sure there is no padding
825 * in 32-bit machine and 64-bit machine.
826 */
827class eversion_t {
828public:
829 version_t version;
830 epoch_t epoch;
831 __u32 __pad;
832 eversion_t() : version(0), epoch(0), __pad(0) {}
833 eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
834
835 // cppcheck-suppress noExplicitConstructor
c07f9fc5 836 eversion_t(const ceph_eversion& ce) :
7c673cae
FG
837 version(ce.version),
838 epoch(ce.epoch),
839 __pad(0) { }
840
841 explicit eversion_t(bufferlist& bl) : __pad(0) { decode(bl); }
842
11fdf7f2
TL
843 static const eversion_t& max() {
844 static const eversion_t max(-1,-1);
7c673cae
FG
845 return max;
846 }
847
848 operator ceph_eversion() {
849 ceph_eversion c;
850 c.epoch = epoch;
851 c.version = version;
852 return c;
853 }
854
855 string get_key_name() const;
856
11fdf7f2
TL
857 // key must point to the beginning of a block of 32 chars
858 inline void get_key_name(char* key) const {
859 // Below is equivalent of sprintf("%010u.%020llu");
860 key[31] = 0;
861 ritoa<uint64_t, 10, 20>(version, key + 31);
862 key[10] = '.';
863 ritoa<uint32_t, 10, 10>(epoch, key + 10);
864 }
865
7c673cae
FG
866 void encode(bufferlist &bl) const {
867#if defined(CEPH_LITTLE_ENDIAN)
868 bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
869#else
11fdf7f2
TL
870 using ceph::encode;
871 encode(version, bl);
872 encode(epoch, bl);
7c673cae
FG
873#endif
874 }
11fdf7f2 875 void decode(bufferlist::const_iterator &bl) {
7c673cae
FG
876#if defined(CEPH_LITTLE_ENDIAN)
877 bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
878#else
11fdf7f2
TL
879 using ceph::decode;
880 decode(version, bl);
881 decode(epoch, bl);
7c673cae
FG
882#endif
883 }
884 void decode(bufferlist& bl) {
11fdf7f2 885 auto p = std::cbegin(bl);
7c673cae
FG
886 decode(p);
887 }
888};
889WRITE_CLASS_ENCODER(eversion_t)
890
891inline bool operator==(const eversion_t& l, const eversion_t& r) {
892 return (l.epoch == r.epoch) && (l.version == r.version);
893}
894inline bool operator!=(const eversion_t& l, const eversion_t& r) {
895 return (l.epoch != r.epoch) || (l.version != r.version);
896}
897inline bool operator<(const eversion_t& l, const eversion_t& r) {
898 return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
899}
900inline bool operator<=(const eversion_t& l, const eversion_t& r) {
901 return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
902}
903inline bool operator>(const eversion_t& l, const eversion_t& r) {
904 return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
905}
906inline bool operator>=(const eversion_t& l, const eversion_t& r) {
907 return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
908}
909inline ostream& operator<<(ostream& out, const eversion_t& e) {
910 return out << e.epoch << "'" << e.version;
911}
912
913/**
914 * objectstore_perf_stat_t
915 *
916 * current perf information about the osd
917 */
918struct objectstore_perf_stat_t {
11fdf7f2
TL
919 // cur_op_latency is in ns since double add/sub are not associative
920 uint64_t os_commit_latency_ns;
921 uint64_t os_apply_latency_ns;
7c673cae
FG
922
923 objectstore_perf_stat_t() :
11fdf7f2 924 os_commit_latency_ns(0), os_apply_latency_ns(0) {}
7c673cae
FG
925
926 bool operator==(const objectstore_perf_stat_t &r) const {
11fdf7f2
TL
927 return os_commit_latency_ns == r.os_commit_latency_ns &&
928 os_apply_latency_ns == r.os_apply_latency_ns;
7c673cae
FG
929 }
930
931 void add(const objectstore_perf_stat_t &o) {
11fdf7f2
TL
932 os_commit_latency_ns += o.os_commit_latency_ns;
933 os_apply_latency_ns += o.os_apply_latency_ns;
7c673cae
FG
934 }
935 void sub(const objectstore_perf_stat_t &o) {
11fdf7f2
TL
936 os_commit_latency_ns -= o.os_commit_latency_ns;
937 os_apply_latency_ns -= o.os_apply_latency_ns;
7c673cae
FG
938 }
939 void dump(Formatter *f) const;
11fdf7f2
TL
940 void encode(bufferlist &bl, uint64_t features) const;
941 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
942 static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
943};
11fdf7f2 944WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t)
7c673cae
FG
945
946/*
947 * pg states
948 */
11fdf7f2
TL
949#define PG_STATE_CREATING (1ULL << 0) // creating
950#define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too)
951#define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas.
952#define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline
953#define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound
954#define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound
955#define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging
956#define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing
957//#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub
958#define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy
959#define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
960#define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering
961#define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub
962#define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects
963#define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill
964#define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed.
965#define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown.
966#define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
967#define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files
968#define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content
969#define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full
970#define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations
971#define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size
972#define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active
973#define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover
974#define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps
975#define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps
976#define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full
977#define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps
978#define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other
979#define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other
980#define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors
981
982std::string pg_state_string(uint64_t state);
7c673cae 983std::string pg_vector_string(const vector<int32_t> &a);
3efd9988 984boost::optional<uint64_t> pg_string_state(const std::string& state);
7c673cae
FG
985
986
987/*
988 * pool_snap_info_t
989 *
990 * attributes for a single pool snapshot.
991 */
992struct pool_snap_info_t {
993 snapid_t snapid;
994 utime_t stamp;
995 string name;
996
997 void dump(Formatter *f) const;
998 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 999 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
1000 static void generate_test_instances(list<pool_snap_info_t*>& o);
1001};
1002WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
1003
1004inline ostream& operator<<(ostream& out, const pool_snap_info_t& si) {
1005 return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
1006}
1007
1008
1009/*
1010 * pool_opts_t
1011 *
1012 * pool options.
1013 */
1014
1015class pool_opts_t {
1016public:
1017 enum key_t {
1018 SCRUB_MIN_INTERVAL,
1019 SCRUB_MAX_INTERVAL,
1020 DEEP_SCRUB_INTERVAL,
1021 RECOVERY_PRIORITY,
1022 RECOVERY_OP_PRIORITY,
1023 SCRUB_PRIORITY,
1024 COMPRESSION_MODE,
1025 COMPRESSION_ALGORITHM,
1026 COMPRESSION_REQUIRED_RATIO,
1027 COMPRESSION_MAX_BLOB_SIZE,
1028 COMPRESSION_MIN_BLOB_SIZE,
1029 CSUM_TYPE,
1030 CSUM_MAX_BLOCK,
1031 CSUM_MIN_BLOCK,
11fdf7f2
TL
1032 FINGERPRINT_ALGORITHM,
1033 PG_NUM_MIN, // min pg_num
1034 TARGET_SIZE_BYTES, // total bytes in pool
1035 TARGET_SIZE_RATIO, // fraction of total cluster
1036 PG_AUTOSCALE_BIAS,
7c673cae
FG
1037 };
1038
1039 enum type_t {
1040 STR,
1041 INT,
1042 DOUBLE,
1043 };
1044
1045 struct opt_desc_t {
1046 key_t key;
1047 type_t type;
1048
1049 opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
1050
1051 bool operator==(const opt_desc_t& rhs) const {
1052 return key == rhs.key && type == rhs.type;
1053 }
1054 };
1055
11fdf7f2 1056 typedef boost::variant<std::string,int64_t,double> value_t;
7c673cae
FG
1057
1058 static bool is_opt_name(const std::string& name);
1059 static opt_desc_t get_opt_desc(const std::string& name);
1060
1061 pool_opts_t() : opts() {}
1062
1063 bool is_set(key_t key) const;
1064
1065 template<typename T>
1066 void set(key_t key, const T &val) {
1067 value_t value = val;
1068 opts[key] = value;
1069 }
1070
1071 template<typename T>
1072 bool get(key_t key, T *val) const {
1073 opts_t::const_iterator i = opts.find(key);
1074 if (i == opts.end()) {
1075 return false;
1076 }
1077 *val = boost::get<T>(i->second);
1078 return true;
1079 }
1080
1081 const value_t& get(key_t key) const;
1082
1083 bool unset(key_t key);
1084
1085 void dump(const std::string& name, Formatter *f) const;
1086
1087 void dump(Formatter *f) const;
11fdf7f2
TL
1088 void encode(bufferlist &bl, uint64_t features) const;
1089 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
1090
1091private:
1092 typedef std::map<key_t, value_t> opts_t;
1093 opts_t opts;
1094
1095 friend ostream& operator<<(ostream& out, const pool_opts_t& opts);
1096};
11fdf7f2
TL
1097WRITE_CLASS_ENCODER_FEATURES(pool_opts_t)
1098
1099struct pg_merge_meta_t {
1100 pg_t source_pgid;
1101 epoch_t ready_epoch = 0;
1102 epoch_t last_epoch_started = 0;
1103 epoch_t last_epoch_clean = 0;
1104 eversion_t source_version;
1105 eversion_t target_version;
1106
1107 void encode(bufferlist& bl) const {
1108 ENCODE_START(1, 1, bl);
1109 encode(source_pgid, bl);
1110 encode(ready_epoch, bl);
1111 encode(last_epoch_started, bl);
1112 encode(last_epoch_clean, bl);
1113 encode(source_version, bl);
1114 encode(target_version, bl);
1115 ENCODE_FINISH(bl);
1116 }
1117 void decode(bufferlist::const_iterator& p) {
1118 DECODE_START(1, p);
1119 decode(source_pgid, p);
1120 decode(ready_epoch, p);
1121 decode(last_epoch_started, p);
1122 decode(last_epoch_clean, p);
1123 decode(source_version, p);
1124 decode(target_version, p);
1125 DECODE_FINISH(p);
1126 }
1127 void dump(Formatter *f) const {
1128 f->dump_stream("source_pgid") << source_pgid;
1129 f->dump_unsigned("ready_epoch", ready_epoch);
1130 f->dump_unsigned("last_epoch_started", last_epoch_started);
1131 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
1132 f->dump_stream("source_version") << source_version;
1133 f->dump_stream("target_version") << target_version;
1134 }
1135};
1136WRITE_CLASS_ENCODER(pg_merge_meta_t)
7c673cae
FG
1137
1138/*
1139 * pg_pool
1140 */
1141struct pg_pool_t {
c07f9fc5
FG
1142 static const char *APPLICATION_NAME_CEPHFS;
1143 static const char *APPLICATION_NAME_RBD;
1144 static const char *APPLICATION_NAME_RGW;
1145
7c673cae
FG
1146 enum {
1147 TYPE_REPLICATED = 1, // replication
1148 //TYPE_RAID4 = 2, // raid4 (never implemented)
1149 TYPE_ERASURE = 3, // erasure-coded
1150 };
11fdf7f2 1151 static std::string_view get_type_name(int t) {
7c673cae
FG
1152 switch (t) {
1153 case TYPE_REPLICATED: return "replicated";
1154 //case TYPE_RAID4: return "raid4";
1155 case TYPE_ERASURE: return "erasure";
1156 default: return "???";
1157 }
1158 }
11fdf7f2 1159 std::string_view get_type_name() const {
7c673cae
FG
1160 return get_type_name(type);
1161 }
7c673cae
FG
1162
1163 enum {
1164 FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
1165 FLAG_FULL = 1<<1, // pool is full
1166 FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
1167 FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
1168 FLAG_NODELETE = 1<<4, // pool can't be deleted
1169 FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
1170 FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
1171 FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1172 FLAG_NOSCRUB = 1<<8, // block periodic scrub
1173 FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
11fdf7f2 1174 FLAG_FULL_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
3efd9988
FG
1175 FLAG_NEARFULL = 1<<11, // pool is nearfull
1176 FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
11fdf7f2
TL
1177 FLAG_SELFMANAGED_SNAPS = 1<<13, // pool uses selfmanaged snaps
1178 FLAG_POOL_SNAPS = 1<<14, // pool has pool snaps
1179 FLAG_CREATING = 1<<15, // initial pool PGs are being created
7c673cae
FG
1180 };
1181
1182 static const char *get_flag_name(int f) {
1183 switch (f) {
1184 case FLAG_HASHPSPOOL: return "hashpspool";
1185 case FLAG_FULL: return "full";
1186 case FLAG_EC_OVERWRITES: return "ec_overwrites";
1187 case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
1188 case FLAG_NODELETE: return "nodelete";
1189 case FLAG_NOPGCHANGE: return "nopgchange";
1190 case FLAG_NOSIZECHANGE: return "nosizechange";
1191 case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
1192 case FLAG_NOSCRUB: return "noscrub";
1193 case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
11fdf7f2 1194 case FLAG_FULL_QUOTA: return "full_quota";
3efd9988
FG
1195 case FLAG_NEARFULL: return "nearfull";
1196 case FLAG_BACKFILLFULL: return "backfillfull";
11fdf7f2
TL
1197 case FLAG_SELFMANAGED_SNAPS: return "selfmanaged_snaps";
1198 case FLAG_POOL_SNAPS: return "pool_snaps";
1199 case FLAG_CREATING: return "creating";
7c673cae
FG
1200 default: return "???";
1201 }
1202 }
1203 static string get_flags_string(uint64_t f) {
1204 string s;
1205 for (unsigned n=0; f && n<64; ++n) {
1206 if (f & (1ull << n)) {
1207 if (s.length())
1208 s += ",";
1209 s += get_flag_name(1ull << n);
1210 }
1211 }
1212 return s;
1213 }
1214 string get_flags_string() const {
1215 return get_flags_string(flags);
1216 }
1217 static uint64_t get_flag_by_name(const string& name) {
1218 if (name == "hashpspool")
1219 return FLAG_HASHPSPOOL;
1220 if (name == "full")
1221 return FLAG_FULL;
1222 if (name == "ec_overwrites")
1223 return FLAG_EC_OVERWRITES;
1224 if (name == "incomplete_clones")
1225 return FLAG_INCOMPLETE_CLONES;
1226 if (name == "nodelete")
1227 return FLAG_NODELETE;
1228 if (name == "nopgchange")
1229 return FLAG_NOPGCHANGE;
1230 if (name == "nosizechange")
1231 return FLAG_NOSIZECHANGE;
1232 if (name == "write_fadvise_dontneed")
1233 return FLAG_WRITE_FADVISE_DONTNEED;
1234 if (name == "noscrub")
1235 return FLAG_NOSCRUB;
1236 if (name == "nodeep-scrub")
1237 return FLAG_NODEEP_SCRUB;
11fdf7f2
TL
1238 if (name == "full_quota")
1239 return FLAG_FULL_QUOTA;
3efd9988
FG
1240 if (name == "nearfull")
1241 return FLAG_NEARFULL;
1242 if (name == "backfillfull")
1243 return FLAG_BACKFILLFULL;
11fdf7f2
TL
1244 if (name == "selfmanaged_snaps")
1245 return FLAG_SELFMANAGED_SNAPS;
1246 if (name == "pool_snaps")
1247 return FLAG_POOL_SNAPS;
1248 if (name == "creating")
1249 return FLAG_CREATING;
7c673cae
FG
1250 return 0;
1251 }
1252
1253 /// converts the acting/up vector to a set of pg shards
1254 void convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const;
1255
1256 typedef enum {
1257 CACHEMODE_NONE = 0, ///< no caching
1258 CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
1259 CACHEMODE_FORWARD = 2, ///< forward if not in cache
1260 CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
1261 CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later
1262 CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later
1263 CACHEMODE_PROXY = 6, ///< proxy if not in cache
1264 } cache_mode_t;
1265 static const char *get_cache_mode_name(cache_mode_t m) {
1266 switch (m) {
1267 case CACHEMODE_NONE: return "none";
1268 case CACHEMODE_WRITEBACK: return "writeback";
1269 case CACHEMODE_FORWARD: return "forward";
1270 case CACHEMODE_READONLY: return "readonly";
1271 case CACHEMODE_READFORWARD: return "readforward";
1272 case CACHEMODE_READPROXY: return "readproxy";
1273 case CACHEMODE_PROXY: return "proxy";
1274 default: return "unknown";
1275 }
1276 }
1277 static cache_mode_t get_cache_mode_from_str(const string& s) {
1278 if (s == "none")
1279 return CACHEMODE_NONE;
1280 if (s == "writeback")
1281 return CACHEMODE_WRITEBACK;
1282 if (s == "forward")
1283 return CACHEMODE_FORWARD;
1284 if (s == "readonly")
1285 return CACHEMODE_READONLY;
1286 if (s == "readforward")
1287 return CACHEMODE_READFORWARD;
1288 if (s == "readproxy")
1289 return CACHEMODE_READPROXY;
1290 if (s == "proxy")
1291 return CACHEMODE_PROXY;
1292 return (cache_mode_t)-1;
1293 }
1294 const char *get_cache_mode_name() const {
1295 return get_cache_mode_name(cache_mode);
1296 }
1297 bool cache_mode_requires_hit_set() const {
1298 switch (cache_mode) {
1299 case CACHEMODE_NONE:
1300 case CACHEMODE_FORWARD:
1301 case CACHEMODE_READONLY:
1302 case CACHEMODE_PROXY:
1303 return false;
1304 case CACHEMODE_WRITEBACK:
1305 case CACHEMODE_READFORWARD:
1306 case CACHEMODE_READPROXY:
1307 return true;
1308 default:
11fdf7f2
TL
1309 ceph_abort_msg("implement me");
1310 }
1311 }
1312
1313 enum {
1314 PG_AUTOSCALE_MODE_OFF = 0,
1315 PG_AUTOSCALE_MODE_WARN = 1,
1316 PG_AUTOSCALE_MODE_ON = 2,
1317 };
1318 static const char *get_pg_autoscale_mode_name(int m) {
1319 switch (m) {
1320 case PG_AUTOSCALE_MODE_OFF: return "off";
1321 case PG_AUTOSCALE_MODE_ON: return "on";
1322 case PG_AUTOSCALE_MODE_WARN: return "warn";
1323 default: return "???";
1324 }
1325 }
1326 static int get_pg_autoscale_mode_by_name(const string& m) {
1327 if (m == "off") {
1328 return PG_AUTOSCALE_MODE_OFF;
1329 }
1330 if (m == "warn") {
1331 return PG_AUTOSCALE_MODE_WARN;
1332 }
1333 if (m == "on") {
1334 return PG_AUTOSCALE_MODE_ON;
7c673cae 1335 }
11fdf7f2 1336 return -1;
7c673cae
FG
1337 }
1338
11fdf7f2 1339 utime_t create_time;
7c673cae
FG
1340 uint64_t flags; ///< FLAG_*
1341 __u8 type; ///< TYPE_*
1342 __u8 size, min_size; ///< number of osds in each pg
31f18b77 1343 __u8 crush_rule; ///< crush placement rule
7c673cae 1344 __u8 object_hash; ///< hash mapping object name to ps
11fdf7f2 1345 __u8 pg_autoscale_mode; ///< PG_AUTOSCALE_MODE_
7c673cae 1346private:
11fdf7f2
TL
1347 __u32 pg_num = 0, pgp_num = 0; ///< number of pgs
1348 __u32 pg_num_pending = 0; ///< pg_num we are about to merge down to
1349 __u32 pg_num_target = 0; ///< pg_num we should converge toward
1350 __u32 pgp_num_target = 0; ///< pgp_num we should converge toward
7c673cae
FG
1351
1352public:
1353 map<string,string> properties; ///< OBSOLETE
1354 string erasure_code_profile; ///< name of the erasure code profile in OSDMap
1355 epoch_t last_change; ///< most recent epoch changed, exclusing snapshot changes
11fdf7f2
TL
1356
1357 /// last epoch that forced clients to resend
1358 epoch_t last_force_op_resend = 0;
1359 /// last epoch that forced clients to resend (pre-nautilus clients only)
1360 epoch_t last_force_op_resend_prenautilus = 0;
7c673cae 1361 /// last epoch that forced clients to resend (pre-luminous clients only)
11fdf7f2
TL
1362 epoch_t last_force_op_resend_preluminous = 0;
1363
1364 /// metadata for the most recent PG merge
1365 pg_merge_meta_t last_pg_merge_meta;
1366
7c673cae
FG
1367 snapid_t snap_seq; ///< seq for per-pool snapshot
1368 epoch_t snap_epoch; ///< osdmap epoch of last snap
1369 uint64_t auid; ///< who owns the pg
7c673cae
FG
1370
1371 uint64_t quota_max_bytes; ///< maximum number of bytes for this pool
1372 uint64_t quota_max_objects; ///< maximum number of objects for this pool
1373
1374 /*
1375 * Pool snaps (global to this pool). These define a SnapContext for
1376 * the pool, unless the client manually specifies an alternate
1377 * context.
1378 */
1379 map<snapid_t, pool_snap_info_t> snaps;
1380 /*
1381 * Alternatively, if we are defining non-pool snaps (e.g. via the
1382 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1383 * used). Snaps and removed_snaps are to be used exclusive of each
1384 * other!
1385 */
1386 interval_set<snapid_t> removed_snaps;
1387
1388 unsigned pg_num_mask, pgp_num_mask;
1389
1390 set<uint64_t> tiers; ///< pools that are tiers of us
1391 int64_t tier_of; ///< pool for which we are a tier
1392 // Note that write wins for read+write ops
1393 int64_t read_tier; ///< pool/tier for objecter to direct reads to
1394 int64_t write_tier; ///< pool/tier for objecter to direct writes to
1395 cache_mode_t cache_mode; ///< cache pool mode
1396
1397 bool is_tier() const { return tier_of >= 0; }
1398 bool has_tiers() const { return !tiers.empty(); }
1399 void clear_tier() {
1400 tier_of = -1;
1401 clear_read_tier();
1402 clear_write_tier();
1403 clear_tier_tunables();
1404 }
1405 bool has_read_tier() const { return read_tier >= 0; }
1406 void clear_read_tier() { read_tier = -1; }
1407 bool has_write_tier() const { return write_tier >= 0; }
1408 void clear_write_tier() { write_tier = -1; }
1409 void clear_tier_tunables() {
1410 if (cache_mode != CACHEMODE_NONE)
1411 flags |= FLAG_INCOMPLETE_CLONES;
1412 cache_mode = CACHEMODE_NONE;
1413
1414 target_max_bytes = 0;
1415 target_max_objects = 0;
1416 cache_target_dirty_ratio_micro = 0;
1417 cache_target_dirty_high_ratio_micro = 0;
1418 cache_target_full_ratio_micro = 0;
1419 hit_set_params = HitSet::Params();
1420 hit_set_period = 0;
1421 hit_set_count = 0;
1422 hit_set_grade_decay_rate = 0;
1423 hit_set_search_last_n = 0;
1424 grade_table.resize(0);
1425 }
1426
1427 uint64_t target_max_bytes; ///< tiering: target max pool size
1428 uint64_t target_max_objects; ///< tiering: target max pool size
1429
1430 uint32_t cache_target_dirty_ratio_micro; ///< cache: fraction of target to leave dirty
11fdf7f2 1431 uint32_t cache_target_dirty_high_ratio_micro; ///< cache: fraction of target to flush with high speed
7c673cae
FG
1432 uint32_t cache_target_full_ratio_micro; ///< cache: fraction of target to fill before we evict in earnest
1433
1434 uint32_t cache_min_flush_age; ///< minimum age (seconds) before we can flush
1435 uint32_t cache_min_evict_age; ///< minimum age (seconds) before we can evict
1436
1437 HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
1438 uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
1439 uint32_t hit_set_count; ///< number of periods to retain
1440 bool use_gmt_hitset; ///< use gmt to name the hitset archive object
1441 uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote on read
1442 uint32_t min_write_recency_for_promote; ///< minimum number of HitSet to check before promote on write
1443 uint32_t hit_set_grade_decay_rate; ///< current hit_set has highest priority on objects
11fdf7f2
TL
1444 ///< temperature count,the follow hit_set's priority decay
1445 ///< by this params than pre hit_set
1446 uint32_t hit_set_search_last_n; ///< accumulate atmost N hit_sets for temperature
7c673cae
FG
1447
1448 uint32_t stripe_width; ///< erasure coded stripe size in bytes
1449
1450 uint64_t expected_num_objects; ///< expected number of objects on this pool, a value of 0 indicates
1451 ///< user does not specify any expected value
1452 bool fast_read; ///< whether turn on fast read on the pool or not
1453
1454 pool_opts_t opts; ///< options
1455
11fdf7f2
TL
1456 typedef enum {
1457 TYPE_FINGERPRINT_NONE = 0,
1458 TYPE_FINGERPRINT_SHA1 = 1,
1459 } fingerprint_t;
1460 static fingerprint_t get_fingerprint_from_str(const string& s) {
1461 if (s == "none")
1462 return TYPE_FINGERPRINT_NONE;
1463 if (s == "sha1")
1464 return TYPE_FINGERPRINT_SHA1;
1465 return (fingerprint_t)-1;
1466 }
1467 const fingerprint_t get_fingerprint_type() const {
1468 string fp_str;
1469 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1470 return get_fingerprint_from_str(fp_str);
1471 }
1472 const char *get_fingerprint_name() const {
1473 string fp_str;
1474 fingerprint_t fp_t;
1475 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1476 fp_t = get_fingerprint_from_str(fp_str);
1477 return get_fingerprint_name(fp_t);
1478 }
1479 static const char *get_fingerprint_name(fingerprint_t m) {
1480 switch (m) {
1481 case TYPE_FINGERPRINT_NONE: return "none";
1482 case TYPE_FINGERPRINT_SHA1: return "sha1";
1483 default: return "unknown";
1484 }
1485 }
1486
c07f9fc5
FG
1487 /// application -> key/value metadata
1488 map<string, std::map<string, string>> application_metadata;
1489
7c673cae
FG
1490private:
1491 vector<uint32_t> grade_table;
1492
1493public:
1494 uint32_t get_grade(unsigned i) const {
1495 if (grade_table.size() <= i)
1496 return 0;
1497 return grade_table[i];
1498 }
1499 void calc_grade_table() {
1500 unsigned v = 1000000;
1501 grade_table.resize(hit_set_count);
1502 for (unsigned i = 0; i < hit_set_count; i++) {
1503 v = v * (1 - (hit_set_grade_decay_rate / 100.0));
1504 grade_table[i] = v;
1505 }
1506 }
1507
1508 pg_pool_t()
1509 : flags(0), type(0), size(0), min_size(0),
31f18b77 1510 crush_rule(0), object_hash(0),
7c673cae 1511 last_change(0),
7c673cae
FG
1512 snap_seq(0), snap_epoch(0),
1513 auid(0),
7c673cae
FG
1514 quota_max_bytes(0), quota_max_objects(0),
1515 pg_num_mask(0), pgp_num_mask(0),
1516 tier_of(-1), read_tier(-1), write_tier(-1),
1517 cache_mode(CACHEMODE_NONE),
1518 target_max_bytes(0), target_max_objects(0),
1519 cache_target_dirty_ratio_micro(0),
1520 cache_target_dirty_high_ratio_micro(0),
1521 cache_target_full_ratio_micro(0),
1522 cache_min_flush_age(0),
1523 cache_min_evict_age(0),
1524 hit_set_params(),
1525 hit_set_period(0),
1526 hit_set_count(0),
1527 use_gmt_hitset(true),
1528 min_read_recency_for_promote(0),
1529 min_write_recency_for_promote(0),
1530 hit_set_grade_decay_rate(0),
1531 hit_set_search_last_n(0),
1532 stripe_width(0),
1533 expected_num_objects(0),
1534 fast_read(false),
1535 opts()
1536 { }
1537
1538 void dump(Formatter *f) const;
1539
11fdf7f2 1540 const utime_t &get_create_time() const { return create_time; }
7c673cae
FG
1541 uint64_t get_flags() const { return flags; }
1542 bool has_flag(uint64_t f) const { return flags & f; }
1543 void set_flag(uint64_t f) { flags |= f; }
1544 void unset_flag(uint64_t f) { flags &= ~f; }
1545
7c673cae 1546 bool require_rollback() const {
11fdf7f2 1547 return is_erasure();
7c673cae
FG
1548 }
1549
1550 /// true if incomplete clones may be present
1551 bool allow_incomplete_clones() const {
1552 return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
1553 }
1554
1555 unsigned get_type() const { return type; }
1556 unsigned get_size() const { return size; }
1557 unsigned get_min_size() const { return min_size; }
31f18b77 1558 int get_crush_rule() const { return crush_rule; }
7c673cae
FG
1559 int get_object_hash() const { return object_hash; }
1560 const char *get_object_hash_name() const {
1561 return ceph_str_hash_name(get_object_hash());
1562 }
1563 epoch_t get_last_change() const { return last_change; }
1564 epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
11fdf7f2
TL
1565 epoch_t get_last_force_op_resend_prenautilus() const {
1566 return last_force_op_resend_prenautilus;
1567 }
7c673cae
FG
1568 epoch_t get_last_force_op_resend_preluminous() const {
1569 return last_force_op_resend_preluminous;
1570 }
1571 epoch_t get_snap_epoch() const { return snap_epoch; }
1572 snapid_t get_snap_seq() const { return snap_seq; }
1573 uint64_t get_auid() const { return auid; }
7c673cae
FG
1574
1575 void set_snap_seq(snapid_t s) { snap_seq = s; }
1576 void set_snap_epoch(epoch_t e) { snap_epoch = e; }
1577
1578 void set_stripe_width(uint32_t s) { stripe_width = s; }
1579 uint32_t get_stripe_width() const { return stripe_width; }
1580
1581 bool is_replicated() const { return get_type() == TYPE_REPLICATED; }
1582 bool is_erasure() const { return get_type() == TYPE_ERASURE; }
1583
1584 bool supports_omap() const {
1585 return !(get_type() == TYPE_ERASURE);
1586 }
1587
1588 bool requires_aligned_append() const {
1589 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
1590 }
1591 uint64_t required_alignment() const { return stripe_width; }
1592
1593 bool allows_ecoverwrites() const {
1594 return has_flag(FLAG_EC_OVERWRITES);
1595 }
1596
1597 bool can_shift_osds() const {
1598 switch (get_type()) {
1599 case TYPE_REPLICATED:
1600 return true;
1601 case TYPE_ERASURE:
1602 return false;
1603 default:
11fdf7f2 1604 ceph_abort_msg("unhandled pool type");
7c673cae
FG
1605 }
1606 }
1607
1608 unsigned get_pg_num() const { return pg_num; }
1609 unsigned get_pgp_num() const { return pgp_num; }
11fdf7f2
TL
1610 unsigned get_pg_num_target() const { return pg_num_target; }
1611 unsigned get_pgp_num_target() const { return pgp_num_target; }
1612 unsigned get_pg_num_pending() const { return pg_num_pending; }
7c673cae
FG
1613
1614 unsigned get_pg_num_mask() const { return pg_num_mask; }
1615 unsigned get_pgp_num_mask() const { return pgp_num_mask; }
1616
1617 // if pg_num is not a multiple of two, pgs are not equally sized.
1618 // return, for a given pg, the fraction (denominator) of the total
1619 // pool size that it represents.
1620 unsigned get_pg_num_divisor(pg_t pgid) const;
1621
11fdf7f2
TL
1622 bool is_pending_merge(pg_t pgid, bool *target) const;
1623
7c673cae
FG
1624 void set_pg_num(int p) {
1625 pg_num = p;
11fdf7f2 1626 pg_num_pending = p;
7c673cae
FG
1627 calc_pg_masks();
1628 }
1629 void set_pgp_num(int p) {
1630 pgp_num = p;
1631 calc_pg_masks();
1632 }
11fdf7f2
TL
1633 void set_pg_num_pending(int p) {
1634 pg_num_pending = p;
1635 calc_pg_masks();
1636 }
1637 void set_pg_num_target(int p) {
1638 pg_num_target = p;
1639 }
1640 void set_pgp_num_target(int p) {
1641 pgp_num_target = p;
1642 }
1643 void dec_pg_num(pg_t source_pgid,
1644 epoch_t ready_epoch,
1645 eversion_t source_version,
1646 eversion_t target_version,
1647 epoch_t last_epoch_started,
1648 epoch_t last_epoch_clean) {
1649 --pg_num;
1650 last_pg_merge_meta.source_pgid = source_pgid;
1651 last_pg_merge_meta.ready_epoch = ready_epoch;
1652 last_pg_merge_meta.source_version = source_version;
1653 last_pg_merge_meta.target_version = target_version;
1654 last_pg_merge_meta.last_epoch_started = last_epoch_started;
1655 last_pg_merge_meta.last_epoch_clean = last_epoch_clean;
1656 calc_pg_masks();
1657 }
7c673cae
FG
1658
1659 void set_quota_max_bytes(uint64_t m) {
1660 quota_max_bytes = m;
1661 }
1662 uint64_t get_quota_max_bytes() {
1663 return quota_max_bytes;
1664 }
1665
1666 void set_quota_max_objects(uint64_t m) {
1667 quota_max_objects = m;
1668 }
1669 uint64_t get_quota_max_objects() {
1670 return quota_max_objects;
1671 }
1672
1673 void set_last_force_op_resend(uint64_t t) {
1674 last_force_op_resend = t;
11fdf7f2 1675 last_force_op_resend_prenautilus = t;
7c673cae
FG
1676 last_force_op_resend_preluminous = t;
1677 }
1678
1679 void calc_pg_masks();
1680
1681 /*
1682 * we have two snap modes:
1683 * - pool global snaps
1684 * - snap existence/non-existence defined by snaps[] and snap_seq
1685 * - user managed snaps
1686 * - removal governed by removed_snaps
1687 *
1688 * we know which mode we're using based on whether removed_snaps is empty.
1689 * If nothing has been created, both functions report false.
1690 */
1691 bool is_pool_snaps_mode() const;
1692 bool is_unmanaged_snaps_mode() const;
1693 bool is_removed_snap(snapid_t s) const;
1694
1695 /*
1696 * build set of known-removed sets from either pool snaps or
1697 * explicit removed_snaps set.
1698 */
1699 void build_removed_snaps(interval_set<snapid_t>& rs) const;
91327a77 1700 bool maybe_updated_removed_snaps(const interval_set<snapid_t>& cached) const;
7c673cae
FG
1701 snapid_t snap_exists(const char *s) const;
1702 void add_snap(const char *n, utime_t stamp);
1703 void add_unmanaged_snap(uint64_t& snapid);
1704 void remove_snap(snapid_t s);
1705 void remove_unmanaged_snap(snapid_t s);
1706
1707 SnapContext get_snap_context() const;
1708
1709 /// hash a object name+namespace key to a hash position
1710 uint32_t hash_key(const string& key, const string& ns) const;
1711
1712 /// round a hash position down to a pg num
1713 uint32_t raw_hash_to_pg(uint32_t v) const;
1714
1715 /*
1716 * map a raw pg (with full precision ps) into an actual pg, for storage
1717 */
1718 pg_t raw_pg_to_pg(pg_t pg) const;
1719
1720 /*
1721 * map raw pg (full precision ps) into a placement seed. include
1722 * pool id in that value so that different pools don't use the same
1723 * seeds.
1724 */
1725 ps_t raw_pg_to_pps(pg_t pg) const;
1726
1727 /// choose a random hash position within a pg
1728 uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
1729
1730 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 1731 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
1732
1733 static void generate_test_instances(list<pg_pool_t*>& o);
1734};
1735WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
1736
1737ostream& operator<<(ostream& out, const pg_pool_t& p);
1738
1739
1740/**
1741 * a summation of object stats
1742 *
1743 * This is just a container for object stats; we don't know what for.
1744 *
1745 * If you add members in object_stat_sum_t, you should make sure there are
1746 * not padding among these members.
1747 * You should also modify the padding_check function.
1748
1749 */
1750struct object_stat_sum_t {
1751 /**************************************************************************
1752 * WARNING: be sure to update operator==, floor, and split when
1753 * adding/removing fields!
1754 **************************************************************************/
1755 int64_t num_bytes; // in bytes
1756 int64_t num_objects;
1757 int64_t num_object_clones;
1758 int64_t num_object_copies; // num_objects * num_replicas
1759 int64_t num_objects_missing_on_primary;
1760 int64_t num_objects_degraded;
1761 int64_t num_objects_unfound;
1762 int64_t num_rd;
1763 int64_t num_rd_kb;
1764 int64_t num_wr;
1765 int64_t num_wr_kb;
1766 int64_t num_scrub_errors; // total deep and shallow scrub errors
1767 int64_t num_objects_recovered;
1768 int64_t num_bytes_recovered;
1769 int64_t num_keys_recovered;
1770 int64_t num_shallow_scrub_errors;
1771 int64_t num_deep_scrub_errors;
1772 int64_t num_objects_dirty;
1773 int64_t num_whiteouts;
1774 int64_t num_objects_omap;
1775 int64_t num_objects_hit_set_archive;
1776 int64_t num_objects_misplaced;
1777 int64_t num_bytes_hit_set_archive;
1778 int64_t num_flush;
1779 int64_t num_flush_kb;
1780 int64_t num_evict;
1781 int64_t num_evict_kb;
1782 int64_t num_promote;
1783 int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0
1784 int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0
1785 int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0
1786 int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0
1787 int64_t num_objects_pinned;
1788 int64_t num_objects_missing;
1789 int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
28e407b8 1790 int64_t num_large_omap_objects = 0;
11fdf7f2
TL
1791 int64_t num_objects_manifest = 0;
1792 int64_t num_omap_bytes = 0;
1793 int64_t num_omap_keys = 0;
1794 int64_t num_objects_repaired = 0;
7c673cae
FG
1795
1796 object_stat_sum_t()
1797 : num_bytes(0),
1798 num_objects(0), num_object_clones(0), num_object_copies(0),
1799 num_objects_missing_on_primary(0), num_objects_degraded(0),
1800 num_objects_unfound(0),
1801 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1802 num_scrub_errors(0),
1803 num_objects_recovered(0),
1804 num_bytes_recovered(0),
1805 num_keys_recovered(0),
1806 num_shallow_scrub_errors(0),
1807 num_deep_scrub_errors(0),
1808 num_objects_dirty(0),
1809 num_whiteouts(0),
1810 num_objects_omap(0),
1811 num_objects_hit_set_archive(0),
1812 num_objects_misplaced(0),
1813 num_bytes_hit_set_archive(0),
1814 num_flush(0),
1815 num_flush_kb(0),
1816 num_evict(0),
1817 num_evict_kb(0),
1818 num_promote(0),
1819 num_flush_mode_high(0), num_flush_mode_low(0),
1820 num_evict_mode_some(0), num_evict_mode_full(0),
1821 num_objects_pinned(0),
1822 num_objects_missing(0),
1823 num_legacy_snapsets(0)
1824 {}
1825
1826 void floor(int64_t f) {
1827#define FLOOR(x) if (x < f) x = f
1828 FLOOR(num_bytes);
1829 FLOOR(num_objects);
1830 FLOOR(num_object_clones);
1831 FLOOR(num_object_copies);
1832 FLOOR(num_objects_missing_on_primary);
1833 FLOOR(num_objects_missing);
1834 FLOOR(num_objects_degraded);
1835 FLOOR(num_objects_misplaced);
1836 FLOOR(num_objects_unfound);
1837 FLOOR(num_rd);
1838 FLOOR(num_rd_kb);
1839 FLOOR(num_wr);
1840 FLOOR(num_wr_kb);
28e407b8 1841 FLOOR(num_large_omap_objects);
11fdf7f2
TL
1842 FLOOR(num_objects_manifest);
1843 FLOOR(num_omap_bytes);
1844 FLOOR(num_omap_keys);
7c673cae
FG
1845 FLOOR(num_shallow_scrub_errors);
1846 FLOOR(num_deep_scrub_errors);
94b18763 1847 num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
7c673cae
FG
1848 FLOOR(num_objects_recovered);
1849 FLOOR(num_bytes_recovered);
1850 FLOOR(num_keys_recovered);
1851 FLOOR(num_objects_dirty);
1852 FLOOR(num_whiteouts);
1853 FLOOR(num_objects_omap);
1854 FLOOR(num_objects_hit_set_archive);
1855 FLOOR(num_bytes_hit_set_archive);
1856 FLOOR(num_flush);
1857 FLOOR(num_flush_kb);
1858 FLOOR(num_evict);
1859 FLOOR(num_evict_kb);
1860 FLOOR(num_promote);
1861 FLOOR(num_flush_mode_high);
1862 FLOOR(num_flush_mode_low);
1863 FLOOR(num_evict_mode_some);
1864 FLOOR(num_evict_mode_full);
1865 FLOOR(num_objects_pinned);
1866 FLOOR(num_legacy_snapsets);
11fdf7f2 1867 FLOOR(num_objects_repaired);
7c673cae
FG
1868#undef FLOOR
1869 }
1870
1871 void split(vector<object_stat_sum_t> &out) const {
1872#define SPLIT(PARAM) \
1873 for (unsigned i = 0; i < out.size(); ++i) { \
1874 out[i].PARAM = PARAM / out.size(); \
1875 if (i < (PARAM % out.size())) { \
1876 out[i].PARAM++; \
1877 } \
1878 }
1879#define SPLIT_PRESERVE_NONZERO(PARAM) \
1880 for (unsigned i = 0; i < out.size(); ++i) { \
1881 if (PARAM) \
1882 out[i].PARAM = 1 + PARAM / out.size(); \
1883 else \
1884 out[i].PARAM = 0; \
1885 }
1886
1887 SPLIT(num_bytes);
1888 SPLIT(num_objects);
1889 SPLIT(num_object_clones);
1890 SPLIT(num_object_copies);
1891 SPLIT(num_objects_missing_on_primary);
1892 SPLIT(num_objects_missing);
1893 SPLIT(num_objects_degraded);
1894 SPLIT(num_objects_misplaced);
1895 SPLIT(num_objects_unfound);
1896 SPLIT(num_rd);
1897 SPLIT(num_rd_kb);
1898 SPLIT(num_wr);
1899 SPLIT(num_wr_kb);
11fdf7f2
TL
1900 SPLIT(num_large_omap_objects);
1901 SPLIT(num_objects_manifest);
1902 SPLIT(num_omap_bytes);
1903 SPLIT(num_omap_keys);
1904 SPLIT(num_objects_repaired);
94b18763
FG
1905 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
1906 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
1907 for (unsigned i = 0; i < out.size(); ++i) {
1908 out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
1909 out[i].num_deep_scrub_errors;
1910 }
7c673cae
FG
1911 SPLIT(num_objects_recovered);
1912 SPLIT(num_bytes_recovered);
1913 SPLIT(num_keys_recovered);
1914 SPLIT(num_objects_dirty);
1915 SPLIT(num_whiteouts);
1916 SPLIT(num_objects_omap);
1917 SPLIT(num_objects_hit_set_archive);
1918 SPLIT(num_bytes_hit_set_archive);
1919 SPLIT(num_flush);
1920 SPLIT(num_flush_kb);
1921 SPLIT(num_evict);
1922 SPLIT(num_evict_kb);
1923 SPLIT(num_promote);
1924 SPLIT(num_flush_mode_high);
1925 SPLIT(num_flush_mode_low);
1926 SPLIT(num_evict_mode_some);
1927 SPLIT(num_evict_mode_full);
1928 SPLIT(num_objects_pinned);
1929 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
1930#undef SPLIT
1931#undef SPLIT_PRESERVE_NONZERO
1932 }
1933
1934 void clear() {
1935 memset(this, 0, sizeof(*this));
1936 }
1937
1938 void calc_copies(int nrep) {
1939 num_object_copies = nrep * num_objects;
1940 }
1941
1942 bool is_zero() const {
1943 return mem_is_zero((char*)this, sizeof(*this));
1944 }
1945
1946 void add(const object_stat_sum_t& o);
1947 void sub(const object_stat_sum_t& o);
1948
1949 void dump(Formatter *f) const;
1950 void padding_check() {
1951 static_assert(
1952 sizeof(object_stat_sum_t) ==
1953 sizeof(num_bytes) +
1954 sizeof(num_objects) +
1955 sizeof(num_object_clones) +
1956 sizeof(num_object_copies) +
1957 sizeof(num_objects_missing_on_primary) +
1958 sizeof(num_objects_degraded) +
1959 sizeof(num_objects_unfound) +
1960 sizeof(num_rd) +
1961 sizeof(num_rd_kb) +
1962 sizeof(num_wr) +
1963 sizeof(num_wr_kb) +
1964 sizeof(num_scrub_errors) +
28e407b8 1965 sizeof(num_large_omap_objects) +
11fdf7f2
TL
1966 sizeof(num_objects_manifest) +
1967 sizeof(num_omap_bytes) +
1968 sizeof(num_omap_keys) +
1969 sizeof(num_objects_repaired) +
7c673cae
FG
1970 sizeof(num_objects_recovered) +
1971 sizeof(num_bytes_recovered) +
1972 sizeof(num_keys_recovered) +
1973 sizeof(num_shallow_scrub_errors) +
1974 sizeof(num_deep_scrub_errors) +
1975 sizeof(num_objects_dirty) +
1976 sizeof(num_whiteouts) +
1977 sizeof(num_objects_omap) +
1978 sizeof(num_objects_hit_set_archive) +
1979 sizeof(num_objects_misplaced) +
1980 sizeof(num_bytes_hit_set_archive) +
1981 sizeof(num_flush) +
1982 sizeof(num_flush_kb) +
1983 sizeof(num_evict) +
1984 sizeof(num_evict_kb) +
1985 sizeof(num_promote) +
1986 sizeof(num_flush_mode_high) +
1987 sizeof(num_flush_mode_low) +
1988 sizeof(num_evict_mode_some) +
1989 sizeof(num_evict_mode_full) +
1990 sizeof(num_objects_pinned) +
1991 sizeof(num_objects_missing) +
1992 sizeof(num_legacy_snapsets)
1993 ,
1994 "object_stat_sum_t have padding");
1995 }
1996 void encode(bufferlist& bl) const;
11fdf7f2 1997 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
1998 static void generate_test_instances(list<object_stat_sum_t*>& o);
1999};
2000WRITE_CLASS_ENCODER(object_stat_sum_t)
2001
2002bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
2003
2004/**
2005 * a collection of object stat sums
2006 *
2007 * This is a collection of stat sums over different categories.
2008 */
2009struct object_stat_collection_t {
2010 /**************************************************************************
2011 * WARNING: be sure to update the operator== when adding/removing fields! *
2012 **************************************************************************/
2013 object_stat_sum_t sum;
2014
2015 void calc_copies(int nrep) {
2016 sum.calc_copies(nrep);
2017 }
2018
2019 void dump(Formatter *f) const;
2020 void encode(bufferlist& bl) const;
11fdf7f2 2021 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
2022 static void generate_test_instances(list<object_stat_collection_t*>& o);
2023
2024 bool is_zero() const {
2025 return sum.is_zero();
2026 }
2027
2028 void clear() {
2029 sum.clear();
2030 }
2031
2032 void floor(int64_t f) {
2033 sum.floor(f);
2034 }
2035
2036 void add(const object_stat_sum_t& o) {
2037 sum.add(o);
2038 }
2039
2040 void add(const object_stat_collection_t& o) {
2041 sum.add(o.sum);
2042 }
2043 void sub(const object_stat_collection_t& o) {
2044 sum.sub(o.sum);
2045 }
2046};
2047WRITE_CLASS_ENCODER(object_stat_collection_t)
2048
2049inline bool operator==(const object_stat_collection_t& l,
2050 const object_stat_collection_t& r) {
2051 return l.sum == r.sum;
2052}
2053
2054
2055/** pg_stat
2056 * aggregate stats for a single PG.
2057 */
2058struct pg_stat_t {
2059 /**************************************************************************
2060 * WARNING: be sure to update the operator== when adding/removing fields! *
2061 **************************************************************************/
2062 eversion_t version;
2063 version_t reported_seq; // sequence number
2064 epoch_t reported_epoch; // epoch of this report
11fdf7f2 2065 uint64_t state;
7c673cae
FG
2066 utime_t last_fresh; // last reported
2067 utime_t last_change; // new state != previous state
2068 utime_t last_active; // state & PG_STATE_ACTIVE
2069 utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
2070 utime_t last_clean; // state & PG_STATE_CLEAN
2071 utime_t last_unstale; // (state & PG_STATE_STALE) == 0
2072 utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
2073 utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
2074
2075 eversion_t log_start; // (log_start,version]
2076 eversion_t ondisk_log_start; // there may be more on disk
2077
2078 epoch_t created;
2079 epoch_t last_epoch_clean;
2080 pg_t parent;
2081 __u32 parent_split_bits;
2082
2083 eversion_t last_scrub;
2084 eversion_t last_deep_scrub;
2085 utime_t last_scrub_stamp;
2086 utime_t last_deep_scrub_stamp;
2087 utime_t last_clean_scrub_stamp;
2088
2089 object_stat_collection_t stats;
2090
2091 int64_t log_size;
2092 int64_t ondisk_log_size; // >= active_log_size
2093
2094 vector<int32_t> up, acting;
81eedcae
TL
2095 vector<pg_shard_t> avail_no_missing;
2096 map< std::set<pg_shard_t>, int32_t > object_location_counts;
7c673cae
FG
2097 epoch_t mapping_epoch;
2098
2099 vector<int32_t> blocked_by; ///< osds on which the pg is blocked
2100
11fdf7f2
TL
2101 interval_set<snapid_t> purged_snaps; ///< recently removed snaps that we've purged
2102
7c673cae
FG
2103 utime_t last_became_active;
2104 utime_t last_became_peered;
2105
2106 /// up, acting primaries
2107 int32_t up_primary;
2108 int32_t acting_primary;
2109
b32b8144
FG
2110 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
2111 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
2112 uint32_t snaptrimq_len;
2113
7c673cae
FG
2114 bool stats_invalid:1;
2115 /// true if num_objects_dirty is not accurate (because it was not
2116 /// maintained starting from pool creation)
2117 bool dirty_stats_invalid:1;
2118 bool omap_stats_invalid:1;
2119 bool hitset_stats_invalid:1;
2120 bool hitset_bytes_stats_invalid:1;
2121 bool pin_stats_invalid:1;
11fdf7f2 2122 bool manifest_stats_invalid:1;
7c673cae
FG
2123
2124 pg_stat_t()
2125 : reported_seq(0),
2126 reported_epoch(0),
2127 state(0),
2128 created(0), last_epoch_clean(0),
2129 parent_split_bits(0),
2130 log_size(0), ondisk_log_size(0),
2131 mapping_epoch(0),
2132 up_primary(-1),
2133 acting_primary(-1),
b32b8144 2134 snaptrimq_len(0),
7c673cae
FG
2135 stats_invalid(false),
2136 dirty_stats_invalid(false),
2137 omap_stats_invalid(false),
2138 hitset_stats_invalid(false),
2139 hitset_bytes_stats_invalid(false),
11fdf7f2
TL
2140 pin_stats_invalid(false),
2141 manifest_stats_invalid(false)
7c673cae
FG
2142 { }
2143
2144 epoch_t get_effective_last_epoch_clean() const {
2145 if (state & PG_STATE_CLEAN) {
2146 // we are clean as of this report, and should thus take the
2147 // reported epoch
2148 return reported_epoch;
2149 } else {
2150 return last_epoch_clean;
2151 }
2152 }
2153
2154 pair<epoch_t, version_t> get_version_pair() const {
2155 return make_pair(reported_epoch, reported_seq);
2156 }
2157
2158 void floor(int64_t f) {
2159 stats.floor(f);
2160 if (log_size < f)
2161 log_size = f;
2162 if (ondisk_log_size < f)
2163 ondisk_log_size = f;
b32b8144
FG
2164 if (snaptrimq_len < f)
2165 snaptrimq_len = f;
7c673cae
FG
2166 }
2167
11fdf7f2
TL
2168 void add_sub_invalid_flags(const pg_stat_t& o) {
2169 // adding (or subtracting!) invalid stats render our stats invalid too
2170 stats_invalid |= o.stats_invalid;
2171 dirty_stats_invalid |= o.dirty_stats_invalid;
2172 hitset_stats_invalid |= o.hitset_stats_invalid;
2173 pin_stats_invalid |= o.pin_stats_invalid;
2174 manifest_stats_invalid |= o.manifest_stats_invalid;
2175 }
7c673cae
FG
2176 void add(const pg_stat_t& o) {
2177 stats.add(o.stats);
2178 log_size += o.log_size;
2179 ondisk_log_size += o.ondisk_log_size;
11fdf7f2
TL
2180 snaptrimq_len = std::min((uint64_t)snaptrimq_len + o.snaptrimq_len,
2181 (uint64_t)(1ull << 31));
2182 add_sub_invalid_flags(o);
7c673cae
FG
2183 }
2184 void sub(const pg_stat_t& o) {
2185 stats.sub(o.stats);
2186 log_size -= o.log_size;
2187 ondisk_log_size -= o.ondisk_log_size;
b32b8144
FG
2188 if (o.snaptrimq_len < snaptrimq_len) {
2189 snaptrimq_len -= o.snaptrimq_len;
2190 } else {
2191 snaptrimq_len = 0;
2192 }
11fdf7f2 2193 add_sub_invalid_flags(o);
7c673cae
FG
2194 }
2195
2196 bool is_acting_osd(int32_t osd, bool primary) const;
2197 void dump(Formatter *f) const;
2198 void dump_brief(Formatter *f) const;
2199 void encode(bufferlist &bl) const;
11fdf7f2 2200 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
2201 static void generate_test_instances(list<pg_stat_t*>& o);
2202};
2203WRITE_CLASS_ENCODER(pg_stat_t)
2204
2205bool operator==(const pg_stat_t& l, const pg_stat_t& r);
2206
11fdf7f2
TL
2207/** store_statfs_t
2208 * ObjectStore full statfs information
2209 */
2210struct store_statfs_t
2211{
2212 uint64_t total = 0; ///< Total bytes
2213 uint64_t available = 0; ///< Free bytes available
2214 uint64_t internally_reserved = 0; ///< Bytes reserved for internal purposes
2215
2216 int64_t allocated = 0; ///< Bytes allocated by the store
2217
2218 int64_t data_stored = 0; ///< Bytes actually stored by the user
2219 int64_t data_compressed = 0; ///< Bytes stored after compression
2220 int64_t data_compressed_allocated = 0; ///< Bytes allocated for compressed data
2221 int64_t data_compressed_original = 0; ///< Bytes that were compressed
2222
2223 int64_t omap_allocated = 0; ///< approx usage of omap data
2224 int64_t internal_metadata = 0; ///< approx usage of internal metadata
2225
2226 void reset() {
2227 *this = store_statfs_t();
2228 }
2229 void floor(int64_t f) {
2230#define FLOOR(x) if (int64_t(x) < f) x = f
2231 FLOOR(total);
2232 FLOOR(available);
2233 FLOOR(internally_reserved);
2234 FLOOR(allocated);
2235 FLOOR(data_stored);
2236 FLOOR(data_compressed);
2237 FLOOR(data_compressed_allocated);
2238 FLOOR(data_compressed_original);
2239
2240 FLOOR(omap_allocated);
2241 FLOOR(internal_metadata);
2242#undef FLOOR
2243 }
2244
2245 bool operator ==(const store_statfs_t& other) const;
2246 bool is_zero() const {
2247 return *this == store_statfs_t();
2248 }
2249
2250 uint64_t get_used() const {
2251 return total - available - internally_reserved;
2252 }
2253
2254 // this accumulates both actually used and statfs's internally_reserved
2255 uint64_t get_used_raw() const {
2256 return total - available;
2257 }
2258
2259 float get_used_raw_ratio() const {
2260 if (total) {
2261 return (float)get_used_raw() / (float)total;
2262 } else {
2263 return 0.0;
2264 }
2265 }
2266
2267 // helpers to ease legacy code porting
2268 uint64_t kb_avail() const {
2269 return available >> 10;
2270 }
2271 uint64_t kb() const {
2272 return total >> 10;
2273 }
2274 uint64_t kb_used() const {
2275 return (total - available - internally_reserved) >> 10;
2276 }
2277 uint64_t kb_used_raw() const {
2278 return get_used_raw() >> 10;
2279 }
2280
2281 uint64_t kb_used_data() const {
2282 return allocated >> 10;
2283 }
2284 uint64_t kb_used_omap() const {
2285 return omap_allocated >> 10;
2286 }
2287
2288 uint64_t kb_used_internal_metadata() const {
2289 return internal_metadata >> 10;
2290 }
2291
2292 void add(const store_statfs_t& o) {
2293 total += o.total;
2294 available += o.available;
2295 internally_reserved += o.internally_reserved;
2296 allocated += o.allocated;
2297 data_stored += o.data_stored;
2298 data_compressed += o.data_compressed;
2299 data_compressed_allocated += o.data_compressed_allocated;
2300 data_compressed_original += o.data_compressed_original;
2301 omap_allocated += o.omap_allocated;
2302 internal_metadata += o.internal_metadata;
2303 }
2304 void sub(const store_statfs_t& o) {
2305 total -= o.total;
2306 available -= o.available;
2307 internally_reserved -= o.internally_reserved;
2308 allocated -= o.allocated;
2309 data_stored -= o.data_stored;
2310 data_compressed -= o.data_compressed;
2311 data_compressed_allocated -= o.data_compressed_allocated;
2312 data_compressed_original -= o.data_compressed_original;
2313 omap_allocated -= o.omap_allocated;
2314 internal_metadata -= o.internal_metadata;
2315 }
2316 void dump(Formatter *f) const;
2317 DENC(store_statfs_t, v, p) {
2318 DENC_START(1, 1, p);
2319 denc(v.total, p);
2320 denc(v.available, p);
2321 denc(v.internally_reserved, p);
2322 denc(v.allocated, p);
2323 denc(v.data_stored, p);
2324 denc(v.data_compressed, p);
2325 denc(v.data_compressed_allocated, p);
2326 denc(v.data_compressed_original, p);
2327 denc(v.omap_allocated, p);
2328 denc(v.internal_metadata, p);
2329 DENC_FINISH(p);
2330 }
2331 static void generate_test_instances(list<store_statfs_t*>& o);
2332};
2333WRITE_CLASS_DENC(store_statfs_t)
2334
2335ostream &operator<<(ostream &lhs, const store_statfs_t &rhs);
2336
2337/** osd_stat
2338 * aggregate stats for an osd
2339 */
2340struct osd_stat_t {
2341 store_statfs_t statfs;
2342 vector<int> hb_peers;
2343 int32_t snap_trim_queue_len, num_snap_trimming;
2344 uint64_t num_shards_repaired;
2345
2346 pow2_hist_t op_queue_age_hist;
2347
2348 objectstore_perf_stat_t os_perf_stat;
2349 osd_alerts_t os_alerts;
2350
2351 epoch_t up_from = 0;
2352 uint64_t seq = 0;
2353
2354 uint32_t num_pgs = 0;
2355
81eedcae
TL
2356 uint32_t num_osds = 0;
2357 uint32_t num_per_pool_osds = 0;
2358
11fdf7f2
TL
2359 osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
2360 num_shards_repaired(0) {}
2361
2362 void add(const osd_stat_t& o) {
2363 statfs.add(o.statfs);
2364 snap_trim_queue_len += o.snap_trim_queue_len;
2365 num_snap_trimming += o.num_snap_trimming;
2366 num_shards_repaired += o.num_shards_repaired;
2367 op_queue_age_hist.add(o.op_queue_age_hist);
2368 os_perf_stat.add(o.os_perf_stat);
2369 num_pgs += o.num_pgs;
81eedcae
TL
2370 num_osds += o.num_osds;
2371 num_per_pool_osds += o.num_per_pool_osds;
11fdf7f2
TL
2372 for (const auto& a : o.os_alerts) {
2373 auto& target = os_alerts[a.first];
2374 for (auto& i : a.second) {
2375 target.emplace(i.first, i.second);
2376 }
2377 }
2378 }
2379 void sub(const osd_stat_t& o) {
2380 statfs.sub(o.statfs);
2381 snap_trim_queue_len -= o.snap_trim_queue_len;
2382 num_snap_trimming -= o.num_snap_trimming;
2383 num_shards_repaired -= o.num_shards_repaired;
2384 op_queue_age_hist.sub(o.op_queue_age_hist);
2385 os_perf_stat.sub(o.os_perf_stat);
2386 num_pgs -= o.num_pgs;
81eedcae
TL
2387 num_osds -= o.num_osds;
2388 num_per_pool_osds -= o.num_per_pool_osds;
11fdf7f2
TL
2389 for (const auto& a : o.os_alerts) {
2390 auto& target = os_alerts[a.first];
2391 for (auto& i : a.second) {
2392 target.erase(i.first);
2393 }
2394 if (target.empty()) {
2395 os_alerts.erase(a.first);
2396 }
2397 }
2398 }
2399 void dump(Formatter *f) const;
2400 void encode(bufferlist &bl, uint64_t features) const;
2401 void decode(bufferlist::const_iterator &bl);
2402 static void generate_test_instances(std::list<osd_stat_t*>& o);
2403};
2404WRITE_CLASS_ENCODER_FEATURES(osd_stat_t)
2405
2406inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
2407 return l.statfs == r.statfs &&
2408 l.snap_trim_queue_len == r.snap_trim_queue_len &&
2409 l.num_snap_trimming == r.num_snap_trimming &&
2410 l.num_shards_repaired == r.num_shards_repaired &&
2411 l.hb_peers == r.hb_peers &&
2412 l.op_queue_age_hist == r.op_queue_age_hist &&
2413 l.os_perf_stat == r.os_perf_stat &&
81eedcae
TL
2414 l.num_pgs == r.num_pgs &&
2415 l.num_osds == r.num_osds &&
2416 l.num_per_pool_osds == r.num_per_pool_osds;
11fdf7f2
TL
2417}
2418inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
2419 return !(l == r);
2420}
2421
2422inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
2423 return out << "osd_stat(" << s.statfs << ", "
2424 << "peers " << s.hb_peers
2425 << " op hist " << s.op_queue_age_hist.h
2426 << ")";
2427}
2428
7c673cae
FG
2429/*
2430 * summation over an entire pool
2431 */
2432struct pool_stat_t {
2433 object_stat_collection_t stats;
11fdf7f2 2434 store_statfs_t store_stats;
7c673cae
FG
2435 int64_t log_size;
2436 int64_t ondisk_log_size; // >= active_log_size
2437 int32_t up; ///< number of up replicas or shards
2438 int32_t acting; ///< number of acting replicas or shards
11fdf7f2 2439 int32_t num_store_stats; ///< amount of store_stats accumulated
7c673cae 2440
11fdf7f2
TL
2441 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
2442 num_store_stats(0)
7c673cae
FG
2443 { }
2444
2445 void floor(int64_t f) {
2446 stats.floor(f);
11fdf7f2 2447 store_stats.floor(f);
7c673cae
FG
2448 if (log_size < f)
2449 log_size = f;
2450 if (ondisk_log_size < f)
2451 ondisk_log_size = f;
2452 if (up < f)
2453 up = f;
2454 if (acting < f)
2455 acting = f;
11fdf7f2
TL
2456 if (num_store_stats < f)
2457 num_store_stats = f;
2458 }
2459
2460 void add(const store_statfs_t& o) {
2461 store_stats.add(o);
2462 ++num_store_stats;
2463 }
2464 void sub(const store_statfs_t& o) {
2465 store_stats.sub(o);
2466 --num_store_stats;
7c673cae
FG
2467 }
2468
2469 void add(const pg_stat_t& o) {
2470 stats.add(o.stats);
2471 log_size += o.log_size;
2472 ondisk_log_size += o.ondisk_log_size;
2473 up += o.up.size();
2474 acting += o.acting.size();
2475 }
2476 void sub(const pg_stat_t& o) {
2477 stats.sub(o.stats);
2478 log_size -= o.log_size;
2479 ondisk_log_size -= o.ondisk_log_size;
2480 up -= o.up.size();
2481 acting -= o.acting.size();
2482 }
2483
2484 bool is_zero() const {
2485 return (stats.is_zero() &&
11fdf7f2 2486 store_stats.is_zero() &&
7c673cae
FG
2487 log_size == 0 &&
2488 ondisk_log_size == 0 &&
2489 up == 0 &&
11fdf7f2
TL
2490 acting == 0 &&
2491 num_store_stats == 0);
2492 }
2493
2494 // helper accessors to retrieve used/netto bytes depending on the
2495 // collection method: new per-pool objectstore report or legacy PG
2496 // summation at OSD.
2497 // In legacy mode used and netto values are the same. But for new per-pool
2498 // collection 'used' provides amount of space ALLOCATED at all related OSDs
2499 // and 'netto' is amount of stored user data.
81eedcae 2500 uint64_t get_allocated_bytes(bool per_pool) const {
11fdf7f2 2501 uint64_t allocated_bytes;
81eedcae 2502 if (per_pool) {
11fdf7f2
TL
2503 allocated_bytes = store_stats.allocated;
2504 } else {
2505 // legacy mode, use numbers from 'stats'
2506 allocated_bytes = stats.sum.num_bytes +
2507 stats.sum.num_bytes_hit_set_archive;
2508 }
2509 // omap is not broken out by pool by nautilus bluestore
2510 allocated_bytes += stats.sum.num_omap_bytes;
2511 return allocated_bytes;
2512 }
81eedcae 2513 uint64_t get_user_bytes(float raw_used_rate, bool per_pool) const {
11fdf7f2 2514 uint64_t user_bytes;
81eedcae 2515 if (per_pool) {
11fdf7f2
TL
2516 user_bytes = raw_used_rate ? store_stats.data_stored / raw_used_rate : 0;
2517 } else {
2518 // legacy mode, use numbers from 'stats'
2519 user_bytes = stats.sum.num_bytes +
2520 stats.sum.num_bytes_hit_set_archive;
2521 }
2522 // omap is not broken out by pool by nautilus bluestore
2523 user_bytes += stats.sum.num_omap_bytes;
2524 return user_bytes;
7c673cae
FG
2525 }
2526
2527 void dump(Formatter *f) const;
2528 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 2529 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
2530 static void generate_test_instances(list<pool_stat_t*>& o);
2531};
2532WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
2533
2534
2535// -----------------------------------------
2536
2537/**
2538 * pg_hit_set_info_t - information about a single recorded HitSet
2539 *
11fdf7f2 2540 * Track basic metadata about a HitSet, like the number of insertions
7c673cae
FG
2541 * and the time range it covers.
2542 */
2543struct pg_hit_set_info_t {
2544 utime_t begin, end; ///< time interval
2545 eversion_t version; ///< version this HitSet object was written
2546 bool using_gmt; ///< use gmt for creating the hit_set archive object name
2547
2548 friend bool operator==(const pg_hit_set_info_t& l,
2549 const pg_hit_set_info_t& r) {
2550 return
2551 l.begin == r.begin &&
2552 l.end == r.end &&
2553 l.version == r.version &&
2554 l.using_gmt == r.using_gmt;
2555 }
2556
2557 explicit pg_hit_set_info_t(bool using_gmt = true)
2558 : using_gmt(using_gmt) {}
2559
2560 void encode(bufferlist &bl) const;
11fdf7f2 2561 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
2562 void dump(Formatter *f) const;
2563 static void generate_test_instances(list<pg_hit_set_info_t*>& o);
2564};
2565WRITE_CLASS_ENCODER(pg_hit_set_info_t)
2566
2567/**
2568 * pg_hit_set_history_t - information about a history of hitsets
2569 *
2570 * Include information about the currently accumulating hit set as well
2571 * as archived/historical ones.
2572 */
2573struct pg_hit_set_history_t {
2574 eversion_t current_last_update; ///< last version inserted into current set
2575 list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
2576
2577 friend bool operator==(const pg_hit_set_history_t& l,
2578 const pg_hit_set_history_t& r) {
2579 return
2580 l.current_last_update == r.current_last_update &&
2581 l.history == r.history;
2582 }
2583
2584 void encode(bufferlist &bl) const;
11fdf7f2 2585 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
2586 void dump(Formatter *f) const;
2587 static void generate_test_instances(list<pg_hit_set_history_t*>& o);
2588};
2589WRITE_CLASS_ENCODER(pg_hit_set_history_t)
2590
2591
2592// -----------------------------------------
2593
2594/**
2595 * pg_history_t - information about recent pg peering/mapping history
2596 *
2597 * This is aggressively shared between OSDs to bound the amount of past
2598 * history they need to worry about.
2599 */
2600struct pg_history_t {
31f18b77
FG
2601 epoch_t epoch_created; // epoch in which *pg* was created (pool or pg)
2602 epoch_t epoch_pool_created; // epoch in which *pool* was created
2603 // (note: may be pg creation epoch for
2604 // pre-luminous clusters)
7c673cae
FG
2605 epoch_t last_epoch_started; // lower bound on last epoch started (anywhere, not necessarily locally)
2606 epoch_t last_interval_started; // first epoch of last_epoch_started interval
2607 epoch_t last_epoch_clean; // lower bound on last epoch the PG was completely clean.
2608 epoch_t last_interval_clean; // first epoch of last_epoch_clean interval
31f18b77 2609 epoch_t last_epoch_split; // as parent or child
7c673cae
FG
2610 epoch_t last_epoch_marked_full; // pool or cluster
2611
2612 /**
2613 * In the event of a map discontinuity, same_*_since may reflect the first
2614 * map the osd has seen in the new map sequence rather than the actual start
2615 * of the interval. This is ok since a discontinuity at epoch e means there
2616 * must have been a clean interval between e and now and that we cannot be
2617 * in the active set during the interval containing e.
2618 */
2619 epoch_t same_up_since; // same acting set since
2620 epoch_t same_interval_since; // same acting AND up set since
2621 epoch_t same_primary_since; // same primary at least back through this epoch.
2622
2623 eversion_t last_scrub;
2624 eversion_t last_deep_scrub;
2625 utime_t last_scrub_stamp;
2626 utime_t last_deep_scrub_stamp;
2627 utime_t last_clean_scrub_stamp;
2628
2629 friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
2630 return
2631 l.epoch_created == r.epoch_created &&
31f18b77 2632 l.epoch_pool_created == r.epoch_pool_created &&
7c673cae
FG
2633 l.last_epoch_started == r.last_epoch_started &&
2634 l.last_interval_started == r.last_interval_started &&
2635 l.last_epoch_clean == r.last_epoch_clean &&
2636 l.last_interval_clean == r.last_interval_clean &&
2637 l.last_epoch_split == r.last_epoch_split &&
2638 l.last_epoch_marked_full == r.last_epoch_marked_full &&
2639 l.same_up_since == r.same_up_since &&
2640 l.same_interval_since == r.same_interval_since &&
2641 l.same_primary_since == r.same_primary_since &&
2642 l.last_scrub == r.last_scrub &&
2643 l.last_deep_scrub == r.last_deep_scrub &&
2644 l.last_scrub_stamp == r.last_scrub_stamp &&
2645 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2646 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp;
2647 }
2648
2649 pg_history_t()
2650 : epoch_created(0),
31f18b77 2651 epoch_pool_created(0),
7c673cae
FG
2652 last_epoch_started(0),
2653 last_interval_started(0),
2654 last_epoch_clean(0),
2655 last_interval_clean(0),
2656 last_epoch_split(0),
2657 last_epoch_marked_full(0),
2658 same_up_since(0), same_interval_since(0), same_primary_since(0) {}
2659
2660 bool merge(const pg_history_t &other) {
2661 // Here, we only update the fields which cannot be calculated from the OSDmap.
2662 bool modified = false;
2663 if (epoch_created < other.epoch_created) {
2664 epoch_created = other.epoch_created;
2665 modified = true;
2666 }
31f18b77
FG
2667 if (epoch_pool_created < other.epoch_pool_created) {
2668 // FIXME: for jewel compat only; this should either be 0 or always the
2669 // same value across all pg instances.
2670 epoch_pool_created = other.epoch_pool_created;
2671 modified = true;
2672 }
7c673cae
FG
2673 if (last_epoch_started < other.last_epoch_started) {
2674 last_epoch_started = other.last_epoch_started;
2675 modified = true;
2676 }
2677 if (last_interval_started < other.last_interval_started) {
2678 last_interval_started = other.last_interval_started;
2679 modified = true;
2680 }
2681 if (last_epoch_clean < other.last_epoch_clean) {
2682 last_epoch_clean = other.last_epoch_clean;
2683 modified = true;
2684 }
2685 if (last_interval_clean < other.last_interval_clean) {
2686 last_interval_clean = other.last_interval_clean;
2687 modified = true;
2688 }
2689 if (last_epoch_split < other.last_epoch_split) {
2690 last_epoch_split = other.last_epoch_split;
2691 modified = true;
2692 }
2693 if (last_epoch_marked_full < other.last_epoch_marked_full) {
2694 last_epoch_marked_full = other.last_epoch_marked_full;
2695 modified = true;
2696 }
2697 if (other.last_scrub > last_scrub) {
2698 last_scrub = other.last_scrub;
2699 modified = true;
2700 }
2701 if (other.last_scrub_stamp > last_scrub_stamp) {
2702 last_scrub_stamp = other.last_scrub_stamp;
2703 modified = true;
2704 }
2705 if (other.last_deep_scrub > last_deep_scrub) {
2706 last_deep_scrub = other.last_deep_scrub;
2707 modified = true;
2708 }
2709 if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
2710 last_deep_scrub_stamp = other.last_deep_scrub_stamp;
2711 modified = true;
2712 }
2713 if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
2714 last_clean_scrub_stamp = other.last_clean_scrub_stamp;
2715 modified = true;
2716 }
2717 return modified;
2718 }
2719
2720 void encode(bufferlist& bl) const;
11fdf7f2 2721 void decode(bufferlist::const_iterator& p);
7c673cae
FG
2722 void dump(Formatter *f) const;
2723 static void generate_test_instances(list<pg_history_t*>& o);
2724};
2725WRITE_CLASS_ENCODER(pg_history_t)
2726
2727inline ostream& operator<<(ostream& out, const pg_history_t& h) {
31f18b77 2728 return out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
7c673cae
FG
2729 << " lis/c " << h.last_interval_started
2730 << "/" << h.last_interval_clean
2731 << " les/c/f " << h.last_epoch_started << "/" << h.last_epoch_clean
2732 << "/" << h.last_epoch_marked_full
2733 << " " << h.same_up_since
2734 << "/" << h.same_interval_since
2735 << "/" << h.same_primary_since;
2736}
2737
2738
2739/**
2740 * pg_info_t - summary of PG statistics.
2741 *
2742 * some notes:
2743 * - last_complete implies we have all objects that existed as of that
2744 * stamp, OR a newer object, OR have already applied a later delete.
2745 * - if last_complete >= log.bottom, then we know pg contents thru log.head.
2746 * otherwise, we have no idea what the pg is supposed to contain.
2747 */
2748struct pg_info_t {
2749 spg_t pgid;
2750 eversion_t last_update; ///< last object version applied to store.
2751 eversion_t last_complete; ///< last version pg was complete through.
2752 epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd
2753 epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
2754
2755 version_t last_user_version; ///< last user object version applied to store
2756
2757 eversion_t log_tail; ///< oldest log entry.
2758
2759 hobject_t last_backfill; ///< objects >= this and < last_complete may be missing
2760 bool last_backfill_bitwise; ///< true if last_backfill reflects a bitwise (vs nibblewise) sort
2761
2762 interval_set<snapid_t> purged_snaps;
2763
2764 pg_stat_t stats;
2765
2766 pg_history_t history;
2767 pg_hit_set_history_t hit_set;
2768
2769 friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
2770 return
2771 l.pgid == r.pgid &&
2772 l.last_update == r.last_update &&
2773 l.last_complete == r.last_complete &&
2774 l.last_epoch_started == r.last_epoch_started &&
2775 l.last_interval_started == r.last_interval_started &&
2776 l.last_user_version == r.last_user_version &&
2777 l.log_tail == r.log_tail &&
2778 l.last_backfill == r.last_backfill &&
2779 l.last_backfill_bitwise == r.last_backfill_bitwise &&
2780 l.purged_snaps == r.purged_snaps &&
2781 l.stats == r.stats &&
2782 l.history == r.history &&
2783 l.hit_set == r.hit_set;
2784 }
2785
2786 pg_info_t()
2787 : last_epoch_started(0),
2788 last_interval_started(0),
2789 last_user_version(0),
2790 last_backfill(hobject_t::get_max()),
2791 last_backfill_bitwise(false)
2792 { }
2793 // cppcheck-suppress noExplicitConstructor
2794 pg_info_t(spg_t p)
2795 : pgid(p),
2796 last_epoch_started(0),
2797 last_interval_started(0),
2798 last_user_version(0),
2799 last_backfill(hobject_t::get_max()),
2800 last_backfill_bitwise(false)
2801 { }
2802
2803 void set_last_backfill(hobject_t pos) {
2804 last_backfill = pos;
2805 last_backfill_bitwise = true;
2806 }
2807
2808 bool is_empty() const { return last_update.version == 0; }
2809 bool dne() const { return history.epoch_created == 0; }
2810
11fdf7f2 2811 bool has_missing() const { return last_complete != last_update; }
7c673cae
FG
2812 bool is_incomplete() const { return !last_backfill.is_max(); }
2813
2814 void encode(bufferlist& bl) const;
11fdf7f2 2815 void decode(bufferlist::const_iterator& p);
7c673cae 2816 void dump(Formatter *f) const;
7c673cae
FG
2817 static void generate_test_instances(list<pg_info_t*>& o);
2818};
2819WRITE_CLASS_ENCODER(pg_info_t)
2820
2821inline ostream& operator<<(ostream& out, const pg_info_t& pgi)
2822{
2823 out << pgi.pgid << "(";
2824 if (pgi.dne())
2825 out << " DNE";
2826 if (pgi.is_empty())
2827 out << " empty";
2828 else {
2829 out << " v " << pgi.last_update;
2830 if (pgi.last_complete != pgi.last_update)
2831 out << " lc " << pgi.last_complete;
2832 out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
2833 }
2834 if (pgi.is_incomplete())
2835 out << " lb " << pgi.last_backfill
2836 << (pgi.last_backfill_bitwise ? " (bitwise)" : " (NIBBLEWISE)");
2837 //out << " c " << pgi.epoch_created;
2838 out << " local-lis/les=" << pgi.last_interval_started
2839 << "/" << pgi.last_epoch_started;
2840 out << " n=" << pgi.stats.stats.sum.num_objects;
2841 out << " " << pgi.history
2842 << ")";
2843 return out;
2844}
2845
2846/**
2847 * pg_fast_info_t - common pg_info_t fields
2848 *
2849 * These are the fields of pg_info_t (and children) that are updated for
2850 * most IO operations.
2851 *
2852 * ** WARNING **
2853 * Because we rely on these fields to be applied to the normal
2854 * info struct, adding a new field here that is not also new in info
2855 * means that we must set an incompat OSD feature bit!
2856 */
2857struct pg_fast_info_t {
2858 eversion_t last_update;
2859 eversion_t last_complete;
2860 version_t last_user_version;
2861 struct { // pg_stat_t stats
2862 eversion_t version;
2863 version_t reported_seq;
2864 utime_t last_fresh;
2865 utime_t last_active;
2866 utime_t last_peered;
2867 utime_t last_clean;
2868 utime_t last_unstale;
2869 utime_t last_undegraded;
2870 utime_t last_fullsized;
2871 int64_t log_size; // (also ondisk_log_size, which has the same value)
2872 struct { // object_stat_collection_t stats;
2873 struct { // objct_stat_sum_t sum
2874 int64_t num_bytes; // in bytes
2875 int64_t num_objects;
2876 int64_t num_object_copies;
2877 int64_t num_rd;
2878 int64_t num_rd_kb;
2879 int64_t num_wr;
2880 int64_t num_wr_kb;
2881 int64_t num_objects_dirty;
2882 } sum;
2883 } stats;
2884 } stats;
2885
2886 void populate_from(const pg_info_t& info) {
2887 last_update = info.last_update;
2888 last_complete = info.last_complete;
2889 last_user_version = info.last_user_version;
2890 stats.version = info.stats.version;
2891 stats.reported_seq = info.stats.reported_seq;
2892 stats.last_fresh = info.stats.last_fresh;
2893 stats.last_active = info.stats.last_active;
2894 stats.last_peered = info.stats.last_peered;
2895 stats.last_clean = info.stats.last_clean;
2896 stats.last_unstale = info.stats.last_unstale;
2897 stats.last_undegraded = info.stats.last_undegraded;
2898 stats.last_fullsized = info.stats.last_fullsized;
2899 stats.log_size = info.stats.log_size;
2900 stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
2901 stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
2902 stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
2903 stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
2904 stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
2905 stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
2906 stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
2907 stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
2908 }
2909
2910 bool try_apply_to(pg_info_t* info) {
2911 if (last_update <= info->last_update)
2912 return false;
2913 info->last_update = last_update;
2914 info->last_complete = last_complete;
2915 info->last_user_version = last_user_version;
2916 info->stats.version = stats.version;
2917 info->stats.reported_seq = stats.reported_seq;
2918 info->stats.last_fresh = stats.last_fresh;
2919 info->stats.last_active = stats.last_active;
2920 info->stats.last_peered = stats.last_peered;
2921 info->stats.last_clean = stats.last_clean;
2922 info->stats.last_unstale = stats.last_unstale;
2923 info->stats.last_undegraded = stats.last_undegraded;
2924 info->stats.last_fullsized = stats.last_fullsized;
2925 info->stats.log_size = stats.log_size;
2926 info->stats.ondisk_log_size = stats.log_size;
2927 info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
2928 info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
2929 info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
2930 info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
2931 info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
2932 info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
2933 info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
2934 info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
2935 return true;
2936 }
2937
2938 void encode(bufferlist& bl) const {
2939 ENCODE_START(1, 1, bl);
11fdf7f2
TL
2940 encode(last_update, bl);
2941 encode(last_complete, bl);
2942 encode(last_user_version, bl);
2943 encode(stats.version, bl);
2944 encode(stats.reported_seq, bl);
2945 encode(stats.last_fresh, bl);
2946 encode(stats.last_active, bl);
2947 encode(stats.last_peered, bl);
2948 encode(stats.last_clean, bl);
2949 encode(stats.last_unstale, bl);
2950 encode(stats.last_undegraded, bl);
2951 encode(stats.last_fullsized, bl);
2952 encode(stats.log_size, bl);
2953 encode(stats.stats.sum.num_bytes, bl);
2954 encode(stats.stats.sum.num_objects, bl);
2955 encode(stats.stats.sum.num_object_copies, bl);
2956 encode(stats.stats.sum.num_rd, bl);
2957 encode(stats.stats.sum.num_rd_kb, bl);
2958 encode(stats.stats.sum.num_wr, bl);
2959 encode(stats.stats.sum.num_wr_kb, bl);
2960 encode(stats.stats.sum.num_objects_dirty, bl);
7c673cae
FG
2961 ENCODE_FINISH(bl);
2962 }
11fdf7f2 2963 void decode(bufferlist::const_iterator& p) {
7c673cae 2964 DECODE_START(1, p);
11fdf7f2
TL
2965 decode(last_update, p);
2966 decode(last_complete, p);
2967 decode(last_user_version, p);
2968 decode(stats.version, p);
2969 decode(stats.reported_seq, p);
2970 decode(stats.last_fresh, p);
2971 decode(stats.last_active, p);
2972 decode(stats.last_peered, p);
2973 decode(stats.last_clean, p);
2974 decode(stats.last_unstale, p);
2975 decode(stats.last_undegraded, p);
2976 decode(stats.last_fullsized, p);
2977 decode(stats.log_size, p);
2978 decode(stats.stats.sum.num_bytes, p);
2979 decode(stats.stats.sum.num_objects, p);
2980 decode(stats.stats.sum.num_object_copies, p);
2981 decode(stats.stats.sum.num_rd, p);
2982 decode(stats.stats.sum.num_rd_kb, p);
2983 decode(stats.stats.sum.num_wr, p);
2984 decode(stats.stats.sum.num_wr_kb, p);
2985 decode(stats.stats.sum.num_objects_dirty, p);
7c673cae
FG
2986 DECODE_FINISH(p);
2987 }
2988};
2989WRITE_CLASS_ENCODER(pg_fast_info_t)
2990
2991
2992struct pg_notify_t {
2993 epoch_t query_epoch;
2994 epoch_t epoch_sent;
2995 pg_info_t info;
2996 shard_id_t to;
2997 shard_id_t from;
2998 pg_notify_t() :
2999 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
3000 from(shard_id_t::NO_SHARD) {}
3001 pg_notify_t(
3002 shard_id_t to,
3003 shard_id_t from,
3004 epoch_t query_epoch,
3005 epoch_t epoch_sent,
3006 const pg_info_t &info)
3007 : query_epoch(query_epoch),
3008 epoch_sent(epoch_sent),
3009 info(info), to(to), from(from) {
11fdf7f2 3010 ceph_assert(from == info.pgid.shard);
7c673cae
FG
3011 }
3012 void encode(bufferlist &bl) const;
11fdf7f2 3013 void decode(bufferlist::const_iterator &p);
7c673cae
FG
3014 void dump(Formatter *f) const;
3015 static void generate_test_instances(list<pg_notify_t*> &o);
3016};
3017WRITE_CLASS_ENCODER(pg_notify_t)
3018ostream &operator<<(ostream &lhs, const pg_notify_t &notify);
3019
3020
3021class OSDMap;
3022/**
3023 * PastIntervals -- information needed to determine the PriorSet and
3024 * the might_have_unfound set
3025 */
3026class PastIntervals {
3027public:
3028 struct pg_interval_t {
3029 vector<int32_t> up, acting;
3030 epoch_t first, last;
3031 bool maybe_went_rw;
3032 int32_t primary;
3033 int32_t up_primary;
3034
3035 pg_interval_t()
3036 : first(0), last(0),
3037 maybe_went_rw(false),
3038 primary(-1),
3039 up_primary(-1)
3040 {}
3041
3042 pg_interval_t(
3043 vector<int32_t> &&up,
3044 vector<int32_t> &&acting,
3045 epoch_t first,
3046 epoch_t last,
3047 bool maybe_went_rw,
3048 int32_t primary,
3049 int32_t up_primary)
3050 : up(up), acting(acting), first(first), last(last),
3051 maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
3052 {}
3053
3054 void encode(bufferlist& bl) const;
11fdf7f2 3055 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
3056 void dump(Formatter *f) const;
3057 static void generate_test_instances(list<pg_interval_t*>& o);
3058 };
3059
11fdf7f2 3060 PastIntervals();
7c673cae
FG
3061 PastIntervals(PastIntervals &&rhs) = default;
3062 PastIntervals &operator=(PastIntervals &&rhs) = default;
3063
3064 PastIntervals(const PastIntervals &rhs);
3065 PastIntervals &operator=(const PastIntervals &rhs);
3066
3067 class interval_rep {
3068 public:
3069 virtual size_t size() const = 0;
3070 virtual bool empty() const = 0;
3071 virtual void clear() = 0;
3072 virtual pair<epoch_t, epoch_t> get_bounds() const = 0;
3073 virtual set<pg_shard_t> get_all_participants(
3074 bool ec_pool) const = 0;
3075 virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
3076 virtual unique_ptr<interval_rep> clone() const = 0;
3077 virtual ostream &print(ostream &out) const = 0;
3078 virtual void encode(bufferlist &bl) const = 0;
11fdf7f2 3079 virtual void decode(bufferlist::const_iterator &bl) = 0;
7c673cae 3080 virtual void dump(Formatter *f) const = 0;
7c673cae 3081 virtual void iterate_mayberw_back_to(
7c673cae
FG
3082 epoch_t les,
3083 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const = 0;
3084
3085 virtual bool has_full_intervals() const { return false; }
3086 virtual void iterate_all_intervals(
3087 std::function<void(const pg_interval_t &)> &&f) const {
11fdf7f2
TL
3088 ceph_assert(!has_full_intervals());
3089 ceph_abort_msg("not valid for this implementation");
7c673cae 3090 }
11fdf7f2 3091 virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0;
7c673cae
FG
3092
3093 virtual ~interval_rep() {}
3094 };
7c673cae
FG
3095 friend class pi_compact_rep;
3096private:
3097
3098 unique_ptr<interval_rep> past_intervals;
3099
11fdf7f2 3100 explicit PastIntervals(interval_rep *rep) : past_intervals(rep) {}
7c673cae
FG
3101
3102public:
3103 void add_interval(bool ec_pool, const pg_interval_t &interval) {
11fdf7f2 3104 ceph_assert(past_intervals);
7c673cae
FG
3105 return past_intervals->add_interval(ec_pool, interval);
3106 }
3107
7c673cae
FG
3108 void encode(bufferlist &bl) const {
3109 ENCODE_START(1, 1, bl);
3110 if (past_intervals) {
11fdf7f2
TL
3111 __u8 type = 2;
3112 encode(type, bl);
7c673cae
FG
3113 past_intervals->encode(bl);
3114 } else {
11fdf7f2 3115 encode((__u8)0, bl);
7c673cae
FG
3116 }
3117 ENCODE_FINISH(bl);
3118 }
7c673cae 3119
11fdf7f2 3120 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
3121
3122 void dump(Formatter *f) const {
11fdf7f2 3123 ceph_assert(past_intervals);
7c673cae
FG
3124 past_intervals->dump(f);
3125 }
3126 static void generate_test_instances(list<PastIntervals *> & o);
3127
3128 /**
3129 * Determines whether there is an interval change
3130 */
3131 static bool is_new_interval(
3132 int old_acting_primary,
3133 int new_acting_primary,
3134 const vector<int> &old_acting,
3135 const vector<int> &new_acting,
3136 int old_up_primary,
3137 int new_up_primary,
3138 const vector<int> &old_up,
3139 const vector<int> &new_up,
3140 int old_size,
3141 int new_size,
3142 int old_min_size,
3143 int new_min_size,
3144 unsigned old_pg_num,
3145 unsigned new_pg_num,
11fdf7f2
TL
3146 unsigned old_pg_num_pending,
3147 unsigned new_pg_num_pending,
7c673cae
FG
3148 bool old_sort_bitwise,
3149 bool new_sort_bitwise,
c07f9fc5
FG
3150 bool old_recovery_deletes,
3151 bool new_recovery_deletes,
7c673cae
FG
3152 pg_t pgid
3153 );
3154
3155 /**
3156 * Determines whether there is an interval change
3157 */
3158 static bool is_new_interval(
3159 int old_acting_primary, ///< [in] primary as of lastmap
3160 int new_acting_primary, ///< [in] primary as of lastmap
3161 const vector<int> &old_acting, ///< [in] acting as of lastmap
3162 const vector<int> &new_acting, ///< [in] acting as of osdmap
3163 int old_up_primary, ///< [in] up primary of lastmap
3164 int new_up_primary, ///< [in] up primary of osdmap
3165 const vector<int> &old_up, ///< [in] up as of lastmap
3166 const vector<int> &new_up, ///< [in] up as of osdmap
11fdf7f2
TL
3167 std::shared_ptr<const OSDMap> osdmap, ///< [in] current map
3168 std::shared_ptr<const OSDMap> lastmap, ///< [in] last map
7c673cae
FG
3169 pg_t pgid ///< [in] pgid for pg
3170 );
3171
3172 /**
3173 * Integrates a new map into *past_intervals, returns true
3174 * if an interval was closed out.
3175 */
3176 static bool check_new_interval(
3177 int old_acting_primary, ///< [in] primary as of lastmap
3178 int new_acting_primary, ///< [in] primary as of osdmap
3179 const vector<int> &old_acting, ///< [in] acting as of lastmap
3180 const vector<int> &new_acting, ///< [in] acting as of osdmap
3181 int old_up_primary, ///< [in] up primary of lastmap
3182 int new_up_primary, ///< [in] up primary of osdmap
3183 const vector<int> &old_up, ///< [in] up as of lastmap
3184 const vector<int> &new_up, ///< [in] up as of osdmap
3185 epoch_t same_interval_since, ///< [in] as of osdmap
3186 epoch_t last_epoch_clean, ///< [in] current
11fdf7f2
TL
3187 std::shared_ptr<const OSDMap> osdmap, ///< [in] current map
3188 std::shared_ptr<const OSDMap> lastmap, ///< [in] last map
7c673cae 3189 pg_t pgid, ///< [in] pgid for pg
11fdf7f2 3190 IsPGRecoverablePredicate *could_have_gone_active, ///< [in] predicate whether the pg can be active
7c673cae
FG
3191 PastIntervals *past_intervals, ///< [out] intervals
3192 ostream *out = 0 ///< [out] debug ostream
3193 );
c07f9fc5 3194
7c673cae
FG
3195 friend ostream& operator<<(ostream& out, const PastIntervals &i);
3196
3197 template <typename F>
3198 void iterate_mayberw_back_to(
7c673cae
FG
3199 epoch_t les,
3200 F &&f) const {
11fdf7f2
TL
3201 ceph_assert(past_intervals);
3202 past_intervals->iterate_mayberw_back_to(les, std::forward<F>(f));
7c673cae
FG
3203 }
3204 void clear() {
11fdf7f2 3205 ceph_assert(past_intervals);
7c673cae
FG
3206 past_intervals->clear();
3207 }
3208
3209 /**
3210 * Should return a value which gives an indication of the amount
3211 * of state contained
3212 */
3213 size_t size() const {
11fdf7f2 3214 ceph_assert(past_intervals);
7c673cae
FG
3215 return past_intervals->size();
3216 }
3217
3218 bool empty() const {
11fdf7f2 3219 ceph_assert(past_intervals);
7c673cae
FG
3220 return past_intervals->empty();
3221 }
3222
3223 void swap(PastIntervals &other) {
31f18b77
FG
3224 using std::swap;
3225 swap(other.past_intervals, past_intervals);
7c673cae
FG
3226 }
3227
3228 /**
3229 * Return all shards which have been in the acting set back to the
3230 * latest epoch to which we have trimmed except for pg_whoami
3231 */
3232 set<pg_shard_t> get_might_have_unfound(
3233 pg_shard_t pg_whoami,
3234 bool ec_pool) const {
11fdf7f2 3235 ceph_assert(past_intervals);
7c673cae
FG
3236 auto ret = past_intervals->get_all_participants(ec_pool);
3237 ret.erase(pg_whoami);
3238 return ret;
3239 }
3240
3241 /**
3242 * Return all shards which we might want to talk to for peering
3243 */
3244 set<pg_shard_t> get_all_probe(
3245 bool ec_pool) const {
11fdf7f2 3246 ceph_assert(past_intervals);
7c673cae
FG
3247 return past_intervals->get_all_participants(ec_pool);
3248 }
3249
3250 /* Return the set of epochs [start, end) represented by the
3251 * past_interval set.
3252 */
3253 pair<epoch_t, epoch_t> get_bounds() const {
11fdf7f2 3254 ceph_assert(past_intervals);
7c673cae
FG
3255 return past_intervals->get_bounds();
3256 }
3257
11fdf7f2
TL
3258 void adjust_start_backwards(epoch_t last_epoch_clean) {
3259 ceph_assert(past_intervals);
3260 past_intervals->adjust_start_backwards(last_epoch_clean);
3261 }
3262
7c673cae
FG
3263 enum osd_state_t {
3264 UP,
3265 DOWN,
3266 DNE,
3267 LOST
3268 };
3269 struct PriorSet {
3270 bool ec_pool = false;
11fdf7f2
TL
3271 set<pg_shard_t> probe; ///< current+prior OSDs we need to probe.
3272 set<int> down; ///< down osds that would normally be in @a probe and might be interesting.
3273 map<int, epoch_t> blocked_by; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
7c673cae 3274
11fdf7f2 3275 bool pg_down = false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
7c673cae
FG
3276 unique_ptr<IsPGRecoverablePredicate> pcontdec;
3277
3278 PriorSet() = default;
3279 PriorSet(PriorSet &&) = default;
3280 PriorSet &operator=(PriorSet &&) = default;
3281
3282 PriorSet &operator=(const PriorSet &) = delete;
3283 PriorSet(const PriorSet &) = delete;
3284
3285 bool operator==(const PriorSet &rhs) const {
3286 return (ec_pool == rhs.ec_pool) &&
3287 (probe == rhs.probe) &&
3288 (down == rhs.down) &&
3289 (blocked_by == rhs.blocked_by) &&
3290 (pg_down == rhs.pg_down);
3291 }
3292
3293 bool affected_by_map(
3294 const OSDMap &osdmap,
3295 const DoutPrefixProvider *dpp) const;
3296
3297 // For verifying tests
3298 PriorSet(
3299 bool ec_pool,
3300 set<pg_shard_t> probe,
3301 set<int> down,
3302 map<int, epoch_t> blocked_by,
3303 bool pg_down,
3304 IsPGRecoverablePredicate *pcontdec)
3305 : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
3306 pg_down(pg_down), pcontdec(pcontdec) {}
3307
3308 private:
3309 template <typename F>
3310 PriorSet(
3311 const PastIntervals &past_intervals,
3312 bool ec_pool,
3313 epoch_t last_epoch_started,
3314 IsPGRecoverablePredicate *c,
3315 F f,
3316 const vector<int> &up,
3317 const vector<int> &acting,
3318 const DoutPrefixProvider *dpp);
3319
3320 friend class PastIntervals;
3321 };
3322
7c673cae
FG
3323 template <typename... Args>
3324 PriorSet get_prior_set(Args&&... args) const {
3325 return PriorSet(*this, std::forward<Args>(args)...);
3326 }
3327};
3328WRITE_CLASS_ENCODER(PastIntervals)
3329
3330ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i);
3331ostream& operator<<(ostream& out, const PastIntervals &i);
3332ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i);
3333
3334template <typename F>
3335PastIntervals::PriorSet::PriorSet(
3336 const PastIntervals &past_intervals,
3337 bool ec_pool,
3338 epoch_t last_epoch_started,
3339 IsPGRecoverablePredicate *c,
3340 F f,
3341 const vector<int> &up,
3342 const vector<int> &acting,
3343 const DoutPrefixProvider *dpp)
3344 : ec_pool(ec_pool), pg_down(false), pcontdec(c)
3345{
3346 /*
3347 * We have to be careful to gracefully deal with situations like
3348 * so. Say we have a power outage or something that takes out both
3349 * OSDs, but the monitor doesn't mark them down in the same epoch.
3350 * The history may look like
3351 *
3352 * 1: A B
3353 * 2: B
3354 * 3: let's say B dies for good, too (say, from the power spike)
3355 * 4: A
3356 *
3357 * which makes it look like B may have applied updates to the PG
3358 * that we need in order to proceed. This sucks...
3359 *
3360 * To minimize the risk of this happening, we CANNOT go active if
3361 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3362 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3363 * Then, we have something like
3364 *
3365 * 1: A B
3366 * 2: B up_thru[B]=0
3367 * 3:
3368 * 4: A
3369 *
3370 * -> we can ignore B, bc it couldn't have gone active (alive_thru
3371 * still 0).
3372 *
3373 * or,
3374 *
3375 * 1: A B
3376 * 2: B up_thru[B]=0
3377 * 3: B up_thru[B]=2
3378 * 4:
3379 * 5: A
3380 *
3381 * -> we must wait for B, bc it was alive through 2, and could have
3382 * written to the pg.
3383 *
3384 * If B is really dead, then an administrator will need to manually
3385 * intervene by marking the OSD as "lost."
3386 */
3387
3388 // Include current acting and up nodes... not because they may
3389 // contain old data (this interval hasn't gone active, obviously),
3390 // but because we want their pg_info to inform choose_acting(), and
3391 // so that we know what they do/do not have explicitly before
3392 // sending them any new info/logs/whatever.
3393 for (unsigned i = 0; i < acting.size(); i++) {
3394 if (acting[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3395 probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3396 }
3397 // It may be possible to exclude the up nodes, but let's keep them in
3398 // there for now.
3399 for (unsigned i = 0; i < up.size(); i++) {
3400 if (up[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3401 probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3402 }
3403
3404 set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
3405 ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
3406 for (auto &&i: all_probe) {
3407 switch (f(0, i.osd, nullptr)) {
3408 case UP: {
3409 probe.insert(i);
3410 break;
3411 }
3412 case DNE:
3413 case LOST:
3414 case DOWN: {
3415 down.insert(i.osd);
3416 break;
3417 }
3418 }
3419 }
3420
3421 past_intervals.iterate_mayberw_back_to(
7c673cae
FG
3422 last_epoch_started,
3423 [&](epoch_t start, const set<pg_shard_t> &acting) {
3424 ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
3425 << ", acting: " << acting << dendl;
3426
3427 // look at candidate osds during this interval. each falls into
3428 // one of three categories: up, down (but potentially
3429 // interesting), or lost (down, but we won't wait for it).
3430 set<pg_shard_t> up_now;
3431 map<int, epoch_t> candidate_blocked_by;
3432 // any candidates down now (that might have useful data)
3433 bool any_down_now = false;
3434
3435 // consider ACTING osds
3436 for (auto &&so: acting) {
3437 epoch_t lost_at = 0;
3438 switch (f(start, so.osd, &lost_at)) {
3439 case UP: {
3440 // include past acting osds if they are up.
3441 up_now.insert(so);
3442 break;
3443 }
3444 case DNE: {
3445 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3446 << " no longer exists" << dendl;
3447 break;
3448 }
3449 case LOST: {
3450 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3451 << " is down, but lost_at " << lost_at << dendl;
3452 up_now.insert(so);
3453 break;
3454 }
3455 case DOWN: {
3456 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3457 << " is down" << dendl;
3458 candidate_blocked_by[so.osd] = lost_at;
3459 any_down_now = true;
3460 break;
3461 }
3462 }
3463 }
3464
3465 // if not enough osds survived this interval, and we may have gone rw,
3466 // then we need to wait for one of those osds to recover to
3467 // ensure that we haven't lost any information.
3468 if (!(*pcontdec)(up_now) && any_down_now) {
3469 // fixme: how do we identify a "clean" shutdown anyway?
3470 ldpp_dout(dpp, 10) << "build_prior possibly went active+rw,"
3471 << " insufficient up; including down osds" << dendl;
11fdf7f2 3472 ceph_assert(!candidate_blocked_by.empty());
7c673cae
FG
3473 pg_down = true;
3474 blocked_by.insert(
3475 candidate_blocked_by.begin(),
3476 candidate_blocked_by.end());
3477 }
3478 });
3479
3480 ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
3481 << " down " << down
3482 << " blocked_by " << blocked_by
3483 << (pg_down ? " pg_down":"")
3484 << dendl;
3485}
3486
3487/**
3488 * pg_query_t - used to ask a peer for information about a pg.
3489 *
3490 * note: if version=0, type=LOG, then we just provide our full log.
3491 */
3492struct pg_query_t {
3493 enum {
3494 INFO = 0,
3495 LOG = 1,
3496 MISSING = 4,
3497 FULLLOG = 5,
3498 };
11fdf7f2 3499 std::string_view get_type_name() const {
7c673cae
FG
3500 switch (type) {
3501 case INFO: return "info";
3502 case LOG: return "log";
3503 case MISSING: return "missing";
3504 case FULLLOG: return "fulllog";
3505 default: return "???";
3506 }
3507 }
3508
3509 __s32 type;
3510 eversion_t since;
3511 pg_history_t history;
3512 epoch_t epoch_sent;
3513 shard_id_t to;
3514 shard_id_t from;
3515
3516 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
3517 from(shard_id_t::NO_SHARD) {}
3518 pg_query_t(
3519 int t,
3520 shard_id_t to,
3521 shard_id_t from,
3522 const pg_history_t& h,
3523 epoch_t epoch_sent)
3524 : type(t),
3525 history(h),
3526 epoch_sent(epoch_sent),
3527 to(to), from(from) {
11fdf7f2 3528 ceph_assert(t != LOG);
7c673cae
FG
3529 }
3530 pg_query_t(
3531 int t,
3532 shard_id_t to,
3533 shard_id_t from,
3534 eversion_t s,
3535 const pg_history_t& h,
3536 epoch_t epoch_sent)
3537 : type(t), since(s), history(h),
3538 epoch_sent(epoch_sent), to(to), from(from) {
11fdf7f2 3539 ceph_assert(t == LOG);
7c673cae
FG
3540 }
3541
3542 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 3543 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
3544
3545 void dump(Formatter *f) const;
3546 static void generate_test_instances(list<pg_query_t*>& o);
3547};
3548WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
3549
3550inline ostream& operator<<(ostream& out, const pg_query_t& q) {
3551 out << "query(" << q.get_type_name() << " " << q.since;
3552 if (q.type == pg_query_t::LOG)
3553 out << " " << q.history;
11fdf7f2 3554 out << " epoch_sent " << q.epoch_sent;
7c673cae
FG
3555 out << ")";
3556 return out;
3557}
3558
3559class PGBackend;
3560class ObjectModDesc {
3561 bool can_local_rollback;
3562 bool rollback_info_completed;
3563
3564 // version required to decode, reflected in encode/decode version
3565 __u8 max_required_version = 1;
3566public:
3567 class Visitor {
3568 public:
3569 virtual void append(uint64_t old_offset) {}
3570 virtual void setattrs(map<string, boost::optional<bufferlist> > &attrs) {}
3571 virtual void rmobject(version_t old_version) {}
3572 /**
3573 * Used to support the unfound_lost_delete log event: if the stashed
3574 * version exists, we unstash it, otherwise, we do nothing. This way
3575 * each replica rolls back to whatever state it had prior to the attempt
3576 * at mark unfound lost delete
3577 */
3578 virtual void try_rmobject(version_t old_version) {
3579 rmobject(old_version);
3580 }
3581 virtual void create() {}
3582 virtual void update_snaps(const set<snapid_t> &old_snaps) {}
3583 virtual void rollback_extents(
3584 version_t gen,
3585 const vector<pair<uint64_t, uint64_t> > &extents) {}
3586 virtual ~Visitor() {}
3587 };
3588 void visit(Visitor *visitor) const;
3589 mutable bufferlist bl;
3590 enum ModID {
3591 APPEND = 1,
3592 SETATTRS = 2,
3593 DELETE = 3,
3594 CREATE = 4,
3595 UPDATE_SNAPS = 5,
3596 TRY_DELETE = 6,
3597 ROLLBACK_EXTENTS = 7
3598 };
31f18b77
FG
3599 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3600 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3601 }
7c673cae
FG
3602 void claim(ObjectModDesc &other) {
3603 bl.clear();
3604 bl.claim(other.bl);
3605 can_local_rollback = other.can_local_rollback;
3606 rollback_info_completed = other.rollback_info_completed;
3607 }
3608 void claim_append(ObjectModDesc &other) {
3609 if (!can_local_rollback || rollback_info_completed)
3610 return;
3611 if (!other.can_local_rollback) {
3612 mark_unrollbackable();
3613 return;
3614 }
3615 bl.claim_append(other.bl);
3616 rollback_info_completed = other.rollback_info_completed;
3617 }
3618 void swap(ObjectModDesc &other) {
3619 bl.swap(other.bl);
3620
31f18b77
FG
3621 using std::swap;
3622 swap(other.can_local_rollback, can_local_rollback);
3623 swap(other.rollback_info_completed, rollback_info_completed);
3624 swap(other.max_required_version, max_required_version);
7c673cae
FG
3625 }
3626 void append_id(ModID id) {
11fdf7f2 3627 using ceph::encode;
7c673cae 3628 uint8_t _id(id);
11fdf7f2 3629 encode(_id, bl);
7c673cae
FG
3630 }
3631 void append(uint64_t old_size) {
3632 if (!can_local_rollback || rollback_info_completed)
3633 return;
3634 ENCODE_START(1, 1, bl);
3635 append_id(APPEND);
11fdf7f2 3636 encode(old_size, bl);
7c673cae
FG
3637 ENCODE_FINISH(bl);
3638 }
3639 void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
3640 if (!can_local_rollback || rollback_info_completed)
3641 return;
3642 ENCODE_START(1, 1, bl);
3643 append_id(SETATTRS);
11fdf7f2 3644 encode(old_attrs, bl);
7c673cae
FG
3645 ENCODE_FINISH(bl);
3646 }
3647 bool rmobject(version_t deletion_version) {
3648 if (!can_local_rollback || rollback_info_completed)
3649 return false;
3650 ENCODE_START(1, 1, bl);
3651 append_id(DELETE);
11fdf7f2 3652 encode(deletion_version, bl);
7c673cae
FG
3653 ENCODE_FINISH(bl);
3654 rollback_info_completed = true;
3655 return true;
3656 }
3657 bool try_rmobject(version_t deletion_version) {
3658 if (!can_local_rollback || rollback_info_completed)
3659 return false;
3660 ENCODE_START(1, 1, bl);
3661 append_id(TRY_DELETE);
11fdf7f2 3662 encode(deletion_version, bl);
7c673cae
FG
3663 ENCODE_FINISH(bl);
3664 rollback_info_completed = true;
3665 return true;
3666 }
3667 void create() {
3668 if (!can_local_rollback || rollback_info_completed)
3669 return;
3670 rollback_info_completed = true;
3671 ENCODE_START(1, 1, bl);
3672 append_id(CREATE);
3673 ENCODE_FINISH(bl);
3674 }
3675 void update_snaps(const set<snapid_t> &old_snaps) {
3676 if (!can_local_rollback || rollback_info_completed)
3677 return;
3678 ENCODE_START(1, 1, bl);
3679 append_id(UPDATE_SNAPS);
11fdf7f2 3680 encode(old_snaps, bl);
7c673cae
FG
3681 ENCODE_FINISH(bl);
3682 }
3683 void rollback_extents(
3684 version_t gen, const vector<pair<uint64_t, uint64_t> > &extents) {
11fdf7f2
TL
3685 ceph_assert(can_local_rollback);
3686 ceph_assert(!rollback_info_completed);
7c673cae
FG
3687 if (max_required_version < 2)
3688 max_required_version = 2;
3689 ENCODE_START(2, 2, bl);
3690 append_id(ROLLBACK_EXTENTS);
11fdf7f2
TL
3691 encode(gen, bl);
3692 encode(extents, bl);
7c673cae
FG
3693 ENCODE_FINISH(bl);
3694 }
3695
3696 // cannot be rolled back
3697 void mark_unrollbackable() {
3698 can_local_rollback = false;
3699 bl.clear();
3700 }
3701 bool can_rollback() const {
3702 return can_local_rollback;
3703 }
3704 bool empty() const {
3705 return can_local_rollback && (bl.length() == 0);
3706 }
3707
3708 bool requires_kraken() const {
3709 return max_required_version >= 2;
3710 }
3711
3712 /**
3713 * Create fresh copy of bl bytes to avoid keeping large buffers around
3714 * in the case that bl contains ptrs which point into a much larger
3715 * message buffer
3716 */
31f18b77 3717 void trim_bl() const {
7c673cae
FG
3718 if (bl.length() > 0)
3719 bl.rebuild();
3720 }
3721 void encode(bufferlist &bl) const;
11fdf7f2 3722 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
3723 void dump(Formatter *f) const;
3724 static void generate_test_instances(list<ObjectModDesc*>& o);
3725};
3726WRITE_CLASS_ENCODER(ObjectModDesc)
3727
3728
3729/**
3730 * pg_log_entry_t - single entry/event in pg log
3731 *
3732 */
3733struct pg_log_entry_t {
3734 enum {
3735 MODIFY = 1, // some unspecified modification (but not *all* modifications)
3736 CLONE = 2, // cloned object from head
3737 DELETE = 3, // deleted object
11fdf7f2 3738 //BACKLOG = 4, // event invented by generate_backlog [obsolete]
7c673cae
FG
3739 LOST_REVERT = 5, // lost new version, revert to an older version.
3740 LOST_DELETE = 6, // lost new version, revert to no object (deleted).
3741 LOST_MARK = 7, // lost new version, now EIO
3742 PROMOTE = 8, // promoted object from another tier
3743 CLEAN = 9, // mark an object clean
3744 ERROR = 10, // write that returned an error
3745 };
3746 static const char *get_op_name(int op) {
3747 switch (op) {
3748 case MODIFY:
3749 return "modify";
3750 case PROMOTE:
3751 return "promote";
3752 case CLONE:
3753 return "clone";
3754 case DELETE:
3755 return "delete";
7c673cae
FG
3756 case LOST_REVERT:
3757 return "l_revert";
3758 case LOST_DELETE:
3759 return "l_delete";
3760 case LOST_MARK:
3761 return "l_mark";
3762 case CLEAN:
3763 return "clean";
3764 case ERROR:
3765 return "error";
3766 default:
3767 return "unknown";
3768 }
3769 }
3770 const char *get_op_name() const {
3771 return get_op_name(op);
3772 }
3773
3774 // describes state for a locally-rollbackable entry
3775 ObjectModDesc mod_desc;
3776 bufferlist snaps; // only for clone entries
3777 hobject_t soid;
3778 osd_reqid_t reqid; // caller+tid to uniquely identify request
31f18b77 3779 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids;
11fdf7f2
TL
3780
3781 /// map extra_reqids by index to error return code (if any)
3782 mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
3783
7c673cae
FG
3784 eversion_t version, prior_version, reverting_to;
3785 version_t user_version; // the user version for this entry
3786 utime_t mtime; // this is the _user_ mtime, mind you
3787 int32_t return_code; // only stored for ERRORs for dup detection
3788
3789 __s32 op;
3790 bool invalid_hash; // only when decoding sobject_t based entries
3791 bool invalid_pool; // only when decoding pool-less hobject based entries
3792
3793 pg_log_entry_t()
3794 : user_version(0), return_code(0), op(0),
31f18b77
FG
3795 invalid_hash(false), invalid_pool(false) {
3796 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3797 }
7c673cae
FG
3798 pg_log_entry_t(int _op, const hobject_t& _soid,
3799 const eversion_t& v, const eversion_t& pv,
3800 version_t uv,
3801 const osd_reqid_t& rid, const utime_t& mt,
3802 int return_code)
3803 : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
3804 mtime(mt), return_code(return_code), op(_op),
31f18b77
FG
3805 invalid_hash(false), invalid_pool(false) {
3806 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3807 }
7c673cae
FG
3808
3809 bool is_clone() const { return op == CLONE; }
3810 bool is_modify() const { return op == MODIFY; }
3811 bool is_promote() const { return op == PROMOTE; }
3812 bool is_clean() const { return op == CLEAN; }
7c673cae
FG
3813 bool is_lost_revert() const { return op == LOST_REVERT; }
3814 bool is_lost_delete() const { return op == LOST_DELETE; }
3815 bool is_lost_mark() const { return op == LOST_MARK; }
3816 bool is_error() const { return op == ERROR; }
3817
3818 bool is_update() const {
3819 return
3820 is_clone() || is_modify() || is_promote() || is_clean() ||
11fdf7f2 3821 is_lost_revert() || is_lost_mark();
7c673cae
FG
3822 }
3823 bool is_delete() const {
3824 return op == DELETE || op == LOST_DELETE;
3825 }
3826
3827 bool can_rollback() const {
3828 return mod_desc.can_rollback();
3829 }
3830
3831 void mark_unrollbackable() {
3832 mod_desc.mark_unrollbackable();
3833 }
3834
3835 bool requires_kraken() const {
3836 return mod_desc.requires_kraken();
3837 }
3838
3839 // Errors are only used for dup detection, whereas
3840 // the index by objects is used by recovery, copy_get,
3841 // and other facilities that don't expect or need to
3842 // be aware of error entries.
3843 bool object_is_indexed() const {
3844 return !is_error();
3845 }
3846
3847 bool reqid_is_indexed() const {
3848 return reqid != osd_reqid_t() &&
3849 (op == MODIFY || op == DELETE || op == ERROR);
3850 }
3851
3852 string get_key_name() const;
3853 void encode_with_checksum(bufferlist& bl) const;
11fdf7f2 3854 void decode_with_checksum(bufferlist::const_iterator& p);
7c673cae
FG
3855
3856 void encode(bufferlist &bl) const;
11fdf7f2 3857 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
3858 void dump(Formatter *f) const;
3859 static void generate_test_instances(list<pg_log_entry_t*>& o);
3860
3861};
3862WRITE_CLASS_ENCODER(pg_log_entry_t)
3863
3864ostream& operator<<(ostream& out, const pg_log_entry_t& e);
3865
c07f9fc5
FG
3866struct pg_log_dup_t {
3867 osd_reqid_t reqid; // caller+tid to uniquely identify request
3868 eversion_t version;
3869 version_t user_version; // the user version for this entry
3870 int32_t return_code; // only stored for ERRORs for dup detection
7c673cae 3871
c07f9fc5
FG
3872 pg_log_dup_t()
3873 : user_version(0), return_code(0)
3874 {}
3875 explicit pg_log_dup_t(const pg_log_entry_t& entry)
3876 : reqid(entry.reqid), version(entry.version),
3877 user_version(entry.user_version), return_code(entry.return_code)
3878 {}
3879 pg_log_dup_t(const eversion_t& v, version_t uv,
3880 const osd_reqid_t& rid, int return_code)
3881 : reqid(rid), version(v), user_version(uv),
3882 return_code(return_code)
3883 {}
3884
3885 string get_key_name() const;
3886 void encode(bufferlist &bl) const;
11fdf7f2 3887 void decode(bufferlist::const_iterator &bl);
c07f9fc5
FG
3888 void dump(Formatter *f) const;
3889 static void generate_test_instances(list<pg_log_dup_t*>& o);
3890
181888fb
FG
3891 bool operator==(const pg_log_dup_t &rhs) const {
3892 return reqid == rhs.reqid &&
3893 version == rhs.version &&
3894 user_version == rhs.user_version &&
3895 return_code == rhs.return_code;
3896 }
3897 bool operator!=(const pg_log_dup_t &rhs) const {
3898 return !(*this == rhs);
3899 }
3900
c07f9fc5
FG
3901 friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
3902};
3903WRITE_CLASS_ENCODER(pg_log_dup_t)
3904
3905std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
7c673cae
FG
3906
3907/**
3908 * pg_log_t - incremental log of recent pg changes.
3909 *
3910 * serves as a recovery queue for recent changes.
3911 */
3912struct pg_log_t {
3913 /*
3914 * head - newest entry (update|delete)
3915 * tail - entry previous to oldest (update|delete) for which we have
3916 * complete negative information.
3917 * i.e. we can infer pg contents for any store whose last_update >= tail.
3918 */
3919 eversion_t head; // newest entry
3920 eversion_t tail; // version prior to oldest
3921
3922protected:
3923 // We can rollback rollback-able entries > can_rollback_to
3924 eversion_t can_rollback_to;
3925
3926 // always <= can_rollback_to, indicates how far stashed rollback
3927 // data can be found
3928 eversion_t rollback_info_trimmed_to;
3929
3930public:
c07f9fc5
FG
3931 // the actual log
3932 mempool::osd_pglog::list<pg_log_entry_t> log;
3933
3934 // entries just for dup op detection ordered oldest to newest
3935 mempool::osd_pglog::list<pg_log_dup_t> dups;
3936
7c673cae
FG
3937 pg_log_t() = default;
3938 pg_log_t(const eversion_t &last_update,
3939 const eversion_t &log_tail,
3940 const eversion_t &can_rollback_to,
3941 const eversion_t &rollback_info_trimmed_to,
c07f9fc5
FG
3942 mempool::osd_pglog::list<pg_log_entry_t> &&entries,
3943 mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
7c673cae
FG
3944 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3945 rollback_info_trimmed_to(rollback_info_trimmed_to),
c07f9fc5 3946 log(std::move(entries)), dups(std::move(dup_entries)) {}
7c673cae
FG
3947 pg_log_t(const eversion_t &last_update,
3948 const eversion_t &log_tail,
3949 const eversion_t &can_rollback_to,
3950 const eversion_t &rollback_info_trimmed_to,
c07f9fc5
FG
3951 const std::list<pg_log_entry_t> &entries,
3952 const std::list<pg_log_dup_t> &dup_entries)
7c673cae
FG
3953 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3954 rollback_info_trimmed_to(rollback_info_trimmed_to) {
3955 for (auto &&entry: entries) {
3956 log.push_back(entry);
3957 }
c07f9fc5
FG
3958 for (auto &&entry: dup_entries) {
3959 dups.push_back(entry);
3960 }
7c673cae
FG
3961 }
3962
3963 void clear() {
3964 eversion_t z;
3965 rollback_info_trimmed_to = can_rollback_to = head = tail = z;
3966 log.clear();
c07f9fc5 3967 dups.clear();
7c673cae
FG
3968 }
3969
3970 eversion_t get_rollback_info_trimmed_to() const {
3971 return rollback_info_trimmed_to;
3972 }
3973 eversion_t get_can_rollback_to() const {
3974 return can_rollback_to;
3975 }
3976
3977
3978 pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
31f18b77 3979 mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
7c673cae
FG
3980 oldlog.swap(log);
3981
3982 eversion_t old_tail;
3983 unsigned mask = ~((~0)<<split_bits);
3984 for (auto i = oldlog.begin();
3985 i != oldlog.end();
3986 ) {
3987 if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
3988 childlog.push_back(*i);
3989 } else {
3990 log.push_back(*i);
3991 }
3992 oldlog.erase(i++);
3993 }
3994
c07f9fc5
FG
3995 // osd_reqid is unique, so it doesn't matter if there are extra
3996 // dup entries in each pg. To avoid storing oid with the dup
3997 // entries, just copy the whole list.
3998 auto childdups(dups);
3999
7c673cae
FG
4000 return pg_log_t(
4001 head,
4002 tail,
4003 can_rollback_to,
4004 rollback_info_trimmed_to,
c07f9fc5
FG
4005 std::move(childlog),
4006 std::move(childdups));
4007 }
7c673cae 4008
31f18b77 4009 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
11fdf7f2 4010 ceph_assert(newhead >= tail);
7c673cae 4011
31f18b77
FG
4012 mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
4013 mempool::osd_pglog::list<pg_log_entry_t> divergent;
7c673cae
FG
4014 while (true) {
4015 if (p == log.begin()) {
4016 // yikes, the whole thing is divergent!
31f18b77
FG
4017 using std::swap;
4018 swap(divergent, log);
7c673cae
FG
4019 break;
4020 }
4021 --p;
4022 if (p->version.version <= newhead.version) {
4023 /*
4024 * look at eversion.version here. we want to avoid a situation like:
4025 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4026 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4027 * lower_bound = 100'9
4028 * i.e, same request, different version. If the eversion.version is > the
4029 * lower_bound, we it is divergent.
4030 */
4031 ++p;
4032 divergent.splice(divergent.begin(), log, p, log.end());
4033 break;
4034 }
11fdf7f2 4035 ceph_assert(p->version > newhead);
7c673cae
FG
4036 }
4037 head = newhead;
4038
4039 if (can_rollback_to > newhead)
4040 can_rollback_to = newhead;
4041
4042 if (rollback_info_trimmed_to > newhead)
4043 rollback_info_trimmed_to = newhead;
4044
4045 return divergent;
4046 }
4047
11fdf7f2
TL
4048 void merge_from(const vector<pg_log_t*>& slogs, eversion_t last_update) {
4049 log.clear();
4050
4051 // sort and merge dups
4052 multimap<eversion_t,pg_log_dup_t> sorted;
4053 for (auto& d : dups) {
4054 sorted.emplace(d.version, d);
4055 }
4056 for (auto l : slogs) {
4057 for (auto& d : l->dups) {
4058 sorted.emplace(d.version, d);
4059 }
4060 }
4061 dups.clear();
4062 for (auto& i : sorted) {
4063 dups.push_back(i.second);
4064 }
4065
4066 head = last_update;
4067 tail = last_update;
4068 can_rollback_to = last_update;
4069 rollback_info_trimmed_to = last_update;
4070 }
4071
7c673cae
FG
4072 bool empty() const {
4073 return log.empty();
4074 }
4075
4076 bool null() const {
4077 return head.version == 0 && head.epoch == 0;
4078 }
4079
4080 size_t approx_size() const {
4081 return head.version - tail.version;
4082 }
4083
4084 static void filter_log(spg_t import_pgid, const OSDMap &curmap,
4085 const string &hit_set_namespace, const pg_log_t &in,
4086 pg_log_t &out, pg_log_t &reject);
4087
4088 /**
4089 * copy entries from the tail of another pg_log_t
4090 *
4091 * @param other pg_log_t to copy from
4092 * @param from copy entries after this version
4093 */
81eedcae 4094 void copy_after(CephContext* cct, const pg_log_t &other, eversion_t from);
7c673cae
FG
4095
4096 /**
4097 * copy up to N entries
4098 *
4099 * @param other source log
4100 * @param max max number of entries to copy
4101 */
81eedcae 4102 void copy_up_to(CephContext* cct, const pg_log_t &other, int max);
7c673cae
FG
4103
4104 ostream& print(ostream& out) const;
4105
4106 void encode(bufferlist &bl) const;
11fdf7f2 4107 void decode(bufferlist::const_iterator &bl, int64_t pool = -1);
7c673cae
FG
4108 void dump(Formatter *f) const;
4109 static void generate_test_instances(list<pg_log_t*>& o);
4110};
4111WRITE_CLASS_ENCODER(pg_log_t)
4112
c07f9fc5 4113inline ostream& operator<<(ostream& out, const pg_log_t& log)
7c673cae
FG
4114{
4115 out << "log((" << log.tail << "," << log.head << "], crt="
4116 << log.get_can_rollback_to() << ")";
4117 return out;
4118}
4119
4120
4121/**
4122 * pg_missing_t - summary of missing objects.
4123 *
4124 * kept in memory, as a supplement to pg_log_t
4125 * also used to pass missing info in messages.
4126 */
4127struct pg_missing_item {
4128 eversion_t need, have;
c07f9fc5
FG
4129 enum missing_flags_t {
4130 FLAG_NONE = 0,
4131 FLAG_DELETE = 1,
4132 } flags;
4133 pg_missing_item() : flags(FLAG_NONE) {}
4134 explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
4135 pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false) : need(n), have(h) {
4136 set_delete(is_delete);
4137 }
4138
4139 void encode(bufferlist& bl, uint64_t features) const {
11fdf7f2 4140 using ceph::encode;
c07f9fc5
FG
4141 if (HAVE_FEATURE(features, OSD_RECOVERY_DELETES)) {
4142 // encoding a zeroed eversion_t to differentiate between this and
4143 // legacy unversioned encoding - a need value of 0'0 is not
4144 // possible. This can be replaced with the legacy encoding
4145 // macros post-luminous.
4146 eversion_t e;
11fdf7f2
TL
4147 encode(e, bl);
4148 encode(need, bl);
4149 encode(have, bl);
4150 encode(static_cast<uint8_t>(flags), bl);
c07f9fc5
FG
4151 } else {
4152 // legacy unversioned encoding
11fdf7f2
TL
4153 encode(need, bl);
4154 encode(have, bl);
c07f9fc5 4155 }
7c673cae 4156 }
11fdf7f2
TL
4157 void decode(bufferlist::const_iterator& bl) {
4158 using ceph::decode;
c07f9fc5 4159 eversion_t e;
11fdf7f2 4160 decode(e, bl);
c07f9fc5
FG
4161 if (e != eversion_t()) {
4162 // legacy encoding, this is the need value
4163 need = e;
11fdf7f2 4164 decode(have, bl);
c07f9fc5 4165 } else {
11fdf7f2
TL
4166 decode(need, bl);
4167 decode(have, bl);
c07f9fc5 4168 uint8_t f;
11fdf7f2 4169 decode(f, bl);
c07f9fc5
FG
4170 flags = static_cast<missing_flags_t>(f);
4171 }
4172 }
4173
4174 void set_delete(bool is_delete) {
4175 flags = is_delete ? FLAG_DELETE : FLAG_NONE;
4176 }
4177
4178 bool is_delete() const {
4179 return (flags & FLAG_DELETE) == FLAG_DELETE;
4180 }
4181
4182 string flag_str() const {
4183 if (flags == FLAG_NONE) {
4184 return "none";
4185 } else {
4186 return "delete";
4187 }
7c673cae 4188 }
c07f9fc5 4189
7c673cae
FG
4190 void dump(Formatter *f) const {
4191 f->dump_stream("need") << need;
4192 f->dump_stream("have") << have;
c07f9fc5 4193 f->dump_stream("flags") << flag_str();
7c673cae
FG
4194 }
4195 static void generate_test_instances(list<pg_missing_item*>& o) {
4196 o.push_back(new pg_missing_item);
4197 o.push_back(new pg_missing_item);
4198 o.back()->need = eversion_t(1, 2);
4199 o.back()->have = eversion_t(1, 1);
c07f9fc5
FG
4200 o.push_back(new pg_missing_item);
4201 o.back()->need = eversion_t(3, 5);
4202 o.back()->have = eversion_t(3, 4);
4203 o.back()->flags = FLAG_DELETE;
7c673cae
FG
4204 }
4205 bool operator==(const pg_missing_item &rhs) const {
c07f9fc5 4206 return need == rhs.need && have == rhs.have && flags == rhs.flags;
7c673cae
FG
4207 }
4208 bool operator!=(const pg_missing_item &rhs) const {
4209 return !(*this == rhs);
4210 }
4211};
c07f9fc5 4212WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
7c673cae
FG
4213ostream& operator<<(ostream& out, const pg_missing_item &item);
4214
4215class pg_missing_const_i {
4216public:
4217 virtual const map<hobject_t, pg_missing_item> &
4218 get_items() const = 0;
4219 virtual const map<version_t, hobject_t> &get_rmissing() const = 0;
c07f9fc5 4220 virtual bool get_may_include_deletes() const = 0;
7c673cae
FG
4221 virtual unsigned int num_missing() const = 0;
4222 virtual bool have_missing() const = 0;
4223 virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
4224 virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
7c673cae
FG
4225 virtual ~pg_missing_const_i() {}
4226};
4227
4228
4229template <bool Track>
4230class ChangeTracker {
4231public:
4232 void changed(const hobject_t &obj) {}
4233 template <typename F>
4234 void get_changed(F &&f) const {}
4235 void flush() {}
4236 bool is_clean() const {
4237 return true;
4238 }
4239};
4240template <>
4241class ChangeTracker<true> {
4242 set<hobject_t> _changed;
4243public:
4244 void changed(const hobject_t &obj) {
4245 _changed.insert(obj);
4246 }
4247 template <typename F>
4248 void get_changed(F &&f) const {
4249 for (auto const &i: _changed) {
4250 f(i);
4251 }
4252 }
4253 void flush() {
4254 _changed.clear();
4255 }
4256 bool is_clean() const {
4257 return _changed.empty();
4258 }
4259};
4260
4261template <bool TrackChanges>
4262class pg_missing_set : public pg_missing_const_i {
4263 using item = pg_missing_item;
4264 map<hobject_t, item> missing; // oid -> (need v, have v)
4265 map<version_t, hobject_t> rmissing; // v -> oid
4266 ChangeTracker<TrackChanges> tracker;
4267
4268public:
4269 pg_missing_set() = default;
4270
4271 template <typename missing_type>
4272 pg_missing_set(const missing_type &m) {
7c673cae
FG
4273 missing = m.get_items();
4274 rmissing = m.get_rmissing();
c07f9fc5 4275 may_include_deletes = m.get_may_include_deletes();
7c673cae
FG
4276 for (auto &&i: missing)
4277 tracker.changed(i.first);
4278 }
4279
c07f9fc5
FG
4280 bool may_include_deletes = false;
4281
7c673cae
FG
4282 const map<hobject_t, item> &get_items() const override {
4283 return missing;
4284 }
4285 const map<version_t, hobject_t> &get_rmissing() const override {
4286 return rmissing;
4287 }
c07f9fc5
FG
4288 bool get_may_include_deletes() const override {
4289 return may_include_deletes;
4290 }
7c673cae
FG
4291 unsigned int num_missing() const override {
4292 return missing.size();
4293 }
4294 bool have_missing() const override {
4295 return !missing.empty();
4296 }
4297 bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
4298 auto iter = missing.find(oid);
4299 if (iter == missing.end())
4300 return false;
4301 if (out)
4302 *out = iter->second;
4303 return true;
4304 }
4305 bool is_missing(const hobject_t& oid, eversion_t v) const override {
4306 map<hobject_t, item>::const_iterator m =
4307 missing.find(oid);
4308 if (m == missing.end())
4309 return false;
4310 const item &item(m->second);
4311 if (item.need > v)
4312 return false;
4313 return true;
4314 }
11fdf7f2
TL
4315 eversion_t get_oldest_need() const {
4316 if (missing.empty()) {
7c673cae 4317 return eversion_t();
11fdf7f2
TL
4318 }
4319 auto it = missing.find(rmissing.begin()->second);
4320 ceph_assert(it != missing.end());
4321 return it->second.need;
7c673cae
FG
4322 }
4323
4324 void claim(pg_missing_set& o) {
4325 static_assert(!TrackChanges, "Can't use claim with TrackChanges");
4326 missing.swap(o.missing);
4327 rmissing.swap(o.rmissing);
4328 }
4329
4330 /*
4331 * this needs to be called in log order as we extend the log. it
4332 * assumes missing is accurate up through the previous log entry.
4333 */
4334 void add_next_event(const pg_log_entry_t& e) {
c07f9fc5
FG
4335 map<hobject_t, item>::iterator missing_it;
4336 missing_it = missing.find(e.soid);
4337 bool is_missing_divergent_item = missing_it != missing.end();
4338 if (e.prior_version == eversion_t() || e.is_clone()) {
4339 // new object.
4340 if (is_missing_divergent_item) { // use iterator
7c673cae 4341 rmissing.erase((missing_it->second).need.version);
c07f9fc5
FG
4342 missing_it->second = item(e.version, eversion_t(), e.is_delete()); // .have = nil
4343 } else // create new element in missing map
4344 missing[e.soid] = item(e.version, eversion_t(), e.is_delete()); // .have = nil
4345 } else if (is_missing_divergent_item) {
4346 // already missing (prior).
4347 rmissing.erase((missing_it->second).need.version);
4348 (missing_it->second).need = e.version; // leave .have unchanged.
4349 missing_it->second.set_delete(e.is_delete());
c07f9fc5
FG
4350 } else {
4351 // not missing, we must have prior_version (if any)
11fdf7f2 4352 ceph_assert(!is_missing_divergent_item);
c07f9fc5 4353 missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
7c673cae 4354 }
c07f9fc5 4355 rmissing[e.version.version] = e.soid;
7c673cae
FG
4356 tracker.changed(e.soid);
4357 }
4358
c07f9fc5 4359 void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
7c673cae
FG
4360 if (missing.count(oid)) {
4361 rmissing.erase(missing[oid].need.version);
4362 missing[oid].need = need; // no not adjust .have
c07f9fc5 4363 missing[oid].set_delete(is_delete);
7c673cae 4364 } else {
c07f9fc5 4365 missing[oid] = item(need, eversion_t(), is_delete);
7c673cae
FG
4366 }
4367 rmissing[need.version] = oid;
4368
4369 tracker.changed(oid);
4370 }
4371
4372 void revise_have(hobject_t oid, eversion_t have) {
4373 if (missing.count(oid)) {
4374 tracker.changed(oid);
4375 missing[oid].have = have;
4376 }
4377 }
4378
c07f9fc5
FG
4379 void add(const hobject_t& oid, eversion_t need, eversion_t have,
4380 bool is_delete) {
4381 missing[oid] = item(need, have, is_delete);
7c673cae
FG
4382 rmissing[need.version] = oid;
4383 tracker.changed(oid);
4384 }
4385
4386 void rm(const hobject_t& oid, eversion_t v) {
4387 std::map<hobject_t, item>::iterator p = missing.find(oid);
4388 if (p != missing.end() && p->second.need <= v)
4389 rm(p);
4390 }
4391
4392 void rm(std::map<hobject_t, item>::const_iterator m) {
4393 tracker.changed(m->first);
4394 rmissing.erase(m->second.need.version);
4395 missing.erase(m);
4396 }
4397
4398 void got(const hobject_t& oid, eversion_t v) {
4399 std::map<hobject_t, item>::iterator p = missing.find(oid);
11fdf7f2
TL
4400 ceph_assert(p != missing.end());
4401 ceph_assert(p->second.need <= v || p->second.is_delete());
7c673cae
FG
4402 got(p);
4403 }
4404
4405 void got(std::map<hobject_t, item>::const_iterator m) {
4406 tracker.changed(m->first);
4407 rmissing.erase(m->second.need.version);
4408 missing.erase(m);
4409 }
4410
4411 void split_into(
4412 pg_t child_pgid,
4413 unsigned split_bits,
4414 pg_missing_set *omissing) {
c07f9fc5 4415 omissing->may_include_deletes = may_include_deletes;
7c673cae
FG
4416 unsigned mask = ~((~0)<<split_bits);
4417 for (map<hobject_t, item>::iterator i = missing.begin();
4418 i != missing.end();
4419 ) {
4420 if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
c07f9fc5
FG
4421 omissing->add(i->first, i->second.need, i->second.have,
4422 i->second.is_delete());
7c673cae
FG
4423 rm(i++);
4424 } else {
4425 ++i;
4426 }
4427 }
4428 }
4429
4430 void clear() {
4431 for (auto const &i: missing)
4432 tracker.changed(i.first);
4433 missing.clear();
4434 rmissing.clear();
4435 }
4436
4437 void encode(bufferlist &bl) const {
c07f9fc5 4438 ENCODE_START(4, 2, bl);
11fdf7f2
TL
4439 encode(missing, bl, may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0);
4440 encode(may_include_deletes, bl);
7c673cae
FG
4441 ENCODE_FINISH(bl);
4442 }
11fdf7f2 4443 void decode(bufferlist::const_iterator &bl, int64_t pool = -1) {
7c673cae
FG
4444 for (auto const &i: missing)
4445 tracker.changed(i.first);
c07f9fc5 4446 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
11fdf7f2 4447 decode(missing, bl);
c07f9fc5 4448 if (struct_v >= 4) {
11fdf7f2 4449 decode(may_include_deletes, bl);
c07f9fc5 4450 }
7c673cae
FG
4451 DECODE_FINISH(bl);
4452
4453 if (struct_v < 3) {
4454 // Handle hobject_t upgrade
4455 map<hobject_t, item> tmp;
4456 for (map<hobject_t, item>::iterator i =
4457 missing.begin();
4458 i != missing.end();
4459 ) {
4460 if (!i->first.is_max() && i->first.pool == -1) {
4461 hobject_t to_insert(i->first);
4462 to_insert.pool = pool;
4463 tmp[to_insert] = i->second;
4464 missing.erase(i++);
4465 } else {
4466 ++i;
4467 }
4468 }
4469 missing.insert(tmp.begin(), tmp.end());
4470 }
4471
4472 for (map<hobject_t,item>::iterator it =
4473 missing.begin();
4474 it != missing.end();
4475 ++it)
4476 rmissing[it->second.need.version] = it->first;
4477 for (auto const &i: missing)
4478 tracker.changed(i.first);
4479 }
4480 void dump(Formatter *f) const {
4481 f->open_array_section("missing");
4482 for (map<hobject_t,item>::const_iterator p =
4483 missing.begin(); p != missing.end(); ++p) {
4484 f->open_object_section("item");
4485 f->dump_stream("object") << p->first;
4486 p->second.dump(f);
4487 f->close_section();
4488 }
4489 f->close_section();
c07f9fc5 4490 f->dump_bool("may_include_deletes", may_include_deletes);
7c673cae
FG
4491 }
4492 template <typename F>
4493 void filter_objects(F &&f) {
4494 for (auto i = missing.begin(); i != missing.end();) {
4495 if (f(i->first)) {
4496 rm(i++);
4497 } else {
4498 ++i;
4499 }
4500 }
4501 }
4502 static void generate_test_instances(list<pg_missing_set*>& o) {
4503 o.push_back(new pg_missing_set);
4504 o.push_back(new pg_missing_set);
4505 o.back()->add(
4506 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
c07f9fc5
FG
4507 eversion_t(5, 6), eversion_t(5, 1), false);
4508 o.push_back(new pg_missing_set);
4509 o.back()->add(
4510 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4511 eversion_t(5, 6), eversion_t(5, 1), true);
4512 o.back()->may_include_deletes = true;
7c673cae
FG
4513 }
4514 template <typename F>
4515 void get_changed(F &&f) const {
4516 tracker.get_changed(f);
4517 }
4518 void flush() {
4519 tracker.flush();
4520 }
4521 bool is_clean() const {
4522 return tracker.is_clean();
4523 }
4524 template <typename missing_t>
4525 bool debug_verify_from_init(
4526 const missing_t &init_missing,
4527 ostream *oss) const {
4528 if (!TrackChanges)
4529 return true;
4530 auto check_missing(init_missing.get_items());
4531 tracker.get_changed([&](const hobject_t &hoid) {
4532 check_missing.erase(hoid);
4533 if (missing.count(hoid)) {
4534 check_missing.insert(*(missing.find(hoid)));
4535 }
4536 });
4537 bool ok = true;
4538 if (check_missing.size() != missing.size()) {
4539 if (oss) {
4540 *oss << "Size mismatch, check: " << check_missing.size()
4541 << ", actual: " << missing.size() << "\n";
4542 }
4543 ok = false;
4544 }
4545 for (auto &i: missing) {
4546 if (!check_missing.count(i.first)) {
4547 if (oss)
4548 *oss << "check_missing missing " << i.first << "\n";
4549 ok = false;
4550 } else if (check_missing[i.first] != i.second) {
4551 if (oss)
4552 *oss << "check_missing missing item mismatch on " << i.first
4553 << ", check: " << check_missing[i.first]
4554 << ", actual: " << i.second << "\n";
4555 ok = false;
4556 }
4557 }
4558 if (oss && !ok) {
4559 *oss << "check_missing: " << check_missing << "\n";
4560 set<hobject_t> changed;
4561 tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
4562 *oss << "changed: " << changed << "\n";
4563 }
4564 return ok;
4565 }
4566};
4567template <bool TrackChanges>
4568void encode(
4569 const pg_missing_set<TrackChanges> &c, bufferlist &bl, uint64_t features=0) {
4570 ENCODE_DUMP_PRE();
4571 c.encode(bl);
4572 ENCODE_DUMP_POST(cl);
4573}
4574template <bool TrackChanges>
11fdf7f2 4575void decode(pg_missing_set<TrackChanges> &c, bufferlist::const_iterator &p) {
7c673cae
FG
4576 c.decode(p);
4577}
4578template <bool TrackChanges>
4579ostream& operator<<(ostream& out, const pg_missing_set<TrackChanges> &missing)
4580{
c07f9fc5
FG
4581 out << "missing(" << missing.num_missing()
4582 << " may_include_deletes = " << missing.may_include_deletes;
7c673cae
FG
4583 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
4584 out << ")";
4585 return out;
4586}
4587
4588using pg_missing_t = pg_missing_set<false>;
4589using pg_missing_tracker_t = pg_missing_set<true>;
4590
4591
4592/**
4593 * pg list objects response format
4594 *
4595 */
4596struct pg_nls_response_t {
4597 collection_list_handle_t handle;
4598 list<librados::ListObjectImpl> entries;
4599
4600 void encode(bufferlist& bl) const {
4601 ENCODE_START(1, 1, bl);
11fdf7f2 4602 encode(handle, bl);
7c673cae 4603 __u32 n = (__u32)entries.size();
11fdf7f2 4604 encode(n, bl);
7c673cae 4605 for (list<librados::ListObjectImpl>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
11fdf7f2
TL
4606 encode(i->nspace, bl);
4607 encode(i->oid, bl);
4608 encode(i->locator, bl);
7c673cae
FG
4609 }
4610 ENCODE_FINISH(bl);
4611 }
11fdf7f2 4612 void decode(bufferlist::const_iterator& bl) {
7c673cae 4613 DECODE_START(1, bl);
11fdf7f2 4614 decode(handle, bl);
7c673cae 4615 __u32 n;
11fdf7f2 4616 decode(n, bl);
7c673cae
FG
4617 entries.clear();
4618 while (n--) {
4619 librados::ListObjectImpl i;
11fdf7f2
TL
4620 decode(i.nspace, bl);
4621 decode(i.oid, bl);
4622 decode(i.locator, bl);
7c673cae
FG
4623 entries.push_back(i);
4624 }
4625 DECODE_FINISH(bl);
4626 }
4627 void dump(Formatter *f) const {
4628 f->dump_stream("handle") << handle;
4629 f->open_array_section("entries");
4630 for (list<librados::ListObjectImpl>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4631 f->open_object_section("object");
4632 f->dump_string("namespace", p->nspace);
4633 f->dump_string("object", p->oid);
4634 f->dump_string("key", p->locator);
4635 f->close_section();
4636 }
4637 f->close_section();
4638 }
4639 static void generate_test_instances(list<pg_nls_response_t*>& o) {
4640 o.push_back(new pg_nls_response_t);
4641 o.push_back(new pg_nls_response_t);
4642 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4643 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4644 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4645 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4646 o.push_back(new pg_nls_response_t);
4647 o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
4648 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4649 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4650 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4651 o.push_back(new pg_nls_response_t);
4652 o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
4653 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4654 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4655 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4656 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4657 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4658 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4659 }
4660};
4661
4662WRITE_CLASS_ENCODER(pg_nls_response_t)
4663
4664// For backwards compatibility with older OSD requests
4665struct pg_ls_response_t {
4666 collection_list_handle_t handle;
4667 list<pair<object_t, string> > entries;
4668
4669 void encode(bufferlist& bl) const {
11fdf7f2 4670 using ceph::encode;
7c673cae 4671 __u8 v = 1;
11fdf7f2
TL
4672 encode(v, bl);
4673 encode(handle, bl);
4674 encode(entries, bl);
7c673cae 4675 }
11fdf7f2
TL
4676 void decode(bufferlist::const_iterator& bl) {
4677 using ceph::decode;
7c673cae 4678 __u8 v;
11fdf7f2
TL
4679 decode(v, bl);
4680 ceph_assert(v == 1);
4681 decode(handle, bl);
4682 decode(entries, bl);
7c673cae
FG
4683 }
4684 void dump(Formatter *f) const {
4685 f->dump_stream("handle") << handle;
4686 f->open_array_section("entries");
4687 for (list<pair<object_t, string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4688 f->open_object_section("object");
4689 f->dump_stream("object") << p->first;
4690 f->dump_string("key", p->second);
4691 f->close_section();
4692 }
4693 f->close_section();
4694 }
4695 static void generate_test_instances(list<pg_ls_response_t*>& o) {
4696 o.push_back(new pg_ls_response_t);
4697 o.push_back(new pg_ls_response_t);
4698 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4699 o.back()->entries.push_back(make_pair(object_t("one"), string()));
4700 o.back()->entries.push_back(make_pair(object_t("two"), string("twokey")));
4701 }
4702};
4703
4704WRITE_CLASS_ENCODER(pg_ls_response_t)
4705
4706/**
4707 * object_copy_cursor_t
4708 */
4709struct object_copy_cursor_t {
4710 uint64_t data_offset;
4711 string omap_offset;
4712 bool attr_complete;
4713 bool data_complete;
4714 bool omap_complete;
4715
4716 object_copy_cursor_t()
4717 : data_offset(0),
4718 attr_complete(false),
4719 data_complete(false),
4720 omap_complete(false)
4721 {}
4722
4723 bool is_initial() const {
4724 return !attr_complete && data_offset == 0 && omap_offset.empty();
4725 }
4726 bool is_complete() const {
4727 return attr_complete && data_complete && omap_complete;
4728 }
4729
4730 static void generate_test_instances(list<object_copy_cursor_t*>& o);
4731 void encode(bufferlist& bl) const;
11fdf7f2 4732 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
4733 void dump(Formatter *f) const;
4734};
4735WRITE_CLASS_ENCODER(object_copy_cursor_t)
4736
4737/**
4738 * object_copy_data_t
4739 *
4740 * Return data from a copy request. The semantics are a little strange
4741 * as a result of the encoding's heritage.
4742 *
4743 * In particular, the sender unconditionally fills in the cursor (from what
4744 * it receives and sends), the size, and the mtime, but is responsible for
4745 * figuring out whether it should put any data in the attrs, data, or
4746 * omap members (corresponding to xattrs, object data, and the omap entries)
4747 * based on external data (the client includes a max amount to return with
4748 * the copy request). The client then looks into the attrs, data, and/or omap
4749 * based on the contents of the cursor.
4750 */
4751struct object_copy_data_t {
4752 enum {
4753 FLAG_DATA_DIGEST = 1<<0,
4754 FLAG_OMAP_DIGEST = 1<<1,
4755 };
4756 object_copy_cursor_t cursor;
4757 uint64_t size;
4758 utime_t mtime;
4759 uint32_t data_digest, omap_digest;
4760 uint32_t flags;
4761 map<string, bufferlist> attrs;
4762 bufferlist data;
4763 bufferlist omap_header;
4764 bufferlist omap_data;
4765
4766 /// which snaps we are defined for (if a snap and not the head)
4767 vector<snapid_t> snaps;
11fdf7f2 4768 /// latest snap seq for the object (if head)
7c673cae
FG
4769 snapid_t snap_seq;
4770
11fdf7f2 4771 /// recent reqids on this object
31f18b77 4772 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > reqids;
7c673cae 4773
11fdf7f2
TL
4774 /// map reqids by index to error return code (if any)
4775 mempool::osd_pglog::map<uint32_t, int> reqid_return_codes;
4776
7c673cae
FG
4777 uint64_t truncate_seq;
4778 uint64_t truncate_size;
4779
4780public:
4781 object_copy_data_t() :
4782 size((uint64_t)-1), data_digest(-1),
4783 omap_digest(-1), flags(0),
4784 truncate_seq(0),
4785 truncate_size(0) {}
4786
4787 static void generate_test_instances(list<object_copy_data_t*>& o);
4788 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 4789 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
4790 void dump(Formatter *f) const;
4791};
4792WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
4793
4794/**
4795 * pg creation info
4796 */
4797struct pg_create_t {
4798 epoch_t created; // epoch pg created
4799 pg_t parent; // split from parent (if != pg_t())
4800 __s32 split_bits;
4801
4802 pg_create_t()
4803 : created(0), split_bits(0) {}
4804 pg_create_t(unsigned c, pg_t p, int s)
4805 : created(c), parent(p), split_bits(s) {}
4806
4807 void encode(bufferlist &bl) const;
11fdf7f2 4808 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
4809 void dump(Formatter *f) const;
4810 static void generate_test_instances(list<pg_create_t*>& o);
4811};
4812WRITE_CLASS_ENCODER(pg_create_t)
4813
7c673cae
FG
4814// -----------------------------------------
4815
4816class ObjectExtent {
4817 /**
4818 * ObjectExtents are used for specifying IO behavior against RADOS
4819 * objects when one is using the ObjectCacher.
4820 *
4821 * To use this in a real system, *every member* must be filled
4822 * out correctly. In particular, make sure to initialize the
4823 * oloc correctly, as its default values are deliberate poison
4824 * and will cause internal ObjectCacher asserts.
4825 *
4826 * Similarly, your buffer_extents vector *must* specify a total
4827 * size equal to your length. If the buffer_extents inadvertently
4828 * contain less space than the length member specifies, you
4829 * will get unintelligible asserts deep in the ObjectCacher.
4830 *
4831 * If you are trying to do testing and don't care about actual
4832 * RADOS function, the simplest thing to do is to initialize
4833 * the ObjectExtent (truncate_size can be 0), create a single entry
4834 * in buffer_extents matching the length, and set oloc.pool to 0.
4835 */
4836 public:
4837 object_t oid; // object id
4838 uint64_t objectno;
4839 uint64_t offset; // in object
4840 uint64_t length; // in object
4841 uint64_t truncate_size; // in object
4842
4843 object_locator_t oloc; // object locator (pool etc)
4844
4845 vector<pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
4846
4847 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
4848 ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
4849 oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
4850};
4851
4852inline ostream& operator<<(ostream& out, const ObjectExtent &ex)
4853{
4854 return out << "extent("
4855 << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
4856 << " " << ex.offset << "~" << ex.length
4857 << " -> " << ex.buffer_extents
4858 << ")";
4859}
4860
4861
7c673cae
FG
4862// ---------------------------------------
4863
4864class OSDSuperblock {
4865public:
4866 uuid_d cluster_fsid, osd_fsid;
4867 int32_t whoami; // my role in this fs.
4868 epoch_t current_epoch; // most recent epoch
4869 epoch_t oldest_map, newest_map; // oldest/newest maps we have.
4870 double weight;
4871
4872 CompatSet compat_features;
4873
4874 // last interval over which i mounted and was then active
4875 epoch_t mounted; // last epoch i mounted
4876 epoch_t clean_thru; // epoch i was active and clean thru
4877
4878 OSDSuperblock() :
4879 whoami(-1),
4880 current_epoch(0), oldest_map(0), newest_map(0), weight(0),
4881 mounted(0), clean_thru(0) {
4882 }
4883
4884 void encode(bufferlist &bl) const;
11fdf7f2 4885 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
4886 void dump(Formatter *f) const;
4887 static void generate_test_instances(list<OSDSuperblock*>& o);
4888};
4889WRITE_CLASS_ENCODER(OSDSuperblock)
4890
4891inline ostream& operator<<(ostream& out, const OSDSuperblock& sb)
4892{
4893 return out << "sb(" << sb.cluster_fsid
4894 << " osd." << sb.whoami
4895 << " " << sb.osd_fsid
4896 << " e" << sb.current_epoch
4897 << " [" << sb.oldest_map << "," << sb.newest_map << "]"
4898 << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
4899 << ")";
4900}
4901
4902
4903// -------
4904
4905
4906
4907
4908
4909
4910/*
4911 * attached to object head. describes most recent snap context, and
4912 * set of existing clones.
4913 */
4914struct SnapSet {
4915 snapid_t seq;
7c673cae
FG
4916 vector<snapid_t> snaps; // descending
4917 vector<snapid_t> clones; // ascending
4918 map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest
4919 map<snapid_t, uint64_t> clone_size;
4920 map<snapid_t, vector<snapid_t>> clone_snaps; // descending
4921
11fdf7f2 4922 SnapSet() : seq(0) {}
7c673cae 4923 explicit SnapSet(bufferlist& bl) {
11fdf7f2 4924 auto p = std::cbegin(bl);
7c673cae
FG
4925 decode(p);
4926 }
4927
7c673cae
FG
4928 /// populate SnapSet from a librados::snap_set_t
4929 void from_snap_set(const librados::snap_set_t& ss, bool legacy);
4930
4931 /// get space accounted to clone
4932 uint64_t get_clone_bytes(snapid_t clone) const;
4933
4934 void encode(bufferlist& bl) const;
11fdf7f2 4935 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
4936 void dump(Formatter *f) const;
4937 static void generate_test_instances(list<SnapSet*>& o);
4938
4939 SnapContext get_ssc_as_of(snapid_t as_of) const {
4940 SnapContext out;
4941 out.seq = as_of;
4942 for (vector<snapid_t>::const_iterator i = snaps.begin();
4943 i != snaps.end();
4944 ++i) {
4945 if (*i <= as_of)
4946 out.snaps.push_back(*i);
4947 }
4948 return out;
4949 }
4950
7c673cae
FG
4951
4952 SnapSet get_filtered(const pg_pool_t &pinfo) const;
4953 void filter(const pg_pool_t &pinfo);
4954};
4955WRITE_CLASS_ENCODER(SnapSet)
4956
4957ostream& operator<<(ostream& out, const SnapSet& cs);
4958
4959
4960
4961#define OI_ATTR "_"
4962#define SS_ATTR "snapset"
4963
4964struct watch_info_t {
4965 uint64_t cookie;
4966 uint32_t timeout_seconds;
4967 entity_addr_t addr;
4968
4969 watch_info_t() : cookie(0), timeout_seconds(0) { }
4970 watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
4971
4972 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 4973 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
4974 void dump(Formatter *f) const;
4975 static void generate_test_instances(list<watch_info_t*>& o);
4976};
4977WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
4978
4979static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
4980 return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
4981 && l.addr == r.addr;
4982}
4983
4984static inline ostream& operator<<(ostream& out, const watch_info_t& w) {
4985 return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
4986 << " " << w.addr << ")";
4987}
4988
4989struct notify_info_t {
4990 uint64_t cookie;
4991 uint64_t notify_id;
4992 uint32_t timeout;
4993 bufferlist bl;
4994};
4995
4996static inline ostream& operator<<(ostream& out, const notify_info_t& n) {
4997 return out << "notify(cookie " << n.cookie
4998 << " notify" << n.notify_id
4999 << " " << n.timeout << "s)";
5000}
5001
11fdf7f2
TL
5002struct chunk_info_t {
5003 typedef enum {
5004 FLAG_DIRTY = 1,
5005 FLAG_MISSING = 2,
5006 FLAG_HAS_REFERENCE = 4,
5007 FLAG_HAS_FINGERPRINT = 8,
5008 } cflag_t;
5009 uint32_t offset;
5010 uint32_t length;
5011 hobject_t oid;
5012 cflag_t flags; // FLAG_*
5013
5014 chunk_info_t() : offset(0), length(0), flags((cflag_t)0) { }
5015
5016 static string get_flag_string(uint64_t flags) {
5017 string r;
5018 if (flags & FLAG_DIRTY) {
5019 r += "|dirty";
5020 }
5021 if (flags & FLAG_MISSING) {
5022 r += "|missing";
5023 }
5024 if (flags & FLAG_HAS_REFERENCE) {
5025 r += "|has_reference";
5026 }
5027 if (flags & FLAG_HAS_FINGERPRINT) {
5028 r += "|has_fingerprint";
5029 }
5030 if (r.length())
5031 return r.substr(1);
5032 return r;
5033 }
5034 bool test_flag(cflag_t f) const {
5035 return (flags & f) == f;
5036 }
5037 void set_flag(cflag_t f) {
5038 flags = (cflag_t)(flags | f);
5039 }
5040 void set_flags(cflag_t f) {
5041 flags = f;
5042 }
5043 void clear_flag(cflag_t f) {
5044 flags = (cflag_t)(flags & ~f);
5045 }
5046 void clear_flags() {
5047 flags = (cflag_t)0;
5048 }
5049 bool is_dirty() const {
5050 return test_flag(FLAG_DIRTY);
5051 }
5052 bool is_missing() const {
5053 return test_flag(FLAG_MISSING);
5054 }
5055 bool has_reference() const {
5056 return test_flag(FLAG_HAS_REFERENCE);
5057 }
5058 bool has_fingerprint() const {
5059 return test_flag(FLAG_HAS_FINGERPRINT);
5060 }
5061 void encode(bufferlist &bl) const;
5062 void decode(bufferlist::const_iterator &bl);
5063 void dump(Formatter *f) const;
5064 friend ostream& operator<<(ostream& out, const chunk_info_t& ci);
5065};
5066WRITE_CLASS_ENCODER(chunk_info_t)
5067ostream& operator<<(ostream& out, const chunk_info_t& ci);
5068
31f18b77
FG
5069struct object_info_t;
5070struct object_manifest_t {
5071 enum {
5072 TYPE_NONE = 0,
11fdf7f2
TL
5073 TYPE_REDIRECT = 1,
5074 TYPE_CHUNKED = 2,
31f18b77
FG
5075 };
5076 uint8_t type; // redirect, chunked, ...
5077 hobject_t redirect_target;
11fdf7f2 5078 map <uint64_t, chunk_info_t> chunk_map;
31f18b77
FG
5079
5080 object_manifest_t() : type(0) { }
5081 object_manifest_t(uint8_t type, const hobject_t& redirect_target)
5082 : type(type), redirect_target(redirect_target) { }
5083
5084 bool is_empty() const {
5085 return type == TYPE_NONE;
5086 }
5087 bool is_redirect() const {
5088 return type == TYPE_REDIRECT;
5089 }
5090 bool is_chunked() const {
5091 return type == TYPE_CHUNKED;
5092 }
11fdf7f2 5093 static std::string_view get_type_name(uint8_t m) {
31f18b77
FG
5094 switch (m) {
5095 case TYPE_NONE: return "none";
5096 case TYPE_REDIRECT: return "redirect";
5097 case TYPE_CHUNKED: return "chunked";
5098 default: return "unknown";
5099 }
5100 }
11fdf7f2 5101 std::string_view get_type_name() const {
31f18b77
FG
5102 return get_type_name(type);
5103 }
11fdf7f2
TL
5104 void clear() {
5105 type = 0;
5106 redirect_target = hobject_t();
5107 chunk_map.clear();
5108 }
31f18b77
FG
5109 static void generate_test_instances(list<object_manifest_t*>& o);
5110 void encode(bufferlist &bl) const;
11fdf7f2 5111 void decode(bufferlist::const_iterator &bl);
31f18b77
FG
5112 void dump(Formatter *f) const;
5113 friend ostream& operator<<(ostream& out, const object_info_t& oi);
5114};
5115WRITE_CLASS_ENCODER(object_manifest_t)
5116ostream& operator<<(ostream& out, const object_manifest_t& oi);
7c673cae
FG
5117
5118struct object_info_t {
5119 hobject_t soid;
5120 eversion_t version, prior_version;
5121 version_t user_version;
5122 osd_reqid_t last_reqid;
5123
5124 uint64_t size;
5125 utime_t mtime;
5126 utime_t local_mtime; // local mtime
5127
5128 // note: these are currently encoded into a total 16 bits; see
5129 // encode()/decode() for the weirdness.
5130 typedef enum {
11fdf7f2
TL
5131 FLAG_LOST = 1<<0,
5132 FLAG_WHITEOUT = 1<<1, // object logically does not exist
5133 FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
5134 FLAG_OMAP = 1<<3, // has (or may have) some/any omap data
5135 FLAG_DATA_DIGEST = 1<<4, // has data crc
5136 FLAG_OMAP_DIGEST = 1<<5, // has omap crc
5137 FLAG_CACHE_PIN = 1<<6, // pin the object in cache tier
5138 FLAG_MANIFEST = 1<<7, // has manifest
5139 FLAG_USES_TMAP = 1<<8, // deprecated; no longer used
5140 FLAG_REDIRECT_HAS_REFERENCE = 1<<9, // has reference
7c673cae
FG
5141 } flag_t;
5142
5143 flag_t flags;
5144
5145 static string get_flag_string(flag_t flags) {
5146 string s;
94b18763
FG
5147 vector<string> sv = get_flag_vector(flags);
5148 for (auto ss : sv) {
5149 s += string("|") + ss;
5150 }
5151 if (s.length())
5152 return s.substr(1);
5153 return s;
5154 }
5155 static vector<string> get_flag_vector(flag_t flags) {
5156 vector<string> sv;
7c673cae 5157 if (flags & FLAG_LOST)
94b18763 5158 sv.insert(sv.end(), "lost");
7c673cae 5159 if (flags & FLAG_WHITEOUT)
94b18763 5160 sv.insert(sv.end(), "whiteout");
7c673cae 5161 if (flags & FLAG_DIRTY)
94b18763 5162 sv.insert(sv.end(), "dirty");
7c673cae 5163 if (flags & FLAG_USES_TMAP)
94b18763 5164 sv.insert(sv.end(), "uses_tmap");
7c673cae 5165 if (flags & FLAG_OMAP)
94b18763 5166 sv.insert(sv.end(), "omap");
7c673cae 5167 if (flags & FLAG_DATA_DIGEST)
94b18763 5168 sv.insert(sv.end(), "data_digest");
7c673cae 5169 if (flags & FLAG_OMAP_DIGEST)
94b18763 5170 sv.insert(sv.end(), "omap_digest");
7c673cae 5171 if (flags & FLAG_CACHE_PIN)
94b18763 5172 sv.insert(sv.end(), "cache_pin");
31f18b77 5173 if (flags & FLAG_MANIFEST)
94b18763 5174 sv.insert(sv.end(), "manifest");
11fdf7f2
TL
5175 if (flags & FLAG_REDIRECT_HAS_REFERENCE)
5176 sv.insert(sv.end(), "redirect_has_reference");
94b18763 5177 return sv;
7c673cae
FG
5178 }
5179 string get_flag_string() const {
5180 return get_flag_string(flags);
5181 }
5182
7c673cae
FG
5183 uint64_t truncate_seq, truncate_size;
5184
5185 map<pair<uint64_t, entity_name_t>, watch_info_t> watchers;
5186
5187 // opportunistic checksums; may or may not be present
5188 __u32 data_digest; ///< data crc32c
5189 __u32 omap_digest; ///< omap crc32c
5190
5191 // alloc hint attribute
5192 uint64_t expected_object_size, expected_write_size;
5193 uint32_t alloc_hint_flags;
5194
31f18b77
FG
5195 struct object_manifest_t manifest;
5196
7c673cae
FG
5197 void copy_user_bits(const object_info_t& other);
5198
7c673cae
FG
5199 bool test_flag(flag_t f) const {
5200 return (flags & f) == f;
5201 }
5202 void set_flag(flag_t f) {
5203 flags = (flag_t)(flags | f);
5204 }
5205 void clear_flag(flag_t f) {
5206 flags = (flag_t)(flags & ~f);
5207 }
5208 bool is_lost() const {
5209 return test_flag(FLAG_LOST);
5210 }
5211 bool is_whiteout() const {
5212 return test_flag(FLAG_WHITEOUT);
5213 }
5214 bool is_dirty() const {
5215 return test_flag(FLAG_DIRTY);
5216 }
5217 bool is_omap() const {
5218 return test_flag(FLAG_OMAP);
5219 }
5220 bool is_data_digest() const {
5221 return test_flag(FLAG_DATA_DIGEST);
5222 }
5223 bool is_omap_digest() const {
5224 return test_flag(FLAG_OMAP_DIGEST);
5225 }
5226 bool is_cache_pinned() const {
5227 return test_flag(FLAG_CACHE_PIN);
5228 }
31f18b77
FG
5229 bool has_manifest() const {
5230 return test_flag(FLAG_MANIFEST);
5231 }
7c673cae
FG
5232 void set_data_digest(__u32 d) {
5233 set_flag(FLAG_DATA_DIGEST);
5234 data_digest = d;
5235 }
5236 void set_omap_digest(__u32 d) {
5237 set_flag(FLAG_OMAP_DIGEST);
5238 omap_digest = d;
5239 }
5240 void clear_data_digest() {
5241 clear_flag(FLAG_DATA_DIGEST);
5242 data_digest = -1;
5243 }
5244 void clear_omap_digest() {
5245 clear_flag(FLAG_OMAP_DIGEST);
5246 omap_digest = -1;
5247 }
5248 void new_object() {
28e407b8
AA
5249 clear_data_digest();
5250 clear_omap_digest();
7c673cae
FG
5251 }
5252
5253 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 5254 void decode(bufferlist::const_iterator& bl);
7c673cae 5255 void decode(bufferlist& bl) {
11fdf7f2 5256 auto p = std::cbegin(bl);
7c673cae
FG
5257 decode(p);
5258 }
5259 void dump(Formatter *f) const;
5260 static void generate_test_instances(list<object_info_t*>& o);
5261
5262 explicit object_info_t()
5263 : user_version(0), size(0), flags((flag_t)0),
5264 truncate_seq(0), truncate_size(0),
5265 data_digest(-1), omap_digest(-1),
5266 expected_object_size(0), expected_write_size(0),
5267 alloc_hint_flags(0)
5268 {}
5269
5270 explicit object_info_t(const hobject_t& s)
5271 : soid(s),
5272 user_version(0), size(0), flags((flag_t)0),
5273 truncate_seq(0), truncate_size(0),
5274 data_digest(-1), omap_digest(-1),
5275 expected_object_size(0), expected_write_size(0),
5276 alloc_hint_flags(0)
5277 {}
5278
5279 explicit object_info_t(bufferlist& bl) {
5280 decode(bl);
5281 }
5282};
5283WRITE_CLASS_ENCODER_FEATURES(object_info_t)
5284
5285ostream& operator<<(ostream& out, const object_info_t& oi);
5286
5287
5288
5289// Object recovery
5290struct ObjectRecoveryInfo {
5291 hobject_t soid;
5292 eversion_t version;
5293 uint64_t size;
5294 object_info_t oi;
5295 SnapSet ss; // only populated if soid is_snap()
5296 interval_set<uint64_t> copy_subset;
5297 map<hobject_t, interval_set<uint64_t>> clone_subset;
5298
5299 ObjectRecoveryInfo() : size(0) { }
5300
5301 static void generate_test_instances(list<ObjectRecoveryInfo*>& o);
5302 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 5303 void decode(bufferlist::const_iterator &bl, int64_t pool = -1);
7c673cae
FG
5304 ostream &print(ostream &out) const;
5305 void dump(Formatter *f) const;
5306};
5307WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
5308ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf);
5309
5310struct ObjectRecoveryProgress {
5311 uint64_t data_recovered_to;
5312 string omap_recovered_to;
5313 bool first;
5314 bool data_complete;
5315 bool omap_complete;
224ce89b 5316 bool error = false;
7c673cae
FG
5317
5318 ObjectRecoveryProgress()
5319 : data_recovered_to(0),
5320 first(true),
5321 data_complete(false), omap_complete(false) { }
5322
5323 bool is_complete(const ObjectRecoveryInfo& info) const {
5324 return (data_recovered_to >= (
5325 info.copy_subset.empty() ?
5326 0 : info.copy_subset.range_end())) &&
5327 omap_complete;
5328 }
5329
5330 static void generate_test_instances(list<ObjectRecoveryProgress*>& o);
5331 void encode(bufferlist &bl) const;
11fdf7f2 5332 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
5333 ostream &print(ostream &out) const;
5334 void dump(Formatter *f) const;
5335};
5336WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
5337ostream& operator<<(ostream& out, const ObjectRecoveryProgress &prog);
5338
5339struct PushReplyOp {
5340 hobject_t soid;
5341
5342 static void generate_test_instances(list<PushReplyOp*>& o);
5343 void encode(bufferlist &bl) const;
11fdf7f2 5344 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
5345 ostream &print(ostream &out) const;
5346 void dump(Formatter *f) const;
5347
5348 uint64_t cost(CephContext *cct) const;
5349};
5350WRITE_CLASS_ENCODER(PushReplyOp)
5351ostream& operator<<(ostream& out, const PushReplyOp &op);
5352
5353struct PullOp {
5354 hobject_t soid;
5355
5356 ObjectRecoveryInfo recovery_info;
5357 ObjectRecoveryProgress recovery_progress;
5358
5359 static void generate_test_instances(list<PullOp*>& o);
5360 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 5361 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
5362 ostream &print(ostream &out) const;
5363 void dump(Formatter *f) const;
5364
5365 uint64_t cost(CephContext *cct) const;
5366};
5367WRITE_CLASS_ENCODER_FEATURES(PullOp)
5368ostream& operator<<(ostream& out, const PullOp &op);
5369
5370struct PushOp {
5371 hobject_t soid;
5372 eversion_t version;
5373 bufferlist data;
5374 interval_set<uint64_t> data_included;
5375 bufferlist omap_header;
5376 map<string, bufferlist> omap_entries;
5377 map<string, bufferlist> attrset;
5378
5379 ObjectRecoveryInfo recovery_info;
5380 ObjectRecoveryProgress before_progress;
5381 ObjectRecoveryProgress after_progress;
5382
5383 static void generate_test_instances(list<PushOp*>& o);
5384 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 5385 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
5386 ostream &print(ostream &out) const;
5387 void dump(Formatter *f) const;
5388
5389 uint64_t cost(CephContext *cct) const;
5390};
5391WRITE_CLASS_ENCODER_FEATURES(PushOp)
5392ostream& operator<<(ostream& out, const PushOp &op);
5393
5394
5395/*
5396 * summarize pg contents for purposes of a scrub
5397 */
5398struct ScrubMap {
5399 struct object {
5400 map<string,bufferptr> attrs;
5401 uint64_t size;
5402 __u32 omap_digest; ///< omap crc32c
5403 __u32 digest; ///< data crc32c
5404 bool negative:1;
5405 bool digest_present:1;
5406 bool omap_digest_present:1;
5407 bool read_error:1;
5408 bool stat_error:1;
5409 bool ec_hash_mismatch:1;
5410 bool ec_size_mismatch:1;
28e407b8
AA
5411 bool large_omap_object_found:1;
5412 uint64_t large_omap_object_key_count = 0;
5413 uint64_t large_omap_object_value_size = 0;
11fdf7f2
TL
5414 uint64_t object_omap_bytes = 0;
5415 uint64_t object_omap_keys = 0;
7c673cae
FG
5416
5417 object() :
5418 // Init invalid size so it won't match if we get a stat EIO error
5419 size(-1), omap_digest(0), digest(0),
28e407b8
AA
5420 negative(false), digest_present(false), omap_digest_present(false),
5421 read_error(false), stat_error(false), ec_hash_mismatch(false),
5422 ec_size_mismatch(false), large_omap_object_found(false) {}
7c673cae
FG
5423
5424 void encode(bufferlist& bl) const;
11fdf7f2 5425 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
5426 void dump(Formatter *f) const;
5427 static void generate_test_instances(list<object*>& o);
5428 };
5429 WRITE_CLASS_ENCODER(object)
5430
5431 map<hobject_t,object> objects;
5432 eversion_t valid_through;
5433 eversion_t incr_since;
28e407b8 5434 bool has_large_omap_object_errors:1;
11fdf7f2 5435 bool has_omap_keys:1;
7c673cae
FG
5436
5437 void merge_incr(const ScrubMap &l);
28e407b8
AA
5438 void clear_from(const hobject_t& start) {
5439 objects.erase(objects.lower_bound(start), objects.end());
5440 }
7c673cae
FG
5441 void insert(const ScrubMap &r) {
5442 objects.insert(r.objects.begin(), r.objects.end());
5443 }
5444 void swap(ScrubMap &r) {
31f18b77
FG
5445 using std::swap;
5446 swap(objects, r.objects);
5447 swap(valid_through, r.valid_through);
5448 swap(incr_since, r.incr_since);
7c673cae
FG
5449 }
5450
5451 void encode(bufferlist& bl) const;
11fdf7f2 5452 void decode(bufferlist::const_iterator& bl, int64_t pool=-1);
7c673cae
FG
5453 void dump(Formatter *f) const;
5454 static void generate_test_instances(list<ScrubMap*>& o);
5455};
5456WRITE_CLASS_ENCODER(ScrubMap::object)
5457WRITE_CLASS_ENCODER(ScrubMap)
5458
28e407b8
AA
5459struct ScrubMapBuilder {
5460 bool deep = false;
5461 vector<hobject_t> ls;
5462 size_t pos = 0;
5463 int64_t data_pos = 0;
5464 string omap_pos;
5465 int ret = 0;
5466 bufferhash data_hash, omap_hash; ///< accumulatinng hash value
5467 uint64_t omap_keys = 0;
5468 uint64_t omap_bytes = 0;
5469
5470 bool empty() {
5471 return ls.empty();
5472 }
5473 bool done() {
5474 return pos >= ls.size();
5475 }
5476 void reset() {
5477 *this = ScrubMapBuilder();
5478 }
5479
5480 bool data_done() {
5481 return data_pos < 0;
5482 }
5483
5484 void next_object() {
5485 ++pos;
5486 data_pos = 0;
5487 omap_pos.clear();
5488 omap_keys = 0;
5489 omap_bytes = 0;
5490 }
5491
5492 friend ostream& operator<<(ostream& out, const ScrubMapBuilder& pos) {
5493 out << "(" << pos.pos << "/" << pos.ls.size();
5494 if (pos.pos < pos.ls.size()) {
5495 out << " " << pos.ls[pos.pos];
5496 }
5497 if (pos.data_pos < 0) {
5498 out << " byte " << pos.data_pos;
5499 }
5500 if (!pos.omap_pos.empty()) {
5501 out << " key " << pos.omap_pos;
5502 }
5503 if (pos.deep) {
5504 out << " deep";
5505 }
5506 if (pos.ret) {
5507 out << " ret " << pos.ret;
5508 }
5509 return out << ")";
5510 }
5511};
5512
7c673cae
FG
5513struct OSDOp {
5514 ceph_osd_op op;
5515 sobject_t soid;
5516
5517 bufferlist indata, outdata;
224ce89b 5518 errorcode32_t rval;
7c673cae
FG
5519
5520 OSDOp() : rval(0) {
5521 memset(&op, 0, sizeof(ceph_osd_op));
5522 }
5523
5524 /**
5525 * split a bufferlist into constituent indata members of a vector of OSDOps
5526 *
5527 * @param ops [out] vector of OSDOps
5528 * @param in [in] combined data buffer
5529 */
5530 static void split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in);
5531
5532 /**
5533 * merge indata members of a vector of OSDOp into a single bufferlist
5534 *
5535 * Notably this also encodes certain other OSDOp data into the data
5536 * buffer, including the sobject_t soid.
5537 *
5538 * @param ops [in] vector of OSDOps
5539 * @param out [out] combined data buffer
5540 */
5541 static void merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out);
5542
5543 /**
5544 * split a bufferlist into constituent outdata members of a vector of OSDOps
5545 *
5546 * @param ops [out] vector of OSDOps
5547 * @param in [in] combined data buffer
5548 */
5549 static void split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in);
5550
5551 /**
5552 * merge outdata members of a vector of OSDOps into a single bufferlist
5553 *
5554 * @param ops [in] vector of OSDOps
5555 * @param out [out] combined data buffer
5556 */
5557 static void merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out);
224ce89b
WB
5558
5559 /**
5560 * Clear data as much as possible, leave minimal data for historical op dump
5561 *
5562 * @param ops [in] vector of OSDOps
5563 */
5564 static void clear_data(vector<OSDOp>& ops);
7c673cae
FG
5565};
5566
5567ostream& operator<<(ostream& out, const OSDOp& op);
5568
5569struct watch_item_t {
5570 entity_name_t name;
5571 uint64_t cookie;
5572 uint32_t timeout_seconds;
5573 entity_addr_t addr;
5574
5575 watch_item_t() : cookie(0), timeout_seconds(0) { }
5576 watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
5577 const entity_addr_t& addr)
5578 : name(name), cookie(cookie), timeout_seconds(timeout),
5579 addr(addr) { }
5580
5581 void encode(bufferlist &bl, uint64_t features) const {
5582 ENCODE_START(2, 1, bl);
11fdf7f2
TL
5583 encode(name, bl);
5584 encode(cookie, bl);
5585 encode(timeout_seconds, bl);
5586 encode(addr, bl, features);
7c673cae
FG
5587 ENCODE_FINISH(bl);
5588 }
11fdf7f2 5589 void decode(bufferlist::const_iterator &bl) {
7c673cae 5590 DECODE_START(2, bl);
11fdf7f2
TL
5591 decode(name, bl);
5592 decode(cookie, bl);
5593 decode(timeout_seconds, bl);
7c673cae 5594 if (struct_v >= 2) {
11fdf7f2 5595 decode(addr, bl);
7c673cae
FG
5596 }
5597 DECODE_FINISH(bl);
5598 }
5599};
5600WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
5601
5602struct obj_watch_item_t {
5603 hobject_t obj;
5604 watch_item_t wi;
5605};
5606
5607/**
5608 * obj list watch response format
5609 *
5610 */
5611struct obj_list_watch_response_t {
5612 list<watch_item_t> entries;
5613
5614 void encode(bufferlist& bl, uint64_t features) const {
5615 ENCODE_START(1, 1, bl);
11fdf7f2 5616 encode(entries, bl, features);
7c673cae
FG
5617 ENCODE_FINISH(bl);
5618 }
11fdf7f2 5619 void decode(bufferlist::const_iterator& bl) {
7c673cae 5620 DECODE_START(1, bl);
11fdf7f2 5621 decode(entries, bl);
7c673cae
FG
5622 DECODE_FINISH(bl);
5623 }
5624 void dump(Formatter *f) const {
5625 f->open_array_section("entries");
5626 for (list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
5627 f->open_object_section("watch");
5628 f->dump_stream("watcher") << p->name;
5629 f->dump_int("cookie", p->cookie);
5630 f->dump_int("timeout", p->timeout_seconds);
5631 f->open_object_section("addr");
5632 p->addr.dump(f);
5633 f->close_section();
5634 f->close_section();
5635 }
5636 f->close_section();
5637 }
5638 static void generate_test_instances(list<obj_list_watch_response_t*>& o) {
5639 entity_addr_t ea;
5640 o.push_back(new obj_list_watch_response_t);
5641 o.push_back(new obj_list_watch_response_t);
5642 ea.set_type(entity_addr_t::TYPE_LEGACY);
5643 ea.set_nonce(1000);
5644 ea.set_family(AF_INET);
5645 ea.set_in4_quad(0, 127);
5646 ea.set_in4_quad(1, 0);
5647 ea.set_in4_quad(2, 0);
5648 ea.set_in4_quad(3, 1);
5649 ea.set_port(1024);
5650 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
5651 ea.set_nonce(1001);
5652 ea.set_in4_quad(3, 2);
5653 ea.set_port(1025);
5654 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
5655 }
5656};
5657WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
5658
5659struct clone_info {
5660 snapid_t cloneid;
5661 vector<snapid_t> snaps; // ascending
5662 vector< pair<uint64_t,uint64_t> > overlap;
5663 uint64_t size;
5664
5665 clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
5666
5667 void encode(bufferlist& bl) const {
5668 ENCODE_START(1, 1, bl);
11fdf7f2
TL
5669 encode(cloneid, bl);
5670 encode(snaps, bl);
5671 encode(overlap, bl);
5672 encode(size, bl);
7c673cae
FG
5673 ENCODE_FINISH(bl);
5674 }
11fdf7f2 5675 void decode(bufferlist::const_iterator& bl) {
7c673cae 5676 DECODE_START(1, bl);
11fdf7f2
TL
5677 decode(cloneid, bl);
5678 decode(snaps, bl);
5679 decode(overlap, bl);
5680 decode(size, bl);
7c673cae
FG
5681 DECODE_FINISH(bl);
5682 }
5683 void dump(Formatter *f) const {
5684 if (cloneid == CEPH_NOSNAP)
5685 f->dump_string("cloneid", "HEAD");
5686 else
5687 f->dump_unsigned("cloneid", cloneid.val);
5688 f->open_array_section("snapshots");
5689 for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
5690 f->open_object_section("snap");
5691 f->dump_unsigned("id", p->val);
5692 f->close_section();
5693 }
5694 f->close_section();
5695 f->open_array_section("overlaps");
5696 for (vector< pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
5697 q != overlap.end(); ++q) {
5698 f->open_object_section("overlap");
5699 f->dump_unsigned("offset", q->first);
5700 f->dump_unsigned("length", q->second);
5701 f->close_section();
5702 }
5703 f->close_section();
5704 f->dump_unsigned("size", size);
5705 }
5706 static void generate_test_instances(list<clone_info*>& o) {
5707 o.push_back(new clone_info);
5708 o.push_back(new clone_info);
5709 o.back()->cloneid = 1;
5710 o.back()->snaps.push_back(1);
5711 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5712 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5713 o.back()->size = 16384;
5714 o.push_back(new clone_info);
5715 o.back()->cloneid = CEPH_NOSNAP;
5716 o.back()->size = 32768;
5717 }
5718};
5719WRITE_CLASS_ENCODER(clone_info)
5720
5721/**
5722 * obj list snaps response format
5723 *
5724 */
5725struct obj_list_snap_response_t {
5726 vector<clone_info> clones; // ascending
5727 snapid_t seq;
5728
5729 void encode(bufferlist& bl) const {
5730 ENCODE_START(2, 1, bl);
11fdf7f2
TL
5731 encode(clones, bl);
5732 encode(seq, bl);
7c673cae
FG
5733 ENCODE_FINISH(bl);
5734 }
11fdf7f2 5735 void decode(bufferlist::const_iterator& bl) {
7c673cae 5736 DECODE_START(2, bl);
11fdf7f2 5737 decode(clones, bl);
7c673cae 5738 if (struct_v >= 2)
11fdf7f2 5739 decode(seq, bl);
7c673cae
FG
5740 else
5741 seq = CEPH_NOSNAP;
5742 DECODE_FINISH(bl);
5743 }
5744 void dump(Formatter *f) const {
5745 f->open_array_section("clones");
5746 for (vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
5747 f->open_object_section("clone");
5748 p->dump(f);
5749 f->close_section();
5750 }
5751 f->dump_unsigned("seq", seq);
5752 f->close_section();
5753 }
5754 static void generate_test_instances(list<obj_list_snap_response_t*>& o) {
5755 o.push_back(new obj_list_snap_response_t);
5756 o.push_back(new obj_list_snap_response_t);
5757 clone_info cl;
5758 cl.cloneid = 1;
5759 cl.snaps.push_back(1);
5760 cl.overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5761 cl.overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5762 cl.size = 16384;
5763 o.back()->clones.push_back(cl);
5764 cl.cloneid = CEPH_NOSNAP;
5765 cl.snaps.clear();
5766 cl.overlap.clear();
5767 cl.size = 32768;
5768 o.back()->clones.push_back(cl);
5769 o.back()->seq = 123;
5770 }
5771};
5772
5773WRITE_CLASS_ENCODER(obj_list_snap_response_t)
5774
5775// PromoteCounter
5776
5777struct PromoteCounter {
11fdf7f2
TL
5778 std::atomic<unsigned long long> attempts{0};
5779 std::atomic<unsigned long long> objects{0};
5780 std::atomic<unsigned long long> bytes{0};
7c673cae
FG
5781
5782 void attempt() {
5783 attempts++;
5784 }
5785
5786 void finish(uint64_t size) {
5787 objects++;
5788 bytes += size;
5789 }
5790
5791 void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
5792 *a = attempts;
5793 *o = objects;
5794 *b = bytes;
5795 attempts = *a / 2;
5796 objects = *o / 2;
5797 bytes = *b / 2;
5798 }
5799};
5800
11fdf7f2
TL
5801struct pool_pg_num_history_t {
5802 /// last epoch updated
5803 epoch_t epoch = 0;
5804 /// poolid -> epoch -> pg_num
5805 map<int64_t,map<epoch_t,uint32_t>> pg_nums;
5806 /// pair(epoch, poolid)
5807 set<pair<epoch_t,int64_t>> deleted_pools;
7c673cae 5808
11fdf7f2
TL
5809 void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) {
5810 pg_nums[pool][epoch] = pg_num;
5811 }
5812 void log_pool_delete(epoch_t epoch, int64_t pool) {
5813 deleted_pools.insert(make_pair(epoch, pool));
5814 }
7c673cae 5815
11fdf7f2
TL
5816 /// prune history based on oldest osdmap epoch in the cluster
5817 void prune(epoch_t oldest_epoch) {
5818 auto i = deleted_pools.begin();
5819 while (i != deleted_pools.end()) {
5820 if (i->first >= oldest_epoch) {
5821 break;
5822 }
5823 pg_nums.erase(i->second);
5824 i = deleted_pools.erase(i);
5825 }
5826 for (auto& j : pg_nums) {
5827 auto k = j.second.lower_bound(oldest_epoch);
5828 // keep this and the entry before it (just to be paranoid)
5829 if (k != j.second.begin()) {
5830 --k;
5831 j.second.erase(j.second.begin(), k);
5832 }
5833 }
5834 }
5835
5836 void encode(bufferlist& bl) const {
5837 ENCODE_START(1, 1, bl);
5838 encode(epoch, bl);
5839 encode(pg_nums, bl);
5840 encode(deleted_pools, bl);
5841 ENCODE_FINISH(bl);
5842 }
5843 void decode(bufferlist::const_iterator& p) {
5844 DECODE_START(1, p);
5845 decode(epoch, p);
5846 decode(pg_nums, p);
5847 decode(deleted_pools, p);
5848 DECODE_FINISH(p);
5849 }
5850 void dump(Formatter *f) const {
5851 f->dump_unsigned("epoch", epoch);
5852 f->open_object_section("pools");
5853 for (auto& i : pg_nums) {
5854 f->open_object_section("pool");
5855 f->dump_unsigned("pool_id", i.first);
5856 f->open_array_section("changes");
5857 for (auto& j : i.second) {
5858 f->open_object_section("change");
5859 f->dump_unsigned("epoch", j.first);
5860 f->dump_unsigned("pg_num", j.second);
5861 f->close_section();
5862 }
5863 f->close_section();
5864 f->close_section();
5865 }
5866 f->close_section();
5867 f->open_array_section("deleted_pools");
5868 for (auto& i : deleted_pools) {
5869 f->open_object_section("deletion");
5870 f->dump_unsigned("pool_id", i.second);
5871 f->dump_unsigned("epoch", i.first);
5872 f->close_section();
5873 }
5874 f->close_section();
5875 }
5876 static void generate_test_instances(list<pool_pg_num_history_t*>& ls) {
5877 ls.push_back(new pool_pg_num_history_t);
5878 }
5879 friend ostream& operator<<(ostream& out, const pool_pg_num_history_t& h) {
5880 return out << "pg_num_history(e" << h.epoch
5881 << " pg_nums " << h.pg_nums
5882 << " deleted_pools " << h.deleted_pools
5883 << ")";
7c673cae 5884 }
7c673cae 5885};
11fdf7f2
TL
5886WRITE_CLASS_ENCODER(pool_pg_num_history_t)
5887
5888// omap specific stats
5889struct omap_stat_t {
5890 int large_omap_objects;
5891 int64_t omap_bytes;
5892 int64_t omap_keys;
5893};
7c673cae
FG
5894
5895#endif