]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/osd_types.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / osd / osd_types.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18#ifndef CEPH_OSD_TYPES_H
19#define CEPH_OSD_TYPES_H
20
21#include <sstream>
22#include <stdio.h>
23#include <memory>
11fdf7f2 24#include <string_view>
7c673cae
FG
25#include <boost/scoped_ptr.hpp>
26#include <boost/optional/optional_io.hpp>
27#include <boost/variant.hpp>
28
29#include "include/rados/rados_types.hpp"
30#include "include/mempool.h"
31
32#include "msg/msg_types.h"
33#include "include/types.h"
34#include "include/utime.h"
35#include "include/CompatSet.h"
36#include "common/histogram.h"
37#include "include/interval_set.h"
38#include "include/inline_memory.h"
39#include "common/Formatter.h"
40#include "common/bloom_filter.hpp"
41#include "common/hobject.h"
42#include "common/snap_types.h"
43#include "HitSet.h"
44#include "Watch.h"
45#include "include/cmp.h"
46#include "librados/ListObjectImpl.h"
47#include "compressor/Compressor.h"
48#include <atomic>
49
50#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
51
52#define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
53#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
54#define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
55#define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
56#define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
57#define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
58#define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
59#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
60#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
61#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
62#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
63#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
64#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
65#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
66#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
c07f9fc5 67#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
7c673cae
FG
68
69
70/// min recovery priority for MBackfillReserve
71#define OSD_RECOVERY_PRIORITY_MIN 0
72
73/// base backfill priority for MBackfillReserve
74#define OSD_BACKFILL_PRIORITY_BASE 100
75
76/// base backfill priority for MBackfillReserve (degraded PG)
77#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
78
79/// base recovery priority for MBackfillReserve
80#define OSD_RECOVERY_PRIORITY_BASE 180
81
82/// base backfill priority for MBackfillReserve (inactive PG)
83#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
84
c07f9fc5 85/// max manually/automatically set recovery priority for MBackfillReserve
a8e16298 86#define OSD_RECOVERY_PRIORITY_MAX 253
c07f9fc5 87
a8e16298
TL
88/// backfill priority for MBackfillReserve, when forced manually
89#define OSD_BACKFILL_PRIORITY_FORCED 254
90
91/// recovery priority for MRecoveryReserve, when forced manually
c07f9fc5 92#define OSD_RECOVERY_PRIORITY_FORCED 255
7c673cae 93
11fdf7f2
TL
94/// priority for pg deletion when osd is not fullish
95#define OSD_DELETE_PRIORITY_NORMAL 179
96
97/// priority for pg deletion when osd is approaching full
98#define OSD_DELETE_PRIORITY_FULLISH 219
99
100/// priority when more full
101#define OSD_DELETE_PRIORITY_FULL 255
102
7c673cae
FG
103
104typedef hobject_t collection_list_handle_t;
105
106/// convert a single CPEH_OSD_FLAG_* to a string
107const char *ceph_osd_flag_name(unsigned flag);
108/// convert a single CEPH_OSD_OF_FLAG_* to a string
109const char *ceph_osd_op_flag_name(unsigned flag);
110
111/// convert CEPH_OSD_FLAG_* op flags to a string
112string ceph_osd_flag_string(unsigned flags);
113/// conver CEPH_OSD_OP_FLAG_* op flags to a string
114string ceph_osd_op_flag_string(unsigned flags);
115/// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a string
116string ceph_osd_alloc_hint_flag_string(unsigned flags);
117
11fdf7f2
TL
118typedef map<string,string> osd_alert_list_t;
119/// map osd id -> alert_list_t
120typedef map<int, osd_alert_list_t> osd_alerts_t;
121void dump(Formatter* f, const osd_alerts_t& alerts);
7c673cae
FG
122
123/**
124 * osd request identifier
125 *
126 * caller name + incarnation# + tid to unique identify this request.
127 */
128struct osd_reqid_t {
129 entity_name_t name; // who
c07f9fc5 130 ceph_tid_t tid;
7c673cae
FG
131 int32_t inc; // incarnation
132
133 osd_reqid_t()
c07f9fc5
FG
134 : tid(0), inc(0)
135 {}
136 osd_reqid_t(const osd_reqid_t& other)
137 : name(other.name), tid(other.tid), inc(other.inc)
138 {}
7c673cae 139 osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
c07f9fc5
FG
140 : name(a), tid(t), inc(i)
141 {}
7c673cae
FG
142
143 DENC(osd_reqid_t, v, p) {
144 DENC_START(2, 2, p);
145 denc(v.name, p);
146 denc(v.tid, p);
147 denc(v.inc, p);
148 DENC_FINISH(p);
149 }
150 void dump(Formatter *f) const;
151 static void generate_test_instances(list<osd_reqid_t*>& o);
152};
153WRITE_CLASS_DENC(osd_reqid_t)
154
155
156
157struct pg_shard_t {
b32b8144 158 static const int32_t NO_OSD = 0x7fffffff;
7c673cae
FG
159 int32_t osd;
160 shard_id_t shard;
161 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
162 explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
163 pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
164 bool is_undefined() const {
165 return osd == -1;
166 }
b32b8144 167 string get_osd() const { return (osd == NO_OSD ? "NONE" : to_string(osd)); }
7c673cae 168 void encode(bufferlist &bl) const;
11fdf7f2 169 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
170 void dump(Formatter *f) const {
171 f->dump_unsigned("osd", osd);
172 if (shard != shard_id_t::NO_SHARD) {
173 f->dump_unsigned("shard", shard);
174 }
175 }
176};
177WRITE_CLASS_ENCODER(pg_shard_t)
178WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
179WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
180ostream &operator<<(ostream &lhs, const pg_shard_t &rhs);
181
182class IsPGRecoverablePredicate {
183public:
184 /**
185 * have encodes the shards available
186 */
187 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
188 virtual ~IsPGRecoverablePredicate() {}
189};
190
191class IsPGReadablePredicate {
192public:
193 /**
194 * have encodes the shards available
195 */
196 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
197 virtual ~IsPGReadablePredicate() {}
198};
199
200inline ostream& operator<<(ostream& out, const osd_reqid_t& r) {
201 return out << r.name << "." << r.inc << ":" << r.tid;
202}
203
204inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
205 return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
206}
207inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
208 return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
209}
210inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
211 return (l.name < r.name) || (l.inc < r.inc) ||
212 (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
213}
214inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
215 return (l.name < r.name) || (l.inc < r.inc) ||
216 (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
217}
218inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
219inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
220
221namespace std {
222 template<> struct hash<osd_reqid_t> {
223 size_t operator()(const osd_reqid_t &r) const {
224 static hash<uint64_t> H;
225 return H(r.name.num() ^ r.tid ^ r.inc);
226 }
227 };
228} // namespace std
229
230
231// -----
232
233// a locator constrains the placement of an object. mainly, which pool
234// does it go in.
235struct object_locator_t {
236 // You specify either the hash or the key -- not both
237 int64_t pool; ///< pool id
238 string key; ///< key string (if non-empty)
239 string nspace; ///< namespace
240 int64_t hash; ///< hash position (if >= 0)
241
242 explicit object_locator_t()
243 : pool(-1), hash(-1) {}
244 explicit object_locator_t(int64_t po)
245 : pool(po), hash(-1) {}
246 explicit object_locator_t(int64_t po, int64_t ps)
247 : pool(po), hash(ps) {}
248 explicit object_locator_t(int64_t po, string ns)
249 : pool(po), nspace(ns), hash(-1) {}
250 explicit object_locator_t(int64_t po, string ns, int64_t ps)
251 : pool(po), nspace(ns), hash(ps) {}
252 explicit object_locator_t(int64_t po, string ns, string s)
253 : pool(po), key(s), nspace(ns), hash(-1) {}
254 explicit object_locator_t(const hobject_t& soid)
255 : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
256
257 int64_t get_pool() const {
258 return pool;
259 }
260
261 void clear() {
262 pool = -1;
263 key = "";
264 nspace = "";
265 hash = -1;
266 }
267
268 bool empty() const {
269 return pool == -1;
270 }
271
272 void encode(bufferlist& bl) const;
11fdf7f2 273 void decode(bufferlist::const_iterator& p);
7c673cae
FG
274 void dump(Formatter *f) const;
275 static void generate_test_instances(list<object_locator_t*>& o);
276};
277WRITE_CLASS_ENCODER(object_locator_t)
278
279inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
280 return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
281}
282inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
283 return !(l == r);
284}
285
286inline ostream& operator<<(ostream& out, const object_locator_t& loc)
287{
288 out << "@" << loc.pool;
289 if (loc.nspace.length())
290 out << ";" << loc.nspace;
291 if (loc.key.length())
292 out << ":" << loc.key;
293 return out;
294}
295
296struct request_redirect_t {
297private:
298 object_locator_t redirect_locator; ///< this is authoritative
299 string redirect_object; ///< If non-empty, the request goes to this object name
7c673cae
FG
300
301 friend ostream& operator<<(ostream& out, const request_redirect_t& redir);
302public:
303
304 request_redirect_t() {}
305 explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
306 redirect_locator(orig) { redirect_locator.pool = rpool; }
307 explicit request_redirect_t(const object_locator_t& rloc) :
308 redirect_locator(rloc) {}
309 explicit request_redirect_t(const object_locator_t& orig,
310 const string& robj) :
311 redirect_locator(orig), redirect_object(robj) {}
312
7c673cae
FG
313 bool empty() const { return redirect_locator.empty() &&
314 redirect_object.empty(); }
315
316 void combine_with_locator(object_locator_t& orig, string& obj) const {
317 orig = redirect_locator;
318 if (!redirect_object.empty())
319 obj = redirect_object;
320 }
321
322 void encode(bufferlist& bl) const;
11fdf7f2 323 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
324 void dump(Formatter *f) const;
325 static void generate_test_instances(list<request_redirect_t*>& o);
326};
327WRITE_CLASS_ENCODER(request_redirect_t)
328
329inline ostream& operator<<(ostream& out, const request_redirect_t& redir) {
330 out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
331 return out;
332}
333
334// Internal OSD op flags - set by the OSD based on the op types
335enum {
336 CEPH_OSD_RMW_FLAG_READ = (1 << 1),
337 CEPH_OSD_RMW_FLAG_WRITE = (1 << 2),
338 CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3),
339 CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
340 CEPH_OSD_RMW_FLAG_PGOP = (1 << 5),
341 CEPH_OSD_RMW_FLAG_CACHE = (1 << 6),
342 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7),
343 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
344 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9),
345 CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10),
346};
347
348
349// pg stuff
350
351#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
352
353// placement seed (a hash value)
354typedef uint32_t ps_t;
355
356// old (v1) pg_t encoding (wrap old struct ceph_pg)
357struct old_pg_t {
358 ceph_pg v;
359 void encode(bufferlist& bl) const {
360 ::encode_raw(v, bl);
361 }
11fdf7f2 362 void decode(bufferlist::const_iterator& bl) {
7c673cae
FG
363 ::decode_raw(v, bl);
364 }
365};
366WRITE_CLASS_ENCODER(old_pg_t)
367
368// placement group id
369struct pg_t {
370 uint64_t m_pool;
371 uint32_t m_seed;
7c673cae 372
11fdf7f2
TL
373 pg_t() : m_pool(0), m_seed(0) {}
374 pg_t(ps_t seed, uint64_t pool) :
375 m_pool(pool), m_seed(seed) {}
7c673cae
FG
376 // cppcheck-suppress noExplicitConstructor
377 pg_t(const ceph_pg& cpg) :
11fdf7f2 378 m_pool(cpg.pool), m_seed(cpg.ps) {}
7c673cae
FG
379
380 // cppcheck-suppress noExplicitConstructor
381 pg_t(const old_pg_t& opg) {
382 *this = opg.v;
383 }
384
385 old_pg_t get_old_pg() const {
386 old_pg_t o;
11fdf7f2 387 ceph_assert(m_pool < 0xffffffffull);
7c673cae
FG
388 o.v.pool = m_pool;
389 o.v.ps = m_seed;
11fdf7f2 390 o.v.preferred = (__s16)-1;
7c673cae
FG
391 return o;
392 }
393
394 ps_t ps() const {
395 return m_seed;
396 }
11fdf7f2 397 int64_t pool() const {
7c673cae
FG
398 return m_pool;
399 }
7c673cae
FG
400
401 static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
402 char *calc_name(char *buf, const char *suffix_backwords) const;
403
404 void set_ps(ps_t p) {
405 m_seed = p;
406 }
407 void set_pool(uint64_t p) {
408 m_pool = p;
409 }
7c673cae
FG
410
411 pg_t get_parent() const;
412 pg_t get_ancestor(unsigned old_pg_num) const;
413
414 int print(char *o, int maxlen) const;
415 bool parse(const char *s);
416
417 bool is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *pchildren) const;
418
11fdf7f2
TL
419 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, pg_t *parent) const;
420 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
421 return ps() < new_pg_num && is_split(new_pg_num, old_pg_num, nullptr);
422 }
423
7c673cae
FG
424 /**
425 * Returns b such that for all object o:
426 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
427 */
428 unsigned get_split_bits(unsigned pg_num) const;
429
430 bool contains(int bits, const ghobject_t& oid) {
f64942e4
AA
431 return
432 (int64_t)m_pool == oid.hobj.get_logical_pool() &&
433 oid.match(bits, ps());
7c673cae
FG
434 }
435 bool contains(int bits, const hobject_t& oid) {
f64942e4
AA
436 return
437 (int64_t)m_pool == oid.get_logical_pool() &&
438 oid.match(bits, ps());
7c673cae
FG
439 }
440
441 hobject_t get_hobj_start() const;
442 hobject_t get_hobj_end(unsigned pg_num) const;
443
444 void encode(bufferlist& bl) const {
11fdf7f2 445 using ceph::encode;
7c673cae 446 __u8 v = 1;
11fdf7f2
TL
447 encode(v, bl);
448 encode(m_pool, bl);
449 encode(m_seed, bl);
450 encode((int32_t)-1, bl); // was preferred
7c673cae 451 }
11fdf7f2
TL
452 void decode(bufferlist::const_iterator& bl) {
453 using ceph::decode;
7c673cae 454 __u8 v;
11fdf7f2
TL
455 decode(v, bl);
456 decode(m_pool, bl);
457 decode(m_seed, bl);
458 bl.advance(sizeof(int32_t)); // was preferred
7c673cae 459 }
11fdf7f2
TL
460 void decode_old(bufferlist::const_iterator& bl) {
461 using ceph::decode;
7c673cae 462 old_pg_t opg;
11fdf7f2 463 decode(opg, bl);
7c673cae
FG
464 *this = opg;
465 }
466 void dump(Formatter *f) const;
467 static void generate_test_instances(list<pg_t*>& o);
468};
469WRITE_CLASS_ENCODER(pg_t)
470
471inline bool operator<(const pg_t& l, const pg_t& r) {
472 return l.pool() < r.pool() ||
11fdf7f2 473 (l.pool() == r.pool() && (l.ps() < r.ps()));
7c673cae
FG
474}
475inline bool operator<=(const pg_t& l, const pg_t& r) {
476 return l.pool() < r.pool() ||
11fdf7f2 477 (l.pool() == r.pool() && (l.ps() <= r.ps()));
7c673cae
FG
478}
479inline bool operator==(const pg_t& l, const pg_t& r) {
480 return l.pool() == r.pool() &&
7c673cae
FG
481 l.ps() == r.ps();
482}
483inline bool operator!=(const pg_t& l, const pg_t& r) {
484 return l.pool() != r.pool() ||
7c673cae
FG
485 l.ps() != r.ps();
486}
487inline bool operator>(const pg_t& l, const pg_t& r) {
488 return l.pool() > r.pool() ||
11fdf7f2 489 (l.pool() == r.pool() && (l.ps() > r.ps()));
7c673cae
FG
490}
491inline bool operator>=(const pg_t& l, const pg_t& r) {
492 return l.pool() > r.pool() ||
11fdf7f2 493 (l.pool() == r.pool() && (l.ps() >= r.ps()));
7c673cae
FG
494}
495
496ostream& operator<<(ostream& out, const pg_t &pg);
497
498namespace std {
499 template<> struct hash< pg_t >
500 {
501 size_t operator()( const pg_t& x ) const
502 {
503 static hash<uint32_t> H;
11fdf7f2
TL
504 // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
505 return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ (int32_t)(-1));
7c673cae
FG
506 }
507 };
508} // namespace std
509
510struct spg_t {
511 pg_t pgid;
512 shard_id_t shard;
513 spg_t() : shard(shard_id_t::NO_SHARD) {}
514 spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
515 explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
516 unsigned get_split_bits(unsigned pg_num) const {
517 return pgid.get_split_bits(pg_num);
518 }
519 spg_t get_parent() const {
520 return spg_t(pgid.get_parent(), shard);
521 }
522 ps_t ps() const {
523 return pgid.ps();
524 }
525 uint64_t pool() const {
526 return pgid.pool();
527 }
7c673cae
FG
528
529 static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
530 char *calc_name(char *buf, const char *suffix_backwords) const;
531
532 bool parse(const char *s);
533 bool parse(const std::string& s) {
534 return parse(s.c_str());
535 }
11fdf7f2
TL
536
537 spg_t get_ancestor(unsigned old_pg_num) const {
538 return spg_t(pgid.get_ancestor(old_pg_num), shard);
539 }
540
7c673cae
FG
541 bool is_split(unsigned old_pg_num, unsigned new_pg_num,
542 set<spg_t> *pchildren) const {
543 set<pg_t> _children;
544 set<pg_t> *children = pchildren ? &_children : NULL;
545 bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
546 if (pchildren && is_split) {
547 for (set<pg_t>::iterator i = _children.begin();
548 i != _children.end();
549 ++i) {
550 pchildren->insert(spg_t(*i, shard));
551 }
552 }
553 return is_split;
554 }
11fdf7f2
TL
555 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
556 return pgid.is_merge_target(old_pg_num, new_pg_num);
557 }
558 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num,
559 spg_t *parent) const {
560 spg_t out = *this;
561 bool r = pgid.is_merge_source(old_pg_num, new_pg_num, &out.pgid);
562 if (r && parent) {
563 *parent = out;
564 }
565 return r;
566 }
567
7c673cae
FG
568 bool is_no_shard() const {
569 return shard == shard_id_t::NO_SHARD;
570 }
571
572 ghobject_t make_pgmeta_oid() const {
573 return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
574 }
575
576 void encode(bufferlist &bl) const {
577 ENCODE_START(1, 1, bl);
11fdf7f2
TL
578 encode(pgid, bl);
579 encode(shard, bl);
7c673cae
FG
580 ENCODE_FINISH(bl);
581 }
11fdf7f2 582 void decode(bufferlist::const_iterator& bl) {
7c673cae 583 DECODE_START(1, bl);
11fdf7f2
TL
584 decode(pgid, bl);
585 decode(shard, bl);
7c673cae
FG
586 DECODE_FINISH(bl);
587 }
588
589 ghobject_t make_temp_ghobject(const string& name) const {
590 return ghobject_t(
591 hobject_t(object_t(name), "", CEPH_NOSNAP,
592 pgid.ps(),
f64942e4
AA
593 hobject_t::get_temp_pool(pgid.pool()),
594 ""),
7c673cae
FG
595 ghobject_t::NO_GEN,
596 shard);
597 }
598
599 unsigned hash_to_shard(unsigned num_shards) const {
600 return ps() % num_shards;
601 }
602};
603WRITE_CLASS_ENCODER(spg_t)
604WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
605WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
606
607namespace std {
608 template<> struct hash< spg_t >
609 {
610 size_t operator()( const spg_t& x ) const
611 {
612 static hash<uint32_t> H;
613 return H(hash<pg_t>()(x.pgid) ^ x.shard);
614 }
615 };
616} // namespace std
617
618ostream& operator<<(ostream& out, const spg_t &pg);
619
620// ----------------------
621
622class coll_t {
623 enum type_t {
624 TYPE_META = 0,
625 TYPE_LEGACY_TEMP = 1, /* no longer used */
626 TYPE_PG = 2,
627 TYPE_PG_TEMP = 3,
628 };
629 type_t type;
630 spg_t pgid;
631 uint64_t removal_seq; // note: deprecated, not encoded
632
633 char _str_buff[spg_t::calc_name_buf_size];
634 char *_str;
635
636 void calc_str();
637
638 coll_t(type_t t, spg_t p, uint64_t r)
639 : type(t), pgid(p), removal_seq(r) {
640 calc_str();
641 }
642
643public:
644 coll_t() : type(TYPE_META), removal_seq(0)
645 {
646 calc_str();
647 }
648
649 coll_t(const coll_t& other)
650 : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
651 calc_str();
652 }
653
654 explicit coll_t(spg_t pgid)
655 : type(TYPE_PG), pgid(pgid), removal_seq(0)
656 {
657 calc_str();
658 }
659
660 coll_t& operator=(const coll_t& rhs)
661 {
662 this->type = rhs.type;
663 this->pgid = rhs.pgid;
664 this->removal_seq = rhs.removal_seq;
665 this->calc_str();
666 return *this;
667 }
668
669 // named constructors
670 static coll_t meta() {
671 return coll_t();
672 }
673 static coll_t pg(spg_t p) {
674 return coll_t(p);
675 }
676
677 const std::string to_str() const {
678 return string(_str);
679 }
680 const char *c_str() const {
681 return _str;
682 }
683
684 bool parse(const std::string& s);
685
686 int operator<(const coll_t &rhs) const {
687 return type < rhs.type ||
688 (type == rhs.type && pgid < rhs.pgid);
689 }
690
691 bool is_meta() const {
692 return type == TYPE_META;
693 }
694 bool is_pg_prefix(spg_t *pgid_) const {
695 if (type == TYPE_PG || type == TYPE_PG_TEMP) {
696 *pgid_ = pgid;
697 return true;
698 }
699 return false;
700 }
701 bool is_pg() const {
702 return type == TYPE_PG;
703 }
704 bool is_pg(spg_t *pgid_) const {
705 if (type == TYPE_PG) {
706 *pgid_ = pgid;
707 return true;
708 }
709 return false;
710 }
711 bool is_temp() const {
712 return type == TYPE_PG_TEMP;
713 }
714 bool is_temp(spg_t *pgid_) const {
715 if (type == TYPE_PG_TEMP) {
716 *pgid_ = pgid;
717 return true;
718 }
719 return false;
720 }
721
722 void encode(bufferlist& bl) const;
11fdf7f2 723 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
724 size_t encoded_size() const;
725
726 inline bool operator==(const coll_t& rhs) const {
727 // only compare type if meta
728 if (type != rhs.type)
729 return false;
730 if (type == TYPE_META)
731 return true;
732 return type == rhs.type && pgid == rhs.pgid;
733 }
734 inline bool operator!=(const coll_t& rhs) const {
735 return !(*this == rhs);
736 }
737
738 // get a TEMP collection that corresponds to the current collection,
739 // which we presume is a pg collection.
740 coll_t get_temp() const {
11fdf7f2 741 ceph_assert(type == TYPE_PG);
7c673cae
FG
742 return coll_t(TYPE_PG_TEMP, pgid, 0);
743 }
744
745 ghobject_t get_min_hobj() const {
746 ghobject_t o;
747 switch (type) {
748 case TYPE_PG:
749 o.hobj.pool = pgid.pool();
750 o.set_shard(pgid.shard);
751 break;
752 case TYPE_META:
753 o.hobj.pool = -1;
754 break;
755 default:
756 break;
757 }
758 return o;
759 }
760
761 unsigned hash_to_shard(unsigned num_shards) const {
762 if (type == TYPE_PG)
763 return pgid.hash_to_shard(num_shards);
764 return 0; // whatever.
765 }
766
767 void dump(Formatter *f) const;
768 static void generate_test_instances(list<coll_t*>& o);
769};
770
771WRITE_CLASS_ENCODER(coll_t)
772
773inline ostream& operator<<(ostream& out, const coll_t& c) {
774 out << c.to_str();
775 return out;
776}
777
778namespace std {
779 template<> struct hash<coll_t> {
780 size_t operator()(const coll_t &c) const {
781 size_t h = 0;
782 string str(c.to_str());
783 std::string::const_iterator end(str.end());
784 for (std::string::const_iterator s = str.begin(); s != end; ++s) {
785 h += *s;
786 h += (h << 10);
787 h ^= (h >> 6);
788 }
789 h += (h << 3);
790 h ^= (h >> 11);
791 h += (h << 15);
792 return h;
793 }
794 };
795} // namespace std
796
797inline ostream& operator<<(ostream& out, const ceph_object_layout &ol)
798{
799 out << pg_t(ol.ol_pgid);
800 int su = ol.ol_stripe_unit;
801 if (su)
802 out << ".su=" << su;
803 return out;
804}
805
806
807
808// compound rados version type
809/* WARNING: If add member in eversion_t, please make sure the encode/decode function
810 * work well. For little-endian machine, we should make sure there is no padding
811 * in 32-bit machine and 64-bit machine.
812 */
813class eversion_t {
814public:
815 version_t version;
816 epoch_t epoch;
817 __u32 __pad;
818 eversion_t() : version(0), epoch(0), __pad(0) {}
819 eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
820
821 // cppcheck-suppress noExplicitConstructor
c07f9fc5 822 eversion_t(const ceph_eversion& ce) :
7c673cae
FG
823 version(ce.version),
824 epoch(ce.epoch),
825 __pad(0) { }
826
827 explicit eversion_t(bufferlist& bl) : __pad(0) { decode(bl); }
828
11fdf7f2
TL
829 static const eversion_t& max() {
830 static const eversion_t max(-1,-1);
7c673cae
FG
831 return max;
832 }
833
834 operator ceph_eversion() {
835 ceph_eversion c;
836 c.epoch = epoch;
837 c.version = version;
838 return c;
839 }
840
841 string get_key_name() const;
842
11fdf7f2
TL
843 // key must point to the beginning of a block of 32 chars
844 inline void get_key_name(char* key) const {
845 // Below is equivalent of sprintf("%010u.%020llu");
846 key[31] = 0;
847 ritoa<uint64_t, 10, 20>(version, key + 31);
848 key[10] = '.';
849 ritoa<uint32_t, 10, 10>(epoch, key + 10);
850 }
851
7c673cae
FG
852 void encode(bufferlist &bl) const {
853#if defined(CEPH_LITTLE_ENDIAN)
854 bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
855#else
11fdf7f2
TL
856 using ceph::encode;
857 encode(version, bl);
858 encode(epoch, bl);
7c673cae
FG
859#endif
860 }
11fdf7f2 861 void decode(bufferlist::const_iterator &bl) {
7c673cae
FG
862#if defined(CEPH_LITTLE_ENDIAN)
863 bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
864#else
11fdf7f2
TL
865 using ceph::decode;
866 decode(version, bl);
867 decode(epoch, bl);
7c673cae
FG
868#endif
869 }
870 void decode(bufferlist& bl) {
11fdf7f2 871 auto p = std::cbegin(bl);
7c673cae
FG
872 decode(p);
873 }
874};
875WRITE_CLASS_ENCODER(eversion_t)
876
877inline bool operator==(const eversion_t& l, const eversion_t& r) {
878 return (l.epoch == r.epoch) && (l.version == r.version);
879}
880inline bool operator!=(const eversion_t& l, const eversion_t& r) {
881 return (l.epoch != r.epoch) || (l.version != r.version);
882}
883inline bool operator<(const eversion_t& l, const eversion_t& r) {
884 return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
885}
886inline bool operator<=(const eversion_t& l, const eversion_t& r) {
887 return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
888}
889inline bool operator>(const eversion_t& l, const eversion_t& r) {
890 return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
891}
892inline bool operator>=(const eversion_t& l, const eversion_t& r) {
893 return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
894}
895inline ostream& operator<<(ostream& out, const eversion_t& e) {
896 return out << e.epoch << "'" << e.version;
897}
898
899/**
900 * objectstore_perf_stat_t
901 *
902 * current perf information about the osd
903 */
904struct objectstore_perf_stat_t {
11fdf7f2
TL
905 // cur_op_latency is in ns since double add/sub are not associative
906 uint64_t os_commit_latency_ns;
907 uint64_t os_apply_latency_ns;
7c673cae
FG
908
909 objectstore_perf_stat_t() :
11fdf7f2 910 os_commit_latency_ns(0), os_apply_latency_ns(0) {}
7c673cae
FG
911
912 bool operator==(const objectstore_perf_stat_t &r) const {
11fdf7f2
TL
913 return os_commit_latency_ns == r.os_commit_latency_ns &&
914 os_apply_latency_ns == r.os_apply_latency_ns;
7c673cae
FG
915 }
916
917 void add(const objectstore_perf_stat_t &o) {
11fdf7f2
TL
918 os_commit_latency_ns += o.os_commit_latency_ns;
919 os_apply_latency_ns += o.os_apply_latency_ns;
7c673cae
FG
920 }
921 void sub(const objectstore_perf_stat_t &o) {
11fdf7f2
TL
922 os_commit_latency_ns -= o.os_commit_latency_ns;
923 os_apply_latency_ns -= o.os_apply_latency_ns;
7c673cae
FG
924 }
925 void dump(Formatter *f) const;
11fdf7f2
TL
926 void encode(bufferlist &bl, uint64_t features) const;
927 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
928 static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
929};
11fdf7f2 930WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t)
7c673cae
FG
931
932/*
933 * pg states
934 */
11fdf7f2
TL
935#define PG_STATE_CREATING (1ULL << 0) // creating
936#define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too)
937#define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas.
938#define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline
939#define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound
940#define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound
941#define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging
942#define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing
943//#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub
944#define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy
945#define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
946#define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering
947#define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub
948#define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects
949#define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill
950#define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed.
951#define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown.
952#define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
953#define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files
954#define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content
955#define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full
956#define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations
957#define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size
958#define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active
959#define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover
960#define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps
961#define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps
962#define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full
963#define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps
964#define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other
965#define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other
966#define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors
967
968std::string pg_state_string(uint64_t state);
7c673cae 969std::string pg_vector_string(const vector<int32_t> &a);
3efd9988 970boost::optional<uint64_t> pg_string_state(const std::string& state);
7c673cae
FG
971
972
973/*
974 * pool_snap_info_t
975 *
976 * attributes for a single pool snapshot.
977 */
978struct pool_snap_info_t {
979 snapid_t snapid;
980 utime_t stamp;
981 string name;
982
983 void dump(Formatter *f) const;
984 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 985 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
986 static void generate_test_instances(list<pool_snap_info_t*>& o);
987};
988WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
989
990inline ostream& operator<<(ostream& out, const pool_snap_info_t& si) {
991 return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
992}
993
994
995/*
996 * pool_opts_t
997 *
998 * pool options.
999 */
1000
1001class pool_opts_t {
1002public:
1003 enum key_t {
1004 SCRUB_MIN_INTERVAL,
1005 SCRUB_MAX_INTERVAL,
1006 DEEP_SCRUB_INTERVAL,
1007 RECOVERY_PRIORITY,
1008 RECOVERY_OP_PRIORITY,
1009 SCRUB_PRIORITY,
1010 COMPRESSION_MODE,
1011 COMPRESSION_ALGORITHM,
1012 COMPRESSION_REQUIRED_RATIO,
1013 COMPRESSION_MAX_BLOB_SIZE,
1014 COMPRESSION_MIN_BLOB_SIZE,
1015 CSUM_TYPE,
1016 CSUM_MAX_BLOCK,
1017 CSUM_MIN_BLOCK,
11fdf7f2
TL
1018 FINGERPRINT_ALGORITHM,
1019 PG_NUM_MIN, // min pg_num
1020 TARGET_SIZE_BYTES, // total bytes in pool
1021 TARGET_SIZE_RATIO, // fraction of total cluster
1022 PG_AUTOSCALE_BIAS,
7c673cae
FG
1023 };
1024
1025 enum type_t {
1026 STR,
1027 INT,
1028 DOUBLE,
1029 };
1030
1031 struct opt_desc_t {
1032 key_t key;
1033 type_t type;
1034
1035 opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
1036
1037 bool operator==(const opt_desc_t& rhs) const {
1038 return key == rhs.key && type == rhs.type;
1039 }
1040 };
1041
11fdf7f2 1042 typedef boost::variant<std::string,int64_t,double> value_t;
7c673cae
FG
1043
1044 static bool is_opt_name(const std::string& name);
1045 static opt_desc_t get_opt_desc(const std::string& name);
1046
1047 pool_opts_t() : opts() {}
1048
1049 bool is_set(key_t key) const;
1050
1051 template<typename T>
1052 void set(key_t key, const T &val) {
1053 value_t value = val;
1054 opts[key] = value;
1055 }
1056
1057 template<typename T>
1058 bool get(key_t key, T *val) const {
1059 opts_t::const_iterator i = opts.find(key);
1060 if (i == opts.end()) {
1061 return false;
1062 }
1063 *val = boost::get<T>(i->second);
1064 return true;
1065 }
1066
1067 const value_t& get(key_t key) const;
1068
1069 bool unset(key_t key);
1070
1071 void dump(const std::string& name, Formatter *f) const;
1072
1073 void dump(Formatter *f) const;
11fdf7f2
TL
1074 void encode(bufferlist &bl, uint64_t features) const;
1075 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
1076
1077private:
1078 typedef std::map<key_t, value_t> opts_t;
1079 opts_t opts;
1080
1081 friend ostream& operator<<(ostream& out, const pool_opts_t& opts);
1082};
11fdf7f2
TL
1083WRITE_CLASS_ENCODER_FEATURES(pool_opts_t)
1084
1085struct pg_merge_meta_t {
1086 pg_t source_pgid;
1087 epoch_t ready_epoch = 0;
1088 epoch_t last_epoch_started = 0;
1089 epoch_t last_epoch_clean = 0;
1090 eversion_t source_version;
1091 eversion_t target_version;
1092
1093 void encode(bufferlist& bl) const {
1094 ENCODE_START(1, 1, bl);
1095 encode(source_pgid, bl);
1096 encode(ready_epoch, bl);
1097 encode(last_epoch_started, bl);
1098 encode(last_epoch_clean, bl);
1099 encode(source_version, bl);
1100 encode(target_version, bl);
1101 ENCODE_FINISH(bl);
1102 }
1103 void decode(bufferlist::const_iterator& p) {
1104 DECODE_START(1, p);
1105 decode(source_pgid, p);
1106 decode(ready_epoch, p);
1107 decode(last_epoch_started, p);
1108 decode(last_epoch_clean, p);
1109 decode(source_version, p);
1110 decode(target_version, p);
1111 DECODE_FINISH(p);
1112 }
1113 void dump(Formatter *f) const {
1114 f->dump_stream("source_pgid") << source_pgid;
1115 f->dump_unsigned("ready_epoch", ready_epoch);
1116 f->dump_unsigned("last_epoch_started", last_epoch_started);
1117 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
1118 f->dump_stream("source_version") << source_version;
1119 f->dump_stream("target_version") << target_version;
1120 }
1121};
1122WRITE_CLASS_ENCODER(pg_merge_meta_t)
7c673cae
FG
1123
1124/*
1125 * pg_pool
1126 */
1127struct pg_pool_t {
c07f9fc5
FG
1128 static const char *APPLICATION_NAME_CEPHFS;
1129 static const char *APPLICATION_NAME_RBD;
1130 static const char *APPLICATION_NAME_RGW;
1131
7c673cae
FG
1132 enum {
1133 TYPE_REPLICATED = 1, // replication
1134 //TYPE_RAID4 = 2, // raid4 (never implemented)
1135 TYPE_ERASURE = 3, // erasure-coded
1136 };
11fdf7f2 1137 static std::string_view get_type_name(int t) {
7c673cae
FG
1138 switch (t) {
1139 case TYPE_REPLICATED: return "replicated";
1140 //case TYPE_RAID4: return "raid4";
1141 case TYPE_ERASURE: return "erasure";
1142 default: return "???";
1143 }
1144 }
11fdf7f2 1145 std::string_view get_type_name() const {
7c673cae
FG
1146 return get_type_name(type);
1147 }
7c673cae
FG
1148
1149 enum {
1150 FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
1151 FLAG_FULL = 1<<1, // pool is full
1152 FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
1153 FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
1154 FLAG_NODELETE = 1<<4, // pool can't be deleted
1155 FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
1156 FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
1157 FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1158 FLAG_NOSCRUB = 1<<8, // block periodic scrub
1159 FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
11fdf7f2 1160 FLAG_FULL_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
3efd9988
FG
1161 FLAG_NEARFULL = 1<<11, // pool is nearfull
1162 FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
11fdf7f2
TL
1163 FLAG_SELFMANAGED_SNAPS = 1<<13, // pool uses selfmanaged snaps
1164 FLAG_POOL_SNAPS = 1<<14, // pool has pool snaps
1165 FLAG_CREATING = 1<<15, // initial pool PGs are being created
7c673cae
FG
1166 };
1167
1168 static const char *get_flag_name(int f) {
1169 switch (f) {
1170 case FLAG_HASHPSPOOL: return "hashpspool";
1171 case FLAG_FULL: return "full";
1172 case FLAG_EC_OVERWRITES: return "ec_overwrites";
1173 case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
1174 case FLAG_NODELETE: return "nodelete";
1175 case FLAG_NOPGCHANGE: return "nopgchange";
1176 case FLAG_NOSIZECHANGE: return "nosizechange";
1177 case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
1178 case FLAG_NOSCRUB: return "noscrub";
1179 case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
11fdf7f2 1180 case FLAG_FULL_QUOTA: return "full_quota";
3efd9988
FG
1181 case FLAG_NEARFULL: return "nearfull";
1182 case FLAG_BACKFILLFULL: return "backfillfull";
11fdf7f2
TL
1183 case FLAG_SELFMANAGED_SNAPS: return "selfmanaged_snaps";
1184 case FLAG_POOL_SNAPS: return "pool_snaps";
1185 case FLAG_CREATING: return "creating";
7c673cae
FG
1186 default: return "???";
1187 }
1188 }
1189 static string get_flags_string(uint64_t f) {
1190 string s;
1191 for (unsigned n=0; f && n<64; ++n) {
1192 if (f & (1ull << n)) {
1193 if (s.length())
1194 s += ",";
1195 s += get_flag_name(1ull << n);
1196 }
1197 }
1198 return s;
1199 }
1200 string get_flags_string() const {
1201 return get_flags_string(flags);
1202 }
1203 static uint64_t get_flag_by_name(const string& name) {
1204 if (name == "hashpspool")
1205 return FLAG_HASHPSPOOL;
1206 if (name == "full")
1207 return FLAG_FULL;
1208 if (name == "ec_overwrites")
1209 return FLAG_EC_OVERWRITES;
1210 if (name == "incomplete_clones")
1211 return FLAG_INCOMPLETE_CLONES;
1212 if (name == "nodelete")
1213 return FLAG_NODELETE;
1214 if (name == "nopgchange")
1215 return FLAG_NOPGCHANGE;
1216 if (name == "nosizechange")
1217 return FLAG_NOSIZECHANGE;
1218 if (name == "write_fadvise_dontneed")
1219 return FLAG_WRITE_FADVISE_DONTNEED;
1220 if (name == "noscrub")
1221 return FLAG_NOSCRUB;
1222 if (name == "nodeep-scrub")
1223 return FLAG_NODEEP_SCRUB;
11fdf7f2
TL
1224 if (name == "full_quota")
1225 return FLAG_FULL_QUOTA;
3efd9988
FG
1226 if (name == "nearfull")
1227 return FLAG_NEARFULL;
1228 if (name == "backfillfull")
1229 return FLAG_BACKFILLFULL;
11fdf7f2
TL
1230 if (name == "selfmanaged_snaps")
1231 return FLAG_SELFMANAGED_SNAPS;
1232 if (name == "pool_snaps")
1233 return FLAG_POOL_SNAPS;
1234 if (name == "creating")
1235 return FLAG_CREATING;
7c673cae
FG
1236 return 0;
1237 }
1238
1239 /// converts the acting/up vector to a set of pg shards
1240 void convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const;
1241
1242 typedef enum {
1243 CACHEMODE_NONE = 0, ///< no caching
1244 CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
1245 CACHEMODE_FORWARD = 2, ///< forward if not in cache
1246 CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
1247 CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later
1248 CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later
1249 CACHEMODE_PROXY = 6, ///< proxy if not in cache
1250 } cache_mode_t;
1251 static const char *get_cache_mode_name(cache_mode_t m) {
1252 switch (m) {
1253 case CACHEMODE_NONE: return "none";
1254 case CACHEMODE_WRITEBACK: return "writeback";
1255 case CACHEMODE_FORWARD: return "forward";
1256 case CACHEMODE_READONLY: return "readonly";
1257 case CACHEMODE_READFORWARD: return "readforward";
1258 case CACHEMODE_READPROXY: return "readproxy";
1259 case CACHEMODE_PROXY: return "proxy";
1260 default: return "unknown";
1261 }
1262 }
1263 static cache_mode_t get_cache_mode_from_str(const string& s) {
1264 if (s == "none")
1265 return CACHEMODE_NONE;
1266 if (s == "writeback")
1267 return CACHEMODE_WRITEBACK;
1268 if (s == "forward")
1269 return CACHEMODE_FORWARD;
1270 if (s == "readonly")
1271 return CACHEMODE_READONLY;
1272 if (s == "readforward")
1273 return CACHEMODE_READFORWARD;
1274 if (s == "readproxy")
1275 return CACHEMODE_READPROXY;
1276 if (s == "proxy")
1277 return CACHEMODE_PROXY;
1278 return (cache_mode_t)-1;
1279 }
1280 const char *get_cache_mode_name() const {
1281 return get_cache_mode_name(cache_mode);
1282 }
1283 bool cache_mode_requires_hit_set() const {
1284 switch (cache_mode) {
1285 case CACHEMODE_NONE:
1286 case CACHEMODE_FORWARD:
1287 case CACHEMODE_READONLY:
1288 case CACHEMODE_PROXY:
1289 return false;
1290 case CACHEMODE_WRITEBACK:
1291 case CACHEMODE_READFORWARD:
1292 case CACHEMODE_READPROXY:
1293 return true;
1294 default:
11fdf7f2
TL
1295 ceph_abort_msg("implement me");
1296 }
1297 }
1298
1299 enum {
1300 PG_AUTOSCALE_MODE_OFF = 0,
1301 PG_AUTOSCALE_MODE_WARN = 1,
1302 PG_AUTOSCALE_MODE_ON = 2,
1303 };
1304 static const char *get_pg_autoscale_mode_name(int m) {
1305 switch (m) {
1306 case PG_AUTOSCALE_MODE_OFF: return "off";
1307 case PG_AUTOSCALE_MODE_ON: return "on";
1308 case PG_AUTOSCALE_MODE_WARN: return "warn";
1309 default: return "???";
1310 }
1311 }
1312 static int get_pg_autoscale_mode_by_name(const string& m) {
1313 if (m == "off") {
1314 return PG_AUTOSCALE_MODE_OFF;
1315 }
1316 if (m == "warn") {
1317 return PG_AUTOSCALE_MODE_WARN;
1318 }
1319 if (m == "on") {
1320 return PG_AUTOSCALE_MODE_ON;
7c673cae 1321 }
11fdf7f2 1322 return -1;
7c673cae
FG
1323 }
1324
11fdf7f2 1325 utime_t create_time;
7c673cae
FG
1326 uint64_t flags; ///< FLAG_*
1327 __u8 type; ///< TYPE_*
1328 __u8 size, min_size; ///< number of osds in each pg
31f18b77 1329 __u8 crush_rule; ///< crush placement rule
7c673cae 1330 __u8 object_hash; ///< hash mapping object name to ps
11fdf7f2 1331 __u8 pg_autoscale_mode; ///< PG_AUTOSCALE_MODE_
7c673cae 1332private:
11fdf7f2
TL
1333 __u32 pg_num = 0, pgp_num = 0; ///< number of pgs
1334 __u32 pg_num_pending = 0; ///< pg_num we are about to merge down to
1335 __u32 pg_num_target = 0; ///< pg_num we should converge toward
1336 __u32 pgp_num_target = 0; ///< pgp_num we should converge toward
7c673cae
FG
1337
1338public:
1339 map<string,string> properties; ///< OBSOLETE
1340 string erasure_code_profile; ///< name of the erasure code profile in OSDMap
1341 epoch_t last_change; ///< most recent epoch changed, exclusing snapshot changes
11fdf7f2
TL
1342
1343 /// last epoch that forced clients to resend
1344 epoch_t last_force_op_resend = 0;
1345 /// last epoch that forced clients to resend (pre-nautilus clients only)
1346 epoch_t last_force_op_resend_prenautilus = 0;
7c673cae 1347 /// last epoch that forced clients to resend (pre-luminous clients only)
11fdf7f2
TL
1348 epoch_t last_force_op_resend_preluminous = 0;
1349
1350 /// metadata for the most recent PG merge
1351 pg_merge_meta_t last_pg_merge_meta;
1352
7c673cae
FG
1353 snapid_t snap_seq; ///< seq for per-pool snapshot
1354 epoch_t snap_epoch; ///< osdmap epoch of last snap
1355 uint64_t auid; ///< who owns the pg
7c673cae
FG
1356
1357 uint64_t quota_max_bytes; ///< maximum number of bytes for this pool
1358 uint64_t quota_max_objects; ///< maximum number of objects for this pool
1359
1360 /*
1361 * Pool snaps (global to this pool). These define a SnapContext for
1362 * the pool, unless the client manually specifies an alternate
1363 * context.
1364 */
1365 map<snapid_t, pool_snap_info_t> snaps;
1366 /*
1367 * Alternatively, if we are defining non-pool snaps (e.g. via the
1368 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1369 * used). Snaps and removed_snaps are to be used exclusive of each
1370 * other!
1371 */
1372 interval_set<snapid_t> removed_snaps;
1373
1374 unsigned pg_num_mask, pgp_num_mask;
1375
1376 set<uint64_t> tiers; ///< pools that are tiers of us
1377 int64_t tier_of; ///< pool for which we are a tier
1378 // Note that write wins for read+write ops
1379 int64_t read_tier; ///< pool/tier for objecter to direct reads to
1380 int64_t write_tier; ///< pool/tier for objecter to direct writes to
1381 cache_mode_t cache_mode; ///< cache pool mode
1382
1383 bool is_tier() const { return tier_of >= 0; }
1384 bool has_tiers() const { return !tiers.empty(); }
1385 void clear_tier() {
1386 tier_of = -1;
1387 clear_read_tier();
1388 clear_write_tier();
1389 clear_tier_tunables();
1390 }
1391 bool has_read_tier() const { return read_tier >= 0; }
1392 void clear_read_tier() { read_tier = -1; }
1393 bool has_write_tier() const { return write_tier >= 0; }
1394 void clear_write_tier() { write_tier = -1; }
1395 void clear_tier_tunables() {
1396 if (cache_mode != CACHEMODE_NONE)
1397 flags |= FLAG_INCOMPLETE_CLONES;
1398 cache_mode = CACHEMODE_NONE;
1399
1400 target_max_bytes = 0;
1401 target_max_objects = 0;
1402 cache_target_dirty_ratio_micro = 0;
1403 cache_target_dirty_high_ratio_micro = 0;
1404 cache_target_full_ratio_micro = 0;
1405 hit_set_params = HitSet::Params();
1406 hit_set_period = 0;
1407 hit_set_count = 0;
1408 hit_set_grade_decay_rate = 0;
1409 hit_set_search_last_n = 0;
1410 grade_table.resize(0);
1411 }
1412
1413 uint64_t target_max_bytes; ///< tiering: target max pool size
1414 uint64_t target_max_objects; ///< tiering: target max pool size
1415
1416 uint32_t cache_target_dirty_ratio_micro; ///< cache: fraction of target to leave dirty
11fdf7f2 1417 uint32_t cache_target_dirty_high_ratio_micro; ///< cache: fraction of target to flush with high speed
7c673cae
FG
1418 uint32_t cache_target_full_ratio_micro; ///< cache: fraction of target to fill before we evict in earnest
1419
1420 uint32_t cache_min_flush_age; ///< minimum age (seconds) before we can flush
1421 uint32_t cache_min_evict_age; ///< minimum age (seconds) before we can evict
1422
1423 HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
1424 uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
1425 uint32_t hit_set_count; ///< number of periods to retain
1426 bool use_gmt_hitset; ///< use gmt to name the hitset archive object
1427 uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote on read
1428 uint32_t min_write_recency_for_promote; ///< minimum number of HitSet to check before promote on write
1429 uint32_t hit_set_grade_decay_rate; ///< current hit_set has highest priority on objects
11fdf7f2
TL
1430 ///< temperature count,the follow hit_set's priority decay
1431 ///< by this params than pre hit_set
1432 uint32_t hit_set_search_last_n; ///< accumulate atmost N hit_sets for temperature
7c673cae
FG
1433
1434 uint32_t stripe_width; ///< erasure coded stripe size in bytes
1435
1436 uint64_t expected_num_objects; ///< expected number of objects on this pool, a value of 0 indicates
1437 ///< user does not specify any expected value
1438 bool fast_read; ///< whether turn on fast read on the pool or not
1439
1440 pool_opts_t opts; ///< options
1441
11fdf7f2
TL
1442 typedef enum {
1443 TYPE_FINGERPRINT_NONE = 0,
1444 TYPE_FINGERPRINT_SHA1 = 1,
1445 } fingerprint_t;
1446 static fingerprint_t get_fingerprint_from_str(const string& s) {
1447 if (s == "none")
1448 return TYPE_FINGERPRINT_NONE;
1449 if (s == "sha1")
1450 return TYPE_FINGERPRINT_SHA1;
1451 return (fingerprint_t)-1;
1452 }
1453 const fingerprint_t get_fingerprint_type() const {
1454 string fp_str;
1455 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1456 return get_fingerprint_from_str(fp_str);
1457 }
1458 const char *get_fingerprint_name() const {
1459 string fp_str;
1460 fingerprint_t fp_t;
1461 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1462 fp_t = get_fingerprint_from_str(fp_str);
1463 return get_fingerprint_name(fp_t);
1464 }
1465 static const char *get_fingerprint_name(fingerprint_t m) {
1466 switch (m) {
1467 case TYPE_FINGERPRINT_NONE: return "none";
1468 case TYPE_FINGERPRINT_SHA1: return "sha1";
1469 default: return "unknown";
1470 }
1471 }
1472
c07f9fc5
FG
1473 /// application -> key/value metadata
1474 map<string, std::map<string, string>> application_metadata;
1475
7c673cae
FG
1476private:
1477 vector<uint32_t> grade_table;
1478
1479public:
1480 uint32_t get_grade(unsigned i) const {
1481 if (grade_table.size() <= i)
1482 return 0;
1483 return grade_table[i];
1484 }
1485 void calc_grade_table() {
1486 unsigned v = 1000000;
1487 grade_table.resize(hit_set_count);
1488 for (unsigned i = 0; i < hit_set_count; i++) {
1489 v = v * (1 - (hit_set_grade_decay_rate / 100.0));
1490 grade_table[i] = v;
1491 }
1492 }
1493
1494 pg_pool_t()
1495 : flags(0), type(0), size(0), min_size(0),
31f18b77 1496 crush_rule(0), object_hash(0),
7c673cae 1497 last_change(0),
7c673cae
FG
1498 snap_seq(0), snap_epoch(0),
1499 auid(0),
7c673cae
FG
1500 quota_max_bytes(0), quota_max_objects(0),
1501 pg_num_mask(0), pgp_num_mask(0),
1502 tier_of(-1), read_tier(-1), write_tier(-1),
1503 cache_mode(CACHEMODE_NONE),
1504 target_max_bytes(0), target_max_objects(0),
1505 cache_target_dirty_ratio_micro(0),
1506 cache_target_dirty_high_ratio_micro(0),
1507 cache_target_full_ratio_micro(0),
1508 cache_min_flush_age(0),
1509 cache_min_evict_age(0),
1510 hit_set_params(),
1511 hit_set_period(0),
1512 hit_set_count(0),
1513 use_gmt_hitset(true),
1514 min_read_recency_for_promote(0),
1515 min_write_recency_for_promote(0),
1516 hit_set_grade_decay_rate(0),
1517 hit_set_search_last_n(0),
1518 stripe_width(0),
1519 expected_num_objects(0),
1520 fast_read(false),
1521 opts()
1522 { }
1523
1524 void dump(Formatter *f) const;
1525
11fdf7f2 1526 const utime_t &get_create_time() const { return create_time; }
7c673cae
FG
1527 uint64_t get_flags() const { return flags; }
1528 bool has_flag(uint64_t f) const { return flags & f; }
1529 void set_flag(uint64_t f) { flags |= f; }
1530 void unset_flag(uint64_t f) { flags &= ~f; }
1531
7c673cae 1532 bool require_rollback() const {
11fdf7f2 1533 return is_erasure();
7c673cae
FG
1534 }
1535
1536 /// true if incomplete clones may be present
1537 bool allow_incomplete_clones() const {
1538 return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
1539 }
1540
1541 unsigned get_type() const { return type; }
1542 unsigned get_size() const { return size; }
1543 unsigned get_min_size() const { return min_size; }
31f18b77 1544 int get_crush_rule() const { return crush_rule; }
7c673cae
FG
1545 int get_object_hash() const { return object_hash; }
1546 const char *get_object_hash_name() const {
1547 return ceph_str_hash_name(get_object_hash());
1548 }
1549 epoch_t get_last_change() const { return last_change; }
1550 epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
11fdf7f2
TL
1551 epoch_t get_last_force_op_resend_prenautilus() const {
1552 return last_force_op_resend_prenautilus;
1553 }
7c673cae
FG
1554 epoch_t get_last_force_op_resend_preluminous() const {
1555 return last_force_op_resend_preluminous;
1556 }
1557 epoch_t get_snap_epoch() const { return snap_epoch; }
1558 snapid_t get_snap_seq() const { return snap_seq; }
1559 uint64_t get_auid() const { return auid; }
7c673cae
FG
1560
1561 void set_snap_seq(snapid_t s) { snap_seq = s; }
1562 void set_snap_epoch(epoch_t e) { snap_epoch = e; }
1563
1564 void set_stripe_width(uint32_t s) { stripe_width = s; }
1565 uint32_t get_stripe_width() const { return stripe_width; }
1566
1567 bool is_replicated() const { return get_type() == TYPE_REPLICATED; }
1568 bool is_erasure() const { return get_type() == TYPE_ERASURE; }
1569
1570 bool supports_omap() const {
1571 return !(get_type() == TYPE_ERASURE);
1572 }
1573
1574 bool requires_aligned_append() const {
1575 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
1576 }
1577 uint64_t required_alignment() const { return stripe_width; }
1578
1579 bool allows_ecoverwrites() const {
1580 return has_flag(FLAG_EC_OVERWRITES);
1581 }
1582
1583 bool can_shift_osds() const {
1584 switch (get_type()) {
1585 case TYPE_REPLICATED:
1586 return true;
1587 case TYPE_ERASURE:
1588 return false;
1589 default:
11fdf7f2 1590 ceph_abort_msg("unhandled pool type");
7c673cae
FG
1591 }
1592 }
1593
1594 unsigned get_pg_num() const { return pg_num; }
1595 unsigned get_pgp_num() const { return pgp_num; }
11fdf7f2
TL
1596 unsigned get_pg_num_target() const { return pg_num_target; }
1597 unsigned get_pgp_num_target() const { return pgp_num_target; }
1598 unsigned get_pg_num_pending() const { return pg_num_pending; }
7c673cae
FG
1599
1600 unsigned get_pg_num_mask() const { return pg_num_mask; }
1601 unsigned get_pgp_num_mask() const { return pgp_num_mask; }
1602
1603 // if pg_num is not a multiple of two, pgs are not equally sized.
1604 // return, for a given pg, the fraction (denominator) of the total
1605 // pool size that it represents.
1606 unsigned get_pg_num_divisor(pg_t pgid) const;
1607
11fdf7f2
TL
1608 bool is_pending_merge(pg_t pgid, bool *target) const;
1609
7c673cae
FG
1610 void set_pg_num(int p) {
1611 pg_num = p;
11fdf7f2 1612 pg_num_pending = p;
7c673cae
FG
1613 calc_pg_masks();
1614 }
1615 void set_pgp_num(int p) {
1616 pgp_num = p;
1617 calc_pg_masks();
1618 }
11fdf7f2
TL
1619 void set_pg_num_pending(int p) {
1620 pg_num_pending = p;
1621 calc_pg_masks();
1622 }
1623 void set_pg_num_target(int p) {
1624 pg_num_target = p;
1625 }
1626 void set_pgp_num_target(int p) {
1627 pgp_num_target = p;
1628 }
1629 void dec_pg_num(pg_t source_pgid,
1630 epoch_t ready_epoch,
1631 eversion_t source_version,
1632 eversion_t target_version,
1633 epoch_t last_epoch_started,
1634 epoch_t last_epoch_clean) {
1635 --pg_num;
1636 last_pg_merge_meta.source_pgid = source_pgid;
1637 last_pg_merge_meta.ready_epoch = ready_epoch;
1638 last_pg_merge_meta.source_version = source_version;
1639 last_pg_merge_meta.target_version = target_version;
1640 last_pg_merge_meta.last_epoch_started = last_epoch_started;
1641 last_pg_merge_meta.last_epoch_clean = last_epoch_clean;
1642 calc_pg_masks();
1643 }
7c673cae
FG
1644
1645 void set_quota_max_bytes(uint64_t m) {
1646 quota_max_bytes = m;
1647 }
1648 uint64_t get_quota_max_bytes() {
1649 return quota_max_bytes;
1650 }
1651
1652 void set_quota_max_objects(uint64_t m) {
1653 quota_max_objects = m;
1654 }
1655 uint64_t get_quota_max_objects() {
1656 return quota_max_objects;
1657 }
1658
1659 void set_last_force_op_resend(uint64_t t) {
1660 last_force_op_resend = t;
11fdf7f2 1661 last_force_op_resend_prenautilus = t;
7c673cae
FG
1662 last_force_op_resend_preluminous = t;
1663 }
1664
1665 void calc_pg_masks();
1666
1667 /*
1668 * we have two snap modes:
1669 * - pool global snaps
1670 * - snap existence/non-existence defined by snaps[] and snap_seq
1671 * - user managed snaps
1672 * - removal governed by removed_snaps
1673 *
1674 * we know which mode we're using based on whether removed_snaps is empty.
1675 * If nothing has been created, both functions report false.
1676 */
1677 bool is_pool_snaps_mode() const;
1678 bool is_unmanaged_snaps_mode() const;
1679 bool is_removed_snap(snapid_t s) const;
1680
1681 /*
1682 * build set of known-removed sets from either pool snaps or
1683 * explicit removed_snaps set.
1684 */
1685 void build_removed_snaps(interval_set<snapid_t>& rs) const;
91327a77 1686 bool maybe_updated_removed_snaps(const interval_set<snapid_t>& cached) const;
7c673cae
FG
1687 snapid_t snap_exists(const char *s) const;
1688 void add_snap(const char *n, utime_t stamp);
1689 void add_unmanaged_snap(uint64_t& snapid);
1690 void remove_snap(snapid_t s);
1691 void remove_unmanaged_snap(snapid_t s);
1692
1693 SnapContext get_snap_context() const;
1694
1695 /// hash a object name+namespace key to a hash position
1696 uint32_t hash_key(const string& key, const string& ns) const;
1697
1698 /// round a hash position down to a pg num
1699 uint32_t raw_hash_to_pg(uint32_t v) const;
1700
1701 /*
1702 * map a raw pg (with full precision ps) into an actual pg, for storage
1703 */
1704 pg_t raw_pg_to_pg(pg_t pg) const;
1705
1706 /*
1707 * map raw pg (full precision ps) into a placement seed. include
1708 * pool id in that value so that different pools don't use the same
1709 * seeds.
1710 */
1711 ps_t raw_pg_to_pps(pg_t pg) const;
1712
1713 /// choose a random hash position within a pg
1714 uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
1715
1716 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 1717 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
1718
1719 static void generate_test_instances(list<pg_pool_t*>& o);
1720};
1721WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
1722
1723ostream& operator<<(ostream& out, const pg_pool_t& p);
1724
1725
1726/**
1727 * a summation of object stats
1728 *
1729 * This is just a container for object stats; we don't know what for.
1730 *
1731 * If you add members in object_stat_sum_t, you should make sure there are
1732 * not padding among these members.
1733 * You should also modify the padding_check function.
1734
1735 */
1736struct object_stat_sum_t {
1737 /**************************************************************************
1738 * WARNING: be sure to update operator==, floor, and split when
1739 * adding/removing fields!
1740 **************************************************************************/
1741 int64_t num_bytes; // in bytes
1742 int64_t num_objects;
1743 int64_t num_object_clones;
1744 int64_t num_object_copies; // num_objects * num_replicas
1745 int64_t num_objects_missing_on_primary;
1746 int64_t num_objects_degraded;
1747 int64_t num_objects_unfound;
1748 int64_t num_rd;
1749 int64_t num_rd_kb;
1750 int64_t num_wr;
1751 int64_t num_wr_kb;
1752 int64_t num_scrub_errors; // total deep and shallow scrub errors
1753 int64_t num_objects_recovered;
1754 int64_t num_bytes_recovered;
1755 int64_t num_keys_recovered;
1756 int64_t num_shallow_scrub_errors;
1757 int64_t num_deep_scrub_errors;
1758 int64_t num_objects_dirty;
1759 int64_t num_whiteouts;
1760 int64_t num_objects_omap;
1761 int64_t num_objects_hit_set_archive;
1762 int64_t num_objects_misplaced;
1763 int64_t num_bytes_hit_set_archive;
1764 int64_t num_flush;
1765 int64_t num_flush_kb;
1766 int64_t num_evict;
1767 int64_t num_evict_kb;
1768 int64_t num_promote;
1769 int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0
1770 int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0
1771 int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0
1772 int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0
1773 int64_t num_objects_pinned;
1774 int64_t num_objects_missing;
1775 int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
28e407b8 1776 int64_t num_large_omap_objects = 0;
11fdf7f2
TL
1777 int64_t num_objects_manifest = 0;
1778 int64_t num_omap_bytes = 0;
1779 int64_t num_omap_keys = 0;
1780 int64_t num_objects_repaired = 0;
7c673cae
FG
1781
1782 object_stat_sum_t()
1783 : num_bytes(0),
1784 num_objects(0), num_object_clones(0), num_object_copies(0),
1785 num_objects_missing_on_primary(0), num_objects_degraded(0),
1786 num_objects_unfound(0),
1787 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1788 num_scrub_errors(0),
1789 num_objects_recovered(0),
1790 num_bytes_recovered(0),
1791 num_keys_recovered(0),
1792 num_shallow_scrub_errors(0),
1793 num_deep_scrub_errors(0),
1794 num_objects_dirty(0),
1795 num_whiteouts(0),
1796 num_objects_omap(0),
1797 num_objects_hit_set_archive(0),
1798 num_objects_misplaced(0),
1799 num_bytes_hit_set_archive(0),
1800 num_flush(0),
1801 num_flush_kb(0),
1802 num_evict(0),
1803 num_evict_kb(0),
1804 num_promote(0),
1805 num_flush_mode_high(0), num_flush_mode_low(0),
1806 num_evict_mode_some(0), num_evict_mode_full(0),
1807 num_objects_pinned(0),
1808 num_objects_missing(0),
1809 num_legacy_snapsets(0)
1810 {}
1811
1812 void floor(int64_t f) {
1813#define FLOOR(x) if (x < f) x = f
1814 FLOOR(num_bytes);
1815 FLOOR(num_objects);
1816 FLOOR(num_object_clones);
1817 FLOOR(num_object_copies);
1818 FLOOR(num_objects_missing_on_primary);
1819 FLOOR(num_objects_missing);
1820 FLOOR(num_objects_degraded);
1821 FLOOR(num_objects_misplaced);
1822 FLOOR(num_objects_unfound);
1823 FLOOR(num_rd);
1824 FLOOR(num_rd_kb);
1825 FLOOR(num_wr);
1826 FLOOR(num_wr_kb);
28e407b8 1827 FLOOR(num_large_omap_objects);
11fdf7f2
TL
1828 FLOOR(num_objects_manifest);
1829 FLOOR(num_omap_bytes);
1830 FLOOR(num_omap_keys);
7c673cae
FG
1831 FLOOR(num_shallow_scrub_errors);
1832 FLOOR(num_deep_scrub_errors);
94b18763 1833 num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
7c673cae
FG
1834 FLOOR(num_objects_recovered);
1835 FLOOR(num_bytes_recovered);
1836 FLOOR(num_keys_recovered);
1837 FLOOR(num_objects_dirty);
1838 FLOOR(num_whiteouts);
1839 FLOOR(num_objects_omap);
1840 FLOOR(num_objects_hit_set_archive);
1841 FLOOR(num_bytes_hit_set_archive);
1842 FLOOR(num_flush);
1843 FLOOR(num_flush_kb);
1844 FLOOR(num_evict);
1845 FLOOR(num_evict_kb);
1846 FLOOR(num_promote);
1847 FLOOR(num_flush_mode_high);
1848 FLOOR(num_flush_mode_low);
1849 FLOOR(num_evict_mode_some);
1850 FLOOR(num_evict_mode_full);
1851 FLOOR(num_objects_pinned);
1852 FLOOR(num_legacy_snapsets);
11fdf7f2 1853 FLOOR(num_objects_repaired);
7c673cae
FG
1854#undef FLOOR
1855 }
1856
1857 void split(vector<object_stat_sum_t> &out) const {
1858#define SPLIT(PARAM) \
1859 for (unsigned i = 0; i < out.size(); ++i) { \
1860 out[i].PARAM = PARAM / out.size(); \
1861 if (i < (PARAM % out.size())) { \
1862 out[i].PARAM++; \
1863 } \
1864 }
1865#define SPLIT_PRESERVE_NONZERO(PARAM) \
1866 for (unsigned i = 0; i < out.size(); ++i) { \
1867 if (PARAM) \
1868 out[i].PARAM = 1 + PARAM / out.size(); \
1869 else \
1870 out[i].PARAM = 0; \
1871 }
1872
1873 SPLIT(num_bytes);
1874 SPLIT(num_objects);
1875 SPLIT(num_object_clones);
1876 SPLIT(num_object_copies);
1877 SPLIT(num_objects_missing_on_primary);
1878 SPLIT(num_objects_missing);
1879 SPLIT(num_objects_degraded);
1880 SPLIT(num_objects_misplaced);
1881 SPLIT(num_objects_unfound);
1882 SPLIT(num_rd);
1883 SPLIT(num_rd_kb);
1884 SPLIT(num_wr);
1885 SPLIT(num_wr_kb);
11fdf7f2
TL
1886 SPLIT(num_large_omap_objects);
1887 SPLIT(num_objects_manifest);
1888 SPLIT(num_omap_bytes);
1889 SPLIT(num_omap_keys);
1890 SPLIT(num_objects_repaired);
94b18763
FG
1891 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
1892 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
1893 for (unsigned i = 0; i < out.size(); ++i) {
1894 out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
1895 out[i].num_deep_scrub_errors;
1896 }
7c673cae
FG
1897 SPLIT(num_objects_recovered);
1898 SPLIT(num_bytes_recovered);
1899 SPLIT(num_keys_recovered);
1900 SPLIT(num_objects_dirty);
1901 SPLIT(num_whiteouts);
1902 SPLIT(num_objects_omap);
1903 SPLIT(num_objects_hit_set_archive);
1904 SPLIT(num_bytes_hit_set_archive);
1905 SPLIT(num_flush);
1906 SPLIT(num_flush_kb);
1907 SPLIT(num_evict);
1908 SPLIT(num_evict_kb);
1909 SPLIT(num_promote);
1910 SPLIT(num_flush_mode_high);
1911 SPLIT(num_flush_mode_low);
1912 SPLIT(num_evict_mode_some);
1913 SPLIT(num_evict_mode_full);
1914 SPLIT(num_objects_pinned);
1915 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
1916#undef SPLIT
1917#undef SPLIT_PRESERVE_NONZERO
1918 }
1919
1920 void clear() {
1921 memset(this, 0, sizeof(*this));
1922 }
1923
1924 void calc_copies(int nrep) {
1925 num_object_copies = nrep * num_objects;
1926 }
1927
1928 bool is_zero() const {
1929 return mem_is_zero((char*)this, sizeof(*this));
1930 }
1931
1932 void add(const object_stat_sum_t& o);
1933 void sub(const object_stat_sum_t& o);
1934
1935 void dump(Formatter *f) const;
1936 void padding_check() {
1937 static_assert(
1938 sizeof(object_stat_sum_t) ==
1939 sizeof(num_bytes) +
1940 sizeof(num_objects) +
1941 sizeof(num_object_clones) +
1942 sizeof(num_object_copies) +
1943 sizeof(num_objects_missing_on_primary) +
1944 sizeof(num_objects_degraded) +
1945 sizeof(num_objects_unfound) +
1946 sizeof(num_rd) +
1947 sizeof(num_rd_kb) +
1948 sizeof(num_wr) +
1949 sizeof(num_wr_kb) +
1950 sizeof(num_scrub_errors) +
28e407b8 1951 sizeof(num_large_omap_objects) +
11fdf7f2
TL
1952 sizeof(num_objects_manifest) +
1953 sizeof(num_omap_bytes) +
1954 sizeof(num_omap_keys) +
1955 sizeof(num_objects_repaired) +
7c673cae
FG
1956 sizeof(num_objects_recovered) +
1957 sizeof(num_bytes_recovered) +
1958 sizeof(num_keys_recovered) +
1959 sizeof(num_shallow_scrub_errors) +
1960 sizeof(num_deep_scrub_errors) +
1961 sizeof(num_objects_dirty) +
1962 sizeof(num_whiteouts) +
1963 sizeof(num_objects_omap) +
1964 sizeof(num_objects_hit_set_archive) +
1965 sizeof(num_objects_misplaced) +
1966 sizeof(num_bytes_hit_set_archive) +
1967 sizeof(num_flush) +
1968 sizeof(num_flush_kb) +
1969 sizeof(num_evict) +
1970 sizeof(num_evict_kb) +
1971 sizeof(num_promote) +
1972 sizeof(num_flush_mode_high) +
1973 sizeof(num_flush_mode_low) +
1974 sizeof(num_evict_mode_some) +
1975 sizeof(num_evict_mode_full) +
1976 sizeof(num_objects_pinned) +
1977 sizeof(num_objects_missing) +
1978 sizeof(num_legacy_snapsets)
1979 ,
1980 "object_stat_sum_t have padding");
1981 }
1982 void encode(bufferlist& bl) const;
11fdf7f2 1983 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
1984 static void generate_test_instances(list<object_stat_sum_t*>& o);
1985};
1986WRITE_CLASS_ENCODER(object_stat_sum_t)
1987
1988bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
1989
1990/**
1991 * a collection of object stat sums
1992 *
1993 * This is a collection of stat sums over different categories.
1994 */
1995struct object_stat_collection_t {
1996 /**************************************************************************
1997 * WARNING: be sure to update the operator== when adding/removing fields! *
1998 **************************************************************************/
1999 object_stat_sum_t sum;
2000
2001 void calc_copies(int nrep) {
2002 sum.calc_copies(nrep);
2003 }
2004
2005 void dump(Formatter *f) const;
2006 void encode(bufferlist& bl) const;
11fdf7f2 2007 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
2008 static void generate_test_instances(list<object_stat_collection_t*>& o);
2009
2010 bool is_zero() const {
2011 return sum.is_zero();
2012 }
2013
2014 void clear() {
2015 sum.clear();
2016 }
2017
2018 void floor(int64_t f) {
2019 sum.floor(f);
2020 }
2021
2022 void add(const object_stat_sum_t& o) {
2023 sum.add(o);
2024 }
2025
2026 void add(const object_stat_collection_t& o) {
2027 sum.add(o.sum);
2028 }
2029 void sub(const object_stat_collection_t& o) {
2030 sum.sub(o.sum);
2031 }
2032};
2033WRITE_CLASS_ENCODER(object_stat_collection_t)
2034
2035inline bool operator==(const object_stat_collection_t& l,
2036 const object_stat_collection_t& r) {
2037 return l.sum == r.sum;
2038}
2039
2040
2041/** pg_stat
2042 * aggregate stats for a single PG.
2043 */
2044struct pg_stat_t {
2045 /**************************************************************************
2046 * WARNING: be sure to update the operator== when adding/removing fields! *
2047 **************************************************************************/
2048 eversion_t version;
2049 version_t reported_seq; // sequence number
2050 epoch_t reported_epoch; // epoch of this report
11fdf7f2 2051 uint64_t state;
7c673cae
FG
2052 utime_t last_fresh; // last reported
2053 utime_t last_change; // new state != previous state
2054 utime_t last_active; // state & PG_STATE_ACTIVE
2055 utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
2056 utime_t last_clean; // state & PG_STATE_CLEAN
2057 utime_t last_unstale; // (state & PG_STATE_STALE) == 0
2058 utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
2059 utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
2060
2061 eversion_t log_start; // (log_start,version]
2062 eversion_t ondisk_log_start; // there may be more on disk
2063
2064 epoch_t created;
2065 epoch_t last_epoch_clean;
2066 pg_t parent;
2067 __u32 parent_split_bits;
2068
2069 eversion_t last_scrub;
2070 eversion_t last_deep_scrub;
2071 utime_t last_scrub_stamp;
2072 utime_t last_deep_scrub_stamp;
2073 utime_t last_clean_scrub_stamp;
2074
2075 object_stat_collection_t stats;
2076
2077 int64_t log_size;
2078 int64_t ondisk_log_size; // >= active_log_size
2079
2080 vector<int32_t> up, acting;
2081 epoch_t mapping_epoch;
2082
2083 vector<int32_t> blocked_by; ///< osds on which the pg is blocked
2084
11fdf7f2
TL
2085 interval_set<snapid_t> purged_snaps; ///< recently removed snaps that we've purged
2086
7c673cae
FG
2087 utime_t last_became_active;
2088 utime_t last_became_peered;
2089
2090 /// up, acting primaries
2091 int32_t up_primary;
2092 int32_t acting_primary;
2093
b32b8144
FG
2094 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
2095 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
2096 uint32_t snaptrimq_len;
2097
7c673cae
FG
2098 bool stats_invalid:1;
2099 /// true if num_objects_dirty is not accurate (because it was not
2100 /// maintained starting from pool creation)
2101 bool dirty_stats_invalid:1;
2102 bool omap_stats_invalid:1;
2103 bool hitset_stats_invalid:1;
2104 bool hitset_bytes_stats_invalid:1;
2105 bool pin_stats_invalid:1;
11fdf7f2 2106 bool manifest_stats_invalid:1;
7c673cae
FG
2107
2108 pg_stat_t()
2109 : reported_seq(0),
2110 reported_epoch(0),
2111 state(0),
2112 created(0), last_epoch_clean(0),
2113 parent_split_bits(0),
2114 log_size(0), ondisk_log_size(0),
2115 mapping_epoch(0),
2116 up_primary(-1),
2117 acting_primary(-1),
b32b8144 2118 snaptrimq_len(0),
7c673cae
FG
2119 stats_invalid(false),
2120 dirty_stats_invalid(false),
2121 omap_stats_invalid(false),
2122 hitset_stats_invalid(false),
2123 hitset_bytes_stats_invalid(false),
11fdf7f2
TL
2124 pin_stats_invalid(false),
2125 manifest_stats_invalid(false)
7c673cae
FG
2126 { }
2127
2128 epoch_t get_effective_last_epoch_clean() const {
2129 if (state & PG_STATE_CLEAN) {
2130 // we are clean as of this report, and should thus take the
2131 // reported epoch
2132 return reported_epoch;
2133 } else {
2134 return last_epoch_clean;
2135 }
2136 }
2137
2138 pair<epoch_t, version_t> get_version_pair() const {
2139 return make_pair(reported_epoch, reported_seq);
2140 }
2141
2142 void floor(int64_t f) {
2143 stats.floor(f);
2144 if (log_size < f)
2145 log_size = f;
2146 if (ondisk_log_size < f)
2147 ondisk_log_size = f;
b32b8144
FG
2148 if (snaptrimq_len < f)
2149 snaptrimq_len = f;
7c673cae
FG
2150 }
2151
11fdf7f2
TL
2152 void add_sub_invalid_flags(const pg_stat_t& o) {
2153 // adding (or subtracting!) invalid stats render our stats invalid too
2154 stats_invalid |= o.stats_invalid;
2155 dirty_stats_invalid |= o.dirty_stats_invalid;
2156 hitset_stats_invalid |= o.hitset_stats_invalid;
2157 pin_stats_invalid |= o.pin_stats_invalid;
2158 manifest_stats_invalid |= o.manifest_stats_invalid;
2159 }
7c673cae
FG
2160 void add(const pg_stat_t& o) {
2161 stats.add(o.stats);
2162 log_size += o.log_size;
2163 ondisk_log_size += o.ondisk_log_size;
11fdf7f2
TL
2164 snaptrimq_len = std::min((uint64_t)snaptrimq_len + o.snaptrimq_len,
2165 (uint64_t)(1ull << 31));
2166 add_sub_invalid_flags(o);
7c673cae
FG
2167 }
2168 void sub(const pg_stat_t& o) {
2169 stats.sub(o.stats);
2170 log_size -= o.log_size;
2171 ondisk_log_size -= o.ondisk_log_size;
b32b8144
FG
2172 if (o.snaptrimq_len < snaptrimq_len) {
2173 snaptrimq_len -= o.snaptrimq_len;
2174 } else {
2175 snaptrimq_len = 0;
2176 }
11fdf7f2 2177 add_sub_invalid_flags(o);
7c673cae
FG
2178 }
2179
2180 bool is_acting_osd(int32_t osd, bool primary) const;
2181 void dump(Formatter *f) const;
2182 void dump_brief(Formatter *f) const;
2183 void encode(bufferlist &bl) const;
11fdf7f2 2184 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
2185 static void generate_test_instances(list<pg_stat_t*>& o);
2186};
2187WRITE_CLASS_ENCODER(pg_stat_t)
2188
2189bool operator==(const pg_stat_t& l, const pg_stat_t& r);
2190
11fdf7f2
TL
2191/** store_statfs_t
2192 * ObjectStore full statfs information
2193 */
2194struct store_statfs_t
2195{
2196 uint64_t total = 0; ///< Total bytes
2197 uint64_t available = 0; ///< Free bytes available
2198 uint64_t internally_reserved = 0; ///< Bytes reserved for internal purposes
2199
2200 int64_t allocated = 0; ///< Bytes allocated by the store
2201
2202 int64_t data_stored = 0; ///< Bytes actually stored by the user
2203 int64_t data_compressed = 0; ///< Bytes stored after compression
2204 int64_t data_compressed_allocated = 0; ///< Bytes allocated for compressed data
2205 int64_t data_compressed_original = 0; ///< Bytes that were compressed
2206
2207 int64_t omap_allocated = 0; ///< approx usage of omap data
2208 int64_t internal_metadata = 0; ///< approx usage of internal metadata
2209
2210 void reset() {
2211 *this = store_statfs_t();
2212 }
2213 void floor(int64_t f) {
2214#define FLOOR(x) if (int64_t(x) < f) x = f
2215 FLOOR(total);
2216 FLOOR(available);
2217 FLOOR(internally_reserved);
2218 FLOOR(allocated);
2219 FLOOR(data_stored);
2220 FLOOR(data_compressed);
2221 FLOOR(data_compressed_allocated);
2222 FLOOR(data_compressed_original);
2223
2224 FLOOR(omap_allocated);
2225 FLOOR(internal_metadata);
2226#undef FLOOR
2227 }
2228
2229 bool operator ==(const store_statfs_t& other) const;
2230 bool is_zero() const {
2231 return *this == store_statfs_t();
2232 }
2233
2234 uint64_t get_used() const {
2235 return total - available - internally_reserved;
2236 }
2237
2238 // this accumulates both actually used and statfs's internally_reserved
2239 uint64_t get_used_raw() const {
2240 return total - available;
2241 }
2242
2243 float get_used_raw_ratio() const {
2244 if (total) {
2245 return (float)get_used_raw() / (float)total;
2246 } else {
2247 return 0.0;
2248 }
2249 }
2250
2251 // helpers to ease legacy code porting
2252 uint64_t kb_avail() const {
2253 return available >> 10;
2254 }
2255 uint64_t kb() const {
2256 return total >> 10;
2257 }
2258 uint64_t kb_used() const {
2259 return (total - available - internally_reserved) >> 10;
2260 }
2261 uint64_t kb_used_raw() const {
2262 return get_used_raw() >> 10;
2263 }
2264
2265 uint64_t kb_used_data() const {
2266 return allocated >> 10;
2267 }
2268 uint64_t kb_used_omap() const {
2269 return omap_allocated >> 10;
2270 }
2271
2272 uint64_t kb_used_internal_metadata() const {
2273 return internal_metadata >> 10;
2274 }
2275
2276 void add(const store_statfs_t& o) {
2277 total += o.total;
2278 available += o.available;
2279 internally_reserved += o.internally_reserved;
2280 allocated += o.allocated;
2281 data_stored += o.data_stored;
2282 data_compressed += o.data_compressed;
2283 data_compressed_allocated += o.data_compressed_allocated;
2284 data_compressed_original += o.data_compressed_original;
2285 omap_allocated += o.omap_allocated;
2286 internal_metadata += o.internal_metadata;
2287 }
2288 void sub(const store_statfs_t& o) {
2289 total -= o.total;
2290 available -= o.available;
2291 internally_reserved -= o.internally_reserved;
2292 allocated -= o.allocated;
2293 data_stored -= o.data_stored;
2294 data_compressed -= o.data_compressed;
2295 data_compressed_allocated -= o.data_compressed_allocated;
2296 data_compressed_original -= o.data_compressed_original;
2297 omap_allocated -= o.omap_allocated;
2298 internal_metadata -= o.internal_metadata;
2299 }
2300 void dump(Formatter *f) const;
2301 DENC(store_statfs_t, v, p) {
2302 DENC_START(1, 1, p);
2303 denc(v.total, p);
2304 denc(v.available, p);
2305 denc(v.internally_reserved, p);
2306 denc(v.allocated, p);
2307 denc(v.data_stored, p);
2308 denc(v.data_compressed, p);
2309 denc(v.data_compressed_allocated, p);
2310 denc(v.data_compressed_original, p);
2311 denc(v.omap_allocated, p);
2312 denc(v.internal_metadata, p);
2313 DENC_FINISH(p);
2314 }
2315 static void generate_test_instances(list<store_statfs_t*>& o);
2316};
2317WRITE_CLASS_DENC(store_statfs_t)
2318
2319ostream &operator<<(ostream &lhs, const store_statfs_t &rhs);
2320
2321/** osd_stat
2322 * aggregate stats for an osd
2323 */
2324struct osd_stat_t {
2325 store_statfs_t statfs;
2326 vector<int> hb_peers;
2327 int32_t snap_trim_queue_len, num_snap_trimming;
2328 uint64_t num_shards_repaired;
2329
2330 pow2_hist_t op_queue_age_hist;
2331
2332 objectstore_perf_stat_t os_perf_stat;
2333 osd_alerts_t os_alerts;
2334
2335 epoch_t up_from = 0;
2336 uint64_t seq = 0;
2337
2338 uint32_t num_pgs = 0;
2339
2340 osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
2341 num_shards_repaired(0) {}
2342
2343 void add(const osd_stat_t& o) {
2344 statfs.add(o.statfs);
2345 snap_trim_queue_len += o.snap_trim_queue_len;
2346 num_snap_trimming += o.num_snap_trimming;
2347 num_shards_repaired += o.num_shards_repaired;
2348 op_queue_age_hist.add(o.op_queue_age_hist);
2349 os_perf_stat.add(o.os_perf_stat);
2350 num_pgs += o.num_pgs;
2351 for (const auto& a : o.os_alerts) {
2352 auto& target = os_alerts[a.first];
2353 for (auto& i : a.second) {
2354 target.emplace(i.first, i.second);
2355 }
2356 }
2357 }
2358 void sub(const osd_stat_t& o) {
2359 statfs.sub(o.statfs);
2360 snap_trim_queue_len -= o.snap_trim_queue_len;
2361 num_snap_trimming -= o.num_snap_trimming;
2362 num_shards_repaired -= o.num_shards_repaired;
2363 op_queue_age_hist.sub(o.op_queue_age_hist);
2364 os_perf_stat.sub(o.os_perf_stat);
2365 num_pgs -= o.num_pgs;
2366 for (const auto& a : o.os_alerts) {
2367 auto& target = os_alerts[a.first];
2368 for (auto& i : a.second) {
2369 target.erase(i.first);
2370 }
2371 if (target.empty()) {
2372 os_alerts.erase(a.first);
2373 }
2374 }
2375 }
2376 void dump(Formatter *f) const;
2377 void encode(bufferlist &bl, uint64_t features) const;
2378 void decode(bufferlist::const_iterator &bl);
2379 static void generate_test_instances(std::list<osd_stat_t*>& o);
2380};
2381WRITE_CLASS_ENCODER_FEATURES(osd_stat_t)
2382
2383inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
2384 return l.statfs == r.statfs &&
2385 l.snap_trim_queue_len == r.snap_trim_queue_len &&
2386 l.num_snap_trimming == r.num_snap_trimming &&
2387 l.num_shards_repaired == r.num_shards_repaired &&
2388 l.hb_peers == r.hb_peers &&
2389 l.op_queue_age_hist == r.op_queue_age_hist &&
2390 l.os_perf_stat == r.os_perf_stat &&
2391 l.num_pgs == r.num_pgs;
2392}
2393inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
2394 return !(l == r);
2395}
2396
2397inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
2398 return out << "osd_stat(" << s.statfs << ", "
2399 << "peers " << s.hb_peers
2400 << " op hist " << s.op_queue_age_hist.h
2401 << ")";
2402}
2403
7c673cae
FG
2404/*
2405 * summation over an entire pool
2406 */
2407struct pool_stat_t {
2408 object_stat_collection_t stats;
11fdf7f2 2409 store_statfs_t store_stats;
7c673cae
FG
2410 int64_t log_size;
2411 int64_t ondisk_log_size; // >= active_log_size
2412 int32_t up; ///< number of up replicas or shards
2413 int32_t acting; ///< number of acting replicas or shards
11fdf7f2 2414 int32_t num_store_stats; ///< amount of store_stats accumulated
7c673cae 2415
11fdf7f2
TL
2416 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
2417 num_store_stats(0)
7c673cae
FG
2418 { }
2419
2420 void floor(int64_t f) {
2421 stats.floor(f);
11fdf7f2 2422 store_stats.floor(f);
7c673cae
FG
2423 if (log_size < f)
2424 log_size = f;
2425 if (ondisk_log_size < f)
2426 ondisk_log_size = f;
2427 if (up < f)
2428 up = f;
2429 if (acting < f)
2430 acting = f;
11fdf7f2
TL
2431 if (num_store_stats < f)
2432 num_store_stats = f;
2433 }
2434
2435 void add(const store_statfs_t& o) {
2436 store_stats.add(o);
2437 ++num_store_stats;
2438 }
2439 void sub(const store_statfs_t& o) {
2440 store_stats.sub(o);
2441 --num_store_stats;
7c673cae
FG
2442 }
2443
2444 void add(const pg_stat_t& o) {
2445 stats.add(o.stats);
2446 log_size += o.log_size;
2447 ondisk_log_size += o.ondisk_log_size;
2448 up += o.up.size();
2449 acting += o.acting.size();
2450 }
2451 void sub(const pg_stat_t& o) {
2452 stats.sub(o.stats);
2453 log_size -= o.log_size;
2454 ondisk_log_size -= o.ondisk_log_size;
2455 up -= o.up.size();
2456 acting -= o.acting.size();
2457 }
2458
2459 bool is_zero() const {
2460 return (stats.is_zero() &&
11fdf7f2 2461 store_stats.is_zero() &&
7c673cae
FG
2462 log_size == 0 &&
2463 ondisk_log_size == 0 &&
2464 up == 0 &&
11fdf7f2
TL
2465 acting == 0 &&
2466 num_store_stats == 0);
2467 }
2468
2469 // helper accessors to retrieve used/netto bytes depending on the
2470 // collection method: new per-pool objectstore report or legacy PG
2471 // summation at OSD.
2472 // In legacy mode used and netto values are the same. But for new per-pool
2473 // collection 'used' provides amount of space ALLOCATED at all related OSDs
2474 // and 'netto' is amount of stored user data.
2475 uint64_t get_allocated_bytes() const {
2476 uint64_t allocated_bytes;
2477 if (num_store_stats) {
2478 allocated_bytes = store_stats.allocated;
2479 } else {
2480 // legacy mode, use numbers from 'stats'
2481 allocated_bytes = stats.sum.num_bytes +
2482 stats.sum.num_bytes_hit_set_archive;
2483 }
2484 // omap is not broken out by pool by nautilus bluestore
2485 allocated_bytes += stats.sum.num_omap_bytes;
2486 return allocated_bytes;
2487 }
2488 uint64_t get_user_bytes(float raw_used_rate) const {
2489 uint64_t user_bytes;
2490 if (num_store_stats) {
2491 user_bytes = raw_used_rate ? store_stats.data_stored / raw_used_rate : 0;
2492 } else {
2493 // legacy mode, use numbers from 'stats'
2494 user_bytes = stats.sum.num_bytes +
2495 stats.sum.num_bytes_hit_set_archive;
2496 }
2497 // omap is not broken out by pool by nautilus bluestore
2498 user_bytes += stats.sum.num_omap_bytes;
2499 return user_bytes;
7c673cae
FG
2500 }
2501
2502 void dump(Formatter *f) const;
2503 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 2504 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
2505 static void generate_test_instances(list<pool_stat_t*>& o);
2506};
2507WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
2508
2509
2510// -----------------------------------------
2511
2512/**
2513 * pg_hit_set_info_t - information about a single recorded HitSet
2514 *
11fdf7f2 2515 * Track basic metadata about a HitSet, like the number of insertions
7c673cae
FG
2516 * and the time range it covers.
2517 */
2518struct pg_hit_set_info_t {
2519 utime_t begin, end; ///< time interval
2520 eversion_t version; ///< version this HitSet object was written
2521 bool using_gmt; ///< use gmt for creating the hit_set archive object name
2522
2523 friend bool operator==(const pg_hit_set_info_t& l,
2524 const pg_hit_set_info_t& r) {
2525 return
2526 l.begin == r.begin &&
2527 l.end == r.end &&
2528 l.version == r.version &&
2529 l.using_gmt == r.using_gmt;
2530 }
2531
2532 explicit pg_hit_set_info_t(bool using_gmt = true)
2533 : using_gmt(using_gmt) {}
2534
2535 void encode(bufferlist &bl) const;
11fdf7f2 2536 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
2537 void dump(Formatter *f) const;
2538 static void generate_test_instances(list<pg_hit_set_info_t*>& o);
2539};
2540WRITE_CLASS_ENCODER(pg_hit_set_info_t)
2541
2542/**
2543 * pg_hit_set_history_t - information about a history of hitsets
2544 *
2545 * Include information about the currently accumulating hit set as well
2546 * as archived/historical ones.
2547 */
2548struct pg_hit_set_history_t {
2549 eversion_t current_last_update; ///< last version inserted into current set
2550 list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
2551
2552 friend bool operator==(const pg_hit_set_history_t& l,
2553 const pg_hit_set_history_t& r) {
2554 return
2555 l.current_last_update == r.current_last_update &&
2556 l.history == r.history;
2557 }
2558
2559 void encode(bufferlist &bl) const;
11fdf7f2 2560 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
2561 void dump(Formatter *f) const;
2562 static void generate_test_instances(list<pg_hit_set_history_t*>& o);
2563};
2564WRITE_CLASS_ENCODER(pg_hit_set_history_t)
2565
2566
2567// -----------------------------------------
2568
2569/**
2570 * pg_history_t - information about recent pg peering/mapping history
2571 *
2572 * This is aggressively shared between OSDs to bound the amount of past
2573 * history they need to worry about.
2574 */
2575struct pg_history_t {
31f18b77
FG
2576 epoch_t epoch_created; // epoch in which *pg* was created (pool or pg)
2577 epoch_t epoch_pool_created; // epoch in which *pool* was created
2578 // (note: may be pg creation epoch for
2579 // pre-luminous clusters)
7c673cae
FG
2580 epoch_t last_epoch_started; // lower bound on last epoch started (anywhere, not necessarily locally)
2581 epoch_t last_interval_started; // first epoch of last_epoch_started interval
2582 epoch_t last_epoch_clean; // lower bound on last epoch the PG was completely clean.
2583 epoch_t last_interval_clean; // first epoch of last_epoch_clean interval
31f18b77 2584 epoch_t last_epoch_split; // as parent or child
7c673cae
FG
2585 epoch_t last_epoch_marked_full; // pool or cluster
2586
2587 /**
2588 * In the event of a map discontinuity, same_*_since may reflect the first
2589 * map the osd has seen in the new map sequence rather than the actual start
2590 * of the interval. This is ok since a discontinuity at epoch e means there
2591 * must have been a clean interval between e and now and that we cannot be
2592 * in the active set during the interval containing e.
2593 */
2594 epoch_t same_up_since; // same acting set since
2595 epoch_t same_interval_since; // same acting AND up set since
2596 epoch_t same_primary_since; // same primary at least back through this epoch.
2597
2598 eversion_t last_scrub;
2599 eversion_t last_deep_scrub;
2600 utime_t last_scrub_stamp;
2601 utime_t last_deep_scrub_stamp;
2602 utime_t last_clean_scrub_stamp;
2603
2604 friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
2605 return
2606 l.epoch_created == r.epoch_created &&
31f18b77 2607 l.epoch_pool_created == r.epoch_pool_created &&
7c673cae
FG
2608 l.last_epoch_started == r.last_epoch_started &&
2609 l.last_interval_started == r.last_interval_started &&
2610 l.last_epoch_clean == r.last_epoch_clean &&
2611 l.last_interval_clean == r.last_interval_clean &&
2612 l.last_epoch_split == r.last_epoch_split &&
2613 l.last_epoch_marked_full == r.last_epoch_marked_full &&
2614 l.same_up_since == r.same_up_since &&
2615 l.same_interval_since == r.same_interval_since &&
2616 l.same_primary_since == r.same_primary_since &&
2617 l.last_scrub == r.last_scrub &&
2618 l.last_deep_scrub == r.last_deep_scrub &&
2619 l.last_scrub_stamp == r.last_scrub_stamp &&
2620 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2621 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp;
2622 }
2623
2624 pg_history_t()
2625 : epoch_created(0),
31f18b77 2626 epoch_pool_created(0),
7c673cae
FG
2627 last_epoch_started(0),
2628 last_interval_started(0),
2629 last_epoch_clean(0),
2630 last_interval_clean(0),
2631 last_epoch_split(0),
2632 last_epoch_marked_full(0),
2633 same_up_since(0), same_interval_since(0), same_primary_since(0) {}
2634
2635 bool merge(const pg_history_t &other) {
2636 // Here, we only update the fields which cannot be calculated from the OSDmap.
2637 bool modified = false;
2638 if (epoch_created < other.epoch_created) {
2639 epoch_created = other.epoch_created;
2640 modified = true;
2641 }
31f18b77
FG
2642 if (epoch_pool_created < other.epoch_pool_created) {
2643 // FIXME: for jewel compat only; this should either be 0 or always the
2644 // same value across all pg instances.
2645 epoch_pool_created = other.epoch_pool_created;
2646 modified = true;
2647 }
7c673cae
FG
2648 if (last_epoch_started < other.last_epoch_started) {
2649 last_epoch_started = other.last_epoch_started;
2650 modified = true;
2651 }
2652 if (last_interval_started < other.last_interval_started) {
2653 last_interval_started = other.last_interval_started;
2654 modified = true;
2655 }
2656 if (last_epoch_clean < other.last_epoch_clean) {
2657 last_epoch_clean = other.last_epoch_clean;
2658 modified = true;
2659 }
2660 if (last_interval_clean < other.last_interval_clean) {
2661 last_interval_clean = other.last_interval_clean;
2662 modified = true;
2663 }
2664 if (last_epoch_split < other.last_epoch_split) {
2665 last_epoch_split = other.last_epoch_split;
2666 modified = true;
2667 }
2668 if (last_epoch_marked_full < other.last_epoch_marked_full) {
2669 last_epoch_marked_full = other.last_epoch_marked_full;
2670 modified = true;
2671 }
2672 if (other.last_scrub > last_scrub) {
2673 last_scrub = other.last_scrub;
2674 modified = true;
2675 }
2676 if (other.last_scrub_stamp > last_scrub_stamp) {
2677 last_scrub_stamp = other.last_scrub_stamp;
2678 modified = true;
2679 }
2680 if (other.last_deep_scrub > last_deep_scrub) {
2681 last_deep_scrub = other.last_deep_scrub;
2682 modified = true;
2683 }
2684 if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
2685 last_deep_scrub_stamp = other.last_deep_scrub_stamp;
2686 modified = true;
2687 }
2688 if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
2689 last_clean_scrub_stamp = other.last_clean_scrub_stamp;
2690 modified = true;
2691 }
2692 return modified;
2693 }
2694
2695 void encode(bufferlist& bl) const;
11fdf7f2 2696 void decode(bufferlist::const_iterator& p);
7c673cae
FG
2697 void dump(Formatter *f) const;
2698 static void generate_test_instances(list<pg_history_t*>& o);
2699};
2700WRITE_CLASS_ENCODER(pg_history_t)
2701
2702inline ostream& operator<<(ostream& out, const pg_history_t& h) {
31f18b77 2703 return out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
7c673cae
FG
2704 << " lis/c " << h.last_interval_started
2705 << "/" << h.last_interval_clean
2706 << " les/c/f " << h.last_epoch_started << "/" << h.last_epoch_clean
2707 << "/" << h.last_epoch_marked_full
2708 << " " << h.same_up_since
2709 << "/" << h.same_interval_since
2710 << "/" << h.same_primary_since;
2711}
2712
2713
2714/**
2715 * pg_info_t - summary of PG statistics.
2716 *
2717 * some notes:
2718 * - last_complete implies we have all objects that existed as of that
2719 * stamp, OR a newer object, OR have already applied a later delete.
2720 * - if last_complete >= log.bottom, then we know pg contents thru log.head.
2721 * otherwise, we have no idea what the pg is supposed to contain.
2722 */
2723struct pg_info_t {
2724 spg_t pgid;
2725 eversion_t last_update; ///< last object version applied to store.
2726 eversion_t last_complete; ///< last version pg was complete through.
2727 epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd
2728 epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
2729
2730 version_t last_user_version; ///< last user object version applied to store
2731
2732 eversion_t log_tail; ///< oldest log entry.
2733
2734 hobject_t last_backfill; ///< objects >= this and < last_complete may be missing
2735 bool last_backfill_bitwise; ///< true if last_backfill reflects a bitwise (vs nibblewise) sort
2736
2737 interval_set<snapid_t> purged_snaps;
2738
2739 pg_stat_t stats;
2740
2741 pg_history_t history;
2742 pg_hit_set_history_t hit_set;
2743
2744 friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
2745 return
2746 l.pgid == r.pgid &&
2747 l.last_update == r.last_update &&
2748 l.last_complete == r.last_complete &&
2749 l.last_epoch_started == r.last_epoch_started &&
2750 l.last_interval_started == r.last_interval_started &&
2751 l.last_user_version == r.last_user_version &&
2752 l.log_tail == r.log_tail &&
2753 l.last_backfill == r.last_backfill &&
2754 l.last_backfill_bitwise == r.last_backfill_bitwise &&
2755 l.purged_snaps == r.purged_snaps &&
2756 l.stats == r.stats &&
2757 l.history == r.history &&
2758 l.hit_set == r.hit_set;
2759 }
2760
2761 pg_info_t()
2762 : last_epoch_started(0),
2763 last_interval_started(0),
2764 last_user_version(0),
2765 last_backfill(hobject_t::get_max()),
2766 last_backfill_bitwise(false)
2767 { }
2768 // cppcheck-suppress noExplicitConstructor
2769 pg_info_t(spg_t p)
2770 : pgid(p),
2771 last_epoch_started(0),
2772 last_interval_started(0),
2773 last_user_version(0),
2774 last_backfill(hobject_t::get_max()),
2775 last_backfill_bitwise(false)
2776 { }
2777
2778 void set_last_backfill(hobject_t pos) {
2779 last_backfill = pos;
2780 last_backfill_bitwise = true;
2781 }
2782
2783 bool is_empty() const { return last_update.version == 0; }
2784 bool dne() const { return history.epoch_created == 0; }
2785
11fdf7f2 2786 bool has_missing() const { return last_complete != last_update; }
7c673cae
FG
2787 bool is_incomplete() const { return !last_backfill.is_max(); }
2788
2789 void encode(bufferlist& bl) const;
11fdf7f2 2790 void decode(bufferlist::const_iterator& p);
7c673cae 2791 void dump(Formatter *f) const;
7c673cae
FG
2792 static void generate_test_instances(list<pg_info_t*>& o);
2793};
2794WRITE_CLASS_ENCODER(pg_info_t)
2795
2796inline ostream& operator<<(ostream& out, const pg_info_t& pgi)
2797{
2798 out << pgi.pgid << "(";
2799 if (pgi.dne())
2800 out << " DNE";
2801 if (pgi.is_empty())
2802 out << " empty";
2803 else {
2804 out << " v " << pgi.last_update;
2805 if (pgi.last_complete != pgi.last_update)
2806 out << " lc " << pgi.last_complete;
2807 out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
2808 }
2809 if (pgi.is_incomplete())
2810 out << " lb " << pgi.last_backfill
2811 << (pgi.last_backfill_bitwise ? " (bitwise)" : " (NIBBLEWISE)");
2812 //out << " c " << pgi.epoch_created;
2813 out << " local-lis/les=" << pgi.last_interval_started
2814 << "/" << pgi.last_epoch_started;
2815 out << " n=" << pgi.stats.stats.sum.num_objects;
2816 out << " " << pgi.history
2817 << ")";
2818 return out;
2819}
2820
2821/**
2822 * pg_fast_info_t - common pg_info_t fields
2823 *
2824 * These are the fields of pg_info_t (and children) that are updated for
2825 * most IO operations.
2826 *
2827 * ** WARNING **
2828 * Because we rely on these fields to be applied to the normal
2829 * info struct, adding a new field here that is not also new in info
2830 * means that we must set an incompat OSD feature bit!
2831 */
2832struct pg_fast_info_t {
2833 eversion_t last_update;
2834 eversion_t last_complete;
2835 version_t last_user_version;
2836 struct { // pg_stat_t stats
2837 eversion_t version;
2838 version_t reported_seq;
2839 utime_t last_fresh;
2840 utime_t last_active;
2841 utime_t last_peered;
2842 utime_t last_clean;
2843 utime_t last_unstale;
2844 utime_t last_undegraded;
2845 utime_t last_fullsized;
2846 int64_t log_size; // (also ondisk_log_size, which has the same value)
2847 struct { // object_stat_collection_t stats;
2848 struct { // objct_stat_sum_t sum
2849 int64_t num_bytes; // in bytes
2850 int64_t num_objects;
2851 int64_t num_object_copies;
2852 int64_t num_rd;
2853 int64_t num_rd_kb;
2854 int64_t num_wr;
2855 int64_t num_wr_kb;
2856 int64_t num_objects_dirty;
2857 } sum;
2858 } stats;
2859 } stats;
2860
2861 void populate_from(const pg_info_t& info) {
2862 last_update = info.last_update;
2863 last_complete = info.last_complete;
2864 last_user_version = info.last_user_version;
2865 stats.version = info.stats.version;
2866 stats.reported_seq = info.stats.reported_seq;
2867 stats.last_fresh = info.stats.last_fresh;
2868 stats.last_active = info.stats.last_active;
2869 stats.last_peered = info.stats.last_peered;
2870 stats.last_clean = info.stats.last_clean;
2871 stats.last_unstale = info.stats.last_unstale;
2872 stats.last_undegraded = info.stats.last_undegraded;
2873 stats.last_fullsized = info.stats.last_fullsized;
2874 stats.log_size = info.stats.log_size;
2875 stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
2876 stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
2877 stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
2878 stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
2879 stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
2880 stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
2881 stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
2882 stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
2883 }
2884
2885 bool try_apply_to(pg_info_t* info) {
2886 if (last_update <= info->last_update)
2887 return false;
2888 info->last_update = last_update;
2889 info->last_complete = last_complete;
2890 info->last_user_version = last_user_version;
2891 info->stats.version = stats.version;
2892 info->stats.reported_seq = stats.reported_seq;
2893 info->stats.last_fresh = stats.last_fresh;
2894 info->stats.last_active = stats.last_active;
2895 info->stats.last_peered = stats.last_peered;
2896 info->stats.last_clean = stats.last_clean;
2897 info->stats.last_unstale = stats.last_unstale;
2898 info->stats.last_undegraded = stats.last_undegraded;
2899 info->stats.last_fullsized = stats.last_fullsized;
2900 info->stats.log_size = stats.log_size;
2901 info->stats.ondisk_log_size = stats.log_size;
2902 info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
2903 info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
2904 info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
2905 info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
2906 info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
2907 info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
2908 info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
2909 info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
2910 return true;
2911 }
2912
2913 void encode(bufferlist& bl) const {
2914 ENCODE_START(1, 1, bl);
11fdf7f2
TL
2915 encode(last_update, bl);
2916 encode(last_complete, bl);
2917 encode(last_user_version, bl);
2918 encode(stats.version, bl);
2919 encode(stats.reported_seq, bl);
2920 encode(stats.last_fresh, bl);
2921 encode(stats.last_active, bl);
2922 encode(stats.last_peered, bl);
2923 encode(stats.last_clean, bl);
2924 encode(stats.last_unstale, bl);
2925 encode(stats.last_undegraded, bl);
2926 encode(stats.last_fullsized, bl);
2927 encode(stats.log_size, bl);
2928 encode(stats.stats.sum.num_bytes, bl);
2929 encode(stats.stats.sum.num_objects, bl);
2930 encode(stats.stats.sum.num_object_copies, bl);
2931 encode(stats.stats.sum.num_rd, bl);
2932 encode(stats.stats.sum.num_rd_kb, bl);
2933 encode(stats.stats.sum.num_wr, bl);
2934 encode(stats.stats.sum.num_wr_kb, bl);
2935 encode(stats.stats.sum.num_objects_dirty, bl);
7c673cae
FG
2936 ENCODE_FINISH(bl);
2937 }
11fdf7f2 2938 void decode(bufferlist::const_iterator& p) {
7c673cae 2939 DECODE_START(1, p);
11fdf7f2
TL
2940 decode(last_update, p);
2941 decode(last_complete, p);
2942 decode(last_user_version, p);
2943 decode(stats.version, p);
2944 decode(stats.reported_seq, p);
2945 decode(stats.last_fresh, p);
2946 decode(stats.last_active, p);
2947 decode(stats.last_peered, p);
2948 decode(stats.last_clean, p);
2949 decode(stats.last_unstale, p);
2950 decode(stats.last_undegraded, p);
2951 decode(stats.last_fullsized, p);
2952 decode(stats.log_size, p);
2953 decode(stats.stats.sum.num_bytes, p);
2954 decode(stats.stats.sum.num_objects, p);
2955 decode(stats.stats.sum.num_object_copies, p);
2956 decode(stats.stats.sum.num_rd, p);
2957 decode(stats.stats.sum.num_rd_kb, p);
2958 decode(stats.stats.sum.num_wr, p);
2959 decode(stats.stats.sum.num_wr_kb, p);
2960 decode(stats.stats.sum.num_objects_dirty, p);
7c673cae
FG
2961 DECODE_FINISH(p);
2962 }
2963};
2964WRITE_CLASS_ENCODER(pg_fast_info_t)
2965
2966
2967struct pg_notify_t {
2968 epoch_t query_epoch;
2969 epoch_t epoch_sent;
2970 pg_info_t info;
2971 shard_id_t to;
2972 shard_id_t from;
2973 pg_notify_t() :
2974 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
2975 from(shard_id_t::NO_SHARD) {}
2976 pg_notify_t(
2977 shard_id_t to,
2978 shard_id_t from,
2979 epoch_t query_epoch,
2980 epoch_t epoch_sent,
2981 const pg_info_t &info)
2982 : query_epoch(query_epoch),
2983 epoch_sent(epoch_sent),
2984 info(info), to(to), from(from) {
11fdf7f2 2985 ceph_assert(from == info.pgid.shard);
7c673cae
FG
2986 }
2987 void encode(bufferlist &bl) const;
11fdf7f2 2988 void decode(bufferlist::const_iterator &p);
7c673cae
FG
2989 void dump(Formatter *f) const;
2990 static void generate_test_instances(list<pg_notify_t*> &o);
2991};
2992WRITE_CLASS_ENCODER(pg_notify_t)
2993ostream &operator<<(ostream &lhs, const pg_notify_t &notify);
2994
2995
2996class OSDMap;
2997/**
2998 * PastIntervals -- information needed to determine the PriorSet and
2999 * the might_have_unfound set
3000 */
3001class PastIntervals {
3002public:
3003 struct pg_interval_t {
3004 vector<int32_t> up, acting;
3005 epoch_t first, last;
3006 bool maybe_went_rw;
3007 int32_t primary;
3008 int32_t up_primary;
3009
3010 pg_interval_t()
3011 : first(0), last(0),
3012 maybe_went_rw(false),
3013 primary(-1),
3014 up_primary(-1)
3015 {}
3016
3017 pg_interval_t(
3018 vector<int32_t> &&up,
3019 vector<int32_t> &&acting,
3020 epoch_t first,
3021 epoch_t last,
3022 bool maybe_went_rw,
3023 int32_t primary,
3024 int32_t up_primary)
3025 : up(up), acting(acting), first(first), last(last),
3026 maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
3027 {}
3028
3029 void encode(bufferlist& bl) const;
11fdf7f2 3030 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
3031 void dump(Formatter *f) const;
3032 static void generate_test_instances(list<pg_interval_t*>& o);
3033 };
3034
11fdf7f2 3035 PastIntervals();
7c673cae
FG
3036 PastIntervals(PastIntervals &&rhs) = default;
3037 PastIntervals &operator=(PastIntervals &&rhs) = default;
3038
3039 PastIntervals(const PastIntervals &rhs);
3040 PastIntervals &operator=(const PastIntervals &rhs);
3041
3042 class interval_rep {
3043 public:
3044 virtual size_t size() const = 0;
3045 virtual bool empty() const = 0;
3046 virtual void clear() = 0;
3047 virtual pair<epoch_t, epoch_t> get_bounds() const = 0;
3048 virtual set<pg_shard_t> get_all_participants(
3049 bool ec_pool) const = 0;
3050 virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
3051 virtual unique_ptr<interval_rep> clone() const = 0;
3052 virtual ostream &print(ostream &out) const = 0;
3053 virtual void encode(bufferlist &bl) const = 0;
11fdf7f2 3054 virtual void decode(bufferlist::const_iterator &bl) = 0;
7c673cae 3055 virtual void dump(Formatter *f) const = 0;
7c673cae 3056 virtual void iterate_mayberw_back_to(
7c673cae
FG
3057 epoch_t les,
3058 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const = 0;
3059
3060 virtual bool has_full_intervals() const { return false; }
3061 virtual void iterate_all_intervals(
3062 std::function<void(const pg_interval_t &)> &&f) const {
11fdf7f2
TL
3063 ceph_assert(!has_full_intervals());
3064 ceph_abort_msg("not valid for this implementation");
7c673cae 3065 }
11fdf7f2 3066 virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0;
7c673cae
FG
3067
3068 virtual ~interval_rep() {}
3069 };
7c673cae
FG
3070 friend class pi_compact_rep;
3071private:
3072
3073 unique_ptr<interval_rep> past_intervals;
3074
11fdf7f2 3075 explicit PastIntervals(interval_rep *rep) : past_intervals(rep) {}
7c673cae
FG
3076
3077public:
3078 void add_interval(bool ec_pool, const pg_interval_t &interval) {
11fdf7f2 3079 ceph_assert(past_intervals);
7c673cae
FG
3080 return past_intervals->add_interval(ec_pool, interval);
3081 }
3082
7c673cae
FG
3083 void encode(bufferlist &bl) const {
3084 ENCODE_START(1, 1, bl);
3085 if (past_intervals) {
11fdf7f2
TL
3086 __u8 type = 2;
3087 encode(type, bl);
7c673cae
FG
3088 past_intervals->encode(bl);
3089 } else {
11fdf7f2 3090 encode((__u8)0, bl);
7c673cae
FG
3091 }
3092 ENCODE_FINISH(bl);
3093 }
7c673cae 3094
11fdf7f2 3095 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
3096
3097 void dump(Formatter *f) const {
11fdf7f2 3098 ceph_assert(past_intervals);
7c673cae
FG
3099 past_intervals->dump(f);
3100 }
3101 static void generate_test_instances(list<PastIntervals *> & o);
3102
3103 /**
3104 * Determines whether there is an interval change
3105 */
3106 static bool is_new_interval(
3107 int old_acting_primary,
3108 int new_acting_primary,
3109 const vector<int> &old_acting,
3110 const vector<int> &new_acting,
3111 int old_up_primary,
3112 int new_up_primary,
3113 const vector<int> &old_up,
3114 const vector<int> &new_up,
3115 int old_size,
3116 int new_size,
3117 int old_min_size,
3118 int new_min_size,
3119 unsigned old_pg_num,
3120 unsigned new_pg_num,
11fdf7f2
TL
3121 unsigned old_pg_num_pending,
3122 unsigned new_pg_num_pending,
7c673cae
FG
3123 bool old_sort_bitwise,
3124 bool new_sort_bitwise,
c07f9fc5
FG
3125 bool old_recovery_deletes,
3126 bool new_recovery_deletes,
7c673cae
FG
3127 pg_t pgid
3128 );
3129
3130 /**
3131 * Determines whether there is an interval change
3132 */
3133 static bool is_new_interval(
3134 int old_acting_primary, ///< [in] primary as of lastmap
3135 int new_acting_primary, ///< [in] primary as of lastmap
3136 const vector<int> &old_acting, ///< [in] acting as of lastmap
3137 const vector<int> &new_acting, ///< [in] acting as of osdmap
3138 int old_up_primary, ///< [in] up primary of lastmap
3139 int new_up_primary, ///< [in] up primary of osdmap
3140 const vector<int> &old_up, ///< [in] up as of lastmap
3141 const vector<int> &new_up, ///< [in] up as of osdmap
11fdf7f2
TL
3142 std::shared_ptr<const OSDMap> osdmap, ///< [in] current map
3143 std::shared_ptr<const OSDMap> lastmap, ///< [in] last map
7c673cae
FG
3144 pg_t pgid ///< [in] pgid for pg
3145 );
3146
3147 /**
3148 * Integrates a new map into *past_intervals, returns true
3149 * if an interval was closed out.
3150 */
3151 static bool check_new_interval(
3152 int old_acting_primary, ///< [in] primary as of lastmap
3153 int new_acting_primary, ///< [in] primary as of osdmap
3154 const vector<int> &old_acting, ///< [in] acting as of lastmap
3155 const vector<int> &new_acting, ///< [in] acting as of osdmap
3156 int old_up_primary, ///< [in] up primary of lastmap
3157 int new_up_primary, ///< [in] up primary of osdmap
3158 const vector<int> &old_up, ///< [in] up as of lastmap
3159 const vector<int> &new_up, ///< [in] up as of osdmap
3160 epoch_t same_interval_since, ///< [in] as of osdmap
3161 epoch_t last_epoch_clean, ///< [in] current
11fdf7f2
TL
3162 std::shared_ptr<const OSDMap> osdmap, ///< [in] current map
3163 std::shared_ptr<const OSDMap> lastmap, ///< [in] last map
7c673cae 3164 pg_t pgid, ///< [in] pgid for pg
11fdf7f2 3165 IsPGRecoverablePredicate *could_have_gone_active, ///< [in] predicate whether the pg can be active
7c673cae
FG
3166 PastIntervals *past_intervals, ///< [out] intervals
3167 ostream *out = 0 ///< [out] debug ostream
3168 );
c07f9fc5 3169
7c673cae
FG
3170 friend ostream& operator<<(ostream& out, const PastIntervals &i);
3171
3172 template <typename F>
3173 void iterate_mayberw_back_to(
7c673cae
FG
3174 epoch_t les,
3175 F &&f) const {
11fdf7f2
TL
3176 ceph_assert(past_intervals);
3177 past_intervals->iterate_mayberw_back_to(les, std::forward<F>(f));
7c673cae
FG
3178 }
3179 void clear() {
11fdf7f2 3180 ceph_assert(past_intervals);
7c673cae
FG
3181 past_intervals->clear();
3182 }
3183
3184 /**
3185 * Should return a value which gives an indication of the amount
3186 * of state contained
3187 */
3188 size_t size() const {
11fdf7f2 3189 ceph_assert(past_intervals);
7c673cae
FG
3190 return past_intervals->size();
3191 }
3192
3193 bool empty() const {
11fdf7f2 3194 ceph_assert(past_intervals);
7c673cae
FG
3195 return past_intervals->empty();
3196 }
3197
3198 void swap(PastIntervals &other) {
31f18b77
FG
3199 using std::swap;
3200 swap(other.past_intervals, past_intervals);
7c673cae
FG
3201 }
3202
3203 /**
3204 * Return all shards which have been in the acting set back to the
3205 * latest epoch to which we have trimmed except for pg_whoami
3206 */
3207 set<pg_shard_t> get_might_have_unfound(
3208 pg_shard_t pg_whoami,
3209 bool ec_pool) const {
11fdf7f2 3210 ceph_assert(past_intervals);
7c673cae
FG
3211 auto ret = past_intervals->get_all_participants(ec_pool);
3212 ret.erase(pg_whoami);
3213 return ret;
3214 }
3215
3216 /**
3217 * Return all shards which we might want to talk to for peering
3218 */
3219 set<pg_shard_t> get_all_probe(
3220 bool ec_pool) const {
11fdf7f2 3221 ceph_assert(past_intervals);
7c673cae
FG
3222 return past_intervals->get_all_participants(ec_pool);
3223 }
3224
3225 /* Return the set of epochs [start, end) represented by the
3226 * past_interval set.
3227 */
3228 pair<epoch_t, epoch_t> get_bounds() const {
11fdf7f2 3229 ceph_assert(past_intervals);
7c673cae
FG
3230 return past_intervals->get_bounds();
3231 }
3232
11fdf7f2
TL
3233 void adjust_start_backwards(epoch_t last_epoch_clean) {
3234 ceph_assert(past_intervals);
3235 past_intervals->adjust_start_backwards(last_epoch_clean);
3236 }
3237
7c673cae
FG
3238 enum osd_state_t {
3239 UP,
3240 DOWN,
3241 DNE,
3242 LOST
3243 };
3244 struct PriorSet {
3245 bool ec_pool = false;
11fdf7f2
TL
3246 set<pg_shard_t> probe; ///< current+prior OSDs we need to probe.
3247 set<int> down; ///< down osds that would normally be in @a probe and might be interesting.
3248 map<int, epoch_t> blocked_by; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
7c673cae 3249
11fdf7f2 3250 bool pg_down = false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
7c673cae
FG
3251 unique_ptr<IsPGRecoverablePredicate> pcontdec;
3252
3253 PriorSet() = default;
3254 PriorSet(PriorSet &&) = default;
3255 PriorSet &operator=(PriorSet &&) = default;
3256
3257 PriorSet &operator=(const PriorSet &) = delete;
3258 PriorSet(const PriorSet &) = delete;
3259
3260 bool operator==(const PriorSet &rhs) const {
3261 return (ec_pool == rhs.ec_pool) &&
3262 (probe == rhs.probe) &&
3263 (down == rhs.down) &&
3264 (blocked_by == rhs.blocked_by) &&
3265 (pg_down == rhs.pg_down);
3266 }
3267
3268 bool affected_by_map(
3269 const OSDMap &osdmap,
3270 const DoutPrefixProvider *dpp) const;
3271
3272 // For verifying tests
3273 PriorSet(
3274 bool ec_pool,
3275 set<pg_shard_t> probe,
3276 set<int> down,
3277 map<int, epoch_t> blocked_by,
3278 bool pg_down,
3279 IsPGRecoverablePredicate *pcontdec)
3280 : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
3281 pg_down(pg_down), pcontdec(pcontdec) {}
3282
3283 private:
3284 template <typename F>
3285 PriorSet(
3286 const PastIntervals &past_intervals,
3287 bool ec_pool,
3288 epoch_t last_epoch_started,
3289 IsPGRecoverablePredicate *c,
3290 F f,
3291 const vector<int> &up,
3292 const vector<int> &acting,
3293 const DoutPrefixProvider *dpp);
3294
3295 friend class PastIntervals;
3296 };
3297
7c673cae
FG
3298 template <typename... Args>
3299 PriorSet get_prior_set(Args&&... args) const {
3300 return PriorSet(*this, std::forward<Args>(args)...);
3301 }
3302};
3303WRITE_CLASS_ENCODER(PastIntervals)
3304
3305ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i);
3306ostream& operator<<(ostream& out, const PastIntervals &i);
3307ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i);
3308
3309template <typename F>
3310PastIntervals::PriorSet::PriorSet(
3311 const PastIntervals &past_intervals,
3312 bool ec_pool,
3313 epoch_t last_epoch_started,
3314 IsPGRecoverablePredicate *c,
3315 F f,
3316 const vector<int> &up,
3317 const vector<int> &acting,
3318 const DoutPrefixProvider *dpp)
3319 : ec_pool(ec_pool), pg_down(false), pcontdec(c)
3320{
3321 /*
3322 * We have to be careful to gracefully deal with situations like
3323 * so. Say we have a power outage or something that takes out both
3324 * OSDs, but the monitor doesn't mark them down in the same epoch.
3325 * The history may look like
3326 *
3327 * 1: A B
3328 * 2: B
3329 * 3: let's say B dies for good, too (say, from the power spike)
3330 * 4: A
3331 *
3332 * which makes it look like B may have applied updates to the PG
3333 * that we need in order to proceed. This sucks...
3334 *
3335 * To minimize the risk of this happening, we CANNOT go active if
3336 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3337 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3338 * Then, we have something like
3339 *
3340 * 1: A B
3341 * 2: B up_thru[B]=0
3342 * 3:
3343 * 4: A
3344 *
3345 * -> we can ignore B, bc it couldn't have gone active (alive_thru
3346 * still 0).
3347 *
3348 * or,
3349 *
3350 * 1: A B
3351 * 2: B up_thru[B]=0
3352 * 3: B up_thru[B]=2
3353 * 4:
3354 * 5: A
3355 *
3356 * -> we must wait for B, bc it was alive through 2, and could have
3357 * written to the pg.
3358 *
3359 * If B is really dead, then an administrator will need to manually
3360 * intervene by marking the OSD as "lost."
3361 */
3362
3363 // Include current acting and up nodes... not because they may
3364 // contain old data (this interval hasn't gone active, obviously),
3365 // but because we want their pg_info to inform choose_acting(), and
3366 // so that we know what they do/do not have explicitly before
3367 // sending them any new info/logs/whatever.
3368 for (unsigned i = 0; i < acting.size(); i++) {
3369 if (acting[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3370 probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3371 }
3372 // It may be possible to exclude the up nodes, but let's keep them in
3373 // there for now.
3374 for (unsigned i = 0; i < up.size(); i++) {
3375 if (up[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3376 probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3377 }
3378
3379 set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
3380 ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
3381 for (auto &&i: all_probe) {
3382 switch (f(0, i.osd, nullptr)) {
3383 case UP: {
3384 probe.insert(i);
3385 break;
3386 }
3387 case DNE:
3388 case LOST:
3389 case DOWN: {
3390 down.insert(i.osd);
3391 break;
3392 }
3393 }
3394 }
3395
3396 past_intervals.iterate_mayberw_back_to(
7c673cae
FG
3397 last_epoch_started,
3398 [&](epoch_t start, const set<pg_shard_t> &acting) {
3399 ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
3400 << ", acting: " << acting << dendl;
3401
3402 // look at candidate osds during this interval. each falls into
3403 // one of three categories: up, down (but potentially
3404 // interesting), or lost (down, but we won't wait for it).
3405 set<pg_shard_t> up_now;
3406 map<int, epoch_t> candidate_blocked_by;
3407 // any candidates down now (that might have useful data)
3408 bool any_down_now = false;
3409
3410 // consider ACTING osds
3411 for (auto &&so: acting) {
3412 epoch_t lost_at = 0;
3413 switch (f(start, so.osd, &lost_at)) {
3414 case UP: {
3415 // include past acting osds if they are up.
3416 up_now.insert(so);
3417 break;
3418 }
3419 case DNE: {
3420 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3421 << " no longer exists" << dendl;
3422 break;
3423 }
3424 case LOST: {
3425 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3426 << " is down, but lost_at " << lost_at << dendl;
3427 up_now.insert(so);
3428 break;
3429 }
3430 case DOWN: {
3431 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3432 << " is down" << dendl;
3433 candidate_blocked_by[so.osd] = lost_at;
3434 any_down_now = true;
3435 break;
3436 }
3437 }
3438 }
3439
3440 // if not enough osds survived this interval, and we may have gone rw,
3441 // then we need to wait for one of those osds to recover to
3442 // ensure that we haven't lost any information.
3443 if (!(*pcontdec)(up_now) && any_down_now) {
3444 // fixme: how do we identify a "clean" shutdown anyway?
3445 ldpp_dout(dpp, 10) << "build_prior possibly went active+rw,"
3446 << " insufficient up; including down osds" << dendl;
11fdf7f2 3447 ceph_assert(!candidate_blocked_by.empty());
7c673cae
FG
3448 pg_down = true;
3449 blocked_by.insert(
3450 candidate_blocked_by.begin(),
3451 candidate_blocked_by.end());
3452 }
3453 });
3454
3455 ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
3456 << " down " << down
3457 << " blocked_by " << blocked_by
3458 << (pg_down ? " pg_down":"")
3459 << dendl;
3460}
3461
3462/**
3463 * pg_query_t - used to ask a peer for information about a pg.
3464 *
3465 * note: if version=0, type=LOG, then we just provide our full log.
3466 */
3467struct pg_query_t {
3468 enum {
3469 INFO = 0,
3470 LOG = 1,
3471 MISSING = 4,
3472 FULLLOG = 5,
3473 };
11fdf7f2 3474 std::string_view get_type_name() const {
7c673cae
FG
3475 switch (type) {
3476 case INFO: return "info";
3477 case LOG: return "log";
3478 case MISSING: return "missing";
3479 case FULLLOG: return "fulllog";
3480 default: return "???";
3481 }
3482 }
3483
3484 __s32 type;
3485 eversion_t since;
3486 pg_history_t history;
3487 epoch_t epoch_sent;
3488 shard_id_t to;
3489 shard_id_t from;
3490
3491 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
3492 from(shard_id_t::NO_SHARD) {}
3493 pg_query_t(
3494 int t,
3495 shard_id_t to,
3496 shard_id_t from,
3497 const pg_history_t& h,
3498 epoch_t epoch_sent)
3499 : type(t),
3500 history(h),
3501 epoch_sent(epoch_sent),
3502 to(to), from(from) {
11fdf7f2 3503 ceph_assert(t != LOG);
7c673cae
FG
3504 }
3505 pg_query_t(
3506 int t,
3507 shard_id_t to,
3508 shard_id_t from,
3509 eversion_t s,
3510 const pg_history_t& h,
3511 epoch_t epoch_sent)
3512 : type(t), since(s), history(h),
3513 epoch_sent(epoch_sent), to(to), from(from) {
11fdf7f2 3514 ceph_assert(t == LOG);
7c673cae
FG
3515 }
3516
3517 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 3518 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
3519
3520 void dump(Formatter *f) const;
3521 static void generate_test_instances(list<pg_query_t*>& o);
3522};
3523WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
3524
3525inline ostream& operator<<(ostream& out, const pg_query_t& q) {
3526 out << "query(" << q.get_type_name() << " " << q.since;
3527 if (q.type == pg_query_t::LOG)
3528 out << " " << q.history;
11fdf7f2 3529 out << " epoch_sent " << q.epoch_sent;
7c673cae
FG
3530 out << ")";
3531 return out;
3532}
3533
3534class PGBackend;
3535class ObjectModDesc {
3536 bool can_local_rollback;
3537 bool rollback_info_completed;
3538
3539 // version required to decode, reflected in encode/decode version
3540 __u8 max_required_version = 1;
3541public:
3542 class Visitor {
3543 public:
3544 virtual void append(uint64_t old_offset) {}
3545 virtual void setattrs(map<string, boost::optional<bufferlist> > &attrs) {}
3546 virtual void rmobject(version_t old_version) {}
3547 /**
3548 * Used to support the unfound_lost_delete log event: if the stashed
3549 * version exists, we unstash it, otherwise, we do nothing. This way
3550 * each replica rolls back to whatever state it had prior to the attempt
3551 * at mark unfound lost delete
3552 */
3553 virtual void try_rmobject(version_t old_version) {
3554 rmobject(old_version);
3555 }
3556 virtual void create() {}
3557 virtual void update_snaps(const set<snapid_t> &old_snaps) {}
3558 virtual void rollback_extents(
3559 version_t gen,
3560 const vector<pair<uint64_t, uint64_t> > &extents) {}
3561 virtual ~Visitor() {}
3562 };
3563 void visit(Visitor *visitor) const;
3564 mutable bufferlist bl;
3565 enum ModID {
3566 APPEND = 1,
3567 SETATTRS = 2,
3568 DELETE = 3,
3569 CREATE = 4,
3570 UPDATE_SNAPS = 5,
3571 TRY_DELETE = 6,
3572 ROLLBACK_EXTENTS = 7
3573 };
31f18b77
FG
3574 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3575 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3576 }
7c673cae
FG
3577 void claim(ObjectModDesc &other) {
3578 bl.clear();
3579 bl.claim(other.bl);
3580 can_local_rollback = other.can_local_rollback;
3581 rollback_info_completed = other.rollback_info_completed;
3582 }
3583 void claim_append(ObjectModDesc &other) {
3584 if (!can_local_rollback || rollback_info_completed)
3585 return;
3586 if (!other.can_local_rollback) {
3587 mark_unrollbackable();
3588 return;
3589 }
3590 bl.claim_append(other.bl);
3591 rollback_info_completed = other.rollback_info_completed;
3592 }
3593 void swap(ObjectModDesc &other) {
3594 bl.swap(other.bl);
3595
31f18b77
FG
3596 using std::swap;
3597 swap(other.can_local_rollback, can_local_rollback);
3598 swap(other.rollback_info_completed, rollback_info_completed);
3599 swap(other.max_required_version, max_required_version);
7c673cae
FG
3600 }
3601 void append_id(ModID id) {
11fdf7f2 3602 using ceph::encode;
7c673cae 3603 uint8_t _id(id);
11fdf7f2 3604 encode(_id, bl);
7c673cae
FG
3605 }
3606 void append(uint64_t old_size) {
3607 if (!can_local_rollback || rollback_info_completed)
3608 return;
3609 ENCODE_START(1, 1, bl);
3610 append_id(APPEND);
11fdf7f2 3611 encode(old_size, bl);
7c673cae
FG
3612 ENCODE_FINISH(bl);
3613 }
3614 void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
3615 if (!can_local_rollback || rollback_info_completed)
3616 return;
3617 ENCODE_START(1, 1, bl);
3618 append_id(SETATTRS);
11fdf7f2 3619 encode(old_attrs, bl);
7c673cae
FG
3620 ENCODE_FINISH(bl);
3621 }
3622 bool rmobject(version_t deletion_version) {
3623 if (!can_local_rollback || rollback_info_completed)
3624 return false;
3625 ENCODE_START(1, 1, bl);
3626 append_id(DELETE);
11fdf7f2 3627 encode(deletion_version, bl);
7c673cae
FG
3628 ENCODE_FINISH(bl);
3629 rollback_info_completed = true;
3630 return true;
3631 }
3632 bool try_rmobject(version_t deletion_version) {
3633 if (!can_local_rollback || rollback_info_completed)
3634 return false;
3635 ENCODE_START(1, 1, bl);
3636 append_id(TRY_DELETE);
11fdf7f2 3637 encode(deletion_version, bl);
7c673cae
FG
3638 ENCODE_FINISH(bl);
3639 rollback_info_completed = true;
3640 return true;
3641 }
3642 void create() {
3643 if (!can_local_rollback || rollback_info_completed)
3644 return;
3645 rollback_info_completed = true;
3646 ENCODE_START(1, 1, bl);
3647 append_id(CREATE);
3648 ENCODE_FINISH(bl);
3649 }
3650 void update_snaps(const set<snapid_t> &old_snaps) {
3651 if (!can_local_rollback || rollback_info_completed)
3652 return;
3653 ENCODE_START(1, 1, bl);
3654 append_id(UPDATE_SNAPS);
11fdf7f2 3655 encode(old_snaps, bl);
7c673cae
FG
3656 ENCODE_FINISH(bl);
3657 }
3658 void rollback_extents(
3659 version_t gen, const vector<pair<uint64_t, uint64_t> > &extents) {
11fdf7f2
TL
3660 ceph_assert(can_local_rollback);
3661 ceph_assert(!rollback_info_completed);
7c673cae
FG
3662 if (max_required_version < 2)
3663 max_required_version = 2;
3664 ENCODE_START(2, 2, bl);
3665 append_id(ROLLBACK_EXTENTS);
11fdf7f2
TL
3666 encode(gen, bl);
3667 encode(extents, bl);
7c673cae
FG
3668 ENCODE_FINISH(bl);
3669 }
3670
3671 // cannot be rolled back
3672 void mark_unrollbackable() {
3673 can_local_rollback = false;
3674 bl.clear();
3675 }
3676 bool can_rollback() const {
3677 return can_local_rollback;
3678 }
3679 bool empty() const {
3680 return can_local_rollback && (bl.length() == 0);
3681 }
3682
3683 bool requires_kraken() const {
3684 return max_required_version >= 2;
3685 }
3686
3687 /**
3688 * Create fresh copy of bl bytes to avoid keeping large buffers around
3689 * in the case that bl contains ptrs which point into a much larger
3690 * message buffer
3691 */
31f18b77 3692 void trim_bl() const {
7c673cae
FG
3693 if (bl.length() > 0)
3694 bl.rebuild();
3695 }
3696 void encode(bufferlist &bl) const;
11fdf7f2 3697 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
3698 void dump(Formatter *f) const;
3699 static void generate_test_instances(list<ObjectModDesc*>& o);
3700};
3701WRITE_CLASS_ENCODER(ObjectModDesc)
3702
3703
3704/**
3705 * pg_log_entry_t - single entry/event in pg log
3706 *
3707 */
3708struct pg_log_entry_t {
3709 enum {
3710 MODIFY = 1, // some unspecified modification (but not *all* modifications)
3711 CLONE = 2, // cloned object from head
3712 DELETE = 3, // deleted object
11fdf7f2 3713 //BACKLOG = 4, // event invented by generate_backlog [obsolete]
7c673cae
FG
3714 LOST_REVERT = 5, // lost new version, revert to an older version.
3715 LOST_DELETE = 6, // lost new version, revert to no object (deleted).
3716 LOST_MARK = 7, // lost new version, now EIO
3717 PROMOTE = 8, // promoted object from another tier
3718 CLEAN = 9, // mark an object clean
3719 ERROR = 10, // write that returned an error
3720 };
3721 static const char *get_op_name(int op) {
3722 switch (op) {
3723 case MODIFY:
3724 return "modify";
3725 case PROMOTE:
3726 return "promote";
3727 case CLONE:
3728 return "clone";
3729 case DELETE:
3730 return "delete";
7c673cae
FG
3731 case LOST_REVERT:
3732 return "l_revert";
3733 case LOST_DELETE:
3734 return "l_delete";
3735 case LOST_MARK:
3736 return "l_mark";
3737 case CLEAN:
3738 return "clean";
3739 case ERROR:
3740 return "error";
3741 default:
3742 return "unknown";
3743 }
3744 }
3745 const char *get_op_name() const {
3746 return get_op_name(op);
3747 }
3748
3749 // describes state for a locally-rollbackable entry
3750 ObjectModDesc mod_desc;
3751 bufferlist snaps; // only for clone entries
3752 hobject_t soid;
3753 osd_reqid_t reqid; // caller+tid to uniquely identify request
31f18b77 3754 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids;
11fdf7f2
TL
3755
3756 /// map extra_reqids by index to error return code (if any)
3757 mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
3758
7c673cae
FG
3759 eversion_t version, prior_version, reverting_to;
3760 version_t user_version; // the user version for this entry
3761 utime_t mtime; // this is the _user_ mtime, mind you
3762 int32_t return_code; // only stored for ERRORs for dup detection
3763
3764 __s32 op;
3765 bool invalid_hash; // only when decoding sobject_t based entries
3766 bool invalid_pool; // only when decoding pool-less hobject based entries
3767
3768 pg_log_entry_t()
3769 : user_version(0), return_code(0), op(0),
31f18b77
FG
3770 invalid_hash(false), invalid_pool(false) {
3771 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3772 }
7c673cae
FG
3773 pg_log_entry_t(int _op, const hobject_t& _soid,
3774 const eversion_t& v, const eversion_t& pv,
3775 version_t uv,
3776 const osd_reqid_t& rid, const utime_t& mt,
3777 int return_code)
3778 : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
3779 mtime(mt), return_code(return_code), op(_op),
31f18b77
FG
3780 invalid_hash(false), invalid_pool(false) {
3781 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3782 }
7c673cae
FG
3783
3784 bool is_clone() const { return op == CLONE; }
3785 bool is_modify() const { return op == MODIFY; }
3786 bool is_promote() const { return op == PROMOTE; }
3787 bool is_clean() const { return op == CLEAN; }
7c673cae
FG
3788 bool is_lost_revert() const { return op == LOST_REVERT; }
3789 bool is_lost_delete() const { return op == LOST_DELETE; }
3790 bool is_lost_mark() const { return op == LOST_MARK; }
3791 bool is_error() const { return op == ERROR; }
3792
3793 bool is_update() const {
3794 return
3795 is_clone() || is_modify() || is_promote() || is_clean() ||
11fdf7f2 3796 is_lost_revert() || is_lost_mark();
7c673cae
FG
3797 }
3798 bool is_delete() const {
3799 return op == DELETE || op == LOST_DELETE;
3800 }
3801
3802 bool can_rollback() const {
3803 return mod_desc.can_rollback();
3804 }
3805
3806 void mark_unrollbackable() {
3807 mod_desc.mark_unrollbackable();
3808 }
3809
3810 bool requires_kraken() const {
3811 return mod_desc.requires_kraken();
3812 }
3813
3814 // Errors are only used for dup detection, whereas
3815 // the index by objects is used by recovery, copy_get,
3816 // and other facilities that don't expect or need to
3817 // be aware of error entries.
3818 bool object_is_indexed() const {
3819 return !is_error();
3820 }
3821
3822 bool reqid_is_indexed() const {
3823 return reqid != osd_reqid_t() &&
3824 (op == MODIFY || op == DELETE || op == ERROR);
3825 }
3826
3827 string get_key_name() const;
3828 void encode_with_checksum(bufferlist& bl) const;
11fdf7f2 3829 void decode_with_checksum(bufferlist::const_iterator& p);
7c673cae
FG
3830
3831 void encode(bufferlist &bl) const;
11fdf7f2 3832 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
3833 void dump(Formatter *f) const;
3834 static void generate_test_instances(list<pg_log_entry_t*>& o);
3835
3836};
3837WRITE_CLASS_ENCODER(pg_log_entry_t)
3838
3839ostream& operator<<(ostream& out, const pg_log_entry_t& e);
3840
c07f9fc5
FG
3841struct pg_log_dup_t {
3842 osd_reqid_t reqid; // caller+tid to uniquely identify request
3843 eversion_t version;
3844 version_t user_version; // the user version for this entry
3845 int32_t return_code; // only stored for ERRORs for dup detection
7c673cae 3846
c07f9fc5
FG
3847 pg_log_dup_t()
3848 : user_version(0), return_code(0)
3849 {}
3850 explicit pg_log_dup_t(const pg_log_entry_t& entry)
3851 : reqid(entry.reqid), version(entry.version),
3852 user_version(entry.user_version), return_code(entry.return_code)
3853 {}
3854 pg_log_dup_t(const eversion_t& v, version_t uv,
3855 const osd_reqid_t& rid, int return_code)
3856 : reqid(rid), version(v), user_version(uv),
3857 return_code(return_code)
3858 {}
3859
3860 string get_key_name() const;
3861 void encode(bufferlist &bl) const;
11fdf7f2 3862 void decode(bufferlist::const_iterator &bl);
c07f9fc5
FG
3863 void dump(Formatter *f) const;
3864 static void generate_test_instances(list<pg_log_dup_t*>& o);
3865
181888fb
FG
3866 bool operator==(const pg_log_dup_t &rhs) const {
3867 return reqid == rhs.reqid &&
3868 version == rhs.version &&
3869 user_version == rhs.user_version &&
3870 return_code == rhs.return_code;
3871 }
3872 bool operator!=(const pg_log_dup_t &rhs) const {
3873 return !(*this == rhs);
3874 }
3875
c07f9fc5
FG
3876 friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
3877};
3878WRITE_CLASS_ENCODER(pg_log_dup_t)
3879
3880std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
7c673cae
FG
3881
3882/**
3883 * pg_log_t - incremental log of recent pg changes.
3884 *
3885 * serves as a recovery queue for recent changes.
3886 */
3887struct pg_log_t {
3888 /*
3889 * head - newest entry (update|delete)
3890 * tail - entry previous to oldest (update|delete) for which we have
3891 * complete negative information.
3892 * i.e. we can infer pg contents for any store whose last_update >= tail.
3893 */
3894 eversion_t head; // newest entry
3895 eversion_t tail; // version prior to oldest
3896
3897protected:
3898 // We can rollback rollback-able entries > can_rollback_to
3899 eversion_t can_rollback_to;
3900
3901 // always <= can_rollback_to, indicates how far stashed rollback
3902 // data can be found
3903 eversion_t rollback_info_trimmed_to;
3904
3905public:
c07f9fc5
FG
3906 // the actual log
3907 mempool::osd_pglog::list<pg_log_entry_t> log;
3908
3909 // entries just for dup op detection ordered oldest to newest
3910 mempool::osd_pglog::list<pg_log_dup_t> dups;
3911
7c673cae
FG
3912 pg_log_t() = default;
3913 pg_log_t(const eversion_t &last_update,
3914 const eversion_t &log_tail,
3915 const eversion_t &can_rollback_to,
3916 const eversion_t &rollback_info_trimmed_to,
c07f9fc5
FG
3917 mempool::osd_pglog::list<pg_log_entry_t> &&entries,
3918 mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
7c673cae
FG
3919 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3920 rollback_info_trimmed_to(rollback_info_trimmed_to),
c07f9fc5 3921 log(std::move(entries)), dups(std::move(dup_entries)) {}
7c673cae
FG
3922 pg_log_t(const eversion_t &last_update,
3923 const eversion_t &log_tail,
3924 const eversion_t &can_rollback_to,
3925 const eversion_t &rollback_info_trimmed_to,
c07f9fc5
FG
3926 const std::list<pg_log_entry_t> &entries,
3927 const std::list<pg_log_dup_t> &dup_entries)
7c673cae
FG
3928 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3929 rollback_info_trimmed_to(rollback_info_trimmed_to) {
3930 for (auto &&entry: entries) {
3931 log.push_back(entry);
3932 }
c07f9fc5
FG
3933 for (auto &&entry: dup_entries) {
3934 dups.push_back(entry);
3935 }
7c673cae
FG
3936 }
3937
3938 void clear() {
3939 eversion_t z;
3940 rollback_info_trimmed_to = can_rollback_to = head = tail = z;
3941 log.clear();
c07f9fc5 3942 dups.clear();
7c673cae
FG
3943 }
3944
3945 eversion_t get_rollback_info_trimmed_to() const {
3946 return rollback_info_trimmed_to;
3947 }
3948 eversion_t get_can_rollback_to() const {
3949 return can_rollback_to;
3950 }
3951
3952
3953 pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
31f18b77 3954 mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
7c673cae
FG
3955 oldlog.swap(log);
3956
3957 eversion_t old_tail;
3958 unsigned mask = ~((~0)<<split_bits);
3959 for (auto i = oldlog.begin();
3960 i != oldlog.end();
3961 ) {
3962 if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
3963 childlog.push_back(*i);
3964 } else {
3965 log.push_back(*i);
3966 }
3967 oldlog.erase(i++);
3968 }
3969
c07f9fc5
FG
3970 // osd_reqid is unique, so it doesn't matter if there are extra
3971 // dup entries in each pg. To avoid storing oid with the dup
3972 // entries, just copy the whole list.
3973 auto childdups(dups);
3974
7c673cae
FG
3975 return pg_log_t(
3976 head,
3977 tail,
3978 can_rollback_to,
3979 rollback_info_trimmed_to,
c07f9fc5
FG
3980 std::move(childlog),
3981 std::move(childdups));
3982 }
7c673cae 3983
31f18b77 3984 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
11fdf7f2 3985 ceph_assert(newhead >= tail);
7c673cae 3986
31f18b77
FG
3987 mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
3988 mempool::osd_pglog::list<pg_log_entry_t> divergent;
7c673cae
FG
3989 while (true) {
3990 if (p == log.begin()) {
3991 // yikes, the whole thing is divergent!
31f18b77
FG
3992 using std::swap;
3993 swap(divergent, log);
7c673cae
FG
3994 break;
3995 }
3996 --p;
3997 if (p->version.version <= newhead.version) {
3998 /*
3999 * look at eversion.version here. we want to avoid a situation like:
4000 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4001 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4002 * lower_bound = 100'9
4003 * i.e, same request, different version. If the eversion.version is > the
4004 * lower_bound, we it is divergent.
4005 */
4006 ++p;
4007 divergent.splice(divergent.begin(), log, p, log.end());
4008 break;
4009 }
11fdf7f2 4010 ceph_assert(p->version > newhead);
7c673cae
FG
4011 }
4012 head = newhead;
4013
4014 if (can_rollback_to > newhead)
4015 can_rollback_to = newhead;
4016
4017 if (rollback_info_trimmed_to > newhead)
4018 rollback_info_trimmed_to = newhead;
4019
4020 return divergent;
4021 }
4022
11fdf7f2
TL
4023 void merge_from(const vector<pg_log_t*>& slogs, eversion_t last_update) {
4024 log.clear();
4025
4026 // sort and merge dups
4027 multimap<eversion_t,pg_log_dup_t> sorted;
4028 for (auto& d : dups) {
4029 sorted.emplace(d.version, d);
4030 }
4031 for (auto l : slogs) {
4032 for (auto& d : l->dups) {
4033 sorted.emplace(d.version, d);
4034 }
4035 }
4036 dups.clear();
4037 for (auto& i : sorted) {
4038 dups.push_back(i.second);
4039 }
4040
4041 head = last_update;
4042 tail = last_update;
4043 can_rollback_to = last_update;
4044 rollback_info_trimmed_to = last_update;
4045 }
4046
7c673cae
FG
4047 bool empty() const {
4048 return log.empty();
4049 }
4050
4051 bool null() const {
4052 return head.version == 0 && head.epoch == 0;
4053 }
4054
4055 size_t approx_size() const {
4056 return head.version - tail.version;
4057 }
4058
4059 static void filter_log(spg_t import_pgid, const OSDMap &curmap,
4060 const string &hit_set_namespace, const pg_log_t &in,
4061 pg_log_t &out, pg_log_t &reject);
4062
4063 /**
4064 * copy entries from the tail of another pg_log_t
4065 *
4066 * @param other pg_log_t to copy from
4067 * @param from copy entries after this version
4068 */
4069 void copy_after(const pg_log_t &other, eversion_t from);
4070
4071 /**
4072 * copy a range of entries from another pg_log_t
4073 *
4074 * @param other pg_log_t to copy from
4075 * @param from copy entries after this version
4076 * @param to up to and including this version
4077 */
4078 void copy_range(const pg_log_t &other, eversion_t from, eversion_t to);
4079
4080 /**
4081 * copy up to N entries
4082 *
4083 * @param other source log
4084 * @param max max number of entries to copy
4085 */
4086 void copy_up_to(const pg_log_t &other, int max);
4087
4088 ostream& print(ostream& out) const;
4089
4090 void encode(bufferlist &bl) const;
11fdf7f2 4091 void decode(bufferlist::const_iterator &bl, int64_t pool = -1);
7c673cae
FG
4092 void dump(Formatter *f) const;
4093 static void generate_test_instances(list<pg_log_t*>& o);
4094};
4095WRITE_CLASS_ENCODER(pg_log_t)
4096
c07f9fc5 4097inline ostream& operator<<(ostream& out, const pg_log_t& log)
7c673cae
FG
4098{
4099 out << "log((" << log.tail << "," << log.head << "], crt="
4100 << log.get_can_rollback_to() << ")";
4101 return out;
4102}
4103
4104
4105/**
4106 * pg_missing_t - summary of missing objects.
4107 *
4108 * kept in memory, as a supplement to pg_log_t
4109 * also used to pass missing info in messages.
4110 */
4111struct pg_missing_item {
4112 eversion_t need, have;
c07f9fc5
FG
4113 enum missing_flags_t {
4114 FLAG_NONE = 0,
4115 FLAG_DELETE = 1,
4116 } flags;
4117 pg_missing_item() : flags(FLAG_NONE) {}
4118 explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
4119 pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false) : need(n), have(h) {
4120 set_delete(is_delete);
4121 }
4122
4123 void encode(bufferlist& bl, uint64_t features) const {
11fdf7f2 4124 using ceph::encode;
c07f9fc5
FG
4125 if (HAVE_FEATURE(features, OSD_RECOVERY_DELETES)) {
4126 // encoding a zeroed eversion_t to differentiate between this and
4127 // legacy unversioned encoding - a need value of 0'0 is not
4128 // possible. This can be replaced with the legacy encoding
4129 // macros post-luminous.
4130 eversion_t e;
11fdf7f2
TL
4131 encode(e, bl);
4132 encode(need, bl);
4133 encode(have, bl);
4134 encode(static_cast<uint8_t>(flags), bl);
c07f9fc5
FG
4135 } else {
4136 // legacy unversioned encoding
11fdf7f2
TL
4137 encode(need, bl);
4138 encode(have, bl);
c07f9fc5 4139 }
7c673cae 4140 }
11fdf7f2
TL
4141 void decode(bufferlist::const_iterator& bl) {
4142 using ceph::decode;
c07f9fc5 4143 eversion_t e;
11fdf7f2 4144 decode(e, bl);
c07f9fc5
FG
4145 if (e != eversion_t()) {
4146 // legacy encoding, this is the need value
4147 need = e;
11fdf7f2 4148 decode(have, bl);
c07f9fc5 4149 } else {
11fdf7f2
TL
4150 decode(need, bl);
4151 decode(have, bl);
c07f9fc5 4152 uint8_t f;
11fdf7f2 4153 decode(f, bl);
c07f9fc5
FG
4154 flags = static_cast<missing_flags_t>(f);
4155 }
4156 }
4157
4158 void set_delete(bool is_delete) {
4159 flags = is_delete ? FLAG_DELETE : FLAG_NONE;
4160 }
4161
4162 bool is_delete() const {
4163 return (flags & FLAG_DELETE) == FLAG_DELETE;
4164 }
4165
4166 string flag_str() const {
4167 if (flags == FLAG_NONE) {
4168 return "none";
4169 } else {
4170 return "delete";
4171 }
7c673cae 4172 }
c07f9fc5 4173
7c673cae
FG
4174 void dump(Formatter *f) const {
4175 f->dump_stream("need") << need;
4176 f->dump_stream("have") << have;
c07f9fc5 4177 f->dump_stream("flags") << flag_str();
7c673cae
FG
4178 }
4179 static void generate_test_instances(list<pg_missing_item*>& o) {
4180 o.push_back(new pg_missing_item);
4181 o.push_back(new pg_missing_item);
4182 o.back()->need = eversion_t(1, 2);
4183 o.back()->have = eversion_t(1, 1);
c07f9fc5
FG
4184 o.push_back(new pg_missing_item);
4185 o.back()->need = eversion_t(3, 5);
4186 o.back()->have = eversion_t(3, 4);
4187 o.back()->flags = FLAG_DELETE;
7c673cae
FG
4188 }
4189 bool operator==(const pg_missing_item &rhs) const {
c07f9fc5 4190 return need == rhs.need && have == rhs.have && flags == rhs.flags;
7c673cae
FG
4191 }
4192 bool operator!=(const pg_missing_item &rhs) const {
4193 return !(*this == rhs);
4194 }
4195};
c07f9fc5 4196WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
7c673cae
FG
4197ostream& operator<<(ostream& out, const pg_missing_item &item);
4198
4199class pg_missing_const_i {
4200public:
4201 virtual const map<hobject_t, pg_missing_item> &
4202 get_items() const = 0;
4203 virtual const map<version_t, hobject_t> &get_rmissing() const = 0;
c07f9fc5 4204 virtual bool get_may_include_deletes() const = 0;
7c673cae
FG
4205 virtual unsigned int num_missing() const = 0;
4206 virtual bool have_missing() const = 0;
4207 virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
4208 virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
7c673cae
FG
4209 virtual ~pg_missing_const_i() {}
4210};
4211
4212
4213template <bool Track>
4214class ChangeTracker {
4215public:
4216 void changed(const hobject_t &obj) {}
4217 template <typename F>
4218 void get_changed(F &&f) const {}
4219 void flush() {}
4220 bool is_clean() const {
4221 return true;
4222 }
4223};
4224template <>
4225class ChangeTracker<true> {
4226 set<hobject_t> _changed;
4227public:
4228 void changed(const hobject_t &obj) {
4229 _changed.insert(obj);
4230 }
4231 template <typename F>
4232 void get_changed(F &&f) const {
4233 for (auto const &i: _changed) {
4234 f(i);
4235 }
4236 }
4237 void flush() {
4238 _changed.clear();
4239 }
4240 bool is_clean() const {
4241 return _changed.empty();
4242 }
4243};
4244
4245template <bool TrackChanges>
4246class pg_missing_set : public pg_missing_const_i {
4247 using item = pg_missing_item;
4248 map<hobject_t, item> missing; // oid -> (need v, have v)
4249 map<version_t, hobject_t> rmissing; // v -> oid
4250 ChangeTracker<TrackChanges> tracker;
4251
4252public:
4253 pg_missing_set() = default;
4254
4255 template <typename missing_type>
4256 pg_missing_set(const missing_type &m) {
7c673cae
FG
4257 missing = m.get_items();
4258 rmissing = m.get_rmissing();
c07f9fc5 4259 may_include_deletes = m.get_may_include_deletes();
7c673cae
FG
4260 for (auto &&i: missing)
4261 tracker.changed(i.first);
4262 }
4263
c07f9fc5
FG
4264 bool may_include_deletes = false;
4265
7c673cae
FG
4266 const map<hobject_t, item> &get_items() const override {
4267 return missing;
4268 }
4269 const map<version_t, hobject_t> &get_rmissing() const override {
4270 return rmissing;
4271 }
c07f9fc5
FG
4272 bool get_may_include_deletes() const override {
4273 return may_include_deletes;
4274 }
7c673cae
FG
4275 unsigned int num_missing() const override {
4276 return missing.size();
4277 }
4278 bool have_missing() const override {
4279 return !missing.empty();
4280 }
4281 bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
4282 auto iter = missing.find(oid);
4283 if (iter == missing.end())
4284 return false;
4285 if (out)
4286 *out = iter->second;
4287 return true;
4288 }
4289 bool is_missing(const hobject_t& oid, eversion_t v) const override {
4290 map<hobject_t, item>::const_iterator m =
4291 missing.find(oid);
4292 if (m == missing.end())
4293 return false;
4294 const item &item(m->second);
4295 if (item.need > v)
4296 return false;
4297 return true;
4298 }
11fdf7f2
TL
4299 eversion_t get_oldest_need() const {
4300 if (missing.empty()) {
7c673cae 4301 return eversion_t();
11fdf7f2
TL
4302 }
4303 auto it = missing.find(rmissing.begin()->second);
4304 ceph_assert(it != missing.end());
4305 return it->second.need;
7c673cae
FG
4306 }
4307
4308 void claim(pg_missing_set& o) {
4309 static_assert(!TrackChanges, "Can't use claim with TrackChanges");
4310 missing.swap(o.missing);
4311 rmissing.swap(o.rmissing);
4312 }
4313
4314 /*
4315 * this needs to be called in log order as we extend the log. it
4316 * assumes missing is accurate up through the previous log entry.
4317 */
4318 void add_next_event(const pg_log_entry_t& e) {
c07f9fc5
FG
4319 map<hobject_t, item>::iterator missing_it;
4320 missing_it = missing.find(e.soid);
4321 bool is_missing_divergent_item = missing_it != missing.end();
4322 if (e.prior_version == eversion_t() || e.is_clone()) {
4323 // new object.
4324 if (is_missing_divergent_item) { // use iterator
7c673cae 4325 rmissing.erase((missing_it->second).need.version);
c07f9fc5
FG
4326 missing_it->second = item(e.version, eversion_t(), e.is_delete()); // .have = nil
4327 } else // create new element in missing map
4328 missing[e.soid] = item(e.version, eversion_t(), e.is_delete()); // .have = nil
4329 } else if (is_missing_divergent_item) {
4330 // already missing (prior).
4331 rmissing.erase((missing_it->second).need.version);
4332 (missing_it->second).need = e.version; // leave .have unchanged.
4333 missing_it->second.set_delete(e.is_delete());
c07f9fc5
FG
4334 } else {
4335 // not missing, we must have prior_version (if any)
11fdf7f2 4336 ceph_assert(!is_missing_divergent_item);
c07f9fc5 4337 missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
7c673cae 4338 }
c07f9fc5 4339 rmissing[e.version.version] = e.soid;
7c673cae
FG
4340 tracker.changed(e.soid);
4341 }
4342
c07f9fc5 4343 void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
7c673cae
FG
4344 if (missing.count(oid)) {
4345 rmissing.erase(missing[oid].need.version);
4346 missing[oid].need = need; // no not adjust .have
c07f9fc5 4347 missing[oid].set_delete(is_delete);
7c673cae 4348 } else {
c07f9fc5 4349 missing[oid] = item(need, eversion_t(), is_delete);
7c673cae
FG
4350 }
4351 rmissing[need.version] = oid;
4352
4353 tracker.changed(oid);
4354 }
4355
4356 void revise_have(hobject_t oid, eversion_t have) {
4357 if (missing.count(oid)) {
4358 tracker.changed(oid);
4359 missing[oid].have = have;
4360 }
4361 }
4362
c07f9fc5
FG
4363 void add(const hobject_t& oid, eversion_t need, eversion_t have,
4364 bool is_delete) {
4365 missing[oid] = item(need, have, is_delete);
7c673cae
FG
4366 rmissing[need.version] = oid;
4367 tracker.changed(oid);
4368 }
4369
4370 void rm(const hobject_t& oid, eversion_t v) {
4371 std::map<hobject_t, item>::iterator p = missing.find(oid);
4372 if (p != missing.end() && p->second.need <= v)
4373 rm(p);
4374 }
4375
4376 void rm(std::map<hobject_t, item>::const_iterator m) {
4377 tracker.changed(m->first);
4378 rmissing.erase(m->second.need.version);
4379 missing.erase(m);
4380 }
4381
4382 void got(const hobject_t& oid, eversion_t v) {
4383 std::map<hobject_t, item>::iterator p = missing.find(oid);
11fdf7f2
TL
4384 ceph_assert(p != missing.end());
4385 ceph_assert(p->second.need <= v || p->second.is_delete());
7c673cae
FG
4386 got(p);
4387 }
4388
4389 void got(std::map<hobject_t, item>::const_iterator m) {
4390 tracker.changed(m->first);
4391 rmissing.erase(m->second.need.version);
4392 missing.erase(m);
4393 }
4394
4395 void split_into(
4396 pg_t child_pgid,
4397 unsigned split_bits,
4398 pg_missing_set *omissing) {
c07f9fc5 4399 omissing->may_include_deletes = may_include_deletes;
7c673cae
FG
4400 unsigned mask = ~((~0)<<split_bits);
4401 for (map<hobject_t, item>::iterator i = missing.begin();
4402 i != missing.end();
4403 ) {
4404 if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
c07f9fc5
FG
4405 omissing->add(i->first, i->second.need, i->second.have,
4406 i->second.is_delete());
7c673cae
FG
4407 rm(i++);
4408 } else {
4409 ++i;
4410 }
4411 }
4412 }
4413
4414 void clear() {
4415 for (auto const &i: missing)
4416 tracker.changed(i.first);
4417 missing.clear();
4418 rmissing.clear();
4419 }
4420
4421 void encode(bufferlist &bl) const {
c07f9fc5 4422 ENCODE_START(4, 2, bl);
11fdf7f2
TL
4423 encode(missing, bl, may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0);
4424 encode(may_include_deletes, bl);
7c673cae
FG
4425 ENCODE_FINISH(bl);
4426 }
11fdf7f2 4427 void decode(bufferlist::const_iterator &bl, int64_t pool = -1) {
7c673cae
FG
4428 for (auto const &i: missing)
4429 tracker.changed(i.first);
c07f9fc5 4430 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
11fdf7f2 4431 decode(missing, bl);
c07f9fc5 4432 if (struct_v >= 4) {
11fdf7f2 4433 decode(may_include_deletes, bl);
c07f9fc5 4434 }
7c673cae
FG
4435 DECODE_FINISH(bl);
4436
4437 if (struct_v < 3) {
4438 // Handle hobject_t upgrade
4439 map<hobject_t, item> tmp;
4440 for (map<hobject_t, item>::iterator i =
4441 missing.begin();
4442 i != missing.end();
4443 ) {
4444 if (!i->first.is_max() && i->first.pool == -1) {
4445 hobject_t to_insert(i->first);
4446 to_insert.pool = pool;
4447 tmp[to_insert] = i->second;
4448 missing.erase(i++);
4449 } else {
4450 ++i;
4451 }
4452 }
4453 missing.insert(tmp.begin(), tmp.end());
4454 }
4455
4456 for (map<hobject_t,item>::iterator it =
4457 missing.begin();
4458 it != missing.end();
4459 ++it)
4460 rmissing[it->second.need.version] = it->first;
4461 for (auto const &i: missing)
4462 tracker.changed(i.first);
4463 }
4464 void dump(Formatter *f) const {
4465 f->open_array_section("missing");
4466 for (map<hobject_t,item>::const_iterator p =
4467 missing.begin(); p != missing.end(); ++p) {
4468 f->open_object_section("item");
4469 f->dump_stream("object") << p->first;
4470 p->second.dump(f);
4471 f->close_section();
4472 }
4473 f->close_section();
c07f9fc5 4474 f->dump_bool("may_include_deletes", may_include_deletes);
7c673cae
FG
4475 }
4476 template <typename F>
4477 void filter_objects(F &&f) {
4478 for (auto i = missing.begin(); i != missing.end();) {
4479 if (f(i->first)) {
4480 rm(i++);
4481 } else {
4482 ++i;
4483 }
4484 }
4485 }
4486 static void generate_test_instances(list<pg_missing_set*>& o) {
4487 o.push_back(new pg_missing_set);
4488 o.push_back(new pg_missing_set);
4489 o.back()->add(
4490 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
c07f9fc5
FG
4491 eversion_t(5, 6), eversion_t(5, 1), false);
4492 o.push_back(new pg_missing_set);
4493 o.back()->add(
4494 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4495 eversion_t(5, 6), eversion_t(5, 1), true);
4496 o.back()->may_include_deletes = true;
7c673cae
FG
4497 }
4498 template <typename F>
4499 void get_changed(F &&f) const {
4500 tracker.get_changed(f);
4501 }
4502 void flush() {
4503 tracker.flush();
4504 }
4505 bool is_clean() const {
4506 return tracker.is_clean();
4507 }
4508 template <typename missing_t>
4509 bool debug_verify_from_init(
4510 const missing_t &init_missing,
4511 ostream *oss) const {
4512 if (!TrackChanges)
4513 return true;
4514 auto check_missing(init_missing.get_items());
4515 tracker.get_changed([&](const hobject_t &hoid) {
4516 check_missing.erase(hoid);
4517 if (missing.count(hoid)) {
4518 check_missing.insert(*(missing.find(hoid)));
4519 }
4520 });
4521 bool ok = true;
4522 if (check_missing.size() != missing.size()) {
4523 if (oss) {
4524 *oss << "Size mismatch, check: " << check_missing.size()
4525 << ", actual: " << missing.size() << "\n";
4526 }
4527 ok = false;
4528 }
4529 for (auto &i: missing) {
4530 if (!check_missing.count(i.first)) {
4531 if (oss)
4532 *oss << "check_missing missing " << i.first << "\n";
4533 ok = false;
4534 } else if (check_missing[i.first] != i.second) {
4535 if (oss)
4536 *oss << "check_missing missing item mismatch on " << i.first
4537 << ", check: " << check_missing[i.first]
4538 << ", actual: " << i.second << "\n";
4539 ok = false;
4540 }
4541 }
4542 if (oss && !ok) {
4543 *oss << "check_missing: " << check_missing << "\n";
4544 set<hobject_t> changed;
4545 tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
4546 *oss << "changed: " << changed << "\n";
4547 }
4548 return ok;
4549 }
4550};
4551template <bool TrackChanges>
4552void encode(
4553 const pg_missing_set<TrackChanges> &c, bufferlist &bl, uint64_t features=0) {
4554 ENCODE_DUMP_PRE();
4555 c.encode(bl);
4556 ENCODE_DUMP_POST(cl);
4557}
4558template <bool TrackChanges>
11fdf7f2 4559void decode(pg_missing_set<TrackChanges> &c, bufferlist::const_iterator &p) {
7c673cae
FG
4560 c.decode(p);
4561}
4562template <bool TrackChanges>
4563ostream& operator<<(ostream& out, const pg_missing_set<TrackChanges> &missing)
4564{
c07f9fc5
FG
4565 out << "missing(" << missing.num_missing()
4566 << " may_include_deletes = " << missing.may_include_deletes;
7c673cae
FG
4567 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
4568 out << ")";
4569 return out;
4570}
4571
4572using pg_missing_t = pg_missing_set<false>;
4573using pg_missing_tracker_t = pg_missing_set<true>;
4574
4575
4576/**
4577 * pg list objects response format
4578 *
4579 */
4580struct pg_nls_response_t {
4581 collection_list_handle_t handle;
4582 list<librados::ListObjectImpl> entries;
4583
4584 void encode(bufferlist& bl) const {
4585 ENCODE_START(1, 1, bl);
11fdf7f2 4586 encode(handle, bl);
7c673cae 4587 __u32 n = (__u32)entries.size();
11fdf7f2 4588 encode(n, bl);
7c673cae 4589 for (list<librados::ListObjectImpl>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
11fdf7f2
TL
4590 encode(i->nspace, bl);
4591 encode(i->oid, bl);
4592 encode(i->locator, bl);
7c673cae
FG
4593 }
4594 ENCODE_FINISH(bl);
4595 }
11fdf7f2 4596 void decode(bufferlist::const_iterator& bl) {
7c673cae 4597 DECODE_START(1, bl);
11fdf7f2 4598 decode(handle, bl);
7c673cae 4599 __u32 n;
11fdf7f2 4600 decode(n, bl);
7c673cae
FG
4601 entries.clear();
4602 while (n--) {
4603 librados::ListObjectImpl i;
11fdf7f2
TL
4604 decode(i.nspace, bl);
4605 decode(i.oid, bl);
4606 decode(i.locator, bl);
7c673cae
FG
4607 entries.push_back(i);
4608 }
4609 DECODE_FINISH(bl);
4610 }
4611 void dump(Formatter *f) const {
4612 f->dump_stream("handle") << handle;
4613 f->open_array_section("entries");
4614 for (list<librados::ListObjectImpl>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4615 f->open_object_section("object");
4616 f->dump_string("namespace", p->nspace);
4617 f->dump_string("object", p->oid);
4618 f->dump_string("key", p->locator);
4619 f->close_section();
4620 }
4621 f->close_section();
4622 }
4623 static void generate_test_instances(list<pg_nls_response_t*>& o) {
4624 o.push_back(new pg_nls_response_t);
4625 o.push_back(new pg_nls_response_t);
4626 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4627 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4628 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4629 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4630 o.push_back(new pg_nls_response_t);
4631 o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
4632 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4633 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4634 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4635 o.push_back(new pg_nls_response_t);
4636 o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
4637 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4638 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4639 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4640 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4641 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4642 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4643 }
4644};
4645
4646WRITE_CLASS_ENCODER(pg_nls_response_t)
4647
4648// For backwards compatibility with older OSD requests
4649struct pg_ls_response_t {
4650 collection_list_handle_t handle;
4651 list<pair<object_t, string> > entries;
4652
4653 void encode(bufferlist& bl) const {
11fdf7f2 4654 using ceph::encode;
7c673cae 4655 __u8 v = 1;
11fdf7f2
TL
4656 encode(v, bl);
4657 encode(handle, bl);
4658 encode(entries, bl);
7c673cae 4659 }
11fdf7f2
TL
4660 void decode(bufferlist::const_iterator& bl) {
4661 using ceph::decode;
7c673cae 4662 __u8 v;
11fdf7f2
TL
4663 decode(v, bl);
4664 ceph_assert(v == 1);
4665 decode(handle, bl);
4666 decode(entries, bl);
7c673cae
FG
4667 }
4668 void dump(Formatter *f) const {
4669 f->dump_stream("handle") << handle;
4670 f->open_array_section("entries");
4671 for (list<pair<object_t, string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4672 f->open_object_section("object");
4673 f->dump_stream("object") << p->first;
4674 f->dump_string("key", p->second);
4675 f->close_section();
4676 }
4677 f->close_section();
4678 }
4679 static void generate_test_instances(list<pg_ls_response_t*>& o) {
4680 o.push_back(new pg_ls_response_t);
4681 o.push_back(new pg_ls_response_t);
4682 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4683 o.back()->entries.push_back(make_pair(object_t("one"), string()));
4684 o.back()->entries.push_back(make_pair(object_t("two"), string("twokey")));
4685 }
4686};
4687
4688WRITE_CLASS_ENCODER(pg_ls_response_t)
4689
4690/**
4691 * object_copy_cursor_t
4692 */
4693struct object_copy_cursor_t {
4694 uint64_t data_offset;
4695 string omap_offset;
4696 bool attr_complete;
4697 bool data_complete;
4698 bool omap_complete;
4699
4700 object_copy_cursor_t()
4701 : data_offset(0),
4702 attr_complete(false),
4703 data_complete(false),
4704 omap_complete(false)
4705 {}
4706
4707 bool is_initial() const {
4708 return !attr_complete && data_offset == 0 && omap_offset.empty();
4709 }
4710 bool is_complete() const {
4711 return attr_complete && data_complete && omap_complete;
4712 }
4713
4714 static void generate_test_instances(list<object_copy_cursor_t*>& o);
4715 void encode(bufferlist& bl) const;
11fdf7f2 4716 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
4717 void dump(Formatter *f) const;
4718};
4719WRITE_CLASS_ENCODER(object_copy_cursor_t)
4720
4721/**
4722 * object_copy_data_t
4723 *
4724 * Return data from a copy request. The semantics are a little strange
4725 * as a result of the encoding's heritage.
4726 *
4727 * In particular, the sender unconditionally fills in the cursor (from what
4728 * it receives and sends), the size, and the mtime, but is responsible for
4729 * figuring out whether it should put any data in the attrs, data, or
4730 * omap members (corresponding to xattrs, object data, and the omap entries)
4731 * based on external data (the client includes a max amount to return with
4732 * the copy request). The client then looks into the attrs, data, and/or omap
4733 * based on the contents of the cursor.
4734 */
4735struct object_copy_data_t {
4736 enum {
4737 FLAG_DATA_DIGEST = 1<<0,
4738 FLAG_OMAP_DIGEST = 1<<1,
4739 };
4740 object_copy_cursor_t cursor;
4741 uint64_t size;
4742 utime_t mtime;
4743 uint32_t data_digest, omap_digest;
4744 uint32_t flags;
4745 map<string, bufferlist> attrs;
4746 bufferlist data;
4747 bufferlist omap_header;
4748 bufferlist omap_data;
4749
4750 /// which snaps we are defined for (if a snap and not the head)
4751 vector<snapid_t> snaps;
11fdf7f2 4752 /// latest snap seq for the object (if head)
7c673cae
FG
4753 snapid_t snap_seq;
4754
11fdf7f2 4755 /// recent reqids on this object
31f18b77 4756 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > reqids;
7c673cae 4757
11fdf7f2
TL
4758 /// map reqids by index to error return code (if any)
4759 mempool::osd_pglog::map<uint32_t, int> reqid_return_codes;
4760
7c673cae
FG
4761 uint64_t truncate_seq;
4762 uint64_t truncate_size;
4763
4764public:
4765 object_copy_data_t() :
4766 size((uint64_t)-1), data_digest(-1),
4767 omap_digest(-1), flags(0),
4768 truncate_seq(0),
4769 truncate_size(0) {}
4770
4771 static void generate_test_instances(list<object_copy_data_t*>& o);
4772 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 4773 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
4774 void dump(Formatter *f) const;
4775};
4776WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
4777
4778/**
4779 * pg creation info
4780 */
4781struct pg_create_t {
4782 epoch_t created; // epoch pg created
4783 pg_t parent; // split from parent (if != pg_t())
4784 __s32 split_bits;
4785
4786 pg_create_t()
4787 : created(0), split_bits(0) {}
4788 pg_create_t(unsigned c, pg_t p, int s)
4789 : created(c), parent(p), split_bits(s) {}
4790
4791 void encode(bufferlist &bl) const;
11fdf7f2 4792 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
4793 void dump(Formatter *f) const;
4794 static void generate_test_instances(list<pg_create_t*>& o);
4795};
4796WRITE_CLASS_ENCODER(pg_create_t)
4797
7c673cae
FG
4798// -----------------------------------------
4799
4800class ObjectExtent {
4801 /**
4802 * ObjectExtents are used for specifying IO behavior against RADOS
4803 * objects when one is using the ObjectCacher.
4804 *
4805 * To use this in a real system, *every member* must be filled
4806 * out correctly. In particular, make sure to initialize the
4807 * oloc correctly, as its default values are deliberate poison
4808 * and will cause internal ObjectCacher asserts.
4809 *
4810 * Similarly, your buffer_extents vector *must* specify a total
4811 * size equal to your length. If the buffer_extents inadvertently
4812 * contain less space than the length member specifies, you
4813 * will get unintelligible asserts deep in the ObjectCacher.
4814 *
4815 * If you are trying to do testing and don't care about actual
4816 * RADOS function, the simplest thing to do is to initialize
4817 * the ObjectExtent (truncate_size can be 0), create a single entry
4818 * in buffer_extents matching the length, and set oloc.pool to 0.
4819 */
4820 public:
4821 object_t oid; // object id
4822 uint64_t objectno;
4823 uint64_t offset; // in object
4824 uint64_t length; // in object
4825 uint64_t truncate_size; // in object
4826
4827 object_locator_t oloc; // object locator (pool etc)
4828
4829 vector<pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
4830
4831 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
4832 ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
4833 oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
4834};
4835
4836inline ostream& operator<<(ostream& out, const ObjectExtent &ex)
4837{
4838 return out << "extent("
4839 << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
4840 << " " << ex.offset << "~" << ex.length
4841 << " -> " << ex.buffer_extents
4842 << ")";
4843}
4844
4845
7c673cae
FG
4846// ---------------------------------------
4847
4848class OSDSuperblock {
4849public:
4850 uuid_d cluster_fsid, osd_fsid;
4851 int32_t whoami; // my role in this fs.
4852 epoch_t current_epoch; // most recent epoch
4853 epoch_t oldest_map, newest_map; // oldest/newest maps we have.
4854 double weight;
4855
4856 CompatSet compat_features;
4857
4858 // last interval over which i mounted and was then active
4859 epoch_t mounted; // last epoch i mounted
4860 epoch_t clean_thru; // epoch i was active and clean thru
4861
4862 OSDSuperblock() :
4863 whoami(-1),
4864 current_epoch(0), oldest_map(0), newest_map(0), weight(0),
4865 mounted(0), clean_thru(0) {
4866 }
4867
4868 void encode(bufferlist &bl) const;
11fdf7f2 4869 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
4870 void dump(Formatter *f) const;
4871 static void generate_test_instances(list<OSDSuperblock*>& o);
4872};
4873WRITE_CLASS_ENCODER(OSDSuperblock)
4874
4875inline ostream& operator<<(ostream& out, const OSDSuperblock& sb)
4876{
4877 return out << "sb(" << sb.cluster_fsid
4878 << " osd." << sb.whoami
4879 << " " << sb.osd_fsid
4880 << " e" << sb.current_epoch
4881 << " [" << sb.oldest_map << "," << sb.newest_map << "]"
4882 << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
4883 << ")";
4884}
4885
4886
4887// -------
4888
4889
4890
4891
4892
4893
4894/*
4895 * attached to object head. describes most recent snap context, and
4896 * set of existing clones.
4897 */
4898struct SnapSet {
4899 snapid_t seq;
7c673cae
FG
4900 vector<snapid_t> snaps; // descending
4901 vector<snapid_t> clones; // ascending
4902 map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest
4903 map<snapid_t, uint64_t> clone_size;
4904 map<snapid_t, vector<snapid_t>> clone_snaps; // descending
4905
11fdf7f2 4906 SnapSet() : seq(0) {}
7c673cae 4907 explicit SnapSet(bufferlist& bl) {
11fdf7f2 4908 auto p = std::cbegin(bl);
7c673cae
FG
4909 decode(p);
4910 }
4911
7c673cae
FG
4912 /// populate SnapSet from a librados::snap_set_t
4913 void from_snap_set(const librados::snap_set_t& ss, bool legacy);
4914
4915 /// get space accounted to clone
4916 uint64_t get_clone_bytes(snapid_t clone) const;
4917
4918 void encode(bufferlist& bl) const;
11fdf7f2 4919 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
4920 void dump(Formatter *f) const;
4921 static void generate_test_instances(list<SnapSet*>& o);
4922
4923 SnapContext get_ssc_as_of(snapid_t as_of) const {
4924 SnapContext out;
4925 out.seq = as_of;
4926 for (vector<snapid_t>::const_iterator i = snaps.begin();
4927 i != snaps.end();
4928 ++i) {
4929 if (*i <= as_of)
4930 out.snaps.push_back(*i);
4931 }
4932 return out;
4933 }
4934
7c673cae
FG
4935
4936 SnapSet get_filtered(const pg_pool_t &pinfo) const;
4937 void filter(const pg_pool_t &pinfo);
4938};
4939WRITE_CLASS_ENCODER(SnapSet)
4940
4941ostream& operator<<(ostream& out, const SnapSet& cs);
4942
4943
4944
4945#define OI_ATTR "_"
4946#define SS_ATTR "snapset"
4947
4948struct watch_info_t {
4949 uint64_t cookie;
4950 uint32_t timeout_seconds;
4951 entity_addr_t addr;
4952
4953 watch_info_t() : cookie(0), timeout_seconds(0) { }
4954 watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
4955
4956 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 4957 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
4958 void dump(Formatter *f) const;
4959 static void generate_test_instances(list<watch_info_t*>& o);
4960};
4961WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
4962
4963static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
4964 return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
4965 && l.addr == r.addr;
4966}
4967
4968static inline ostream& operator<<(ostream& out, const watch_info_t& w) {
4969 return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
4970 << " " << w.addr << ")";
4971}
4972
4973struct notify_info_t {
4974 uint64_t cookie;
4975 uint64_t notify_id;
4976 uint32_t timeout;
4977 bufferlist bl;
4978};
4979
4980static inline ostream& operator<<(ostream& out, const notify_info_t& n) {
4981 return out << "notify(cookie " << n.cookie
4982 << " notify" << n.notify_id
4983 << " " << n.timeout << "s)";
4984}
4985
11fdf7f2
TL
4986struct chunk_info_t {
4987 typedef enum {
4988 FLAG_DIRTY = 1,
4989 FLAG_MISSING = 2,
4990 FLAG_HAS_REFERENCE = 4,
4991 FLAG_HAS_FINGERPRINT = 8,
4992 } cflag_t;
4993 uint32_t offset;
4994 uint32_t length;
4995 hobject_t oid;
4996 cflag_t flags; // FLAG_*
4997
4998 chunk_info_t() : offset(0), length(0), flags((cflag_t)0) { }
4999
5000 static string get_flag_string(uint64_t flags) {
5001 string r;
5002 if (flags & FLAG_DIRTY) {
5003 r += "|dirty";
5004 }
5005 if (flags & FLAG_MISSING) {
5006 r += "|missing";
5007 }
5008 if (flags & FLAG_HAS_REFERENCE) {
5009 r += "|has_reference";
5010 }
5011 if (flags & FLAG_HAS_FINGERPRINT) {
5012 r += "|has_fingerprint";
5013 }
5014 if (r.length())
5015 return r.substr(1);
5016 return r;
5017 }
5018 bool test_flag(cflag_t f) const {
5019 return (flags & f) == f;
5020 }
5021 void set_flag(cflag_t f) {
5022 flags = (cflag_t)(flags | f);
5023 }
5024 void set_flags(cflag_t f) {
5025 flags = f;
5026 }
5027 void clear_flag(cflag_t f) {
5028 flags = (cflag_t)(flags & ~f);
5029 }
5030 void clear_flags() {
5031 flags = (cflag_t)0;
5032 }
5033 bool is_dirty() const {
5034 return test_flag(FLAG_DIRTY);
5035 }
5036 bool is_missing() const {
5037 return test_flag(FLAG_MISSING);
5038 }
5039 bool has_reference() const {
5040 return test_flag(FLAG_HAS_REFERENCE);
5041 }
5042 bool has_fingerprint() const {
5043 return test_flag(FLAG_HAS_FINGERPRINT);
5044 }
5045 void encode(bufferlist &bl) const;
5046 void decode(bufferlist::const_iterator &bl);
5047 void dump(Formatter *f) const;
5048 friend ostream& operator<<(ostream& out, const chunk_info_t& ci);
5049};
5050WRITE_CLASS_ENCODER(chunk_info_t)
5051ostream& operator<<(ostream& out, const chunk_info_t& ci);
5052
31f18b77
FG
5053struct object_info_t;
5054struct object_manifest_t {
5055 enum {
5056 TYPE_NONE = 0,
11fdf7f2
TL
5057 TYPE_REDIRECT = 1,
5058 TYPE_CHUNKED = 2,
31f18b77
FG
5059 };
5060 uint8_t type; // redirect, chunked, ...
5061 hobject_t redirect_target;
11fdf7f2 5062 map <uint64_t, chunk_info_t> chunk_map;
31f18b77
FG
5063
5064 object_manifest_t() : type(0) { }
5065 object_manifest_t(uint8_t type, const hobject_t& redirect_target)
5066 : type(type), redirect_target(redirect_target) { }
5067
5068 bool is_empty() const {
5069 return type == TYPE_NONE;
5070 }
5071 bool is_redirect() const {
5072 return type == TYPE_REDIRECT;
5073 }
5074 bool is_chunked() const {
5075 return type == TYPE_CHUNKED;
5076 }
11fdf7f2 5077 static std::string_view get_type_name(uint8_t m) {
31f18b77
FG
5078 switch (m) {
5079 case TYPE_NONE: return "none";
5080 case TYPE_REDIRECT: return "redirect";
5081 case TYPE_CHUNKED: return "chunked";
5082 default: return "unknown";
5083 }
5084 }
11fdf7f2 5085 std::string_view get_type_name() const {
31f18b77
FG
5086 return get_type_name(type);
5087 }
11fdf7f2
TL
5088 void clear() {
5089 type = 0;
5090 redirect_target = hobject_t();
5091 chunk_map.clear();
5092 }
31f18b77
FG
5093 static void generate_test_instances(list<object_manifest_t*>& o);
5094 void encode(bufferlist &bl) const;
11fdf7f2 5095 void decode(bufferlist::const_iterator &bl);
31f18b77
FG
5096 void dump(Formatter *f) const;
5097 friend ostream& operator<<(ostream& out, const object_info_t& oi);
5098};
5099WRITE_CLASS_ENCODER(object_manifest_t)
5100ostream& operator<<(ostream& out, const object_manifest_t& oi);
7c673cae
FG
5101
5102struct object_info_t {
5103 hobject_t soid;
5104 eversion_t version, prior_version;
5105 version_t user_version;
5106 osd_reqid_t last_reqid;
5107
5108 uint64_t size;
5109 utime_t mtime;
5110 utime_t local_mtime; // local mtime
5111
5112 // note: these are currently encoded into a total 16 bits; see
5113 // encode()/decode() for the weirdness.
5114 typedef enum {
11fdf7f2
TL
5115 FLAG_LOST = 1<<0,
5116 FLAG_WHITEOUT = 1<<1, // object logically does not exist
5117 FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
5118 FLAG_OMAP = 1<<3, // has (or may have) some/any omap data
5119 FLAG_DATA_DIGEST = 1<<4, // has data crc
5120 FLAG_OMAP_DIGEST = 1<<5, // has omap crc
5121 FLAG_CACHE_PIN = 1<<6, // pin the object in cache tier
5122 FLAG_MANIFEST = 1<<7, // has manifest
5123 FLAG_USES_TMAP = 1<<8, // deprecated; no longer used
5124 FLAG_REDIRECT_HAS_REFERENCE = 1<<9, // has reference
7c673cae
FG
5125 } flag_t;
5126
5127 flag_t flags;
5128
5129 static string get_flag_string(flag_t flags) {
5130 string s;
94b18763
FG
5131 vector<string> sv = get_flag_vector(flags);
5132 for (auto ss : sv) {
5133 s += string("|") + ss;
5134 }
5135 if (s.length())
5136 return s.substr(1);
5137 return s;
5138 }
5139 static vector<string> get_flag_vector(flag_t flags) {
5140 vector<string> sv;
7c673cae 5141 if (flags & FLAG_LOST)
94b18763 5142 sv.insert(sv.end(), "lost");
7c673cae 5143 if (flags & FLAG_WHITEOUT)
94b18763 5144 sv.insert(sv.end(), "whiteout");
7c673cae 5145 if (flags & FLAG_DIRTY)
94b18763 5146 sv.insert(sv.end(), "dirty");
7c673cae 5147 if (flags & FLAG_USES_TMAP)
94b18763 5148 sv.insert(sv.end(), "uses_tmap");
7c673cae 5149 if (flags & FLAG_OMAP)
94b18763 5150 sv.insert(sv.end(), "omap");
7c673cae 5151 if (flags & FLAG_DATA_DIGEST)
94b18763 5152 sv.insert(sv.end(), "data_digest");
7c673cae 5153 if (flags & FLAG_OMAP_DIGEST)
94b18763 5154 sv.insert(sv.end(), "omap_digest");
7c673cae 5155 if (flags & FLAG_CACHE_PIN)
94b18763 5156 sv.insert(sv.end(), "cache_pin");
31f18b77 5157 if (flags & FLAG_MANIFEST)
94b18763 5158 sv.insert(sv.end(), "manifest");
11fdf7f2
TL
5159 if (flags & FLAG_REDIRECT_HAS_REFERENCE)
5160 sv.insert(sv.end(), "redirect_has_reference");
94b18763 5161 return sv;
7c673cae
FG
5162 }
5163 string get_flag_string() const {
5164 return get_flag_string(flags);
5165 }
5166
7c673cae
FG
5167 uint64_t truncate_seq, truncate_size;
5168
5169 map<pair<uint64_t, entity_name_t>, watch_info_t> watchers;
5170
5171 // opportunistic checksums; may or may not be present
5172 __u32 data_digest; ///< data crc32c
5173 __u32 omap_digest; ///< omap crc32c
5174
5175 // alloc hint attribute
5176 uint64_t expected_object_size, expected_write_size;
5177 uint32_t alloc_hint_flags;
5178
31f18b77
FG
5179 struct object_manifest_t manifest;
5180
7c673cae
FG
5181 void copy_user_bits(const object_info_t& other);
5182
7c673cae
FG
5183 bool test_flag(flag_t f) const {
5184 return (flags & f) == f;
5185 }
5186 void set_flag(flag_t f) {
5187 flags = (flag_t)(flags | f);
5188 }
5189 void clear_flag(flag_t f) {
5190 flags = (flag_t)(flags & ~f);
5191 }
5192 bool is_lost() const {
5193 return test_flag(FLAG_LOST);
5194 }
5195 bool is_whiteout() const {
5196 return test_flag(FLAG_WHITEOUT);
5197 }
5198 bool is_dirty() const {
5199 return test_flag(FLAG_DIRTY);
5200 }
5201 bool is_omap() const {
5202 return test_flag(FLAG_OMAP);
5203 }
5204 bool is_data_digest() const {
5205 return test_flag(FLAG_DATA_DIGEST);
5206 }
5207 bool is_omap_digest() const {
5208 return test_flag(FLAG_OMAP_DIGEST);
5209 }
5210 bool is_cache_pinned() const {
5211 return test_flag(FLAG_CACHE_PIN);
5212 }
31f18b77
FG
5213 bool has_manifest() const {
5214 return test_flag(FLAG_MANIFEST);
5215 }
7c673cae
FG
5216 void set_data_digest(__u32 d) {
5217 set_flag(FLAG_DATA_DIGEST);
5218 data_digest = d;
5219 }
5220 void set_omap_digest(__u32 d) {
5221 set_flag(FLAG_OMAP_DIGEST);
5222 omap_digest = d;
5223 }
5224 void clear_data_digest() {
5225 clear_flag(FLAG_DATA_DIGEST);
5226 data_digest = -1;
5227 }
5228 void clear_omap_digest() {
5229 clear_flag(FLAG_OMAP_DIGEST);
5230 omap_digest = -1;
5231 }
5232 void new_object() {
28e407b8
AA
5233 clear_data_digest();
5234 clear_omap_digest();
7c673cae
FG
5235 }
5236
5237 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 5238 void decode(bufferlist::const_iterator& bl);
7c673cae 5239 void decode(bufferlist& bl) {
11fdf7f2 5240 auto p = std::cbegin(bl);
7c673cae
FG
5241 decode(p);
5242 }
5243 void dump(Formatter *f) const;
5244 static void generate_test_instances(list<object_info_t*>& o);
5245
5246 explicit object_info_t()
5247 : user_version(0), size(0), flags((flag_t)0),
5248 truncate_seq(0), truncate_size(0),
5249 data_digest(-1), omap_digest(-1),
5250 expected_object_size(0), expected_write_size(0),
5251 alloc_hint_flags(0)
5252 {}
5253
5254 explicit object_info_t(const hobject_t& s)
5255 : soid(s),
5256 user_version(0), size(0), flags((flag_t)0),
5257 truncate_seq(0), truncate_size(0),
5258 data_digest(-1), omap_digest(-1),
5259 expected_object_size(0), expected_write_size(0),
5260 alloc_hint_flags(0)
5261 {}
5262
5263 explicit object_info_t(bufferlist& bl) {
5264 decode(bl);
5265 }
5266};
5267WRITE_CLASS_ENCODER_FEATURES(object_info_t)
5268
5269ostream& operator<<(ostream& out, const object_info_t& oi);
5270
5271
5272
5273// Object recovery
5274struct ObjectRecoveryInfo {
5275 hobject_t soid;
5276 eversion_t version;
5277 uint64_t size;
5278 object_info_t oi;
5279 SnapSet ss; // only populated if soid is_snap()
5280 interval_set<uint64_t> copy_subset;
5281 map<hobject_t, interval_set<uint64_t>> clone_subset;
5282
5283 ObjectRecoveryInfo() : size(0) { }
5284
5285 static void generate_test_instances(list<ObjectRecoveryInfo*>& o);
5286 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 5287 void decode(bufferlist::const_iterator &bl, int64_t pool = -1);
7c673cae
FG
5288 ostream &print(ostream &out) const;
5289 void dump(Formatter *f) const;
5290};
5291WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
5292ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf);
5293
5294struct ObjectRecoveryProgress {
5295 uint64_t data_recovered_to;
5296 string omap_recovered_to;
5297 bool first;
5298 bool data_complete;
5299 bool omap_complete;
224ce89b 5300 bool error = false;
7c673cae
FG
5301
5302 ObjectRecoveryProgress()
5303 : data_recovered_to(0),
5304 first(true),
5305 data_complete(false), omap_complete(false) { }
5306
5307 bool is_complete(const ObjectRecoveryInfo& info) const {
5308 return (data_recovered_to >= (
5309 info.copy_subset.empty() ?
5310 0 : info.copy_subset.range_end())) &&
5311 omap_complete;
5312 }
5313
5314 static void generate_test_instances(list<ObjectRecoveryProgress*>& o);
5315 void encode(bufferlist &bl) const;
11fdf7f2 5316 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
5317 ostream &print(ostream &out) const;
5318 void dump(Formatter *f) const;
5319};
5320WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
5321ostream& operator<<(ostream& out, const ObjectRecoveryProgress &prog);
5322
5323struct PushReplyOp {
5324 hobject_t soid;
5325
5326 static void generate_test_instances(list<PushReplyOp*>& o);
5327 void encode(bufferlist &bl) const;
11fdf7f2 5328 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
5329 ostream &print(ostream &out) const;
5330 void dump(Formatter *f) const;
5331
5332 uint64_t cost(CephContext *cct) const;
5333};
5334WRITE_CLASS_ENCODER(PushReplyOp)
5335ostream& operator<<(ostream& out, const PushReplyOp &op);
5336
5337struct PullOp {
5338 hobject_t soid;
5339
5340 ObjectRecoveryInfo recovery_info;
5341 ObjectRecoveryProgress recovery_progress;
5342
5343 static void generate_test_instances(list<PullOp*>& o);
5344 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 5345 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
5346 ostream &print(ostream &out) const;
5347 void dump(Formatter *f) const;
5348
5349 uint64_t cost(CephContext *cct) const;
5350};
5351WRITE_CLASS_ENCODER_FEATURES(PullOp)
5352ostream& operator<<(ostream& out, const PullOp &op);
5353
5354struct PushOp {
5355 hobject_t soid;
5356 eversion_t version;
5357 bufferlist data;
5358 interval_set<uint64_t> data_included;
5359 bufferlist omap_header;
5360 map<string, bufferlist> omap_entries;
5361 map<string, bufferlist> attrset;
5362
5363 ObjectRecoveryInfo recovery_info;
5364 ObjectRecoveryProgress before_progress;
5365 ObjectRecoveryProgress after_progress;
5366
5367 static void generate_test_instances(list<PushOp*>& o);
5368 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 5369 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
5370 ostream &print(ostream &out) const;
5371 void dump(Formatter *f) const;
5372
5373 uint64_t cost(CephContext *cct) const;
5374};
5375WRITE_CLASS_ENCODER_FEATURES(PushOp)
5376ostream& operator<<(ostream& out, const PushOp &op);
5377
5378
5379/*
5380 * summarize pg contents for purposes of a scrub
5381 */
5382struct ScrubMap {
5383 struct object {
5384 map<string,bufferptr> attrs;
5385 uint64_t size;
5386 __u32 omap_digest; ///< omap crc32c
5387 __u32 digest; ///< data crc32c
5388 bool negative:1;
5389 bool digest_present:1;
5390 bool omap_digest_present:1;
5391 bool read_error:1;
5392 bool stat_error:1;
5393 bool ec_hash_mismatch:1;
5394 bool ec_size_mismatch:1;
28e407b8
AA
5395 bool large_omap_object_found:1;
5396 uint64_t large_omap_object_key_count = 0;
5397 uint64_t large_omap_object_value_size = 0;
11fdf7f2
TL
5398 uint64_t object_omap_bytes = 0;
5399 uint64_t object_omap_keys = 0;
7c673cae
FG
5400
5401 object() :
5402 // Init invalid size so it won't match if we get a stat EIO error
5403 size(-1), omap_digest(0), digest(0),
28e407b8
AA
5404 negative(false), digest_present(false), omap_digest_present(false),
5405 read_error(false), stat_error(false), ec_hash_mismatch(false),
5406 ec_size_mismatch(false), large_omap_object_found(false) {}
7c673cae
FG
5407
5408 void encode(bufferlist& bl) const;
11fdf7f2 5409 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
5410 void dump(Formatter *f) const;
5411 static void generate_test_instances(list<object*>& o);
5412 };
5413 WRITE_CLASS_ENCODER(object)
5414
5415 map<hobject_t,object> objects;
5416 eversion_t valid_through;
5417 eversion_t incr_since;
28e407b8 5418 bool has_large_omap_object_errors:1;
11fdf7f2 5419 bool has_omap_keys:1;
7c673cae
FG
5420
5421 void merge_incr(const ScrubMap &l);
28e407b8
AA
5422 void clear_from(const hobject_t& start) {
5423 objects.erase(objects.lower_bound(start), objects.end());
5424 }
7c673cae
FG
5425 void insert(const ScrubMap &r) {
5426 objects.insert(r.objects.begin(), r.objects.end());
5427 }
5428 void swap(ScrubMap &r) {
31f18b77
FG
5429 using std::swap;
5430 swap(objects, r.objects);
5431 swap(valid_through, r.valid_through);
5432 swap(incr_since, r.incr_since);
7c673cae
FG
5433 }
5434
5435 void encode(bufferlist& bl) const;
11fdf7f2 5436 void decode(bufferlist::const_iterator& bl, int64_t pool=-1);
7c673cae
FG
5437 void dump(Formatter *f) const;
5438 static void generate_test_instances(list<ScrubMap*>& o);
5439};
5440WRITE_CLASS_ENCODER(ScrubMap::object)
5441WRITE_CLASS_ENCODER(ScrubMap)
5442
28e407b8
AA
5443struct ScrubMapBuilder {
5444 bool deep = false;
5445 vector<hobject_t> ls;
5446 size_t pos = 0;
5447 int64_t data_pos = 0;
5448 string omap_pos;
5449 int ret = 0;
5450 bufferhash data_hash, omap_hash; ///< accumulatinng hash value
5451 uint64_t omap_keys = 0;
5452 uint64_t omap_bytes = 0;
5453
5454 bool empty() {
5455 return ls.empty();
5456 }
5457 bool done() {
5458 return pos >= ls.size();
5459 }
5460 void reset() {
5461 *this = ScrubMapBuilder();
5462 }
5463
5464 bool data_done() {
5465 return data_pos < 0;
5466 }
5467
5468 void next_object() {
5469 ++pos;
5470 data_pos = 0;
5471 omap_pos.clear();
5472 omap_keys = 0;
5473 omap_bytes = 0;
5474 }
5475
5476 friend ostream& operator<<(ostream& out, const ScrubMapBuilder& pos) {
5477 out << "(" << pos.pos << "/" << pos.ls.size();
5478 if (pos.pos < pos.ls.size()) {
5479 out << " " << pos.ls[pos.pos];
5480 }
5481 if (pos.data_pos < 0) {
5482 out << " byte " << pos.data_pos;
5483 }
5484 if (!pos.omap_pos.empty()) {
5485 out << " key " << pos.omap_pos;
5486 }
5487 if (pos.deep) {
5488 out << " deep";
5489 }
5490 if (pos.ret) {
5491 out << " ret " << pos.ret;
5492 }
5493 return out << ")";
5494 }
5495};
5496
7c673cae
FG
5497struct OSDOp {
5498 ceph_osd_op op;
5499 sobject_t soid;
5500
5501 bufferlist indata, outdata;
224ce89b 5502 errorcode32_t rval;
7c673cae
FG
5503
5504 OSDOp() : rval(0) {
5505 memset(&op, 0, sizeof(ceph_osd_op));
5506 }
5507
5508 /**
5509 * split a bufferlist into constituent indata members of a vector of OSDOps
5510 *
5511 * @param ops [out] vector of OSDOps
5512 * @param in [in] combined data buffer
5513 */
5514 static void split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in);
5515
5516 /**
5517 * merge indata members of a vector of OSDOp into a single bufferlist
5518 *
5519 * Notably this also encodes certain other OSDOp data into the data
5520 * buffer, including the sobject_t soid.
5521 *
5522 * @param ops [in] vector of OSDOps
5523 * @param out [out] combined data buffer
5524 */
5525 static void merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out);
5526
5527 /**
5528 * split a bufferlist into constituent outdata members of a vector of OSDOps
5529 *
5530 * @param ops [out] vector of OSDOps
5531 * @param in [in] combined data buffer
5532 */
5533 static void split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in);
5534
5535 /**
5536 * merge outdata members of a vector of OSDOps into a single bufferlist
5537 *
5538 * @param ops [in] vector of OSDOps
5539 * @param out [out] combined data buffer
5540 */
5541 static void merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out);
224ce89b
WB
5542
5543 /**
5544 * Clear data as much as possible, leave minimal data for historical op dump
5545 *
5546 * @param ops [in] vector of OSDOps
5547 */
5548 static void clear_data(vector<OSDOp>& ops);
7c673cae
FG
5549};
5550
5551ostream& operator<<(ostream& out, const OSDOp& op);
5552
5553struct watch_item_t {
5554 entity_name_t name;
5555 uint64_t cookie;
5556 uint32_t timeout_seconds;
5557 entity_addr_t addr;
5558
5559 watch_item_t() : cookie(0), timeout_seconds(0) { }
5560 watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
5561 const entity_addr_t& addr)
5562 : name(name), cookie(cookie), timeout_seconds(timeout),
5563 addr(addr) { }
5564
5565 void encode(bufferlist &bl, uint64_t features) const {
5566 ENCODE_START(2, 1, bl);
11fdf7f2
TL
5567 encode(name, bl);
5568 encode(cookie, bl);
5569 encode(timeout_seconds, bl);
5570 encode(addr, bl, features);
7c673cae
FG
5571 ENCODE_FINISH(bl);
5572 }
11fdf7f2 5573 void decode(bufferlist::const_iterator &bl) {
7c673cae 5574 DECODE_START(2, bl);
11fdf7f2
TL
5575 decode(name, bl);
5576 decode(cookie, bl);
5577 decode(timeout_seconds, bl);
7c673cae 5578 if (struct_v >= 2) {
11fdf7f2 5579 decode(addr, bl);
7c673cae
FG
5580 }
5581 DECODE_FINISH(bl);
5582 }
5583};
5584WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
5585
5586struct obj_watch_item_t {
5587 hobject_t obj;
5588 watch_item_t wi;
5589};
5590
5591/**
5592 * obj list watch response format
5593 *
5594 */
5595struct obj_list_watch_response_t {
5596 list<watch_item_t> entries;
5597
5598 void encode(bufferlist& bl, uint64_t features) const {
5599 ENCODE_START(1, 1, bl);
11fdf7f2 5600 encode(entries, bl, features);
7c673cae
FG
5601 ENCODE_FINISH(bl);
5602 }
11fdf7f2 5603 void decode(bufferlist::const_iterator& bl) {
7c673cae 5604 DECODE_START(1, bl);
11fdf7f2 5605 decode(entries, bl);
7c673cae
FG
5606 DECODE_FINISH(bl);
5607 }
5608 void dump(Formatter *f) const {
5609 f->open_array_section("entries");
5610 for (list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
5611 f->open_object_section("watch");
5612 f->dump_stream("watcher") << p->name;
5613 f->dump_int("cookie", p->cookie);
5614 f->dump_int("timeout", p->timeout_seconds);
5615 f->open_object_section("addr");
5616 p->addr.dump(f);
5617 f->close_section();
5618 f->close_section();
5619 }
5620 f->close_section();
5621 }
5622 static void generate_test_instances(list<obj_list_watch_response_t*>& o) {
5623 entity_addr_t ea;
5624 o.push_back(new obj_list_watch_response_t);
5625 o.push_back(new obj_list_watch_response_t);
5626 ea.set_type(entity_addr_t::TYPE_LEGACY);
5627 ea.set_nonce(1000);
5628 ea.set_family(AF_INET);
5629 ea.set_in4_quad(0, 127);
5630 ea.set_in4_quad(1, 0);
5631 ea.set_in4_quad(2, 0);
5632 ea.set_in4_quad(3, 1);
5633 ea.set_port(1024);
5634 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
5635 ea.set_nonce(1001);
5636 ea.set_in4_quad(3, 2);
5637 ea.set_port(1025);
5638 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
5639 }
5640};
5641WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
5642
5643struct clone_info {
5644 snapid_t cloneid;
5645 vector<snapid_t> snaps; // ascending
5646 vector< pair<uint64_t,uint64_t> > overlap;
5647 uint64_t size;
5648
5649 clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
5650
5651 void encode(bufferlist& bl) const {
5652 ENCODE_START(1, 1, bl);
11fdf7f2
TL
5653 encode(cloneid, bl);
5654 encode(snaps, bl);
5655 encode(overlap, bl);
5656 encode(size, bl);
7c673cae
FG
5657 ENCODE_FINISH(bl);
5658 }
11fdf7f2 5659 void decode(bufferlist::const_iterator& bl) {
7c673cae 5660 DECODE_START(1, bl);
11fdf7f2
TL
5661 decode(cloneid, bl);
5662 decode(snaps, bl);
5663 decode(overlap, bl);
5664 decode(size, bl);
7c673cae
FG
5665 DECODE_FINISH(bl);
5666 }
5667 void dump(Formatter *f) const {
5668 if (cloneid == CEPH_NOSNAP)
5669 f->dump_string("cloneid", "HEAD");
5670 else
5671 f->dump_unsigned("cloneid", cloneid.val);
5672 f->open_array_section("snapshots");
5673 for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
5674 f->open_object_section("snap");
5675 f->dump_unsigned("id", p->val);
5676 f->close_section();
5677 }
5678 f->close_section();
5679 f->open_array_section("overlaps");
5680 for (vector< pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
5681 q != overlap.end(); ++q) {
5682 f->open_object_section("overlap");
5683 f->dump_unsigned("offset", q->first);
5684 f->dump_unsigned("length", q->second);
5685 f->close_section();
5686 }
5687 f->close_section();
5688 f->dump_unsigned("size", size);
5689 }
5690 static void generate_test_instances(list<clone_info*>& o) {
5691 o.push_back(new clone_info);
5692 o.push_back(new clone_info);
5693 o.back()->cloneid = 1;
5694 o.back()->snaps.push_back(1);
5695 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5696 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5697 o.back()->size = 16384;
5698 o.push_back(new clone_info);
5699 o.back()->cloneid = CEPH_NOSNAP;
5700 o.back()->size = 32768;
5701 }
5702};
5703WRITE_CLASS_ENCODER(clone_info)
5704
5705/**
5706 * obj list snaps response format
5707 *
5708 */
5709struct obj_list_snap_response_t {
5710 vector<clone_info> clones; // ascending
5711 snapid_t seq;
5712
5713 void encode(bufferlist& bl) const {
5714 ENCODE_START(2, 1, bl);
11fdf7f2
TL
5715 encode(clones, bl);
5716 encode(seq, bl);
7c673cae
FG
5717 ENCODE_FINISH(bl);
5718 }
11fdf7f2 5719 void decode(bufferlist::const_iterator& bl) {
7c673cae 5720 DECODE_START(2, bl);
11fdf7f2 5721 decode(clones, bl);
7c673cae 5722 if (struct_v >= 2)
11fdf7f2 5723 decode(seq, bl);
7c673cae
FG
5724 else
5725 seq = CEPH_NOSNAP;
5726 DECODE_FINISH(bl);
5727 }
5728 void dump(Formatter *f) const {
5729 f->open_array_section("clones");
5730 for (vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
5731 f->open_object_section("clone");
5732 p->dump(f);
5733 f->close_section();
5734 }
5735 f->dump_unsigned("seq", seq);
5736 f->close_section();
5737 }
5738 static void generate_test_instances(list<obj_list_snap_response_t*>& o) {
5739 o.push_back(new obj_list_snap_response_t);
5740 o.push_back(new obj_list_snap_response_t);
5741 clone_info cl;
5742 cl.cloneid = 1;
5743 cl.snaps.push_back(1);
5744 cl.overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5745 cl.overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5746 cl.size = 16384;
5747 o.back()->clones.push_back(cl);
5748 cl.cloneid = CEPH_NOSNAP;
5749 cl.snaps.clear();
5750 cl.overlap.clear();
5751 cl.size = 32768;
5752 o.back()->clones.push_back(cl);
5753 o.back()->seq = 123;
5754 }
5755};
5756
5757WRITE_CLASS_ENCODER(obj_list_snap_response_t)
5758
5759// PromoteCounter
5760
5761struct PromoteCounter {
11fdf7f2
TL
5762 std::atomic<unsigned long long> attempts{0};
5763 std::atomic<unsigned long long> objects{0};
5764 std::atomic<unsigned long long> bytes{0};
7c673cae
FG
5765
5766 void attempt() {
5767 attempts++;
5768 }
5769
5770 void finish(uint64_t size) {
5771 objects++;
5772 bytes += size;
5773 }
5774
5775 void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
5776 *a = attempts;
5777 *o = objects;
5778 *b = bytes;
5779 attempts = *a / 2;
5780 objects = *o / 2;
5781 bytes = *b / 2;
5782 }
5783};
5784
11fdf7f2
TL
5785struct pool_pg_num_history_t {
5786 /// last epoch updated
5787 epoch_t epoch = 0;
5788 /// poolid -> epoch -> pg_num
5789 map<int64_t,map<epoch_t,uint32_t>> pg_nums;
5790 /// pair(epoch, poolid)
5791 set<pair<epoch_t,int64_t>> deleted_pools;
7c673cae 5792
11fdf7f2
TL
5793 void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) {
5794 pg_nums[pool][epoch] = pg_num;
5795 }
5796 void log_pool_delete(epoch_t epoch, int64_t pool) {
5797 deleted_pools.insert(make_pair(epoch, pool));
5798 }
7c673cae 5799
11fdf7f2
TL
5800 /// prune history based on oldest osdmap epoch in the cluster
5801 void prune(epoch_t oldest_epoch) {
5802 auto i = deleted_pools.begin();
5803 while (i != deleted_pools.end()) {
5804 if (i->first >= oldest_epoch) {
5805 break;
5806 }
5807 pg_nums.erase(i->second);
5808 i = deleted_pools.erase(i);
5809 }
5810 for (auto& j : pg_nums) {
5811 auto k = j.second.lower_bound(oldest_epoch);
5812 // keep this and the entry before it (just to be paranoid)
5813 if (k != j.second.begin()) {
5814 --k;
5815 j.second.erase(j.second.begin(), k);
5816 }
5817 }
5818 }
5819
5820 void encode(bufferlist& bl) const {
5821 ENCODE_START(1, 1, bl);
5822 encode(epoch, bl);
5823 encode(pg_nums, bl);
5824 encode(deleted_pools, bl);
5825 ENCODE_FINISH(bl);
5826 }
5827 void decode(bufferlist::const_iterator& p) {
5828 DECODE_START(1, p);
5829 decode(epoch, p);
5830 decode(pg_nums, p);
5831 decode(deleted_pools, p);
5832 DECODE_FINISH(p);
5833 }
5834 void dump(Formatter *f) const {
5835 f->dump_unsigned("epoch", epoch);
5836 f->open_object_section("pools");
5837 for (auto& i : pg_nums) {
5838 f->open_object_section("pool");
5839 f->dump_unsigned("pool_id", i.first);
5840 f->open_array_section("changes");
5841 for (auto& j : i.second) {
5842 f->open_object_section("change");
5843 f->dump_unsigned("epoch", j.first);
5844 f->dump_unsigned("pg_num", j.second);
5845 f->close_section();
5846 }
5847 f->close_section();
5848 f->close_section();
5849 }
5850 f->close_section();
5851 f->open_array_section("deleted_pools");
5852 for (auto& i : deleted_pools) {
5853 f->open_object_section("deletion");
5854 f->dump_unsigned("pool_id", i.second);
5855 f->dump_unsigned("epoch", i.first);
5856 f->close_section();
5857 }
5858 f->close_section();
5859 }
5860 static void generate_test_instances(list<pool_pg_num_history_t*>& ls) {
5861 ls.push_back(new pool_pg_num_history_t);
5862 }
5863 friend ostream& operator<<(ostream& out, const pool_pg_num_history_t& h) {
5864 return out << "pg_num_history(e" << h.epoch
5865 << " pg_nums " << h.pg_nums
5866 << " deleted_pools " << h.deleted_pools
5867 << ")";
7c673cae 5868 }
7c673cae 5869};
11fdf7f2
TL
5870WRITE_CLASS_ENCODER(pool_pg_num_history_t)
5871
5872// omap specific stats
5873struct omap_stat_t {
5874 int large_omap_objects;
5875 int64_t omap_bytes;
5876 int64_t omap_keys;
5877};
7c673cae
FG
5878
5879#endif