]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/osd_types.h
update sources to 12.2.7
[ceph.git] / ceph / src / osd / osd_types.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18#ifndef CEPH_OSD_TYPES_H
19#define CEPH_OSD_TYPES_H
20
21#include <sstream>
22#include <stdio.h>
23#include <memory>
24#include <boost/scoped_ptr.hpp>
25#include <boost/optional/optional_io.hpp>
26#include <boost/variant.hpp>
27
28#include "include/rados/rados_types.hpp"
29#include "include/mempool.h"
30
31#include "msg/msg_types.h"
32#include "include/types.h"
33#include "include/utime.h"
34#include "include/CompatSet.h"
35#include "common/histogram.h"
36#include "include/interval_set.h"
37#include "include/inline_memory.h"
38#include "common/Formatter.h"
39#include "common/bloom_filter.hpp"
40#include "common/hobject.h"
41#include "common/snap_types.h"
42#include "HitSet.h"
43#include "Watch.h"
44#include "include/cmp.h"
45#include "librados/ListObjectImpl.h"
46#include "compressor/Compressor.h"
47#include <atomic>
48
49#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
50
51#define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
52#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
53#define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
54#define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
55#define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
56#define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
57#define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
58#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
59#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
60#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
61#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
62#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
63#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
64#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
65#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
c07f9fc5 66#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
7c673cae
FG
67
68
69/// min recovery priority for MBackfillReserve
70#define OSD_RECOVERY_PRIORITY_MIN 0
71
72/// base backfill priority for MBackfillReserve
73#define OSD_BACKFILL_PRIORITY_BASE 100
74
75/// base backfill priority for MBackfillReserve (degraded PG)
76#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
77
78/// base recovery priority for MBackfillReserve
79#define OSD_RECOVERY_PRIORITY_BASE 180
80
81/// base backfill priority for MBackfillReserve (inactive PG)
82#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
83
c07f9fc5
FG
84/// max manually/automatically set recovery priority for MBackfillReserve
85#define OSD_RECOVERY_PRIORITY_MAX 254
86
87/// max recovery priority for MBackfillReserve, only when forced manually
88#define OSD_RECOVERY_PRIORITY_FORCED 255
7c673cae
FG
89
90
91typedef hobject_t collection_list_handle_t;
92
93/// convert a single CPEH_OSD_FLAG_* to a string
94const char *ceph_osd_flag_name(unsigned flag);
95/// convert a single CEPH_OSD_OF_FLAG_* to a string
96const char *ceph_osd_op_flag_name(unsigned flag);
97
98/// convert CEPH_OSD_FLAG_* op flags to a string
99string ceph_osd_flag_string(unsigned flags);
100/// conver CEPH_OSD_OP_FLAG_* op flags to a string
101string ceph_osd_op_flag_string(unsigned flags);
102/// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a string
103string ceph_osd_alloc_hint_flag_string(unsigned flags);
104
105
106/**
107 * osd request identifier
108 *
109 * caller name + incarnation# + tid to unique identify this request.
110 */
111struct osd_reqid_t {
112 entity_name_t name; // who
c07f9fc5 113 ceph_tid_t tid;
7c673cae
FG
114 int32_t inc; // incarnation
115
116 osd_reqid_t()
c07f9fc5
FG
117 : tid(0), inc(0)
118 {}
119 osd_reqid_t(const osd_reqid_t& other)
120 : name(other.name), tid(other.tid), inc(other.inc)
121 {}
7c673cae 122 osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
c07f9fc5
FG
123 : name(a), tid(t), inc(i)
124 {}
7c673cae
FG
125
126 DENC(osd_reqid_t, v, p) {
127 DENC_START(2, 2, p);
128 denc(v.name, p);
129 denc(v.tid, p);
130 denc(v.inc, p);
131 DENC_FINISH(p);
132 }
133 void dump(Formatter *f) const;
134 static void generate_test_instances(list<osd_reqid_t*>& o);
135};
136WRITE_CLASS_DENC(osd_reqid_t)
137
138
139
140struct pg_shard_t {
b32b8144 141 static const int32_t NO_OSD = 0x7fffffff;
7c673cae
FG
142 int32_t osd;
143 shard_id_t shard;
144 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
145 explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
146 pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
147 bool is_undefined() const {
148 return osd == -1;
149 }
b32b8144 150 string get_osd() const { return (osd == NO_OSD ? "NONE" : to_string(osd)); }
7c673cae
FG
151 void encode(bufferlist &bl) const;
152 void decode(bufferlist::iterator &bl);
153 void dump(Formatter *f) const {
154 f->dump_unsigned("osd", osd);
155 if (shard != shard_id_t::NO_SHARD) {
156 f->dump_unsigned("shard", shard);
157 }
158 }
159};
160WRITE_CLASS_ENCODER(pg_shard_t)
161WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
162WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
163ostream &operator<<(ostream &lhs, const pg_shard_t &rhs);
164
165class IsPGRecoverablePredicate {
166public:
167 /**
168 * have encodes the shards available
169 */
170 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
171 virtual ~IsPGRecoverablePredicate() {}
172};
173
174class IsPGReadablePredicate {
175public:
176 /**
177 * have encodes the shards available
178 */
179 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
180 virtual ~IsPGReadablePredicate() {}
181};
182
183inline ostream& operator<<(ostream& out, const osd_reqid_t& r) {
184 return out << r.name << "." << r.inc << ":" << r.tid;
185}
186
187inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
188 return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
189}
190inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
191 return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
192}
193inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
194 return (l.name < r.name) || (l.inc < r.inc) ||
195 (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
196}
197inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
198 return (l.name < r.name) || (l.inc < r.inc) ||
199 (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
200}
201inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
202inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
203
204namespace std {
205 template<> struct hash<osd_reqid_t> {
206 size_t operator()(const osd_reqid_t &r) const {
207 static hash<uint64_t> H;
208 return H(r.name.num() ^ r.tid ^ r.inc);
209 }
210 };
211} // namespace std
212
213
214// -----
215
216// a locator constrains the placement of an object. mainly, which pool
217// does it go in.
218struct object_locator_t {
219 // You specify either the hash or the key -- not both
220 int64_t pool; ///< pool id
221 string key; ///< key string (if non-empty)
222 string nspace; ///< namespace
223 int64_t hash; ///< hash position (if >= 0)
224
225 explicit object_locator_t()
226 : pool(-1), hash(-1) {}
227 explicit object_locator_t(int64_t po)
228 : pool(po), hash(-1) {}
229 explicit object_locator_t(int64_t po, int64_t ps)
230 : pool(po), hash(ps) {}
231 explicit object_locator_t(int64_t po, string ns)
232 : pool(po), nspace(ns), hash(-1) {}
233 explicit object_locator_t(int64_t po, string ns, int64_t ps)
234 : pool(po), nspace(ns), hash(ps) {}
235 explicit object_locator_t(int64_t po, string ns, string s)
236 : pool(po), key(s), nspace(ns), hash(-1) {}
237 explicit object_locator_t(const hobject_t& soid)
238 : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
239
240 int64_t get_pool() const {
241 return pool;
242 }
243
244 void clear() {
245 pool = -1;
246 key = "";
247 nspace = "";
248 hash = -1;
249 }
250
251 bool empty() const {
252 return pool == -1;
253 }
254
255 void encode(bufferlist& bl) const;
256 void decode(bufferlist::iterator& p);
257 void dump(Formatter *f) const;
258 static void generate_test_instances(list<object_locator_t*>& o);
259};
260WRITE_CLASS_ENCODER(object_locator_t)
261
262inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
263 return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
264}
265inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
266 return !(l == r);
267}
268
269inline ostream& operator<<(ostream& out, const object_locator_t& loc)
270{
271 out << "@" << loc.pool;
272 if (loc.nspace.length())
273 out << ";" << loc.nspace;
274 if (loc.key.length())
275 out << ":" << loc.key;
276 return out;
277}
278
279struct request_redirect_t {
280private:
281 object_locator_t redirect_locator; ///< this is authoritative
282 string redirect_object; ///< If non-empty, the request goes to this object name
283 bufferlist osd_instructions; ///< a bufferlist for the OSDs, passed but not interpreted by clients
284
285 friend ostream& operator<<(ostream& out, const request_redirect_t& redir);
286public:
287
288 request_redirect_t() {}
289 explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
290 redirect_locator(orig) { redirect_locator.pool = rpool; }
291 explicit request_redirect_t(const object_locator_t& rloc) :
292 redirect_locator(rloc) {}
293 explicit request_redirect_t(const object_locator_t& orig,
294 const string& robj) :
295 redirect_locator(orig), redirect_object(robj) {}
296
297 void set_instructions(const bufferlist& bl) { osd_instructions = bl; }
298 const bufferlist& get_instructions() { return osd_instructions; }
299
300 bool empty() const { return redirect_locator.empty() &&
301 redirect_object.empty(); }
302
303 void combine_with_locator(object_locator_t& orig, string& obj) const {
304 orig = redirect_locator;
305 if (!redirect_object.empty())
306 obj = redirect_object;
307 }
308
309 void encode(bufferlist& bl) const;
310 void decode(bufferlist::iterator& bl);
311 void dump(Formatter *f) const;
312 static void generate_test_instances(list<request_redirect_t*>& o);
313};
314WRITE_CLASS_ENCODER(request_redirect_t)
315
316inline ostream& operator<<(ostream& out, const request_redirect_t& redir) {
317 out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
318 return out;
319}
320
321// Internal OSD op flags - set by the OSD based on the op types
322enum {
323 CEPH_OSD_RMW_FLAG_READ = (1 << 1),
324 CEPH_OSD_RMW_FLAG_WRITE = (1 << 2),
325 CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3),
326 CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
327 CEPH_OSD_RMW_FLAG_PGOP = (1 << 5),
328 CEPH_OSD_RMW_FLAG_CACHE = (1 << 6),
329 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7),
330 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
331 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9),
332 CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10),
333};
334
335
336// pg stuff
337
338#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
339
340// placement seed (a hash value)
341typedef uint32_t ps_t;
342
343// old (v1) pg_t encoding (wrap old struct ceph_pg)
344struct old_pg_t {
345 ceph_pg v;
346 void encode(bufferlist& bl) const {
347 ::encode_raw(v, bl);
348 }
349 void decode(bufferlist::iterator& bl) {
350 ::decode_raw(v, bl);
351 }
352};
353WRITE_CLASS_ENCODER(old_pg_t)
354
355// placement group id
356struct pg_t {
357 uint64_t m_pool;
358 uint32_t m_seed;
359 int32_t m_preferred;
360
361 pg_t() : m_pool(0), m_seed(0), m_preferred(-1) {}
362 pg_t(ps_t seed, uint64_t pool, int pref=-1) :
363 m_pool(pool), m_seed(seed), m_preferred(pref) {}
364 // cppcheck-suppress noExplicitConstructor
365 pg_t(const ceph_pg& cpg) :
366 m_pool(cpg.pool), m_seed(cpg.ps), m_preferred((__s16)cpg.preferred) {}
367
368 // cppcheck-suppress noExplicitConstructor
369 pg_t(const old_pg_t& opg) {
370 *this = opg.v;
371 }
372
373 old_pg_t get_old_pg() const {
374 old_pg_t o;
375 assert(m_pool < 0xffffffffull);
376 o.v.pool = m_pool;
377 o.v.ps = m_seed;
378 o.v.preferred = (__s16)m_preferred;
379 return o;
380 }
381
382 ps_t ps() const {
383 return m_seed;
384 }
385 uint64_t pool() const {
386 return m_pool;
387 }
388 int32_t preferred() const {
389 return m_preferred;
390 }
391
392 static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
393 char *calc_name(char *buf, const char *suffix_backwords) const;
394
395 void set_ps(ps_t p) {
396 m_seed = p;
397 }
398 void set_pool(uint64_t p) {
399 m_pool = p;
400 }
401 void set_preferred(int32_t osd) {
402 m_preferred = osd;
403 }
404
405 pg_t get_parent() const;
406 pg_t get_ancestor(unsigned old_pg_num) const;
407
408 int print(char *o, int maxlen) const;
409 bool parse(const char *s);
410
411 bool is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *pchildren) const;
412
413 /**
414 * Returns b such that for all object o:
415 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
416 */
417 unsigned get_split_bits(unsigned pg_num) const;
418
419 bool contains(int bits, const ghobject_t& oid) {
420 return oid.match(bits, ps());
421 }
422 bool contains(int bits, const hobject_t& oid) {
423 return oid.match(bits, ps());
424 }
425
426 hobject_t get_hobj_start() const;
427 hobject_t get_hobj_end(unsigned pg_num) const;
428
429 void encode(bufferlist& bl) const {
430 __u8 v = 1;
431 ::encode(v, bl);
432 ::encode(m_pool, bl);
433 ::encode(m_seed, bl);
434 ::encode(m_preferred, bl);
435 }
436 void decode(bufferlist::iterator& bl) {
437 __u8 v;
438 ::decode(v, bl);
439 ::decode(m_pool, bl);
440 ::decode(m_seed, bl);
441 ::decode(m_preferred, bl);
442 }
443 void decode_old(bufferlist::iterator& bl) {
444 old_pg_t opg;
445 ::decode(opg, bl);
446 *this = opg;
447 }
448 void dump(Formatter *f) const;
449 static void generate_test_instances(list<pg_t*>& o);
450};
451WRITE_CLASS_ENCODER(pg_t)
452
453inline bool operator<(const pg_t& l, const pg_t& r) {
454 return l.pool() < r.pool() ||
455 (l.pool() == r.pool() && (l.preferred() < r.preferred() ||
456 (l.preferred() == r.preferred() && (l.ps() < r.ps()))));
457}
458inline bool operator<=(const pg_t& l, const pg_t& r) {
459 return l.pool() < r.pool() ||
460 (l.pool() == r.pool() && (l.preferred() < r.preferred() ||
461 (l.preferred() == r.preferred() && (l.ps() <= r.ps()))));
462}
463inline bool operator==(const pg_t& l, const pg_t& r) {
464 return l.pool() == r.pool() &&
465 l.preferred() == r.preferred() &&
466 l.ps() == r.ps();
467}
468inline bool operator!=(const pg_t& l, const pg_t& r) {
469 return l.pool() != r.pool() ||
470 l.preferred() != r.preferred() ||
471 l.ps() != r.ps();
472}
473inline bool operator>(const pg_t& l, const pg_t& r) {
474 return l.pool() > r.pool() ||
475 (l.pool() == r.pool() && (l.preferred() > r.preferred() ||
476 (l.preferred() == r.preferred() && (l.ps() > r.ps()))));
477}
478inline bool operator>=(const pg_t& l, const pg_t& r) {
479 return l.pool() > r.pool() ||
480 (l.pool() == r.pool() && (l.preferred() > r.preferred() ||
481 (l.preferred() == r.preferred() && (l.ps() >= r.ps()))));
482}
483
484ostream& operator<<(ostream& out, const pg_t &pg);
485
486namespace std {
487 template<> struct hash< pg_t >
488 {
489 size_t operator()( const pg_t& x ) const
490 {
491 static hash<uint32_t> H;
492 return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ x.preferred());
493 }
494 };
495} // namespace std
496
497struct spg_t {
498 pg_t pgid;
499 shard_id_t shard;
500 spg_t() : shard(shard_id_t::NO_SHARD) {}
501 spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
502 explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
503 unsigned get_split_bits(unsigned pg_num) const {
504 return pgid.get_split_bits(pg_num);
505 }
506 spg_t get_parent() const {
507 return spg_t(pgid.get_parent(), shard);
508 }
509 ps_t ps() const {
510 return pgid.ps();
511 }
512 uint64_t pool() const {
513 return pgid.pool();
514 }
515 int32_t preferred() const {
516 return pgid.preferred();
517 }
518
519 static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
520 char *calc_name(char *buf, const char *suffix_backwords) const;
521
522 bool parse(const char *s);
523 bool parse(const std::string& s) {
524 return parse(s.c_str());
525 }
526 bool is_split(unsigned old_pg_num, unsigned new_pg_num,
527 set<spg_t> *pchildren) const {
528 set<pg_t> _children;
529 set<pg_t> *children = pchildren ? &_children : NULL;
530 bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
531 if (pchildren && is_split) {
532 for (set<pg_t>::iterator i = _children.begin();
533 i != _children.end();
534 ++i) {
535 pchildren->insert(spg_t(*i, shard));
536 }
537 }
538 return is_split;
539 }
540 bool is_no_shard() const {
541 return shard == shard_id_t::NO_SHARD;
542 }
543
544 ghobject_t make_pgmeta_oid() const {
545 return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
546 }
547
548 void encode(bufferlist &bl) const {
549 ENCODE_START(1, 1, bl);
550 ::encode(pgid, bl);
551 ::encode(shard, bl);
552 ENCODE_FINISH(bl);
553 }
554 void decode(bufferlist::iterator &bl) {
555 DECODE_START(1, bl);
556 ::decode(pgid, bl);
557 ::decode(shard, bl);
558 DECODE_FINISH(bl);
559 }
560
561 ghobject_t make_temp_ghobject(const string& name) const {
562 return ghobject_t(
563 hobject_t(object_t(name), "", CEPH_NOSNAP,
564 pgid.ps(),
565 hobject_t::POOL_TEMP_START - pgid.pool(), ""),
566 ghobject_t::NO_GEN,
567 shard);
568 }
569
570 unsigned hash_to_shard(unsigned num_shards) const {
571 return ps() % num_shards;
572 }
573};
574WRITE_CLASS_ENCODER(spg_t)
575WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
576WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
577
578namespace std {
579 template<> struct hash< spg_t >
580 {
581 size_t operator()( const spg_t& x ) const
582 {
583 static hash<uint32_t> H;
584 return H(hash<pg_t>()(x.pgid) ^ x.shard);
585 }
586 };
587} // namespace std
588
589ostream& operator<<(ostream& out, const spg_t &pg);
590
591// ----------------------
592
593class coll_t {
594 enum type_t {
595 TYPE_META = 0,
596 TYPE_LEGACY_TEMP = 1, /* no longer used */
597 TYPE_PG = 2,
598 TYPE_PG_TEMP = 3,
599 };
600 type_t type;
601 spg_t pgid;
602 uint64_t removal_seq; // note: deprecated, not encoded
603
604 char _str_buff[spg_t::calc_name_buf_size];
605 char *_str;
606
607 void calc_str();
608
609 coll_t(type_t t, spg_t p, uint64_t r)
610 : type(t), pgid(p), removal_seq(r) {
611 calc_str();
612 }
613
614public:
615 coll_t() : type(TYPE_META), removal_seq(0)
616 {
617 calc_str();
618 }
619
620 coll_t(const coll_t& other)
621 : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
622 calc_str();
623 }
624
625 explicit coll_t(spg_t pgid)
626 : type(TYPE_PG), pgid(pgid), removal_seq(0)
627 {
628 calc_str();
629 }
630
631 coll_t& operator=(const coll_t& rhs)
632 {
633 this->type = rhs.type;
634 this->pgid = rhs.pgid;
635 this->removal_seq = rhs.removal_seq;
636 this->calc_str();
637 return *this;
638 }
639
640 // named constructors
641 static coll_t meta() {
642 return coll_t();
643 }
644 static coll_t pg(spg_t p) {
645 return coll_t(p);
646 }
647
648 const std::string to_str() const {
649 return string(_str);
650 }
651 const char *c_str() const {
652 return _str;
653 }
654
655 bool parse(const std::string& s);
656
657 int operator<(const coll_t &rhs) const {
658 return type < rhs.type ||
659 (type == rhs.type && pgid < rhs.pgid);
660 }
661
662 bool is_meta() const {
663 return type == TYPE_META;
664 }
665 bool is_pg_prefix(spg_t *pgid_) const {
666 if (type == TYPE_PG || type == TYPE_PG_TEMP) {
667 *pgid_ = pgid;
668 return true;
669 }
670 return false;
671 }
672 bool is_pg() const {
673 return type == TYPE_PG;
674 }
675 bool is_pg(spg_t *pgid_) const {
676 if (type == TYPE_PG) {
677 *pgid_ = pgid;
678 return true;
679 }
680 return false;
681 }
682 bool is_temp() const {
683 return type == TYPE_PG_TEMP;
684 }
685 bool is_temp(spg_t *pgid_) const {
686 if (type == TYPE_PG_TEMP) {
687 *pgid_ = pgid;
688 return true;
689 }
690 return false;
691 }
692
693 void encode(bufferlist& bl) const;
694 void decode(bufferlist::iterator& bl);
695 size_t encoded_size() const;
696
697 inline bool operator==(const coll_t& rhs) const {
698 // only compare type if meta
699 if (type != rhs.type)
700 return false;
701 if (type == TYPE_META)
702 return true;
703 return type == rhs.type && pgid == rhs.pgid;
704 }
705 inline bool operator!=(const coll_t& rhs) const {
706 return !(*this == rhs);
707 }
708
709 // get a TEMP collection that corresponds to the current collection,
710 // which we presume is a pg collection.
711 coll_t get_temp() const {
712 assert(type == TYPE_PG);
713 return coll_t(TYPE_PG_TEMP, pgid, 0);
714 }
715
716 ghobject_t get_min_hobj() const {
717 ghobject_t o;
718 switch (type) {
719 case TYPE_PG:
720 o.hobj.pool = pgid.pool();
721 o.set_shard(pgid.shard);
722 break;
723 case TYPE_META:
724 o.hobj.pool = -1;
725 break;
726 default:
727 break;
728 }
729 return o;
730 }
731
732 unsigned hash_to_shard(unsigned num_shards) const {
733 if (type == TYPE_PG)
734 return pgid.hash_to_shard(num_shards);
735 return 0; // whatever.
736 }
737
738 void dump(Formatter *f) const;
739 static void generate_test_instances(list<coll_t*>& o);
740};
741
742WRITE_CLASS_ENCODER(coll_t)
743
744inline ostream& operator<<(ostream& out, const coll_t& c) {
745 out << c.to_str();
746 return out;
747}
748
749namespace std {
750 template<> struct hash<coll_t> {
751 size_t operator()(const coll_t &c) const {
752 size_t h = 0;
753 string str(c.to_str());
754 std::string::const_iterator end(str.end());
755 for (std::string::const_iterator s = str.begin(); s != end; ++s) {
756 h += *s;
757 h += (h << 10);
758 h ^= (h >> 6);
759 }
760 h += (h << 3);
761 h ^= (h >> 11);
762 h += (h << 15);
763 return h;
764 }
765 };
766} // namespace std
767
768inline ostream& operator<<(ostream& out, const ceph_object_layout &ol)
769{
770 out << pg_t(ol.ol_pgid);
771 int su = ol.ol_stripe_unit;
772 if (su)
773 out << ".su=" << su;
774 return out;
775}
776
777
778
779// compound rados version type
780/* WARNING: If add member in eversion_t, please make sure the encode/decode function
781 * work well. For little-endian machine, we should make sure there is no padding
782 * in 32-bit machine and 64-bit machine.
783 */
784class eversion_t {
785public:
786 version_t version;
787 epoch_t epoch;
788 __u32 __pad;
789 eversion_t() : version(0), epoch(0), __pad(0) {}
790 eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
791
792 // cppcheck-suppress noExplicitConstructor
c07f9fc5 793 eversion_t(const ceph_eversion& ce) :
7c673cae
FG
794 version(ce.version),
795 epoch(ce.epoch),
796 __pad(0) { }
797
798 explicit eversion_t(bufferlist& bl) : __pad(0) { decode(bl); }
799
800 static eversion_t max() {
801 eversion_t max;
802 max.version -= 1;
803 max.epoch -= 1;
804 return max;
805 }
806
807 operator ceph_eversion() {
808 ceph_eversion c;
809 c.epoch = epoch;
810 c.version = version;
811 return c;
812 }
813
814 string get_key_name() const;
815
816 void encode(bufferlist &bl) const {
817#if defined(CEPH_LITTLE_ENDIAN)
818 bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
819#else
820 ::encode(version, bl);
821 ::encode(epoch, bl);
822#endif
823 }
824 void decode(bufferlist::iterator &bl) {
825#if defined(CEPH_LITTLE_ENDIAN)
826 bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
827#else
828 ::decode(version, bl);
829 ::decode(epoch, bl);
830#endif
831 }
832 void decode(bufferlist& bl) {
833 bufferlist::iterator p = bl.begin();
834 decode(p);
835 }
836};
837WRITE_CLASS_ENCODER(eversion_t)
838
839inline bool operator==(const eversion_t& l, const eversion_t& r) {
840 return (l.epoch == r.epoch) && (l.version == r.version);
841}
842inline bool operator!=(const eversion_t& l, const eversion_t& r) {
843 return (l.epoch != r.epoch) || (l.version != r.version);
844}
845inline bool operator<(const eversion_t& l, const eversion_t& r) {
846 return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
847}
848inline bool operator<=(const eversion_t& l, const eversion_t& r) {
849 return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
850}
851inline bool operator>(const eversion_t& l, const eversion_t& r) {
852 return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
853}
854inline bool operator>=(const eversion_t& l, const eversion_t& r) {
855 return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
856}
857inline ostream& operator<<(ostream& out, const eversion_t& e) {
858 return out << e.epoch << "'" << e.version;
859}
860
861/**
862 * objectstore_perf_stat_t
863 *
864 * current perf information about the osd
865 */
866struct objectstore_perf_stat_t {
867 // cur_op_latency is in ms since double add/sub are not associative
868 uint32_t os_commit_latency;
869 uint32_t os_apply_latency;
870
871 objectstore_perf_stat_t() :
872 os_commit_latency(0), os_apply_latency(0) {}
873
874 bool operator==(const objectstore_perf_stat_t &r) const {
875 return os_commit_latency == r.os_commit_latency &&
876 os_apply_latency == r.os_apply_latency;
877 }
878
879 void add(const objectstore_perf_stat_t &o) {
880 os_commit_latency += o.os_commit_latency;
881 os_apply_latency += o.os_apply_latency;
882 }
883 void sub(const objectstore_perf_stat_t &o) {
884 os_commit_latency -= o.os_commit_latency;
885 os_apply_latency -= o.os_apply_latency;
886 }
887 void dump(Formatter *f) const;
888 void encode(bufferlist &bl) const;
889 void decode(bufferlist::iterator &bl);
890 static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
891};
892WRITE_CLASS_ENCODER(objectstore_perf_stat_t)
893
894/** osd_stat
895 * aggregate stats for an osd
896 */
897struct osd_stat_t {
898 int64_t kb, kb_used, kb_avail;
899 vector<int> hb_peers;
900 int32_t snap_trim_queue_len, num_snap_trimming;
901
902 pow2_hist_t op_queue_age_hist;
903
904 objectstore_perf_stat_t os_perf_stat;
905
31f18b77
FG
906 epoch_t up_from = 0;
907 uint64_t seq = 0;
908
35e4c445
FG
909 uint32_t num_pgs = 0;
910
7c673cae
FG
911 osd_stat_t() : kb(0), kb_used(0), kb_avail(0),
912 snap_trim_queue_len(0), num_snap_trimming(0) {}
913
914 void add(const osd_stat_t& o) {
915 kb += o.kb;
916 kb_used += o.kb_used;
917 kb_avail += o.kb_avail;
918 snap_trim_queue_len += o.snap_trim_queue_len;
919 num_snap_trimming += o.num_snap_trimming;
920 op_queue_age_hist.add(o.op_queue_age_hist);
921 os_perf_stat.add(o.os_perf_stat);
35e4c445 922 num_pgs += o.num_pgs;
7c673cae
FG
923 }
924 void sub(const osd_stat_t& o) {
925 kb -= o.kb;
926 kb_used -= o.kb_used;
927 kb_avail -= o.kb_avail;
928 snap_trim_queue_len -= o.snap_trim_queue_len;
929 num_snap_trimming -= o.num_snap_trimming;
930 op_queue_age_hist.sub(o.op_queue_age_hist);
931 os_perf_stat.sub(o.os_perf_stat);
35e4c445 932 num_pgs -= o.num_pgs;
7c673cae
FG
933 }
934
935 void dump(Formatter *f) const;
936 void encode(bufferlist &bl) const;
937 void decode(bufferlist::iterator &bl);
938 static void generate_test_instances(std::list<osd_stat_t*>& o);
939};
940WRITE_CLASS_ENCODER(osd_stat_t)
941
942inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
943 return l.kb == r.kb &&
944 l.kb_used == r.kb_used &&
945 l.kb_avail == r.kb_avail &&
946 l.snap_trim_queue_len == r.snap_trim_queue_len &&
947 l.num_snap_trimming == r.num_snap_trimming &&
948 l.hb_peers == r.hb_peers &&
949 l.op_queue_age_hist == r.op_queue_age_hist &&
35e4c445
FG
950 l.os_perf_stat == r.os_perf_stat &&
951 l.num_pgs == r.num_pgs;
7c673cae
FG
952}
953inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
954 return !(l == r);
955}
956
957
958
959inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
960 return out << "osd_stat(" << kb_t(s.kb_used) << " used, "
961 << kb_t(s.kb_avail) << " avail, "
962 << kb_t(s.kb) << " total, "
963 << "peers " << s.hb_peers
964 << " op hist " << s.op_queue_age_hist.h
965 << ")";
966}
967
968
969/*
970 * pg states
971 */
972#define PG_STATE_CREATING (1<<0) // creating
973#define PG_STATE_ACTIVE (1<<1) // i am active. (primary: replicas too)
974#define PG_STATE_CLEAN (1<<2) // peers are complete, clean of stray replicas.
975#define PG_STATE_DOWN (1<<4) // a needed replica is down, PG offline
b32b8144
FG
976#define PG_STATE_RECOVERY_UNFOUND (1<<5) // recovery stopped due to unfound
977#define PG_STATE_BACKFILL_UNFOUND (1<<6) // backfill stopped due to unfound
7c673cae
FG
978//#define PG_STATE_SPLITTING (1<<7) // i am splitting
979#define PG_STATE_SCRUBBING (1<<8) // scrubbing
980//#define PG_STATE_SCRUBQ (1<<9) // queued for scrub
981#define PG_STATE_DEGRADED (1<<10) // pg contains objects with reduced redundancy
982#define PG_STATE_INCONSISTENT (1<<11) // pg replicas are inconsistent (but shouldn't be)
983#define PG_STATE_PEERING (1<<12) // pg is (re)peering
984#define PG_STATE_REPAIR (1<<13) // pg should repair on next scrub
985#define PG_STATE_RECOVERING (1<<14) // pg is recovering/migrating objects
986#define PG_STATE_BACKFILL_WAIT (1<<15) // [active] reserving backfill
987#define PG_STATE_INCOMPLETE (1<<16) // incomplete content, peering failed.
988#define PG_STATE_STALE (1<<17) // our state for this pg is stale, unknown.
989#define PG_STATE_REMAPPED (1<<18) // pg is explicitly remapped to different OSDs than CRUSH
990#define PG_STATE_DEEP_SCRUB (1<<19) // deep scrub: check CRC32 on files
3efd9988 991#define PG_STATE_BACKFILLING (1<<20) // [active] backfilling pg content
7c673cae
FG
992#define PG_STATE_BACKFILL_TOOFULL (1<<21) // backfill can't proceed: too full
993#define PG_STATE_RECOVERY_WAIT (1<<22) // waiting for recovery reservations
994#define PG_STATE_UNDERSIZED (1<<23) // pg acting < pool size
995#define PG_STATE_ACTIVATING (1<<24) // pg is peered but not yet active
996#define PG_STATE_PEERED (1<<25) // peered, cannot go active, can recover
997#define PG_STATE_SNAPTRIM (1<<26) // trimming snaps
998#define PG_STATE_SNAPTRIM_WAIT (1<<27) // queued to trim snaps
999#define PG_STATE_RECOVERY_TOOFULL (1<<28) // recovery can't proceed: too full
224ce89b 1000#define PG_STATE_SNAPTRIM_ERROR (1<<29) // error stopped trimming snaps
c07f9fc5
FG
1001#define PG_STATE_FORCED_RECOVERY (1<<30) // force recovery of this pg before any other
1002#define PG_STATE_FORCED_BACKFILL (1<<31) // force backfill of this pg before any other
7c673cae
FG
1003
1004std::string pg_state_string(int state);
1005std::string pg_vector_string(const vector<int32_t> &a);
3efd9988 1006boost::optional<uint64_t> pg_string_state(const std::string& state);
7c673cae
FG
1007
1008
1009/*
1010 * pool_snap_info_t
1011 *
1012 * attributes for a single pool snapshot.
1013 */
1014struct pool_snap_info_t {
1015 snapid_t snapid;
1016 utime_t stamp;
1017 string name;
1018
1019 void dump(Formatter *f) const;
1020 void encode(bufferlist& bl, uint64_t features) const;
1021 void decode(bufferlist::iterator& bl);
1022 static void generate_test_instances(list<pool_snap_info_t*>& o);
1023};
1024WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
1025
1026inline ostream& operator<<(ostream& out, const pool_snap_info_t& si) {
1027 return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
1028}
1029
1030
1031/*
1032 * pool_opts_t
1033 *
1034 * pool options.
1035 */
1036
1037class pool_opts_t {
1038public:
1039 enum key_t {
1040 SCRUB_MIN_INTERVAL,
1041 SCRUB_MAX_INTERVAL,
1042 DEEP_SCRUB_INTERVAL,
1043 RECOVERY_PRIORITY,
1044 RECOVERY_OP_PRIORITY,
1045 SCRUB_PRIORITY,
1046 COMPRESSION_MODE,
1047 COMPRESSION_ALGORITHM,
1048 COMPRESSION_REQUIRED_RATIO,
1049 COMPRESSION_MAX_BLOB_SIZE,
1050 COMPRESSION_MIN_BLOB_SIZE,
1051 CSUM_TYPE,
1052 CSUM_MAX_BLOCK,
1053 CSUM_MIN_BLOCK,
1054 };
1055
1056 enum type_t {
1057 STR,
1058 INT,
1059 DOUBLE,
1060 };
1061
1062 struct opt_desc_t {
1063 key_t key;
1064 type_t type;
1065
1066 opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
1067
1068 bool operator==(const opt_desc_t& rhs) const {
1069 return key == rhs.key && type == rhs.type;
1070 }
1071 };
1072
1073 typedef boost::variant<std::string,int,double> value_t;
1074
1075 static bool is_opt_name(const std::string& name);
1076 static opt_desc_t get_opt_desc(const std::string& name);
1077
1078 pool_opts_t() : opts() {}
1079
1080 bool is_set(key_t key) const;
1081
1082 template<typename T>
1083 void set(key_t key, const T &val) {
1084 value_t value = val;
1085 opts[key] = value;
1086 }
1087
1088 template<typename T>
1089 bool get(key_t key, T *val) const {
1090 opts_t::const_iterator i = opts.find(key);
1091 if (i == opts.end()) {
1092 return false;
1093 }
1094 *val = boost::get<T>(i->second);
1095 return true;
1096 }
1097
1098 const value_t& get(key_t key) const;
1099
1100 bool unset(key_t key);
1101
1102 void dump(const std::string& name, Formatter *f) const;
1103
1104 void dump(Formatter *f) const;
1105 void encode(bufferlist &bl) const;
1106 void decode(bufferlist::iterator &bl);
1107
1108private:
1109 typedef std::map<key_t, value_t> opts_t;
1110 opts_t opts;
1111
1112 friend ostream& operator<<(ostream& out, const pool_opts_t& opts);
1113};
1114WRITE_CLASS_ENCODER(pool_opts_t)
1115
1116/*
1117 * pg_pool
1118 */
1119struct pg_pool_t {
c07f9fc5
FG
1120 static const char *APPLICATION_NAME_CEPHFS;
1121 static const char *APPLICATION_NAME_RBD;
1122 static const char *APPLICATION_NAME_RGW;
1123
7c673cae
FG
1124 enum {
1125 TYPE_REPLICATED = 1, // replication
1126 //TYPE_RAID4 = 2, // raid4 (never implemented)
1127 TYPE_ERASURE = 3, // erasure-coded
1128 };
1129 static const char *get_type_name(int t) {
1130 switch (t) {
1131 case TYPE_REPLICATED: return "replicated";
1132 //case TYPE_RAID4: return "raid4";
1133 case TYPE_ERASURE: return "erasure";
1134 default: return "???";
1135 }
1136 }
1137 const char *get_type_name() const {
1138 return get_type_name(type);
1139 }
7c673cae
FG
1140
1141 enum {
1142 FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
1143 FLAG_FULL = 1<<1, // pool is full
1144 FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
1145 FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
1146 FLAG_NODELETE = 1<<4, // pool can't be deleted
1147 FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
1148 FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
1149 FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1150 FLAG_NOSCRUB = 1<<8, // block periodic scrub
1151 FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
3efd9988
FG
1152 FLAG_FULL_NO_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
1153 FLAG_NEARFULL = 1<<11, // pool is nearfull
1154 FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
7c673cae
FG
1155 };
1156
1157 static const char *get_flag_name(int f) {
1158 switch (f) {
1159 case FLAG_HASHPSPOOL: return "hashpspool";
1160 case FLAG_FULL: return "full";
1161 case FLAG_EC_OVERWRITES: return "ec_overwrites";
1162 case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
1163 case FLAG_NODELETE: return "nodelete";
1164 case FLAG_NOPGCHANGE: return "nopgchange";
1165 case FLAG_NOSIZECHANGE: return "nosizechange";
1166 case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
1167 case FLAG_NOSCRUB: return "noscrub";
1168 case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
3efd9988
FG
1169 case FLAG_FULL_NO_QUOTA: return "full_no_quota";
1170 case FLAG_NEARFULL: return "nearfull";
1171 case FLAG_BACKFILLFULL: return "backfillfull";
7c673cae
FG
1172 default: return "???";
1173 }
1174 }
1175 static string get_flags_string(uint64_t f) {
1176 string s;
1177 for (unsigned n=0; f && n<64; ++n) {
1178 if (f & (1ull << n)) {
1179 if (s.length())
1180 s += ",";
1181 s += get_flag_name(1ull << n);
1182 }
1183 }
1184 return s;
1185 }
1186 string get_flags_string() const {
1187 return get_flags_string(flags);
1188 }
1189 static uint64_t get_flag_by_name(const string& name) {
1190 if (name == "hashpspool")
1191 return FLAG_HASHPSPOOL;
1192 if (name == "full")
1193 return FLAG_FULL;
1194 if (name == "ec_overwrites")
1195 return FLAG_EC_OVERWRITES;
1196 if (name == "incomplete_clones")
1197 return FLAG_INCOMPLETE_CLONES;
1198 if (name == "nodelete")
1199 return FLAG_NODELETE;
1200 if (name == "nopgchange")
1201 return FLAG_NOPGCHANGE;
1202 if (name == "nosizechange")
1203 return FLAG_NOSIZECHANGE;
1204 if (name == "write_fadvise_dontneed")
1205 return FLAG_WRITE_FADVISE_DONTNEED;
1206 if (name == "noscrub")
1207 return FLAG_NOSCRUB;
1208 if (name == "nodeep-scrub")
1209 return FLAG_NODEEP_SCRUB;
3efd9988
FG
1210 if (name == "full_no_quota")
1211 return FLAG_FULL_NO_QUOTA;
1212 if (name == "nearfull")
1213 return FLAG_NEARFULL;
1214 if (name == "backfillfull")
1215 return FLAG_BACKFILLFULL;
7c673cae
FG
1216 return 0;
1217 }
1218
1219 /// converts the acting/up vector to a set of pg shards
1220 void convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const;
1221
1222 typedef enum {
1223 CACHEMODE_NONE = 0, ///< no caching
1224 CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
1225 CACHEMODE_FORWARD = 2, ///< forward if not in cache
1226 CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
1227 CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later
1228 CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later
1229 CACHEMODE_PROXY = 6, ///< proxy if not in cache
1230 } cache_mode_t;
1231 static const char *get_cache_mode_name(cache_mode_t m) {
1232 switch (m) {
1233 case CACHEMODE_NONE: return "none";
1234 case CACHEMODE_WRITEBACK: return "writeback";
1235 case CACHEMODE_FORWARD: return "forward";
1236 case CACHEMODE_READONLY: return "readonly";
1237 case CACHEMODE_READFORWARD: return "readforward";
1238 case CACHEMODE_READPROXY: return "readproxy";
1239 case CACHEMODE_PROXY: return "proxy";
1240 default: return "unknown";
1241 }
1242 }
1243 static cache_mode_t get_cache_mode_from_str(const string& s) {
1244 if (s == "none")
1245 return CACHEMODE_NONE;
1246 if (s == "writeback")
1247 return CACHEMODE_WRITEBACK;
1248 if (s == "forward")
1249 return CACHEMODE_FORWARD;
1250 if (s == "readonly")
1251 return CACHEMODE_READONLY;
1252 if (s == "readforward")
1253 return CACHEMODE_READFORWARD;
1254 if (s == "readproxy")
1255 return CACHEMODE_READPROXY;
1256 if (s == "proxy")
1257 return CACHEMODE_PROXY;
1258 return (cache_mode_t)-1;
1259 }
1260 const char *get_cache_mode_name() const {
1261 return get_cache_mode_name(cache_mode);
1262 }
1263 bool cache_mode_requires_hit_set() const {
1264 switch (cache_mode) {
1265 case CACHEMODE_NONE:
1266 case CACHEMODE_FORWARD:
1267 case CACHEMODE_READONLY:
1268 case CACHEMODE_PROXY:
1269 return false;
1270 case CACHEMODE_WRITEBACK:
1271 case CACHEMODE_READFORWARD:
1272 case CACHEMODE_READPROXY:
1273 return true;
1274 default:
1275 assert(0 == "implement me");
1276 }
1277 }
1278
1279 uint64_t flags; ///< FLAG_*
1280 __u8 type; ///< TYPE_*
1281 __u8 size, min_size; ///< number of osds in each pg
31f18b77 1282 __u8 crush_rule; ///< crush placement rule
7c673cae
FG
1283 __u8 object_hash; ///< hash mapping object name to ps
1284private:
1285 __u32 pg_num, pgp_num; ///< number of pgs
1286
1287
1288public:
1289 map<string,string> properties; ///< OBSOLETE
1290 string erasure_code_profile; ///< name of the erasure code profile in OSDMap
1291 epoch_t last_change; ///< most recent epoch changed, exclusing snapshot changes
1292 epoch_t last_force_op_resend; ///< last epoch that forced clients to resend
1293 /// last epoch that forced clients to resend (pre-luminous clients only)
1294 epoch_t last_force_op_resend_preluminous;
1295 snapid_t snap_seq; ///< seq for per-pool snapshot
1296 epoch_t snap_epoch; ///< osdmap epoch of last snap
1297 uint64_t auid; ///< who owns the pg
1298 __u32 crash_replay_interval; ///< seconds to allow clients to replay ACKed but unCOMMITted requests
1299
1300 uint64_t quota_max_bytes; ///< maximum number of bytes for this pool
1301 uint64_t quota_max_objects; ///< maximum number of objects for this pool
1302
1303 /*
1304 * Pool snaps (global to this pool). These define a SnapContext for
1305 * the pool, unless the client manually specifies an alternate
1306 * context.
1307 */
1308 map<snapid_t, pool_snap_info_t> snaps;
1309 /*
1310 * Alternatively, if we are defining non-pool snaps (e.g. via the
1311 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1312 * used). Snaps and removed_snaps are to be used exclusive of each
1313 * other!
1314 */
1315 interval_set<snapid_t> removed_snaps;
1316
1317 unsigned pg_num_mask, pgp_num_mask;
1318
1319 set<uint64_t> tiers; ///< pools that are tiers of us
1320 int64_t tier_of; ///< pool for which we are a tier
1321 // Note that write wins for read+write ops
1322 int64_t read_tier; ///< pool/tier for objecter to direct reads to
1323 int64_t write_tier; ///< pool/tier for objecter to direct writes to
1324 cache_mode_t cache_mode; ///< cache pool mode
1325
1326 bool is_tier() const { return tier_of >= 0; }
1327 bool has_tiers() const { return !tiers.empty(); }
1328 void clear_tier() {
1329 tier_of = -1;
1330 clear_read_tier();
1331 clear_write_tier();
1332 clear_tier_tunables();
1333 }
1334 bool has_read_tier() const { return read_tier >= 0; }
1335 void clear_read_tier() { read_tier = -1; }
1336 bool has_write_tier() const { return write_tier >= 0; }
1337 void clear_write_tier() { write_tier = -1; }
1338 void clear_tier_tunables() {
1339 if (cache_mode != CACHEMODE_NONE)
1340 flags |= FLAG_INCOMPLETE_CLONES;
1341 cache_mode = CACHEMODE_NONE;
1342
1343 target_max_bytes = 0;
1344 target_max_objects = 0;
1345 cache_target_dirty_ratio_micro = 0;
1346 cache_target_dirty_high_ratio_micro = 0;
1347 cache_target_full_ratio_micro = 0;
1348 hit_set_params = HitSet::Params();
1349 hit_set_period = 0;
1350 hit_set_count = 0;
1351 hit_set_grade_decay_rate = 0;
1352 hit_set_search_last_n = 0;
1353 grade_table.resize(0);
1354 }
1355
1356 uint64_t target_max_bytes; ///< tiering: target max pool size
1357 uint64_t target_max_objects; ///< tiering: target max pool size
1358
1359 uint32_t cache_target_dirty_ratio_micro; ///< cache: fraction of target to leave dirty
1360 uint32_t cache_target_dirty_high_ratio_micro; ///<cache: fraction of target to flush with high speed
1361 uint32_t cache_target_full_ratio_micro; ///< cache: fraction of target to fill before we evict in earnest
1362
1363 uint32_t cache_min_flush_age; ///< minimum age (seconds) before we can flush
1364 uint32_t cache_min_evict_age; ///< minimum age (seconds) before we can evict
1365
1366 HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
1367 uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
1368 uint32_t hit_set_count; ///< number of periods to retain
1369 bool use_gmt_hitset; ///< use gmt to name the hitset archive object
1370 uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote on read
1371 uint32_t min_write_recency_for_promote; ///< minimum number of HitSet to check before promote on write
1372 uint32_t hit_set_grade_decay_rate; ///< current hit_set has highest priority on objects
1373 ///temperature count,the follow hit_set's priority decay
1374 ///by this params than pre hit_set
1375 uint32_t hit_set_search_last_n; ///<accumulate atmost N hit_sets for temperature
1376
1377 uint32_t stripe_width; ///< erasure coded stripe size in bytes
1378
1379 uint64_t expected_num_objects; ///< expected number of objects on this pool, a value of 0 indicates
1380 ///< user does not specify any expected value
1381 bool fast_read; ///< whether turn on fast read on the pool or not
1382
1383 pool_opts_t opts; ///< options
1384
c07f9fc5
FG
1385 /// application -> key/value metadata
1386 map<string, std::map<string, string>> application_metadata;
1387
7c673cae
FG
1388private:
1389 vector<uint32_t> grade_table;
1390
1391public:
1392 uint32_t get_grade(unsigned i) const {
1393 if (grade_table.size() <= i)
1394 return 0;
1395 return grade_table[i];
1396 }
1397 void calc_grade_table() {
1398 unsigned v = 1000000;
1399 grade_table.resize(hit_set_count);
1400 for (unsigned i = 0; i < hit_set_count; i++) {
1401 v = v * (1 - (hit_set_grade_decay_rate / 100.0));
1402 grade_table[i] = v;
1403 }
1404 }
1405
1406 pg_pool_t()
1407 : flags(0), type(0), size(0), min_size(0),
31f18b77 1408 crush_rule(0), object_hash(0),
7c673cae
FG
1409 pg_num(0), pgp_num(0),
1410 last_change(0),
1411 last_force_op_resend(0),
1412 last_force_op_resend_preluminous(0),
1413 snap_seq(0), snap_epoch(0),
1414 auid(0),
1415 crash_replay_interval(0),
1416 quota_max_bytes(0), quota_max_objects(0),
1417 pg_num_mask(0), pgp_num_mask(0),
1418 tier_of(-1), read_tier(-1), write_tier(-1),
1419 cache_mode(CACHEMODE_NONE),
1420 target_max_bytes(0), target_max_objects(0),
1421 cache_target_dirty_ratio_micro(0),
1422 cache_target_dirty_high_ratio_micro(0),
1423 cache_target_full_ratio_micro(0),
1424 cache_min_flush_age(0),
1425 cache_min_evict_age(0),
1426 hit_set_params(),
1427 hit_set_period(0),
1428 hit_set_count(0),
1429 use_gmt_hitset(true),
1430 min_read_recency_for_promote(0),
1431 min_write_recency_for_promote(0),
1432 hit_set_grade_decay_rate(0),
1433 hit_set_search_last_n(0),
1434 stripe_width(0),
1435 expected_num_objects(0),
1436 fast_read(false),
1437 opts()
1438 { }
1439
1440 void dump(Formatter *f) const;
1441
1442 uint64_t get_flags() const { return flags; }
1443 bool has_flag(uint64_t f) const { return flags & f; }
1444 void set_flag(uint64_t f) { flags |= f; }
1445 void unset_flag(uint64_t f) { flags &= ~f; }
1446
1447 bool ec_pool() const {
1448 return type == TYPE_ERASURE;
1449 }
1450 bool require_rollback() const {
1451 return ec_pool();
1452 }
1453
1454 /// true if incomplete clones may be present
1455 bool allow_incomplete_clones() const {
1456 return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
1457 }
1458
1459 unsigned get_type() const { return type; }
1460 unsigned get_size() const { return size; }
1461 unsigned get_min_size() const { return min_size; }
31f18b77 1462 int get_crush_rule() const { return crush_rule; }
7c673cae
FG
1463 int get_object_hash() const { return object_hash; }
1464 const char *get_object_hash_name() const {
1465 return ceph_str_hash_name(get_object_hash());
1466 }
1467 epoch_t get_last_change() const { return last_change; }
1468 epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
1469 epoch_t get_last_force_op_resend_preluminous() const {
1470 return last_force_op_resend_preluminous;
1471 }
1472 epoch_t get_snap_epoch() const { return snap_epoch; }
1473 snapid_t get_snap_seq() const { return snap_seq; }
1474 uint64_t get_auid() const { return auid; }
1475 unsigned get_crash_replay_interval() const { return crash_replay_interval; }
1476
1477 void set_snap_seq(snapid_t s) { snap_seq = s; }
1478 void set_snap_epoch(epoch_t e) { snap_epoch = e; }
1479
1480 void set_stripe_width(uint32_t s) { stripe_width = s; }
1481 uint32_t get_stripe_width() const { return stripe_width; }
1482
1483 bool is_replicated() const { return get_type() == TYPE_REPLICATED; }
1484 bool is_erasure() const { return get_type() == TYPE_ERASURE; }
1485
1486 bool supports_omap() const {
1487 return !(get_type() == TYPE_ERASURE);
1488 }
1489
1490 bool requires_aligned_append() const {
1491 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
1492 }
1493 uint64_t required_alignment() const { return stripe_width; }
1494
1495 bool allows_ecoverwrites() const {
1496 return has_flag(FLAG_EC_OVERWRITES);
1497 }
1498
1499 bool can_shift_osds() const {
1500 switch (get_type()) {
1501 case TYPE_REPLICATED:
1502 return true;
1503 case TYPE_ERASURE:
1504 return false;
1505 default:
1506 assert(0 == "unhandled pool type");
1507 }
1508 }
1509
1510 unsigned get_pg_num() const { return pg_num; }
1511 unsigned get_pgp_num() const { return pgp_num; }
1512
1513 unsigned get_pg_num_mask() const { return pg_num_mask; }
1514 unsigned get_pgp_num_mask() const { return pgp_num_mask; }
1515
1516 // if pg_num is not a multiple of two, pgs are not equally sized.
1517 // return, for a given pg, the fraction (denominator) of the total
1518 // pool size that it represents.
1519 unsigned get_pg_num_divisor(pg_t pgid) const;
1520
1521 void set_pg_num(int p) {
1522 pg_num = p;
1523 calc_pg_masks();
1524 }
1525 void set_pgp_num(int p) {
1526 pgp_num = p;
1527 calc_pg_masks();
1528 }
1529
1530 void set_quota_max_bytes(uint64_t m) {
1531 quota_max_bytes = m;
1532 }
1533 uint64_t get_quota_max_bytes() {
1534 return quota_max_bytes;
1535 }
1536
1537 void set_quota_max_objects(uint64_t m) {
1538 quota_max_objects = m;
1539 }
1540 uint64_t get_quota_max_objects() {
1541 return quota_max_objects;
1542 }
1543
1544 void set_last_force_op_resend(uint64_t t) {
1545 last_force_op_resend = t;
1546 last_force_op_resend_preluminous = t;
1547 }
1548
1549 void calc_pg_masks();
1550
1551 /*
1552 * we have two snap modes:
1553 * - pool global snaps
1554 * - snap existence/non-existence defined by snaps[] and snap_seq
1555 * - user managed snaps
1556 * - removal governed by removed_snaps
1557 *
1558 * we know which mode we're using based on whether removed_snaps is empty.
1559 * If nothing has been created, both functions report false.
1560 */
1561 bool is_pool_snaps_mode() const;
1562 bool is_unmanaged_snaps_mode() const;
1563 bool is_removed_snap(snapid_t s) const;
1564
1565 /*
1566 * build set of known-removed sets from either pool snaps or
1567 * explicit removed_snaps set.
1568 */
1569 void build_removed_snaps(interval_set<snapid_t>& rs) const;
1570 snapid_t snap_exists(const char *s) const;
1571 void add_snap(const char *n, utime_t stamp);
1572 void add_unmanaged_snap(uint64_t& snapid);
1573 void remove_snap(snapid_t s);
1574 void remove_unmanaged_snap(snapid_t s);
1575
1576 SnapContext get_snap_context() const;
1577
1578 /// hash a object name+namespace key to a hash position
1579 uint32_t hash_key(const string& key, const string& ns) const;
1580
1581 /// round a hash position down to a pg num
1582 uint32_t raw_hash_to_pg(uint32_t v) const;
1583
1584 /*
1585 * map a raw pg (with full precision ps) into an actual pg, for storage
1586 */
1587 pg_t raw_pg_to_pg(pg_t pg) const;
1588
1589 /*
1590 * map raw pg (full precision ps) into a placement seed. include
1591 * pool id in that value so that different pools don't use the same
1592 * seeds.
1593 */
1594 ps_t raw_pg_to_pps(pg_t pg) const;
1595
1596 /// choose a random hash position within a pg
1597 uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
1598
1599 void encode(bufferlist& bl, uint64_t features) const;
1600 void decode(bufferlist::iterator& bl);
1601
1602 static void generate_test_instances(list<pg_pool_t*>& o);
1603};
1604WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
1605
1606ostream& operator<<(ostream& out, const pg_pool_t& p);
1607
1608
1609/**
1610 * a summation of object stats
1611 *
1612 * This is just a container for object stats; we don't know what for.
1613 *
1614 * If you add members in object_stat_sum_t, you should make sure there are
1615 * not padding among these members.
1616 * You should also modify the padding_check function.
1617
1618 */
1619struct object_stat_sum_t {
1620 /**************************************************************************
1621 * WARNING: be sure to update operator==, floor, and split when
1622 * adding/removing fields!
1623 **************************************************************************/
1624 int64_t num_bytes; // in bytes
1625 int64_t num_objects;
1626 int64_t num_object_clones;
1627 int64_t num_object_copies; // num_objects * num_replicas
1628 int64_t num_objects_missing_on_primary;
1629 int64_t num_objects_degraded;
1630 int64_t num_objects_unfound;
1631 int64_t num_rd;
1632 int64_t num_rd_kb;
1633 int64_t num_wr;
1634 int64_t num_wr_kb;
1635 int64_t num_scrub_errors; // total deep and shallow scrub errors
1636 int64_t num_objects_recovered;
1637 int64_t num_bytes_recovered;
1638 int64_t num_keys_recovered;
1639 int64_t num_shallow_scrub_errors;
1640 int64_t num_deep_scrub_errors;
1641 int64_t num_objects_dirty;
1642 int64_t num_whiteouts;
1643 int64_t num_objects_omap;
1644 int64_t num_objects_hit_set_archive;
1645 int64_t num_objects_misplaced;
1646 int64_t num_bytes_hit_set_archive;
1647 int64_t num_flush;
1648 int64_t num_flush_kb;
1649 int64_t num_evict;
1650 int64_t num_evict_kb;
1651 int64_t num_promote;
1652 int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0
1653 int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0
1654 int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0
1655 int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0
1656 int64_t num_objects_pinned;
1657 int64_t num_objects_missing;
1658 int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
28e407b8 1659 int64_t num_large_omap_objects = 0;
7c673cae
FG
1660
1661 object_stat_sum_t()
1662 : num_bytes(0),
1663 num_objects(0), num_object_clones(0), num_object_copies(0),
1664 num_objects_missing_on_primary(0), num_objects_degraded(0),
1665 num_objects_unfound(0),
1666 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1667 num_scrub_errors(0),
1668 num_objects_recovered(0),
1669 num_bytes_recovered(0),
1670 num_keys_recovered(0),
1671 num_shallow_scrub_errors(0),
1672 num_deep_scrub_errors(0),
1673 num_objects_dirty(0),
1674 num_whiteouts(0),
1675 num_objects_omap(0),
1676 num_objects_hit_set_archive(0),
1677 num_objects_misplaced(0),
1678 num_bytes_hit_set_archive(0),
1679 num_flush(0),
1680 num_flush_kb(0),
1681 num_evict(0),
1682 num_evict_kb(0),
1683 num_promote(0),
1684 num_flush_mode_high(0), num_flush_mode_low(0),
1685 num_evict_mode_some(0), num_evict_mode_full(0),
1686 num_objects_pinned(0),
1687 num_objects_missing(0),
1688 num_legacy_snapsets(0)
1689 {}
1690
1691 void floor(int64_t f) {
1692#define FLOOR(x) if (x < f) x = f
1693 FLOOR(num_bytes);
1694 FLOOR(num_objects);
1695 FLOOR(num_object_clones);
1696 FLOOR(num_object_copies);
1697 FLOOR(num_objects_missing_on_primary);
1698 FLOOR(num_objects_missing);
1699 FLOOR(num_objects_degraded);
1700 FLOOR(num_objects_misplaced);
1701 FLOOR(num_objects_unfound);
1702 FLOOR(num_rd);
1703 FLOOR(num_rd_kb);
1704 FLOOR(num_wr);
1705 FLOOR(num_wr_kb);
28e407b8 1706 FLOOR(num_large_omap_objects);
7c673cae
FG
1707 FLOOR(num_shallow_scrub_errors);
1708 FLOOR(num_deep_scrub_errors);
94b18763 1709 num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
7c673cae
FG
1710 FLOOR(num_objects_recovered);
1711 FLOOR(num_bytes_recovered);
1712 FLOOR(num_keys_recovered);
1713 FLOOR(num_objects_dirty);
1714 FLOOR(num_whiteouts);
1715 FLOOR(num_objects_omap);
1716 FLOOR(num_objects_hit_set_archive);
1717 FLOOR(num_bytes_hit_set_archive);
1718 FLOOR(num_flush);
1719 FLOOR(num_flush_kb);
1720 FLOOR(num_evict);
1721 FLOOR(num_evict_kb);
1722 FLOOR(num_promote);
1723 FLOOR(num_flush_mode_high);
1724 FLOOR(num_flush_mode_low);
1725 FLOOR(num_evict_mode_some);
1726 FLOOR(num_evict_mode_full);
1727 FLOOR(num_objects_pinned);
1728 FLOOR(num_legacy_snapsets);
1729#undef FLOOR
1730 }
1731
1732 void split(vector<object_stat_sum_t> &out) const {
1733#define SPLIT(PARAM) \
1734 for (unsigned i = 0; i < out.size(); ++i) { \
1735 out[i].PARAM = PARAM / out.size(); \
1736 if (i < (PARAM % out.size())) { \
1737 out[i].PARAM++; \
1738 } \
1739 }
1740#define SPLIT_PRESERVE_NONZERO(PARAM) \
1741 for (unsigned i = 0; i < out.size(); ++i) { \
1742 if (PARAM) \
1743 out[i].PARAM = 1 + PARAM / out.size(); \
1744 else \
1745 out[i].PARAM = 0; \
1746 }
1747
1748 SPLIT(num_bytes);
1749 SPLIT(num_objects);
1750 SPLIT(num_object_clones);
1751 SPLIT(num_object_copies);
1752 SPLIT(num_objects_missing_on_primary);
1753 SPLIT(num_objects_missing);
1754 SPLIT(num_objects_degraded);
1755 SPLIT(num_objects_misplaced);
1756 SPLIT(num_objects_unfound);
1757 SPLIT(num_rd);
1758 SPLIT(num_rd_kb);
1759 SPLIT(num_wr);
1760 SPLIT(num_wr_kb);
94b18763
FG
1761 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
1762 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
1763 for (unsigned i = 0; i < out.size(); ++i) {
1764 out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
1765 out[i].num_deep_scrub_errors;
1766 }
28e407b8 1767 SPLIT(num_large_omap_objects);
7c673cae
FG
1768 SPLIT(num_objects_recovered);
1769 SPLIT(num_bytes_recovered);
1770 SPLIT(num_keys_recovered);
1771 SPLIT(num_objects_dirty);
1772 SPLIT(num_whiteouts);
1773 SPLIT(num_objects_omap);
1774 SPLIT(num_objects_hit_set_archive);
1775 SPLIT(num_bytes_hit_set_archive);
1776 SPLIT(num_flush);
1777 SPLIT(num_flush_kb);
1778 SPLIT(num_evict);
1779 SPLIT(num_evict_kb);
1780 SPLIT(num_promote);
1781 SPLIT(num_flush_mode_high);
1782 SPLIT(num_flush_mode_low);
1783 SPLIT(num_evict_mode_some);
1784 SPLIT(num_evict_mode_full);
1785 SPLIT(num_objects_pinned);
1786 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
1787#undef SPLIT
1788#undef SPLIT_PRESERVE_NONZERO
1789 }
1790
1791 void clear() {
1792 memset(this, 0, sizeof(*this));
1793 }
1794
1795 void calc_copies(int nrep) {
1796 num_object_copies = nrep * num_objects;
1797 }
1798
1799 bool is_zero() const {
1800 return mem_is_zero((char*)this, sizeof(*this));
1801 }
1802
1803 void add(const object_stat_sum_t& o);
1804 void sub(const object_stat_sum_t& o);
1805
1806 void dump(Formatter *f) const;
1807 void padding_check() {
1808 static_assert(
1809 sizeof(object_stat_sum_t) ==
1810 sizeof(num_bytes) +
1811 sizeof(num_objects) +
1812 sizeof(num_object_clones) +
1813 sizeof(num_object_copies) +
1814 sizeof(num_objects_missing_on_primary) +
1815 sizeof(num_objects_degraded) +
1816 sizeof(num_objects_unfound) +
1817 sizeof(num_rd) +
1818 sizeof(num_rd_kb) +
1819 sizeof(num_wr) +
1820 sizeof(num_wr_kb) +
1821 sizeof(num_scrub_errors) +
28e407b8 1822 sizeof(num_large_omap_objects) +
7c673cae
FG
1823 sizeof(num_objects_recovered) +
1824 sizeof(num_bytes_recovered) +
1825 sizeof(num_keys_recovered) +
1826 sizeof(num_shallow_scrub_errors) +
1827 sizeof(num_deep_scrub_errors) +
1828 sizeof(num_objects_dirty) +
1829 sizeof(num_whiteouts) +
1830 sizeof(num_objects_omap) +
1831 sizeof(num_objects_hit_set_archive) +
1832 sizeof(num_objects_misplaced) +
1833 sizeof(num_bytes_hit_set_archive) +
1834 sizeof(num_flush) +
1835 sizeof(num_flush_kb) +
1836 sizeof(num_evict) +
1837 sizeof(num_evict_kb) +
1838 sizeof(num_promote) +
1839 sizeof(num_flush_mode_high) +
1840 sizeof(num_flush_mode_low) +
1841 sizeof(num_evict_mode_some) +
1842 sizeof(num_evict_mode_full) +
1843 sizeof(num_objects_pinned) +
1844 sizeof(num_objects_missing) +
1845 sizeof(num_legacy_snapsets)
1846 ,
1847 "object_stat_sum_t have padding");
1848 }
1849 void encode(bufferlist& bl) const;
1850 void decode(bufferlist::iterator& bl);
1851 static void generate_test_instances(list<object_stat_sum_t*>& o);
1852};
1853WRITE_CLASS_ENCODER(object_stat_sum_t)
1854
1855bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
1856
1857/**
1858 * a collection of object stat sums
1859 *
1860 * This is a collection of stat sums over different categories.
1861 */
1862struct object_stat_collection_t {
1863 /**************************************************************************
1864 * WARNING: be sure to update the operator== when adding/removing fields! *
1865 **************************************************************************/
1866 object_stat_sum_t sum;
1867
1868 void calc_copies(int nrep) {
1869 sum.calc_copies(nrep);
1870 }
1871
1872 void dump(Formatter *f) const;
1873 void encode(bufferlist& bl) const;
1874 void decode(bufferlist::iterator& bl);
1875 static void generate_test_instances(list<object_stat_collection_t*>& o);
1876
1877 bool is_zero() const {
1878 return sum.is_zero();
1879 }
1880
1881 void clear() {
1882 sum.clear();
1883 }
1884
1885 void floor(int64_t f) {
1886 sum.floor(f);
1887 }
1888
1889 void add(const object_stat_sum_t& o) {
1890 sum.add(o);
1891 }
1892
1893 void add(const object_stat_collection_t& o) {
1894 sum.add(o.sum);
1895 }
1896 void sub(const object_stat_collection_t& o) {
1897 sum.sub(o.sum);
1898 }
1899};
1900WRITE_CLASS_ENCODER(object_stat_collection_t)
1901
1902inline bool operator==(const object_stat_collection_t& l,
1903 const object_stat_collection_t& r) {
1904 return l.sum == r.sum;
1905}
1906
1907
1908/** pg_stat
1909 * aggregate stats for a single PG.
1910 */
1911struct pg_stat_t {
1912 /**************************************************************************
1913 * WARNING: be sure to update the operator== when adding/removing fields! *
1914 **************************************************************************/
1915 eversion_t version;
1916 version_t reported_seq; // sequence number
1917 epoch_t reported_epoch; // epoch of this report
1918 __u32 state;
1919 utime_t last_fresh; // last reported
1920 utime_t last_change; // new state != previous state
1921 utime_t last_active; // state & PG_STATE_ACTIVE
1922 utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
1923 utime_t last_clean; // state & PG_STATE_CLEAN
1924 utime_t last_unstale; // (state & PG_STATE_STALE) == 0
1925 utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
1926 utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
1927
1928 eversion_t log_start; // (log_start,version]
1929 eversion_t ondisk_log_start; // there may be more on disk
1930
1931 epoch_t created;
1932 epoch_t last_epoch_clean;
1933 pg_t parent;
1934 __u32 parent_split_bits;
1935
1936 eversion_t last_scrub;
1937 eversion_t last_deep_scrub;
1938 utime_t last_scrub_stamp;
1939 utime_t last_deep_scrub_stamp;
1940 utime_t last_clean_scrub_stamp;
1941
1942 object_stat_collection_t stats;
1943
1944 int64_t log_size;
1945 int64_t ondisk_log_size; // >= active_log_size
1946
1947 vector<int32_t> up, acting;
1948 epoch_t mapping_epoch;
1949
1950 vector<int32_t> blocked_by; ///< osds on which the pg is blocked
1951
1952 utime_t last_became_active;
1953 utime_t last_became_peered;
1954
1955 /// up, acting primaries
1956 int32_t up_primary;
1957 int32_t acting_primary;
1958
b32b8144
FG
1959 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
1960 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
1961 uint32_t snaptrimq_len;
1962
7c673cae
FG
1963 bool stats_invalid:1;
1964 /// true if num_objects_dirty is not accurate (because it was not
1965 /// maintained starting from pool creation)
1966 bool dirty_stats_invalid:1;
1967 bool omap_stats_invalid:1;
1968 bool hitset_stats_invalid:1;
1969 bool hitset_bytes_stats_invalid:1;
1970 bool pin_stats_invalid:1;
1971
1972 pg_stat_t()
1973 : reported_seq(0),
1974 reported_epoch(0),
1975 state(0),
1976 created(0), last_epoch_clean(0),
1977 parent_split_bits(0),
1978 log_size(0), ondisk_log_size(0),
1979 mapping_epoch(0),
1980 up_primary(-1),
1981 acting_primary(-1),
b32b8144 1982 snaptrimq_len(0),
7c673cae
FG
1983 stats_invalid(false),
1984 dirty_stats_invalid(false),
1985 omap_stats_invalid(false),
1986 hitset_stats_invalid(false),
1987 hitset_bytes_stats_invalid(false),
1988 pin_stats_invalid(false)
1989 { }
1990
1991 epoch_t get_effective_last_epoch_clean() const {
1992 if (state & PG_STATE_CLEAN) {
1993 // we are clean as of this report, and should thus take the
1994 // reported epoch
1995 return reported_epoch;
1996 } else {
1997 return last_epoch_clean;
1998 }
1999 }
2000
2001 pair<epoch_t, version_t> get_version_pair() const {
2002 return make_pair(reported_epoch, reported_seq);
2003 }
2004
2005 void floor(int64_t f) {
2006 stats.floor(f);
2007 if (log_size < f)
2008 log_size = f;
2009 if (ondisk_log_size < f)
2010 ondisk_log_size = f;
b32b8144
FG
2011 if (snaptrimq_len < f)
2012 snaptrimq_len = f;
7c673cae
FG
2013 }
2014
2015 void add(const pg_stat_t& o) {
2016 stats.add(o.stats);
2017 log_size += o.log_size;
2018 ondisk_log_size += o.ondisk_log_size;
b32b8144
FG
2019 if (((uint64_t)snaptrimq_len + (uint64_t)o.snaptrimq_len) > (uint64_t)(1 << 31)) {
2020 snaptrimq_len = 1 << 31;
2021 } else {
2022 snaptrimq_len += o.snaptrimq_len;
2023 }
7c673cae
FG
2024 }
2025 void sub(const pg_stat_t& o) {
2026 stats.sub(o.stats);
2027 log_size -= o.log_size;
2028 ondisk_log_size -= o.ondisk_log_size;
b32b8144
FG
2029 if (o.snaptrimq_len < snaptrimq_len) {
2030 snaptrimq_len -= o.snaptrimq_len;
2031 } else {
2032 snaptrimq_len = 0;
2033 }
7c673cae
FG
2034 }
2035
2036 bool is_acting_osd(int32_t osd, bool primary) const;
2037 void dump(Formatter *f) const;
2038 void dump_brief(Formatter *f) const;
2039 void encode(bufferlist &bl) const;
2040 void decode(bufferlist::iterator &bl);
2041 static void generate_test_instances(list<pg_stat_t*>& o);
2042};
2043WRITE_CLASS_ENCODER(pg_stat_t)
2044
2045bool operator==(const pg_stat_t& l, const pg_stat_t& r);
2046
2047/*
2048 * summation over an entire pool
2049 */
2050struct pool_stat_t {
2051 object_stat_collection_t stats;
2052 int64_t log_size;
2053 int64_t ondisk_log_size; // >= active_log_size
2054 int32_t up; ///< number of up replicas or shards
2055 int32_t acting; ///< number of acting replicas or shards
2056
2057 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0)
2058 { }
2059
2060 void floor(int64_t f) {
2061 stats.floor(f);
2062 if (log_size < f)
2063 log_size = f;
2064 if (ondisk_log_size < f)
2065 ondisk_log_size = f;
2066 if (up < f)
2067 up = f;
2068 if (acting < f)
2069 acting = f;
2070 }
2071
2072 void add(const pg_stat_t& o) {
2073 stats.add(o.stats);
2074 log_size += o.log_size;
2075 ondisk_log_size += o.ondisk_log_size;
2076 up += o.up.size();
2077 acting += o.acting.size();
2078 }
2079 void sub(const pg_stat_t& o) {
2080 stats.sub(o.stats);
2081 log_size -= o.log_size;
2082 ondisk_log_size -= o.ondisk_log_size;
2083 up -= o.up.size();
2084 acting -= o.acting.size();
2085 }
2086
2087 bool is_zero() const {
2088 return (stats.is_zero() &&
2089 log_size == 0 &&
2090 ondisk_log_size == 0 &&
2091 up == 0 &&
2092 acting == 0);
2093 }
2094
2095 void dump(Formatter *f) const;
2096 void encode(bufferlist &bl, uint64_t features) const;
2097 void decode(bufferlist::iterator &bl);
2098 static void generate_test_instances(list<pool_stat_t*>& o);
2099};
2100WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
2101
2102
2103// -----------------------------------------
2104
2105/**
2106 * pg_hit_set_info_t - information about a single recorded HitSet
2107 *
2108 * Track basic metadata about a HitSet, like the nubmer of insertions
2109 * and the time range it covers.
2110 */
2111struct pg_hit_set_info_t {
2112 utime_t begin, end; ///< time interval
2113 eversion_t version; ///< version this HitSet object was written
2114 bool using_gmt; ///< use gmt for creating the hit_set archive object name
2115
2116 friend bool operator==(const pg_hit_set_info_t& l,
2117 const pg_hit_set_info_t& r) {
2118 return
2119 l.begin == r.begin &&
2120 l.end == r.end &&
2121 l.version == r.version &&
2122 l.using_gmt == r.using_gmt;
2123 }
2124
2125 explicit pg_hit_set_info_t(bool using_gmt = true)
2126 : using_gmt(using_gmt) {}
2127
2128 void encode(bufferlist &bl) const;
2129 void decode(bufferlist::iterator &bl);
2130 void dump(Formatter *f) const;
2131 static void generate_test_instances(list<pg_hit_set_info_t*>& o);
2132};
2133WRITE_CLASS_ENCODER(pg_hit_set_info_t)
2134
2135/**
2136 * pg_hit_set_history_t - information about a history of hitsets
2137 *
2138 * Include information about the currently accumulating hit set as well
2139 * as archived/historical ones.
2140 */
2141struct pg_hit_set_history_t {
2142 eversion_t current_last_update; ///< last version inserted into current set
2143 list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
2144
2145 friend bool operator==(const pg_hit_set_history_t& l,
2146 const pg_hit_set_history_t& r) {
2147 return
2148 l.current_last_update == r.current_last_update &&
2149 l.history == r.history;
2150 }
2151
2152 void encode(bufferlist &bl) const;
2153 void decode(bufferlist::iterator &bl);
2154 void dump(Formatter *f) const;
2155 static void generate_test_instances(list<pg_hit_set_history_t*>& o);
2156};
2157WRITE_CLASS_ENCODER(pg_hit_set_history_t)
2158
2159
2160// -----------------------------------------
2161
2162/**
2163 * pg_history_t - information about recent pg peering/mapping history
2164 *
2165 * This is aggressively shared between OSDs to bound the amount of past
2166 * history they need to worry about.
2167 */
2168struct pg_history_t {
31f18b77
FG
2169 epoch_t epoch_created; // epoch in which *pg* was created (pool or pg)
2170 epoch_t epoch_pool_created; // epoch in which *pool* was created
2171 // (note: may be pg creation epoch for
2172 // pre-luminous clusters)
7c673cae
FG
2173 epoch_t last_epoch_started; // lower bound on last epoch started (anywhere, not necessarily locally)
2174 epoch_t last_interval_started; // first epoch of last_epoch_started interval
2175 epoch_t last_epoch_clean; // lower bound on last epoch the PG was completely clean.
2176 epoch_t last_interval_clean; // first epoch of last_epoch_clean interval
31f18b77 2177 epoch_t last_epoch_split; // as parent or child
7c673cae
FG
2178 epoch_t last_epoch_marked_full; // pool or cluster
2179
2180 /**
2181 * In the event of a map discontinuity, same_*_since may reflect the first
2182 * map the osd has seen in the new map sequence rather than the actual start
2183 * of the interval. This is ok since a discontinuity at epoch e means there
2184 * must have been a clean interval between e and now and that we cannot be
2185 * in the active set during the interval containing e.
2186 */
2187 epoch_t same_up_since; // same acting set since
2188 epoch_t same_interval_since; // same acting AND up set since
2189 epoch_t same_primary_since; // same primary at least back through this epoch.
2190
2191 eversion_t last_scrub;
2192 eversion_t last_deep_scrub;
2193 utime_t last_scrub_stamp;
2194 utime_t last_deep_scrub_stamp;
2195 utime_t last_clean_scrub_stamp;
2196
2197 friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
2198 return
2199 l.epoch_created == r.epoch_created &&
31f18b77 2200 l.epoch_pool_created == r.epoch_pool_created &&
7c673cae
FG
2201 l.last_epoch_started == r.last_epoch_started &&
2202 l.last_interval_started == r.last_interval_started &&
2203 l.last_epoch_clean == r.last_epoch_clean &&
2204 l.last_interval_clean == r.last_interval_clean &&
2205 l.last_epoch_split == r.last_epoch_split &&
2206 l.last_epoch_marked_full == r.last_epoch_marked_full &&
2207 l.same_up_since == r.same_up_since &&
2208 l.same_interval_since == r.same_interval_since &&
2209 l.same_primary_since == r.same_primary_since &&
2210 l.last_scrub == r.last_scrub &&
2211 l.last_deep_scrub == r.last_deep_scrub &&
2212 l.last_scrub_stamp == r.last_scrub_stamp &&
2213 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2214 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp;
2215 }
2216
2217 pg_history_t()
2218 : epoch_created(0),
31f18b77 2219 epoch_pool_created(0),
7c673cae
FG
2220 last_epoch_started(0),
2221 last_interval_started(0),
2222 last_epoch_clean(0),
2223 last_interval_clean(0),
2224 last_epoch_split(0),
2225 last_epoch_marked_full(0),
2226 same_up_since(0), same_interval_since(0), same_primary_since(0) {}
2227
2228 bool merge(const pg_history_t &other) {
2229 // Here, we only update the fields which cannot be calculated from the OSDmap.
2230 bool modified = false;
2231 if (epoch_created < other.epoch_created) {
2232 epoch_created = other.epoch_created;
2233 modified = true;
2234 }
31f18b77
FG
2235 if (epoch_pool_created < other.epoch_pool_created) {
2236 // FIXME: for jewel compat only; this should either be 0 or always the
2237 // same value across all pg instances.
2238 epoch_pool_created = other.epoch_pool_created;
2239 modified = true;
2240 }
7c673cae
FG
2241 if (last_epoch_started < other.last_epoch_started) {
2242 last_epoch_started = other.last_epoch_started;
2243 modified = true;
2244 }
2245 if (last_interval_started < other.last_interval_started) {
2246 last_interval_started = other.last_interval_started;
2247 modified = true;
2248 }
2249 if (last_epoch_clean < other.last_epoch_clean) {
2250 last_epoch_clean = other.last_epoch_clean;
2251 modified = true;
2252 }
2253 if (last_interval_clean < other.last_interval_clean) {
2254 last_interval_clean = other.last_interval_clean;
2255 modified = true;
2256 }
2257 if (last_epoch_split < other.last_epoch_split) {
2258 last_epoch_split = other.last_epoch_split;
2259 modified = true;
2260 }
2261 if (last_epoch_marked_full < other.last_epoch_marked_full) {
2262 last_epoch_marked_full = other.last_epoch_marked_full;
2263 modified = true;
2264 }
2265 if (other.last_scrub > last_scrub) {
2266 last_scrub = other.last_scrub;
2267 modified = true;
2268 }
2269 if (other.last_scrub_stamp > last_scrub_stamp) {
2270 last_scrub_stamp = other.last_scrub_stamp;
2271 modified = true;
2272 }
2273 if (other.last_deep_scrub > last_deep_scrub) {
2274 last_deep_scrub = other.last_deep_scrub;
2275 modified = true;
2276 }
2277 if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
2278 last_deep_scrub_stamp = other.last_deep_scrub_stamp;
2279 modified = true;
2280 }
2281 if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
2282 last_clean_scrub_stamp = other.last_clean_scrub_stamp;
2283 modified = true;
2284 }
2285 return modified;
2286 }
2287
2288 void encode(bufferlist& bl) const;
2289 void decode(bufferlist::iterator& p);
2290 void dump(Formatter *f) const;
2291 static void generate_test_instances(list<pg_history_t*>& o);
2292};
2293WRITE_CLASS_ENCODER(pg_history_t)
2294
2295inline ostream& operator<<(ostream& out, const pg_history_t& h) {
31f18b77 2296 return out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
7c673cae
FG
2297 << " lis/c " << h.last_interval_started
2298 << "/" << h.last_interval_clean
2299 << " les/c/f " << h.last_epoch_started << "/" << h.last_epoch_clean
2300 << "/" << h.last_epoch_marked_full
2301 << " " << h.same_up_since
2302 << "/" << h.same_interval_since
2303 << "/" << h.same_primary_since;
2304}
2305
2306
2307/**
2308 * pg_info_t - summary of PG statistics.
2309 *
2310 * some notes:
2311 * - last_complete implies we have all objects that existed as of that
2312 * stamp, OR a newer object, OR have already applied a later delete.
2313 * - if last_complete >= log.bottom, then we know pg contents thru log.head.
2314 * otherwise, we have no idea what the pg is supposed to contain.
2315 */
2316struct pg_info_t {
2317 spg_t pgid;
2318 eversion_t last_update; ///< last object version applied to store.
2319 eversion_t last_complete; ///< last version pg was complete through.
2320 epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd
2321 epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
2322
2323 version_t last_user_version; ///< last user object version applied to store
2324
2325 eversion_t log_tail; ///< oldest log entry.
2326
2327 hobject_t last_backfill; ///< objects >= this and < last_complete may be missing
2328 bool last_backfill_bitwise; ///< true if last_backfill reflects a bitwise (vs nibblewise) sort
2329
2330 interval_set<snapid_t> purged_snaps;
2331
2332 pg_stat_t stats;
2333
2334 pg_history_t history;
2335 pg_hit_set_history_t hit_set;
2336
2337 friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
2338 return
2339 l.pgid == r.pgid &&
2340 l.last_update == r.last_update &&
2341 l.last_complete == r.last_complete &&
2342 l.last_epoch_started == r.last_epoch_started &&
2343 l.last_interval_started == r.last_interval_started &&
2344 l.last_user_version == r.last_user_version &&
2345 l.log_tail == r.log_tail &&
2346 l.last_backfill == r.last_backfill &&
2347 l.last_backfill_bitwise == r.last_backfill_bitwise &&
2348 l.purged_snaps == r.purged_snaps &&
2349 l.stats == r.stats &&
2350 l.history == r.history &&
2351 l.hit_set == r.hit_set;
2352 }
2353
2354 pg_info_t()
2355 : last_epoch_started(0),
2356 last_interval_started(0),
2357 last_user_version(0),
2358 last_backfill(hobject_t::get_max()),
2359 last_backfill_bitwise(false)
2360 { }
2361 // cppcheck-suppress noExplicitConstructor
2362 pg_info_t(spg_t p)
2363 : pgid(p),
2364 last_epoch_started(0),
2365 last_interval_started(0),
2366 last_user_version(0),
2367 last_backfill(hobject_t::get_max()),
2368 last_backfill_bitwise(false)
2369 { }
2370
2371 void set_last_backfill(hobject_t pos) {
2372 last_backfill = pos;
2373 last_backfill_bitwise = true;
2374 }
2375
2376 bool is_empty() const { return last_update.version == 0; }
2377 bool dne() const { return history.epoch_created == 0; }
2378
2379 bool is_incomplete() const { return !last_backfill.is_max(); }
2380
2381 void encode(bufferlist& bl) const;
2382 void decode(bufferlist::iterator& p);
2383 void dump(Formatter *f) const;
2384 bool overlaps_with(const pg_info_t &oinfo) const {
2385 return last_update > oinfo.log_tail ?
2386 oinfo.last_update >= log_tail :
2387 last_update >= oinfo.log_tail;
2388 }
2389 static void generate_test_instances(list<pg_info_t*>& o);
2390};
2391WRITE_CLASS_ENCODER(pg_info_t)
2392
2393inline ostream& operator<<(ostream& out, const pg_info_t& pgi)
2394{
2395 out << pgi.pgid << "(";
2396 if (pgi.dne())
2397 out << " DNE";
2398 if (pgi.is_empty())
2399 out << " empty";
2400 else {
2401 out << " v " << pgi.last_update;
2402 if (pgi.last_complete != pgi.last_update)
2403 out << " lc " << pgi.last_complete;
2404 out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
2405 }
2406 if (pgi.is_incomplete())
2407 out << " lb " << pgi.last_backfill
2408 << (pgi.last_backfill_bitwise ? " (bitwise)" : " (NIBBLEWISE)");
2409 //out << " c " << pgi.epoch_created;
2410 out << " local-lis/les=" << pgi.last_interval_started
2411 << "/" << pgi.last_epoch_started;
2412 out << " n=" << pgi.stats.stats.sum.num_objects;
2413 out << " " << pgi.history
2414 << ")";
2415 return out;
2416}
2417
2418/**
2419 * pg_fast_info_t - common pg_info_t fields
2420 *
2421 * These are the fields of pg_info_t (and children) that are updated for
2422 * most IO operations.
2423 *
2424 * ** WARNING **
2425 * Because we rely on these fields to be applied to the normal
2426 * info struct, adding a new field here that is not also new in info
2427 * means that we must set an incompat OSD feature bit!
2428 */
2429struct pg_fast_info_t {
2430 eversion_t last_update;
2431 eversion_t last_complete;
2432 version_t last_user_version;
2433 struct { // pg_stat_t stats
2434 eversion_t version;
2435 version_t reported_seq;
2436 utime_t last_fresh;
2437 utime_t last_active;
2438 utime_t last_peered;
2439 utime_t last_clean;
2440 utime_t last_unstale;
2441 utime_t last_undegraded;
2442 utime_t last_fullsized;
2443 int64_t log_size; // (also ondisk_log_size, which has the same value)
2444 struct { // object_stat_collection_t stats;
2445 struct { // objct_stat_sum_t sum
2446 int64_t num_bytes; // in bytes
2447 int64_t num_objects;
2448 int64_t num_object_copies;
2449 int64_t num_rd;
2450 int64_t num_rd_kb;
2451 int64_t num_wr;
2452 int64_t num_wr_kb;
2453 int64_t num_objects_dirty;
2454 } sum;
2455 } stats;
2456 } stats;
2457
2458 void populate_from(const pg_info_t& info) {
2459 last_update = info.last_update;
2460 last_complete = info.last_complete;
2461 last_user_version = info.last_user_version;
2462 stats.version = info.stats.version;
2463 stats.reported_seq = info.stats.reported_seq;
2464 stats.last_fresh = info.stats.last_fresh;
2465 stats.last_active = info.stats.last_active;
2466 stats.last_peered = info.stats.last_peered;
2467 stats.last_clean = info.stats.last_clean;
2468 stats.last_unstale = info.stats.last_unstale;
2469 stats.last_undegraded = info.stats.last_undegraded;
2470 stats.last_fullsized = info.stats.last_fullsized;
2471 stats.log_size = info.stats.log_size;
2472 stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
2473 stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
2474 stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
2475 stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
2476 stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
2477 stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
2478 stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
2479 stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
2480 }
2481
2482 bool try_apply_to(pg_info_t* info) {
2483 if (last_update <= info->last_update)
2484 return false;
2485 info->last_update = last_update;
2486 info->last_complete = last_complete;
2487 info->last_user_version = last_user_version;
2488 info->stats.version = stats.version;
2489 info->stats.reported_seq = stats.reported_seq;
2490 info->stats.last_fresh = stats.last_fresh;
2491 info->stats.last_active = stats.last_active;
2492 info->stats.last_peered = stats.last_peered;
2493 info->stats.last_clean = stats.last_clean;
2494 info->stats.last_unstale = stats.last_unstale;
2495 info->stats.last_undegraded = stats.last_undegraded;
2496 info->stats.last_fullsized = stats.last_fullsized;
2497 info->stats.log_size = stats.log_size;
2498 info->stats.ondisk_log_size = stats.log_size;
2499 info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
2500 info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
2501 info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
2502 info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
2503 info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
2504 info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
2505 info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
2506 info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
2507 return true;
2508 }
2509
2510 void encode(bufferlist& bl) const {
2511 ENCODE_START(1, 1, bl);
2512 ::encode(last_update, bl);
2513 ::encode(last_complete, bl);
2514 ::encode(last_user_version, bl);
2515 ::encode(stats.version, bl);
2516 ::encode(stats.reported_seq, bl);
2517 ::encode(stats.last_fresh, bl);
2518 ::encode(stats.last_active, bl);
2519 ::encode(stats.last_peered, bl);
2520 ::encode(stats.last_clean, bl);
2521 ::encode(stats.last_unstale, bl);
2522 ::encode(stats.last_undegraded, bl);
2523 ::encode(stats.last_fullsized, bl);
2524 ::encode(stats.log_size, bl);
2525 ::encode(stats.stats.sum.num_bytes, bl);
2526 ::encode(stats.stats.sum.num_objects, bl);
2527 ::encode(stats.stats.sum.num_object_copies, bl);
2528 ::encode(stats.stats.sum.num_rd, bl);
2529 ::encode(stats.stats.sum.num_rd_kb, bl);
2530 ::encode(stats.stats.sum.num_wr, bl);
2531 ::encode(stats.stats.sum.num_wr_kb, bl);
2532 ::encode(stats.stats.sum.num_objects_dirty, bl);
2533 ENCODE_FINISH(bl);
2534 }
2535 void decode(bufferlist::iterator& p) {
2536 DECODE_START(1, p);
2537 ::decode(last_update, p);
2538 ::decode(last_complete, p);
2539 ::decode(last_user_version, p);
2540 ::decode(stats.version, p);
2541 ::decode(stats.reported_seq, p);
2542 ::decode(stats.last_fresh, p);
2543 ::decode(stats.last_active, p);
2544 ::decode(stats.last_peered, p);
2545 ::decode(stats.last_clean, p);
2546 ::decode(stats.last_unstale, p);
2547 ::decode(stats.last_undegraded, p);
2548 ::decode(stats.last_fullsized, p);
2549 ::decode(stats.log_size, p);
2550 ::decode(stats.stats.sum.num_bytes, p);
2551 ::decode(stats.stats.sum.num_objects, p);
2552 ::decode(stats.stats.sum.num_object_copies, p);
2553 ::decode(stats.stats.sum.num_rd, p);
2554 ::decode(stats.stats.sum.num_rd_kb, p);
2555 ::decode(stats.stats.sum.num_wr, p);
2556 ::decode(stats.stats.sum.num_wr_kb, p);
2557 ::decode(stats.stats.sum.num_objects_dirty, p);
2558 DECODE_FINISH(p);
2559 }
2560};
2561WRITE_CLASS_ENCODER(pg_fast_info_t)
2562
2563
2564struct pg_notify_t {
2565 epoch_t query_epoch;
2566 epoch_t epoch_sent;
2567 pg_info_t info;
2568 shard_id_t to;
2569 shard_id_t from;
2570 pg_notify_t() :
2571 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
2572 from(shard_id_t::NO_SHARD) {}
2573 pg_notify_t(
2574 shard_id_t to,
2575 shard_id_t from,
2576 epoch_t query_epoch,
2577 epoch_t epoch_sent,
2578 const pg_info_t &info)
2579 : query_epoch(query_epoch),
2580 epoch_sent(epoch_sent),
2581 info(info), to(to), from(from) {
2582 assert(from == info.pgid.shard);
2583 }
2584 void encode(bufferlist &bl) const;
2585 void decode(bufferlist::iterator &p);
2586 void dump(Formatter *f) const;
2587 static void generate_test_instances(list<pg_notify_t*> &o);
2588};
2589WRITE_CLASS_ENCODER(pg_notify_t)
2590ostream &operator<<(ostream &lhs, const pg_notify_t &notify);
2591
2592
2593class OSDMap;
2594/**
2595 * PastIntervals -- information needed to determine the PriorSet and
2596 * the might_have_unfound set
2597 */
2598class PastIntervals {
2599public:
2600 struct pg_interval_t {
2601 vector<int32_t> up, acting;
2602 epoch_t first, last;
2603 bool maybe_went_rw;
2604 int32_t primary;
2605 int32_t up_primary;
2606
2607 pg_interval_t()
2608 : first(0), last(0),
2609 maybe_went_rw(false),
2610 primary(-1),
2611 up_primary(-1)
2612 {}
2613
2614 pg_interval_t(
2615 vector<int32_t> &&up,
2616 vector<int32_t> &&acting,
2617 epoch_t first,
2618 epoch_t last,
2619 bool maybe_went_rw,
2620 int32_t primary,
2621 int32_t up_primary)
2622 : up(up), acting(acting), first(first), last(last),
2623 maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
2624 {}
2625
2626 void encode(bufferlist& bl) const;
2627 void decode(bufferlist::iterator& bl);
2628 void dump(Formatter *f) const;
2629 static void generate_test_instances(list<pg_interval_t*>& o);
2630 };
2631
2632 PastIntervals() = default;
2633 PastIntervals(bool ec_pool, const OSDMap &osdmap) : PastIntervals() {
2634 update_type_from_map(ec_pool, osdmap);
2635 }
2636 PastIntervals(bool ec_pool, bool compact) : PastIntervals() {
2637 update_type(ec_pool, compact);
2638 }
2639 PastIntervals(PastIntervals &&rhs) = default;
2640 PastIntervals &operator=(PastIntervals &&rhs) = default;
2641
2642 PastIntervals(const PastIntervals &rhs);
2643 PastIntervals &operator=(const PastIntervals &rhs);
2644
2645 class interval_rep {
2646 public:
2647 virtual size_t size() const = 0;
2648 virtual bool empty() const = 0;
2649 virtual void clear() = 0;
2650 virtual pair<epoch_t, epoch_t> get_bounds() const = 0;
2651 virtual set<pg_shard_t> get_all_participants(
2652 bool ec_pool) const = 0;
2653 virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
2654 virtual unique_ptr<interval_rep> clone() const = 0;
2655 virtual ostream &print(ostream &out) const = 0;
2656 virtual void encode(bufferlist &bl) const = 0;
2657 virtual void decode(bufferlist::iterator &bl) = 0;
2658 virtual void dump(Formatter *f) const = 0;
2659 virtual bool is_classic() const = 0;
2660 virtual void iterate_mayberw_back_to(
2661 bool ec_pool,
2662 epoch_t les,
2663 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const = 0;
2664
2665 virtual bool has_full_intervals() const { return false; }
2666 virtual void iterate_all_intervals(
2667 std::function<void(const pg_interval_t &)> &&f) const {
2668 assert(!has_full_intervals());
2669 assert(0 == "not valid for this implementation");
2670 }
2671
2672 virtual ~interval_rep() {}
2673 };
2674 friend class pi_simple_rep;
2675 friend class pi_compact_rep;
2676private:
2677
2678 unique_ptr<interval_rep> past_intervals;
2679
2680 PastIntervals(interval_rep *rep) : past_intervals(rep) {}
2681
2682public:
2683 void add_interval(bool ec_pool, const pg_interval_t &interval) {
2684 assert(past_intervals);
2685 return past_intervals->add_interval(ec_pool, interval);
2686 }
2687
2688 bool is_classic() const {
2689 assert(past_intervals);
2690 return past_intervals->is_classic();
2691 }
2692
2693 void encode(bufferlist &bl) const {
2694 ENCODE_START(1, 1, bl);
2695 if (past_intervals) {
2696 __u8 type = is_classic() ? 1 : 2;
2697 ::encode(type, bl);
2698 past_intervals->encode(bl);
2699 } else {
2700 ::encode((__u8)0, bl);
2701 }
2702 ENCODE_FINISH(bl);
2703 }
2704 void encode_classic(bufferlist &bl) const {
2705 if (past_intervals) {
2706 assert(past_intervals->is_classic());
2707 past_intervals->encode(bl);
2708 } else {
2709 // it's a map<>
2710 ::encode((uint32_t)0, bl);
2711 }
2712 }
2713
2714 void decode(bufferlist::iterator &bl);
2715 void decode_classic(bufferlist::iterator &bl);
2716
2717 void dump(Formatter *f) const {
2718 assert(past_intervals);
2719 past_intervals->dump(f);
2720 }
2721 static void generate_test_instances(list<PastIntervals *> & o);
2722
2723 /**
2724 * Determines whether there is an interval change
2725 */
2726 static bool is_new_interval(
2727 int old_acting_primary,
2728 int new_acting_primary,
2729 const vector<int> &old_acting,
2730 const vector<int> &new_acting,
2731 int old_up_primary,
2732 int new_up_primary,
2733 const vector<int> &old_up,
2734 const vector<int> &new_up,
2735 int old_size,
2736 int new_size,
2737 int old_min_size,
2738 int new_min_size,
2739 unsigned old_pg_num,
2740 unsigned new_pg_num,
2741 bool old_sort_bitwise,
2742 bool new_sort_bitwise,
c07f9fc5
FG
2743 bool old_recovery_deletes,
2744 bool new_recovery_deletes,
7c673cae
FG
2745 pg_t pgid
2746 );
2747
2748 /**
2749 * Determines whether there is an interval change
2750 */
2751 static bool is_new_interval(
2752 int old_acting_primary, ///< [in] primary as of lastmap
2753 int new_acting_primary, ///< [in] primary as of lastmap
2754 const vector<int> &old_acting, ///< [in] acting as of lastmap
2755 const vector<int> &new_acting, ///< [in] acting as of osdmap
2756 int old_up_primary, ///< [in] up primary of lastmap
2757 int new_up_primary, ///< [in] up primary of osdmap
2758 const vector<int> &old_up, ///< [in] up as of lastmap
2759 const vector<int> &new_up, ///< [in] up as of osdmap
2760 ceph::shared_ptr<const OSDMap> osdmap, ///< [in] current map
2761 ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
2762 pg_t pgid ///< [in] pgid for pg
2763 );
2764
2765 /**
2766 * Integrates a new map into *past_intervals, returns true
2767 * if an interval was closed out.
2768 */
2769 static bool check_new_interval(
2770 int old_acting_primary, ///< [in] primary as of lastmap
2771 int new_acting_primary, ///< [in] primary as of osdmap
2772 const vector<int> &old_acting, ///< [in] acting as of lastmap
2773 const vector<int> &new_acting, ///< [in] acting as of osdmap
2774 int old_up_primary, ///< [in] up primary of lastmap
2775 int new_up_primary, ///< [in] up primary of osdmap
2776 const vector<int> &old_up, ///< [in] up as of lastmap
2777 const vector<int> &new_up, ///< [in] up as of osdmap
2778 epoch_t same_interval_since, ///< [in] as of osdmap
2779 epoch_t last_epoch_clean, ///< [in] current
2780 ceph::shared_ptr<const OSDMap> osdmap, ///< [in] current map
2781 ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
2782 pg_t pgid, ///< [in] pgid for pg
2783 IsPGRecoverablePredicate *could_have_gone_active, /// [in] predicate whether the pg can be active
2784 PastIntervals *past_intervals, ///< [out] intervals
2785 ostream *out = 0 ///< [out] debug ostream
2786 );
c07f9fc5 2787
7c673cae
FG
2788 friend ostream& operator<<(ostream& out, const PastIntervals &i);
2789
2790 template <typename F>
2791 void iterate_mayberw_back_to(
2792 bool ec_pool,
2793 epoch_t les,
2794 F &&f) const {
2795 assert(past_intervals);
2796 past_intervals->iterate_mayberw_back_to(ec_pool, les, std::forward<F>(f));
2797 }
2798 void clear() {
2799 assert(past_intervals);
2800 past_intervals->clear();
2801 }
2802
2803 /**
2804 * Should return a value which gives an indication of the amount
2805 * of state contained
2806 */
2807 size_t size() const {
2808 assert(past_intervals);
2809 return past_intervals->size();
2810 }
2811
2812 bool empty() const {
2813 assert(past_intervals);
2814 return past_intervals->empty();
2815 }
2816
2817 void swap(PastIntervals &other) {
31f18b77
FG
2818 using std::swap;
2819 swap(other.past_intervals, past_intervals);
7c673cae
FG
2820 }
2821
2822 /**
2823 * Return all shards which have been in the acting set back to the
2824 * latest epoch to which we have trimmed except for pg_whoami
2825 */
2826 set<pg_shard_t> get_might_have_unfound(
2827 pg_shard_t pg_whoami,
2828 bool ec_pool) const {
2829 assert(past_intervals);
2830 auto ret = past_intervals->get_all_participants(ec_pool);
2831 ret.erase(pg_whoami);
2832 return ret;
2833 }
2834
2835 /**
2836 * Return all shards which we might want to talk to for peering
2837 */
2838 set<pg_shard_t> get_all_probe(
2839 bool ec_pool) const {
2840 assert(past_intervals);
2841 return past_intervals->get_all_participants(ec_pool);
2842 }
2843
2844 /* Return the set of epochs [start, end) represented by the
2845 * past_interval set.
2846 */
2847 pair<epoch_t, epoch_t> get_bounds() const {
2848 assert(past_intervals);
2849 return past_intervals->get_bounds();
2850 }
2851
2852 enum osd_state_t {
2853 UP,
2854 DOWN,
2855 DNE,
2856 LOST
2857 };
2858 struct PriorSet {
2859 bool ec_pool = false;
2860 set<pg_shard_t> probe; /// current+prior OSDs we need to probe.
2861 set<int> down; /// down osds that would normally be in @a probe and might be interesting.
2862 map<int, epoch_t> blocked_by; /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
2863
2864 bool pg_down = false; /// some down osds are included in @a cur; the DOWN pg state bit should be set.
2865 unique_ptr<IsPGRecoverablePredicate> pcontdec;
2866
2867 PriorSet() = default;
2868 PriorSet(PriorSet &&) = default;
2869 PriorSet &operator=(PriorSet &&) = default;
2870
2871 PriorSet &operator=(const PriorSet &) = delete;
2872 PriorSet(const PriorSet &) = delete;
2873
2874 bool operator==(const PriorSet &rhs) const {
2875 return (ec_pool == rhs.ec_pool) &&
2876 (probe == rhs.probe) &&
2877 (down == rhs.down) &&
2878 (blocked_by == rhs.blocked_by) &&
2879 (pg_down == rhs.pg_down);
2880 }
2881
2882 bool affected_by_map(
2883 const OSDMap &osdmap,
2884 const DoutPrefixProvider *dpp) const;
2885
2886 // For verifying tests
2887 PriorSet(
2888 bool ec_pool,
2889 set<pg_shard_t> probe,
2890 set<int> down,
2891 map<int, epoch_t> blocked_by,
2892 bool pg_down,
2893 IsPGRecoverablePredicate *pcontdec)
2894 : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
2895 pg_down(pg_down), pcontdec(pcontdec) {}
2896
2897 private:
2898 template <typename F>
2899 PriorSet(
2900 const PastIntervals &past_intervals,
2901 bool ec_pool,
2902 epoch_t last_epoch_started,
2903 IsPGRecoverablePredicate *c,
2904 F f,
2905 const vector<int> &up,
2906 const vector<int> &acting,
2907 const DoutPrefixProvider *dpp);
2908
2909 friend class PastIntervals;
2910 };
2911
2912 void update_type(bool ec_pool, bool compact);
2913 void update_type_from_map(bool ec_pool, const OSDMap &osdmap);
2914
2915 template <typename... Args>
2916 PriorSet get_prior_set(Args&&... args) const {
2917 return PriorSet(*this, std::forward<Args>(args)...);
2918 }
2919};
2920WRITE_CLASS_ENCODER(PastIntervals)
2921
2922ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i);
2923ostream& operator<<(ostream& out, const PastIntervals &i);
2924ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i);
2925
2926template <typename F>
2927PastIntervals::PriorSet::PriorSet(
2928 const PastIntervals &past_intervals,
2929 bool ec_pool,
2930 epoch_t last_epoch_started,
2931 IsPGRecoverablePredicate *c,
2932 F f,
2933 const vector<int> &up,
2934 const vector<int> &acting,
2935 const DoutPrefixProvider *dpp)
2936 : ec_pool(ec_pool), pg_down(false), pcontdec(c)
2937{
2938 /*
2939 * We have to be careful to gracefully deal with situations like
2940 * so. Say we have a power outage or something that takes out both
2941 * OSDs, but the monitor doesn't mark them down in the same epoch.
2942 * The history may look like
2943 *
2944 * 1: A B
2945 * 2: B
2946 * 3: let's say B dies for good, too (say, from the power spike)
2947 * 4: A
2948 *
2949 * which makes it look like B may have applied updates to the PG
2950 * that we need in order to proceed. This sucks...
2951 *
2952 * To minimize the risk of this happening, we CANNOT go active if
2953 * _any_ OSDs in the prior set are down until we send an MOSDAlive
2954 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
2955 * Then, we have something like
2956 *
2957 * 1: A B
2958 * 2: B up_thru[B]=0
2959 * 3:
2960 * 4: A
2961 *
2962 * -> we can ignore B, bc it couldn't have gone active (alive_thru
2963 * still 0).
2964 *
2965 * or,
2966 *
2967 * 1: A B
2968 * 2: B up_thru[B]=0
2969 * 3: B up_thru[B]=2
2970 * 4:
2971 * 5: A
2972 *
2973 * -> we must wait for B, bc it was alive through 2, and could have
2974 * written to the pg.
2975 *
2976 * If B is really dead, then an administrator will need to manually
2977 * intervene by marking the OSD as "lost."
2978 */
2979
2980 // Include current acting and up nodes... not because they may
2981 // contain old data (this interval hasn't gone active, obviously),
2982 // but because we want their pg_info to inform choose_acting(), and
2983 // so that we know what they do/do not have explicitly before
2984 // sending them any new info/logs/whatever.
2985 for (unsigned i = 0; i < acting.size(); i++) {
2986 if (acting[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
2987 probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
2988 }
2989 // It may be possible to exclude the up nodes, but let's keep them in
2990 // there for now.
2991 for (unsigned i = 0; i < up.size(); i++) {
2992 if (up[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
2993 probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
2994 }
2995
2996 set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
2997 ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
2998 for (auto &&i: all_probe) {
2999 switch (f(0, i.osd, nullptr)) {
3000 case UP: {
3001 probe.insert(i);
3002 break;
3003 }
3004 case DNE:
3005 case LOST:
3006 case DOWN: {
3007 down.insert(i.osd);
3008 break;
3009 }
3010 }
3011 }
3012
3013 past_intervals.iterate_mayberw_back_to(
3014 ec_pool,
3015 last_epoch_started,
3016 [&](epoch_t start, const set<pg_shard_t> &acting) {
3017 ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
3018 << ", acting: " << acting << dendl;
3019
3020 // look at candidate osds during this interval. each falls into
3021 // one of three categories: up, down (but potentially
3022 // interesting), or lost (down, but we won't wait for it).
3023 set<pg_shard_t> up_now;
3024 map<int, epoch_t> candidate_blocked_by;
3025 // any candidates down now (that might have useful data)
3026 bool any_down_now = false;
3027
3028 // consider ACTING osds
3029 for (auto &&so: acting) {
3030 epoch_t lost_at = 0;
3031 switch (f(start, so.osd, &lost_at)) {
3032 case UP: {
3033 // include past acting osds if they are up.
3034 up_now.insert(so);
3035 break;
3036 }
3037 case DNE: {
3038 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3039 << " no longer exists" << dendl;
3040 break;
3041 }
3042 case LOST: {
3043 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3044 << " is down, but lost_at " << lost_at << dendl;
3045 up_now.insert(so);
3046 break;
3047 }
3048 case DOWN: {
3049 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3050 << " is down" << dendl;
3051 candidate_blocked_by[so.osd] = lost_at;
3052 any_down_now = true;
3053 break;
3054 }
3055 }
3056 }
3057
3058 // if not enough osds survived this interval, and we may have gone rw,
3059 // then we need to wait for one of those osds to recover to
3060 // ensure that we haven't lost any information.
3061 if (!(*pcontdec)(up_now) && any_down_now) {
3062 // fixme: how do we identify a "clean" shutdown anyway?
3063 ldpp_dout(dpp, 10) << "build_prior possibly went active+rw,"
3064 << " insufficient up; including down osds" << dendl;
3065 assert(!candidate_blocked_by.empty());
3066 pg_down = true;
3067 blocked_by.insert(
3068 candidate_blocked_by.begin(),
3069 candidate_blocked_by.end());
3070 }
3071 });
3072
3073 ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
3074 << " down " << down
3075 << " blocked_by " << blocked_by
3076 << (pg_down ? " pg_down":"")
3077 << dendl;
3078}
3079
3080/**
3081 * pg_query_t - used to ask a peer for information about a pg.
3082 *
3083 * note: if version=0, type=LOG, then we just provide our full log.
3084 */
3085struct pg_query_t {
3086 enum {
3087 INFO = 0,
3088 LOG = 1,
3089 MISSING = 4,
3090 FULLLOG = 5,
3091 };
3092 const char *get_type_name() const {
3093 switch (type) {
3094 case INFO: return "info";
3095 case LOG: return "log";
3096 case MISSING: return "missing";
3097 case FULLLOG: return "fulllog";
3098 default: return "???";
3099 }
3100 }
3101
3102 __s32 type;
3103 eversion_t since;
3104 pg_history_t history;
3105 epoch_t epoch_sent;
3106 shard_id_t to;
3107 shard_id_t from;
3108
3109 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
3110 from(shard_id_t::NO_SHARD) {}
3111 pg_query_t(
3112 int t,
3113 shard_id_t to,
3114 shard_id_t from,
3115 const pg_history_t& h,
3116 epoch_t epoch_sent)
3117 : type(t),
3118 history(h),
3119 epoch_sent(epoch_sent),
3120 to(to), from(from) {
3121 assert(t != LOG);
3122 }
3123 pg_query_t(
3124 int t,
3125 shard_id_t to,
3126 shard_id_t from,
3127 eversion_t s,
3128 const pg_history_t& h,
3129 epoch_t epoch_sent)
3130 : type(t), since(s), history(h),
3131 epoch_sent(epoch_sent), to(to), from(from) {
3132 assert(t == LOG);
3133 }
3134
3135 void encode(bufferlist &bl, uint64_t features) const;
3136 void decode(bufferlist::iterator &bl);
3137
3138 void dump(Formatter *f) const;
3139 static void generate_test_instances(list<pg_query_t*>& o);
3140};
3141WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
3142
3143inline ostream& operator<<(ostream& out, const pg_query_t& q) {
3144 out << "query(" << q.get_type_name() << " " << q.since;
3145 if (q.type == pg_query_t::LOG)
3146 out << " " << q.history;
3147 out << ")";
3148 return out;
3149}
3150
3151class PGBackend;
3152class ObjectModDesc {
3153 bool can_local_rollback;
3154 bool rollback_info_completed;
3155
3156 // version required to decode, reflected in encode/decode version
3157 __u8 max_required_version = 1;
3158public:
3159 class Visitor {
3160 public:
3161 virtual void append(uint64_t old_offset) {}
3162 virtual void setattrs(map<string, boost::optional<bufferlist> > &attrs) {}
3163 virtual void rmobject(version_t old_version) {}
3164 /**
3165 * Used to support the unfound_lost_delete log event: if the stashed
3166 * version exists, we unstash it, otherwise, we do nothing. This way
3167 * each replica rolls back to whatever state it had prior to the attempt
3168 * at mark unfound lost delete
3169 */
3170 virtual void try_rmobject(version_t old_version) {
3171 rmobject(old_version);
3172 }
3173 virtual void create() {}
3174 virtual void update_snaps(const set<snapid_t> &old_snaps) {}
3175 virtual void rollback_extents(
3176 version_t gen,
3177 const vector<pair<uint64_t, uint64_t> > &extents) {}
3178 virtual ~Visitor() {}
3179 };
3180 void visit(Visitor *visitor) const;
3181 mutable bufferlist bl;
3182 enum ModID {
3183 APPEND = 1,
3184 SETATTRS = 2,
3185 DELETE = 3,
3186 CREATE = 4,
3187 UPDATE_SNAPS = 5,
3188 TRY_DELETE = 6,
3189 ROLLBACK_EXTENTS = 7
3190 };
31f18b77
FG
3191 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3192 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3193 }
7c673cae
FG
3194 void claim(ObjectModDesc &other) {
3195 bl.clear();
3196 bl.claim(other.bl);
3197 can_local_rollback = other.can_local_rollback;
3198 rollback_info_completed = other.rollback_info_completed;
3199 }
3200 void claim_append(ObjectModDesc &other) {
3201 if (!can_local_rollback || rollback_info_completed)
3202 return;
3203 if (!other.can_local_rollback) {
3204 mark_unrollbackable();
3205 return;
3206 }
3207 bl.claim_append(other.bl);
3208 rollback_info_completed = other.rollback_info_completed;
3209 }
3210 void swap(ObjectModDesc &other) {
3211 bl.swap(other.bl);
3212
31f18b77
FG
3213 using std::swap;
3214 swap(other.can_local_rollback, can_local_rollback);
3215 swap(other.rollback_info_completed, rollback_info_completed);
3216 swap(other.max_required_version, max_required_version);
7c673cae
FG
3217 }
3218 void append_id(ModID id) {
3219 uint8_t _id(id);
3220 ::encode(_id, bl);
3221 }
3222 void append(uint64_t old_size) {
3223 if (!can_local_rollback || rollback_info_completed)
3224 return;
3225 ENCODE_START(1, 1, bl);
3226 append_id(APPEND);
3227 ::encode(old_size, bl);
3228 ENCODE_FINISH(bl);
3229 }
3230 void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
3231 if (!can_local_rollback || rollback_info_completed)
3232 return;
3233 ENCODE_START(1, 1, bl);
3234 append_id(SETATTRS);
3235 ::encode(old_attrs, bl);
3236 ENCODE_FINISH(bl);
3237 }
3238 bool rmobject(version_t deletion_version) {
3239 if (!can_local_rollback || rollback_info_completed)
3240 return false;
3241 ENCODE_START(1, 1, bl);
3242 append_id(DELETE);
3243 ::encode(deletion_version, bl);
3244 ENCODE_FINISH(bl);
3245 rollback_info_completed = true;
3246 return true;
3247 }
3248 bool try_rmobject(version_t deletion_version) {
3249 if (!can_local_rollback || rollback_info_completed)
3250 return false;
3251 ENCODE_START(1, 1, bl);
3252 append_id(TRY_DELETE);
3253 ::encode(deletion_version, bl);
3254 ENCODE_FINISH(bl);
3255 rollback_info_completed = true;
3256 return true;
3257 }
3258 void create() {
3259 if (!can_local_rollback || rollback_info_completed)
3260 return;
3261 rollback_info_completed = true;
3262 ENCODE_START(1, 1, bl);
3263 append_id(CREATE);
3264 ENCODE_FINISH(bl);
3265 }
3266 void update_snaps(const set<snapid_t> &old_snaps) {
3267 if (!can_local_rollback || rollback_info_completed)
3268 return;
3269 ENCODE_START(1, 1, bl);
3270 append_id(UPDATE_SNAPS);
3271 ::encode(old_snaps, bl);
3272 ENCODE_FINISH(bl);
3273 }
3274 void rollback_extents(
3275 version_t gen, const vector<pair<uint64_t, uint64_t> > &extents) {
3276 assert(can_local_rollback);
3277 assert(!rollback_info_completed);
3278 if (max_required_version < 2)
3279 max_required_version = 2;
3280 ENCODE_START(2, 2, bl);
3281 append_id(ROLLBACK_EXTENTS);
3282 ::encode(gen, bl);
3283 ::encode(extents, bl);
3284 ENCODE_FINISH(bl);
3285 }
3286
3287 // cannot be rolled back
3288 void mark_unrollbackable() {
3289 can_local_rollback = false;
3290 bl.clear();
3291 }
3292 bool can_rollback() const {
3293 return can_local_rollback;
3294 }
3295 bool empty() const {
3296 return can_local_rollback && (bl.length() == 0);
3297 }
3298
3299 bool requires_kraken() const {
3300 return max_required_version >= 2;
3301 }
3302
3303 /**
3304 * Create fresh copy of bl bytes to avoid keeping large buffers around
3305 * in the case that bl contains ptrs which point into a much larger
3306 * message buffer
3307 */
31f18b77 3308 void trim_bl() const {
7c673cae
FG
3309 if (bl.length() > 0)
3310 bl.rebuild();
3311 }
3312 void encode(bufferlist &bl) const;
3313 void decode(bufferlist::iterator &bl);
3314 void dump(Formatter *f) const;
3315 static void generate_test_instances(list<ObjectModDesc*>& o);
3316};
3317WRITE_CLASS_ENCODER(ObjectModDesc)
3318
3319
3320/**
3321 * pg_log_entry_t - single entry/event in pg log
3322 *
3323 */
3324struct pg_log_entry_t {
3325 enum {
3326 MODIFY = 1, // some unspecified modification (but not *all* modifications)
3327 CLONE = 2, // cloned object from head
3328 DELETE = 3, // deleted object
3329 BACKLOG = 4, // event invented by generate_backlog [deprecated]
3330 LOST_REVERT = 5, // lost new version, revert to an older version.
3331 LOST_DELETE = 6, // lost new version, revert to no object (deleted).
3332 LOST_MARK = 7, // lost new version, now EIO
3333 PROMOTE = 8, // promoted object from another tier
3334 CLEAN = 9, // mark an object clean
3335 ERROR = 10, // write that returned an error
3336 };
3337 static const char *get_op_name(int op) {
3338 switch (op) {
3339 case MODIFY:
3340 return "modify";
3341 case PROMOTE:
3342 return "promote";
3343 case CLONE:
3344 return "clone";
3345 case DELETE:
3346 return "delete";
3347 case BACKLOG:
3348 return "backlog";
3349 case LOST_REVERT:
3350 return "l_revert";
3351 case LOST_DELETE:
3352 return "l_delete";
3353 case LOST_MARK:
3354 return "l_mark";
3355 case CLEAN:
3356 return "clean";
3357 case ERROR:
3358 return "error";
3359 default:
3360 return "unknown";
3361 }
3362 }
3363 const char *get_op_name() const {
3364 return get_op_name(op);
3365 }
3366
3367 // describes state for a locally-rollbackable entry
3368 ObjectModDesc mod_desc;
3369 bufferlist snaps; // only for clone entries
3370 hobject_t soid;
3371 osd_reqid_t reqid; // caller+tid to uniquely identify request
31f18b77 3372 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids;
7c673cae
FG
3373 eversion_t version, prior_version, reverting_to;
3374 version_t user_version; // the user version for this entry
3375 utime_t mtime; // this is the _user_ mtime, mind you
3376 int32_t return_code; // only stored for ERRORs for dup detection
3377
3378 __s32 op;
3379 bool invalid_hash; // only when decoding sobject_t based entries
3380 bool invalid_pool; // only when decoding pool-less hobject based entries
3381
3382 pg_log_entry_t()
3383 : user_version(0), return_code(0), op(0),
31f18b77
FG
3384 invalid_hash(false), invalid_pool(false) {
3385 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3386 }
7c673cae
FG
3387 pg_log_entry_t(int _op, const hobject_t& _soid,
3388 const eversion_t& v, const eversion_t& pv,
3389 version_t uv,
3390 const osd_reqid_t& rid, const utime_t& mt,
3391 int return_code)
3392 : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
3393 mtime(mt), return_code(return_code), op(_op),
31f18b77
FG
3394 invalid_hash(false), invalid_pool(false) {
3395 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3396 }
7c673cae
FG
3397
3398 bool is_clone() const { return op == CLONE; }
3399 bool is_modify() const { return op == MODIFY; }
3400 bool is_promote() const { return op == PROMOTE; }
3401 bool is_clean() const { return op == CLEAN; }
3402 bool is_backlog() const { return op == BACKLOG; }
3403 bool is_lost_revert() const { return op == LOST_REVERT; }
3404 bool is_lost_delete() const { return op == LOST_DELETE; }
3405 bool is_lost_mark() const { return op == LOST_MARK; }
3406 bool is_error() const { return op == ERROR; }
3407
3408 bool is_update() const {
3409 return
3410 is_clone() || is_modify() || is_promote() || is_clean() ||
3411 is_backlog() || is_lost_revert() || is_lost_mark();
3412 }
3413 bool is_delete() const {
3414 return op == DELETE || op == LOST_DELETE;
3415 }
3416
3417 bool can_rollback() const {
3418 return mod_desc.can_rollback();
3419 }
3420
3421 void mark_unrollbackable() {
3422 mod_desc.mark_unrollbackable();
3423 }
3424
3425 bool requires_kraken() const {
3426 return mod_desc.requires_kraken();
3427 }
3428
3429 // Errors are only used for dup detection, whereas
3430 // the index by objects is used by recovery, copy_get,
3431 // and other facilities that don't expect or need to
3432 // be aware of error entries.
3433 bool object_is_indexed() const {
3434 return !is_error();
3435 }
3436
3437 bool reqid_is_indexed() const {
3438 return reqid != osd_reqid_t() &&
3439 (op == MODIFY || op == DELETE || op == ERROR);
3440 }
3441
3442 string get_key_name() const;
3443 void encode_with_checksum(bufferlist& bl) const;
3444 void decode_with_checksum(bufferlist::iterator& p);
3445
3446 void encode(bufferlist &bl) const;
3447 void decode(bufferlist::iterator &bl);
3448 void dump(Formatter *f) const;
3449 static void generate_test_instances(list<pg_log_entry_t*>& o);
3450
3451};
3452WRITE_CLASS_ENCODER(pg_log_entry_t)
3453
3454ostream& operator<<(ostream& out, const pg_log_entry_t& e);
3455
c07f9fc5
FG
3456struct pg_log_dup_t {
3457 osd_reqid_t reqid; // caller+tid to uniquely identify request
3458 eversion_t version;
3459 version_t user_version; // the user version for this entry
3460 int32_t return_code; // only stored for ERRORs for dup detection
7c673cae 3461
c07f9fc5
FG
3462 pg_log_dup_t()
3463 : user_version(0), return_code(0)
3464 {}
3465 explicit pg_log_dup_t(const pg_log_entry_t& entry)
3466 : reqid(entry.reqid), version(entry.version),
3467 user_version(entry.user_version), return_code(entry.return_code)
3468 {}
3469 pg_log_dup_t(const eversion_t& v, version_t uv,
3470 const osd_reqid_t& rid, int return_code)
3471 : reqid(rid), version(v), user_version(uv),
3472 return_code(return_code)
3473 {}
3474
3475 string get_key_name() const;
3476 void encode(bufferlist &bl) const;
3477 void decode(bufferlist::iterator &bl);
3478 void dump(Formatter *f) const;
3479 static void generate_test_instances(list<pg_log_dup_t*>& o);
3480
181888fb
FG
3481 bool operator==(const pg_log_dup_t &rhs) const {
3482 return reqid == rhs.reqid &&
3483 version == rhs.version &&
3484 user_version == rhs.user_version &&
3485 return_code == rhs.return_code;
3486 }
3487 bool operator!=(const pg_log_dup_t &rhs) const {
3488 return !(*this == rhs);
3489 }
3490
c07f9fc5
FG
3491 friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
3492};
3493WRITE_CLASS_ENCODER(pg_log_dup_t)
3494
3495std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
7c673cae
FG
3496
3497/**
3498 * pg_log_t - incremental log of recent pg changes.
3499 *
3500 * serves as a recovery queue for recent changes.
3501 */
3502struct pg_log_t {
3503 /*
3504 * head - newest entry (update|delete)
3505 * tail - entry previous to oldest (update|delete) for which we have
3506 * complete negative information.
3507 * i.e. we can infer pg contents for any store whose last_update >= tail.
3508 */
3509 eversion_t head; // newest entry
3510 eversion_t tail; // version prior to oldest
3511
3512protected:
3513 // We can rollback rollback-able entries > can_rollback_to
3514 eversion_t can_rollback_to;
3515
3516 // always <= can_rollback_to, indicates how far stashed rollback
3517 // data can be found
3518 eversion_t rollback_info_trimmed_to;
3519
3520public:
c07f9fc5
FG
3521 // the actual log
3522 mempool::osd_pglog::list<pg_log_entry_t> log;
3523
3524 // entries just for dup op detection ordered oldest to newest
3525 mempool::osd_pglog::list<pg_log_dup_t> dups;
3526
7c673cae
FG
3527 pg_log_t() = default;
3528 pg_log_t(const eversion_t &last_update,
3529 const eversion_t &log_tail,
3530 const eversion_t &can_rollback_to,
3531 const eversion_t &rollback_info_trimmed_to,
c07f9fc5
FG
3532 mempool::osd_pglog::list<pg_log_entry_t> &&entries,
3533 mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
7c673cae
FG
3534 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3535 rollback_info_trimmed_to(rollback_info_trimmed_to),
c07f9fc5 3536 log(std::move(entries)), dups(std::move(dup_entries)) {}
7c673cae
FG
3537 pg_log_t(const eversion_t &last_update,
3538 const eversion_t &log_tail,
3539 const eversion_t &can_rollback_to,
3540 const eversion_t &rollback_info_trimmed_to,
c07f9fc5
FG
3541 const std::list<pg_log_entry_t> &entries,
3542 const std::list<pg_log_dup_t> &dup_entries)
7c673cae
FG
3543 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3544 rollback_info_trimmed_to(rollback_info_trimmed_to) {
3545 for (auto &&entry: entries) {
3546 log.push_back(entry);
3547 }
c07f9fc5
FG
3548 for (auto &&entry: dup_entries) {
3549 dups.push_back(entry);
3550 }
7c673cae
FG
3551 }
3552
3553 void clear() {
3554 eversion_t z;
3555 rollback_info_trimmed_to = can_rollback_to = head = tail = z;
3556 log.clear();
c07f9fc5 3557 dups.clear();
7c673cae
FG
3558 }
3559
3560 eversion_t get_rollback_info_trimmed_to() const {
3561 return rollback_info_trimmed_to;
3562 }
3563 eversion_t get_can_rollback_to() const {
3564 return can_rollback_to;
3565 }
3566
3567
3568 pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
31f18b77 3569 mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
7c673cae
FG
3570 oldlog.swap(log);
3571
3572 eversion_t old_tail;
3573 unsigned mask = ~((~0)<<split_bits);
3574 for (auto i = oldlog.begin();
3575 i != oldlog.end();
3576 ) {
3577 if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
3578 childlog.push_back(*i);
3579 } else {
3580 log.push_back(*i);
3581 }
3582 oldlog.erase(i++);
3583 }
3584
c07f9fc5
FG
3585 // osd_reqid is unique, so it doesn't matter if there are extra
3586 // dup entries in each pg. To avoid storing oid with the dup
3587 // entries, just copy the whole list.
3588 auto childdups(dups);
3589
7c673cae
FG
3590 return pg_log_t(
3591 head,
3592 tail,
3593 can_rollback_to,
3594 rollback_info_trimmed_to,
c07f9fc5
FG
3595 std::move(childlog),
3596 std::move(childdups));
3597 }
7c673cae 3598
31f18b77 3599 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
7c673cae
FG
3600 assert(newhead >= tail);
3601
31f18b77
FG
3602 mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
3603 mempool::osd_pglog::list<pg_log_entry_t> divergent;
7c673cae
FG
3604 while (true) {
3605 if (p == log.begin()) {
3606 // yikes, the whole thing is divergent!
31f18b77
FG
3607 using std::swap;
3608 swap(divergent, log);
7c673cae
FG
3609 break;
3610 }
3611 --p;
3612 if (p->version.version <= newhead.version) {
3613 /*
3614 * look at eversion.version here. we want to avoid a situation like:
3615 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
3616 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
3617 * lower_bound = 100'9
3618 * i.e, same request, different version. If the eversion.version is > the
3619 * lower_bound, we it is divergent.
3620 */
3621 ++p;
3622 divergent.splice(divergent.begin(), log, p, log.end());
3623 break;
3624 }
3625 assert(p->version > newhead);
3626 }
3627 head = newhead;
3628
3629 if (can_rollback_to > newhead)
3630 can_rollback_to = newhead;
3631
3632 if (rollback_info_trimmed_to > newhead)
3633 rollback_info_trimmed_to = newhead;
3634
3635 return divergent;
3636 }
3637
3638 bool empty() const {
3639 return log.empty();
3640 }
3641
3642 bool null() const {
3643 return head.version == 0 && head.epoch == 0;
3644 }
3645
3646 size_t approx_size() const {
3647 return head.version - tail.version;
3648 }
3649
3650 static void filter_log(spg_t import_pgid, const OSDMap &curmap,
3651 const string &hit_set_namespace, const pg_log_t &in,
3652 pg_log_t &out, pg_log_t &reject);
3653
3654 /**
3655 * copy entries from the tail of another pg_log_t
3656 *
3657 * @param other pg_log_t to copy from
3658 * @param from copy entries after this version
3659 */
3660 void copy_after(const pg_log_t &other, eversion_t from);
3661
3662 /**
3663 * copy a range of entries from another pg_log_t
3664 *
3665 * @param other pg_log_t to copy from
3666 * @param from copy entries after this version
3667 * @param to up to and including this version
3668 */
3669 void copy_range(const pg_log_t &other, eversion_t from, eversion_t to);
3670
3671 /**
3672 * copy up to N entries
3673 *
3674 * @param other source log
3675 * @param max max number of entries to copy
3676 */
3677 void copy_up_to(const pg_log_t &other, int max);
3678
3679 ostream& print(ostream& out) const;
3680
3681 void encode(bufferlist &bl) const;
3682 void decode(bufferlist::iterator &bl, int64_t pool = -1);
3683 void dump(Formatter *f) const;
3684 static void generate_test_instances(list<pg_log_t*>& o);
3685};
3686WRITE_CLASS_ENCODER(pg_log_t)
3687
c07f9fc5 3688inline ostream& operator<<(ostream& out, const pg_log_t& log)
7c673cae
FG
3689{
3690 out << "log((" << log.tail << "," << log.head << "], crt="
3691 << log.get_can_rollback_to() << ")";
3692 return out;
3693}
3694
3695
3696/**
3697 * pg_missing_t - summary of missing objects.
3698 *
3699 * kept in memory, as a supplement to pg_log_t
3700 * also used to pass missing info in messages.
3701 */
3702struct pg_missing_item {
3703 eversion_t need, have;
c07f9fc5
FG
3704 enum missing_flags_t {
3705 FLAG_NONE = 0,
3706 FLAG_DELETE = 1,
3707 } flags;
3708 pg_missing_item() : flags(FLAG_NONE) {}
3709 explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
3710 pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false) : need(n), have(h) {
3711 set_delete(is_delete);
3712 }
3713
3714 void encode(bufferlist& bl, uint64_t features) const {
3715 if (HAVE_FEATURE(features, OSD_RECOVERY_DELETES)) {
3716 // encoding a zeroed eversion_t to differentiate between this and
3717 // legacy unversioned encoding - a need value of 0'0 is not
3718 // possible. This can be replaced with the legacy encoding
3719 // macros post-luminous.
3720 eversion_t e;
3721 ::encode(e, bl);
3722 ::encode(need, bl);
3723 ::encode(have, bl);
3724 ::encode(static_cast<uint8_t>(flags), bl);
3725 } else {
3726 // legacy unversioned encoding
3727 ::encode(need, bl);
3728 ::encode(have, bl);
3729 }
7c673cae
FG
3730 }
3731 void decode(bufferlist::iterator& bl) {
c07f9fc5
FG
3732 eversion_t e;
3733 ::decode(e, bl);
3734 if (e != eversion_t()) {
3735 // legacy encoding, this is the need value
3736 need = e;
3737 ::decode(have, bl);
3738 } else {
3739 ::decode(need, bl);
3740 ::decode(have, bl);
3741 uint8_t f;
3742 ::decode(f, bl);
3743 flags = static_cast<missing_flags_t>(f);
3744 }
3745 }
3746
3747 void set_delete(bool is_delete) {
3748 flags = is_delete ? FLAG_DELETE : FLAG_NONE;
3749 }
3750
3751 bool is_delete() const {
3752 return (flags & FLAG_DELETE) == FLAG_DELETE;
3753 }
3754
3755 string flag_str() const {
3756 if (flags == FLAG_NONE) {
3757 return "none";
3758 } else {
3759 return "delete";
3760 }
7c673cae 3761 }
c07f9fc5 3762
7c673cae
FG
3763 void dump(Formatter *f) const {
3764 f->dump_stream("need") << need;
3765 f->dump_stream("have") << have;
c07f9fc5 3766 f->dump_stream("flags") << flag_str();
7c673cae
FG
3767 }
3768 static void generate_test_instances(list<pg_missing_item*>& o) {
3769 o.push_back(new pg_missing_item);
3770 o.push_back(new pg_missing_item);
3771 o.back()->need = eversion_t(1, 2);
3772 o.back()->have = eversion_t(1, 1);
c07f9fc5
FG
3773 o.push_back(new pg_missing_item);
3774 o.back()->need = eversion_t(3, 5);
3775 o.back()->have = eversion_t(3, 4);
3776 o.back()->flags = FLAG_DELETE;
7c673cae
FG
3777 }
3778 bool operator==(const pg_missing_item &rhs) const {
c07f9fc5 3779 return need == rhs.need && have == rhs.have && flags == rhs.flags;
7c673cae
FG
3780 }
3781 bool operator!=(const pg_missing_item &rhs) const {
3782 return !(*this == rhs);
3783 }
3784};
c07f9fc5 3785WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
7c673cae
FG
3786ostream& operator<<(ostream& out, const pg_missing_item &item);
3787
3788class pg_missing_const_i {
3789public:
3790 virtual const map<hobject_t, pg_missing_item> &
3791 get_items() const = 0;
3792 virtual const map<version_t, hobject_t> &get_rmissing() const = 0;
c07f9fc5 3793 virtual bool get_may_include_deletes() const = 0;
7c673cae
FG
3794 virtual unsigned int num_missing() const = 0;
3795 virtual bool have_missing() const = 0;
3796 virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
3797 virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
3798 virtual eversion_t have_old(const hobject_t& oid) const = 0;
3799 virtual ~pg_missing_const_i() {}
3800};
3801
3802
3803template <bool Track>
3804class ChangeTracker {
3805public:
3806 void changed(const hobject_t &obj) {}
3807 template <typename F>
3808 void get_changed(F &&f) const {}
3809 void flush() {}
3810 bool is_clean() const {
3811 return true;
3812 }
3813};
3814template <>
3815class ChangeTracker<true> {
3816 set<hobject_t> _changed;
3817public:
3818 void changed(const hobject_t &obj) {
3819 _changed.insert(obj);
3820 }
3821 template <typename F>
3822 void get_changed(F &&f) const {
3823 for (auto const &i: _changed) {
3824 f(i);
3825 }
3826 }
3827 void flush() {
3828 _changed.clear();
3829 }
3830 bool is_clean() const {
3831 return _changed.empty();
3832 }
3833};
3834
3835template <bool TrackChanges>
3836class pg_missing_set : public pg_missing_const_i {
3837 using item = pg_missing_item;
3838 map<hobject_t, item> missing; // oid -> (need v, have v)
3839 map<version_t, hobject_t> rmissing; // v -> oid
3840 ChangeTracker<TrackChanges> tracker;
3841
3842public:
3843 pg_missing_set() = default;
3844
3845 template <typename missing_type>
3846 pg_missing_set(const missing_type &m) {
7c673cae
FG
3847 missing = m.get_items();
3848 rmissing = m.get_rmissing();
c07f9fc5 3849 may_include_deletes = m.get_may_include_deletes();
7c673cae
FG
3850 for (auto &&i: missing)
3851 tracker.changed(i.first);
3852 }
3853
c07f9fc5
FG
3854 bool may_include_deletes = false;
3855
7c673cae
FG
3856 const map<hobject_t, item> &get_items() const override {
3857 return missing;
3858 }
3859 const map<version_t, hobject_t> &get_rmissing() const override {
3860 return rmissing;
3861 }
c07f9fc5
FG
3862 bool get_may_include_deletes() const override {
3863 return may_include_deletes;
3864 }
7c673cae
FG
3865 unsigned int num_missing() const override {
3866 return missing.size();
3867 }
3868 bool have_missing() const override {
3869 return !missing.empty();
3870 }
3871 bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
3872 auto iter = missing.find(oid);
3873 if (iter == missing.end())
3874 return false;
3875 if (out)
3876 *out = iter->second;
3877 return true;
3878 }
3879 bool is_missing(const hobject_t& oid, eversion_t v) const override {
3880 map<hobject_t, item>::const_iterator m =
3881 missing.find(oid);
3882 if (m == missing.end())
3883 return false;
3884 const item &item(m->second);
3885 if (item.need > v)
3886 return false;
3887 return true;
3888 }
3889 eversion_t have_old(const hobject_t& oid) const override {
3890 map<hobject_t, item>::const_iterator m =
3891 missing.find(oid);
3892 if (m == missing.end())
3893 return eversion_t();
3894 const item &item(m->second);
3895 return item.have;
3896 }
3897
3898 void claim(pg_missing_set& o) {
3899 static_assert(!TrackChanges, "Can't use claim with TrackChanges");
3900 missing.swap(o.missing);
3901 rmissing.swap(o.rmissing);
3902 }
3903
3904 /*
3905 * this needs to be called in log order as we extend the log. it
3906 * assumes missing is accurate up through the previous log entry.
3907 */
3908 void add_next_event(const pg_log_entry_t& e) {
c07f9fc5
FG
3909 map<hobject_t, item>::iterator missing_it;
3910 missing_it = missing.find(e.soid);
3911 bool is_missing_divergent_item = missing_it != missing.end();
3912 if (e.prior_version == eversion_t() || e.is_clone()) {
3913 // new object.
3914 if (is_missing_divergent_item) { // use iterator
7c673cae 3915 rmissing.erase((missing_it->second).need.version);
c07f9fc5
FG
3916 missing_it->second = item(e.version, eversion_t(), e.is_delete()); // .have = nil
3917 } else // create new element in missing map
3918 missing[e.soid] = item(e.version, eversion_t(), e.is_delete()); // .have = nil
3919 } else if (is_missing_divergent_item) {
3920 // already missing (prior).
3921 rmissing.erase((missing_it->second).need.version);
3922 (missing_it->second).need = e.version; // leave .have unchanged.
3923 missing_it->second.set_delete(e.is_delete());
3924 } else if (e.is_backlog()) {
3925 // May not have prior version
3926 assert(0 == "these don't exist anymore");
3927 } else {
3928 // not missing, we must have prior_version (if any)
3929 assert(!is_missing_divergent_item);
3930 missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
7c673cae 3931 }
c07f9fc5 3932 rmissing[e.version.version] = e.soid;
7c673cae
FG
3933 tracker.changed(e.soid);
3934 }
3935
c07f9fc5 3936 void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
7c673cae
FG
3937 if (missing.count(oid)) {
3938 rmissing.erase(missing[oid].need.version);
3939 missing[oid].need = need; // no not adjust .have
c07f9fc5 3940 missing[oid].set_delete(is_delete);
7c673cae 3941 } else {
c07f9fc5 3942 missing[oid] = item(need, eversion_t(), is_delete);
7c673cae
FG
3943 }
3944 rmissing[need.version] = oid;
3945
3946 tracker.changed(oid);
3947 }
3948
3949 void revise_have(hobject_t oid, eversion_t have) {
3950 if (missing.count(oid)) {
3951 tracker.changed(oid);
3952 missing[oid].have = have;
3953 }
3954 }
3955
c07f9fc5
FG
3956 void add(const hobject_t& oid, eversion_t need, eversion_t have,
3957 bool is_delete) {
3958 missing[oid] = item(need, have, is_delete);
7c673cae
FG
3959 rmissing[need.version] = oid;
3960 tracker.changed(oid);
3961 }
3962
3963 void rm(const hobject_t& oid, eversion_t v) {
3964 std::map<hobject_t, item>::iterator p = missing.find(oid);
3965 if (p != missing.end() && p->second.need <= v)
3966 rm(p);
3967 }
3968
3969 void rm(std::map<hobject_t, item>::const_iterator m) {
3970 tracker.changed(m->first);
3971 rmissing.erase(m->second.need.version);
3972 missing.erase(m);
3973 }
3974
3975 void got(const hobject_t& oid, eversion_t v) {
3976 std::map<hobject_t, item>::iterator p = missing.find(oid);
3977 assert(p != missing.end());
c07f9fc5 3978 assert(p->second.need <= v || p->second.is_delete());
7c673cae
FG
3979 got(p);
3980 }
3981
3982 void got(std::map<hobject_t, item>::const_iterator m) {
3983 tracker.changed(m->first);
3984 rmissing.erase(m->second.need.version);
3985 missing.erase(m);
3986 }
3987
3988 void split_into(
3989 pg_t child_pgid,
3990 unsigned split_bits,
3991 pg_missing_set *omissing) {
c07f9fc5 3992 omissing->may_include_deletes = may_include_deletes;
7c673cae
FG
3993 unsigned mask = ~((~0)<<split_bits);
3994 for (map<hobject_t, item>::iterator i = missing.begin();
3995 i != missing.end();
3996 ) {
3997 if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
c07f9fc5
FG
3998 omissing->add(i->first, i->second.need, i->second.have,
3999 i->second.is_delete());
7c673cae
FG
4000 rm(i++);
4001 } else {
4002 ++i;
4003 }
4004 }
4005 }
4006
4007 void clear() {
4008 for (auto const &i: missing)
4009 tracker.changed(i.first);
4010 missing.clear();
4011 rmissing.clear();
4012 }
4013
4014 void encode(bufferlist &bl) const {
c07f9fc5
FG
4015 ENCODE_START(4, 2, bl);
4016 ::encode(missing, bl, may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0);
4017 ::encode(may_include_deletes, bl);
7c673cae
FG
4018 ENCODE_FINISH(bl);
4019 }
4020 void decode(bufferlist::iterator &bl, int64_t pool = -1) {
4021 for (auto const &i: missing)
4022 tracker.changed(i.first);
c07f9fc5 4023 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
7c673cae 4024 ::decode(missing, bl);
c07f9fc5
FG
4025 if (struct_v >= 4) {
4026 ::decode(may_include_deletes, bl);
4027 }
7c673cae
FG
4028 DECODE_FINISH(bl);
4029
4030 if (struct_v < 3) {
4031 // Handle hobject_t upgrade
4032 map<hobject_t, item> tmp;
4033 for (map<hobject_t, item>::iterator i =
4034 missing.begin();
4035 i != missing.end();
4036 ) {
4037 if (!i->first.is_max() && i->first.pool == -1) {
4038 hobject_t to_insert(i->first);
4039 to_insert.pool = pool;
4040 tmp[to_insert] = i->second;
4041 missing.erase(i++);
4042 } else {
4043 ++i;
4044 }
4045 }
4046 missing.insert(tmp.begin(), tmp.end());
4047 }
4048
4049 for (map<hobject_t,item>::iterator it =
4050 missing.begin();
4051 it != missing.end();
4052 ++it)
4053 rmissing[it->second.need.version] = it->first;
4054 for (auto const &i: missing)
4055 tracker.changed(i.first);
4056 }
4057 void dump(Formatter *f) const {
4058 f->open_array_section("missing");
4059 for (map<hobject_t,item>::const_iterator p =
4060 missing.begin(); p != missing.end(); ++p) {
4061 f->open_object_section("item");
4062 f->dump_stream("object") << p->first;
4063 p->second.dump(f);
4064 f->close_section();
4065 }
4066 f->close_section();
c07f9fc5 4067 f->dump_bool("may_include_deletes", may_include_deletes);
7c673cae
FG
4068 }
4069 template <typename F>
4070 void filter_objects(F &&f) {
4071 for (auto i = missing.begin(); i != missing.end();) {
4072 if (f(i->first)) {
4073 rm(i++);
4074 } else {
4075 ++i;
4076 }
4077 }
4078 }
4079 static void generate_test_instances(list<pg_missing_set*>& o) {
4080 o.push_back(new pg_missing_set);
4081 o.push_back(new pg_missing_set);
4082 o.back()->add(
4083 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
c07f9fc5
FG
4084 eversion_t(5, 6), eversion_t(5, 1), false);
4085 o.push_back(new pg_missing_set);
4086 o.back()->add(
4087 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4088 eversion_t(5, 6), eversion_t(5, 1), true);
4089 o.back()->may_include_deletes = true;
7c673cae
FG
4090 }
4091 template <typename F>
4092 void get_changed(F &&f) const {
4093 tracker.get_changed(f);
4094 }
4095 void flush() {
4096 tracker.flush();
4097 }
4098 bool is_clean() const {
4099 return tracker.is_clean();
4100 }
4101 template <typename missing_t>
4102 bool debug_verify_from_init(
4103 const missing_t &init_missing,
4104 ostream *oss) const {
4105 if (!TrackChanges)
4106 return true;
4107 auto check_missing(init_missing.get_items());
4108 tracker.get_changed([&](const hobject_t &hoid) {
4109 check_missing.erase(hoid);
4110 if (missing.count(hoid)) {
4111 check_missing.insert(*(missing.find(hoid)));
4112 }
4113 });
4114 bool ok = true;
4115 if (check_missing.size() != missing.size()) {
4116 if (oss) {
4117 *oss << "Size mismatch, check: " << check_missing.size()
4118 << ", actual: " << missing.size() << "\n";
4119 }
4120 ok = false;
4121 }
4122 for (auto &i: missing) {
4123 if (!check_missing.count(i.first)) {
4124 if (oss)
4125 *oss << "check_missing missing " << i.first << "\n";
4126 ok = false;
4127 } else if (check_missing[i.first] != i.second) {
4128 if (oss)
4129 *oss << "check_missing missing item mismatch on " << i.first
4130 << ", check: " << check_missing[i.first]
4131 << ", actual: " << i.second << "\n";
4132 ok = false;
4133 }
4134 }
4135 if (oss && !ok) {
4136 *oss << "check_missing: " << check_missing << "\n";
4137 set<hobject_t> changed;
4138 tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
4139 *oss << "changed: " << changed << "\n";
4140 }
4141 return ok;
4142 }
4143};
4144template <bool TrackChanges>
4145void encode(
4146 const pg_missing_set<TrackChanges> &c, bufferlist &bl, uint64_t features=0) {
4147 ENCODE_DUMP_PRE();
4148 c.encode(bl);
4149 ENCODE_DUMP_POST(cl);
4150}
4151template <bool TrackChanges>
4152void decode(pg_missing_set<TrackChanges> &c, bufferlist::iterator &p) {
4153 c.decode(p);
4154}
4155template <bool TrackChanges>
4156ostream& operator<<(ostream& out, const pg_missing_set<TrackChanges> &missing)
4157{
c07f9fc5
FG
4158 out << "missing(" << missing.num_missing()
4159 << " may_include_deletes = " << missing.may_include_deletes;
7c673cae
FG
4160 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
4161 out << ")";
4162 return out;
4163}
4164
4165using pg_missing_t = pg_missing_set<false>;
4166using pg_missing_tracker_t = pg_missing_set<true>;
4167
4168
4169/**
4170 * pg list objects response format
4171 *
4172 */
4173struct pg_nls_response_t {
4174 collection_list_handle_t handle;
4175 list<librados::ListObjectImpl> entries;
4176
4177 void encode(bufferlist& bl) const {
4178 ENCODE_START(1, 1, bl);
4179 ::encode(handle, bl);
4180 __u32 n = (__u32)entries.size();
4181 ::encode(n, bl);
4182 for (list<librados::ListObjectImpl>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
4183 ::encode(i->nspace, bl);
4184 ::encode(i->oid, bl);
4185 ::encode(i->locator, bl);
4186 }
4187 ENCODE_FINISH(bl);
4188 }
4189 void decode(bufferlist::iterator& bl) {
4190 DECODE_START(1, bl);
4191 ::decode(handle, bl);
4192 __u32 n;
4193 ::decode(n, bl);
4194 entries.clear();
4195 while (n--) {
4196 librados::ListObjectImpl i;
4197 ::decode(i.nspace, bl);
4198 ::decode(i.oid, bl);
4199 ::decode(i.locator, bl);
4200 entries.push_back(i);
4201 }
4202 DECODE_FINISH(bl);
4203 }
4204 void dump(Formatter *f) const {
4205 f->dump_stream("handle") << handle;
4206 f->open_array_section("entries");
4207 for (list<librados::ListObjectImpl>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4208 f->open_object_section("object");
4209 f->dump_string("namespace", p->nspace);
4210 f->dump_string("object", p->oid);
4211 f->dump_string("key", p->locator);
4212 f->close_section();
4213 }
4214 f->close_section();
4215 }
4216 static void generate_test_instances(list<pg_nls_response_t*>& o) {
4217 o.push_back(new pg_nls_response_t);
4218 o.push_back(new pg_nls_response_t);
4219 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4220 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4221 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4222 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4223 o.push_back(new pg_nls_response_t);
4224 o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
4225 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4226 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4227 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4228 o.push_back(new pg_nls_response_t);
4229 o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
4230 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4231 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4232 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4233 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4234 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4235 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4236 }
4237};
4238
4239WRITE_CLASS_ENCODER(pg_nls_response_t)
4240
4241// For backwards compatibility with older OSD requests
4242struct pg_ls_response_t {
4243 collection_list_handle_t handle;
4244 list<pair<object_t, string> > entries;
4245
4246 void encode(bufferlist& bl) const {
4247 __u8 v = 1;
4248 ::encode(v, bl);
4249 ::encode(handle, bl);
4250 ::encode(entries, bl);
4251 }
4252 void decode(bufferlist::iterator& bl) {
4253 __u8 v;
4254 ::decode(v, bl);
4255 assert(v == 1);
4256 ::decode(handle, bl);
4257 ::decode(entries, bl);
4258 }
4259 void dump(Formatter *f) const {
4260 f->dump_stream("handle") << handle;
4261 f->open_array_section("entries");
4262 for (list<pair<object_t, string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4263 f->open_object_section("object");
4264 f->dump_stream("object") << p->first;
4265 f->dump_string("key", p->second);
4266 f->close_section();
4267 }
4268 f->close_section();
4269 }
4270 static void generate_test_instances(list<pg_ls_response_t*>& o) {
4271 o.push_back(new pg_ls_response_t);
4272 o.push_back(new pg_ls_response_t);
4273 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4274 o.back()->entries.push_back(make_pair(object_t("one"), string()));
4275 o.back()->entries.push_back(make_pair(object_t("two"), string("twokey")));
4276 }
4277};
4278
4279WRITE_CLASS_ENCODER(pg_ls_response_t)
4280
4281/**
4282 * object_copy_cursor_t
4283 */
4284struct object_copy_cursor_t {
4285 uint64_t data_offset;
4286 string omap_offset;
4287 bool attr_complete;
4288 bool data_complete;
4289 bool omap_complete;
4290
4291 object_copy_cursor_t()
4292 : data_offset(0),
4293 attr_complete(false),
4294 data_complete(false),
4295 omap_complete(false)
4296 {}
4297
4298 bool is_initial() const {
4299 return !attr_complete && data_offset == 0 && omap_offset.empty();
4300 }
4301 bool is_complete() const {
4302 return attr_complete && data_complete && omap_complete;
4303 }
4304
4305 static void generate_test_instances(list<object_copy_cursor_t*>& o);
4306 void encode(bufferlist& bl) const;
4307 void decode(bufferlist::iterator &bl);
4308 void dump(Formatter *f) const;
4309};
4310WRITE_CLASS_ENCODER(object_copy_cursor_t)
4311
4312/**
4313 * object_copy_data_t
4314 *
4315 * Return data from a copy request. The semantics are a little strange
4316 * as a result of the encoding's heritage.
4317 *
4318 * In particular, the sender unconditionally fills in the cursor (from what
4319 * it receives and sends), the size, and the mtime, but is responsible for
4320 * figuring out whether it should put any data in the attrs, data, or
4321 * omap members (corresponding to xattrs, object data, and the omap entries)
4322 * based on external data (the client includes a max amount to return with
4323 * the copy request). The client then looks into the attrs, data, and/or omap
4324 * based on the contents of the cursor.
4325 */
4326struct object_copy_data_t {
4327 enum {
4328 FLAG_DATA_DIGEST = 1<<0,
4329 FLAG_OMAP_DIGEST = 1<<1,
4330 };
4331 object_copy_cursor_t cursor;
4332 uint64_t size;
4333 utime_t mtime;
4334 uint32_t data_digest, omap_digest;
4335 uint32_t flags;
4336 map<string, bufferlist> attrs;
4337 bufferlist data;
4338 bufferlist omap_header;
4339 bufferlist omap_data;
4340
4341 /// which snaps we are defined for (if a snap and not the head)
4342 vector<snapid_t> snaps;
4343 ///< latest snap seq for the object (if head)
4344 snapid_t snap_seq;
4345
4346 ///< recent reqids on this object
31f18b77 4347 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > reqids;
7c673cae
FG
4348
4349 uint64_t truncate_seq;
4350 uint64_t truncate_size;
4351
4352public:
4353 object_copy_data_t() :
4354 size((uint64_t)-1), data_digest(-1),
4355 omap_digest(-1), flags(0),
4356 truncate_seq(0),
4357 truncate_size(0) {}
4358
4359 static void generate_test_instances(list<object_copy_data_t*>& o);
4360 void encode(bufferlist& bl, uint64_t features) const;
4361 void decode(bufferlist::iterator& bl);
4362 void dump(Formatter *f) const;
4363};
4364WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
4365
4366/**
4367 * pg creation info
4368 */
4369struct pg_create_t {
4370 epoch_t created; // epoch pg created
4371 pg_t parent; // split from parent (if != pg_t())
4372 __s32 split_bits;
4373
4374 pg_create_t()
4375 : created(0), split_bits(0) {}
4376 pg_create_t(unsigned c, pg_t p, int s)
4377 : created(c), parent(p), split_bits(s) {}
4378
4379 void encode(bufferlist &bl) const;
4380 void decode(bufferlist::iterator &bl);
4381 void dump(Formatter *f) const;
4382 static void generate_test_instances(list<pg_create_t*>& o);
4383};
4384WRITE_CLASS_ENCODER(pg_create_t)
4385
4386// -----------------------------------------
4387
4388struct osd_peer_stat_t {
4389 utime_t stamp;
4390
4391 osd_peer_stat_t() { }
4392
4393 void encode(bufferlist &bl) const;
4394 void decode(bufferlist::iterator &bl);
4395 void dump(Formatter *f) const;
4396 static void generate_test_instances(list<osd_peer_stat_t*>& o);
4397};
4398WRITE_CLASS_ENCODER(osd_peer_stat_t)
4399
4400ostream& operator<<(ostream& out, const osd_peer_stat_t &stat);
4401
4402
4403// -----------------------------------------
4404
4405class ObjectExtent {
4406 /**
4407 * ObjectExtents are used for specifying IO behavior against RADOS
4408 * objects when one is using the ObjectCacher.
4409 *
4410 * To use this in a real system, *every member* must be filled
4411 * out correctly. In particular, make sure to initialize the
4412 * oloc correctly, as its default values are deliberate poison
4413 * and will cause internal ObjectCacher asserts.
4414 *
4415 * Similarly, your buffer_extents vector *must* specify a total
4416 * size equal to your length. If the buffer_extents inadvertently
4417 * contain less space than the length member specifies, you
4418 * will get unintelligible asserts deep in the ObjectCacher.
4419 *
4420 * If you are trying to do testing and don't care about actual
4421 * RADOS function, the simplest thing to do is to initialize
4422 * the ObjectExtent (truncate_size can be 0), create a single entry
4423 * in buffer_extents matching the length, and set oloc.pool to 0.
4424 */
4425 public:
4426 object_t oid; // object id
4427 uint64_t objectno;
4428 uint64_t offset; // in object
4429 uint64_t length; // in object
4430 uint64_t truncate_size; // in object
4431
4432 object_locator_t oloc; // object locator (pool etc)
4433
4434 vector<pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
4435
4436 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
4437 ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
4438 oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
4439};
4440
4441inline ostream& operator<<(ostream& out, const ObjectExtent &ex)
4442{
4443 return out << "extent("
4444 << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
4445 << " " << ex.offset << "~" << ex.length
4446 << " -> " << ex.buffer_extents
4447 << ")";
4448}
4449
4450
7c673cae
FG
4451// ---------------------------------------
4452
4453class OSDSuperblock {
4454public:
4455 uuid_d cluster_fsid, osd_fsid;
4456 int32_t whoami; // my role in this fs.
4457 epoch_t current_epoch; // most recent epoch
4458 epoch_t oldest_map, newest_map; // oldest/newest maps we have.
4459 double weight;
4460
4461 CompatSet compat_features;
4462
4463 // last interval over which i mounted and was then active
4464 epoch_t mounted; // last epoch i mounted
4465 epoch_t clean_thru; // epoch i was active and clean thru
4466
4467 OSDSuperblock() :
4468 whoami(-1),
4469 current_epoch(0), oldest_map(0), newest_map(0), weight(0),
4470 mounted(0), clean_thru(0) {
4471 }
4472
4473 void encode(bufferlist &bl) const;
4474 void decode(bufferlist::iterator &bl);
4475 void dump(Formatter *f) const;
4476 static void generate_test_instances(list<OSDSuperblock*>& o);
4477};
4478WRITE_CLASS_ENCODER(OSDSuperblock)
4479
4480inline ostream& operator<<(ostream& out, const OSDSuperblock& sb)
4481{
4482 return out << "sb(" << sb.cluster_fsid
4483 << " osd." << sb.whoami
4484 << " " << sb.osd_fsid
4485 << " e" << sb.current_epoch
4486 << " [" << sb.oldest_map << "," << sb.newest_map << "]"
4487 << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
4488 << ")";
4489}
4490
4491
4492// -------
4493
4494
4495
4496
4497
4498
4499/*
4500 * attached to object head. describes most recent snap context, and
4501 * set of existing clones.
4502 */
4503struct SnapSet {
4504 snapid_t seq;
4505 bool head_exists;
4506 vector<snapid_t> snaps; // descending
4507 vector<snapid_t> clones; // ascending
4508 map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest
4509 map<snapid_t, uint64_t> clone_size;
4510 map<snapid_t, vector<snapid_t>> clone_snaps; // descending
4511
4512 SnapSet() : seq(0), head_exists(false) {}
4513 explicit SnapSet(bufferlist& bl) {
4514 bufferlist::iterator p = bl.begin();
4515 decode(p);
4516 }
4517
4518 bool is_legacy() const {
4519 return clone_snaps.size() < clones.size() || !head_exists;
4520 }
4521
4522 /// populate SnapSet from a librados::snap_set_t
4523 void from_snap_set(const librados::snap_set_t& ss, bool legacy);
4524
4525 /// get space accounted to clone
4526 uint64_t get_clone_bytes(snapid_t clone) const;
4527
4528 void encode(bufferlist& bl) const;
4529 void decode(bufferlist::iterator& bl);
4530 void dump(Formatter *f) const;
4531 static void generate_test_instances(list<SnapSet*>& o);
4532
4533 SnapContext get_ssc_as_of(snapid_t as_of) const {
4534 SnapContext out;
4535 out.seq = as_of;
4536 for (vector<snapid_t>::const_iterator i = snaps.begin();
4537 i != snaps.end();
4538 ++i) {
4539 if (*i <= as_of)
4540 out.snaps.push_back(*i);
4541 }
4542 return out;
4543 }
4544
4545 // return min element of snaps > after, return max if no such element
4546 snapid_t get_first_snap_after(snapid_t after, snapid_t max) const {
4547 for (vector<snapid_t>::const_reverse_iterator i = snaps.rbegin();
4548 i != snaps.rend();
4549 ++i) {
4550 if (*i > after)
4551 return *i;
4552 }
4553 return max;
4554 }
4555
4556 SnapSet get_filtered(const pg_pool_t &pinfo) const;
4557 void filter(const pg_pool_t &pinfo);
4558};
4559WRITE_CLASS_ENCODER(SnapSet)
4560
4561ostream& operator<<(ostream& out, const SnapSet& cs);
4562
4563
4564
4565#define OI_ATTR "_"
4566#define SS_ATTR "snapset"
4567
4568struct watch_info_t {
4569 uint64_t cookie;
4570 uint32_t timeout_seconds;
4571 entity_addr_t addr;
4572
4573 watch_info_t() : cookie(0), timeout_seconds(0) { }
4574 watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
4575
4576 void encode(bufferlist& bl, uint64_t features) const;
4577 void decode(bufferlist::iterator& bl);
4578 void dump(Formatter *f) const;
4579 static void generate_test_instances(list<watch_info_t*>& o);
4580};
4581WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
4582
4583static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
4584 return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
4585 && l.addr == r.addr;
4586}
4587
4588static inline ostream& operator<<(ostream& out, const watch_info_t& w) {
4589 return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
4590 << " " << w.addr << ")";
4591}
4592
4593struct notify_info_t {
4594 uint64_t cookie;
4595 uint64_t notify_id;
4596 uint32_t timeout;
4597 bufferlist bl;
4598};
4599
4600static inline ostream& operator<<(ostream& out, const notify_info_t& n) {
4601 return out << "notify(cookie " << n.cookie
4602 << " notify" << n.notify_id
4603 << " " << n.timeout << "s)";
4604}
4605
31f18b77
FG
4606struct object_info_t;
4607struct object_manifest_t {
4608 enum {
4609 TYPE_NONE = 0,
4610 TYPE_REDIRECT = 1, // start with this
4611 TYPE_CHUNKED = 2, // do this later
4612 };
4613 uint8_t type; // redirect, chunked, ...
4614 hobject_t redirect_target;
4615
4616 object_manifest_t() : type(0) { }
4617 object_manifest_t(uint8_t type, const hobject_t& redirect_target)
4618 : type(type), redirect_target(redirect_target) { }
4619
4620 bool is_empty() const {
4621 return type == TYPE_NONE;
4622 }
4623 bool is_redirect() const {
4624 return type == TYPE_REDIRECT;
4625 }
4626 bool is_chunked() const {
4627 return type == TYPE_CHUNKED;
4628 }
4629 static const char *get_type_name(uint8_t m) {
4630 switch (m) {
4631 case TYPE_NONE: return "none";
4632 case TYPE_REDIRECT: return "redirect";
4633 case TYPE_CHUNKED: return "chunked";
4634 default: return "unknown";
4635 }
4636 }
4637 const char *get_type_name() const {
4638 return get_type_name(type);
4639 }
4640 static void generate_test_instances(list<object_manifest_t*>& o);
4641 void encode(bufferlist &bl) const;
4642 void decode(bufferlist::iterator &bl);
4643 void dump(Formatter *f) const;
4644 friend ostream& operator<<(ostream& out, const object_info_t& oi);
4645};
4646WRITE_CLASS_ENCODER(object_manifest_t)
4647ostream& operator<<(ostream& out, const object_manifest_t& oi);
7c673cae
FG
4648
4649struct object_info_t {
4650 hobject_t soid;
4651 eversion_t version, prior_version;
4652 version_t user_version;
4653 osd_reqid_t last_reqid;
4654
4655 uint64_t size;
4656 utime_t mtime;
4657 utime_t local_mtime; // local mtime
4658
4659 // note: these are currently encoded into a total 16 bits; see
4660 // encode()/decode() for the weirdness.
4661 typedef enum {
4662 FLAG_LOST = 1<<0,
4663 FLAG_WHITEOUT = 1<<1, // object logically does not exist
4664 FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
4665 FLAG_OMAP = 1 << 3, // has (or may have) some/any omap data
4666 FLAG_DATA_DIGEST = 1 << 4, // has data crc
4667 FLAG_OMAP_DIGEST = 1 << 5, // has omap crc
4668 FLAG_CACHE_PIN = 1 << 6, // pin the object in cache tier
31f18b77 4669 FLAG_MANIFEST = 1 << 7, // has manifest
7c673cae
FG
4670 // ...
4671 FLAG_USES_TMAP = 1<<8, // deprecated; no longer used.
4672 } flag_t;
4673
4674 flag_t flags;
4675
4676 static string get_flag_string(flag_t flags) {
4677 string s;
94b18763
FG
4678 vector<string> sv = get_flag_vector(flags);
4679 for (auto ss : sv) {
4680 s += string("|") + ss;
4681 }
4682 if (s.length())
4683 return s.substr(1);
4684 return s;
4685 }
4686 static vector<string> get_flag_vector(flag_t flags) {
4687 vector<string> sv;
7c673cae 4688 if (flags & FLAG_LOST)
94b18763 4689 sv.insert(sv.end(), "lost");
7c673cae 4690 if (flags & FLAG_WHITEOUT)
94b18763 4691 sv.insert(sv.end(), "whiteout");
7c673cae 4692 if (flags & FLAG_DIRTY)
94b18763 4693 sv.insert(sv.end(), "dirty");
7c673cae 4694 if (flags & FLAG_USES_TMAP)
94b18763 4695 sv.insert(sv.end(), "uses_tmap");
7c673cae 4696 if (flags & FLAG_OMAP)
94b18763 4697 sv.insert(sv.end(), "omap");
7c673cae 4698 if (flags & FLAG_DATA_DIGEST)
94b18763 4699 sv.insert(sv.end(), "data_digest");
7c673cae 4700 if (flags & FLAG_OMAP_DIGEST)
94b18763 4701 sv.insert(sv.end(), "omap_digest");
7c673cae 4702 if (flags & FLAG_CACHE_PIN)
94b18763 4703 sv.insert(sv.end(), "cache_pin");
31f18b77 4704 if (flags & FLAG_MANIFEST)
94b18763
FG
4705 sv.insert(sv.end(), "manifest");
4706 return sv;
7c673cae
FG
4707 }
4708 string get_flag_string() const {
4709 return get_flag_string(flags);
4710 }
4711
4712 /// [clone] descending. pre-luminous; moved to SnapSet
4713 vector<snapid_t> legacy_snaps;
4714
4715 uint64_t truncate_seq, truncate_size;
4716
4717 map<pair<uint64_t, entity_name_t>, watch_info_t> watchers;
4718
4719 // opportunistic checksums; may or may not be present
4720 __u32 data_digest; ///< data crc32c
4721 __u32 omap_digest; ///< omap crc32c
4722
4723 // alloc hint attribute
4724 uint64_t expected_object_size, expected_write_size;
4725 uint32_t alloc_hint_flags;
4726
31f18b77
FG
4727 struct object_manifest_t manifest;
4728
7c673cae
FG
4729 void copy_user_bits(const object_info_t& other);
4730
4731 static ps_t legacy_object_locator_to_ps(const object_t &oid,
4732 const object_locator_t &loc);
4733
4734 bool test_flag(flag_t f) const {
4735 return (flags & f) == f;
4736 }
4737 void set_flag(flag_t f) {
4738 flags = (flag_t)(flags | f);
4739 }
4740 void clear_flag(flag_t f) {
4741 flags = (flag_t)(flags & ~f);
4742 }
4743 bool is_lost() const {
4744 return test_flag(FLAG_LOST);
4745 }
4746 bool is_whiteout() const {
4747 return test_flag(FLAG_WHITEOUT);
4748 }
4749 bool is_dirty() const {
4750 return test_flag(FLAG_DIRTY);
4751 }
4752 bool is_omap() const {
4753 return test_flag(FLAG_OMAP);
4754 }
4755 bool is_data_digest() const {
4756 return test_flag(FLAG_DATA_DIGEST);
4757 }
4758 bool is_omap_digest() const {
4759 return test_flag(FLAG_OMAP_DIGEST);
4760 }
4761 bool is_cache_pinned() const {
4762 return test_flag(FLAG_CACHE_PIN);
4763 }
31f18b77
FG
4764 bool has_manifest() const {
4765 return test_flag(FLAG_MANIFEST);
4766 }
7c673cae
FG
4767
4768 void set_data_digest(__u32 d) {
4769 set_flag(FLAG_DATA_DIGEST);
4770 data_digest = d;
4771 }
4772 void set_omap_digest(__u32 d) {
4773 set_flag(FLAG_OMAP_DIGEST);
4774 omap_digest = d;
4775 }
4776 void clear_data_digest() {
4777 clear_flag(FLAG_DATA_DIGEST);
4778 data_digest = -1;
4779 }
4780 void clear_omap_digest() {
4781 clear_flag(FLAG_OMAP_DIGEST);
4782 omap_digest = -1;
4783 }
4784 void new_object() {
28e407b8
AA
4785 clear_data_digest();
4786 clear_omap_digest();
7c673cae
FG
4787 }
4788
4789 void encode(bufferlist& bl, uint64_t features) const;
4790 void decode(bufferlist::iterator& bl);
4791 void decode(bufferlist& bl) {
4792 bufferlist::iterator p = bl.begin();
4793 decode(p);
4794 }
4795 void dump(Formatter *f) const;
4796 static void generate_test_instances(list<object_info_t*>& o);
4797
4798 explicit object_info_t()
4799 : user_version(0), size(0), flags((flag_t)0),
4800 truncate_seq(0), truncate_size(0),
4801 data_digest(-1), omap_digest(-1),
4802 expected_object_size(0), expected_write_size(0),
4803 alloc_hint_flags(0)
4804 {}
4805
4806 explicit object_info_t(const hobject_t& s)
4807 : soid(s),
4808 user_version(0), size(0), flags((flag_t)0),
4809 truncate_seq(0), truncate_size(0),
4810 data_digest(-1), omap_digest(-1),
4811 expected_object_size(0), expected_write_size(0),
4812 alloc_hint_flags(0)
4813 {}
4814
4815 explicit object_info_t(bufferlist& bl) {
4816 decode(bl);
4817 }
4818};
4819WRITE_CLASS_ENCODER_FEATURES(object_info_t)
4820
4821ostream& operator<<(ostream& out, const object_info_t& oi);
4822
4823
4824
4825// Object recovery
4826struct ObjectRecoveryInfo {
4827 hobject_t soid;
4828 eversion_t version;
4829 uint64_t size;
4830 object_info_t oi;
4831 SnapSet ss; // only populated if soid is_snap()
4832 interval_set<uint64_t> copy_subset;
4833 map<hobject_t, interval_set<uint64_t>> clone_subset;
4834
4835 ObjectRecoveryInfo() : size(0) { }
4836
4837 static void generate_test_instances(list<ObjectRecoveryInfo*>& o);
4838 void encode(bufferlist &bl, uint64_t features) const;
4839 void decode(bufferlist::iterator &bl, int64_t pool = -1);
4840 ostream &print(ostream &out) const;
4841 void dump(Formatter *f) const;
4842};
4843WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
4844ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf);
4845
4846struct ObjectRecoveryProgress {
4847 uint64_t data_recovered_to;
4848 string omap_recovered_to;
4849 bool first;
4850 bool data_complete;
4851 bool omap_complete;
224ce89b 4852 bool error = false;
7c673cae
FG
4853
4854 ObjectRecoveryProgress()
4855 : data_recovered_to(0),
4856 first(true),
4857 data_complete(false), omap_complete(false) { }
4858
4859 bool is_complete(const ObjectRecoveryInfo& info) const {
4860 return (data_recovered_to >= (
4861 info.copy_subset.empty() ?
4862 0 : info.copy_subset.range_end())) &&
4863 omap_complete;
4864 }
4865
4866 static void generate_test_instances(list<ObjectRecoveryProgress*>& o);
4867 void encode(bufferlist &bl) const;
4868 void decode(bufferlist::iterator &bl);
4869 ostream &print(ostream &out) const;
4870 void dump(Formatter *f) const;
4871};
4872WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
4873ostream& operator<<(ostream& out, const ObjectRecoveryProgress &prog);
4874
4875struct PushReplyOp {
4876 hobject_t soid;
4877
4878 static void generate_test_instances(list<PushReplyOp*>& o);
4879 void encode(bufferlist &bl) const;
4880 void decode(bufferlist::iterator &bl);
4881 ostream &print(ostream &out) const;
4882 void dump(Formatter *f) const;
4883
4884 uint64_t cost(CephContext *cct) const;
4885};
4886WRITE_CLASS_ENCODER(PushReplyOp)
4887ostream& operator<<(ostream& out, const PushReplyOp &op);
4888
4889struct PullOp {
4890 hobject_t soid;
4891
4892 ObjectRecoveryInfo recovery_info;
4893 ObjectRecoveryProgress recovery_progress;
4894
4895 static void generate_test_instances(list<PullOp*>& o);
4896 void encode(bufferlist &bl, uint64_t features) const;
4897 void decode(bufferlist::iterator &bl);
4898 ostream &print(ostream &out) const;
4899 void dump(Formatter *f) const;
4900
4901 uint64_t cost(CephContext *cct) const;
4902};
4903WRITE_CLASS_ENCODER_FEATURES(PullOp)
4904ostream& operator<<(ostream& out, const PullOp &op);
4905
4906struct PushOp {
4907 hobject_t soid;
4908 eversion_t version;
4909 bufferlist data;
4910 interval_set<uint64_t> data_included;
4911 bufferlist omap_header;
4912 map<string, bufferlist> omap_entries;
4913 map<string, bufferlist> attrset;
4914
4915 ObjectRecoveryInfo recovery_info;
4916 ObjectRecoveryProgress before_progress;
4917 ObjectRecoveryProgress after_progress;
4918
4919 static void generate_test_instances(list<PushOp*>& o);
4920 void encode(bufferlist &bl, uint64_t features) const;
4921 void decode(bufferlist::iterator &bl);
4922 ostream &print(ostream &out) const;
4923 void dump(Formatter *f) const;
4924
4925 uint64_t cost(CephContext *cct) const;
4926};
4927WRITE_CLASS_ENCODER_FEATURES(PushOp)
4928ostream& operator<<(ostream& out, const PushOp &op);
4929
4930
4931/*
4932 * summarize pg contents for purposes of a scrub
4933 */
4934struct ScrubMap {
4935 struct object {
4936 map<string,bufferptr> attrs;
4937 uint64_t size;
4938 __u32 omap_digest; ///< omap crc32c
4939 __u32 digest; ///< data crc32c
4940 bool negative:1;
4941 bool digest_present:1;
4942 bool omap_digest_present:1;
4943 bool read_error:1;
4944 bool stat_error:1;
4945 bool ec_hash_mismatch:1;
4946 bool ec_size_mismatch:1;
28e407b8
AA
4947 bool large_omap_object_found:1;
4948 uint64_t large_omap_object_key_count = 0;
4949 uint64_t large_omap_object_value_size = 0;
7c673cae
FG
4950
4951 object() :
4952 // Init invalid size so it won't match if we get a stat EIO error
4953 size(-1), omap_digest(0), digest(0),
28e407b8
AA
4954 negative(false), digest_present(false), omap_digest_present(false),
4955 read_error(false), stat_error(false), ec_hash_mismatch(false),
4956 ec_size_mismatch(false), large_omap_object_found(false) {}
7c673cae
FG
4957
4958 void encode(bufferlist& bl) const;
4959 void decode(bufferlist::iterator& bl);
4960 void dump(Formatter *f) const;
4961 static void generate_test_instances(list<object*>& o);
4962 };
4963 WRITE_CLASS_ENCODER(object)
4964
4965 map<hobject_t,object> objects;
4966 eversion_t valid_through;
4967 eversion_t incr_since;
28e407b8 4968 bool has_large_omap_object_errors:1;
7c673cae
FG
4969
4970 void merge_incr(const ScrubMap &l);
28e407b8
AA
4971 void clear_from(const hobject_t& start) {
4972 objects.erase(objects.lower_bound(start), objects.end());
4973 }
7c673cae
FG
4974 void insert(const ScrubMap &r) {
4975 objects.insert(r.objects.begin(), r.objects.end());
4976 }
4977 void swap(ScrubMap &r) {
31f18b77
FG
4978 using std::swap;
4979 swap(objects, r.objects);
4980 swap(valid_through, r.valid_through);
4981 swap(incr_since, r.incr_since);
7c673cae
FG
4982 }
4983
4984 void encode(bufferlist& bl) const;
4985 void decode(bufferlist::iterator& bl, int64_t pool=-1);
4986 void dump(Formatter *f) const;
4987 static void generate_test_instances(list<ScrubMap*>& o);
4988};
4989WRITE_CLASS_ENCODER(ScrubMap::object)
4990WRITE_CLASS_ENCODER(ScrubMap)
4991
28e407b8
AA
4992struct ScrubMapBuilder {
4993 bool deep = false;
4994 vector<hobject_t> ls;
4995 size_t pos = 0;
4996 int64_t data_pos = 0;
4997 string omap_pos;
4998 int ret = 0;
4999 bufferhash data_hash, omap_hash; ///< accumulatinng hash value
5000 uint64_t omap_keys = 0;
5001 uint64_t omap_bytes = 0;
5002
5003 bool empty() {
5004 return ls.empty();
5005 }
5006 bool done() {
5007 return pos >= ls.size();
5008 }
5009 void reset() {
5010 *this = ScrubMapBuilder();
5011 }
5012
5013 bool data_done() {
5014 return data_pos < 0;
5015 }
5016
5017 void next_object() {
5018 ++pos;
5019 data_pos = 0;
5020 omap_pos.clear();
5021 omap_keys = 0;
5022 omap_bytes = 0;
5023 }
5024
5025 friend ostream& operator<<(ostream& out, const ScrubMapBuilder& pos) {
5026 out << "(" << pos.pos << "/" << pos.ls.size();
5027 if (pos.pos < pos.ls.size()) {
5028 out << " " << pos.ls[pos.pos];
5029 }
5030 if (pos.data_pos < 0) {
5031 out << " byte " << pos.data_pos;
5032 }
5033 if (!pos.omap_pos.empty()) {
5034 out << " key " << pos.omap_pos;
5035 }
5036 if (pos.deep) {
5037 out << " deep";
5038 }
5039 if (pos.ret) {
5040 out << " ret " << pos.ret;
5041 }
5042 return out << ")";
5043 }
5044};
5045
7c673cae
FG
5046struct OSDOp {
5047 ceph_osd_op op;
5048 sobject_t soid;
5049
5050 bufferlist indata, outdata;
224ce89b 5051 errorcode32_t rval;
7c673cae
FG
5052
5053 OSDOp() : rval(0) {
5054 memset(&op, 0, sizeof(ceph_osd_op));
5055 }
5056
5057 /**
5058 * split a bufferlist into constituent indata members of a vector of OSDOps
5059 *
5060 * @param ops [out] vector of OSDOps
5061 * @param in [in] combined data buffer
5062 */
5063 static void split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in);
5064
5065 /**
5066 * merge indata members of a vector of OSDOp into a single bufferlist
5067 *
5068 * Notably this also encodes certain other OSDOp data into the data
5069 * buffer, including the sobject_t soid.
5070 *
5071 * @param ops [in] vector of OSDOps
5072 * @param out [out] combined data buffer
5073 */
5074 static void merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out);
5075
5076 /**
5077 * split a bufferlist into constituent outdata members of a vector of OSDOps
5078 *
5079 * @param ops [out] vector of OSDOps
5080 * @param in [in] combined data buffer
5081 */
5082 static void split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in);
5083
5084 /**
5085 * merge outdata members of a vector of OSDOps into a single bufferlist
5086 *
5087 * @param ops [in] vector of OSDOps
5088 * @param out [out] combined data buffer
5089 */
5090 static void merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out);
224ce89b
WB
5091
5092 /**
5093 * Clear data as much as possible, leave minimal data for historical op dump
5094 *
5095 * @param ops [in] vector of OSDOps
5096 */
5097 static void clear_data(vector<OSDOp>& ops);
7c673cae
FG
5098};
5099
5100ostream& operator<<(ostream& out, const OSDOp& op);
5101
5102struct watch_item_t {
5103 entity_name_t name;
5104 uint64_t cookie;
5105 uint32_t timeout_seconds;
5106 entity_addr_t addr;
5107
5108 watch_item_t() : cookie(0), timeout_seconds(0) { }
5109 watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
5110 const entity_addr_t& addr)
5111 : name(name), cookie(cookie), timeout_seconds(timeout),
5112 addr(addr) { }
5113
5114 void encode(bufferlist &bl, uint64_t features) const {
5115 ENCODE_START(2, 1, bl);
5116 ::encode(name, bl);
5117 ::encode(cookie, bl);
5118 ::encode(timeout_seconds, bl);
5119 ::encode(addr, bl, features);
5120 ENCODE_FINISH(bl);
5121 }
5122 void decode(bufferlist::iterator &bl) {
5123 DECODE_START(2, bl);
5124 ::decode(name, bl);
5125 ::decode(cookie, bl);
5126 ::decode(timeout_seconds, bl);
5127 if (struct_v >= 2) {
5128 ::decode(addr, bl);
5129 }
5130 DECODE_FINISH(bl);
5131 }
5132};
5133WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
5134
5135struct obj_watch_item_t {
5136 hobject_t obj;
5137 watch_item_t wi;
5138};
5139
5140/**
5141 * obj list watch response format
5142 *
5143 */
5144struct obj_list_watch_response_t {
5145 list<watch_item_t> entries;
5146
5147 void encode(bufferlist& bl, uint64_t features) const {
5148 ENCODE_START(1, 1, bl);
5149 ::encode(entries, bl, features);
5150 ENCODE_FINISH(bl);
5151 }
5152 void decode(bufferlist::iterator& bl) {
5153 DECODE_START(1, bl);
5154 ::decode(entries, bl);
5155 DECODE_FINISH(bl);
5156 }
5157 void dump(Formatter *f) const {
5158 f->open_array_section("entries");
5159 for (list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
5160 f->open_object_section("watch");
5161 f->dump_stream("watcher") << p->name;
5162 f->dump_int("cookie", p->cookie);
5163 f->dump_int("timeout", p->timeout_seconds);
5164 f->open_object_section("addr");
5165 p->addr.dump(f);
5166 f->close_section();
5167 f->close_section();
5168 }
5169 f->close_section();
5170 }
5171 static void generate_test_instances(list<obj_list_watch_response_t*>& o) {
5172 entity_addr_t ea;
5173 o.push_back(new obj_list_watch_response_t);
5174 o.push_back(new obj_list_watch_response_t);
5175 ea.set_type(entity_addr_t::TYPE_LEGACY);
5176 ea.set_nonce(1000);
5177 ea.set_family(AF_INET);
5178 ea.set_in4_quad(0, 127);
5179 ea.set_in4_quad(1, 0);
5180 ea.set_in4_quad(2, 0);
5181 ea.set_in4_quad(3, 1);
5182 ea.set_port(1024);
5183 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
5184 ea.set_nonce(1001);
5185 ea.set_in4_quad(3, 2);
5186 ea.set_port(1025);
5187 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
5188 }
5189};
5190WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
5191
5192struct clone_info {
5193 snapid_t cloneid;
5194 vector<snapid_t> snaps; // ascending
5195 vector< pair<uint64_t,uint64_t> > overlap;
5196 uint64_t size;
5197
5198 clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
5199
5200 void encode(bufferlist& bl) const {
5201 ENCODE_START(1, 1, bl);
5202 ::encode(cloneid, bl);
5203 ::encode(snaps, bl);
5204 ::encode(overlap, bl);
5205 ::encode(size, bl);
5206 ENCODE_FINISH(bl);
5207 }
5208 void decode(bufferlist::iterator& bl) {
5209 DECODE_START(1, bl);
5210 ::decode(cloneid, bl);
5211 ::decode(snaps, bl);
5212 ::decode(overlap, bl);
5213 ::decode(size, bl);
5214 DECODE_FINISH(bl);
5215 }
5216 void dump(Formatter *f) const {
5217 if (cloneid == CEPH_NOSNAP)
5218 f->dump_string("cloneid", "HEAD");
5219 else
5220 f->dump_unsigned("cloneid", cloneid.val);
5221 f->open_array_section("snapshots");
5222 for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
5223 f->open_object_section("snap");
5224 f->dump_unsigned("id", p->val);
5225 f->close_section();
5226 }
5227 f->close_section();
5228 f->open_array_section("overlaps");
5229 for (vector< pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
5230 q != overlap.end(); ++q) {
5231 f->open_object_section("overlap");
5232 f->dump_unsigned("offset", q->first);
5233 f->dump_unsigned("length", q->second);
5234 f->close_section();
5235 }
5236 f->close_section();
5237 f->dump_unsigned("size", size);
5238 }
5239 static void generate_test_instances(list<clone_info*>& o) {
5240 o.push_back(new clone_info);
5241 o.push_back(new clone_info);
5242 o.back()->cloneid = 1;
5243 o.back()->snaps.push_back(1);
5244 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5245 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5246 o.back()->size = 16384;
5247 o.push_back(new clone_info);
5248 o.back()->cloneid = CEPH_NOSNAP;
5249 o.back()->size = 32768;
5250 }
5251};
5252WRITE_CLASS_ENCODER(clone_info)
5253
5254/**
5255 * obj list snaps response format
5256 *
5257 */
5258struct obj_list_snap_response_t {
5259 vector<clone_info> clones; // ascending
5260 snapid_t seq;
5261
5262 void encode(bufferlist& bl) const {
5263 ENCODE_START(2, 1, bl);
5264 ::encode(clones, bl);
5265 ::encode(seq, bl);
5266 ENCODE_FINISH(bl);
5267 }
5268 void decode(bufferlist::iterator& bl) {
5269 DECODE_START(2, bl);
5270 ::decode(clones, bl);
5271 if (struct_v >= 2)
5272 ::decode(seq, bl);
5273 else
5274 seq = CEPH_NOSNAP;
5275 DECODE_FINISH(bl);
5276 }
5277 void dump(Formatter *f) const {
5278 f->open_array_section("clones");
5279 for (vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
5280 f->open_object_section("clone");
5281 p->dump(f);
5282 f->close_section();
5283 }
5284 f->dump_unsigned("seq", seq);
5285 f->close_section();
5286 }
5287 static void generate_test_instances(list<obj_list_snap_response_t*>& o) {
5288 o.push_back(new obj_list_snap_response_t);
5289 o.push_back(new obj_list_snap_response_t);
5290 clone_info cl;
5291 cl.cloneid = 1;
5292 cl.snaps.push_back(1);
5293 cl.overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5294 cl.overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5295 cl.size = 16384;
5296 o.back()->clones.push_back(cl);
5297 cl.cloneid = CEPH_NOSNAP;
5298 cl.snaps.clear();
5299 cl.overlap.clear();
5300 cl.size = 32768;
5301 o.back()->clones.push_back(cl);
5302 o.back()->seq = 123;
5303 }
5304};
5305
5306WRITE_CLASS_ENCODER(obj_list_snap_response_t)
5307
5308// PromoteCounter
5309
5310struct PromoteCounter {
5311 std::atomic_ullong attempts{0};
5312 std::atomic_ullong objects{0};
5313 std::atomic_ullong bytes{0};
5314
5315 void attempt() {
5316 attempts++;
5317 }
5318
5319 void finish(uint64_t size) {
5320 objects++;
5321 bytes += size;
5322 }
5323
5324 void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
5325 *a = attempts;
5326 *o = objects;
5327 *b = bytes;
5328 attempts = *a / 2;
5329 objects = *o / 2;
5330 bytes = *b / 2;
5331 }
5332};
5333
5334/** store_statfs_t
5335 * ObjectStore full statfs information
5336 */
5337struct store_statfs_t
5338{
5339 uint64_t total = 0; // Total bytes
5340 uint64_t available = 0; // Free bytes available
5341
5342 int64_t allocated = 0; // Bytes allocated by the store
5343 int64_t stored = 0; // Bytes actually stored by the user
5344 int64_t compressed = 0; // Bytes stored after compression
5345 int64_t compressed_allocated = 0; // Bytes allocated for compressed data
5346 int64_t compressed_original = 0; // Bytes that were successfully compressed
5347
5348 void reset() {
5349 *this = store_statfs_t();
5350 }
5351 bool operator ==(const store_statfs_t& other) const;
5352 void dump(Formatter *f) const;
5353};
5354ostream &operator<<(ostream &lhs, const store_statfs_t &rhs);
5355
5356#endif