]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/osd_types.h
update sources to 12.2.2
[ceph.git] / ceph / src / osd / osd_types.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18#ifndef CEPH_OSD_TYPES_H
19#define CEPH_OSD_TYPES_H
20
21#include <sstream>
22#include <stdio.h>
23#include <memory>
24#include <boost/scoped_ptr.hpp>
25#include <boost/optional/optional_io.hpp>
26#include <boost/variant.hpp>
27
28#include "include/rados/rados_types.hpp"
29#include "include/mempool.h"
30
31#include "msg/msg_types.h"
32#include "include/types.h"
33#include "include/utime.h"
34#include "include/CompatSet.h"
35#include "common/histogram.h"
36#include "include/interval_set.h"
37#include "include/inline_memory.h"
38#include "common/Formatter.h"
39#include "common/bloom_filter.hpp"
40#include "common/hobject.h"
41#include "common/snap_types.h"
42#include "HitSet.h"
43#include "Watch.h"
44#include "include/cmp.h"
45#include "librados/ListObjectImpl.h"
46#include "compressor/Compressor.h"
47#include <atomic>
48
49#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
50
51#define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
52#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
53#define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
54#define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
55#define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
56#define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
57#define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
58#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
59#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
60#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
61#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
62#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
63#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
64#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
65#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
c07f9fc5 66#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
7c673cae
FG
67
68
69/// min recovery priority for MBackfillReserve
70#define OSD_RECOVERY_PRIORITY_MIN 0
71
72/// base backfill priority for MBackfillReserve
73#define OSD_BACKFILL_PRIORITY_BASE 100
74
75/// base backfill priority for MBackfillReserve (degraded PG)
76#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
77
78/// base recovery priority for MBackfillReserve
79#define OSD_RECOVERY_PRIORITY_BASE 180
80
81/// base backfill priority for MBackfillReserve (inactive PG)
82#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
83
c07f9fc5
FG
84/// max manually/automatically set recovery priority for MBackfillReserve
85#define OSD_RECOVERY_PRIORITY_MAX 254
86
87/// max recovery priority for MBackfillReserve, only when forced manually
88#define OSD_RECOVERY_PRIORITY_FORCED 255
7c673cae
FG
89
90
91typedef hobject_t collection_list_handle_t;
92
93/// convert a single CPEH_OSD_FLAG_* to a string
94const char *ceph_osd_flag_name(unsigned flag);
95/// convert a single CEPH_OSD_OF_FLAG_* to a string
96const char *ceph_osd_op_flag_name(unsigned flag);
97
98/// convert CEPH_OSD_FLAG_* op flags to a string
99string ceph_osd_flag_string(unsigned flags);
100/// conver CEPH_OSD_OP_FLAG_* op flags to a string
101string ceph_osd_op_flag_string(unsigned flags);
102/// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a string
103string ceph_osd_alloc_hint_flag_string(unsigned flags);
104
105
106/**
107 * osd request identifier
108 *
109 * caller name + incarnation# + tid to unique identify this request.
110 */
111struct osd_reqid_t {
112 entity_name_t name; // who
c07f9fc5 113 ceph_tid_t tid;
7c673cae
FG
114 int32_t inc; // incarnation
115
116 osd_reqid_t()
c07f9fc5
FG
117 : tid(0), inc(0)
118 {}
119 osd_reqid_t(const osd_reqid_t& other)
120 : name(other.name), tid(other.tid), inc(other.inc)
121 {}
7c673cae 122 osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
c07f9fc5
FG
123 : name(a), tid(t), inc(i)
124 {}
7c673cae
FG
125
126 DENC(osd_reqid_t, v, p) {
127 DENC_START(2, 2, p);
128 denc(v.name, p);
129 denc(v.tid, p);
130 denc(v.inc, p);
131 DENC_FINISH(p);
132 }
133 void dump(Formatter *f) const;
134 static void generate_test_instances(list<osd_reqid_t*>& o);
135};
136WRITE_CLASS_DENC(osd_reqid_t)
137
138
139
140struct pg_shard_t {
141 int32_t osd;
142 shard_id_t shard;
143 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
144 explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
145 pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
146 bool is_undefined() const {
147 return osd == -1;
148 }
149 void encode(bufferlist &bl) const;
150 void decode(bufferlist::iterator &bl);
151 void dump(Formatter *f) const {
152 f->dump_unsigned("osd", osd);
153 if (shard != shard_id_t::NO_SHARD) {
154 f->dump_unsigned("shard", shard);
155 }
156 }
157};
158WRITE_CLASS_ENCODER(pg_shard_t)
159WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
160WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
161ostream &operator<<(ostream &lhs, const pg_shard_t &rhs);
162
163class IsPGRecoverablePredicate {
164public:
165 /**
166 * have encodes the shards available
167 */
168 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
169 virtual ~IsPGRecoverablePredicate() {}
170};
171
172class IsPGReadablePredicate {
173public:
174 /**
175 * have encodes the shards available
176 */
177 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
178 virtual ~IsPGReadablePredicate() {}
179};
180
181inline ostream& operator<<(ostream& out, const osd_reqid_t& r) {
182 return out << r.name << "." << r.inc << ":" << r.tid;
183}
184
185inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
186 return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
187}
188inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
189 return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
190}
191inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
192 return (l.name < r.name) || (l.inc < r.inc) ||
193 (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
194}
195inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
196 return (l.name < r.name) || (l.inc < r.inc) ||
197 (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
198}
199inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
200inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
201
202namespace std {
203 template<> struct hash<osd_reqid_t> {
204 size_t operator()(const osd_reqid_t &r) const {
205 static hash<uint64_t> H;
206 return H(r.name.num() ^ r.tid ^ r.inc);
207 }
208 };
209} // namespace std
210
211
212// -----
213
214// a locator constrains the placement of an object. mainly, which pool
215// does it go in.
216struct object_locator_t {
217 // You specify either the hash or the key -- not both
218 int64_t pool; ///< pool id
219 string key; ///< key string (if non-empty)
220 string nspace; ///< namespace
221 int64_t hash; ///< hash position (if >= 0)
222
223 explicit object_locator_t()
224 : pool(-1), hash(-1) {}
225 explicit object_locator_t(int64_t po)
226 : pool(po), hash(-1) {}
227 explicit object_locator_t(int64_t po, int64_t ps)
228 : pool(po), hash(ps) {}
229 explicit object_locator_t(int64_t po, string ns)
230 : pool(po), nspace(ns), hash(-1) {}
231 explicit object_locator_t(int64_t po, string ns, int64_t ps)
232 : pool(po), nspace(ns), hash(ps) {}
233 explicit object_locator_t(int64_t po, string ns, string s)
234 : pool(po), key(s), nspace(ns), hash(-1) {}
235 explicit object_locator_t(const hobject_t& soid)
236 : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
237
238 int64_t get_pool() const {
239 return pool;
240 }
241
242 void clear() {
243 pool = -1;
244 key = "";
245 nspace = "";
246 hash = -1;
247 }
248
249 bool empty() const {
250 return pool == -1;
251 }
252
253 void encode(bufferlist& bl) const;
254 void decode(bufferlist::iterator& p);
255 void dump(Formatter *f) const;
256 static void generate_test_instances(list<object_locator_t*>& o);
257};
258WRITE_CLASS_ENCODER(object_locator_t)
259
260inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
261 return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
262}
263inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
264 return !(l == r);
265}
266
267inline ostream& operator<<(ostream& out, const object_locator_t& loc)
268{
269 out << "@" << loc.pool;
270 if (loc.nspace.length())
271 out << ";" << loc.nspace;
272 if (loc.key.length())
273 out << ":" << loc.key;
274 return out;
275}
276
277struct request_redirect_t {
278private:
279 object_locator_t redirect_locator; ///< this is authoritative
280 string redirect_object; ///< If non-empty, the request goes to this object name
281 bufferlist osd_instructions; ///< a bufferlist for the OSDs, passed but not interpreted by clients
282
283 friend ostream& operator<<(ostream& out, const request_redirect_t& redir);
284public:
285
286 request_redirect_t() {}
287 explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
288 redirect_locator(orig) { redirect_locator.pool = rpool; }
289 explicit request_redirect_t(const object_locator_t& rloc) :
290 redirect_locator(rloc) {}
291 explicit request_redirect_t(const object_locator_t& orig,
292 const string& robj) :
293 redirect_locator(orig), redirect_object(robj) {}
294
295 void set_instructions(const bufferlist& bl) { osd_instructions = bl; }
296 const bufferlist& get_instructions() { return osd_instructions; }
297
298 bool empty() const { return redirect_locator.empty() &&
299 redirect_object.empty(); }
300
301 void combine_with_locator(object_locator_t& orig, string& obj) const {
302 orig = redirect_locator;
303 if (!redirect_object.empty())
304 obj = redirect_object;
305 }
306
307 void encode(bufferlist& bl) const;
308 void decode(bufferlist::iterator& bl);
309 void dump(Formatter *f) const;
310 static void generate_test_instances(list<request_redirect_t*>& o);
311};
312WRITE_CLASS_ENCODER(request_redirect_t)
313
314inline ostream& operator<<(ostream& out, const request_redirect_t& redir) {
315 out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
316 return out;
317}
318
319// Internal OSD op flags - set by the OSD based on the op types
320enum {
321 CEPH_OSD_RMW_FLAG_READ = (1 << 1),
322 CEPH_OSD_RMW_FLAG_WRITE = (1 << 2),
323 CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3),
324 CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
325 CEPH_OSD_RMW_FLAG_PGOP = (1 << 5),
326 CEPH_OSD_RMW_FLAG_CACHE = (1 << 6),
327 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7),
328 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
329 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9),
330 CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10),
331};
332
333
334// pg stuff
335
336#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
337
338// placement seed (a hash value)
339typedef uint32_t ps_t;
340
341// old (v1) pg_t encoding (wrap old struct ceph_pg)
342struct old_pg_t {
343 ceph_pg v;
344 void encode(bufferlist& bl) const {
345 ::encode_raw(v, bl);
346 }
347 void decode(bufferlist::iterator& bl) {
348 ::decode_raw(v, bl);
349 }
350};
351WRITE_CLASS_ENCODER(old_pg_t)
352
353// placement group id
354struct pg_t {
355 uint64_t m_pool;
356 uint32_t m_seed;
357 int32_t m_preferred;
358
359 pg_t() : m_pool(0), m_seed(0), m_preferred(-1) {}
360 pg_t(ps_t seed, uint64_t pool, int pref=-1) :
361 m_pool(pool), m_seed(seed), m_preferred(pref) {}
362 // cppcheck-suppress noExplicitConstructor
363 pg_t(const ceph_pg& cpg) :
364 m_pool(cpg.pool), m_seed(cpg.ps), m_preferred((__s16)cpg.preferred) {}
365
366 // cppcheck-suppress noExplicitConstructor
367 pg_t(const old_pg_t& opg) {
368 *this = opg.v;
369 }
370
371 old_pg_t get_old_pg() const {
372 old_pg_t o;
373 assert(m_pool < 0xffffffffull);
374 o.v.pool = m_pool;
375 o.v.ps = m_seed;
376 o.v.preferred = (__s16)m_preferred;
377 return o;
378 }
379
380 ps_t ps() const {
381 return m_seed;
382 }
383 uint64_t pool() const {
384 return m_pool;
385 }
386 int32_t preferred() const {
387 return m_preferred;
388 }
389
390 static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
391 char *calc_name(char *buf, const char *suffix_backwords) const;
392
393 void set_ps(ps_t p) {
394 m_seed = p;
395 }
396 void set_pool(uint64_t p) {
397 m_pool = p;
398 }
399 void set_preferred(int32_t osd) {
400 m_preferred = osd;
401 }
402
403 pg_t get_parent() const;
404 pg_t get_ancestor(unsigned old_pg_num) const;
405
406 int print(char *o, int maxlen) const;
407 bool parse(const char *s);
408
409 bool is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *pchildren) const;
410
411 /**
412 * Returns b such that for all object o:
413 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
414 */
415 unsigned get_split_bits(unsigned pg_num) const;
416
417 bool contains(int bits, const ghobject_t& oid) {
418 return oid.match(bits, ps());
419 }
420 bool contains(int bits, const hobject_t& oid) {
421 return oid.match(bits, ps());
422 }
423
424 hobject_t get_hobj_start() const;
425 hobject_t get_hobj_end(unsigned pg_num) const;
426
427 void encode(bufferlist& bl) const {
428 __u8 v = 1;
429 ::encode(v, bl);
430 ::encode(m_pool, bl);
431 ::encode(m_seed, bl);
432 ::encode(m_preferred, bl);
433 }
434 void decode(bufferlist::iterator& bl) {
435 __u8 v;
436 ::decode(v, bl);
437 ::decode(m_pool, bl);
438 ::decode(m_seed, bl);
439 ::decode(m_preferred, bl);
440 }
441 void decode_old(bufferlist::iterator& bl) {
442 old_pg_t opg;
443 ::decode(opg, bl);
444 *this = opg;
445 }
446 void dump(Formatter *f) const;
447 static void generate_test_instances(list<pg_t*>& o);
448};
449WRITE_CLASS_ENCODER(pg_t)
450
451inline bool operator<(const pg_t& l, const pg_t& r) {
452 return l.pool() < r.pool() ||
453 (l.pool() == r.pool() && (l.preferred() < r.preferred() ||
454 (l.preferred() == r.preferred() && (l.ps() < r.ps()))));
455}
456inline bool operator<=(const pg_t& l, const pg_t& r) {
457 return l.pool() < r.pool() ||
458 (l.pool() == r.pool() && (l.preferred() < r.preferred() ||
459 (l.preferred() == r.preferred() && (l.ps() <= r.ps()))));
460}
461inline bool operator==(const pg_t& l, const pg_t& r) {
462 return l.pool() == r.pool() &&
463 l.preferred() == r.preferred() &&
464 l.ps() == r.ps();
465}
466inline bool operator!=(const pg_t& l, const pg_t& r) {
467 return l.pool() != r.pool() ||
468 l.preferred() != r.preferred() ||
469 l.ps() != r.ps();
470}
471inline bool operator>(const pg_t& l, const pg_t& r) {
472 return l.pool() > r.pool() ||
473 (l.pool() == r.pool() && (l.preferred() > r.preferred() ||
474 (l.preferred() == r.preferred() && (l.ps() > r.ps()))));
475}
476inline bool operator>=(const pg_t& l, const pg_t& r) {
477 return l.pool() > r.pool() ||
478 (l.pool() == r.pool() && (l.preferred() > r.preferred() ||
479 (l.preferred() == r.preferred() && (l.ps() >= r.ps()))));
480}
481
482ostream& operator<<(ostream& out, const pg_t &pg);
483
484namespace std {
485 template<> struct hash< pg_t >
486 {
487 size_t operator()( const pg_t& x ) const
488 {
489 static hash<uint32_t> H;
490 return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ x.preferred());
491 }
492 };
493} // namespace std
494
495struct spg_t {
496 pg_t pgid;
497 shard_id_t shard;
498 spg_t() : shard(shard_id_t::NO_SHARD) {}
499 spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
500 explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
501 unsigned get_split_bits(unsigned pg_num) const {
502 return pgid.get_split_bits(pg_num);
503 }
504 spg_t get_parent() const {
505 return spg_t(pgid.get_parent(), shard);
506 }
507 ps_t ps() const {
508 return pgid.ps();
509 }
510 uint64_t pool() const {
511 return pgid.pool();
512 }
513 int32_t preferred() const {
514 return pgid.preferred();
515 }
516
517 static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
518 char *calc_name(char *buf, const char *suffix_backwords) const;
519
520 bool parse(const char *s);
521 bool parse(const std::string& s) {
522 return parse(s.c_str());
523 }
524 bool is_split(unsigned old_pg_num, unsigned new_pg_num,
525 set<spg_t> *pchildren) const {
526 set<pg_t> _children;
527 set<pg_t> *children = pchildren ? &_children : NULL;
528 bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
529 if (pchildren && is_split) {
530 for (set<pg_t>::iterator i = _children.begin();
531 i != _children.end();
532 ++i) {
533 pchildren->insert(spg_t(*i, shard));
534 }
535 }
536 return is_split;
537 }
538 bool is_no_shard() const {
539 return shard == shard_id_t::NO_SHARD;
540 }
541
542 ghobject_t make_pgmeta_oid() const {
543 return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
544 }
545
546 void encode(bufferlist &bl) const {
547 ENCODE_START(1, 1, bl);
548 ::encode(pgid, bl);
549 ::encode(shard, bl);
550 ENCODE_FINISH(bl);
551 }
552 void decode(bufferlist::iterator &bl) {
553 DECODE_START(1, bl);
554 ::decode(pgid, bl);
555 ::decode(shard, bl);
556 DECODE_FINISH(bl);
557 }
558
559 ghobject_t make_temp_ghobject(const string& name) const {
560 return ghobject_t(
561 hobject_t(object_t(name), "", CEPH_NOSNAP,
562 pgid.ps(),
563 hobject_t::POOL_TEMP_START - pgid.pool(), ""),
564 ghobject_t::NO_GEN,
565 shard);
566 }
567
568 unsigned hash_to_shard(unsigned num_shards) const {
569 return ps() % num_shards;
570 }
571};
572WRITE_CLASS_ENCODER(spg_t)
573WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
574WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
575
576namespace std {
577 template<> struct hash< spg_t >
578 {
579 size_t operator()( const spg_t& x ) const
580 {
581 static hash<uint32_t> H;
582 return H(hash<pg_t>()(x.pgid) ^ x.shard);
583 }
584 };
585} // namespace std
586
587ostream& operator<<(ostream& out, const spg_t &pg);
588
589// ----------------------
590
591class coll_t {
592 enum type_t {
593 TYPE_META = 0,
594 TYPE_LEGACY_TEMP = 1, /* no longer used */
595 TYPE_PG = 2,
596 TYPE_PG_TEMP = 3,
597 };
598 type_t type;
599 spg_t pgid;
600 uint64_t removal_seq; // note: deprecated, not encoded
601
602 char _str_buff[spg_t::calc_name_buf_size];
603 char *_str;
604
605 void calc_str();
606
607 coll_t(type_t t, spg_t p, uint64_t r)
608 : type(t), pgid(p), removal_seq(r) {
609 calc_str();
610 }
611
612public:
613 coll_t() : type(TYPE_META), removal_seq(0)
614 {
615 calc_str();
616 }
617
618 coll_t(const coll_t& other)
619 : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
620 calc_str();
621 }
622
623 explicit coll_t(spg_t pgid)
624 : type(TYPE_PG), pgid(pgid), removal_seq(0)
625 {
626 calc_str();
627 }
628
629 coll_t& operator=(const coll_t& rhs)
630 {
631 this->type = rhs.type;
632 this->pgid = rhs.pgid;
633 this->removal_seq = rhs.removal_seq;
634 this->calc_str();
635 return *this;
636 }
637
638 // named constructors
639 static coll_t meta() {
640 return coll_t();
641 }
642 static coll_t pg(spg_t p) {
643 return coll_t(p);
644 }
645
646 const std::string to_str() const {
647 return string(_str);
648 }
649 const char *c_str() const {
650 return _str;
651 }
652
653 bool parse(const std::string& s);
654
655 int operator<(const coll_t &rhs) const {
656 return type < rhs.type ||
657 (type == rhs.type && pgid < rhs.pgid);
658 }
659
660 bool is_meta() const {
661 return type == TYPE_META;
662 }
663 bool is_pg_prefix(spg_t *pgid_) const {
664 if (type == TYPE_PG || type == TYPE_PG_TEMP) {
665 *pgid_ = pgid;
666 return true;
667 }
668 return false;
669 }
670 bool is_pg() const {
671 return type == TYPE_PG;
672 }
673 bool is_pg(spg_t *pgid_) const {
674 if (type == TYPE_PG) {
675 *pgid_ = pgid;
676 return true;
677 }
678 return false;
679 }
680 bool is_temp() const {
681 return type == TYPE_PG_TEMP;
682 }
683 bool is_temp(spg_t *pgid_) const {
684 if (type == TYPE_PG_TEMP) {
685 *pgid_ = pgid;
686 return true;
687 }
688 return false;
689 }
690
691 void encode(bufferlist& bl) const;
692 void decode(bufferlist::iterator& bl);
693 size_t encoded_size() const;
694
695 inline bool operator==(const coll_t& rhs) const {
696 // only compare type if meta
697 if (type != rhs.type)
698 return false;
699 if (type == TYPE_META)
700 return true;
701 return type == rhs.type && pgid == rhs.pgid;
702 }
703 inline bool operator!=(const coll_t& rhs) const {
704 return !(*this == rhs);
705 }
706
707 // get a TEMP collection that corresponds to the current collection,
708 // which we presume is a pg collection.
709 coll_t get_temp() const {
710 assert(type == TYPE_PG);
711 return coll_t(TYPE_PG_TEMP, pgid, 0);
712 }
713
714 ghobject_t get_min_hobj() const {
715 ghobject_t o;
716 switch (type) {
717 case TYPE_PG:
718 o.hobj.pool = pgid.pool();
719 o.set_shard(pgid.shard);
720 break;
721 case TYPE_META:
722 o.hobj.pool = -1;
723 break;
724 default:
725 break;
726 }
727 return o;
728 }
729
730 unsigned hash_to_shard(unsigned num_shards) const {
731 if (type == TYPE_PG)
732 return pgid.hash_to_shard(num_shards);
733 return 0; // whatever.
734 }
735
736 void dump(Formatter *f) const;
737 static void generate_test_instances(list<coll_t*>& o);
738};
739
740WRITE_CLASS_ENCODER(coll_t)
741
742inline ostream& operator<<(ostream& out, const coll_t& c) {
743 out << c.to_str();
744 return out;
745}
746
747namespace std {
748 template<> struct hash<coll_t> {
749 size_t operator()(const coll_t &c) const {
750 size_t h = 0;
751 string str(c.to_str());
752 std::string::const_iterator end(str.end());
753 for (std::string::const_iterator s = str.begin(); s != end; ++s) {
754 h += *s;
755 h += (h << 10);
756 h ^= (h >> 6);
757 }
758 h += (h << 3);
759 h ^= (h >> 11);
760 h += (h << 15);
761 return h;
762 }
763 };
764} // namespace std
765
766inline ostream& operator<<(ostream& out, const ceph_object_layout &ol)
767{
768 out << pg_t(ol.ol_pgid);
769 int su = ol.ol_stripe_unit;
770 if (su)
771 out << ".su=" << su;
772 return out;
773}
774
775
776
777// compound rados version type
778/* WARNING: If add member in eversion_t, please make sure the encode/decode function
779 * work well. For little-endian machine, we should make sure there is no padding
780 * in 32-bit machine and 64-bit machine.
781 */
782class eversion_t {
783public:
784 version_t version;
785 epoch_t epoch;
786 __u32 __pad;
787 eversion_t() : version(0), epoch(0), __pad(0) {}
788 eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
789
790 // cppcheck-suppress noExplicitConstructor
c07f9fc5 791 eversion_t(const ceph_eversion& ce) :
7c673cae
FG
792 version(ce.version),
793 epoch(ce.epoch),
794 __pad(0) { }
795
796 explicit eversion_t(bufferlist& bl) : __pad(0) { decode(bl); }
797
798 static eversion_t max() {
799 eversion_t max;
800 max.version -= 1;
801 max.epoch -= 1;
802 return max;
803 }
804
805 operator ceph_eversion() {
806 ceph_eversion c;
807 c.epoch = epoch;
808 c.version = version;
809 return c;
810 }
811
812 string get_key_name() const;
813
814 void encode(bufferlist &bl) const {
815#if defined(CEPH_LITTLE_ENDIAN)
816 bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
817#else
818 ::encode(version, bl);
819 ::encode(epoch, bl);
820#endif
821 }
822 void decode(bufferlist::iterator &bl) {
823#if defined(CEPH_LITTLE_ENDIAN)
824 bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
825#else
826 ::decode(version, bl);
827 ::decode(epoch, bl);
828#endif
829 }
830 void decode(bufferlist& bl) {
831 bufferlist::iterator p = bl.begin();
832 decode(p);
833 }
834};
835WRITE_CLASS_ENCODER(eversion_t)
836
837inline bool operator==(const eversion_t& l, const eversion_t& r) {
838 return (l.epoch == r.epoch) && (l.version == r.version);
839}
840inline bool operator!=(const eversion_t& l, const eversion_t& r) {
841 return (l.epoch != r.epoch) || (l.version != r.version);
842}
843inline bool operator<(const eversion_t& l, const eversion_t& r) {
844 return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
845}
846inline bool operator<=(const eversion_t& l, const eversion_t& r) {
847 return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
848}
849inline bool operator>(const eversion_t& l, const eversion_t& r) {
850 return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
851}
852inline bool operator>=(const eversion_t& l, const eversion_t& r) {
853 return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
854}
855inline ostream& operator<<(ostream& out, const eversion_t& e) {
856 return out << e.epoch << "'" << e.version;
857}
858
859/**
860 * objectstore_perf_stat_t
861 *
862 * current perf information about the osd
863 */
864struct objectstore_perf_stat_t {
865 // cur_op_latency is in ms since double add/sub are not associative
866 uint32_t os_commit_latency;
867 uint32_t os_apply_latency;
868
869 objectstore_perf_stat_t() :
870 os_commit_latency(0), os_apply_latency(0) {}
871
872 bool operator==(const objectstore_perf_stat_t &r) const {
873 return os_commit_latency == r.os_commit_latency &&
874 os_apply_latency == r.os_apply_latency;
875 }
876
877 void add(const objectstore_perf_stat_t &o) {
878 os_commit_latency += o.os_commit_latency;
879 os_apply_latency += o.os_apply_latency;
880 }
881 void sub(const objectstore_perf_stat_t &o) {
882 os_commit_latency -= o.os_commit_latency;
883 os_apply_latency -= o.os_apply_latency;
884 }
885 void dump(Formatter *f) const;
886 void encode(bufferlist &bl) const;
887 void decode(bufferlist::iterator &bl);
888 static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
889};
890WRITE_CLASS_ENCODER(objectstore_perf_stat_t)
891
892/** osd_stat
893 * aggregate stats for an osd
894 */
895struct osd_stat_t {
896 int64_t kb, kb_used, kb_avail;
897 vector<int> hb_peers;
898 int32_t snap_trim_queue_len, num_snap_trimming;
899
900 pow2_hist_t op_queue_age_hist;
901
902 objectstore_perf_stat_t os_perf_stat;
903
31f18b77
FG
904 epoch_t up_from = 0;
905 uint64_t seq = 0;
906
35e4c445
FG
907 uint32_t num_pgs = 0;
908
7c673cae
FG
909 osd_stat_t() : kb(0), kb_used(0), kb_avail(0),
910 snap_trim_queue_len(0), num_snap_trimming(0) {}
911
912 void add(const osd_stat_t& o) {
913 kb += o.kb;
914 kb_used += o.kb_used;
915 kb_avail += o.kb_avail;
916 snap_trim_queue_len += o.snap_trim_queue_len;
917 num_snap_trimming += o.num_snap_trimming;
918 op_queue_age_hist.add(o.op_queue_age_hist);
919 os_perf_stat.add(o.os_perf_stat);
35e4c445 920 num_pgs += o.num_pgs;
7c673cae
FG
921 }
922 void sub(const osd_stat_t& o) {
923 kb -= o.kb;
924 kb_used -= o.kb_used;
925 kb_avail -= o.kb_avail;
926 snap_trim_queue_len -= o.snap_trim_queue_len;
927 num_snap_trimming -= o.num_snap_trimming;
928 op_queue_age_hist.sub(o.op_queue_age_hist);
929 os_perf_stat.sub(o.os_perf_stat);
35e4c445 930 num_pgs -= o.num_pgs;
7c673cae
FG
931 }
932
933 void dump(Formatter *f) const;
934 void encode(bufferlist &bl) const;
935 void decode(bufferlist::iterator &bl);
936 static void generate_test_instances(std::list<osd_stat_t*>& o);
937};
938WRITE_CLASS_ENCODER(osd_stat_t)
939
940inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
941 return l.kb == r.kb &&
942 l.kb_used == r.kb_used &&
943 l.kb_avail == r.kb_avail &&
944 l.snap_trim_queue_len == r.snap_trim_queue_len &&
945 l.num_snap_trimming == r.num_snap_trimming &&
946 l.hb_peers == r.hb_peers &&
947 l.op_queue_age_hist == r.op_queue_age_hist &&
35e4c445
FG
948 l.os_perf_stat == r.os_perf_stat &&
949 l.num_pgs == r.num_pgs;
7c673cae
FG
950}
951inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
952 return !(l == r);
953}
954
955
956
957inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
958 return out << "osd_stat(" << kb_t(s.kb_used) << " used, "
959 << kb_t(s.kb_avail) << " avail, "
960 << kb_t(s.kb) << " total, "
961 << "peers " << s.hb_peers
962 << " op hist " << s.op_queue_age_hist.h
963 << ")";
964}
965
966
967/*
968 * pg states
969 */
970#define PG_STATE_CREATING (1<<0) // creating
971#define PG_STATE_ACTIVE (1<<1) // i am active. (primary: replicas too)
972#define PG_STATE_CLEAN (1<<2) // peers are complete, clean of stray replicas.
973#define PG_STATE_DOWN (1<<4) // a needed replica is down, PG offline
974//#define PG_STATE_REPLAY (1<<5) // crashed, waiting for replay
975//#define PG_STATE_STRAY (1<<6) // i must notify the primary i exist.
976//#define PG_STATE_SPLITTING (1<<7) // i am splitting
977#define PG_STATE_SCRUBBING (1<<8) // scrubbing
978//#define PG_STATE_SCRUBQ (1<<9) // queued for scrub
979#define PG_STATE_DEGRADED (1<<10) // pg contains objects with reduced redundancy
980#define PG_STATE_INCONSISTENT (1<<11) // pg replicas are inconsistent (but shouldn't be)
981#define PG_STATE_PEERING (1<<12) // pg is (re)peering
982#define PG_STATE_REPAIR (1<<13) // pg should repair on next scrub
983#define PG_STATE_RECOVERING (1<<14) // pg is recovering/migrating objects
984#define PG_STATE_BACKFILL_WAIT (1<<15) // [active] reserving backfill
985#define PG_STATE_INCOMPLETE (1<<16) // incomplete content, peering failed.
986#define PG_STATE_STALE (1<<17) // our state for this pg is stale, unknown.
987#define PG_STATE_REMAPPED (1<<18) // pg is explicitly remapped to different OSDs than CRUSH
988#define PG_STATE_DEEP_SCRUB (1<<19) // deep scrub: check CRC32 on files
3efd9988 989#define PG_STATE_BACKFILLING (1<<20) // [active] backfilling pg content
7c673cae
FG
990#define PG_STATE_BACKFILL_TOOFULL (1<<21) // backfill can't proceed: too full
991#define PG_STATE_RECOVERY_WAIT (1<<22) // waiting for recovery reservations
992#define PG_STATE_UNDERSIZED (1<<23) // pg acting < pool size
993#define PG_STATE_ACTIVATING (1<<24) // pg is peered but not yet active
994#define PG_STATE_PEERED (1<<25) // peered, cannot go active, can recover
995#define PG_STATE_SNAPTRIM (1<<26) // trimming snaps
996#define PG_STATE_SNAPTRIM_WAIT (1<<27) // queued to trim snaps
997#define PG_STATE_RECOVERY_TOOFULL (1<<28) // recovery can't proceed: too full
224ce89b 998#define PG_STATE_SNAPTRIM_ERROR (1<<29) // error stopped trimming snaps
c07f9fc5
FG
999#define PG_STATE_FORCED_RECOVERY (1<<30) // force recovery of this pg before any other
1000#define PG_STATE_FORCED_BACKFILL (1<<31) // force backfill of this pg before any other
7c673cae
FG
1001
1002std::string pg_state_string(int state);
1003std::string pg_vector_string(const vector<int32_t> &a);
3efd9988 1004boost::optional<uint64_t> pg_string_state(const std::string& state);
7c673cae
FG
1005
1006
1007/*
1008 * pool_snap_info_t
1009 *
1010 * attributes for a single pool snapshot.
1011 */
1012struct pool_snap_info_t {
1013 snapid_t snapid;
1014 utime_t stamp;
1015 string name;
1016
1017 void dump(Formatter *f) const;
1018 void encode(bufferlist& bl, uint64_t features) const;
1019 void decode(bufferlist::iterator& bl);
1020 static void generate_test_instances(list<pool_snap_info_t*>& o);
1021};
1022WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
1023
1024inline ostream& operator<<(ostream& out, const pool_snap_info_t& si) {
1025 return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
1026}
1027
1028
1029/*
1030 * pool_opts_t
1031 *
1032 * pool options.
1033 */
1034
1035class pool_opts_t {
1036public:
1037 enum key_t {
1038 SCRUB_MIN_INTERVAL,
1039 SCRUB_MAX_INTERVAL,
1040 DEEP_SCRUB_INTERVAL,
1041 RECOVERY_PRIORITY,
1042 RECOVERY_OP_PRIORITY,
1043 SCRUB_PRIORITY,
1044 COMPRESSION_MODE,
1045 COMPRESSION_ALGORITHM,
1046 COMPRESSION_REQUIRED_RATIO,
1047 COMPRESSION_MAX_BLOB_SIZE,
1048 COMPRESSION_MIN_BLOB_SIZE,
1049 CSUM_TYPE,
1050 CSUM_MAX_BLOCK,
1051 CSUM_MIN_BLOCK,
1052 };
1053
1054 enum type_t {
1055 STR,
1056 INT,
1057 DOUBLE,
1058 };
1059
1060 struct opt_desc_t {
1061 key_t key;
1062 type_t type;
1063
1064 opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
1065
1066 bool operator==(const opt_desc_t& rhs) const {
1067 return key == rhs.key && type == rhs.type;
1068 }
1069 };
1070
1071 typedef boost::variant<std::string,int,double> value_t;
1072
1073 static bool is_opt_name(const std::string& name);
1074 static opt_desc_t get_opt_desc(const std::string& name);
1075
1076 pool_opts_t() : opts() {}
1077
1078 bool is_set(key_t key) const;
1079
1080 template<typename T>
1081 void set(key_t key, const T &val) {
1082 value_t value = val;
1083 opts[key] = value;
1084 }
1085
1086 template<typename T>
1087 bool get(key_t key, T *val) const {
1088 opts_t::const_iterator i = opts.find(key);
1089 if (i == opts.end()) {
1090 return false;
1091 }
1092 *val = boost::get<T>(i->second);
1093 return true;
1094 }
1095
1096 const value_t& get(key_t key) const;
1097
1098 bool unset(key_t key);
1099
1100 void dump(const std::string& name, Formatter *f) const;
1101
1102 void dump(Formatter *f) const;
1103 void encode(bufferlist &bl) const;
1104 void decode(bufferlist::iterator &bl);
1105
1106private:
1107 typedef std::map<key_t, value_t> opts_t;
1108 opts_t opts;
1109
1110 friend ostream& operator<<(ostream& out, const pool_opts_t& opts);
1111};
1112WRITE_CLASS_ENCODER(pool_opts_t)
1113
1114/*
1115 * pg_pool
1116 */
1117struct pg_pool_t {
c07f9fc5
FG
1118 static const char *APPLICATION_NAME_CEPHFS;
1119 static const char *APPLICATION_NAME_RBD;
1120 static const char *APPLICATION_NAME_RGW;
1121
7c673cae
FG
1122 enum {
1123 TYPE_REPLICATED = 1, // replication
1124 //TYPE_RAID4 = 2, // raid4 (never implemented)
1125 TYPE_ERASURE = 3, // erasure-coded
1126 };
1127 static const char *get_type_name(int t) {
1128 switch (t) {
1129 case TYPE_REPLICATED: return "replicated";
1130 //case TYPE_RAID4: return "raid4";
1131 case TYPE_ERASURE: return "erasure";
1132 default: return "???";
1133 }
1134 }
1135 const char *get_type_name() const {
1136 return get_type_name(type);
1137 }
7c673cae
FG
1138
1139 enum {
1140 FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
1141 FLAG_FULL = 1<<1, // pool is full
1142 FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
1143 FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
1144 FLAG_NODELETE = 1<<4, // pool can't be deleted
1145 FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
1146 FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
1147 FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1148 FLAG_NOSCRUB = 1<<8, // block periodic scrub
1149 FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
3efd9988
FG
1150 FLAG_FULL_NO_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
1151 FLAG_NEARFULL = 1<<11, // pool is nearfull
1152 FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
7c673cae
FG
1153 };
1154
1155 static const char *get_flag_name(int f) {
1156 switch (f) {
1157 case FLAG_HASHPSPOOL: return "hashpspool";
1158 case FLAG_FULL: return "full";
1159 case FLAG_EC_OVERWRITES: return "ec_overwrites";
1160 case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
1161 case FLAG_NODELETE: return "nodelete";
1162 case FLAG_NOPGCHANGE: return "nopgchange";
1163 case FLAG_NOSIZECHANGE: return "nosizechange";
1164 case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
1165 case FLAG_NOSCRUB: return "noscrub";
1166 case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
3efd9988
FG
1167 case FLAG_FULL_NO_QUOTA: return "full_no_quota";
1168 case FLAG_NEARFULL: return "nearfull";
1169 case FLAG_BACKFILLFULL: return "backfillfull";
7c673cae
FG
1170 default: return "???";
1171 }
1172 }
1173 static string get_flags_string(uint64_t f) {
1174 string s;
1175 for (unsigned n=0; f && n<64; ++n) {
1176 if (f & (1ull << n)) {
1177 if (s.length())
1178 s += ",";
1179 s += get_flag_name(1ull << n);
1180 }
1181 }
1182 return s;
1183 }
1184 string get_flags_string() const {
1185 return get_flags_string(flags);
1186 }
1187 static uint64_t get_flag_by_name(const string& name) {
1188 if (name == "hashpspool")
1189 return FLAG_HASHPSPOOL;
1190 if (name == "full")
1191 return FLAG_FULL;
1192 if (name == "ec_overwrites")
1193 return FLAG_EC_OVERWRITES;
1194 if (name == "incomplete_clones")
1195 return FLAG_INCOMPLETE_CLONES;
1196 if (name == "nodelete")
1197 return FLAG_NODELETE;
1198 if (name == "nopgchange")
1199 return FLAG_NOPGCHANGE;
1200 if (name == "nosizechange")
1201 return FLAG_NOSIZECHANGE;
1202 if (name == "write_fadvise_dontneed")
1203 return FLAG_WRITE_FADVISE_DONTNEED;
1204 if (name == "noscrub")
1205 return FLAG_NOSCRUB;
1206 if (name == "nodeep-scrub")
1207 return FLAG_NODEEP_SCRUB;
3efd9988
FG
1208 if (name == "full_no_quota")
1209 return FLAG_FULL_NO_QUOTA;
1210 if (name == "nearfull")
1211 return FLAG_NEARFULL;
1212 if (name == "backfillfull")
1213 return FLAG_BACKFILLFULL;
7c673cae
FG
1214 return 0;
1215 }
1216
1217 /// converts the acting/up vector to a set of pg shards
1218 void convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const;
1219
1220 typedef enum {
1221 CACHEMODE_NONE = 0, ///< no caching
1222 CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
1223 CACHEMODE_FORWARD = 2, ///< forward if not in cache
1224 CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
1225 CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later
1226 CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later
1227 CACHEMODE_PROXY = 6, ///< proxy if not in cache
1228 } cache_mode_t;
1229 static const char *get_cache_mode_name(cache_mode_t m) {
1230 switch (m) {
1231 case CACHEMODE_NONE: return "none";
1232 case CACHEMODE_WRITEBACK: return "writeback";
1233 case CACHEMODE_FORWARD: return "forward";
1234 case CACHEMODE_READONLY: return "readonly";
1235 case CACHEMODE_READFORWARD: return "readforward";
1236 case CACHEMODE_READPROXY: return "readproxy";
1237 case CACHEMODE_PROXY: return "proxy";
1238 default: return "unknown";
1239 }
1240 }
1241 static cache_mode_t get_cache_mode_from_str(const string& s) {
1242 if (s == "none")
1243 return CACHEMODE_NONE;
1244 if (s == "writeback")
1245 return CACHEMODE_WRITEBACK;
1246 if (s == "forward")
1247 return CACHEMODE_FORWARD;
1248 if (s == "readonly")
1249 return CACHEMODE_READONLY;
1250 if (s == "readforward")
1251 return CACHEMODE_READFORWARD;
1252 if (s == "readproxy")
1253 return CACHEMODE_READPROXY;
1254 if (s == "proxy")
1255 return CACHEMODE_PROXY;
1256 return (cache_mode_t)-1;
1257 }
1258 const char *get_cache_mode_name() const {
1259 return get_cache_mode_name(cache_mode);
1260 }
1261 bool cache_mode_requires_hit_set() const {
1262 switch (cache_mode) {
1263 case CACHEMODE_NONE:
1264 case CACHEMODE_FORWARD:
1265 case CACHEMODE_READONLY:
1266 case CACHEMODE_PROXY:
1267 return false;
1268 case CACHEMODE_WRITEBACK:
1269 case CACHEMODE_READFORWARD:
1270 case CACHEMODE_READPROXY:
1271 return true;
1272 default:
1273 assert(0 == "implement me");
1274 }
1275 }
1276
1277 uint64_t flags; ///< FLAG_*
1278 __u8 type; ///< TYPE_*
1279 __u8 size, min_size; ///< number of osds in each pg
31f18b77 1280 __u8 crush_rule; ///< crush placement rule
7c673cae
FG
1281 __u8 object_hash; ///< hash mapping object name to ps
1282private:
1283 __u32 pg_num, pgp_num; ///< number of pgs
1284
1285
1286public:
1287 map<string,string> properties; ///< OBSOLETE
1288 string erasure_code_profile; ///< name of the erasure code profile in OSDMap
1289 epoch_t last_change; ///< most recent epoch changed, exclusing snapshot changes
1290 epoch_t last_force_op_resend; ///< last epoch that forced clients to resend
1291 /// last epoch that forced clients to resend (pre-luminous clients only)
1292 epoch_t last_force_op_resend_preluminous;
1293 snapid_t snap_seq; ///< seq for per-pool snapshot
1294 epoch_t snap_epoch; ///< osdmap epoch of last snap
1295 uint64_t auid; ///< who owns the pg
1296 __u32 crash_replay_interval; ///< seconds to allow clients to replay ACKed but unCOMMITted requests
1297
1298 uint64_t quota_max_bytes; ///< maximum number of bytes for this pool
1299 uint64_t quota_max_objects; ///< maximum number of objects for this pool
1300
1301 /*
1302 * Pool snaps (global to this pool). These define a SnapContext for
1303 * the pool, unless the client manually specifies an alternate
1304 * context.
1305 */
1306 map<snapid_t, pool_snap_info_t> snaps;
1307 /*
1308 * Alternatively, if we are defining non-pool snaps (e.g. via the
1309 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1310 * used). Snaps and removed_snaps are to be used exclusive of each
1311 * other!
1312 */
1313 interval_set<snapid_t> removed_snaps;
1314
1315 unsigned pg_num_mask, pgp_num_mask;
1316
1317 set<uint64_t> tiers; ///< pools that are tiers of us
1318 int64_t tier_of; ///< pool for which we are a tier
1319 // Note that write wins for read+write ops
1320 int64_t read_tier; ///< pool/tier for objecter to direct reads to
1321 int64_t write_tier; ///< pool/tier for objecter to direct writes to
1322 cache_mode_t cache_mode; ///< cache pool mode
1323
1324 bool is_tier() const { return tier_of >= 0; }
1325 bool has_tiers() const { return !tiers.empty(); }
1326 void clear_tier() {
1327 tier_of = -1;
1328 clear_read_tier();
1329 clear_write_tier();
1330 clear_tier_tunables();
1331 }
1332 bool has_read_tier() const { return read_tier >= 0; }
1333 void clear_read_tier() { read_tier = -1; }
1334 bool has_write_tier() const { return write_tier >= 0; }
1335 void clear_write_tier() { write_tier = -1; }
1336 void clear_tier_tunables() {
1337 if (cache_mode != CACHEMODE_NONE)
1338 flags |= FLAG_INCOMPLETE_CLONES;
1339 cache_mode = CACHEMODE_NONE;
1340
1341 target_max_bytes = 0;
1342 target_max_objects = 0;
1343 cache_target_dirty_ratio_micro = 0;
1344 cache_target_dirty_high_ratio_micro = 0;
1345 cache_target_full_ratio_micro = 0;
1346 hit_set_params = HitSet::Params();
1347 hit_set_period = 0;
1348 hit_set_count = 0;
1349 hit_set_grade_decay_rate = 0;
1350 hit_set_search_last_n = 0;
1351 grade_table.resize(0);
1352 }
1353
1354 uint64_t target_max_bytes; ///< tiering: target max pool size
1355 uint64_t target_max_objects; ///< tiering: target max pool size
1356
1357 uint32_t cache_target_dirty_ratio_micro; ///< cache: fraction of target to leave dirty
1358 uint32_t cache_target_dirty_high_ratio_micro; ///<cache: fraction of target to flush with high speed
1359 uint32_t cache_target_full_ratio_micro; ///< cache: fraction of target to fill before we evict in earnest
1360
1361 uint32_t cache_min_flush_age; ///< minimum age (seconds) before we can flush
1362 uint32_t cache_min_evict_age; ///< minimum age (seconds) before we can evict
1363
1364 HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
1365 uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
1366 uint32_t hit_set_count; ///< number of periods to retain
1367 bool use_gmt_hitset; ///< use gmt to name the hitset archive object
1368 uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote on read
1369 uint32_t min_write_recency_for_promote; ///< minimum number of HitSet to check before promote on write
1370 uint32_t hit_set_grade_decay_rate; ///< current hit_set has highest priority on objects
1371 ///temperature count,the follow hit_set's priority decay
1372 ///by this params than pre hit_set
1373 uint32_t hit_set_search_last_n; ///<accumulate atmost N hit_sets for temperature
1374
1375 uint32_t stripe_width; ///< erasure coded stripe size in bytes
1376
1377 uint64_t expected_num_objects; ///< expected number of objects on this pool, a value of 0 indicates
1378 ///< user does not specify any expected value
1379 bool fast_read; ///< whether turn on fast read on the pool or not
1380
1381 pool_opts_t opts; ///< options
1382
c07f9fc5
FG
1383 /// application -> key/value metadata
1384 map<string, std::map<string, string>> application_metadata;
1385
7c673cae
FG
1386private:
1387 vector<uint32_t> grade_table;
1388
1389public:
1390 uint32_t get_grade(unsigned i) const {
1391 if (grade_table.size() <= i)
1392 return 0;
1393 return grade_table[i];
1394 }
1395 void calc_grade_table() {
1396 unsigned v = 1000000;
1397 grade_table.resize(hit_set_count);
1398 for (unsigned i = 0; i < hit_set_count; i++) {
1399 v = v * (1 - (hit_set_grade_decay_rate / 100.0));
1400 grade_table[i] = v;
1401 }
1402 }
1403
1404 pg_pool_t()
1405 : flags(0), type(0), size(0), min_size(0),
31f18b77 1406 crush_rule(0), object_hash(0),
7c673cae
FG
1407 pg_num(0), pgp_num(0),
1408 last_change(0),
1409 last_force_op_resend(0),
1410 last_force_op_resend_preluminous(0),
1411 snap_seq(0), snap_epoch(0),
1412 auid(0),
1413 crash_replay_interval(0),
1414 quota_max_bytes(0), quota_max_objects(0),
1415 pg_num_mask(0), pgp_num_mask(0),
1416 tier_of(-1), read_tier(-1), write_tier(-1),
1417 cache_mode(CACHEMODE_NONE),
1418 target_max_bytes(0), target_max_objects(0),
1419 cache_target_dirty_ratio_micro(0),
1420 cache_target_dirty_high_ratio_micro(0),
1421 cache_target_full_ratio_micro(0),
1422 cache_min_flush_age(0),
1423 cache_min_evict_age(0),
1424 hit_set_params(),
1425 hit_set_period(0),
1426 hit_set_count(0),
1427 use_gmt_hitset(true),
1428 min_read_recency_for_promote(0),
1429 min_write_recency_for_promote(0),
1430 hit_set_grade_decay_rate(0),
1431 hit_set_search_last_n(0),
1432 stripe_width(0),
1433 expected_num_objects(0),
1434 fast_read(false),
1435 opts()
1436 { }
1437
1438 void dump(Formatter *f) const;
1439
1440 uint64_t get_flags() const { return flags; }
1441 bool has_flag(uint64_t f) const { return flags & f; }
1442 void set_flag(uint64_t f) { flags |= f; }
1443 void unset_flag(uint64_t f) { flags &= ~f; }
1444
1445 bool ec_pool() const {
1446 return type == TYPE_ERASURE;
1447 }
1448 bool require_rollback() const {
1449 return ec_pool();
1450 }
1451
1452 /// true if incomplete clones may be present
1453 bool allow_incomplete_clones() const {
1454 return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
1455 }
1456
1457 unsigned get_type() const { return type; }
1458 unsigned get_size() const { return size; }
1459 unsigned get_min_size() const { return min_size; }
31f18b77 1460 int get_crush_rule() const { return crush_rule; }
7c673cae
FG
1461 int get_object_hash() const { return object_hash; }
1462 const char *get_object_hash_name() const {
1463 return ceph_str_hash_name(get_object_hash());
1464 }
1465 epoch_t get_last_change() const { return last_change; }
1466 epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
1467 epoch_t get_last_force_op_resend_preluminous() const {
1468 return last_force_op_resend_preluminous;
1469 }
1470 epoch_t get_snap_epoch() const { return snap_epoch; }
1471 snapid_t get_snap_seq() const { return snap_seq; }
1472 uint64_t get_auid() const { return auid; }
1473 unsigned get_crash_replay_interval() const { return crash_replay_interval; }
1474
1475 void set_snap_seq(snapid_t s) { snap_seq = s; }
1476 void set_snap_epoch(epoch_t e) { snap_epoch = e; }
1477
1478 void set_stripe_width(uint32_t s) { stripe_width = s; }
1479 uint32_t get_stripe_width() const { return stripe_width; }
1480
1481 bool is_replicated() const { return get_type() == TYPE_REPLICATED; }
1482 bool is_erasure() const { return get_type() == TYPE_ERASURE; }
1483
1484 bool supports_omap() const {
1485 return !(get_type() == TYPE_ERASURE);
1486 }
1487
1488 bool requires_aligned_append() const {
1489 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
1490 }
1491 uint64_t required_alignment() const { return stripe_width; }
1492
1493 bool allows_ecoverwrites() const {
1494 return has_flag(FLAG_EC_OVERWRITES);
1495 }
1496
1497 bool can_shift_osds() const {
1498 switch (get_type()) {
1499 case TYPE_REPLICATED:
1500 return true;
1501 case TYPE_ERASURE:
1502 return false;
1503 default:
1504 assert(0 == "unhandled pool type");
1505 }
1506 }
1507
1508 unsigned get_pg_num() const { return pg_num; }
1509 unsigned get_pgp_num() const { return pgp_num; }
1510
1511 unsigned get_pg_num_mask() const { return pg_num_mask; }
1512 unsigned get_pgp_num_mask() const { return pgp_num_mask; }
1513
1514 // if pg_num is not a multiple of two, pgs are not equally sized.
1515 // return, for a given pg, the fraction (denominator) of the total
1516 // pool size that it represents.
1517 unsigned get_pg_num_divisor(pg_t pgid) const;
1518
1519 void set_pg_num(int p) {
1520 pg_num = p;
1521 calc_pg_masks();
1522 }
1523 void set_pgp_num(int p) {
1524 pgp_num = p;
1525 calc_pg_masks();
1526 }
1527
1528 void set_quota_max_bytes(uint64_t m) {
1529 quota_max_bytes = m;
1530 }
1531 uint64_t get_quota_max_bytes() {
1532 return quota_max_bytes;
1533 }
1534
1535 void set_quota_max_objects(uint64_t m) {
1536 quota_max_objects = m;
1537 }
1538 uint64_t get_quota_max_objects() {
1539 return quota_max_objects;
1540 }
1541
1542 void set_last_force_op_resend(uint64_t t) {
1543 last_force_op_resend = t;
1544 last_force_op_resend_preluminous = t;
1545 }
1546
1547 void calc_pg_masks();
1548
1549 /*
1550 * we have two snap modes:
1551 * - pool global snaps
1552 * - snap existence/non-existence defined by snaps[] and snap_seq
1553 * - user managed snaps
1554 * - removal governed by removed_snaps
1555 *
1556 * we know which mode we're using based on whether removed_snaps is empty.
1557 * If nothing has been created, both functions report false.
1558 */
1559 bool is_pool_snaps_mode() const;
1560 bool is_unmanaged_snaps_mode() const;
1561 bool is_removed_snap(snapid_t s) const;
1562
1563 /*
1564 * build set of known-removed sets from either pool snaps or
1565 * explicit removed_snaps set.
1566 */
1567 void build_removed_snaps(interval_set<snapid_t>& rs) const;
1568 snapid_t snap_exists(const char *s) const;
1569 void add_snap(const char *n, utime_t stamp);
1570 void add_unmanaged_snap(uint64_t& snapid);
1571 void remove_snap(snapid_t s);
1572 void remove_unmanaged_snap(snapid_t s);
1573
1574 SnapContext get_snap_context() const;
1575
1576 /// hash a object name+namespace key to a hash position
1577 uint32_t hash_key(const string& key, const string& ns) const;
1578
1579 /// round a hash position down to a pg num
1580 uint32_t raw_hash_to_pg(uint32_t v) const;
1581
1582 /*
1583 * map a raw pg (with full precision ps) into an actual pg, for storage
1584 */
1585 pg_t raw_pg_to_pg(pg_t pg) const;
1586
1587 /*
1588 * map raw pg (full precision ps) into a placement seed. include
1589 * pool id in that value so that different pools don't use the same
1590 * seeds.
1591 */
1592 ps_t raw_pg_to_pps(pg_t pg) const;
1593
1594 /// choose a random hash position within a pg
1595 uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
1596
1597 void encode(bufferlist& bl, uint64_t features) const;
1598 void decode(bufferlist::iterator& bl);
1599
1600 static void generate_test_instances(list<pg_pool_t*>& o);
1601};
1602WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
1603
1604ostream& operator<<(ostream& out, const pg_pool_t& p);
1605
1606
1607/**
1608 * a summation of object stats
1609 *
1610 * This is just a container for object stats; we don't know what for.
1611 *
1612 * If you add members in object_stat_sum_t, you should make sure there are
1613 * not padding among these members.
1614 * You should also modify the padding_check function.
1615
1616 */
1617struct object_stat_sum_t {
1618 /**************************************************************************
1619 * WARNING: be sure to update operator==, floor, and split when
1620 * adding/removing fields!
1621 **************************************************************************/
1622 int64_t num_bytes; // in bytes
1623 int64_t num_objects;
1624 int64_t num_object_clones;
1625 int64_t num_object_copies; // num_objects * num_replicas
1626 int64_t num_objects_missing_on_primary;
1627 int64_t num_objects_degraded;
1628 int64_t num_objects_unfound;
1629 int64_t num_rd;
1630 int64_t num_rd_kb;
1631 int64_t num_wr;
1632 int64_t num_wr_kb;
1633 int64_t num_scrub_errors; // total deep and shallow scrub errors
1634 int64_t num_objects_recovered;
1635 int64_t num_bytes_recovered;
1636 int64_t num_keys_recovered;
1637 int64_t num_shallow_scrub_errors;
1638 int64_t num_deep_scrub_errors;
1639 int64_t num_objects_dirty;
1640 int64_t num_whiteouts;
1641 int64_t num_objects_omap;
1642 int64_t num_objects_hit_set_archive;
1643 int64_t num_objects_misplaced;
1644 int64_t num_bytes_hit_set_archive;
1645 int64_t num_flush;
1646 int64_t num_flush_kb;
1647 int64_t num_evict;
1648 int64_t num_evict_kb;
1649 int64_t num_promote;
1650 int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0
1651 int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0
1652 int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0
1653 int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0
1654 int64_t num_objects_pinned;
1655 int64_t num_objects_missing;
1656 int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
1657
1658 object_stat_sum_t()
1659 : num_bytes(0),
1660 num_objects(0), num_object_clones(0), num_object_copies(0),
1661 num_objects_missing_on_primary(0), num_objects_degraded(0),
1662 num_objects_unfound(0),
1663 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1664 num_scrub_errors(0),
1665 num_objects_recovered(0),
1666 num_bytes_recovered(0),
1667 num_keys_recovered(0),
1668 num_shallow_scrub_errors(0),
1669 num_deep_scrub_errors(0),
1670 num_objects_dirty(0),
1671 num_whiteouts(0),
1672 num_objects_omap(0),
1673 num_objects_hit_set_archive(0),
1674 num_objects_misplaced(0),
1675 num_bytes_hit_set_archive(0),
1676 num_flush(0),
1677 num_flush_kb(0),
1678 num_evict(0),
1679 num_evict_kb(0),
1680 num_promote(0),
1681 num_flush_mode_high(0), num_flush_mode_low(0),
1682 num_evict_mode_some(0), num_evict_mode_full(0),
1683 num_objects_pinned(0),
1684 num_objects_missing(0),
1685 num_legacy_snapsets(0)
1686 {}
1687
1688 void floor(int64_t f) {
1689#define FLOOR(x) if (x < f) x = f
1690 FLOOR(num_bytes);
1691 FLOOR(num_objects);
1692 FLOOR(num_object_clones);
1693 FLOOR(num_object_copies);
1694 FLOOR(num_objects_missing_on_primary);
1695 FLOOR(num_objects_missing);
1696 FLOOR(num_objects_degraded);
1697 FLOOR(num_objects_misplaced);
1698 FLOOR(num_objects_unfound);
1699 FLOOR(num_rd);
1700 FLOOR(num_rd_kb);
1701 FLOOR(num_wr);
1702 FLOOR(num_wr_kb);
1703 FLOOR(num_scrub_errors);
1704 FLOOR(num_shallow_scrub_errors);
1705 FLOOR(num_deep_scrub_errors);
1706 FLOOR(num_objects_recovered);
1707 FLOOR(num_bytes_recovered);
1708 FLOOR(num_keys_recovered);
1709 FLOOR(num_objects_dirty);
1710 FLOOR(num_whiteouts);
1711 FLOOR(num_objects_omap);
1712 FLOOR(num_objects_hit_set_archive);
1713 FLOOR(num_bytes_hit_set_archive);
1714 FLOOR(num_flush);
1715 FLOOR(num_flush_kb);
1716 FLOOR(num_evict);
1717 FLOOR(num_evict_kb);
1718 FLOOR(num_promote);
1719 FLOOR(num_flush_mode_high);
1720 FLOOR(num_flush_mode_low);
1721 FLOOR(num_evict_mode_some);
1722 FLOOR(num_evict_mode_full);
1723 FLOOR(num_objects_pinned);
1724 FLOOR(num_legacy_snapsets);
1725#undef FLOOR
1726 }
1727
1728 void split(vector<object_stat_sum_t> &out) const {
1729#define SPLIT(PARAM) \
1730 for (unsigned i = 0; i < out.size(); ++i) { \
1731 out[i].PARAM = PARAM / out.size(); \
1732 if (i < (PARAM % out.size())) { \
1733 out[i].PARAM++; \
1734 } \
1735 }
1736#define SPLIT_PRESERVE_NONZERO(PARAM) \
1737 for (unsigned i = 0; i < out.size(); ++i) { \
1738 if (PARAM) \
1739 out[i].PARAM = 1 + PARAM / out.size(); \
1740 else \
1741 out[i].PARAM = 0; \
1742 }
1743
1744 SPLIT(num_bytes);
1745 SPLIT(num_objects);
1746 SPLIT(num_object_clones);
1747 SPLIT(num_object_copies);
1748 SPLIT(num_objects_missing_on_primary);
1749 SPLIT(num_objects_missing);
1750 SPLIT(num_objects_degraded);
1751 SPLIT(num_objects_misplaced);
1752 SPLIT(num_objects_unfound);
1753 SPLIT(num_rd);
1754 SPLIT(num_rd_kb);
1755 SPLIT(num_wr);
1756 SPLIT(num_wr_kb);
1757 SPLIT(num_scrub_errors);
1758 SPLIT(num_shallow_scrub_errors);
1759 SPLIT(num_deep_scrub_errors);
1760 SPLIT(num_objects_recovered);
1761 SPLIT(num_bytes_recovered);
1762 SPLIT(num_keys_recovered);
1763 SPLIT(num_objects_dirty);
1764 SPLIT(num_whiteouts);
1765 SPLIT(num_objects_omap);
1766 SPLIT(num_objects_hit_set_archive);
1767 SPLIT(num_bytes_hit_set_archive);
1768 SPLIT(num_flush);
1769 SPLIT(num_flush_kb);
1770 SPLIT(num_evict);
1771 SPLIT(num_evict_kb);
1772 SPLIT(num_promote);
1773 SPLIT(num_flush_mode_high);
1774 SPLIT(num_flush_mode_low);
1775 SPLIT(num_evict_mode_some);
1776 SPLIT(num_evict_mode_full);
1777 SPLIT(num_objects_pinned);
1778 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
1779#undef SPLIT
1780#undef SPLIT_PRESERVE_NONZERO
1781 }
1782
1783 void clear() {
1784 memset(this, 0, sizeof(*this));
1785 }
1786
1787 void calc_copies(int nrep) {
1788 num_object_copies = nrep * num_objects;
1789 }
1790
1791 bool is_zero() const {
1792 return mem_is_zero((char*)this, sizeof(*this));
1793 }
1794
1795 void add(const object_stat_sum_t& o);
1796 void sub(const object_stat_sum_t& o);
1797
1798 void dump(Formatter *f) const;
1799 void padding_check() {
1800 static_assert(
1801 sizeof(object_stat_sum_t) ==
1802 sizeof(num_bytes) +
1803 sizeof(num_objects) +
1804 sizeof(num_object_clones) +
1805 sizeof(num_object_copies) +
1806 sizeof(num_objects_missing_on_primary) +
1807 sizeof(num_objects_degraded) +
1808 sizeof(num_objects_unfound) +
1809 sizeof(num_rd) +
1810 sizeof(num_rd_kb) +
1811 sizeof(num_wr) +
1812 sizeof(num_wr_kb) +
1813 sizeof(num_scrub_errors) +
1814 sizeof(num_objects_recovered) +
1815 sizeof(num_bytes_recovered) +
1816 sizeof(num_keys_recovered) +
1817 sizeof(num_shallow_scrub_errors) +
1818 sizeof(num_deep_scrub_errors) +
1819 sizeof(num_objects_dirty) +
1820 sizeof(num_whiteouts) +
1821 sizeof(num_objects_omap) +
1822 sizeof(num_objects_hit_set_archive) +
1823 sizeof(num_objects_misplaced) +
1824 sizeof(num_bytes_hit_set_archive) +
1825 sizeof(num_flush) +
1826 sizeof(num_flush_kb) +
1827 sizeof(num_evict) +
1828 sizeof(num_evict_kb) +
1829 sizeof(num_promote) +
1830 sizeof(num_flush_mode_high) +
1831 sizeof(num_flush_mode_low) +
1832 sizeof(num_evict_mode_some) +
1833 sizeof(num_evict_mode_full) +
1834 sizeof(num_objects_pinned) +
1835 sizeof(num_objects_missing) +
1836 sizeof(num_legacy_snapsets)
1837 ,
1838 "object_stat_sum_t have padding");
1839 }
1840 void encode(bufferlist& bl) const;
1841 void decode(bufferlist::iterator& bl);
1842 static void generate_test_instances(list<object_stat_sum_t*>& o);
1843};
1844WRITE_CLASS_ENCODER(object_stat_sum_t)
1845
1846bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
1847
1848/**
1849 * a collection of object stat sums
1850 *
1851 * This is a collection of stat sums over different categories.
1852 */
1853struct object_stat_collection_t {
1854 /**************************************************************************
1855 * WARNING: be sure to update the operator== when adding/removing fields! *
1856 **************************************************************************/
1857 object_stat_sum_t sum;
1858
1859 void calc_copies(int nrep) {
1860 sum.calc_copies(nrep);
1861 }
1862
1863 void dump(Formatter *f) const;
1864 void encode(bufferlist& bl) const;
1865 void decode(bufferlist::iterator& bl);
1866 static void generate_test_instances(list<object_stat_collection_t*>& o);
1867
1868 bool is_zero() const {
1869 return sum.is_zero();
1870 }
1871
1872 void clear() {
1873 sum.clear();
1874 }
1875
1876 void floor(int64_t f) {
1877 sum.floor(f);
1878 }
1879
1880 void add(const object_stat_sum_t& o) {
1881 sum.add(o);
1882 }
1883
1884 void add(const object_stat_collection_t& o) {
1885 sum.add(o.sum);
1886 }
1887 void sub(const object_stat_collection_t& o) {
1888 sum.sub(o.sum);
1889 }
1890};
1891WRITE_CLASS_ENCODER(object_stat_collection_t)
1892
1893inline bool operator==(const object_stat_collection_t& l,
1894 const object_stat_collection_t& r) {
1895 return l.sum == r.sum;
1896}
1897
1898
1899/** pg_stat
1900 * aggregate stats for a single PG.
1901 */
1902struct pg_stat_t {
1903 /**************************************************************************
1904 * WARNING: be sure to update the operator== when adding/removing fields! *
1905 **************************************************************************/
1906 eversion_t version;
1907 version_t reported_seq; // sequence number
1908 epoch_t reported_epoch; // epoch of this report
1909 __u32 state;
1910 utime_t last_fresh; // last reported
1911 utime_t last_change; // new state != previous state
1912 utime_t last_active; // state & PG_STATE_ACTIVE
1913 utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
1914 utime_t last_clean; // state & PG_STATE_CLEAN
1915 utime_t last_unstale; // (state & PG_STATE_STALE) == 0
1916 utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
1917 utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
1918
1919 eversion_t log_start; // (log_start,version]
1920 eversion_t ondisk_log_start; // there may be more on disk
1921
1922 epoch_t created;
1923 epoch_t last_epoch_clean;
1924 pg_t parent;
1925 __u32 parent_split_bits;
1926
1927 eversion_t last_scrub;
1928 eversion_t last_deep_scrub;
1929 utime_t last_scrub_stamp;
1930 utime_t last_deep_scrub_stamp;
1931 utime_t last_clean_scrub_stamp;
1932
1933 object_stat_collection_t stats;
1934
1935 int64_t log_size;
1936 int64_t ondisk_log_size; // >= active_log_size
1937
1938 vector<int32_t> up, acting;
1939 epoch_t mapping_epoch;
1940
1941 vector<int32_t> blocked_by; ///< osds on which the pg is blocked
1942
1943 utime_t last_became_active;
1944 utime_t last_became_peered;
1945
1946 /// up, acting primaries
1947 int32_t up_primary;
1948 int32_t acting_primary;
1949
1950 bool stats_invalid:1;
1951 /// true if num_objects_dirty is not accurate (because it was not
1952 /// maintained starting from pool creation)
1953 bool dirty_stats_invalid:1;
1954 bool omap_stats_invalid:1;
1955 bool hitset_stats_invalid:1;
1956 bool hitset_bytes_stats_invalid:1;
1957 bool pin_stats_invalid:1;
1958
1959 pg_stat_t()
1960 : reported_seq(0),
1961 reported_epoch(0),
1962 state(0),
1963 created(0), last_epoch_clean(0),
1964 parent_split_bits(0),
1965 log_size(0), ondisk_log_size(0),
1966 mapping_epoch(0),
1967 up_primary(-1),
1968 acting_primary(-1),
1969 stats_invalid(false),
1970 dirty_stats_invalid(false),
1971 omap_stats_invalid(false),
1972 hitset_stats_invalid(false),
1973 hitset_bytes_stats_invalid(false),
1974 pin_stats_invalid(false)
1975 { }
1976
1977 epoch_t get_effective_last_epoch_clean() const {
1978 if (state & PG_STATE_CLEAN) {
1979 // we are clean as of this report, and should thus take the
1980 // reported epoch
1981 return reported_epoch;
1982 } else {
1983 return last_epoch_clean;
1984 }
1985 }
1986
1987 pair<epoch_t, version_t> get_version_pair() const {
1988 return make_pair(reported_epoch, reported_seq);
1989 }
1990
1991 void floor(int64_t f) {
1992 stats.floor(f);
1993 if (log_size < f)
1994 log_size = f;
1995 if (ondisk_log_size < f)
1996 ondisk_log_size = f;
1997 }
1998
1999 void add(const pg_stat_t& o) {
2000 stats.add(o.stats);
2001 log_size += o.log_size;
2002 ondisk_log_size += o.ondisk_log_size;
2003 }
2004 void sub(const pg_stat_t& o) {
2005 stats.sub(o.stats);
2006 log_size -= o.log_size;
2007 ondisk_log_size -= o.ondisk_log_size;
2008 }
2009
2010 bool is_acting_osd(int32_t osd, bool primary) const;
2011 void dump(Formatter *f) const;
2012 void dump_brief(Formatter *f) const;
2013 void encode(bufferlist &bl) const;
2014 void decode(bufferlist::iterator &bl);
2015 static void generate_test_instances(list<pg_stat_t*>& o);
2016};
2017WRITE_CLASS_ENCODER(pg_stat_t)
2018
2019bool operator==(const pg_stat_t& l, const pg_stat_t& r);
2020
2021/*
2022 * summation over an entire pool
2023 */
2024struct pool_stat_t {
2025 object_stat_collection_t stats;
2026 int64_t log_size;
2027 int64_t ondisk_log_size; // >= active_log_size
2028 int32_t up; ///< number of up replicas or shards
2029 int32_t acting; ///< number of acting replicas or shards
2030
2031 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0)
2032 { }
2033
2034 void floor(int64_t f) {
2035 stats.floor(f);
2036 if (log_size < f)
2037 log_size = f;
2038 if (ondisk_log_size < f)
2039 ondisk_log_size = f;
2040 if (up < f)
2041 up = f;
2042 if (acting < f)
2043 acting = f;
2044 }
2045
2046 void add(const pg_stat_t& o) {
2047 stats.add(o.stats);
2048 log_size += o.log_size;
2049 ondisk_log_size += o.ondisk_log_size;
2050 up += o.up.size();
2051 acting += o.acting.size();
2052 }
2053 void sub(const pg_stat_t& o) {
2054 stats.sub(o.stats);
2055 log_size -= o.log_size;
2056 ondisk_log_size -= o.ondisk_log_size;
2057 up -= o.up.size();
2058 acting -= o.acting.size();
2059 }
2060
2061 bool is_zero() const {
2062 return (stats.is_zero() &&
2063 log_size == 0 &&
2064 ondisk_log_size == 0 &&
2065 up == 0 &&
2066 acting == 0);
2067 }
2068
2069 void dump(Formatter *f) const;
2070 void encode(bufferlist &bl, uint64_t features) const;
2071 void decode(bufferlist::iterator &bl);
2072 static void generate_test_instances(list<pool_stat_t*>& o);
2073};
2074WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
2075
2076
2077// -----------------------------------------
2078
2079/**
2080 * pg_hit_set_info_t - information about a single recorded HitSet
2081 *
2082 * Track basic metadata about a HitSet, like the nubmer of insertions
2083 * and the time range it covers.
2084 */
2085struct pg_hit_set_info_t {
2086 utime_t begin, end; ///< time interval
2087 eversion_t version; ///< version this HitSet object was written
2088 bool using_gmt; ///< use gmt for creating the hit_set archive object name
2089
2090 friend bool operator==(const pg_hit_set_info_t& l,
2091 const pg_hit_set_info_t& r) {
2092 return
2093 l.begin == r.begin &&
2094 l.end == r.end &&
2095 l.version == r.version &&
2096 l.using_gmt == r.using_gmt;
2097 }
2098
2099 explicit pg_hit_set_info_t(bool using_gmt = true)
2100 : using_gmt(using_gmt) {}
2101
2102 void encode(bufferlist &bl) const;
2103 void decode(bufferlist::iterator &bl);
2104 void dump(Formatter *f) const;
2105 static void generate_test_instances(list<pg_hit_set_info_t*>& o);
2106};
2107WRITE_CLASS_ENCODER(pg_hit_set_info_t)
2108
2109/**
2110 * pg_hit_set_history_t - information about a history of hitsets
2111 *
2112 * Include information about the currently accumulating hit set as well
2113 * as archived/historical ones.
2114 */
2115struct pg_hit_set_history_t {
2116 eversion_t current_last_update; ///< last version inserted into current set
2117 list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
2118
2119 friend bool operator==(const pg_hit_set_history_t& l,
2120 const pg_hit_set_history_t& r) {
2121 return
2122 l.current_last_update == r.current_last_update &&
2123 l.history == r.history;
2124 }
2125
2126 void encode(bufferlist &bl) const;
2127 void decode(bufferlist::iterator &bl);
2128 void dump(Formatter *f) const;
2129 static void generate_test_instances(list<pg_hit_set_history_t*>& o);
2130};
2131WRITE_CLASS_ENCODER(pg_hit_set_history_t)
2132
2133
2134// -----------------------------------------
2135
2136/**
2137 * pg_history_t - information about recent pg peering/mapping history
2138 *
2139 * This is aggressively shared between OSDs to bound the amount of past
2140 * history they need to worry about.
2141 */
2142struct pg_history_t {
31f18b77
FG
2143 epoch_t epoch_created; // epoch in which *pg* was created (pool or pg)
2144 epoch_t epoch_pool_created; // epoch in which *pool* was created
2145 // (note: may be pg creation epoch for
2146 // pre-luminous clusters)
7c673cae
FG
2147 epoch_t last_epoch_started; // lower bound on last epoch started (anywhere, not necessarily locally)
2148 epoch_t last_interval_started; // first epoch of last_epoch_started interval
2149 epoch_t last_epoch_clean; // lower bound on last epoch the PG was completely clean.
2150 epoch_t last_interval_clean; // first epoch of last_epoch_clean interval
31f18b77 2151 epoch_t last_epoch_split; // as parent or child
7c673cae
FG
2152 epoch_t last_epoch_marked_full; // pool or cluster
2153
2154 /**
2155 * In the event of a map discontinuity, same_*_since may reflect the first
2156 * map the osd has seen in the new map sequence rather than the actual start
2157 * of the interval. This is ok since a discontinuity at epoch e means there
2158 * must have been a clean interval between e and now and that we cannot be
2159 * in the active set during the interval containing e.
2160 */
2161 epoch_t same_up_since; // same acting set since
2162 epoch_t same_interval_since; // same acting AND up set since
2163 epoch_t same_primary_since; // same primary at least back through this epoch.
2164
2165 eversion_t last_scrub;
2166 eversion_t last_deep_scrub;
2167 utime_t last_scrub_stamp;
2168 utime_t last_deep_scrub_stamp;
2169 utime_t last_clean_scrub_stamp;
2170
2171 friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
2172 return
2173 l.epoch_created == r.epoch_created &&
31f18b77 2174 l.epoch_pool_created == r.epoch_pool_created &&
7c673cae
FG
2175 l.last_epoch_started == r.last_epoch_started &&
2176 l.last_interval_started == r.last_interval_started &&
2177 l.last_epoch_clean == r.last_epoch_clean &&
2178 l.last_interval_clean == r.last_interval_clean &&
2179 l.last_epoch_split == r.last_epoch_split &&
2180 l.last_epoch_marked_full == r.last_epoch_marked_full &&
2181 l.same_up_since == r.same_up_since &&
2182 l.same_interval_since == r.same_interval_since &&
2183 l.same_primary_since == r.same_primary_since &&
2184 l.last_scrub == r.last_scrub &&
2185 l.last_deep_scrub == r.last_deep_scrub &&
2186 l.last_scrub_stamp == r.last_scrub_stamp &&
2187 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2188 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp;
2189 }
2190
2191 pg_history_t()
2192 : epoch_created(0),
31f18b77 2193 epoch_pool_created(0),
7c673cae
FG
2194 last_epoch_started(0),
2195 last_interval_started(0),
2196 last_epoch_clean(0),
2197 last_interval_clean(0),
2198 last_epoch_split(0),
2199 last_epoch_marked_full(0),
2200 same_up_since(0), same_interval_since(0), same_primary_since(0) {}
2201
2202 bool merge(const pg_history_t &other) {
2203 // Here, we only update the fields which cannot be calculated from the OSDmap.
2204 bool modified = false;
2205 if (epoch_created < other.epoch_created) {
2206 epoch_created = other.epoch_created;
2207 modified = true;
2208 }
31f18b77
FG
2209 if (epoch_pool_created < other.epoch_pool_created) {
2210 // FIXME: for jewel compat only; this should either be 0 or always the
2211 // same value across all pg instances.
2212 epoch_pool_created = other.epoch_pool_created;
2213 modified = true;
2214 }
7c673cae
FG
2215 if (last_epoch_started < other.last_epoch_started) {
2216 last_epoch_started = other.last_epoch_started;
2217 modified = true;
2218 }
2219 if (last_interval_started < other.last_interval_started) {
2220 last_interval_started = other.last_interval_started;
2221 modified = true;
2222 }
2223 if (last_epoch_clean < other.last_epoch_clean) {
2224 last_epoch_clean = other.last_epoch_clean;
2225 modified = true;
2226 }
2227 if (last_interval_clean < other.last_interval_clean) {
2228 last_interval_clean = other.last_interval_clean;
2229 modified = true;
2230 }
2231 if (last_epoch_split < other.last_epoch_split) {
2232 last_epoch_split = other.last_epoch_split;
2233 modified = true;
2234 }
2235 if (last_epoch_marked_full < other.last_epoch_marked_full) {
2236 last_epoch_marked_full = other.last_epoch_marked_full;
2237 modified = true;
2238 }
2239 if (other.last_scrub > last_scrub) {
2240 last_scrub = other.last_scrub;
2241 modified = true;
2242 }
2243 if (other.last_scrub_stamp > last_scrub_stamp) {
2244 last_scrub_stamp = other.last_scrub_stamp;
2245 modified = true;
2246 }
2247 if (other.last_deep_scrub > last_deep_scrub) {
2248 last_deep_scrub = other.last_deep_scrub;
2249 modified = true;
2250 }
2251 if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
2252 last_deep_scrub_stamp = other.last_deep_scrub_stamp;
2253 modified = true;
2254 }
2255 if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
2256 last_clean_scrub_stamp = other.last_clean_scrub_stamp;
2257 modified = true;
2258 }
2259 return modified;
2260 }
2261
2262 void encode(bufferlist& bl) const;
2263 void decode(bufferlist::iterator& p);
2264 void dump(Formatter *f) const;
2265 static void generate_test_instances(list<pg_history_t*>& o);
2266};
2267WRITE_CLASS_ENCODER(pg_history_t)
2268
2269inline ostream& operator<<(ostream& out, const pg_history_t& h) {
31f18b77 2270 return out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
7c673cae
FG
2271 << " lis/c " << h.last_interval_started
2272 << "/" << h.last_interval_clean
2273 << " les/c/f " << h.last_epoch_started << "/" << h.last_epoch_clean
2274 << "/" << h.last_epoch_marked_full
2275 << " " << h.same_up_since
2276 << "/" << h.same_interval_since
2277 << "/" << h.same_primary_since;
2278}
2279
2280
2281/**
2282 * pg_info_t - summary of PG statistics.
2283 *
2284 * some notes:
2285 * - last_complete implies we have all objects that existed as of that
2286 * stamp, OR a newer object, OR have already applied a later delete.
2287 * - if last_complete >= log.bottom, then we know pg contents thru log.head.
2288 * otherwise, we have no idea what the pg is supposed to contain.
2289 */
2290struct pg_info_t {
2291 spg_t pgid;
2292 eversion_t last_update; ///< last object version applied to store.
2293 eversion_t last_complete; ///< last version pg was complete through.
2294 epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd
2295 epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
2296
2297 version_t last_user_version; ///< last user object version applied to store
2298
2299 eversion_t log_tail; ///< oldest log entry.
2300
2301 hobject_t last_backfill; ///< objects >= this and < last_complete may be missing
2302 bool last_backfill_bitwise; ///< true if last_backfill reflects a bitwise (vs nibblewise) sort
2303
2304 interval_set<snapid_t> purged_snaps;
2305
2306 pg_stat_t stats;
2307
2308 pg_history_t history;
2309 pg_hit_set_history_t hit_set;
2310
2311 friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
2312 return
2313 l.pgid == r.pgid &&
2314 l.last_update == r.last_update &&
2315 l.last_complete == r.last_complete &&
2316 l.last_epoch_started == r.last_epoch_started &&
2317 l.last_interval_started == r.last_interval_started &&
2318 l.last_user_version == r.last_user_version &&
2319 l.log_tail == r.log_tail &&
2320 l.last_backfill == r.last_backfill &&
2321 l.last_backfill_bitwise == r.last_backfill_bitwise &&
2322 l.purged_snaps == r.purged_snaps &&
2323 l.stats == r.stats &&
2324 l.history == r.history &&
2325 l.hit_set == r.hit_set;
2326 }
2327
2328 pg_info_t()
2329 : last_epoch_started(0),
2330 last_interval_started(0),
2331 last_user_version(0),
2332 last_backfill(hobject_t::get_max()),
2333 last_backfill_bitwise(false)
2334 { }
2335 // cppcheck-suppress noExplicitConstructor
2336 pg_info_t(spg_t p)
2337 : pgid(p),
2338 last_epoch_started(0),
2339 last_interval_started(0),
2340 last_user_version(0),
2341 last_backfill(hobject_t::get_max()),
2342 last_backfill_bitwise(false)
2343 { }
2344
2345 void set_last_backfill(hobject_t pos) {
2346 last_backfill = pos;
2347 last_backfill_bitwise = true;
2348 }
2349
2350 bool is_empty() const { return last_update.version == 0; }
2351 bool dne() const { return history.epoch_created == 0; }
2352
2353 bool is_incomplete() const { return !last_backfill.is_max(); }
2354
2355 void encode(bufferlist& bl) const;
2356 void decode(bufferlist::iterator& p);
2357 void dump(Formatter *f) const;
2358 bool overlaps_with(const pg_info_t &oinfo) const {
2359 return last_update > oinfo.log_tail ?
2360 oinfo.last_update >= log_tail :
2361 last_update >= oinfo.log_tail;
2362 }
2363 static void generate_test_instances(list<pg_info_t*>& o);
2364};
2365WRITE_CLASS_ENCODER(pg_info_t)
2366
2367inline ostream& operator<<(ostream& out, const pg_info_t& pgi)
2368{
2369 out << pgi.pgid << "(";
2370 if (pgi.dne())
2371 out << " DNE";
2372 if (pgi.is_empty())
2373 out << " empty";
2374 else {
2375 out << " v " << pgi.last_update;
2376 if (pgi.last_complete != pgi.last_update)
2377 out << " lc " << pgi.last_complete;
2378 out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
2379 }
2380 if (pgi.is_incomplete())
2381 out << " lb " << pgi.last_backfill
2382 << (pgi.last_backfill_bitwise ? " (bitwise)" : " (NIBBLEWISE)");
2383 //out << " c " << pgi.epoch_created;
2384 out << " local-lis/les=" << pgi.last_interval_started
2385 << "/" << pgi.last_epoch_started;
2386 out << " n=" << pgi.stats.stats.sum.num_objects;
2387 out << " " << pgi.history
2388 << ")";
2389 return out;
2390}
2391
2392/**
2393 * pg_fast_info_t - common pg_info_t fields
2394 *
2395 * These are the fields of pg_info_t (and children) that are updated for
2396 * most IO operations.
2397 *
2398 * ** WARNING **
2399 * Because we rely on these fields to be applied to the normal
2400 * info struct, adding a new field here that is not also new in info
2401 * means that we must set an incompat OSD feature bit!
2402 */
2403struct pg_fast_info_t {
2404 eversion_t last_update;
2405 eversion_t last_complete;
2406 version_t last_user_version;
2407 struct { // pg_stat_t stats
2408 eversion_t version;
2409 version_t reported_seq;
2410 utime_t last_fresh;
2411 utime_t last_active;
2412 utime_t last_peered;
2413 utime_t last_clean;
2414 utime_t last_unstale;
2415 utime_t last_undegraded;
2416 utime_t last_fullsized;
2417 int64_t log_size; // (also ondisk_log_size, which has the same value)
2418 struct { // object_stat_collection_t stats;
2419 struct { // objct_stat_sum_t sum
2420 int64_t num_bytes; // in bytes
2421 int64_t num_objects;
2422 int64_t num_object_copies;
2423 int64_t num_rd;
2424 int64_t num_rd_kb;
2425 int64_t num_wr;
2426 int64_t num_wr_kb;
2427 int64_t num_objects_dirty;
2428 } sum;
2429 } stats;
2430 } stats;
2431
2432 void populate_from(const pg_info_t& info) {
2433 last_update = info.last_update;
2434 last_complete = info.last_complete;
2435 last_user_version = info.last_user_version;
2436 stats.version = info.stats.version;
2437 stats.reported_seq = info.stats.reported_seq;
2438 stats.last_fresh = info.stats.last_fresh;
2439 stats.last_active = info.stats.last_active;
2440 stats.last_peered = info.stats.last_peered;
2441 stats.last_clean = info.stats.last_clean;
2442 stats.last_unstale = info.stats.last_unstale;
2443 stats.last_undegraded = info.stats.last_undegraded;
2444 stats.last_fullsized = info.stats.last_fullsized;
2445 stats.log_size = info.stats.log_size;
2446 stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
2447 stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
2448 stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
2449 stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
2450 stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
2451 stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
2452 stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
2453 stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
2454 }
2455
2456 bool try_apply_to(pg_info_t* info) {
2457 if (last_update <= info->last_update)
2458 return false;
2459 info->last_update = last_update;
2460 info->last_complete = last_complete;
2461 info->last_user_version = last_user_version;
2462 info->stats.version = stats.version;
2463 info->stats.reported_seq = stats.reported_seq;
2464 info->stats.last_fresh = stats.last_fresh;
2465 info->stats.last_active = stats.last_active;
2466 info->stats.last_peered = stats.last_peered;
2467 info->stats.last_clean = stats.last_clean;
2468 info->stats.last_unstale = stats.last_unstale;
2469 info->stats.last_undegraded = stats.last_undegraded;
2470 info->stats.last_fullsized = stats.last_fullsized;
2471 info->stats.log_size = stats.log_size;
2472 info->stats.ondisk_log_size = stats.log_size;
2473 info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
2474 info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
2475 info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
2476 info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
2477 info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
2478 info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
2479 info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
2480 info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
2481 return true;
2482 }
2483
2484 void encode(bufferlist& bl) const {
2485 ENCODE_START(1, 1, bl);
2486 ::encode(last_update, bl);
2487 ::encode(last_complete, bl);
2488 ::encode(last_user_version, bl);
2489 ::encode(stats.version, bl);
2490 ::encode(stats.reported_seq, bl);
2491 ::encode(stats.last_fresh, bl);
2492 ::encode(stats.last_active, bl);
2493 ::encode(stats.last_peered, bl);
2494 ::encode(stats.last_clean, bl);
2495 ::encode(stats.last_unstale, bl);
2496 ::encode(stats.last_undegraded, bl);
2497 ::encode(stats.last_fullsized, bl);
2498 ::encode(stats.log_size, bl);
2499 ::encode(stats.stats.sum.num_bytes, bl);
2500 ::encode(stats.stats.sum.num_objects, bl);
2501 ::encode(stats.stats.sum.num_object_copies, bl);
2502 ::encode(stats.stats.sum.num_rd, bl);
2503 ::encode(stats.stats.sum.num_rd_kb, bl);
2504 ::encode(stats.stats.sum.num_wr, bl);
2505 ::encode(stats.stats.sum.num_wr_kb, bl);
2506 ::encode(stats.stats.sum.num_objects_dirty, bl);
2507 ENCODE_FINISH(bl);
2508 }
2509 void decode(bufferlist::iterator& p) {
2510 DECODE_START(1, p);
2511 ::decode(last_update, p);
2512 ::decode(last_complete, p);
2513 ::decode(last_user_version, p);
2514 ::decode(stats.version, p);
2515 ::decode(stats.reported_seq, p);
2516 ::decode(stats.last_fresh, p);
2517 ::decode(stats.last_active, p);
2518 ::decode(stats.last_peered, p);
2519 ::decode(stats.last_clean, p);
2520 ::decode(stats.last_unstale, p);
2521 ::decode(stats.last_undegraded, p);
2522 ::decode(stats.last_fullsized, p);
2523 ::decode(stats.log_size, p);
2524 ::decode(stats.stats.sum.num_bytes, p);
2525 ::decode(stats.stats.sum.num_objects, p);
2526 ::decode(stats.stats.sum.num_object_copies, p);
2527 ::decode(stats.stats.sum.num_rd, p);
2528 ::decode(stats.stats.sum.num_rd_kb, p);
2529 ::decode(stats.stats.sum.num_wr, p);
2530 ::decode(stats.stats.sum.num_wr_kb, p);
2531 ::decode(stats.stats.sum.num_objects_dirty, p);
2532 DECODE_FINISH(p);
2533 }
2534};
2535WRITE_CLASS_ENCODER(pg_fast_info_t)
2536
2537
2538struct pg_notify_t {
2539 epoch_t query_epoch;
2540 epoch_t epoch_sent;
2541 pg_info_t info;
2542 shard_id_t to;
2543 shard_id_t from;
2544 pg_notify_t() :
2545 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
2546 from(shard_id_t::NO_SHARD) {}
2547 pg_notify_t(
2548 shard_id_t to,
2549 shard_id_t from,
2550 epoch_t query_epoch,
2551 epoch_t epoch_sent,
2552 const pg_info_t &info)
2553 : query_epoch(query_epoch),
2554 epoch_sent(epoch_sent),
2555 info(info), to(to), from(from) {
2556 assert(from == info.pgid.shard);
2557 }
2558 void encode(bufferlist &bl) const;
2559 void decode(bufferlist::iterator &p);
2560 void dump(Formatter *f) const;
2561 static void generate_test_instances(list<pg_notify_t*> &o);
2562};
2563WRITE_CLASS_ENCODER(pg_notify_t)
2564ostream &operator<<(ostream &lhs, const pg_notify_t &notify);
2565
2566
2567class OSDMap;
2568/**
2569 * PastIntervals -- information needed to determine the PriorSet and
2570 * the might_have_unfound set
2571 */
2572class PastIntervals {
2573public:
2574 struct pg_interval_t {
2575 vector<int32_t> up, acting;
2576 epoch_t first, last;
2577 bool maybe_went_rw;
2578 int32_t primary;
2579 int32_t up_primary;
2580
2581 pg_interval_t()
2582 : first(0), last(0),
2583 maybe_went_rw(false),
2584 primary(-1),
2585 up_primary(-1)
2586 {}
2587
2588 pg_interval_t(
2589 vector<int32_t> &&up,
2590 vector<int32_t> &&acting,
2591 epoch_t first,
2592 epoch_t last,
2593 bool maybe_went_rw,
2594 int32_t primary,
2595 int32_t up_primary)
2596 : up(up), acting(acting), first(first), last(last),
2597 maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
2598 {}
2599
2600 void encode(bufferlist& bl) const;
2601 void decode(bufferlist::iterator& bl);
2602 void dump(Formatter *f) const;
2603 static void generate_test_instances(list<pg_interval_t*>& o);
2604 };
2605
2606 PastIntervals() = default;
2607 PastIntervals(bool ec_pool, const OSDMap &osdmap) : PastIntervals() {
2608 update_type_from_map(ec_pool, osdmap);
2609 }
2610 PastIntervals(bool ec_pool, bool compact) : PastIntervals() {
2611 update_type(ec_pool, compact);
2612 }
2613 PastIntervals(PastIntervals &&rhs) = default;
2614 PastIntervals &operator=(PastIntervals &&rhs) = default;
2615
2616 PastIntervals(const PastIntervals &rhs);
2617 PastIntervals &operator=(const PastIntervals &rhs);
2618
2619 class interval_rep {
2620 public:
2621 virtual size_t size() const = 0;
2622 virtual bool empty() const = 0;
2623 virtual void clear() = 0;
2624 virtual pair<epoch_t, epoch_t> get_bounds() const = 0;
2625 virtual set<pg_shard_t> get_all_participants(
2626 bool ec_pool) const = 0;
2627 virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
2628 virtual unique_ptr<interval_rep> clone() const = 0;
2629 virtual ostream &print(ostream &out) const = 0;
2630 virtual void encode(bufferlist &bl) const = 0;
2631 virtual void decode(bufferlist::iterator &bl) = 0;
2632 virtual void dump(Formatter *f) const = 0;
2633 virtual bool is_classic() const = 0;
2634 virtual void iterate_mayberw_back_to(
2635 bool ec_pool,
2636 epoch_t les,
2637 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const = 0;
2638
2639 virtual bool has_full_intervals() const { return false; }
2640 virtual void iterate_all_intervals(
2641 std::function<void(const pg_interval_t &)> &&f) const {
2642 assert(!has_full_intervals());
2643 assert(0 == "not valid for this implementation");
2644 }
2645
2646 virtual ~interval_rep() {}
2647 };
2648 friend class pi_simple_rep;
2649 friend class pi_compact_rep;
2650private:
2651
2652 unique_ptr<interval_rep> past_intervals;
2653
2654 PastIntervals(interval_rep *rep) : past_intervals(rep) {}
2655
2656public:
2657 void add_interval(bool ec_pool, const pg_interval_t &interval) {
2658 assert(past_intervals);
2659 return past_intervals->add_interval(ec_pool, interval);
2660 }
2661
2662 bool is_classic() const {
2663 assert(past_intervals);
2664 return past_intervals->is_classic();
2665 }
2666
2667 void encode(bufferlist &bl) const {
2668 ENCODE_START(1, 1, bl);
2669 if (past_intervals) {
2670 __u8 type = is_classic() ? 1 : 2;
2671 ::encode(type, bl);
2672 past_intervals->encode(bl);
2673 } else {
2674 ::encode((__u8)0, bl);
2675 }
2676 ENCODE_FINISH(bl);
2677 }
2678 void encode_classic(bufferlist &bl) const {
2679 if (past_intervals) {
2680 assert(past_intervals->is_classic());
2681 past_intervals->encode(bl);
2682 } else {
2683 // it's a map<>
2684 ::encode((uint32_t)0, bl);
2685 }
2686 }
2687
2688 void decode(bufferlist::iterator &bl);
2689 void decode_classic(bufferlist::iterator &bl);
2690
2691 void dump(Formatter *f) const {
2692 assert(past_intervals);
2693 past_intervals->dump(f);
2694 }
2695 static void generate_test_instances(list<PastIntervals *> & o);
2696
2697 /**
2698 * Determines whether there is an interval change
2699 */
2700 static bool is_new_interval(
2701 int old_acting_primary,
2702 int new_acting_primary,
2703 const vector<int> &old_acting,
2704 const vector<int> &new_acting,
2705 int old_up_primary,
2706 int new_up_primary,
2707 const vector<int> &old_up,
2708 const vector<int> &new_up,
2709 int old_size,
2710 int new_size,
2711 int old_min_size,
2712 int new_min_size,
2713 unsigned old_pg_num,
2714 unsigned new_pg_num,
2715 bool old_sort_bitwise,
2716 bool new_sort_bitwise,
c07f9fc5
FG
2717 bool old_recovery_deletes,
2718 bool new_recovery_deletes,
7c673cae
FG
2719 pg_t pgid
2720 );
2721
2722 /**
2723 * Determines whether there is an interval change
2724 */
2725 static bool is_new_interval(
2726 int old_acting_primary, ///< [in] primary as of lastmap
2727 int new_acting_primary, ///< [in] primary as of lastmap
2728 const vector<int> &old_acting, ///< [in] acting as of lastmap
2729 const vector<int> &new_acting, ///< [in] acting as of osdmap
2730 int old_up_primary, ///< [in] up primary of lastmap
2731 int new_up_primary, ///< [in] up primary of osdmap
2732 const vector<int> &old_up, ///< [in] up as of lastmap
2733 const vector<int> &new_up, ///< [in] up as of osdmap
2734 ceph::shared_ptr<const OSDMap> osdmap, ///< [in] current map
2735 ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
2736 pg_t pgid ///< [in] pgid for pg
2737 );
2738
2739 /**
2740 * Integrates a new map into *past_intervals, returns true
2741 * if an interval was closed out.
2742 */
2743 static bool check_new_interval(
2744 int old_acting_primary, ///< [in] primary as of lastmap
2745 int new_acting_primary, ///< [in] primary as of osdmap
2746 const vector<int> &old_acting, ///< [in] acting as of lastmap
2747 const vector<int> &new_acting, ///< [in] acting as of osdmap
2748 int old_up_primary, ///< [in] up primary of lastmap
2749 int new_up_primary, ///< [in] up primary of osdmap
2750 const vector<int> &old_up, ///< [in] up as of lastmap
2751 const vector<int> &new_up, ///< [in] up as of osdmap
2752 epoch_t same_interval_since, ///< [in] as of osdmap
2753 epoch_t last_epoch_clean, ///< [in] current
2754 ceph::shared_ptr<const OSDMap> osdmap, ///< [in] current map
2755 ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
2756 pg_t pgid, ///< [in] pgid for pg
2757 IsPGRecoverablePredicate *could_have_gone_active, /// [in] predicate whether the pg can be active
2758 PastIntervals *past_intervals, ///< [out] intervals
2759 ostream *out = 0 ///< [out] debug ostream
2760 );
c07f9fc5 2761
7c673cae
FG
2762 friend ostream& operator<<(ostream& out, const PastIntervals &i);
2763
2764 template <typename F>
2765 void iterate_mayberw_back_to(
2766 bool ec_pool,
2767 epoch_t les,
2768 F &&f) const {
2769 assert(past_intervals);
2770 past_intervals->iterate_mayberw_back_to(ec_pool, les, std::forward<F>(f));
2771 }
2772 void clear() {
2773 assert(past_intervals);
2774 past_intervals->clear();
2775 }
2776
2777 /**
2778 * Should return a value which gives an indication of the amount
2779 * of state contained
2780 */
2781 size_t size() const {
2782 assert(past_intervals);
2783 return past_intervals->size();
2784 }
2785
2786 bool empty() const {
2787 assert(past_intervals);
2788 return past_intervals->empty();
2789 }
2790
2791 void swap(PastIntervals &other) {
31f18b77
FG
2792 using std::swap;
2793 swap(other.past_intervals, past_intervals);
7c673cae
FG
2794 }
2795
2796 /**
2797 * Return all shards which have been in the acting set back to the
2798 * latest epoch to which we have trimmed except for pg_whoami
2799 */
2800 set<pg_shard_t> get_might_have_unfound(
2801 pg_shard_t pg_whoami,
2802 bool ec_pool) const {
2803 assert(past_intervals);
2804 auto ret = past_intervals->get_all_participants(ec_pool);
2805 ret.erase(pg_whoami);
2806 return ret;
2807 }
2808
2809 /**
2810 * Return all shards which we might want to talk to for peering
2811 */
2812 set<pg_shard_t> get_all_probe(
2813 bool ec_pool) const {
2814 assert(past_intervals);
2815 return past_intervals->get_all_participants(ec_pool);
2816 }
2817
2818 /* Return the set of epochs [start, end) represented by the
2819 * past_interval set.
2820 */
2821 pair<epoch_t, epoch_t> get_bounds() const {
2822 assert(past_intervals);
2823 return past_intervals->get_bounds();
2824 }
2825
2826 enum osd_state_t {
2827 UP,
2828 DOWN,
2829 DNE,
2830 LOST
2831 };
2832 struct PriorSet {
2833 bool ec_pool = false;
2834 set<pg_shard_t> probe; /// current+prior OSDs we need to probe.
2835 set<int> down; /// down osds that would normally be in @a probe and might be interesting.
2836 map<int, epoch_t> blocked_by; /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
2837
2838 bool pg_down = false; /// some down osds are included in @a cur; the DOWN pg state bit should be set.
2839 unique_ptr<IsPGRecoverablePredicate> pcontdec;
2840
2841 PriorSet() = default;
2842 PriorSet(PriorSet &&) = default;
2843 PriorSet &operator=(PriorSet &&) = default;
2844
2845 PriorSet &operator=(const PriorSet &) = delete;
2846 PriorSet(const PriorSet &) = delete;
2847
2848 bool operator==(const PriorSet &rhs) const {
2849 return (ec_pool == rhs.ec_pool) &&
2850 (probe == rhs.probe) &&
2851 (down == rhs.down) &&
2852 (blocked_by == rhs.blocked_by) &&
2853 (pg_down == rhs.pg_down);
2854 }
2855
2856 bool affected_by_map(
2857 const OSDMap &osdmap,
2858 const DoutPrefixProvider *dpp) const;
2859
2860 // For verifying tests
2861 PriorSet(
2862 bool ec_pool,
2863 set<pg_shard_t> probe,
2864 set<int> down,
2865 map<int, epoch_t> blocked_by,
2866 bool pg_down,
2867 IsPGRecoverablePredicate *pcontdec)
2868 : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
2869 pg_down(pg_down), pcontdec(pcontdec) {}
2870
2871 private:
2872 template <typename F>
2873 PriorSet(
2874 const PastIntervals &past_intervals,
2875 bool ec_pool,
2876 epoch_t last_epoch_started,
2877 IsPGRecoverablePredicate *c,
2878 F f,
2879 const vector<int> &up,
2880 const vector<int> &acting,
2881 const DoutPrefixProvider *dpp);
2882
2883 friend class PastIntervals;
2884 };
2885
2886 void update_type(bool ec_pool, bool compact);
2887 void update_type_from_map(bool ec_pool, const OSDMap &osdmap);
2888
2889 template <typename... Args>
2890 PriorSet get_prior_set(Args&&... args) const {
2891 return PriorSet(*this, std::forward<Args>(args)...);
2892 }
2893};
2894WRITE_CLASS_ENCODER(PastIntervals)
2895
2896ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i);
2897ostream& operator<<(ostream& out, const PastIntervals &i);
2898ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i);
2899
2900template <typename F>
2901PastIntervals::PriorSet::PriorSet(
2902 const PastIntervals &past_intervals,
2903 bool ec_pool,
2904 epoch_t last_epoch_started,
2905 IsPGRecoverablePredicate *c,
2906 F f,
2907 const vector<int> &up,
2908 const vector<int> &acting,
2909 const DoutPrefixProvider *dpp)
2910 : ec_pool(ec_pool), pg_down(false), pcontdec(c)
2911{
2912 /*
2913 * We have to be careful to gracefully deal with situations like
2914 * so. Say we have a power outage or something that takes out both
2915 * OSDs, but the monitor doesn't mark them down in the same epoch.
2916 * The history may look like
2917 *
2918 * 1: A B
2919 * 2: B
2920 * 3: let's say B dies for good, too (say, from the power spike)
2921 * 4: A
2922 *
2923 * which makes it look like B may have applied updates to the PG
2924 * that we need in order to proceed. This sucks...
2925 *
2926 * To minimize the risk of this happening, we CANNOT go active if
2927 * _any_ OSDs in the prior set are down until we send an MOSDAlive
2928 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
2929 * Then, we have something like
2930 *
2931 * 1: A B
2932 * 2: B up_thru[B]=0
2933 * 3:
2934 * 4: A
2935 *
2936 * -> we can ignore B, bc it couldn't have gone active (alive_thru
2937 * still 0).
2938 *
2939 * or,
2940 *
2941 * 1: A B
2942 * 2: B up_thru[B]=0
2943 * 3: B up_thru[B]=2
2944 * 4:
2945 * 5: A
2946 *
2947 * -> we must wait for B, bc it was alive through 2, and could have
2948 * written to the pg.
2949 *
2950 * If B is really dead, then an administrator will need to manually
2951 * intervene by marking the OSD as "lost."
2952 */
2953
2954 // Include current acting and up nodes... not because they may
2955 // contain old data (this interval hasn't gone active, obviously),
2956 // but because we want their pg_info to inform choose_acting(), and
2957 // so that we know what they do/do not have explicitly before
2958 // sending them any new info/logs/whatever.
2959 for (unsigned i = 0; i < acting.size(); i++) {
2960 if (acting[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
2961 probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
2962 }
2963 // It may be possible to exclude the up nodes, but let's keep them in
2964 // there for now.
2965 for (unsigned i = 0; i < up.size(); i++) {
2966 if (up[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
2967 probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
2968 }
2969
2970 set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
2971 ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
2972 for (auto &&i: all_probe) {
2973 switch (f(0, i.osd, nullptr)) {
2974 case UP: {
2975 probe.insert(i);
2976 break;
2977 }
2978 case DNE:
2979 case LOST:
2980 case DOWN: {
2981 down.insert(i.osd);
2982 break;
2983 }
2984 }
2985 }
2986
2987 past_intervals.iterate_mayberw_back_to(
2988 ec_pool,
2989 last_epoch_started,
2990 [&](epoch_t start, const set<pg_shard_t> &acting) {
2991 ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
2992 << ", acting: " << acting << dendl;
2993
2994 // look at candidate osds during this interval. each falls into
2995 // one of three categories: up, down (but potentially
2996 // interesting), or lost (down, but we won't wait for it).
2997 set<pg_shard_t> up_now;
2998 map<int, epoch_t> candidate_blocked_by;
2999 // any candidates down now (that might have useful data)
3000 bool any_down_now = false;
3001
3002 // consider ACTING osds
3003 for (auto &&so: acting) {
3004 epoch_t lost_at = 0;
3005 switch (f(start, so.osd, &lost_at)) {
3006 case UP: {
3007 // include past acting osds if they are up.
3008 up_now.insert(so);
3009 break;
3010 }
3011 case DNE: {
3012 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3013 << " no longer exists" << dendl;
3014 break;
3015 }
3016 case LOST: {
3017 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3018 << " is down, but lost_at " << lost_at << dendl;
3019 up_now.insert(so);
3020 break;
3021 }
3022 case DOWN: {
3023 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3024 << " is down" << dendl;
3025 candidate_blocked_by[so.osd] = lost_at;
3026 any_down_now = true;
3027 break;
3028 }
3029 }
3030 }
3031
3032 // if not enough osds survived this interval, and we may have gone rw,
3033 // then we need to wait for one of those osds to recover to
3034 // ensure that we haven't lost any information.
3035 if (!(*pcontdec)(up_now) && any_down_now) {
3036 // fixme: how do we identify a "clean" shutdown anyway?
3037 ldpp_dout(dpp, 10) << "build_prior possibly went active+rw,"
3038 << " insufficient up; including down osds" << dendl;
3039 assert(!candidate_blocked_by.empty());
3040 pg_down = true;
3041 blocked_by.insert(
3042 candidate_blocked_by.begin(),
3043 candidate_blocked_by.end());
3044 }
3045 });
3046
3047 ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
3048 << " down " << down
3049 << " blocked_by " << blocked_by
3050 << (pg_down ? " pg_down":"")
3051 << dendl;
3052}
3053
3054/**
3055 * pg_query_t - used to ask a peer for information about a pg.
3056 *
3057 * note: if version=0, type=LOG, then we just provide our full log.
3058 */
3059struct pg_query_t {
3060 enum {
3061 INFO = 0,
3062 LOG = 1,
3063 MISSING = 4,
3064 FULLLOG = 5,
3065 };
3066 const char *get_type_name() const {
3067 switch (type) {
3068 case INFO: return "info";
3069 case LOG: return "log";
3070 case MISSING: return "missing";
3071 case FULLLOG: return "fulllog";
3072 default: return "???";
3073 }
3074 }
3075
3076 __s32 type;
3077 eversion_t since;
3078 pg_history_t history;
3079 epoch_t epoch_sent;
3080 shard_id_t to;
3081 shard_id_t from;
3082
3083 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
3084 from(shard_id_t::NO_SHARD) {}
3085 pg_query_t(
3086 int t,
3087 shard_id_t to,
3088 shard_id_t from,
3089 const pg_history_t& h,
3090 epoch_t epoch_sent)
3091 : type(t),
3092 history(h),
3093 epoch_sent(epoch_sent),
3094 to(to), from(from) {
3095 assert(t != LOG);
3096 }
3097 pg_query_t(
3098 int t,
3099 shard_id_t to,
3100 shard_id_t from,
3101 eversion_t s,
3102 const pg_history_t& h,
3103 epoch_t epoch_sent)
3104 : type(t), since(s), history(h),
3105 epoch_sent(epoch_sent), to(to), from(from) {
3106 assert(t == LOG);
3107 }
3108
3109 void encode(bufferlist &bl, uint64_t features) const;
3110 void decode(bufferlist::iterator &bl);
3111
3112 void dump(Formatter *f) const;
3113 static void generate_test_instances(list<pg_query_t*>& o);
3114};
3115WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
3116
3117inline ostream& operator<<(ostream& out, const pg_query_t& q) {
3118 out << "query(" << q.get_type_name() << " " << q.since;
3119 if (q.type == pg_query_t::LOG)
3120 out << " " << q.history;
3121 out << ")";
3122 return out;
3123}
3124
3125class PGBackend;
3126class ObjectModDesc {
3127 bool can_local_rollback;
3128 bool rollback_info_completed;
3129
3130 // version required to decode, reflected in encode/decode version
3131 __u8 max_required_version = 1;
3132public:
3133 class Visitor {
3134 public:
3135 virtual void append(uint64_t old_offset) {}
3136 virtual void setattrs(map<string, boost::optional<bufferlist> > &attrs) {}
3137 virtual void rmobject(version_t old_version) {}
3138 /**
3139 * Used to support the unfound_lost_delete log event: if the stashed
3140 * version exists, we unstash it, otherwise, we do nothing. This way
3141 * each replica rolls back to whatever state it had prior to the attempt
3142 * at mark unfound lost delete
3143 */
3144 virtual void try_rmobject(version_t old_version) {
3145 rmobject(old_version);
3146 }
3147 virtual void create() {}
3148 virtual void update_snaps(const set<snapid_t> &old_snaps) {}
3149 virtual void rollback_extents(
3150 version_t gen,
3151 const vector<pair<uint64_t, uint64_t> > &extents) {}
3152 virtual ~Visitor() {}
3153 };
3154 void visit(Visitor *visitor) const;
3155 mutable bufferlist bl;
3156 enum ModID {
3157 APPEND = 1,
3158 SETATTRS = 2,
3159 DELETE = 3,
3160 CREATE = 4,
3161 UPDATE_SNAPS = 5,
3162 TRY_DELETE = 6,
3163 ROLLBACK_EXTENTS = 7
3164 };
31f18b77
FG
3165 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3166 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3167 }
7c673cae
FG
3168 void claim(ObjectModDesc &other) {
3169 bl.clear();
3170 bl.claim(other.bl);
3171 can_local_rollback = other.can_local_rollback;
3172 rollback_info_completed = other.rollback_info_completed;
3173 }
3174 void claim_append(ObjectModDesc &other) {
3175 if (!can_local_rollback || rollback_info_completed)
3176 return;
3177 if (!other.can_local_rollback) {
3178 mark_unrollbackable();
3179 return;
3180 }
3181 bl.claim_append(other.bl);
3182 rollback_info_completed = other.rollback_info_completed;
3183 }
3184 void swap(ObjectModDesc &other) {
3185 bl.swap(other.bl);
3186
31f18b77
FG
3187 using std::swap;
3188 swap(other.can_local_rollback, can_local_rollback);
3189 swap(other.rollback_info_completed, rollback_info_completed);
3190 swap(other.max_required_version, max_required_version);
7c673cae
FG
3191 }
3192 void append_id(ModID id) {
3193 uint8_t _id(id);
3194 ::encode(_id, bl);
3195 }
3196 void append(uint64_t old_size) {
3197 if (!can_local_rollback || rollback_info_completed)
3198 return;
3199 ENCODE_START(1, 1, bl);
3200 append_id(APPEND);
3201 ::encode(old_size, bl);
3202 ENCODE_FINISH(bl);
3203 }
3204 void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
3205 if (!can_local_rollback || rollback_info_completed)
3206 return;
3207 ENCODE_START(1, 1, bl);
3208 append_id(SETATTRS);
3209 ::encode(old_attrs, bl);
3210 ENCODE_FINISH(bl);
3211 }
3212 bool rmobject(version_t deletion_version) {
3213 if (!can_local_rollback || rollback_info_completed)
3214 return false;
3215 ENCODE_START(1, 1, bl);
3216 append_id(DELETE);
3217 ::encode(deletion_version, bl);
3218 ENCODE_FINISH(bl);
3219 rollback_info_completed = true;
3220 return true;
3221 }
3222 bool try_rmobject(version_t deletion_version) {
3223 if (!can_local_rollback || rollback_info_completed)
3224 return false;
3225 ENCODE_START(1, 1, bl);
3226 append_id(TRY_DELETE);
3227 ::encode(deletion_version, bl);
3228 ENCODE_FINISH(bl);
3229 rollback_info_completed = true;
3230 return true;
3231 }
3232 void create() {
3233 if (!can_local_rollback || rollback_info_completed)
3234 return;
3235 rollback_info_completed = true;
3236 ENCODE_START(1, 1, bl);
3237 append_id(CREATE);
3238 ENCODE_FINISH(bl);
3239 }
3240 void update_snaps(const set<snapid_t> &old_snaps) {
3241 if (!can_local_rollback || rollback_info_completed)
3242 return;
3243 ENCODE_START(1, 1, bl);
3244 append_id(UPDATE_SNAPS);
3245 ::encode(old_snaps, bl);
3246 ENCODE_FINISH(bl);
3247 }
3248 void rollback_extents(
3249 version_t gen, const vector<pair<uint64_t, uint64_t> > &extents) {
3250 assert(can_local_rollback);
3251 assert(!rollback_info_completed);
3252 if (max_required_version < 2)
3253 max_required_version = 2;
3254 ENCODE_START(2, 2, bl);
3255 append_id(ROLLBACK_EXTENTS);
3256 ::encode(gen, bl);
3257 ::encode(extents, bl);
3258 ENCODE_FINISH(bl);
3259 }
3260
3261 // cannot be rolled back
3262 void mark_unrollbackable() {
3263 can_local_rollback = false;
3264 bl.clear();
3265 }
3266 bool can_rollback() const {
3267 return can_local_rollback;
3268 }
3269 bool empty() const {
3270 return can_local_rollback && (bl.length() == 0);
3271 }
3272
3273 bool requires_kraken() const {
3274 return max_required_version >= 2;
3275 }
3276
3277 /**
3278 * Create fresh copy of bl bytes to avoid keeping large buffers around
3279 * in the case that bl contains ptrs which point into a much larger
3280 * message buffer
3281 */
31f18b77 3282 void trim_bl() const {
7c673cae
FG
3283 if (bl.length() > 0)
3284 bl.rebuild();
3285 }
3286 void encode(bufferlist &bl) const;
3287 void decode(bufferlist::iterator &bl);
3288 void dump(Formatter *f) const;
3289 static void generate_test_instances(list<ObjectModDesc*>& o);
3290};
3291WRITE_CLASS_ENCODER(ObjectModDesc)
3292
3293
3294/**
3295 * pg_log_entry_t - single entry/event in pg log
3296 *
3297 */
3298struct pg_log_entry_t {
3299 enum {
3300 MODIFY = 1, // some unspecified modification (but not *all* modifications)
3301 CLONE = 2, // cloned object from head
3302 DELETE = 3, // deleted object
3303 BACKLOG = 4, // event invented by generate_backlog [deprecated]
3304 LOST_REVERT = 5, // lost new version, revert to an older version.
3305 LOST_DELETE = 6, // lost new version, revert to no object (deleted).
3306 LOST_MARK = 7, // lost new version, now EIO
3307 PROMOTE = 8, // promoted object from another tier
3308 CLEAN = 9, // mark an object clean
3309 ERROR = 10, // write that returned an error
3310 };
3311 static const char *get_op_name(int op) {
3312 switch (op) {
3313 case MODIFY:
3314 return "modify";
3315 case PROMOTE:
3316 return "promote";
3317 case CLONE:
3318 return "clone";
3319 case DELETE:
3320 return "delete";
3321 case BACKLOG:
3322 return "backlog";
3323 case LOST_REVERT:
3324 return "l_revert";
3325 case LOST_DELETE:
3326 return "l_delete";
3327 case LOST_MARK:
3328 return "l_mark";
3329 case CLEAN:
3330 return "clean";
3331 case ERROR:
3332 return "error";
3333 default:
3334 return "unknown";
3335 }
3336 }
3337 const char *get_op_name() const {
3338 return get_op_name(op);
3339 }
3340
3341 // describes state for a locally-rollbackable entry
3342 ObjectModDesc mod_desc;
3343 bufferlist snaps; // only for clone entries
3344 hobject_t soid;
3345 osd_reqid_t reqid; // caller+tid to uniquely identify request
31f18b77 3346 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids;
7c673cae
FG
3347 eversion_t version, prior_version, reverting_to;
3348 version_t user_version; // the user version for this entry
3349 utime_t mtime; // this is the _user_ mtime, mind you
3350 int32_t return_code; // only stored for ERRORs for dup detection
3351
3352 __s32 op;
3353 bool invalid_hash; // only when decoding sobject_t based entries
3354 bool invalid_pool; // only when decoding pool-less hobject based entries
3355
3356 pg_log_entry_t()
3357 : user_version(0), return_code(0), op(0),
31f18b77
FG
3358 invalid_hash(false), invalid_pool(false) {
3359 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3360 }
7c673cae
FG
3361 pg_log_entry_t(int _op, const hobject_t& _soid,
3362 const eversion_t& v, const eversion_t& pv,
3363 version_t uv,
3364 const osd_reqid_t& rid, const utime_t& mt,
3365 int return_code)
3366 : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
3367 mtime(mt), return_code(return_code), op(_op),
31f18b77
FG
3368 invalid_hash(false), invalid_pool(false) {
3369 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3370 }
7c673cae
FG
3371
3372 bool is_clone() const { return op == CLONE; }
3373 bool is_modify() const { return op == MODIFY; }
3374 bool is_promote() const { return op == PROMOTE; }
3375 bool is_clean() const { return op == CLEAN; }
3376 bool is_backlog() const { return op == BACKLOG; }
3377 bool is_lost_revert() const { return op == LOST_REVERT; }
3378 bool is_lost_delete() const { return op == LOST_DELETE; }
3379 bool is_lost_mark() const { return op == LOST_MARK; }
3380 bool is_error() const { return op == ERROR; }
3381
3382 bool is_update() const {
3383 return
3384 is_clone() || is_modify() || is_promote() || is_clean() ||
3385 is_backlog() || is_lost_revert() || is_lost_mark();
3386 }
3387 bool is_delete() const {
3388 return op == DELETE || op == LOST_DELETE;
3389 }
3390
3391 bool can_rollback() const {
3392 return mod_desc.can_rollback();
3393 }
3394
3395 void mark_unrollbackable() {
3396 mod_desc.mark_unrollbackable();
3397 }
3398
3399 bool requires_kraken() const {
3400 return mod_desc.requires_kraken();
3401 }
3402
3403 // Errors are only used for dup detection, whereas
3404 // the index by objects is used by recovery, copy_get,
3405 // and other facilities that don't expect or need to
3406 // be aware of error entries.
3407 bool object_is_indexed() const {
3408 return !is_error();
3409 }
3410
3411 bool reqid_is_indexed() const {
3412 return reqid != osd_reqid_t() &&
3413 (op == MODIFY || op == DELETE || op == ERROR);
3414 }
3415
3416 string get_key_name() const;
3417 void encode_with_checksum(bufferlist& bl) const;
3418 void decode_with_checksum(bufferlist::iterator& p);
3419
3420 void encode(bufferlist &bl) const;
3421 void decode(bufferlist::iterator &bl);
3422 void dump(Formatter *f) const;
3423 static void generate_test_instances(list<pg_log_entry_t*>& o);
3424
3425};
3426WRITE_CLASS_ENCODER(pg_log_entry_t)
3427
3428ostream& operator<<(ostream& out, const pg_log_entry_t& e);
3429
c07f9fc5
FG
3430struct pg_log_dup_t {
3431 osd_reqid_t reqid; // caller+tid to uniquely identify request
3432 eversion_t version;
3433 version_t user_version; // the user version for this entry
3434 int32_t return_code; // only stored for ERRORs for dup detection
7c673cae 3435
c07f9fc5
FG
3436 pg_log_dup_t()
3437 : user_version(0), return_code(0)
3438 {}
3439 explicit pg_log_dup_t(const pg_log_entry_t& entry)
3440 : reqid(entry.reqid), version(entry.version),
3441 user_version(entry.user_version), return_code(entry.return_code)
3442 {}
3443 pg_log_dup_t(const eversion_t& v, version_t uv,
3444 const osd_reqid_t& rid, int return_code)
3445 : reqid(rid), version(v), user_version(uv),
3446 return_code(return_code)
3447 {}
3448
3449 string get_key_name() const;
3450 void encode(bufferlist &bl) const;
3451 void decode(bufferlist::iterator &bl);
3452 void dump(Formatter *f) const;
3453 static void generate_test_instances(list<pg_log_dup_t*>& o);
3454
181888fb
FG
3455 bool operator==(const pg_log_dup_t &rhs) const {
3456 return reqid == rhs.reqid &&
3457 version == rhs.version &&
3458 user_version == rhs.user_version &&
3459 return_code == rhs.return_code;
3460 }
3461 bool operator!=(const pg_log_dup_t &rhs) const {
3462 return !(*this == rhs);
3463 }
3464
c07f9fc5
FG
3465 friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
3466};
3467WRITE_CLASS_ENCODER(pg_log_dup_t)
3468
3469std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
7c673cae
FG
3470
3471/**
3472 * pg_log_t - incremental log of recent pg changes.
3473 *
3474 * serves as a recovery queue for recent changes.
3475 */
3476struct pg_log_t {
3477 /*
3478 * head - newest entry (update|delete)
3479 * tail - entry previous to oldest (update|delete) for which we have
3480 * complete negative information.
3481 * i.e. we can infer pg contents for any store whose last_update >= tail.
3482 */
3483 eversion_t head; // newest entry
3484 eversion_t tail; // version prior to oldest
3485
3486protected:
3487 // We can rollback rollback-able entries > can_rollback_to
3488 eversion_t can_rollback_to;
3489
3490 // always <= can_rollback_to, indicates how far stashed rollback
3491 // data can be found
3492 eversion_t rollback_info_trimmed_to;
3493
3494public:
c07f9fc5
FG
3495 // the actual log
3496 mempool::osd_pglog::list<pg_log_entry_t> log;
3497
3498 // entries just for dup op detection ordered oldest to newest
3499 mempool::osd_pglog::list<pg_log_dup_t> dups;
3500
7c673cae
FG
3501 pg_log_t() = default;
3502 pg_log_t(const eversion_t &last_update,
3503 const eversion_t &log_tail,
3504 const eversion_t &can_rollback_to,
3505 const eversion_t &rollback_info_trimmed_to,
c07f9fc5
FG
3506 mempool::osd_pglog::list<pg_log_entry_t> &&entries,
3507 mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
7c673cae
FG
3508 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3509 rollback_info_trimmed_to(rollback_info_trimmed_to),
c07f9fc5 3510 log(std::move(entries)), dups(std::move(dup_entries)) {}
7c673cae
FG
3511 pg_log_t(const eversion_t &last_update,
3512 const eversion_t &log_tail,
3513 const eversion_t &can_rollback_to,
3514 const eversion_t &rollback_info_trimmed_to,
c07f9fc5
FG
3515 const std::list<pg_log_entry_t> &entries,
3516 const std::list<pg_log_dup_t> &dup_entries)
7c673cae
FG
3517 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3518 rollback_info_trimmed_to(rollback_info_trimmed_to) {
3519 for (auto &&entry: entries) {
3520 log.push_back(entry);
3521 }
c07f9fc5
FG
3522 for (auto &&entry: dup_entries) {
3523 dups.push_back(entry);
3524 }
7c673cae
FG
3525 }
3526
3527 void clear() {
3528 eversion_t z;
3529 rollback_info_trimmed_to = can_rollback_to = head = tail = z;
3530 log.clear();
c07f9fc5 3531 dups.clear();
7c673cae
FG
3532 }
3533
3534 eversion_t get_rollback_info_trimmed_to() const {
3535 return rollback_info_trimmed_to;
3536 }
3537 eversion_t get_can_rollback_to() const {
3538 return can_rollback_to;
3539 }
3540
3541
3542 pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
31f18b77 3543 mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
7c673cae
FG
3544 oldlog.swap(log);
3545
3546 eversion_t old_tail;
3547 unsigned mask = ~((~0)<<split_bits);
3548 for (auto i = oldlog.begin();
3549 i != oldlog.end();
3550 ) {
3551 if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
3552 childlog.push_back(*i);
3553 } else {
3554 log.push_back(*i);
3555 }
3556 oldlog.erase(i++);
3557 }
3558
c07f9fc5
FG
3559 // osd_reqid is unique, so it doesn't matter if there are extra
3560 // dup entries in each pg. To avoid storing oid with the dup
3561 // entries, just copy the whole list.
3562 auto childdups(dups);
3563
7c673cae
FG
3564 return pg_log_t(
3565 head,
3566 tail,
3567 can_rollback_to,
3568 rollback_info_trimmed_to,
c07f9fc5
FG
3569 std::move(childlog),
3570 std::move(childdups));
3571 }
7c673cae 3572
31f18b77 3573 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
7c673cae
FG
3574 assert(newhead >= tail);
3575
31f18b77
FG
3576 mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
3577 mempool::osd_pglog::list<pg_log_entry_t> divergent;
7c673cae
FG
3578 while (true) {
3579 if (p == log.begin()) {
3580 // yikes, the whole thing is divergent!
31f18b77
FG
3581 using std::swap;
3582 swap(divergent, log);
7c673cae
FG
3583 break;
3584 }
3585 --p;
3586 if (p->version.version <= newhead.version) {
3587 /*
3588 * look at eversion.version here. we want to avoid a situation like:
3589 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
3590 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
3591 * lower_bound = 100'9
3592 * i.e, same request, different version. If the eversion.version is > the
3593 * lower_bound, we it is divergent.
3594 */
3595 ++p;
3596 divergent.splice(divergent.begin(), log, p, log.end());
3597 break;
3598 }
3599 assert(p->version > newhead);
3600 }
3601 head = newhead;
3602
3603 if (can_rollback_to > newhead)
3604 can_rollback_to = newhead;
3605
3606 if (rollback_info_trimmed_to > newhead)
3607 rollback_info_trimmed_to = newhead;
3608
3609 return divergent;
3610 }
3611
3612 bool empty() const {
3613 return log.empty();
3614 }
3615
3616 bool null() const {
3617 return head.version == 0 && head.epoch == 0;
3618 }
3619
3620 size_t approx_size() const {
3621 return head.version - tail.version;
3622 }
3623
3624 static void filter_log(spg_t import_pgid, const OSDMap &curmap,
3625 const string &hit_set_namespace, const pg_log_t &in,
3626 pg_log_t &out, pg_log_t &reject);
3627
3628 /**
3629 * copy entries from the tail of another pg_log_t
3630 *
3631 * @param other pg_log_t to copy from
3632 * @param from copy entries after this version
3633 */
3634 void copy_after(const pg_log_t &other, eversion_t from);
3635
3636 /**
3637 * copy a range of entries from another pg_log_t
3638 *
3639 * @param other pg_log_t to copy from
3640 * @param from copy entries after this version
3641 * @param to up to and including this version
3642 */
3643 void copy_range(const pg_log_t &other, eversion_t from, eversion_t to);
3644
3645 /**
3646 * copy up to N entries
3647 *
3648 * @param other source log
3649 * @param max max number of entries to copy
3650 */
3651 void copy_up_to(const pg_log_t &other, int max);
3652
3653 ostream& print(ostream& out) const;
3654
3655 void encode(bufferlist &bl) const;
3656 void decode(bufferlist::iterator &bl, int64_t pool = -1);
3657 void dump(Formatter *f) const;
3658 static void generate_test_instances(list<pg_log_t*>& o);
3659};
3660WRITE_CLASS_ENCODER(pg_log_t)
3661
c07f9fc5 3662inline ostream& operator<<(ostream& out, const pg_log_t& log)
7c673cae
FG
3663{
3664 out << "log((" << log.tail << "," << log.head << "], crt="
3665 << log.get_can_rollback_to() << ")";
3666 return out;
3667}
3668
3669
3670/**
3671 * pg_missing_t - summary of missing objects.
3672 *
3673 * kept in memory, as a supplement to pg_log_t
3674 * also used to pass missing info in messages.
3675 */
3676struct pg_missing_item {
3677 eversion_t need, have;
c07f9fc5
FG
3678 enum missing_flags_t {
3679 FLAG_NONE = 0,
3680 FLAG_DELETE = 1,
3681 } flags;
3682 pg_missing_item() : flags(FLAG_NONE) {}
3683 explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
3684 pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false) : need(n), have(h) {
3685 set_delete(is_delete);
3686 }
3687
3688 void encode(bufferlist& bl, uint64_t features) const {
3689 if (HAVE_FEATURE(features, OSD_RECOVERY_DELETES)) {
3690 // encoding a zeroed eversion_t to differentiate between this and
3691 // legacy unversioned encoding - a need value of 0'0 is not
3692 // possible. This can be replaced with the legacy encoding
3693 // macros post-luminous.
3694 eversion_t e;
3695 ::encode(e, bl);
3696 ::encode(need, bl);
3697 ::encode(have, bl);
3698 ::encode(static_cast<uint8_t>(flags), bl);
3699 } else {
3700 // legacy unversioned encoding
3701 ::encode(need, bl);
3702 ::encode(have, bl);
3703 }
7c673cae
FG
3704 }
3705 void decode(bufferlist::iterator& bl) {
c07f9fc5
FG
3706 eversion_t e;
3707 ::decode(e, bl);
3708 if (e != eversion_t()) {
3709 // legacy encoding, this is the need value
3710 need = e;
3711 ::decode(have, bl);
3712 } else {
3713 ::decode(need, bl);
3714 ::decode(have, bl);
3715 uint8_t f;
3716 ::decode(f, bl);
3717 flags = static_cast<missing_flags_t>(f);
3718 }
3719 }
3720
3721 void set_delete(bool is_delete) {
3722 flags = is_delete ? FLAG_DELETE : FLAG_NONE;
3723 }
3724
3725 bool is_delete() const {
3726 return (flags & FLAG_DELETE) == FLAG_DELETE;
3727 }
3728
3729 string flag_str() const {
3730 if (flags == FLAG_NONE) {
3731 return "none";
3732 } else {
3733 return "delete";
3734 }
7c673cae 3735 }
c07f9fc5 3736
7c673cae
FG
3737 void dump(Formatter *f) const {
3738 f->dump_stream("need") << need;
3739 f->dump_stream("have") << have;
c07f9fc5 3740 f->dump_stream("flags") << flag_str();
7c673cae
FG
3741 }
3742 static void generate_test_instances(list<pg_missing_item*>& o) {
3743 o.push_back(new pg_missing_item);
3744 o.push_back(new pg_missing_item);
3745 o.back()->need = eversion_t(1, 2);
3746 o.back()->have = eversion_t(1, 1);
c07f9fc5
FG
3747 o.push_back(new pg_missing_item);
3748 o.back()->need = eversion_t(3, 5);
3749 o.back()->have = eversion_t(3, 4);
3750 o.back()->flags = FLAG_DELETE;
7c673cae
FG
3751 }
3752 bool operator==(const pg_missing_item &rhs) const {
c07f9fc5 3753 return need == rhs.need && have == rhs.have && flags == rhs.flags;
7c673cae
FG
3754 }
3755 bool operator!=(const pg_missing_item &rhs) const {
3756 return !(*this == rhs);
3757 }
3758};
c07f9fc5 3759WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
7c673cae
FG
3760ostream& operator<<(ostream& out, const pg_missing_item &item);
3761
3762class pg_missing_const_i {
3763public:
3764 virtual const map<hobject_t, pg_missing_item> &
3765 get_items() const = 0;
3766 virtual const map<version_t, hobject_t> &get_rmissing() const = 0;
c07f9fc5 3767 virtual bool get_may_include_deletes() const = 0;
7c673cae
FG
3768 virtual unsigned int num_missing() const = 0;
3769 virtual bool have_missing() const = 0;
3770 virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
3771 virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
3772 virtual eversion_t have_old(const hobject_t& oid) const = 0;
3773 virtual ~pg_missing_const_i() {}
3774};
3775
3776
3777template <bool Track>
3778class ChangeTracker {
3779public:
3780 void changed(const hobject_t &obj) {}
3781 template <typename F>
3782 void get_changed(F &&f) const {}
3783 void flush() {}
3784 bool is_clean() const {
3785 return true;
3786 }
3787};
3788template <>
3789class ChangeTracker<true> {
3790 set<hobject_t> _changed;
3791public:
3792 void changed(const hobject_t &obj) {
3793 _changed.insert(obj);
3794 }
3795 template <typename F>
3796 void get_changed(F &&f) const {
3797 for (auto const &i: _changed) {
3798 f(i);
3799 }
3800 }
3801 void flush() {
3802 _changed.clear();
3803 }
3804 bool is_clean() const {
3805 return _changed.empty();
3806 }
3807};
3808
3809template <bool TrackChanges>
3810class pg_missing_set : public pg_missing_const_i {
3811 using item = pg_missing_item;
3812 map<hobject_t, item> missing; // oid -> (need v, have v)
3813 map<version_t, hobject_t> rmissing; // v -> oid
3814 ChangeTracker<TrackChanges> tracker;
3815
3816public:
3817 pg_missing_set() = default;
3818
3819 template <typename missing_type>
3820 pg_missing_set(const missing_type &m) {
7c673cae
FG
3821 missing = m.get_items();
3822 rmissing = m.get_rmissing();
c07f9fc5 3823 may_include_deletes = m.get_may_include_deletes();
7c673cae
FG
3824 for (auto &&i: missing)
3825 tracker.changed(i.first);
3826 }
3827
c07f9fc5
FG
3828 bool may_include_deletes = false;
3829
7c673cae
FG
3830 const map<hobject_t, item> &get_items() const override {
3831 return missing;
3832 }
3833 const map<version_t, hobject_t> &get_rmissing() const override {
3834 return rmissing;
3835 }
c07f9fc5
FG
3836 bool get_may_include_deletes() const override {
3837 return may_include_deletes;
3838 }
7c673cae
FG
3839 unsigned int num_missing() const override {
3840 return missing.size();
3841 }
3842 bool have_missing() const override {
3843 return !missing.empty();
3844 }
3845 bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
3846 auto iter = missing.find(oid);
3847 if (iter == missing.end())
3848 return false;
3849 if (out)
3850 *out = iter->second;
3851 return true;
3852 }
3853 bool is_missing(const hobject_t& oid, eversion_t v) const override {
3854 map<hobject_t, item>::const_iterator m =
3855 missing.find(oid);
3856 if (m == missing.end())
3857 return false;
3858 const item &item(m->second);
3859 if (item.need > v)
3860 return false;
3861 return true;
3862 }
3863 eversion_t have_old(const hobject_t& oid) const override {
3864 map<hobject_t, item>::const_iterator m =
3865 missing.find(oid);
3866 if (m == missing.end())
3867 return eversion_t();
3868 const item &item(m->second);
3869 return item.have;
3870 }
3871
3872 void claim(pg_missing_set& o) {
3873 static_assert(!TrackChanges, "Can't use claim with TrackChanges");
3874 missing.swap(o.missing);
3875 rmissing.swap(o.rmissing);
3876 }
3877
3878 /*
3879 * this needs to be called in log order as we extend the log. it
3880 * assumes missing is accurate up through the previous log entry.
3881 */
3882 void add_next_event(const pg_log_entry_t& e) {
c07f9fc5
FG
3883 map<hobject_t, item>::iterator missing_it;
3884 missing_it = missing.find(e.soid);
3885 bool is_missing_divergent_item = missing_it != missing.end();
3886 if (e.prior_version == eversion_t() || e.is_clone()) {
3887 // new object.
3888 if (is_missing_divergent_item) { // use iterator
7c673cae 3889 rmissing.erase((missing_it->second).need.version);
c07f9fc5
FG
3890 missing_it->second = item(e.version, eversion_t(), e.is_delete()); // .have = nil
3891 } else // create new element in missing map
3892 missing[e.soid] = item(e.version, eversion_t(), e.is_delete()); // .have = nil
3893 } else if (is_missing_divergent_item) {
3894 // already missing (prior).
3895 rmissing.erase((missing_it->second).need.version);
3896 (missing_it->second).need = e.version; // leave .have unchanged.
3897 missing_it->second.set_delete(e.is_delete());
3898 } else if (e.is_backlog()) {
3899 // May not have prior version
3900 assert(0 == "these don't exist anymore");
3901 } else {
3902 // not missing, we must have prior_version (if any)
3903 assert(!is_missing_divergent_item);
3904 missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
7c673cae 3905 }
c07f9fc5 3906 rmissing[e.version.version] = e.soid;
7c673cae
FG
3907 tracker.changed(e.soid);
3908 }
3909
c07f9fc5 3910 void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
7c673cae
FG
3911 if (missing.count(oid)) {
3912 rmissing.erase(missing[oid].need.version);
3913 missing[oid].need = need; // no not adjust .have
c07f9fc5 3914 missing[oid].set_delete(is_delete);
7c673cae 3915 } else {
c07f9fc5 3916 missing[oid] = item(need, eversion_t(), is_delete);
7c673cae
FG
3917 }
3918 rmissing[need.version] = oid;
3919
3920 tracker.changed(oid);
3921 }
3922
3923 void revise_have(hobject_t oid, eversion_t have) {
3924 if (missing.count(oid)) {
3925 tracker.changed(oid);
3926 missing[oid].have = have;
3927 }
3928 }
3929
c07f9fc5
FG
3930 void add(const hobject_t& oid, eversion_t need, eversion_t have,
3931 bool is_delete) {
3932 missing[oid] = item(need, have, is_delete);
7c673cae
FG
3933 rmissing[need.version] = oid;
3934 tracker.changed(oid);
3935 }
3936
3937 void rm(const hobject_t& oid, eversion_t v) {
3938 std::map<hobject_t, item>::iterator p = missing.find(oid);
3939 if (p != missing.end() && p->second.need <= v)
3940 rm(p);
3941 }
3942
3943 void rm(std::map<hobject_t, item>::const_iterator m) {
3944 tracker.changed(m->first);
3945 rmissing.erase(m->second.need.version);
3946 missing.erase(m);
3947 }
3948
3949 void got(const hobject_t& oid, eversion_t v) {
3950 std::map<hobject_t, item>::iterator p = missing.find(oid);
3951 assert(p != missing.end());
c07f9fc5 3952 assert(p->second.need <= v || p->second.is_delete());
7c673cae
FG
3953 got(p);
3954 }
3955
3956 void got(std::map<hobject_t, item>::const_iterator m) {
3957 tracker.changed(m->first);
3958 rmissing.erase(m->second.need.version);
3959 missing.erase(m);
3960 }
3961
3962 void split_into(
3963 pg_t child_pgid,
3964 unsigned split_bits,
3965 pg_missing_set *omissing) {
c07f9fc5 3966 omissing->may_include_deletes = may_include_deletes;
7c673cae
FG
3967 unsigned mask = ~((~0)<<split_bits);
3968 for (map<hobject_t, item>::iterator i = missing.begin();
3969 i != missing.end();
3970 ) {
3971 if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
c07f9fc5
FG
3972 omissing->add(i->first, i->second.need, i->second.have,
3973 i->second.is_delete());
7c673cae
FG
3974 rm(i++);
3975 } else {
3976 ++i;
3977 }
3978 }
3979 }
3980
3981 void clear() {
3982 for (auto const &i: missing)
3983 tracker.changed(i.first);
3984 missing.clear();
3985 rmissing.clear();
3986 }
3987
3988 void encode(bufferlist &bl) const {
c07f9fc5
FG
3989 ENCODE_START(4, 2, bl);
3990 ::encode(missing, bl, may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0);
3991 ::encode(may_include_deletes, bl);
7c673cae
FG
3992 ENCODE_FINISH(bl);
3993 }
3994 void decode(bufferlist::iterator &bl, int64_t pool = -1) {
3995 for (auto const &i: missing)
3996 tracker.changed(i.first);
c07f9fc5 3997 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
7c673cae 3998 ::decode(missing, bl);
c07f9fc5
FG
3999 if (struct_v >= 4) {
4000 ::decode(may_include_deletes, bl);
4001 }
7c673cae
FG
4002 DECODE_FINISH(bl);
4003
4004 if (struct_v < 3) {
4005 // Handle hobject_t upgrade
4006 map<hobject_t, item> tmp;
4007 for (map<hobject_t, item>::iterator i =
4008 missing.begin();
4009 i != missing.end();
4010 ) {
4011 if (!i->first.is_max() && i->first.pool == -1) {
4012 hobject_t to_insert(i->first);
4013 to_insert.pool = pool;
4014 tmp[to_insert] = i->second;
4015 missing.erase(i++);
4016 } else {
4017 ++i;
4018 }
4019 }
4020 missing.insert(tmp.begin(), tmp.end());
4021 }
4022
4023 for (map<hobject_t,item>::iterator it =
4024 missing.begin();
4025 it != missing.end();
4026 ++it)
4027 rmissing[it->second.need.version] = it->first;
4028 for (auto const &i: missing)
4029 tracker.changed(i.first);
4030 }
4031 void dump(Formatter *f) const {
4032 f->open_array_section("missing");
4033 for (map<hobject_t,item>::const_iterator p =
4034 missing.begin(); p != missing.end(); ++p) {
4035 f->open_object_section("item");
4036 f->dump_stream("object") << p->first;
4037 p->second.dump(f);
4038 f->close_section();
4039 }
4040 f->close_section();
c07f9fc5 4041 f->dump_bool("may_include_deletes", may_include_deletes);
7c673cae
FG
4042 }
4043 template <typename F>
4044 void filter_objects(F &&f) {
4045 for (auto i = missing.begin(); i != missing.end();) {
4046 if (f(i->first)) {
4047 rm(i++);
4048 } else {
4049 ++i;
4050 }
4051 }
4052 }
4053 static void generate_test_instances(list<pg_missing_set*>& o) {
4054 o.push_back(new pg_missing_set);
4055 o.push_back(new pg_missing_set);
4056 o.back()->add(
4057 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
c07f9fc5
FG
4058 eversion_t(5, 6), eversion_t(5, 1), false);
4059 o.push_back(new pg_missing_set);
4060 o.back()->add(
4061 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4062 eversion_t(5, 6), eversion_t(5, 1), true);
4063 o.back()->may_include_deletes = true;
7c673cae
FG
4064 }
4065 template <typename F>
4066 void get_changed(F &&f) const {
4067 tracker.get_changed(f);
4068 }
4069 void flush() {
4070 tracker.flush();
4071 }
4072 bool is_clean() const {
4073 return tracker.is_clean();
4074 }
4075 template <typename missing_t>
4076 bool debug_verify_from_init(
4077 const missing_t &init_missing,
4078 ostream *oss) const {
4079 if (!TrackChanges)
4080 return true;
4081 auto check_missing(init_missing.get_items());
4082 tracker.get_changed([&](const hobject_t &hoid) {
4083 check_missing.erase(hoid);
4084 if (missing.count(hoid)) {
4085 check_missing.insert(*(missing.find(hoid)));
4086 }
4087 });
4088 bool ok = true;
4089 if (check_missing.size() != missing.size()) {
4090 if (oss) {
4091 *oss << "Size mismatch, check: " << check_missing.size()
4092 << ", actual: " << missing.size() << "\n";
4093 }
4094 ok = false;
4095 }
4096 for (auto &i: missing) {
4097 if (!check_missing.count(i.first)) {
4098 if (oss)
4099 *oss << "check_missing missing " << i.first << "\n";
4100 ok = false;
4101 } else if (check_missing[i.first] != i.second) {
4102 if (oss)
4103 *oss << "check_missing missing item mismatch on " << i.first
4104 << ", check: " << check_missing[i.first]
4105 << ", actual: " << i.second << "\n";
4106 ok = false;
4107 }
4108 }
4109 if (oss && !ok) {
4110 *oss << "check_missing: " << check_missing << "\n";
4111 set<hobject_t> changed;
4112 tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
4113 *oss << "changed: " << changed << "\n";
4114 }
4115 return ok;
4116 }
4117};
4118template <bool TrackChanges>
4119void encode(
4120 const pg_missing_set<TrackChanges> &c, bufferlist &bl, uint64_t features=0) {
4121 ENCODE_DUMP_PRE();
4122 c.encode(bl);
4123 ENCODE_DUMP_POST(cl);
4124}
4125template <bool TrackChanges>
4126void decode(pg_missing_set<TrackChanges> &c, bufferlist::iterator &p) {
4127 c.decode(p);
4128}
4129template <bool TrackChanges>
4130ostream& operator<<(ostream& out, const pg_missing_set<TrackChanges> &missing)
4131{
c07f9fc5
FG
4132 out << "missing(" << missing.num_missing()
4133 << " may_include_deletes = " << missing.may_include_deletes;
7c673cae
FG
4134 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
4135 out << ")";
4136 return out;
4137}
4138
4139using pg_missing_t = pg_missing_set<false>;
4140using pg_missing_tracker_t = pg_missing_set<true>;
4141
4142
4143/**
4144 * pg list objects response format
4145 *
4146 */
4147struct pg_nls_response_t {
4148 collection_list_handle_t handle;
4149 list<librados::ListObjectImpl> entries;
4150
4151 void encode(bufferlist& bl) const {
4152 ENCODE_START(1, 1, bl);
4153 ::encode(handle, bl);
4154 __u32 n = (__u32)entries.size();
4155 ::encode(n, bl);
4156 for (list<librados::ListObjectImpl>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
4157 ::encode(i->nspace, bl);
4158 ::encode(i->oid, bl);
4159 ::encode(i->locator, bl);
4160 }
4161 ENCODE_FINISH(bl);
4162 }
4163 void decode(bufferlist::iterator& bl) {
4164 DECODE_START(1, bl);
4165 ::decode(handle, bl);
4166 __u32 n;
4167 ::decode(n, bl);
4168 entries.clear();
4169 while (n--) {
4170 librados::ListObjectImpl i;
4171 ::decode(i.nspace, bl);
4172 ::decode(i.oid, bl);
4173 ::decode(i.locator, bl);
4174 entries.push_back(i);
4175 }
4176 DECODE_FINISH(bl);
4177 }
4178 void dump(Formatter *f) const {
4179 f->dump_stream("handle") << handle;
4180 f->open_array_section("entries");
4181 for (list<librados::ListObjectImpl>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4182 f->open_object_section("object");
4183 f->dump_string("namespace", p->nspace);
4184 f->dump_string("object", p->oid);
4185 f->dump_string("key", p->locator);
4186 f->close_section();
4187 }
4188 f->close_section();
4189 }
4190 static void generate_test_instances(list<pg_nls_response_t*>& o) {
4191 o.push_back(new pg_nls_response_t);
4192 o.push_back(new pg_nls_response_t);
4193 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4194 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4195 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4196 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4197 o.push_back(new pg_nls_response_t);
4198 o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
4199 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4200 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4201 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4202 o.push_back(new pg_nls_response_t);
4203 o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
4204 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4205 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4206 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4207 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4208 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4209 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4210 }
4211};
4212
4213WRITE_CLASS_ENCODER(pg_nls_response_t)
4214
4215// For backwards compatibility with older OSD requests
4216struct pg_ls_response_t {
4217 collection_list_handle_t handle;
4218 list<pair<object_t, string> > entries;
4219
4220 void encode(bufferlist& bl) const {
4221 __u8 v = 1;
4222 ::encode(v, bl);
4223 ::encode(handle, bl);
4224 ::encode(entries, bl);
4225 }
4226 void decode(bufferlist::iterator& bl) {
4227 __u8 v;
4228 ::decode(v, bl);
4229 assert(v == 1);
4230 ::decode(handle, bl);
4231 ::decode(entries, bl);
4232 }
4233 void dump(Formatter *f) const {
4234 f->dump_stream("handle") << handle;
4235 f->open_array_section("entries");
4236 for (list<pair<object_t, string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4237 f->open_object_section("object");
4238 f->dump_stream("object") << p->first;
4239 f->dump_string("key", p->second);
4240 f->close_section();
4241 }
4242 f->close_section();
4243 }
4244 static void generate_test_instances(list<pg_ls_response_t*>& o) {
4245 o.push_back(new pg_ls_response_t);
4246 o.push_back(new pg_ls_response_t);
4247 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4248 o.back()->entries.push_back(make_pair(object_t("one"), string()));
4249 o.back()->entries.push_back(make_pair(object_t("two"), string("twokey")));
4250 }
4251};
4252
4253WRITE_CLASS_ENCODER(pg_ls_response_t)
4254
4255/**
4256 * object_copy_cursor_t
4257 */
4258struct object_copy_cursor_t {
4259 uint64_t data_offset;
4260 string omap_offset;
4261 bool attr_complete;
4262 bool data_complete;
4263 bool omap_complete;
4264
4265 object_copy_cursor_t()
4266 : data_offset(0),
4267 attr_complete(false),
4268 data_complete(false),
4269 omap_complete(false)
4270 {}
4271
4272 bool is_initial() const {
4273 return !attr_complete && data_offset == 0 && omap_offset.empty();
4274 }
4275 bool is_complete() const {
4276 return attr_complete && data_complete && omap_complete;
4277 }
4278
4279 static void generate_test_instances(list<object_copy_cursor_t*>& o);
4280 void encode(bufferlist& bl) const;
4281 void decode(bufferlist::iterator &bl);
4282 void dump(Formatter *f) const;
4283};
4284WRITE_CLASS_ENCODER(object_copy_cursor_t)
4285
4286/**
4287 * object_copy_data_t
4288 *
4289 * Return data from a copy request. The semantics are a little strange
4290 * as a result of the encoding's heritage.
4291 *
4292 * In particular, the sender unconditionally fills in the cursor (from what
4293 * it receives and sends), the size, and the mtime, but is responsible for
4294 * figuring out whether it should put any data in the attrs, data, or
4295 * omap members (corresponding to xattrs, object data, and the omap entries)
4296 * based on external data (the client includes a max amount to return with
4297 * the copy request). The client then looks into the attrs, data, and/or omap
4298 * based on the contents of the cursor.
4299 */
4300struct object_copy_data_t {
4301 enum {
4302 FLAG_DATA_DIGEST = 1<<0,
4303 FLAG_OMAP_DIGEST = 1<<1,
4304 };
4305 object_copy_cursor_t cursor;
4306 uint64_t size;
4307 utime_t mtime;
4308 uint32_t data_digest, omap_digest;
4309 uint32_t flags;
4310 map<string, bufferlist> attrs;
4311 bufferlist data;
4312 bufferlist omap_header;
4313 bufferlist omap_data;
4314
4315 /// which snaps we are defined for (if a snap and not the head)
4316 vector<snapid_t> snaps;
4317 ///< latest snap seq for the object (if head)
4318 snapid_t snap_seq;
4319
4320 ///< recent reqids on this object
31f18b77 4321 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > reqids;
7c673cae
FG
4322
4323 uint64_t truncate_seq;
4324 uint64_t truncate_size;
4325
4326public:
4327 object_copy_data_t() :
4328 size((uint64_t)-1), data_digest(-1),
4329 omap_digest(-1), flags(0),
4330 truncate_seq(0),
4331 truncate_size(0) {}
4332
4333 static void generate_test_instances(list<object_copy_data_t*>& o);
4334 void encode(bufferlist& bl, uint64_t features) const;
4335 void decode(bufferlist::iterator& bl);
4336 void dump(Formatter *f) const;
4337};
4338WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
4339
4340/**
4341 * pg creation info
4342 */
4343struct pg_create_t {
4344 epoch_t created; // epoch pg created
4345 pg_t parent; // split from parent (if != pg_t())
4346 __s32 split_bits;
4347
4348 pg_create_t()
4349 : created(0), split_bits(0) {}
4350 pg_create_t(unsigned c, pg_t p, int s)
4351 : created(c), parent(p), split_bits(s) {}
4352
4353 void encode(bufferlist &bl) const;
4354 void decode(bufferlist::iterator &bl);
4355 void dump(Formatter *f) const;
4356 static void generate_test_instances(list<pg_create_t*>& o);
4357};
4358WRITE_CLASS_ENCODER(pg_create_t)
4359
4360// -----------------------------------------
4361
4362struct osd_peer_stat_t {
4363 utime_t stamp;
4364
4365 osd_peer_stat_t() { }
4366
4367 void encode(bufferlist &bl) const;
4368 void decode(bufferlist::iterator &bl);
4369 void dump(Formatter *f) const;
4370 static void generate_test_instances(list<osd_peer_stat_t*>& o);
4371};
4372WRITE_CLASS_ENCODER(osd_peer_stat_t)
4373
4374ostream& operator<<(ostream& out, const osd_peer_stat_t &stat);
4375
4376
4377// -----------------------------------------
4378
4379class ObjectExtent {
4380 /**
4381 * ObjectExtents are used for specifying IO behavior against RADOS
4382 * objects when one is using the ObjectCacher.
4383 *
4384 * To use this in a real system, *every member* must be filled
4385 * out correctly. In particular, make sure to initialize the
4386 * oloc correctly, as its default values are deliberate poison
4387 * and will cause internal ObjectCacher asserts.
4388 *
4389 * Similarly, your buffer_extents vector *must* specify a total
4390 * size equal to your length. If the buffer_extents inadvertently
4391 * contain less space than the length member specifies, you
4392 * will get unintelligible asserts deep in the ObjectCacher.
4393 *
4394 * If you are trying to do testing and don't care about actual
4395 * RADOS function, the simplest thing to do is to initialize
4396 * the ObjectExtent (truncate_size can be 0), create a single entry
4397 * in buffer_extents matching the length, and set oloc.pool to 0.
4398 */
4399 public:
4400 object_t oid; // object id
4401 uint64_t objectno;
4402 uint64_t offset; // in object
4403 uint64_t length; // in object
4404 uint64_t truncate_size; // in object
4405
4406 object_locator_t oloc; // object locator (pool etc)
4407
4408 vector<pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
4409
4410 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
4411 ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
4412 oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
4413};
4414
4415inline ostream& operator<<(ostream& out, const ObjectExtent &ex)
4416{
4417 return out << "extent("
4418 << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
4419 << " " << ex.offset << "~" << ex.length
4420 << " -> " << ex.buffer_extents
4421 << ")";
4422}
4423
4424
7c673cae
FG
4425// ---------------------------------------
4426
4427class OSDSuperblock {
4428public:
4429 uuid_d cluster_fsid, osd_fsid;
4430 int32_t whoami; // my role in this fs.
4431 epoch_t current_epoch; // most recent epoch
4432 epoch_t oldest_map, newest_map; // oldest/newest maps we have.
4433 double weight;
4434
4435 CompatSet compat_features;
4436
4437 // last interval over which i mounted and was then active
4438 epoch_t mounted; // last epoch i mounted
4439 epoch_t clean_thru; // epoch i was active and clean thru
4440
4441 OSDSuperblock() :
4442 whoami(-1),
4443 current_epoch(0), oldest_map(0), newest_map(0), weight(0),
4444 mounted(0), clean_thru(0) {
4445 }
4446
4447 void encode(bufferlist &bl) const;
4448 void decode(bufferlist::iterator &bl);
4449 void dump(Formatter *f) const;
4450 static void generate_test_instances(list<OSDSuperblock*>& o);
4451};
4452WRITE_CLASS_ENCODER(OSDSuperblock)
4453
4454inline ostream& operator<<(ostream& out, const OSDSuperblock& sb)
4455{
4456 return out << "sb(" << sb.cluster_fsid
4457 << " osd." << sb.whoami
4458 << " " << sb.osd_fsid
4459 << " e" << sb.current_epoch
4460 << " [" << sb.oldest_map << "," << sb.newest_map << "]"
4461 << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
4462 << ")";
4463}
4464
4465
4466// -------
4467
4468
4469
4470
4471
4472
4473/*
4474 * attached to object head. describes most recent snap context, and
4475 * set of existing clones.
4476 */
4477struct SnapSet {
4478 snapid_t seq;
4479 bool head_exists;
4480 vector<snapid_t> snaps; // descending
4481 vector<snapid_t> clones; // ascending
4482 map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest
4483 map<snapid_t, uint64_t> clone_size;
4484 map<snapid_t, vector<snapid_t>> clone_snaps; // descending
4485
4486 SnapSet() : seq(0), head_exists(false) {}
4487 explicit SnapSet(bufferlist& bl) {
4488 bufferlist::iterator p = bl.begin();
4489 decode(p);
4490 }
4491
4492 bool is_legacy() const {
4493 return clone_snaps.size() < clones.size() || !head_exists;
4494 }
4495
4496 /// populate SnapSet from a librados::snap_set_t
4497 void from_snap_set(const librados::snap_set_t& ss, bool legacy);
4498
4499 /// get space accounted to clone
4500 uint64_t get_clone_bytes(snapid_t clone) const;
4501
4502 void encode(bufferlist& bl) const;
4503 void decode(bufferlist::iterator& bl);
4504 void dump(Formatter *f) const;
4505 static void generate_test_instances(list<SnapSet*>& o);
4506
4507 SnapContext get_ssc_as_of(snapid_t as_of) const {
4508 SnapContext out;
4509 out.seq = as_of;
4510 for (vector<snapid_t>::const_iterator i = snaps.begin();
4511 i != snaps.end();
4512 ++i) {
4513 if (*i <= as_of)
4514 out.snaps.push_back(*i);
4515 }
4516 return out;
4517 }
4518
4519 // return min element of snaps > after, return max if no such element
4520 snapid_t get_first_snap_after(snapid_t after, snapid_t max) const {
4521 for (vector<snapid_t>::const_reverse_iterator i = snaps.rbegin();
4522 i != snaps.rend();
4523 ++i) {
4524 if (*i > after)
4525 return *i;
4526 }
4527 return max;
4528 }
4529
4530 SnapSet get_filtered(const pg_pool_t &pinfo) const;
4531 void filter(const pg_pool_t &pinfo);
4532};
4533WRITE_CLASS_ENCODER(SnapSet)
4534
4535ostream& operator<<(ostream& out, const SnapSet& cs);
4536
4537
4538
4539#define OI_ATTR "_"
4540#define SS_ATTR "snapset"
4541
4542struct watch_info_t {
4543 uint64_t cookie;
4544 uint32_t timeout_seconds;
4545 entity_addr_t addr;
4546
4547 watch_info_t() : cookie(0), timeout_seconds(0) { }
4548 watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
4549
4550 void encode(bufferlist& bl, uint64_t features) const;
4551 void decode(bufferlist::iterator& bl);
4552 void dump(Formatter *f) const;
4553 static void generate_test_instances(list<watch_info_t*>& o);
4554};
4555WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
4556
4557static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
4558 return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
4559 && l.addr == r.addr;
4560}
4561
4562static inline ostream& operator<<(ostream& out, const watch_info_t& w) {
4563 return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
4564 << " " << w.addr << ")";
4565}
4566
4567struct notify_info_t {
4568 uint64_t cookie;
4569 uint64_t notify_id;
4570 uint32_t timeout;
4571 bufferlist bl;
4572};
4573
4574static inline ostream& operator<<(ostream& out, const notify_info_t& n) {
4575 return out << "notify(cookie " << n.cookie
4576 << " notify" << n.notify_id
4577 << " " << n.timeout << "s)";
4578}
4579
31f18b77
FG
4580struct object_info_t;
4581struct object_manifest_t {
4582 enum {
4583 TYPE_NONE = 0,
4584 TYPE_REDIRECT = 1, // start with this
4585 TYPE_CHUNKED = 2, // do this later
4586 };
4587 uint8_t type; // redirect, chunked, ...
4588 hobject_t redirect_target;
4589
4590 object_manifest_t() : type(0) { }
4591 object_manifest_t(uint8_t type, const hobject_t& redirect_target)
4592 : type(type), redirect_target(redirect_target) { }
4593
4594 bool is_empty() const {
4595 return type == TYPE_NONE;
4596 }
4597 bool is_redirect() const {
4598 return type == TYPE_REDIRECT;
4599 }
4600 bool is_chunked() const {
4601 return type == TYPE_CHUNKED;
4602 }
4603 static const char *get_type_name(uint8_t m) {
4604 switch (m) {
4605 case TYPE_NONE: return "none";
4606 case TYPE_REDIRECT: return "redirect";
4607 case TYPE_CHUNKED: return "chunked";
4608 default: return "unknown";
4609 }
4610 }
4611 const char *get_type_name() const {
4612 return get_type_name(type);
4613 }
4614 static void generate_test_instances(list<object_manifest_t*>& o);
4615 void encode(bufferlist &bl) const;
4616 void decode(bufferlist::iterator &bl);
4617 void dump(Formatter *f) const;
4618 friend ostream& operator<<(ostream& out, const object_info_t& oi);
4619};
4620WRITE_CLASS_ENCODER(object_manifest_t)
4621ostream& operator<<(ostream& out, const object_manifest_t& oi);
7c673cae
FG
4622
4623struct object_info_t {
4624 hobject_t soid;
4625 eversion_t version, prior_version;
4626 version_t user_version;
4627 osd_reqid_t last_reqid;
4628
4629 uint64_t size;
4630 utime_t mtime;
4631 utime_t local_mtime; // local mtime
4632
4633 // note: these are currently encoded into a total 16 bits; see
4634 // encode()/decode() for the weirdness.
4635 typedef enum {
4636 FLAG_LOST = 1<<0,
4637 FLAG_WHITEOUT = 1<<1, // object logically does not exist
4638 FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
4639 FLAG_OMAP = 1 << 3, // has (or may have) some/any omap data
4640 FLAG_DATA_DIGEST = 1 << 4, // has data crc
4641 FLAG_OMAP_DIGEST = 1 << 5, // has omap crc
4642 FLAG_CACHE_PIN = 1 << 6, // pin the object in cache tier
31f18b77 4643 FLAG_MANIFEST = 1 << 7, // has manifest
7c673cae
FG
4644 // ...
4645 FLAG_USES_TMAP = 1<<8, // deprecated; no longer used.
4646 } flag_t;
4647
4648 flag_t flags;
4649
4650 static string get_flag_string(flag_t flags) {
4651 string s;
4652 if (flags & FLAG_LOST)
4653 s += "|lost";
4654 if (flags & FLAG_WHITEOUT)
4655 s += "|whiteout";
4656 if (flags & FLAG_DIRTY)
4657 s += "|dirty";
4658 if (flags & FLAG_USES_TMAP)
4659 s += "|uses_tmap";
4660 if (flags & FLAG_OMAP)
4661 s += "|omap";
4662 if (flags & FLAG_DATA_DIGEST)
4663 s += "|data_digest";
4664 if (flags & FLAG_OMAP_DIGEST)
4665 s += "|omap_digest";
4666 if (flags & FLAG_CACHE_PIN)
4667 s += "|cache_pin";
31f18b77
FG
4668 if (flags & FLAG_MANIFEST)
4669 s += "|manifest";
7c673cae
FG
4670 if (s.length())
4671 return s.substr(1);
4672 return s;
4673 }
4674 string get_flag_string() const {
4675 return get_flag_string(flags);
4676 }
4677
4678 /// [clone] descending. pre-luminous; moved to SnapSet
4679 vector<snapid_t> legacy_snaps;
4680
4681 uint64_t truncate_seq, truncate_size;
4682
4683 map<pair<uint64_t, entity_name_t>, watch_info_t> watchers;
4684
4685 // opportunistic checksums; may or may not be present
4686 __u32 data_digest; ///< data crc32c
4687 __u32 omap_digest; ///< omap crc32c
4688
4689 // alloc hint attribute
4690 uint64_t expected_object_size, expected_write_size;
4691 uint32_t alloc_hint_flags;
4692
31f18b77
FG
4693 struct object_manifest_t manifest;
4694
7c673cae
FG
4695 void copy_user_bits(const object_info_t& other);
4696
4697 static ps_t legacy_object_locator_to_ps(const object_t &oid,
4698 const object_locator_t &loc);
4699
4700 bool test_flag(flag_t f) const {
4701 return (flags & f) == f;
4702 }
4703 void set_flag(flag_t f) {
4704 flags = (flag_t)(flags | f);
4705 }
4706 void clear_flag(flag_t f) {
4707 flags = (flag_t)(flags & ~f);
4708 }
4709 bool is_lost() const {
4710 return test_flag(FLAG_LOST);
4711 }
4712 bool is_whiteout() const {
4713 return test_flag(FLAG_WHITEOUT);
4714 }
4715 bool is_dirty() const {
4716 return test_flag(FLAG_DIRTY);
4717 }
4718 bool is_omap() const {
4719 return test_flag(FLAG_OMAP);
4720 }
4721 bool is_data_digest() const {
4722 return test_flag(FLAG_DATA_DIGEST);
4723 }
4724 bool is_omap_digest() const {
4725 return test_flag(FLAG_OMAP_DIGEST);
4726 }
4727 bool is_cache_pinned() const {
4728 return test_flag(FLAG_CACHE_PIN);
4729 }
31f18b77
FG
4730 bool has_manifest() const {
4731 return test_flag(FLAG_MANIFEST);
4732 }
7c673cae
FG
4733
4734 void set_data_digest(__u32 d) {
4735 set_flag(FLAG_DATA_DIGEST);
4736 data_digest = d;
4737 }
4738 void set_omap_digest(__u32 d) {
4739 set_flag(FLAG_OMAP_DIGEST);
4740 omap_digest = d;
4741 }
4742 void clear_data_digest() {
4743 clear_flag(FLAG_DATA_DIGEST);
4744 data_digest = -1;
4745 }
4746 void clear_omap_digest() {
4747 clear_flag(FLAG_OMAP_DIGEST);
4748 omap_digest = -1;
4749 }
4750 void new_object() {
4751 set_data_digest(-1);
4752 set_omap_digest(-1);
4753 }
4754
4755 void encode(bufferlist& bl, uint64_t features) const;
4756 void decode(bufferlist::iterator& bl);
4757 void decode(bufferlist& bl) {
4758 bufferlist::iterator p = bl.begin();
4759 decode(p);
4760 }
4761 void dump(Formatter *f) const;
4762 static void generate_test_instances(list<object_info_t*>& o);
4763
4764 explicit object_info_t()
4765 : user_version(0), size(0), flags((flag_t)0),
4766 truncate_seq(0), truncate_size(0),
4767 data_digest(-1), omap_digest(-1),
4768 expected_object_size(0), expected_write_size(0),
4769 alloc_hint_flags(0)
4770 {}
4771
4772 explicit object_info_t(const hobject_t& s)
4773 : soid(s),
4774 user_version(0), size(0), flags((flag_t)0),
4775 truncate_seq(0), truncate_size(0),
4776 data_digest(-1), omap_digest(-1),
4777 expected_object_size(0), expected_write_size(0),
4778 alloc_hint_flags(0)
4779 {}
4780
4781 explicit object_info_t(bufferlist& bl) {
4782 decode(bl);
4783 }
4784};
4785WRITE_CLASS_ENCODER_FEATURES(object_info_t)
4786
4787ostream& operator<<(ostream& out, const object_info_t& oi);
4788
4789
4790
4791// Object recovery
4792struct ObjectRecoveryInfo {
4793 hobject_t soid;
4794 eversion_t version;
4795 uint64_t size;
4796 object_info_t oi;
4797 SnapSet ss; // only populated if soid is_snap()
4798 interval_set<uint64_t> copy_subset;
4799 map<hobject_t, interval_set<uint64_t>> clone_subset;
4800
4801 ObjectRecoveryInfo() : size(0) { }
4802
4803 static void generate_test_instances(list<ObjectRecoveryInfo*>& o);
4804 void encode(bufferlist &bl, uint64_t features) const;
4805 void decode(bufferlist::iterator &bl, int64_t pool = -1);
4806 ostream &print(ostream &out) const;
4807 void dump(Formatter *f) const;
4808};
4809WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
4810ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf);
4811
4812struct ObjectRecoveryProgress {
4813 uint64_t data_recovered_to;
4814 string omap_recovered_to;
4815 bool first;
4816 bool data_complete;
4817 bool omap_complete;
224ce89b 4818 bool error = false;
7c673cae
FG
4819
4820 ObjectRecoveryProgress()
4821 : data_recovered_to(0),
4822 first(true),
4823 data_complete(false), omap_complete(false) { }
4824
4825 bool is_complete(const ObjectRecoveryInfo& info) const {
4826 return (data_recovered_to >= (
4827 info.copy_subset.empty() ?
4828 0 : info.copy_subset.range_end())) &&
4829 omap_complete;
4830 }
4831
4832 static void generate_test_instances(list<ObjectRecoveryProgress*>& o);
4833 void encode(bufferlist &bl) const;
4834 void decode(bufferlist::iterator &bl);
4835 ostream &print(ostream &out) const;
4836 void dump(Formatter *f) const;
4837};
4838WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
4839ostream& operator<<(ostream& out, const ObjectRecoveryProgress &prog);
4840
4841struct PushReplyOp {
4842 hobject_t soid;
4843
4844 static void generate_test_instances(list<PushReplyOp*>& o);
4845 void encode(bufferlist &bl) const;
4846 void decode(bufferlist::iterator &bl);
4847 ostream &print(ostream &out) const;
4848 void dump(Formatter *f) const;
4849
4850 uint64_t cost(CephContext *cct) const;
4851};
4852WRITE_CLASS_ENCODER(PushReplyOp)
4853ostream& operator<<(ostream& out, const PushReplyOp &op);
4854
4855struct PullOp {
4856 hobject_t soid;
4857
4858 ObjectRecoveryInfo recovery_info;
4859 ObjectRecoveryProgress recovery_progress;
4860
4861 static void generate_test_instances(list<PullOp*>& o);
4862 void encode(bufferlist &bl, uint64_t features) const;
4863 void decode(bufferlist::iterator &bl);
4864 ostream &print(ostream &out) const;
4865 void dump(Formatter *f) const;
4866
4867 uint64_t cost(CephContext *cct) const;
4868};
4869WRITE_CLASS_ENCODER_FEATURES(PullOp)
4870ostream& operator<<(ostream& out, const PullOp &op);
4871
4872struct PushOp {
4873 hobject_t soid;
4874 eversion_t version;
4875 bufferlist data;
4876 interval_set<uint64_t> data_included;
4877 bufferlist omap_header;
4878 map<string, bufferlist> omap_entries;
4879 map<string, bufferlist> attrset;
4880
4881 ObjectRecoveryInfo recovery_info;
4882 ObjectRecoveryProgress before_progress;
4883 ObjectRecoveryProgress after_progress;
4884
4885 static void generate_test_instances(list<PushOp*>& o);
4886 void encode(bufferlist &bl, uint64_t features) const;
4887 void decode(bufferlist::iterator &bl);
4888 ostream &print(ostream &out) const;
4889 void dump(Formatter *f) const;
4890
4891 uint64_t cost(CephContext *cct) const;
4892};
4893WRITE_CLASS_ENCODER_FEATURES(PushOp)
4894ostream& operator<<(ostream& out, const PushOp &op);
4895
4896
4897/*
4898 * summarize pg contents for purposes of a scrub
4899 */
4900struct ScrubMap {
4901 struct object {
4902 map<string,bufferptr> attrs;
4903 uint64_t size;
4904 __u32 omap_digest; ///< omap crc32c
4905 __u32 digest; ///< data crc32c
4906 bool negative:1;
4907 bool digest_present:1;
4908 bool omap_digest_present:1;
4909 bool read_error:1;
4910 bool stat_error:1;
4911 bool ec_hash_mismatch:1;
4912 bool ec_size_mismatch:1;
4913
4914 object() :
4915 // Init invalid size so it won't match if we get a stat EIO error
4916 size(-1), omap_digest(0), digest(0),
4917 negative(false), digest_present(false), omap_digest_present(false),
4918 read_error(false), stat_error(false), ec_hash_mismatch(false), ec_size_mismatch(false) {}
4919
4920 void encode(bufferlist& bl) const;
4921 void decode(bufferlist::iterator& bl);
4922 void dump(Formatter *f) const;
4923 static void generate_test_instances(list<object*>& o);
4924 };
4925 WRITE_CLASS_ENCODER(object)
4926
4927 map<hobject_t,object> objects;
4928 eversion_t valid_through;
4929 eversion_t incr_since;
4930
4931 void merge_incr(const ScrubMap &l);
4932 void insert(const ScrubMap &r) {
4933 objects.insert(r.objects.begin(), r.objects.end());
4934 }
4935 void swap(ScrubMap &r) {
31f18b77
FG
4936 using std::swap;
4937 swap(objects, r.objects);
4938 swap(valid_through, r.valid_through);
4939 swap(incr_since, r.incr_since);
7c673cae
FG
4940 }
4941
4942 void encode(bufferlist& bl) const;
4943 void decode(bufferlist::iterator& bl, int64_t pool=-1);
4944 void dump(Formatter *f) const;
4945 static void generate_test_instances(list<ScrubMap*>& o);
4946};
4947WRITE_CLASS_ENCODER(ScrubMap::object)
4948WRITE_CLASS_ENCODER(ScrubMap)
4949
7c673cae
FG
4950struct OSDOp {
4951 ceph_osd_op op;
4952 sobject_t soid;
4953
4954 bufferlist indata, outdata;
224ce89b 4955 errorcode32_t rval;
7c673cae
FG
4956
4957 OSDOp() : rval(0) {
4958 memset(&op, 0, sizeof(ceph_osd_op));
4959 }
4960
4961 /**
4962 * split a bufferlist into constituent indata members of a vector of OSDOps
4963 *
4964 * @param ops [out] vector of OSDOps
4965 * @param in [in] combined data buffer
4966 */
4967 static void split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in);
4968
4969 /**
4970 * merge indata members of a vector of OSDOp into a single bufferlist
4971 *
4972 * Notably this also encodes certain other OSDOp data into the data
4973 * buffer, including the sobject_t soid.
4974 *
4975 * @param ops [in] vector of OSDOps
4976 * @param out [out] combined data buffer
4977 */
4978 static void merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out);
4979
4980 /**
4981 * split a bufferlist into constituent outdata members of a vector of OSDOps
4982 *
4983 * @param ops [out] vector of OSDOps
4984 * @param in [in] combined data buffer
4985 */
4986 static void split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in);
4987
4988 /**
4989 * merge outdata members of a vector of OSDOps into a single bufferlist
4990 *
4991 * @param ops [in] vector of OSDOps
4992 * @param out [out] combined data buffer
4993 */
4994 static void merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out);
224ce89b
WB
4995
4996 /**
4997 * Clear data as much as possible, leave minimal data for historical op dump
4998 *
4999 * @param ops [in] vector of OSDOps
5000 */
5001 static void clear_data(vector<OSDOp>& ops);
7c673cae
FG
5002};
5003
5004ostream& operator<<(ostream& out, const OSDOp& op);
5005
5006struct watch_item_t {
5007 entity_name_t name;
5008 uint64_t cookie;
5009 uint32_t timeout_seconds;
5010 entity_addr_t addr;
5011
5012 watch_item_t() : cookie(0), timeout_seconds(0) { }
5013 watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
5014 const entity_addr_t& addr)
5015 : name(name), cookie(cookie), timeout_seconds(timeout),
5016 addr(addr) { }
5017
5018 void encode(bufferlist &bl, uint64_t features) const {
5019 ENCODE_START(2, 1, bl);
5020 ::encode(name, bl);
5021 ::encode(cookie, bl);
5022 ::encode(timeout_seconds, bl);
5023 ::encode(addr, bl, features);
5024 ENCODE_FINISH(bl);
5025 }
5026 void decode(bufferlist::iterator &bl) {
5027 DECODE_START(2, bl);
5028 ::decode(name, bl);
5029 ::decode(cookie, bl);
5030 ::decode(timeout_seconds, bl);
5031 if (struct_v >= 2) {
5032 ::decode(addr, bl);
5033 }
5034 DECODE_FINISH(bl);
5035 }
5036};
5037WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
5038
5039struct obj_watch_item_t {
5040 hobject_t obj;
5041 watch_item_t wi;
5042};
5043
5044/**
5045 * obj list watch response format
5046 *
5047 */
5048struct obj_list_watch_response_t {
5049 list<watch_item_t> entries;
5050
5051 void encode(bufferlist& bl, uint64_t features) const {
5052 ENCODE_START(1, 1, bl);
5053 ::encode(entries, bl, features);
5054 ENCODE_FINISH(bl);
5055 }
5056 void decode(bufferlist::iterator& bl) {
5057 DECODE_START(1, bl);
5058 ::decode(entries, bl);
5059 DECODE_FINISH(bl);
5060 }
5061 void dump(Formatter *f) const {
5062 f->open_array_section("entries");
5063 for (list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
5064 f->open_object_section("watch");
5065 f->dump_stream("watcher") << p->name;
5066 f->dump_int("cookie", p->cookie);
5067 f->dump_int("timeout", p->timeout_seconds);
5068 f->open_object_section("addr");
5069 p->addr.dump(f);
5070 f->close_section();
5071 f->close_section();
5072 }
5073 f->close_section();
5074 }
5075 static void generate_test_instances(list<obj_list_watch_response_t*>& o) {
5076 entity_addr_t ea;
5077 o.push_back(new obj_list_watch_response_t);
5078 o.push_back(new obj_list_watch_response_t);
5079 ea.set_type(entity_addr_t::TYPE_LEGACY);
5080 ea.set_nonce(1000);
5081 ea.set_family(AF_INET);
5082 ea.set_in4_quad(0, 127);
5083 ea.set_in4_quad(1, 0);
5084 ea.set_in4_quad(2, 0);
5085 ea.set_in4_quad(3, 1);
5086 ea.set_port(1024);
5087 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
5088 ea.set_nonce(1001);
5089 ea.set_in4_quad(3, 2);
5090 ea.set_port(1025);
5091 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
5092 }
5093};
5094WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
5095
5096struct clone_info {
5097 snapid_t cloneid;
5098 vector<snapid_t> snaps; // ascending
5099 vector< pair<uint64_t,uint64_t> > overlap;
5100 uint64_t size;
5101
5102 clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
5103
5104 void encode(bufferlist& bl) const {
5105 ENCODE_START(1, 1, bl);
5106 ::encode(cloneid, bl);
5107 ::encode(snaps, bl);
5108 ::encode(overlap, bl);
5109 ::encode(size, bl);
5110 ENCODE_FINISH(bl);
5111 }
5112 void decode(bufferlist::iterator& bl) {
5113 DECODE_START(1, bl);
5114 ::decode(cloneid, bl);
5115 ::decode(snaps, bl);
5116 ::decode(overlap, bl);
5117 ::decode(size, bl);
5118 DECODE_FINISH(bl);
5119 }
5120 void dump(Formatter *f) const {
5121 if (cloneid == CEPH_NOSNAP)
5122 f->dump_string("cloneid", "HEAD");
5123 else
5124 f->dump_unsigned("cloneid", cloneid.val);
5125 f->open_array_section("snapshots");
5126 for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
5127 f->open_object_section("snap");
5128 f->dump_unsigned("id", p->val);
5129 f->close_section();
5130 }
5131 f->close_section();
5132 f->open_array_section("overlaps");
5133 for (vector< pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
5134 q != overlap.end(); ++q) {
5135 f->open_object_section("overlap");
5136 f->dump_unsigned("offset", q->first);
5137 f->dump_unsigned("length", q->second);
5138 f->close_section();
5139 }
5140 f->close_section();
5141 f->dump_unsigned("size", size);
5142 }
5143 static void generate_test_instances(list<clone_info*>& o) {
5144 o.push_back(new clone_info);
5145 o.push_back(new clone_info);
5146 o.back()->cloneid = 1;
5147 o.back()->snaps.push_back(1);
5148 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5149 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5150 o.back()->size = 16384;
5151 o.push_back(new clone_info);
5152 o.back()->cloneid = CEPH_NOSNAP;
5153 o.back()->size = 32768;
5154 }
5155};
5156WRITE_CLASS_ENCODER(clone_info)
5157
5158/**
5159 * obj list snaps response format
5160 *
5161 */
5162struct obj_list_snap_response_t {
5163 vector<clone_info> clones; // ascending
5164 snapid_t seq;
5165
5166 void encode(bufferlist& bl) const {
5167 ENCODE_START(2, 1, bl);
5168 ::encode(clones, bl);
5169 ::encode(seq, bl);
5170 ENCODE_FINISH(bl);
5171 }
5172 void decode(bufferlist::iterator& bl) {
5173 DECODE_START(2, bl);
5174 ::decode(clones, bl);
5175 if (struct_v >= 2)
5176 ::decode(seq, bl);
5177 else
5178 seq = CEPH_NOSNAP;
5179 DECODE_FINISH(bl);
5180 }
5181 void dump(Formatter *f) const {
5182 f->open_array_section("clones");
5183 for (vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
5184 f->open_object_section("clone");
5185 p->dump(f);
5186 f->close_section();
5187 }
5188 f->dump_unsigned("seq", seq);
5189 f->close_section();
5190 }
5191 static void generate_test_instances(list<obj_list_snap_response_t*>& o) {
5192 o.push_back(new obj_list_snap_response_t);
5193 o.push_back(new obj_list_snap_response_t);
5194 clone_info cl;
5195 cl.cloneid = 1;
5196 cl.snaps.push_back(1);
5197 cl.overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5198 cl.overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5199 cl.size = 16384;
5200 o.back()->clones.push_back(cl);
5201 cl.cloneid = CEPH_NOSNAP;
5202 cl.snaps.clear();
5203 cl.overlap.clear();
5204 cl.size = 32768;
5205 o.back()->clones.push_back(cl);
5206 o.back()->seq = 123;
5207 }
5208};
5209
5210WRITE_CLASS_ENCODER(obj_list_snap_response_t)
5211
5212// PromoteCounter
5213
5214struct PromoteCounter {
5215 std::atomic_ullong attempts{0};
5216 std::atomic_ullong objects{0};
5217 std::atomic_ullong bytes{0};
5218
5219 void attempt() {
5220 attempts++;
5221 }
5222
5223 void finish(uint64_t size) {
5224 objects++;
5225 bytes += size;
5226 }
5227
5228 void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
5229 *a = attempts;
5230 *o = objects;
5231 *b = bytes;
5232 attempts = *a / 2;
5233 objects = *o / 2;
5234 bytes = *b / 2;
5235 }
5236};
5237
5238/** store_statfs_t
5239 * ObjectStore full statfs information
5240 */
5241struct store_statfs_t
5242{
5243 uint64_t total = 0; // Total bytes
5244 uint64_t available = 0; // Free bytes available
5245
5246 int64_t allocated = 0; // Bytes allocated by the store
5247 int64_t stored = 0; // Bytes actually stored by the user
5248 int64_t compressed = 0; // Bytes stored after compression
5249 int64_t compressed_allocated = 0; // Bytes allocated for compressed data
5250 int64_t compressed_original = 0; // Bytes that were successfully compressed
5251
5252 void reset() {
5253 *this = store_statfs_t();
5254 }
5255 bool operator ==(const store_statfs_t& other) const;
5256 void dump(Formatter *f) const;
5257};
5258ostream &operator<<(ostream &lhs, const store_statfs_t &rhs);
5259
5260#endif