]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/osd_types.h
update sources to v12.1.1
[ceph.git] / ceph / src / osd / osd_types.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18#ifndef CEPH_OSD_TYPES_H
19#define CEPH_OSD_TYPES_H
20
21#include <sstream>
22#include <stdio.h>
23#include <memory>
24#include <boost/scoped_ptr.hpp>
25#include <boost/optional/optional_io.hpp>
26#include <boost/variant.hpp>
27
28#include "include/rados/rados_types.hpp"
29#include "include/mempool.h"
30
31#include "msg/msg_types.h"
32#include "include/types.h"
33#include "include/utime.h"
34#include "include/CompatSet.h"
35#include "common/histogram.h"
36#include "include/interval_set.h"
37#include "include/inline_memory.h"
38#include "common/Formatter.h"
39#include "common/bloom_filter.hpp"
40#include "common/hobject.h"
41#include "common/snap_types.h"
42#include "HitSet.h"
43#include "Watch.h"
44#include "include/cmp.h"
45#include "librados/ListObjectImpl.h"
46#include "compressor/Compressor.h"
47#include <atomic>
48
49#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
50
51#define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
52#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
53#define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
54#define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
55#define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
56#define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
57#define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
58#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
59#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
60#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
61#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
62#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
63#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
64#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
65#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
66
67
68/// min recovery priority for MBackfillReserve
69#define OSD_RECOVERY_PRIORITY_MIN 0
70
71/// base backfill priority for MBackfillReserve
72#define OSD_BACKFILL_PRIORITY_BASE 100
73
74/// base backfill priority for MBackfillReserve (degraded PG)
75#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
76
77/// base recovery priority for MBackfillReserve
78#define OSD_RECOVERY_PRIORITY_BASE 180
79
80/// base backfill priority for MBackfillReserve (inactive PG)
81#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
82
83/// max recovery priority for MBackfillReserve
84#define OSD_RECOVERY_PRIORITY_MAX 255
85
86
87typedef hobject_t collection_list_handle_t;
88
89/// convert a single CPEH_OSD_FLAG_* to a string
90const char *ceph_osd_flag_name(unsigned flag);
91/// convert a single CEPH_OSD_OF_FLAG_* to a string
92const char *ceph_osd_op_flag_name(unsigned flag);
93
94/// convert CEPH_OSD_FLAG_* op flags to a string
95string ceph_osd_flag_string(unsigned flags);
96/// conver CEPH_OSD_OP_FLAG_* op flags to a string
97string ceph_osd_op_flag_string(unsigned flags);
98/// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a string
99string ceph_osd_alloc_hint_flag_string(unsigned flags);
100
101
102/**
103 * osd request identifier
104 *
105 * caller name + incarnation# + tid to unique identify this request.
106 */
107struct osd_reqid_t {
108 entity_name_t name; // who
109 ceph_tid_t tid;
110 int32_t inc; // incarnation
111
112 osd_reqid_t()
113 : tid(0), inc(0) {}
114 osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
115 : name(a), tid(t), inc(i) {}
116
117 DENC(osd_reqid_t, v, p) {
118 DENC_START(2, 2, p);
119 denc(v.name, p);
120 denc(v.tid, p);
121 denc(v.inc, p);
122 DENC_FINISH(p);
123 }
124 void dump(Formatter *f) const;
125 static void generate_test_instances(list<osd_reqid_t*>& o);
126};
127WRITE_CLASS_DENC(osd_reqid_t)
128
129
130
131struct pg_shard_t {
132 int32_t osd;
133 shard_id_t shard;
134 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
135 explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
136 pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
137 bool is_undefined() const {
138 return osd == -1;
139 }
140 void encode(bufferlist &bl) const;
141 void decode(bufferlist::iterator &bl);
142 void dump(Formatter *f) const {
143 f->dump_unsigned("osd", osd);
144 if (shard != shard_id_t::NO_SHARD) {
145 f->dump_unsigned("shard", shard);
146 }
147 }
148};
149WRITE_CLASS_ENCODER(pg_shard_t)
150WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
151WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
152ostream &operator<<(ostream &lhs, const pg_shard_t &rhs);
153
154class IsPGRecoverablePredicate {
155public:
156 /**
157 * have encodes the shards available
158 */
159 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
160 virtual ~IsPGRecoverablePredicate() {}
161};
162
163class IsPGReadablePredicate {
164public:
165 /**
166 * have encodes the shards available
167 */
168 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
169 virtual ~IsPGReadablePredicate() {}
170};
171
172inline ostream& operator<<(ostream& out, const osd_reqid_t& r) {
173 return out << r.name << "." << r.inc << ":" << r.tid;
174}
175
176inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
177 return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
178}
179inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
180 return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
181}
182inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
183 return (l.name < r.name) || (l.inc < r.inc) ||
184 (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
185}
186inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
187 return (l.name < r.name) || (l.inc < r.inc) ||
188 (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
189}
190inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
191inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
192
193namespace std {
194 template<> struct hash<osd_reqid_t> {
195 size_t operator()(const osd_reqid_t &r) const {
196 static hash<uint64_t> H;
197 return H(r.name.num() ^ r.tid ^ r.inc);
198 }
199 };
200} // namespace std
201
202
203// -----
204
205// a locator constrains the placement of an object. mainly, which pool
206// does it go in.
207struct object_locator_t {
208 // You specify either the hash or the key -- not both
209 int64_t pool; ///< pool id
210 string key; ///< key string (if non-empty)
211 string nspace; ///< namespace
212 int64_t hash; ///< hash position (if >= 0)
213
214 explicit object_locator_t()
215 : pool(-1), hash(-1) {}
216 explicit object_locator_t(int64_t po)
217 : pool(po), hash(-1) {}
218 explicit object_locator_t(int64_t po, int64_t ps)
219 : pool(po), hash(ps) {}
220 explicit object_locator_t(int64_t po, string ns)
221 : pool(po), nspace(ns), hash(-1) {}
222 explicit object_locator_t(int64_t po, string ns, int64_t ps)
223 : pool(po), nspace(ns), hash(ps) {}
224 explicit object_locator_t(int64_t po, string ns, string s)
225 : pool(po), key(s), nspace(ns), hash(-1) {}
226 explicit object_locator_t(const hobject_t& soid)
227 : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
228
229 int64_t get_pool() const {
230 return pool;
231 }
232
233 void clear() {
234 pool = -1;
235 key = "";
236 nspace = "";
237 hash = -1;
238 }
239
240 bool empty() const {
241 return pool == -1;
242 }
243
244 void encode(bufferlist& bl) const;
245 void decode(bufferlist::iterator& p);
246 void dump(Formatter *f) const;
247 static void generate_test_instances(list<object_locator_t*>& o);
248};
249WRITE_CLASS_ENCODER(object_locator_t)
250
251inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
252 return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
253}
254inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
255 return !(l == r);
256}
257
258inline ostream& operator<<(ostream& out, const object_locator_t& loc)
259{
260 out << "@" << loc.pool;
261 if (loc.nspace.length())
262 out << ";" << loc.nspace;
263 if (loc.key.length())
264 out << ":" << loc.key;
265 return out;
266}
267
268struct request_redirect_t {
269private:
270 object_locator_t redirect_locator; ///< this is authoritative
271 string redirect_object; ///< If non-empty, the request goes to this object name
272 bufferlist osd_instructions; ///< a bufferlist for the OSDs, passed but not interpreted by clients
273
274 friend ostream& operator<<(ostream& out, const request_redirect_t& redir);
275public:
276
277 request_redirect_t() {}
278 explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
279 redirect_locator(orig) { redirect_locator.pool = rpool; }
280 explicit request_redirect_t(const object_locator_t& rloc) :
281 redirect_locator(rloc) {}
282 explicit request_redirect_t(const object_locator_t& orig,
283 const string& robj) :
284 redirect_locator(orig), redirect_object(robj) {}
285
286 void set_instructions(const bufferlist& bl) { osd_instructions = bl; }
287 const bufferlist& get_instructions() { return osd_instructions; }
288
289 bool empty() const { return redirect_locator.empty() &&
290 redirect_object.empty(); }
291
292 void combine_with_locator(object_locator_t& orig, string& obj) const {
293 orig = redirect_locator;
294 if (!redirect_object.empty())
295 obj = redirect_object;
296 }
297
298 void encode(bufferlist& bl) const;
299 void decode(bufferlist::iterator& bl);
300 void dump(Formatter *f) const;
301 static void generate_test_instances(list<request_redirect_t*>& o);
302};
303WRITE_CLASS_ENCODER(request_redirect_t)
304
305inline ostream& operator<<(ostream& out, const request_redirect_t& redir) {
306 out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
307 return out;
308}
309
310// Internal OSD op flags - set by the OSD based on the op types
311enum {
312 CEPH_OSD_RMW_FLAG_READ = (1 << 1),
313 CEPH_OSD_RMW_FLAG_WRITE = (1 << 2),
314 CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3),
315 CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
316 CEPH_OSD_RMW_FLAG_PGOP = (1 << 5),
317 CEPH_OSD_RMW_FLAG_CACHE = (1 << 6),
318 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7),
319 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
320 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9),
321 CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10),
322};
323
324
325// pg stuff
326
327#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
328
329// placement seed (a hash value)
330typedef uint32_t ps_t;
331
332// old (v1) pg_t encoding (wrap old struct ceph_pg)
333struct old_pg_t {
334 ceph_pg v;
335 void encode(bufferlist& bl) const {
336 ::encode_raw(v, bl);
337 }
338 void decode(bufferlist::iterator& bl) {
339 ::decode_raw(v, bl);
340 }
341};
342WRITE_CLASS_ENCODER(old_pg_t)
343
344// placement group id
345struct pg_t {
346 uint64_t m_pool;
347 uint32_t m_seed;
348 int32_t m_preferred;
349
350 pg_t() : m_pool(0), m_seed(0), m_preferred(-1) {}
351 pg_t(ps_t seed, uint64_t pool, int pref=-1) :
352 m_pool(pool), m_seed(seed), m_preferred(pref) {}
353 // cppcheck-suppress noExplicitConstructor
354 pg_t(const ceph_pg& cpg) :
355 m_pool(cpg.pool), m_seed(cpg.ps), m_preferred((__s16)cpg.preferred) {}
356
357 // cppcheck-suppress noExplicitConstructor
358 pg_t(const old_pg_t& opg) {
359 *this = opg.v;
360 }
361
362 old_pg_t get_old_pg() const {
363 old_pg_t o;
364 assert(m_pool < 0xffffffffull);
365 o.v.pool = m_pool;
366 o.v.ps = m_seed;
367 o.v.preferred = (__s16)m_preferred;
368 return o;
369 }
370
371 ps_t ps() const {
372 return m_seed;
373 }
374 uint64_t pool() const {
375 return m_pool;
376 }
377 int32_t preferred() const {
378 return m_preferred;
379 }
380
381 static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
382 char *calc_name(char *buf, const char *suffix_backwords) const;
383
384 void set_ps(ps_t p) {
385 m_seed = p;
386 }
387 void set_pool(uint64_t p) {
388 m_pool = p;
389 }
390 void set_preferred(int32_t osd) {
391 m_preferred = osd;
392 }
393
394 pg_t get_parent() const;
395 pg_t get_ancestor(unsigned old_pg_num) const;
396
397 int print(char *o, int maxlen) const;
398 bool parse(const char *s);
399
400 bool is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *pchildren) const;
401
402 /**
403 * Returns b such that for all object o:
404 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
405 */
406 unsigned get_split_bits(unsigned pg_num) const;
407
408 bool contains(int bits, const ghobject_t& oid) {
409 return oid.match(bits, ps());
410 }
411 bool contains(int bits, const hobject_t& oid) {
412 return oid.match(bits, ps());
413 }
414
415 hobject_t get_hobj_start() const;
416 hobject_t get_hobj_end(unsigned pg_num) const;
417
418 void encode(bufferlist& bl) const {
419 __u8 v = 1;
420 ::encode(v, bl);
421 ::encode(m_pool, bl);
422 ::encode(m_seed, bl);
423 ::encode(m_preferred, bl);
424 }
425 void decode(bufferlist::iterator& bl) {
426 __u8 v;
427 ::decode(v, bl);
428 ::decode(m_pool, bl);
429 ::decode(m_seed, bl);
430 ::decode(m_preferred, bl);
431 }
432 void decode_old(bufferlist::iterator& bl) {
433 old_pg_t opg;
434 ::decode(opg, bl);
435 *this = opg;
436 }
437 void dump(Formatter *f) const;
438 static void generate_test_instances(list<pg_t*>& o);
439};
440WRITE_CLASS_ENCODER(pg_t)
441
442inline bool operator<(const pg_t& l, const pg_t& r) {
443 return l.pool() < r.pool() ||
444 (l.pool() == r.pool() && (l.preferred() < r.preferred() ||
445 (l.preferred() == r.preferred() && (l.ps() < r.ps()))));
446}
447inline bool operator<=(const pg_t& l, const pg_t& r) {
448 return l.pool() < r.pool() ||
449 (l.pool() == r.pool() && (l.preferred() < r.preferred() ||
450 (l.preferred() == r.preferred() && (l.ps() <= r.ps()))));
451}
452inline bool operator==(const pg_t& l, const pg_t& r) {
453 return l.pool() == r.pool() &&
454 l.preferred() == r.preferred() &&
455 l.ps() == r.ps();
456}
457inline bool operator!=(const pg_t& l, const pg_t& r) {
458 return l.pool() != r.pool() ||
459 l.preferred() != r.preferred() ||
460 l.ps() != r.ps();
461}
462inline bool operator>(const pg_t& l, const pg_t& r) {
463 return l.pool() > r.pool() ||
464 (l.pool() == r.pool() && (l.preferred() > r.preferred() ||
465 (l.preferred() == r.preferred() && (l.ps() > r.ps()))));
466}
467inline bool operator>=(const pg_t& l, const pg_t& r) {
468 return l.pool() > r.pool() ||
469 (l.pool() == r.pool() && (l.preferred() > r.preferred() ||
470 (l.preferred() == r.preferred() && (l.ps() >= r.ps()))));
471}
472
473ostream& operator<<(ostream& out, const pg_t &pg);
474
475namespace std {
476 template<> struct hash< pg_t >
477 {
478 size_t operator()( const pg_t& x ) const
479 {
480 static hash<uint32_t> H;
481 return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ x.preferred());
482 }
483 };
484} // namespace std
485
486struct spg_t {
487 pg_t pgid;
488 shard_id_t shard;
489 spg_t() : shard(shard_id_t::NO_SHARD) {}
490 spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
491 explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
492 unsigned get_split_bits(unsigned pg_num) const {
493 return pgid.get_split_bits(pg_num);
494 }
495 spg_t get_parent() const {
496 return spg_t(pgid.get_parent(), shard);
497 }
498 ps_t ps() const {
499 return pgid.ps();
500 }
501 uint64_t pool() const {
502 return pgid.pool();
503 }
504 int32_t preferred() const {
505 return pgid.preferred();
506 }
507
508 static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
509 char *calc_name(char *buf, const char *suffix_backwords) const;
510
511 bool parse(const char *s);
512 bool parse(const std::string& s) {
513 return parse(s.c_str());
514 }
515 bool is_split(unsigned old_pg_num, unsigned new_pg_num,
516 set<spg_t> *pchildren) const {
517 set<pg_t> _children;
518 set<pg_t> *children = pchildren ? &_children : NULL;
519 bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
520 if (pchildren && is_split) {
521 for (set<pg_t>::iterator i = _children.begin();
522 i != _children.end();
523 ++i) {
524 pchildren->insert(spg_t(*i, shard));
525 }
526 }
527 return is_split;
528 }
529 bool is_no_shard() const {
530 return shard == shard_id_t::NO_SHARD;
531 }
532
533 ghobject_t make_pgmeta_oid() const {
534 return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
535 }
536
537 void encode(bufferlist &bl) const {
538 ENCODE_START(1, 1, bl);
539 ::encode(pgid, bl);
540 ::encode(shard, bl);
541 ENCODE_FINISH(bl);
542 }
543 void decode(bufferlist::iterator &bl) {
544 DECODE_START(1, bl);
545 ::decode(pgid, bl);
546 ::decode(shard, bl);
547 DECODE_FINISH(bl);
548 }
549
550 ghobject_t make_temp_ghobject(const string& name) const {
551 return ghobject_t(
552 hobject_t(object_t(name), "", CEPH_NOSNAP,
553 pgid.ps(),
554 hobject_t::POOL_TEMP_START - pgid.pool(), ""),
555 ghobject_t::NO_GEN,
556 shard);
557 }
558
559 unsigned hash_to_shard(unsigned num_shards) const {
560 return ps() % num_shards;
561 }
562};
563WRITE_CLASS_ENCODER(spg_t)
564WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
565WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
566
567namespace std {
568 template<> struct hash< spg_t >
569 {
570 size_t operator()( const spg_t& x ) const
571 {
572 static hash<uint32_t> H;
573 return H(hash<pg_t>()(x.pgid) ^ x.shard);
574 }
575 };
576} // namespace std
577
578ostream& operator<<(ostream& out, const spg_t &pg);
579
580// ----------------------
581
582class coll_t {
583 enum type_t {
584 TYPE_META = 0,
585 TYPE_LEGACY_TEMP = 1, /* no longer used */
586 TYPE_PG = 2,
587 TYPE_PG_TEMP = 3,
588 };
589 type_t type;
590 spg_t pgid;
591 uint64_t removal_seq; // note: deprecated, not encoded
592
593 char _str_buff[spg_t::calc_name_buf_size];
594 char *_str;
595
596 void calc_str();
597
598 coll_t(type_t t, spg_t p, uint64_t r)
599 : type(t), pgid(p), removal_seq(r) {
600 calc_str();
601 }
602
603public:
604 coll_t() : type(TYPE_META), removal_seq(0)
605 {
606 calc_str();
607 }
608
609 coll_t(const coll_t& other)
610 : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
611 calc_str();
612 }
613
614 explicit coll_t(spg_t pgid)
615 : type(TYPE_PG), pgid(pgid), removal_seq(0)
616 {
617 calc_str();
618 }
619
620 coll_t& operator=(const coll_t& rhs)
621 {
622 this->type = rhs.type;
623 this->pgid = rhs.pgid;
624 this->removal_seq = rhs.removal_seq;
625 this->calc_str();
626 return *this;
627 }
628
629 // named constructors
630 static coll_t meta() {
631 return coll_t();
632 }
633 static coll_t pg(spg_t p) {
634 return coll_t(p);
635 }
636
637 const std::string to_str() const {
638 return string(_str);
639 }
640 const char *c_str() const {
641 return _str;
642 }
643
644 bool parse(const std::string& s);
645
646 int operator<(const coll_t &rhs) const {
647 return type < rhs.type ||
648 (type == rhs.type && pgid < rhs.pgid);
649 }
650
651 bool is_meta() const {
652 return type == TYPE_META;
653 }
654 bool is_pg_prefix(spg_t *pgid_) const {
655 if (type == TYPE_PG || type == TYPE_PG_TEMP) {
656 *pgid_ = pgid;
657 return true;
658 }
659 return false;
660 }
661 bool is_pg() const {
662 return type == TYPE_PG;
663 }
664 bool is_pg(spg_t *pgid_) const {
665 if (type == TYPE_PG) {
666 *pgid_ = pgid;
667 return true;
668 }
669 return false;
670 }
671 bool is_temp() const {
672 return type == TYPE_PG_TEMP;
673 }
674 bool is_temp(spg_t *pgid_) const {
675 if (type == TYPE_PG_TEMP) {
676 *pgid_ = pgid;
677 return true;
678 }
679 return false;
680 }
681
682 void encode(bufferlist& bl) const;
683 void decode(bufferlist::iterator& bl);
684 size_t encoded_size() const;
685
686 inline bool operator==(const coll_t& rhs) const {
687 // only compare type if meta
688 if (type != rhs.type)
689 return false;
690 if (type == TYPE_META)
691 return true;
692 return type == rhs.type && pgid == rhs.pgid;
693 }
694 inline bool operator!=(const coll_t& rhs) const {
695 return !(*this == rhs);
696 }
697
698 // get a TEMP collection that corresponds to the current collection,
699 // which we presume is a pg collection.
700 coll_t get_temp() const {
701 assert(type == TYPE_PG);
702 return coll_t(TYPE_PG_TEMP, pgid, 0);
703 }
704
705 ghobject_t get_min_hobj() const {
706 ghobject_t o;
707 switch (type) {
708 case TYPE_PG:
709 o.hobj.pool = pgid.pool();
710 o.set_shard(pgid.shard);
711 break;
712 case TYPE_META:
713 o.hobj.pool = -1;
714 break;
715 default:
716 break;
717 }
718 return o;
719 }
720
721 unsigned hash_to_shard(unsigned num_shards) const {
722 if (type == TYPE_PG)
723 return pgid.hash_to_shard(num_shards);
724 return 0; // whatever.
725 }
726
727 void dump(Formatter *f) const;
728 static void generate_test_instances(list<coll_t*>& o);
729};
730
731WRITE_CLASS_ENCODER(coll_t)
732
733inline ostream& operator<<(ostream& out, const coll_t& c) {
734 out << c.to_str();
735 return out;
736}
737
738namespace std {
739 template<> struct hash<coll_t> {
740 size_t operator()(const coll_t &c) const {
741 size_t h = 0;
742 string str(c.to_str());
743 std::string::const_iterator end(str.end());
744 for (std::string::const_iterator s = str.begin(); s != end; ++s) {
745 h += *s;
746 h += (h << 10);
747 h ^= (h >> 6);
748 }
749 h += (h << 3);
750 h ^= (h >> 11);
751 h += (h << 15);
752 return h;
753 }
754 };
755} // namespace std
756
757inline ostream& operator<<(ostream& out, const ceph_object_layout &ol)
758{
759 out << pg_t(ol.ol_pgid);
760 int su = ol.ol_stripe_unit;
761 if (su)
762 out << ".su=" << su;
763 return out;
764}
765
766
767
768// compound rados version type
769/* WARNING: If add member in eversion_t, please make sure the encode/decode function
770 * work well. For little-endian machine, we should make sure there is no padding
771 * in 32-bit machine and 64-bit machine.
772 */
773class eversion_t {
774public:
775 version_t version;
776 epoch_t epoch;
777 __u32 __pad;
778 eversion_t() : version(0), epoch(0), __pad(0) {}
779 eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
780
781 // cppcheck-suppress noExplicitConstructor
782 eversion_t(const ceph_eversion& ce) :
783 version(ce.version),
784 epoch(ce.epoch),
785 __pad(0) { }
786
787 explicit eversion_t(bufferlist& bl) : __pad(0) { decode(bl); }
788
789 static eversion_t max() {
790 eversion_t max;
791 max.version -= 1;
792 max.epoch -= 1;
793 return max;
794 }
795
796 operator ceph_eversion() {
797 ceph_eversion c;
798 c.epoch = epoch;
799 c.version = version;
800 return c;
801 }
802
803 string get_key_name() const;
804
805 void encode(bufferlist &bl) const {
806#if defined(CEPH_LITTLE_ENDIAN)
807 bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
808#else
809 ::encode(version, bl);
810 ::encode(epoch, bl);
811#endif
812 }
813 void decode(bufferlist::iterator &bl) {
814#if defined(CEPH_LITTLE_ENDIAN)
815 bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
816#else
817 ::decode(version, bl);
818 ::decode(epoch, bl);
819#endif
820 }
821 void decode(bufferlist& bl) {
822 bufferlist::iterator p = bl.begin();
823 decode(p);
824 }
825};
826WRITE_CLASS_ENCODER(eversion_t)
827
828inline bool operator==(const eversion_t& l, const eversion_t& r) {
829 return (l.epoch == r.epoch) && (l.version == r.version);
830}
831inline bool operator!=(const eversion_t& l, const eversion_t& r) {
832 return (l.epoch != r.epoch) || (l.version != r.version);
833}
834inline bool operator<(const eversion_t& l, const eversion_t& r) {
835 return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
836}
837inline bool operator<=(const eversion_t& l, const eversion_t& r) {
838 return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
839}
840inline bool operator>(const eversion_t& l, const eversion_t& r) {
841 return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
842}
843inline bool operator>=(const eversion_t& l, const eversion_t& r) {
844 return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
845}
846inline ostream& operator<<(ostream& out, const eversion_t& e) {
847 return out << e.epoch << "'" << e.version;
848}
849
850/**
851 * objectstore_perf_stat_t
852 *
853 * current perf information about the osd
854 */
855struct objectstore_perf_stat_t {
856 // cur_op_latency is in ms since double add/sub are not associative
857 uint32_t os_commit_latency;
858 uint32_t os_apply_latency;
859
860 objectstore_perf_stat_t() :
861 os_commit_latency(0), os_apply_latency(0) {}
862
863 bool operator==(const objectstore_perf_stat_t &r) const {
864 return os_commit_latency == r.os_commit_latency &&
865 os_apply_latency == r.os_apply_latency;
866 }
867
868 void add(const objectstore_perf_stat_t &o) {
869 os_commit_latency += o.os_commit_latency;
870 os_apply_latency += o.os_apply_latency;
871 }
872 void sub(const objectstore_perf_stat_t &o) {
873 os_commit_latency -= o.os_commit_latency;
874 os_apply_latency -= o.os_apply_latency;
875 }
876 void dump(Formatter *f) const;
877 void encode(bufferlist &bl) const;
878 void decode(bufferlist::iterator &bl);
879 static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
880};
881WRITE_CLASS_ENCODER(objectstore_perf_stat_t)
882
883/** osd_stat
884 * aggregate stats for an osd
885 */
886struct osd_stat_t {
887 int64_t kb, kb_used, kb_avail;
888 vector<int> hb_peers;
889 int32_t snap_trim_queue_len, num_snap_trimming;
890
891 pow2_hist_t op_queue_age_hist;
892
893 objectstore_perf_stat_t os_perf_stat;
894
31f18b77
FG
895 epoch_t up_from = 0;
896 uint64_t seq = 0;
897
7c673cae
FG
898 osd_stat_t() : kb(0), kb_used(0), kb_avail(0),
899 snap_trim_queue_len(0), num_snap_trimming(0) {}
900
901 void add(const osd_stat_t& o) {
902 kb += o.kb;
903 kb_used += o.kb_used;
904 kb_avail += o.kb_avail;
905 snap_trim_queue_len += o.snap_trim_queue_len;
906 num_snap_trimming += o.num_snap_trimming;
907 op_queue_age_hist.add(o.op_queue_age_hist);
908 os_perf_stat.add(o.os_perf_stat);
909 }
910 void sub(const osd_stat_t& o) {
911 kb -= o.kb;
912 kb_used -= o.kb_used;
913 kb_avail -= o.kb_avail;
914 snap_trim_queue_len -= o.snap_trim_queue_len;
915 num_snap_trimming -= o.num_snap_trimming;
916 op_queue_age_hist.sub(o.op_queue_age_hist);
917 os_perf_stat.sub(o.os_perf_stat);
918 }
919
920 void dump(Formatter *f) const;
921 void encode(bufferlist &bl) const;
922 void decode(bufferlist::iterator &bl);
923 static void generate_test_instances(std::list<osd_stat_t*>& o);
924};
925WRITE_CLASS_ENCODER(osd_stat_t)
926
927inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
928 return l.kb == r.kb &&
929 l.kb_used == r.kb_used &&
930 l.kb_avail == r.kb_avail &&
931 l.snap_trim_queue_len == r.snap_trim_queue_len &&
932 l.num_snap_trimming == r.num_snap_trimming &&
933 l.hb_peers == r.hb_peers &&
934 l.op_queue_age_hist == r.op_queue_age_hist &&
935 l.os_perf_stat == r.os_perf_stat;
936}
937inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
938 return !(l == r);
939}
940
941
942
943inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
944 return out << "osd_stat(" << kb_t(s.kb_used) << " used, "
945 << kb_t(s.kb_avail) << " avail, "
946 << kb_t(s.kb) << " total, "
947 << "peers " << s.hb_peers
948 << " op hist " << s.op_queue_age_hist.h
949 << ")";
950}
951
952
953/*
954 * pg states
955 */
956#define PG_STATE_CREATING (1<<0) // creating
957#define PG_STATE_ACTIVE (1<<1) // i am active. (primary: replicas too)
958#define PG_STATE_CLEAN (1<<2) // peers are complete, clean of stray replicas.
959#define PG_STATE_DOWN (1<<4) // a needed replica is down, PG offline
960//#define PG_STATE_REPLAY (1<<5) // crashed, waiting for replay
961//#define PG_STATE_STRAY (1<<6) // i must notify the primary i exist.
962//#define PG_STATE_SPLITTING (1<<7) // i am splitting
963#define PG_STATE_SCRUBBING (1<<8) // scrubbing
964//#define PG_STATE_SCRUBQ (1<<9) // queued for scrub
965#define PG_STATE_DEGRADED (1<<10) // pg contains objects with reduced redundancy
966#define PG_STATE_INCONSISTENT (1<<11) // pg replicas are inconsistent (but shouldn't be)
967#define PG_STATE_PEERING (1<<12) // pg is (re)peering
968#define PG_STATE_REPAIR (1<<13) // pg should repair on next scrub
969#define PG_STATE_RECOVERING (1<<14) // pg is recovering/migrating objects
970#define PG_STATE_BACKFILL_WAIT (1<<15) // [active] reserving backfill
971#define PG_STATE_INCOMPLETE (1<<16) // incomplete content, peering failed.
972#define PG_STATE_STALE (1<<17) // our state for this pg is stale, unknown.
973#define PG_STATE_REMAPPED (1<<18) // pg is explicitly remapped to different OSDs than CRUSH
974#define PG_STATE_DEEP_SCRUB (1<<19) // deep scrub: check CRC32 on files
975#define PG_STATE_BACKFILL (1<<20) // [active] backfilling pg content
976#define PG_STATE_BACKFILL_TOOFULL (1<<21) // backfill can't proceed: too full
977#define PG_STATE_RECOVERY_WAIT (1<<22) // waiting for recovery reservations
978#define PG_STATE_UNDERSIZED (1<<23) // pg acting < pool size
979#define PG_STATE_ACTIVATING (1<<24) // pg is peered but not yet active
980#define PG_STATE_PEERED (1<<25) // peered, cannot go active, can recover
981#define PG_STATE_SNAPTRIM (1<<26) // trimming snaps
982#define PG_STATE_SNAPTRIM_WAIT (1<<27) // queued to trim snaps
983#define PG_STATE_RECOVERY_TOOFULL (1<<28) // recovery can't proceed: too full
224ce89b 984#define PG_STATE_SNAPTRIM_ERROR (1<<29) // error stopped trimming snaps
7c673cae
FG
985
986std::string pg_state_string(int state);
987std::string pg_vector_string(const vector<int32_t> &a);
988int pg_string_state(const std::string& state);
989
990
991/*
992 * pool_snap_info_t
993 *
994 * attributes for a single pool snapshot.
995 */
996struct pool_snap_info_t {
997 snapid_t snapid;
998 utime_t stamp;
999 string name;
1000
1001 void dump(Formatter *f) const;
1002 void encode(bufferlist& bl, uint64_t features) const;
1003 void decode(bufferlist::iterator& bl);
1004 static void generate_test_instances(list<pool_snap_info_t*>& o);
1005};
1006WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
1007
1008inline ostream& operator<<(ostream& out, const pool_snap_info_t& si) {
1009 return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
1010}
1011
1012
1013/*
1014 * pool_opts_t
1015 *
1016 * pool options.
1017 */
1018
1019class pool_opts_t {
1020public:
1021 enum key_t {
1022 SCRUB_MIN_INTERVAL,
1023 SCRUB_MAX_INTERVAL,
1024 DEEP_SCRUB_INTERVAL,
1025 RECOVERY_PRIORITY,
1026 RECOVERY_OP_PRIORITY,
1027 SCRUB_PRIORITY,
1028 COMPRESSION_MODE,
1029 COMPRESSION_ALGORITHM,
1030 COMPRESSION_REQUIRED_RATIO,
1031 COMPRESSION_MAX_BLOB_SIZE,
1032 COMPRESSION_MIN_BLOB_SIZE,
1033 CSUM_TYPE,
1034 CSUM_MAX_BLOCK,
1035 CSUM_MIN_BLOCK,
1036 };
1037
1038 enum type_t {
1039 STR,
1040 INT,
1041 DOUBLE,
1042 };
1043
1044 struct opt_desc_t {
1045 key_t key;
1046 type_t type;
1047
1048 opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
1049
1050 bool operator==(const opt_desc_t& rhs) const {
1051 return key == rhs.key && type == rhs.type;
1052 }
1053 };
1054
1055 typedef boost::variant<std::string,int,double> value_t;
1056
1057 static bool is_opt_name(const std::string& name);
1058 static opt_desc_t get_opt_desc(const std::string& name);
1059
1060 pool_opts_t() : opts() {}
1061
1062 bool is_set(key_t key) const;
1063
1064 template<typename T>
1065 void set(key_t key, const T &val) {
1066 value_t value = val;
1067 opts[key] = value;
1068 }
1069
1070 template<typename T>
1071 bool get(key_t key, T *val) const {
1072 opts_t::const_iterator i = opts.find(key);
1073 if (i == opts.end()) {
1074 return false;
1075 }
1076 *val = boost::get<T>(i->second);
1077 return true;
1078 }
1079
1080 const value_t& get(key_t key) const;
1081
1082 bool unset(key_t key);
1083
1084 void dump(const std::string& name, Formatter *f) const;
1085
1086 void dump(Formatter *f) const;
1087 void encode(bufferlist &bl) const;
1088 void decode(bufferlist::iterator &bl);
1089
1090private:
1091 typedef std::map<key_t, value_t> opts_t;
1092 opts_t opts;
1093
1094 friend ostream& operator<<(ostream& out, const pool_opts_t& opts);
1095};
1096WRITE_CLASS_ENCODER(pool_opts_t)
1097
1098/*
1099 * pg_pool
1100 */
1101struct pg_pool_t {
1102 enum {
1103 TYPE_REPLICATED = 1, // replication
1104 //TYPE_RAID4 = 2, // raid4 (never implemented)
1105 TYPE_ERASURE = 3, // erasure-coded
1106 };
1107 static const char *get_type_name(int t) {
1108 switch (t) {
1109 case TYPE_REPLICATED: return "replicated";
1110 //case TYPE_RAID4: return "raid4";
1111 case TYPE_ERASURE: return "erasure";
1112 default: return "???";
1113 }
1114 }
1115 const char *get_type_name() const {
1116 return get_type_name(type);
1117 }
7c673cae
FG
1118
1119 enum {
1120 FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
1121 FLAG_FULL = 1<<1, // pool is full
1122 FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
1123 FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
1124 FLAG_NODELETE = 1<<4, // pool can't be deleted
1125 FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
1126 FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
1127 FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1128 FLAG_NOSCRUB = 1<<8, // block periodic scrub
1129 FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
1130 };
1131
1132 static const char *get_flag_name(int f) {
1133 switch (f) {
1134 case FLAG_HASHPSPOOL: return "hashpspool";
1135 case FLAG_FULL: return "full";
1136 case FLAG_EC_OVERWRITES: return "ec_overwrites";
1137 case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
1138 case FLAG_NODELETE: return "nodelete";
1139 case FLAG_NOPGCHANGE: return "nopgchange";
1140 case FLAG_NOSIZECHANGE: return "nosizechange";
1141 case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
1142 case FLAG_NOSCRUB: return "noscrub";
1143 case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
1144 default: return "???";
1145 }
1146 }
1147 static string get_flags_string(uint64_t f) {
1148 string s;
1149 for (unsigned n=0; f && n<64; ++n) {
1150 if (f & (1ull << n)) {
1151 if (s.length())
1152 s += ",";
1153 s += get_flag_name(1ull << n);
1154 }
1155 }
1156 return s;
1157 }
1158 string get_flags_string() const {
1159 return get_flags_string(flags);
1160 }
1161 static uint64_t get_flag_by_name(const string& name) {
1162 if (name == "hashpspool")
1163 return FLAG_HASHPSPOOL;
1164 if (name == "full")
1165 return FLAG_FULL;
1166 if (name == "ec_overwrites")
1167 return FLAG_EC_OVERWRITES;
1168 if (name == "incomplete_clones")
1169 return FLAG_INCOMPLETE_CLONES;
1170 if (name == "nodelete")
1171 return FLAG_NODELETE;
1172 if (name == "nopgchange")
1173 return FLAG_NOPGCHANGE;
1174 if (name == "nosizechange")
1175 return FLAG_NOSIZECHANGE;
1176 if (name == "write_fadvise_dontneed")
1177 return FLAG_WRITE_FADVISE_DONTNEED;
1178 if (name == "noscrub")
1179 return FLAG_NOSCRUB;
1180 if (name == "nodeep-scrub")
1181 return FLAG_NODEEP_SCRUB;
1182 return 0;
1183 }
1184
1185 /// converts the acting/up vector to a set of pg shards
1186 void convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const;
1187
1188 typedef enum {
1189 CACHEMODE_NONE = 0, ///< no caching
1190 CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
1191 CACHEMODE_FORWARD = 2, ///< forward if not in cache
1192 CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
1193 CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later
1194 CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later
1195 CACHEMODE_PROXY = 6, ///< proxy if not in cache
1196 } cache_mode_t;
1197 static const char *get_cache_mode_name(cache_mode_t m) {
1198 switch (m) {
1199 case CACHEMODE_NONE: return "none";
1200 case CACHEMODE_WRITEBACK: return "writeback";
1201 case CACHEMODE_FORWARD: return "forward";
1202 case CACHEMODE_READONLY: return "readonly";
1203 case CACHEMODE_READFORWARD: return "readforward";
1204 case CACHEMODE_READPROXY: return "readproxy";
1205 case CACHEMODE_PROXY: return "proxy";
1206 default: return "unknown";
1207 }
1208 }
1209 static cache_mode_t get_cache_mode_from_str(const string& s) {
1210 if (s == "none")
1211 return CACHEMODE_NONE;
1212 if (s == "writeback")
1213 return CACHEMODE_WRITEBACK;
1214 if (s == "forward")
1215 return CACHEMODE_FORWARD;
1216 if (s == "readonly")
1217 return CACHEMODE_READONLY;
1218 if (s == "readforward")
1219 return CACHEMODE_READFORWARD;
1220 if (s == "readproxy")
1221 return CACHEMODE_READPROXY;
1222 if (s == "proxy")
1223 return CACHEMODE_PROXY;
1224 return (cache_mode_t)-1;
1225 }
1226 const char *get_cache_mode_name() const {
1227 return get_cache_mode_name(cache_mode);
1228 }
1229 bool cache_mode_requires_hit_set() const {
1230 switch (cache_mode) {
1231 case CACHEMODE_NONE:
1232 case CACHEMODE_FORWARD:
1233 case CACHEMODE_READONLY:
1234 case CACHEMODE_PROXY:
1235 return false;
1236 case CACHEMODE_WRITEBACK:
1237 case CACHEMODE_READFORWARD:
1238 case CACHEMODE_READPROXY:
1239 return true;
1240 default:
1241 assert(0 == "implement me");
1242 }
1243 }
1244
1245 uint64_t flags; ///< FLAG_*
1246 __u8 type; ///< TYPE_*
1247 __u8 size, min_size; ///< number of osds in each pg
31f18b77 1248 __u8 crush_rule; ///< crush placement rule
7c673cae
FG
1249 __u8 object_hash; ///< hash mapping object name to ps
1250private:
1251 __u32 pg_num, pgp_num; ///< number of pgs
1252
1253
1254public:
1255 map<string,string> properties; ///< OBSOLETE
1256 string erasure_code_profile; ///< name of the erasure code profile in OSDMap
1257 epoch_t last_change; ///< most recent epoch changed, exclusing snapshot changes
1258 epoch_t last_force_op_resend; ///< last epoch that forced clients to resend
1259 /// last epoch that forced clients to resend (pre-luminous clients only)
1260 epoch_t last_force_op_resend_preluminous;
1261 snapid_t snap_seq; ///< seq for per-pool snapshot
1262 epoch_t snap_epoch; ///< osdmap epoch of last snap
1263 uint64_t auid; ///< who owns the pg
1264 __u32 crash_replay_interval; ///< seconds to allow clients to replay ACKed but unCOMMITted requests
1265
1266 uint64_t quota_max_bytes; ///< maximum number of bytes for this pool
1267 uint64_t quota_max_objects; ///< maximum number of objects for this pool
1268
1269 /*
1270 * Pool snaps (global to this pool). These define a SnapContext for
1271 * the pool, unless the client manually specifies an alternate
1272 * context.
1273 */
1274 map<snapid_t, pool_snap_info_t> snaps;
1275 /*
1276 * Alternatively, if we are defining non-pool snaps (e.g. via the
1277 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1278 * used). Snaps and removed_snaps are to be used exclusive of each
1279 * other!
1280 */
1281 interval_set<snapid_t> removed_snaps;
1282
1283 unsigned pg_num_mask, pgp_num_mask;
1284
1285 set<uint64_t> tiers; ///< pools that are tiers of us
1286 int64_t tier_of; ///< pool for which we are a tier
1287 // Note that write wins for read+write ops
1288 int64_t read_tier; ///< pool/tier for objecter to direct reads to
1289 int64_t write_tier; ///< pool/tier for objecter to direct writes to
1290 cache_mode_t cache_mode; ///< cache pool mode
1291
1292 bool is_tier() const { return tier_of >= 0; }
1293 bool has_tiers() const { return !tiers.empty(); }
1294 void clear_tier() {
1295 tier_of = -1;
1296 clear_read_tier();
1297 clear_write_tier();
1298 clear_tier_tunables();
1299 }
1300 bool has_read_tier() const { return read_tier >= 0; }
1301 void clear_read_tier() { read_tier = -1; }
1302 bool has_write_tier() const { return write_tier >= 0; }
1303 void clear_write_tier() { write_tier = -1; }
1304 void clear_tier_tunables() {
1305 if (cache_mode != CACHEMODE_NONE)
1306 flags |= FLAG_INCOMPLETE_CLONES;
1307 cache_mode = CACHEMODE_NONE;
1308
1309 target_max_bytes = 0;
1310 target_max_objects = 0;
1311 cache_target_dirty_ratio_micro = 0;
1312 cache_target_dirty_high_ratio_micro = 0;
1313 cache_target_full_ratio_micro = 0;
1314 hit_set_params = HitSet::Params();
1315 hit_set_period = 0;
1316 hit_set_count = 0;
1317 hit_set_grade_decay_rate = 0;
1318 hit_set_search_last_n = 0;
1319 grade_table.resize(0);
1320 }
1321
1322 uint64_t target_max_bytes; ///< tiering: target max pool size
1323 uint64_t target_max_objects; ///< tiering: target max pool size
1324
1325 uint32_t cache_target_dirty_ratio_micro; ///< cache: fraction of target to leave dirty
1326 uint32_t cache_target_dirty_high_ratio_micro; ///<cache: fraction of target to flush with high speed
1327 uint32_t cache_target_full_ratio_micro; ///< cache: fraction of target to fill before we evict in earnest
1328
1329 uint32_t cache_min_flush_age; ///< minimum age (seconds) before we can flush
1330 uint32_t cache_min_evict_age; ///< minimum age (seconds) before we can evict
1331
1332 HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
1333 uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
1334 uint32_t hit_set_count; ///< number of periods to retain
1335 bool use_gmt_hitset; ///< use gmt to name the hitset archive object
1336 uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote on read
1337 uint32_t min_write_recency_for_promote; ///< minimum number of HitSet to check before promote on write
1338 uint32_t hit_set_grade_decay_rate; ///< current hit_set has highest priority on objects
1339 ///temperature count,the follow hit_set's priority decay
1340 ///by this params than pre hit_set
1341 uint32_t hit_set_search_last_n; ///<accumulate atmost N hit_sets for temperature
1342
1343 uint32_t stripe_width; ///< erasure coded stripe size in bytes
1344
1345 uint64_t expected_num_objects; ///< expected number of objects on this pool, a value of 0 indicates
1346 ///< user does not specify any expected value
1347 bool fast_read; ///< whether turn on fast read on the pool or not
1348
1349 pool_opts_t opts; ///< options
1350
1351private:
1352 vector<uint32_t> grade_table;
1353
1354public:
1355 uint32_t get_grade(unsigned i) const {
1356 if (grade_table.size() <= i)
1357 return 0;
1358 return grade_table[i];
1359 }
1360 void calc_grade_table() {
1361 unsigned v = 1000000;
1362 grade_table.resize(hit_set_count);
1363 for (unsigned i = 0; i < hit_set_count; i++) {
1364 v = v * (1 - (hit_set_grade_decay_rate / 100.0));
1365 grade_table[i] = v;
1366 }
1367 }
1368
1369 pg_pool_t()
1370 : flags(0), type(0), size(0), min_size(0),
31f18b77 1371 crush_rule(0), object_hash(0),
7c673cae
FG
1372 pg_num(0), pgp_num(0),
1373 last_change(0),
1374 last_force_op_resend(0),
1375 last_force_op_resend_preluminous(0),
1376 snap_seq(0), snap_epoch(0),
1377 auid(0),
1378 crash_replay_interval(0),
1379 quota_max_bytes(0), quota_max_objects(0),
1380 pg_num_mask(0), pgp_num_mask(0),
1381 tier_of(-1), read_tier(-1), write_tier(-1),
1382 cache_mode(CACHEMODE_NONE),
1383 target_max_bytes(0), target_max_objects(0),
1384 cache_target_dirty_ratio_micro(0),
1385 cache_target_dirty_high_ratio_micro(0),
1386 cache_target_full_ratio_micro(0),
1387 cache_min_flush_age(0),
1388 cache_min_evict_age(0),
1389 hit_set_params(),
1390 hit_set_period(0),
1391 hit_set_count(0),
1392 use_gmt_hitset(true),
1393 min_read_recency_for_promote(0),
1394 min_write_recency_for_promote(0),
1395 hit_set_grade_decay_rate(0),
1396 hit_set_search_last_n(0),
1397 stripe_width(0),
1398 expected_num_objects(0),
1399 fast_read(false),
1400 opts()
1401 { }
1402
1403 void dump(Formatter *f) const;
1404
1405 uint64_t get_flags() const { return flags; }
1406 bool has_flag(uint64_t f) const { return flags & f; }
1407 void set_flag(uint64_t f) { flags |= f; }
1408 void unset_flag(uint64_t f) { flags &= ~f; }
1409
1410 bool ec_pool() const {
1411 return type == TYPE_ERASURE;
1412 }
1413 bool require_rollback() const {
1414 return ec_pool();
1415 }
1416
1417 /// true if incomplete clones may be present
1418 bool allow_incomplete_clones() const {
1419 return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
1420 }
1421
1422 unsigned get_type() const { return type; }
1423 unsigned get_size() const { return size; }
1424 unsigned get_min_size() const { return min_size; }
31f18b77 1425 int get_crush_rule() const { return crush_rule; }
7c673cae
FG
1426 int get_object_hash() const { return object_hash; }
1427 const char *get_object_hash_name() const {
1428 return ceph_str_hash_name(get_object_hash());
1429 }
1430 epoch_t get_last_change() const { return last_change; }
1431 epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
1432 epoch_t get_last_force_op_resend_preluminous() const {
1433 return last_force_op_resend_preluminous;
1434 }
1435 epoch_t get_snap_epoch() const { return snap_epoch; }
1436 snapid_t get_snap_seq() const { return snap_seq; }
1437 uint64_t get_auid() const { return auid; }
1438 unsigned get_crash_replay_interval() const { return crash_replay_interval; }
1439
1440 void set_snap_seq(snapid_t s) { snap_seq = s; }
1441 void set_snap_epoch(epoch_t e) { snap_epoch = e; }
1442
1443 void set_stripe_width(uint32_t s) { stripe_width = s; }
1444 uint32_t get_stripe_width() const { return stripe_width; }
1445
1446 bool is_replicated() const { return get_type() == TYPE_REPLICATED; }
1447 bool is_erasure() const { return get_type() == TYPE_ERASURE; }
1448
1449 bool supports_omap() const {
1450 return !(get_type() == TYPE_ERASURE);
1451 }
1452
1453 bool requires_aligned_append() const {
1454 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
1455 }
1456 uint64_t required_alignment() const { return stripe_width; }
1457
1458 bool allows_ecoverwrites() const {
1459 return has_flag(FLAG_EC_OVERWRITES);
1460 }
1461
1462 bool can_shift_osds() const {
1463 switch (get_type()) {
1464 case TYPE_REPLICATED:
1465 return true;
1466 case TYPE_ERASURE:
1467 return false;
1468 default:
1469 assert(0 == "unhandled pool type");
1470 }
1471 }
1472
1473 unsigned get_pg_num() const { return pg_num; }
1474 unsigned get_pgp_num() const { return pgp_num; }
1475
1476 unsigned get_pg_num_mask() const { return pg_num_mask; }
1477 unsigned get_pgp_num_mask() const { return pgp_num_mask; }
1478
1479 // if pg_num is not a multiple of two, pgs are not equally sized.
1480 // return, for a given pg, the fraction (denominator) of the total
1481 // pool size that it represents.
1482 unsigned get_pg_num_divisor(pg_t pgid) const;
1483
1484 void set_pg_num(int p) {
1485 pg_num = p;
1486 calc_pg_masks();
1487 }
1488 void set_pgp_num(int p) {
1489 pgp_num = p;
1490 calc_pg_masks();
1491 }
1492
1493 void set_quota_max_bytes(uint64_t m) {
1494 quota_max_bytes = m;
1495 }
1496 uint64_t get_quota_max_bytes() {
1497 return quota_max_bytes;
1498 }
1499
1500 void set_quota_max_objects(uint64_t m) {
1501 quota_max_objects = m;
1502 }
1503 uint64_t get_quota_max_objects() {
1504 return quota_max_objects;
1505 }
1506
1507 void set_last_force_op_resend(uint64_t t) {
1508 last_force_op_resend = t;
1509 last_force_op_resend_preluminous = t;
1510 }
1511
1512 void calc_pg_masks();
1513
1514 /*
1515 * we have two snap modes:
1516 * - pool global snaps
1517 * - snap existence/non-existence defined by snaps[] and snap_seq
1518 * - user managed snaps
1519 * - removal governed by removed_snaps
1520 *
1521 * we know which mode we're using based on whether removed_snaps is empty.
1522 * If nothing has been created, both functions report false.
1523 */
1524 bool is_pool_snaps_mode() const;
1525 bool is_unmanaged_snaps_mode() const;
1526 bool is_removed_snap(snapid_t s) const;
1527
1528 /*
1529 * build set of known-removed sets from either pool snaps or
1530 * explicit removed_snaps set.
1531 */
1532 void build_removed_snaps(interval_set<snapid_t>& rs) const;
1533 snapid_t snap_exists(const char *s) const;
1534 void add_snap(const char *n, utime_t stamp);
1535 void add_unmanaged_snap(uint64_t& snapid);
1536 void remove_snap(snapid_t s);
1537 void remove_unmanaged_snap(snapid_t s);
1538
1539 SnapContext get_snap_context() const;
1540
1541 /// hash a object name+namespace key to a hash position
1542 uint32_t hash_key(const string& key, const string& ns) const;
1543
1544 /// round a hash position down to a pg num
1545 uint32_t raw_hash_to_pg(uint32_t v) const;
1546
1547 /*
1548 * map a raw pg (with full precision ps) into an actual pg, for storage
1549 */
1550 pg_t raw_pg_to_pg(pg_t pg) const;
1551
1552 /*
1553 * map raw pg (full precision ps) into a placement seed. include
1554 * pool id in that value so that different pools don't use the same
1555 * seeds.
1556 */
1557 ps_t raw_pg_to_pps(pg_t pg) const;
1558
1559 /// choose a random hash position within a pg
1560 uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
1561
1562 void encode(bufferlist& bl, uint64_t features) const;
1563 void decode(bufferlist::iterator& bl);
1564
1565 static void generate_test_instances(list<pg_pool_t*>& o);
1566};
1567WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
1568
1569ostream& operator<<(ostream& out, const pg_pool_t& p);
1570
1571
1572/**
1573 * a summation of object stats
1574 *
1575 * This is just a container for object stats; we don't know what for.
1576 *
1577 * If you add members in object_stat_sum_t, you should make sure there are
1578 * not padding among these members.
1579 * You should also modify the padding_check function.
1580
1581 */
1582struct object_stat_sum_t {
1583 /**************************************************************************
1584 * WARNING: be sure to update operator==, floor, and split when
1585 * adding/removing fields!
1586 **************************************************************************/
1587 int64_t num_bytes; // in bytes
1588 int64_t num_objects;
1589 int64_t num_object_clones;
1590 int64_t num_object_copies; // num_objects * num_replicas
1591 int64_t num_objects_missing_on_primary;
1592 int64_t num_objects_degraded;
1593 int64_t num_objects_unfound;
1594 int64_t num_rd;
1595 int64_t num_rd_kb;
1596 int64_t num_wr;
1597 int64_t num_wr_kb;
1598 int64_t num_scrub_errors; // total deep and shallow scrub errors
1599 int64_t num_objects_recovered;
1600 int64_t num_bytes_recovered;
1601 int64_t num_keys_recovered;
1602 int64_t num_shallow_scrub_errors;
1603 int64_t num_deep_scrub_errors;
1604 int64_t num_objects_dirty;
1605 int64_t num_whiteouts;
1606 int64_t num_objects_omap;
1607 int64_t num_objects_hit_set_archive;
1608 int64_t num_objects_misplaced;
1609 int64_t num_bytes_hit_set_archive;
1610 int64_t num_flush;
1611 int64_t num_flush_kb;
1612 int64_t num_evict;
1613 int64_t num_evict_kb;
1614 int64_t num_promote;
1615 int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0
1616 int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0
1617 int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0
1618 int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0
1619 int64_t num_objects_pinned;
1620 int64_t num_objects_missing;
1621 int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
1622
1623 object_stat_sum_t()
1624 : num_bytes(0),
1625 num_objects(0), num_object_clones(0), num_object_copies(0),
1626 num_objects_missing_on_primary(0), num_objects_degraded(0),
1627 num_objects_unfound(0),
1628 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1629 num_scrub_errors(0),
1630 num_objects_recovered(0),
1631 num_bytes_recovered(0),
1632 num_keys_recovered(0),
1633 num_shallow_scrub_errors(0),
1634 num_deep_scrub_errors(0),
1635 num_objects_dirty(0),
1636 num_whiteouts(0),
1637 num_objects_omap(0),
1638 num_objects_hit_set_archive(0),
1639 num_objects_misplaced(0),
1640 num_bytes_hit_set_archive(0),
1641 num_flush(0),
1642 num_flush_kb(0),
1643 num_evict(0),
1644 num_evict_kb(0),
1645 num_promote(0),
1646 num_flush_mode_high(0), num_flush_mode_low(0),
1647 num_evict_mode_some(0), num_evict_mode_full(0),
1648 num_objects_pinned(0),
1649 num_objects_missing(0),
1650 num_legacy_snapsets(0)
1651 {}
1652
1653 void floor(int64_t f) {
1654#define FLOOR(x) if (x < f) x = f
1655 FLOOR(num_bytes);
1656 FLOOR(num_objects);
1657 FLOOR(num_object_clones);
1658 FLOOR(num_object_copies);
1659 FLOOR(num_objects_missing_on_primary);
1660 FLOOR(num_objects_missing);
1661 FLOOR(num_objects_degraded);
1662 FLOOR(num_objects_misplaced);
1663 FLOOR(num_objects_unfound);
1664 FLOOR(num_rd);
1665 FLOOR(num_rd_kb);
1666 FLOOR(num_wr);
1667 FLOOR(num_wr_kb);
1668 FLOOR(num_scrub_errors);
1669 FLOOR(num_shallow_scrub_errors);
1670 FLOOR(num_deep_scrub_errors);
1671 FLOOR(num_objects_recovered);
1672 FLOOR(num_bytes_recovered);
1673 FLOOR(num_keys_recovered);
1674 FLOOR(num_objects_dirty);
1675 FLOOR(num_whiteouts);
1676 FLOOR(num_objects_omap);
1677 FLOOR(num_objects_hit_set_archive);
1678 FLOOR(num_bytes_hit_set_archive);
1679 FLOOR(num_flush);
1680 FLOOR(num_flush_kb);
1681 FLOOR(num_evict);
1682 FLOOR(num_evict_kb);
1683 FLOOR(num_promote);
1684 FLOOR(num_flush_mode_high);
1685 FLOOR(num_flush_mode_low);
1686 FLOOR(num_evict_mode_some);
1687 FLOOR(num_evict_mode_full);
1688 FLOOR(num_objects_pinned);
1689 FLOOR(num_legacy_snapsets);
1690#undef FLOOR
1691 }
1692
1693 void split(vector<object_stat_sum_t> &out) const {
1694#define SPLIT(PARAM) \
1695 for (unsigned i = 0; i < out.size(); ++i) { \
1696 out[i].PARAM = PARAM / out.size(); \
1697 if (i < (PARAM % out.size())) { \
1698 out[i].PARAM++; \
1699 } \
1700 }
1701#define SPLIT_PRESERVE_NONZERO(PARAM) \
1702 for (unsigned i = 0; i < out.size(); ++i) { \
1703 if (PARAM) \
1704 out[i].PARAM = 1 + PARAM / out.size(); \
1705 else \
1706 out[i].PARAM = 0; \
1707 }
1708
1709 SPLIT(num_bytes);
1710 SPLIT(num_objects);
1711 SPLIT(num_object_clones);
1712 SPLIT(num_object_copies);
1713 SPLIT(num_objects_missing_on_primary);
1714 SPLIT(num_objects_missing);
1715 SPLIT(num_objects_degraded);
1716 SPLIT(num_objects_misplaced);
1717 SPLIT(num_objects_unfound);
1718 SPLIT(num_rd);
1719 SPLIT(num_rd_kb);
1720 SPLIT(num_wr);
1721 SPLIT(num_wr_kb);
1722 SPLIT(num_scrub_errors);
1723 SPLIT(num_shallow_scrub_errors);
1724 SPLIT(num_deep_scrub_errors);
1725 SPLIT(num_objects_recovered);
1726 SPLIT(num_bytes_recovered);
1727 SPLIT(num_keys_recovered);
1728 SPLIT(num_objects_dirty);
1729 SPLIT(num_whiteouts);
1730 SPLIT(num_objects_omap);
1731 SPLIT(num_objects_hit_set_archive);
1732 SPLIT(num_bytes_hit_set_archive);
1733 SPLIT(num_flush);
1734 SPLIT(num_flush_kb);
1735 SPLIT(num_evict);
1736 SPLIT(num_evict_kb);
1737 SPLIT(num_promote);
1738 SPLIT(num_flush_mode_high);
1739 SPLIT(num_flush_mode_low);
1740 SPLIT(num_evict_mode_some);
1741 SPLIT(num_evict_mode_full);
1742 SPLIT(num_objects_pinned);
1743 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
1744#undef SPLIT
1745#undef SPLIT_PRESERVE_NONZERO
1746 }
1747
1748 void clear() {
1749 memset(this, 0, sizeof(*this));
1750 }
1751
1752 void calc_copies(int nrep) {
1753 num_object_copies = nrep * num_objects;
1754 }
1755
1756 bool is_zero() const {
1757 return mem_is_zero((char*)this, sizeof(*this));
1758 }
1759
1760 void add(const object_stat_sum_t& o);
1761 void sub(const object_stat_sum_t& o);
1762
1763 void dump(Formatter *f) const;
1764 void padding_check() {
1765 static_assert(
1766 sizeof(object_stat_sum_t) ==
1767 sizeof(num_bytes) +
1768 sizeof(num_objects) +
1769 sizeof(num_object_clones) +
1770 sizeof(num_object_copies) +
1771 sizeof(num_objects_missing_on_primary) +
1772 sizeof(num_objects_degraded) +
1773 sizeof(num_objects_unfound) +
1774 sizeof(num_rd) +
1775 sizeof(num_rd_kb) +
1776 sizeof(num_wr) +
1777 sizeof(num_wr_kb) +
1778 sizeof(num_scrub_errors) +
1779 sizeof(num_objects_recovered) +
1780 sizeof(num_bytes_recovered) +
1781 sizeof(num_keys_recovered) +
1782 sizeof(num_shallow_scrub_errors) +
1783 sizeof(num_deep_scrub_errors) +
1784 sizeof(num_objects_dirty) +
1785 sizeof(num_whiteouts) +
1786 sizeof(num_objects_omap) +
1787 sizeof(num_objects_hit_set_archive) +
1788 sizeof(num_objects_misplaced) +
1789 sizeof(num_bytes_hit_set_archive) +
1790 sizeof(num_flush) +
1791 sizeof(num_flush_kb) +
1792 sizeof(num_evict) +
1793 sizeof(num_evict_kb) +
1794 sizeof(num_promote) +
1795 sizeof(num_flush_mode_high) +
1796 sizeof(num_flush_mode_low) +
1797 sizeof(num_evict_mode_some) +
1798 sizeof(num_evict_mode_full) +
1799 sizeof(num_objects_pinned) +
1800 sizeof(num_objects_missing) +
1801 sizeof(num_legacy_snapsets)
1802 ,
1803 "object_stat_sum_t have padding");
1804 }
1805 void encode(bufferlist& bl) const;
1806 void decode(bufferlist::iterator& bl);
1807 static void generate_test_instances(list<object_stat_sum_t*>& o);
1808};
1809WRITE_CLASS_ENCODER(object_stat_sum_t)
1810
1811bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
1812
1813/**
1814 * a collection of object stat sums
1815 *
1816 * This is a collection of stat sums over different categories.
1817 */
1818struct object_stat_collection_t {
1819 /**************************************************************************
1820 * WARNING: be sure to update the operator== when adding/removing fields! *
1821 **************************************************************************/
1822 object_stat_sum_t sum;
1823
1824 void calc_copies(int nrep) {
1825 sum.calc_copies(nrep);
1826 }
1827
1828 void dump(Formatter *f) const;
1829 void encode(bufferlist& bl) const;
1830 void decode(bufferlist::iterator& bl);
1831 static void generate_test_instances(list<object_stat_collection_t*>& o);
1832
1833 bool is_zero() const {
1834 return sum.is_zero();
1835 }
1836
1837 void clear() {
1838 sum.clear();
1839 }
1840
1841 void floor(int64_t f) {
1842 sum.floor(f);
1843 }
1844
1845 void add(const object_stat_sum_t& o) {
1846 sum.add(o);
1847 }
1848
1849 void add(const object_stat_collection_t& o) {
1850 sum.add(o.sum);
1851 }
1852 void sub(const object_stat_collection_t& o) {
1853 sum.sub(o.sum);
1854 }
1855};
1856WRITE_CLASS_ENCODER(object_stat_collection_t)
1857
1858inline bool operator==(const object_stat_collection_t& l,
1859 const object_stat_collection_t& r) {
1860 return l.sum == r.sum;
1861}
1862
1863
1864/** pg_stat
1865 * aggregate stats for a single PG.
1866 */
1867struct pg_stat_t {
1868 /**************************************************************************
1869 * WARNING: be sure to update the operator== when adding/removing fields! *
1870 **************************************************************************/
1871 eversion_t version;
1872 version_t reported_seq; // sequence number
1873 epoch_t reported_epoch; // epoch of this report
1874 __u32 state;
1875 utime_t last_fresh; // last reported
1876 utime_t last_change; // new state != previous state
1877 utime_t last_active; // state & PG_STATE_ACTIVE
1878 utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
1879 utime_t last_clean; // state & PG_STATE_CLEAN
1880 utime_t last_unstale; // (state & PG_STATE_STALE) == 0
1881 utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
1882 utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
1883
1884 eversion_t log_start; // (log_start,version]
1885 eversion_t ondisk_log_start; // there may be more on disk
1886
1887 epoch_t created;
1888 epoch_t last_epoch_clean;
1889 pg_t parent;
1890 __u32 parent_split_bits;
1891
1892 eversion_t last_scrub;
1893 eversion_t last_deep_scrub;
1894 utime_t last_scrub_stamp;
1895 utime_t last_deep_scrub_stamp;
1896 utime_t last_clean_scrub_stamp;
1897
1898 object_stat_collection_t stats;
1899
1900 int64_t log_size;
1901 int64_t ondisk_log_size; // >= active_log_size
1902
1903 vector<int32_t> up, acting;
1904 epoch_t mapping_epoch;
1905
1906 vector<int32_t> blocked_by; ///< osds on which the pg is blocked
1907
1908 utime_t last_became_active;
1909 utime_t last_became_peered;
1910
1911 /// up, acting primaries
1912 int32_t up_primary;
1913 int32_t acting_primary;
1914
1915 bool stats_invalid:1;
1916 /// true if num_objects_dirty is not accurate (because it was not
1917 /// maintained starting from pool creation)
1918 bool dirty_stats_invalid:1;
1919 bool omap_stats_invalid:1;
1920 bool hitset_stats_invalid:1;
1921 bool hitset_bytes_stats_invalid:1;
1922 bool pin_stats_invalid:1;
1923
1924 pg_stat_t()
1925 : reported_seq(0),
1926 reported_epoch(0),
1927 state(0),
1928 created(0), last_epoch_clean(0),
1929 parent_split_bits(0),
1930 log_size(0), ondisk_log_size(0),
1931 mapping_epoch(0),
1932 up_primary(-1),
1933 acting_primary(-1),
1934 stats_invalid(false),
1935 dirty_stats_invalid(false),
1936 omap_stats_invalid(false),
1937 hitset_stats_invalid(false),
1938 hitset_bytes_stats_invalid(false),
1939 pin_stats_invalid(false)
1940 { }
1941
1942 epoch_t get_effective_last_epoch_clean() const {
1943 if (state & PG_STATE_CLEAN) {
1944 // we are clean as of this report, and should thus take the
1945 // reported epoch
1946 return reported_epoch;
1947 } else {
1948 return last_epoch_clean;
1949 }
1950 }
1951
1952 pair<epoch_t, version_t> get_version_pair() const {
1953 return make_pair(reported_epoch, reported_seq);
1954 }
1955
1956 void floor(int64_t f) {
1957 stats.floor(f);
1958 if (log_size < f)
1959 log_size = f;
1960 if (ondisk_log_size < f)
1961 ondisk_log_size = f;
1962 }
1963
1964 void add(const pg_stat_t& o) {
1965 stats.add(o.stats);
1966 log_size += o.log_size;
1967 ondisk_log_size += o.ondisk_log_size;
1968 }
1969 void sub(const pg_stat_t& o) {
1970 stats.sub(o.stats);
1971 log_size -= o.log_size;
1972 ondisk_log_size -= o.ondisk_log_size;
1973 }
1974
1975 bool is_acting_osd(int32_t osd, bool primary) const;
1976 void dump(Formatter *f) const;
1977 void dump_brief(Formatter *f) const;
1978 void encode(bufferlist &bl) const;
1979 void decode(bufferlist::iterator &bl);
1980 static void generate_test_instances(list<pg_stat_t*>& o);
1981};
1982WRITE_CLASS_ENCODER(pg_stat_t)
1983
1984bool operator==(const pg_stat_t& l, const pg_stat_t& r);
1985
1986/*
1987 * summation over an entire pool
1988 */
1989struct pool_stat_t {
1990 object_stat_collection_t stats;
1991 int64_t log_size;
1992 int64_t ondisk_log_size; // >= active_log_size
1993 int32_t up; ///< number of up replicas or shards
1994 int32_t acting; ///< number of acting replicas or shards
1995
1996 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0)
1997 { }
1998
1999 void floor(int64_t f) {
2000 stats.floor(f);
2001 if (log_size < f)
2002 log_size = f;
2003 if (ondisk_log_size < f)
2004 ondisk_log_size = f;
2005 if (up < f)
2006 up = f;
2007 if (acting < f)
2008 acting = f;
2009 }
2010
2011 void add(const pg_stat_t& o) {
2012 stats.add(o.stats);
2013 log_size += o.log_size;
2014 ondisk_log_size += o.ondisk_log_size;
2015 up += o.up.size();
2016 acting += o.acting.size();
2017 }
2018 void sub(const pg_stat_t& o) {
2019 stats.sub(o.stats);
2020 log_size -= o.log_size;
2021 ondisk_log_size -= o.ondisk_log_size;
2022 up -= o.up.size();
2023 acting -= o.acting.size();
2024 }
2025
2026 bool is_zero() const {
2027 return (stats.is_zero() &&
2028 log_size == 0 &&
2029 ondisk_log_size == 0 &&
2030 up == 0 &&
2031 acting == 0);
2032 }
2033
2034 void dump(Formatter *f) const;
2035 void encode(bufferlist &bl, uint64_t features) const;
2036 void decode(bufferlist::iterator &bl);
2037 static void generate_test_instances(list<pool_stat_t*>& o);
2038};
2039WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
2040
2041
2042// -----------------------------------------
2043
2044/**
2045 * pg_hit_set_info_t - information about a single recorded HitSet
2046 *
2047 * Track basic metadata about a HitSet, like the nubmer of insertions
2048 * and the time range it covers.
2049 */
2050struct pg_hit_set_info_t {
2051 utime_t begin, end; ///< time interval
2052 eversion_t version; ///< version this HitSet object was written
2053 bool using_gmt; ///< use gmt for creating the hit_set archive object name
2054
2055 friend bool operator==(const pg_hit_set_info_t& l,
2056 const pg_hit_set_info_t& r) {
2057 return
2058 l.begin == r.begin &&
2059 l.end == r.end &&
2060 l.version == r.version &&
2061 l.using_gmt == r.using_gmt;
2062 }
2063
2064 explicit pg_hit_set_info_t(bool using_gmt = true)
2065 : using_gmt(using_gmt) {}
2066
2067 void encode(bufferlist &bl) const;
2068 void decode(bufferlist::iterator &bl);
2069 void dump(Formatter *f) const;
2070 static void generate_test_instances(list<pg_hit_set_info_t*>& o);
2071};
2072WRITE_CLASS_ENCODER(pg_hit_set_info_t)
2073
2074/**
2075 * pg_hit_set_history_t - information about a history of hitsets
2076 *
2077 * Include information about the currently accumulating hit set as well
2078 * as archived/historical ones.
2079 */
2080struct pg_hit_set_history_t {
2081 eversion_t current_last_update; ///< last version inserted into current set
2082 list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
2083
2084 friend bool operator==(const pg_hit_set_history_t& l,
2085 const pg_hit_set_history_t& r) {
2086 return
2087 l.current_last_update == r.current_last_update &&
2088 l.history == r.history;
2089 }
2090
2091 void encode(bufferlist &bl) const;
2092 void decode(bufferlist::iterator &bl);
2093 void dump(Formatter *f) const;
2094 static void generate_test_instances(list<pg_hit_set_history_t*>& o);
2095};
2096WRITE_CLASS_ENCODER(pg_hit_set_history_t)
2097
2098
2099// -----------------------------------------
2100
2101/**
2102 * pg_history_t - information about recent pg peering/mapping history
2103 *
2104 * This is aggressively shared between OSDs to bound the amount of past
2105 * history they need to worry about.
2106 */
2107struct pg_history_t {
31f18b77
FG
2108 epoch_t epoch_created; // epoch in which *pg* was created (pool or pg)
2109 epoch_t epoch_pool_created; // epoch in which *pool* was created
2110 // (note: may be pg creation epoch for
2111 // pre-luminous clusters)
7c673cae
FG
2112 epoch_t last_epoch_started; // lower bound on last epoch started (anywhere, not necessarily locally)
2113 epoch_t last_interval_started; // first epoch of last_epoch_started interval
2114 epoch_t last_epoch_clean; // lower bound on last epoch the PG was completely clean.
2115 epoch_t last_interval_clean; // first epoch of last_epoch_clean interval
31f18b77 2116 epoch_t last_epoch_split; // as parent or child
7c673cae
FG
2117 epoch_t last_epoch_marked_full; // pool or cluster
2118
2119 /**
2120 * In the event of a map discontinuity, same_*_since may reflect the first
2121 * map the osd has seen in the new map sequence rather than the actual start
2122 * of the interval. This is ok since a discontinuity at epoch e means there
2123 * must have been a clean interval between e and now and that we cannot be
2124 * in the active set during the interval containing e.
2125 */
2126 epoch_t same_up_since; // same acting set since
2127 epoch_t same_interval_since; // same acting AND up set since
2128 epoch_t same_primary_since; // same primary at least back through this epoch.
2129
2130 eversion_t last_scrub;
2131 eversion_t last_deep_scrub;
2132 utime_t last_scrub_stamp;
2133 utime_t last_deep_scrub_stamp;
2134 utime_t last_clean_scrub_stamp;
2135
2136 friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
2137 return
2138 l.epoch_created == r.epoch_created &&
31f18b77 2139 l.epoch_pool_created == r.epoch_pool_created &&
7c673cae
FG
2140 l.last_epoch_started == r.last_epoch_started &&
2141 l.last_interval_started == r.last_interval_started &&
2142 l.last_epoch_clean == r.last_epoch_clean &&
2143 l.last_interval_clean == r.last_interval_clean &&
2144 l.last_epoch_split == r.last_epoch_split &&
2145 l.last_epoch_marked_full == r.last_epoch_marked_full &&
2146 l.same_up_since == r.same_up_since &&
2147 l.same_interval_since == r.same_interval_since &&
2148 l.same_primary_since == r.same_primary_since &&
2149 l.last_scrub == r.last_scrub &&
2150 l.last_deep_scrub == r.last_deep_scrub &&
2151 l.last_scrub_stamp == r.last_scrub_stamp &&
2152 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2153 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp;
2154 }
2155
2156 pg_history_t()
2157 : epoch_created(0),
31f18b77 2158 epoch_pool_created(0),
7c673cae
FG
2159 last_epoch_started(0),
2160 last_interval_started(0),
2161 last_epoch_clean(0),
2162 last_interval_clean(0),
2163 last_epoch_split(0),
2164 last_epoch_marked_full(0),
2165 same_up_since(0), same_interval_since(0), same_primary_since(0) {}
2166
2167 bool merge(const pg_history_t &other) {
2168 // Here, we only update the fields which cannot be calculated from the OSDmap.
2169 bool modified = false;
2170 if (epoch_created < other.epoch_created) {
2171 epoch_created = other.epoch_created;
2172 modified = true;
2173 }
31f18b77
FG
2174 if (epoch_pool_created < other.epoch_pool_created) {
2175 // FIXME: for jewel compat only; this should either be 0 or always the
2176 // same value across all pg instances.
2177 epoch_pool_created = other.epoch_pool_created;
2178 modified = true;
2179 }
7c673cae
FG
2180 if (last_epoch_started < other.last_epoch_started) {
2181 last_epoch_started = other.last_epoch_started;
2182 modified = true;
2183 }
2184 if (last_interval_started < other.last_interval_started) {
2185 last_interval_started = other.last_interval_started;
2186 modified = true;
2187 }
2188 if (last_epoch_clean < other.last_epoch_clean) {
2189 last_epoch_clean = other.last_epoch_clean;
2190 modified = true;
2191 }
2192 if (last_interval_clean < other.last_interval_clean) {
2193 last_interval_clean = other.last_interval_clean;
2194 modified = true;
2195 }
2196 if (last_epoch_split < other.last_epoch_split) {
2197 last_epoch_split = other.last_epoch_split;
2198 modified = true;
2199 }
2200 if (last_epoch_marked_full < other.last_epoch_marked_full) {
2201 last_epoch_marked_full = other.last_epoch_marked_full;
2202 modified = true;
2203 }
2204 if (other.last_scrub > last_scrub) {
2205 last_scrub = other.last_scrub;
2206 modified = true;
2207 }
2208 if (other.last_scrub_stamp > last_scrub_stamp) {
2209 last_scrub_stamp = other.last_scrub_stamp;
2210 modified = true;
2211 }
2212 if (other.last_deep_scrub > last_deep_scrub) {
2213 last_deep_scrub = other.last_deep_scrub;
2214 modified = true;
2215 }
2216 if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
2217 last_deep_scrub_stamp = other.last_deep_scrub_stamp;
2218 modified = true;
2219 }
2220 if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
2221 last_clean_scrub_stamp = other.last_clean_scrub_stamp;
2222 modified = true;
2223 }
2224 return modified;
2225 }
2226
2227 void encode(bufferlist& bl) const;
2228 void decode(bufferlist::iterator& p);
2229 void dump(Formatter *f) const;
2230 static void generate_test_instances(list<pg_history_t*>& o);
2231};
2232WRITE_CLASS_ENCODER(pg_history_t)
2233
2234inline ostream& operator<<(ostream& out, const pg_history_t& h) {
31f18b77 2235 return out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
7c673cae
FG
2236 << " lis/c " << h.last_interval_started
2237 << "/" << h.last_interval_clean
2238 << " les/c/f " << h.last_epoch_started << "/" << h.last_epoch_clean
2239 << "/" << h.last_epoch_marked_full
2240 << " " << h.same_up_since
2241 << "/" << h.same_interval_since
2242 << "/" << h.same_primary_since;
2243}
2244
2245
2246/**
2247 * pg_info_t - summary of PG statistics.
2248 *
2249 * some notes:
2250 * - last_complete implies we have all objects that existed as of that
2251 * stamp, OR a newer object, OR have already applied a later delete.
2252 * - if last_complete >= log.bottom, then we know pg contents thru log.head.
2253 * otherwise, we have no idea what the pg is supposed to contain.
2254 */
2255struct pg_info_t {
2256 spg_t pgid;
2257 eversion_t last_update; ///< last object version applied to store.
2258 eversion_t last_complete; ///< last version pg was complete through.
2259 epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd
2260 epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
2261
2262 version_t last_user_version; ///< last user object version applied to store
2263
2264 eversion_t log_tail; ///< oldest log entry.
2265
2266 hobject_t last_backfill; ///< objects >= this and < last_complete may be missing
2267 bool last_backfill_bitwise; ///< true if last_backfill reflects a bitwise (vs nibblewise) sort
2268
2269 interval_set<snapid_t> purged_snaps;
2270
2271 pg_stat_t stats;
2272
2273 pg_history_t history;
2274 pg_hit_set_history_t hit_set;
2275
2276 friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
2277 return
2278 l.pgid == r.pgid &&
2279 l.last_update == r.last_update &&
2280 l.last_complete == r.last_complete &&
2281 l.last_epoch_started == r.last_epoch_started &&
2282 l.last_interval_started == r.last_interval_started &&
2283 l.last_user_version == r.last_user_version &&
2284 l.log_tail == r.log_tail &&
2285 l.last_backfill == r.last_backfill &&
2286 l.last_backfill_bitwise == r.last_backfill_bitwise &&
2287 l.purged_snaps == r.purged_snaps &&
2288 l.stats == r.stats &&
2289 l.history == r.history &&
2290 l.hit_set == r.hit_set;
2291 }
2292
2293 pg_info_t()
2294 : last_epoch_started(0),
2295 last_interval_started(0),
2296 last_user_version(0),
2297 last_backfill(hobject_t::get_max()),
2298 last_backfill_bitwise(false)
2299 { }
2300 // cppcheck-suppress noExplicitConstructor
2301 pg_info_t(spg_t p)
2302 : pgid(p),
2303 last_epoch_started(0),
2304 last_interval_started(0),
2305 last_user_version(0),
2306 last_backfill(hobject_t::get_max()),
2307 last_backfill_bitwise(false)
2308 { }
2309
2310 void set_last_backfill(hobject_t pos) {
2311 last_backfill = pos;
2312 last_backfill_bitwise = true;
2313 }
2314
2315 bool is_empty() const { return last_update.version == 0; }
2316 bool dne() const { return history.epoch_created == 0; }
2317
2318 bool is_incomplete() const { return !last_backfill.is_max(); }
2319
2320 void encode(bufferlist& bl) const;
2321 void decode(bufferlist::iterator& p);
2322 void dump(Formatter *f) const;
2323 bool overlaps_with(const pg_info_t &oinfo) const {
2324 return last_update > oinfo.log_tail ?
2325 oinfo.last_update >= log_tail :
2326 last_update >= oinfo.log_tail;
2327 }
2328 static void generate_test_instances(list<pg_info_t*>& o);
2329};
2330WRITE_CLASS_ENCODER(pg_info_t)
2331
2332inline ostream& operator<<(ostream& out, const pg_info_t& pgi)
2333{
2334 out << pgi.pgid << "(";
2335 if (pgi.dne())
2336 out << " DNE";
2337 if (pgi.is_empty())
2338 out << " empty";
2339 else {
2340 out << " v " << pgi.last_update;
2341 if (pgi.last_complete != pgi.last_update)
2342 out << " lc " << pgi.last_complete;
2343 out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
2344 }
2345 if (pgi.is_incomplete())
2346 out << " lb " << pgi.last_backfill
2347 << (pgi.last_backfill_bitwise ? " (bitwise)" : " (NIBBLEWISE)");
2348 //out << " c " << pgi.epoch_created;
2349 out << " local-lis/les=" << pgi.last_interval_started
2350 << "/" << pgi.last_epoch_started;
2351 out << " n=" << pgi.stats.stats.sum.num_objects;
2352 out << " " << pgi.history
2353 << ")";
2354 return out;
2355}
2356
2357/**
2358 * pg_fast_info_t - common pg_info_t fields
2359 *
2360 * These are the fields of pg_info_t (and children) that are updated for
2361 * most IO operations.
2362 *
2363 * ** WARNING **
2364 * Because we rely on these fields to be applied to the normal
2365 * info struct, adding a new field here that is not also new in info
2366 * means that we must set an incompat OSD feature bit!
2367 */
2368struct pg_fast_info_t {
2369 eversion_t last_update;
2370 eversion_t last_complete;
2371 version_t last_user_version;
2372 struct { // pg_stat_t stats
2373 eversion_t version;
2374 version_t reported_seq;
2375 utime_t last_fresh;
2376 utime_t last_active;
2377 utime_t last_peered;
2378 utime_t last_clean;
2379 utime_t last_unstale;
2380 utime_t last_undegraded;
2381 utime_t last_fullsized;
2382 int64_t log_size; // (also ondisk_log_size, which has the same value)
2383 struct { // object_stat_collection_t stats;
2384 struct { // objct_stat_sum_t sum
2385 int64_t num_bytes; // in bytes
2386 int64_t num_objects;
2387 int64_t num_object_copies;
2388 int64_t num_rd;
2389 int64_t num_rd_kb;
2390 int64_t num_wr;
2391 int64_t num_wr_kb;
2392 int64_t num_objects_dirty;
2393 } sum;
2394 } stats;
2395 } stats;
2396
2397 void populate_from(const pg_info_t& info) {
2398 last_update = info.last_update;
2399 last_complete = info.last_complete;
2400 last_user_version = info.last_user_version;
2401 stats.version = info.stats.version;
2402 stats.reported_seq = info.stats.reported_seq;
2403 stats.last_fresh = info.stats.last_fresh;
2404 stats.last_active = info.stats.last_active;
2405 stats.last_peered = info.stats.last_peered;
2406 stats.last_clean = info.stats.last_clean;
2407 stats.last_unstale = info.stats.last_unstale;
2408 stats.last_undegraded = info.stats.last_undegraded;
2409 stats.last_fullsized = info.stats.last_fullsized;
2410 stats.log_size = info.stats.log_size;
2411 stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
2412 stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
2413 stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
2414 stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
2415 stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
2416 stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
2417 stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
2418 stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
2419 }
2420
2421 bool try_apply_to(pg_info_t* info) {
2422 if (last_update <= info->last_update)
2423 return false;
2424 info->last_update = last_update;
2425 info->last_complete = last_complete;
2426 info->last_user_version = last_user_version;
2427 info->stats.version = stats.version;
2428 info->stats.reported_seq = stats.reported_seq;
2429 info->stats.last_fresh = stats.last_fresh;
2430 info->stats.last_active = stats.last_active;
2431 info->stats.last_peered = stats.last_peered;
2432 info->stats.last_clean = stats.last_clean;
2433 info->stats.last_unstale = stats.last_unstale;
2434 info->stats.last_undegraded = stats.last_undegraded;
2435 info->stats.last_fullsized = stats.last_fullsized;
2436 info->stats.log_size = stats.log_size;
2437 info->stats.ondisk_log_size = stats.log_size;
2438 info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
2439 info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
2440 info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
2441 info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
2442 info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
2443 info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
2444 info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
2445 info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
2446 return true;
2447 }
2448
2449 void encode(bufferlist& bl) const {
2450 ENCODE_START(1, 1, bl);
2451 ::encode(last_update, bl);
2452 ::encode(last_complete, bl);
2453 ::encode(last_user_version, bl);
2454 ::encode(stats.version, bl);
2455 ::encode(stats.reported_seq, bl);
2456 ::encode(stats.last_fresh, bl);
2457 ::encode(stats.last_active, bl);
2458 ::encode(stats.last_peered, bl);
2459 ::encode(stats.last_clean, bl);
2460 ::encode(stats.last_unstale, bl);
2461 ::encode(stats.last_undegraded, bl);
2462 ::encode(stats.last_fullsized, bl);
2463 ::encode(stats.log_size, bl);
2464 ::encode(stats.stats.sum.num_bytes, bl);
2465 ::encode(stats.stats.sum.num_objects, bl);
2466 ::encode(stats.stats.sum.num_object_copies, bl);
2467 ::encode(stats.stats.sum.num_rd, bl);
2468 ::encode(stats.stats.sum.num_rd_kb, bl);
2469 ::encode(stats.stats.sum.num_wr, bl);
2470 ::encode(stats.stats.sum.num_wr_kb, bl);
2471 ::encode(stats.stats.sum.num_objects_dirty, bl);
2472 ENCODE_FINISH(bl);
2473 }
2474 void decode(bufferlist::iterator& p) {
2475 DECODE_START(1, p);
2476 ::decode(last_update, p);
2477 ::decode(last_complete, p);
2478 ::decode(last_user_version, p);
2479 ::decode(stats.version, p);
2480 ::decode(stats.reported_seq, p);
2481 ::decode(stats.last_fresh, p);
2482 ::decode(stats.last_active, p);
2483 ::decode(stats.last_peered, p);
2484 ::decode(stats.last_clean, p);
2485 ::decode(stats.last_unstale, p);
2486 ::decode(stats.last_undegraded, p);
2487 ::decode(stats.last_fullsized, p);
2488 ::decode(stats.log_size, p);
2489 ::decode(stats.stats.sum.num_bytes, p);
2490 ::decode(stats.stats.sum.num_objects, p);
2491 ::decode(stats.stats.sum.num_object_copies, p);
2492 ::decode(stats.stats.sum.num_rd, p);
2493 ::decode(stats.stats.sum.num_rd_kb, p);
2494 ::decode(stats.stats.sum.num_wr, p);
2495 ::decode(stats.stats.sum.num_wr_kb, p);
2496 ::decode(stats.stats.sum.num_objects_dirty, p);
2497 DECODE_FINISH(p);
2498 }
2499};
2500WRITE_CLASS_ENCODER(pg_fast_info_t)
2501
2502
2503struct pg_notify_t {
2504 epoch_t query_epoch;
2505 epoch_t epoch_sent;
2506 pg_info_t info;
2507 shard_id_t to;
2508 shard_id_t from;
2509 pg_notify_t() :
2510 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
2511 from(shard_id_t::NO_SHARD) {}
2512 pg_notify_t(
2513 shard_id_t to,
2514 shard_id_t from,
2515 epoch_t query_epoch,
2516 epoch_t epoch_sent,
2517 const pg_info_t &info)
2518 : query_epoch(query_epoch),
2519 epoch_sent(epoch_sent),
2520 info(info), to(to), from(from) {
2521 assert(from == info.pgid.shard);
2522 }
2523 void encode(bufferlist &bl) const;
2524 void decode(bufferlist::iterator &p);
2525 void dump(Formatter *f) const;
2526 static void generate_test_instances(list<pg_notify_t*> &o);
2527};
2528WRITE_CLASS_ENCODER(pg_notify_t)
2529ostream &operator<<(ostream &lhs, const pg_notify_t &notify);
2530
2531
2532class OSDMap;
2533/**
2534 * PastIntervals -- information needed to determine the PriorSet and
2535 * the might_have_unfound set
2536 */
2537class PastIntervals {
2538public:
2539 struct pg_interval_t {
2540 vector<int32_t> up, acting;
2541 epoch_t first, last;
2542 bool maybe_went_rw;
2543 int32_t primary;
2544 int32_t up_primary;
2545
2546 pg_interval_t()
2547 : first(0), last(0),
2548 maybe_went_rw(false),
2549 primary(-1),
2550 up_primary(-1)
2551 {}
2552
2553 pg_interval_t(
2554 vector<int32_t> &&up,
2555 vector<int32_t> &&acting,
2556 epoch_t first,
2557 epoch_t last,
2558 bool maybe_went_rw,
2559 int32_t primary,
2560 int32_t up_primary)
2561 : up(up), acting(acting), first(first), last(last),
2562 maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
2563 {}
2564
2565 void encode(bufferlist& bl) const;
2566 void decode(bufferlist::iterator& bl);
2567 void dump(Formatter *f) const;
2568 static void generate_test_instances(list<pg_interval_t*>& o);
2569 };
2570
2571 PastIntervals() = default;
2572 PastIntervals(bool ec_pool, const OSDMap &osdmap) : PastIntervals() {
2573 update_type_from_map(ec_pool, osdmap);
2574 }
2575 PastIntervals(bool ec_pool, bool compact) : PastIntervals() {
2576 update_type(ec_pool, compact);
2577 }
2578 PastIntervals(PastIntervals &&rhs) = default;
2579 PastIntervals &operator=(PastIntervals &&rhs) = default;
2580
2581 PastIntervals(const PastIntervals &rhs);
2582 PastIntervals &operator=(const PastIntervals &rhs);
2583
2584 class interval_rep {
2585 public:
2586 virtual size_t size() const = 0;
2587 virtual bool empty() const = 0;
2588 virtual void clear() = 0;
2589 virtual pair<epoch_t, epoch_t> get_bounds() const = 0;
2590 virtual set<pg_shard_t> get_all_participants(
2591 bool ec_pool) const = 0;
2592 virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
2593 virtual unique_ptr<interval_rep> clone() const = 0;
2594 virtual ostream &print(ostream &out) const = 0;
2595 virtual void encode(bufferlist &bl) const = 0;
2596 virtual void decode(bufferlist::iterator &bl) = 0;
2597 virtual void dump(Formatter *f) const = 0;
2598 virtual bool is_classic() const = 0;
2599 virtual void iterate_mayberw_back_to(
2600 bool ec_pool,
2601 epoch_t les,
2602 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const = 0;
2603
2604 virtual bool has_full_intervals() const { return false; }
2605 virtual void iterate_all_intervals(
2606 std::function<void(const pg_interval_t &)> &&f) const {
2607 assert(!has_full_intervals());
2608 assert(0 == "not valid for this implementation");
2609 }
2610
2611 virtual ~interval_rep() {}
2612 };
2613 friend class pi_simple_rep;
2614 friend class pi_compact_rep;
2615private:
2616
2617 unique_ptr<interval_rep> past_intervals;
2618
2619 PastIntervals(interval_rep *rep) : past_intervals(rep) {}
2620
2621public:
2622 void add_interval(bool ec_pool, const pg_interval_t &interval) {
2623 assert(past_intervals);
2624 return past_intervals->add_interval(ec_pool, interval);
2625 }
2626
2627 bool is_classic() const {
2628 assert(past_intervals);
2629 return past_intervals->is_classic();
2630 }
2631
2632 void encode(bufferlist &bl) const {
2633 ENCODE_START(1, 1, bl);
2634 if (past_intervals) {
2635 __u8 type = is_classic() ? 1 : 2;
2636 ::encode(type, bl);
2637 past_intervals->encode(bl);
2638 } else {
2639 ::encode((__u8)0, bl);
2640 }
2641 ENCODE_FINISH(bl);
2642 }
2643 void encode_classic(bufferlist &bl) const {
2644 if (past_intervals) {
2645 assert(past_intervals->is_classic());
2646 past_intervals->encode(bl);
2647 } else {
2648 // it's a map<>
2649 ::encode((uint32_t)0, bl);
2650 }
2651 }
2652
2653 void decode(bufferlist::iterator &bl);
2654 void decode_classic(bufferlist::iterator &bl);
2655
2656 void dump(Formatter *f) const {
2657 assert(past_intervals);
2658 past_intervals->dump(f);
2659 }
2660 static void generate_test_instances(list<PastIntervals *> & o);
2661
2662 /**
2663 * Determines whether there is an interval change
2664 */
2665 static bool is_new_interval(
2666 int old_acting_primary,
2667 int new_acting_primary,
2668 const vector<int> &old_acting,
2669 const vector<int> &new_acting,
2670 int old_up_primary,
2671 int new_up_primary,
2672 const vector<int> &old_up,
2673 const vector<int> &new_up,
2674 int old_size,
2675 int new_size,
2676 int old_min_size,
2677 int new_min_size,
2678 unsigned old_pg_num,
2679 unsigned new_pg_num,
2680 bool old_sort_bitwise,
2681 bool new_sort_bitwise,
2682 pg_t pgid
2683 );
2684
2685 /**
2686 * Determines whether there is an interval change
2687 */
2688 static bool is_new_interval(
2689 int old_acting_primary, ///< [in] primary as of lastmap
2690 int new_acting_primary, ///< [in] primary as of lastmap
2691 const vector<int> &old_acting, ///< [in] acting as of lastmap
2692 const vector<int> &new_acting, ///< [in] acting as of osdmap
2693 int old_up_primary, ///< [in] up primary of lastmap
2694 int new_up_primary, ///< [in] up primary of osdmap
2695 const vector<int> &old_up, ///< [in] up as of lastmap
2696 const vector<int> &new_up, ///< [in] up as of osdmap
2697 ceph::shared_ptr<const OSDMap> osdmap, ///< [in] current map
2698 ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
2699 pg_t pgid ///< [in] pgid for pg
2700 );
2701
2702 /**
2703 * Integrates a new map into *past_intervals, returns true
2704 * if an interval was closed out.
2705 */
2706 static bool check_new_interval(
2707 int old_acting_primary, ///< [in] primary as of lastmap
2708 int new_acting_primary, ///< [in] primary as of osdmap
2709 const vector<int> &old_acting, ///< [in] acting as of lastmap
2710 const vector<int> &new_acting, ///< [in] acting as of osdmap
2711 int old_up_primary, ///< [in] up primary of lastmap
2712 int new_up_primary, ///< [in] up primary of osdmap
2713 const vector<int> &old_up, ///< [in] up as of lastmap
2714 const vector<int> &new_up, ///< [in] up as of osdmap
2715 epoch_t same_interval_since, ///< [in] as of osdmap
2716 epoch_t last_epoch_clean, ///< [in] current
2717 ceph::shared_ptr<const OSDMap> osdmap, ///< [in] current map
2718 ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
2719 pg_t pgid, ///< [in] pgid for pg
2720 IsPGRecoverablePredicate *could_have_gone_active, /// [in] predicate whether the pg can be active
2721 PastIntervals *past_intervals, ///< [out] intervals
2722 ostream *out = 0 ///< [out] debug ostream
2723 );
2724 friend ostream& operator<<(ostream& out, const PastIntervals &i);
2725
2726 template <typename F>
2727 void iterate_mayberw_back_to(
2728 bool ec_pool,
2729 epoch_t les,
2730 F &&f) const {
2731 assert(past_intervals);
2732 past_intervals->iterate_mayberw_back_to(ec_pool, les, std::forward<F>(f));
2733 }
2734 void clear() {
2735 assert(past_intervals);
2736 past_intervals->clear();
2737 }
2738
2739 /**
2740 * Should return a value which gives an indication of the amount
2741 * of state contained
2742 */
2743 size_t size() const {
2744 assert(past_intervals);
2745 return past_intervals->size();
2746 }
2747
2748 bool empty() const {
2749 assert(past_intervals);
2750 return past_intervals->empty();
2751 }
2752
2753 void swap(PastIntervals &other) {
31f18b77
FG
2754 using std::swap;
2755 swap(other.past_intervals, past_intervals);
7c673cae
FG
2756 }
2757
2758 /**
2759 * Return all shards which have been in the acting set back to the
2760 * latest epoch to which we have trimmed except for pg_whoami
2761 */
2762 set<pg_shard_t> get_might_have_unfound(
2763 pg_shard_t pg_whoami,
2764 bool ec_pool) const {
2765 assert(past_intervals);
2766 auto ret = past_intervals->get_all_participants(ec_pool);
2767 ret.erase(pg_whoami);
2768 return ret;
2769 }
2770
2771 /**
2772 * Return all shards which we might want to talk to for peering
2773 */
2774 set<pg_shard_t> get_all_probe(
2775 bool ec_pool) const {
2776 assert(past_intervals);
2777 return past_intervals->get_all_participants(ec_pool);
2778 }
2779
2780 /* Return the set of epochs [start, end) represented by the
2781 * past_interval set.
2782 */
2783 pair<epoch_t, epoch_t> get_bounds() const {
2784 assert(past_intervals);
2785 return past_intervals->get_bounds();
2786 }
2787
2788 enum osd_state_t {
2789 UP,
2790 DOWN,
2791 DNE,
2792 LOST
2793 };
2794 struct PriorSet {
2795 bool ec_pool = false;
2796 set<pg_shard_t> probe; /// current+prior OSDs we need to probe.
2797 set<int> down; /// down osds that would normally be in @a probe and might be interesting.
2798 map<int, epoch_t> blocked_by; /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
2799
2800 bool pg_down = false; /// some down osds are included in @a cur; the DOWN pg state bit should be set.
2801 unique_ptr<IsPGRecoverablePredicate> pcontdec;
2802
2803 PriorSet() = default;
2804 PriorSet(PriorSet &&) = default;
2805 PriorSet &operator=(PriorSet &&) = default;
2806
2807 PriorSet &operator=(const PriorSet &) = delete;
2808 PriorSet(const PriorSet &) = delete;
2809
2810 bool operator==(const PriorSet &rhs) const {
2811 return (ec_pool == rhs.ec_pool) &&
2812 (probe == rhs.probe) &&
2813 (down == rhs.down) &&
2814 (blocked_by == rhs.blocked_by) &&
2815 (pg_down == rhs.pg_down);
2816 }
2817
2818 bool affected_by_map(
2819 const OSDMap &osdmap,
2820 const DoutPrefixProvider *dpp) const;
2821
2822 // For verifying tests
2823 PriorSet(
2824 bool ec_pool,
2825 set<pg_shard_t> probe,
2826 set<int> down,
2827 map<int, epoch_t> blocked_by,
2828 bool pg_down,
2829 IsPGRecoverablePredicate *pcontdec)
2830 : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
2831 pg_down(pg_down), pcontdec(pcontdec) {}
2832
2833 private:
2834 template <typename F>
2835 PriorSet(
2836 const PastIntervals &past_intervals,
2837 bool ec_pool,
2838 epoch_t last_epoch_started,
2839 IsPGRecoverablePredicate *c,
2840 F f,
2841 const vector<int> &up,
2842 const vector<int> &acting,
2843 const DoutPrefixProvider *dpp);
2844
2845 friend class PastIntervals;
2846 };
2847
2848 void update_type(bool ec_pool, bool compact);
2849 void update_type_from_map(bool ec_pool, const OSDMap &osdmap);
2850
2851 template <typename... Args>
2852 PriorSet get_prior_set(Args&&... args) const {
2853 return PriorSet(*this, std::forward<Args>(args)...);
2854 }
2855};
2856WRITE_CLASS_ENCODER(PastIntervals)
2857
2858ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i);
2859ostream& operator<<(ostream& out, const PastIntervals &i);
2860ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i);
2861
2862template <typename F>
2863PastIntervals::PriorSet::PriorSet(
2864 const PastIntervals &past_intervals,
2865 bool ec_pool,
2866 epoch_t last_epoch_started,
2867 IsPGRecoverablePredicate *c,
2868 F f,
2869 const vector<int> &up,
2870 const vector<int> &acting,
2871 const DoutPrefixProvider *dpp)
2872 : ec_pool(ec_pool), pg_down(false), pcontdec(c)
2873{
2874 /*
2875 * We have to be careful to gracefully deal with situations like
2876 * so. Say we have a power outage or something that takes out both
2877 * OSDs, but the monitor doesn't mark them down in the same epoch.
2878 * The history may look like
2879 *
2880 * 1: A B
2881 * 2: B
2882 * 3: let's say B dies for good, too (say, from the power spike)
2883 * 4: A
2884 *
2885 * which makes it look like B may have applied updates to the PG
2886 * that we need in order to proceed. This sucks...
2887 *
2888 * To minimize the risk of this happening, we CANNOT go active if
2889 * _any_ OSDs in the prior set are down until we send an MOSDAlive
2890 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
2891 * Then, we have something like
2892 *
2893 * 1: A B
2894 * 2: B up_thru[B]=0
2895 * 3:
2896 * 4: A
2897 *
2898 * -> we can ignore B, bc it couldn't have gone active (alive_thru
2899 * still 0).
2900 *
2901 * or,
2902 *
2903 * 1: A B
2904 * 2: B up_thru[B]=0
2905 * 3: B up_thru[B]=2
2906 * 4:
2907 * 5: A
2908 *
2909 * -> we must wait for B, bc it was alive through 2, and could have
2910 * written to the pg.
2911 *
2912 * If B is really dead, then an administrator will need to manually
2913 * intervene by marking the OSD as "lost."
2914 */
2915
2916 // Include current acting and up nodes... not because they may
2917 // contain old data (this interval hasn't gone active, obviously),
2918 // but because we want their pg_info to inform choose_acting(), and
2919 // so that we know what they do/do not have explicitly before
2920 // sending them any new info/logs/whatever.
2921 for (unsigned i = 0; i < acting.size(); i++) {
2922 if (acting[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
2923 probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
2924 }
2925 // It may be possible to exclude the up nodes, but let's keep them in
2926 // there for now.
2927 for (unsigned i = 0; i < up.size(); i++) {
2928 if (up[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
2929 probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
2930 }
2931
2932 set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
2933 ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
2934 for (auto &&i: all_probe) {
2935 switch (f(0, i.osd, nullptr)) {
2936 case UP: {
2937 probe.insert(i);
2938 break;
2939 }
2940 case DNE:
2941 case LOST:
2942 case DOWN: {
2943 down.insert(i.osd);
2944 break;
2945 }
2946 }
2947 }
2948
2949 past_intervals.iterate_mayberw_back_to(
2950 ec_pool,
2951 last_epoch_started,
2952 [&](epoch_t start, const set<pg_shard_t> &acting) {
2953 ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
2954 << ", acting: " << acting << dendl;
2955
2956 // look at candidate osds during this interval. each falls into
2957 // one of three categories: up, down (but potentially
2958 // interesting), or lost (down, but we won't wait for it).
2959 set<pg_shard_t> up_now;
2960 map<int, epoch_t> candidate_blocked_by;
2961 // any candidates down now (that might have useful data)
2962 bool any_down_now = false;
2963
2964 // consider ACTING osds
2965 for (auto &&so: acting) {
2966 epoch_t lost_at = 0;
2967 switch (f(start, so.osd, &lost_at)) {
2968 case UP: {
2969 // include past acting osds if they are up.
2970 up_now.insert(so);
2971 break;
2972 }
2973 case DNE: {
2974 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
2975 << " no longer exists" << dendl;
2976 break;
2977 }
2978 case LOST: {
2979 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
2980 << " is down, but lost_at " << lost_at << dendl;
2981 up_now.insert(so);
2982 break;
2983 }
2984 case DOWN: {
2985 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
2986 << " is down" << dendl;
2987 candidate_blocked_by[so.osd] = lost_at;
2988 any_down_now = true;
2989 break;
2990 }
2991 }
2992 }
2993
2994 // if not enough osds survived this interval, and we may have gone rw,
2995 // then we need to wait for one of those osds to recover to
2996 // ensure that we haven't lost any information.
2997 if (!(*pcontdec)(up_now) && any_down_now) {
2998 // fixme: how do we identify a "clean" shutdown anyway?
2999 ldpp_dout(dpp, 10) << "build_prior possibly went active+rw,"
3000 << " insufficient up; including down osds" << dendl;
3001 assert(!candidate_blocked_by.empty());
3002 pg_down = true;
3003 blocked_by.insert(
3004 candidate_blocked_by.begin(),
3005 candidate_blocked_by.end());
3006 }
3007 });
3008
3009 ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
3010 << " down " << down
3011 << " blocked_by " << blocked_by
3012 << (pg_down ? " pg_down":"")
3013 << dendl;
3014}
3015
3016/**
3017 * pg_query_t - used to ask a peer for information about a pg.
3018 *
3019 * note: if version=0, type=LOG, then we just provide our full log.
3020 */
3021struct pg_query_t {
3022 enum {
3023 INFO = 0,
3024 LOG = 1,
3025 MISSING = 4,
3026 FULLLOG = 5,
3027 };
3028 const char *get_type_name() const {
3029 switch (type) {
3030 case INFO: return "info";
3031 case LOG: return "log";
3032 case MISSING: return "missing";
3033 case FULLLOG: return "fulllog";
3034 default: return "???";
3035 }
3036 }
3037
3038 __s32 type;
3039 eversion_t since;
3040 pg_history_t history;
3041 epoch_t epoch_sent;
3042 shard_id_t to;
3043 shard_id_t from;
3044
3045 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
3046 from(shard_id_t::NO_SHARD) {}
3047 pg_query_t(
3048 int t,
3049 shard_id_t to,
3050 shard_id_t from,
3051 const pg_history_t& h,
3052 epoch_t epoch_sent)
3053 : type(t),
3054 history(h),
3055 epoch_sent(epoch_sent),
3056 to(to), from(from) {
3057 assert(t != LOG);
3058 }
3059 pg_query_t(
3060 int t,
3061 shard_id_t to,
3062 shard_id_t from,
3063 eversion_t s,
3064 const pg_history_t& h,
3065 epoch_t epoch_sent)
3066 : type(t), since(s), history(h),
3067 epoch_sent(epoch_sent), to(to), from(from) {
3068 assert(t == LOG);
3069 }
3070
3071 void encode(bufferlist &bl, uint64_t features) const;
3072 void decode(bufferlist::iterator &bl);
3073
3074 void dump(Formatter *f) const;
3075 static void generate_test_instances(list<pg_query_t*>& o);
3076};
3077WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
3078
3079inline ostream& operator<<(ostream& out, const pg_query_t& q) {
3080 out << "query(" << q.get_type_name() << " " << q.since;
3081 if (q.type == pg_query_t::LOG)
3082 out << " " << q.history;
3083 out << ")";
3084 return out;
3085}
3086
3087class PGBackend;
3088class ObjectModDesc {
3089 bool can_local_rollback;
3090 bool rollback_info_completed;
3091
3092 // version required to decode, reflected in encode/decode version
3093 __u8 max_required_version = 1;
3094public:
3095 class Visitor {
3096 public:
3097 virtual void append(uint64_t old_offset) {}
3098 virtual void setattrs(map<string, boost::optional<bufferlist> > &attrs) {}
3099 virtual void rmobject(version_t old_version) {}
3100 /**
3101 * Used to support the unfound_lost_delete log event: if the stashed
3102 * version exists, we unstash it, otherwise, we do nothing. This way
3103 * each replica rolls back to whatever state it had prior to the attempt
3104 * at mark unfound lost delete
3105 */
3106 virtual void try_rmobject(version_t old_version) {
3107 rmobject(old_version);
3108 }
3109 virtual void create() {}
3110 virtual void update_snaps(const set<snapid_t> &old_snaps) {}
3111 virtual void rollback_extents(
3112 version_t gen,
3113 const vector<pair<uint64_t, uint64_t> > &extents) {}
3114 virtual ~Visitor() {}
3115 };
3116 void visit(Visitor *visitor) const;
3117 mutable bufferlist bl;
3118 enum ModID {
3119 APPEND = 1,
3120 SETATTRS = 2,
3121 DELETE = 3,
3122 CREATE = 4,
3123 UPDATE_SNAPS = 5,
3124 TRY_DELETE = 6,
3125 ROLLBACK_EXTENTS = 7
3126 };
31f18b77
FG
3127 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3128 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3129 }
7c673cae
FG
3130 void claim(ObjectModDesc &other) {
3131 bl.clear();
3132 bl.claim(other.bl);
3133 can_local_rollback = other.can_local_rollback;
3134 rollback_info_completed = other.rollback_info_completed;
3135 }
3136 void claim_append(ObjectModDesc &other) {
3137 if (!can_local_rollback || rollback_info_completed)
3138 return;
3139 if (!other.can_local_rollback) {
3140 mark_unrollbackable();
3141 return;
3142 }
3143 bl.claim_append(other.bl);
3144 rollback_info_completed = other.rollback_info_completed;
3145 }
3146 void swap(ObjectModDesc &other) {
3147 bl.swap(other.bl);
3148
31f18b77
FG
3149 using std::swap;
3150 swap(other.can_local_rollback, can_local_rollback);
3151 swap(other.rollback_info_completed, rollback_info_completed);
3152 swap(other.max_required_version, max_required_version);
7c673cae
FG
3153 }
3154 void append_id(ModID id) {
3155 uint8_t _id(id);
3156 ::encode(_id, bl);
3157 }
3158 void append(uint64_t old_size) {
3159 if (!can_local_rollback || rollback_info_completed)
3160 return;
3161 ENCODE_START(1, 1, bl);
3162 append_id(APPEND);
3163 ::encode(old_size, bl);
3164 ENCODE_FINISH(bl);
3165 }
3166 void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
3167 if (!can_local_rollback || rollback_info_completed)
3168 return;
3169 ENCODE_START(1, 1, bl);
3170 append_id(SETATTRS);
3171 ::encode(old_attrs, bl);
3172 ENCODE_FINISH(bl);
3173 }
3174 bool rmobject(version_t deletion_version) {
3175 if (!can_local_rollback || rollback_info_completed)
3176 return false;
3177 ENCODE_START(1, 1, bl);
3178 append_id(DELETE);
3179 ::encode(deletion_version, bl);
3180 ENCODE_FINISH(bl);
3181 rollback_info_completed = true;
3182 return true;
3183 }
3184 bool try_rmobject(version_t deletion_version) {
3185 if (!can_local_rollback || rollback_info_completed)
3186 return false;
3187 ENCODE_START(1, 1, bl);
3188 append_id(TRY_DELETE);
3189 ::encode(deletion_version, bl);
3190 ENCODE_FINISH(bl);
3191 rollback_info_completed = true;
3192 return true;
3193 }
3194 void create() {
3195 if (!can_local_rollback || rollback_info_completed)
3196 return;
3197 rollback_info_completed = true;
3198 ENCODE_START(1, 1, bl);
3199 append_id(CREATE);
3200 ENCODE_FINISH(bl);
3201 }
3202 void update_snaps(const set<snapid_t> &old_snaps) {
3203 if (!can_local_rollback || rollback_info_completed)
3204 return;
3205 ENCODE_START(1, 1, bl);
3206 append_id(UPDATE_SNAPS);
3207 ::encode(old_snaps, bl);
3208 ENCODE_FINISH(bl);
3209 }
3210 void rollback_extents(
3211 version_t gen, const vector<pair<uint64_t, uint64_t> > &extents) {
3212 assert(can_local_rollback);
3213 assert(!rollback_info_completed);
3214 if (max_required_version < 2)
3215 max_required_version = 2;
3216 ENCODE_START(2, 2, bl);
3217 append_id(ROLLBACK_EXTENTS);
3218 ::encode(gen, bl);
3219 ::encode(extents, bl);
3220 ENCODE_FINISH(bl);
3221 }
3222
3223 // cannot be rolled back
3224 void mark_unrollbackable() {
3225 can_local_rollback = false;
3226 bl.clear();
3227 }
3228 bool can_rollback() const {
3229 return can_local_rollback;
3230 }
3231 bool empty() const {
3232 return can_local_rollback && (bl.length() == 0);
3233 }
3234
3235 bool requires_kraken() const {
3236 return max_required_version >= 2;
3237 }
3238
3239 /**
3240 * Create fresh copy of bl bytes to avoid keeping large buffers around
3241 * in the case that bl contains ptrs which point into a much larger
3242 * message buffer
3243 */
31f18b77 3244 void trim_bl() const {
7c673cae
FG
3245 if (bl.length() > 0)
3246 bl.rebuild();
3247 }
3248 void encode(bufferlist &bl) const;
3249 void decode(bufferlist::iterator &bl);
3250 void dump(Formatter *f) const;
3251 static void generate_test_instances(list<ObjectModDesc*>& o);
3252};
3253WRITE_CLASS_ENCODER(ObjectModDesc)
3254
3255
3256/**
3257 * pg_log_entry_t - single entry/event in pg log
3258 *
3259 */
3260struct pg_log_entry_t {
3261 enum {
3262 MODIFY = 1, // some unspecified modification (but not *all* modifications)
3263 CLONE = 2, // cloned object from head
3264 DELETE = 3, // deleted object
3265 BACKLOG = 4, // event invented by generate_backlog [deprecated]
3266 LOST_REVERT = 5, // lost new version, revert to an older version.
3267 LOST_DELETE = 6, // lost new version, revert to no object (deleted).
3268 LOST_MARK = 7, // lost new version, now EIO
3269 PROMOTE = 8, // promoted object from another tier
3270 CLEAN = 9, // mark an object clean
3271 ERROR = 10, // write that returned an error
3272 };
3273 static const char *get_op_name(int op) {
3274 switch (op) {
3275 case MODIFY:
3276 return "modify";
3277 case PROMOTE:
3278 return "promote";
3279 case CLONE:
3280 return "clone";
3281 case DELETE:
3282 return "delete";
3283 case BACKLOG:
3284 return "backlog";
3285 case LOST_REVERT:
3286 return "l_revert";
3287 case LOST_DELETE:
3288 return "l_delete";
3289 case LOST_MARK:
3290 return "l_mark";
3291 case CLEAN:
3292 return "clean";
3293 case ERROR:
3294 return "error";
3295 default:
3296 return "unknown";
3297 }
3298 }
3299 const char *get_op_name() const {
3300 return get_op_name(op);
3301 }
3302
3303 // describes state for a locally-rollbackable entry
3304 ObjectModDesc mod_desc;
3305 bufferlist snaps; // only for clone entries
3306 hobject_t soid;
3307 osd_reqid_t reqid; // caller+tid to uniquely identify request
31f18b77 3308 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids;
7c673cae
FG
3309 eversion_t version, prior_version, reverting_to;
3310 version_t user_version; // the user version for this entry
3311 utime_t mtime; // this is the _user_ mtime, mind you
3312 int32_t return_code; // only stored for ERRORs for dup detection
3313
3314 __s32 op;
3315 bool invalid_hash; // only when decoding sobject_t based entries
3316 bool invalid_pool; // only when decoding pool-less hobject based entries
3317
3318 pg_log_entry_t()
3319 : user_version(0), return_code(0), op(0),
31f18b77
FG
3320 invalid_hash(false), invalid_pool(false) {
3321 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3322 }
7c673cae
FG
3323 pg_log_entry_t(int _op, const hobject_t& _soid,
3324 const eversion_t& v, const eversion_t& pv,
3325 version_t uv,
3326 const osd_reqid_t& rid, const utime_t& mt,
3327 int return_code)
3328 : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
3329 mtime(mt), return_code(return_code), op(_op),
31f18b77
FG
3330 invalid_hash(false), invalid_pool(false) {
3331 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3332 }
7c673cae
FG
3333
3334 bool is_clone() const { return op == CLONE; }
3335 bool is_modify() const { return op == MODIFY; }
3336 bool is_promote() const { return op == PROMOTE; }
3337 bool is_clean() const { return op == CLEAN; }
3338 bool is_backlog() const { return op == BACKLOG; }
3339 bool is_lost_revert() const { return op == LOST_REVERT; }
3340 bool is_lost_delete() const { return op == LOST_DELETE; }
3341 bool is_lost_mark() const { return op == LOST_MARK; }
3342 bool is_error() const { return op == ERROR; }
3343
3344 bool is_update() const {
3345 return
3346 is_clone() || is_modify() || is_promote() || is_clean() ||
3347 is_backlog() || is_lost_revert() || is_lost_mark();
3348 }
3349 bool is_delete() const {
3350 return op == DELETE || op == LOST_DELETE;
3351 }
3352
3353 bool can_rollback() const {
3354 return mod_desc.can_rollback();
3355 }
3356
3357 void mark_unrollbackable() {
3358 mod_desc.mark_unrollbackable();
3359 }
3360
3361 bool requires_kraken() const {
3362 return mod_desc.requires_kraken();
3363 }
3364
3365 // Errors are only used for dup detection, whereas
3366 // the index by objects is used by recovery, copy_get,
3367 // and other facilities that don't expect or need to
3368 // be aware of error entries.
3369 bool object_is_indexed() const {
3370 return !is_error();
3371 }
3372
3373 bool reqid_is_indexed() const {
3374 return reqid != osd_reqid_t() &&
3375 (op == MODIFY || op == DELETE || op == ERROR);
3376 }
3377
3378 string get_key_name() const;
3379 void encode_with_checksum(bufferlist& bl) const;
3380 void decode_with_checksum(bufferlist::iterator& p);
3381
3382 void encode(bufferlist &bl) const;
3383 void decode(bufferlist::iterator &bl);
3384 void dump(Formatter *f) const;
3385 static void generate_test_instances(list<pg_log_entry_t*>& o);
3386
3387};
3388WRITE_CLASS_ENCODER(pg_log_entry_t)
3389
3390ostream& operator<<(ostream& out, const pg_log_entry_t& e);
3391
3392
3393
3394/**
3395 * pg_log_t - incremental log of recent pg changes.
3396 *
3397 * serves as a recovery queue for recent changes.
3398 */
3399struct pg_log_t {
3400 /*
3401 * head - newest entry (update|delete)
3402 * tail - entry previous to oldest (update|delete) for which we have
3403 * complete negative information.
3404 * i.e. we can infer pg contents for any store whose last_update >= tail.
3405 */
3406 eversion_t head; // newest entry
3407 eversion_t tail; // version prior to oldest
3408
3409protected:
3410 // We can rollback rollback-able entries > can_rollback_to
3411 eversion_t can_rollback_to;
3412
3413 // always <= can_rollback_to, indicates how far stashed rollback
3414 // data can be found
3415 eversion_t rollback_info_trimmed_to;
3416
3417public:
31f18b77 3418 mempool::osd_pglog::list<pg_log_entry_t> log; // the actual log.
7c673cae
FG
3419
3420 pg_log_t() = default;
3421 pg_log_t(const eversion_t &last_update,
3422 const eversion_t &log_tail,
3423 const eversion_t &can_rollback_to,
3424 const eversion_t &rollback_info_trimmed_to,
31f18b77 3425 mempool::osd_pglog::list<pg_log_entry_t> &&entries)
7c673cae
FG
3426 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3427 rollback_info_trimmed_to(rollback_info_trimmed_to),
3428 log(std::move(entries)) {}
3429 pg_log_t(const eversion_t &last_update,
3430 const eversion_t &log_tail,
3431 const eversion_t &can_rollback_to,
3432 const eversion_t &rollback_info_trimmed_to,
3433 const std::list<pg_log_entry_t> &entries)
3434 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3435 rollback_info_trimmed_to(rollback_info_trimmed_to) {
3436 for (auto &&entry: entries) {
3437 log.push_back(entry);
3438 }
3439 }
3440
3441 void clear() {
3442 eversion_t z;
3443 rollback_info_trimmed_to = can_rollback_to = head = tail = z;
3444 log.clear();
3445 }
3446
3447 eversion_t get_rollback_info_trimmed_to() const {
3448 return rollback_info_trimmed_to;
3449 }
3450 eversion_t get_can_rollback_to() const {
3451 return can_rollback_to;
3452 }
3453
3454
3455 pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
31f18b77 3456 mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
7c673cae
FG
3457 oldlog.swap(log);
3458
3459 eversion_t old_tail;
3460 unsigned mask = ~((~0)<<split_bits);
3461 for (auto i = oldlog.begin();
3462 i != oldlog.end();
3463 ) {
3464 if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
3465 childlog.push_back(*i);
3466 } else {
3467 log.push_back(*i);
3468 }
3469 oldlog.erase(i++);
3470 }
3471
3472 return pg_log_t(
3473 head,
3474 tail,
3475 can_rollback_to,
3476 rollback_info_trimmed_to,
3477 std::move(childlog));
3478 }
3479
31f18b77 3480 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
7c673cae
FG
3481 assert(newhead >= tail);
3482
31f18b77
FG
3483 mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
3484 mempool::osd_pglog::list<pg_log_entry_t> divergent;
7c673cae
FG
3485 while (true) {
3486 if (p == log.begin()) {
3487 // yikes, the whole thing is divergent!
31f18b77
FG
3488 using std::swap;
3489 swap(divergent, log);
7c673cae
FG
3490 break;
3491 }
3492 --p;
3493 if (p->version.version <= newhead.version) {
3494 /*
3495 * look at eversion.version here. we want to avoid a situation like:
3496 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
3497 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
3498 * lower_bound = 100'9
3499 * i.e, same request, different version. If the eversion.version is > the
3500 * lower_bound, we it is divergent.
3501 */
3502 ++p;
3503 divergent.splice(divergent.begin(), log, p, log.end());
3504 break;
3505 }
3506 assert(p->version > newhead);
3507 }
3508 head = newhead;
3509
3510 if (can_rollback_to > newhead)
3511 can_rollback_to = newhead;
3512
3513 if (rollback_info_trimmed_to > newhead)
3514 rollback_info_trimmed_to = newhead;
3515
3516 return divergent;
3517 }
3518
3519 bool empty() const {
3520 return log.empty();
3521 }
3522
3523 bool null() const {
3524 return head.version == 0 && head.epoch == 0;
3525 }
3526
3527 size_t approx_size() const {
3528 return head.version - tail.version;
3529 }
3530
3531 static void filter_log(spg_t import_pgid, const OSDMap &curmap,
3532 const string &hit_set_namespace, const pg_log_t &in,
3533 pg_log_t &out, pg_log_t &reject);
3534
3535 /**
3536 * copy entries from the tail of another pg_log_t
3537 *
3538 * @param other pg_log_t to copy from
3539 * @param from copy entries after this version
3540 */
3541 void copy_after(const pg_log_t &other, eversion_t from);
3542
3543 /**
3544 * copy a range of entries from another pg_log_t
3545 *
3546 * @param other pg_log_t to copy from
3547 * @param from copy entries after this version
3548 * @param to up to and including this version
3549 */
3550 void copy_range(const pg_log_t &other, eversion_t from, eversion_t to);
3551
3552 /**
3553 * copy up to N entries
3554 *
3555 * @param other source log
3556 * @param max max number of entries to copy
3557 */
3558 void copy_up_to(const pg_log_t &other, int max);
3559
3560 ostream& print(ostream& out) const;
3561
3562 void encode(bufferlist &bl) const;
3563 void decode(bufferlist::iterator &bl, int64_t pool = -1);
3564 void dump(Formatter *f) const;
3565 static void generate_test_instances(list<pg_log_t*>& o);
3566};
3567WRITE_CLASS_ENCODER(pg_log_t)
3568
3569inline ostream& operator<<(ostream& out, const pg_log_t& log)
3570{
3571 out << "log((" << log.tail << "," << log.head << "], crt="
3572 << log.get_can_rollback_to() << ")";
3573 return out;
3574}
3575
3576
3577/**
3578 * pg_missing_t - summary of missing objects.
3579 *
3580 * kept in memory, as a supplement to pg_log_t
3581 * also used to pass missing info in messages.
3582 */
3583struct pg_missing_item {
3584 eversion_t need, have;
3585 pg_missing_item() {}
3586 explicit pg_missing_item(eversion_t n) : need(n) {} // have no old version
3587 pg_missing_item(eversion_t n, eversion_t h) : need(n), have(h) {}
3588
3589 void encode(bufferlist& bl) const {
3590 ::encode(need, bl);
3591 ::encode(have, bl);
3592 }
3593 void decode(bufferlist::iterator& bl) {
3594 ::decode(need, bl);
3595 ::decode(have, bl);
3596 }
3597 void dump(Formatter *f) const {
3598 f->dump_stream("need") << need;
3599 f->dump_stream("have") << have;
3600 }
3601 static void generate_test_instances(list<pg_missing_item*>& o) {
3602 o.push_back(new pg_missing_item);
3603 o.push_back(new pg_missing_item);
3604 o.back()->need = eversion_t(1, 2);
3605 o.back()->have = eversion_t(1, 1);
3606 }
3607 bool operator==(const pg_missing_item &rhs) const {
3608 return need == rhs.need && have == rhs.have;
3609 }
3610 bool operator!=(const pg_missing_item &rhs) const {
3611 return !(*this == rhs);
3612 }
3613};
3614WRITE_CLASS_ENCODER(pg_missing_item)
3615ostream& operator<<(ostream& out, const pg_missing_item &item);
3616
3617class pg_missing_const_i {
3618public:
3619 virtual const map<hobject_t, pg_missing_item> &
3620 get_items() const = 0;
3621 virtual const map<version_t, hobject_t> &get_rmissing() const = 0;
3622 virtual unsigned int num_missing() const = 0;
3623 virtual bool have_missing() const = 0;
3624 virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
3625 virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
3626 virtual eversion_t have_old(const hobject_t& oid) const = 0;
3627 virtual ~pg_missing_const_i() {}
3628};
3629
3630
3631template <bool Track>
3632class ChangeTracker {
3633public:
3634 void changed(const hobject_t &obj) {}
3635 template <typename F>
3636 void get_changed(F &&f) const {}
3637 void flush() {}
3638 bool is_clean() const {
3639 return true;
3640 }
3641};
3642template <>
3643class ChangeTracker<true> {
3644 set<hobject_t> _changed;
3645public:
3646 void changed(const hobject_t &obj) {
3647 _changed.insert(obj);
3648 }
3649 template <typename F>
3650 void get_changed(F &&f) const {
3651 for (auto const &i: _changed) {
3652 f(i);
3653 }
3654 }
3655 void flush() {
3656 _changed.clear();
3657 }
3658 bool is_clean() const {
3659 return _changed.empty();
3660 }
3661};
3662
3663template <bool TrackChanges>
3664class pg_missing_set : public pg_missing_const_i {
3665 using item = pg_missing_item;
3666 map<hobject_t, item> missing; // oid -> (need v, have v)
3667 map<version_t, hobject_t> rmissing; // v -> oid
3668 ChangeTracker<TrackChanges> tracker;
3669
3670public:
3671 pg_missing_set() = default;
3672
3673 template <typename missing_type>
3674 pg_missing_set(const missing_type &m) {
7c673cae
FG
3675 missing = m.get_items();
3676 rmissing = m.get_rmissing();
3677 for (auto &&i: missing)
3678 tracker.changed(i.first);
3679 }
3680
3681 const map<hobject_t, item> &get_items() const override {
3682 return missing;
3683 }
3684 const map<version_t, hobject_t> &get_rmissing() const override {
3685 return rmissing;
3686 }
3687 unsigned int num_missing() const override {
3688 return missing.size();
3689 }
3690 bool have_missing() const override {
3691 return !missing.empty();
3692 }
3693 bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
3694 auto iter = missing.find(oid);
3695 if (iter == missing.end())
3696 return false;
3697 if (out)
3698 *out = iter->second;
3699 return true;
3700 }
3701 bool is_missing(const hobject_t& oid, eversion_t v) const override {
3702 map<hobject_t, item>::const_iterator m =
3703 missing.find(oid);
3704 if (m == missing.end())
3705 return false;
3706 const item &item(m->second);
3707 if (item.need > v)
3708 return false;
3709 return true;
3710 }
3711 eversion_t have_old(const hobject_t& oid) const override {
3712 map<hobject_t, item>::const_iterator m =
3713 missing.find(oid);
3714 if (m == missing.end())
3715 return eversion_t();
3716 const item &item(m->second);
3717 return item.have;
3718 }
3719
3720 void claim(pg_missing_set& o) {
3721 static_assert(!TrackChanges, "Can't use claim with TrackChanges");
3722 missing.swap(o.missing);
3723 rmissing.swap(o.rmissing);
3724 }
3725
3726 /*
3727 * this needs to be called in log order as we extend the log. it
3728 * assumes missing is accurate up through the previous log entry.
3729 */
3730 void add_next_event(const pg_log_entry_t& e) {
3731 if (e.is_update()) {
3732 map<hobject_t, item>::iterator missing_it;
3733 missing_it = missing.find(e.soid);
3734 bool is_missing_divergent_item = missing_it != missing.end();
3735 if (e.prior_version == eversion_t() || e.is_clone()) {
3736 // new object.
3737 if (is_missing_divergent_item) { // use iterator
3738 rmissing.erase((missing_it->second).need.version);
3739 missing_it->second = item(e.version, eversion_t()); // .have = nil
3740 } else // create new element in missing map
3741 missing[e.soid] = item(e.version, eversion_t()); // .have = nil
3742 } else if (is_missing_divergent_item) {
3743 // already missing (prior).
3744 rmissing.erase((missing_it->second).need.version);
3745 (missing_it->second).need = e.version; // leave .have unchanged.
3746 } else if (e.is_backlog()) {
3747 // May not have prior version
3748 assert(0 == "these don't exist anymore");
3749 } else {
3750 // not missing, we must have prior_version (if any)
3751 assert(!is_missing_divergent_item);
3752 missing[e.soid] = item(e.version, e.prior_version);
3753 }
3754 rmissing[e.version.version] = e.soid;
3755 } else if (e.is_delete()) {
3756 rm(e.soid, e.version);
3757 }
3758
3759 tracker.changed(e.soid);
3760 }
3761
3762 void revise_need(hobject_t oid, eversion_t need) {
3763 if (missing.count(oid)) {
3764 rmissing.erase(missing[oid].need.version);
3765 missing[oid].need = need; // no not adjust .have
3766 } else {
3767 missing[oid] = item(need, eversion_t());
3768 }
3769 rmissing[need.version] = oid;
3770
3771 tracker.changed(oid);
3772 }
3773
3774 void revise_have(hobject_t oid, eversion_t have) {
3775 if (missing.count(oid)) {
3776 tracker.changed(oid);
3777 missing[oid].have = have;
3778 }
3779 }
3780
3781 void add(const hobject_t& oid, eversion_t need, eversion_t have) {
3782 missing[oid] = item(need, have);
3783 rmissing[need.version] = oid;
3784 tracker.changed(oid);
3785 }
3786
3787 void rm(const hobject_t& oid, eversion_t v) {
3788 std::map<hobject_t, item>::iterator p = missing.find(oid);
3789 if (p != missing.end() && p->second.need <= v)
3790 rm(p);
3791 }
3792
3793 void rm(std::map<hobject_t, item>::const_iterator m) {
3794 tracker.changed(m->first);
3795 rmissing.erase(m->second.need.version);
3796 missing.erase(m);
3797 }
3798
3799 void got(const hobject_t& oid, eversion_t v) {
3800 std::map<hobject_t, item>::iterator p = missing.find(oid);
3801 assert(p != missing.end());
3802 assert(p->second.need <= v);
3803 got(p);
3804 }
3805
3806 void got(std::map<hobject_t, item>::const_iterator m) {
3807 tracker.changed(m->first);
3808 rmissing.erase(m->second.need.version);
3809 missing.erase(m);
3810 }
3811
3812 void split_into(
3813 pg_t child_pgid,
3814 unsigned split_bits,
3815 pg_missing_set *omissing) {
3816 unsigned mask = ~((~0)<<split_bits);
3817 for (map<hobject_t, item>::iterator i = missing.begin();
3818 i != missing.end();
3819 ) {
3820 if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
3821 omissing->add(i->first, i->second.need, i->second.have);
3822 rm(i++);
3823 } else {
3824 ++i;
3825 }
3826 }
3827 }
3828
3829 void clear() {
3830 for (auto const &i: missing)
3831 tracker.changed(i.first);
3832 missing.clear();
3833 rmissing.clear();
3834 }
3835
3836 void encode(bufferlist &bl) const {
3837 ENCODE_START(3, 2, bl);
3838 ::encode(missing, bl);
3839 ENCODE_FINISH(bl);
3840 }
3841 void decode(bufferlist::iterator &bl, int64_t pool = -1) {
3842 for (auto const &i: missing)
3843 tracker.changed(i.first);
3844 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
3845 ::decode(missing, bl);
3846 DECODE_FINISH(bl);
3847
3848 if (struct_v < 3) {
3849 // Handle hobject_t upgrade
3850 map<hobject_t, item> tmp;
3851 for (map<hobject_t, item>::iterator i =
3852 missing.begin();
3853 i != missing.end();
3854 ) {
3855 if (!i->first.is_max() && i->first.pool == -1) {
3856 hobject_t to_insert(i->first);
3857 to_insert.pool = pool;
3858 tmp[to_insert] = i->second;
3859 missing.erase(i++);
3860 } else {
3861 ++i;
3862 }
3863 }
3864 missing.insert(tmp.begin(), tmp.end());
3865 }
3866
3867 for (map<hobject_t,item>::iterator it =
3868 missing.begin();
3869 it != missing.end();
3870 ++it)
3871 rmissing[it->second.need.version] = it->first;
3872 for (auto const &i: missing)
3873 tracker.changed(i.first);
3874 }
3875 void dump(Formatter *f) const {
3876 f->open_array_section("missing");
3877 for (map<hobject_t,item>::const_iterator p =
3878 missing.begin(); p != missing.end(); ++p) {
3879 f->open_object_section("item");
3880 f->dump_stream("object") << p->first;
3881 p->second.dump(f);
3882 f->close_section();
3883 }
3884 f->close_section();
3885 }
3886 template <typename F>
3887 void filter_objects(F &&f) {
3888 for (auto i = missing.begin(); i != missing.end();) {
3889 if (f(i->first)) {
3890 rm(i++);
3891 } else {
3892 ++i;
3893 }
3894 }
3895 }
3896 static void generate_test_instances(list<pg_missing_set*>& o) {
3897 o.push_back(new pg_missing_set);
3898 o.push_back(new pg_missing_set);
3899 o.back()->add(
3900 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
3901 eversion_t(5, 6), eversion_t(5, 1));
3902 }
3903 template <typename F>
3904 void get_changed(F &&f) const {
3905 tracker.get_changed(f);
3906 }
3907 void flush() {
3908 tracker.flush();
3909 }
3910 bool is_clean() const {
3911 return tracker.is_clean();
3912 }
3913 template <typename missing_t>
3914 bool debug_verify_from_init(
3915 const missing_t &init_missing,
3916 ostream *oss) const {
3917 if (!TrackChanges)
3918 return true;
3919 auto check_missing(init_missing.get_items());
3920 tracker.get_changed([&](const hobject_t &hoid) {
3921 check_missing.erase(hoid);
3922 if (missing.count(hoid)) {
3923 check_missing.insert(*(missing.find(hoid)));
3924 }
3925 });
3926 bool ok = true;
3927 if (check_missing.size() != missing.size()) {
3928 if (oss) {
3929 *oss << "Size mismatch, check: " << check_missing.size()
3930 << ", actual: " << missing.size() << "\n";
3931 }
3932 ok = false;
3933 }
3934 for (auto &i: missing) {
3935 if (!check_missing.count(i.first)) {
3936 if (oss)
3937 *oss << "check_missing missing " << i.first << "\n";
3938 ok = false;
3939 } else if (check_missing[i.first] != i.second) {
3940 if (oss)
3941 *oss << "check_missing missing item mismatch on " << i.first
3942 << ", check: " << check_missing[i.first]
3943 << ", actual: " << i.second << "\n";
3944 ok = false;
3945 }
3946 }
3947 if (oss && !ok) {
3948 *oss << "check_missing: " << check_missing << "\n";
3949 set<hobject_t> changed;
3950 tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
3951 *oss << "changed: " << changed << "\n";
3952 }
3953 return ok;
3954 }
3955};
3956template <bool TrackChanges>
3957void encode(
3958 const pg_missing_set<TrackChanges> &c, bufferlist &bl, uint64_t features=0) {
3959 ENCODE_DUMP_PRE();
3960 c.encode(bl);
3961 ENCODE_DUMP_POST(cl);
3962}
3963template <bool TrackChanges>
3964void decode(pg_missing_set<TrackChanges> &c, bufferlist::iterator &p) {
3965 c.decode(p);
3966}
3967template <bool TrackChanges>
3968ostream& operator<<(ostream& out, const pg_missing_set<TrackChanges> &missing)
3969{
3970 out << "missing(" << missing.num_missing();
3971 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
3972 out << ")";
3973 return out;
3974}
3975
3976using pg_missing_t = pg_missing_set<false>;
3977using pg_missing_tracker_t = pg_missing_set<true>;
3978
3979
3980/**
3981 * pg list objects response format
3982 *
3983 */
3984struct pg_nls_response_t {
3985 collection_list_handle_t handle;
3986 list<librados::ListObjectImpl> entries;
3987
3988 void encode(bufferlist& bl) const {
3989 ENCODE_START(1, 1, bl);
3990 ::encode(handle, bl);
3991 __u32 n = (__u32)entries.size();
3992 ::encode(n, bl);
3993 for (list<librados::ListObjectImpl>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
3994 ::encode(i->nspace, bl);
3995 ::encode(i->oid, bl);
3996 ::encode(i->locator, bl);
3997 }
3998 ENCODE_FINISH(bl);
3999 }
4000 void decode(bufferlist::iterator& bl) {
4001 DECODE_START(1, bl);
4002 ::decode(handle, bl);
4003 __u32 n;
4004 ::decode(n, bl);
4005 entries.clear();
4006 while (n--) {
4007 librados::ListObjectImpl i;
4008 ::decode(i.nspace, bl);
4009 ::decode(i.oid, bl);
4010 ::decode(i.locator, bl);
4011 entries.push_back(i);
4012 }
4013 DECODE_FINISH(bl);
4014 }
4015 void dump(Formatter *f) const {
4016 f->dump_stream("handle") << handle;
4017 f->open_array_section("entries");
4018 for (list<librados::ListObjectImpl>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4019 f->open_object_section("object");
4020 f->dump_string("namespace", p->nspace);
4021 f->dump_string("object", p->oid);
4022 f->dump_string("key", p->locator);
4023 f->close_section();
4024 }
4025 f->close_section();
4026 }
4027 static void generate_test_instances(list<pg_nls_response_t*>& o) {
4028 o.push_back(new pg_nls_response_t);
4029 o.push_back(new pg_nls_response_t);
4030 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4031 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4032 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4033 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4034 o.push_back(new pg_nls_response_t);
4035 o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
4036 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4037 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4038 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4039 o.push_back(new pg_nls_response_t);
4040 o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
4041 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4042 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4043 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4044 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4045 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4046 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4047 }
4048};
4049
4050WRITE_CLASS_ENCODER(pg_nls_response_t)
4051
4052// For backwards compatibility with older OSD requests
4053struct pg_ls_response_t {
4054 collection_list_handle_t handle;
4055 list<pair<object_t, string> > entries;
4056
4057 void encode(bufferlist& bl) const {
4058 __u8 v = 1;
4059 ::encode(v, bl);
4060 ::encode(handle, bl);
4061 ::encode(entries, bl);
4062 }
4063 void decode(bufferlist::iterator& bl) {
4064 __u8 v;
4065 ::decode(v, bl);
4066 assert(v == 1);
4067 ::decode(handle, bl);
4068 ::decode(entries, bl);
4069 }
4070 void dump(Formatter *f) const {
4071 f->dump_stream("handle") << handle;
4072 f->open_array_section("entries");
4073 for (list<pair<object_t, string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4074 f->open_object_section("object");
4075 f->dump_stream("object") << p->first;
4076 f->dump_string("key", p->second);
4077 f->close_section();
4078 }
4079 f->close_section();
4080 }
4081 static void generate_test_instances(list<pg_ls_response_t*>& o) {
4082 o.push_back(new pg_ls_response_t);
4083 o.push_back(new pg_ls_response_t);
4084 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4085 o.back()->entries.push_back(make_pair(object_t("one"), string()));
4086 o.back()->entries.push_back(make_pair(object_t("two"), string("twokey")));
4087 }
4088};
4089
4090WRITE_CLASS_ENCODER(pg_ls_response_t)
4091
4092/**
4093 * object_copy_cursor_t
4094 */
4095struct object_copy_cursor_t {
4096 uint64_t data_offset;
4097 string omap_offset;
4098 bool attr_complete;
4099 bool data_complete;
4100 bool omap_complete;
4101
4102 object_copy_cursor_t()
4103 : data_offset(0),
4104 attr_complete(false),
4105 data_complete(false),
4106 omap_complete(false)
4107 {}
4108
4109 bool is_initial() const {
4110 return !attr_complete && data_offset == 0 && omap_offset.empty();
4111 }
4112 bool is_complete() const {
4113 return attr_complete && data_complete && omap_complete;
4114 }
4115
4116 static void generate_test_instances(list<object_copy_cursor_t*>& o);
4117 void encode(bufferlist& bl) const;
4118 void decode(bufferlist::iterator &bl);
4119 void dump(Formatter *f) const;
4120};
4121WRITE_CLASS_ENCODER(object_copy_cursor_t)
4122
4123/**
4124 * object_copy_data_t
4125 *
4126 * Return data from a copy request. The semantics are a little strange
4127 * as a result of the encoding's heritage.
4128 *
4129 * In particular, the sender unconditionally fills in the cursor (from what
4130 * it receives and sends), the size, and the mtime, but is responsible for
4131 * figuring out whether it should put any data in the attrs, data, or
4132 * omap members (corresponding to xattrs, object data, and the omap entries)
4133 * based on external data (the client includes a max amount to return with
4134 * the copy request). The client then looks into the attrs, data, and/or omap
4135 * based on the contents of the cursor.
4136 */
4137struct object_copy_data_t {
4138 enum {
4139 FLAG_DATA_DIGEST = 1<<0,
4140 FLAG_OMAP_DIGEST = 1<<1,
4141 };
4142 object_copy_cursor_t cursor;
4143 uint64_t size;
4144 utime_t mtime;
4145 uint32_t data_digest, omap_digest;
4146 uint32_t flags;
4147 map<string, bufferlist> attrs;
4148 bufferlist data;
4149 bufferlist omap_header;
4150 bufferlist omap_data;
4151
4152 /// which snaps we are defined for (if a snap and not the head)
4153 vector<snapid_t> snaps;
4154 ///< latest snap seq for the object (if head)
4155 snapid_t snap_seq;
4156
4157 ///< recent reqids on this object
31f18b77 4158 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > reqids;
7c673cae
FG
4159
4160 uint64_t truncate_seq;
4161 uint64_t truncate_size;
4162
4163public:
4164 object_copy_data_t() :
4165 size((uint64_t)-1), data_digest(-1),
4166 omap_digest(-1), flags(0),
4167 truncate_seq(0),
4168 truncate_size(0) {}
4169
4170 static void generate_test_instances(list<object_copy_data_t*>& o);
4171 void encode(bufferlist& bl, uint64_t features) const;
4172 void decode(bufferlist::iterator& bl);
4173 void dump(Formatter *f) const;
4174};
4175WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
4176
4177/**
4178 * pg creation info
4179 */
4180struct pg_create_t {
4181 epoch_t created; // epoch pg created
4182 pg_t parent; // split from parent (if != pg_t())
4183 __s32 split_bits;
4184
4185 pg_create_t()
4186 : created(0), split_bits(0) {}
4187 pg_create_t(unsigned c, pg_t p, int s)
4188 : created(c), parent(p), split_bits(s) {}
4189
4190 void encode(bufferlist &bl) const;
4191 void decode(bufferlist::iterator &bl);
4192 void dump(Formatter *f) const;
4193 static void generate_test_instances(list<pg_create_t*>& o);
4194};
4195WRITE_CLASS_ENCODER(pg_create_t)
4196
4197// -----------------------------------------
4198
4199struct osd_peer_stat_t {
4200 utime_t stamp;
4201
4202 osd_peer_stat_t() { }
4203
4204 void encode(bufferlist &bl) const;
4205 void decode(bufferlist::iterator &bl);
4206 void dump(Formatter *f) const;
4207 static void generate_test_instances(list<osd_peer_stat_t*>& o);
4208};
4209WRITE_CLASS_ENCODER(osd_peer_stat_t)
4210
4211ostream& operator<<(ostream& out, const osd_peer_stat_t &stat);
4212
4213
4214// -----------------------------------------
4215
4216class ObjectExtent {
4217 /**
4218 * ObjectExtents are used for specifying IO behavior against RADOS
4219 * objects when one is using the ObjectCacher.
4220 *
4221 * To use this in a real system, *every member* must be filled
4222 * out correctly. In particular, make sure to initialize the
4223 * oloc correctly, as its default values are deliberate poison
4224 * and will cause internal ObjectCacher asserts.
4225 *
4226 * Similarly, your buffer_extents vector *must* specify a total
4227 * size equal to your length. If the buffer_extents inadvertently
4228 * contain less space than the length member specifies, you
4229 * will get unintelligible asserts deep in the ObjectCacher.
4230 *
4231 * If you are trying to do testing and don't care about actual
4232 * RADOS function, the simplest thing to do is to initialize
4233 * the ObjectExtent (truncate_size can be 0), create a single entry
4234 * in buffer_extents matching the length, and set oloc.pool to 0.
4235 */
4236 public:
4237 object_t oid; // object id
4238 uint64_t objectno;
4239 uint64_t offset; // in object
4240 uint64_t length; // in object
4241 uint64_t truncate_size; // in object
4242
4243 object_locator_t oloc; // object locator (pool etc)
4244
4245 vector<pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
4246
4247 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
4248 ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
4249 oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
4250};
4251
4252inline ostream& operator<<(ostream& out, const ObjectExtent &ex)
4253{
4254 return out << "extent("
4255 << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
4256 << " " << ex.offset << "~" << ex.length
4257 << " -> " << ex.buffer_extents
4258 << ")";
4259}
4260
4261
4262
4263
4264
4265
4266// ---------------------------------------
4267
4268class OSDSuperblock {
4269public:
4270 uuid_d cluster_fsid, osd_fsid;
4271 int32_t whoami; // my role in this fs.
4272 epoch_t current_epoch; // most recent epoch
4273 epoch_t oldest_map, newest_map; // oldest/newest maps we have.
4274 double weight;
4275
4276 CompatSet compat_features;
4277
4278 // last interval over which i mounted and was then active
4279 epoch_t mounted; // last epoch i mounted
4280 epoch_t clean_thru; // epoch i was active and clean thru
4281
4282 OSDSuperblock() :
4283 whoami(-1),
4284 current_epoch(0), oldest_map(0), newest_map(0), weight(0),
4285 mounted(0), clean_thru(0) {
4286 }
4287
4288 void encode(bufferlist &bl) const;
4289 void decode(bufferlist::iterator &bl);
4290 void dump(Formatter *f) const;
4291 static void generate_test_instances(list<OSDSuperblock*>& o);
4292};
4293WRITE_CLASS_ENCODER(OSDSuperblock)
4294
4295inline ostream& operator<<(ostream& out, const OSDSuperblock& sb)
4296{
4297 return out << "sb(" << sb.cluster_fsid
4298 << " osd." << sb.whoami
4299 << " " << sb.osd_fsid
4300 << " e" << sb.current_epoch
4301 << " [" << sb.oldest_map << "," << sb.newest_map << "]"
4302 << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
4303 << ")";
4304}
4305
4306
4307// -------
4308
4309
4310
4311
4312
4313
4314/*
4315 * attached to object head. describes most recent snap context, and
4316 * set of existing clones.
4317 */
4318struct SnapSet {
4319 snapid_t seq;
4320 bool head_exists;
4321 vector<snapid_t> snaps; // descending
4322 vector<snapid_t> clones; // ascending
4323 map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest
4324 map<snapid_t, uint64_t> clone_size;
4325 map<snapid_t, vector<snapid_t>> clone_snaps; // descending
4326
4327 SnapSet() : seq(0), head_exists(false) {}
4328 explicit SnapSet(bufferlist& bl) {
4329 bufferlist::iterator p = bl.begin();
4330 decode(p);
4331 }
4332
4333 bool is_legacy() const {
4334 return clone_snaps.size() < clones.size() || !head_exists;
4335 }
4336
4337 /// populate SnapSet from a librados::snap_set_t
4338 void from_snap_set(const librados::snap_set_t& ss, bool legacy);
4339
4340 /// get space accounted to clone
4341 uint64_t get_clone_bytes(snapid_t clone) const;
4342
4343 void encode(bufferlist& bl) const;
4344 void decode(bufferlist::iterator& bl);
4345 void dump(Formatter *f) const;
4346 static void generate_test_instances(list<SnapSet*>& o);
4347
4348 SnapContext get_ssc_as_of(snapid_t as_of) const {
4349 SnapContext out;
4350 out.seq = as_of;
4351 for (vector<snapid_t>::const_iterator i = snaps.begin();
4352 i != snaps.end();
4353 ++i) {
4354 if (*i <= as_of)
4355 out.snaps.push_back(*i);
4356 }
4357 return out;
4358 }
4359
4360 // return min element of snaps > after, return max if no such element
4361 snapid_t get_first_snap_after(snapid_t after, snapid_t max) const {
4362 for (vector<snapid_t>::const_reverse_iterator i = snaps.rbegin();
4363 i != snaps.rend();
4364 ++i) {
4365 if (*i > after)
4366 return *i;
4367 }
4368 return max;
4369 }
4370
4371 SnapSet get_filtered(const pg_pool_t &pinfo) const;
4372 void filter(const pg_pool_t &pinfo);
4373};
4374WRITE_CLASS_ENCODER(SnapSet)
4375
4376ostream& operator<<(ostream& out, const SnapSet& cs);
4377
4378
4379
4380#define OI_ATTR "_"
4381#define SS_ATTR "snapset"
4382
4383struct watch_info_t {
4384 uint64_t cookie;
4385 uint32_t timeout_seconds;
4386 entity_addr_t addr;
4387
4388 watch_info_t() : cookie(0), timeout_seconds(0) { }
4389 watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
4390
4391 void encode(bufferlist& bl, uint64_t features) const;
4392 void decode(bufferlist::iterator& bl);
4393 void dump(Formatter *f) const;
4394 static void generate_test_instances(list<watch_info_t*>& o);
4395};
4396WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
4397
4398static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
4399 return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
4400 && l.addr == r.addr;
4401}
4402
4403static inline ostream& operator<<(ostream& out, const watch_info_t& w) {
4404 return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
4405 << " " << w.addr << ")";
4406}
4407
4408struct notify_info_t {
4409 uint64_t cookie;
4410 uint64_t notify_id;
4411 uint32_t timeout;
4412 bufferlist bl;
4413};
4414
4415static inline ostream& operator<<(ostream& out, const notify_info_t& n) {
4416 return out << "notify(cookie " << n.cookie
4417 << " notify" << n.notify_id
4418 << " " << n.timeout << "s)";
4419}
4420
31f18b77
FG
4421struct object_info_t;
4422struct object_manifest_t {
4423 enum {
4424 TYPE_NONE = 0,
4425 TYPE_REDIRECT = 1, // start with this
4426 TYPE_CHUNKED = 2, // do this later
4427 };
4428 uint8_t type; // redirect, chunked, ...
4429 hobject_t redirect_target;
4430
4431 object_manifest_t() : type(0) { }
4432 object_manifest_t(uint8_t type, const hobject_t& redirect_target)
4433 : type(type), redirect_target(redirect_target) { }
4434
4435 bool is_empty() const {
4436 return type == TYPE_NONE;
4437 }
4438 bool is_redirect() const {
4439 return type == TYPE_REDIRECT;
4440 }
4441 bool is_chunked() const {
4442 return type == TYPE_CHUNKED;
4443 }
4444 static const char *get_type_name(uint8_t m) {
4445 switch (m) {
4446 case TYPE_NONE: return "none";
4447 case TYPE_REDIRECT: return "redirect";
4448 case TYPE_CHUNKED: return "chunked";
4449 default: return "unknown";
4450 }
4451 }
4452 const char *get_type_name() const {
4453 return get_type_name(type);
4454 }
4455 static void generate_test_instances(list<object_manifest_t*>& o);
4456 void encode(bufferlist &bl) const;
4457 void decode(bufferlist::iterator &bl);
4458 void dump(Formatter *f) const;
4459 friend ostream& operator<<(ostream& out, const object_info_t& oi);
4460};
4461WRITE_CLASS_ENCODER(object_manifest_t)
4462ostream& operator<<(ostream& out, const object_manifest_t& oi);
7c673cae
FG
4463
4464struct object_info_t {
4465 hobject_t soid;
4466 eversion_t version, prior_version;
4467 version_t user_version;
4468 osd_reqid_t last_reqid;
4469
4470 uint64_t size;
4471 utime_t mtime;
4472 utime_t local_mtime; // local mtime
4473
4474 // note: these are currently encoded into a total 16 bits; see
4475 // encode()/decode() for the weirdness.
4476 typedef enum {
4477 FLAG_LOST = 1<<0,
4478 FLAG_WHITEOUT = 1<<1, // object logically does not exist
4479 FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
4480 FLAG_OMAP = 1 << 3, // has (or may have) some/any omap data
4481 FLAG_DATA_DIGEST = 1 << 4, // has data crc
4482 FLAG_OMAP_DIGEST = 1 << 5, // has omap crc
4483 FLAG_CACHE_PIN = 1 << 6, // pin the object in cache tier
31f18b77 4484 FLAG_MANIFEST = 1 << 7, // has manifest
7c673cae
FG
4485 // ...
4486 FLAG_USES_TMAP = 1<<8, // deprecated; no longer used.
4487 } flag_t;
4488
4489 flag_t flags;
4490
4491 static string get_flag_string(flag_t flags) {
4492 string s;
4493 if (flags & FLAG_LOST)
4494 s += "|lost";
4495 if (flags & FLAG_WHITEOUT)
4496 s += "|whiteout";
4497 if (flags & FLAG_DIRTY)
4498 s += "|dirty";
4499 if (flags & FLAG_USES_TMAP)
4500 s += "|uses_tmap";
4501 if (flags & FLAG_OMAP)
4502 s += "|omap";
4503 if (flags & FLAG_DATA_DIGEST)
4504 s += "|data_digest";
4505 if (flags & FLAG_OMAP_DIGEST)
4506 s += "|omap_digest";
4507 if (flags & FLAG_CACHE_PIN)
4508 s += "|cache_pin";
31f18b77
FG
4509 if (flags & FLAG_MANIFEST)
4510 s += "|manifest";
7c673cae
FG
4511 if (s.length())
4512 return s.substr(1);
4513 return s;
4514 }
4515 string get_flag_string() const {
4516 return get_flag_string(flags);
4517 }
4518
4519 /// [clone] descending. pre-luminous; moved to SnapSet
4520 vector<snapid_t> legacy_snaps;
4521
4522 uint64_t truncate_seq, truncate_size;
4523
4524 map<pair<uint64_t, entity_name_t>, watch_info_t> watchers;
4525
4526 // opportunistic checksums; may or may not be present
4527 __u32 data_digest; ///< data crc32c
4528 __u32 omap_digest; ///< omap crc32c
4529
4530 // alloc hint attribute
4531 uint64_t expected_object_size, expected_write_size;
4532 uint32_t alloc_hint_flags;
4533
31f18b77
FG
4534 struct object_manifest_t manifest;
4535
7c673cae
FG
4536 void copy_user_bits(const object_info_t& other);
4537
4538 static ps_t legacy_object_locator_to_ps(const object_t &oid,
4539 const object_locator_t &loc);
4540
4541 bool test_flag(flag_t f) const {
4542 return (flags & f) == f;
4543 }
4544 void set_flag(flag_t f) {
4545 flags = (flag_t)(flags | f);
4546 }
4547 void clear_flag(flag_t f) {
4548 flags = (flag_t)(flags & ~f);
4549 }
4550 bool is_lost() const {
4551 return test_flag(FLAG_LOST);
4552 }
4553 bool is_whiteout() const {
4554 return test_flag(FLAG_WHITEOUT);
4555 }
4556 bool is_dirty() const {
4557 return test_flag(FLAG_DIRTY);
4558 }
4559 bool is_omap() const {
4560 return test_flag(FLAG_OMAP);
4561 }
4562 bool is_data_digest() const {
4563 return test_flag(FLAG_DATA_DIGEST);
4564 }
4565 bool is_omap_digest() const {
4566 return test_flag(FLAG_OMAP_DIGEST);
4567 }
4568 bool is_cache_pinned() const {
4569 return test_flag(FLAG_CACHE_PIN);
4570 }
31f18b77
FG
4571 bool has_manifest() const {
4572 return test_flag(FLAG_MANIFEST);
4573 }
7c673cae
FG
4574
4575 void set_data_digest(__u32 d) {
4576 set_flag(FLAG_DATA_DIGEST);
4577 data_digest = d;
4578 }
4579 void set_omap_digest(__u32 d) {
4580 set_flag(FLAG_OMAP_DIGEST);
4581 omap_digest = d;
4582 }
4583 void clear_data_digest() {
4584 clear_flag(FLAG_DATA_DIGEST);
4585 data_digest = -1;
4586 }
4587 void clear_omap_digest() {
4588 clear_flag(FLAG_OMAP_DIGEST);
4589 omap_digest = -1;
4590 }
4591 void new_object() {
4592 set_data_digest(-1);
4593 set_omap_digest(-1);
4594 }
4595
4596 void encode(bufferlist& bl, uint64_t features) const;
4597 void decode(bufferlist::iterator& bl);
4598 void decode(bufferlist& bl) {
4599 bufferlist::iterator p = bl.begin();
4600 decode(p);
4601 }
4602 void dump(Formatter *f) const;
4603 static void generate_test_instances(list<object_info_t*>& o);
4604
4605 explicit object_info_t()
4606 : user_version(0), size(0), flags((flag_t)0),
4607 truncate_seq(0), truncate_size(0),
4608 data_digest(-1), omap_digest(-1),
4609 expected_object_size(0), expected_write_size(0),
4610 alloc_hint_flags(0)
4611 {}
4612
4613 explicit object_info_t(const hobject_t& s)
4614 : soid(s),
4615 user_version(0), size(0), flags((flag_t)0),
4616 truncate_seq(0), truncate_size(0),
4617 data_digest(-1), omap_digest(-1),
4618 expected_object_size(0), expected_write_size(0),
4619 alloc_hint_flags(0)
4620 {}
4621
4622 explicit object_info_t(bufferlist& bl) {
4623 decode(bl);
4624 }
4625};
4626WRITE_CLASS_ENCODER_FEATURES(object_info_t)
4627
4628ostream& operator<<(ostream& out, const object_info_t& oi);
4629
4630
4631
4632// Object recovery
4633struct ObjectRecoveryInfo {
4634 hobject_t soid;
4635 eversion_t version;
4636 uint64_t size;
4637 object_info_t oi;
4638 SnapSet ss; // only populated if soid is_snap()
4639 interval_set<uint64_t> copy_subset;
4640 map<hobject_t, interval_set<uint64_t>> clone_subset;
4641
4642 ObjectRecoveryInfo() : size(0) { }
4643
4644 static void generate_test_instances(list<ObjectRecoveryInfo*>& o);
4645 void encode(bufferlist &bl, uint64_t features) const;
4646 void decode(bufferlist::iterator &bl, int64_t pool = -1);
4647 ostream &print(ostream &out) const;
4648 void dump(Formatter *f) const;
4649};
4650WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
4651ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf);
4652
4653struct ObjectRecoveryProgress {
4654 uint64_t data_recovered_to;
4655 string omap_recovered_to;
4656 bool first;
4657 bool data_complete;
4658 bool omap_complete;
224ce89b 4659 bool error = false;
7c673cae
FG
4660
4661 ObjectRecoveryProgress()
4662 : data_recovered_to(0),
4663 first(true),
4664 data_complete(false), omap_complete(false) { }
4665
4666 bool is_complete(const ObjectRecoveryInfo& info) const {
4667 return (data_recovered_to >= (
4668 info.copy_subset.empty() ?
4669 0 : info.copy_subset.range_end())) &&
4670 omap_complete;
4671 }
4672
4673 static void generate_test_instances(list<ObjectRecoveryProgress*>& o);
4674 void encode(bufferlist &bl) const;
4675 void decode(bufferlist::iterator &bl);
4676 ostream &print(ostream &out) const;
4677 void dump(Formatter *f) const;
4678};
4679WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
4680ostream& operator<<(ostream& out, const ObjectRecoveryProgress &prog);
4681
4682struct PushReplyOp {
4683 hobject_t soid;
4684
4685 static void generate_test_instances(list<PushReplyOp*>& o);
4686 void encode(bufferlist &bl) const;
4687 void decode(bufferlist::iterator &bl);
4688 ostream &print(ostream &out) const;
4689 void dump(Formatter *f) const;
4690
4691 uint64_t cost(CephContext *cct) const;
4692};
4693WRITE_CLASS_ENCODER(PushReplyOp)
4694ostream& operator<<(ostream& out, const PushReplyOp &op);
4695
4696struct PullOp {
4697 hobject_t soid;
4698
4699 ObjectRecoveryInfo recovery_info;
4700 ObjectRecoveryProgress recovery_progress;
4701
4702 static void generate_test_instances(list<PullOp*>& o);
4703 void encode(bufferlist &bl, uint64_t features) const;
4704 void decode(bufferlist::iterator &bl);
4705 ostream &print(ostream &out) const;
4706 void dump(Formatter *f) const;
4707
4708 uint64_t cost(CephContext *cct) const;
4709};
4710WRITE_CLASS_ENCODER_FEATURES(PullOp)
4711ostream& operator<<(ostream& out, const PullOp &op);
4712
4713struct PushOp {
4714 hobject_t soid;
4715 eversion_t version;
4716 bufferlist data;
4717 interval_set<uint64_t> data_included;
4718 bufferlist omap_header;
4719 map<string, bufferlist> omap_entries;
4720 map<string, bufferlist> attrset;
4721
4722 ObjectRecoveryInfo recovery_info;
4723 ObjectRecoveryProgress before_progress;
4724 ObjectRecoveryProgress after_progress;
4725
4726 static void generate_test_instances(list<PushOp*>& o);
4727 void encode(bufferlist &bl, uint64_t features) const;
4728 void decode(bufferlist::iterator &bl);
4729 ostream &print(ostream &out) const;
4730 void dump(Formatter *f) const;
4731
4732 uint64_t cost(CephContext *cct) const;
4733};
4734WRITE_CLASS_ENCODER_FEATURES(PushOp)
4735ostream& operator<<(ostream& out, const PushOp &op);
4736
4737
4738/*
4739 * summarize pg contents for purposes of a scrub
4740 */
4741struct ScrubMap {
4742 struct object {
4743 map<string,bufferptr> attrs;
4744 uint64_t size;
4745 __u32 omap_digest; ///< omap crc32c
4746 __u32 digest; ///< data crc32c
4747 bool negative:1;
4748 bool digest_present:1;
4749 bool omap_digest_present:1;
4750 bool read_error:1;
4751 bool stat_error:1;
4752 bool ec_hash_mismatch:1;
4753 bool ec_size_mismatch:1;
4754
4755 object() :
4756 // Init invalid size so it won't match if we get a stat EIO error
4757 size(-1), omap_digest(0), digest(0),
4758 negative(false), digest_present(false), omap_digest_present(false),
4759 read_error(false), stat_error(false), ec_hash_mismatch(false), ec_size_mismatch(false) {}
4760
4761 void encode(bufferlist& bl) const;
4762 void decode(bufferlist::iterator& bl);
4763 void dump(Formatter *f) const;
4764 static void generate_test_instances(list<object*>& o);
4765 };
4766 WRITE_CLASS_ENCODER(object)
4767
4768 map<hobject_t,object> objects;
4769 eversion_t valid_through;
4770 eversion_t incr_since;
4771
4772 void merge_incr(const ScrubMap &l);
4773 void insert(const ScrubMap &r) {
4774 objects.insert(r.objects.begin(), r.objects.end());
4775 }
4776 void swap(ScrubMap &r) {
31f18b77
FG
4777 using std::swap;
4778 swap(objects, r.objects);
4779 swap(valid_through, r.valid_through);
4780 swap(incr_since, r.incr_since);
7c673cae
FG
4781 }
4782
4783 void encode(bufferlist& bl) const;
4784 void decode(bufferlist::iterator& bl, int64_t pool=-1);
4785 void dump(Formatter *f) const;
4786 static void generate_test_instances(list<ScrubMap*>& o);
4787};
4788WRITE_CLASS_ENCODER(ScrubMap::object)
4789WRITE_CLASS_ENCODER(ScrubMap)
4790
4791
4792struct OSDOp {
4793 ceph_osd_op op;
4794 sobject_t soid;
4795
4796 bufferlist indata, outdata;
224ce89b 4797 errorcode32_t rval;
7c673cae
FG
4798
4799 OSDOp() : rval(0) {
4800 memset(&op, 0, sizeof(ceph_osd_op));
4801 }
4802
4803 /**
4804 * split a bufferlist into constituent indata members of a vector of OSDOps
4805 *
4806 * @param ops [out] vector of OSDOps
4807 * @param in [in] combined data buffer
4808 */
4809 static void split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in);
4810
4811 /**
4812 * merge indata members of a vector of OSDOp into a single bufferlist
4813 *
4814 * Notably this also encodes certain other OSDOp data into the data
4815 * buffer, including the sobject_t soid.
4816 *
4817 * @param ops [in] vector of OSDOps
4818 * @param out [out] combined data buffer
4819 */
4820 static void merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out);
4821
4822 /**
4823 * split a bufferlist into constituent outdata members of a vector of OSDOps
4824 *
4825 * @param ops [out] vector of OSDOps
4826 * @param in [in] combined data buffer
4827 */
4828 static void split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in);
4829
4830 /**
4831 * merge outdata members of a vector of OSDOps into a single bufferlist
4832 *
4833 * @param ops [in] vector of OSDOps
4834 * @param out [out] combined data buffer
4835 */
4836 static void merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out);
224ce89b
WB
4837
4838 /**
4839 * Clear data as much as possible, leave minimal data for historical op dump
4840 *
4841 * @param ops [in] vector of OSDOps
4842 */
4843 static void clear_data(vector<OSDOp>& ops);
7c673cae
FG
4844};
4845
4846ostream& operator<<(ostream& out, const OSDOp& op);
4847
4848struct watch_item_t {
4849 entity_name_t name;
4850 uint64_t cookie;
4851 uint32_t timeout_seconds;
4852 entity_addr_t addr;
4853
4854 watch_item_t() : cookie(0), timeout_seconds(0) { }
4855 watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
4856 const entity_addr_t& addr)
4857 : name(name), cookie(cookie), timeout_seconds(timeout),
4858 addr(addr) { }
4859
4860 void encode(bufferlist &bl, uint64_t features) const {
4861 ENCODE_START(2, 1, bl);
4862 ::encode(name, bl);
4863 ::encode(cookie, bl);
4864 ::encode(timeout_seconds, bl);
4865 ::encode(addr, bl, features);
4866 ENCODE_FINISH(bl);
4867 }
4868 void decode(bufferlist::iterator &bl) {
4869 DECODE_START(2, bl);
4870 ::decode(name, bl);
4871 ::decode(cookie, bl);
4872 ::decode(timeout_seconds, bl);
4873 if (struct_v >= 2) {
4874 ::decode(addr, bl);
4875 }
4876 DECODE_FINISH(bl);
4877 }
4878};
4879WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
4880
4881struct obj_watch_item_t {
4882 hobject_t obj;
4883 watch_item_t wi;
4884};
4885
4886/**
4887 * obj list watch response format
4888 *
4889 */
4890struct obj_list_watch_response_t {
4891 list<watch_item_t> entries;
4892
4893 void encode(bufferlist& bl, uint64_t features) const {
4894 ENCODE_START(1, 1, bl);
4895 ::encode(entries, bl, features);
4896 ENCODE_FINISH(bl);
4897 }
4898 void decode(bufferlist::iterator& bl) {
4899 DECODE_START(1, bl);
4900 ::decode(entries, bl);
4901 DECODE_FINISH(bl);
4902 }
4903 void dump(Formatter *f) const {
4904 f->open_array_section("entries");
4905 for (list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4906 f->open_object_section("watch");
4907 f->dump_stream("watcher") << p->name;
4908 f->dump_int("cookie", p->cookie);
4909 f->dump_int("timeout", p->timeout_seconds);
4910 f->open_object_section("addr");
4911 p->addr.dump(f);
4912 f->close_section();
4913 f->close_section();
4914 }
4915 f->close_section();
4916 }
4917 static void generate_test_instances(list<obj_list_watch_response_t*>& o) {
4918 entity_addr_t ea;
4919 o.push_back(new obj_list_watch_response_t);
4920 o.push_back(new obj_list_watch_response_t);
4921 ea.set_type(entity_addr_t::TYPE_LEGACY);
4922 ea.set_nonce(1000);
4923 ea.set_family(AF_INET);
4924 ea.set_in4_quad(0, 127);
4925 ea.set_in4_quad(1, 0);
4926 ea.set_in4_quad(2, 0);
4927 ea.set_in4_quad(3, 1);
4928 ea.set_port(1024);
4929 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
4930 ea.set_nonce(1001);
4931 ea.set_in4_quad(3, 2);
4932 ea.set_port(1025);
4933 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
4934 }
4935};
4936WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
4937
4938struct clone_info {
4939 snapid_t cloneid;
4940 vector<snapid_t> snaps; // ascending
4941 vector< pair<uint64_t,uint64_t> > overlap;
4942 uint64_t size;
4943
4944 clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
4945
4946 void encode(bufferlist& bl) const {
4947 ENCODE_START(1, 1, bl);
4948 ::encode(cloneid, bl);
4949 ::encode(snaps, bl);
4950 ::encode(overlap, bl);
4951 ::encode(size, bl);
4952 ENCODE_FINISH(bl);
4953 }
4954 void decode(bufferlist::iterator& bl) {
4955 DECODE_START(1, bl);
4956 ::decode(cloneid, bl);
4957 ::decode(snaps, bl);
4958 ::decode(overlap, bl);
4959 ::decode(size, bl);
4960 DECODE_FINISH(bl);
4961 }
4962 void dump(Formatter *f) const {
4963 if (cloneid == CEPH_NOSNAP)
4964 f->dump_string("cloneid", "HEAD");
4965 else
4966 f->dump_unsigned("cloneid", cloneid.val);
4967 f->open_array_section("snapshots");
4968 for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
4969 f->open_object_section("snap");
4970 f->dump_unsigned("id", p->val);
4971 f->close_section();
4972 }
4973 f->close_section();
4974 f->open_array_section("overlaps");
4975 for (vector< pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
4976 q != overlap.end(); ++q) {
4977 f->open_object_section("overlap");
4978 f->dump_unsigned("offset", q->first);
4979 f->dump_unsigned("length", q->second);
4980 f->close_section();
4981 }
4982 f->close_section();
4983 f->dump_unsigned("size", size);
4984 }
4985 static void generate_test_instances(list<clone_info*>& o) {
4986 o.push_back(new clone_info);
4987 o.push_back(new clone_info);
4988 o.back()->cloneid = 1;
4989 o.back()->snaps.push_back(1);
4990 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
4991 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
4992 o.back()->size = 16384;
4993 o.push_back(new clone_info);
4994 o.back()->cloneid = CEPH_NOSNAP;
4995 o.back()->size = 32768;
4996 }
4997};
4998WRITE_CLASS_ENCODER(clone_info)
4999
5000/**
5001 * obj list snaps response format
5002 *
5003 */
5004struct obj_list_snap_response_t {
5005 vector<clone_info> clones; // ascending
5006 snapid_t seq;
5007
5008 void encode(bufferlist& bl) const {
5009 ENCODE_START(2, 1, bl);
5010 ::encode(clones, bl);
5011 ::encode(seq, bl);
5012 ENCODE_FINISH(bl);
5013 }
5014 void decode(bufferlist::iterator& bl) {
5015 DECODE_START(2, bl);
5016 ::decode(clones, bl);
5017 if (struct_v >= 2)
5018 ::decode(seq, bl);
5019 else
5020 seq = CEPH_NOSNAP;
5021 DECODE_FINISH(bl);
5022 }
5023 void dump(Formatter *f) const {
5024 f->open_array_section("clones");
5025 for (vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
5026 f->open_object_section("clone");
5027 p->dump(f);
5028 f->close_section();
5029 }
5030 f->dump_unsigned("seq", seq);
5031 f->close_section();
5032 }
5033 static void generate_test_instances(list<obj_list_snap_response_t*>& o) {
5034 o.push_back(new obj_list_snap_response_t);
5035 o.push_back(new obj_list_snap_response_t);
5036 clone_info cl;
5037 cl.cloneid = 1;
5038 cl.snaps.push_back(1);
5039 cl.overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5040 cl.overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5041 cl.size = 16384;
5042 o.back()->clones.push_back(cl);
5043 cl.cloneid = CEPH_NOSNAP;
5044 cl.snaps.clear();
5045 cl.overlap.clear();
5046 cl.size = 32768;
5047 o.back()->clones.push_back(cl);
5048 o.back()->seq = 123;
5049 }
5050};
5051
5052WRITE_CLASS_ENCODER(obj_list_snap_response_t)
5053
5054// PromoteCounter
5055
5056struct PromoteCounter {
5057 std::atomic_ullong attempts{0};
5058 std::atomic_ullong objects{0};
5059 std::atomic_ullong bytes{0};
5060
5061 void attempt() {
5062 attempts++;
5063 }
5064
5065 void finish(uint64_t size) {
5066 objects++;
5067 bytes += size;
5068 }
5069
5070 void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
5071 *a = attempts;
5072 *o = objects;
5073 *b = bytes;
5074 attempts = *a / 2;
5075 objects = *o / 2;
5076 bytes = *b / 2;
5077 }
5078};
5079
5080/** store_statfs_t
5081 * ObjectStore full statfs information
5082 */
5083struct store_statfs_t
5084{
5085 uint64_t total = 0; // Total bytes
5086 uint64_t available = 0; // Free bytes available
5087
5088 int64_t allocated = 0; // Bytes allocated by the store
5089 int64_t stored = 0; // Bytes actually stored by the user
5090 int64_t compressed = 0; // Bytes stored after compression
5091 int64_t compressed_allocated = 0; // Bytes allocated for compressed data
5092 int64_t compressed_original = 0; // Bytes that were successfully compressed
5093
5094 void reset() {
5095 *this = store_statfs_t();
5096 }
5097 bool operator ==(const store_statfs_t& other) const;
5098 void dump(Formatter *f) const;
5099};
5100ostream &operator<<(ostream &lhs, const store_statfs_t &rhs);
5101
5102#endif