]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/osd_types.h
import ceph 12.2.12
[ceph.git] / ceph / src / osd / osd_types.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #ifndef CEPH_OSD_TYPES_H
19 #define CEPH_OSD_TYPES_H
20
21 #include <sstream>
22 #include <stdio.h>
23 #include <memory>
24 #include <boost/scoped_ptr.hpp>
25 #include <boost/optional/optional_io.hpp>
26 #include <boost/variant.hpp>
27
28 #include "include/rados/rados_types.hpp"
29 #include "include/mempool.h"
30
31 #include "msg/msg_types.h"
32 #include "include/types.h"
33 #include "include/utime.h"
34 #include "include/CompatSet.h"
35 #include "common/histogram.h"
36 #include "include/interval_set.h"
37 #include "include/inline_memory.h"
38 #include "common/Formatter.h"
39 #include "common/bloom_filter.hpp"
40 #include "common/hobject.h"
41 #include "common/snap_types.h"
42 #include "HitSet.h"
43 #include "Watch.h"
44 #include "include/cmp.h"
45 #include "librados/ListObjectImpl.h"
46 #include "compressor/Compressor.h"
47 #include <atomic>
48
49 #define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
50
51 #define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
52 #define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
53 #define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
54 #define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
55 #define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
56 #define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
57 #define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
58 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
59 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
60 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
61 #define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
62 #define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
63 #define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
64 #define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
65 #define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
66 #define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
67
68
69 /// min recovery priority for MBackfillReserve
70 #define OSD_RECOVERY_PRIORITY_MIN 0
71
72 /// base backfill priority for MBackfillReserve
73 #define OSD_BACKFILL_PRIORITY_BASE 100
74
75 /// base backfill priority for MBackfillReserve (degraded PG)
76 #define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
77
78 /// base recovery priority for MBackfillReserve
79 #define OSD_RECOVERY_PRIORITY_BASE 180
80
81 /// base backfill priority for MBackfillReserve (inactive PG)
82 #define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
83
84 /// max manually/automatically set recovery priority for MBackfillReserve
85 #define OSD_RECOVERY_PRIORITY_MAX 253
86
87 /// backfill priority for MBackfillReserve, when forced manually
88 #define OSD_BACKFILL_PRIORITY_FORCED 254
89
90 /// recovery priority for MRecoveryReserve, when forced manually
91 #define OSD_RECOVERY_PRIORITY_FORCED 255
92
93
94 typedef hobject_t collection_list_handle_t;
95
96 /// convert a single CPEH_OSD_FLAG_* to a string
97 const char *ceph_osd_flag_name(unsigned flag);
98 /// convert a single CEPH_OSD_OF_FLAG_* to a string
99 const char *ceph_osd_op_flag_name(unsigned flag);
100
101 /// convert CEPH_OSD_FLAG_* op flags to a string
102 string ceph_osd_flag_string(unsigned flags);
103 /// conver CEPH_OSD_OP_FLAG_* op flags to a string
104 string ceph_osd_op_flag_string(unsigned flags);
105 /// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a string
106 string ceph_osd_alloc_hint_flag_string(unsigned flags);
107
108
109 /**
110 * osd request identifier
111 *
112 * caller name + incarnation# + tid to unique identify this request.
113 */
114 struct osd_reqid_t {
115 entity_name_t name; // who
116 ceph_tid_t tid;
117 int32_t inc; // incarnation
118
119 osd_reqid_t()
120 : tid(0), inc(0)
121 {}
122 osd_reqid_t(const osd_reqid_t& other)
123 : name(other.name), tid(other.tid), inc(other.inc)
124 {}
125 osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
126 : name(a), tid(t), inc(i)
127 {}
128
129 DENC(osd_reqid_t, v, p) {
130 DENC_START(2, 2, p);
131 denc(v.name, p);
132 denc(v.tid, p);
133 denc(v.inc, p);
134 DENC_FINISH(p);
135 }
136 void dump(Formatter *f) const;
137 static void generate_test_instances(list<osd_reqid_t*>& o);
138 };
139 WRITE_CLASS_DENC(osd_reqid_t)
140
141
142
143 struct pg_shard_t {
144 static const int32_t NO_OSD = 0x7fffffff;
145 int32_t osd;
146 shard_id_t shard;
147 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
148 explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
149 pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
150 bool is_undefined() const {
151 return osd == -1;
152 }
153 string get_osd() const { return (osd == NO_OSD ? "NONE" : to_string(osd)); }
154 void encode(bufferlist &bl) const;
155 void decode(bufferlist::iterator &bl);
156 void dump(Formatter *f) const {
157 f->dump_unsigned("osd", osd);
158 if (shard != shard_id_t::NO_SHARD) {
159 f->dump_unsigned("shard", shard);
160 }
161 }
162 };
163 WRITE_CLASS_ENCODER(pg_shard_t)
164 WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
165 WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
166 ostream &operator<<(ostream &lhs, const pg_shard_t &rhs);
167
168 class IsPGRecoverablePredicate {
169 public:
170 /**
171 * have encodes the shards available
172 */
173 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
174 virtual ~IsPGRecoverablePredicate() {}
175 };
176
177 class IsPGReadablePredicate {
178 public:
179 /**
180 * have encodes the shards available
181 */
182 virtual bool operator()(const set<pg_shard_t> &have) const = 0;
183 virtual ~IsPGReadablePredicate() {}
184 };
185
186 inline ostream& operator<<(ostream& out, const osd_reqid_t& r) {
187 return out << r.name << "." << r.inc << ":" << r.tid;
188 }
189
190 inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
191 return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
192 }
193 inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
194 return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
195 }
196 inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
197 return (l.name < r.name) || (l.inc < r.inc) ||
198 (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
199 }
200 inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
201 return (l.name < r.name) || (l.inc < r.inc) ||
202 (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
203 }
204 inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
205 inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
206
207 namespace std {
208 template<> struct hash<osd_reqid_t> {
209 size_t operator()(const osd_reqid_t &r) const {
210 static hash<uint64_t> H;
211 return H(r.name.num() ^ r.tid ^ r.inc);
212 }
213 };
214 } // namespace std
215
216
217 // -----
218
219 // a locator constrains the placement of an object. mainly, which pool
220 // does it go in.
221 struct object_locator_t {
222 // You specify either the hash or the key -- not both
223 int64_t pool; ///< pool id
224 string key; ///< key string (if non-empty)
225 string nspace; ///< namespace
226 int64_t hash; ///< hash position (if >= 0)
227
228 explicit object_locator_t()
229 : pool(-1), hash(-1) {}
230 explicit object_locator_t(int64_t po)
231 : pool(po), hash(-1) {}
232 explicit object_locator_t(int64_t po, int64_t ps)
233 : pool(po), hash(ps) {}
234 explicit object_locator_t(int64_t po, string ns)
235 : pool(po), nspace(ns), hash(-1) {}
236 explicit object_locator_t(int64_t po, string ns, int64_t ps)
237 : pool(po), nspace(ns), hash(ps) {}
238 explicit object_locator_t(int64_t po, string ns, string s)
239 : pool(po), key(s), nspace(ns), hash(-1) {}
240 explicit object_locator_t(const hobject_t& soid)
241 : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
242
243 int64_t get_pool() const {
244 return pool;
245 }
246
247 void clear() {
248 pool = -1;
249 key = "";
250 nspace = "";
251 hash = -1;
252 }
253
254 bool empty() const {
255 return pool == -1;
256 }
257
258 void encode(bufferlist& bl) const;
259 void decode(bufferlist::iterator& p);
260 void dump(Formatter *f) const;
261 static void generate_test_instances(list<object_locator_t*>& o);
262 };
263 WRITE_CLASS_ENCODER(object_locator_t)
264
265 inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
266 return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
267 }
268 inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
269 return !(l == r);
270 }
271
272 inline ostream& operator<<(ostream& out, const object_locator_t& loc)
273 {
274 out << "@" << loc.pool;
275 if (loc.nspace.length())
276 out << ";" << loc.nspace;
277 if (loc.key.length())
278 out << ":" << loc.key;
279 return out;
280 }
281
282 struct request_redirect_t {
283 private:
284 object_locator_t redirect_locator; ///< this is authoritative
285 string redirect_object; ///< If non-empty, the request goes to this object name
286 bufferlist osd_instructions; ///< a bufferlist for the OSDs, passed but not interpreted by clients
287
288 friend ostream& operator<<(ostream& out, const request_redirect_t& redir);
289 public:
290
291 request_redirect_t() {}
292 explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
293 redirect_locator(orig) { redirect_locator.pool = rpool; }
294 explicit request_redirect_t(const object_locator_t& rloc) :
295 redirect_locator(rloc) {}
296 explicit request_redirect_t(const object_locator_t& orig,
297 const string& robj) :
298 redirect_locator(orig), redirect_object(robj) {}
299
300 void set_instructions(const bufferlist& bl) { osd_instructions = bl; }
301 const bufferlist& get_instructions() { return osd_instructions; }
302
303 bool empty() const { return redirect_locator.empty() &&
304 redirect_object.empty(); }
305
306 void combine_with_locator(object_locator_t& orig, string& obj) const {
307 orig = redirect_locator;
308 if (!redirect_object.empty())
309 obj = redirect_object;
310 }
311
312 void encode(bufferlist& bl) const;
313 void decode(bufferlist::iterator& bl);
314 void dump(Formatter *f) const;
315 static void generate_test_instances(list<request_redirect_t*>& o);
316 };
317 WRITE_CLASS_ENCODER(request_redirect_t)
318
319 inline ostream& operator<<(ostream& out, const request_redirect_t& redir) {
320 out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
321 return out;
322 }
323
324 // Internal OSD op flags - set by the OSD based on the op types
325 enum {
326 CEPH_OSD_RMW_FLAG_READ = (1 << 1),
327 CEPH_OSD_RMW_FLAG_WRITE = (1 << 2),
328 CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3),
329 CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
330 CEPH_OSD_RMW_FLAG_PGOP = (1 << 5),
331 CEPH_OSD_RMW_FLAG_CACHE = (1 << 6),
332 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7),
333 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
334 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9),
335 CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10),
336 };
337
338
339 // pg stuff
340
341 #define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
342
343 // placement seed (a hash value)
344 typedef uint32_t ps_t;
345
346 // old (v1) pg_t encoding (wrap old struct ceph_pg)
347 struct old_pg_t {
348 ceph_pg v;
349 void encode(bufferlist& bl) const {
350 ::encode_raw(v, bl);
351 }
352 void decode(bufferlist::iterator& bl) {
353 ::decode_raw(v, bl);
354 }
355 };
356 WRITE_CLASS_ENCODER(old_pg_t)
357
358 // placement group id
359 struct pg_t {
360 uint64_t m_pool;
361 uint32_t m_seed;
362 int32_t m_preferred;
363
364 pg_t() : m_pool(0), m_seed(0), m_preferred(-1) {}
365 pg_t(ps_t seed, uint64_t pool, int pref=-1) :
366 m_pool(pool), m_seed(seed), m_preferred(pref) {}
367 // cppcheck-suppress noExplicitConstructor
368 pg_t(const ceph_pg& cpg) :
369 m_pool(cpg.pool), m_seed(cpg.ps), m_preferred((__s16)cpg.preferred) {}
370
371 // cppcheck-suppress noExplicitConstructor
372 pg_t(const old_pg_t& opg) {
373 *this = opg.v;
374 }
375
376 old_pg_t get_old_pg() const {
377 old_pg_t o;
378 assert(m_pool < 0xffffffffull);
379 o.v.pool = m_pool;
380 o.v.ps = m_seed;
381 o.v.preferred = (__s16)m_preferred;
382 return o;
383 }
384
385 ps_t ps() const {
386 return m_seed;
387 }
388 uint64_t pool() const {
389 return m_pool;
390 }
391 int32_t preferred() const {
392 return m_preferred;
393 }
394
395 static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
396 char *calc_name(char *buf, const char *suffix_backwords) const;
397
398 void set_ps(ps_t p) {
399 m_seed = p;
400 }
401 void set_pool(uint64_t p) {
402 m_pool = p;
403 }
404 void set_preferred(int32_t osd) {
405 m_preferred = osd;
406 }
407
408 pg_t get_parent() const;
409 pg_t get_ancestor(unsigned old_pg_num) const;
410
411 int print(char *o, int maxlen) const;
412 bool parse(const char *s);
413
414 bool is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *pchildren) const;
415
416 /**
417 * Returns b such that for all object o:
418 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
419 */
420 unsigned get_split_bits(unsigned pg_num) const;
421
422 bool contains(int bits, const ghobject_t& oid) {
423 return
424 (int64_t)m_pool == oid.hobj.get_logical_pool() &&
425 oid.match(bits, ps());
426 }
427 bool contains(int bits, const hobject_t& oid) {
428 return
429 (int64_t)m_pool == oid.get_logical_pool() &&
430 oid.match(bits, ps());
431 }
432
433 hobject_t get_hobj_start() const;
434 hobject_t get_hobj_end(unsigned pg_num) const;
435
436 void encode(bufferlist& bl) const {
437 __u8 v = 1;
438 ::encode(v, bl);
439 ::encode(m_pool, bl);
440 ::encode(m_seed, bl);
441 ::encode(m_preferred, bl);
442 }
443 void decode(bufferlist::iterator& bl) {
444 __u8 v;
445 ::decode(v, bl);
446 ::decode(m_pool, bl);
447 ::decode(m_seed, bl);
448 ::decode(m_preferred, bl);
449 }
450 void decode_old(bufferlist::iterator& bl) {
451 old_pg_t opg;
452 ::decode(opg, bl);
453 *this = opg;
454 }
455 void dump(Formatter *f) const;
456 static void generate_test_instances(list<pg_t*>& o);
457 };
458 WRITE_CLASS_ENCODER(pg_t)
459
460 inline bool operator<(const pg_t& l, const pg_t& r) {
461 return l.pool() < r.pool() ||
462 (l.pool() == r.pool() && (l.preferred() < r.preferred() ||
463 (l.preferred() == r.preferred() && (l.ps() < r.ps()))));
464 }
465 inline bool operator<=(const pg_t& l, const pg_t& r) {
466 return l.pool() < r.pool() ||
467 (l.pool() == r.pool() && (l.preferred() < r.preferred() ||
468 (l.preferred() == r.preferred() && (l.ps() <= r.ps()))));
469 }
470 inline bool operator==(const pg_t& l, const pg_t& r) {
471 return l.pool() == r.pool() &&
472 l.preferred() == r.preferred() &&
473 l.ps() == r.ps();
474 }
475 inline bool operator!=(const pg_t& l, const pg_t& r) {
476 return l.pool() != r.pool() ||
477 l.preferred() != r.preferred() ||
478 l.ps() != r.ps();
479 }
480 inline bool operator>(const pg_t& l, const pg_t& r) {
481 return l.pool() > r.pool() ||
482 (l.pool() == r.pool() && (l.preferred() > r.preferred() ||
483 (l.preferred() == r.preferred() && (l.ps() > r.ps()))));
484 }
485 inline bool operator>=(const pg_t& l, const pg_t& r) {
486 return l.pool() > r.pool() ||
487 (l.pool() == r.pool() && (l.preferred() > r.preferred() ||
488 (l.preferred() == r.preferred() && (l.ps() >= r.ps()))));
489 }
490
491 ostream& operator<<(ostream& out, const pg_t &pg);
492
493 namespace std {
494 template<> struct hash< pg_t >
495 {
496 size_t operator()( const pg_t& x ) const
497 {
498 static hash<uint32_t> H;
499 return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ x.preferred());
500 }
501 };
502 } // namespace std
503
504 struct spg_t {
505 pg_t pgid;
506 shard_id_t shard;
507 spg_t() : shard(shard_id_t::NO_SHARD) {}
508 spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
509 explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
510 unsigned get_split_bits(unsigned pg_num) const {
511 return pgid.get_split_bits(pg_num);
512 }
513 spg_t get_parent() const {
514 return spg_t(pgid.get_parent(), shard);
515 }
516 ps_t ps() const {
517 return pgid.ps();
518 }
519 uint64_t pool() const {
520 return pgid.pool();
521 }
522 int32_t preferred() const {
523 return pgid.preferred();
524 }
525
526 static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
527 char *calc_name(char *buf, const char *suffix_backwords) const;
528
529 bool parse(const char *s);
530 bool parse(const std::string& s) {
531 return parse(s.c_str());
532 }
533 bool is_split(unsigned old_pg_num, unsigned new_pg_num,
534 set<spg_t> *pchildren) const {
535 set<pg_t> _children;
536 set<pg_t> *children = pchildren ? &_children : NULL;
537 bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
538 if (pchildren && is_split) {
539 for (set<pg_t>::iterator i = _children.begin();
540 i != _children.end();
541 ++i) {
542 pchildren->insert(spg_t(*i, shard));
543 }
544 }
545 return is_split;
546 }
547 bool is_no_shard() const {
548 return shard == shard_id_t::NO_SHARD;
549 }
550
551 ghobject_t make_pgmeta_oid() const {
552 return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
553 }
554
555 void encode(bufferlist &bl) const {
556 ENCODE_START(1, 1, bl);
557 ::encode(pgid, bl);
558 ::encode(shard, bl);
559 ENCODE_FINISH(bl);
560 }
561 void decode(bufferlist::iterator &bl) {
562 DECODE_START(1, bl);
563 ::decode(pgid, bl);
564 ::decode(shard, bl);
565 DECODE_FINISH(bl);
566 }
567
568 ghobject_t make_temp_ghobject(const string& name) const {
569 return ghobject_t(
570 hobject_t(object_t(name), "", CEPH_NOSNAP,
571 pgid.ps(),
572 hobject_t::get_temp_pool(pgid.pool()),
573 ""),
574 ghobject_t::NO_GEN,
575 shard);
576 }
577
578 unsigned hash_to_shard(unsigned num_shards) const {
579 return ps() % num_shards;
580 }
581 };
582 WRITE_CLASS_ENCODER(spg_t)
583 WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
584 WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
585
586 namespace std {
587 template<> struct hash< spg_t >
588 {
589 size_t operator()( const spg_t& x ) const
590 {
591 static hash<uint32_t> H;
592 return H(hash<pg_t>()(x.pgid) ^ x.shard);
593 }
594 };
595 } // namespace std
596
597 ostream& operator<<(ostream& out, const spg_t &pg);
598
599 // ----------------------
600
601 class coll_t {
602 enum type_t {
603 TYPE_META = 0,
604 TYPE_LEGACY_TEMP = 1, /* no longer used */
605 TYPE_PG = 2,
606 TYPE_PG_TEMP = 3,
607 };
608 type_t type;
609 spg_t pgid;
610 uint64_t removal_seq; // note: deprecated, not encoded
611
612 char _str_buff[spg_t::calc_name_buf_size];
613 char *_str;
614
615 void calc_str();
616
617 coll_t(type_t t, spg_t p, uint64_t r)
618 : type(t), pgid(p), removal_seq(r) {
619 calc_str();
620 }
621
622 public:
623 coll_t() : type(TYPE_META), removal_seq(0)
624 {
625 calc_str();
626 }
627
628 coll_t(const coll_t& other)
629 : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
630 calc_str();
631 }
632
633 explicit coll_t(spg_t pgid)
634 : type(TYPE_PG), pgid(pgid), removal_seq(0)
635 {
636 calc_str();
637 }
638
639 coll_t& operator=(const coll_t& rhs)
640 {
641 this->type = rhs.type;
642 this->pgid = rhs.pgid;
643 this->removal_seq = rhs.removal_seq;
644 this->calc_str();
645 return *this;
646 }
647
648 // named constructors
649 static coll_t meta() {
650 return coll_t();
651 }
652 static coll_t pg(spg_t p) {
653 return coll_t(p);
654 }
655
656 const std::string to_str() const {
657 return string(_str);
658 }
659 const char *c_str() const {
660 return _str;
661 }
662
663 bool parse(const std::string& s);
664
665 int operator<(const coll_t &rhs) const {
666 return type < rhs.type ||
667 (type == rhs.type && pgid < rhs.pgid);
668 }
669
670 bool is_meta() const {
671 return type == TYPE_META;
672 }
673 bool is_pg_prefix(spg_t *pgid_) const {
674 if (type == TYPE_PG || type == TYPE_PG_TEMP) {
675 *pgid_ = pgid;
676 return true;
677 }
678 return false;
679 }
680 bool is_pg() const {
681 return type == TYPE_PG;
682 }
683 bool is_pg(spg_t *pgid_) const {
684 if (type == TYPE_PG) {
685 *pgid_ = pgid;
686 return true;
687 }
688 return false;
689 }
690 bool is_temp() const {
691 return type == TYPE_PG_TEMP;
692 }
693 bool is_temp(spg_t *pgid_) const {
694 if (type == TYPE_PG_TEMP) {
695 *pgid_ = pgid;
696 return true;
697 }
698 return false;
699 }
700
701 void encode(bufferlist& bl) const;
702 void decode(bufferlist::iterator& bl);
703 size_t encoded_size() const;
704
705 inline bool operator==(const coll_t& rhs) const {
706 // only compare type if meta
707 if (type != rhs.type)
708 return false;
709 if (type == TYPE_META)
710 return true;
711 return type == rhs.type && pgid == rhs.pgid;
712 }
713 inline bool operator!=(const coll_t& rhs) const {
714 return !(*this == rhs);
715 }
716
717 // get a TEMP collection that corresponds to the current collection,
718 // which we presume is a pg collection.
719 coll_t get_temp() const {
720 assert(type == TYPE_PG);
721 return coll_t(TYPE_PG_TEMP, pgid, 0);
722 }
723
724 ghobject_t get_min_hobj() const {
725 ghobject_t o;
726 switch (type) {
727 case TYPE_PG:
728 o.hobj.pool = pgid.pool();
729 o.set_shard(pgid.shard);
730 break;
731 case TYPE_META:
732 o.hobj.pool = -1;
733 break;
734 default:
735 break;
736 }
737 return o;
738 }
739
740 unsigned hash_to_shard(unsigned num_shards) const {
741 if (type == TYPE_PG)
742 return pgid.hash_to_shard(num_shards);
743 return 0; // whatever.
744 }
745
746 void dump(Formatter *f) const;
747 static void generate_test_instances(list<coll_t*>& o);
748 };
749
750 WRITE_CLASS_ENCODER(coll_t)
751
752 inline ostream& operator<<(ostream& out, const coll_t& c) {
753 out << c.to_str();
754 return out;
755 }
756
757 namespace std {
758 template<> struct hash<coll_t> {
759 size_t operator()(const coll_t &c) const {
760 size_t h = 0;
761 string str(c.to_str());
762 std::string::const_iterator end(str.end());
763 for (std::string::const_iterator s = str.begin(); s != end; ++s) {
764 h += *s;
765 h += (h << 10);
766 h ^= (h >> 6);
767 }
768 h += (h << 3);
769 h ^= (h >> 11);
770 h += (h << 15);
771 return h;
772 }
773 };
774 } // namespace std
775
776 inline ostream& operator<<(ostream& out, const ceph_object_layout &ol)
777 {
778 out << pg_t(ol.ol_pgid);
779 int su = ol.ol_stripe_unit;
780 if (su)
781 out << ".su=" << su;
782 return out;
783 }
784
785
786
787 // compound rados version type
788 /* WARNING: If add member in eversion_t, please make sure the encode/decode function
789 * work well. For little-endian machine, we should make sure there is no padding
790 * in 32-bit machine and 64-bit machine.
791 */
792 class eversion_t {
793 public:
794 version_t version;
795 epoch_t epoch;
796 __u32 __pad;
797 eversion_t() : version(0), epoch(0), __pad(0) {}
798 eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
799
800 // cppcheck-suppress noExplicitConstructor
801 eversion_t(const ceph_eversion& ce) :
802 version(ce.version),
803 epoch(ce.epoch),
804 __pad(0) { }
805
806 explicit eversion_t(bufferlist& bl) : __pad(0) { decode(bl); }
807
808 static eversion_t max() {
809 eversion_t max;
810 max.version -= 1;
811 max.epoch -= 1;
812 return max;
813 }
814
815 operator ceph_eversion() {
816 ceph_eversion c;
817 c.epoch = epoch;
818 c.version = version;
819 return c;
820 }
821
822 string get_key_name() const;
823
824 void encode(bufferlist &bl) const {
825 #if defined(CEPH_LITTLE_ENDIAN)
826 bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
827 #else
828 ::encode(version, bl);
829 ::encode(epoch, bl);
830 #endif
831 }
832 void decode(bufferlist::iterator &bl) {
833 #if defined(CEPH_LITTLE_ENDIAN)
834 bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
835 #else
836 ::decode(version, bl);
837 ::decode(epoch, bl);
838 #endif
839 }
840 void decode(bufferlist& bl) {
841 bufferlist::iterator p = bl.begin();
842 decode(p);
843 }
844 };
845 WRITE_CLASS_ENCODER(eversion_t)
846
847 inline bool operator==(const eversion_t& l, const eversion_t& r) {
848 return (l.epoch == r.epoch) && (l.version == r.version);
849 }
850 inline bool operator!=(const eversion_t& l, const eversion_t& r) {
851 return (l.epoch != r.epoch) || (l.version != r.version);
852 }
853 inline bool operator<(const eversion_t& l, const eversion_t& r) {
854 return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
855 }
856 inline bool operator<=(const eversion_t& l, const eversion_t& r) {
857 return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
858 }
859 inline bool operator>(const eversion_t& l, const eversion_t& r) {
860 return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
861 }
862 inline bool operator>=(const eversion_t& l, const eversion_t& r) {
863 return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
864 }
865 inline ostream& operator<<(ostream& out, const eversion_t& e) {
866 return out << e.epoch << "'" << e.version;
867 }
868
869 /**
870 * objectstore_perf_stat_t
871 *
872 * current perf information about the osd
873 */
874 struct objectstore_perf_stat_t {
875 // cur_op_latency is in ms since double add/sub are not associative
876 uint32_t os_commit_latency;
877 uint32_t os_apply_latency;
878
879 objectstore_perf_stat_t() :
880 os_commit_latency(0), os_apply_latency(0) {}
881
882 bool operator==(const objectstore_perf_stat_t &r) const {
883 return os_commit_latency == r.os_commit_latency &&
884 os_apply_latency == r.os_apply_latency;
885 }
886
887 void add(const objectstore_perf_stat_t &o) {
888 os_commit_latency += o.os_commit_latency;
889 os_apply_latency += o.os_apply_latency;
890 }
891 void sub(const objectstore_perf_stat_t &o) {
892 os_commit_latency -= o.os_commit_latency;
893 os_apply_latency -= o.os_apply_latency;
894 }
895 void dump(Formatter *f) const;
896 void encode(bufferlist &bl) const;
897 void decode(bufferlist::iterator &bl);
898 static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
899 };
900 WRITE_CLASS_ENCODER(objectstore_perf_stat_t)
901
902 /** osd_stat
903 * aggregate stats for an osd
904 */
905 struct osd_stat_t {
906 int64_t kb, kb_used, kb_avail;
907 vector<int> hb_peers;
908 int32_t snap_trim_queue_len, num_snap_trimming;
909
910 pow2_hist_t op_queue_age_hist;
911
912 objectstore_perf_stat_t os_perf_stat;
913
914 epoch_t up_from = 0;
915 uint64_t seq = 0;
916
917 uint32_t num_pgs = 0;
918
919 osd_stat_t() : kb(0), kb_used(0), kb_avail(0),
920 snap_trim_queue_len(0), num_snap_trimming(0) {}
921
922 void add(const osd_stat_t& o) {
923 kb += o.kb;
924 kb_used += o.kb_used;
925 kb_avail += o.kb_avail;
926 snap_trim_queue_len += o.snap_trim_queue_len;
927 num_snap_trimming += o.num_snap_trimming;
928 op_queue_age_hist.add(o.op_queue_age_hist);
929 os_perf_stat.add(o.os_perf_stat);
930 num_pgs += o.num_pgs;
931 }
932 void sub(const osd_stat_t& o) {
933 kb -= o.kb;
934 kb_used -= o.kb_used;
935 kb_avail -= o.kb_avail;
936 snap_trim_queue_len -= o.snap_trim_queue_len;
937 num_snap_trimming -= o.num_snap_trimming;
938 op_queue_age_hist.sub(o.op_queue_age_hist);
939 os_perf_stat.sub(o.os_perf_stat);
940 num_pgs -= o.num_pgs;
941 }
942
943 void dump(Formatter *f) const;
944 void encode(bufferlist &bl) const;
945 void decode(bufferlist::iterator &bl);
946 static void generate_test_instances(std::list<osd_stat_t*>& o);
947 };
948 WRITE_CLASS_ENCODER(osd_stat_t)
949
950 inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
951 return l.kb == r.kb &&
952 l.kb_used == r.kb_used &&
953 l.kb_avail == r.kb_avail &&
954 l.snap_trim_queue_len == r.snap_trim_queue_len &&
955 l.num_snap_trimming == r.num_snap_trimming &&
956 l.hb_peers == r.hb_peers &&
957 l.op_queue_age_hist == r.op_queue_age_hist &&
958 l.os_perf_stat == r.os_perf_stat &&
959 l.num_pgs == r.num_pgs;
960 }
961 inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
962 return !(l == r);
963 }
964
965
966
967 inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
968 return out << "osd_stat(" << byte_u_t(s.kb_used << 10) << " used, "
969 << byte_u_t(s.kb_avail << 10) << " avail, "
970 << byte_u_t(s.kb << 10) << " total, "
971 << "peers " << s.hb_peers
972 << " op hist " << s.op_queue_age_hist.h
973 << ")";
974 }
975
976
977 /*
978 * pg states
979 */
980 #define PG_STATE_CREATING (1<<0) // creating
981 #define PG_STATE_ACTIVE (1<<1) // i am active. (primary: replicas too)
982 #define PG_STATE_CLEAN (1<<2) // peers are complete, clean of stray replicas.
983 #define PG_STATE_DOWN (1<<4) // a needed replica is down, PG offline
984 #define PG_STATE_RECOVERY_UNFOUND (1<<5) // recovery stopped due to unfound
985 #define PG_STATE_BACKFILL_UNFOUND (1<<6) // backfill stopped due to unfound
986 //#define PG_STATE_SPLITTING (1<<7) // i am splitting
987 #define PG_STATE_SCRUBBING (1<<8) // scrubbing
988 //#define PG_STATE_SCRUBQ (1<<9) // queued for scrub
989 #define PG_STATE_DEGRADED (1<<10) // pg contains objects with reduced redundancy
990 #define PG_STATE_INCONSISTENT (1<<11) // pg replicas are inconsistent (but shouldn't be)
991 #define PG_STATE_PEERING (1<<12) // pg is (re)peering
992 #define PG_STATE_REPAIR (1<<13) // pg should repair on next scrub
993 #define PG_STATE_RECOVERING (1<<14) // pg is recovering/migrating objects
994 #define PG_STATE_BACKFILL_WAIT (1<<15) // [active] reserving backfill
995 #define PG_STATE_INCOMPLETE (1<<16) // incomplete content, peering failed.
996 #define PG_STATE_STALE (1<<17) // our state for this pg is stale, unknown.
997 #define PG_STATE_REMAPPED (1<<18) // pg is explicitly remapped to different OSDs than CRUSH
998 #define PG_STATE_DEEP_SCRUB (1<<19) // deep scrub: check CRC32 on files
999 #define PG_STATE_BACKFILLING (1<<20) // [active] backfilling pg content
1000 #define PG_STATE_BACKFILL_TOOFULL (1<<21) // backfill can't proceed: too full
1001 #define PG_STATE_RECOVERY_WAIT (1<<22) // waiting for recovery reservations
1002 #define PG_STATE_UNDERSIZED (1<<23) // pg acting < pool size
1003 #define PG_STATE_ACTIVATING (1<<24) // pg is peered but not yet active
1004 #define PG_STATE_PEERED (1<<25) // peered, cannot go active, can recover
1005 #define PG_STATE_SNAPTRIM (1<<26) // trimming snaps
1006 #define PG_STATE_SNAPTRIM_WAIT (1<<27) // queued to trim snaps
1007 #define PG_STATE_RECOVERY_TOOFULL (1<<28) // recovery can't proceed: too full
1008 #define PG_STATE_SNAPTRIM_ERROR (1<<29) // error stopped trimming snaps
1009 #define PG_STATE_FORCED_RECOVERY (1<<30) // force recovery of this pg before any other
1010 #define PG_STATE_FORCED_BACKFILL (1<<31) // force backfill of this pg before any other
1011
1012 std::string pg_state_string(int state);
1013 std::string pg_vector_string(const vector<int32_t> &a);
1014 boost::optional<uint64_t> pg_string_state(const std::string& state);
1015
1016
1017 /*
1018 * pool_snap_info_t
1019 *
1020 * attributes for a single pool snapshot.
1021 */
1022 struct pool_snap_info_t {
1023 snapid_t snapid;
1024 utime_t stamp;
1025 string name;
1026
1027 void dump(Formatter *f) const;
1028 void encode(bufferlist& bl, uint64_t features) const;
1029 void decode(bufferlist::iterator& bl);
1030 static void generate_test_instances(list<pool_snap_info_t*>& o);
1031 };
1032 WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
1033
1034 inline ostream& operator<<(ostream& out, const pool_snap_info_t& si) {
1035 return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
1036 }
1037
1038
1039 /*
1040 * pool_opts_t
1041 *
1042 * pool options.
1043 */
1044
1045 class pool_opts_t {
1046 public:
1047 enum key_t {
1048 SCRUB_MIN_INTERVAL,
1049 SCRUB_MAX_INTERVAL,
1050 DEEP_SCRUB_INTERVAL,
1051 RECOVERY_PRIORITY,
1052 RECOVERY_OP_PRIORITY,
1053 SCRUB_PRIORITY,
1054 COMPRESSION_MODE,
1055 COMPRESSION_ALGORITHM,
1056 COMPRESSION_REQUIRED_RATIO,
1057 COMPRESSION_MAX_BLOB_SIZE,
1058 COMPRESSION_MIN_BLOB_SIZE,
1059 CSUM_TYPE,
1060 CSUM_MAX_BLOCK,
1061 CSUM_MIN_BLOCK,
1062 };
1063
1064 enum type_t {
1065 STR,
1066 INT,
1067 DOUBLE,
1068 };
1069
1070 struct opt_desc_t {
1071 key_t key;
1072 type_t type;
1073
1074 opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
1075
1076 bool operator==(const opt_desc_t& rhs) const {
1077 return key == rhs.key && type == rhs.type;
1078 }
1079 };
1080
1081 typedef boost::variant<std::string,int,double> value_t;
1082
1083 static bool is_opt_name(const std::string& name);
1084 static opt_desc_t get_opt_desc(const std::string& name);
1085
1086 pool_opts_t() : opts() {}
1087
1088 bool is_set(key_t key) const;
1089
1090 template<typename T>
1091 void set(key_t key, const T &val) {
1092 value_t value = val;
1093 opts[key] = value;
1094 }
1095
1096 template<typename T>
1097 bool get(key_t key, T *val) const {
1098 opts_t::const_iterator i = opts.find(key);
1099 if (i == opts.end()) {
1100 return false;
1101 }
1102 *val = boost::get<T>(i->second);
1103 return true;
1104 }
1105
1106 const value_t& get(key_t key) const;
1107
1108 bool unset(key_t key);
1109
1110 void dump(const std::string& name, Formatter *f) const;
1111
1112 void dump(Formatter *f) const;
1113 void encode(bufferlist &bl) const;
1114 void decode(bufferlist::iterator &bl);
1115
1116 private:
1117 typedef std::map<key_t, value_t> opts_t;
1118 opts_t opts;
1119
1120 friend ostream& operator<<(ostream& out, const pool_opts_t& opts);
1121 };
1122 WRITE_CLASS_ENCODER(pool_opts_t)
1123
1124 /*
1125 * pg_pool
1126 */
1127 struct pg_pool_t {
1128 static const char *APPLICATION_NAME_CEPHFS;
1129 static const char *APPLICATION_NAME_RBD;
1130 static const char *APPLICATION_NAME_RGW;
1131
1132 enum {
1133 TYPE_REPLICATED = 1, // replication
1134 //TYPE_RAID4 = 2, // raid4 (never implemented)
1135 TYPE_ERASURE = 3, // erasure-coded
1136 };
1137 static const char *get_type_name(int t) {
1138 switch (t) {
1139 case TYPE_REPLICATED: return "replicated";
1140 //case TYPE_RAID4: return "raid4";
1141 case TYPE_ERASURE: return "erasure";
1142 default: return "???";
1143 }
1144 }
1145 const char *get_type_name() const {
1146 return get_type_name(type);
1147 }
1148
1149 enum {
1150 FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
1151 FLAG_FULL = 1<<1, // pool is full
1152 FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
1153 FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
1154 FLAG_NODELETE = 1<<4, // pool can't be deleted
1155 FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
1156 FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
1157 FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1158 FLAG_NOSCRUB = 1<<8, // block periodic scrub
1159 FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
1160 FLAG_FULL_NO_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
1161 FLAG_NEARFULL = 1<<11, // pool is nearfull
1162 FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
1163 };
1164
1165 static const char *get_flag_name(int f) {
1166 switch (f) {
1167 case FLAG_HASHPSPOOL: return "hashpspool";
1168 case FLAG_FULL: return "full";
1169 case FLAG_EC_OVERWRITES: return "ec_overwrites";
1170 case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
1171 case FLAG_NODELETE: return "nodelete";
1172 case FLAG_NOPGCHANGE: return "nopgchange";
1173 case FLAG_NOSIZECHANGE: return "nosizechange";
1174 case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
1175 case FLAG_NOSCRUB: return "noscrub";
1176 case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
1177 case FLAG_FULL_NO_QUOTA: return "full_no_quota";
1178 case FLAG_NEARFULL: return "nearfull";
1179 case FLAG_BACKFILLFULL: return "backfillfull";
1180 default: return "???";
1181 }
1182 }
1183 static string get_flags_string(uint64_t f) {
1184 string s;
1185 for (unsigned n=0; f && n<64; ++n) {
1186 if (f & (1ull << n)) {
1187 if (s.length())
1188 s += ",";
1189 s += get_flag_name(1ull << n);
1190 }
1191 }
1192 return s;
1193 }
1194 string get_flags_string() const {
1195 return get_flags_string(flags);
1196 }
1197 static uint64_t get_flag_by_name(const string& name) {
1198 if (name == "hashpspool")
1199 return FLAG_HASHPSPOOL;
1200 if (name == "full")
1201 return FLAG_FULL;
1202 if (name == "ec_overwrites")
1203 return FLAG_EC_OVERWRITES;
1204 if (name == "incomplete_clones")
1205 return FLAG_INCOMPLETE_CLONES;
1206 if (name == "nodelete")
1207 return FLAG_NODELETE;
1208 if (name == "nopgchange")
1209 return FLAG_NOPGCHANGE;
1210 if (name == "nosizechange")
1211 return FLAG_NOSIZECHANGE;
1212 if (name == "write_fadvise_dontneed")
1213 return FLAG_WRITE_FADVISE_DONTNEED;
1214 if (name == "noscrub")
1215 return FLAG_NOSCRUB;
1216 if (name == "nodeep-scrub")
1217 return FLAG_NODEEP_SCRUB;
1218 if (name == "full_no_quota")
1219 return FLAG_FULL_NO_QUOTA;
1220 if (name == "nearfull")
1221 return FLAG_NEARFULL;
1222 if (name == "backfillfull")
1223 return FLAG_BACKFILLFULL;
1224 return 0;
1225 }
1226
1227 /// converts the acting/up vector to a set of pg shards
1228 void convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const;
1229
1230 typedef enum {
1231 CACHEMODE_NONE = 0, ///< no caching
1232 CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
1233 CACHEMODE_FORWARD = 2, ///< forward if not in cache
1234 CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
1235 CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later
1236 CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later
1237 CACHEMODE_PROXY = 6, ///< proxy if not in cache
1238 } cache_mode_t;
1239 static const char *get_cache_mode_name(cache_mode_t m) {
1240 switch (m) {
1241 case CACHEMODE_NONE: return "none";
1242 case CACHEMODE_WRITEBACK: return "writeback";
1243 case CACHEMODE_FORWARD: return "forward";
1244 case CACHEMODE_READONLY: return "readonly";
1245 case CACHEMODE_READFORWARD: return "readforward";
1246 case CACHEMODE_READPROXY: return "readproxy";
1247 case CACHEMODE_PROXY: return "proxy";
1248 default: return "unknown";
1249 }
1250 }
1251 static cache_mode_t get_cache_mode_from_str(const string& s) {
1252 if (s == "none")
1253 return CACHEMODE_NONE;
1254 if (s == "writeback")
1255 return CACHEMODE_WRITEBACK;
1256 if (s == "forward")
1257 return CACHEMODE_FORWARD;
1258 if (s == "readonly")
1259 return CACHEMODE_READONLY;
1260 if (s == "readforward")
1261 return CACHEMODE_READFORWARD;
1262 if (s == "readproxy")
1263 return CACHEMODE_READPROXY;
1264 if (s == "proxy")
1265 return CACHEMODE_PROXY;
1266 return (cache_mode_t)-1;
1267 }
1268 const char *get_cache_mode_name() const {
1269 return get_cache_mode_name(cache_mode);
1270 }
1271 bool cache_mode_requires_hit_set() const {
1272 switch (cache_mode) {
1273 case CACHEMODE_NONE:
1274 case CACHEMODE_FORWARD:
1275 case CACHEMODE_READONLY:
1276 case CACHEMODE_PROXY:
1277 return false;
1278 case CACHEMODE_WRITEBACK:
1279 case CACHEMODE_READFORWARD:
1280 case CACHEMODE_READPROXY:
1281 return true;
1282 default:
1283 assert(0 == "implement me");
1284 }
1285 }
1286
1287 uint64_t flags; ///< FLAG_*
1288 __u8 type; ///< TYPE_*
1289 __u8 size, min_size; ///< number of osds in each pg
1290 __u8 crush_rule; ///< crush placement rule
1291 __u8 object_hash; ///< hash mapping object name to ps
1292 private:
1293 __u32 pg_num, pgp_num; ///< number of pgs
1294
1295
1296 public:
1297 map<string,string> properties; ///< OBSOLETE
1298 string erasure_code_profile; ///< name of the erasure code profile in OSDMap
1299 epoch_t last_change; ///< most recent epoch changed, exclusing snapshot changes
1300 epoch_t last_force_op_resend; ///< last epoch that forced clients to resend
1301 /// last epoch that forced clients to resend (pre-luminous clients only)
1302 epoch_t last_force_op_resend_preluminous;
1303 snapid_t snap_seq; ///< seq for per-pool snapshot
1304 epoch_t snap_epoch; ///< osdmap epoch of last snap
1305 uint64_t auid; ///< who owns the pg
1306 __u32 crash_replay_interval; ///< seconds to allow clients to replay ACKed but unCOMMITted requests
1307
1308 uint64_t quota_max_bytes; ///< maximum number of bytes for this pool
1309 uint64_t quota_max_objects; ///< maximum number of objects for this pool
1310
1311 /*
1312 * Pool snaps (global to this pool). These define a SnapContext for
1313 * the pool, unless the client manually specifies an alternate
1314 * context.
1315 */
1316 map<snapid_t, pool_snap_info_t> snaps;
1317 /*
1318 * Alternatively, if we are defining non-pool snaps (e.g. via the
1319 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1320 * used). Snaps and removed_snaps are to be used exclusive of each
1321 * other!
1322 */
1323 interval_set<snapid_t> removed_snaps;
1324
1325 unsigned pg_num_mask, pgp_num_mask;
1326
1327 set<uint64_t> tiers; ///< pools that are tiers of us
1328 int64_t tier_of; ///< pool for which we are a tier
1329 // Note that write wins for read+write ops
1330 int64_t read_tier; ///< pool/tier for objecter to direct reads to
1331 int64_t write_tier; ///< pool/tier for objecter to direct writes to
1332 cache_mode_t cache_mode; ///< cache pool mode
1333
1334 bool is_tier() const { return tier_of >= 0; }
1335 bool has_tiers() const { return !tiers.empty(); }
1336 void clear_tier() {
1337 tier_of = -1;
1338 clear_read_tier();
1339 clear_write_tier();
1340 clear_tier_tunables();
1341 }
1342 bool has_read_tier() const { return read_tier >= 0; }
1343 void clear_read_tier() { read_tier = -1; }
1344 bool has_write_tier() const { return write_tier >= 0; }
1345 void clear_write_tier() { write_tier = -1; }
1346 void clear_tier_tunables() {
1347 if (cache_mode != CACHEMODE_NONE)
1348 flags |= FLAG_INCOMPLETE_CLONES;
1349 cache_mode = CACHEMODE_NONE;
1350
1351 target_max_bytes = 0;
1352 target_max_objects = 0;
1353 cache_target_dirty_ratio_micro = 0;
1354 cache_target_dirty_high_ratio_micro = 0;
1355 cache_target_full_ratio_micro = 0;
1356 hit_set_params = HitSet::Params();
1357 hit_set_period = 0;
1358 hit_set_count = 0;
1359 hit_set_grade_decay_rate = 0;
1360 hit_set_search_last_n = 0;
1361 grade_table.resize(0);
1362 }
1363
1364 uint64_t target_max_bytes; ///< tiering: target max pool size
1365 uint64_t target_max_objects; ///< tiering: target max pool size
1366
1367 uint32_t cache_target_dirty_ratio_micro; ///< cache: fraction of target to leave dirty
1368 uint32_t cache_target_dirty_high_ratio_micro; ///<cache: fraction of target to flush with high speed
1369 uint32_t cache_target_full_ratio_micro; ///< cache: fraction of target to fill before we evict in earnest
1370
1371 uint32_t cache_min_flush_age; ///< minimum age (seconds) before we can flush
1372 uint32_t cache_min_evict_age; ///< minimum age (seconds) before we can evict
1373
1374 HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
1375 uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
1376 uint32_t hit_set_count; ///< number of periods to retain
1377 bool use_gmt_hitset; ///< use gmt to name the hitset archive object
1378 uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote on read
1379 uint32_t min_write_recency_for_promote; ///< minimum number of HitSet to check before promote on write
1380 uint32_t hit_set_grade_decay_rate; ///< current hit_set has highest priority on objects
1381 ///temperature count,the follow hit_set's priority decay
1382 ///by this params than pre hit_set
1383 uint32_t hit_set_search_last_n; ///<accumulate atmost N hit_sets for temperature
1384
1385 uint32_t stripe_width; ///< erasure coded stripe size in bytes
1386
1387 uint64_t expected_num_objects; ///< expected number of objects on this pool, a value of 0 indicates
1388 ///< user does not specify any expected value
1389 bool fast_read; ///< whether turn on fast read on the pool or not
1390
1391 pool_opts_t opts; ///< options
1392
1393 /// application -> key/value metadata
1394 map<string, std::map<string, string>> application_metadata;
1395
1396 private:
1397 vector<uint32_t> grade_table;
1398
1399 public:
1400 uint32_t get_grade(unsigned i) const {
1401 if (grade_table.size() <= i)
1402 return 0;
1403 return grade_table[i];
1404 }
1405 void calc_grade_table() {
1406 unsigned v = 1000000;
1407 grade_table.resize(hit_set_count);
1408 for (unsigned i = 0; i < hit_set_count; i++) {
1409 v = v * (1 - (hit_set_grade_decay_rate / 100.0));
1410 grade_table[i] = v;
1411 }
1412 }
1413
1414 pg_pool_t()
1415 : flags(0), type(0), size(0), min_size(0),
1416 crush_rule(0), object_hash(0),
1417 pg_num(0), pgp_num(0),
1418 last_change(0),
1419 last_force_op_resend(0),
1420 last_force_op_resend_preluminous(0),
1421 snap_seq(0), snap_epoch(0),
1422 auid(0),
1423 crash_replay_interval(0),
1424 quota_max_bytes(0), quota_max_objects(0),
1425 pg_num_mask(0), pgp_num_mask(0),
1426 tier_of(-1), read_tier(-1), write_tier(-1),
1427 cache_mode(CACHEMODE_NONE),
1428 target_max_bytes(0), target_max_objects(0),
1429 cache_target_dirty_ratio_micro(0),
1430 cache_target_dirty_high_ratio_micro(0),
1431 cache_target_full_ratio_micro(0),
1432 cache_min_flush_age(0),
1433 cache_min_evict_age(0),
1434 hit_set_params(),
1435 hit_set_period(0),
1436 hit_set_count(0),
1437 use_gmt_hitset(true),
1438 min_read_recency_for_promote(0),
1439 min_write_recency_for_promote(0),
1440 hit_set_grade_decay_rate(0),
1441 hit_set_search_last_n(0),
1442 stripe_width(0),
1443 expected_num_objects(0),
1444 fast_read(false),
1445 opts()
1446 { }
1447
1448 void dump(Formatter *f) const;
1449
1450 uint64_t get_flags() const { return flags; }
1451 bool has_flag(uint64_t f) const { return flags & f; }
1452 void set_flag(uint64_t f) { flags |= f; }
1453 void unset_flag(uint64_t f) { flags &= ~f; }
1454
1455 bool ec_pool() const {
1456 return type == TYPE_ERASURE;
1457 }
1458 bool require_rollback() const {
1459 return ec_pool();
1460 }
1461
1462 /// true if incomplete clones may be present
1463 bool allow_incomplete_clones() const {
1464 return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
1465 }
1466
1467 unsigned get_type() const { return type; }
1468 unsigned get_size() const { return size; }
1469 unsigned get_min_size() const { return min_size; }
1470 int get_crush_rule() const { return crush_rule; }
1471 int get_object_hash() const { return object_hash; }
1472 const char *get_object_hash_name() const {
1473 return ceph_str_hash_name(get_object_hash());
1474 }
1475 epoch_t get_last_change() const { return last_change; }
1476 epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
1477 epoch_t get_last_force_op_resend_preluminous() const {
1478 return last_force_op_resend_preluminous;
1479 }
1480 epoch_t get_snap_epoch() const { return snap_epoch; }
1481 snapid_t get_snap_seq() const { return snap_seq; }
1482 uint64_t get_auid() const { return auid; }
1483 unsigned get_crash_replay_interval() const { return crash_replay_interval; }
1484
1485 void set_snap_seq(snapid_t s) { snap_seq = s; }
1486 void set_snap_epoch(epoch_t e) { snap_epoch = e; }
1487
1488 void set_stripe_width(uint32_t s) { stripe_width = s; }
1489 uint32_t get_stripe_width() const { return stripe_width; }
1490
1491 bool is_replicated() const { return get_type() == TYPE_REPLICATED; }
1492 bool is_erasure() const { return get_type() == TYPE_ERASURE; }
1493
1494 bool supports_omap() const {
1495 return !(get_type() == TYPE_ERASURE);
1496 }
1497
1498 bool requires_aligned_append() const {
1499 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
1500 }
1501 uint64_t required_alignment() const { return stripe_width; }
1502
1503 bool allows_ecoverwrites() const {
1504 return has_flag(FLAG_EC_OVERWRITES);
1505 }
1506
1507 bool can_shift_osds() const {
1508 switch (get_type()) {
1509 case TYPE_REPLICATED:
1510 return true;
1511 case TYPE_ERASURE:
1512 return false;
1513 default:
1514 assert(0 == "unhandled pool type");
1515 }
1516 }
1517
1518 unsigned get_pg_num() const { return pg_num; }
1519 unsigned get_pgp_num() const { return pgp_num; }
1520
1521 unsigned get_pg_num_mask() const { return pg_num_mask; }
1522 unsigned get_pgp_num_mask() const { return pgp_num_mask; }
1523
1524 // if pg_num is not a multiple of two, pgs are not equally sized.
1525 // return, for a given pg, the fraction (denominator) of the total
1526 // pool size that it represents.
1527 unsigned get_pg_num_divisor(pg_t pgid) const;
1528
1529 void set_pg_num(int p) {
1530 pg_num = p;
1531 calc_pg_masks();
1532 }
1533 void set_pgp_num(int p) {
1534 pgp_num = p;
1535 calc_pg_masks();
1536 }
1537
1538 void set_quota_max_bytes(uint64_t m) {
1539 quota_max_bytes = m;
1540 }
1541 uint64_t get_quota_max_bytes() {
1542 return quota_max_bytes;
1543 }
1544
1545 void set_quota_max_objects(uint64_t m) {
1546 quota_max_objects = m;
1547 }
1548 uint64_t get_quota_max_objects() {
1549 return quota_max_objects;
1550 }
1551
1552 void set_last_force_op_resend(uint64_t t) {
1553 last_force_op_resend = t;
1554 last_force_op_resend_preluminous = t;
1555 }
1556
1557 void calc_pg_masks();
1558
1559 /*
1560 * we have two snap modes:
1561 * - pool global snaps
1562 * - snap existence/non-existence defined by snaps[] and snap_seq
1563 * - user managed snaps
1564 * - removal governed by removed_snaps
1565 *
1566 * we know which mode we're using based on whether removed_snaps is empty.
1567 * If nothing has been created, both functions report false.
1568 */
1569 bool is_pool_snaps_mode() const;
1570 bool is_unmanaged_snaps_mode() const;
1571 bool is_removed_snap(snapid_t s) const;
1572
1573 /*
1574 * build set of known-removed sets from either pool snaps or
1575 * explicit removed_snaps set.
1576 */
1577 void build_removed_snaps(interval_set<snapid_t>& rs) const;
1578 bool maybe_updated_removed_snaps(const interval_set<snapid_t>& cached) const;
1579 snapid_t snap_exists(const char *s) const;
1580 void add_snap(const char *n, utime_t stamp);
1581 void add_unmanaged_snap(uint64_t& snapid);
1582 void remove_snap(snapid_t s);
1583 void remove_unmanaged_snap(snapid_t s);
1584
1585 SnapContext get_snap_context() const;
1586
1587 /// hash a object name+namespace key to a hash position
1588 uint32_t hash_key(const string& key, const string& ns) const;
1589
1590 /// round a hash position down to a pg num
1591 uint32_t raw_hash_to_pg(uint32_t v) const;
1592
1593 /*
1594 * map a raw pg (with full precision ps) into an actual pg, for storage
1595 */
1596 pg_t raw_pg_to_pg(pg_t pg) const;
1597
1598 /*
1599 * map raw pg (full precision ps) into a placement seed. include
1600 * pool id in that value so that different pools don't use the same
1601 * seeds.
1602 */
1603 ps_t raw_pg_to_pps(pg_t pg) const;
1604
1605 /// choose a random hash position within a pg
1606 uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
1607
1608 void encode(bufferlist& bl, uint64_t features) const;
1609 void decode(bufferlist::iterator& bl);
1610
1611 static void generate_test_instances(list<pg_pool_t*>& o);
1612 };
1613 WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
1614
1615 ostream& operator<<(ostream& out, const pg_pool_t& p);
1616
1617
1618 /**
1619 * a summation of object stats
1620 *
1621 * This is just a container for object stats; we don't know what for.
1622 *
1623 * If you add members in object_stat_sum_t, you should make sure there are
1624 * not padding among these members.
1625 * You should also modify the padding_check function.
1626
1627 */
1628 struct object_stat_sum_t {
1629 /**************************************************************************
1630 * WARNING: be sure to update operator==, floor, and split when
1631 * adding/removing fields!
1632 **************************************************************************/
1633 int64_t num_bytes; // in bytes
1634 int64_t num_objects;
1635 int64_t num_object_clones;
1636 int64_t num_object_copies; // num_objects * num_replicas
1637 int64_t num_objects_missing_on_primary;
1638 int64_t num_objects_degraded;
1639 int64_t num_objects_unfound;
1640 int64_t num_rd;
1641 int64_t num_rd_kb;
1642 int64_t num_wr;
1643 int64_t num_wr_kb;
1644 int64_t num_scrub_errors; // total deep and shallow scrub errors
1645 int64_t num_objects_recovered;
1646 int64_t num_bytes_recovered;
1647 int64_t num_keys_recovered;
1648 int64_t num_shallow_scrub_errors;
1649 int64_t num_deep_scrub_errors;
1650 int64_t num_objects_dirty;
1651 int64_t num_whiteouts;
1652 int64_t num_objects_omap;
1653 int64_t num_objects_hit_set_archive;
1654 int64_t num_objects_misplaced;
1655 int64_t num_bytes_hit_set_archive;
1656 int64_t num_flush;
1657 int64_t num_flush_kb;
1658 int64_t num_evict;
1659 int64_t num_evict_kb;
1660 int64_t num_promote;
1661 int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0
1662 int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0
1663 int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0
1664 int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0
1665 int64_t num_objects_pinned;
1666 int64_t num_objects_missing;
1667 int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
1668 int64_t num_large_omap_objects = 0;
1669
1670 object_stat_sum_t()
1671 : num_bytes(0),
1672 num_objects(0), num_object_clones(0), num_object_copies(0),
1673 num_objects_missing_on_primary(0), num_objects_degraded(0),
1674 num_objects_unfound(0),
1675 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1676 num_scrub_errors(0),
1677 num_objects_recovered(0),
1678 num_bytes_recovered(0),
1679 num_keys_recovered(0),
1680 num_shallow_scrub_errors(0),
1681 num_deep_scrub_errors(0),
1682 num_objects_dirty(0),
1683 num_whiteouts(0),
1684 num_objects_omap(0),
1685 num_objects_hit_set_archive(0),
1686 num_objects_misplaced(0),
1687 num_bytes_hit_set_archive(0),
1688 num_flush(0),
1689 num_flush_kb(0),
1690 num_evict(0),
1691 num_evict_kb(0),
1692 num_promote(0),
1693 num_flush_mode_high(0), num_flush_mode_low(0),
1694 num_evict_mode_some(0), num_evict_mode_full(0),
1695 num_objects_pinned(0),
1696 num_objects_missing(0),
1697 num_legacy_snapsets(0)
1698 {}
1699
1700 void floor(int64_t f) {
1701 #define FLOOR(x) if (x < f) x = f
1702 FLOOR(num_bytes);
1703 FLOOR(num_objects);
1704 FLOOR(num_object_clones);
1705 FLOOR(num_object_copies);
1706 FLOOR(num_objects_missing_on_primary);
1707 FLOOR(num_objects_missing);
1708 FLOOR(num_objects_degraded);
1709 FLOOR(num_objects_misplaced);
1710 FLOOR(num_objects_unfound);
1711 FLOOR(num_rd);
1712 FLOOR(num_rd_kb);
1713 FLOOR(num_wr);
1714 FLOOR(num_wr_kb);
1715 FLOOR(num_large_omap_objects);
1716 FLOOR(num_shallow_scrub_errors);
1717 FLOOR(num_deep_scrub_errors);
1718 num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
1719 FLOOR(num_objects_recovered);
1720 FLOOR(num_bytes_recovered);
1721 FLOOR(num_keys_recovered);
1722 FLOOR(num_objects_dirty);
1723 FLOOR(num_whiteouts);
1724 FLOOR(num_objects_omap);
1725 FLOOR(num_objects_hit_set_archive);
1726 FLOOR(num_bytes_hit_set_archive);
1727 FLOOR(num_flush);
1728 FLOOR(num_flush_kb);
1729 FLOOR(num_evict);
1730 FLOOR(num_evict_kb);
1731 FLOOR(num_promote);
1732 FLOOR(num_flush_mode_high);
1733 FLOOR(num_flush_mode_low);
1734 FLOOR(num_evict_mode_some);
1735 FLOOR(num_evict_mode_full);
1736 FLOOR(num_objects_pinned);
1737 FLOOR(num_legacy_snapsets);
1738 #undef FLOOR
1739 }
1740
1741 void split(vector<object_stat_sum_t> &out) const {
1742 #define SPLIT(PARAM) \
1743 for (unsigned i = 0; i < out.size(); ++i) { \
1744 out[i].PARAM = PARAM / out.size(); \
1745 if (i < (PARAM % out.size())) { \
1746 out[i].PARAM++; \
1747 } \
1748 }
1749 #define SPLIT_PRESERVE_NONZERO(PARAM) \
1750 for (unsigned i = 0; i < out.size(); ++i) { \
1751 if (PARAM) \
1752 out[i].PARAM = 1 + PARAM / out.size(); \
1753 else \
1754 out[i].PARAM = 0; \
1755 }
1756
1757 SPLIT(num_bytes);
1758 SPLIT(num_objects);
1759 SPLIT(num_object_clones);
1760 SPLIT(num_object_copies);
1761 SPLIT(num_objects_missing_on_primary);
1762 SPLIT(num_objects_missing);
1763 SPLIT(num_objects_degraded);
1764 SPLIT(num_objects_misplaced);
1765 SPLIT(num_objects_unfound);
1766 SPLIT(num_rd);
1767 SPLIT(num_rd_kb);
1768 SPLIT(num_wr);
1769 SPLIT(num_wr_kb);
1770 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
1771 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
1772 for (unsigned i = 0; i < out.size(); ++i) {
1773 out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
1774 out[i].num_deep_scrub_errors;
1775 }
1776 SPLIT(num_large_omap_objects);
1777 SPLIT(num_objects_recovered);
1778 SPLIT(num_bytes_recovered);
1779 SPLIT(num_keys_recovered);
1780 SPLIT(num_objects_dirty);
1781 SPLIT(num_whiteouts);
1782 SPLIT(num_objects_omap);
1783 SPLIT(num_objects_hit_set_archive);
1784 SPLIT(num_bytes_hit_set_archive);
1785 SPLIT(num_flush);
1786 SPLIT(num_flush_kb);
1787 SPLIT(num_evict);
1788 SPLIT(num_evict_kb);
1789 SPLIT(num_promote);
1790 SPLIT(num_flush_mode_high);
1791 SPLIT(num_flush_mode_low);
1792 SPLIT(num_evict_mode_some);
1793 SPLIT(num_evict_mode_full);
1794 SPLIT(num_objects_pinned);
1795 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
1796 #undef SPLIT
1797 #undef SPLIT_PRESERVE_NONZERO
1798 }
1799
1800 void clear() {
1801 memset(this, 0, sizeof(*this));
1802 }
1803
1804 void calc_copies(int nrep) {
1805 num_object_copies = nrep * num_objects;
1806 }
1807
1808 bool is_zero() const {
1809 return mem_is_zero((char*)this, sizeof(*this));
1810 }
1811
1812 void add(const object_stat_sum_t& o);
1813 void sub(const object_stat_sum_t& o);
1814
1815 void dump(Formatter *f) const;
1816 void padding_check() {
1817 static_assert(
1818 sizeof(object_stat_sum_t) ==
1819 sizeof(num_bytes) +
1820 sizeof(num_objects) +
1821 sizeof(num_object_clones) +
1822 sizeof(num_object_copies) +
1823 sizeof(num_objects_missing_on_primary) +
1824 sizeof(num_objects_degraded) +
1825 sizeof(num_objects_unfound) +
1826 sizeof(num_rd) +
1827 sizeof(num_rd_kb) +
1828 sizeof(num_wr) +
1829 sizeof(num_wr_kb) +
1830 sizeof(num_scrub_errors) +
1831 sizeof(num_large_omap_objects) +
1832 sizeof(num_objects_recovered) +
1833 sizeof(num_bytes_recovered) +
1834 sizeof(num_keys_recovered) +
1835 sizeof(num_shallow_scrub_errors) +
1836 sizeof(num_deep_scrub_errors) +
1837 sizeof(num_objects_dirty) +
1838 sizeof(num_whiteouts) +
1839 sizeof(num_objects_omap) +
1840 sizeof(num_objects_hit_set_archive) +
1841 sizeof(num_objects_misplaced) +
1842 sizeof(num_bytes_hit_set_archive) +
1843 sizeof(num_flush) +
1844 sizeof(num_flush_kb) +
1845 sizeof(num_evict) +
1846 sizeof(num_evict_kb) +
1847 sizeof(num_promote) +
1848 sizeof(num_flush_mode_high) +
1849 sizeof(num_flush_mode_low) +
1850 sizeof(num_evict_mode_some) +
1851 sizeof(num_evict_mode_full) +
1852 sizeof(num_objects_pinned) +
1853 sizeof(num_objects_missing) +
1854 sizeof(num_legacy_snapsets)
1855 ,
1856 "object_stat_sum_t have padding");
1857 }
1858 void encode(bufferlist& bl) const;
1859 void decode(bufferlist::iterator& bl);
1860 static void generate_test_instances(list<object_stat_sum_t*>& o);
1861 };
1862 WRITE_CLASS_ENCODER(object_stat_sum_t)
1863
1864 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
1865
1866 /**
1867 * a collection of object stat sums
1868 *
1869 * This is a collection of stat sums over different categories.
1870 */
1871 struct object_stat_collection_t {
1872 /**************************************************************************
1873 * WARNING: be sure to update the operator== when adding/removing fields! *
1874 **************************************************************************/
1875 object_stat_sum_t sum;
1876
1877 void calc_copies(int nrep) {
1878 sum.calc_copies(nrep);
1879 }
1880
1881 void dump(Formatter *f) const;
1882 void encode(bufferlist& bl) const;
1883 void decode(bufferlist::iterator& bl);
1884 static void generate_test_instances(list<object_stat_collection_t*>& o);
1885
1886 bool is_zero() const {
1887 return sum.is_zero();
1888 }
1889
1890 void clear() {
1891 sum.clear();
1892 }
1893
1894 void floor(int64_t f) {
1895 sum.floor(f);
1896 }
1897
1898 void add(const object_stat_sum_t& o) {
1899 sum.add(o);
1900 }
1901
1902 void add(const object_stat_collection_t& o) {
1903 sum.add(o.sum);
1904 }
1905 void sub(const object_stat_collection_t& o) {
1906 sum.sub(o.sum);
1907 }
1908 };
1909 WRITE_CLASS_ENCODER(object_stat_collection_t)
1910
1911 inline bool operator==(const object_stat_collection_t& l,
1912 const object_stat_collection_t& r) {
1913 return l.sum == r.sum;
1914 }
1915
1916
1917 /** pg_stat
1918 * aggregate stats for a single PG.
1919 */
1920 struct pg_stat_t {
1921 /**************************************************************************
1922 * WARNING: be sure to update the operator== when adding/removing fields! *
1923 **************************************************************************/
1924 eversion_t version;
1925 version_t reported_seq; // sequence number
1926 epoch_t reported_epoch; // epoch of this report
1927 __u32 state;
1928 utime_t last_fresh; // last reported
1929 utime_t last_change; // new state != previous state
1930 utime_t last_active; // state & PG_STATE_ACTIVE
1931 utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
1932 utime_t last_clean; // state & PG_STATE_CLEAN
1933 utime_t last_unstale; // (state & PG_STATE_STALE) == 0
1934 utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
1935 utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
1936
1937 eversion_t log_start; // (log_start,version]
1938 eversion_t ondisk_log_start; // there may be more on disk
1939
1940 epoch_t created;
1941 epoch_t last_epoch_clean;
1942 pg_t parent;
1943 __u32 parent_split_bits;
1944
1945 eversion_t last_scrub;
1946 eversion_t last_deep_scrub;
1947 utime_t last_scrub_stamp;
1948 utime_t last_deep_scrub_stamp;
1949 utime_t last_clean_scrub_stamp;
1950
1951 object_stat_collection_t stats;
1952
1953 int64_t log_size;
1954 int64_t ondisk_log_size; // >= active_log_size
1955
1956 vector<int32_t> up, acting;
1957 epoch_t mapping_epoch;
1958
1959 vector<int32_t> blocked_by; ///< osds on which the pg is blocked
1960
1961 utime_t last_became_active;
1962 utime_t last_became_peered;
1963
1964 /// up, acting primaries
1965 int32_t up_primary;
1966 int32_t acting_primary;
1967
1968 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
1969 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
1970 uint32_t snaptrimq_len;
1971
1972 bool stats_invalid:1;
1973 /// true if num_objects_dirty is not accurate (because it was not
1974 /// maintained starting from pool creation)
1975 bool dirty_stats_invalid:1;
1976 bool omap_stats_invalid:1;
1977 bool hitset_stats_invalid:1;
1978 bool hitset_bytes_stats_invalid:1;
1979 bool pin_stats_invalid:1;
1980
1981 pg_stat_t()
1982 : reported_seq(0),
1983 reported_epoch(0),
1984 state(0),
1985 created(0), last_epoch_clean(0),
1986 parent_split_bits(0),
1987 log_size(0), ondisk_log_size(0),
1988 mapping_epoch(0),
1989 up_primary(-1),
1990 acting_primary(-1),
1991 snaptrimq_len(0),
1992 stats_invalid(false),
1993 dirty_stats_invalid(false),
1994 omap_stats_invalid(false),
1995 hitset_stats_invalid(false),
1996 hitset_bytes_stats_invalid(false),
1997 pin_stats_invalid(false)
1998 { }
1999
2000 epoch_t get_effective_last_epoch_clean() const {
2001 if (state & PG_STATE_CLEAN) {
2002 // we are clean as of this report, and should thus take the
2003 // reported epoch
2004 return reported_epoch;
2005 } else {
2006 return last_epoch_clean;
2007 }
2008 }
2009
2010 pair<epoch_t, version_t> get_version_pair() const {
2011 return make_pair(reported_epoch, reported_seq);
2012 }
2013
2014 void floor(int64_t f) {
2015 stats.floor(f);
2016 if (log_size < f)
2017 log_size = f;
2018 if (ondisk_log_size < f)
2019 ondisk_log_size = f;
2020 if (snaptrimq_len < f)
2021 snaptrimq_len = f;
2022 }
2023
2024 void add(const pg_stat_t& o) {
2025 stats.add(o.stats);
2026 log_size += o.log_size;
2027 ondisk_log_size += o.ondisk_log_size;
2028 if (((uint64_t)snaptrimq_len + (uint64_t)o.snaptrimq_len) > (uint64_t)(1 << 31)) {
2029 snaptrimq_len = 1 << 31;
2030 } else {
2031 snaptrimq_len += o.snaptrimq_len;
2032 }
2033 }
2034 void sub(const pg_stat_t& o) {
2035 stats.sub(o.stats);
2036 log_size -= o.log_size;
2037 ondisk_log_size -= o.ondisk_log_size;
2038 if (o.snaptrimq_len < snaptrimq_len) {
2039 snaptrimq_len -= o.snaptrimq_len;
2040 } else {
2041 snaptrimq_len = 0;
2042 }
2043 }
2044
2045 bool is_acting_osd(int32_t osd, bool primary) const;
2046 void dump(Formatter *f) const;
2047 void dump_brief(Formatter *f) const;
2048 void encode(bufferlist &bl) const;
2049 void decode(bufferlist::iterator &bl);
2050 static void generate_test_instances(list<pg_stat_t*>& o);
2051 };
2052 WRITE_CLASS_ENCODER(pg_stat_t)
2053
2054 bool operator==(const pg_stat_t& l, const pg_stat_t& r);
2055
2056 /*
2057 * summation over an entire pool
2058 */
2059 struct pool_stat_t {
2060 object_stat_collection_t stats;
2061 int64_t log_size;
2062 int64_t ondisk_log_size; // >= active_log_size
2063 int32_t up; ///< number of up replicas or shards
2064 int32_t acting; ///< number of acting replicas or shards
2065
2066 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0)
2067 { }
2068
2069 void floor(int64_t f) {
2070 stats.floor(f);
2071 if (log_size < f)
2072 log_size = f;
2073 if (ondisk_log_size < f)
2074 ondisk_log_size = f;
2075 if (up < f)
2076 up = f;
2077 if (acting < f)
2078 acting = f;
2079 }
2080
2081 void add(const pg_stat_t& o) {
2082 stats.add(o.stats);
2083 log_size += o.log_size;
2084 ondisk_log_size += o.ondisk_log_size;
2085 up += o.up.size();
2086 acting += o.acting.size();
2087 }
2088 void sub(const pg_stat_t& o) {
2089 stats.sub(o.stats);
2090 log_size -= o.log_size;
2091 ondisk_log_size -= o.ondisk_log_size;
2092 up -= o.up.size();
2093 acting -= o.acting.size();
2094 }
2095
2096 bool is_zero() const {
2097 return (stats.is_zero() &&
2098 log_size == 0 &&
2099 ondisk_log_size == 0 &&
2100 up == 0 &&
2101 acting == 0);
2102 }
2103
2104 void dump(Formatter *f) const;
2105 void encode(bufferlist &bl, uint64_t features) const;
2106 void decode(bufferlist::iterator &bl);
2107 static void generate_test_instances(list<pool_stat_t*>& o);
2108 };
2109 WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
2110
2111
2112 // -----------------------------------------
2113
2114 /**
2115 * pg_hit_set_info_t - information about a single recorded HitSet
2116 *
2117 * Track basic metadata about a HitSet, like the nubmer of insertions
2118 * and the time range it covers.
2119 */
2120 struct pg_hit_set_info_t {
2121 utime_t begin, end; ///< time interval
2122 eversion_t version; ///< version this HitSet object was written
2123 bool using_gmt; ///< use gmt for creating the hit_set archive object name
2124
2125 friend bool operator==(const pg_hit_set_info_t& l,
2126 const pg_hit_set_info_t& r) {
2127 return
2128 l.begin == r.begin &&
2129 l.end == r.end &&
2130 l.version == r.version &&
2131 l.using_gmt == r.using_gmt;
2132 }
2133
2134 explicit pg_hit_set_info_t(bool using_gmt = true)
2135 : using_gmt(using_gmt) {}
2136
2137 void encode(bufferlist &bl) const;
2138 void decode(bufferlist::iterator &bl);
2139 void dump(Formatter *f) const;
2140 static void generate_test_instances(list<pg_hit_set_info_t*>& o);
2141 };
2142 WRITE_CLASS_ENCODER(pg_hit_set_info_t)
2143
2144 /**
2145 * pg_hit_set_history_t - information about a history of hitsets
2146 *
2147 * Include information about the currently accumulating hit set as well
2148 * as archived/historical ones.
2149 */
2150 struct pg_hit_set_history_t {
2151 eversion_t current_last_update; ///< last version inserted into current set
2152 list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
2153
2154 friend bool operator==(const pg_hit_set_history_t& l,
2155 const pg_hit_set_history_t& r) {
2156 return
2157 l.current_last_update == r.current_last_update &&
2158 l.history == r.history;
2159 }
2160
2161 void encode(bufferlist &bl) const;
2162 void decode(bufferlist::iterator &bl);
2163 void dump(Formatter *f) const;
2164 static void generate_test_instances(list<pg_hit_set_history_t*>& o);
2165 };
2166 WRITE_CLASS_ENCODER(pg_hit_set_history_t)
2167
2168
2169 // -----------------------------------------
2170
2171 /**
2172 * pg_history_t - information about recent pg peering/mapping history
2173 *
2174 * This is aggressively shared between OSDs to bound the amount of past
2175 * history they need to worry about.
2176 */
2177 struct pg_history_t {
2178 epoch_t epoch_created; // epoch in which *pg* was created (pool or pg)
2179 epoch_t epoch_pool_created; // epoch in which *pool* was created
2180 // (note: may be pg creation epoch for
2181 // pre-luminous clusters)
2182 epoch_t last_epoch_started; // lower bound on last epoch started (anywhere, not necessarily locally)
2183 epoch_t last_interval_started; // first epoch of last_epoch_started interval
2184 epoch_t last_epoch_clean; // lower bound on last epoch the PG was completely clean.
2185 epoch_t last_interval_clean; // first epoch of last_epoch_clean interval
2186 epoch_t last_epoch_split; // as parent or child
2187 epoch_t last_epoch_marked_full; // pool or cluster
2188
2189 /**
2190 * In the event of a map discontinuity, same_*_since may reflect the first
2191 * map the osd has seen in the new map sequence rather than the actual start
2192 * of the interval. This is ok since a discontinuity at epoch e means there
2193 * must have been a clean interval between e and now and that we cannot be
2194 * in the active set during the interval containing e.
2195 */
2196 epoch_t same_up_since; // same acting set since
2197 epoch_t same_interval_since; // same acting AND up set since
2198 epoch_t same_primary_since; // same primary at least back through this epoch.
2199
2200 eversion_t last_scrub;
2201 eversion_t last_deep_scrub;
2202 utime_t last_scrub_stamp;
2203 utime_t last_deep_scrub_stamp;
2204 utime_t last_clean_scrub_stamp;
2205
2206 friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
2207 return
2208 l.epoch_created == r.epoch_created &&
2209 l.epoch_pool_created == r.epoch_pool_created &&
2210 l.last_epoch_started == r.last_epoch_started &&
2211 l.last_interval_started == r.last_interval_started &&
2212 l.last_epoch_clean == r.last_epoch_clean &&
2213 l.last_interval_clean == r.last_interval_clean &&
2214 l.last_epoch_split == r.last_epoch_split &&
2215 l.last_epoch_marked_full == r.last_epoch_marked_full &&
2216 l.same_up_since == r.same_up_since &&
2217 l.same_interval_since == r.same_interval_since &&
2218 l.same_primary_since == r.same_primary_since &&
2219 l.last_scrub == r.last_scrub &&
2220 l.last_deep_scrub == r.last_deep_scrub &&
2221 l.last_scrub_stamp == r.last_scrub_stamp &&
2222 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2223 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp;
2224 }
2225
2226 pg_history_t()
2227 : epoch_created(0),
2228 epoch_pool_created(0),
2229 last_epoch_started(0),
2230 last_interval_started(0),
2231 last_epoch_clean(0),
2232 last_interval_clean(0),
2233 last_epoch_split(0),
2234 last_epoch_marked_full(0),
2235 same_up_since(0), same_interval_since(0), same_primary_since(0) {}
2236
2237 bool merge(const pg_history_t &other) {
2238 // Here, we only update the fields which cannot be calculated from the OSDmap.
2239 bool modified = false;
2240 if (epoch_created < other.epoch_created) {
2241 epoch_created = other.epoch_created;
2242 modified = true;
2243 }
2244 if (epoch_pool_created < other.epoch_pool_created) {
2245 // FIXME: for jewel compat only; this should either be 0 or always the
2246 // same value across all pg instances.
2247 epoch_pool_created = other.epoch_pool_created;
2248 modified = true;
2249 }
2250 if (last_epoch_started < other.last_epoch_started) {
2251 last_epoch_started = other.last_epoch_started;
2252 modified = true;
2253 }
2254 if (last_interval_started < other.last_interval_started) {
2255 last_interval_started = other.last_interval_started;
2256 modified = true;
2257 }
2258 if (last_epoch_clean < other.last_epoch_clean) {
2259 last_epoch_clean = other.last_epoch_clean;
2260 modified = true;
2261 }
2262 if (last_interval_clean < other.last_interval_clean) {
2263 last_interval_clean = other.last_interval_clean;
2264 modified = true;
2265 }
2266 if (last_epoch_split < other.last_epoch_split) {
2267 last_epoch_split = other.last_epoch_split;
2268 modified = true;
2269 }
2270 if (last_epoch_marked_full < other.last_epoch_marked_full) {
2271 last_epoch_marked_full = other.last_epoch_marked_full;
2272 modified = true;
2273 }
2274 if (other.last_scrub > last_scrub) {
2275 last_scrub = other.last_scrub;
2276 modified = true;
2277 }
2278 if (other.last_scrub_stamp > last_scrub_stamp) {
2279 last_scrub_stamp = other.last_scrub_stamp;
2280 modified = true;
2281 }
2282 if (other.last_deep_scrub > last_deep_scrub) {
2283 last_deep_scrub = other.last_deep_scrub;
2284 modified = true;
2285 }
2286 if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
2287 last_deep_scrub_stamp = other.last_deep_scrub_stamp;
2288 modified = true;
2289 }
2290 if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
2291 last_clean_scrub_stamp = other.last_clean_scrub_stamp;
2292 modified = true;
2293 }
2294 return modified;
2295 }
2296
2297 void encode(bufferlist& bl) const;
2298 void decode(bufferlist::iterator& p);
2299 void dump(Formatter *f) const;
2300 static void generate_test_instances(list<pg_history_t*>& o);
2301 };
2302 WRITE_CLASS_ENCODER(pg_history_t)
2303
2304 inline ostream& operator<<(ostream& out, const pg_history_t& h) {
2305 return out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
2306 << " lis/c " << h.last_interval_started
2307 << "/" << h.last_interval_clean
2308 << " les/c/f " << h.last_epoch_started << "/" << h.last_epoch_clean
2309 << "/" << h.last_epoch_marked_full
2310 << " " << h.same_up_since
2311 << "/" << h.same_interval_since
2312 << "/" << h.same_primary_since;
2313 }
2314
2315
2316 /**
2317 * pg_info_t - summary of PG statistics.
2318 *
2319 * some notes:
2320 * - last_complete implies we have all objects that existed as of that
2321 * stamp, OR a newer object, OR have already applied a later delete.
2322 * - if last_complete >= log.bottom, then we know pg contents thru log.head.
2323 * otherwise, we have no idea what the pg is supposed to contain.
2324 */
2325 struct pg_info_t {
2326 spg_t pgid;
2327 eversion_t last_update; ///< last object version applied to store.
2328 eversion_t last_complete; ///< last version pg was complete through.
2329 epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd
2330 epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
2331
2332 version_t last_user_version; ///< last user object version applied to store
2333
2334 eversion_t log_tail; ///< oldest log entry.
2335
2336 hobject_t last_backfill; ///< objects >= this and < last_complete may be missing
2337 bool last_backfill_bitwise; ///< true if last_backfill reflects a bitwise (vs nibblewise) sort
2338
2339 interval_set<snapid_t> purged_snaps;
2340
2341 pg_stat_t stats;
2342
2343 pg_history_t history;
2344 pg_hit_set_history_t hit_set;
2345
2346 friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
2347 return
2348 l.pgid == r.pgid &&
2349 l.last_update == r.last_update &&
2350 l.last_complete == r.last_complete &&
2351 l.last_epoch_started == r.last_epoch_started &&
2352 l.last_interval_started == r.last_interval_started &&
2353 l.last_user_version == r.last_user_version &&
2354 l.log_tail == r.log_tail &&
2355 l.last_backfill == r.last_backfill &&
2356 l.last_backfill_bitwise == r.last_backfill_bitwise &&
2357 l.purged_snaps == r.purged_snaps &&
2358 l.stats == r.stats &&
2359 l.history == r.history &&
2360 l.hit_set == r.hit_set;
2361 }
2362
2363 pg_info_t()
2364 : last_epoch_started(0),
2365 last_interval_started(0),
2366 last_user_version(0),
2367 last_backfill(hobject_t::get_max()),
2368 last_backfill_bitwise(false)
2369 { }
2370 // cppcheck-suppress noExplicitConstructor
2371 pg_info_t(spg_t p)
2372 : pgid(p),
2373 last_epoch_started(0),
2374 last_interval_started(0),
2375 last_user_version(0),
2376 last_backfill(hobject_t::get_max()),
2377 last_backfill_bitwise(false)
2378 { }
2379
2380 void set_last_backfill(hobject_t pos) {
2381 last_backfill = pos;
2382 last_backfill_bitwise = true;
2383 }
2384
2385 bool is_empty() const { return last_update.version == 0; }
2386 bool dne() const { return history.epoch_created == 0; }
2387
2388 bool is_incomplete() const { return !last_backfill.is_max(); }
2389
2390 void encode(bufferlist& bl) const;
2391 void decode(bufferlist::iterator& p);
2392 void dump(Formatter *f) const;
2393 bool overlaps_with(const pg_info_t &oinfo) const {
2394 return last_update > oinfo.log_tail ?
2395 oinfo.last_update >= log_tail :
2396 last_update >= oinfo.log_tail;
2397 }
2398 static void generate_test_instances(list<pg_info_t*>& o);
2399 };
2400 WRITE_CLASS_ENCODER(pg_info_t)
2401
2402 inline ostream& operator<<(ostream& out, const pg_info_t& pgi)
2403 {
2404 out << pgi.pgid << "(";
2405 if (pgi.dne())
2406 out << " DNE";
2407 if (pgi.is_empty())
2408 out << " empty";
2409 else {
2410 out << " v " << pgi.last_update;
2411 if (pgi.last_complete != pgi.last_update)
2412 out << " lc " << pgi.last_complete;
2413 out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
2414 }
2415 if (pgi.is_incomplete())
2416 out << " lb " << pgi.last_backfill
2417 << (pgi.last_backfill_bitwise ? " (bitwise)" : " (NIBBLEWISE)");
2418 //out << " c " << pgi.epoch_created;
2419 out << " local-lis/les=" << pgi.last_interval_started
2420 << "/" << pgi.last_epoch_started;
2421 out << " n=" << pgi.stats.stats.sum.num_objects;
2422 out << " " << pgi.history
2423 << ")";
2424 return out;
2425 }
2426
2427 /**
2428 * pg_fast_info_t - common pg_info_t fields
2429 *
2430 * These are the fields of pg_info_t (and children) that are updated for
2431 * most IO operations.
2432 *
2433 * ** WARNING **
2434 * Because we rely on these fields to be applied to the normal
2435 * info struct, adding a new field here that is not also new in info
2436 * means that we must set an incompat OSD feature bit!
2437 */
2438 struct pg_fast_info_t {
2439 eversion_t last_update;
2440 eversion_t last_complete;
2441 version_t last_user_version;
2442 struct { // pg_stat_t stats
2443 eversion_t version;
2444 version_t reported_seq;
2445 utime_t last_fresh;
2446 utime_t last_active;
2447 utime_t last_peered;
2448 utime_t last_clean;
2449 utime_t last_unstale;
2450 utime_t last_undegraded;
2451 utime_t last_fullsized;
2452 int64_t log_size; // (also ondisk_log_size, which has the same value)
2453 struct { // object_stat_collection_t stats;
2454 struct { // objct_stat_sum_t sum
2455 int64_t num_bytes; // in bytes
2456 int64_t num_objects;
2457 int64_t num_object_copies;
2458 int64_t num_rd;
2459 int64_t num_rd_kb;
2460 int64_t num_wr;
2461 int64_t num_wr_kb;
2462 int64_t num_objects_dirty;
2463 } sum;
2464 } stats;
2465 } stats;
2466
2467 void populate_from(const pg_info_t& info) {
2468 last_update = info.last_update;
2469 last_complete = info.last_complete;
2470 last_user_version = info.last_user_version;
2471 stats.version = info.stats.version;
2472 stats.reported_seq = info.stats.reported_seq;
2473 stats.last_fresh = info.stats.last_fresh;
2474 stats.last_active = info.stats.last_active;
2475 stats.last_peered = info.stats.last_peered;
2476 stats.last_clean = info.stats.last_clean;
2477 stats.last_unstale = info.stats.last_unstale;
2478 stats.last_undegraded = info.stats.last_undegraded;
2479 stats.last_fullsized = info.stats.last_fullsized;
2480 stats.log_size = info.stats.log_size;
2481 stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
2482 stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
2483 stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
2484 stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
2485 stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
2486 stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
2487 stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
2488 stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
2489 }
2490
2491 bool try_apply_to(pg_info_t* info) {
2492 if (last_update <= info->last_update)
2493 return false;
2494 info->last_update = last_update;
2495 info->last_complete = last_complete;
2496 info->last_user_version = last_user_version;
2497 info->stats.version = stats.version;
2498 info->stats.reported_seq = stats.reported_seq;
2499 info->stats.last_fresh = stats.last_fresh;
2500 info->stats.last_active = stats.last_active;
2501 info->stats.last_peered = stats.last_peered;
2502 info->stats.last_clean = stats.last_clean;
2503 info->stats.last_unstale = stats.last_unstale;
2504 info->stats.last_undegraded = stats.last_undegraded;
2505 info->stats.last_fullsized = stats.last_fullsized;
2506 info->stats.log_size = stats.log_size;
2507 info->stats.ondisk_log_size = stats.log_size;
2508 info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
2509 info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
2510 info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
2511 info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
2512 info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
2513 info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
2514 info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
2515 info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
2516 return true;
2517 }
2518
2519 void encode(bufferlist& bl) const {
2520 ENCODE_START(1, 1, bl);
2521 ::encode(last_update, bl);
2522 ::encode(last_complete, bl);
2523 ::encode(last_user_version, bl);
2524 ::encode(stats.version, bl);
2525 ::encode(stats.reported_seq, bl);
2526 ::encode(stats.last_fresh, bl);
2527 ::encode(stats.last_active, bl);
2528 ::encode(stats.last_peered, bl);
2529 ::encode(stats.last_clean, bl);
2530 ::encode(stats.last_unstale, bl);
2531 ::encode(stats.last_undegraded, bl);
2532 ::encode(stats.last_fullsized, bl);
2533 ::encode(stats.log_size, bl);
2534 ::encode(stats.stats.sum.num_bytes, bl);
2535 ::encode(stats.stats.sum.num_objects, bl);
2536 ::encode(stats.stats.sum.num_object_copies, bl);
2537 ::encode(stats.stats.sum.num_rd, bl);
2538 ::encode(stats.stats.sum.num_rd_kb, bl);
2539 ::encode(stats.stats.sum.num_wr, bl);
2540 ::encode(stats.stats.sum.num_wr_kb, bl);
2541 ::encode(stats.stats.sum.num_objects_dirty, bl);
2542 ENCODE_FINISH(bl);
2543 }
2544 void decode(bufferlist::iterator& p) {
2545 DECODE_START(1, p);
2546 ::decode(last_update, p);
2547 ::decode(last_complete, p);
2548 ::decode(last_user_version, p);
2549 ::decode(stats.version, p);
2550 ::decode(stats.reported_seq, p);
2551 ::decode(stats.last_fresh, p);
2552 ::decode(stats.last_active, p);
2553 ::decode(stats.last_peered, p);
2554 ::decode(stats.last_clean, p);
2555 ::decode(stats.last_unstale, p);
2556 ::decode(stats.last_undegraded, p);
2557 ::decode(stats.last_fullsized, p);
2558 ::decode(stats.log_size, p);
2559 ::decode(stats.stats.sum.num_bytes, p);
2560 ::decode(stats.stats.sum.num_objects, p);
2561 ::decode(stats.stats.sum.num_object_copies, p);
2562 ::decode(stats.stats.sum.num_rd, p);
2563 ::decode(stats.stats.sum.num_rd_kb, p);
2564 ::decode(stats.stats.sum.num_wr, p);
2565 ::decode(stats.stats.sum.num_wr_kb, p);
2566 ::decode(stats.stats.sum.num_objects_dirty, p);
2567 DECODE_FINISH(p);
2568 }
2569 };
2570 WRITE_CLASS_ENCODER(pg_fast_info_t)
2571
2572
2573 struct pg_notify_t {
2574 epoch_t query_epoch;
2575 epoch_t epoch_sent;
2576 pg_info_t info;
2577 shard_id_t to;
2578 shard_id_t from;
2579 pg_notify_t() :
2580 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
2581 from(shard_id_t::NO_SHARD) {}
2582 pg_notify_t(
2583 shard_id_t to,
2584 shard_id_t from,
2585 epoch_t query_epoch,
2586 epoch_t epoch_sent,
2587 const pg_info_t &info)
2588 : query_epoch(query_epoch),
2589 epoch_sent(epoch_sent),
2590 info(info), to(to), from(from) {
2591 assert(from == info.pgid.shard);
2592 }
2593 void encode(bufferlist &bl) const;
2594 void decode(bufferlist::iterator &p);
2595 void dump(Formatter *f) const;
2596 static void generate_test_instances(list<pg_notify_t*> &o);
2597 };
2598 WRITE_CLASS_ENCODER(pg_notify_t)
2599 ostream &operator<<(ostream &lhs, const pg_notify_t &notify);
2600
2601
2602 class OSDMap;
2603 /**
2604 * PastIntervals -- information needed to determine the PriorSet and
2605 * the might_have_unfound set
2606 */
2607 class PastIntervals {
2608 public:
2609 struct pg_interval_t {
2610 vector<int32_t> up, acting;
2611 epoch_t first, last;
2612 bool maybe_went_rw;
2613 int32_t primary;
2614 int32_t up_primary;
2615
2616 pg_interval_t()
2617 : first(0), last(0),
2618 maybe_went_rw(false),
2619 primary(-1),
2620 up_primary(-1)
2621 {}
2622
2623 pg_interval_t(
2624 vector<int32_t> &&up,
2625 vector<int32_t> &&acting,
2626 epoch_t first,
2627 epoch_t last,
2628 bool maybe_went_rw,
2629 int32_t primary,
2630 int32_t up_primary)
2631 : up(up), acting(acting), first(first), last(last),
2632 maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
2633 {}
2634
2635 void encode(bufferlist& bl) const;
2636 void decode(bufferlist::iterator& bl);
2637 void dump(Formatter *f) const;
2638 static void generate_test_instances(list<pg_interval_t*>& o);
2639 };
2640
2641 PastIntervals() = default;
2642 PastIntervals(bool ec_pool, const OSDMap &osdmap) : PastIntervals() {
2643 update_type_from_map(ec_pool, osdmap);
2644 }
2645 PastIntervals(bool ec_pool, bool compact) : PastIntervals() {
2646 update_type(ec_pool, compact);
2647 }
2648 PastIntervals(PastIntervals &&rhs) = default;
2649 PastIntervals &operator=(PastIntervals &&rhs) = default;
2650
2651 PastIntervals(const PastIntervals &rhs);
2652 PastIntervals &operator=(const PastIntervals &rhs);
2653
2654 class interval_rep {
2655 public:
2656 virtual size_t size() const = 0;
2657 virtual bool empty() const = 0;
2658 virtual void clear() = 0;
2659 virtual pair<epoch_t, epoch_t> get_bounds() const = 0;
2660 virtual set<pg_shard_t> get_all_participants(
2661 bool ec_pool) const = 0;
2662 virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
2663 virtual unique_ptr<interval_rep> clone() const = 0;
2664 virtual ostream &print(ostream &out) const = 0;
2665 virtual void encode(bufferlist &bl) const = 0;
2666 virtual void decode(bufferlist::iterator &bl) = 0;
2667 virtual void dump(Formatter *f) const = 0;
2668 virtual bool is_classic() const = 0;
2669 virtual void iterate_mayberw_back_to(
2670 bool ec_pool,
2671 epoch_t les,
2672 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const = 0;
2673
2674 virtual bool has_full_intervals() const { return false; }
2675 virtual void iterate_all_intervals(
2676 std::function<void(const pg_interval_t &)> &&f) const {
2677 assert(!has_full_intervals());
2678 assert(0 == "not valid for this implementation");
2679 }
2680
2681 virtual ~interval_rep() {}
2682 };
2683 friend class pi_simple_rep;
2684 friend class pi_compact_rep;
2685 private:
2686
2687 unique_ptr<interval_rep> past_intervals;
2688
2689 PastIntervals(interval_rep *rep) : past_intervals(rep) {}
2690
2691 public:
2692 void add_interval(bool ec_pool, const pg_interval_t &interval) {
2693 assert(past_intervals);
2694 return past_intervals->add_interval(ec_pool, interval);
2695 }
2696
2697 bool is_classic() const {
2698 assert(past_intervals);
2699 return past_intervals->is_classic();
2700 }
2701
2702 void encode(bufferlist &bl) const {
2703 ENCODE_START(1, 1, bl);
2704 if (past_intervals) {
2705 __u8 type = is_classic() ? 1 : 2;
2706 ::encode(type, bl);
2707 past_intervals->encode(bl);
2708 } else {
2709 ::encode((__u8)0, bl);
2710 }
2711 ENCODE_FINISH(bl);
2712 }
2713 void encode_classic(bufferlist &bl) const {
2714 if (past_intervals) {
2715 assert(past_intervals->is_classic());
2716 past_intervals->encode(bl);
2717 } else {
2718 // it's a map<>
2719 ::encode((uint32_t)0, bl);
2720 }
2721 }
2722
2723 void decode(bufferlist::iterator &bl);
2724 void decode_classic(bufferlist::iterator &bl);
2725
2726 void dump(Formatter *f) const {
2727 assert(past_intervals);
2728 past_intervals->dump(f);
2729 }
2730 static void generate_test_instances(list<PastIntervals *> & o);
2731
2732 /**
2733 * Determines whether there is an interval change
2734 */
2735 static bool is_new_interval(
2736 int old_acting_primary,
2737 int new_acting_primary,
2738 const vector<int> &old_acting,
2739 const vector<int> &new_acting,
2740 int old_up_primary,
2741 int new_up_primary,
2742 const vector<int> &old_up,
2743 const vector<int> &new_up,
2744 int old_size,
2745 int new_size,
2746 int old_min_size,
2747 int new_min_size,
2748 unsigned old_pg_num,
2749 unsigned new_pg_num,
2750 bool old_sort_bitwise,
2751 bool new_sort_bitwise,
2752 bool old_recovery_deletes,
2753 bool new_recovery_deletes,
2754 pg_t pgid
2755 );
2756
2757 /**
2758 * Determines whether there is an interval change
2759 */
2760 static bool is_new_interval(
2761 int old_acting_primary, ///< [in] primary as of lastmap
2762 int new_acting_primary, ///< [in] primary as of lastmap
2763 const vector<int> &old_acting, ///< [in] acting as of lastmap
2764 const vector<int> &new_acting, ///< [in] acting as of osdmap
2765 int old_up_primary, ///< [in] up primary of lastmap
2766 int new_up_primary, ///< [in] up primary of osdmap
2767 const vector<int> &old_up, ///< [in] up as of lastmap
2768 const vector<int> &new_up, ///< [in] up as of osdmap
2769 ceph::shared_ptr<const OSDMap> osdmap, ///< [in] current map
2770 ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
2771 pg_t pgid ///< [in] pgid for pg
2772 );
2773
2774 /**
2775 * Integrates a new map into *past_intervals, returns true
2776 * if an interval was closed out.
2777 */
2778 static bool check_new_interval(
2779 int old_acting_primary, ///< [in] primary as of lastmap
2780 int new_acting_primary, ///< [in] primary as of osdmap
2781 const vector<int> &old_acting, ///< [in] acting as of lastmap
2782 const vector<int> &new_acting, ///< [in] acting as of osdmap
2783 int old_up_primary, ///< [in] up primary of lastmap
2784 int new_up_primary, ///< [in] up primary of osdmap
2785 const vector<int> &old_up, ///< [in] up as of lastmap
2786 const vector<int> &new_up, ///< [in] up as of osdmap
2787 epoch_t same_interval_since, ///< [in] as of osdmap
2788 epoch_t last_epoch_clean, ///< [in] current
2789 ceph::shared_ptr<const OSDMap> osdmap, ///< [in] current map
2790 ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
2791 pg_t pgid, ///< [in] pgid for pg
2792 IsPGRecoverablePredicate *could_have_gone_active, /// [in] predicate whether the pg can be active
2793 PastIntervals *past_intervals, ///< [out] intervals
2794 ostream *out = 0 ///< [out] debug ostream
2795 );
2796
2797 friend ostream& operator<<(ostream& out, const PastIntervals &i);
2798
2799 template <typename F>
2800 void iterate_mayberw_back_to(
2801 bool ec_pool,
2802 epoch_t les,
2803 F &&f) const {
2804 assert(past_intervals);
2805 past_intervals->iterate_mayberw_back_to(ec_pool, les, std::forward<F>(f));
2806 }
2807 void clear() {
2808 assert(past_intervals);
2809 past_intervals->clear();
2810 }
2811
2812 /**
2813 * Should return a value which gives an indication of the amount
2814 * of state contained
2815 */
2816 size_t size() const {
2817 assert(past_intervals);
2818 return past_intervals->size();
2819 }
2820
2821 bool empty() const {
2822 assert(past_intervals);
2823 return past_intervals->empty();
2824 }
2825
2826 void swap(PastIntervals &other) {
2827 using std::swap;
2828 swap(other.past_intervals, past_intervals);
2829 }
2830
2831 /**
2832 * Return all shards which have been in the acting set back to the
2833 * latest epoch to which we have trimmed except for pg_whoami
2834 */
2835 set<pg_shard_t> get_might_have_unfound(
2836 pg_shard_t pg_whoami,
2837 bool ec_pool) const {
2838 assert(past_intervals);
2839 auto ret = past_intervals->get_all_participants(ec_pool);
2840 ret.erase(pg_whoami);
2841 return ret;
2842 }
2843
2844 /**
2845 * Return all shards which we might want to talk to for peering
2846 */
2847 set<pg_shard_t> get_all_probe(
2848 bool ec_pool) const {
2849 assert(past_intervals);
2850 return past_intervals->get_all_participants(ec_pool);
2851 }
2852
2853 /* Return the set of epochs [start, end) represented by the
2854 * past_interval set.
2855 */
2856 pair<epoch_t, epoch_t> get_bounds() const {
2857 assert(past_intervals);
2858 return past_intervals->get_bounds();
2859 }
2860
2861 enum osd_state_t {
2862 UP,
2863 DOWN,
2864 DNE,
2865 LOST
2866 };
2867 struct PriorSet {
2868 bool ec_pool = false;
2869 set<pg_shard_t> probe; /// current+prior OSDs we need to probe.
2870 set<int> down; /// down osds that would normally be in @a probe and might be interesting.
2871 map<int, epoch_t> blocked_by; /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
2872
2873 bool pg_down = false; /// some down osds are included in @a cur; the DOWN pg state bit should be set.
2874 unique_ptr<IsPGRecoverablePredicate> pcontdec;
2875
2876 PriorSet() = default;
2877 PriorSet(PriorSet &&) = default;
2878 PriorSet &operator=(PriorSet &&) = default;
2879
2880 PriorSet &operator=(const PriorSet &) = delete;
2881 PriorSet(const PriorSet &) = delete;
2882
2883 bool operator==(const PriorSet &rhs) const {
2884 return (ec_pool == rhs.ec_pool) &&
2885 (probe == rhs.probe) &&
2886 (down == rhs.down) &&
2887 (blocked_by == rhs.blocked_by) &&
2888 (pg_down == rhs.pg_down);
2889 }
2890
2891 bool affected_by_map(
2892 const OSDMap &osdmap,
2893 const DoutPrefixProvider *dpp) const;
2894
2895 // For verifying tests
2896 PriorSet(
2897 bool ec_pool,
2898 set<pg_shard_t> probe,
2899 set<int> down,
2900 map<int, epoch_t> blocked_by,
2901 bool pg_down,
2902 IsPGRecoverablePredicate *pcontdec)
2903 : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
2904 pg_down(pg_down), pcontdec(pcontdec) {}
2905
2906 private:
2907 template <typename F>
2908 PriorSet(
2909 const PastIntervals &past_intervals,
2910 bool ec_pool,
2911 epoch_t last_epoch_started,
2912 IsPGRecoverablePredicate *c,
2913 F f,
2914 const vector<int> &up,
2915 const vector<int> &acting,
2916 const DoutPrefixProvider *dpp);
2917
2918 friend class PastIntervals;
2919 };
2920
2921 void update_type(bool ec_pool, bool compact);
2922 void update_type_from_map(bool ec_pool, const OSDMap &osdmap);
2923
2924 template <typename... Args>
2925 PriorSet get_prior_set(Args&&... args) const {
2926 return PriorSet(*this, std::forward<Args>(args)...);
2927 }
2928 };
2929 WRITE_CLASS_ENCODER(PastIntervals)
2930
2931 ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i);
2932 ostream& operator<<(ostream& out, const PastIntervals &i);
2933 ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i);
2934
2935 template <typename F>
2936 PastIntervals::PriorSet::PriorSet(
2937 const PastIntervals &past_intervals,
2938 bool ec_pool,
2939 epoch_t last_epoch_started,
2940 IsPGRecoverablePredicate *c,
2941 F f,
2942 const vector<int> &up,
2943 const vector<int> &acting,
2944 const DoutPrefixProvider *dpp)
2945 : ec_pool(ec_pool), pg_down(false), pcontdec(c)
2946 {
2947 /*
2948 * We have to be careful to gracefully deal with situations like
2949 * so. Say we have a power outage or something that takes out both
2950 * OSDs, but the monitor doesn't mark them down in the same epoch.
2951 * The history may look like
2952 *
2953 * 1: A B
2954 * 2: B
2955 * 3: let's say B dies for good, too (say, from the power spike)
2956 * 4: A
2957 *
2958 * which makes it look like B may have applied updates to the PG
2959 * that we need in order to proceed. This sucks...
2960 *
2961 * To minimize the risk of this happening, we CANNOT go active if
2962 * _any_ OSDs in the prior set are down until we send an MOSDAlive
2963 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
2964 * Then, we have something like
2965 *
2966 * 1: A B
2967 * 2: B up_thru[B]=0
2968 * 3:
2969 * 4: A
2970 *
2971 * -> we can ignore B, bc it couldn't have gone active (alive_thru
2972 * still 0).
2973 *
2974 * or,
2975 *
2976 * 1: A B
2977 * 2: B up_thru[B]=0
2978 * 3: B up_thru[B]=2
2979 * 4:
2980 * 5: A
2981 *
2982 * -> we must wait for B, bc it was alive through 2, and could have
2983 * written to the pg.
2984 *
2985 * If B is really dead, then an administrator will need to manually
2986 * intervene by marking the OSD as "lost."
2987 */
2988
2989 // Include current acting and up nodes... not because they may
2990 // contain old data (this interval hasn't gone active, obviously),
2991 // but because we want their pg_info to inform choose_acting(), and
2992 // so that we know what they do/do not have explicitly before
2993 // sending them any new info/logs/whatever.
2994 for (unsigned i = 0; i < acting.size(); i++) {
2995 if (acting[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
2996 probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
2997 }
2998 // It may be possible to exclude the up nodes, but let's keep them in
2999 // there for now.
3000 for (unsigned i = 0; i < up.size(); i++) {
3001 if (up[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3002 probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3003 }
3004
3005 set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
3006 ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
3007 for (auto &&i: all_probe) {
3008 switch (f(0, i.osd, nullptr)) {
3009 case UP: {
3010 probe.insert(i);
3011 break;
3012 }
3013 case DNE:
3014 case LOST:
3015 case DOWN: {
3016 down.insert(i.osd);
3017 break;
3018 }
3019 }
3020 }
3021
3022 past_intervals.iterate_mayberw_back_to(
3023 ec_pool,
3024 last_epoch_started,
3025 [&](epoch_t start, const set<pg_shard_t> &acting) {
3026 ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
3027 << ", acting: " << acting << dendl;
3028
3029 // look at candidate osds during this interval. each falls into
3030 // one of three categories: up, down (but potentially
3031 // interesting), or lost (down, but we won't wait for it).
3032 set<pg_shard_t> up_now;
3033 map<int, epoch_t> candidate_blocked_by;
3034 // any candidates down now (that might have useful data)
3035 bool any_down_now = false;
3036
3037 // consider ACTING osds
3038 for (auto &&so: acting) {
3039 epoch_t lost_at = 0;
3040 switch (f(start, so.osd, &lost_at)) {
3041 case UP: {
3042 // include past acting osds if they are up.
3043 up_now.insert(so);
3044 break;
3045 }
3046 case DNE: {
3047 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3048 << " no longer exists" << dendl;
3049 break;
3050 }
3051 case LOST: {
3052 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3053 << " is down, but lost_at " << lost_at << dendl;
3054 up_now.insert(so);
3055 break;
3056 }
3057 case DOWN: {
3058 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3059 << " is down" << dendl;
3060 candidate_blocked_by[so.osd] = lost_at;
3061 any_down_now = true;
3062 break;
3063 }
3064 }
3065 }
3066
3067 // if not enough osds survived this interval, and we may have gone rw,
3068 // then we need to wait for one of those osds to recover to
3069 // ensure that we haven't lost any information.
3070 if (!(*pcontdec)(up_now) && any_down_now) {
3071 // fixme: how do we identify a "clean" shutdown anyway?
3072 ldpp_dout(dpp, 10) << "build_prior possibly went active+rw,"
3073 << " insufficient up; including down osds" << dendl;
3074 assert(!candidate_blocked_by.empty());
3075 pg_down = true;
3076 blocked_by.insert(
3077 candidate_blocked_by.begin(),
3078 candidate_blocked_by.end());
3079 }
3080 });
3081
3082 ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
3083 << " down " << down
3084 << " blocked_by " << blocked_by
3085 << (pg_down ? " pg_down":"")
3086 << dendl;
3087 }
3088
3089 /**
3090 * pg_query_t - used to ask a peer for information about a pg.
3091 *
3092 * note: if version=0, type=LOG, then we just provide our full log.
3093 */
3094 struct pg_query_t {
3095 enum {
3096 INFO = 0,
3097 LOG = 1,
3098 MISSING = 4,
3099 FULLLOG = 5,
3100 };
3101 const char *get_type_name() const {
3102 switch (type) {
3103 case INFO: return "info";
3104 case LOG: return "log";
3105 case MISSING: return "missing";
3106 case FULLLOG: return "fulllog";
3107 default: return "???";
3108 }
3109 }
3110
3111 __s32 type;
3112 eversion_t since;
3113 pg_history_t history;
3114 epoch_t epoch_sent;
3115 shard_id_t to;
3116 shard_id_t from;
3117
3118 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
3119 from(shard_id_t::NO_SHARD) {}
3120 pg_query_t(
3121 int t,
3122 shard_id_t to,
3123 shard_id_t from,
3124 const pg_history_t& h,
3125 epoch_t epoch_sent)
3126 : type(t),
3127 history(h),
3128 epoch_sent(epoch_sent),
3129 to(to), from(from) {
3130 assert(t != LOG);
3131 }
3132 pg_query_t(
3133 int t,
3134 shard_id_t to,
3135 shard_id_t from,
3136 eversion_t s,
3137 const pg_history_t& h,
3138 epoch_t epoch_sent)
3139 : type(t), since(s), history(h),
3140 epoch_sent(epoch_sent), to(to), from(from) {
3141 assert(t == LOG);
3142 }
3143
3144 void encode(bufferlist &bl, uint64_t features) const;
3145 void decode(bufferlist::iterator &bl);
3146
3147 void dump(Formatter *f) const;
3148 static void generate_test_instances(list<pg_query_t*>& o);
3149 };
3150 WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
3151
3152 inline ostream& operator<<(ostream& out, const pg_query_t& q) {
3153 out << "query(" << q.get_type_name() << " " << q.since;
3154 if (q.type == pg_query_t::LOG)
3155 out << " " << q.history;
3156 out << ")";
3157 return out;
3158 }
3159
3160 class PGBackend;
3161 class ObjectModDesc {
3162 bool can_local_rollback;
3163 bool rollback_info_completed;
3164
3165 // version required to decode, reflected in encode/decode version
3166 __u8 max_required_version = 1;
3167 public:
3168 class Visitor {
3169 public:
3170 virtual void append(uint64_t old_offset) {}
3171 virtual void setattrs(map<string, boost::optional<bufferlist> > &attrs) {}
3172 virtual void rmobject(version_t old_version) {}
3173 /**
3174 * Used to support the unfound_lost_delete log event: if the stashed
3175 * version exists, we unstash it, otherwise, we do nothing. This way
3176 * each replica rolls back to whatever state it had prior to the attempt
3177 * at mark unfound lost delete
3178 */
3179 virtual void try_rmobject(version_t old_version) {
3180 rmobject(old_version);
3181 }
3182 virtual void create() {}
3183 virtual void update_snaps(const set<snapid_t> &old_snaps) {}
3184 virtual void rollback_extents(
3185 version_t gen,
3186 const vector<pair<uint64_t, uint64_t> > &extents) {}
3187 virtual ~Visitor() {}
3188 };
3189 void visit(Visitor *visitor) const;
3190 mutable bufferlist bl;
3191 enum ModID {
3192 APPEND = 1,
3193 SETATTRS = 2,
3194 DELETE = 3,
3195 CREATE = 4,
3196 UPDATE_SNAPS = 5,
3197 TRY_DELETE = 6,
3198 ROLLBACK_EXTENTS = 7
3199 };
3200 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3201 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3202 }
3203 void claim(ObjectModDesc &other) {
3204 bl.clear();
3205 bl.claim(other.bl);
3206 can_local_rollback = other.can_local_rollback;
3207 rollback_info_completed = other.rollback_info_completed;
3208 }
3209 void claim_append(ObjectModDesc &other) {
3210 if (!can_local_rollback || rollback_info_completed)
3211 return;
3212 if (!other.can_local_rollback) {
3213 mark_unrollbackable();
3214 return;
3215 }
3216 bl.claim_append(other.bl);
3217 rollback_info_completed = other.rollback_info_completed;
3218 }
3219 void swap(ObjectModDesc &other) {
3220 bl.swap(other.bl);
3221
3222 using std::swap;
3223 swap(other.can_local_rollback, can_local_rollback);
3224 swap(other.rollback_info_completed, rollback_info_completed);
3225 swap(other.max_required_version, max_required_version);
3226 }
3227 void append_id(ModID id) {
3228 uint8_t _id(id);
3229 ::encode(_id, bl);
3230 }
3231 void append(uint64_t old_size) {
3232 if (!can_local_rollback || rollback_info_completed)
3233 return;
3234 ENCODE_START(1, 1, bl);
3235 append_id(APPEND);
3236 ::encode(old_size, bl);
3237 ENCODE_FINISH(bl);
3238 }
3239 void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
3240 if (!can_local_rollback || rollback_info_completed)
3241 return;
3242 ENCODE_START(1, 1, bl);
3243 append_id(SETATTRS);
3244 ::encode(old_attrs, bl);
3245 ENCODE_FINISH(bl);
3246 }
3247 bool rmobject(version_t deletion_version) {
3248 if (!can_local_rollback || rollback_info_completed)
3249 return false;
3250 ENCODE_START(1, 1, bl);
3251 append_id(DELETE);
3252 ::encode(deletion_version, bl);
3253 ENCODE_FINISH(bl);
3254 rollback_info_completed = true;
3255 return true;
3256 }
3257 bool try_rmobject(version_t deletion_version) {
3258 if (!can_local_rollback || rollback_info_completed)
3259 return false;
3260 ENCODE_START(1, 1, bl);
3261 append_id(TRY_DELETE);
3262 ::encode(deletion_version, bl);
3263 ENCODE_FINISH(bl);
3264 rollback_info_completed = true;
3265 return true;
3266 }
3267 void create() {
3268 if (!can_local_rollback || rollback_info_completed)
3269 return;
3270 rollback_info_completed = true;
3271 ENCODE_START(1, 1, bl);
3272 append_id(CREATE);
3273 ENCODE_FINISH(bl);
3274 }
3275 void update_snaps(const set<snapid_t> &old_snaps) {
3276 if (!can_local_rollback || rollback_info_completed)
3277 return;
3278 ENCODE_START(1, 1, bl);
3279 append_id(UPDATE_SNAPS);
3280 ::encode(old_snaps, bl);
3281 ENCODE_FINISH(bl);
3282 }
3283 void rollback_extents(
3284 version_t gen, const vector<pair<uint64_t, uint64_t> > &extents) {
3285 assert(can_local_rollback);
3286 assert(!rollback_info_completed);
3287 if (max_required_version < 2)
3288 max_required_version = 2;
3289 ENCODE_START(2, 2, bl);
3290 append_id(ROLLBACK_EXTENTS);
3291 ::encode(gen, bl);
3292 ::encode(extents, bl);
3293 ENCODE_FINISH(bl);
3294 }
3295
3296 // cannot be rolled back
3297 void mark_unrollbackable() {
3298 can_local_rollback = false;
3299 bl.clear();
3300 }
3301 bool can_rollback() const {
3302 return can_local_rollback;
3303 }
3304 bool empty() const {
3305 return can_local_rollback && (bl.length() == 0);
3306 }
3307
3308 bool requires_kraken() const {
3309 return max_required_version >= 2;
3310 }
3311
3312 /**
3313 * Create fresh copy of bl bytes to avoid keeping large buffers around
3314 * in the case that bl contains ptrs which point into a much larger
3315 * message buffer
3316 */
3317 void trim_bl() const {
3318 if (bl.length() > 0)
3319 bl.rebuild();
3320 }
3321 void encode(bufferlist &bl) const;
3322 void decode(bufferlist::iterator &bl);
3323 void dump(Formatter *f) const;
3324 static void generate_test_instances(list<ObjectModDesc*>& o);
3325 };
3326 WRITE_CLASS_ENCODER(ObjectModDesc)
3327
3328
3329 /**
3330 * pg_log_entry_t - single entry/event in pg log
3331 *
3332 */
3333 struct pg_log_entry_t {
3334 enum {
3335 MODIFY = 1, // some unspecified modification (but not *all* modifications)
3336 CLONE = 2, // cloned object from head
3337 DELETE = 3, // deleted object
3338 BACKLOG = 4, // event invented by generate_backlog [deprecated]
3339 LOST_REVERT = 5, // lost new version, revert to an older version.
3340 LOST_DELETE = 6, // lost new version, revert to no object (deleted).
3341 LOST_MARK = 7, // lost new version, now EIO
3342 PROMOTE = 8, // promoted object from another tier
3343 CLEAN = 9, // mark an object clean
3344 ERROR = 10, // write that returned an error
3345 };
3346 static const char *get_op_name(int op) {
3347 switch (op) {
3348 case MODIFY:
3349 return "modify";
3350 case PROMOTE:
3351 return "promote";
3352 case CLONE:
3353 return "clone";
3354 case DELETE:
3355 return "delete";
3356 case BACKLOG:
3357 return "backlog";
3358 case LOST_REVERT:
3359 return "l_revert";
3360 case LOST_DELETE:
3361 return "l_delete";
3362 case LOST_MARK:
3363 return "l_mark";
3364 case CLEAN:
3365 return "clean";
3366 case ERROR:
3367 return "error";
3368 default:
3369 return "unknown";
3370 }
3371 }
3372 const char *get_op_name() const {
3373 return get_op_name(op);
3374 }
3375
3376 // describes state for a locally-rollbackable entry
3377 ObjectModDesc mod_desc;
3378 bufferlist snaps; // only for clone entries
3379 hobject_t soid;
3380 osd_reqid_t reqid; // caller+tid to uniquely identify request
3381 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids;
3382 eversion_t version, prior_version, reverting_to;
3383 version_t user_version; // the user version for this entry
3384 utime_t mtime; // this is the _user_ mtime, mind you
3385 int32_t return_code; // only stored for ERRORs for dup detection
3386
3387 __s32 op;
3388 bool invalid_hash; // only when decoding sobject_t based entries
3389 bool invalid_pool; // only when decoding pool-less hobject based entries
3390
3391 pg_log_entry_t()
3392 : user_version(0), return_code(0), op(0),
3393 invalid_hash(false), invalid_pool(false) {
3394 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3395 }
3396 pg_log_entry_t(int _op, const hobject_t& _soid,
3397 const eversion_t& v, const eversion_t& pv,
3398 version_t uv,
3399 const osd_reqid_t& rid, const utime_t& mt,
3400 int return_code)
3401 : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
3402 mtime(mt), return_code(return_code), op(_op),
3403 invalid_hash(false), invalid_pool(false) {
3404 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
3405 }
3406
3407 bool is_clone() const { return op == CLONE; }
3408 bool is_modify() const { return op == MODIFY; }
3409 bool is_promote() const { return op == PROMOTE; }
3410 bool is_clean() const { return op == CLEAN; }
3411 bool is_backlog() const { return op == BACKLOG; }
3412 bool is_lost_revert() const { return op == LOST_REVERT; }
3413 bool is_lost_delete() const { return op == LOST_DELETE; }
3414 bool is_lost_mark() const { return op == LOST_MARK; }
3415 bool is_error() const { return op == ERROR; }
3416
3417 bool is_update() const {
3418 return
3419 is_clone() || is_modify() || is_promote() || is_clean() ||
3420 is_backlog() || is_lost_revert() || is_lost_mark();
3421 }
3422 bool is_delete() const {
3423 return op == DELETE || op == LOST_DELETE;
3424 }
3425
3426 bool can_rollback() const {
3427 return mod_desc.can_rollback();
3428 }
3429
3430 void mark_unrollbackable() {
3431 mod_desc.mark_unrollbackable();
3432 }
3433
3434 bool requires_kraken() const {
3435 return mod_desc.requires_kraken();
3436 }
3437
3438 // Errors are only used for dup detection, whereas
3439 // the index by objects is used by recovery, copy_get,
3440 // and other facilities that don't expect or need to
3441 // be aware of error entries.
3442 bool object_is_indexed() const {
3443 return !is_error();
3444 }
3445
3446 bool reqid_is_indexed() const {
3447 return reqid != osd_reqid_t() &&
3448 (op == MODIFY || op == DELETE || op == ERROR);
3449 }
3450
3451 string get_key_name() const;
3452 void encode_with_checksum(bufferlist& bl) const;
3453 void decode_with_checksum(bufferlist::iterator& p);
3454
3455 void encode(bufferlist &bl) const;
3456 void decode(bufferlist::iterator &bl);
3457 void dump(Formatter *f) const;
3458 static void generate_test_instances(list<pg_log_entry_t*>& o);
3459
3460 };
3461 WRITE_CLASS_ENCODER(pg_log_entry_t)
3462
3463 ostream& operator<<(ostream& out, const pg_log_entry_t& e);
3464
3465 struct pg_log_dup_t {
3466 osd_reqid_t reqid; // caller+tid to uniquely identify request
3467 eversion_t version;
3468 version_t user_version; // the user version for this entry
3469 int32_t return_code; // only stored for ERRORs for dup detection
3470
3471 pg_log_dup_t()
3472 : user_version(0), return_code(0)
3473 {}
3474 explicit pg_log_dup_t(const pg_log_entry_t& entry)
3475 : reqid(entry.reqid), version(entry.version),
3476 user_version(entry.user_version), return_code(entry.return_code)
3477 {}
3478 pg_log_dup_t(const eversion_t& v, version_t uv,
3479 const osd_reqid_t& rid, int return_code)
3480 : reqid(rid), version(v), user_version(uv),
3481 return_code(return_code)
3482 {}
3483
3484 string get_key_name() const;
3485 void encode(bufferlist &bl) const;
3486 void decode(bufferlist::iterator &bl);
3487 void dump(Formatter *f) const;
3488 static void generate_test_instances(list<pg_log_dup_t*>& o);
3489
3490 bool operator==(const pg_log_dup_t &rhs) const {
3491 return reqid == rhs.reqid &&
3492 version == rhs.version &&
3493 user_version == rhs.user_version &&
3494 return_code == rhs.return_code;
3495 }
3496 bool operator!=(const pg_log_dup_t &rhs) const {
3497 return !(*this == rhs);
3498 }
3499
3500 friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
3501 };
3502 WRITE_CLASS_ENCODER(pg_log_dup_t)
3503
3504 std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
3505
3506 /**
3507 * pg_log_t - incremental log of recent pg changes.
3508 *
3509 * serves as a recovery queue for recent changes.
3510 */
3511 struct pg_log_t {
3512 /*
3513 * head - newest entry (update|delete)
3514 * tail - entry previous to oldest (update|delete) for which we have
3515 * complete negative information.
3516 * i.e. we can infer pg contents for any store whose last_update >= tail.
3517 */
3518 eversion_t head; // newest entry
3519 eversion_t tail; // version prior to oldest
3520
3521 protected:
3522 // We can rollback rollback-able entries > can_rollback_to
3523 eversion_t can_rollback_to;
3524
3525 // always <= can_rollback_to, indicates how far stashed rollback
3526 // data can be found
3527 eversion_t rollback_info_trimmed_to;
3528
3529 public:
3530 // the actual log
3531 mempool::osd_pglog::list<pg_log_entry_t> log;
3532
3533 // entries just for dup op detection ordered oldest to newest
3534 mempool::osd_pglog::list<pg_log_dup_t> dups;
3535
3536 pg_log_t() = default;
3537 pg_log_t(const eversion_t &last_update,
3538 const eversion_t &log_tail,
3539 const eversion_t &can_rollback_to,
3540 const eversion_t &rollback_info_trimmed_to,
3541 mempool::osd_pglog::list<pg_log_entry_t> &&entries,
3542 mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
3543 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3544 rollback_info_trimmed_to(rollback_info_trimmed_to),
3545 log(std::move(entries)), dups(std::move(dup_entries)) {}
3546 pg_log_t(const eversion_t &last_update,
3547 const eversion_t &log_tail,
3548 const eversion_t &can_rollback_to,
3549 const eversion_t &rollback_info_trimmed_to,
3550 const std::list<pg_log_entry_t> &entries,
3551 const std::list<pg_log_dup_t> &dup_entries)
3552 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
3553 rollback_info_trimmed_to(rollback_info_trimmed_to) {
3554 for (auto &&entry: entries) {
3555 log.push_back(entry);
3556 }
3557 for (auto &&entry: dup_entries) {
3558 dups.push_back(entry);
3559 }
3560 }
3561
3562 void clear() {
3563 eversion_t z;
3564 rollback_info_trimmed_to = can_rollback_to = head = tail = z;
3565 log.clear();
3566 dups.clear();
3567 }
3568
3569 eversion_t get_rollback_info_trimmed_to() const {
3570 return rollback_info_trimmed_to;
3571 }
3572 eversion_t get_can_rollback_to() const {
3573 return can_rollback_to;
3574 }
3575
3576
3577 pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
3578 mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
3579 oldlog.swap(log);
3580
3581 eversion_t old_tail;
3582 unsigned mask = ~((~0)<<split_bits);
3583 for (auto i = oldlog.begin();
3584 i != oldlog.end();
3585 ) {
3586 if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
3587 childlog.push_back(*i);
3588 } else {
3589 log.push_back(*i);
3590 }
3591 oldlog.erase(i++);
3592 }
3593
3594 // osd_reqid is unique, so it doesn't matter if there are extra
3595 // dup entries in each pg. To avoid storing oid with the dup
3596 // entries, just copy the whole list.
3597 auto childdups(dups);
3598
3599 return pg_log_t(
3600 head,
3601 tail,
3602 can_rollback_to,
3603 rollback_info_trimmed_to,
3604 std::move(childlog),
3605 std::move(childdups));
3606 }
3607
3608 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
3609 assert(newhead >= tail);
3610
3611 mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
3612 mempool::osd_pglog::list<pg_log_entry_t> divergent;
3613 while (true) {
3614 if (p == log.begin()) {
3615 // yikes, the whole thing is divergent!
3616 using std::swap;
3617 swap(divergent, log);
3618 break;
3619 }
3620 --p;
3621 if (p->version.version <= newhead.version) {
3622 /*
3623 * look at eversion.version here. we want to avoid a situation like:
3624 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
3625 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
3626 * lower_bound = 100'9
3627 * i.e, same request, different version. If the eversion.version is > the
3628 * lower_bound, we it is divergent.
3629 */
3630 ++p;
3631 divergent.splice(divergent.begin(), log, p, log.end());
3632 break;
3633 }
3634 assert(p->version > newhead);
3635 }
3636 head = newhead;
3637
3638 if (can_rollback_to > newhead)
3639 can_rollback_to = newhead;
3640
3641 if (rollback_info_trimmed_to > newhead)
3642 rollback_info_trimmed_to = newhead;
3643
3644 return divergent;
3645 }
3646
3647 bool empty() const {
3648 return log.empty();
3649 }
3650
3651 bool null() const {
3652 return head.version == 0 && head.epoch == 0;
3653 }
3654
3655 size_t approx_size() const {
3656 return head.version - tail.version;
3657 }
3658
3659 static void filter_log(spg_t import_pgid, const OSDMap &curmap,
3660 const string &hit_set_namespace, const pg_log_t &in,
3661 pg_log_t &out, pg_log_t &reject);
3662
3663 /**
3664 * copy entries from the tail of another pg_log_t
3665 *
3666 * @param other pg_log_t to copy from
3667 * @param from copy entries after this version
3668 */
3669 void copy_after(const pg_log_t &other, eversion_t from);
3670
3671 /**
3672 * copy a range of entries from another pg_log_t
3673 *
3674 * @param other pg_log_t to copy from
3675 * @param from copy entries after this version
3676 * @param to up to and including this version
3677 */
3678 void copy_range(const pg_log_t &other, eversion_t from, eversion_t to);
3679
3680 /**
3681 * copy up to N entries
3682 *
3683 * @param other source log
3684 * @param max max number of entries to copy
3685 */
3686 void copy_up_to(const pg_log_t &other, int max);
3687
3688 ostream& print(ostream& out) const;
3689
3690 void encode(bufferlist &bl) const;
3691 void decode(bufferlist::iterator &bl, int64_t pool = -1);
3692 void dump(Formatter *f) const;
3693 static void generate_test_instances(list<pg_log_t*>& o);
3694 };
3695 WRITE_CLASS_ENCODER(pg_log_t)
3696
3697 inline ostream& operator<<(ostream& out, const pg_log_t& log)
3698 {
3699 out << "log((" << log.tail << "," << log.head << "], crt="
3700 << log.get_can_rollback_to() << ")";
3701 return out;
3702 }
3703
3704
3705 /**
3706 * pg_missing_t - summary of missing objects.
3707 *
3708 * kept in memory, as a supplement to pg_log_t
3709 * also used to pass missing info in messages.
3710 */
3711 struct pg_missing_item {
3712 eversion_t need, have;
3713 enum missing_flags_t {
3714 FLAG_NONE = 0,
3715 FLAG_DELETE = 1,
3716 } flags;
3717 pg_missing_item() : flags(FLAG_NONE) {}
3718 explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
3719 pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false) : need(n), have(h) {
3720 set_delete(is_delete);
3721 }
3722
3723 void encode(bufferlist& bl, uint64_t features) const {
3724 if (HAVE_FEATURE(features, OSD_RECOVERY_DELETES)) {
3725 // encoding a zeroed eversion_t to differentiate between this and
3726 // legacy unversioned encoding - a need value of 0'0 is not
3727 // possible. This can be replaced with the legacy encoding
3728 // macros post-luminous.
3729 eversion_t e;
3730 ::encode(e, bl);
3731 ::encode(need, bl);
3732 ::encode(have, bl);
3733 ::encode(static_cast<uint8_t>(flags), bl);
3734 } else {
3735 // legacy unversioned encoding
3736 ::encode(need, bl);
3737 ::encode(have, bl);
3738 }
3739 }
3740 void decode(bufferlist::iterator& bl) {
3741 eversion_t e;
3742 ::decode(e, bl);
3743 if (e != eversion_t()) {
3744 // legacy encoding, this is the need value
3745 need = e;
3746 ::decode(have, bl);
3747 } else {
3748 ::decode(need, bl);
3749 ::decode(have, bl);
3750 uint8_t f;
3751 ::decode(f, bl);
3752 flags = static_cast<missing_flags_t>(f);
3753 }
3754 }
3755
3756 void set_delete(bool is_delete) {
3757 flags = is_delete ? FLAG_DELETE : FLAG_NONE;
3758 }
3759
3760 bool is_delete() const {
3761 return (flags & FLAG_DELETE) == FLAG_DELETE;
3762 }
3763
3764 string flag_str() const {
3765 if (flags == FLAG_NONE) {
3766 return "none";
3767 } else {
3768 return "delete";
3769 }
3770 }
3771
3772 void dump(Formatter *f) const {
3773 f->dump_stream("need") << need;
3774 f->dump_stream("have") << have;
3775 f->dump_stream("flags") << flag_str();
3776 }
3777 static void generate_test_instances(list<pg_missing_item*>& o) {
3778 o.push_back(new pg_missing_item);
3779 o.push_back(new pg_missing_item);
3780 o.back()->need = eversion_t(1, 2);
3781 o.back()->have = eversion_t(1, 1);
3782 o.push_back(new pg_missing_item);
3783 o.back()->need = eversion_t(3, 5);
3784 o.back()->have = eversion_t(3, 4);
3785 o.back()->flags = FLAG_DELETE;
3786 }
3787 bool operator==(const pg_missing_item &rhs) const {
3788 return need == rhs.need && have == rhs.have && flags == rhs.flags;
3789 }
3790 bool operator!=(const pg_missing_item &rhs) const {
3791 return !(*this == rhs);
3792 }
3793 };
3794 WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
3795 ostream& operator<<(ostream& out, const pg_missing_item &item);
3796
3797 class pg_missing_const_i {
3798 public:
3799 virtual const map<hobject_t, pg_missing_item> &
3800 get_items() const = 0;
3801 virtual const map<version_t, hobject_t> &get_rmissing() const = 0;
3802 virtual bool get_may_include_deletes() const = 0;
3803 virtual unsigned int num_missing() const = 0;
3804 virtual bool have_missing() const = 0;
3805 virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
3806 virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
3807 virtual eversion_t have_old(const hobject_t& oid) const = 0;
3808 virtual ~pg_missing_const_i() {}
3809 };
3810
3811
3812 template <bool Track>
3813 class ChangeTracker {
3814 public:
3815 void changed(const hobject_t &obj) {}
3816 template <typename F>
3817 void get_changed(F &&f) const {}
3818 void flush() {}
3819 bool is_clean() const {
3820 return true;
3821 }
3822 };
3823 template <>
3824 class ChangeTracker<true> {
3825 set<hobject_t> _changed;
3826 public:
3827 void changed(const hobject_t &obj) {
3828 _changed.insert(obj);
3829 }
3830 template <typename F>
3831 void get_changed(F &&f) const {
3832 for (auto const &i: _changed) {
3833 f(i);
3834 }
3835 }
3836 void flush() {
3837 _changed.clear();
3838 }
3839 bool is_clean() const {
3840 return _changed.empty();
3841 }
3842 };
3843
3844 template <bool TrackChanges>
3845 class pg_missing_set : public pg_missing_const_i {
3846 using item = pg_missing_item;
3847 map<hobject_t, item> missing; // oid -> (need v, have v)
3848 map<version_t, hobject_t> rmissing; // v -> oid
3849 ChangeTracker<TrackChanges> tracker;
3850
3851 public:
3852 pg_missing_set() = default;
3853
3854 template <typename missing_type>
3855 pg_missing_set(const missing_type &m) {
3856 missing = m.get_items();
3857 rmissing = m.get_rmissing();
3858 may_include_deletes = m.get_may_include_deletes();
3859 for (auto &&i: missing)
3860 tracker.changed(i.first);
3861 }
3862
3863 bool may_include_deletes = false;
3864
3865 const map<hobject_t, item> &get_items() const override {
3866 return missing;
3867 }
3868 const map<version_t, hobject_t> &get_rmissing() const override {
3869 return rmissing;
3870 }
3871 bool get_may_include_deletes() const override {
3872 return may_include_deletes;
3873 }
3874 unsigned int num_missing() const override {
3875 return missing.size();
3876 }
3877 bool have_missing() const override {
3878 return !missing.empty();
3879 }
3880 bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
3881 auto iter = missing.find(oid);
3882 if (iter == missing.end())
3883 return false;
3884 if (out)
3885 *out = iter->second;
3886 return true;
3887 }
3888 bool is_missing(const hobject_t& oid, eversion_t v) const override {
3889 map<hobject_t, item>::const_iterator m =
3890 missing.find(oid);
3891 if (m == missing.end())
3892 return false;
3893 const item &item(m->second);
3894 if (item.need > v)
3895 return false;
3896 return true;
3897 }
3898 eversion_t have_old(const hobject_t& oid) const override {
3899 map<hobject_t, item>::const_iterator m =
3900 missing.find(oid);
3901 if (m == missing.end())
3902 return eversion_t();
3903 const item &item(m->second);
3904 return item.have;
3905 }
3906
3907 void claim(pg_missing_set& o) {
3908 static_assert(!TrackChanges, "Can't use claim with TrackChanges");
3909 missing.swap(o.missing);
3910 rmissing.swap(o.rmissing);
3911 }
3912
3913 /*
3914 * this needs to be called in log order as we extend the log. it
3915 * assumes missing is accurate up through the previous log entry.
3916 */
3917 void add_next_event(const pg_log_entry_t& e) {
3918 map<hobject_t, item>::iterator missing_it;
3919 missing_it = missing.find(e.soid);
3920 bool is_missing_divergent_item = missing_it != missing.end();
3921 if (e.prior_version == eversion_t() || e.is_clone()) {
3922 // new object.
3923 if (is_missing_divergent_item) { // use iterator
3924 rmissing.erase((missing_it->second).need.version);
3925 missing_it->second = item(e.version, eversion_t(), e.is_delete()); // .have = nil
3926 } else // create new element in missing map
3927 missing[e.soid] = item(e.version, eversion_t(), e.is_delete()); // .have = nil
3928 } else if (is_missing_divergent_item) {
3929 // already missing (prior).
3930 rmissing.erase((missing_it->second).need.version);
3931 (missing_it->second).need = e.version; // leave .have unchanged.
3932 missing_it->second.set_delete(e.is_delete());
3933 } else if (e.is_backlog()) {
3934 // May not have prior version
3935 assert(0 == "these don't exist anymore");
3936 } else {
3937 // not missing, we must have prior_version (if any)
3938 assert(!is_missing_divergent_item);
3939 missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
3940 }
3941 rmissing[e.version.version] = e.soid;
3942 tracker.changed(e.soid);
3943 }
3944
3945 void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
3946 if (missing.count(oid)) {
3947 rmissing.erase(missing[oid].need.version);
3948 missing[oid].need = need; // no not adjust .have
3949 missing[oid].set_delete(is_delete);
3950 } else {
3951 missing[oid] = item(need, eversion_t(), is_delete);
3952 }
3953 rmissing[need.version] = oid;
3954
3955 tracker.changed(oid);
3956 }
3957
3958 void revise_have(hobject_t oid, eversion_t have) {
3959 if (missing.count(oid)) {
3960 tracker.changed(oid);
3961 missing[oid].have = have;
3962 }
3963 }
3964
3965 void add(const hobject_t& oid, eversion_t need, eversion_t have,
3966 bool is_delete) {
3967 missing[oid] = item(need, have, is_delete);
3968 rmissing[need.version] = oid;
3969 tracker.changed(oid);
3970 }
3971
3972 void rm(const hobject_t& oid, eversion_t v) {
3973 std::map<hobject_t, item>::iterator p = missing.find(oid);
3974 if (p != missing.end() && p->second.need <= v)
3975 rm(p);
3976 }
3977
3978 void rm(std::map<hobject_t, item>::const_iterator m) {
3979 tracker.changed(m->first);
3980 rmissing.erase(m->second.need.version);
3981 missing.erase(m);
3982 }
3983
3984 void got(const hobject_t& oid, eversion_t v) {
3985 std::map<hobject_t, item>::iterator p = missing.find(oid);
3986 assert(p != missing.end());
3987 assert(p->second.need <= v || p->second.is_delete());
3988 got(p);
3989 }
3990
3991 void got(std::map<hobject_t, item>::const_iterator m) {
3992 tracker.changed(m->first);
3993 rmissing.erase(m->second.need.version);
3994 missing.erase(m);
3995 }
3996
3997 void split_into(
3998 pg_t child_pgid,
3999 unsigned split_bits,
4000 pg_missing_set *omissing) {
4001 omissing->may_include_deletes = may_include_deletes;
4002 unsigned mask = ~((~0)<<split_bits);
4003 for (map<hobject_t, item>::iterator i = missing.begin();
4004 i != missing.end();
4005 ) {
4006 if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
4007 omissing->add(i->first, i->second.need, i->second.have,
4008 i->second.is_delete());
4009 rm(i++);
4010 } else {
4011 ++i;
4012 }
4013 }
4014 }
4015
4016 void clear() {
4017 for (auto const &i: missing)
4018 tracker.changed(i.first);
4019 missing.clear();
4020 rmissing.clear();
4021 }
4022
4023 void encode(bufferlist &bl) const {
4024 ENCODE_START(4, 2, bl);
4025 ::encode(missing, bl, may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0);
4026 ::encode(may_include_deletes, bl);
4027 ENCODE_FINISH(bl);
4028 }
4029 void decode(bufferlist::iterator &bl, int64_t pool = -1) {
4030 for (auto const &i: missing)
4031 tracker.changed(i.first);
4032 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
4033 ::decode(missing, bl);
4034 if (struct_v >= 4) {
4035 ::decode(may_include_deletes, bl);
4036 }
4037 DECODE_FINISH(bl);
4038
4039 if (struct_v < 3) {
4040 // Handle hobject_t upgrade
4041 map<hobject_t, item> tmp;
4042 for (map<hobject_t, item>::iterator i =
4043 missing.begin();
4044 i != missing.end();
4045 ) {
4046 if (!i->first.is_max() && i->first.pool == -1) {
4047 hobject_t to_insert(i->first);
4048 to_insert.pool = pool;
4049 tmp[to_insert] = i->second;
4050 missing.erase(i++);
4051 } else {
4052 ++i;
4053 }
4054 }
4055 missing.insert(tmp.begin(), tmp.end());
4056 }
4057
4058 for (map<hobject_t,item>::iterator it =
4059 missing.begin();
4060 it != missing.end();
4061 ++it)
4062 rmissing[it->second.need.version] = it->first;
4063 for (auto const &i: missing)
4064 tracker.changed(i.first);
4065 }
4066 void dump(Formatter *f) const {
4067 f->open_array_section("missing");
4068 for (map<hobject_t,item>::const_iterator p =
4069 missing.begin(); p != missing.end(); ++p) {
4070 f->open_object_section("item");
4071 f->dump_stream("object") << p->first;
4072 p->second.dump(f);
4073 f->close_section();
4074 }
4075 f->close_section();
4076 f->dump_bool("may_include_deletes", may_include_deletes);
4077 }
4078 template <typename F>
4079 void filter_objects(F &&f) {
4080 for (auto i = missing.begin(); i != missing.end();) {
4081 if (f(i->first)) {
4082 rm(i++);
4083 } else {
4084 ++i;
4085 }
4086 }
4087 }
4088 static void generate_test_instances(list<pg_missing_set*>& o) {
4089 o.push_back(new pg_missing_set);
4090 o.push_back(new pg_missing_set);
4091 o.back()->add(
4092 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4093 eversion_t(5, 6), eversion_t(5, 1), false);
4094 o.push_back(new pg_missing_set);
4095 o.back()->add(
4096 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4097 eversion_t(5, 6), eversion_t(5, 1), true);
4098 o.back()->may_include_deletes = true;
4099 }
4100 template <typename F>
4101 void get_changed(F &&f) const {
4102 tracker.get_changed(f);
4103 }
4104 void flush() {
4105 tracker.flush();
4106 }
4107 bool is_clean() const {
4108 return tracker.is_clean();
4109 }
4110 template <typename missing_t>
4111 bool debug_verify_from_init(
4112 const missing_t &init_missing,
4113 ostream *oss) const {
4114 if (!TrackChanges)
4115 return true;
4116 auto check_missing(init_missing.get_items());
4117 tracker.get_changed([&](const hobject_t &hoid) {
4118 check_missing.erase(hoid);
4119 if (missing.count(hoid)) {
4120 check_missing.insert(*(missing.find(hoid)));
4121 }
4122 });
4123 bool ok = true;
4124 if (check_missing.size() != missing.size()) {
4125 if (oss) {
4126 *oss << "Size mismatch, check: " << check_missing.size()
4127 << ", actual: " << missing.size() << "\n";
4128 }
4129 ok = false;
4130 }
4131 for (auto &i: missing) {
4132 if (!check_missing.count(i.first)) {
4133 if (oss)
4134 *oss << "check_missing missing " << i.first << "\n";
4135 ok = false;
4136 } else if (check_missing[i.first] != i.second) {
4137 if (oss)
4138 *oss << "check_missing missing item mismatch on " << i.first
4139 << ", check: " << check_missing[i.first]
4140 << ", actual: " << i.second << "\n";
4141 ok = false;
4142 }
4143 }
4144 if (oss && !ok) {
4145 *oss << "check_missing: " << check_missing << "\n";
4146 set<hobject_t> changed;
4147 tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
4148 *oss << "changed: " << changed << "\n";
4149 }
4150 return ok;
4151 }
4152 };
4153 template <bool TrackChanges>
4154 void encode(
4155 const pg_missing_set<TrackChanges> &c, bufferlist &bl, uint64_t features=0) {
4156 ENCODE_DUMP_PRE();
4157 c.encode(bl);
4158 ENCODE_DUMP_POST(cl);
4159 }
4160 template <bool TrackChanges>
4161 void decode(pg_missing_set<TrackChanges> &c, bufferlist::iterator &p) {
4162 c.decode(p);
4163 }
4164 template <bool TrackChanges>
4165 ostream& operator<<(ostream& out, const pg_missing_set<TrackChanges> &missing)
4166 {
4167 out << "missing(" << missing.num_missing()
4168 << " may_include_deletes = " << missing.may_include_deletes;
4169 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
4170 out << ")";
4171 return out;
4172 }
4173
4174 using pg_missing_t = pg_missing_set<false>;
4175 using pg_missing_tracker_t = pg_missing_set<true>;
4176
4177
4178 /**
4179 * pg list objects response format
4180 *
4181 */
4182 struct pg_nls_response_t {
4183 collection_list_handle_t handle;
4184 list<librados::ListObjectImpl> entries;
4185
4186 void encode(bufferlist& bl) const {
4187 ENCODE_START(1, 1, bl);
4188 ::encode(handle, bl);
4189 __u32 n = (__u32)entries.size();
4190 ::encode(n, bl);
4191 for (list<librados::ListObjectImpl>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
4192 ::encode(i->nspace, bl);
4193 ::encode(i->oid, bl);
4194 ::encode(i->locator, bl);
4195 }
4196 ENCODE_FINISH(bl);
4197 }
4198 void decode(bufferlist::iterator& bl) {
4199 DECODE_START(1, bl);
4200 ::decode(handle, bl);
4201 __u32 n;
4202 ::decode(n, bl);
4203 entries.clear();
4204 while (n--) {
4205 librados::ListObjectImpl i;
4206 ::decode(i.nspace, bl);
4207 ::decode(i.oid, bl);
4208 ::decode(i.locator, bl);
4209 entries.push_back(i);
4210 }
4211 DECODE_FINISH(bl);
4212 }
4213 void dump(Formatter *f) const {
4214 f->dump_stream("handle") << handle;
4215 f->open_array_section("entries");
4216 for (list<librados::ListObjectImpl>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4217 f->open_object_section("object");
4218 f->dump_string("namespace", p->nspace);
4219 f->dump_string("object", p->oid);
4220 f->dump_string("key", p->locator);
4221 f->close_section();
4222 }
4223 f->close_section();
4224 }
4225 static void generate_test_instances(list<pg_nls_response_t*>& o) {
4226 o.push_back(new pg_nls_response_t);
4227 o.push_back(new pg_nls_response_t);
4228 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4229 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4230 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4231 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4232 o.push_back(new pg_nls_response_t);
4233 o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
4234 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4235 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4236 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4237 o.push_back(new pg_nls_response_t);
4238 o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
4239 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
4240 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
4241 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
4242 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4243 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4244 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4245 }
4246 };
4247
4248 WRITE_CLASS_ENCODER(pg_nls_response_t)
4249
4250 // For backwards compatibility with older OSD requests
4251 struct pg_ls_response_t {
4252 collection_list_handle_t handle;
4253 list<pair<object_t, string> > entries;
4254
4255 void encode(bufferlist& bl) const {
4256 __u8 v = 1;
4257 ::encode(v, bl);
4258 ::encode(handle, bl);
4259 ::encode(entries, bl);
4260 }
4261 void decode(bufferlist::iterator& bl) {
4262 __u8 v;
4263 ::decode(v, bl);
4264 assert(v == 1);
4265 ::decode(handle, bl);
4266 ::decode(entries, bl);
4267 }
4268 void dump(Formatter *f) const {
4269 f->dump_stream("handle") << handle;
4270 f->open_array_section("entries");
4271 for (list<pair<object_t, string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4272 f->open_object_section("object");
4273 f->dump_stream("object") << p->first;
4274 f->dump_string("key", p->second);
4275 f->close_section();
4276 }
4277 f->close_section();
4278 }
4279 static void generate_test_instances(list<pg_ls_response_t*>& o) {
4280 o.push_back(new pg_ls_response_t);
4281 o.push_back(new pg_ls_response_t);
4282 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4283 o.back()->entries.push_back(make_pair(object_t("one"), string()));
4284 o.back()->entries.push_back(make_pair(object_t("two"), string("twokey")));
4285 }
4286 };
4287
4288 WRITE_CLASS_ENCODER(pg_ls_response_t)
4289
4290 /**
4291 * object_copy_cursor_t
4292 */
4293 struct object_copy_cursor_t {
4294 uint64_t data_offset;
4295 string omap_offset;
4296 bool attr_complete;
4297 bool data_complete;
4298 bool omap_complete;
4299
4300 object_copy_cursor_t()
4301 : data_offset(0),
4302 attr_complete(false),
4303 data_complete(false),
4304 omap_complete(false)
4305 {}
4306
4307 bool is_initial() const {
4308 return !attr_complete && data_offset == 0 && omap_offset.empty();
4309 }
4310 bool is_complete() const {
4311 return attr_complete && data_complete && omap_complete;
4312 }
4313
4314 static void generate_test_instances(list<object_copy_cursor_t*>& o);
4315 void encode(bufferlist& bl) const;
4316 void decode(bufferlist::iterator &bl);
4317 void dump(Formatter *f) const;
4318 };
4319 WRITE_CLASS_ENCODER(object_copy_cursor_t)
4320
4321 /**
4322 * object_copy_data_t
4323 *
4324 * Return data from a copy request. The semantics are a little strange
4325 * as a result of the encoding's heritage.
4326 *
4327 * In particular, the sender unconditionally fills in the cursor (from what
4328 * it receives and sends), the size, and the mtime, but is responsible for
4329 * figuring out whether it should put any data in the attrs, data, or
4330 * omap members (corresponding to xattrs, object data, and the omap entries)
4331 * based on external data (the client includes a max amount to return with
4332 * the copy request). The client then looks into the attrs, data, and/or omap
4333 * based on the contents of the cursor.
4334 */
4335 struct object_copy_data_t {
4336 enum {
4337 FLAG_DATA_DIGEST = 1<<0,
4338 FLAG_OMAP_DIGEST = 1<<1,
4339 };
4340 object_copy_cursor_t cursor;
4341 uint64_t size;
4342 utime_t mtime;
4343 uint32_t data_digest, omap_digest;
4344 uint32_t flags;
4345 map<string, bufferlist> attrs;
4346 bufferlist data;
4347 bufferlist omap_header;
4348 bufferlist omap_data;
4349
4350 /// which snaps we are defined for (if a snap and not the head)
4351 vector<snapid_t> snaps;
4352 ///< latest snap seq for the object (if head)
4353 snapid_t snap_seq;
4354
4355 ///< recent reqids on this object
4356 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > reqids;
4357
4358 uint64_t truncate_seq;
4359 uint64_t truncate_size;
4360
4361 public:
4362 object_copy_data_t() :
4363 size((uint64_t)-1), data_digest(-1),
4364 omap_digest(-1), flags(0),
4365 truncate_seq(0),
4366 truncate_size(0) {}
4367
4368 static void generate_test_instances(list<object_copy_data_t*>& o);
4369 void encode(bufferlist& bl, uint64_t features) const;
4370 void decode(bufferlist::iterator& bl);
4371 void dump(Formatter *f) const;
4372 };
4373 WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
4374
4375 /**
4376 * pg creation info
4377 */
4378 struct pg_create_t {
4379 epoch_t created; // epoch pg created
4380 pg_t parent; // split from parent (if != pg_t())
4381 __s32 split_bits;
4382
4383 pg_create_t()
4384 : created(0), split_bits(0) {}
4385 pg_create_t(unsigned c, pg_t p, int s)
4386 : created(c), parent(p), split_bits(s) {}
4387
4388 void encode(bufferlist &bl) const;
4389 void decode(bufferlist::iterator &bl);
4390 void dump(Formatter *f) const;
4391 static void generate_test_instances(list<pg_create_t*>& o);
4392 };
4393 WRITE_CLASS_ENCODER(pg_create_t)
4394
4395 // -----------------------------------------
4396
4397 struct osd_peer_stat_t {
4398 utime_t stamp;
4399
4400 osd_peer_stat_t() { }
4401
4402 void encode(bufferlist &bl) const;
4403 void decode(bufferlist::iterator &bl);
4404 void dump(Formatter *f) const;
4405 static void generate_test_instances(list<osd_peer_stat_t*>& o);
4406 };
4407 WRITE_CLASS_ENCODER(osd_peer_stat_t)
4408
4409 ostream& operator<<(ostream& out, const osd_peer_stat_t &stat);
4410
4411
4412 // -----------------------------------------
4413
4414 class ObjectExtent {
4415 /**
4416 * ObjectExtents are used for specifying IO behavior against RADOS
4417 * objects when one is using the ObjectCacher.
4418 *
4419 * To use this in a real system, *every member* must be filled
4420 * out correctly. In particular, make sure to initialize the
4421 * oloc correctly, as its default values are deliberate poison
4422 * and will cause internal ObjectCacher asserts.
4423 *
4424 * Similarly, your buffer_extents vector *must* specify a total
4425 * size equal to your length. If the buffer_extents inadvertently
4426 * contain less space than the length member specifies, you
4427 * will get unintelligible asserts deep in the ObjectCacher.
4428 *
4429 * If you are trying to do testing and don't care about actual
4430 * RADOS function, the simplest thing to do is to initialize
4431 * the ObjectExtent (truncate_size can be 0), create a single entry
4432 * in buffer_extents matching the length, and set oloc.pool to 0.
4433 */
4434 public:
4435 object_t oid; // object id
4436 uint64_t objectno;
4437 uint64_t offset; // in object
4438 uint64_t length; // in object
4439 uint64_t truncate_size; // in object
4440
4441 object_locator_t oloc; // object locator (pool etc)
4442
4443 vector<pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
4444
4445 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
4446 ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
4447 oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
4448 };
4449
4450 inline ostream& operator<<(ostream& out, const ObjectExtent &ex)
4451 {
4452 return out << "extent("
4453 << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
4454 << " " << ex.offset << "~" << ex.length
4455 << " -> " << ex.buffer_extents
4456 << ")";
4457 }
4458
4459
4460 // ---------------------------------------
4461
4462 class OSDSuperblock {
4463 public:
4464 uuid_d cluster_fsid, osd_fsid;
4465 int32_t whoami; // my role in this fs.
4466 epoch_t current_epoch; // most recent epoch
4467 epoch_t oldest_map, newest_map; // oldest/newest maps we have.
4468 double weight;
4469
4470 CompatSet compat_features;
4471
4472 // last interval over which i mounted and was then active
4473 epoch_t mounted; // last epoch i mounted
4474 epoch_t clean_thru; // epoch i was active and clean thru
4475
4476 OSDSuperblock() :
4477 whoami(-1),
4478 current_epoch(0), oldest_map(0), newest_map(0), weight(0),
4479 mounted(0), clean_thru(0) {
4480 }
4481
4482 void encode(bufferlist &bl) const;
4483 void decode(bufferlist::iterator &bl);
4484 void dump(Formatter *f) const;
4485 static void generate_test_instances(list<OSDSuperblock*>& o);
4486 };
4487 WRITE_CLASS_ENCODER(OSDSuperblock)
4488
4489 inline ostream& operator<<(ostream& out, const OSDSuperblock& sb)
4490 {
4491 return out << "sb(" << sb.cluster_fsid
4492 << " osd." << sb.whoami
4493 << " " << sb.osd_fsid
4494 << " e" << sb.current_epoch
4495 << " [" << sb.oldest_map << "," << sb.newest_map << "]"
4496 << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
4497 << ")";
4498 }
4499
4500
4501 // -------
4502
4503
4504
4505
4506
4507
4508 /*
4509 * attached to object head. describes most recent snap context, and
4510 * set of existing clones.
4511 */
4512 struct SnapSet {
4513 snapid_t seq;
4514 bool head_exists;
4515 vector<snapid_t> snaps; // descending
4516 vector<snapid_t> clones; // ascending
4517 map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest
4518 map<snapid_t, uint64_t> clone_size;
4519 map<snapid_t, vector<snapid_t>> clone_snaps; // descending
4520
4521 SnapSet() : seq(0), head_exists(false) {}
4522 explicit SnapSet(bufferlist& bl) {
4523 bufferlist::iterator p = bl.begin();
4524 decode(p);
4525 }
4526
4527 bool is_legacy() const {
4528 return clone_snaps.size() < clones.size() || !head_exists;
4529 }
4530
4531 /// populate SnapSet from a librados::snap_set_t
4532 void from_snap_set(const librados::snap_set_t& ss, bool legacy);
4533
4534 /// get space accounted to clone
4535 uint64_t get_clone_bytes(snapid_t clone) const;
4536
4537 void encode(bufferlist& bl) const;
4538 void decode(bufferlist::iterator& bl);
4539 void dump(Formatter *f) const;
4540 static void generate_test_instances(list<SnapSet*>& o);
4541
4542 SnapContext get_ssc_as_of(snapid_t as_of) const {
4543 SnapContext out;
4544 out.seq = as_of;
4545 for (vector<snapid_t>::const_iterator i = snaps.begin();
4546 i != snaps.end();
4547 ++i) {
4548 if (*i <= as_of)
4549 out.snaps.push_back(*i);
4550 }
4551 return out;
4552 }
4553
4554 // return min element of snaps > after, return max if no such element
4555 snapid_t get_first_snap_after(snapid_t after, snapid_t max) const {
4556 for (vector<snapid_t>::const_reverse_iterator i = snaps.rbegin();
4557 i != snaps.rend();
4558 ++i) {
4559 if (*i > after)
4560 return *i;
4561 }
4562 return max;
4563 }
4564
4565 SnapSet get_filtered(const pg_pool_t &pinfo) const;
4566 void filter(const pg_pool_t &pinfo);
4567 };
4568 WRITE_CLASS_ENCODER(SnapSet)
4569
4570 ostream& operator<<(ostream& out, const SnapSet& cs);
4571
4572
4573
4574 #define OI_ATTR "_"
4575 #define SS_ATTR "snapset"
4576
4577 struct watch_info_t {
4578 uint64_t cookie;
4579 uint32_t timeout_seconds;
4580 entity_addr_t addr;
4581
4582 watch_info_t() : cookie(0), timeout_seconds(0) { }
4583 watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
4584
4585 void encode(bufferlist& bl, uint64_t features) const;
4586 void decode(bufferlist::iterator& bl);
4587 void dump(Formatter *f) const;
4588 static void generate_test_instances(list<watch_info_t*>& o);
4589 };
4590 WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
4591
4592 static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
4593 return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
4594 && l.addr == r.addr;
4595 }
4596
4597 static inline ostream& operator<<(ostream& out, const watch_info_t& w) {
4598 return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
4599 << " " << w.addr << ")";
4600 }
4601
4602 struct notify_info_t {
4603 uint64_t cookie;
4604 uint64_t notify_id;
4605 uint32_t timeout;
4606 bufferlist bl;
4607 };
4608
4609 static inline ostream& operator<<(ostream& out, const notify_info_t& n) {
4610 return out << "notify(cookie " << n.cookie
4611 << " notify" << n.notify_id
4612 << " " << n.timeout << "s)";
4613 }
4614
4615 struct object_info_t;
4616 struct object_manifest_t {
4617 enum {
4618 TYPE_NONE = 0,
4619 TYPE_REDIRECT = 1, // start with this
4620 TYPE_CHUNKED = 2, // do this later
4621 };
4622 uint8_t type; // redirect, chunked, ...
4623 hobject_t redirect_target;
4624
4625 object_manifest_t() : type(0) { }
4626 object_manifest_t(uint8_t type, const hobject_t& redirect_target)
4627 : type(type), redirect_target(redirect_target) { }
4628
4629 bool is_empty() const {
4630 return type == TYPE_NONE;
4631 }
4632 bool is_redirect() const {
4633 return type == TYPE_REDIRECT;
4634 }
4635 bool is_chunked() const {
4636 return type == TYPE_CHUNKED;
4637 }
4638 static const char *get_type_name(uint8_t m) {
4639 switch (m) {
4640 case TYPE_NONE: return "none";
4641 case TYPE_REDIRECT: return "redirect";
4642 case TYPE_CHUNKED: return "chunked";
4643 default: return "unknown";
4644 }
4645 }
4646 const char *get_type_name() const {
4647 return get_type_name(type);
4648 }
4649 static void generate_test_instances(list<object_manifest_t*>& o);
4650 void encode(bufferlist &bl) const;
4651 void decode(bufferlist::iterator &bl);
4652 void dump(Formatter *f) const;
4653 friend ostream& operator<<(ostream& out, const object_info_t& oi);
4654 };
4655 WRITE_CLASS_ENCODER(object_manifest_t)
4656 ostream& operator<<(ostream& out, const object_manifest_t& oi);
4657
4658 struct object_info_t {
4659 hobject_t soid;
4660 eversion_t version, prior_version;
4661 version_t user_version;
4662 osd_reqid_t last_reqid;
4663
4664 uint64_t size;
4665 utime_t mtime;
4666 utime_t local_mtime; // local mtime
4667
4668 // note: these are currently encoded into a total 16 bits; see
4669 // encode()/decode() for the weirdness.
4670 typedef enum {
4671 FLAG_LOST = 1<<0,
4672 FLAG_WHITEOUT = 1<<1, // object logically does not exist
4673 FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
4674 FLAG_OMAP = 1 << 3, // has (or may have) some/any omap data
4675 FLAG_DATA_DIGEST = 1 << 4, // has data crc
4676 FLAG_OMAP_DIGEST = 1 << 5, // has omap crc
4677 FLAG_CACHE_PIN = 1 << 6, // pin the object in cache tier
4678 FLAG_MANIFEST = 1 << 7, // has manifest
4679 // ...
4680 FLAG_USES_TMAP = 1<<8, // deprecated; no longer used.
4681 } flag_t;
4682
4683 flag_t flags;
4684
4685 static string get_flag_string(flag_t flags) {
4686 string s;
4687 vector<string> sv = get_flag_vector(flags);
4688 for (auto ss : sv) {
4689 s += string("|") + ss;
4690 }
4691 if (s.length())
4692 return s.substr(1);
4693 return s;
4694 }
4695 static vector<string> get_flag_vector(flag_t flags) {
4696 vector<string> sv;
4697 if (flags & FLAG_LOST)
4698 sv.insert(sv.end(), "lost");
4699 if (flags & FLAG_WHITEOUT)
4700 sv.insert(sv.end(), "whiteout");
4701 if (flags & FLAG_DIRTY)
4702 sv.insert(sv.end(), "dirty");
4703 if (flags & FLAG_USES_TMAP)
4704 sv.insert(sv.end(), "uses_tmap");
4705 if (flags & FLAG_OMAP)
4706 sv.insert(sv.end(), "omap");
4707 if (flags & FLAG_DATA_DIGEST)
4708 sv.insert(sv.end(), "data_digest");
4709 if (flags & FLAG_OMAP_DIGEST)
4710 sv.insert(sv.end(), "omap_digest");
4711 if (flags & FLAG_CACHE_PIN)
4712 sv.insert(sv.end(), "cache_pin");
4713 if (flags & FLAG_MANIFEST)
4714 sv.insert(sv.end(), "manifest");
4715 return sv;
4716 }
4717 string get_flag_string() const {
4718 return get_flag_string(flags);
4719 }
4720
4721 /// [clone] descending. pre-luminous; moved to SnapSet
4722 vector<snapid_t> legacy_snaps;
4723
4724 uint64_t truncate_seq, truncate_size;
4725
4726 map<pair<uint64_t, entity_name_t>, watch_info_t> watchers;
4727
4728 // opportunistic checksums; may or may not be present
4729 __u32 data_digest; ///< data crc32c
4730 __u32 omap_digest; ///< omap crc32c
4731
4732 // alloc hint attribute
4733 uint64_t expected_object_size, expected_write_size;
4734 uint32_t alloc_hint_flags;
4735
4736 struct object_manifest_t manifest;
4737
4738 void copy_user_bits(const object_info_t& other);
4739
4740 static ps_t legacy_object_locator_to_ps(const object_t &oid,
4741 const object_locator_t &loc);
4742
4743 bool test_flag(flag_t f) const {
4744 return (flags & f) == f;
4745 }
4746 void set_flag(flag_t f) {
4747 flags = (flag_t)(flags | f);
4748 }
4749 void clear_flag(flag_t f) {
4750 flags = (flag_t)(flags & ~f);
4751 }
4752 bool is_lost() const {
4753 return test_flag(FLAG_LOST);
4754 }
4755 bool is_whiteout() const {
4756 return test_flag(FLAG_WHITEOUT);
4757 }
4758 bool is_dirty() const {
4759 return test_flag(FLAG_DIRTY);
4760 }
4761 bool is_omap() const {
4762 return test_flag(FLAG_OMAP);
4763 }
4764 bool is_data_digest() const {
4765 return test_flag(FLAG_DATA_DIGEST);
4766 }
4767 bool is_omap_digest() const {
4768 return test_flag(FLAG_OMAP_DIGEST);
4769 }
4770 bool is_cache_pinned() const {
4771 return test_flag(FLAG_CACHE_PIN);
4772 }
4773 bool has_manifest() const {
4774 return test_flag(FLAG_MANIFEST);
4775 }
4776
4777 void set_data_digest(__u32 d) {
4778 set_flag(FLAG_DATA_DIGEST);
4779 data_digest = d;
4780 }
4781 void set_omap_digest(__u32 d) {
4782 set_flag(FLAG_OMAP_DIGEST);
4783 omap_digest = d;
4784 }
4785 void clear_data_digest() {
4786 clear_flag(FLAG_DATA_DIGEST);
4787 data_digest = -1;
4788 }
4789 void clear_omap_digest() {
4790 clear_flag(FLAG_OMAP_DIGEST);
4791 omap_digest = -1;
4792 }
4793 void new_object() {
4794 clear_data_digest();
4795 clear_omap_digest();
4796 }
4797
4798 void encode(bufferlist& bl, uint64_t features) const;
4799 void decode(bufferlist::iterator& bl);
4800 void decode(bufferlist& bl) {
4801 bufferlist::iterator p = bl.begin();
4802 decode(p);
4803 }
4804 void dump(Formatter *f) const;
4805 static void generate_test_instances(list<object_info_t*>& o);
4806
4807 explicit object_info_t()
4808 : user_version(0), size(0), flags((flag_t)0),
4809 truncate_seq(0), truncate_size(0),
4810 data_digest(-1), omap_digest(-1),
4811 expected_object_size(0), expected_write_size(0),
4812 alloc_hint_flags(0)
4813 {}
4814
4815 explicit object_info_t(const hobject_t& s)
4816 : soid(s),
4817 user_version(0), size(0), flags((flag_t)0),
4818 truncate_seq(0), truncate_size(0),
4819 data_digest(-1), omap_digest(-1),
4820 expected_object_size(0), expected_write_size(0),
4821 alloc_hint_flags(0)
4822 {}
4823
4824 explicit object_info_t(bufferlist& bl) {
4825 decode(bl);
4826 }
4827 };
4828 WRITE_CLASS_ENCODER_FEATURES(object_info_t)
4829
4830 ostream& operator<<(ostream& out, const object_info_t& oi);
4831
4832
4833
4834 // Object recovery
4835 struct ObjectRecoveryInfo {
4836 hobject_t soid;
4837 eversion_t version;
4838 uint64_t size;
4839 object_info_t oi;
4840 SnapSet ss; // only populated if soid is_snap()
4841 interval_set<uint64_t> copy_subset;
4842 map<hobject_t, interval_set<uint64_t>> clone_subset;
4843
4844 ObjectRecoveryInfo() : size(0) { }
4845
4846 static void generate_test_instances(list<ObjectRecoveryInfo*>& o);
4847 void encode(bufferlist &bl, uint64_t features) const;
4848 void decode(bufferlist::iterator &bl, int64_t pool = -1);
4849 ostream &print(ostream &out) const;
4850 void dump(Formatter *f) const;
4851 };
4852 WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
4853 ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf);
4854
4855 struct ObjectRecoveryProgress {
4856 uint64_t data_recovered_to;
4857 string omap_recovered_to;
4858 bool first;
4859 bool data_complete;
4860 bool omap_complete;
4861 bool error = false;
4862
4863 ObjectRecoveryProgress()
4864 : data_recovered_to(0),
4865 first(true),
4866 data_complete(false), omap_complete(false) { }
4867
4868 bool is_complete(const ObjectRecoveryInfo& info) const {
4869 return (data_recovered_to >= (
4870 info.copy_subset.empty() ?
4871 0 : info.copy_subset.range_end())) &&
4872 omap_complete;
4873 }
4874
4875 static void generate_test_instances(list<ObjectRecoveryProgress*>& o);
4876 void encode(bufferlist &bl) const;
4877 void decode(bufferlist::iterator &bl);
4878 ostream &print(ostream &out) const;
4879 void dump(Formatter *f) const;
4880 };
4881 WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
4882 ostream& operator<<(ostream& out, const ObjectRecoveryProgress &prog);
4883
4884 struct PushReplyOp {
4885 hobject_t soid;
4886
4887 static void generate_test_instances(list<PushReplyOp*>& o);
4888 void encode(bufferlist &bl) const;
4889 void decode(bufferlist::iterator &bl);
4890 ostream &print(ostream &out) const;
4891 void dump(Formatter *f) const;
4892
4893 uint64_t cost(CephContext *cct) const;
4894 };
4895 WRITE_CLASS_ENCODER(PushReplyOp)
4896 ostream& operator<<(ostream& out, const PushReplyOp &op);
4897
4898 struct PullOp {
4899 hobject_t soid;
4900
4901 ObjectRecoveryInfo recovery_info;
4902 ObjectRecoveryProgress recovery_progress;
4903
4904 static void generate_test_instances(list<PullOp*>& o);
4905 void encode(bufferlist &bl, uint64_t features) const;
4906 void decode(bufferlist::iterator &bl);
4907 ostream &print(ostream &out) const;
4908 void dump(Formatter *f) const;
4909
4910 uint64_t cost(CephContext *cct) const;
4911 };
4912 WRITE_CLASS_ENCODER_FEATURES(PullOp)
4913 ostream& operator<<(ostream& out, const PullOp &op);
4914
4915 struct PushOp {
4916 hobject_t soid;
4917 eversion_t version;
4918 bufferlist data;
4919 interval_set<uint64_t> data_included;
4920 bufferlist omap_header;
4921 map<string, bufferlist> omap_entries;
4922 map<string, bufferlist> attrset;
4923
4924 ObjectRecoveryInfo recovery_info;
4925 ObjectRecoveryProgress before_progress;
4926 ObjectRecoveryProgress after_progress;
4927
4928 static void generate_test_instances(list<PushOp*>& o);
4929 void encode(bufferlist &bl, uint64_t features) const;
4930 void decode(bufferlist::iterator &bl);
4931 ostream &print(ostream &out) const;
4932 void dump(Formatter *f) const;
4933
4934 uint64_t cost(CephContext *cct) const;
4935 };
4936 WRITE_CLASS_ENCODER_FEATURES(PushOp)
4937 ostream& operator<<(ostream& out, const PushOp &op);
4938
4939
4940 /*
4941 * summarize pg contents for purposes of a scrub
4942 */
4943 struct ScrubMap {
4944 struct object {
4945 map<string,bufferptr> attrs;
4946 uint64_t size;
4947 __u32 omap_digest; ///< omap crc32c
4948 __u32 digest; ///< data crc32c
4949 bool negative:1;
4950 bool digest_present:1;
4951 bool omap_digest_present:1;
4952 bool read_error:1;
4953 bool stat_error:1;
4954 bool ec_hash_mismatch:1;
4955 bool ec_size_mismatch:1;
4956 bool large_omap_object_found:1;
4957 uint64_t large_omap_object_key_count = 0;
4958 uint64_t large_omap_object_value_size = 0;
4959
4960 object() :
4961 // Init invalid size so it won't match if we get a stat EIO error
4962 size(-1), omap_digest(0), digest(0),
4963 negative(false), digest_present(false), omap_digest_present(false),
4964 read_error(false), stat_error(false), ec_hash_mismatch(false),
4965 ec_size_mismatch(false), large_omap_object_found(false) {}
4966
4967 void encode(bufferlist& bl) const;
4968 void decode(bufferlist::iterator& bl);
4969 void dump(Formatter *f) const;
4970 static void generate_test_instances(list<object*>& o);
4971 };
4972 WRITE_CLASS_ENCODER(object)
4973
4974 map<hobject_t,object> objects;
4975 eversion_t valid_through;
4976 eversion_t incr_since;
4977 bool has_large_omap_object_errors:1;
4978 boost::optional<bool> has_builtin_csum;
4979
4980 void merge_incr(const ScrubMap &l);
4981 void clear_from(const hobject_t& start) {
4982 objects.erase(objects.lower_bound(start), objects.end());
4983 }
4984 void insert(const ScrubMap &r) {
4985 objects.insert(r.objects.begin(), r.objects.end());
4986 }
4987 void swap(ScrubMap &r) {
4988 using std::swap;
4989 swap(objects, r.objects);
4990 swap(valid_through, r.valid_through);
4991 swap(incr_since, r.incr_since);
4992 }
4993
4994 void encode(bufferlist& bl) const;
4995 void decode(bufferlist::iterator& bl, int64_t pool=-1);
4996 void dump(Formatter *f) const;
4997 static void generate_test_instances(list<ScrubMap*>& o);
4998 };
4999 WRITE_CLASS_ENCODER(ScrubMap::object)
5000 WRITE_CLASS_ENCODER(ScrubMap)
5001
5002 struct ScrubMapBuilder {
5003 bool deep = false;
5004 vector<hobject_t> ls;
5005 size_t pos = 0;
5006 int64_t data_pos = 0;
5007 string omap_pos;
5008 int ret = 0;
5009 bufferhash data_hash, omap_hash; ///< accumulatinng hash value
5010 uint64_t omap_keys = 0;
5011 uint64_t omap_bytes = 0;
5012
5013 bool empty() {
5014 return ls.empty();
5015 }
5016 bool done() {
5017 return pos >= ls.size();
5018 }
5019 void reset() {
5020 *this = ScrubMapBuilder();
5021 }
5022
5023 bool data_done() {
5024 return data_pos < 0;
5025 }
5026
5027 void next_object() {
5028 ++pos;
5029 data_pos = 0;
5030 omap_pos.clear();
5031 omap_keys = 0;
5032 omap_bytes = 0;
5033 }
5034
5035 friend ostream& operator<<(ostream& out, const ScrubMapBuilder& pos) {
5036 out << "(" << pos.pos << "/" << pos.ls.size();
5037 if (pos.pos < pos.ls.size()) {
5038 out << " " << pos.ls[pos.pos];
5039 }
5040 if (pos.data_pos < 0) {
5041 out << " byte " << pos.data_pos;
5042 }
5043 if (!pos.omap_pos.empty()) {
5044 out << " key " << pos.omap_pos;
5045 }
5046 if (pos.deep) {
5047 out << " deep";
5048 }
5049 if (pos.ret) {
5050 out << " ret " << pos.ret;
5051 }
5052 return out << ")";
5053 }
5054 };
5055
5056 struct OSDOp {
5057 ceph_osd_op op;
5058 sobject_t soid;
5059
5060 bufferlist indata, outdata;
5061 errorcode32_t rval;
5062
5063 OSDOp() : rval(0) {
5064 memset(&op, 0, sizeof(ceph_osd_op));
5065 }
5066
5067 /**
5068 * split a bufferlist into constituent indata members of a vector of OSDOps
5069 *
5070 * @param ops [out] vector of OSDOps
5071 * @param in [in] combined data buffer
5072 */
5073 static void split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in);
5074
5075 /**
5076 * merge indata members of a vector of OSDOp into a single bufferlist
5077 *
5078 * Notably this also encodes certain other OSDOp data into the data
5079 * buffer, including the sobject_t soid.
5080 *
5081 * @param ops [in] vector of OSDOps
5082 * @param out [out] combined data buffer
5083 */
5084 static void merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out);
5085
5086 /**
5087 * split a bufferlist into constituent outdata members of a vector of OSDOps
5088 *
5089 * @param ops [out] vector of OSDOps
5090 * @param in [in] combined data buffer
5091 */
5092 static void split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in);
5093
5094 /**
5095 * merge outdata members of a vector of OSDOps into a single bufferlist
5096 *
5097 * @param ops [in] vector of OSDOps
5098 * @param out [out] combined data buffer
5099 */
5100 static void merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out);
5101
5102 /**
5103 * Clear data as much as possible, leave minimal data for historical op dump
5104 *
5105 * @param ops [in] vector of OSDOps
5106 */
5107 static void clear_data(vector<OSDOp>& ops);
5108 };
5109
5110 ostream& operator<<(ostream& out, const OSDOp& op);
5111
5112 struct watch_item_t {
5113 entity_name_t name;
5114 uint64_t cookie;
5115 uint32_t timeout_seconds;
5116 entity_addr_t addr;
5117
5118 watch_item_t() : cookie(0), timeout_seconds(0) { }
5119 watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
5120 const entity_addr_t& addr)
5121 : name(name), cookie(cookie), timeout_seconds(timeout),
5122 addr(addr) { }
5123
5124 void encode(bufferlist &bl, uint64_t features) const {
5125 ENCODE_START(2, 1, bl);
5126 ::encode(name, bl);
5127 ::encode(cookie, bl);
5128 ::encode(timeout_seconds, bl);
5129 ::encode(addr, bl, features);
5130 ENCODE_FINISH(bl);
5131 }
5132 void decode(bufferlist::iterator &bl) {
5133 DECODE_START(2, bl);
5134 ::decode(name, bl);
5135 ::decode(cookie, bl);
5136 ::decode(timeout_seconds, bl);
5137 if (struct_v >= 2) {
5138 ::decode(addr, bl);
5139 }
5140 DECODE_FINISH(bl);
5141 }
5142 };
5143 WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
5144
5145 struct obj_watch_item_t {
5146 hobject_t obj;
5147 watch_item_t wi;
5148 };
5149
5150 /**
5151 * obj list watch response format
5152 *
5153 */
5154 struct obj_list_watch_response_t {
5155 list<watch_item_t> entries;
5156
5157 void encode(bufferlist& bl, uint64_t features) const {
5158 ENCODE_START(1, 1, bl);
5159 ::encode(entries, bl, features);
5160 ENCODE_FINISH(bl);
5161 }
5162 void decode(bufferlist::iterator& bl) {
5163 DECODE_START(1, bl);
5164 ::decode(entries, bl);
5165 DECODE_FINISH(bl);
5166 }
5167 void dump(Formatter *f) const {
5168 f->open_array_section("entries");
5169 for (list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
5170 f->open_object_section("watch");
5171 f->dump_stream("watcher") << p->name;
5172 f->dump_int("cookie", p->cookie);
5173 f->dump_int("timeout", p->timeout_seconds);
5174 f->open_object_section("addr");
5175 p->addr.dump(f);
5176 f->close_section();
5177 f->close_section();
5178 }
5179 f->close_section();
5180 }
5181 static void generate_test_instances(list<obj_list_watch_response_t*>& o) {
5182 entity_addr_t ea;
5183 o.push_back(new obj_list_watch_response_t);
5184 o.push_back(new obj_list_watch_response_t);
5185 ea.set_type(entity_addr_t::TYPE_LEGACY);
5186 ea.set_nonce(1000);
5187 ea.set_family(AF_INET);
5188 ea.set_in4_quad(0, 127);
5189 ea.set_in4_quad(1, 0);
5190 ea.set_in4_quad(2, 0);
5191 ea.set_in4_quad(3, 1);
5192 ea.set_port(1024);
5193 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
5194 ea.set_nonce(1001);
5195 ea.set_in4_quad(3, 2);
5196 ea.set_port(1025);
5197 o.back()->entries.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
5198 }
5199 };
5200 WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
5201
5202 struct clone_info {
5203 snapid_t cloneid;
5204 vector<snapid_t> snaps; // ascending
5205 vector< pair<uint64_t,uint64_t> > overlap;
5206 uint64_t size;
5207
5208 clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
5209
5210 void encode(bufferlist& bl) const {
5211 ENCODE_START(1, 1, bl);
5212 ::encode(cloneid, bl);
5213 ::encode(snaps, bl);
5214 ::encode(overlap, bl);
5215 ::encode(size, bl);
5216 ENCODE_FINISH(bl);
5217 }
5218 void decode(bufferlist::iterator& bl) {
5219 DECODE_START(1, bl);
5220 ::decode(cloneid, bl);
5221 ::decode(snaps, bl);
5222 ::decode(overlap, bl);
5223 ::decode(size, bl);
5224 DECODE_FINISH(bl);
5225 }
5226 void dump(Formatter *f) const {
5227 if (cloneid == CEPH_NOSNAP)
5228 f->dump_string("cloneid", "HEAD");
5229 else
5230 f->dump_unsigned("cloneid", cloneid.val);
5231 f->open_array_section("snapshots");
5232 for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
5233 f->open_object_section("snap");
5234 f->dump_unsigned("id", p->val);
5235 f->close_section();
5236 }
5237 f->close_section();
5238 f->open_array_section("overlaps");
5239 for (vector< pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
5240 q != overlap.end(); ++q) {
5241 f->open_object_section("overlap");
5242 f->dump_unsigned("offset", q->first);
5243 f->dump_unsigned("length", q->second);
5244 f->close_section();
5245 }
5246 f->close_section();
5247 f->dump_unsigned("size", size);
5248 }
5249 static void generate_test_instances(list<clone_info*>& o) {
5250 o.push_back(new clone_info);
5251 o.push_back(new clone_info);
5252 o.back()->cloneid = 1;
5253 o.back()->snaps.push_back(1);
5254 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5255 o.back()->overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5256 o.back()->size = 16384;
5257 o.push_back(new clone_info);
5258 o.back()->cloneid = CEPH_NOSNAP;
5259 o.back()->size = 32768;
5260 }
5261 };
5262 WRITE_CLASS_ENCODER(clone_info)
5263
5264 /**
5265 * obj list snaps response format
5266 *
5267 */
5268 struct obj_list_snap_response_t {
5269 vector<clone_info> clones; // ascending
5270 snapid_t seq;
5271
5272 void encode(bufferlist& bl) const {
5273 ENCODE_START(2, 1, bl);
5274 ::encode(clones, bl);
5275 ::encode(seq, bl);
5276 ENCODE_FINISH(bl);
5277 }
5278 void decode(bufferlist::iterator& bl) {
5279 DECODE_START(2, bl);
5280 ::decode(clones, bl);
5281 if (struct_v >= 2)
5282 ::decode(seq, bl);
5283 else
5284 seq = CEPH_NOSNAP;
5285 DECODE_FINISH(bl);
5286 }
5287 void dump(Formatter *f) const {
5288 f->open_array_section("clones");
5289 for (vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
5290 f->open_object_section("clone");
5291 p->dump(f);
5292 f->close_section();
5293 }
5294 f->dump_unsigned("seq", seq);
5295 f->close_section();
5296 }
5297 static void generate_test_instances(list<obj_list_snap_response_t*>& o) {
5298 o.push_back(new obj_list_snap_response_t);
5299 o.push_back(new obj_list_snap_response_t);
5300 clone_info cl;
5301 cl.cloneid = 1;
5302 cl.snaps.push_back(1);
5303 cl.overlap.push_back(pair<uint64_t,uint64_t>(0,4096));
5304 cl.overlap.push_back(pair<uint64_t,uint64_t>(8192,4096));
5305 cl.size = 16384;
5306 o.back()->clones.push_back(cl);
5307 cl.cloneid = CEPH_NOSNAP;
5308 cl.snaps.clear();
5309 cl.overlap.clear();
5310 cl.size = 32768;
5311 o.back()->clones.push_back(cl);
5312 o.back()->seq = 123;
5313 }
5314 };
5315
5316 WRITE_CLASS_ENCODER(obj_list_snap_response_t)
5317
5318 // PromoteCounter
5319
5320 struct PromoteCounter {
5321 std::atomic_ullong attempts{0};
5322 std::atomic_ullong objects{0};
5323 std::atomic_ullong bytes{0};
5324
5325 void attempt() {
5326 attempts++;
5327 }
5328
5329 void finish(uint64_t size) {
5330 objects++;
5331 bytes += size;
5332 }
5333
5334 void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
5335 *a = attempts;
5336 *o = objects;
5337 *b = bytes;
5338 attempts = *a / 2;
5339 objects = *o / 2;
5340 bytes = *b / 2;
5341 }
5342 };
5343
5344 /** store_statfs_t
5345 * ObjectStore full statfs information
5346 */
5347 struct store_statfs_t
5348 {
5349 uint64_t total = 0; // Total bytes
5350 uint64_t available = 0; // Free bytes available
5351
5352 int64_t allocated = 0; // Bytes allocated by the store
5353 int64_t stored = 0; // Bytes actually stored by the user
5354 int64_t compressed = 0; // Bytes stored after compression
5355 int64_t compressed_allocated = 0; // Bytes allocated for compressed data
5356 int64_t compressed_original = 0; // Bytes that were successfully compressed
5357
5358 void reset() {
5359 *this = store_statfs_t();
5360 }
5361 bool operator ==(const store_statfs_t& other) const;
5362 void dump(Formatter *f) const;
5363 };
5364 ostream &operator<<(ostream &lhs, const store_statfs_t &rhs);
5365
5366 #endif