]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/bluestore_types.h
import ceph quincy 17.2.4
[ceph.git] / ceph / src / os / bluestore / bluestore_types.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
16 #define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
17
18 #include <ostream>
19 #include <type_traits>
20 #include <vector>
21 #include <array>
22 #include "include/mempool.h"
23 #include "include/types.h"
24 #include "include/interval_set.h"
25 #include "include/utime.h"
26 #include "common/hobject.h"
27 #include "compressor/Compressor.h"
28 #include "common/Checksummer.h"
29 #include "include/mempool.h"
30 #include "include/ceph_hash.h"
31
32 namespace ceph {
33 class Formatter;
34 }
35
36 /// label for block device
37 struct bluestore_bdev_label_t {
38 uuid_d osd_uuid; ///< osd uuid
39 uint64_t size = 0; ///< device size
40 utime_t btime; ///< birth time
41 std::string description; ///< device description
42
43 std::map<std::string,std::string> meta; ///< {read,write}_meta() content from ObjectStore
44
45 void encode(ceph::buffer::list& bl) const;
46 void decode(ceph::buffer::list::const_iterator& p);
47 void dump(ceph::Formatter *f) const;
48 static void generate_test_instances(std::list<bluestore_bdev_label_t*>& o);
49 };
50 WRITE_CLASS_ENCODER(bluestore_bdev_label_t)
51
52 std::ostream& operator<<(std::ostream& out, const bluestore_bdev_label_t& l);
53
54 /// collection metadata
55 struct bluestore_cnode_t {
56 uint32_t bits; ///< how many bits of coll pgid are significant
57
58 explicit bluestore_cnode_t(int b=0) : bits(b) {}
59
60 DENC(bluestore_cnode_t, v, p) {
61 DENC_START(1, 1, p);
62 denc(v.bits, p);
63 DENC_FINISH(p);
64 }
65 void dump(ceph::Formatter *f) const;
66 static void generate_test_instances(std::list<bluestore_cnode_t*>& o);
67 };
68 WRITE_CLASS_DENC(bluestore_cnode_t)
69
70 std::ostream& operator<<(std::ostream& out, const bluestore_cnode_t& l);
71
72 template <typename OFFS_TYPE, typename LEN_TYPE>
73 struct bluestore_interval_t
74 {
75 static const uint64_t INVALID_OFFSET = ~0ull;
76
77 OFFS_TYPE offset = 0;
78 LEN_TYPE length = 0;
79
80 bluestore_interval_t(){}
81 bluestore_interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
82
83 bool is_valid() const {
84 return offset != INVALID_OFFSET;
85 }
86 uint64_t end() const {
87 return offset != INVALID_OFFSET ? offset + length : INVALID_OFFSET;
88 }
89
90 bool operator==(const bluestore_interval_t& other) const {
91 return offset == other.offset && length == other.length;
92 }
93
94 };
95
96 /// pextent: physical extent
97 struct bluestore_pextent_t : public bluestore_interval_t<uint64_t, uint32_t>
98 {
99 bluestore_pextent_t() {}
100 bluestore_pextent_t(uint64_t o, uint64_t l) : bluestore_interval_t(o, l) {}
101 bluestore_pextent_t(const bluestore_interval_t &ext) :
102 bluestore_interval_t(ext.offset, ext.length) {}
103
104 DENC(bluestore_pextent_t, v, p) {
105 denc_lba(v.offset, p);
106 denc_varint_lowz(v.length, p);
107 }
108
109 void dump(ceph::Formatter *f) const;
110 static void generate_test_instances(std::list<bluestore_pextent_t*>& ls);
111 };
112 WRITE_CLASS_DENC(bluestore_pextent_t)
113
114 std::ostream& operator<<(std::ostream& out, const bluestore_pextent_t& o);
115
116 typedef mempool::bluestore_cache_other::vector<bluestore_pextent_t> PExtentVector;
117
118 template<>
119 struct denc_traits<PExtentVector> {
120 static constexpr bool supported = true;
121 static constexpr bool bounded = false;
122 static constexpr bool featured = false;
123 static constexpr bool need_contiguous = true;
124 static void bound_encode(const PExtentVector& v, size_t& p) {
125 p += sizeof(uint32_t);
126 const auto size = v.size();
127 if (size) {
128 size_t per = 0;
129 denc(v.front(), per);
130 p += per * size;
131 }
132 }
133 static void encode(const PExtentVector& v,
134 ceph::buffer::list::contiguous_appender& p) {
135 denc_varint(v.size(), p);
136 for (auto& i : v) {
137 denc(i, p);
138 }
139 }
140 static void decode(PExtentVector& v, ceph::buffer::ptr::const_iterator& p) {
141 unsigned num;
142 denc_varint(num, p);
143 v.clear();
144 v.resize(num);
145 for (unsigned i=0; i<num; ++i) {
146 denc(v[i], p);
147 }
148 }
149 };
150
151 /// extent_map: a std::map of reference counted extents
152 struct bluestore_extent_ref_map_t {
153 struct record_t {
154 uint32_t length;
155 uint32_t refs;
156 record_t(uint32_t l=0, uint32_t r=0) : length(l), refs(r) {}
157 DENC(bluestore_extent_ref_map_t::record_t, v, p) {
158 denc_varint_lowz(v.length, p);
159 denc_varint(v.refs, p);
160 }
161 };
162
163 typedef mempool::bluestore_cache_other::map<uint64_t,record_t> map_t;
164 map_t ref_map;
165
166 void _check() const;
167 void _maybe_merge_left(map_t::iterator& p);
168
169 void clear() {
170 ref_map.clear();
171 }
172 bool empty() const {
173 return ref_map.empty();
174 }
175
176 void get(uint64_t offset, uint32_t len);
177 void put(uint64_t offset, uint32_t len, PExtentVector *release,
178 bool *maybe_unshared);
179
180 bool contains(uint64_t offset, uint32_t len) const;
181 bool intersects(uint64_t offset, uint32_t len) const;
182
183 void bound_encode(size_t& p) const {
184 denc_varint((uint32_t)0, p);
185 if (!ref_map.empty()) {
186 size_t elem_size = 0;
187 denc_varint_lowz((uint64_t)0, elem_size);
188 ref_map.begin()->second.bound_encode(elem_size);
189 p += elem_size * ref_map.size();
190 }
191 }
192 void encode(ceph::buffer::list::contiguous_appender& p) const {
193 const uint32_t n = ref_map.size();
194 denc_varint(n, p);
195 if (n) {
196 auto i = ref_map.begin();
197 denc_varint_lowz(i->first, p);
198 i->second.encode(p);
199 int64_t pos = i->first;
200 while (++i != ref_map.end()) {
201 denc_varint_lowz((int64_t)i->first - pos, p);
202 i->second.encode(p);
203 pos = i->first;
204 }
205 }
206 }
207 void decode(ceph::buffer::ptr::const_iterator& p) {
208 uint32_t n;
209 denc_varint(n, p);
210 if (n) {
211 int64_t pos;
212 denc_varint_lowz(pos, p);
213 ref_map[pos].decode(p);
214 while (--n) {
215 int64_t delta;
216 denc_varint_lowz(delta, p);
217 pos += delta;
218 ref_map[pos].decode(p);
219 }
220 }
221 }
222
223 void dump(ceph::Formatter *f) const;
224 static void generate_test_instances(std::list<bluestore_extent_ref_map_t*>& o);
225 };
226 WRITE_CLASS_DENC(bluestore_extent_ref_map_t)
227
228
229 std::ostream& operator<<(std::ostream& out, const bluestore_extent_ref_map_t& rm);
230 static inline bool operator==(const bluestore_extent_ref_map_t::record_t& l,
231 const bluestore_extent_ref_map_t::record_t& r) {
232 return l.length == r.length && l.refs == r.refs;
233 }
234 static inline bool operator==(const bluestore_extent_ref_map_t& l,
235 const bluestore_extent_ref_map_t& r) {
236 return l.ref_map == r.ref_map;
237 }
238 static inline bool operator!=(const bluestore_extent_ref_map_t& l,
239 const bluestore_extent_ref_map_t& r) {
240 return !(l == r);
241 }
242
243 /// blob_use_tracker: a set of per-alloc unit ref buckets to track blob usage
244 struct bluestore_blob_use_tracker_t {
245 // N.B.: There is no need to minimize au_size/num_au
246 // as much as possible (e.g. have just a single byte for au_size) since:
247 // 1) Struct isn't packed hence it's padded. And even if it's packed see 2)
248 // 2) Mem manager has its own granularity, most probably >= 8 bytes
249 //
250 uint32_t au_size; // Allocation (=tracking) unit size,
251 // == 0 if uninitialized
252 uint32_t num_au; // Amount of allocation units tracked
253 // == 0 if single unit or the whole blob is tracked
254 uint32_t alloc_au; // Amount of allocation units allocated
255
256 union {
257 uint32_t* bytes_per_au;
258 uint32_t total_bytes;
259 };
260
261 bluestore_blob_use_tracker_t()
262 : au_size(0), num_au(0), alloc_au(0), bytes_per_au(nullptr) {
263 }
264 bluestore_blob_use_tracker_t(const bluestore_blob_use_tracker_t& tracker);
265 bluestore_blob_use_tracker_t& operator=(const bluestore_blob_use_tracker_t& rhs);
266 ~bluestore_blob_use_tracker_t() {
267 clear();
268 }
269
270 void clear() {
271 release(alloc_au, bytes_per_au);
272 num_au = 0;
273 alloc_au = 0;
274 bytes_per_au = 0;
275 au_size = 0;
276 }
277
278 uint32_t get_referenced_bytes() const {
279 uint32_t total = 0;
280 if (!num_au) {
281 total = total_bytes;
282 } else {
283 for (size_t i = 0; i < num_au; ++i) {
284 total += bytes_per_au[i];
285 }
286 }
287 return total;
288 }
289 bool is_not_empty() const {
290 if (!num_au) {
291 return total_bytes != 0;
292 } else {
293 for (size_t i = 0; i < num_au; ++i) {
294 if (bytes_per_au[i]) {
295 return true;
296 }
297 }
298 }
299 return false;
300 }
301 bool is_empty() const {
302 return !is_not_empty();
303 }
304 void prune_tail(uint32_t new_len) {
305 if (num_au) {
306 new_len = round_up_to(new_len, au_size);
307 uint32_t _num_au = new_len / au_size;
308 ceph_assert(_num_au <= num_au);
309 if (_num_au) {
310 num_au = _num_au; // bytes_per_au array is left unmodified
311 } else {
312 clear();
313 }
314 }
315 }
316 void add_tail(uint32_t new_len, uint32_t _au_size) {
317 auto full_size = au_size * (num_au ? num_au : 1);
318 ceph_assert(new_len >= full_size);
319 if (new_len == full_size) {
320 return;
321 }
322 if (!num_au) {
323 uint32_t old_total = total_bytes;
324 total_bytes = 0;
325 init(new_len, _au_size);
326 ceph_assert(num_au);
327 bytes_per_au[0] = old_total;
328 } else {
329 ceph_assert(_au_size == au_size);
330 new_len = round_up_to(new_len, au_size);
331 uint32_t _num_au = new_len / au_size;
332 ceph_assert(_num_au >= num_au);
333 if (_num_au > num_au) {
334 auto old_bytes = bytes_per_au;
335 auto old_num_au = num_au;
336 auto old_alloc_au = alloc_au;
337 alloc_au = num_au = 0; // to bypass an assertion in allocate()
338 bytes_per_au = nullptr;
339 allocate(_num_au);
340 for (size_t i = 0; i < old_num_au; i++) {
341 bytes_per_au[i] = old_bytes[i];
342 }
343 for (size_t i = old_num_au; i < num_au; i++) {
344 bytes_per_au[i] = 0;
345 }
346 release(old_alloc_au, old_bytes);
347 }
348 }
349 }
350
351 void init(
352 uint32_t full_length,
353 uint32_t _au_size);
354
355 void get(
356 uint32_t offset,
357 uint32_t len);
358
359 /// put: return true if the blob has no references any more after the call,
360 /// no release_units is filled for the sake of performance.
361 /// return false if there are some references to the blob,
362 /// in this case release_units contains pextents
363 /// (identified by their offsets relative to the blob start)
364 /// that are not used any more and can be safely deallocated.
365 bool put(
366 uint32_t offset,
367 uint32_t len,
368 PExtentVector *release);
369
370 bool can_split() const;
371 bool can_split_at(uint32_t blob_offset) const;
372 void split(
373 uint32_t blob_offset,
374 bluestore_blob_use_tracker_t* r);
375
376 bool equal(
377 const bluestore_blob_use_tracker_t& other) const;
378
379 void bound_encode(size_t& p) const {
380 denc_varint(au_size, p);
381 if (au_size) {
382 denc_varint(num_au, p);
383 if (!num_au) {
384 denc_varint(total_bytes, p);
385 } else {
386 size_t elem_size = 0;
387 denc_varint((uint32_t)0, elem_size);
388 p += elem_size * num_au;
389 }
390 }
391 }
392 void encode(ceph::buffer::list::contiguous_appender& p) const {
393 denc_varint(au_size, p);
394 if (au_size) {
395 denc_varint(num_au, p);
396 if (!num_au) {
397 denc_varint(total_bytes, p);
398 } else {
399 size_t elem_size = 0;
400 denc_varint((uint32_t)0, elem_size);
401 for (size_t i = 0; i < num_au; ++i) {
402 denc_varint(bytes_per_au[i], p);
403 }
404 }
405 }
406 }
407 void decode(ceph::buffer::ptr::const_iterator& p) {
408 clear();
409 denc_varint(au_size, p);
410 if (au_size) {
411 uint32_t _num_au;
412 denc_varint(_num_au, p);
413 if (!_num_au) {
414 num_au = 0;
415 denc_varint(total_bytes, p);
416 } else {
417 allocate(_num_au);
418 for (size_t i = 0; i < _num_au; ++i) {
419 denc_varint(bytes_per_au[i], p);
420 }
421 }
422 }
423 }
424
425 void dump(ceph::Formatter *f) const;
426 static void generate_test_instances(std::list<bluestore_blob_use_tracker_t*>& o);
427 private:
428 void allocate(uint32_t _num_au);
429 void release(uint32_t _num_au, uint32_t* ptr);
430 };
431 WRITE_CLASS_DENC(bluestore_blob_use_tracker_t)
432 std::ostream& operator<<(std::ostream& out, const bluestore_blob_use_tracker_t& rm);
433
434 /// blob: a piece of data on disk
435 struct bluestore_blob_t {
436 private:
437 PExtentVector extents; ///< raw data position on device
438 uint32_t logical_length = 0; ///< original length of data stored in the blob
439 uint32_t compressed_length = 0; ///< compressed length if any
440
441 public:
442 enum {
443 LEGACY_FLAG_MUTABLE = 1, ///< [legacy] blob can be overwritten or split
444 FLAG_COMPRESSED = 2, ///< blob is compressed
445 FLAG_CSUM = 4, ///< blob has checksums
446 FLAG_HAS_UNUSED = 8, ///< blob has unused std::map
447 FLAG_SHARED = 16, ///< blob is shared; see external SharedBlob
448 };
449 static std::string get_flags_string(unsigned flags);
450
451 uint32_t flags = 0; ///< FLAG_*
452
453 typedef uint16_t unused_t;
454 unused_t unused = 0; ///< portion that has never been written to (bitmap)
455
456 uint8_t csum_type = Checksummer::CSUM_NONE; ///< CSUM_*
457 uint8_t csum_chunk_order = 0; ///< csum block size is 1<<block_order bytes
458
459 ceph::buffer::ptr csum_data; ///< opaque std::vector of csum data
460
461 bluestore_blob_t(uint32_t f = 0) : flags(f) {}
462
463 const PExtentVector& get_extents() const {
464 return extents;
465 }
466 PExtentVector& dirty_extents() {
467 return extents;
468 }
469
470 DENC_HELPERS;
471 void bound_encode(size_t& p, uint64_t struct_v) const {
472 ceph_assert(struct_v == 1 || struct_v == 2);
473 denc(extents, p);
474 denc_varint(flags, p);
475 denc_varint_lowz(logical_length, p);
476 denc_varint_lowz(compressed_length, p);
477 denc(csum_type, p);
478 denc(csum_chunk_order, p);
479 denc_varint(csum_data.length(), p);
480 p += csum_data.length();
481 p += sizeof(unused_t);
482 }
483
484 void encode(ceph::buffer::list::contiguous_appender& p, uint64_t struct_v) const {
485 ceph_assert(struct_v == 1 || struct_v == 2);
486 denc(extents, p);
487 denc_varint(flags, p);
488 if (is_compressed()) {
489 denc_varint_lowz(logical_length, p);
490 denc_varint_lowz(compressed_length, p);
491 }
492 if (has_csum()) {
493 denc(csum_type, p);
494 denc(csum_chunk_order, p);
495 denc_varint(csum_data.length(), p);
496 memcpy(p.get_pos_add(csum_data.length()), csum_data.c_str(),
497 csum_data.length());
498 }
499 if (has_unused()) {
500 denc(unused, p);
501 }
502 }
503
504 void decode(ceph::buffer::ptr::const_iterator& p, uint64_t struct_v) {
505 ceph_assert(struct_v == 1 || struct_v == 2);
506 denc(extents, p);
507 denc_varint(flags, p);
508 if (is_compressed()) {
509 denc_varint_lowz(logical_length, p);
510 denc_varint_lowz(compressed_length, p);
511 } else {
512 logical_length = get_ondisk_length();
513 }
514 if (has_csum()) {
515 denc(csum_type, p);
516 denc(csum_chunk_order, p);
517 int len;
518 denc_varint(len, p);
519 csum_data = p.get_ptr(len);
520 csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
521 }
522 if (has_unused()) {
523 denc(unused, p);
524 }
525 }
526
527 bool can_split() const {
528 return
529 !has_flag(FLAG_SHARED) &&
530 !has_flag(FLAG_COMPRESSED) &&
531 !has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex
532 }
533 bool can_split_at(uint32_t blob_offset) const {
534 return !has_csum() || blob_offset % get_csum_chunk_size() == 0;
535 }
536
537 void dump(ceph::Formatter *f) const;
538 static void generate_test_instances(std::list<bluestore_blob_t*>& ls);
539
540 bool has_flag(unsigned f) const {
541 return flags & f;
542 }
543 void set_flag(unsigned f) {
544 flags |= f;
545 }
546 void clear_flag(unsigned f) {
547 flags &= ~f;
548 }
549 std::string get_flags_string() const {
550 return get_flags_string(flags);
551 }
552
553 void set_compressed(uint64_t clen_orig, uint64_t clen) {
554 set_flag(FLAG_COMPRESSED);
555 logical_length = clen_orig;
556 compressed_length = clen;
557 }
558 bool is_mutable() const {
559 return !is_compressed() && !is_shared();
560 }
561 bool is_compressed() const {
562 return has_flag(FLAG_COMPRESSED);
563 }
564 bool has_csum() const {
565 return has_flag(FLAG_CSUM);
566 }
567 bool has_unused() const {
568 return has_flag(FLAG_HAS_UNUSED);
569 }
570 bool is_shared() const {
571 return has_flag(FLAG_SHARED);
572 }
573
574 /// return chunk (i.e. min readable block) size for the blob
575 uint64_t get_chunk_size(uint64_t dev_block_size) const {
576 return has_csum() ?
577 std::max<uint64_t>(dev_block_size, get_csum_chunk_size()) : dev_block_size;
578 }
579 uint32_t get_csum_chunk_size() const {
580 return 1 << csum_chunk_order;
581 }
582 uint32_t get_compressed_payload_length() const {
583 return is_compressed() ? compressed_length : 0;
584 }
585 uint64_t calc_offset(uint64_t x_off, uint64_t *plen) const {
586 auto p = extents.begin();
587 ceph_assert(p != extents.end());
588 while (x_off >= p->length) {
589 x_off -= p->length;
590 ++p;
591 ceph_assert(p != extents.end());
592 }
593 if (plen)
594 *plen = p->length - x_off;
595 return p->offset + x_off;
596 }
597
598 // validate whether or not the status of pextents within the given range
599 // meets the requirement(allocated or unallocated).
600 bool _validate_range(uint64_t b_off, uint64_t b_len,
601 bool require_allocated) const {
602 auto p = extents.begin();
603 ceph_assert(p != extents.end());
604 while (b_off >= p->length) {
605 b_off -= p->length;
606 if (++p == extents.end())
607 return false;
608 }
609 b_len += b_off;
610 while (b_len) {
611 if (require_allocated != p->is_valid()) {
612 return false;
613 }
614 if (p->length >= b_len) {
615 return true;
616 }
617 b_len -= p->length;
618 if (++p == extents.end())
619 return false;
620 }
621 ceph_abort_msg("we should not get here");
622 return false;
623 }
624
625 /// return true if the entire range is allocated
626 /// (mapped to extents on disk)
627 bool is_allocated(uint64_t b_off, uint64_t b_len) const {
628 return _validate_range(b_off, b_len, true);
629 }
630
631 /// return true if the entire range is unallocated
632 /// (not mapped to extents on disk)
633 bool is_unallocated(uint64_t b_off, uint64_t b_len) const {
634 return _validate_range(b_off, b_len, false);
635 }
636
637 /// return true if the logical range has never been used
638 bool is_unused(uint64_t offset, uint64_t length) const {
639 if (!has_unused()) {
640 return false;
641 }
642 ceph_assert(!is_compressed());
643 uint64_t blob_len = get_logical_length();
644 ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
645 ceph_assert(offset + length <= blob_len);
646 uint64_t chunk_size = blob_len / (sizeof(unused)*8);
647 uint64_t start = offset / chunk_size;
648 uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size;
649 auto i = start;
650 while (i < end && (unused & (1u << i))) {
651 i++;
652 }
653 return i >= end;
654 }
655
656 /// mark a range that has never been used
657 void add_unused(uint64_t offset, uint64_t length) {
658 ceph_assert(!is_compressed());
659 uint64_t blob_len = get_logical_length();
660 ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
661 ceph_assert(offset + length <= blob_len);
662 uint64_t chunk_size = blob_len / (sizeof(unused)*8);
663 uint64_t start = round_up_to(offset, chunk_size) / chunk_size;
664 uint64_t end = (offset + length) / chunk_size;
665 for (auto i = start; i < end; ++i) {
666 unused |= (1u << i);
667 }
668 if (start != end) {
669 set_flag(FLAG_HAS_UNUSED);
670 }
671 }
672
673 /// indicate that a range has (now) been used.
674 void mark_used(uint64_t offset, uint64_t length) {
675 if (has_unused()) {
676 ceph_assert(!is_compressed());
677 uint64_t blob_len = get_logical_length();
678 ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
679 ceph_assert(offset + length <= blob_len);
680 uint64_t chunk_size = blob_len / (sizeof(unused)*8);
681 uint64_t start = offset / chunk_size;
682 uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size;
683 for (auto i = start; i < end; ++i) {
684 unused &= ~(1u << i);
685 }
686 if (unused == 0) {
687 clear_flag(FLAG_HAS_UNUSED);
688 }
689 }
690 }
691
692 // map_f_invoke templates intended to mask parameters which are not expected
693 // by the provided callback
694 template<class F, typename std::enable_if<std::is_invocable_r_v<
695 int,
696 F,
697 uint64_t,
698 uint64_t>>::type* = nullptr>
699 int map_f_invoke(uint64_t lo,
700 const bluestore_pextent_t& p,
701 uint64_t o,
702 uint64_t l, F&& f) const{
703 return f(o, l);
704 }
705
706 template<class F, typename std::enable_if<std::is_invocable_r_v<
707 int,
708 F,
709 uint64_t,
710 uint64_t,
711 uint64_t>>::type * = nullptr>
712 int map_f_invoke(uint64_t lo,
713 const bluestore_pextent_t& p,
714 uint64_t o,
715 uint64_t l, F&& f) const {
716 return f(lo, o, l);
717 }
718
719 template<class F, typename std::enable_if<std::is_invocable_r_v<
720 int,
721 F,
722 const bluestore_pextent_t&,
723 uint64_t,
724 uint64_t>>::type * = nullptr>
725 int map_f_invoke(uint64_t lo,
726 const bluestore_pextent_t& p,
727 uint64_t o,
728 uint64_t l, F&& f) const {
729 return f(p, o, l);
730 }
731
732 template<class F>
733 int map(uint64_t x_off, uint64_t x_len, F&& f) const {
734 auto x_off0 = x_off;
735 auto p = extents.begin();
736 ceph_assert(p != extents.end());
737 while (x_off >= p->length) {
738 x_off -= p->length;
739 ++p;
740 ceph_assert(p != extents.end());
741 }
742 while (x_len > 0 && p != extents.end()) {
743 uint64_t l = std::min(p->length - x_off, x_len);
744 int r = map_f_invoke(x_off0, *p, p->offset + x_off, l, f);
745 if (r < 0)
746 return r;
747 x_off = 0;
748 x_len -= l;
749 x_off0 += l;
750 ++p;
751 }
752 return 0;
753 }
754
755 template<class F>
756 void map_bl(uint64_t x_off,
757 ceph::buffer::list& bl,
758 F&& f) const {
759 static_assert(std::is_invocable_v<F, uint64_t, ceph::buffer::list&>);
760
761 auto p = extents.begin();
762 ceph_assert(p != extents.end());
763 while (x_off >= p->length) {
764 x_off -= p->length;
765 ++p;
766 ceph_assert(p != extents.end());
767 }
768 ceph::buffer::list::iterator it = bl.begin();
769 uint64_t x_len = bl.length();
770 while (x_len > 0) {
771 ceph_assert(p != extents.end());
772 uint64_t l = std::min(p->length - x_off, x_len);
773 ceph::buffer::list t;
774 it.copy(l, t);
775 f(p->offset + x_off, t);
776 x_off = 0;
777 x_len -= l;
778 ++p;
779 }
780 }
781
782 uint32_t get_ondisk_length() const {
783 uint32_t len = 0;
784 for (auto &p : extents) {
785 len += p.length;
786 }
787 return len;
788 }
789
790 uint32_t get_logical_length() const {
791 return logical_length;
792 }
793 size_t get_csum_value_size() const;
794
795 size_t get_csum_count() const {
796 size_t vs = get_csum_value_size();
797 if (!vs)
798 return 0;
799 return csum_data.length() / vs;
800 }
801 uint64_t get_csum_item(unsigned i) const {
802 size_t cs = get_csum_value_size();
803 const char *p = csum_data.c_str();
804 switch (cs) {
805 case 0:
806 ceph_abort_msg("no csum data, bad index");
807 case 1:
808 return reinterpret_cast<const uint8_t*>(p)[i];
809 case 2:
810 return reinterpret_cast<const ceph_le16*>(p)[i];
811 case 4:
812 return reinterpret_cast<const ceph_le32*>(p)[i];
813 case 8:
814 return reinterpret_cast<const ceph_le64*>(p)[i];
815 default:
816 ceph_abort_msg("unrecognized csum word size");
817 }
818 }
819 const char *get_csum_item_ptr(unsigned i) const {
820 size_t cs = get_csum_value_size();
821 return csum_data.c_str() + (cs * i);
822 }
823 char *get_csum_item_ptr(unsigned i) {
824 size_t cs = get_csum_value_size();
825 return csum_data.c_str() + (cs * i);
826 }
827
828 void init_csum(unsigned type, unsigned order, unsigned len) {
829 flags |= FLAG_CSUM;
830 csum_type = type;
831 csum_chunk_order = order;
832 csum_data = ceph::buffer::create(get_csum_value_size() * len / get_csum_chunk_size());
833 csum_data.zero();
834 csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
835 }
836
837 /// calculate csum for the buffer at the given b_off
838 void calc_csum(uint64_t b_off, const ceph::buffer::list& bl);
839
840 /// verify csum: return -EOPNOTSUPP for unsupported checksum type;
841 /// return -1 and valid(nonnegative) b_bad_off for checksum error;
842 /// return 0 if all is well.
843 int verify_csum(uint64_t b_off, const ceph::buffer::list& bl, int* b_bad_off,
844 uint64_t *bad_csum) const;
845
846 bool can_prune_tail() const {
847 return
848 extents.size() > 1 && // if it's all invalid it's not pruning.
849 !extents.back().is_valid() &&
850 !has_unused();
851 }
852 void prune_tail() {
853 const auto &p = extents.back();
854 logical_length -= p.length;
855 extents.pop_back();
856 if (has_csum()) {
857 ceph::buffer::ptr t;
858 t.swap(csum_data);
859 csum_data = ceph::buffer::ptr(t.c_str(),
860 get_logical_length() / get_csum_chunk_size() *
861 get_csum_value_size());
862 }
863 }
864 void add_tail(uint32_t new_len) {
865 ceph_assert(is_mutable());
866 ceph_assert(!has_unused());
867 ceph_assert(new_len > logical_length);
868 extents.emplace_back(
869 bluestore_pextent_t(
870 bluestore_pextent_t::INVALID_OFFSET,
871 new_len - logical_length));
872 logical_length = new_len;
873 if (has_csum()) {
874 ceph::buffer::ptr t;
875 t.swap(csum_data);
876 csum_data = ceph::buffer::create(
877 get_csum_value_size() * logical_length / get_csum_chunk_size());
878 csum_data.copy_in(0, t.length(), t.c_str());
879 csum_data.zero(t.length(), csum_data.length() - t.length());
880 }
881 }
882 uint32_t get_release_size(uint32_t min_alloc_size) const {
883 if (is_compressed()) {
884 return get_logical_length();
885 }
886 uint32_t res = get_csum_chunk_size();
887 if (!has_csum() || res < min_alloc_size) {
888 res = min_alloc_size;
889 }
890 return res;
891 }
892
893 void split(uint32_t blob_offset, bluestore_blob_t& rb);
894 void allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs);
895 void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only
896
897 /// updates blob's pextents container and return unused pextents eligible
898 /// for release.
899 /// all - indicates that the whole blob to be released.
900 /// logical - specifies set of logical extents within blob's
901 /// to be released
902 /// Returns true if blob has no more valid pextents
903 bool release_extents(
904 bool all,
905 const PExtentVector& logical,
906 PExtentVector* r);
907 };
908 WRITE_CLASS_DENC_FEATURED(bluestore_blob_t)
909
910 std::ostream& operator<<(std::ostream& out, const bluestore_blob_t& o);
911
912
913 /// shared blob state
914 struct bluestore_shared_blob_t {
915 MEMPOOL_CLASS_HELPERS();
916 uint64_t sbid; ///> shared blob id
917 bluestore_extent_ref_map_t ref_map; ///< shared blob extents
918
919 bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {}
920 bluestore_shared_blob_t(uint64_t _sbid,
921 bluestore_extent_ref_map_t&& _ref_map )
922 : sbid(_sbid), ref_map(std::move(_ref_map)) {}
923
924 DENC(bluestore_shared_blob_t, v, p) {
925 DENC_START(1, 1, p);
926 denc(v.ref_map, p);
927 DENC_FINISH(p);
928 }
929
930
931 void dump(ceph::Formatter *f) const;
932 static void generate_test_instances(std::list<bluestore_shared_blob_t*>& ls);
933
934 bool empty() const {
935 return ref_map.empty();
936 }
937 };
938 WRITE_CLASS_DENC(bluestore_shared_blob_t)
939
940 std::ostream& operator<<(std::ostream& out, const bluestore_shared_blob_t& o);
941
942 /// onode: per-object metadata
943 struct bluestore_onode_t {
944 uint64_t nid = 0; ///< numeric id (locally unique)
945 uint64_t size = 0; ///< object size
946 // mempool to be assigned to buffer::ptr manually
947 std::map<mempool::bluestore_cache_meta::string, ceph::buffer::ptr> attrs;
948
949 struct shard_info {
950 uint32_t offset = 0; ///< logical offset for start of shard
951 uint32_t bytes = 0; ///< encoded bytes
952 DENC(shard_info, v, p) {
953 denc_varint(v.offset, p);
954 denc_varint(v.bytes, p);
955 }
956 void dump(ceph::Formatter *f) const;
957 };
958 std::vector<shard_info> extent_map_shards; ///< extent std::map shards (if any)
959
960 uint32_t expected_object_size = 0;
961 uint32_t expected_write_size = 0;
962 uint32_t alloc_hint_flags = 0;
963
964 uint8_t flags = 0;
965
966 std::map<uint32_t, uint64_t> zone_offset_refs; ///< (zone, offset) refs to this onode
967
968 enum {
969 FLAG_OMAP = 1, ///< object may have omap data
970 FLAG_PGMETA_OMAP = 2, ///< omap data is in meta omap prefix
971 FLAG_PERPOOL_OMAP = 4, ///< omap data is in per-pool prefix; per-pool keys
972 FLAG_PERPG_OMAP = 8, ///< omap data is in per-pg prefix; per-pg keys
973 };
974
975 std::string get_flags_string() const {
976 std::string s;
977 if (flags & FLAG_OMAP) {
978 s = "omap";
979 }
980 if (flags & FLAG_PGMETA_OMAP) {
981 s += "+pgmeta_omap";
982 }
983 if (flags & FLAG_PERPOOL_OMAP) {
984 s += "+per_pool_omap";
985 }
986 if (flags & FLAG_PERPG_OMAP) {
987 s += "+per_pg_omap";
988 }
989 return s;
990 }
991
992 bool has_flag(unsigned f) const {
993 return flags & f;
994 }
995
996 void set_flag(unsigned f) {
997 flags |= f;
998 }
999
1000 void clear_flag(unsigned f) {
1001 flags &= ~f;
1002 }
1003
1004 bool has_omap() const {
1005 return has_flag(FLAG_OMAP);
1006 }
1007
1008 static bool is_pgmeta_omap(uint8_t flags) {
1009 return flags & FLAG_PGMETA_OMAP;
1010 }
1011 static bool is_perpool_omap(uint8_t flags) {
1012 return flags & FLAG_PERPOOL_OMAP;
1013 }
1014 static bool is_perpg_omap(uint8_t flags) {
1015 return flags & FLAG_PERPG_OMAP;
1016 }
1017 bool is_pgmeta_omap() const {
1018 return has_flag(FLAG_PGMETA_OMAP);
1019 }
1020 bool is_perpool_omap() const {
1021 return has_flag(FLAG_PERPOOL_OMAP);
1022 }
1023 bool is_perpg_omap() const {
1024 return has_flag(FLAG_PERPG_OMAP);
1025 }
1026
1027 void set_omap_flags(bool legacy) {
1028 set_flag(FLAG_OMAP | (legacy ? 0 : (FLAG_PERPOOL_OMAP | FLAG_PERPG_OMAP)));
1029 }
1030 void set_omap_flags_pgmeta() {
1031 set_flag(FLAG_OMAP | FLAG_PGMETA_OMAP);
1032 }
1033
1034 void clear_omap_flag() {
1035 clear_flag(FLAG_OMAP |
1036 FLAG_PGMETA_OMAP |
1037 FLAG_PERPOOL_OMAP |
1038 FLAG_PERPG_OMAP);
1039 }
1040
1041 DENC(bluestore_onode_t, v, p) {
1042 DENC_START(2, 1, p);
1043 denc_varint(v.nid, p);
1044 denc_varint(v.size, p);
1045 denc(v.attrs, p);
1046 denc(v.flags, p);
1047 denc(v.extent_map_shards, p);
1048 denc_varint(v.expected_object_size, p);
1049 denc_varint(v.expected_write_size, p);
1050 denc_varint(v.alloc_hint_flags, p);
1051 if (struct_v >= 2) {
1052 denc(v.zone_offset_refs, p);
1053 }
1054 DENC_FINISH(p);
1055 }
1056 void dump(ceph::Formatter *f) const;
1057 static void generate_test_instances(std::list<bluestore_onode_t*>& o);
1058 };
1059 WRITE_CLASS_DENC(bluestore_onode_t::shard_info)
1060 WRITE_CLASS_DENC(bluestore_onode_t)
1061
1062 std::ostream& operator<<(std::ostream& out, const bluestore_onode_t::shard_info& si);
1063
1064 /// writeahead-logged op
1065 struct bluestore_deferred_op_t {
1066 typedef enum {
1067 OP_WRITE = 1,
1068 } type_t;
1069 __u8 op = 0;
1070
1071 PExtentVector extents;
1072 ceph::buffer::list data;
1073
1074 DENC(bluestore_deferred_op_t, v, p) {
1075 DENC_START(1, 1, p);
1076 denc(v.op, p);
1077 denc(v.extents, p);
1078 denc(v.data, p);
1079 DENC_FINISH(p);
1080 }
1081 void dump(ceph::Formatter *f) const;
1082 static void generate_test_instances(std::list<bluestore_deferred_op_t*>& o);
1083 };
1084 WRITE_CLASS_DENC(bluestore_deferred_op_t)
1085
1086
1087 /// writeahead-logged transaction
1088 struct bluestore_deferred_transaction_t {
1089 uint64_t seq = 0;
1090 std::list<bluestore_deferred_op_t> ops;
1091 interval_set<uint64_t> released; ///< allocations to release after tx
1092
1093 bluestore_deferred_transaction_t() : seq(0) {}
1094
1095 DENC(bluestore_deferred_transaction_t, v, p) {
1096 DENC_START(1, 1, p);
1097 denc(v.seq, p);
1098 denc(v.ops, p);
1099 denc(v.released, p);
1100 DENC_FINISH(p);
1101 }
1102 void dump(ceph::Formatter *f) const;
1103 static void generate_test_instances(std::list<bluestore_deferred_transaction_t*>& o);
1104 };
1105 WRITE_CLASS_DENC(bluestore_deferred_transaction_t)
1106
1107 struct bluestore_compression_header_t {
1108 uint8_t type = Compressor::COMP_ALG_NONE;
1109 uint32_t length = 0;
1110 boost::optional<int32_t> compressor_message;
1111
1112 bluestore_compression_header_t() {}
1113 bluestore_compression_header_t(uint8_t _type)
1114 : type(_type) {}
1115
1116 DENC(bluestore_compression_header_t, v, p) {
1117 DENC_START(2, 1, p);
1118 denc(v.type, p);
1119 denc(v.length, p);
1120 if (struct_v >= 2) {
1121 denc(v.compressor_message, p);
1122 }
1123 DENC_FINISH(p);
1124 }
1125 void dump(ceph::Formatter *f) const;
1126 static void generate_test_instances(std::list<bluestore_compression_header_t*>& o);
1127 };
1128 WRITE_CLASS_DENC(bluestore_compression_header_t)
1129
1130 template <template <typename> typename V, class COUNTER_TYPE = int32_t>
1131 class ref_counter_2hash_tracker_t {
1132 size_t num_non_zero = 0;
1133 size_t num_buckets = 0;
1134 V<COUNTER_TYPE> buckets1;
1135 V<COUNTER_TYPE> buckets2;
1136
1137 public:
1138 ref_counter_2hash_tracker_t(uint64_t mem_cap) {
1139 num_buckets = mem_cap / sizeof(COUNTER_TYPE) / 2;
1140 ceph_assert(num_buckets);
1141 buckets1.resize(num_buckets);
1142 buckets2.resize(num_buckets);
1143 reset();
1144 }
1145
1146 size_t get_num_buckets() const {
1147 return num_buckets;
1148 }
1149
1150 void inc(const char* hash_val, size_t hash_val_len, int n) {
1151 auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len) %
1152 num_buckets;
1153 if (buckets1[h] == 0 && n) {
1154 ++num_non_zero;
1155 } else if (buckets1[h] == -n) {
1156 --num_non_zero;
1157 }
1158 buckets1[h] += n;
1159 h = ceph_str_hash_linux((const char*)hash_val, hash_val_len) % num_buckets;
1160 if (buckets2[h] == 0 && n) {
1161 ++num_non_zero;
1162 } else if (buckets2[h] == -n) {
1163 --num_non_zero;
1164 }
1165 buckets2[h] += n;
1166 }
1167
1168 bool test_hash_conflict(
1169 const char* hash_val1,
1170 const char* hash_val2,
1171 size_t hash_val_len) const {
1172
1173 auto h1 = ceph_str_hash_rjenkins((const char*)hash_val1, hash_val_len);
1174 auto h2 = ceph_str_hash_rjenkins((const char*)hash_val2, hash_val_len);
1175 auto h3 = ceph_str_hash_linux((const char*)hash_val1, hash_val_len);
1176 auto h4 = ceph_str_hash_linux((const char*)hash_val2, hash_val_len);
1177 return ((h1 % num_buckets) == (h2 % num_buckets)) &&
1178 ((h3 % num_buckets) == (h4 % num_buckets));
1179 }
1180
1181 bool test_all_zero(const char* hash_val, size_t hash_val_len) const {
1182 auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len);
1183 if (buckets1[h % num_buckets] != 0) {
1184 return false;
1185 }
1186 h = ceph_str_hash_linux((const char*)hash_val, hash_val_len);
1187 return buckets2[h % num_buckets] == 0;
1188 }
1189
1190 // returns number of mismatching buckets
1191 size_t count_non_zero() const {
1192 return num_non_zero;
1193 }
1194 void reset() {
1195 for (size_t i = 0; i < num_buckets; i++) {
1196 buckets1[i] = 0;
1197 buckets2[i] = 0;
1198 }
1199 num_non_zero = 0;
1200 }
1201 };
1202
1203 class shared_blob_2hash_tracker_t
1204 : public ref_counter_2hash_tracker_t<mempool::bluestore_fsck::vector> {
1205
1206 static const size_t hash_input_len = 3;
1207
1208 typedef std::array<uint64_t, hash_input_len> hash_input_t;
1209
1210 static size_t get_hash_input_size() {
1211 return hash_input_len * sizeof(hash_input_t::value_type);
1212 }
1213
1214 inline hash_input_t build_hash_input(uint64_t sbid, uint64_t offset) const;
1215
1216 size_t au_void_bits = 0;
1217
1218
1219 public:
1220 shared_blob_2hash_tracker_t(uint64_t mem_cap, size_t alloc_unit)
1221 : ref_counter_2hash_tracker_t(mem_cap) {
1222 ceph_assert(alloc_unit);
1223 ceph_assert(isp2(alloc_unit));
1224 au_void_bits = ctz(alloc_unit);
1225 }
1226 void inc(uint64_t sbid, uint64_t offset, int n);
1227 void inc_range(uint64_t sbid, uint64_t offset, uint32_t len, int n);
1228
1229 bool test_hash_conflict(
1230 uint64_t sbid,
1231 uint64_t offset,
1232 uint64_t sbid2,
1233 uint64_t offset2) const;
1234 bool test_all_zero(
1235 uint64_t sbid,
1236 uint64_t offset) const;
1237 bool test_all_zero_range(
1238 uint64_t sbid,
1239 uint64_t offset,
1240 uint32_t len) const;
1241 };
1242
1243 class sb_info_t {
1244 // subzero value indicates (potentially) stray blob,
1245 // i.e. blob that has got no real references from onodes
1246 int64_t sbid = 0;
1247
1248 public:
1249 enum {
1250 INVALID_POOL_ID = INT64_MIN
1251 };
1252
1253 int64_t pool_id = INVALID_POOL_ID;
1254 // subzero value indicates compressed_allocated as well
1255 int32_t allocated_chunks = 0;
1256
1257 sb_info_t(int64_t _sbid = 0) : sbid(_sbid)
1258 {
1259 }
1260 bool operator< (const sb_info_t& other) const {
1261 return std::abs(sbid) < std::abs(other.sbid);
1262 }
1263 bool operator< (const uint64_t& other_sbid) const {
1264 return uint64_t(std::abs(sbid)) < other_sbid;
1265 }
1266 bool is_stray() const {
1267 return sbid < 0;
1268 }
1269 uint64_t get_sbid() const {
1270 return uint64_t(std::abs(sbid));
1271 }
1272 void adopt() {
1273 sbid = std::abs(sbid);
1274 }
1275 } __attribute__((packed));
1276
1277 // Space-efficient container to keep a set of sb_info structures
1278 // given that the majority of entries are appended in a proper id-sorted
1279 // order. Hence one can keep them in a regular vector and apply binary search
1280 // whenever specific entry to be found.
1281 // For the rare occasions when out-of-order append takes place - an auxilliary
1282 // regular map is used.
1283 struct sb_info_space_efficient_map_t {
1284 // large array sorted by the user
1285 mempool::bluestore_fsck::vector<sb_info_t> items;
1286 // small additional set of items we maintain sorting ourselves
1287 // this would never keep an entry with id > items.back().id
1288 mempool::bluestore_fsck::vector<sb_info_t> aux_items;
1289
1290 sb_info_t& add_maybe_stray(uint64_t sbid) {
1291 return _add(-int64_t(sbid));
1292 }
1293 sb_info_t& add_or_adopt(uint64_t sbid) {
1294 auto& r = _add(sbid);
1295 r.adopt();
1296 return r;
1297 }
1298 auto find(uint64_t id) {
1299 if (items.size() != 0) {
1300 auto it = std::lower_bound(
1301 items.begin(),
1302 items.end() - 1,
1303 id,
1304 [](const sb_info_t& a, const uint64_t& b) {
1305 return a < b;
1306 });
1307 if (it->get_sbid() == id) {
1308 return it;
1309 }
1310 if (aux_items.size() != 0) {
1311 auto it = std::lower_bound(
1312 aux_items.begin(),
1313 aux_items.end(),
1314 id,
1315 [](const sb_info_t& a, const uint64_t& b) {
1316 return a < b;
1317 });
1318 if (it->get_sbid() == id) {
1319 return it;
1320 }
1321 }
1322 }
1323 return items.end();
1324 }
1325 // enumerates strays, order isn't guaranteed.
1326 void foreach_stray(std::function<void(const sb_info_t&)> cb) {
1327 for (auto& sbi : items) {
1328 if (sbi.is_stray()) {
1329 cb(sbi);
1330 }
1331 }
1332 for (auto& sbi : aux_items) {
1333 if (sbi.is_stray()) {
1334 cb(sbi);
1335 }
1336 }
1337 }
1338 auto end() {
1339 return items.end();
1340 }
1341
1342 void shrink() {
1343 items.shrink_to_fit();
1344 aux_items.shrink_to_fit();
1345 }
1346 void clear() {
1347 items.clear();
1348 aux_items.clear();
1349 shrink();
1350 }
1351 private:
1352 sb_info_t& _add(int64_t id) {
1353 uint64_t n_id = uint64_t(std::abs(id));
1354 if (items.size() == 0 || n_id > items.back().get_sbid()) {
1355 return items.emplace_back(id);
1356 }
1357 auto it = find(n_id);
1358 if (it != items.end()) {
1359 return *it;
1360 }
1361 if (aux_items.size() == 0 || n_id > aux_items.back().get_sbid()) {
1362 return aux_items.emplace_back(id);
1363 }
1364 // do sorted insertion, may be expensive!
1365 it = std::upper_bound(
1366 aux_items.begin(),
1367 aux_items.end(),
1368 n_id,
1369 [](const uint64_t& a, const sb_info_t& b) {
1370 return a < b.get_sbid();
1371 });
1372 return *aux_items.emplace(it, id);
1373 }
1374 };
1375
1376 #endif