]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/bluestore_types.h
ed82991a5673e8deb7c0db966af550d1a738b1a6
[ceph.git] / ceph / src / os / bluestore / bluestore_types.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
16 #define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
17
18 #include <ostream>
19 #include <type_traits>
20 #include <vector>
21 #include <array>
22 #include "include/mempool.h"
23 #include "include/types.h"
24 #include "include/interval_set.h"
25 #include "include/utime.h"
26 #include "common/hobject.h"
27 #include "compressor/Compressor.h"
28 #include "common/Checksummer.h"
29 #include "include/mempool.h"
30 #include "include/ceph_hash.h"
31
32 namespace ceph {
33 class Formatter;
34 }
35
36 /// label for block device
37 struct bluestore_bdev_label_t {
38 uuid_d osd_uuid; ///< osd uuid
39 uint64_t size = 0; ///< device size
40 utime_t btime; ///< birth time
41 std::string description; ///< device description
42
43 std::map<std::string,std::string> meta; ///< {read,write}_meta() content from ObjectStore
44
45 void encode(ceph::buffer::list& bl) const;
46 void decode(ceph::buffer::list::const_iterator& p);
47 void dump(ceph::Formatter *f) const;
48 static void generate_test_instances(std::list<bluestore_bdev_label_t*>& o);
49 };
50 WRITE_CLASS_ENCODER(bluestore_bdev_label_t)
51
52 std::ostream& operator<<(std::ostream& out, const bluestore_bdev_label_t& l);
53
54 /// collection metadata
55 struct bluestore_cnode_t {
56 uint32_t bits; ///< how many bits of coll pgid are significant
57
58 explicit bluestore_cnode_t(int b=0) : bits(b) {}
59
60 DENC(bluestore_cnode_t, v, p) {
61 DENC_START(1, 1, p);
62 denc(v.bits, p);
63 DENC_FINISH(p);
64 }
65 void dump(ceph::Formatter *f) const;
66 static void generate_test_instances(std::list<bluestore_cnode_t*>& o);
67 };
68 WRITE_CLASS_DENC(bluestore_cnode_t)
69
70 std::ostream& operator<<(std::ostream& out, const bluestore_cnode_t& l);
71
72 template <typename OFFS_TYPE, typename LEN_TYPE>
73 struct bluestore_interval_t
74 {
75 static const uint64_t INVALID_OFFSET = ~0ull;
76
77 OFFS_TYPE offset = 0;
78 LEN_TYPE length = 0;
79
80 bluestore_interval_t(){}
81 bluestore_interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
82
83 bool is_valid() const {
84 return offset != INVALID_OFFSET;
85 }
86 uint64_t end() const {
87 return offset != INVALID_OFFSET ? offset + length : INVALID_OFFSET;
88 }
89
90 bool operator==(const bluestore_interval_t& other) const {
91 return offset == other.offset && length == other.length;
92 }
93
94 };
95
96 /// pextent: physical extent
97 struct bluestore_pextent_t : public bluestore_interval_t<uint64_t, uint32_t>
98 {
99 bluestore_pextent_t() {}
100 bluestore_pextent_t(uint64_t o, uint64_t l) : bluestore_interval_t(o, l) {}
101 bluestore_pextent_t(const bluestore_interval_t &ext) :
102 bluestore_interval_t(ext.offset, ext.length) {}
103
104 DENC(bluestore_pextent_t, v, p) {
105 denc_lba(v.offset, p);
106 denc_varint_lowz(v.length, p);
107 }
108
109 void dump(ceph::Formatter *f) const;
110 static void generate_test_instances(std::list<bluestore_pextent_t*>& ls);
111 };
112 WRITE_CLASS_DENC(bluestore_pextent_t)
113
114 std::ostream& operator<<(std::ostream& out, const bluestore_pextent_t& o);
115
116 typedef mempool::bluestore_cache_other::vector<bluestore_pextent_t> PExtentVector;
117
118 template<>
119 struct denc_traits<PExtentVector> {
120 static constexpr bool supported = true;
121 static constexpr bool bounded = false;
122 static constexpr bool featured = false;
123 static constexpr bool need_contiguous = true;
124 static void bound_encode(const PExtentVector& v, size_t& p) {
125 p += sizeof(uint32_t);
126 const auto size = v.size();
127 if (size) {
128 size_t per = 0;
129 denc(v.front(), per);
130 p += per * size;
131 }
132 }
133 static void encode(const PExtentVector& v,
134 ceph::buffer::list::contiguous_appender& p) {
135 denc_varint(v.size(), p);
136 for (auto& i : v) {
137 denc(i, p);
138 }
139 }
140 static void decode(PExtentVector& v, ceph::buffer::ptr::const_iterator& p) {
141 unsigned num;
142 denc_varint(num, p);
143 v.clear();
144 v.resize(num);
145 for (unsigned i=0; i<num; ++i) {
146 denc(v[i], p);
147 }
148 }
149 };
150
151 /// extent_map: a std::map of reference counted extents
152 struct bluestore_extent_ref_map_t {
153 struct record_t {
154 uint32_t length;
155 uint32_t refs;
156 record_t(uint32_t l=0, uint32_t r=0) : length(l), refs(r) {}
157 DENC(bluestore_extent_ref_map_t::record_t, v, p) {
158 denc_varint_lowz(v.length, p);
159 denc_varint(v.refs, p);
160 }
161 };
162
163 typedef mempool::bluestore_cache_other::map<uint64_t,record_t> map_t;
164 map_t ref_map;
165
166 void _check() const;
167 void _maybe_merge_left(map_t::iterator& p);
168
169 void clear() {
170 ref_map.clear();
171 }
172 bool empty() const {
173 return ref_map.empty();
174 }
175
176 void get(uint64_t offset, uint32_t len);
177 void put(uint64_t offset, uint32_t len, PExtentVector *release,
178 bool *maybe_unshared);
179
180 bool contains(uint64_t offset, uint32_t len) const;
181 bool intersects(uint64_t offset, uint32_t len) const;
182
183 void bound_encode(size_t& p) const {
184 denc_varint((uint32_t)0, p);
185 if (!ref_map.empty()) {
186 size_t elem_size = 0;
187 denc_varint_lowz((uint64_t)0, elem_size);
188 ref_map.begin()->second.bound_encode(elem_size);
189 p += elem_size * ref_map.size();
190 }
191 }
192 void encode(ceph::buffer::list::contiguous_appender& p) const {
193 const uint32_t n = ref_map.size();
194 denc_varint(n, p);
195 if (n) {
196 auto i = ref_map.begin();
197 denc_varint_lowz(i->first, p);
198 i->second.encode(p);
199 int64_t pos = i->first;
200 while (++i != ref_map.end()) {
201 denc_varint_lowz((int64_t)i->first - pos, p);
202 i->second.encode(p);
203 pos = i->first;
204 }
205 }
206 }
207 void decode(ceph::buffer::ptr::const_iterator& p) {
208 uint32_t n;
209 denc_varint(n, p);
210 if (n) {
211 int64_t pos;
212 denc_varint_lowz(pos, p);
213 ref_map[pos].decode(p);
214 while (--n) {
215 int64_t delta;
216 denc_varint_lowz(delta, p);
217 pos += delta;
218 ref_map[pos].decode(p);
219 }
220 }
221 }
222
223 void dump(ceph::Formatter *f) const;
224 static void generate_test_instances(std::list<bluestore_extent_ref_map_t*>& o);
225 };
226 WRITE_CLASS_DENC(bluestore_extent_ref_map_t)
227
228
229 std::ostream& operator<<(std::ostream& out, const bluestore_extent_ref_map_t& rm);
230 static inline bool operator==(const bluestore_extent_ref_map_t::record_t& l,
231 const bluestore_extent_ref_map_t::record_t& r) {
232 return l.length == r.length && l.refs == r.refs;
233 }
234 static inline bool operator==(const bluestore_extent_ref_map_t& l,
235 const bluestore_extent_ref_map_t& r) {
236 return l.ref_map == r.ref_map;
237 }
238 static inline bool operator!=(const bluestore_extent_ref_map_t& l,
239 const bluestore_extent_ref_map_t& r) {
240 return !(l == r);
241 }
242
243 /// blob_use_tracker: a set of per-alloc unit ref buckets to track blob usage
244 struct bluestore_blob_use_tracker_t {
245 // N.B.: There is no need to minimize au_size/num_au
246 // as much as possible (e.g. have just a single byte for au_size) since:
247 // 1) Struct isn't packed hence it's padded. And even if it's packed see 2)
248 // 2) Mem manager has its own granularity, most probably >= 8 bytes
249 //
250 uint32_t au_size; // Allocation (=tracking) unit size,
251 // == 0 if uninitialized
252 uint32_t num_au; // Amount of allocation units tracked
253 // == 0 if single unit or the whole blob is tracked
254
255 union {
256 uint32_t* bytes_per_au;
257 uint32_t total_bytes;
258 };
259
260 bluestore_blob_use_tracker_t()
261 : au_size(0), num_au(0), bytes_per_au(nullptr) {
262 }
263 bluestore_blob_use_tracker_t(const bluestore_blob_use_tracker_t& tracker);
264 bluestore_blob_use_tracker_t& operator=(const bluestore_blob_use_tracker_t& rhs);
265 ~bluestore_blob_use_tracker_t() {
266 clear();
267 }
268
269 void clear() {
270 if (num_au != 0) {
271 delete[] bytes_per_au;
272 mempool::get_pool(
273 mempool::pool_index_t(mempool::mempool_bluestore_cache_other)).
274 adjust_count(-1, -sizeof(uint32_t) * num_au);
275 }
276 bytes_per_au = 0;
277 au_size = 0;
278 num_au = 0;
279 }
280
281 uint32_t get_referenced_bytes() const {
282 uint32_t total = 0;
283 if (!num_au) {
284 total = total_bytes;
285 } else {
286 for (size_t i = 0; i < num_au; ++i) {
287 total += bytes_per_au[i];
288 }
289 }
290 return total;
291 }
292 bool is_not_empty() const {
293 if (!num_au) {
294 return total_bytes != 0;
295 } else {
296 for (size_t i = 0; i < num_au; ++i) {
297 if (bytes_per_au[i]) {
298 return true;
299 }
300 }
301 }
302 return false;
303 }
304 bool is_empty() const {
305 return !is_not_empty();
306 }
307 void prune_tail(uint32_t new_len) {
308 if (num_au) {
309 new_len = round_up_to(new_len, au_size);
310 uint32_t _num_au = new_len / au_size;
311 ceph_assert(_num_au <= num_au);
312 if (_num_au) {
313 num_au = _num_au; // bytes_per_au array is left unmodified
314
315 } else {
316 clear();
317 }
318 }
319 }
320 void add_tail(uint32_t new_len, uint32_t _au_size) {
321 auto full_size = au_size * (num_au ? num_au : 1);
322 ceph_assert(new_len >= full_size);
323 if (new_len == full_size) {
324 return;
325 }
326 if (!num_au) {
327 uint32_t old_total = total_bytes;
328 total_bytes = 0;
329 init(new_len, _au_size);
330 ceph_assert(num_au);
331 bytes_per_au[0] = old_total;
332 } else {
333 ceph_assert(_au_size == au_size);
334 new_len = round_up_to(new_len, au_size);
335 uint32_t _num_au = new_len / au_size;
336 ceph_assert(_num_au >= num_au);
337 if (_num_au > num_au) {
338 auto old_bytes = bytes_per_au;
339 auto old_num_au = num_au;
340 num_au = _num_au;
341 allocate();
342 for (size_t i = 0; i < old_num_au; i++) {
343 bytes_per_au[i] = old_bytes[i];
344 }
345 for (size_t i = old_num_au; i < num_au; i++) {
346 bytes_per_au[i] = 0;
347 }
348 delete[] old_bytes;
349 }
350 }
351 }
352
353 void init(
354 uint32_t full_length,
355 uint32_t _au_size);
356
357 void get(
358 uint32_t offset,
359 uint32_t len);
360
361 /// put: return true if the blob has no references any more after the call,
362 /// no release_units is filled for the sake of performance.
363 /// return false if there are some references to the blob,
364 /// in this case release_units contains pextents
365 /// (identified by their offsets relative to the blob start)
366 /// that are not used any more and can be safely deallocated.
367 bool put(
368 uint32_t offset,
369 uint32_t len,
370 PExtentVector *release);
371
372 bool can_split() const;
373 bool can_split_at(uint32_t blob_offset) const;
374 void split(
375 uint32_t blob_offset,
376 bluestore_blob_use_tracker_t* r);
377
378 bool equal(
379 const bluestore_blob_use_tracker_t& other) const;
380
381 void bound_encode(size_t& p) const {
382 denc_varint(au_size, p);
383 if (au_size) {
384 denc_varint(num_au, p);
385 if (!num_au) {
386 denc_varint(total_bytes, p);
387 } else {
388 size_t elem_size = 0;
389 denc_varint((uint32_t)0, elem_size);
390 p += elem_size * num_au;
391 }
392 }
393 }
394 void encode(ceph::buffer::list::contiguous_appender& p) const {
395 denc_varint(au_size, p);
396 if (au_size) {
397 denc_varint(num_au, p);
398 if (!num_au) {
399 denc_varint(total_bytes, p);
400 } else {
401 size_t elem_size = 0;
402 denc_varint((uint32_t)0, elem_size);
403 for (size_t i = 0; i < num_au; ++i) {
404 denc_varint(bytes_per_au[i], p);
405 }
406 }
407 }
408 }
409 void decode(ceph::buffer::ptr::const_iterator& p) {
410 clear();
411 denc_varint(au_size, p);
412 if (au_size) {
413 denc_varint(num_au, p);
414 if (!num_au) {
415 denc_varint(total_bytes, p);
416 } else {
417 allocate();
418 for (size_t i = 0; i < num_au; ++i) {
419 denc_varint(bytes_per_au[i], p);
420 }
421 }
422 }
423 }
424
425 void dump(ceph::Formatter *f) const;
426 static void generate_test_instances(std::list<bluestore_blob_use_tracker_t*>& o);
427 private:
428 void allocate();
429 };
430 WRITE_CLASS_DENC(bluestore_blob_use_tracker_t)
431 std::ostream& operator<<(std::ostream& out, const bluestore_blob_use_tracker_t& rm);
432
433 /// blob: a piece of data on disk
434 struct bluestore_blob_t {
435 private:
436 PExtentVector extents; ///< raw data position on device
437 uint32_t logical_length = 0; ///< original length of data stored in the blob
438 uint32_t compressed_length = 0; ///< compressed length if any
439
440 public:
441 enum {
442 LEGACY_FLAG_MUTABLE = 1, ///< [legacy] blob can be overwritten or split
443 FLAG_COMPRESSED = 2, ///< blob is compressed
444 FLAG_CSUM = 4, ///< blob has checksums
445 FLAG_HAS_UNUSED = 8, ///< blob has unused std::map
446 FLAG_SHARED = 16, ///< blob is shared; see external SharedBlob
447 };
448 static std::string get_flags_string(unsigned flags);
449
450 uint32_t flags = 0; ///< FLAG_*
451
452 typedef uint16_t unused_t;
453 unused_t unused = 0; ///< portion that has never been written to (bitmap)
454
455 uint8_t csum_type = Checksummer::CSUM_NONE; ///< CSUM_*
456 uint8_t csum_chunk_order = 0; ///< csum block size is 1<<block_order bytes
457
458 ceph::buffer::ptr csum_data; ///< opaque std::vector of csum data
459
460 bluestore_blob_t(uint32_t f = 0) : flags(f) {}
461
462 const PExtentVector& get_extents() const {
463 return extents;
464 }
465 PExtentVector& dirty_extents() {
466 return extents;
467 }
468
469 DENC_HELPERS;
470 void bound_encode(size_t& p, uint64_t struct_v) const {
471 ceph_assert(struct_v == 1 || struct_v == 2);
472 denc(extents, p);
473 denc_varint(flags, p);
474 denc_varint_lowz(logical_length, p);
475 denc_varint_lowz(compressed_length, p);
476 denc(csum_type, p);
477 denc(csum_chunk_order, p);
478 denc_varint(csum_data.length(), p);
479 p += csum_data.length();
480 p += sizeof(unused_t);
481 }
482
483 void encode(ceph::buffer::list::contiguous_appender& p, uint64_t struct_v) const {
484 ceph_assert(struct_v == 1 || struct_v == 2);
485 denc(extents, p);
486 denc_varint(flags, p);
487 if (is_compressed()) {
488 denc_varint_lowz(logical_length, p);
489 denc_varint_lowz(compressed_length, p);
490 }
491 if (has_csum()) {
492 denc(csum_type, p);
493 denc(csum_chunk_order, p);
494 denc_varint(csum_data.length(), p);
495 memcpy(p.get_pos_add(csum_data.length()), csum_data.c_str(),
496 csum_data.length());
497 }
498 if (has_unused()) {
499 denc(unused, p);
500 }
501 }
502
503 void decode(ceph::buffer::ptr::const_iterator& p, uint64_t struct_v) {
504 ceph_assert(struct_v == 1 || struct_v == 2);
505 denc(extents, p);
506 denc_varint(flags, p);
507 if (is_compressed()) {
508 denc_varint_lowz(logical_length, p);
509 denc_varint_lowz(compressed_length, p);
510 } else {
511 logical_length = get_ondisk_length();
512 }
513 if (has_csum()) {
514 denc(csum_type, p);
515 denc(csum_chunk_order, p);
516 int len;
517 denc_varint(len, p);
518 csum_data = p.get_ptr(len);
519 csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
520 }
521 if (has_unused()) {
522 denc(unused, p);
523 }
524 }
525
526 bool can_split() const {
527 return
528 !has_flag(FLAG_SHARED) &&
529 !has_flag(FLAG_COMPRESSED) &&
530 !has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex
531 }
532 bool can_split_at(uint32_t blob_offset) const {
533 return !has_csum() || blob_offset % get_csum_chunk_size() == 0;
534 }
535
536 void dump(ceph::Formatter *f) const;
537 static void generate_test_instances(std::list<bluestore_blob_t*>& ls);
538
539 bool has_flag(unsigned f) const {
540 return flags & f;
541 }
542 void set_flag(unsigned f) {
543 flags |= f;
544 }
545 void clear_flag(unsigned f) {
546 flags &= ~f;
547 }
548 std::string get_flags_string() const {
549 return get_flags_string(flags);
550 }
551
552 void set_compressed(uint64_t clen_orig, uint64_t clen) {
553 set_flag(FLAG_COMPRESSED);
554 logical_length = clen_orig;
555 compressed_length = clen;
556 }
557 bool is_mutable() const {
558 return !is_compressed() && !is_shared();
559 }
560 bool is_compressed() const {
561 return has_flag(FLAG_COMPRESSED);
562 }
563 bool has_csum() const {
564 return has_flag(FLAG_CSUM);
565 }
566 bool has_unused() const {
567 return has_flag(FLAG_HAS_UNUSED);
568 }
569 bool is_shared() const {
570 return has_flag(FLAG_SHARED);
571 }
572
573 /// return chunk (i.e. min readable block) size for the blob
574 uint64_t get_chunk_size(uint64_t dev_block_size) const {
575 return has_csum() ?
576 std::max<uint64_t>(dev_block_size, get_csum_chunk_size()) : dev_block_size;
577 }
578 uint32_t get_csum_chunk_size() const {
579 return 1 << csum_chunk_order;
580 }
581 uint32_t get_compressed_payload_length() const {
582 return is_compressed() ? compressed_length : 0;
583 }
584 uint64_t calc_offset(uint64_t x_off, uint64_t *plen) const {
585 auto p = extents.begin();
586 ceph_assert(p != extents.end());
587 while (x_off >= p->length) {
588 x_off -= p->length;
589 ++p;
590 ceph_assert(p != extents.end());
591 }
592 if (plen)
593 *plen = p->length - x_off;
594 return p->offset + x_off;
595 }
596
597 // validate whether or not the status of pextents within the given range
598 // meets the requirement(allocated or unallocated).
599 bool _validate_range(uint64_t b_off, uint64_t b_len,
600 bool require_allocated) const {
601 auto p = extents.begin();
602 ceph_assert(p != extents.end());
603 while (b_off >= p->length) {
604 b_off -= p->length;
605 if (++p == extents.end())
606 return false;
607 }
608 b_len += b_off;
609 while (b_len) {
610 if (require_allocated != p->is_valid()) {
611 return false;
612 }
613 if (p->length >= b_len) {
614 return true;
615 }
616 b_len -= p->length;
617 if (++p == extents.end())
618 return false;
619 }
620 ceph_abort_msg("we should not get here");
621 return false;
622 }
623
624 /// return true if the entire range is allocated
625 /// (mapped to extents on disk)
626 bool is_allocated(uint64_t b_off, uint64_t b_len) const {
627 return _validate_range(b_off, b_len, true);
628 }
629
630 /// return true if the entire range is unallocated
631 /// (not mapped to extents on disk)
632 bool is_unallocated(uint64_t b_off, uint64_t b_len) const {
633 return _validate_range(b_off, b_len, false);
634 }
635
636 /// return true if the logical range has never been used
637 bool is_unused(uint64_t offset, uint64_t length) const {
638 if (!has_unused()) {
639 return false;
640 }
641 ceph_assert(!is_compressed());
642 uint64_t blob_len = get_logical_length();
643 ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
644 ceph_assert(offset + length <= blob_len);
645 uint64_t chunk_size = blob_len / (sizeof(unused)*8);
646 uint64_t start = offset / chunk_size;
647 uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size;
648 auto i = start;
649 while (i < end && (unused & (1u << i))) {
650 i++;
651 }
652 return i >= end;
653 }
654
655 /// mark a range that has never been used
656 void add_unused(uint64_t offset, uint64_t length) {
657 ceph_assert(!is_compressed());
658 uint64_t blob_len = get_logical_length();
659 ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
660 ceph_assert(offset + length <= blob_len);
661 uint64_t chunk_size = blob_len / (sizeof(unused)*8);
662 uint64_t start = round_up_to(offset, chunk_size) / chunk_size;
663 uint64_t end = (offset + length) / chunk_size;
664 for (auto i = start; i < end; ++i) {
665 unused |= (1u << i);
666 }
667 if (start != end) {
668 set_flag(FLAG_HAS_UNUSED);
669 }
670 }
671
672 /// indicate that a range has (now) been used.
673 void mark_used(uint64_t offset, uint64_t length) {
674 if (has_unused()) {
675 ceph_assert(!is_compressed());
676 uint64_t blob_len = get_logical_length();
677 ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
678 ceph_assert(offset + length <= blob_len);
679 uint64_t chunk_size = blob_len / (sizeof(unused)*8);
680 uint64_t start = offset / chunk_size;
681 uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size;
682 for (auto i = start; i < end; ++i) {
683 unused &= ~(1u << i);
684 }
685 if (unused == 0) {
686 clear_flag(FLAG_HAS_UNUSED);
687 }
688 }
689 }
690
691 // map_f_invoke templates intended to mask parameters which are not expected
692 // by the provided callback
693 template<class F, typename std::enable_if<std::is_invocable_r_v<
694 int,
695 F,
696 uint64_t,
697 uint64_t>>::type* = nullptr>
698 int map_f_invoke(uint64_t lo,
699 const bluestore_pextent_t& p,
700 uint64_t o,
701 uint64_t l, F&& f) const{
702 return f(o, l);
703 }
704
705 template<class F, typename std::enable_if<std::is_invocable_r_v<
706 int,
707 F,
708 uint64_t,
709 uint64_t,
710 uint64_t>>::type * = nullptr>
711 int map_f_invoke(uint64_t lo,
712 const bluestore_pextent_t& p,
713 uint64_t o,
714 uint64_t l, F&& f) const {
715 return f(lo, o, l);
716 }
717
718 template<class F, typename std::enable_if<std::is_invocable_r_v<
719 int,
720 F,
721 const bluestore_pextent_t&,
722 uint64_t,
723 uint64_t>>::type * = nullptr>
724 int map_f_invoke(uint64_t lo,
725 const bluestore_pextent_t& p,
726 uint64_t o,
727 uint64_t l, F&& f) const {
728 return f(p, o, l);
729 }
730
731 template<class F>
732 int map(uint64_t x_off, uint64_t x_len, F&& f) const {
733 auto x_off0 = x_off;
734 auto p = extents.begin();
735 ceph_assert(p != extents.end());
736 while (x_off >= p->length) {
737 x_off -= p->length;
738 ++p;
739 ceph_assert(p != extents.end());
740 }
741 while (x_len > 0 && p != extents.end()) {
742 uint64_t l = std::min(p->length - x_off, x_len);
743 int r = map_f_invoke(x_off0, *p, p->offset + x_off, l, f);
744 if (r < 0)
745 return r;
746 x_off = 0;
747 x_len -= l;
748 x_off0 += l;
749 ++p;
750 }
751 return 0;
752 }
753
754 template<class F>
755 void map_bl(uint64_t x_off,
756 ceph::buffer::list& bl,
757 F&& f) const {
758 static_assert(std::is_invocable_v<F, uint64_t, ceph::buffer::list&>);
759
760 auto p = extents.begin();
761 ceph_assert(p != extents.end());
762 while (x_off >= p->length) {
763 x_off -= p->length;
764 ++p;
765 ceph_assert(p != extents.end());
766 }
767 ceph::buffer::list::iterator it = bl.begin();
768 uint64_t x_len = bl.length();
769 while (x_len > 0) {
770 ceph_assert(p != extents.end());
771 uint64_t l = std::min(p->length - x_off, x_len);
772 ceph::buffer::list t;
773 it.copy(l, t);
774 f(p->offset + x_off, t);
775 x_off = 0;
776 x_len -= l;
777 ++p;
778 }
779 }
780
781 uint32_t get_ondisk_length() const {
782 uint32_t len = 0;
783 for (auto &p : extents) {
784 len += p.length;
785 }
786 return len;
787 }
788
789 uint32_t get_logical_length() const {
790 return logical_length;
791 }
792 size_t get_csum_value_size() const;
793
794 size_t get_csum_count() const {
795 size_t vs = get_csum_value_size();
796 if (!vs)
797 return 0;
798 return csum_data.length() / vs;
799 }
800 uint64_t get_csum_item(unsigned i) const {
801 size_t cs = get_csum_value_size();
802 const char *p = csum_data.c_str();
803 switch (cs) {
804 case 0:
805 ceph_abort_msg("no csum data, bad index");
806 case 1:
807 return reinterpret_cast<const uint8_t*>(p)[i];
808 case 2:
809 return reinterpret_cast<const ceph_le16*>(p)[i];
810 case 4:
811 return reinterpret_cast<const ceph_le32*>(p)[i];
812 case 8:
813 return reinterpret_cast<const ceph_le64*>(p)[i];
814 default:
815 ceph_abort_msg("unrecognized csum word size");
816 }
817 }
818 const char *get_csum_item_ptr(unsigned i) const {
819 size_t cs = get_csum_value_size();
820 return csum_data.c_str() + (cs * i);
821 }
822 char *get_csum_item_ptr(unsigned i) {
823 size_t cs = get_csum_value_size();
824 return csum_data.c_str() + (cs * i);
825 }
826
827 void init_csum(unsigned type, unsigned order, unsigned len) {
828 flags |= FLAG_CSUM;
829 csum_type = type;
830 csum_chunk_order = order;
831 csum_data = ceph::buffer::create(get_csum_value_size() * len / get_csum_chunk_size());
832 csum_data.zero();
833 csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
834 }
835
836 /// calculate csum for the buffer at the given b_off
837 void calc_csum(uint64_t b_off, const ceph::buffer::list& bl);
838
839 /// verify csum: return -EOPNOTSUPP for unsupported checksum type;
840 /// return -1 and valid(nonnegative) b_bad_off for checksum error;
841 /// return 0 if all is well.
842 int verify_csum(uint64_t b_off, const ceph::buffer::list& bl, int* b_bad_off,
843 uint64_t *bad_csum) const;
844
845 bool can_prune_tail() const {
846 return
847 extents.size() > 1 && // if it's all invalid it's not pruning.
848 !extents.back().is_valid() &&
849 !has_unused();
850 }
851 void prune_tail() {
852 const auto &p = extents.back();
853 logical_length -= p.length;
854 extents.pop_back();
855 if (has_csum()) {
856 ceph::buffer::ptr t;
857 t.swap(csum_data);
858 csum_data = ceph::buffer::ptr(t.c_str(),
859 get_logical_length() / get_csum_chunk_size() *
860 get_csum_value_size());
861 }
862 }
863 void add_tail(uint32_t new_len) {
864 ceph_assert(is_mutable());
865 ceph_assert(!has_unused());
866 ceph_assert(new_len > logical_length);
867 extents.emplace_back(
868 bluestore_pextent_t(
869 bluestore_pextent_t::INVALID_OFFSET,
870 new_len - logical_length));
871 logical_length = new_len;
872 if (has_csum()) {
873 ceph::buffer::ptr t;
874 t.swap(csum_data);
875 csum_data = ceph::buffer::create(
876 get_csum_value_size() * logical_length / get_csum_chunk_size());
877 csum_data.copy_in(0, t.length(), t.c_str());
878 csum_data.zero(t.length(), csum_data.length() - t.length());
879 }
880 }
881 uint32_t get_release_size(uint32_t min_alloc_size) const {
882 if (is_compressed()) {
883 return get_logical_length();
884 }
885 uint32_t res = get_csum_chunk_size();
886 if (!has_csum() || res < min_alloc_size) {
887 res = min_alloc_size;
888 }
889 return res;
890 }
891
892 void split(uint32_t blob_offset, bluestore_blob_t& rb);
893 void allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs);
894 void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only
895
896 /// updates blob's pextents container and return unused pextents eligible
897 /// for release.
898 /// all - indicates that the whole blob to be released.
899 /// logical - specifies set of logical extents within blob's
900 /// to be released
901 /// Returns true if blob has no more valid pextents
902 bool release_extents(
903 bool all,
904 const PExtentVector& logical,
905 PExtentVector* r);
906 };
907 WRITE_CLASS_DENC_FEATURED(bluestore_blob_t)
908
909 std::ostream& operator<<(std::ostream& out, const bluestore_blob_t& o);
910
911
912 /// shared blob state
913 struct bluestore_shared_blob_t {
914 MEMPOOL_CLASS_HELPERS();
915 uint64_t sbid; ///> shared blob id
916 bluestore_extent_ref_map_t ref_map; ///< shared blob extents
917
918 bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {}
919 bluestore_shared_blob_t(uint64_t _sbid,
920 bluestore_extent_ref_map_t&& _ref_map )
921 : sbid(_sbid), ref_map(std::move(_ref_map)) {}
922
923 DENC(bluestore_shared_blob_t, v, p) {
924 DENC_START(1, 1, p);
925 denc(v.ref_map, p);
926 DENC_FINISH(p);
927 }
928
929
930 void dump(ceph::Formatter *f) const;
931 static void generate_test_instances(std::list<bluestore_shared_blob_t*>& ls);
932
933 bool empty() const {
934 return ref_map.empty();
935 }
936 };
937 WRITE_CLASS_DENC(bluestore_shared_blob_t)
938
939 std::ostream& operator<<(std::ostream& out, const bluestore_shared_blob_t& o);
940
941 /// onode: per-object metadata
942 struct bluestore_onode_t {
943 uint64_t nid = 0; ///< numeric id (locally unique)
944 uint64_t size = 0; ///< object size
945 // mempool to be assigned to buffer::ptr manually
946 std::map<mempool::bluestore_cache_meta::string, ceph::buffer::ptr> attrs;
947
948 struct shard_info {
949 uint32_t offset = 0; ///< logical offset for start of shard
950 uint32_t bytes = 0; ///< encoded bytes
951 DENC(shard_info, v, p) {
952 denc_varint(v.offset, p);
953 denc_varint(v.bytes, p);
954 }
955 void dump(ceph::Formatter *f) const;
956 };
957 std::vector<shard_info> extent_map_shards; ///< extent std::map shards (if any)
958
959 uint32_t expected_object_size = 0;
960 uint32_t expected_write_size = 0;
961 uint32_t alloc_hint_flags = 0;
962
963 uint8_t flags = 0;
964
965 std::map<uint32_t, uint64_t> zone_offset_refs; ///< (zone, offset) refs to this onode
966
967 enum {
968 FLAG_OMAP = 1, ///< object may have omap data
969 FLAG_PGMETA_OMAP = 2, ///< omap data is in meta omap prefix
970 FLAG_PERPOOL_OMAP = 4, ///< omap data is in per-pool prefix; per-pool keys
971 FLAG_PERPG_OMAP = 8, ///< omap data is in per-pg prefix; per-pg keys
972 };
973
974 std::string get_flags_string() const {
975 std::string s;
976 if (flags & FLAG_OMAP) {
977 s = "omap";
978 }
979 if (flags & FLAG_PGMETA_OMAP) {
980 s += "+pgmeta_omap";
981 }
982 if (flags & FLAG_PERPOOL_OMAP) {
983 s += "+per_pool_omap";
984 }
985 if (flags & FLAG_PERPG_OMAP) {
986 s += "+per_pg_omap";
987 }
988 return s;
989 }
990
991 bool has_flag(unsigned f) const {
992 return flags & f;
993 }
994
995 void set_flag(unsigned f) {
996 flags |= f;
997 }
998
999 void clear_flag(unsigned f) {
1000 flags &= ~f;
1001 }
1002
1003 bool has_omap() const {
1004 return has_flag(FLAG_OMAP);
1005 }
1006
1007 static bool is_pgmeta_omap(uint8_t flags) {
1008 return flags & FLAG_PGMETA_OMAP;
1009 }
1010 static bool is_perpool_omap(uint8_t flags) {
1011 return flags & FLAG_PERPOOL_OMAP;
1012 }
1013 static bool is_perpg_omap(uint8_t flags) {
1014 return flags & FLAG_PERPG_OMAP;
1015 }
1016 bool is_pgmeta_omap() const {
1017 return has_flag(FLAG_PGMETA_OMAP);
1018 }
1019 bool is_perpool_omap() const {
1020 return has_flag(FLAG_PERPOOL_OMAP);
1021 }
1022 bool is_perpg_omap() const {
1023 return has_flag(FLAG_PERPG_OMAP);
1024 }
1025
1026 void set_omap_flags(bool legacy) {
1027 set_flag(FLAG_OMAP | (legacy ? 0 : (FLAG_PERPOOL_OMAP | FLAG_PERPG_OMAP)));
1028 }
1029 void set_omap_flags_pgmeta() {
1030 set_flag(FLAG_OMAP | FLAG_PGMETA_OMAP);
1031 }
1032
1033 void clear_omap_flag() {
1034 clear_flag(FLAG_OMAP |
1035 FLAG_PGMETA_OMAP |
1036 FLAG_PERPOOL_OMAP |
1037 FLAG_PERPG_OMAP);
1038 }
1039
1040 DENC(bluestore_onode_t, v, p) {
1041 DENC_START(2, 1, p);
1042 denc_varint(v.nid, p);
1043 denc_varint(v.size, p);
1044 denc(v.attrs, p);
1045 denc(v.flags, p);
1046 denc(v.extent_map_shards, p);
1047 denc_varint(v.expected_object_size, p);
1048 denc_varint(v.expected_write_size, p);
1049 denc_varint(v.alloc_hint_flags, p);
1050 if (struct_v >= 2) {
1051 denc(v.zone_offset_refs, p);
1052 }
1053 DENC_FINISH(p);
1054 }
1055 void dump(ceph::Formatter *f) const;
1056 static void generate_test_instances(std::list<bluestore_onode_t*>& o);
1057 };
1058 WRITE_CLASS_DENC(bluestore_onode_t::shard_info)
1059 WRITE_CLASS_DENC(bluestore_onode_t)
1060
1061 std::ostream& operator<<(std::ostream& out, const bluestore_onode_t::shard_info& si);
1062
1063 /// writeahead-logged op
1064 struct bluestore_deferred_op_t {
1065 typedef enum {
1066 OP_WRITE = 1,
1067 } type_t;
1068 __u8 op = 0;
1069
1070 PExtentVector extents;
1071 ceph::buffer::list data;
1072
1073 DENC(bluestore_deferred_op_t, v, p) {
1074 DENC_START(1, 1, p);
1075 denc(v.op, p);
1076 denc(v.extents, p);
1077 denc(v.data, p);
1078 DENC_FINISH(p);
1079 }
1080 void dump(ceph::Formatter *f) const;
1081 static void generate_test_instances(std::list<bluestore_deferred_op_t*>& o);
1082 };
1083 WRITE_CLASS_DENC(bluestore_deferred_op_t)
1084
1085
1086 /// writeahead-logged transaction
1087 struct bluestore_deferred_transaction_t {
1088 uint64_t seq = 0;
1089 std::list<bluestore_deferred_op_t> ops;
1090 interval_set<uint64_t> released; ///< allocations to release after tx
1091
1092 bluestore_deferred_transaction_t() : seq(0) {}
1093
1094 DENC(bluestore_deferred_transaction_t, v, p) {
1095 DENC_START(1, 1, p);
1096 denc(v.seq, p);
1097 denc(v.ops, p);
1098 denc(v.released, p);
1099 DENC_FINISH(p);
1100 }
1101 void dump(ceph::Formatter *f) const;
1102 static void generate_test_instances(std::list<bluestore_deferred_transaction_t*>& o);
1103 };
1104 WRITE_CLASS_DENC(bluestore_deferred_transaction_t)
1105
1106 struct bluestore_compression_header_t {
1107 uint8_t type = Compressor::COMP_ALG_NONE;
1108 uint32_t length = 0;
1109 boost::optional<int32_t> compressor_message;
1110
1111 bluestore_compression_header_t() {}
1112 bluestore_compression_header_t(uint8_t _type)
1113 : type(_type) {}
1114
1115 DENC(bluestore_compression_header_t, v, p) {
1116 DENC_START(2, 1, p);
1117 denc(v.type, p);
1118 denc(v.length, p);
1119 if (struct_v >= 2) {
1120 denc(v.compressor_message, p);
1121 }
1122 DENC_FINISH(p);
1123 }
1124 void dump(ceph::Formatter *f) const;
1125 static void generate_test_instances(std::list<bluestore_compression_header_t*>& o);
1126 };
1127 WRITE_CLASS_DENC(bluestore_compression_header_t)
1128
1129 template <template <typename> typename V, class COUNTER_TYPE = int32_t>
1130 class ref_counter_2hash_tracker_t {
1131 size_t num_non_zero = 0;
1132 size_t num_buckets = 0;
1133 V<COUNTER_TYPE> buckets1;
1134 V<COUNTER_TYPE> buckets2;
1135
1136 public:
1137 ref_counter_2hash_tracker_t(uint64_t mem_cap) {
1138 num_buckets = mem_cap / sizeof(COUNTER_TYPE) / 2;
1139 ceph_assert(num_buckets);
1140 buckets1.resize(num_buckets);
1141 buckets2.resize(num_buckets);
1142 reset();
1143 }
1144
1145 size_t get_num_buckets() const {
1146 return num_buckets;
1147 }
1148
1149 void inc(const char* hash_val, size_t hash_val_len, int n) {
1150 auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len) %
1151 num_buckets;
1152 if (buckets1[h] == 0 && n) {
1153 ++num_non_zero;
1154 } else if (buckets1[h] == -n) {
1155 --num_non_zero;
1156 }
1157 buckets1[h] += n;
1158 h = ceph_str_hash_linux((const char*)hash_val, hash_val_len) % num_buckets;
1159 if (buckets2[h] == 0 && n) {
1160 ++num_non_zero;
1161 } else if (buckets2[h] == -n) {
1162 --num_non_zero;
1163 }
1164 buckets2[h] += n;
1165 }
1166
1167 bool test_hash_conflict(
1168 const char* hash_val1,
1169 const char* hash_val2,
1170 size_t hash_val_len) const {
1171
1172 auto h1 = ceph_str_hash_rjenkins((const char*)hash_val1, hash_val_len);
1173 auto h2 = ceph_str_hash_rjenkins((const char*)hash_val2, hash_val_len);
1174 auto h3 = ceph_str_hash_linux((const char*)hash_val1, hash_val_len);
1175 auto h4 = ceph_str_hash_linux((const char*)hash_val2, hash_val_len);
1176 return ((h1 % num_buckets) == (h2 % num_buckets)) &&
1177 ((h3 % num_buckets) == (h4 % num_buckets));
1178 }
1179
1180 bool test_all_zero(const char* hash_val, size_t hash_val_len) const {
1181 auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len);
1182 if (buckets1[h % num_buckets] != 0) {
1183 return false;
1184 }
1185 h = ceph_str_hash_linux((const char*)hash_val, hash_val_len);
1186 return buckets2[h % num_buckets] == 0;
1187 }
1188
1189 // returns number of mismatching buckets
1190 size_t count_non_zero() const {
1191 return num_non_zero;
1192 }
1193 void reset() {
1194 for (size_t i = 0; i < num_buckets; i++) {
1195 buckets1[i] = 0;
1196 buckets2[i] = 0;
1197 }
1198 num_non_zero = 0;
1199 }
1200 };
1201
1202 class shared_blob_2hash_tracker_t
1203 : public ref_counter_2hash_tracker_t<mempool::bluestore_fsck::vector> {
1204
1205 static const size_t hash_input_len = 3;
1206
1207 typedef std::array<uint64_t, hash_input_len> hash_input_t;
1208
1209 static size_t get_hash_input_size() {
1210 return hash_input_len * sizeof(hash_input_t::value_type);
1211 }
1212
1213 inline hash_input_t build_hash_input(uint64_t sbid, uint64_t offset) const;
1214
1215 size_t au_void_bits = 0;
1216
1217
1218 public:
1219 shared_blob_2hash_tracker_t(uint64_t mem_cap, size_t alloc_unit)
1220 : ref_counter_2hash_tracker_t(mem_cap) {
1221 ceph_assert(alloc_unit);
1222 ceph_assert(isp2(alloc_unit));
1223 au_void_bits = ctz(alloc_unit);
1224 }
1225 void inc(uint64_t sbid, uint64_t offset, int n);
1226 void inc_range(uint64_t sbid, uint64_t offset, uint32_t len, int n);
1227
1228 bool test_hash_conflict(
1229 uint64_t sbid,
1230 uint64_t offset,
1231 uint64_t sbid2,
1232 uint64_t offset2) const;
1233 bool test_all_zero(
1234 uint64_t sbid,
1235 uint64_t offset) const;
1236 bool test_all_zero_range(
1237 uint64_t sbid,
1238 uint64_t offset,
1239 uint32_t len) const;
1240 };
1241
1242 class sb_info_t {
1243 // subzero value indicates (potentially) stray blob,
1244 // i.e. blob that has got no real references from onodes
1245 int64_t sbid = 0;
1246
1247 public:
1248 enum {
1249 INVALID_POOL_ID = INT64_MIN
1250 };
1251
1252 int64_t pool_id = INVALID_POOL_ID;
1253 // subzero value indicates compressed_allocated as well
1254 int32_t allocated_chunks = 0;
1255
1256 sb_info_t(int64_t _sbid = 0) : sbid(_sbid)
1257 {
1258 }
1259 bool operator< (const sb_info_t& other) const {
1260 return std::abs(sbid) < std::abs(other.sbid);
1261 }
1262 bool operator< (const uint64_t& other_sbid) const {
1263 return uint64_t(std::abs(sbid)) < other_sbid;
1264 }
1265 bool is_stray() const {
1266 return sbid < 0;
1267 }
1268 uint64_t get_sbid() const {
1269 return uint64_t(std::abs(sbid));
1270 }
1271 void adopt() {
1272 sbid = std::abs(sbid);
1273 }
1274 } __attribute__((packed));
1275
1276 // Space-efficient container to keep a set of sb_info structures
1277 // given that the majority of entries are appended in a proper id-sorted
1278 // order. Hence one can keep them in a regular vector and apply binary search
1279 // whenever specific entry to be found.
1280 // For the rare occasions when out-of-order append takes place - an auxilliary
1281 // regular map is used.
1282 struct sb_info_space_efficient_map_t {
1283 // large array sorted by the user
1284 mempool::bluestore_fsck::vector<sb_info_t> items;
1285 // small additional set of items we maintain sorting ourselves
1286 // this would never keep an entry with id > items.back().id
1287 mempool::bluestore_fsck::vector<sb_info_t> aux_items;
1288
1289 sb_info_t& add_maybe_stray(uint64_t sbid) {
1290 return _add(-int64_t(sbid));
1291 }
1292 sb_info_t& add_or_adopt(uint64_t sbid) {
1293 auto& r = _add(sbid);
1294 r.adopt();
1295 return r;
1296 }
1297 auto find(uint64_t id) {
1298 if (items.size() != 0) {
1299 auto it = std::lower_bound(
1300 items.begin(),
1301 items.end() - 1,
1302 id,
1303 [](const sb_info_t& a, const uint64_t& b) {
1304 return a < b;
1305 });
1306 if (it->get_sbid() == id) {
1307 return it;
1308 }
1309 if (aux_items.size() != 0) {
1310 auto it = std::lower_bound(
1311 aux_items.begin(),
1312 aux_items.end(),
1313 id,
1314 [](const sb_info_t& a, const uint64_t& b) {
1315 return a < b;
1316 });
1317 if (it->get_sbid() == id) {
1318 return it;
1319 }
1320 }
1321 }
1322 return items.end();
1323 }
1324 // enumerates strays, order isn't guaranteed.
1325 void foreach_stray(std::function<void(const sb_info_t&)> cb) {
1326 for (auto& sbi : items) {
1327 if (sbi.is_stray()) {
1328 cb(sbi);
1329 }
1330 }
1331 for (auto& sbi : aux_items) {
1332 if (sbi.is_stray()) {
1333 cb(sbi);
1334 }
1335 }
1336 }
1337 auto end() {
1338 return items.end();
1339 }
1340
1341 void shrink() {
1342 items.shrink_to_fit();
1343 aux_items.shrink_to_fit();
1344 }
1345 void clear() {
1346 items.clear();
1347 aux_items.clear();
1348 shrink();
1349 }
1350 private:
1351 sb_info_t& _add(int64_t id) {
1352 uint64_t n_id = uint64_t(std::abs(id));
1353 if (items.size() == 0 || n_id > items.back().get_sbid()) {
1354 return items.emplace_back(id);
1355 }
1356 auto it = find(n_id);
1357 if (it != items.end()) {
1358 return *it;
1359 }
1360 if (aux_items.size() == 0 || n_id > aux_items.back().get_sbid()) {
1361 return aux_items.emplace_back(id);
1362 }
1363 // do sorted insertion, may be expensive!
1364 it = std::upper_bound(
1365 aux_items.begin(),
1366 aux_items.end(),
1367 n_id,
1368 [](const uint64_t& a, const sb_info_t& b) {
1369 return a < b.get_sbid();
1370 });
1371 return *aux_items.emplace(it, id);
1372 }
1373 };
1374
1375 #endif