]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/bluestore_types.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / os / bluestore / bluestore_types.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
16 #define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
17
18 #include <ostream>
19 #include <bitset>
20 #include "include/types.h"
21 #include "include/interval_set.h"
22 #include "include/utime.h"
23 #include "common/hobject.h"
24 #include "compressor/Compressor.h"
25 #include "common/Checksummer.h"
26 #include "include/mempool.h"
27
28 namespace ceph {
29 class Formatter;
30 }
31
32 /// label for block device
33 struct bluestore_bdev_label_t {
34 uuid_d osd_uuid; ///< osd uuid
35 uint64_t size; ///< device size
36 utime_t btime; ///< birth time
37 string description; ///< device description
38
39 void encode(bufferlist& bl) const;
40 void decode(bufferlist::iterator& p);
41 void dump(Formatter *f) const;
42 static void generate_test_instances(list<bluestore_bdev_label_t*>& o);
43 };
44 WRITE_CLASS_ENCODER(bluestore_bdev_label_t)
45
46 ostream& operator<<(ostream& out, const bluestore_bdev_label_t& l);
47
48 /// collection metadata
49 struct bluestore_cnode_t {
50 uint32_t bits; ///< how many bits of coll pgid are significant
51
52 explicit bluestore_cnode_t(int b=0) : bits(b) {}
53
54 DENC(bluestore_cnode_t, v, p) {
55 DENC_START(1, 1, p);
56 denc(v.bits, p);
57 DENC_FINISH(p);
58 }
59 void dump(Formatter *f) const;
60 static void generate_test_instances(list<bluestore_cnode_t*>& o);
61 };
62 WRITE_CLASS_DENC(bluestore_cnode_t)
63
64 class AllocExtent;
65 typedef mempool::bluestore_alloc::vector<AllocExtent> AllocExtentVector;
66 class AllocExtent {
67 public:
68 uint64_t offset;
69 uint32_t length;
70
71 AllocExtent() {
72 offset = 0;
73 length = 0;
74 }
75
76 AllocExtent(int64_t off, int32_t len) : offset(off), length(len) { }
77 uint64_t end() const {
78 return offset + length;
79 }
80 bool operator==(const AllocExtent& other) const {
81 return offset == other.offset && length == other.length;
82 }
83 };
84
85 inline static ostream& operator<<(ostream& out, const AllocExtent& e) {
86 return out << "0x" << std::hex << e.offset << "~" << e.length << std::dec;
87 }
88
89 class ExtentList {
90 AllocExtentVector *m_extents;
91 int64_t m_block_size;
92 int64_t m_max_blocks;
93
94 public:
95 void init(AllocExtentVector *extents, int64_t block_size,
96 uint64_t max_alloc_size) {
97 m_extents = extents;
98 m_block_size = block_size;
99 m_max_blocks = max_alloc_size / block_size;
100 assert(m_extents->empty());
101 }
102
103 ExtentList(AllocExtentVector *extents, int64_t block_size) {
104 init(extents, block_size, 0);
105 }
106
107 ExtentList(AllocExtentVector *extents, int64_t block_size,
108 uint64_t max_alloc_size) {
109 init(extents, block_size, max_alloc_size);
110 }
111
112 void reset() {
113 m_extents->clear();
114 }
115
116 void add_extents(int64_t start, int64_t count);
117
118 AllocExtentVector *get_extents() {
119 return m_extents;
120 }
121
122 std::pair<int64_t, int64_t> get_nth_extent(int index) {
123 return std::make_pair
124 ((*m_extents)[index].offset / m_block_size,
125 (*m_extents)[index].length / m_block_size);
126 }
127
128 int64_t get_extent_count() {
129 return m_extents->size();
130 }
131 };
132
133
134 /// pextent: physical extent
135 struct bluestore_pextent_t : public AllocExtent {
136 const static uint64_t INVALID_OFFSET = ~0ull;
137
138 bluestore_pextent_t() : AllocExtent() {}
139 bluestore_pextent_t(uint64_t o, uint64_t l) : AllocExtent(o, l) {}
140 bluestore_pextent_t(const AllocExtent &ext) :
141 AllocExtent(ext.offset, ext.length) { }
142
143 bluestore_pextent_t& operator=(const AllocExtent &ext) {
144 offset = ext.offset;
145 length = ext.length;
146 return *this;
147 }
148 bool is_valid() const {
149 return offset != INVALID_OFFSET;
150 }
151
152 DENC(bluestore_pextent_t, v, p) {
153 denc_lba(v.offset, p);
154 denc_varint_lowz(v.length, p);
155 }
156
157 void dump(Formatter *f) const;
158 static void generate_test_instances(list<bluestore_pextent_t*>& ls);
159 };
160 WRITE_CLASS_DENC(bluestore_pextent_t)
161
162 ostream& operator<<(ostream& out, const bluestore_pextent_t& o);
163
164 typedef mempool::bluestore_meta_other::vector<bluestore_pextent_t> PExtentVector;
165
166 template<>
167 struct denc_traits<PExtentVector> {
168 static constexpr bool supported = true;
169 static constexpr bool bounded = false;
170 static constexpr bool featured = false;
171 static void bound_encode(const PExtentVector& v, size_t& p) {
172 p += sizeof(uint32_t);
173 const auto size = v.size();
174 if (size) {
175 size_t per = 0;
176 denc(v.front(), per);
177 p += per * size;
178 }
179 }
180 static void encode(const PExtentVector& v,
181 bufferlist::contiguous_appender& p) {
182 denc_varint(v.size(), p);
183 for (auto& i : v) {
184 denc(i, p);
185 }
186 }
187 static void decode(PExtentVector& v, bufferptr::iterator& p) {
188 unsigned num;
189 denc_varint(num, p);
190 v.clear();
191 v.resize(num);
192 for (unsigned i=0; i<num; ++i) {
193 denc(v[i], p);
194 }
195 }
196 };
197
198
199 /// extent_map: a map of reference counted extents
200 struct bluestore_extent_ref_map_t {
201 struct record_t {
202 uint32_t length;
203 uint32_t refs;
204 record_t(uint32_t l=0, uint32_t r=0) : length(l), refs(r) {}
205 DENC(bluestore_extent_ref_map_t::record_t, v, p) {
206 denc_varint_lowz(v.length, p);
207 denc_varint(v.refs, p);
208 }
209 };
210
211 typedef mempool::bluestore_meta_other::map<uint64_t,record_t> map_t;
212 map_t ref_map;
213
214 void _check() const;
215 void _maybe_merge_left(map_t::iterator& p);
216
217 void clear() {
218 ref_map.clear();
219 }
220 bool empty() const {
221 return ref_map.empty();
222 }
223
224 void get(uint64_t offset, uint32_t len);
225 void put(uint64_t offset, uint32_t len, PExtentVector *release);
226
227 bool contains(uint64_t offset, uint32_t len) const;
228 bool intersects(uint64_t offset, uint32_t len) const;
229
230 void bound_encode(size_t& p) const {
231 denc_varint((uint32_t)0, p);
232 if (!ref_map.empty()) {
233 size_t elem_size = 0;
234 denc_varint_lowz((uint64_t)0, elem_size);
235 ref_map.begin()->second.bound_encode(elem_size);
236 p += elem_size * ref_map.size();
237 }
238 }
239 void encode(bufferlist::contiguous_appender& p) const {
240 uint32_t n = ref_map.size();
241 denc_varint(n, p);
242 if (n) {
243 auto i = ref_map.begin();
244 denc_varint_lowz(i->first, p);
245 i->second.encode(p);
246 int64_t pos = i->first;
247 while (--n) {
248 ++i;
249 denc_varint_lowz((int64_t)i->first - pos, p);
250 i->second.encode(p);
251 pos = i->first;
252 }
253 }
254 }
255 void decode(bufferptr::iterator& p) {
256 uint32_t n;
257 denc_varint(n, p);
258 if (n) {
259 int64_t pos;
260 denc_varint_lowz(pos, p);
261 ref_map[pos].decode(p);
262 while (--n) {
263 int64_t delta;
264 denc_varint_lowz(delta, p);
265 pos += delta;
266 ref_map[pos].decode(p);
267 }
268 }
269 }
270
271 void dump(Formatter *f) const;
272 static void generate_test_instances(list<bluestore_extent_ref_map_t*>& o);
273 };
274 WRITE_CLASS_DENC(bluestore_extent_ref_map_t)
275
276
277 ostream& operator<<(ostream& out, const bluestore_extent_ref_map_t& rm);
278 static inline bool operator==(const bluestore_extent_ref_map_t::record_t& l,
279 const bluestore_extent_ref_map_t::record_t& r) {
280 return l.length == r.length && l.refs == r.refs;
281 }
282 static inline bool operator==(const bluestore_extent_ref_map_t& l,
283 const bluestore_extent_ref_map_t& r) {
284 return l.ref_map == r.ref_map;
285 }
286 static inline bool operator!=(const bluestore_extent_ref_map_t& l,
287 const bluestore_extent_ref_map_t& r) {
288 return !(l == r);
289 }
290
291 /// blob_use_tracker: a set of per-alloc unit ref counters to track blob usage
292 struct bluestore_blob_use_tracker_t {
293 // N.B.: There is no need to minimize au_size/num_au
294 // as much as possible (e.g. have just a single byte for au_size) since:
295 // 1) Struct isn't packed hence it's padded. And even if it's packed see 2)
296 // 2) Mem manager has its own granularity, most probably >= 8 bytes
297 //
298 uint32_t au_size; // Allocation (=tracking) unit size,
299 // == 0 if uninitialized
300 uint32_t num_au; // Amount of allocation units tracked
301 // == 0 if single unit or the whole blob is tracked
302
303 union {
304 uint32_t* bytes_per_au;
305 uint32_t total_bytes;
306 };
307
308 bluestore_blob_use_tracker_t()
309 : au_size(0), num_au(0), bytes_per_au(nullptr) {
310 }
311 ~bluestore_blob_use_tracker_t() {
312 clear();
313 }
314
315 void clear() {
316 if (num_au != 0) {
317 delete[] bytes_per_au;
318 }
319 bytes_per_au = 0;
320 au_size = 0;
321 num_au = 0;
322 }
323
324 uint32_t get_referenced_bytes() const {
325 uint32_t total = 0;
326 if (!num_au) {
327 total = total_bytes;
328 } else {
329 for (size_t i = 0; i < num_au; ++i) {
330 total += bytes_per_au[i];
331 }
332 }
333 return total;
334 }
335 bool is_not_empty() const {
336 if (!num_au) {
337 return total_bytes != 0;
338 } else {
339 for (size_t i = 0; i < num_au; ++i) {
340 if (bytes_per_au[i]) {
341 return true;
342 }
343 }
344 }
345 return false;
346 }
347 bool is_empty() const {
348 return !is_not_empty();
349 }
350 void prune_tail(uint32_t new_len) {
351 if (num_au) {
352 new_len = ROUND_UP_TO(new_len, au_size);
353 uint32_t _num_au = new_len / au_size;
354 assert(_num_au <= num_au);
355 if (_num_au) {
356 num_au = _num_au; // bytes_per_au array is left unmodified
357
358 } else {
359 clear();
360 }
361 }
362 }
363 void add_tail(uint32_t new_len, uint32_t _au_size) {
364 auto full_size = au_size * (num_au ? num_au : 1);
365 assert(new_len >= full_size);
366 if (new_len == full_size) {
367 return;
368 }
369 if (!num_au) {
370 uint32_t old_total = total_bytes;
371 total_bytes = 0;
372 init(new_len, _au_size);
373 assert(num_au);
374 bytes_per_au[0] = old_total;
375 } else {
376 assert(_au_size == au_size);
377 new_len = ROUND_UP_TO(new_len, au_size);
378 uint32_t _num_au = new_len / au_size;
379 assert(_num_au >= num_au);
380 if (_num_au > num_au) {
381 auto old_bytes = bytes_per_au;
382 auto old_num_au = num_au;
383 num_au = _num_au;
384 allocate();
385 for (size_t i = 0; i < old_num_au; i++) {
386 bytes_per_au[i] = old_bytes[i];
387 }
388 for (size_t i = old_num_au; i < num_au; i++) {
389 bytes_per_au[i] = 0;
390 }
391 delete[] old_bytes;
392 }
393 }
394 }
395
396 void init(
397 uint32_t full_length,
398 uint32_t _au_size);
399
400 void get(
401 uint32_t offset,
402 uint32_t len);
403
404 /// put: return true if the blob has no references any more after the call,
405 /// no release_units is filled for the sake of performance.
406 /// return false if there are some references to the blob,
407 /// in this case release_units contains pextents
408 /// (identified by their offsets relative to the blob start)
409 // that are not used any more and can be safely deallocated.
410 bool put(
411 uint32_t offset,
412 uint32_t len,
413 PExtentVector *release);
414
415 bool can_split() const;
416 bool can_split_at(uint32_t blob_offset) const;
417 void split(
418 uint32_t blob_offset,
419 bluestore_blob_use_tracker_t* r);
420
421 bool equal(
422 const bluestore_blob_use_tracker_t& other) const;
423
424 void bound_encode(size_t& p) const {
425 denc_varint(au_size, p);
426 if (au_size) {
427 denc_varint(num_au, p);
428 if (!num_au) {
429 denc_varint(total_bytes, p);
430 } else {
431 size_t elem_size = 0;
432 denc_varint((uint32_t)0, elem_size);
433 p += elem_size * num_au;
434 }
435 }
436 }
437 void encode(bufferlist::contiguous_appender& p) const {
438 denc_varint(au_size, p);
439 if (au_size) {
440 denc_varint(num_au, p);
441 if (!num_au) {
442 denc_varint(total_bytes, p);
443 } else {
444 size_t elem_size = 0;
445 denc_varint((uint32_t)0, elem_size);
446 for (size_t i = 0; i < num_au; ++i) {
447 denc_varint(bytes_per_au[i], p);
448 }
449 }
450 }
451 }
452 void decode(bufferptr::iterator& p) {
453 clear();
454 denc_varint(au_size, p);
455 if (au_size) {
456 denc_varint(num_au, p);
457 if (!num_au) {
458 denc_varint(total_bytes, p);
459 } else {
460 allocate();
461 for (size_t i = 0; i < num_au; ++i) {
462 denc_varint(bytes_per_au[i], p);
463 }
464 }
465 }
466 }
467
468 void dump(Formatter *f) const;
469 static void generate_test_instances(list<bluestore_blob_use_tracker_t*>& o);
470 private:
471 void allocate();
472 void fall_back_to_per_au(uint32_t _num_au, uint32_t _au_size);
473 };
474 WRITE_CLASS_DENC(bluestore_blob_use_tracker_t)
475 ostream& operator<<(ostream& out, const bluestore_blob_use_tracker_t& rm);
476
477 /// blob: a piece of data on disk
478 struct bluestore_blob_t {
479 private:
480 PExtentVector extents; ///< raw data position on device
481 uint32_t logical_length = 0; ///< < original length of data stored in the blob
482 uint32_t compressed_length = 0; ///< compressed length if any
483
484 public:
485 enum {
486 FLAG_MUTABLE = 1, ///< blob can be overwritten or split
487 FLAG_COMPRESSED = 2, ///< blob is compressed
488 FLAG_CSUM = 4, ///< blob has checksums
489 FLAG_HAS_UNUSED = 8, ///< blob has unused map
490 FLAG_SHARED = 16, ///< blob is shared; see external SharedBlob
491 };
492 static string get_flags_string(unsigned flags);
493
494 uint32_t flags = 0; ///< FLAG_*
495
496 typedef uint16_t unused_t;
497 unused_t unused = 0; ///< portion that has never been written to (bitmap)
498
499 uint8_t csum_type = Checksummer::CSUM_NONE; ///< CSUM_*
500 uint8_t csum_chunk_order = 0; ///< csum block size is 1<<block_order bytes
501
502 bufferptr csum_data; ///< opaque vector of csum data
503
504 bluestore_blob_t(uint32_t f = 0) : flags(f) {}
505
506 const PExtentVector& get_extents() const {
507 return extents;
508 }
509
510 DENC_HELPERS;
511 void bound_encode(size_t& p, uint64_t struct_v) const {
512 assert(struct_v == 1 || struct_v == 2);
513 denc(extents, p);
514 denc_varint(flags, p);
515 denc_varint_lowz(logical_length, p);
516 denc_varint_lowz(compressed_length, p);
517 denc(csum_type, p);
518 denc(csum_chunk_order, p);
519 denc_varint(csum_data.length(), p);
520 p += csum_data.length();
521 p += sizeof(unused_t);
522 }
523
524 void encode(bufferlist::contiguous_appender& p, uint64_t struct_v) const {
525 assert(struct_v == 1 || struct_v == 2);
526 denc(extents, p);
527 denc_varint(flags, p);
528 if (is_compressed()) {
529 denc_varint_lowz(logical_length, p);
530 denc_varint_lowz(compressed_length, p);
531 }
532 if (has_csum()) {
533 denc(csum_type, p);
534 denc(csum_chunk_order, p);
535 denc_varint(csum_data.length(), p);
536 memcpy(p.get_pos_add(csum_data.length()), csum_data.c_str(),
537 csum_data.length());
538 }
539 if (has_unused()) {
540 denc(unused, p);
541 }
542 }
543
544 void decode(bufferptr::iterator& p, uint64_t struct_v) {
545 assert(struct_v == 1 || struct_v == 2);
546 denc(extents, p);
547 denc_varint(flags, p);
548 if (is_compressed()) {
549 denc_varint_lowz(logical_length, p);
550 denc_varint_lowz(compressed_length, p);
551 } else {
552 logical_length = get_ondisk_length();
553 }
554 if (has_csum()) {
555 denc(csum_type, p);
556 denc(csum_chunk_order, p);
557 int len;
558 denc_varint(len, p);
559 csum_data = p.get_ptr(len);
560 }
561 if (has_unused()) {
562 denc(unused, p);
563 }
564 }
565
566 bool can_split() const {
567 return
568 !has_flag(FLAG_SHARED) &&
569 !has_flag(FLAG_COMPRESSED) &&
570 !has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex
571 }
572 bool can_split_at(uint32_t blob_offset) const {
573 return !has_csum() || blob_offset % get_csum_chunk_size() == 0;
574 }
575
576 void dump(Formatter *f) const;
577 static void generate_test_instances(list<bluestore_blob_t*>& ls);
578
579 bool has_flag(unsigned f) const {
580 return flags & f;
581 }
582 void set_flag(unsigned f) {
583 flags |= f;
584 }
585 void clear_flag(unsigned f) {
586 flags &= ~f;
587 }
588 string get_flags_string() const {
589 return get_flags_string(flags);
590 }
591
592 void set_compressed(uint64_t clen_orig, uint64_t clen) {
593 set_flag(FLAG_COMPRESSED);
594 logical_length = clen_orig;
595 compressed_length = clen;
596 }
597 bool is_mutable() const {
598 return has_flag(FLAG_MUTABLE);
599 }
600 bool is_compressed() const {
601 return has_flag(FLAG_COMPRESSED);
602 }
603 bool has_csum() const {
604 return has_flag(FLAG_CSUM);
605 }
606 bool has_unused() const {
607 return has_flag(FLAG_HAS_UNUSED);
608 }
609 bool is_shared() const {
610 return has_flag(FLAG_SHARED);
611 }
612
613 /// return chunk (i.e. min readable block) size for the blob
614 uint64_t get_chunk_size(uint64_t dev_block_size) const {
615 return has_csum() ?
616 MAX(dev_block_size, get_csum_chunk_size()) : dev_block_size;
617 }
618 uint32_t get_csum_chunk_size() const {
619 return 1 << csum_chunk_order;
620 }
621 uint32_t get_compressed_payload_length() const {
622 return is_compressed() ? compressed_length : 0;
623 }
624 uint64_t calc_offset(uint64_t x_off, uint64_t *plen) const {
625 auto p = extents.begin();
626 assert(p != extents.end());
627 while (x_off >= p->length) {
628 x_off -= p->length;
629 ++p;
630 assert(p != extents.end());
631 }
632 if (plen)
633 *plen = p->length - x_off;
634 return p->offset + x_off;
635 }
636
637 /// return true if the entire range is allocated (mapped to extents on disk)
638 bool is_allocated(uint64_t b_off, uint64_t b_len) const {
639 auto p = extents.begin();
640 assert(p != extents.end());
641 while (b_off >= p->length) {
642 b_off -= p->length;
643 ++p;
644 assert(p != extents.end());
645 }
646 b_len += b_off;
647 while (b_len) {
648 assert(p != extents.end());
649 if (!p->is_valid()) {
650 return false;
651 }
652 if (p->length >= b_len) {
653 return true;
654 }
655 b_len -= p->length;
656 ++p;
657 }
658 assert(0 == "we should not get here");
659 }
660
661 /// return true if the entire range is unallocated
662 /// (not mapped to extents on disk)
663 bool is_unallocated(uint64_t b_off, uint64_t b_len) const {
664 auto p = extents.begin();
665 assert(p != extents.end());
666 while (b_off >= p->length) {
667 b_off -= p->length;
668 ++p;
669 assert(p != extents.end());
670 }
671 b_len += b_off;
672 while (b_len) {
673 assert(p != extents.end());
674 if (p->is_valid()) {
675 return false;
676 }
677 if (p->length >= b_len) {
678 return true;
679 }
680 b_len -= p->length;
681 ++p;
682 }
683 assert(0 == "we should not get here");
684 }
685
686 /// return true if the logical range has never been used
687 bool is_unused(uint64_t offset, uint64_t length) const {
688 if (!has_unused()) {
689 return false;
690 }
691 uint64_t blob_len = get_logical_length();
692 assert((blob_len % (sizeof(unused)*8)) == 0);
693 assert(offset + length <= blob_len);
694 uint64_t chunk_size = blob_len / (sizeof(unused)*8);
695 uint64_t start = offset / chunk_size;
696 uint64_t end = ROUND_UP_TO(offset + length, chunk_size) / chunk_size;
697 auto i = start;
698 while (i < end && (unused & (1u << i))) {
699 i++;
700 }
701 return i >= end;
702 }
703
704 /// mark a range that has never been used
705 void add_unused(uint64_t offset, uint64_t length) {
706 uint64_t blob_len = get_logical_length();
707 assert((blob_len % (sizeof(unused)*8)) == 0);
708 assert(offset + length <= blob_len);
709 uint64_t chunk_size = blob_len / (sizeof(unused)*8);
710 uint64_t start = ROUND_UP_TO(offset, chunk_size) / chunk_size;
711 uint64_t end = (offset + length) / chunk_size;
712 for (auto i = start; i < end; ++i) {
713 unused |= (1u << i);
714 }
715 if (start != end) {
716 set_flag(FLAG_HAS_UNUSED);
717 }
718 }
719
720 /// indicate that a range has (now) been used.
721 void mark_used(uint64_t offset, uint64_t length) {
722 if (has_unused()) {
723 uint64_t blob_len = get_logical_length();
724 assert((blob_len % (sizeof(unused)*8)) == 0);
725 assert(offset + length <= blob_len);
726 uint64_t chunk_size = blob_len / (sizeof(unused)*8);
727 uint64_t start = offset / chunk_size;
728 uint64_t end = ROUND_UP_TO(offset + length, chunk_size) / chunk_size;
729 for (auto i = start; i < end; ++i) {
730 unused &= ~(1u << i);
731 }
732 if (unused == 0) {
733 clear_flag(FLAG_HAS_UNUSED);
734 }
735 }
736 }
737
738 int map(uint64_t x_off, uint64_t x_len,
739 std::function<int(uint64_t,uint64_t)> f) const {
740 auto p = extents.begin();
741 assert(p != extents.end());
742 while (x_off >= p->length) {
743 x_off -= p->length;
744 ++p;
745 assert(p != extents.end());
746 }
747 while (x_len > 0) {
748 assert(p != extents.end());
749 uint64_t l = MIN(p->length - x_off, x_len);
750 int r = f(p->offset + x_off, l);
751 if (r < 0)
752 return r;
753 x_off = 0;
754 x_len -= l;
755 ++p;
756 }
757 return 0;
758 }
759 void map_bl(uint64_t x_off,
760 bufferlist& bl,
761 std::function<void(uint64_t,bufferlist&)> f) const {
762 auto p = extents.begin();
763 assert(p != extents.end());
764 while (x_off >= p->length) {
765 x_off -= p->length;
766 ++p;
767 assert(p != extents.end());
768 }
769 bufferlist::iterator it = bl.begin();
770 uint64_t x_len = bl.length();
771 while (x_len > 0) {
772 assert(p != extents.end());
773 uint64_t l = MIN(p->length - x_off, x_len);
774 bufferlist t;
775 it.copy(l, t);
776 f(p->offset + x_off, t);
777 x_off = 0;
778 x_len -= l;
779 ++p;
780 }
781 }
782
783 uint32_t get_ondisk_length() const {
784 uint32_t len = 0;
785 for (auto &p : extents) {
786 len += p.length;
787 }
788 return len;
789 }
790
791 uint32_t get_logical_length() const {
792 return logical_length;
793 }
794 size_t get_csum_value_size() const;
795
796 size_t get_csum_count() const {
797 size_t vs = get_csum_value_size();
798 if (!vs)
799 return 0;
800 return csum_data.length() / vs;
801 }
802 uint64_t get_csum_item(unsigned i) const {
803 size_t cs = get_csum_value_size();
804 const char *p = csum_data.c_str();
805 switch (cs) {
806 case 0:
807 assert(0 == "no csum data, bad index");
808 case 1:
809 return reinterpret_cast<const uint8_t*>(p)[i];
810 case 2:
811 return reinterpret_cast<const __le16*>(p)[i];
812 case 4:
813 return reinterpret_cast<const __le32*>(p)[i];
814 case 8:
815 return reinterpret_cast<const __le64*>(p)[i];
816 default:
817 assert(0 == "unrecognized csum word size");
818 }
819 }
820 const char *get_csum_item_ptr(unsigned i) const {
821 size_t cs = get_csum_value_size();
822 return csum_data.c_str() + (cs * i);
823 }
824 char *get_csum_item_ptr(unsigned i) {
825 size_t cs = get_csum_value_size();
826 return csum_data.c_str() + (cs * i);
827 }
828
829 void init_csum(unsigned type, unsigned order, unsigned len) {
830 flags |= FLAG_CSUM;
831 csum_type = type;
832 csum_chunk_order = order;
833 csum_data = buffer::create(get_csum_value_size() * len / get_csum_chunk_size());
834 csum_data.zero();
835 }
836
837 /// calculate csum for the buffer at the given b_off
838 void calc_csum(uint64_t b_off, const bufferlist& bl);
839
840 /// verify csum: return -EOPNOTSUPP for unsupported checksum type;
841 /// return -1 and valid(nonnegative) b_bad_off for checksum error;
842 /// return 0 if all is well.
843 int verify_csum(uint64_t b_off, const bufferlist& bl, int* b_bad_off,
844 uint64_t *bad_csum) const;
845
846 bool can_prune_tail() const {
847 return
848 extents.size() > 1 && // if it's all invalid it's not pruning.
849 !extents.back().is_valid() &&
850 !has_unused();
851 }
852 void prune_tail() {
853 const auto &p = extents.back();
854 logical_length -= p.length;
855 extents.pop_back();
856 if (has_csum()) {
857 bufferptr t;
858 t.swap(csum_data);
859 csum_data = bufferptr(t.c_str(),
860 get_logical_length() / get_csum_chunk_size() *
861 get_csum_value_size());
862 }
863 }
864 void add_tail(uint32_t new_len) {
865 assert(is_mutable());
866 assert(!has_unused());
867 assert(new_len > logical_length);
868 extents.emplace_back(
869 bluestore_pextent_t(
870 bluestore_pextent_t::INVALID_OFFSET,
871 new_len - logical_length));
872 logical_length = new_len;
873 if (has_csum()) {
874 bufferptr t;
875 t.swap(csum_data);
876 csum_data = buffer::create(
877 get_csum_value_size() * logical_length / get_csum_chunk_size());
878 csum_data.copy_in(0, t.length(), t.c_str());
879 csum_data.zero(t.length(), csum_data.length() - t.length());
880 }
881 }
882 uint32_t get_release_size(uint32_t min_alloc_size) const {
883 if (is_compressed()) {
884 return get_logical_length();
885 }
886 uint32_t res = get_csum_chunk_size();
887 if (!has_csum() || res < min_alloc_size) {
888 res = min_alloc_size;
889 }
890 return res;
891 }
892
893 void split(uint32_t blob_offset, bluestore_blob_t& rb);
894 void allocated(uint32_t b_off, uint32_t length, const AllocExtentVector& allocs);
895 void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only
896
897 /// updates blob's pextents container and return unused pextents eligible
898 /// for release.
899 /// all - indicates that the whole blob to be released.
900 /// logical - specifies set of logical extents within blob's
901 /// to be released
902 /// Returns true if blob has no more valid pextents
903 bool release_extents(
904 bool all,
905 const PExtentVector& logical,
906 PExtentVector* r);
907 };
908 WRITE_CLASS_DENC_FEATURED(bluestore_blob_t)
909
910 ostream& operator<<(ostream& out, const bluestore_blob_t& o);
911
912
913 /// shared blob state
914 struct bluestore_shared_blob_t {
915 uint64_t sbid; ///> shared blob id
916 bluestore_extent_ref_map_t ref_map; ///< shared blob extents
917
918 bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {}
919
920 DENC(bluestore_shared_blob_t, v, p) {
921 DENC_START(1, 1, p);
922 denc(v.ref_map, p);
923 DENC_FINISH(p);
924 }
925
926
927 void dump(Formatter *f) const;
928 static void generate_test_instances(list<bluestore_shared_blob_t*>& ls);
929
930 bool empty() const {
931 return ref_map.empty();
932 }
933 };
934 WRITE_CLASS_DENC(bluestore_shared_blob_t)
935
936 ostream& operator<<(ostream& out, const bluestore_shared_blob_t& o);
937
938 /// onode: per-object metadata
939 struct bluestore_onode_t {
940 uint64_t nid = 0; ///< numeric id (locally unique)
941 uint64_t size = 0; ///< object size
942 map<mempool::bluestore_meta_other::string, bufferptr> attrs; ///< attrs
943
944 struct shard_info {
945 uint32_t offset = 0; ///< logical offset for start of shard
946 uint32_t bytes = 0; ///< encoded bytes
947 DENC(shard_info, v, p) {
948 denc_varint(v.offset, p);
949 denc_varint(v.bytes, p);
950 }
951 void dump(Formatter *f) const;
952 };
953 vector<shard_info> extent_map_shards; ///< extent map shards (if any)
954
955 uint32_t expected_object_size = 0;
956 uint32_t expected_write_size = 0;
957 uint32_t alloc_hint_flags = 0;
958
959 uint8_t flags = 0;
960
961 enum {
962 FLAG_OMAP = 1,
963 };
964
965 string get_flags_string() const {
966 string s;
967 if (flags & FLAG_OMAP) {
968 s = "omap";
969 }
970 return s;
971 }
972
973 bool has_flag(unsigned f) const {
974 return flags & f;
975 }
976
977 void set_flag(unsigned f) {
978 flags |= f;
979 }
980
981 void clear_flag(unsigned f) {
982 flags &= ~f;
983 }
984
985 bool has_omap() const {
986 return has_flag(FLAG_OMAP);
987 }
988
989 void set_omap_flag() {
990 set_flag(FLAG_OMAP);
991 }
992
993 void clear_omap_flag() {
994 clear_flag(FLAG_OMAP);
995 }
996
997 DENC(bluestore_onode_t, v, p) {
998 DENC_START(1, 1, p);
999 denc_varint(v.nid, p);
1000 denc_varint(v.size, p);
1001 denc(v.attrs, p);
1002 denc(v.flags, p);
1003 denc(v.extent_map_shards, p);
1004 denc_varint(v.expected_object_size, p);
1005 denc_varint(v.expected_write_size, p);
1006 denc_varint(v.alloc_hint_flags, p);
1007 DENC_FINISH(p);
1008 }
1009 void dump(Formatter *f) const;
1010 static void generate_test_instances(list<bluestore_onode_t*>& o);
1011 };
1012 WRITE_CLASS_DENC(bluestore_onode_t::shard_info)
1013 WRITE_CLASS_DENC(bluestore_onode_t)
1014
1015 ostream& operator<<(ostream& out, const bluestore_onode_t::shard_info& si);
1016
1017 /// writeahead-logged op
1018 struct bluestore_deferred_op_t {
1019 typedef enum {
1020 OP_WRITE = 1,
1021 } type_t;
1022 __u8 op = 0;
1023
1024 PExtentVector extents;
1025 bufferlist data;
1026
1027 DENC(bluestore_deferred_op_t, v, p) {
1028 DENC_START(1, 1, p);
1029 denc(v.op, p);
1030 denc(v.extents, p);
1031 denc(v.data, p);
1032 DENC_FINISH(p);
1033 }
1034 void dump(Formatter *f) const;
1035 static void generate_test_instances(list<bluestore_deferred_op_t*>& o);
1036 };
1037 WRITE_CLASS_DENC(bluestore_deferred_op_t)
1038
1039
1040 /// writeahead-logged transaction
1041 struct bluestore_deferred_transaction_t {
1042 uint64_t seq = 0;
1043 list<bluestore_deferred_op_t> ops;
1044 interval_set<uint64_t> released; ///< allocations to release after tx
1045
1046 bluestore_deferred_transaction_t() : seq(0) {}
1047
1048 DENC(bluestore_deferred_transaction_t, v, p) {
1049 DENC_START(1, 1, p);
1050 denc(v.seq, p);
1051 denc(v.ops, p);
1052 denc(v.released, p);
1053 DENC_FINISH(p);
1054 }
1055 void dump(Formatter *f) const;
1056 static void generate_test_instances(list<bluestore_deferred_transaction_t*>& o);
1057 };
1058 WRITE_CLASS_DENC(bluestore_deferred_transaction_t)
1059
1060 struct bluestore_compression_header_t {
1061 uint8_t type = Compressor::COMP_ALG_NONE;
1062 uint32_t length = 0;
1063
1064 bluestore_compression_header_t() {}
1065 bluestore_compression_header_t(uint8_t _type)
1066 : type(_type) {}
1067
1068 DENC(bluestore_compression_header_t, v, p) {
1069 DENC_START(1, 1, p);
1070 denc(v.type, p);
1071 denc(v.length, p);
1072 DENC_FINISH(p);
1073 }
1074 void dump(Formatter *f) const;
1075 static void generate_test_instances(list<bluestore_compression_header_t*>& o);
1076 };
1077 WRITE_CLASS_DENC(bluestore_compression_header_t)
1078
1079
1080 #endif