]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2014 Red Hat | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H | |
16 | #define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H | |
17 | ||
18 | #include <ostream> | |
19 | #include <bitset> | |
94b18763 | 20 | #include <type_traits> |
7c673cae FG |
21 | #include "include/types.h" |
22 | #include "include/interval_set.h" | |
23 | #include "include/utime.h" | |
24 | #include "common/hobject.h" | |
25 | #include "compressor/Compressor.h" | |
26 | #include "common/Checksummer.h" | |
27 | #include "include/mempool.h" | |
28 | ||
29 | namespace ceph { | |
30 | class Formatter; | |
31 | } | |
32 | ||
33 | /// label for block device | |
34 | struct bluestore_bdev_label_t { | |
35 | uuid_d osd_uuid; ///< osd uuid | |
36 | uint64_t size; ///< device size | |
37 | utime_t btime; ///< birth time | |
38 | string description; ///< device description | |
39 | ||
3efd9988 FG |
40 | map<string,string> meta; ///< {read,write}_meta() content from ObjectStore |
41 | ||
7c673cae FG |
42 | void encode(bufferlist& bl) const; |
43 | void decode(bufferlist::iterator& p); | |
44 | void dump(Formatter *f) const; | |
45 | static void generate_test_instances(list<bluestore_bdev_label_t*>& o); | |
46 | }; | |
47 | WRITE_CLASS_ENCODER(bluestore_bdev_label_t) | |
48 | ||
49 | ostream& operator<<(ostream& out, const bluestore_bdev_label_t& l); | |
50 | ||
51 | /// collection metadata | |
52 | struct bluestore_cnode_t { | |
53 | uint32_t bits; ///< how many bits of coll pgid are significant | |
54 | ||
55 | explicit bluestore_cnode_t(int b=0) : bits(b) {} | |
56 | ||
57 | DENC(bluestore_cnode_t, v, p) { | |
58 | DENC_START(1, 1, p); | |
59 | denc(v.bits, p); | |
60 | DENC_FINISH(p); | |
61 | } | |
62 | void dump(Formatter *f) const; | |
63 | static void generate_test_instances(list<bluestore_cnode_t*>& o); | |
64 | }; | |
65 | WRITE_CLASS_DENC(bluestore_cnode_t) | |
66 | ||
28e407b8 AA |
67 | ostream& operator<<(ostream& out, const bluestore_cnode_t& l); |
68 | ||
7c673cae FG |
69 | class AllocExtent; |
70 | typedef mempool::bluestore_alloc::vector<AllocExtent> AllocExtentVector; | |
71 | class AllocExtent { | |
72 | public: | |
73 | uint64_t offset; | |
74 | uint32_t length; | |
75 | ||
76 | AllocExtent() { | |
77 | offset = 0; | |
78 | length = 0; | |
79 | } | |
80 | ||
81 | AllocExtent(int64_t off, int32_t len) : offset(off), length(len) { } | |
82 | uint64_t end() const { | |
83 | return offset + length; | |
84 | } | |
85 | bool operator==(const AllocExtent& other) const { | |
86 | return offset == other.offset && length == other.length; | |
87 | } | |
88 | }; | |
89 | ||
90 | inline static ostream& operator<<(ostream& out, const AllocExtent& e) { | |
91 | return out << "0x" << std::hex << e.offset << "~" << e.length << std::dec; | |
92 | } | |
93 | ||
94 | class ExtentList { | |
95 | AllocExtentVector *m_extents; | |
96 | int64_t m_block_size; | |
97 | int64_t m_max_blocks; | |
98 | ||
99 | public: | |
100 | void init(AllocExtentVector *extents, int64_t block_size, | |
101 | uint64_t max_alloc_size) { | |
102 | m_extents = extents; | |
103 | m_block_size = block_size; | |
104 | m_max_blocks = max_alloc_size / block_size; | |
105 | assert(m_extents->empty()); | |
106 | } | |
107 | ||
108 | ExtentList(AllocExtentVector *extents, int64_t block_size) { | |
109 | init(extents, block_size, 0); | |
110 | } | |
111 | ||
112 | ExtentList(AllocExtentVector *extents, int64_t block_size, | |
113 | uint64_t max_alloc_size) { | |
114 | init(extents, block_size, max_alloc_size); | |
115 | } | |
116 | ||
117 | void reset() { | |
118 | m_extents->clear(); | |
119 | } | |
120 | ||
121 | void add_extents(int64_t start, int64_t count); | |
122 | ||
123 | AllocExtentVector *get_extents() { | |
124 | return m_extents; | |
125 | } | |
126 | ||
127 | std::pair<int64_t, int64_t> get_nth_extent(int index) { | |
128 | return std::make_pair | |
129 | ((*m_extents)[index].offset / m_block_size, | |
130 | (*m_extents)[index].length / m_block_size); | |
131 | } | |
132 | ||
133 | int64_t get_extent_count() { | |
134 | return m_extents->size(); | |
135 | } | |
136 | }; | |
137 | ||
138 | ||
139 | /// pextent: physical extent | |
140 | struct bluestore_pextent_t : public AllocExtent { | |
141 | const static uint64_t INVALID_OFFSET = ~0ull; | |
142 | ||
143 | bluestore_pextent_t() : AllocExtent() {} | |
144 | bluestore_pextent_t(uint64_t o, uint64_t l) : AllocExtent(o, l) {} | |
145 | bluestore_pextent_t(const AllocExtent &ext) : | |
146 | AllocExtent(ext.offset, ext.length) { } | |
147 | ||
148 | bluestore_pextent_t& operator=(const AllocExtent &ext) { | |
149 | offset = ext.offset; | |
150 | length = ext.length; | |
151 | return *this; | |
152 | } | |
153 | bool is_valid() const { | |
154 | return offset != INVALID_OFFSET; | |
155 | } | |
156 | ||
157 | DENC(bluestore_pextent_t, v, p) { | |
158 | denc_lba(v.offset, p); | |
159 | denc_varint_lowz(v.length, p); | |
160 | } | |
161 | ||
162 | void dump(Formatter *f) const; | |
163 | static void generate_test_instances(list<bluestore_pextent_t*>& ls); | |
164 | }; | |
165 | WRITE_CLASS_DENC(bluestore_pextent_t) | |
166 | ||
167 | ostream& operator<<(ostream& out, const bluestore_pextent_t& o); | |
168 | ||
31f18b77 | 169 | typedef mempool::bluestore_cache_other::vector<bluestore_pextent_t> PExtentVector; |
7c673cae FG |
170 | |
171 | template<> | |
172 | struct denc_traits<PExtentVector> { | |
173 | static constexpr bool supported = true; | |
174 | static constexpr bool bounded = false; | |
175 | static constexpr bool featured = false; | |
31f18b77 | 176 | static constexpr bool need_contiguous = true; |
7c673cae FG |
177 | static void bound_encode(const PExtentVector& v, size_t& p) { |
178 | p += sizeof(uint32_t); | |
179 | const auto size = v.size(); | |
180 | if (size) { | |
181 | size_t per = 0; | |
182 | denc(v.front(), per); | |
183 | p += per * size; | |
184 | } | |
185 | } | |
186 | static void encode(const PExtentVector& v, | |
187 | bufferlist::contiguous_appender& p) { | |
188 | denc_varint(v.size(), p); | |
189 | for (auto& i : v) { | |
190 | denc(i, p); | |
191 | } | |
192 | } | |
193 | static void decode(PExtentVector& v, bufferptr::iterator& p) { | |
194 | unsigned num; | |
195 | denc_varint(num, p); | |
196 | v.clear(); | |
197 | v.resize(num); | |
198 | for (unsigned i=0; i<num; ++i) { | |
199 | denc(v[i], p); | |
200 | } | |
201 | } | |
202 | }; | |
203 | ||
204 | ||
205 | /// extent_map: a map of reference counted extents | |
206 | struct bluestore_extent_ref_map_t { | |
207 | struct record_t { | |
208 | uint32_t length; | |
209 | uint32_t refs; | |
210 | record_t(uint32_t l=0, uint32_t r=0) : length(l), refs(r) {} | |
211 | DENC(bluestore_extent_ref_map_t::record_t, v, p) { | |
212 | denc_varint_lowz(v.length, p); | |
213 | denc_varint(v.refs, p); | |
214 | } | |
215 | }; | |
216 | ||
31f18b77 | 217 | typedef mempool::bluestore_cache_other::map<uint64_t,record_t> map_t; |
7c673cae FG |
218 | map_t ref_map; |
219 | ||
220 | void _check() const; | |
221 | void _maybe_merge_left(map_t::iterator& p); | |
222 | ||
223 | void clear() { | |
224 | ref_map.clear(); | |
225 | } | |
226 | bool empty() const { | |
227 | return ref_map.empty(); | |
228 | } | |
229 | ||
230 | void get(uint64_t offset, uint32_t len); | |
31f18b77 FG |
231 | void put(uint64_t offset, uint32_t len, PExtentVector *release, |
232 | bool *maybe_unshared); | |
7c673cae FG |
233 | |
234 | bool contains(uint64_t offset, uint32_t len) const; | |
235 | bool intersects(uint64_t offset, uint32_t len) const; | |
236 | ||
237 | void bound_encode(size_t& p) const { | |
238 | denc_varint((uint32_t)0, p); | |
239 | if (!ref_map.empty()) { | |
240 | size_t elem_size = 0; | |
241 | denc_varint_lowz((uint64_t)0, elem_size); | |
242 | ref_map.begin()->second.bound_encode(elem_size); | |
243 | p += elem_size * ref_map.size(); | |
244 | } | |
245 | } | |
246 | void encode(bufferlist::contiguous_appender& p) const { | |
247 | uint32_t n = ref_map.size(); | |
248 | denc_varint(n, p); | |
249 | if (n) { | |
250 | auto i = ref_map.begin(); | |
251 | denc_varint_lowz(i->first, p); | |
252 | i->second.encode(p); | |
253 | int64_t pos = i->first; | |
254 | while (--n) { | |
255 | ++i; | |
256 | denc_varint_lowz((int64_t)i->first - pos, p); | |
257 | i->second.encode(p); | |
258 | pos = i->first; | |
259 | } | |
260 | } | |
261 | } | |
262 | void decode(bufferptr::iterator& p) { | |
263 | uint32_t n; | |
264 | denc_varint(n, p); | |
265 | if (n) { | |
266 | int64_t pos; | |
267 | denc_varint_lowz(pos, p); | |
268 | ref_map[pos].decode(p); | |
269 | while (--n) { | |
270 | int64_t delta; | |
271 | denc_varint_lowz(delta, p); | |
272 | pos += delta; | |
273 | ref_map[pos].decode(p); | |
274 | } | |
275 | } | |
276 | } | |
277 | ||
278 | void dump(Formatter *f) const; | |
279 | static void generate_test_instances(list<bluestore_extent_ref_map_t*>& o); | |
280 | }; | |
281 | WRITE_CLASS_DENC(bluestore_extent_ref_map_t) | |
282 | ||
283 | ||
284 | ostream& operator<<(ostream& out, const bluestore_extent_ref_map_t& rm); | |
285 | static inline bool operator==(const bluestore_extent_ref_map_t::record_t& l, | |
286 | const bluestore_extent_ref_map_t::record_t& r) { | |
287 | return l.length == r.length && l.refs == r.refs; | |
288 | } | |
289 | static inline bool operator==(const bluestore_extent_ref_map_t& l, | |
290 | const bluestore_extent_ref_map_t& r) { | |
291 | return l.ref_map == r.ref_map; | |
292 | } | |
293 | static inline bool operator!=(const bluestore_extent_ref_map_t& l, | |
294 | const bluestore_extent_ref_map_t& r) { | |
295 | return !(l == r); | |
296 | } | |
297 | ||
298 | /// blob_use_tracker: a set of per-alloc unit ref counters to track blob usage | |
299 | struct bluestore_blob_use_tracker_t { | |
300 | // N.B.: There is no need to minimize au_size/num_au | |
301 | // as much as possible (e.g. have just a single byte for au_size) since: | |
302 | // 1) Struct isn't packed hence it's padded. And even if it's packed see 2) | |
303 | // 2) Mem manager has its own granularity, most probably >= 8 bytes | |
304 | // | |
305 | uint32_t au_size; // Allocation (=tracking) unit size, | |
306 | // == 0 if uninitialized | |
307 | uint32_t num_au; // Amount of allocation units tracked | |
308 | // == 0 if single unit or the whole blob is tracked | |
309 | ||
310 | union { | |
311 | uint32_t* bytes_per_au; | |
312 | uint32_t total_bytes; | |
313 | }; | |
314 | ||
315 | bluestore_blob_use_tracker_t() | |
316 | : au_size(0), num_au(0), bytes_per_au(nullptr) { | |
317 | } | |
318 | ~bluestore_blob_use_tracker_t() { | |
319 | clear(); | |
320 | } | |
321 | ||
322 | void clear() { | |
323 | if (num_au != 0) { | |
324 | delete[] bytes_per_au; | |
325 | } | |
326 | bytes_per_au = 0; | |
327 | au_size = 0; | |
328 | num_au = 0; | |
329 | } | |
330 | ||
331 | uint32_t get_referenced_bytes() const { | |
332 | uint32_t total = 0; | |
333 | if (!num_au) { | |
334 | total = total_bytes; | |
335 | } else { | |
336 | for (size_t i = 0; i < num_au; ++i) { | |
337 | total += bytes_per_au[i]; | |
338 | } | |
339 | } | |
340 | return total; | |
341 | } | |
342 | bool is_not_empty() const { | |
343 | if (!num_au) { | |
344 | return total_bytes != 0; | |
345 | } else { | |
346 | for (size_t i = 0; i < num_au; ++i) { | |
347 | if (bytes_per_au[i]) { | |
348 | return true; | |
349 | } | |
350 | } | |
351 | } | |
352 | return false; | |
353 | } | |
354 | bool is_empty() const { | |
355 | return !is_not_empty(); | |
356 | } | |
357 | void prune_tail(uint32_t new_len) { | |
358 | if (num_au) { | |
359 | new_len = ROUND_UP_TO(new_len, au_size); | |
360 | uint32_t _num_au = new_len / au_size; | |
361 | assert(_num_au <= num_au); | |
362 | if (_num_au) { | |
363 | num_au = _num_au; // bytes_per_au array is left unmodified | |
364 | ||
365 | } else { | |
366 | clear(); | |
367 | } | |
368 | } | |
369 | } | |
370 | void add_tail(uint32_t new_len, uint32_t _au_size) { | |
371 | auto full_size = au_size * (num_au ? num_au : 1); | |
372 | assert(new_len >= full_size); | |
373 | if (new_len == full_size) { | |
374 | return; | |
375 | } | |
376 | if (!num_au) { | |
377 | uint32_t old_total = total_bytes; | |
378 | total_bytes = 0; | |
379 | init(new_len, _au_size); | |
380 | assert(num_au); | |
381 | bytes_per_au[0] = old_total; | |
382 | } else { | |
383 | assert(_au_size == au_size); | |
384 | new_len = ROUND_UP_TO(new_len, au_size); | |
385 | uint32_t _num_au = new_len / au_size; | |
386 | assert(_num_au >= num_au); | |
387 | if (_num_au > num_au) { | |
388 | auto old_bytes = bytes_per_au; | |
389 | auto old_num_au = num_au; | |
390 | num_au = _num_au; | |
391 | allocate(); | |
392 | for (size_t i = 0; i < old_num_au; i++) { | |
393 | bytes_per_au[i] = old_bytes[i]; | |
394 | } | |
395 | for (size_t i = old_num_au; i < num_au; i++) { | |
396 | bytes_per_au[i] = 0; | |
397 | } | |
398 | delete[] old_bytes; | |
399 | } | |
400 | } | |
401 | } | |
402 | ||
403 | void init( | |
404 | uint32_t full_length, | |
405 | uint32_t _au_size); | |
406 | ||
407 | void get( | |
408 | uint32_t offset, | |
409 | uint32_t len); | |
410 | ||
411 | /// put: return true if the blob has no references any more after the call, | |
412 | /// no release_units is filled for the sake of performance. | |
413 | /// return false if there are some references to the blob, | |
414 | /// in this case release_units contains pextents | |
415 | /// (identified by their offsets relative to the blob start) | |
31f18b77 | 416 | /// that are not used any more and can be safely deallocated. |
7c673cae FG |
417 | bool put( |
418 | uint32_t offset, | |
419 | uint32_t len, | |
420 | PExtentVector *release); | |
421 | ||
422 | bool can_split() const; | |
423 | bool can_split_at(uint32_t blob_offset) const; | |
424 | void split( | |
425 | uint32_t blob_offset, | |
426 | bluestore_blob_use_tracker_t* r); | |
427 | ||
428 | bool equal( | |
429 | const bluestore_blob_use_tracker_t& other) const; | |
430 | ||
431 | void bound_encode(size_t& p) const { | |
432 | denc_varint(au_size, p); | |
433 | if (au_size) { | |
434 | denc_varint(num_au, p); | |
435 | if (!num_au) { | |
436 | denc_varint(total_bytes, p); | |
437 | } else { | |
438 | size_t elem_size = 0; | |
439 | denc_varint((uint32_t)0, elem_size); | |
440 | p += elem_size * num_au; | |
441 | } | |
442 | } | |
443 | } | |
444 | void encode(bufferlist::contiguous_appender& p) const { | |
445 | denc_varint(au_size, p); | |
446 | if (au_size) { | |
447 | denc_varint(num_au, p); | |
448 | if (!num_au) { | |
449 | denc_varint(total_bytes, p); | |
450 | } else { | |
451 | size_t elem_size = 0; | |
452 | denc_varint((uint32_t)0, elem_size); | |
453 | for (size_t i = 0; i < num_au; ++i) { | |
454 | denc_varint(bytes_per_au[i], p); | |
455 | } | |
456 | } | |
457 | } | |
458 | } | |
459 | void decode(bufferptr::iterator& p) { | |
460 | clear(); | |
461 | denc_varint(au_size, p); | |
462 | if (au_size) { | |
463 | denc_varint(num_au, p); | |
464 | if (!num_au) { | |
465 | denc_varint(total_bytes, p); | |
466 | } else { | |
467 | allocate(); | |
468 | for (size_t i = 0; i < num_au; ++i) { | |
469 | denc_varint(bytes_per_au[i], p); | |
470 | } | |
471 | } | |
472 | } | |
473 | } | |
474 | ||
475 | void dump(Formatter *f) const; | |
476 | static void generate_test_instances(list<bluestore_blob_use_tracker_t*>& o); | |
477 | private: | |
478 | void allocate(); | |
7c673cae FG |
479 | }; |
480 | WRITE_CLASS_DENC(bluestore_blob_use_tracker_t) | |
481 | ostream& operator<<(ostream& out, const bluestore_blob_use_tracker_t& rm); | |
482 | ||
483 | /// blob: a piece of data on disk | |
484 | struct bluestore_blob_t { | |
485 | private: | |
486 | PExtentVector extents; ///< raw data position on device | |
31f18b77 | 487 | uint32_t logical_length = 0; ///< original length of data stored in the blob |
7c673cae FG |
488 | uint32_t compressed_length = 0; ///< compressed length if any |
489 | ||
490 | public: | |
491 | enum { | |
31f18b77 | 492 | LEGACY_FLAG_MUTABLE = 1, ///< [legacy] blob can be overwritten or split |
7c673cae FG |
493 | FLAG_COMPRESSED = 2, ///< blob is compressed |
494 | FLAG_CSUM = 4, ///< blob has checksums | |
495 | FLAG_HAS_UNUSED = 8, ///< blob has unused map | |
496 | FLAG_SHARED = 16, ///< blob is shared; see external SharedBlob | |
497 | }; | |
498 | static string get_flags_string(unsigned flags); | |
499 | ||
500 | uint32_t flags = 0; ///< FLAG_* | |
501 | ||
502 | typedef uint16_t unused_t; | |
503 | unused_t unused = 0; ///< portion that has never been written to (bitmap) | |
504 | ||
505 | uint8_t csum_type = Checksummer::CSUM_NONE; ///< CSUM_* | |
506 | uint8_t csum_chunk_order = 0; ///< csum block size is 1<<block_order bytes | |
507 | ||
508 | bufferptr csum_data; ///< opaque vector of csum data | |
509 | ||
510 | bluestore_blob_t(uint32_t f = 0) : flags(f) {} | |
511 | ||
512 | const PExtentVector& get_extents() const { | |
513 | return extents; | |
514 | } | |
515 | ||
516 | DENC_HELPERS; | |
517 | void bound_encode(size_t& p, uint64_t struct_v) const { | |
518 | assert(struct_v == 1 || struct_v == 2); | |
519 | denc(extents, p); | |
520 | denc_varint(flags, p); | |
521 | denc_varint_lowz(logical_length, p); | |
522 | denc_varint_lowz(compressed_length, p); | |
523 | denc(csum_type, p); | |
524 | denc(csum_chunk_order, p); | |
525 | denc_varint(csum_data.length(), p); | |
526 | p += csum_data.length(); | |
527 | p += sizeof(unused_t); | |
528 | } | |
529 | ||
530 | void encode(bufferlist::contiguous_appender& p, uint64_t struct_v) const { | |
531 | assert(struct_v == 1 || struct_v == 2); | |
532 | denc(extents, p); | |
533 | denc_varint(flags, p); | |
534 | if (is_compressed()) { | |
535 | denc_varint_lowz(logical_length, p); | |
536 | denc_varint_lowz(compressed_length, p); | |
537 | } | |
538 | if (has_csum()) { | |
539 | denc(csum_type, p); | |
540 | denc(csum_chunk_order, p); | |
541 | denc_varint(csum_data.length(), p); | |
542 | memcpy(p.get_pos_add(csum_data.length()), csum_data.c_str(), | |
543 | csum_data.length()); | |
544 | } | |
545 | if (has_unused()) { | |
546 | denc(unused, p); | |
547 | } | |
548 | } | |
549 | ||
550 | void decode(bufferptr::iterator& p, uint64_t struct_v) { | |
551 | assert(struct_v == 1 || struct_v == 2); | |
552 | denc(extents, p); | |
553 | denc_varint(flags, p); | |
554 | if (is_compressed()) { | |
555 | denc_varint_lowz(logical_length, p); | |
556 | denc_varint_lowz(compressed_length, p); | |
557 | } else { | |
558 | logical_length = get_ondisk_length(); | |
559 | } | |
560 | if (has_csum()) { | |
561 | denc(csum_type, p); | |
562 | denc(csum_chunk_order, p); | |
563 | int len; | |
564 | denc_varint(len, p); | |
565 | csum_data = p.get_ptr(len); | |
3efd9988 | 566 | csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other); |
7c673cae FG |
567 | } |
568 | if (has_unused()) { | |
569 | denc(unused, p); | |
570 | } | |
571 | } | |
572 | ||
573 | bool can_split() const { | |
574 | return | |
575 | !has_flag(FLAG_SHARED) && | |
576 | !has_flag(FLAG_COMPRESSED) && | |
577 | !has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex | |
578 | } | |
579 | bool can_split_at(uint32_t blob_offset) const { | |
580 | return !has_csum() || blob_offset % get_csum_chunk_size() == 0; | |
581 | } | |
582 | ||
583 | void dump(Formatter *f) const; | |
584 | static void generate_test_instances(list<bluestore_blob_t*>& ls); | |
585 | ||
586 | bool has_flag(unsigned f) const { | |
587 | return flags & f; | |
588 | } | |
589 | void set_flag(unsigned f) { | |
590 | flags |= f; | |
591 | } | |
592 | void clear_flag(unsigned f) { | |
593 | flags &= ~f; | |
594 | } | |
595 | string get_flags_string() const { | |
596 | return get_flags_string(flags); | |
597 | } | |
598 | ||
599 | void set_compressed(uint64_t clen_orig, uint64_t clen) { | |
600 | set_flag(FLAG_COMPRESSED); | |
601 | logical_length = clen_orig; | |
602 | compressed_length = clen; | |
603 | } | |
604 | bool is_mutable() const { | |
31f18b77 | 605 | return !is_compressed() && !is_shared(); |
7c673cae FG |
606 | } |
607 | bool is_compressed() const { | |
608 | return has_flag(FLAG_COMPRESSED); | |
609 | } | |
610 | bool has_csum() const { | |
611 | return has_flag(FLAG_CSUM); | |
612 | } | |
613 | bool has_unused() const { | |
614 | return has_flag(FLAG_HAS_UNUSED); | |
615 | } | |
616 | bool is_shared() const { | |
617 | return has_flag(FLAG_SHARED); | |
618 | } | |
619 | ||
620 | /// return chunk (i.e. min readable block) size for the blob | |
621 | uint64_t get_chunk_size(uint64_t dev_block_size) const { | |
622 | return has_csum() ? | |
623 | MAX(dev_block_size, get_csum_chunk_size()) : dev_block_size; | |
624 | } | |
625 | uint32_t get_csum_chunk_size() const { | |
626 | return 1 << csum_chunk_order; | |
627 | } | |
628 | uint32_t get_compressed_payload_length() const { | |
629 | return is_compressed() ? compressed_length : 0; | |
630 | } | |
631 | uint64_t calc_offset(uint64_t x_off, uint64_t *plen) const { | |
632 | auto p = extents.begin(); | |
633 | assert(p != extents.end()); | |
634 | while (x_off >= p->length) { | |
635 | x_off -= p->length; | |
636 | ++p; | |
637 | assert(p != extents.end()); | |
638 | } | |
639 | if (plen) | |
640 | *plen = p->length - x_off; | |
641 | return p->offset + x_off; | |
642 | } | |
643 | ||
31f18b77 FG |
644 | // validate whether or not the status of pextents within the given range |
645 | // meets the requirement(allocated or unallocated). | |
646 | bool _validate_range(uint64_t b_off, uint64_t b_len, | |
647 | bool require_allocated) const { | |
7c673cae FG |
648 | auto p = extents.begin(); |
649 | assert(p != extents.end()); | |
650 | while (b_off >= p->length) { | |
651 | b_off -= p->length; | |
652 | ++p; | |
653 | assert(p != extents.end()); | |
654 | } | |
655 | b_len += b_off; | |
656 | while (b_len) { | |
657 | assert(p != extents.end()); | |
31f18b77 FG |
658 | if (require_allocated != p->is_valid()) { |
659 | return false; | |
7c673cae | 660 | } |
31f18b77 | 661 | |
7c673cae | 662 | if (p->length >= b_len) { |
31f18b77 | 663 | return true; |
7c673cae FG |
664 | } |
665 | b_len -= p->length; | |
666 | ++p; | |
667 | } | |
668 | assert(0 == "we should not get here"); | |
669 | } | |
670 | ||
31f18b77 FG |
671 | /// return true if the entire range is allocated |
672 | /// (mapped to extents on disk) | |
673 | bool is_allocated(uint64_t b_off, uint64_t b_len) const { | |
674 | return _validate_range(b_off, b_len, true); | |
675 | } | |
676 | ||
7c673cae | 677 | /// return true if the entire range is unallocated |
31f18b77 | 678 | /// (not mapped to extents on disk) |
7c673cae | 679 | bool is_unallocated(uint64_t b_off, uint64_t b_len) const { |
31f18b77 | 680 | return _validate_range(b_off, b_len, false); |
7c673cae FG |
681 | } |
682 | ||
683 | /// return true if the logical range has never been used | |
684 | bool is_unused(uint64_t offset, uint64_t length) const { | |
685 | if (!has_unused()) { | |
686 | return false; | |
687 | } | |
688 | uint64_t blob_len = get_logical_length(); | |
689 | assert((blob_len % (sizeof(unused)*8)) == 0); | |
690 | assert(offset + length <= blob_len); | |
691 | uint64_t chunk_size = blob_len / (sizeof(unused)*8); | |
692 | uint64_t start = offset / chunk_size; | |
693 | uint64_t end = ROUND_UP_TO(offset + length, chunk_size) / chunk_size; | |
694 | auto i = start; | |
695 | while (i < end && (unused & (1u << i))) { | |
696 | i++; | |
697 | } | |
698 | return i >= end; | |
699 | } | |
700 | ||
701 | /// mark a range that has never been used | |
702 | void add_unused(uint64_t offset, uint64_t length) { | |
703 | uint64_t blob_len = get_logical_length(); | |
704 | assert((blob_len % (sizeof(unused)*8)) == 0); | |
705 | assert(offset + length <= blob_len); | |
706 | uint64_t chunk_size = blob_len / (sizeof(unused)*8); | |
707 | uint64_t start = ROUND_UP_TO(offset, chunk_size) / chunk_size; | |
708 | uint64_t end = (offset + length) / chunk_size; | |
709 | for (auto i = start; i < end; ++i) { | |
710 | unused |= (1u << i); | |
711 | } | |
712 | if (start != end) { | |
713 | set_flag(FLAG_HAS_UNUSED); | |
714 | } | |
715 | } | |
716 | ||
717 | /// indicate that a range has (now) been used. | |
718 | void mark_used(uint64_t offset, uint64_t length) { | |
719 | if (has_unused()) { | |
720 | uint64_t blob_len = get_logical_length(); | |
721 | assert((blob_len % (sizeof(unused)*8)) == 0); | |
722 | assert(offset + length <= blob_len); | |
723 | uint64_t chunk_size = blob_len / (sizeof(unused)*8); | |
724 | uint64_t start = offset / chunk_size; | |
725 | uint64_t end = ROUND_UP_TO(offset + length, chunk_size) / chunk_size; | |
726 | for (auto i = start; i < end; ++i) { | |
727 | unused &= ~(1u << i); | |
728 | } | |
729 | if (unused == 0) { | |
730 | clear_flag(FLAG_HAS_UNUSED); | |
731 | } | |
732 | } | |
733 | } | |
734 | ||
94b18763 FG |
735 | template<class F> |
736 | int map(uint64_t x_off, uint64_t x_len, F&& f) const { | |
7c673cae FG |
737 | auto p = extents.begin(); |
738 | assert(p != extents.end()); | |
739 | while (x_off >= p->length) { | |
740 | x_off -= p->length; | |
741 | ++p; | |
742 | assert(p != extents.end()); | |
743 | } | |
744 | while (x_len > 0) { | |
745 | assert(p != extents.end()); | |
746 | uint64_t l = MIN(p->length - x_off, x_len); | |
747 | int r = f(p->offset + x_off, l); | |
748 | if (r < 0) | |
749 | return r; | |
750 | x_off = 0; | |
751 | x_len -= l; | |
752 | ++p; | |
753 | } | |
754 | return 0; | |
755 | } | |
94b18763 | 756 | template<class F> |
7c673cae FG |
757 | void map_bl(uint64_t x_off, |
758 | bufferlist& bl, | |
94b18763 | 759 | F&& f) const { |
7c673cae FG |
760 | auto p = extents.begin(); |
761 | assert(p != extents.end()); | |
762 | while (x_off >= p->length) { | |
763 | x_off -= p->length; | |
764 | ++p; | |
765 | assert(p != extents.end()); | |
766 | } | |
767 | bufferlist::iterator it = bl.begin(); | |
768 | uint64_t x_len = bl.length(); | |
769 | while (x_len > 0) { | |
770 | assert(p != extents.end()); | |
771 | uint64_t l = MIN(p->length - x_off, x_len); | |
772 | bufferlist t; | |
773 | it.copy(l, t); | |
774 | f(p->offset + x_off, t); | |
775 | x_off = 0; | |
776 | x_len -= l; | |
777 | ++p; | |
778 | } | |
779 | } | |
780 | ||
781 | uint32_t get_ondisk_length() const { | |
782 | uint32_t len = 0; | |
783 | for (auto &p : extents) { | |
784 | len += p.length; | |
785 | } | |
786 | return len; | |
787 | } | |
788 | ||
789 | uint32_t get_logical_length() const { | |
790 | return logical_length; | |
791 | } | |
792 | size_t get_csum_value_size() const; | |
793 | ||
794 | size_t get_csum_count() const { | |
795 | size_t vs = get_csum_value_size(); | |
796 | if (!vs) | |
797 | return 0; | |
798 | return csum_data.length() / vs; | |
799 | } | |
800 | uint64_t get_csum_item(unsigned i) const { | |
801 | size_t cs = get_csum_value_size(); | |
802 | const char *p = csum_data.c_str(); | |
803 | switch (cs) { | |
804 | case 0: | |
805 | assert(0 == "no csum data, bad index"); | |
806 | case 1: | |
807 | return reinterpret_cast<const uint8_t*>(p)[i]; | |
808 | case 2: | |
809 | return reinterpret_cast<const __le16*>(p)[i]; | |
810 | case 4: | |
811 | return reinterpret_cast<const __le32*>(p)[i]; | |
812 | case 8: | |
813 | return reinterpret_cast<const __le64*>(p)[i]; | |
814 | default: | |
815 | assert(0 == "unrecognized csum word size"); | |
816 | } | |
817 | } | |
818 | const char *get_csum_item_ptr(unsigned i) const { | |
819 | size_t cs = get_csum_value_size(); | |
820 | return csum_data.c_str() + (cs * i); | |
821 | } | |
822 | char *get_csum_item_ptr(unsigned i) { | |
823 | size_t cs = get_csum_value_size(); | |
824 | return csum_data.c_str() + (cs * i); | |
825 | } | |
826 | ||
827 | void init_csum(unsigned type, unsigned order, unsigned len) { | |
828 | flags |= FLAG_CSUM; | |
829 | csum_type = type; | |
830 | csum_chunk_order = order; | |
831 | csum_data = buffer::create(get_csum_value_size() * len / get_csum_chunk_size()); | |
832 | csum_data.zero(); | |
3efd9988 | 833 | csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other); |
7c673cae FG |
834 | } |
835 | ||
836 | /// calculate csum for the buffer at the given b_off | |
837 | void calc_csum(uint64_t b_off, const bufferlist& bl); | |
838 | ||
839 | /// verify csum: return -EOPNOTSUPP for unsupported checksum type; | |
840 | /// return -1 and valid(nonnegative) b_bad_off for checksum error; | |
841 | /// return 0 if all is well. | |
842 | int verify_csum(uint64_t b_off, const bufferlist& bl, int* b_bad_off, | |
843 | uint64_t *bad_csum) const; | |
844 | ||
845 | bool can_prune_tail() const { | |
846 | return | |
847 | extents.size() > 1 && // if it's all invalid it's not pruning. | |
848 | !extents.back().is_valid() && | |
849 | !has_unused(); | |
850 | } | |
851 | void prune_tail() { | |
852 | const auto &p = extents.back(); | |
853 | logical_length -= p.length; | |
854 | extents.pop_back(); | |
855 | if (has_csum()) { | |
856 | bufferptr t; | |
857 | t.swap(csum_data); | |
858 | csum_data = bufferptr(t.c_str(), | |
859 | get_logical_length() / get_csum_chunk_size() * | |
860 | get_csum_value_size()); | |
861 | } | |
862 | } | |
863 | void add_tail(uint32_t new_len) { | |
864 | assert(is_mutable()); | |
865 | assert(!has_unused()); | |
866 | assert(new_len > logical_length); | |
867 | extents.emplace_back( | |
868 | bluestore_pextent_t( | |
869 | bluestore_pextent_t::INVALID_OFFSET, | |
870 | new_len - logical_length)); | |
871 | logical_length = new_len; | |
872 | if (has_csum()) { | |
873 | bufferptr t; | |
874 | t.swap(csum_data); | |
875 | csum_data = buffer::create( | |
876 | get_csum_value_size() * logical_length / get_csum_chunk_size()); | |
877 | csum_data.copy_in(0, t.length(), t.c_str()); | |
878 | csum_data.zero(t.length(), csum_data.length() - t.length()); | |
879 | } | |
880 | } | |
881 | uint32_t get_release_size(uint32_t min_alloc_size) const { | |
882 | if (is_compressed()) { | |
883 | return get_logical_length(); | |
884 | } | |
885 | uint32_t res = get_csum_chunk_size(); | |
886 | if (!has_csum() || res < min_alloc_size) { | |
887 | res = min_alloc_size; | |
888 | } | |
889 | return res; | |
890 | } | |
891 | ||
892 | void split(uint32_t blob_offset, bluestore_blob_t& rb); | |
893 | void allocated(uint32_t b_off, uint32_t length, const AllocExtentVector& allocs); | |
894 | void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only | |
895 | ||
896 | /// updates blob's pextents container and return unused pextents eligible | |
897 | /// for release. | |
898 | /// all - indicates that the whole blob to be released. | |
899 | /// logical - specifies set of logical extents within blob's | |
900 | /// to be released | |
901 | /// Returns true if blob has no more valid pextents | |
902 | bool release_extents( | |
903 | bool all, | |
904 | const PExtentVector& logical, | |
905 | PExtentVector* r); | |
906 | }; | |
907 | WRITE_CLASS_DENC_FEATURED(bluestore_blob_t) | |
908 | ||
909 | ostream& operator<<(ostream& out, const bluestore_blob_t& o); | |
910 | ||
911 | ||
912 | /// shared blob state | |
913 | struct bluestore_shared_blob_t { | |
914 | uint64_t sbid; ///> shared blob id | |
915 | bluestore_extent_ref_map_t ref_map; ///< shared blob extents | |
916 | ||
917 | bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {} | |
918 | ||
919 | DENC(bluestore_shared_blob_t, v, p) { | |
920 | DENC_START(1, 1, p); | |
921 | denc(v.ref_map, p); | |
922 | DENC_FINISH(p); | |
923 | } | |
924 | ||
925 | ||
926 | void dump(Formatter *f) const; | |
927 | static void generate_test_instances(list<bluestore_shared_blob_t*>& ls); | |
928 | ||
929 | bool empty() const { | |
930 | return ref_map.empty(); | |
931 | } | |
932 | }; | |
933 | WRITE_CLASS_DENC(bluestore_shared_blob_t) | |
934 | ||
935 | ostream& operator<<(ostream& out, const bluestore_shared_blob_t& o); | |
936 | ||
937 | /// onode: per-object metadata | |
938 | struct bluestore_onode_t { | |
939 | uint64_t nid = 0; ///< numeric id (locally unique) | |
940 | uint64_t size = 0; ///< object size | |
31f18b77 | 941 | map<mempool::bluestore_cache_other::string, bufferptr> attrs; ///< attrs |
7c673cae FG |
942 | |
943 | struct shard_info { | |
944 | uint32_t offset = 0; ///< logical offset for start of shard | |
945 | uint32_t bytes = 0; ///< encoded bytes | |
946 | DENC(shard_info, v, p) { | |
947 | denc_varint(v.offset, p); | |
948 | denc_varint(v.bytes, p); | |
949 | } | |
950 | void dump(Formatter *f) const; | |
951 | }; | |
952 | vector<shard_info> extent_map_shards; ///< extent map shards (if any) | |
953 | ||
954 | uint32_t expected_object_size = 0; | |
955 | uint32_t expected_write_size = 0; | |
956 | uint32_t alloc_hint_flags = 0; | |
957 | ||
958 | uint8_t flags = 0; | |
959 | ||
960 | enum { | |
961 | FLAG_OMAP = 1, | |
962 | }; | |
963 | ||
964 | string get_flags_string() const { | |
965 | string s; | |
966 | if (flags & FLAG_OMAP) { | |
967 | s = "omap"; | |
968 | } | |
969 | return s; | |
970 | } | |
971 | ||
972 | bool has_flag(unsigned f) const { | |
973 | return flags & f; | |
974 | } | |
975 | ||
976 | void set_flag(unsigned f) { | |
977 | flags |= f; | |
978 | } | |
979 | ||
980 | void clear_flag(unsigned f) { | |
981 | flags &= ~f; | |
982 | } | |
983 | ||
984 | bool has_omap() const { | |
985 | return has_flag(FLAG_OMAP); | |
986 | } | |
987 | ||
988 | void set_omap_flag() { | |
989 | set_flag(FLAG_OMAP); | |
990 | } | |
991 | ||
992 | void clear_omap_flag() { | |
993 | clear_flag(FLAG_OMAP); | |
994 | } | |
995 | ||
996 | DENC(bluestore_onode_t, v, p) { | |
997 | DENC_START(1, 1, p); | |
998 | denc_varint(v.nid, p); | |
999 | denc_varint(v.size, p); | |
1000 | denc(v.attrs, p); | |
1001 | denc(v.flags, p); | |
1002 | denc(v.extent_map_shards, p); | |
1003 | denc_varint(v.expected_object_size, p); | |
1004 | denc_varint(v.expected_write_size, p); | |
1005 | denc_varint(v.alloc_hint_flags, p); | |
1006 | DENC_FINISH(p); | |
1007 | } | |
1008 | void dump(Formatter *f) const; | |
1009 | static void generate_test_instances(list<bluestore_onode_t*>& o); | |
1010 | }; | |
1011 | WRITE_CLASS_DENC(bluestore_onode_t::shard_info) | |
1012 | WRITE_CLASS_DENC(bluestore_onode_t) | |
1013 | ||
1014 | ostream& operator<<(ostream& out, const bluestore_onode_t::shard_info& si); | |
1015 | ||
1016 | /// writeahead-logged op | |
1017 | struct bluestore_deferred_op_t { | |
1018 | typedef enum { | |
1019 | OP_WRITE = 1, | |
1020 | } type_t; | |
1021 | __u8 op = 0; | |
1022 | ||
1023 | PExtentVector extents; | |
1024 | bufferlist data; | |
1025 | ||
1026 | DENC(bluestore_deferred_op_t, v, p) { | |
1027 | DENC_START(1, 1, p); | |
1028 | denc(v.op, p); | |
1029 | denc(v.extents, p); | |
1030 | denc(v.data, p); | |
1031 | DENC_FINISH(p); | |
1032 | } | |
1033 | void dump(Formatter *f) const; | |
1034 | static void generate_test_instances(list<bluestore_deferred_op_t*>& o); | |
1035 | }; | |
1036 | WRITE_CLASS_DENC(bluestore_deferred_op_t) | |
1037 | ||
1038 | ||
1039 | /// writeahead-logged transaction | |
1040 | struct bluestore_deferred_transaction_t { | |
1041 | uint64_t seq = 0; | |
1042 | list<bluestore_deferred_op_t> ops; | |
1043 | interval_set<uint64_t> released; ///< allocations to release after tx | |
1044 | ||
1045 | bluestore_deferred_transaction_t() : seq(0) {} | |
1046 | ||
1047 | DENC(bluestore_deferred_transaction_t, v, p) { | |
1048 | DENC_START(1, 1, p); | |
1049 | denc(v.seq, p); | |
1050 | denc(v.ops, p); | |
1051 | denc(v.released, p); | |
1052 | DENC_FINISH(p); | |
1053 | } | |
1054 | void dump(Formatter *f) const; | |
1055 | static void generate_test_instances(list<bluestore_deferred_transaction_t*>& o); | |
1056 | }; | |
1057 | WRITE_CLASS_DENC(bluestore_deferred_transaction_t) | |
1058 | ||
1059 | struct bluestore_compression_header_t { | |
1060 | uint8_t type = Compressor::COMP_ALG_NONE; | |
1061 | uint32_t length = 0; | |
1062 | ||
1063 | bluestore_compression_header_t() {} | |
1064 | bluestore_compression_header_t(uint8_t _type) | |
1065 | : type(_type) {} | |
1066 | ||
1067 | DENC(bluestore_compression_header_t, v, p) { | |
1068 | DENC_START(1, 1, p); | |
1069 | denc(v.type, p); | |
1070 | denc(v.length, p); | |
1071 | DENC_FINISH(p); | |
1072 | } | |
1073 | void dump(Formatter *f) const; | |
1074 | static void generate_test_instances(list<bluestore_compression_header_t*>& o); | |
1075 | }; | |
1076 | WRITE_CLASS_DENC(bluestore_compression_header_t) | |
1077 | ||
1078 | ||
1079 | #endif |