]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2014 Red Hat | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H | |
16 | #define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H | |
17 | ||
18 | #include <ostream> | |
19 | #include <bitset> | |
20 | #include "include/types.h" | |
21 | #include "include/interval_set.h" | |
22 | #include "include/utime.h" | |
23 | #include "common/hobject.h" | |
24 | #include "compressor/Compressor.h" | |
25 | #include "common/Checksummer.h" | |
26 | #include "include/mempool.h" | |
27 | ||
28 | namespace ceph { | |
29 | class Formatter; | |
30 | } | |
31 | ||
32 | /// label for block device | |
33 | struct bluestore_bdev_label_t { | |
34 | uuid_d osd_uuid; ///< osd uuid | |
35 | uint64_t size; ///< device size | |
36 | utime_t btime; ///< birth time | |
37 | string description; ///< device description | |
38 | ||
39 | void encode(bufferlist& bl) const; | |
40 | void decode(bufferlist::iterator& p); | |
41 | void dump(Formatter *f) const; | |
42 | static void generate_test_instances(list<bluestore_bdev_label_t*>& o); | |
43 | }; | |
44 | WRITE_CLASS_ENCODER(bluestore_bdev_label_t) | |
45 | ||
46 | ostream& operator<<(ostream& out, const bluestore_bdev_label_t& l); | |
47 | ||
48 | /// collection metadata | |
49 | struct bluestore_cnode_t { | |
50 | uint32_t bits; ///< how many bits of coll pgid are significant | |
51 | ||
52 | explicit bluestore_cnode_t(int b=0) : bits(b) {} | |
53 | ||
54 | DENC(bluestore_cnode_t, v, p) { | |
55 | DENC_START(1, 1, p); | |
56 | denc(v.bits, p); | |
57 | DENC_FINISH(p); | |
58 | } | |
59 | void dump(Formatter *f) const; | |
60 | static void generate_test_instances(list<bluestore_cnode_t*>& o); | |
61 | }; | |
62 | WRITE_CLASS_DENC(bluestore_cnode_t) | |
63 | ||
64 | class AllocExtent; | |
65 | typedef mempool::bluestore_alloc::vector<AllocExtent> AllocExtentVector; | |
66 | class AllocExtent { | |
67 | public: | |
68 | uint64_t offset; | |
69 | uint32_t length; | |
70 | ||
71 | AllocExtent() { | |
72 | offset = 0; | |
73 | length = 0; | |
74 | } | |
75 | ||
76 | AllocExtent(int64_t off, int32_t len) : offset(off), length(len) { } | |
77 | uint64_t end() const { | |
78 | return offset + length; | |
79 | } | |
80 | bool operator==(const AllocExtent& other) const { | |
81 | return offset == other.offset && length == other.length; | |
82 | } | |
83 | }; | |
84 | ||
85 | inline static ostream& operator<<(ostream& out, const AllocExtent& e) { | |
86 | return out << "0x" << std::hex << e.offset << "~" << e.length << std::dec; | |
87 | } | |
88 | ||
89 | class ExtentList { | |
90 | AllocExtentVector *m_extents; | |
91 | int64_t m_block_size; | |
92 | int64_t m_max_blocks; | |
93 | ||
94 | public: | |
95 | void init(AllocExtentVector *extents, int64_t block_size, | |
96 | uint64_t max_alloc_size) { | |
97 | m_extents = extents; | |
98 | m_block_size = block_size; | |
99 | m_max_blocks = max_alloc_size / block_size; | |
100 | assert(m_extents->empty()); | |
101 | } | |
102 | ||
103 | ExtentList(AllocExtentVector *extents, int64_t block_size) { | |
104 | init(extents, block_size, 0); | |
105 | } | |
106 | ||
107 | ExtentList(AllocExtentVector *extents, int64_t block_size, | |
108 | uint64_t max_alloc_size) { | |
109 | init(extents, block_size, max_alloc_size); | |
110 | } | |
111 | ||
112 | void reset() { | |
113 | m_extents->clear(); | |
114 | } | |
115 | ||
116 | void add_extents(int64_t start, int64_t count); | |
117 | ||
118 | AllocExtentVector *get_extents() { | |
119 | return m_extents; | |
120 | } | |
121 | ||
122 | std::pair<int64_t, int64_t> get_nth_extent(int index) { | |
123 | return std::make_pair | |
124 | ((*m_extents)[index].offset / m_block_size, | |
125 | (*m_extents)[index].length / m_block_size); | |
126 | } | |
127 | ||
128 | int64_t get_extent_count() { | |
129 | return m_extents->size(); | |
130 | } | |
131 | }; | |
132 | ||
133 | ||
134 | /// pextent: physical extent | |
135 | struct bluestore_pextent_t : public AllocExtent { | |
136 | const static uint64_t INVALID_OFFSET = ~0ull; | |
137 | ||
138 | bluestore_pextent_t() : AllocExtent() {} | |
139 | bluestore_pextent_t(uint64_t o, uint64_t l) : AllocExtent(o, l) {} | |
140 | bluestore_pextent_t(const AllocExtent &ext) : | |
141 | AllocExtent(ext.offset, ext.length) { } | |
142 | ||
143 | bluestore_pextent_t& operator=(const AllocExtent &ext) { | |
144 | offset = ext.offset; | |
145 | length = ext.length; | |
146 | return *this; | |
147 | } | |
148 | bool is_valid() const { | |
149 | return offset != INVALID_OFFSET; | |
150 | } | |
151 | ||
152 | DENC(bluestore_pextent_t, v, p) { | |
153 | denc_lba(v.offset, p); | |
154 | denc_varint_lowz(v.length, p); | |
155 | } | |
156 | ||
157 | void dump(Formatter *f) const; | |
158 | static void generate_test_instances(list<bluestore_pextent_t*>& ls); | |
159 | }; | |
160 | WRITE_CLASS_DENC(bluestore_pextent_t) | |
161 | ||
162 | ostream& operator<<(ostream& out, const bluestore_pextent_t& o); | |
163 | ||
31f18b77 | 164 | typedef mempool::bluestore_cache_other::vector<bluestore_pextent_t> PExtentVector; |
7c673cae FG |
165 | |
166 | template<> | |
167 | struct denc_traits<PExtentVector> { | |
168 | static constexpr bool supported = true; | |
169 | static constexpr bool bounded = false; | |
170 | static constexpr bool featured = false; | |
31f18b77 | 171 | static constexpr bool need_contiguous = true; |
7c673cae FG |
172 | static void bound_encode(const PExtentVector& v, size_t& p) { |
173 | p += sizeof(uint32_t); | |
174 | const auto size = v.size(); | |
175 | if (size) { | |
176 | size_t per = 0; | |
177 | denc(v.front(), per); | |
178 | p += per * size; | |
179 | } | |
180 | } | |
181 | static void encode(const PExtentVector& v, | |
182 | bufferlist::contiguous_appender& p) { | |
183 | denc_varint(v.size(), p); | |
184 | for (auto& i : v) { | |
185 | denc(i, p); | |
186 | } | |
187 | } | |
188 | static void decode(PExtentVector& v, bufferptr::iterator& p) { | |
189 | unsigned num; | |
190 | denc_varint(num, p); | |
191 | v.clear(); | |
192 | v.resize(num); | |
193 | for (unsigned i=0; i<num; ++i) { | |
194 | denc(v[i], p); | |
195 | } | |
196 | } | |
197 | }; | |
198 | ||
199 | ||
200 | /// extent_map: a map of reference counted extents | |
201 | struct bluestore_extent_ref_map_t { | |
202 | struct record_t { | |
203 | uint32_t length; | |
204 | uint32_t refs; | |
205 | record_t(uint32_t l=0, uint32_t r=0) : length(l), refs(r) {} | |
206 | DENC(bluestore_extent_ref_map_t::record_t, v, p) { | |
207 | denc_varint_lowz(v.length, p); | |
208 | denc_varint(v.refs, p); | |
209 | } | |
210 | }; | |
211 | ||
31f18b77 | 212 | typedef mempool::bluestore_cache_other::map<uint64_t,record_t> map_t; |
7c673cae FG |
213 | map_t ref_map; |
214 | ||
215 | void _check() const; | |
216 | void _maybe_merge_left(map_t::iterator& p); | |
217 | ||
218 | void clear() { | |
219 | ref_map.clear(); | |
220 | } | |
221 | bool empty() const { | |
222 | return ref_map.empty(); | |
223 | } | |
224 | ||
225 | void get(uint64_t offset, uint32_t len); | |
31f18b77 FG |
226 | void put(uint64_t offset, uint32_t len, PExtentVector *release, |
227 | bool *maybe_unshared); | |
7c673cae FG |
228 | |
229 | bool contains(uint64_t offset, uint32_t len) const; | |
230 | bool intersects(uint64_t offset, uint32_t len) const; | |
231 | ||
232 | void bound_encode(size_t& p) const { | |
233 | denc_varint((uint32_t)0, p); | |
234 | if (!ref_map.empty()) { | |
235 | size_t elem_size = 0; | |
236 | denc_varint_lowz((uint64_t)0, elem_size); | |
237 | ref_map.begin()->second.bound_encode(elem_size); | |
238 | p += elem_size * ref_map.size(); | |
239 | } | |
240 | } | |
241 | void encode(bufferlist::contiguous_appender& p) const { | |
242 | uint32_t n = ref_map.size(); | |
243 | denc_varint(n, p); | |
244 | if (n) { | |
245 | auto i = ref_map.begin(); | |
246 | denc_varint_lowz(i->first, p); | |
247 | i->second.encode(p); | |
248 | int64_t pos = i->first; | |
249 | while (--n) { | |
250 | ++i; | |
251 | denc_varint_lowz((int64_t)i->first - pos, p); | |
252 | i->second.encode(p); | |
253 | pos = i->first; | |
254 | } | |
255 | } | |
256 | } | |
257 | void decode(bufferptr::iterator& p) { | |
258 | uint32_t n; | |
259 | denc_varint(n, p); | |
260 | if (n) { | |
261 | int64_t pos; | |
262 | denc_varint_lowz(pos, p); | |
263 | ref_map[pos].decode(p); | |
264 | while (--n) { | |
265 | int64_t delta; | |
266 | denc_varint_lowz(delta, p); | |
267 | pos += delta; | |
268 | ref_map[pos].decode(p); | |
269 | } | |
270 | } | |
271 | } | |
272 | ||
273 | void dump(Formatter *f) const; | |
274 | static void generate_test_instances(list<bluestore_extent_ref_map_t*>& o); | |
275 | }; | |
276 | WRITE_CLASS_DENC(bluestore_extent_ref_map_t) | |
277 | ||
278 | ||
279 | ostream& operator<<(ostream& out, const bluestore_extent_ref_map_t& rm); | |
280 | static inline bool operator==(const bluestore_extent_ref_map_t::record_t& l, | |
281 | const bluestore_extent_ref_map_t::record_t& r) { | |
282 | return l.length == r.length && l.refs == r.refs; | |
283 | } | |
284 | static inline bool operator==(const bluestore_extent_ref_map_t& l, | |
285 | const bluestore_extent_ref_map_t& r) { | |
286 | return l.ref_map == r.ref_map; | |
287 | } | |
288 | static inline bool operator!=(const bluestore_extent_ref_map_t& l, | |
289 | const bluestore_extent_ref_map_t& r) { | |
290 | return !(l == r); | |
291 | } | |
292 | ||
293 | /// blob_use_tracker: a set of per-alloc unit ref counters to track blob usage | |
294 | struct bluestore_blob_use_tracker_t { | |
295 | // N.B.: There is no need to minimize au_size/num_au | |
296 | // as much as possible (e.g. have just a single byte for au_size) since: | |
297 | // 1) Struct isn't packed hence it's padded. And even if it's packed see 2) | |
298 | // 2) Mem manager has its own granularity, most probably >= 8 bytes | |
299 | // | |
300 | uint32_t au_size; // Allocation (=tracking) unit size, | |
301 | // == 0 if uninitialized | |
302 | uint32_t num_au; // Amount of allocation units tracked | |
303 | // == 0 if single unit or the whole blob is tracked | |
304 | ||
305 | union { | |
306 | uint32_t* bytes_per_au; | |
307 | uint32_t total_bytes; | |
308 | }; | |
309 | ||
310 | bluestore_blob_use_tracker_t() | |
311 | : au_size(0), num_au(0), bytes_per_au(nullptr) { | |
312 | } | |
313 | ~bluestore_blob_use_tracker_t() { | |
314 | clear(); | |
315 | } | |
316 | ||
317 | void clear() { | |
318 | if (num_au != 0) { | |
319 | delete[] bytes_per_au; | |
320 | } | |
321 | bytes_per_au = 0; | |
322 | au_size = 0; | |
323 | num_au = 0; | |
324 | } | |
325 | ||
326 | uint32_t get_referenced_bytes() const { | |
327 | uint32_t total = 0; | |
328 | if (!num_au) { | |
329 | total = total_bytes; | |
330 | } else { | |
331 | for (size_t i = 0; i < num_au; ++i) { | |
332 | total += bytes_per_au[i]; | |
333 | } | |
334 | } | |
335 | return total; | |
336 | } | |
337 | bool is_not_empty() const { | |
338 | if (!num_au) { | |
339 | return total_bytes != 0; | |
340 | } else { | |
341 | for (size_t i = 0; i < num_au; ++i) { | |
342 | if (bytes_per_au[i]) { | |
343 | return true; | |
344 | } | |
345 | } | |
346 | } | |
347 | return false; | |
348 | } | |
349 | bool is_empty() const { | |
350 | return !is_not_empty(); | |
351 | } | |
352 | void prune_tail(uint32_t new_len) { | |
353 | if (num_au) { | |
354 | new_len = ROUND_UP_TO(new_len, au_size); | |
355 | uint32_t _num_au = new_len / au_size; | |
356 | assert(_num_au <= num_au); | |
357 | if (_num_au) { | |
358 | num_au = _num_au; // bytes_per_au array is left unmodified | |
359 | ||
360 | } else { | |
361 | clear(); | |
362 | } | |
363 | } | |
364 | } | |
365 | void add_tail(uint32_t new_len, uint32_t _au_size) { | |
366 | auto full_size = au_size * (num_au ? num_au : 1); | |
367 | assert(new_len >= full_size); | |
368 | if (new_len == full_size) { | |
369 | return; | |
370 | } | |
371 | if (!num_au) { | |
372 | uint32_t old_total = total_bytes; | |
373 | total_bytes = 0; | |
374 | init(new_len, _au_size); | |
375 | assert(num_au); | |
376 | bytes_per_au[0] = old_total; | |
377 | } else { | |
378 | assert(_au_size == au_size); | |
379 | new_len = ROUND_UP_TO(new_len, au_size); | |
380 | uint32_t _num_au = new_len / au_size; | |
381 | assert(_num_au >= num_au); | |
382 | if (_num_au > num_au) { | |
383 | auto old_bytes = bytes_per_au; | |
384 | auto old_num_au = num_au; | |
385 | num_au = _num_au; | |
386 | allocate(); | |
387 | for (size_t i = 0; i < old_num_au; i++) { | |
388 | bytes_per_au[i] = old_bytes[i]; | |
389 | } | |
390 | for (size_t i = old_num_au; i < num_au; i++) { | |
391 | bytes_per_au[i] = 0; | |
392 | } | |
393 | delete[] old_bytes; | |
394 | } | |
395 | } | |
396 | } | |
397 | ||
398 | void init( | |
399 | uint32_t full_length, | |
400 | uint32_t _au_size); | |
401 | ||
402 | void get( | |
403 | uint32_t offset, | |
404 | uint32_t len); | |
405 | ||
406 | /// put: return true if the blob has no references any more after the call, | |
407 | /// no release_units is filled for the sake of performance. | |
408 | /// return false if there are some references to the blob, | |
409 | /// in this case release_units contains pextents | |
410 | /// (identified by their offsets relative to the blob start) | |
31f18b77 | 411 | /// that are not used any more and can be safely deallocated. |
7c673cae FG |
412 | bool put( |
413 | uint32_t offset, | |
414 | uint32_t len, | |
415 | PExtentVector *release); | |
416 | ||
417 | bool can_split() const; | |
418 | bool can_split_at(uint32_t blob_offset) const; | |
419 | void split( | |
420 | uint32_t blob_offset, | |
421 | bluestore_blob_use_tracker_t* r); | |
422 | ||
423 | bool equal( | |
424 | const bluestore_blob_use_tracker_t& other) const; | |
425 | ||
426 | void bound_encode(size_t& p) const { | |
427 | denc_varint(au_size, p); | |
428 | if (au_size) { | |
429 | denc_varint(num_au, p); | |
430 | if (!num_au) { | |
431 | denc_varint(total_bytes, p); | |
432 | } else { | |
433 | size_t elem_size = 0; | |
434 | denc_varint((uint32_t)0, elem_size); | |
435 | p += elem_size * num_au; | |
436 | } | |
437 | } | |
438 | } | |
439 | void encode(bufferlist::contiguous_appender& p) const { | |
440 | denc_varint(au_size, p); | |
441 | if (au_size) { | |
442 | denc_varint(num_au, p); | |
443 | if (!num_au) { | |
444 | denc_varint(total_bytes, p); | |
445 | } else { | |
446 | size_t elem_size = 0; | |
447 | denc_varint((uint32_t)0, elem_size); | |
448 | for (size_t i = 0; i < num_au; ++i) { | |
449 | denc_varint(bytes_per_au[i], p); | |
450 | } | |
451 | } | |
452 | } | |
453 | } | |
454 | void decode(bufferptr::iterator& p) { | |
455 | clear(); | |
456 | denc_varint(au_size, p); | |
457 | if (au_size) { | |
458 | denc_varint(num_au, p); | |
459 | if (!num_au) { | |
460 | denc_varint(total_bytes, p); | |
461 | } else { | |
462 | allocate(); | |
463 | for (size_t i = 0; i < num_au; ++i) { | |
464 | denc_varint(bytes_per_au[i], p); | |
465 | } | |
466 | } | |
467 | } | |
468 | } | |
469 | ||
470 | void dump(Formatter *f) const; | |
471 | static void generate_test_instances(list<bluestore_blob_use_tracker_t*>& o); | |
472 | private: | |
473 | void allocate(); | |
7c673cae FG |
474 | }; |
475 | WRITE_CLASS_DENC(bluestore_blob_use_tracker_t) | |
476 | ostream& operator<<(ostream& out, const bluestore_blob_use_tracker_t& rm); | |
477 | ||
478 | /// blob: a piece of data on disk | |
479 | struct bluestore_blob_t { | |
480 | private: | |
481 | PExtentVector extents; ///< raw data position on device | |
31f18b77 | 482 | uint32_t logical_length = 0; ///< original length of data stored in the blob |
7c673cae FG |
483 | uint32_t compressed_length = 0; ///< compressed length if any |
484 | ||
485 | public: | |
486 | enum { | |
31f18b77 | 487 | LEGACY_FLAG_MUTABLE = 1, ///< [legacy] blob can be overwritten or split |
7c673cae FG |
488 | FLAG_COMPRESSED = 2, ///< blob is compressed |
489 | FLAG_CSUM = 4, ///< blob has checksums | |
490 | FLAG_HAS_UNUSED = 8, ///< blob has unused map | |
491 | FLAG_SHARED = 16, ///< blob is shared; see external SharedBlob | |
492 | }; | |
493 | static string get_flags_string(unsigned flags); | |
494 | ||
495 | uint32_t flags = 0; ///< FLAG_* | |
496 | ||
497 | typedef uint16_t unused_t; | |
498 | unused_t unused = 0; ///< portion that has never been written to (bitmap) | |
499 | ||
500 | uint8_t csum_type = Checksummer::CSUM_NONE; ///< CSUM_* | |
501 | uint8_t csum_chunk_order = 0; ///< csum block size is 1<<block_order bytes | |
502 | ||
503 | bufferptr csum_data; ///< opaque vector of csum data | |
504 | ||
505 | bluestore_blob_t(uint32_t f = 0) : flags(f) {} | |
506 | ||
507 | const PExtentVector& get_extents() const { | |
508 | return extents; | |
509 | } | |
510 | ||
511 | DENC_HELPERS; | |
512 | void bound_encode(size_t& p, uint64_t struct_v) const { | |
513 | assert(struct_v == 1 || struct_v == 2); | |
514 | denc(extents, p); | |
515 | denc_varint(flags, p); | |
516 | denc_varint_lowz(logical_length, p); | |
517 | denc_varint_lowz(compressed_length, p); | |
518 | denc(csum_type, p); | |
519 | denc(csum_chunk_order, p); | |
520 | denc_varint(csum_data.length(), p); | |
521 | p += csum_data.length(); | |
522 | p += sizeof(unused_t); | |
523 | } | |
524 | ||
525 | void encode(bufferlist::contiguous_appender& p, uint64_t struct_v) const { | |
526 | assert(struct_v == 1 || struct_v == 2); | |
527 | denc(extents, p); | |
528 | denc_varint(flags, p); | |
529 | if (is_compressed()) { | |
530 | denc_varint_lowz(logical_length, p); | |
531 | denc_varint_lowz(compressed_length, p); | |
532 | } | |
533 | if (has_csum()) { | |
534 | denc(csum_type, p); | |
535 | denc(csum_chunk_order, p); | |
536 | denc_varint(csum_data.length(), p); | |
537 | memcpy(p.get_pos_add(csum_data.length()), csum_data.c_str(), | |
538 | csum_data.length()); | |
539 | } | |
540 | if (has_unused()) { | |
541 | denc(unused, p); | |
542 | } | |
543 | } | |
544 | ||
545 | void decode(bufferptr::iterator& p, uint64_t struct_v) { | |
546 | assert(struct_v == 1 || struct_v == 2); | |
547 | denc(extents, p); | |
548 | denc_varint(flags, p); | |
549 | if (is_compressed()) { | |
550 | denc_varint_lowz(logical_length, p); | |
551 | denc_varint_lowz(compressed_length, p); | |
552 | } else { | |
553 | logical_length = get_ondisk_length(); | |
554 | } | |
555 | if (has_csum()) { | |
556 | denc(csum_type, p); | |
557 | denc(csum_chunk_order, p); | |
558 | int len; | |
559 | denc_varint(len, p); | |
560 | csum_data = p.get_ptr(len); | |
561 | } | |
562 | if (has_unused()) { | |
563 | denc(unused, p); | |
564 | } | |
565 | } | |
566 | ||
567 | bool can_split() const { | |
568 | return | |
569 | !has_flag(FLAG_SHARED) && | |
570 | !has_flag(FLAG_COMPRESSED) && | |
571 | !has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex | |
572 | } | |
573 | bool can_split_at(uint32_t blob_offset) const { | |
574 | return !has_csum() || blob_offset % get_csum_chunk_size() == 0; | |
575 | } | |
576 | ||
577 | void dump(Formatter *f) const; | |
578 | static void generate_test_instances(list<bluestore_blob_t*>& ls); | |
579 | ||
580 | bool has_flag(unsigned f) const { | |
581 | return flags & f; | |
582 | } | |
583 | void set_flag(unsigned f) { | |
584 | flags |= f; | |
585 | } | |
586 | void clear_flag(unsigned f) { | |
587 | flags &= ~f; | |
588 | } | |
589 | string get_flags_string() const { | |
590 | return get_flags_string(flags); | |
591 | } | |
592 | ||
593 | void set_compressed(uint64_t clen_orig, uint64_t clen) { | |
594 | set_flag(FLAG_COMPRESSED); | |
595 | logical_length = clen_orig; | |
596 | compressed_length = clen; | |
597 | } | |
598 | bool is_mutable() const { | |
31f18b77 | 599 | return !is_compressed() && !is_shared(); |
7c673cae FG |
600 | } |
601 | bool is_compressed() const { | |
602 | return has_flag(FLAG_COMPRESSED); | |
603 | } | |
604 | bool has_csum() const { | |
605 | return has_flag(FLAG_CSUM); | |
606 | } | |
607 | bool has_unused() const { | |
608 | return has_flag(FLAG_HAS_UNUSED); | |
609 | } | |
610 | bool is_shared() const { | |
611 | return has_flag(FLAG_SHARED); | |
612 | } | |
613 | ||
614 | /// return chunk (i.e. min readable block) size for the blob | |
615 | uint64_t get_chunk_size(uint64_t dev_block_size) const { | |
616 | return has_csum() ? | |
617 | MAX(dev_block_size, get_csum_chunk_size()) : dev_block_size; | |
618 | } | |
619 | uint32_t get_csum_chunk_size() const { | |
620 | return 1 << csum_chunk_order; | |
621 | } | |
622 | uint32_t get_compressed_payload_length() const { | |
623 | return is_compressed() ? compressed_length : 0; | |
624 | } | |
625 | uint64_t calc_offset(uint64_t x_off, uint64_t *plen) const { | |
626 | auto p = extents.begin(); | |
627 | assert(p != extents.end()); | |
628 | while (x_off >= p->length) { | |
629 | x_off -= p->length; | |
630 | ++p; | |
631 | assert(p != extents.end()); | |
632 | } | |
633 | if (plen) | |
634 | *plen = p->length - x_off; | |
635 | return p->offset + x_off; | |
636 | } | |
637 | ||
31f18b77 FG |
638 | // validate whether or not the status of pextents within the given range |
639 | // meets the requirement(allocated or unallocated). | |
640 | bool _validate_range(uint64_t b_off, uint64_t b_len, | |
641 | bool require_allocated) const { | |
7c673cae FG |
642 | auto p = extents.begin(); |
643 | assert(p != extents.end()); | |
644 | while (b_off >= p->length) { | |
645 | b_off -= p->length; | |
646 | ++p; | |
647 | assert(p != extents.end()); | |
648 | } | |
649 | b_len += b_off; | |
650 | while (b_len) { | |
651 | assert(p != extents.end()); | |
31f18b77 FG |
652 | if (require_allocated != p->is_valid()) { |
653 | return false; | |
7c673cae | 654 | } |
31f18b77 | 655 | |
7c673cae | 656 | if (p->length >= b_len) { |
31f18b77 | 657 | return true; |
7c673cae FG |
658 | } |
659 | b_len -= p->length; | |
660 | ++p; | |
661 | } | |
662 | assert(0 == "we should not get here"); | |
663 | } | |
664 | ||
31f18b77 FG |
665 | /// return true if the entire range is allocated |
666 | /// (mapped to extents on disk) | |
667 | bool is_allocated(uint64_t b_off, uint64_t b_len) const { | |
668 | return _validate_range(b_off, b_len, true); | |
669 | } | |
670 | ||
7c673cae | 671 | /// return true if the entire range is unallocated |
31f18b77 | 672 | /// (not mapped to extents on disk) |
7c673cae | 673 | bool is_unallocated(uint64_t b_off, uint64_t b_len) const { |
31f18b77 | 674 | return _validate_range(b_off, b_len, false); |
7c673cae FG |
675 | } |
676 | ||
677 | /// return true if the logical range has never been used | |
678 | bool is_unused(uint64_t offset, uint64_t length) const { | |
679 | if (!has_unused()) { | |
680 | return false; | |
681 | } | |
682 | uint64_t blob_len = get_logical_length(); | |
683 | assert((blob_len % (sizeof(unused)*8)) == 0); | |
684 | assert(offset + length <= blob_len); | |
685 | uint64_t chunk_size = blob_len / (sizeof(unused)*8); | |
686 | uint64_t start = offset / chunk_size; | |
687 | uint64_t end = ROUND_UP_TO(offset + length, chunk_size) / chunk_size; | |
688 | auto i = start; | |
689 | while (i < end && (unused & (1u << i))) { | |
690 | i++; | |
691 | } | |
692 | return i >= end; | |
693 | } | |
694 | ||
695 | /// mark a range that has never been used | |
696 | void add_unused(uint64_t offset, uint64_t length) { | |
697 | uint64_t blob_len = get_logical_length(); | |
698 | assert((blob_len % (sizeof(unused)*8)) == 0); | |
699 | assert(offset + length <= blob_len); | |
700 | uint64_t chunk_size = blob_len / (sizeof(unused)*8); | |
701 | uint64_t start = ROUND_UP_TO(offset, chunk_size) / chunk_size; | |
702 | uint64_t end = (offset + length) / chunk_size; | |
703 | for (auto i = start; i < end; ++i) { | |
704 | unused |= (1u << i); | |
705 | } | |
706 | if (start != end) { | |
707 | set_flag(FLAG_HAS_UNUSED); | |
708 | } | |
709 | } | |
710 | ||
711 | /// indicate that a range has (now) been used. | |
712 | void mark_used(uint64_t offset, uint64_t length) { | |
713 | if (has_unused()) { | |
714 | uint64_t blob_len = get_logical_length(); | |
715 | assert((blob_len % (sizeof(unused)*8)) == 0); | |
716 | assert(offset + length <= blob_len); | |
717 | uint64_t chunk_size = blob_len / (sizeof(unused)*8); | |
718 | uint64_t start = offset / chunk_size; | |
719 | uint64_t end = ROUND_UP_TO(offset + length, chunk_size) / chunk_size; | |
720 | for (auto i = start; i < end; ++i) { | |
721 | unused &= ~(1u << i); | |
722 | } | |
723 | if (unused == 0) { | |
724 | clear_flag(FLAG_HAS_UNUSED); | |
725 | } | |
726 | } | |
727 | } | |
728 | ||
729 | int map(uint64_t x_off, uint64_t x_len, | |
730 | std::function<int(uint64_t,uint64_t)> f) const { | |
731 | auto p = extents.begin(); | |
732 | assert(p != extents.end()); | |
733 | while (x_off >= p->length) { | |
734 | x_off -= p->length; | |
735 | ++p; | |
736 | assert(p != extents.end()); | |
737 | } | |
738 | while (x_len > 0) { | |
739 | assert(p != extents.end()); | |
740 | uint64_t l = MIN(p->length - x_off, x_len); | |
741 | int r = f(p->offset + x_off, l); | |
742 | if (r < 0) | |
743 | return r; | |
744 | x_off = 0; | |
745 | x_len -= l; | |
746 | ++p; | |
747 | } | |
748 | return 0; | |
749 | } | |
750 | void map_bl(uint64_t x_off, | |
751 | bufferlist& bl, | |
752 | std::function<void(uint64_t,bufferlist&)> f) const { | |
753 | auto p = extents.begin(); | |
754 | assert(p != extents.end()); | |
755 | while (x_off >= p->length) { | |
756 | x_off -= p->length; | |
757 | ++p; | |
758 | assert(p != extents.end()); | |
759 | } | |
760 | bufferlist::iterator it = bl.begin(); | |
761 | uint64_t x_len = bl.length(); | |
762 | while (x_len > 0) { | |
763 | assert(p != extents.end()); | |
764 | uint64_t l = MIN(p->length - x_off, x_len); | |
765 | bufferlist t; | |
766 | it.copy(l, t); | |
767 | f(p->offset + x_off, t); | |
768 | x_off = 0; | |
769 | x_len -= l; | |
770 | ++p; | |
771 | } | |
772 | } | |
773 | ||
774 | uint32_t get_ondisk_length() const { | |
775 | uint32_t len = 0; | |
776 | for (auto &p : extents) { | |
777 | len += p.length; | |
778 | } | |
779 | return len; | |
780 | } | |
781 | ||
782 | uint32_t get_logical_length() const { | |
783 | return logical_length; | |
784 | } | |
785 | size_t get_csum_value_size() const; | |
786 | ||
787 | size_t get_csum_count() const { | |
788 | size_t vs = get_csum_value_size(); | |
789 | if (!vs) | |
790 | return 0; | |
791 | return csum_data.length() / vs; | |
792 | } | |
793 | uint64_t get_csum_item(unsigned i) const { | |
794 | size_t cs = get_csum_value_size(); | |
795 | const char *p = csum_data.c_str(); | |
796 | switch (cs) { | |
797 | case 0: | |
798 | assert(0 == "no csum data, bad index"); | |
799 | case 1: | |
800 | return reinterpret_cast<const uint8_t*>(p)[i]; | |
801 | case 2: | |
802 | return reinterpret_cast<const __le16*>(p)[i]; | |
803 | case 4: | |
804 | return reinterpret_cast<const __le32*>(p)[i]; | |
805 | case 8: | |
806 | return reinterpret_cast<const __le64*>(p)[i]; | |
807 | default: | |
808 | assert(0 == "unrecognized csum word size"); | |
809 | } | |
810 | } | |
811 | const char *get_csum_item_ptr(unsigned i) const { | |
812 | size_t cs = get_csum_value_size(); | |
813 | return csum_data.c_str() + (cs * i); | |
814 | } | |
815 | char *get_csum_item_ptr(unsigned i) { | |
816 | size_t cs = get_csum_value_size(); | |
817 | return csum_data.c_str() + (cs * i); | |
818 | } | |
819 | ||
820 | void init_csum(unsigned type, unsigned order, unsigned len) { | |
821 | flags |= FLAG_CSUM; | |
822 | csum_type = type; | |
823 | csum_chunk_order = order; | |
824 | csum_data = buffer::create(get_csum_value_size() * len / get_csum_chunk_size()); | |
825 | csum_data.zero(); | |
826 | } | |
827 | ||
828 | /// calculate csum for the buffer at the given b_off | |
829 | void calc_csum(uint64_t b_off, const bufferlist& bl); | |
830 | ||
831 | /// verify csum: return -EOPNOTSUPP for unsupported checksum type; | |
832 | /// return -1 and valid(nonnegative) b_bad_off for checksum error; | |
833 | /// return 0 if all is well. | |
834 | int verify_csum(uint64_t b_off, const bufferlist& bl, int* b_bad_off, | |
835 | uint64_t *bad_csum) const; | |
836 | ||
837 | bool can_prune_tail() const { | |
838 | return | |
839 | extents.size() > 1 && // if it's all invalid it's not pruning. | |
840 | !extents.back().is_valid() && | |
841 | !has_unused(); | |
842 | } | |
843 | void prune_tail() { | |
844 | const auto &p = extents.back(); | |
845 | logical_length -= p.length; | |
846 | extents.pop_back(); | |
847 | if (has_csum()) { | |
848 | bufferptr t; | |
849 | t.swap(csum_data); | |
850 | csum_data = bufferptr(t.c_str(), | |
851 | get_logical_length() / get_csum_chunk_size() * | |
852 | get_csum_value_size()); | |
853 | } | |
854 | } | |
855 | void add_tail(uint32_t new_len) { | |
856 | assert(is_mutable()); | |
857 | assert(!has_unused()); | |
858 | assert(new_len > logical_length); | |
859 | extents.emplace_back( | |
860 | bluestore_pextent_t( | |
861 | bluestore_pextent_t::INVALID_OFFSET, | |
862 | new_len - logical_length)); | |
863 | logical_length = new_len; | |
864 | if (has_csum()) { | |
865 | bufferptr t; | |
866 | t.swap(csum_data); | |
867 | csum_data = buffer::create( | |
868 | get_csum_value_size() * logical_length / get_csum_chunk_size()); | |
869 | csum_data.copy_in(0, t.length(), t.c_str()); | |
870 | csum_data.zero(t.length(), csum_data.length() - t.length()); | |
871 | } | |
872 | } | |
873 | uint32_t get_release_size(uint32_t min_alloc_size) const { | |
874 | if (is_compressed()) { | |
875 | return get_logical_length(); | |
876 | } | |
877 | uint32_t res = get_csum_chunk_size(); | |
878 | if (!has_csum() || res < min_alloc_size) { | |
879 | res = min_alloc_size; | |
880 | } | |
881 | return res; | |
882 | } | |
883 | ||
884 | void split(uint32_t blob_offset, bluestore_blob_t& rb); | |
885 | void allocated(uint32_t b_off, uint32_t length, const AllocExtentVector& allocs); | |
886 | void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only | |
887 | ||
888 | /// updates blob's pextents container and return unused pextents eligible | |
889 | /// for release. | |
890 | /// all - indicates that the whole blob to be released. | |
891 | /// logical - specifies set of logical extents within blob's | |
892 | /// to be released | |
893 | /// Returns true if blob has no more valid pextents | |
894 | bool release_extents( | |
895 | bool all, | |
896 | const PExtentVector& logical, | |
897 | PExtentVector* r); | |
898 | }; | |
899 | WRITE_CLASS_DENC_FEATURED(bluestore_blob_t) | |
900 | ||
901 | ostream& operator<<(ostream& out, const bluestore_blob_t& o); | |
902 | ||
903 | ||
904 | /// shared blob state | |
905 | struct bluestore_shared_blob_t { | |
906 | uint64_t sbid; ///> shared blob id | |
907 | bluestore_extent_ref_map_t ref_map; ///< shared blob extents | |
908 | ||
909 | bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {} | |
910 | ||
911 | DENC(bluestore_shared_blob_t, v, p) { | |
912 | DENC_START(1, 1, p); | |
913 | denc(v.ref_map, p); | |
914 | DENC_FINISH(p); | |
915 | } | |
916 | ||
917 | ||
918 | void dump(Formatter *f) const; | |
919 | static void generate_test_instances(list<bluestore_shared_blob_t*>& ls); | |
920 | ||
921 | bool empty() const { | |
922 | return ref_map.empty(); | |
923 | } | |
924 | }; | |
925 | WRITE_CLASS_DENC(bluestore_shared_blob_t) | |
926 | ||
927 | ostream& operator<<(ostream& out, const bluestore_shared_blob_t& o); | |
928 | ||
929 | /// onode: per-object metadata | |
930 | struct bluestore_onode_t { | |
931 | uint64_t nid = 0; ///< numeric id (locally unique) | |
932 | uint64_t size = 0; ///< object size | |
31f18b77 | 933 | map<mempool::bluestore_cache_other::string, bufferptr> attrs; ///< attrs |
7c673cae FG |
934 | |
935 | struct shard_info { | |
936 | uint32_t offset = 0; ///< logical offset for start of shard | |
937 | uint32_t bytes = 0; ///< encoded bytes | |
938 | DENC(shard_info, v, p) { | |
939 | denc_varint(v.offset, p); | |
940 | denc_varint(v.bytes, p); | |
941 | } | |
942 | void dump(Formatter *f) const; | |
943 | }; | |
944 | vector<shard_info> extent_map_shards; ///< extent map shards (if any) | |
945 | ||
946 | uint32_t expected_object_size = 0; | |
947 | uint32_t expected_write_size = 0; | |
948 | uint32_t alloc_hint_flags = 0; | |
949 | ||
950 | uint8_t flags = 0; | |
951 | ||
952 | enum { | |
953 | FLAG_OMAP = 1, | |
954 | }; | |
955 | ||
956 | string get_flags_string() const { | |
957 | string s; | |
958 | if (flags & FLAG_OMAP) { | |
959 | s = "omap"; | |
960 | } | |
961 | return s; | |
962 | } | |
963 | ||
964 | bool has_flag(unsigned f) const { | |
965 | return flags & f; | |
966 | } | |
967 | ||
968 | void set_flag(unsigned f) { | |
969 | flags |= f; | |
970 | } | |
971 | ||
972 | void clear_flag(unsigned f) { | |
973 | flags &= ~f; | |
974 | } | |
975 | ||
976 | bool has_omap() const { | |
977 | return has_flag(FLAG_OMAP); | |
978 | } | |
979 | ||
980 | void set_omap_flag() { | |
981 | set_flag(FLAG_OMAP); | |
982 | } | |
983 | ||
984 | void clear_omap_flag() { | |
985 | clear_flag(FLAG_OMAP); | |
986 | } | |
987 | ||
988 | DENC(bluestore_onode_t, v, p) { | |
989 | DENC_START(1, 1, p); | |
990 | denc_varint(v.nid, p); | |
991 | denc_varint(v.size, p); | |
992 | denc(v.attrs, p); | |
993 | denc(v.flags, p); | |
994 | denc(v.extent_map_shards, p); | |
995 | denc_varint(v.expected_object_size, p); | |
996 | denc_varint(v.expected_write_size, p); | |
997 | denc_varint(v.alloc_hint_flags, p); | |
998 | DENC_FINISH(p); | |
999 | } | |
1000 | void dump(Formatter *f) const; | |
1001 | static void generate_test_instances(list<bluestore_onode_t*>& o); | |
1002 | }; | |
1003 | WRITE_CLASS_DENC(bluestore_onode_t::shard_info) | |
1004 | WRITE_CLASS_DENC(bluestore_onode_t) | |
1005 | ||
1006 | ostream& operator<<(ostream& out, const bluestore_onode_t::shard_info& si); | |
1007 | ||
1008 | /// writeahead-logged op | |
1009 | struct bluestore_deferred_op_t { | |
1010 | typedef enum { | |
1011 | OP_WRITE = 1, | |
1012 | } type_t; | |
1013 | __u8 op = 0; | |
1014 | ||
1015 | PExtentVector extents; | |
1016 | bufferlist data; | |
1017 | ||
1018 | DENC(bluestore_deferred_op_t, v, p) { | |
1019 | DENC_START(1, 1, p); | |
1020 | denc(v.op, p); | |
1021 | denc(v.extents, p); | |
1022 | denc(v.data, p); | |
1023 | DENC_FINISH(p); | |
1024 | } | |
1025 | void dump(Formatter *f) const; | |
1026 | static void generate_test_instances(list<bluestore_deferred_op_t*>& o); | |
1027 | }; | |
1028 | WRITE_CLASS_DENC(bluestore_deferred_op_t) | |
1029 | ||
1030 | ||
1031 | /// writeahead-logged transaction | |
1032 | struct bluestore_deferred_transaction_t { | |
1033 | uint64_t seq = 0; | |
1034 | list<bluestore_deferred_op_t> ops; | |
1035 | interval_set<uint64_t> released; ///< allocations to release after tx | |
1036 | ||
1037 | bluestore_deferred_transaction_t() : seq(0) {} | |
1038 | ||
1039 | DENC(bluestore_deferred_transaction_t, v, p) { | |
1040 | DENC_START(1, 1, p); | |
1041 | denc(v.seq, p); | |
1042 | denc(v.ops, p); | |
1043 | denc(v.released, p); | |
1044 | DENC_FINISH(p); | |
1045 | } | |
1046 | void dump(Formatter *f) const; | |
1047 | static void generate_test_instances(list<bluestore_deferred_transaction_t*>& o); | |
1048 | }; | |
1049 | WRITE_CLASS_DENC(bluestore_deferred_transaction_t) | |
1050 | ||
1051 | struct bluestore_compression_header_t { | |
1052 | uint8_t type = Compressor::COMP_ALG_NONE; | |
1053 | uint32_t length = 0; | |
1054 | ||
1055 | bluestore_compression_header_t() {} | |
1056 | bluestore_compression_header_t(uint8_t _type) | |
1057 | : type(_type) {} | |
1058 | ||
1059 | DENC(bluestore_compression_header_t, v, p) { | |
1060 | DENC_START(1, 1, p); | |
1061 | denc(v.type, p); | |
1062 | denc(v.length, p); | |
1063 | DENC_FINISH(p); | |
1064 | } | |
1065 | void dump(Formatter *f) const; | |
1066 | static void generate_test_instances(list<bluestore_compression_header_t*>& o); | |
1067 | }; | |
1068 | WRITE_CLASS_DENC(bluestore_compression_header_t) | |
1069 | ||
1070 | ||
1071 | #endif |