]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2014 Red Hat | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H | |
16 | #define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H | |
17 | ||
18 | #include <ostream> | |
19 | #include <bitset> | |
94b18763 | 20 | #include <type_traits> |
7c673cae FG |
21 | #include "include/types.h" |
22 | #include "include/interval_set.h" | |
23 | #include "include/utime.h" | |
24 | #include "common/hobject.h" | |
25 | #include "compressor/Compressor.h" | |
26 | #include "common/Checksummer.h" | |
27 | #include "include/mempool.h" | |
28 | ||
29 | namespace ceph { | |
30 | class Formatter; | |
31 | } | |
32 | ||
33 | /// label for block device | |
34 | struct bluestore_bdev_label_t { | |
35 | uuid_d osd_uuid; ///< osd uuid | |
36 | uint64_t size; ///< device size | |
37 | utime_t btime; ///< birth time | |
38 | string description; ///< device description | |
39 | ||
3efd9988 FG |
40 | map<string,string> meta; ///< {read,write}_meta() content from ObjectStore |
41 | ||
7c673cae | 42 | void encode(bufferlist& bl) const; |
11fdf7f2 | 43 | void decode(bufferlist::const_iterator& p); |
7c673cae FG |
44 | void dump(Formatter *f) const; |
45 | static void generate_test_instances(list<bluestore_bdev_label_t*>& o); | |
46 | }; | |
47 | WRITE_CLASS_ENCODER(bluestore_bdev_label_t) | |
48 | ||
49 | ostream& operator<<(ostream& out, const bluestore_bdev_label_t& l); | |
50 | ||
51 | /// collection metadata | |
52 | struct bluestore_cnode_t { | |
53 | uint32_t bits; ///< how many bits of coll pgid are significant | |
54 | ||
55 | explicit bluestore_cnode_t(int b=0) : bits(b) {} | |
56 | ||
57 | DENC(bluestore_cnode_t, v, p) { | |
58 | DENC_START(1, 1, p); | |
59 | denc(v.bits, p); | |
60 | DENC_FINISH(p); | |
61 | } | |
62 | void dump(Formatter *f) const; | |
63 | static void generate_test_instances(list<bluestore_cnode_t*>& o); | |
64 | }; | |
65 | WRITE_CLASS_DENC(bluestore_cnode_t) | |
66 | ||
28e407b8 AA |
67 | ostream& operator<<(ostream& out, const bluestore_cnode_t& l); |
68 | ||
a8e16298 TL |
69 | template <typename OFFS_TYPE, typename LEN_TYPE> |
70 | struct bluestore_interval_t | |
71 | { | |
72 | static const uint64_t INVALID_OFFSET = ~0ull; | |
7c673cae | 73 | |
a8e16298 TL |
74 | OFFS_TYPE offset = 0; |
75 | LEN_TYPE length = 0; | |
7c673cae | 76 | |
a8e16298 TL |
77 | bluestore_interval_t(){} |
78 | bluestore_interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {} | |
7c673cae | 79 | |
a8e16298 TL |
80 | bool is_valid() const { |
81 | return offset != INVALID_OFFSET; | |
7c673cae | 82 | } |
a8e16298 TL |
83 | uint64_t end() const { |
84 | return offset != INVALID_OFFSET ? offset + length : INVALID_OFFSET; | |
7c673cae FG |
85 | } |
86 | ||
a8e16298 TL |
87 | bool operator==(const bluestore_interval_t& other) const { |
88 | return offset == other.offset && length == other.length; | |
7c673cae FG |
89 | } |
90 | ||
7c673cae FG |
91 | }; |
92 | ||
7c673cae | 93 | /// pextent: physical extent |
a8e16298 TL |
94 | struct bluestore_pextent_t : public bluestore_interval_t<uint64_t, uint32_t> |
95 | { | |
96 | bluestore_pextent_t() {} | |
97 | bluestore_pextent_t(uint64_t o, uint64_t l) : bluestore_interval_t(o, l) {} | |
98 | bluestore_pextent_t(const bluestore_interval_t &ext) : | |
99 | bluestore_interval_t(ext.offset, ext.length) {} | |
7c673cae FG |
100 | |
101 | DENC(bluestore_pextent_t, v, p) { | |
102 | denc_lba(v.offset, p); | |
103 | denc_varint_lowz(v.length, p); | |
104 | } | |
105 | ||
106 | void dump(Formatter *f) const; | |
107 | static void generate_test_instances(list<bluestore_pextent_t*>& ls); | |
108 | }; | |
109 | WRITE_CLASS_DENC(bluestore_pextent_t) | |
110 | ||
111 | ostream& operator<<(ostream& out, const bluestore_pextent_t& o); | |
112 | ||
31f18b77 | 113 | typedef mempool::bluestore_cache_other::vector<bluestore_pextent_t> PExtentVector; |
7c673cae FG |
114 | |
115 | template<> | |
116 | struct denc_traits<PExtentVector> { | |
117 | static constexpr bool supported = true; | |
118 | static constexpr bool bounded = false; | |
119 | static constexpr bool featured = false; | |
31f18b77 | 120 | static constexpr bool need_contiguous = true; |
7c673cae FG |
121 | static void bound_encode(const PExtentVector& v, size_t& p) { |
122 | p += sizeof(uint32_t); | |
123 | const auto size = v.size(); | |
124 | if (size) { | |
125 | size_t per = 0; | |
126 | denc(v.front(), per); | |
127 | p += per * size; | |
128 | } | |
129 | } | |
130 | static void encode(const PExtentVector& v, | |
131 | bufferlist::contiguous_appender& p) { | |
132 | denc_varint(v.size(), p); | |
133 | for (auto& i : v) { | |
134 | denc(i, p); | |
135 | } | |
136 | } | |
11fdf7f2 | 137 | static void decode(PExtentVector& v, bufferptr::const_iterator& p) { |
7c673cae FG |
138 | unsigned num; |
139 | denc_varint(num, p); | |
140 | v.clear(); | |
141 | v.resize(num); | |
142 | for (unsigned i=0; i<num; ++i) { | |
143 | denc(v[i], p); | |
144 | } | |
145 | } | |
146 | }; | |
147 | ||
7c673cae FG |
148 | /// extent_map: a map of reference counted extents |
149 | struct bluestore_extent_ref_map_t { | |
150 | struct record_t { | |
151 | uint32_t length; | |
152 | uint32_t refs; | |
153 | record_t(uint32_t l=0, uint32_t r=0) : length(l), refs(r) {} | |
154 | DENC(bluestore_extent_ref_map_t::record_t, v, p) { | |
155 | denc_varint_lowz(v.length, p); | |
156 | denc_varint(v.refs, p); | |
157 | } | |
158 | }; | |
159 | ||
31f18b77 | 160 | typedef mempool::bluestore_cache_other::map<uint64_t,record_t> map_t; |
7c673cae FG |
161 | map_t ref_map; |
162 | ||
163 | void _check() const; | |
164 | void _maybe_merge_left(map_t::iterator& p); | |
165 | ||
166 | void clear() { | |
167 | ref_map.clear(); | |
168 | } | |
169 | bool empty() const { | |
170 | return ref_map.empty(); | |
171 | } | |
172 | ||
173 | void get(uint64_t offset, uint32_t len); | |
31f18b77 FG |
174 | void put(uint64_t offset, uint32_t len, PExtentVector *release, |
175 | bool *maybe_unshared); | |
7c673cae FG |
176 | |
177 | bool contains(uint64_t offset, uint32_t len) const; | |
178 | bool intersects(uint64_t offset, uint32_t len) const; | |
179 | ||
180 | void bound_encode(size_t& p) const { | |
181 | denc_varint((uint32_t)0, p); | |
182 | if (!ref_map.empty()) { | |
183 | size_t elem_size = 0; | |
184 | denc_varint_lowz((uint64_t)0, elem_size); | |
185 | ref_map.begin()->second.bound_encode(elem_size); | |
186 | p += elem_size * ref_map.size(); | |
187 | } | |
188 | } | |
189 | void encode(bufferlist::contiguous_appender& p) const { | |
11fdf7f2 | 190 | const uint32_t n = ref_map.size(); |
7c673cae FG |
191 | denc_varint(n, p); |
192 | if (n) { | |
193 | auto i = ref_map.begin(); | |
194 | denc_varint_lowz(i->first, p); | |
195 | i->second.encode(p); | |
196 | int64_t pos = i->first; | |
11fdf7f2 | 197 | while (++i != ref_map.end()) { |
7c673cae FG |
198 | denc_varint_lowz((int64_t)i->first - pos, p); |
199 | i->second.encode(p); | |
200 | pos = i->first; | |
201 | } | |
202 | } | |
203 | } | |
11fdf7f2 | 204 | void decode(bufferptr::const_iterator& p) { |
7c673cae FG |
205 | uint32_t n; |
206 | denc_varint(n, p); | |
207 | if (n) { | |
208 | int64_t pos; | |
209 | denc_varint_lowz(pos, p); | |
210 | ref_map[pos].decode(p); | |
211 | while (--n) { | |
212 | int64_t delta; | |
213 | denc_varint_lowz(delta, p); | |
214 | pos += delta; | |
215 | ref_map[pos].decode(p); | |
216 | } | |
217 | } | |
218 | } | |
219 | ||
220 | void dump(Formatter *f) const; | |
221 | static void generate_test_instances(list<bluestore_extent_ref_map_t*>& o); | |
222 | }; | |
223 | WRITE_CLASS_DENC(bluestore_extent_ref_map_t) | |
224 | ||
225 | ||
226 | ostream& operator<<(ostream& out, const bluestore_extent_ref_map_t& rm); | |
227 | static inline bool operator==(const bluestore_extent_ref_map_t::record_t& l, | |
228 | const bluestore_extent_ref_map_t::record_t& r) { | |
229 | return l.length == r.length && l.refs == r.refs; | |
230 | } | |
231 | static inline bool operator==(const bluestore_extent_ref_map_t& l, | |
232 | const bluestore_extent_ref_map_t& r) { | |
233 | return l.ref_map == r.ref_map; | |
234 | } | |
235 | static inline bool operator!=(const bluestore_extent_ref_map_t& l, | |
236 | const bluestore_extent_ref_map_t& r) { | |
237 | return !(l == r); | |
238 | } | |
239 | ||
240 | /// blob_use_tracker: a set of per-alloc unit ref counters to track blob usage | |
241 | struct bluestore_blob_use_tracker_t { | |
242 | // N.B.: There is no need to minimize au_size/num_au | |
243 | // as much as possible (e.g. have just a single byte for au_size) since: | |
244 | // 1) Struct isn't packed hence it's padded. And even if it's packed see 2) | |
245 | // 2) Mem manager has its own granularity, most probably >= 8 bytes | |
246 | // | |
247 | uint32_t au_size; // Allocation (=tracking) unit size, | |
248 | // == 0 if uninitialized | |
249 | uint32_t num_au; // Amount of allocation units tracked | |
250 | // == 0 if single unit or the whole blob is tracked | |
251 | ||
252 | union { | |
253 | uint32_t* bytes_per_au; | |
254 | uint32_t total_bytes; | |
255 | }; | |
256 | ||
257 | bluestore_blob_use_tracker_t() | |
258 | : au_size(0), num_au(0), bytes_per_au(nullptr) { | |
259 | } | |
260 | ~bluestore_blob_use_tracker_t() { | |
261 | clear(); | |
262 | } | |
263 | ||
264 | void clear() { | |
265 | if (num_au != 0) { | |
266 | delete[] bytes_per_au; | |
267 | } | |
268 | bytes_per_au = 0; | |
269 | au_size = 0; | |
270 | num_au = 0; | |
271 | } | |
272 | ||
273 | uint32_t get_referenced_bytes() const { | |
274 | uint32_t total = 0; | |
275 | if (!num_au) { | |
276 | total = total_bytes; | |
277 | } else { | |
278 | for (size_t i = 0; i < num_au; ++i) { | |
279 | total += bytes_per_au[i]; | |
280 | } | |
281 | } | |
282 | return total; | |
283 | } | |
284 | bool is_not_empty() const { | |
285 | if (!num_au) { | |
286 | return total_bytes != 0; | |
287 | } else { | |
288 | for (size_t i = 0; i < num_au; ++i) { | |
289 | if (bytes_per_au[i]) { | |
290 | return true; | |
291 | } | |
292 | } | |
293 | } | |
294 | return false; | |
295 | } | |
296 | bool is_empty() const { | |
297 | return !is_not_empty(); | |
298 | } | |
299 | void prune_tail(uint32_t new_len) { | |
300 | if (num_au) { | |
11fdf7f2 | 301 | new_len = round_up_to(new_len, au_size); |
7c673cae | 302 | uint32_t _num_au = new_len / au_size; |
11fdf7f2 | 303 | ceph_assert(_num_au <= num_au); |
7c673cae FG |
304 | if (_num_au) { |
305 | num_au = _num_au; // bytes_per_au array is left unmodified | |
306 | ||
307 | } else { | |
308 | clear(); | |
309 | } | |
310 | } | |
311 | } | |
312 | void add_tail(uint32_t new_len, uint32_t _au_size) { | |
313 | auto full_size = au_size * (num_au ? num_au : 1); | |
11fdf7f2 | 314 | ceph_assert(new_len >= full_size); |
7c673cae FG |
315 | if (new_len == full_size) { |
316 | return; | |
317 | } | |
318 | if (!num_au) { | |
319 | uint32_t old_total = total_bytes; | |
320 | total_bytes = 0; | |
321 | init(new_len, _au_size); | |
11fdf7f2 | 322 | ceph_assert(num_au); |
7c673cae FG |
323 | bytes_per_au[0] = old_total; |
324 | } else { | |
11fdf7f2 TL |
325 | ceph_assert(_au_size == au_size); |
326 | new_len = round_up_to(new_len, au_size); | |
7c673cae | 327 | uint32_t _num_au = new_len / au_size; |
11fdf7f2 | 328 | ceph_assert(_num_au >= num_au); |
7c673cae FG |
329 | if (_num_au > num_au) { |
330 | auto old_bytes = bytes_per_au; | |
331 | auto old_num_au = num_au; | |
332 | num_au = _num_au; | |
333 | allocate(); | |
334 | for (size_t i = 0; i < old_num_au; i++) { | |
335 | bytes_per_au[i] = old_bytes[i]; | |
336 | } | |
337 | for (size_t i = old_num_au; i < num_au; i++) { | |
338 | bytes_per_au[i] = 0; | |
339 | } | |
340 | delete[] old_bytes; | |
341 | } | |
342 | } | |
343 | } | |
344 | ||
345 | void init( | |
346 | uint32_t full_length, | |
347 | uint32_t _au_size); | |
348 | ||
349 | void get( | |
350 | uint32_t offset, | |
351 | uint32_t len); | |
352 | ||
353 | /// put: return true if the blob has no references any more after the call, | |
354 | /// no release_units is filled for the sake of performance. | |
355 | /// return false if there are some references to the blob, | |
356 | /// in this case release_units contains pextents | |
357 | /// (identified by their offsets relative to the blob start) | |
31f18b77 | 358 | /// that are not used any more and can be safely deallocated. |
7c673cae FG |
359 | bool put( |
360 | uint32_t offset, | |
361 | uint32_t len, | |
362 | PExtentVector *release); | |
363 | ||
364 | bool can_split() const; | |
365 | bool can_split_at(uint32_t blob_offset) const; | |
366 | void split( | |
367 | uint32_t blob_offset, | |
368 | bluestore_blob_use_tracker_t* r); | |
369 | ||
370 | bool equal( | |
371 | const bluestore_blob_use_tracker_t& other) const; | |
372 | ||
373 | void bound_encode(size_t& p) const { | |
374 | denc_varint(au_size, p); | |
375 | if (au_size) { | |
376 | denc_varint(num_au, p); | |
377 | if (!num_au) { | |
378 | denc_varint(total_bytes, p); | |
379 | } else { | |
380 | size_t elem_size = 0; | |
381 | denc_varint((uint32_t)0, elem_size); | |
382 | p += elem_size * num_au; | |
383 | } | |
384 | } | |
385 | } | |
386 | void encode(bufferlist::contiguous_appender& p) const { | |
387 | denc_varint(au_size, p); | |
388 | if (au_size) { | |
389 | denc_varint(num_au, p); | |
390 | if (!num_au) { | |
391 | denc_varint(total_bytes, p); | |
392 | } else { | |
393 | size_t elem_size = 0; | |
394 | denc_varint((uint32_t)0, elem_size); | |
395 | for (size_t i = 0; i < num_au; ++i) { | |
396 | denc_varint(bytes_per_au[i], p); | |
397 | } | |
398 | } | |
399 | } | |
400 | } | |
11fdf7f2 | 401 | void decode(bufferptr::const_iterator& p) { |
7c673cae FG |
402 | clear(); |
403 | denc_varint(au_size, p); | |
404 | if (au_size) { | |
405 | denc_varint(num_au, p); | |
406 | if (!num_au) { | |
407 | denc_varint(total_bytes, p); | |
408 | } else { | |
409 | allocate(); | |
410 | for (size_t i = 0; i < num_au; ++i) { | |
411 | denc_varint(bytes_per_au[i], p); | |
412 | } | |
413 | } | |
414 | } | |
415 | } | |
416 | ||
417 | void dump(Formatter *f) const; | |
418 | static void generate_test_instances(list<bluestore_blob_use_tracker_t*>& o); | |
419 | private: | |
420 | void allocate(); | |
7c673cae FG |
421 | }; |
422 | WRITE_CLASS_DENC(bluestore_blob_use_tracker_t) | |
423 | ostream& operator<<(ostream& out, const bluestore_blob_use_tracker_t& rm); | |
424 | ||
425 | /// blob: a piece of data on disk | |
426 | struct bluestore_blob_t { | |
427 | private: | |
428 | PExtentVector extents; ///< raw data position on device | |
31f18b77 | 429 | uint32_t logical_length = 0; ///< original length of data stored in the blob |
7c673cae FG |
430 | uint32_t compressed_length = 0; ///< compressed length if any |
431 | ||
432 | public: | |
433 | enum { | |
31f18b77 | 434 | LEGACY_FLAG_MUTABLE = 1, ///< [legacy] blob can be overwritten or split |
7c673cae FG |
435 | FLAG_COMPRESSED = 2, ///< blob is compressed |
436 | FLAG_CSUM = 4, ///< blob has checksums | |
437 | FLAG_HAS_UNUSED = 8, ///< blob has unused map | |
438 | FLAG_SHARED = 16, ///< blob is shared; see external SharedBlob | |
439 | }; | |
440 | static string get_flags_string(unsigned flags); | |
441 | ||
442 | uint32_t flags = 0; ///< FLAG_* | |
443 | ||
444 | typedef uint16_t unused_t; | |
445 | unused_t unused = 0; ///< portion that has never been written to (bitmap) | |
446 | ||
447 | uint8_t csum_type = Checksummer::CSUM_NONE; ///< CSUM_* | |
448 | uint8_t csum_chunk_order = 0; ///< csum block size is 1<<block_order bytes | |
449 | ||
450 | bufferptr csum_data; ///< opaque vector of csum data | |
451 | ||
452 | bluestore_blob_t(uint32_t f = 0) : flags(f) {} | |
453 | ||
454 | const PExtentVector& get_extents() const { | |
455 | return extents; | |
456 | } | |
11fdf7f2 TL |
457 | PExtentVector& dirty_extents() { |
458 | return extents; | |
459 | } | |
7c673cae FG |
460 | |
461 | DENC_HELPERS; | |
462 | void bound_encode(size_t& p, uint64_t struct_v) const { | |
11fdf7f2 | 463 | ceph_assert(struct_v == 1 || struct_v == 2); |
7c673cae FG |
464 | denc(extents, p); |
465 | denc_varint(flags, p); | |
466 | denc_varint_lowz(logical_length, p); | |
467 | denc_varint_lowz(compressed_length, p); | |
468 | denc(csum_type, p); | |
469 | denc(csum_chunk_order, p); | |
470 | denc_varint(csum_data.length(), p); | |
471 | p += csum_data.length(); | |
472 | p += sizeof(unused_t); | |
473 | } | |
474 | ||
475 | void encode(bufferlist::contiguous_appender& p, uint64_t struct_v) const { | |
11fdf7f2 | 476 | ceph_assert(struct_v == 1 || struct_v == 2); |
7c673cae FG |
477 | denc(extents, p); |
478 | denc_varint(flags, p); | |
479 | if (is_compressed()) { | |
480 | denc_varint_lowz(logical_length, p); | |
481 | denc_varint_lowz(compressed_length, p); | |
482 | } | |
483 | if (has_csum()) { | |
484 | denc(csum_type, p); | |
485 | denc(csum_chunk_order, p); | |
486 | denc_varint(csum_data.length(), p); | |
487 | memcpy(p.get_pos_add(csum_data.length()), csum_data.c_str(), | |
488 | csum_data.length()); | |
489 | } | |
490 | if (has_unused()) { | |
491 | denc(unused, p); | |
492 | } | |
493 | } | |
494 | ||
11fdf7f2 TL |
495 | void decode(bufferptr::const_iterator& p, uint64_t struct_v) { |
496 | ceph_assert(struct_v == 1 || struct_v == 2); | |
7c673cae FG |
497 | denc(extents, p); |
498 | denc_varint(flags, p); | |
499 | if (is_compressed()) { | |
500 | denc_varint_lowz(logical_length, p); | |
501 | denc_varint_lowz(compressed_length, p); | |
502 | } else { | |
503 | logical_length = get_ondisk_length(); | |
504 | } | |
505 | if (has_csum()) { | |
506 | denc(csum_type, p); | |
507 | denc(csum_chunk_order, p); | |
508 | int len; | |
509 | denc_varint(len, p); | |
510 | csum_data = p.get_ptr(len); | |
3efd9988 | 511 | csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other); |
7c673cae FG |
512 | } |
513 | if (has_unused()) { | |
514 | denc(unused, p); | |
515 | } | |
516 | } | |
517 | ||
518 | bool can_split() const { | |
519 | return | |
520 | !has_flag(FLAG_SHARED) && | |
521 | !has_flag(FLAG_COMPRESSED) && | |
522 | !has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex | |
523 | } | |
524 | bool can_split_at(uint32_t blob_offset) const { | |
525 | return !has_csum() || blob_offset % get_csum_chunk_size() == 0; | |
526 | } | |
527 | ||
528 | void dump(Formatter *f) const; | |
529 | static void generate_test_instances(list<bluestore_blob_t*>& ls); | |
530 | ||
531 | bool has_flag(unsigned f) const { | |
532 | return flags & f; | |
533 | } | |
534 | void set_flag(unsigned f) { | |
535 | flags |= f; | |
536 | } | |
537 | void clear_flag(unsigned f) { | |
538 | flags &= ~f; | |
539 | } | |
540 | string get_flags_string() const { | |
541 | return get_flags_string(flags); | |
542 | } | |
543 | ||
544 | void set_compressed(uint64_t clen_orig, uint64_t clen) { | |
545 | set_flag(FLAG_COMPRESSED); | |
546 | logical_length = clen_orig; | |
547 | compressed_length = clen; | |
548 | } | |
549 | bool is_mutable() const { | |
31f18b77 | 550 | return !is_compressed() && !is_shared(); |
7c673cae FG |
551 | } |
552 | bool is_compressed() const { | |
553 | return has_flag(FLAG_COMPRESSED); | |
554 | } | |
555 | bool has_csum() const { | |
556 | return has_flag(FLAG_CSUM); | |
557 | } | |
558 | bool has_unused() const { | |
559 | return has_flag(FLAG_HAS_UNUSED); | |
560 | } | |
561 | bool is_shared() const { | |
562 | return has_flag(FLAG_SHARED); | |
563 | } | |
564 | ||
565 | /// return chunk (i.e. min readable block) size for the blob | |
566 | uint64_t get_chunk_size(uint64_t dev_block_size) const { | |
567 | return has_csum() ? | |
11fdf7f2 | 568 | std::max<uint64_t>(dev_block_size, get_csum_chunk_size()) : dev_block_size; |
7c673cae FG |
569 | } |
570 | uint32_t get_csum_chunk_size() const { | |
571 | return 1 << csum_chunk_order; | |
572 | } | |
573 | uint32_t get_compressed_payload_length() const { | |
574 | return is_compressed() ? compressed_length : 0; | |
575 | } | |
576 | uint64_t calc_offset(uint64_t x_off, uint64_t *plen) const { | |
577 | auto p = extents.begin(); | |
11fdf7f2 | 578 | ceph_assert(p != extents.end()); |
7c673cae FG |
579 | while (x_off >= p->length) { |
580 | x_off -= p->length; | |
581 | ++p; | |
11fdf7f2 | 582 | ceph_assert(p != extents.end()); |
7c673cae FG |
583 | } |
584 | if (plen) | |
585 | *plen = p->length - x_off; | |
586 | return p->offset + x_off; | |
587 | } | |
588 | ||
31f18b77 FG |
589 | // validate whether or not the status of pextents within the given range |
590 | // meets the requirement(allocated or unallocated). | |
591 | bool _validate_range(uint64_t b_off, uint64_t b_len, | |
592 | bool require_allocated) const { | |
7c673cae | 593 | auto p = extents.begin(); |
11fdf7f2 | 594 | ceph_assert(p != extents.end()); |
7c673cae FG |
595 | while (b_off >= p->length) { |
596 | b_off -= p->length; | |
597 | ++p; | |
11fdf7f2 | 598 | ceph_assert(p != extents.end()); |
7c673cae FG |
599 | } |
600 | b_len += b_off; | |
601 | while (b_len) { | |
11fdf7f2 | 602 | ceph_assert(p != extents.end()); |
31f18b77 FG |
603 | if (require_allocated != p->is_valid()) { |
604 | return false; | |
7c673cae | 605 | } |
31f18b77 | 606 | |
7c673cae | 607 | if (p->length >= b_len) { |
31f18b77 | 608 | return true; |
7c673cae FG |
609 | } |
610 | b_len -= p->length; | |
611 | ++p; | |
612 | } | |
11fdf7f2 TL |
613 | ceph_abort_msg("we should not get here"); |
614 | return false; | |
7c673cae FG |
615 | } |
616 | ||
31f18b77 FG |
617 | /// return true if the entire range is allocated |
618 | /// (mapped to extents on disk) | |
619 | bool is_allocated(uint64_t b_off, uint64_t b_len) const { | |
620 | return _validate_range(b_off, b_len, true); | |
621 | } | |
622 | ||
7c673cae | 623 | /// return true if the entire range is unallocated |
31f18b77 | 624 | /// (not mapped to extents on disk) |
7c673cae | 625 | bool is_unallocated(uint64_t b_off, uint64_t b_len) const { |
31f18b77 | 626 | return _validate_range(b_off, b_len, false); |
7c673cae FG |
627 | } |
628 | ||
629 | /// return true if the logical range has never been used | |
630 | bool is_unused(uint64_t offset, uint64_t length) const { | |
631 | if (!has_unused()) { | |
632 | return false; | |
633 | } | |
634 | uint64_t blob_len = get_logical_length(); | |
11fdf7f2 TL |
635 | ceph_assert((blob_len % (sizeof(unused)*8)) == 0); |
636 | ceph_assert(offset + length <= blob_len); | |
7c673cae FG |
637 | uint64_t chunk_size = blob_len / (sizeof(unused)*8); |
638 | uint64_t start = offset / chunk_size; | |
11fdf7f2 | 639 | uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size; |
7c673cae FG |
640 | auto i = start; |
641 | while (i < end && (unused & (1u << i))) { | |
642 | i++; | |
643 | } | |
644 | return i >= end; | |
645 | } | |
646 | ||
647 | /// mark a range that has never been used | |
648 | void add_unused(uint64_t offset, uint64_t length) { | |
649 | uint64_t blob_len = get_logical_length(); | |
11fdf7f2 TL |
650 | ceph_assert((blob_len % (sizeof(unused)*8)) == 0); |
651 | ceph_assert(offset + length <= blob_len); | |
7c673cae | 652 | uint64_t chunk_size = blob_len / (sizeof(unused)*8); |
11fdf7f2 | 653 | uint64_t start = round_up_to(offset, chunk_size) / chunk_size; |
7c673cae FG |
654 | uint64_t end = (offset + length) / chunk_size; |
655 | for (auto i = start; i < end; ++i) { | |
656 | unused |= (1u << i); | |
657 | } | |
658 | if (start != end) { | |
659 | set_flag(FLAG_HAS_UNUSED); | |
660 | } | |
661 | } | |
662 | ||
663 | /// indicate that a range has (now) been used. | |
664 | void mark_used(uint64_t offset, uint64_t length) { | |
665 | if (has_unused()) { | |
666 | uint64_t blob_len = get_logical_length(); | |
11fdf7f2 TL |
667 | ceph_assert((blob_len % (sizeof(unused)*8)) == 0); |
668 | ceph_assert(offset + length <= blob_len); | |
7c673cae FG |
669 | uint64_t chunk_size = blob_len / (sizeof(unused)*8); |
670 | uint64_t start = offset / chunk_size; | |
11fdf7f2 | 671 | uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size; |
7c673cae FG |
672 | for (auto i = start; i < end; ++i) { |
673 | unused &= ~(1u << i); | |
674 | } | |
675 | if (unused == 0) { | |
676 | clear_flag(FLAG_HAS_UNUSED); | |
677 | } | |
678 | } | |
679 | } | |
680 | ||
94b18763 FG |
681 | template<class F> |
682 | int map(uint64_t x_off, uint64_t x_len, F&& f) const { | |
11fdf7f2 TL |
683 | static_assert(std::is_invocable_r_v<int, F, uint64_t, uint64_t>); |
684 | ||
7c673cae | 685 | auto p = extents.begin(); |
11fdf7f2 | 686 | ceph_assert(p != extents.end()); |
7c673cae FG |
687 | while (x_off >= p->length) { |
688 | x_off -= p->length; | |
689 | ++p; | |
11fdf7f2 | 690 | ceph_assert(p != extents.end()); |
7c673cae FG |
691 | } |
692 | while (x_len > 0) { | |
11fdf7f2 TL |
693 | ceph_assert(p != extents.end()); |
694 | uint64_t l = std::min(p->length - x_off, x_len); | |
7c673cae FG |
695 | int r = f(p->offset + x_off, l); |
696 | if (r < 0) | |
697 | return r; | |
698 | x_off = 0; | |
699 | x_len -= l; | |
700 | ++p; | |
701 | } | |
702 | return 0; | |
703 | } | |
94b18763 | 704 | template<class F> |
7c673cae FG |
705 | void map_bl(uint64_t x_off, |
706 | bufferlist& bl, | |
94b18763 | 707 | F&& f) const { |
11fdf7f2 TL |
708 | static_assert(std::is_invocable_v<F, uint64_t, bufferlist&>); |
709 | ||
7c673cae | 710 | auto p = extents.begin(); |
11fdf7f2 | 711 | ceph_assert(p != extents.end()); |
7c673cae FG |
712 | while (x_off >= p->length) { |
713 | x_off -= p->length; | |
714 | ++p; | |
11fdf7f2 | 715 | ceph_assert(p != extents.end()); |
7c673cae FG |
716 | } |
717 | bufferlist::iterator it = bl.begin(); | |
718 | uint64_t x_len = bl.length(); | |
719 | while (x_len > 0) { | |
11fdf7f2 TL |
720 | ceph_assert(p != extents.end()); |
721 | uint64_t l = std::min(p->length - x_off, x_len); | |
7c673cae FG |
722 | bufferlist t; |
723 | it.copy(l, t); | |
724 | f(p->offset + x_off, t); | |
725 | x_off = 0; | |
726 | x_len -= l; | |
727 | ++p; | |
728 | } | |
729 | } | |
730 | ||
731 | uint32_t get_ondisk_length() const { | |
732 | uint32_t len = 0; | |
733 | for (auto &p : extents) { | |
734 | len += p.length; | |
735 | } | |
736 | return len; | |
737 | } | |
738 | ||
739 | uint32_t get_logical_length() const { | |
740 | return logical_length; | |
741 | } | |
742 | size_t get_csum_value_size() const; | |
743 | ||
744 | size_t get_csum_count() const { | |
745 | size_t vs = get_csum_value_size(); | |
746 | if (!vs) | |
747 | return 0; | |
748 | return csum_data.length() / vs; | |
749 | } | |
750 | uint64_t get_csum_item(unsigned i) const { | |
751 | size_t cs = get_csum_value_size(); | |
752 | const char *p = csum_data.c_str(); | |
753 | switch (cs) { | |
754 | case 0: | |
11fdf7f2 | 755 | ceph_abort_msg("no csum data, bad index"); |
7c673cae FG |
756 | case 1: |
757 | return reinterpret_cast<const uint8_t*>(p)[i]; | |
758 | case 2: | |
759 | return reinterpret_cast<const __le16*>(p)[i]; | |
760 | case 4: | |
761 | return reinterpret_cast<const __le32*>(p)[i]; | |
762 | case 8: | |
763 | return reinterpret_cast<const __le64*>(p)[i]; | |
764 | default: | |
11fdf7f2 | 765 | ceph_abort_msg("unrecognized csum word size"); |
7c673cae FG |
766 | } |
767 | } | |
768 | const char *get_csum_item_ptr(unsigned i) const { | |
769 | size_t cs = get_csum_value_size(); | |
770 | return csum_data.c_str() + (cs * i); | |
771 | } | |
772 | char *get_csum_item_ptr(unsigned i) { | |
773 | size_t cs = get_csum_value_size(); | |
774 | return csum_data.c_str() + (cs * i); | |
775 | } | |
776 | ||
777 | void init_csum(unsigned type, unsigned order, unsigned len) { | |
778 | flags |= FLAG_CSUM; | |
779 | csum_type = type; | |
780 | csum_chunk_order = order; | |
781 | csum_data = buffer::create(get_csum_value_size() * len / get_csum_chunk_size()); | |
782 | csum_data.zero(); | |
3efd9988 | 783 | csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other); |
7c673cae FG |
784 | } |
785 | ||
786 | /// calculate csum for the buffer at the given b_off | |
787 | void calc_csum(uint64_t b_off, const bufferlist& bl); | |
788 | ||
789 | /// verify csum: return -EOPNOTSUPP for unsupported checksum type; | |
790 | /// return -1 and valid(nonnegative) b_bad_off for checksum error; | |
791 | /// return 0 if all is well. | |
792 | int verify_csum(uint64_t b_off, const bufferlist& bl, int* b_bad_off, | |
793 | uint64_t *bad_csum) const; | |
794 | ||
795 | bool can_prune_tail() const { | |
796 | return | |
797 | extents.size() > 1 && // if it's all invalid it's not pruning. | |
798 | !extents.back().is_valid() && | |
799 | !has_unused(); | |
800 | } | |
801 | void prune_tail() { | |
802 | const auto &p = extents.back(); | |
803 | logical_length -= p.length; | |
804 | extents.pop_back(); | |
805 | if (has_csum()) { | |
806 | bufferptr t; | |
807 | t.swap(csum_data); | |
808 | csum_data = bufferptr(t.c_str(), | |
809 | get_logical_length() / get_csum_chunk_size() * | |
810 | get_csum_value_size()); | |
811 | } | |
812 | } | |
813 | void add_tail(uint32_t new_len) { | |
11fdf7f2 TL |
814 | ceph_assert(is_mutable()); |
815 | ceph_assert(!has_unused()); | |
816 | ceph_assert(new_len > logical_length); | |
7c673cae FG |
817 | extents.emplace_back( |
818 | bluestore_pextent_t( | |
819 | bluestore_pextent_t::INVALID_OFFSET, | |
820 | new_len - logical_length)); | |
821 | logical_length = new_len; | |
822 | if (has_csum()) { | |
823 | bufferptr t; | |
824 | t.swap(csum_data); | |
825 | csum_data = buffer::create( | |
826 | get_csum_value_size() * logical_length / get_csum_chunk_size()); | |
827 | csum_data.copy_in(0, t.length(), t.c_str()); | |
828 | csum_data.zero(t.length(), csum_data.length() - t.length()); | |
829 | } | |
830 | } | |
831 | uint32_t get_release_size(uint32_t min_alloc_size) const { | |
832 | if (is_compressed()) { | |
833 | return get_logical_length(); | |
834 | } | |
835 | uint32_t res = get_csum_chunk_size(); | |
836 | if (!has_csum() || res < min_alloc_size) { | |
837 | res = min_alloc_size; | |
838 | } | |
839 | return res; | |
840 | } | |
841 | ||
842 | void split(uint32_t blob_offset, bluestore_blob_t& rb); | |
a8e16298 | 843 | void allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs); |
7c673cae FG |
844 | void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only |
845 | ||
846 | /// updates blob's pextents container and return unused pextents eligible | |
847 | /// for release. | |
848 | /// all - indicates that the whole blob to be released. | |
849 | /// logical - specifies set of logical extents within blob's | |
850 | /// to be released | |
851 | /// Returns true if blob has no more valid pextents | |
852 | bool release_extents( | |
853 | bool all, | |
854 | const PExtentVector& logical, | |
855 | PExtentVector* r); | |
856 | }; | |
857 | WRITE_CLASS_DENC_FEATURED(bluestore_blob_t) | |
858 | ||
859 | ostream& operator<<(ostream& out, const bluestore_blob_t& o); | |
860 | ||
861 | ||
862 | /// shared blob state | |
863 | struct bluestore_shared_blob_t { | |
864 | uint64_t sbid; ///> shared blob id | |
865 | bluestore_extent_ref_map_t ref_map; ///< shared blob extents | |
866 | ||
867 | bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {} | |
11fdf7f2 TL |
868 | bluestore_shared_blob_t(uint64_t _sbid, |
869 | bluestore_extent_ref_map_t&& _ref_map ) | |
870 | : sbid(_sbid), ref_map(std::move(_ref_map)) {} | |
7c673cae FG |
871 | |
872 | DENC(bluestore_shared_blob_t, v, p) { | |
873 | DENC_START(1, 1, p); | |
874 | denc(v.ref_map, p); | |
875 | DENC_FINISH(p); | |
876 | } | |
877 | ||
878 | ||
879 | void dump(Formatter *f) const; | |
880 | static void generate_test_instances(list<bluestore_shared_blob_t*>& ls); | |
881 | ||
882 | bool empty() const { | |
883 | return ref_map.empty(); | |
884 | } | |
885 | }; | |
886 | WRITE_CLASS_DENC(bluestore_shared_blob_t) | |
887 | ||
888 | ostream& operator<<(ostream& out, const bluestore_shared_blob_t& o); | |
889 | ||
890 | /// onode: per-object metadata | |
891 | struct bluestore_onode_t { | |
892 | uint64_t nid = 0; ///< numeric id (locally unique) | |
893 | uint64_t size = 0; ///< object size | |
31f18b77 | 894 | map<mempool::bluestore_cache_other::string, bufferptr> attrs; ///< attrs |
7c673cae FG |
895 | |
896 | struct shard_info { | |
897 | uint32_t offset = 0; ///< logical offset for start of shard | |
898 | uint32_t bytes = 0; ///< encoded bytes | |
899 | DENC(shard_info, v, p) { | |
900 | denc_varint(v.offset, p); | |
901 | denc_varint(v.bytes, p); | |
902 | } | |
903 | void dump(Formatter *f) const; | |
904 | }; | |
905 | vector<shard_info> extent_map_shards; ///< extent map shards (if any) | |
906 | ||
907 | uint32_t expected_object_size = 0; | |
908 | uint32_t expected_write_size = 0; | |
909 | uint32_t alloc_hint_flags = 0; | |
910 | ||
911 | uint8_t flags = 0; | |
912 | ||
913 | enum { | |
11fdf7f2 TL |
914 | FLAG_OMAP = 1, ///< object may have omap data |
915 | FLAG_PGMETA_OMAP = 2, ///< omap data is in meta omap prefix | |
7c673cae FG |
916 | }; |
917 | ||
918 | string get_flags_string() const { | |
919 | string s; | |
920 | if (flags & FLAG_OMAP) { | |
921 | s = "omap"; | |
922 | } | |
923 | return s; | |
924 | } | |
925 | ||
926 | bool has_flag(unsigned f) const { | |
927 | return flags & f; | |
928 | } | |
929 | ||
930 | void set_flag(unsigned f) { | |
931 | flags |= f; | |
932 | } | |
933 | ||
934 | void clear_flag(unsigned f) { | |
935 | flags &= ~f; | |
936 | } | |
937 | ||
938 | bool has_omap() const { | |
939 | return has_flag(FLAG_OMAP); | |
940 | } | |
11fdf7f2 TL |
941 | bool is_pgmeta_omap() const { |
942 | return has_flag(FLAG_PGMETA_OMAP); | |
943 | } | |
7c673cae FG |
944 | |
945 | void set_omap_flag() { | |
946 | set_flag(FLAG_OMAP); | |
947 | } | |
948 | ||
949 | void clear_omap_flag() { | |
950 | clear_flag(FLAG_OMAP); | |
951 | } | |
952 | ||
953 | DENC(bluestore_onode_t, v, p) { | |
954 | DENC_START(1, 1, p); | |
955 | denc_varint(v.nid, p); | |
956 | denc_varint(v.size, p); | |
957 | denc(v.attrs, p); | |
958 | denc(v.flags, p); | |
959 | denc(v.extent_map_shards, p); | |
960 | denc_varint(v.expected_object_size, p); | |
961 | denc_varint(v.expected_write_size, p); | |
962 | denc_varint(v.alloc_hint_flags, p); | |
963 | DENC_FINISH(p); | |
964 | } | |
965 | void dump(Formatter *f) const; | |
966 | static void generate_test_instances(list<bluestore_onode_t*>& o); | |
967 | }; | |
968 | WRITE_CLASS_DENC(bluestore_onode_t::shard_info) | |
969 | WRITE_CLASS_DENC(bluestore_onode_t) | |
970 | ||
971 | ostream& operator<<(ostream& out, const bluestore_onode_t::shard_info& si); | |
972 | ||
973 | /// writeahead-logged op | |
974 | struct bluestore_deferred_op_t { | |
975 | typedef enum { | |
976 | OP_WRITE = 1, | |
977 | } type_t; | |
978 | __u8 op = 0; | |
979 | ||
980 | PExtentVector extents; | |
981 | bufferlist data; | |
982 | ||
983 | DENC(bluestore_deferred_op_t, v, p) { | |
984 | DENC_START(1, 1, p); | |
985 | denc(v.op, p); | |
986 | denc(v.extents, p); | |
987 | denc(v.data, p); | |
988 | DENC_FINISH(p); | |
989 | } | |
990 | void dump(Formatter *f) const; | |
991 | static void generate_test_instances(list<bluestore_deferred_op_t*>& o); | |
992 | }; | |
993 | WRITE_CLASS_DENC(bluestore_deferred_op_t) | |
994 | ||
995 | ||
996 | /// writeahead-logged transaction | |
997 | struct bluestore_deferred_transaction_t { | |
998 | uint64_t seq = 0; | |
999 | list<bluestore_deferred_op_t> ops; | |
1000 | interval_set<uint64_t> released; ///< allocations to release after tx | |
1001 | ||
1002 | bluestore_deferred_transaction_t() : seq(0) {} | |
1003 | ||
1004 | DENC(bluestore_deferred_transaction_t, v, p) { | |
1005 | DENC_START(1, 1, p); | |
1006 | denc(v.seq, p); | |
1007 | denc(v.ops, p); | |
1008 | denc(v.released, p); | |
1009 | DENC_FINISH(p); | |
1010 | } | |
1011 | void dump(Formatter *f) const; | |
1012 | static void generate_test_instances(list<bluestore_deferred_transaction_t*>& o); | |
1013 | }; | |
1014 | WRITE_CLASS_DENC(bluestore_deferred_transaction_t) | |
1015 | ||
1016 | struct bluestore_compression_header_t { | |
1017 | uint8_t type = Compressor::COMP_ALG_NONE; | |
1018 | uint32_t length = 0; | |
1019 | ||
1020 | bluestore_compression_header_t() {} | |
1021 | bluestore_compression_header_t(uint8_t _type) | |
1022 | : type(_type) {} | |
1023 | ||
1024 | DENC(bluestore_compression_header_t, v, p) { | |
1025 | DENC_START(1, 1, p); | |
1026 | denc(v.type, p); | |
1027 | denc(v.length, p); | |
1028 | DENC_FINISH(p); | |
1029 | } | |
1030 | void dump(Formatter *f) const; | |
1031 | static void generate_test_instances(list<bluestore_compression_header_t*>& o); | |
1032 | }; | |
1033 | WRITE_CLASS_DENC(bluestore_compression_header_t) | |
1034 | ||
1035 | ||
1036 | #endif |