]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2014 Red Hat | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H | |
16 | #define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H | |
17 | ||
1e59de90 | 18 | #include <bit> |
7c673cae | 19 | #include <ostream> |
94b18763 | 20 | #include <type_traits> |
20effc67 TL |
21 | #include <vector> |
22 | #include <array> | |
f67539c2 | 23 | #include "include/mempool.h" |
7c673cae FG |
24 | #include "include/types.h" |
25 | #include "include/interval_set.h" | |
26 | #include "include/utime.h" | |
27 | #include "common/hobject.h" | |
28 | #include "compressor/Compressor.h" | |
29 | #include "common/Checksummer.h" | |
20effc67 | 30 | #include "include/ceph_hash.h" |
7c673cae FG |
31 | |
32 | namespace ceph { | |
33 | class Formatter; | |
34 | } | |
35 | ||
36 | /// label for block device | |
37 | struct bluestore_bdev_label_t { | |
38 | uuid_d osd_uuid; ///< osd uuid | |
9f95a23c | 39 | uint64_t size = 0; ///< device size |
7c673cae | 40 | utime_t btime; ///< birth time |
f67539c2 | 41 | std::string description; ///< device description |
7c673cae | 42 | |
f67539c2 | 43 | std::map<std::string,std::string> meta; ///< {read,write}_meta() content from ObjectStore |
3efd9988 | 44 | |
f67539c2 TL |
45 | void encode(ceph::buffer::list& bl) const; |
46 | void decode(ceph::buffer::list::const_iterator& p); | |
47 | void dump(ceph::Formatter *f) const; | |
48 | static void generate_test_instances(std::list<bluestore_bdev_label_t*>& o); | |
7c673cae FG |
49 | }; |
50 | WRITE_CLASS_ENCODER(bluestore_bdev_label_t) | |
51 | ||
f67539c2 | 52 | std::ostream& operator<<(std::ostream& out, const bluestore_bdev_label_t& l); |
7c673cae FG |
53 | |
54 | /// collection metadata | |
55 | struct bluestore_cnode_t { | |
56 | uint32_t bits; ///< how many bits of coll pgid are significant | |
57 | ||
58 | explicit bluestore_cnode_t(int b=0) : bits(b) {} | |
59 | ||
60 | DENC(bluestore_cnode_t, v, p) { | |
61 | DENC_START(1, 1, p); | |
62 | denc(v.bits, p); | |
63 | DENC_FINISH(p); | |
64 | } | |
f67539c2 TL |
65 | void dump(ceph::Formatter *f) const; |
66 | static void generate_test_instances(std::list<bluestore_cnode_t*>& o); | |
7c673cae FG |
67 | }; |
68 | WRITE_CLASS_DENC(bluestore_cnode_t) | |
69 | ||
f67539c2 | 70 | std::ostream& operator<<(std::ostream& out, const bluestore_cnode_t& l); |
28e407b8 | 71 | |
a8e16298 TL |
72 | template <typename OFFS_TYPE, typename LEN_TYPE> |
73 | struct bluestore_interval_t | |
74 | { | |
75 | static const uint64_t INVALID_OFFSET = ~0ull; | |
7c673cae | 76 | |
a8e16298 TL |
77 | OFFS_TYPE offset = 0; |
78 | LEN_TYPE length = 0; | |
7c673cae | 79 | |
a8e16298 TL |
80 | bluestore_interval_t(){} |
81 | bluestore_interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {} | |
7c673cae | 82 | |
a8e16298 TL |
83 | bool is_valid() const { |
84 | return offset != INVALID_OFFSET; | |
7c673cae | 85 | } |
a8e16298 TL |
86 | uint64_t end() const { |
87 | return offset != INVALID_OFFSET ? offset + length : INVALID_OFFSET; | |
7c673cae FG |
88 | } |
89 | ||
a8e16298 TL |
90 | bool operator==(const bluestore_interval_t& other) const { |
91 | return offset == other.offset && length == other.length; | |
7c673cae FG |
92 | } |
93 | ||
7c673cae FG |
94 | }; |
95 | ||
7c673cae | 96 | /// pextent: physical extent |
a8e16298 TL |
97 | struct bluestore_pextent_t : public bluestore_interval_t<uint64_t, uint32_t> |
98 | { | |
99 | bluestore_pextent_t() {} | |
100 | bluestore_pextent_t(uint64_t o, uint64_t l) : bluestore_interval_t(o, l) {} | |
101 | bluestore_pextent_t(const bluestore_interval_t &ext) : | |
102 | bluestore_interval_t(ext.offset, ext.length) {} | |
7c673cae FG |
103 | |
104 | DENC(bluestore_pextent_t, v, p) { | |
105 | denc_lba(v.offset, p); | |
106 | denc_varint_lowz(v.length, p); | |
107 | } | |
108 | ||
f67539c2 TL |
109 | void dump(ceph::Formatter *f) const; |
110 | static void generate_test_instances(std::list<bluestore_pextent_t*>& ls); | |
7c673cae FG |
111 | }; |
112 | WRITE_CLASS_DENC(bluestore_pextent_t) | |
113 | ||
f67539c2 | 114 | std::ostream& operator<<(std::ostream& out, const bluestore_pextent_t& o); |
7c673cae | 115 | |
31f18b77 | 116 | typedef mempool::bluestore_cache_other::vector<bluestore_pextent_t> PExtentVector; |
7c673cae FG |
117 | |
118 | template<> | |
119 | struct denc_traits<PExtentVector> { | |
120 | static constexpr bool supported = true; | |
121 | static constexpr bool bounded = false; | |
122 | static constexpr bool featured = false; | |
31f18b77 | 123 | static constexpr bool need_contiguous = true; |
7c673cae FG |
124 | static void bound_encode(const PExtentVector& v, size_t& p) { |
125 | p += sizeof(uint32_t); | |
126 | const auto size = v.size(); | |
127 | if (size) { | |
128 | size_t per = 0; | |
129 | denc(v.front(), per); | |
130 | p += per * size; | |
131 | } | |
132 | } | |
133 | static void encode(const PExtentVector& v, | |
f67539c2 | 134 | ceph::buffer::list::contiguous_appender& p) { |
7c673cae FG |
135 | denc_varint(v.size(), p); |
136 | for (auto& i : v) { | |
137 | denc(i, p); | |
138 | } | |
139 | } | |
f67539c2 | 140 | static void decode(PExtentVector& v, ceph::buffer::ptr::const_iterator& p) { |
7c673cae FG |
141 | unsigned num; |
142 | denc_varint(num, p); | |
143 | v.clear(); | |
144 | v.resize(num); | |
145 | for (unsigned i=0; i<num; ++i) { | |
146 | denc(v[i], p); | |
147 | } | |
148 | } | |
149 | }; | |
150 | ||
f67539c2 | 151 | /// extent_map: a std::map of reference counted extents |
7c673cae FG |
152 | struct bluestore_extent_ref_map_t { |
153 | struct record_t { | |
154 | uint32_t length; | |
155 | uint32_t refs; | |
156 | record_t(uint32_t l=0, uint32_t r=0) : length(l), refs(r) {} | |
157 | DENC(bluestore_extent_ref_map_t::record_t, v, p) { | |
158 | denc_varint_lowz(v.length, p); | |
159 | denc_varint(v.refs, p); | |
160 | } | |
161 | }; | |
162 | ||
31f18b77 | 163 | typedef mempool::bluestore_cache_other::map<uint64_t,record_t> map_t; |
7c673cae FG |
164 | map_t ref_map; |
165 | ||
166 | void _check() const; | |
167 | void _maybe_merge_left(map_t::iterator& p); | |
168 | ||
169 | void clear() { | |
170 | ref_map.clear(); | |
171 | } | |
172 | bool empty() const { | |
173 | return ref_map.empty(); | |
174 | } | |
175 | ||
176 | void get(uint64_t offset, uint32_t len); | |
31f18b77 FG |
177 | void put(uint64_t offset, uint32_t len, PExtentVector *release, |
178 | bool *maybe_unshared); | |
7c673cae FG |
179 | |
180 | bool contains(uint64_t offset, uint32_t len) const; | |
181 | bool intersects(uint64_t offset, uint32_t len) const; | |
182 | ||
183 | void bound_encode(size_t& p) const { | |
184 | denc_varint((uint32_t)0, p); | |
185 | if (!ref_map.empty()) { | |
186 | size_t elem_size = 0; | |
187 | denc_varint_lowz((uint64_t)0, elem_size); | |
188 | ref_map.begin()->second.bound_encode(elem_size); | |
189 | p += elem_size * ref_map.size(); | |
190 | } | |
191 | } | |
f67539c2 | 192 | void encode(ceph::buffer::list::contiguous_appender& p) const { |
11fdf7f2 | 193 | const uint32_t n = ref_map.size(); |
7c673cae FG |
194 | denc_varint(n, p); |
195 | if (n) { | |
196 | auto i = ref_map.begin(); | |
197 | denc_varint_lowz(i->first, p); | |
198 | i->second.encode(p); | |
199 | int64_t pos = i->first; | |
11fdf7f2 | 200 | while (++i != ref_map.end()) { |
7c673cae FG |
201 | denc_varint_lowz((int64_t)i->first - pos, p); |
202 | i->second.encode(p); | |
203 | pos = i->first; | |
204 | } | |
205 | } | |
206 | } | |
f67539c2 | 207 | void decode(ceph::buffer::ptr::const_iterator& p) { |
7c673cae FG |
208 | uint32_t n; |
209 | denc_varint(n, p); | |
210 | if (n) { | |
211 | int64_t pos; | |
212 | denc_varint_lowz(pos, p); | |
213 | ref_map[pos].decode(p); | |
214 | while (--n) { | |
215 | int64_t delta; | |
216 | denc_varint_lowz(delta, p); | |
217 | pos += delta; | |
218 | ref_map[pos].decode(p); | |
219 | } | |
220 | } | |
221 | } | |
222 | ||
f67539c2 TL |
223 | void dump(ceph::Formatter *f) const; |
224 | static void generate_test_instances(std::list<bluestore_extent_ref_map_t*>& o); | |
7c673cae FG |
225 | }; |
226 | WRITE_CLASS_DENC(bluestore_extent_ref_map_t) | |
227 | ||
228 | ||
f67539c2 | 229 | std::ostream& operator<<(std::ostream& out, const bluestore_extent_ref_map_t& rm); |
7c673cae FG |
230 | static inline bool operator==(const bluestore_extent_ref_map_t::record_t& l, |
231 | const bluestore_extent_ref_map_t::record_t& r) { | |
232 | return l.length == r.length && l.refs == r.refs; | |
233 | } | |
234 | static inline bool operator==(const bluestore_extent_ref_map_t& l, | |
235 | const bluestore_extent_ref_map_t& r) { | |
236 | return l.ref_map == r.ref_map; | |
237 | } | |
238 | static inline bool operator!=(const bluestore_extent_ref_map_t& l, | |
239 | const bluestore_extent_ref_map_t& r) { | |
240 | return !(l == r); | |
241 | } | |
242 | ||
20effc67 | 243 | /// blob_use_tracker: a set of per-alloc unit ref buckets to track blob usage |
7c673cae FG |
244 | struct bluestore_blob_use_tracker_t { |
245 | // N.B.: There is no need to minimize au_size/num_au | |
246 | // as much as possible (e.g. have just a single byte for au_size) since: | |
247 | // 1) Struct isn't packed hence it's padded. And even if it's packed see 2) | |
248 | // 2) Mem manager has its own granularity, most probably >= 8 bytes | |
249 | // | |
2a845540 TL |
250 | uint32_t au_size; // Allocation (=tracking) unit size, |
251 | // == 0 if uninitialized | |
252 | uint32_t num_au; // Amount of allocation units tracked | |
253 | // == 0 if single unit or the whole blob is tracked | |
254 | uint32_t alloc_au; // Amount of allocation units allocated | |
7c673cae FG |
255 | |
256 | union { | |
257 | uint32_t* bytes_per_au; | |
258 | uint32_t total_bytes; | |
259 | }; | |
260 | ||
261 | bluestore_blob_use_tracker_t() | |
2a845540 | 262 | : au_size(0), num_au(0), alloc_au(0), bytes_per_au(nullptr) { |
7c673cae | 263 | } |
9f95a23c TL |
264 | bluestore_blob_use_tracker_t(const bluestore_blob_use_tracker_t& tracker); |
265 | bluestore_blob_use_tracker_t& operator=(const bluestore_blob_use_tracker_t& rhs); | |
7c673cae FG |
266 | ~bluestore_blob_use_tracker_t() { |
267 | clear(); | |
268 | } | |
269 | ||
270 | void clear() { | |
2a845540 TL |
271 | release(alloc_au, bytes_per_au); |
272 | num_au = 0; | |
273 | alloc_au = 0; | |
7c673cae FG |
274 | bytes_per_au = 0; |
275 | au_size = 0; | |
7c673cae FG |
276 | } |
277 | ||
278 | uint32_t get_referenced_bytes() const { | |
279 | uint32_t total = 0; | |
280 | if (!num_au) { | |
281 | total = total_bytes; | |
282 | } else { | |
283 | for (size_t i = 0; i < num_au; ++i) { | |
284 | total += bytes_per_au[i]; | |
285 | } | |
286 | } | |
287 | return total; | |
288 | } | |
289 | bool is_not_empty() const { | |
290 | if (!num_au) { | |
291 | return total_bytes != 0; | |
292 | } else { | |
293 | for (size_t i = 0; i < num_au; ++i) { | |
294 | if (bytes_per_au[i]) { | |
295 | return true; | |
296 | } | |
297 | } | |
298 | } | |
299 | return false; | |
300 | } | |
301 | bool is_empty() const { | |
302 | return !is_not_empty(); | |
303 | } | |
304 | void prune_tail(uint32_t new_len) { | |
305 | if (num_au) { | |
11fdf7f2 | 306 | new_len = round_up_to(new_len, au_size); |
7c673cae | 307 | uint32_t _num_au = new_len / au_size; |
11fdf7f2 | 308 | ceph_assert(_num_au <= num_au); |
7c673cae FG |
309 | if (_num_au) { |
310 | num_au = _num_au; // bytes_per_au array is left unmodified | |
7c673cae FG |
311 | } else { |
312 | clear(); | |
313 | } | |
314 | } | |
315 | } | |
316 | void add_tail(uint32_t new_len, uint32_t _au_size) { | |
317 | auto full_size = au_size * (num_au ? num_au : 1); | |
11fdf7f2 | 318 | ceph_assert(new_len >= full_size); |
7c673cae FG |
319 | if (new_len == full_size) { |
320 | return; | |
321 | } | |
322 | if (!num_au) { | |
323 | uint32_t old_total = total_bytes; | |
324 | total_bytes = 0; | |
325 | init(new_len, _au_size); | |
11fdf7f2 | 326 | ceph_assert(num_au); |
7c673cae FG |
327 | bytes_per_au[0] = old_total; |
328 | } else { | |
11fdf7f2 TL |
329 | ceph_assert(_au_size == au_size); |
330 | new_len = round_up_to(new_len, au_size); | |
7c673cae | 331 | uint32_t _num_au = new_len / au_size; |
11fdf7f2 | 332 | ceph_assert(_num_au >= num_au); |
7c673cae FG |
333 | if (_num_au > num_au) { |
334 | auto old_bytes = bytes_per_au; | |
335 | auto old_num_au = num_au; | |
2a845540 TL |
336 | auto old_alloc_au = alloc_au; |
337 | alloc_au = num_au = 0; // to bypass an assertion in allocate() | |
338 | bytes_per_au = nullptr; | |
339 | allocate(_num_au); | |
7c673cae FG |
340 | for (size_t i = 0; i < old_num_au; i++) { |
341 | bytes_per_au[i] = old_bytes[i]; | |
342 | } | |
343 | for (size_t i = old_num_au; i < num_au; i++) { | |
344 | bytes_per_au[i] = 0; | |
345 | } | |
2a845540 | 346 | release(old_alloc_au, old_bytes); |
7c673cae FG |
347 | } |
348 | } | |
349 | } | |
350 | ||
351 | void init( | |
352 | uint32_t full_length, | |
353 | uint32_t _au_size); | |
354 | ||
355 | void get( | |
356 | uint32_t offset, | |
357 | uint32_t len); | |
358 | ||
359 | /// put: return true if the blob has no references any more after the call, | |
360 | /// no release_units is filled for the sake of performance. | |
361 | /// return false if there are some references to the blob, | |
362 | /// in this case release_units contains pextents | |
363 | /// (identified by their offsets relative to the blob start) | |
31f18b77 | 364 | /// that are not used any more and can be safely deallocated. |
7c673cae FG |
365 | bool put( |
366 | uint32_t offset, | |
367 | uint32_t len, | |
368 | PExtentVector *release); | |
369 | ||
370 | bool can_split() const; | |
371 | bool can_split_at(uint32_t blob_offset) const; | |
372 | void split( | |
373 | uint32_t blob_offset, | |
374 | bluestore_blob_use_tracker_t* r); | |
375 | ||
376 | bool equal( | |
377 | const bluestore_blob_use_tracker_t& other) const; | |
378 | ||
379 | void bound_encode(size_t& p) const { | |
380 | denc_varint(au_size, p); | |
381 | if (au_size) { | |
382 | denc_varint(num_au, p); | |
383 | if (!num_au) { | |
384 | denc_varint(total_bytes, p); | |
385 | } else { | |
386 | size_t elem_size = 0; | |
387 | denc_varint((uint32_t)0, elem_size); | |
388 | p += elem_size * num_au; | |
389 | } | |
390 | } | |
391 | } | |
f67539c2 | 392 | void encode(ceph::buffer::list::contiguous_appender& p) const { |
7c673cae FG |
393 | denc_varint(au_size, p); |
394 | if (au_size) { | |
395 | denc_varint(num_au, p); | |
396 | if (!num_au) { | |
397 | denc_varint(total_bytes, p); | |
398 | } else { | |
399 | size_t elem_size = 0; | |
400 | denc_varint((uint32_t)0, elem_size); | |
401 | for (size_t i = 0; i < num_au; ++i) { | |
402 | denc_varint(bytes_per_au[i], p); | |
403 | } | |
404 | } | |
405 | } | |
406 | } | |
f67539c2 | 407 | void decode(ceph::buffer::ptr::const_iterator& p) { |
7c673cae FG |
408 | clear(); |
409 | denc_varint(au_size, p); | |
410 | if (au_size) { | |
2a845540 TL |
411 | uint32_t _num_au; |
412 | denc_varint(_num_au, p); | |
413 | if (!_num_au) { | |
414 | num_au = 0; | |
7c673cae FG |
415 | denc_varint(total_bytes, p); |
416 | } else { | |
2a845540 TL |
417 | allocate(_num_au); |
418 | for (size_t i = 0; i < _num_au; ++i) { | |
7c673cae FG |
419 | denc_varint(bytes_per_au[i], p); |
420 | } | |
421 | } | |
422 | } | |
423 | } | |
424 | ||
f67539c2 TL |
425 | void dump(ceph::Formatter *f) const; |
426 | static void generate_test_instances(std::list<bluestore_blob_use_tracker_t*>& o); | |
7c673cae | 427 | private: |
2a845540 TL |
428 | void allocate(uint32_t _num_au); |
429 | void release(uint32_t _num_au, uint32_t* ptr); | |
7c673cae FG |
430 | }; |
431 | WRITE_CLASS_DENC(bluestore_blob_use_tracker_t) | |
f67539c2 | 432 | std::ostream& operator<<(std::ostream& out, const bluestore_blob_use_tracker_t& rm); |
7c673cae FG |
433 | |
434 | /// blob: a piece of data on disk | |
435 | struct bluestore_blob_t { | |
436 | private: | |
437 | PExtentVector extents; ///< raw data position on device | |
31f18b77 | 438 | uint32_t logical_length = 0; ///< original length of data stored in the blob |
7c673cae FG |
439 | uint32_t compressed_length = 0; ///< compressed length if any |
440 | ||
441 | public: | |
442 | enum { | |
31f18b77 | 443 | LEGACY_FLAG_MUTABLE = 1, ///< [legacy] blob can be overwritten or split |
7c673cae FG |
444 | FLAG_COMPRESSED = 2, ///< blob is compressed |
445 | FLAG_CSUM = 4, ///< blob has checksums | |
f67539c2 | 446 | FLAG_HAS_UNUSED = 8, ///< blob has unused std::map |
7c673cae FG |
447 | FLAG_SHARED = 16, ///< blob is shared; see external SharedBlob |
448 | }; | |
f67539c2 | 449 | static std::string get_flags_string(unsigned flags); |
7c673cae FG |
450 | |
451 | uint32_t flags = 0; ///< FLAG_* | |
452 | ||
453 | typedef uint16_t unused_t; | |
454 | unused_t unused = 0; ///< portion that has never been written to (bitmap) | |
455 | ||
456 | uint8_t csum_type = Checksummer::CSUM_NONE; ///< CSUM_* | |
457 | uint8_t csum_chunk_order = 0; ///< csum block size is 1<<block_order bytes | |
458 | ||
f67539c2 | 459 | ceph::buffer::ptr csum_data; ///< opaque std::vector of csum data |
7c673cae FG |
460 | |
461 | bluestore_blob_t(uint32_t f = 0) : flags(f) {} | |
462 | ||
463 | const PExtentVector& get_extents() const { | |
464 | return extents; | |
465 | } | |
11fdf7f2 TL |
466 | PExtentVector& dirty_extents() { |
467 | return extents; | |
468 | } | |
7c673cae FG |
469 | |
470 | DENC_HELPERS; | |
471 | void bound_encode(size_t& p, uint64_t struct_v) const { | |
11fdf7f2 | 472 | ceph_assert(struct_v == 1 || struct_v == 2); |
7c673cae FG |
473 | denc(extents, p); |
474 | denc_varint(flags, p); | |
475 | denc_varint_lowz(logical_length, p); | |
476 | denc_varint_lowz(compressed_length, p); | |
477 | denc(csum_type, p); | |
478 | denc(csum_chunk_order, p); | |
479 | denc_varint(csum_data.length(), p); | |
480 | p += csum_data.length(); | |
481 | p += sizeof(unused_t); | |
482 | } | |
483 | ||
f67539c2 | 484 | void encode(ceph::buffer::list::contiguous_appender& p, uint64_t struct_v) const { |
11fdf7f2 | 485 | ceph_assert(struct_v == 1 || struct_v == 2); |
7c673cae FG |
486 | denc(extents, p); |
487 | denc_varint(flags, p); | |
488 | if (is_compressed()) { | |
489 | denc_varint_lowz(logical_length, p); | |
490 | denc_varint_lowz(compressed_length, p); | |
491 | } | |
492 | if (has_csum()) { | |
493 | denc(csum_type, p); | |
494 | denc(csum_chunk_order, p); | |
495 | denc_varint(csum_data.length(), p); | |
496 | memcpy(p.get_pos_add(csum_data.length()), csum_data.c_str(), | |
497 | csum_data.length()); | |
498 | } | |
499 | if (has_unused()) { | |
500 | denc(unused, p); | |
501 | } | |
502 | } | |
503 | ||
f67539c2 | 504 | void decode(ceph::buffer::ptr::const_iterator& p, uint64_t struct_v) { |
11fdf7f2 | 505 | ceph_assert(struct_v == 1 || struct_v == 2); |
7c673cae FG |
506 | denc(extents, p); |
507 | denc_varint(flags, p); | |
508 | if (is_compressed()) { | |
509 | denc_varint_lowz(logical_length, p); | |
510 | denc_varint_lowz(compressed_length, p); | |
511 | } else { | |
512 | logical_length = get_ondisk_length(); | |
513 | } | |
514 | if (has_csum()) { | |
515 | denc(csum_type, p); | |
516 | denc(csum_chunk_order, p); | |
517 | int len; | |
518 | denc_varint(len, p); | |
519 | csum_data = p.get_ptr(len); | |
3efd9988 | 520 | csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other); |
7c673cae FG |
521 | } |
522 | if (has_unused()) { | |
523 | denc(unused, p); | |
524 | } | |
525 | } | |
526 | ||
527 | bool can_split() const { | |
528 | return | |
529 | !has_flag(FLAG_SHARED) && | |
530 | !has_flag(FLAG_COMPRESSED) && | |
531 | !has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex | |
532 | } | |
533 | bool can_split_at(uint32_t blob_offset) const { | |
534 | return !has_csum() || blob_offset % get_csum_chunk_size() == 0; | |
535 | } | |
536 | ||
f67539c2 TL |
537 | void dump(ceph::Formatter *f) const; |
538 | static void generate_test_instances(std::list<bluestore_blob_t*>& ls); | |
7c673cae FG |
539 | |
540 | bool has_flag(unsigned f) const { | |
541 | return flags & f; | |
542 | } | |
543 | void set_flag(unsigned f) { | |
544 | flags |= f; | |
545 | } | |
546 | void clear_flag(unsigned f) { | |
547 | flags &= ~f; | |
548 | } | |
f67539c2 | 549 | std::string get_flags_string() const { |
7c673cae FG |
550 | return get_flags_string(flags); |
551 | } | |
552 | ||
553 | void set_compressed(uint64_t clen_orig, uint64_t clen) { | |
554 | set_flag(FLAG_COMPRESSED); | |
555 | logical_length = clen_orig; | |
556 | compressed_length = clen; | |
557 | } | |
558 | bool is_mutable() const { | |
31f18b77 | 559 | return !is_compressed() && !is_shared(); |
7c673cae FG |
560 | } |
561 | bool is_compressed() const { | |
562 | return has_flag(FLAG_COMPRESSED); | |
563 | } | |
564 | bool has_csum() const { | |
565 | return has_flag(FLAG_CSUM); | |
566 | } | |
567 | bool has_unused() const { | |
568 | return has_flag(FLAG_HAS_UNUSED); | |
569 | } | |
570 | bool is_shared() const { | |
571 | return has_flag(FLAG_SHARED); | |
572 | } | |
573 | ||
574 | /// return chunk (i.e. min readable block) size for the blob | |
575 | uint64_t get_chunk_size(uint64_t dev_block_size) const { | |
576 | return has_csum() ? | |
11fdf7f2 | 577 | std::max<uint64_t>(dev_block_size, get_csum_chunk_size()) : dev_block_size; |
7c673cae FG |
578 | } |
579 | uint32_t get_csum_chunk_size() const { | |
580 | return 1 << csum_chunk_order; | |
581 | } | |
582 | uint32_t get_compressed_payload_length() const { | |
583 | return is_compressed() ? compressed_length : 0; | |
584 | } | |
585 | uint64_t calc_offset(uint64_t x_off, uint64_t *plen) const { | |
586 | auto p = extents.begin(); | |
11fdf7f2 | 587 | ceph_assert(p != extents.end()); |
7c673cae FG |
588 | while (x_off >= p->length) { |
589 | x_off -= p->length; | |
590 | ++p; | |
11fdf7f2 | 591 | ceph_assert(p != extents.end()); |
7c673cae FG |
592 | } |
593 | if (plen) | |
594 | *plen = p->length - x_off; | |
595 | return p->offset + x_off; | |
596 | } | |
597 | ||
31f18b77 FG |
598 | // validate whether or not the status of pextents within the given range |
599 | // meets the requirement(allocated or unallocated). | |
600 | bool _validate_range(uint64_t b_off, uint64_t b_len, | |
601 | bool require_allocated) const { | |
7c673cae | 602 | auto p = extents.begin(); |
11fdf7f2 | 603 | ceph_assert(p != extents.end()); |
7c673cae FG |
604 | while (b_off >= p->length) { |
605 | b_off -= p->length; | |
f67539c2 TL |
606 | if (++p == extents.end()) |
607 | return false; | |
7c673cae FG |
608 | } |
609 | b_len += b_off; | |
610 | while (b_len) { | |
31f18b77 FG |
611 | if (require_allocated != p->is_valid()) { |
612 | return false; | |
7c673cae FG |
613 | } |
614 | if (p->length >= b_len) { | |
31f18b77 | 615 | return true; |
7c673cae FG |
616 | } |
617 | b_len -= p->length; | |
f67539c2 TL |
618 | if (++p == extents.end()) |
619 | return false; | |
7c673cae | 620 | } |
11fdf7f2 TL |
621 | ceph_abort_msg("we should not get here"); |
622 | return false; | |
7c673cae FG |
623 | } |
624 | ||
31f18b77 FG |
625 | /// return true if the entire range is allocated |
626 | /// (mapped to extents on disk) | |
627 | bool is_allocated(uint64_t b_off, uint64_t b_len) const { | |
628 | return _validate_range(b_off, b_len, true); | |
629 | } | |
630 | ||
7c673cae | 631 | /// return true if the entire range is unallocated |
31f18b77 | 632 | /// (not mapped to extents on disk) |
7c673cae | 633 | bool is_unallocated(uint64_t b_off, uint64_t b_len) const { |
31f18b77 | 634 | return _validate_range(b_off, b_len, false); |
7c673cae FG |
635 | } |
636 | ||
637 | /// return true if the logical range has never been used | |
638 | bool is_unused(uint64_t offset, uint64_t length) const { | |
639 | if (!has_unused()) { | |
640 | return false; | |
641 | } | |
1911f103 | 642 | ceph_assert(!is_compressed()); |
7c673cae | 643 | uint64_t blob_len = get_logical_length(); |
11fdf7f2 TL |
644 | ceph_assert((blob_len % (sizeof(unused)*8)) == 0); |
645 | ceph_assert(offset + length <= blob_len); | |
7c673cae FG |
646 | uint64_t chunk_size = blob_len / (sizeof(unused)*8); |
647 | uint64_t start = offset / chunk_size; | |
11fdf7f2 | 648 | uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size; |
7c673cae FG |
649 | auto i = start; |
650 | while (i < end && (unused & (1u << i))) { | |
651 | i++; | |
652 | } | |
653 | return i >= end; | |
654 | } | |
655 | ||
656 | /// mark a range that has never been used | |
657 | void add_unused(uint64_t offset, uint64_t length) { | |
1911f103 | 658 | ceph_assert(!is_compressed()); |
7c673cae | 659 | uint64_t blob_len = get_logical_length(); |
11fdf7f2 TL |
660 | ceph_assert((blob_len % (sizeof(unused)*8)) == 0); |
661 | ceph_assert(offset + length <= blob_len); | |
7c673cae | 662 | uint64_t chunk_size = blob_len / (sizeof(unused)*8); |
11fdf7f2 | 663 | uint64_t start = round_up_to(offset, chunk_size) / chunk_size; |
7c673cae FG |
664 | uint64_t end = (offset + length) / chunk_size; |
665 | for (auto i = start; i < end; ++i) { | |
666 | unused |= (1u << i); | |
667 | } | |
668 | if (start != end) { | |
669 | set_flag(FLAG_HAS_UNUSED); | |
670 | } | |
671 | } | |
672 | ||
673 | /// indicate that a range has (now) been used. | |
674 | void mark_used(uint64_t offset, uint64_t length) { | |
675 | if (has_unused()) { | |
1911f103 | 676 | ceph_assert(!is_compressed()); |
7c673cae | 677 | uint64_t blob_len = get_logical_length(); |
11fdf7f2 TL |
678 | ceph_assert((blob_len % (sizeof(unused)*8)) == 0); |
679 | ceph_assert(offset + length <= blob_len); | |
7c673cae FG |
680 | uint64_t chunk_size = blob_len / (sizeof(unused)*8); |
681 | uint64_t start = offset / chunk_size; | |
11fdf7f2 | 682 | uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size; |
7c673cae FG |
683 | for (auto i = start; i < end; ++i) { |
684 | unused &= ~(1u << i); | |
685 | } | |
686 | if (unused == 0) { | |
687 | clear_flag(FLAG_HAS_UNUSED); | |
688 | } | |
689 | } | |
690 | } | |
691 | ||
f67539c2 TL |
692 | // map_f_invoke templates intended to mask parameters which are not expected |
693 | // by the provided callback | |
694 | template<class F, typename std::enable_if<std::is_invocable_r_v< | |
695 | int, | |
696 | F, | |
697 | uint64_t, | |
698 | uint64_t>>::type* = nullptr> | |
699 | int map_f_invoke(uint64_t lo, | |
700 | const bluestore_pextent_t& p, | |
701 | uint64_t o, | |
702 | uint64_t l, F&& f) const{ | |
703 | return f(o, l); | |
704 | } | |
705 | ||
706 | template<class F, typename std::enable_if<std::is_invocable_r_v< | |
707 | int, | |
708 | F, | |
709 | uint64_t, | |
710 | uint64_t, | |
711 | uint64_t>>::type * = nullptr> | |
712 | int map_f_invoke(uint64_t lo, | |
713 | const bluestore_pextent_t& p, | |
714 | uint64_t o, | |
715 | uint64_t l, F&& f) const { | |
716 | return f(lo, o, l); | |
717 | } | |
718 | ||
719 | template<class F, typename std::enable_if<std::is_invocable_r_v< | |
720 | int, | |
721 | F, | |
722 | const bluestore_pextent_t&, | |
723 | uint64_t, | |
724 | uint64_t>>::type * = nullptr> | |
725 | int map_f_invoke(uint64_t lo, | |
726 | const bluestore_pextent_t& p, | |
727 | uint64_t o, | |
728 | uint64_t l, F&& f) const { | |
729 | return f(p, o, l); | |
730 | } | |
731 | ||
94b18763 FG |
732 | template<class F> |
733 | int map(uint64_t x_off, uint64_t x_len, F&& f) const { | |
f67539c2 | 734 | auto x_off0 = x_off; |
7c673cae | 735 | auto p = extents.begin(); |
11fdf7f2 | 736 | ceph_assert(p != extents.end()); |
7c673cae FG |
737 | while (x_off >= p->length) { |
738 | x_off -= p->length; | |
739 | ++p; | |
11fdf7f2 | 740 | ceph_assert(p != extents.end()); |
7c673cae | 741 | } |
f67539c2 | 742 | while (x_len > 0 && p != extents.end()) { |
11fdf7f2 | 743 | uint64_t l = std::min(p->length - x_off, x_len); |
f67539c2 | 744 | int r = map_f_invoke(x_off0, *p, p->offset + x_off, l, f); |
7c673cae FG |
745 | if (r < 0) |
746 | return r; | |
747 | x_off = 0; | |
748 | x_len -= l; | |
f67539c2 | 749 | x_off0 += l; |
7c673cae FG |
750 | ++p; |
751 | } | |
752 | return 0; | |
753 | } | |
f67539c2 | 754 | |
94b18763 | 755 | template<class F> |
7c673cae | 756 | void map_bl(uint64_t x_off, |
f67539c2 | 757 | ceph::buffer::list& bl, |
94b18763 | 758 | F&& f) const { |
f67539c2 | 759 | static_assert(std::is_invocable_v<F, uint64_t, ceph::buffer::list&>); |
11fdf7f2 | 760 | |
7c673cae | 761 | auto p = extents.begin(); |
11fdf7f2 | 762 | ceph_assert(p != extents.end()); |
7c673cae FG |
763 | while (x_off >= p->length) { |
764 | x_off -= p->length; | |
765 | ++p; | |
11fdf7f2 | 766 | ceph_assert(p != extents.end()); |
7c673cae | 767 | } |
f67539c2 | 768 | ceph::buffer::list::iterator it = bl.begin(); |
7c673cae FG |
769 | uint64_t x_len = bl.length(); |
770 | while (x_len > 0) { | |
11fdf7f2 TL |
771 | ceph_assert(p != extents.end()); |
772 | uint64_t l = std::min(p->length - x_off, x_len); | |
f67539c2 | 773 | ceph::buffer::list t; |
7c673cae FG |
774 | it.copy(l, t); |
775 | f(p->offset + x_off, t); | |
776 | x_off = 0; | |
777 | x_len -= l; | |
778 | ++p; | |
779 | } | |
780 | } | |
781 | ||
782 | uint32_t get_ondisk_length() const { | |
783 | uint32_t len = 0; | |
784 | for (auto &p : extents) { | |
785 | len += p.length; | |
786 | } | |
787 | return len; | |
788 | } | |
789 | ||
790 | uint32_t get_logical_length() const { | |
791 | return logical_length; | |
792 | } | |
793 | size_t get_csum_value_size() const; | |
794 | ||
795 | size_t get_csum_count() const { | |
796 | size_t vs = get_csum_value_size(); | |
797 | if (!vs) | |
798 | return 0; | |
799 | return csum_data.length() / vs; | |
800 | } | |
801 | uint64_t get_csum_item(unsigned i) const { | |
802 | size_t cs = get_csum_value_size(); | |
803 | const char *p = csum_data.c_str(); | |
804 | switch (cs) { | |
805 | case 0: | |
11fdf7f2 | 806 | ceph_abort_msg("no csum data, bad index"); |
7c673cae FG |
807 | case 1: |
808 | return reinterpret_cast<const uint8_t*>(p)[i]; | |
809 | case 2: | |
eafe8130 | 810 | return reinterpret_cast<const ceph_le16*>(p)[i]; |
7c673cae | 811 | case 4: |
eafe8130 | 812 | return reinterpret_cast<const ceph_le32*>(p)[i]; |
7c673cae | 813 | case 8: |
eafe8130 | 814 | return reinterpret_cast<const ceph_le64*>(p)[i]; |
7c673cae | 815 | default: |
11fdf7f2 | 816 | ceph_abort_msg("unrecognized csum word size"); |
7c673cae FG |
817 | } |
818 | } | |
819 | const char *get_csum_item_ptr(unsigned i) const { | |
820 | size_t cs = get_csum_value_size(); | |
821 | return csum_data.c_str() + (cs * i); | |
822 | } | |
823 | char *get_csum_item_ptr(unsigned i) { | |
824 | size_t cs = get_csum_value_size(); | |
825 | return csum_data.c_str() + (cs * i); | |
826 | } | |
827 | ||
828 | void init_csum(unsigned type, unsigned order, unsigned len) { | |
829 | flags |= FLAG_CSUM; | |
830 | csum_type = type; | |
831 | csum_chunk_order = order; | |
f67539c2 | 832 | csum_data = ceph::buffer::create(get_csum_value_size() * len / get_csum_chunk_size()); |
7c673cae | 833 | csum_data.zero(); |
3efd9988 | 834 | csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other); |
7c673cae FG |
835 | } |
836 | ||
837 | /// calculate csum for the buffer at the given b_off | |
f67539c2 | 838 | void calc_csum(uint64_t b_off, const ceph::buffer::list& bl); |
7c673cae FG |
839 | |
840 | /// verify csum: return -EOPNOTSUPP for unsupported checksum type; | |
841 | /// return -1 and valid(nonnegative) b_bad_off for checksum error; | |
842 | /// return 0 if all is well. | |
f67539c2 | 843 | int verify_csum(uint64_t b_off, const ceph::buffer::list& bl, int* b_bad_off, |
7c673cae FG |
844 | uint64_t *bad_csum) const; |
845 | ||
846 | bool can_prune_tail() const { | |
847 | return | |
848 | extents.size() > 1 && // if it's all invalid it's not pruning. | |
849 | !extents.back().is_valid() && | |
850 | !has_unused(); | |
851 | } | |
852 | void prune_tail() { | |
853 | const auto &p = extents.back(); | |
854 | logical_length -= p.length; | |
855 | extents.pop_back(); | |
856 | if (has_csum()) { | |
f67539c2 | 857 | ceph::buffer::ptr t; |
7c673cae | 858 | t.swap(csum_data); |
f67539c2 | 859 | csum_data = ceph::buffer::ptr(t.c_str(), |
7c673cae FG |
860 | get_logical_length() / get_csum_chunk_size() * |
861 | get_csum_value_size()); | |
862 | } | |
863 | } | |
864 | void add_tail(uint32_t new_len) { | |
11fdf7f2 TL |
865 | ceph_assert(is_mutable()); |
866 | ceph_assert(!has_unused()); | |
867 | ceph_assert(new_len > logical_length); | |
7c673cae FG |
868 | extents.emplace_back( |
869 | bluestore_pextent_t( | |
870 | bluestore_pextent_t::INVALID_OFFSET, | |
871 | new_len - logical_length)); | |
872 | logical_length = new_len; | |
873 | if (has_csum()) { | |
f67539c2 | 874 | ceph::buffer::ptr t; |
7c673cae | 875 | t.swap(csum_data); |
f67539c2 | 876 | csum_data = ceph::buffer::create( |
7c673cae FG |
877 | get_csum_value_size() * logical_length / get_csum_chunk_size()); |
878 | csum_data.copy_in(0, t.length(), t.c_str()); | |
879 | csum_data.zero(t.length(), csum_data.length() - t.length()); | |
880 | } | |
881 | } | |
882 | uint32_t get_release_size(uint32_t min_alloc_size) const { | |
883 | if (is_compressed()) { | |
884 | return get_logical_length(); | |
885 | } | |
886 | uint32_t res = get_csum_chunk_size(); | |
887 | if (!has_csum() || res < min_alloc_size) { | |
888 | res = min_alloc_size; | |
889 | } | |
890 | return res; | |
891 | } | |
892 | ||
893 | void split(uint32_t blob_offset, bluestore_blob_t& rb); | |
a8e16298 | 894 | void allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs); |
7c673cae FG |
895 | void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only |
896 | ||
897 | /// updates blob's pextents container and return unused pextents eligible | |
898 | /// for release. | |
899 | /// all - indicates that the whole blob to be released. | |
900 | /// logical - specifies set of logical extents within blob's | |
901 | /// to be released | |
902 | /// Returns true if blob has no more valid pextents | |
903 | bool release_extents( | |
904 | bool all, | |
905 | const PExtentVector& logical, | |
906 | PExtentVector* r); | |
907 | }; | |
908 | WRITE_CLASS_DENC_FEATURED(bluestore_blob_t) | |
909 | ||
f67539c2 | 910 | std::ostream& operator<<(std::ostream& out, const bluestore_blob_t& o); |
7c673cae FG |
911 | |
912 | ||
913 | /// shared blob state | |
914 | struct bluestore_shared_blob_t { | |
f91f0fd5 | 915 | MEMPOOL_CLASS_HELPERS(); |
7c673cae FG |
916 | uint64_t sbid; ///> shared blob id |
917 | bluestore_extent_ref_map_t ref_map; ///< shared blob extents | |
918 | ||
919 | bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {} | |
11fdf7f2 TL |
920 | bluestore_shared_blob_t(uint64_t _sbid, |
921 | bluestore_extent_ref_map_t&& _ref_map ) | |
922 | : sbid(_sbid), ref_map(std::move(_ref_map)) {} | |
7c673cae FG |
923 | |
924 | DENC(bluestore_shared_blob_t, v, p) { | |
925 | DENC_START(1, 1, p); | |
926 | denc(v.ref_map, p); | |
927 | DENC_FINISH(p); | |
928 | } | |
929 | ||
930 | ||
f67539c2 TL |
931 | void dump(ceph::Formatter *f) const; |
932 | static void generate_test_instances(std::list<bluestore_shared_blob_t*>& ls); | |
7c673cae FG |
933 | |
934 | bool empty() const { | |
935 | return ref_map.empty(); | |
936 | } | |
937 | }; | |
938 | WRITE_CLASS_DENC(bluestore_shared_blob_t) | |
939 | ||
f67539c2 | 940 | std::ostream& operator<<(std::ostream& out, const bluestore_shared_blob_t& o); |
7c673cae FG |
941 | |
942 | /// onode: per-object metadata | |
943 | struct bluestore_onode_t { | |
944 | uint64_t nid = 0; ///< numeric id (locally unique) | |
945 | uint64_t size = 0; ///< object size | |
f91f0fd5 TL |
946 | // mempool to be assigned to buffer::ptr manually |
947 | std::map<mempool::bluestore_cache_meta::string, ceph::buffer::ptr> attrs; | |
7c673cae FG |
948 | |
949 | struct shard_info { | |
950 | uint32_t offset = 0; ///< logical offset for start of shard | |
951 | uint32_t bytes = 0; ///< encoded bytes | |
952 | DENC(shard_info, v, p) { | |
953 | denc_varint(v.offset, p); | |
954 | denc_varint(v.bytes, p); | |
955 | } | |
f67539c2 | 956 | void dump(ceph::Formatter *f) const; |
7c673cae | 957 | }; |
f67539c2 | 958 | std::vector<shard_info> extent_map_shards; ///< extent std::map shards (if any) |
7c673cae FG |
959 | |
960 | uint32_t expected_object_size = 0; | |
961 | uint32_t expected_write_size = 0; | |
962 | uint32_t alloc_hint_flags = 0; | |
963 | ||
964 | uint8_t flags = 0; | |
965 | ||
20effc67 TL |
966 | std::map<uint32_t, uint64_t> zone_offset_refs; ///< (zone, offset) refs to this onode |
967 | ||
7c673cae | 968 | enum { |
f67539c2 | 969 | FLAG_OMAP = 1, ///< object may have omap data |
11fdf7f2 | 970 | FLAG_PGMETA_OMAP = 2, ///< omap data is in meta omap prefix |
9f95a23c | 971 | FLAG_PERPOOL_OMAP = 4, ///< omap data is in per-pool prefix; per-pool keys |
f67539c2 | 972 | FLAG_PERPG_OMAP = 8, ///< omap data is in per-pg prefix; per-pg keys |
7c673cae FG |
973 | }; |
974 | ||
f67539c2 TL |
975 | std::string get_flags_string() const { |
976 | std::string s; | |
7c673cae FG |
977 | if (flags & FLAG_OMAP) { |
978 | s = "omap"; | |
979 | } | |
9f95a23c TL |
980 | if (flags & FLAG_PGMETA_OMAP) { |
981 | s += "+pgmeta_omap"; | |
982 | } | |
983 | if (flags & FLAG_PERPOOL_OMAP) { | |
f67539c2 TL |
984 | s += "+per_pool_omap"; |
985 | } | |
986 | if (flags & FLAG_PERPG_OMAP) { | |
987 | s += "+per_pg_omap"; | |
9f95a23c | 988 | } |
7c673cae FG |
989 | return s; |
990 | } | |
991 | ||
992 | bool has_flag(unsigned f) const { | |
993 | return flags & f; | |
994 | } | |
995 | ||
996 | void set_flag(unsigned f) { | |
997 | flags |= f; | |
998 | } | |
999 | ||
1000 | void clear_flag(unsigned f) { | |
1001 | flags &= ~f; | |
1002 | } | |
1003 | ||
1004 | bool has_omap() const { | |
1005 | return has_flag(FLAG_OMAP); | |
1006 | } | |
522d829b TL |
1007 | |
1008 | static bool is_pgmeta_omap(uint8_t flags) { | |
1009 | return flags & FLAG_PGMETA_OMAP; | |
1010 | } | |
1011 | static bool is_perpool_omap(uint8_t flags) { | |
1012 | return flags & FLAG_PERPOOL_OMAP; | |
1013 | } | |
1014 | static bool is_perpg_omap(uint8_t flags) { | |
1015 | return flags & FLAG_PERPG_OMAP; | |
1016 | } | |
11fdf7f2 TL |
1017 | bool is_pgmeta_omap() const { |
1018 | return has_flag(FLAG_PGMETA_OMAP); | |
1019 | } | |
9f95a23c TL |
1020 | bool is_perpool_omap() const { |
1021 | return has_flag(FLAG_PERPOOL_OMAP); | |
1022 | } | |
f67539c2 TL |
1023 | bool is_perpg_omap() const { |
1024 | return has_flag(FLAG_PERPG_OMAP); | |
1025 | } | |
7c673cae | 1026 | |
522d829b TL |
1027 | void set_omap_flags(bool legacy) { |
1028 | set_flag(FLAG_OMAP | (legacy ? 0 : (FLAG_PERPOOL_OMAP | FLAG_PERPG_OMAP))); | |
9f95a23c TL |
1029 | } |
1030 | void set_omap_flags_pgmeta() { | |
1031 | set_flag(FLAG_OMAP | FLAG_PGMETA_OMAP); | |
7c673cae FG |
1032 | } |
1033 | ||
1034 | void clear_omap_flag() { | |
20effc67 TL |
1035 | clear_flag(FLAG_OMAP | |
1036 | FLAG_PGMETA_OMAP | | |
1037 | FLAG_PERPOOL_OMAP | | |
1038 | FLAG_PERPG_OMAP); | |
7c673cae FG |
1039 | } |
1040 | ||
1041 | DENC(bluestore_onode_t, v, p) { | |
20effc67 | 1042 | DENC_START(2, 1, p); |
7c673cae FG |
1043 | denc_varint(v.nid, p); |
1044 | denc_varint(v.size, p); | |
1045 | denc(v.attrs, p); | |
1046 | denc(v.flags, p); | |
1047 | denc(v.extent_map_shards, p); | |
1048 | denc_varint(v.expected_object_size, p); | |
1049 | denc_varint(v.expected_write_size, p); | |
1050 | denc_varint(v.alloc_hint_flags, p); | |
20effc67 TL |
1051 | if (struct_v >= 2) { |
1052 | denc(v.zone_offset_refs, p); | |
1053 | } | |
7c673cae FG |
1054 | DENC_FINISH(p); |
1055 | } | |
f67539c2 TL |
1056 | void dump(ceph::Formatter *f) const; |
1057 | static void generate_test_instances(std::list<bluestore_onode_t*>& o); | |
7c673cae FG |
1058 | }; |
1059 | WRITE_CLASS_DENC(bluestore_onode_t::shard_info) | |
1060 | WRITE_CLASS_DENC(bluestore_onode_t) | |
1061 | ||
f67539c2 | 1062 | std::ostream& operator<<(std::ostream& out, const bluestore_onode_t::shard_info& si); |
7c673cae FG |
1063 | |
1064 | /// writeahead-logged op | |
1065 | struct bluestore_deferred_op_t { | |
1066 | typedef enum { | |
1067 | OP_WRITE = 1, | |
1068 | } type_t; | |
1069 | __u8 op = 0; | |
1070 | ||
1071 | PExtentVector extents; | |
f67539c2 | 1072 | ceph::buffer::list data; |
7c673cae FG |
1073 | |
1074 | DENC(bluestore_deferred_op_t, v, p) { | |
1075 | DENC_START(1, 1, p); | |
1076 | denc(v.op, p); | |
1077 | denc(v.extents, p); | |
1078 | denc(v.data, p); | |
1079 | DENC_FINISH(p); | |
1080 | } | |
f67539c2 TL |
1081 | void dump(ceph::Formatter *f) const; |
1082 | static void generate_test_instances(std::list<bluestore_deferred_op_t*>& o); | |
7c673cae FG |
1083 | }; |
1084 | WRITE_CLASS_DENC(bluestore_deferred_op_t) | |
1085 | ||
1086 | ||
1087 | /// writeahead-logged transaction | |
1088 | struct bluestore_deferred_transaction_t { | |
1089 | uint64_t seq = 0; | |
f67539c2 | 1090 | std::list<bluestore_deferred_op_t> ops; |
7c673cae FG |
1091 | interval_set<uint64_t> released; ///< allocations to release after tx |
1092 | ||
1093 | bluestore_deferred_transaction_t() : seq(0) {} | |
1094 | ||
1095 | DENC(bluestore_deferred_transaction_t, v, p) { | |
1096 | DENC_START(1, 1, p); | |
1097 | denc(v.seq, p); | |
1098 | denc(v.ops, p); | |
1099 | denc(v.released, p); | |
1100 | DENC_FINISH(p); | |
1101 | } | |
f67539c2 TL |
1102 | void dump(ceph::Formatter *f) const; |
1103 | static void generate_test_instances(std::list<bluestore_deferred_transaction_t*>& o); | |
7c673cae FG |
1104 | }; |
1105 | WRITE_CLASS_DENC(bluestore_deferred_transaction_t) | |
1106 | ||
1107 | struct bluestore_compression_header_t { | |
1108 | uint8_t type = Compressor::COMP_ALG_NONE; | |
1109 | uint32_t length = 0; | |
1e59de90 | 1110 | std::optional<int32_t> compressor_message; |
7c673cae FG |
1111 | |
1112 | bluestore_compression_header_t() {} | |
1113 | bluestore_compression_header_t(uint8_t _type) | |
1114 | : type(_type) {} | |
1115 | ||
1116 | DENC(bluestore_compression_header_t, v, p) { | |
f67539c2 | 1117 | DENC_START(2, 1, p); |
7c673cae FG |
1118 | denc(v.type, p); |
1119 | denc(v.length, p); | |
f67539c2 TL |
1120 | if (struct_v >= 2) { |
1121 | denc(v.compressor_message, p); | |
1122 | } | |
7c673cae FG |
1123 | DENC_FINISH(p); |
1124 | } | |
f67539c2 TL |
1125 | void dump(ceph::Formatter *f) const; |
1126 | static void generate_test_instances(std::list<bluestore_compression_header_t*>& o); | |
7c673cae FG |
1127 | }; |
1128 | WRITE_CLASS_DENC(bluestore_compression_header_t) | |
1129 | ||
20effc67 TL |
1130 | template <template <typename> typename V, class COUNTER_TYPE = int32_t> |
1131 | class ref_counter_2hash_tracker_t { | |
1132 | size_t num_non_zero = 0; | |
1133 | size_t num_buckets = 0; | |
1134 | V<COUNTER_TYPE> buckets1; | |
1135 | V<COUNTER_TYPE> buckets2; | |
1136 | ||
1137 | public: | |
1138 | ref_counter_2hash_tracker_t(uint64_t mem_cap) { | |
1139 | num_buckets = mem_cap / sizeof(COUNTER_TYPE) / 2; | |
1140 | ceph_assert(num_buckets); | |
1141 | buckets1.resize(num_buckets); | |
1142 | buckets2.resize(num_buckets); | |
1143 | reset(); | |
1144 | } | |
1145 | ||
1146 | size_t get_num_buckets() const { | |
1147 | return num_buckets; | |
1148 | } | |
1149 | ||
1150 | void inc(const char* hash_val, size_t hash_val_len, int n) { | |
1151 | auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len) % | |
1152 | num_buckets; | |
1153 | if (buckets1[h] == 0 && n) { | |
1154 | ++num_non_zero; | |
1155 | } else if (buckets1[h] == -n) { | |
1156 | --num_non_zero; | |
1157 | } | |
1158 | buckets1[h] += n; | |
1159 | h = ceph_str_hash_linux((const char*)hash_val, hash_val_len) % num_buckets; | |
1160 | if (buckets2[h] == 0 && n) { | |
1161 | ++num_non_zero; | |
1162 | } else if (buckets2[h] == -n) { | |
1163 | --num_non_zero; | |
1164 | } | |
1165 | buckets2[h] += n; | |
1166 | } | |
1167 | ||
1168 | bool test_hash_conflict( | |
1169 | const char* hash_val1, | |
1170 | const char* hash_val2, | |
1171 | size_t hash_val_len) const { | |
1172 | ||
1173 | auto h1 = ceph_str_hash_rjenkins((const char*)hash_val1, hash_val_len); | |
1174 | auto h2 = ceph_str_hash_rjenkins((const char*)hash_val2, hash_val_len); | |
1175 | auto h3 = ceph_str_hash_linux((const char*)hash_val1, hash_val_len); | |
1176 | auto h4 = ceph_str_hash_linux((const char*)hash_val2, hash_val_len); | |
1177 | return ((h1 % num_buckets) == (h2 % num_buckets)) && | |
1178 | ((h3 % num_buckets) == (h4 % num_buckets)); | |
1179 | } | |
1180 | ||
1181 | bool test_all_zero(const char* hash_val, size_t hash_val_len) const { | |
1182 | auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len); | |
1183 | if (buckets1[h % num_buckets] != 0) { | |
1184 | return false; | |
1185 | } | |
1186 | h = ceph_str_hash_linux((const char*)hash_val, hash_val_len); | |
1187 | return buckets2[h % num_buckets] == 0; | |
1188 | } | |
1189 | ||
1190 | // returns number of mismatching buckets | |
1191 | size_t count_non_zero() const { | |
1192 | return num_non_zero; | |
1193 | } | |
1194 | void reset() { | |
1195 | for (size_t i = 0; i < num_buckets; i++) { | |
1196 | buckets1[i] = 0; | |
1197 | buckets2[i] = 0; | |
1198 | } | |
1199 | num_non_zero = 0; | |
1200 | } | |
1201 | }; | |
1202 | ||
1203 | class shared_blob_2hash_tracker_t | |
1204 | : public ref_counter_2hash_tracker_t<mempool::bluestore_fsck::vector> { | |
1205 | ||
1206 | static const size_t hash_input_len = 3; | |
1207 | ||
1208 | typedef std::array<uint64_t, hash_input_len> hash_input_t; | |
1209 | ||
1210 | static size_t get_hash_input_size() { | |
1211 | return hash_input_len * sizeof(hash_input_t::value_type); | |
1212 | } | |
1213 | ||
1214 | inline hash_input_t build_hash_input(uint64_t sbid, uint64_t offset) const; | |
1215 | ||
1216 | size_t au_void_bits = 0; | |
1217 | ||
1218 | ||
1219 | public: | |
1220 | shared_blob_2hash_tracker_t(uint64_t mem_cap, size_t alloc_unit) | |
1221 | : ref_counter_2hash_tracker_t(mem_cap) { | |
1222 | ceph_assert(alloc_unit); | |
1e59de90 TL |
1223 | ceph_assert(std::has_single_bit(alloc_unit)); |
1224 | au_void_bits = std::countr_zero(alloc_unit); | |
20effc67 TL |
1225 | } |
1226 | void inc(uint64_t sbid, uint64_t offset, int n); | |
1227 | void inc_range(uint64_t sbid, uint64_t offset, uint32_t len, int n); | |
1228 | ||
1229 | bool test_hash_conflict( | |
1230 | uint64_t sbid, | |
1231 | uint64_t offset, | |
1232 | uint64_t sbid2, | |
1233 | uint64_t offset2) const; | |
1234 | bool test_all_zero( | |
1235 | uint64_t sbid, | |
1236 | uint64_t offset) const; | |
1237 | bool test_all_zero_range( | |
1238 | uint64_t sbid, | |
1239 | uint64_t offset, | |
1240 | uint32_t len) const; | |
1241 | }; | |
1242 | ||
1243 | class sb_info_t { | |
1244 | // subzero value indicates (potentially) stray blob, | |
1245 | // i.e. blob that has got no real references from onodes | |
1246 | int64_t sbid = 0; | |
1247 | ||
1248 | public: | |
1249 | enum { | |
1250 | INVALID_POOL_ID = INT64_MIN | |
1251 | }; | |
1252 | ||
1253 | int64_t pool_id = INVALID_POOL_ID; | |
1254 | // subzero value indicates compressed_allocated as well | |
1255 | int32_t allocated_chunks = 0; | |
1256 | ||
1257 | sb_info_t(int64_t _sbid = 0) : sbid(_sbid) | |
1258 | { | |
1259 | } | |
1260 | bool operator< (const sb_info_t& other) const { | |
1261 | return std::abs(sbid) < std::abs(other.sbid); | |
1262 | } | |
1263 | bool operator< (const uint64_t& other_sbid) const { | |
1264 | return uint64_t(std::abs(sbid)) < other_sbid; | |
1265 | } | |
1266 | bool is_stray() const { | |
1267 | return sbid < 0; | |
1268 | } | |
1269 | uint64_t get_sbid() const { | |
1270 | return uint64_t(std::abs(sbid)); | |
1271 | } | |
1272 | void adopt() { | |
1273 | sbid = std::abs(sbid); | |
1274 | } | |
1275 | } __attribute__((packed)); | |
1276 | ||
1277 | // Space-efficient container to keep a set of sb_info structures | |
1278 | // given that the majority of entries are appended in a proper id-sorted | |
1279 | // order. Hence one can keep them in a regular vector and apply binary search | |
1280 | // whenever specific entry to be found. | |
1281 | // For the rare occasions when out-of-order append takes place - an auxilliary | |
1282 | // regular map is used. | |
1283 | struct sb_info_space_efficient_map_t { | |
1284 | // large array sorted by the user | |
1285 | mempool::bluestore_fsck::vector<sb_info_t> items; | |
1286 | // small additional set of items we maintain sorting ourselves | |
1287 | // this would never keep an entry with id > items.back().id | |
1288 | mempool::bluestore_fsck::vector<sb_info_t> aux_items; | |
1289 | ||
1290 | sb_info_t& add_maybe_stray(uint64_t sbid) { | |
1291 | return _add(-int64_t(sbid)); | |
1292 | } | |
1293 | sb_info_t& add_or_adopt(uint64_t sbid) { | |
1294 | auto& r = _add(sbid); | |
1295 | r.adopt(); | |
1296 | return r; | |
1297 | } | |
1298 | auto find(uint64_t id) { | |
1299 | if (items.size() != 0) { | |
1300 | auto it = std::lower_bound( | |
1301 | items.begin(), | |
1302 | items.end() - 1, | |
1303 | id, | |
1304 | [](const sb_info_t& a, const uint64_t& b) { | |
1305 | return a < b; | |
1306 | }); | |
1307 | if (it->get_sbid() == id) { | |
1308 | return it; | |
1309 | } | |
1310 | if (aux_items.size() != 0) { | |
1311 | auto it = std::lower_bound( | |
1312 | aux_items.begin(), | |
1313 | aux_items.end(), | |
1314 | id, | |
1315 | [](const sb_info_t& a, const uint64_t& b) { | |
1316 | return a < b; | |
1317 | }); | |
1318 | if (it->get_sbid() == id) { | |
1319 | return it; | |
1320 | } | |
1321 | } | |
1322 | } | |
1323 | return items.end(); | |
1324 | } | |
1325 | // enumerates strays, order isn't guaranteed. | |
1326 | void foreach_stray(std::function<void(const sb_info_t&)> cb) { | |
1327 | for (auto& sbi : items) { | |
1328 | if (sbi.is_stray()) { | |
1329 | cb(sbi); | |
1330 | } | |
1331 | } | |
1332 | for (auto& sbi : aux_items) { | |
1333 | if (sbi.is_stray()) { | |
1334 | cb(sbi); | |
1335 | } | |
1336 | } | |
1337 | } | |
1338 | auto end() { | |
1339 | return items.end(); | |
1340 | } | |
1341 | ||
1342 | void shrink() { | |
1343 | items.shrink_to_fit(); | |
1344 | aux_items.shrink_to_fit(); | |
1345 | } | |
1346 | void clear() { | |
1347 | items.clear(); | |
1348 | aux_items.clear(); | |
1349 | shrink(); | |
1350 | } | |
1351 | private: | |
1352 | sb_info_t& _add(int64_t id) { | |
1353 | uint64_t n_id = uint64_t(std::abs(id)); | |
1354 | if (items.size() == 0 || n_id > items.back().get_sbid()) { | |
1355 | return items.emplace_back(id); | |
1356 | } | |
1357 | auto it = find(n_id); | |
1358 | if (it != items.end()) { | |
1359 | return *it; | |
1360 | } | |
1361 | if (aux_items.size() == 0 || n_id > aux_items.back().get_sbid()) { | |
1362 | return aux_items.emplace_back(id); | |
1363 | } | |
1364 | // do sorted insertion, may be expensive! | |
1365 | it = std::upper_bound( | |
1366 | aux_items.begin(), | |
1367 | aux_items.end(), | |
1368 | n_id, | |
1369 | [](const uint64_t& a, const sb_info_t& b) { | |
1370 | return a < b.get_sbid(); | |
1371 | }); | |
1372 | return *aux_items.emplace(it, id); | |
1373 | } | |
1374 | }; | |
7c673cae FG |
1375 | |
1376 | #endif |