]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueStore.cc
import quincy 17.2.0
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <unistd.h>
16 #include <stdlib.h>
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <fcntl.h>
20 #include <algorithm>
21
22 #include <boost/container/flat_set.hpp>
23 #include <boost/algorithm/string.hpp>
24
25 #include "include/cpp-btree/btree_set.h"
26
27 #include "BlueStore.h"
28 #include "bluestore_common.h"
29 #include "simple_bitmap.h"
30 #include "os/kv.h"
31 #include "include/compat.h"
32 #include "include/intarith.h"
33 #include "include/stringify.h"
34 #include "include/str_map.h"
35 #include "include/util.h"
36 #include "common/errno.h"
37 #include "common/safe_io.h"
38 #include "common/PriorityCache.h"
39 #include "common/url_escape.h"
40 #include "Allocator.h"
41 #include "FreelistManager.h"
42 #include "BlueFS.h"
43 #include "BlueRocksEnv.h"
44 #include "auth/Crypto.h"
45 #include "common/EventTrace.h"
46 #include "perfglue/heap_profiler.h"
47 #include "common/blkdev.h"
48 #include "common/numa.h"
49 #include "common/pretty_binary.h"
50 #include "kv/KeyValueHistogram.h"
51
52 #ifdef HAVE_LIBZBD
53 #include "ZonedAllocator.h"
54 #include "ZonedFreelistManager.h"
55 #endif
56
57 #if defined(WITH_LTTNG)
58 #define TRACEPOINT_DEFINE
59 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
60 #include "tracing/bluestore.h"
61 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
62 #undef TRACEPOINT_DEFINE
63 #else
64 #define tracepoint(...)
65 #endif
66
67 #define dout_context cct
68 #define dout_subsys ceph_subsys_bluestore
69
70 using bid_t = decltype(BlueStore::Blob::id);
71
72 // bluestore_cache_onode
73 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
74 bluestore_cache_onode);
75
76 // bluestore_cache_other
77 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
78 bluestore_Buffer);
79 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
80 bluestore_Extent);
81 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
82 bluestore_Blob);
83 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
84 bluestore_SharedBlob);
85
86 // bluestore_txc
87 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
88 bluestore_txc);
89 using std::byte;
90 using std::deque;
91 using std::min;
92 using std::make_pair;
93 using std::numeric_limits;
94 using std::pair;
95 using std::less;
96 using std::list;
97 using std::make_unique;
98 using std::map;
99 using std::max;
100 using std::ostream;
101 using std::ostringstream;
102 using std::set;
103 using std::string;
104 using std::stringstream;
105 using std::unique_ptr;
106 using std::vector;
107
108 using ceph::bufferlist;
109 using ceph::bufferptr;
110 using ceph::coarse_mono_clock;
111 using ceph::decode;
112 using ceph::encode;
113 using ceph::Formatter;
114 using ceph::JSONFormatter;
115 using ceph::make_timespan;
116 using ceph::mono_clock;
117 using ceph::mono_time;
118 using ceph::timespan_str;
119
120 // kv store prefixes
121 const string PREFIX_SUPER = "S"; // field -> value
122 const string PREFIX_STAT = "T"; // field -> value(int64 array)
123 const string PREFIX_COLL = "C"; // collection name -> cnode_t
124 const string PREFIX_OBJ = "O"; // object name -> onode_t
125 const string PREFIX_OMAP = "M"; // u64 + keyname -> value
126 const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
127 const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
128 const string PREFIX_PERPG_OMAP = "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value
129 const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
130 const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
131 const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
132 const string PREFIX_SHARED_BLOB = "X"; // u64 SB id -> shared_blob_t
133
134 #ifdef HAVE_LIBZBD
135 const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager)
136 const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager)
137 const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata)
138 #endif
139
140 const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
141
142 // write a label in the first block. always use this size. note that
143 // bluefs makes a matching assumption about the location of its
144 // superblock (always the second block of the device).
145 #define BDEV_LABEL_BLOCK_SIZE 4096
146
147 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
148 #define SUPER_RESERVED 8192
149
150 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
151
152
153 /*
154 * extent map blob encoding
155 *
156 * we use the low bits of the blobid field to indicate some common scenarios
157 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
158 */
159 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
160 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
161 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
162 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
163 #define BLOBID_SHIFT_BITS 4
164
165 /*
166 * object name key structure
167 *
168 * encoded u8: shard + 2^7 (so that it sorts properly)
169 * encoded u64: poolid + 2^63 (so that it sorts properly)
170 * encoded u32: hash (bit reversed)
171 *
172 * escaped string: namespace
173 *
174 * escaped string: key or object name
175 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
176 * we are done. otherwise, we are followed by the object name.
177 * escaped string: object name (unless '=' above)
178 *
179 * encoded u64: snap
180 * encoded u64: generation
181 * 'o'
182 */
183 #define ONODE_KEY_SUFFIX 'o'
184
185 /*
186 * extent shard key
187 *
188 * object prefix key
189 * u32
190 * 'x'
191 */
192 #define EXTENT_SHARD_KEY_SUFFIX 'x'
193
194 /*
195 * string encoding in the key
196 *
197 * The key string needs to lexicographically sort the same way that
198 * ghobject_t does. We do this by escaping anything <= to '#' with #
199 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
200 * hex digits.
201 *
202 * We use ! as a terminator for strings; this works because it is < #
203 * and will get escaped if it is present in the string.
204 *
205 * NOTE: There is a bug in this implementation: due to implicit
206 * character type conversion in comparison it may produce unexpected
207 * ordering. Unfortunately fixing the bug would mean invalidating the
208 * keys in existing deployments. Instead we do additional sorting
209 * where it is needed.
210 */
211 template<typename S>
212 static void append_escaped(const string &in, S *out)
213 {
214 char hexbyte[in.length() * 3 + 1];
215 char* ptr = &hexbyte[0];
216 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
217 if (*i <= '#') { // bug: unexpected result for *i > 0x7f
218 *ptr++ = '#';
219 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
220 *ptr++ = "0123456789abcdef"[*i & 0x0f];
221 } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
222 *ptr++ = '~';
223 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
224 *ptr++ = "0123456789abcdef"[*i & 0x0f];
225 } else {
226 *ptr++ = *i;
227 }
228 }
229 *ptr++ = '!';
230 out->append(hexbyte, ptr - &hexbyte[0]);
231 }
232
233 inline unsigned h2i(char c)
234 {
235 if ((c >= '0') && (c <= '9')) {
236 return c - 0x30;
237 } else if ((c >= 'a') && (c <= 'f')) {
238 return c - 'a' + 10;
239 } else if ((c >= 'A') && (c <= 'F')) {
240 return c - 'A' + 10;
241 } else {
242 return 256; // make it always larger than 255
243 }
244 }
245
246 static int decode_escaped(const char *p, string *out)
247 {
248 char buff[256];
249 char* ptr = &buff[0];
250 char* max = &buff[252];
251 const char *orig_p = p;
252 while (*p && *p != '!') {
253 if (*p == '#' || *p == '~') {
254 unsigned hex = 0;
255 p++;
256 hex = h2i(*p++) << 4;
257 if (hex > 255) {
258 return -EINVAL;
259 }
260 hex |= h2i(*p++);
261 if (hex > 255) {
262 return -EINVAL;
263 }
264 *ptr++ = hex;
265 } else {
266 *ptr++ = *p++;
267 }
268 if (ptr > max) {
269 out->append(buff, ptr-buff);
270 ptr = &buff[0];
271 }
272 }
273 if (ptr != buff) {
274 out->append(buff, ptr-buff);
275 }
276 return p - orig_p;
277 }
278
279 template<typename T>
280 static void _key_encode_shard(shard_id_t shard, T *key)
281 {
282 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
283 }
284
285 static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
286 {
287 pshard->id = (uint8_t)*key - (uint8_t)0x80;
288 return key + 1;
289 }
290
291 static void get_coll_range(const coll_t& cid, int bits,
292 ghobject_t *temp_start, ghobject_t *temp_end,
293 ghobject_t *start, ghobject_t *end, bool legacy)
294 {
295 spg_t pgid;
296 constexpr uint32_t MAX_HASH = std::numeric_limits<uint32_t>::max();
297 // use different nspaces due to we use different schemes when encoding
298 // keys for listing objects
299 const std::string_view MAX_NSPACE = legacy ? "\x7f" : "\xff";
300 if (cid.is_pg(&pgid)) {
301 start->shard_id = pgid.shard;
302 *temp_start = *start;
303
304 start->hobj.pool = pgid.pool();
305 temp_start->hobj.pool = -2ll - pgid.pool();
306
307 *end = *start;
308 *temp_end = *temp_start;
309
310 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
311 start->hobj.set_bitwise_key_u32(reverse_hash);
312 temp_start->hobj.set_bitwise_key_u32(reverse_hash);
313
314 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
315 if (end_hash > MAX_HASH) {
316 // make sure end hobj is even greater than the maximum possible hobj
317 end->hobj.set_bitwise_key_u32(MAX_HASH);
318 temp_end->hobj.set_bitwise_key_u32(MAX_HASH);
319 end->hobj.nspace = MAX_NSPACE;
320 } else {
321 end->hobj.set_bitwise_key_u32(end_hash);
322 temp_end->hobj.set_bitwise_key_u32(end_hash);
323 }
324 } else {
325 start->shard_id = shard_id_t::NO_SHARD;
326 start->hobj.pool = -1ull;
327
328 *end = *start;
329 start->hobj.set_bitwise_key_u32(0);
330 end->hobj.set_bitwise_key_u32(MAX_HASH);
331 end->hobj.nspace = MAX_NSPACE;
332 // no separate temp section
333 *temp_start = *end;
334 *temp_end = *end;
335 }
336
337 start->generation = 0;
338 end->generation = 0;
339 temp_start->generation = 0;
340 temp_end->generation = 0;
341 }
342
343 static void get_shared_blob_key(uint64_t sbid, string *key)
344 {
345 key->clear();
346 _key_encode_u64(sbid, key);
347 }
348
349 static int get_key_shared_blob(const string& key, uint64_t *sbid)
350 {
351 const char *p = key.c_str();
352 if (key.length() < sizeof(uint64_t))
353 return -1;
354 _key_decode_u64(p, sbid);
355 return 0;
356 }
357
358 template<typename S>
359 static void _key_encode_prefix(const ghobject_t& oid, S *key)
360 {
361 _key_encode_shard(oid.shard_id, key);
362 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
363 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
364 }
365
366 static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
367 {
368 p = _key_decode_shard(p, &oid->shard_id);
369
370 uint64_t pool;
371 p = _key_decode_u64(p, &pool);
372 oid->hobj.pool = pool - 0x8000000000000000ull;
373
374 unsigned hash;
375 p = _key_decode_u32(p, &hash);
376
377 oid->hobj.set_bitwise_key_u32(hash);
378
379 return p;
380 }
381
382 #define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
383
384 static int _get_key_object(const char *p, ghobject_t *oid)
385 {
386 int r;
387
388 p = _key_decode_prefix(p, oid);
389
390 r = decode_escaped(p, &oid->hobj.nspace);
391 if (r < 0)
392 return -2;
393 p += r + 1;
394
395 string k;
396 r = decode_escaped(p, &k);
397 if (r < 0)
398 return -3;
399 p += r + 1;
400 if (*p == '=') {
401 // no key
402 ++p;
403 oid->hobj.oid.name = k;
404 } else if (*p == '<' || *p == '>') {
405 // key + name
406 ++p;
407 r = decode_escaped(p, &oid->hobj.oid.name);
408 if (r < 0)
409 return -5;
410 p += r + 1;
411 oid->hobj.set_key(k);
412 } else {
413 // malformed
414 return -6;
415 }
416
417 p = _key_decode_u64(p, &oid->hobj.snap.val);
418 p = _key_decode_u64(p, &oid->generation);
419
420 if (*p != ONODE_KEY_SUFFIX) {
421 return -7;
422 }
423 p++;
424 if (*p) {
425 // if we get something other than a null terminator here,
426 // something goes wrong.
427 return -8;
428 }
429
430 return 0;
431 }
432
433 template<typename S>
434 static int get_key_object(const S& key, ghobject_t *oid)
435 {
436 if (key.length() < ENCODED_KEY_PREFIX_LEN)
437 return -1;
438 if (key.length() == ENCODED_KEY_PREFIX_LEN)
439 return -2;
440 const char *p = key.c_str();
441 return _get_key_object(p, oid);
442 }
443
444 template<typename S>
445 static void _get_object_key(const ghobject_t& oid, S *key)
446 {
447 size_t max_len = ENCODED_KEY_PREFIX_LEN +
448 (oid.hobj.nspace.length() * 3 + 1) +
449 (oid.hobj.get_key().length() * 3 + 1) +
450 1 + // for '<', '=', or '>'
451 (oid.hobj.oid.name.length() * 3 + 1) +
452 8 + 8 + 1;
453 key->reserve(max_len);
454
455 _key_encode_prefix(oid, key);
456
457 append_escaped(oid.hobj.nspace, key);
458
459 if (oid.hobj.get_key().length()) {
460 // is a key... could be < = or >.
461 append_escaped(oid.hobj.get_key(), key);
462 // (ASCII chars < = and > sort in that order, yay)
463 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
464 if (r) {
465 key->append(r > 0 ? ">" : "<");
466 append_escaped(oid.hobj.oid.name, key);
467 } else {
468 // same as no key
469 key->append("=");
470 }
471 } else {
472 // no key
473 append_escaped(oid.hobj.oid.name, key);
474 key->append("=");
475 }
476
477 _key_encode_u64(oid.hobj.snap, key);
478 _key_encode_u64(oid.generation, key);
479
480 key->push_back(ONODE_KEY_SUFFIX);
481 }
482
483 template<typename S>
484 static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
485 {
486 key->clear();
487 _get_object_key(oid, key);
488
489 // sanity check
490 if (true) {
491 ghobject_t t;
492 int r = get_key_object(*key, &t);
493 if (r || t != oid) {
494 derr << " r " << r << dendl;
495 derr << "key " << pretty_binary_string(*key) << dendl;
496 derr << "oid " << oid << dendl;
497 derr << " t " << t << dendl;
498 ceph_assert(r == 0 && t == oid);
499 }
500 }
501 }
502
503 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
504 // char lets us quickly test whether it is a shard key without decoding any
505 // of the prefix bytes.
506 template<typename S>
507 static void get_extent_shard_key(const S& onode_key, uint32_t offset,
508 string *key)
509 {
510 key->clear();
511 key->reserve(onode_key.length() + 4 + 1);
512 key->append(onode_key.c_str(), onode_key.size());
513 _key_encode_u32(offset, key);
514 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
515 }
516
517 static void rewrite_extent_shard_key(uint32_t offset, string *key)
518 {
519 ceph_assert(key->size() > sizeof(uint32_t) + 1);
520 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
521 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
522 }
523
524 template<typename S>
525 static void generate_extent_shard_key_and_apply(
526 const S& onode_key,
527 uint32_t offset,
528 string *key,
529 std::function<void(const string& final_key)> apply)
530 {
531 if (key->empty()) { // make full key
532 ceph_assert(!onode_key.empty());
533 get_extent_shard_key(onode_key, offset, key);
534 } else {
535 rewrite_extent_shard_key(offset, key);
536 }
537 apply(*key);
538 }
539
540 int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
541 {
542 ceph_assert(key.size() > sizeof(uint32_t) + 1);
543 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
544 int okey_len = key.size() - sizeof(uint32_t) - 1;
545 *onode_key = key.substr(0, okey_len);
546 const char *p = key.data() + okey_len;
547 _key_decode_u32(p, offset);
548 return 0;
549 }
550
551 static bool is_extent_shard_key(const string& key)
552 {
553 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
554 }
555
556 static void get_deferred_key(uint64_t seq, string *out)
557 {
558 _key_encode_u64(seq, out);
559 }
560
561 static void get_pool_stat_key(int64_t pool_id, string *key)
562 {
563 key->clear();
564 _key_encode_u64(pool_id, key);
565 }
566
567 static int get_key_pool_stat(const string& key, uint64_t* pool_id)
568 {
569 const char *p = key.c_str();
570 if (key.length() < sizeof(uint64_t))
571 return -1;
572 _key_decode_u64(p, pool_id);
573 return 0;
574 }
575
576 #ifdef HAVE_LIBZBD
577 static void get_zone_offset_object_key(
578 uint32_t zone,
579 uint64_t offset,
580 ghobject_t oid,
581 std::string *key)
582 {
583 key->clear();
584 _key_encode_u32(zone, key);
585 _key_encode_u64(offset, key);
586 _get_object_key(oid, key);
587 }
588
589 static int get_key_zone_offset_object(
590 const string& key,
591 uint32_t *zone,
592 uint64_t *offset,
593 ghobject_t *oid)
594 {
595 const char *p = key.c_str();
596 if (key.length() < sizeof(uint64_t) + sizeof(uint32_t) + ENCODED_KEY_PREFIX_LEN + 1)
597 return -1;
598 p = _key_decode_u32(p, zone);
599 p = _key_decode_u64(p, offset);
600 int r = _get_key_object(p, oid);
601 if (r < 0) {
602 return r;
603 }
604 return 0;
605 }
606 #endif
607
608 template <int LogLevelV>
609 void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
610 {
611 uint64_t pos = 0;
612 for (auto& s : em.shards) {
613 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
614 << (s.loaded ? " (loaded)" : "")
615 << (s.dirty ? " (dirty)" : "")
616 << dendl;
617 }
618 for (auto& e : em.extent_map) {
619 dout(LogLevelV) << __func__ << " " << e << dendl;
620 ceph_assert(e.logical_offset >= pos);
621 pos = e.logical_offset + e.length;
622 const bluestore_blob_t& blob = e.blob->get_blob();
623 if (blob.has_csum()) {
624 vector<uint64_t> v;
625 unsigned n = blob.get_csum_count();
626 for (unsigned i = 0; i < n; ++i)
627 v.push_back(blob.get_csum_item(i));
628 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
629 << dendl;
630 }
631 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
632 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
633 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
634 << "~" << i.second->length << std::dec
635 << " " << *i.second << dendl;
636 }
637 }
638 }
639
640 template <int LogLevelV>
641 void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
642 {
643 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
644 return;
645 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
646 << " nid " << o.onode.nid
647 << " size 0x" << std::hex << o.onode.size
648 << " (" << std::dec << o.onode.size << ")"
649 << " expected_object_size " << o.onode.expected_object_size
650 << " expected_write_size " << o.onode.expected_write_size
651 << " in " << o.onode.extent_map_shards.size() << " shards"
652 << ", " << o.extent_map.spanning_blob_map.size()
653 << " spanning blobs"
654 << dendl;
655 for (auto& [zone, offset] : o.onode.zone_offset_refs) {
656 dout(LogLevelV) << __func__ << " zone ref 0x" << std::hex << zone
657 << " offset 0x" << offset << std::dec << dendl;
658 }
659 for (auto p = o.onode.attrs.begin();
660 p != o.onode.attrs.end();
661 ++p) {
662 dout(LogLevelV) << __func__ << " attr " << p->first
663 << " len " << p->second.length() << dendl;
664 }
665 _dump_extent_map<LogLevelV>(cct, o.extent_map);
666 }
667
668 template <int LogLevelV>
669 void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
670 {
671 dout(LogLevelV) << __func__ << " transaction dump:\n";
672 JSONFormatter f(true);
673 f.open_object_section("transaction");
674 t->dump(&f);
675 f.close_section();
676 f.flush(*_dout);
677 *_dout << dendl;
678 }
679
680 // Buffer
681
682 ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
683 {
684 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
685 << b.offset << "~" << b.length << std::dec
686 << " " << BlueStore::Buffer::get_state_name(b.state);
687 if (b.flags)
688 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
689 return out << ")";
690 }
691
692 namespace {
693
694 /*
695 * Due to a bug in key string encoding (see a comment for append_escaped)
696 * the KeyValueDB iterator does not lexicographically sort the same
697 * way that ghobject_t does: objects with the same hash may have wrong order.
698 *
699 * This is the iterator wrapper that fixes the keys order.
700 */
701
702 class CollectionListIterator {
703 public:
704 CollectionListIterator(const KeyValueDB::Iterator &it)
705 : m_it(it) {
706 }
707 virtual ~CollectionListIterator() {
708 }
709
710 virtual bool valid() const = 0;
711 virtual const ghobject_t &oid() const = 0;
712 virtual void lower_bound(const ghobject_t &oid) = 0;
713 virtual void upper_bound(const ghobject_t &oid) = 0;
714 virtual void next() = 0;
715
716 virtual int cmp(const ghobject_t &oid) const = 0;
717
718 bool is_ge(const ghobject_t &oid) const {
719 return cmp(oid) >= 0;
720 }
721
722 bool is_lt(const ghobject_t &oid) const {
723 return cmp(oid) < 0;
724 }
725
726 protected:
727 KeyValueDB::Iterator m_it;
728 };
729
730 class SimpleCollectionListIterator : public CollectionListIterator {
731 public:
732 SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
733 : CollectionListIterator(it), m_cct(cct) {
734 }
735
736 bool valid() const override {
737 return m_it->valid();
738 }
739
740 const ghobject_t &oid() const override {
741 ceph_assert(valid());
742
743 return m_oid;
744 }
745
746 void lower_bound(const ghobject_t &oid) override {
747 string key;
748 get_object_key(m_cct, oid, &key);
749
750 m_it->lower_bound(key);
751 get_oid();
752 }
753
754 void upper_bound(const ghobject_t &oid) override {
755 string key;
756 get_object_key(m_cct, oid, &key);
757
758 m_it->upper_bound(key);
759 get_oid();
760 }
761
762 void next() override {
763 ceph_assert(valid());
764
765 m_it->next();
766 get_oid();
767 }
768
769 int cmp(const ghobject_t &oid) const override {
770 ceph_assert(valid());
771
772 string key;
773 get_object_key(m_cct, oid, &key);
774
775 return m_it->key().compare(key);
776 }
777
778 private:
779 CephContext *m_cct;
780 ghobject_t m_oid;
781
782 void get_oid() {
783 m_oid = ghobject_t();
784 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
785 m_it->next();
786 }
787 if (!valid()) {
788 return;
789 }
790
791 int r = get_key_object(m_it->key(), &m_oid);
792 ceph_assert(r == 0);
793 }
794 };
795
796 class SortedCollectionListIterator : public CollectionListIterator {
797 public:
798 SortedCollectionListIterator(const KeyValueDB::Iterator &it)
799 : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
800 }
801
802 bool valid() const override {
803 return m_chunk_iter != m_chunk.end();
804 }
805
806 const ghobject_t &oid() const override {
807 ceph_assert(valid());
808
809 return m_chunk_iter->first;
810 }
811
812 void lower_bound(const ghobject_t &oid) override {
813 std::string key;
814 _key_encode_prefix(oid, &key);
815
816 m_it->lower_bound(key);
817 m_chunk_iter = m_chunk.end();
818 if (!get_next_chunk()) {
819 return;
820 }
821
822 if (this->oid().shard_id != oid.shard_id ||
823 this->oid().hobj.pool != oid.hobj.pool ||
824 this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
825 return;
826 }
827
828 m_chunk_iter = m_chunk.lower_bound(oid);
829 if (m_chunk_iter == m_chunk.end()) {
830 get_next_chunk();
831 }
832 }
833
834 void upper_bound(const ghobject_t &oid) override {
835 lower_bound(oid);
836
837 if (valid() && this->oid() == oid) {
838 next();
839 }
840 }
841
842 void next() override {
843 ceph_assert(valid());
844
845 m_chunk_iter++;
846 if (m_chunk_iter == m_chunk.end()) {
847 get_next_chunk();
848 }
849 }
850
851 int cmp(const ghobject_t &oid) const override {
852 ceph_assert(valid());
853
854 if (this->oid() < oid) {
855 return -1;
856 }
857 if (this->oid() > oid) {
858 return 1;
859 }
860 return 0;
861 }
862
863 private:
864 std::map<ghobject_t, std::string> m_chunk;
865 std::map<ghobject_t, std::string>::iterator m_chunk_iter;
866
867 bool get_next_chunk() {
868 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
869 m_it->next();
870 }
871
872 if (!m_it->valid()) {
873 return false;
874 }
875
876 ghobject_t oid;
877 int r = get_key_object(m_it->key(), &oid);
878 ceph_assert(r == 0);
879
880 m_chunk.clear();
881 while (true) {
882 m_chunk.insert({oid, m_it->key()});
883
884 do {
885 m_it->next();
886 } while (m_it->valid() && is_extent_shard_key(m_it->key()));
887
888 if (!m_it->valid()) {
889 break;
890 }
891
892 ghobject_t next;
893 r = get_key_object(m_it->key(), &next);
894 ceph_assert(r == 0);
895 if (next.shard_id != oid.shard_id ||
896 next.hobj.pool != oid.hobj.pool ||
897 next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
898 break;
899 }
900 oid = next;
901 }
902
903 m_chunk_iter = m_chunk.begin();
904 return true;
905 }
906 };
907
908 } // anonymous namespace
909
910 // Garbage Collector
911
912 void BlueStore::GarbageCollector::process_protrusive_extents(
913 const BlueStore::ExtentMap& extent_map,
914 uint64_t start_offset,
915 uint64_t end_offset,
916 uint64_t start_touch_offset,
917 uint64_t end_touch_offset,
918 uint64_t min_alloc_size)
919 {
920 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
921
922 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
923 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
924
925 dout(30) << __func__ << " (hex): [" << std::hex
926 << lookup_start_offset << ", " << lookup_end_offset
927 << ")" << std::dec << dendl;
928
929 for (auto it = extent_map.seek_lextent(lookup_start_offset);
930 it != extent_map.extent_map.end() &&
931 it->logical_offset < lookup_end_offset;
932 ++it) {
933 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
934 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
935
936 dout(30) << __func__ << " " << *it
937 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
938 << dendl;
939
940 Blob* b = it->blob.get();
941
942 if (it->logical_offset >=start_touch_offset &&
943 it->logical_end() <= end_touch_offset) {
944 // Process extents within the range affected by
945 // the current write request.
946 // Need to take into account if existing extents
947 // can be merged with them (uncompressed case)
948 if (!b->get_blob().is_compressed()) {
949 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
950 --blob_info_counted->expected_allocations; // don't need to allocate
951 // new AU for compressed
952 // data since another
953 // collocated uncompressed
954 // blob already exists
955 dout(30) << __func__ << " --expected:"
956 << alloc_unit_start << dendl;
957 }
958 used_alloc_unit = alloc_unit_end;
959 blob_info_counted = nullptr;
960 }
961 } else if (b->get_blob().is_compressed()) {
962
963 // additionally we take compressed blobs that were not impacted
964 // by the write into account too
965 BlobInfo& bi =
966 affected_blobs.emplace(
967 b, BlobInfo(b->get_referenced_bytes())).first->second;
968
969 int adjust =
970 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
971 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
972 dout(30) << __func__ << " expected_allocations="
973 << bi.expected_allocations << " end_au:"
974 << alloc_unit_end << dendl;
975
976 blob_info_counted = &bi;
977 used_alloc_unit = alloc_unit_end;
978
979 ceph_assert(it->length <= bi.referenced_bytes);
980 bi.referenced_bytes -= it->length;
981 dout(30) << __func__ << " affected_blob:" << *b
982 << " unref 0x" << std::hex << it->length
983 << " referenced = 0x" << bi.referenced_bytes
984 << std::dec << dendl;
985 // NOTE: we can't move specific blob to resulting GC list here
986 // when reference counter == 0 since subsequent extents might
987 // decrement its expected_allocation.
988 // Hence need to enumerate all the extents first.
989 if (!bi.collect_candidate) {
990 bi.first_lextent = it;
991 bi.collect_candidate = true;
992 }
993 bi.last_lextent = it;
994 } else {
995 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
996 // don't need to allocate new AU for compressed data since another
997 // collocated uncompressed blob already exists
998 --blob_info_counted->expected_allocations;
999 dout(30) << __func__ << " --expected_allocations:"
1000 << alloc_unit_start << dendl;
1001 }
1002 used_alloc_unit = alloc_unit_end;
1003 blob_info_counted = nullptr;
1004 }
1005 }
1006
1007 for (auto b_it = affected_blobs.begin();
1008 b_it != affected_blobs.end();
1009 ++b_it) {
1010 Blob* b = b_it->first;
1011 BlobInfo& bi = b_it->second;
1012 if (bi.referenced_bytes == 0) {
1013 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
1014 int64_t blob_expected_for_release =
1015 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
1016
1017 dout(30) << __func__ << " " << *(b_it->first)
1018 << " expected4release=" << blob_expected_for_release
1019 << " expected_allocations=" << bi.expected_allocations
1020 << dendl;
1021 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
1022 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
1023 if (bi.collect_candidate) {
1024 auto it = bi.first_lextent;
1025 bool bExit = false;
1026 do {
1027 if (it->blob.get() == b) {
1028 extents_to_collect.insert(it->logical_offset, it->length);
1029 }
1030 bExit = it == bi.last_lextent;
1031 ++it;
1032 } while (!bExit);
1033 }
1034 expected_for_release += blob_expected_for_release;
1035 expected_allocations += bi.expected_allocations;
1036 }
1037 }
1038 }
1039 }
1040
1041 int64_t BlueStore::GarbageCollector::estimate(
1042 uint64_t start_offset,
1043 uint64_t length,
1044 const BlueStore::ExtentMap& extent_map,
1045 const BlueStore::old_extent_map_t& old_extents,
1046 uint64_t min_alloc_size)
1047 {
1048
1049 affected_blobs.clear();
1050 extents_to_collect.clear();
1051 used_alloc_unit = boost::optional<uint64_t >();
1052 blob_info_counted = nullptr;
1053
1054 uint64_t gc_start_offset = start_offset;
1055 uint64_t gc_end_offset = start_offset + length;
1056
1057 uint64_t end_offset = start_offset + length;
1058
1059 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
1060 Blob* b = it->e.blob.get();
1061 if (b->get_blob().is_compressed()) {
1062
1063 // update gc_start_offset/gc_end_offset if needed
1064 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
1065 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
1066
1067 auto o = it->e.logical_offset;
1068 auto l = it->e.length;
1069
1070 uint64_t ref_bytes = b->get_referenced_bytes();
1071 // micro optimization to bypass blobs that have no more references
1072 if (ref_bytes != 0) {
1073 dout(30) << __func__ << " affected_blob:" << *b
1074 << " unref 0x" << std::hex << o << "~" << l
1075 << std::dec << dendl;
1076 affected_blobs.emplace(b, BlobInfo(ref_bytes));
1077 }
1078 }
1079 }
1080 dout(30) << __func__ << " gc range(hex): [" << std::hex
1081 << gc_start_offset << ", " << gc_end_offset
1082 << ")" << std::dec << dendl;
1083
1084 // enumerate preceeding extents to check if they reference affected blobs
1085 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
1086 process_protrusive_extents(extent_map,
1087 gc_start_offset,
1088 gc_end_offset,
1089 start_offset,
1090 end_offset,
1091 min_alloc_size);
1092 }
1093 return expected_for_release - expected_allocations;
1094 }
1095
1096 // LruOnodeCacheShard
1097 struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
1098 typedef boost::intrusive::list<
1099 BlueStore::Onode,
1100 boost::intrusive::member_hook<
1101 BlueStore::Onode,
1102 boost::intrusive::list_member_hook<>,
1103 &BlueStore::Onode::lru_item> > list_t;
1104
1105 list_t lru;
1106
1107 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
1108
1109 void _add(BlueStore::Onode* o, int level) override
1110 {
1111 if (o->put_cache()) {
1112 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
1113 o->cache_age_bin = age_bins.front();
1114 *(o->cache_age_bin) += 1;
1115 } else {
1116 ++num_pinned;
1117 }
1118 ++num; // we count both pinned and unpinned entries
1119 dout(20) << __func__ << " " << this << " " << o->oid << " added, num="
1120 << num << dendl;
1121 }
1122 void _rm(BlueStore::Onode* o) override
1123 {
1124 if (o->pop_cache()) {
1125 *(o->cache_age_bin) -= 1;
1126 lru.erase(lru.iterator_to(*o));
1127 } else {
1128 ceph_assert(num_pinned);
1129 --num_pinned;
1130 }
1131 ceph_assert(num);
1132 --num;
1133 dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
1134 }
1135 void _pin(BlueStore::Onode* o) override
1136 {
1137 *(o->cache_age_bin) -= 1;
1138 lru.erase(lru.iterator_to(*o));
1139 ++num_pinned;
1140 dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " pinned" << dendl;
1141 }
1142 void _unpin(BlueStore::Onode* o) override
1143 {
1144 lru.push_front(*o);
1145 o->cache_age_bin = age_bins.front();
1146 *(o->cache_age_bin) += 1;
1147 ceph_assert(num_pinned);
1148 --num_pinned;
1149 dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " unpinned" << dendl;
1150 }
1151 void _unpin_and_rm(BlueStore::Onode* o) override
1152 {
1153 o->pop_cache();
1154 ceph_assert(num_pinned);
1155 --num_pinned;
1156 ceph_assert(num);
1157 --num;
1158 }
1159 void _trim_to(uint64_t new_size) override
1160 {
1161 if (new_size >= lru.size()) {
1162 return; // don't even try
1163 }
1164 uint64_t n = lru.size() - new_size;
1165 auto p = lru.end();
1166 ceph_assert(p != lru.begin());
1167 --p;
1168 ceph_assert(num >= n);
1169 num -= n;
1170 while (n-- > 0) {
1171 BlueStore::Onode *o = &*p;
1172 dout(20) << __func__ << " rm " << o->oid << " "
1173 << o->nref << " " << o->cached << " " << o->pinned << dendl;
1174 if (p != lru.begin()) {
1175 lru.erase(p--);
1176 } else {
1177 ceph_assert(n == 0);
1178 lru.erase(p);
1179 }
1180 *(o->cache_age_bin) -= 1;
1181 auto pinned = !o->pop_cache();
1182 ceph_assert(!pinned);
1183 o->c->onode_map._remove(o->oid);
1184 }
1185 }
1186 void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
1187 {
1188 if (to == this) {
1189 return;
1190 }
1191 ceph_assert(o->cached);
1192 ceph_assert(o->pinned);
1193 ceph_assert(num);
1194 ceph_assert(num_pinned);
1195 --num_pinned;
1196 --num;
1197 ++to->num_pinned;
1198 ++to->num;
1199 }
1200 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
1201 {
1202 *onodes += num;
1203 *pinned_onodes += num_pinned;
1204 }
1205 };
1206
1207 // OnodeCacheShard
1208 BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
1209 CephContext* cct,
1210 string type,
1211 PerfCounters *logger)
1212 {
1213 BlueStore::OnodeCacheShard *c = nullptr;
1214 // Currently we only implement an LRU cache for onodes
1215 c = new LruOnodeCacheShard(cct);
1216 c->logger = logger;
1217 return c;
1218 }
1219
1220 // LruBufferCacheShard
1221 struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
1222 typedef boost::intrusive::list<
1223 BlueStore::Buffer,
1224 boost::intrusive::member_hook<
1225 BlueStore::Buffer,
1226 boost::intrusive::list_member_hook<>,
1227 &BlueStore::Buffer::lru_item> > list_t;
1228 list_t lru;
1229
1230 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
1231
1232 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
1233 if (near) {
1234 auto q = lru.iterator_to(*near);
1235 lru.insert(q, *b);
1236 } else if (level > 0) {
1237 lru.push_front(*b);
1238 } else {
1239 lru.push_back(*b);
1240 }
1241 buffer_bytes += b->length;
1242 b->cache_age_bin = age_bins.front();
1243 *(b->cache_age_bin) += b->length;
1244 num = lru.size();
1245 }
1246 void _rm(BlueStore::Buffer *b) override {
1247 ceph_assert(buffer_bytes >= b->length);
1248 buffer_bytes -= b->length;
1249 assert(*(b->cache_age_bin) >= b->length);
1250 *(b->cache_age_bin) -= b->length;
1251 auto q = lru.iterator_to(*b);
1252 lru.erase(q);
1253 num = lru.size();
1254 }
1255 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
1256 src->_rm(b);
1257 _add(b, 0, nullptr);
1258 }
1259 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
1260 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1261 buffer_bytes += delta;
1262 assert(*(b->cache_age_bin) + delta >= 0);
1263 *(b->cache_age_bin) += delta;
1264 }
1265 void _touch(BlueStore::Buffer *b) override {
1266 auto p = lru.iterator_to(*b);
1267 lru.erase(p);
1268 lru.push_front(*b);
1269 *(b->cache_age_bin) -= b->length;
1270 b->cache_age_bin = age_bins.front();
1271 *(b->cache_age_bin) += b->length;
1272 num = lru.size();
1273 _audit("_touch_buffer end");
1274 }
1275
1276 void _trim_to(uint64_t max) override
1277 {
1278 while (buffer_bytes > max) {
1279 auto i = lru.rbegin();
1280 if (i == lru.rend()) {
1281 // stop if lru is now empty
1282 break;
1283 }
1284
1285 BlueStore::Buffer *b = &*i;
1286 ceph_assert(b->is_clean());
1287 dout(20) << __func__ << " rm " << *b << dendl;
1288 assert(*(b->cache_age_bin) >= b->length);
1289 *(b->cache_age_bin) -= b->length;
1290 b->space->_rm_buffer(this, b);
1291 }
1292 num = lru.size();
1293 }
1294
1295 void add_stats(uint64_t *extents,
1296 uint64_t *blobs,
1297 uint64_t *buffers,
1298 uint64_t *bytes) override {
1299 *extents += num_extents;
1300 *blobs += num_blobs;
1301 *buffers += num;
1302 *bytes += buffer_bytes;
1303 }
1304 #ifdef DEBUG_CACHE
1305 void _audit(const char *s) override
1306 {
1307 dout(10) << __func__ << " " << when << " start" << dendl;
1308 uint64_t s = 0;
1309 for (auto i = lru.begin(); i != lru.end(); ++i) {
1310 s += i->length;
1311 }
1312 if (s != buffer_bytes) {
1313 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1314 << dendl;
1315 for (auto i = lru.begin(); i != lru.end(); ++i) {
1316 derr << __func__ << " " << *i << dendl;
1317 }
1318 ceph_assert(s == buffer_bytes);
1319 }
1320 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1321 << " ok" << dendl;
1322 }
1323 #endif
1324 };
1325
1326 // TwoQBufferCacheShard
1327
1328 struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1329 typedef boost::intrusive::list<
1330 BlueStore::Buffer,
1331 boost::intrusive::member_hook<
1332 BlueStore::Buffer,
1333 boost::intrusive::list_member_hook<>,
1334 &BlueStore::Buffer::lru_item> > list_t;
1335 list_t hot; ///< "Am" hot buffers
1336 list_t warm_in; ///< "A1in" newly warm buffers
1337 list_t warm_out; ///< "A1out" empty buffers we've evicted
1338
1339 enum {
1340 BUFFER_NEW = 0,
1341 BUFFER_WARM_IN, ///< in warm_in
1342 BUFFER_WARM_OUT, ///< in warm_out
1343 BUFFER_HOT, ///< in hot
1344 BUFFER_TYPE_MAX
1345 };
1346
1347 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
1348
1349 public:
1350 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
1351
1352 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1353 {
1354 dout(20) << __func__ << " level " << level << " near " << near
1355 << " on " << *b
1356 << " which has cache_private " << b->cache_private << dendl;
1357 if (near) {
1358 b->cache_private = near->cache_private;
1359 switch (b->cache_private) {
1360 case BUFFER_WARM_IN:
1361 warm_in.insert(warm_in.iterator_to(*near), *b);
1362 break;
1363 case BUFFER_WARM_OUT:
1364 ceph_assert(b->is_empty());
1365 warm_out.insert(warm_out.iterator_to(*near), *b);
1366 break;
1367 case BUFFER_HOT:
1368 hot.insert(hot.iterator_to(*near), *b);
1369 break;
1370 default:
1371 ceph_abort_msg("bad cache_private");
1372 }
1373 } else if (b->cache_private == BUFFER_NEW) {
1374 b->cache_private = BUFFER_WARM_IN;
1375 if (level > 0) {
1376 warm_in.push_front(*b);
1377 } else {
1378 // take caller hint to start at the back of the warm queue
1379 warm_in.push_back(*b);
1380 }
1381 } else {
1382 // we got a hint from discard
1383 switch (b->cache_private) {
1384 case BUFFER_WARM_IN:
1385 // stay in warm_in. move to front, even though 2Q doesn't actually
1386 // do this.
1387 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1388 warm_in.push_front(*b);
1389 break;
1390 case BUFFER_WARM_OUT:
1391 b->cache_private = BUFFER_HOT;
1392 // move to hot. fall-thru
1393 case BUFFER_HOT:
1394 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1395 hot.push_front(*b);
1396 break;
1397 default:
1398 ceph_abort_msg("bad cache_private");
1399 }
1400 }
1401 b->cache_age_bin = age_bins.front();
1402 if (!b->is_empty()) {
1403 buffer_bytes += b->length;
1404 list_bytes[b->cache_private] += b->length;
1405 *(b->cache_age_bin) += b->length;
1406 }
1407 num = hot.size() + warm_in.size();
1408 }
1409
1410 void _rm(BlueStore::Buffer *b) override
1411 {
1412 dout(20) << __func__ << " " << *b << dendl;
1413 if (!b->is_empty()) {
1414 ceph_assert(buffer_bytes >= b->length);
1415 buffer_bytes -= b->length;
1416 ceph_assert(list_bytes[b->cache_private] >= b->length);
1417 list_bytes[b->cache_private] -= b->length;
1418 assert(*(b->cache_age_bin) >= b->length);
1419 *(b->cache_age_bin) -= b->length;
1420 }
1421 switch (b->cache_private) {
1422 case BUFFER_WARM_IN:
1423 warm_in.erase(warm_in.iterator_to(*b));
1424 break;
1425 case BUFFER_WARM_OUT:
1426 warm_out.erase(warm_out.iterator_to(*b));
1427 break;
1428 case BUFFER_HOT:
1429 hot.erase(hot.iterator_to(*b));
1430 break;
1431 default:
1432 ceph_abort_msg("bad cache_private");
1433 }
1434 num = hot.size() + warm_in.size();
1435 }
1436
1437 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1438 {
1439 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1440 src->_rm(b);
1441
1442 // preserve which list we're on (even if we can't preserve the order!)
1443 switch (b->cache_private) {
1444 case BUFFER_WARM_IN:
1445 ceph_assert(!b->is_empty());
1446 warm_in.push_back(*b);
1447 break;
1448 case BUFFER_WARM_OUT:
1449 ceph_assert(b->is_empty());
1450 warm_out.push_back(*b);
1451 break;
1452 case BUFFER_HOT:
1453 ceph_assert(!b->is_empty());
1454 hot.push_back(*b);
1455 break;
1456 default:
1457 ceph_abort_msg("bad cache_private");
1458 }
1459 if (!b->is_empty()) {
1460 buffer_bytes += b->length;
1461 list_bytes[b->cache_private] += b->length;
1462 *(b->cache_age_bin) += b->length;
1463 }
1464 num = hot.size() + warm_in.size();
1465 }
1466
1467 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1468 {
1469 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1470 if (!b->is_empty()) {
1471 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1472 buffer_bytes += delta;
1473 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1474 list_bytes[b->cache_private] += delta;
1475 assert(*(b->cache_age_bin) + delta >= 0);
1476 *(b->cache_age_bin) += delta;
1477 }
1478 }
1479
1480 void _touch(BlueStore::Buffer *b) override {
1481 switch (b->cache_private) {
1482 case BUFFER_WARM_IN:
1483 // do nothing (somewhat counter-intuitively!)
1484 break;
1485 case BUFFER_WARM_OUT:
1486 // move from warm_out to hot LRU
1487 ceph_abort_msg("this happens via discard hint");
1488 break;
1489 case BUFFER_HOT:
1490 // move to front of hot LRU
1491 hot.erase(hot.iterator_to(*b));
1492 hot.push_front(*b);
1493 break;
1494 }
1495 *(b->cache_age_bin) -= b->length;
1496 b->cache_age_bin = age_bins.front();
1497 *(b->cache_age_bin) += b->length;
1498 num = hot.size() + warm_in.size();
1499 _audit("_touch_buffer end");
1500 }
1501
1502 void _trim_to(uint64_t max) override
1503 {
1504 if (buffer_bytes > max) {
1505 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1506 uint64_t khot = max - kin;
1507
1508 // pre-calculate kout based on average buffer size too,
1509 // which is typical(the warm_in and hot lists may change later)
1510 uint64_t kout = 0;
1511 uint64_t buffer_num = hot.size() + warm_in.size();
1512 if (buffer_num) {
1513 uint64_t avg_size = buffer_bytes / buffer_num;
1514 ceph_assert(avg_size);
1515 uint64_t calculated_num = max / avg_size;
1516 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1517 }
1518
1519 if (list_bytes[BUFFER_HOT] < khot) {
1520 // hot is small, give slack to warm_in
1521 kin += khot - list_bytes[BUFFER_HOT];
1522 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1523 // warm_in is small, give slack to hot
1524 khot += kin - list_bytes[BUFFER_WARM_IN];
1525 }
1526
1527 // adjust warm_in list
1528 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1529 uint64_t evicted = 0;
1530
1531 while (to_evict_bytes > 0) {
1532 auto p = warm_in.rbegin();
1533 if (p == warm_in.rend()) {
1534 // stop if warm_in list is now empty
1535 break;
1536 }
1537
1538 BlueStore::Buffer *b = &*p;
1539 ceph_assert(b->is_clean());
1540 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1541 ceph_assert(buffer_bytes >= b->length);
1542 buffer_bytes -= b->length;
1543 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1544 list_bytes[BUFFER_WARM_IN] -= b->length;
1545 assert(*(b->cache_age_bin) >= b->length);
1546 *(b->cache_age_bin) -= b->length;
1547 to_evict_bytes -= b->length;
1548 evicted += b->length;
1549 b->state = BlueStore::Buffer::STATE_EMPTY;
1550 b->data.clear();
1551 warm_in.erase(warm_in.iterator_to(*b));
1552 warm_out.push_front(*b);
1553 b->cache_private = BUFFER_WARM_OUT;
1554 }
1555
1556 if (evicted > 0) {
1557 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1558 << " from warm_in list, done evicting warm_in buffers"
1559 << dendl;
1560 }
1561
1562 // adjust hot list
1563 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1564 evicted = 0;
1565
1566 while (to_evict_bytes > 0) {
1567 auto p = hot.rbegin();
1568 if (p == hot.rend()) {
1569 // stop if hot list is now empty
1570 break;
1571 }
1572
1573 BlueStore::Buffer *b = &*p;
1574 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1575 ceph_assert(b->is_clean());
1576 // adjust evict size before buffer goes invalid
1577 to_evict_bytes -= b->length;
1578 evicted += b->length;
1579 b->space->_rm_buffer(this, b);
1580 }
1581
1582 if (evicted > 0) {
1583 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1584 << " from hot list, done evicting hot buffers"
1585 << dendl;
1586 }
1587
1588 // adjust warm out list too, if necessary
1589 int64_t n = warm_out.size() - kout;
1590 while (n-- > 0) {
1591 BlueStore::Buffer *b = &*warm_out.rbegin();
1592 ceph_assert(b->is_empty());
1593 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1594 b->space->_rm_buffer(this, b);
1595 }
1596 }
1597 num = hot.size() + warm_in.size();
1598 }
1599
1600 void add_stats(uint64_t *extents,
1601 uint64_t *blobs,
1602 uint64_t *buffers,
1603 uint64_t *bytes) override {
1604 *extents += num_extents;
1605 *blobs += num_blobs;
1606 *buffers += num;
1607 *bytes += buffer_bytes;
1608 }
1609
1610 #ifdef DEBUG_CACHE
1611 void _audit(const char *s) override
1612 {
1613 dout(10) << __func__ << " " << when << " start" << dendl;
1614 uint64_t s = 0;
1615 for (auto i = hot.begin(); i != hot.end(); ++i) {
1616 s += i->length;
1617 }
1618
1619 uint64_t hot_bytes = s;
1620 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1621 derr << __func__ << " hot_list_bytes "
1622 << list_bytes[BUFFER_HOT]
1623 << " != actual " << hot_bytes
1624 << dendl;
1625 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
1626 }
1627
1628 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1629 s += i->length;
1630 }
1631
1632 uint64_t warm_in_bytes = s - hot_bytes;
1633 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1634 derr << __func__ << " warm_in_list_bytes "
1635 << list_bytes[BUFFER_WARM_IN]
1636 << " != actual " << warm_in_bytes
1637 << dendl;
1638 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
1639 }
1640
1641 if (s != buffer_bytes) {
1642 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1643 << dendl;
1644 ceph_assert(s == buffer_bytes);
1645 }
1646
1647 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1648 << " ok" << dendl;
1649 }
1650 #endif
1651 };
1652
1653 // BuferCacheShard
1654
1655 BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1656 CephContext* cct,
1657 string type,
1658 PerfCounters *logger)
1659 {
1660 BufferCacheShard *c = nullptr;
1661 if (type == "lru")
1662 c = new LruBufferCacheShard(cct);
1663 else if (type == "2q")
1664 c = new TwoQBufferCacheShard(cct);
1665 else
1666 ceph_abort_msg("unrecognized cache type");
1667 c->logger = logger;
1668 return c;
1669 }
1670
1671 // BufferSpace
1672
1673 #undef dout_prefix
1674 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1675
1676 void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
1677 {
1678 // note: we already hold cache->lock
1679 ldout(cache->cct, 20) << __func__ << dendl;
1680 while (!buffer_map.empty()) {
1681 _rm_buffer(cache, buffer_map.begin());
1682 }
1683 }
1684
1685 int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
1686 {
1687 // note: we already hold cache->lock
1688 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1689 << std::dec << dendl;
1690 int cache_private = 0;
1691 cache->_audit("discard start");
1692 auto i = _data_lower_bound(offset);
1693 uint32_t end = offset + length;
1694 while (i != buffer_map.end()) {
1695 Buffer *b = i->second.get();
1696 if (b->offset >= end) {
1697 break;
1698 }
1699 if (b->cache_private > cache_private) {
1700 cache_private = b->cache_private;
1701 }
1702 if (b->offset < offset) {
1703 int64_t front = offset - b->offset;
1704 if (b->end() > end) {
1705 // drop middle (split)
1706 uint32_t tail = b->end() - end;
1707 if (b->data.length()) {
1708 bufferlist bl;
1709 bl.substr_of(b->data, b->length - tail, tail);
1710 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
1711 nb->maybe_rebuild();
1712 _add_buffer(cache, nb, 0, b);
1713 } else {
1714 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail,
1715 b->flags),
1716 0, b);
1717 }
1718 if (!b->is_writing()) {
1719 cache->_adjust_size(b, front - (int64_t)b->length);
1720 }
1721 b->truncate(front);
1722 b->maybe_rebuild();
1723 cache->_audit("discard end 1");
1724 break;
1725 } else {
1726 // drop tail
1727 if (!b->is_writing()) {
1728 cache->_adjust_size(b, front - (int64_t)b->length);
1729 }
1730 b->truncate(front);
1731 b->maybe_rebuild();
1732 ++i;
1733 continue;
1734 }
1735 }
1736 if (b->end() <= end) {
1737 // drop entire buffer
1738 _rm_buffer(cache, i++);
1739 continue;
1740 }
1741 // drop front
1742 uint32_t keep = b->end() - end;
1743 if (b->data.length()) {
1744 bufferlist bl;
1745 bl.substr_of(b->data, b->length - keep, keep);
1746 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
1747 nb->maybe_rebuild();
1748 _add_buffer(cache, nb, 0, b);
1749 } else {
1750 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep,
1751 b->flags),
1752 0, b);
1753 }
1754 _rm_buffer(cache, i);
1755 cache->_audit("discard end 2");
1756 break;
1757 }
1758 return cache_private;
1759 }
1760
1761 void BlueStore::BufferSpace::read(
1762 BufferCacheShard* cache,
1763 uint32_t offset,
1764 uint32_t length,
1765 BlueStore::ready_regions_t& res,
1766 interval_set<uint32_t>& res_intervals,
1767 int flags)
1768 {
1769 res.clear();
1770 res_intervals.clear();
1771 uint32_t want_bytes = length;
1772 uint32_t end = offset + length;
1773
1774 {
1775 std::lock_guard l(cache->lock);
1776 for (auto i = _data_lower_bound(offset);
1777 i != buffer_map.end() && offset < end && i->first < end;
1778 ++i) {
1779 Buffer *b = i->second.get();
1780 ceph_assert(b->end() > offset);
1781
1782 bool val = false;
1783 if (flags & BYPASS_CLEAN_CACHE)
1784 val = b->is_writing();
1785 else
1786 val = b->is_writing() || b->is_clean();
1787 if (val) {
1788 if (b->offset < offset) {
1789 uint32_t skip = offset - b->offset;
1790 uint32_t l = min(length, b->length - skip);
1791 res[offset].substr_of(b->data, skip, l);
1792 res_intervals.insert(offset, l);
1793 offset += l;
1794 length -= l;
1795 if (!b->is_writing()) {
1796 cache->_touch(b);
1797 }
1798 continue;
1799 }
1800 if (b->offset > offset) {
1801 uint32_t gap = b->offset - offset;
1802 if (length <= gap) {
1803 break;
1804 }
1805 offset += gap;
1806 length -= gap;
1807 }
1808 if (!b->is_writing()) {
1809 cache->_touch(b);
1810 }
1811 if (b->length > length) {
1812 res[offset].substr_of(b->data, 0, length);
1813 res_intervals.insert(offset, length);
1814 break;
1815 } else {
1816 res[offset].append(b->data);
1817 res_intervals.insert(offset, b->length);
1818 if (b->length == length)
1819 break;
1820 offset += b->length;
1821 length -= b->length;
1822 }
1823 }
1824 }
1825 }
1826
1827 uint64_t hit_bytes = res_intervals.size();
1828 ceph_assert(hit_bytes <= want_bytes);
1829 uint64_t miss_bytes = want_bytes - hit_bytes;
1830 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1831 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1832 }
1833
1834 void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
1835 {
1836 auto i = writing.begin();
1837 while (i != writing.end()) {
1838 if (i->seq > seq) {
1839 break;
1840 }
1841 if (i->seq < seq) {
1842 ++i;
1843 continue;
1844 }
1845
1846 Buffer *b = &*i;
1847 ceph_assert(b->is_writing());
1848
1849 if (b->flags & Buffer::FLAG_NOCACHE) {
1850 writing.erase(i++);
1851 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1852 buffer_map.erase(b->offset);
1853 } else {
1854 b->state = Buffer::STATE_CLEAN;
1855 writing.erase(i++);
1856 b->maybe_rebuild();
1857 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
1858 cache->_add(b, 1, nullptr);
1859 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1860 }
1861 }
1862 cache->_trim();
1863 cache->_audit("finish_write end");
1864 }
1865
1866 void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
1867 {
1868 std::lock_guard lk(cache->lock);
1869 if (buffer_map.empty())
1870 return;
1871
1872 auto p = --buffer_map.end();
1873 while (true) {
1874 if (p->second->end() <= pos)
1875 break;
1876
1877 if (p->second->offset < pos) {
1878 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1879 size_t left = pos - p->second->offset;
1880 size_t right = p->second->length - left;
1881 if (p->second->data.length()) {
1882 bufferlist bl;
1883 bl.substr_of(p->second->data, left, right);
1884 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1885 0, bl, p->second->flags),
1886 0, p->second.get());
1887 } else {
1888 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1889 0, right, p->second->flags),
1890 0, p->second.get());
1891 }
1892 cache->_adjust_size(p->second.get(), -right);
1893 p->second->truncate(left);
1894 break;
1895 }
1896
1897 ceph_assert(p->second->end() > pos);
1898 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1899 if (p->second->data.length()) {
1900 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1901 p->second->offset - pos, p->second->data, p->second->flags),
1902 0, p->second.get());
1903 } else {
1904 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1905 p->second->offset - pos, p->second->length, p->second->flags),
1906 0, p->second.get());
1907 }
1908 if (p == buffer_map.begin()) {
1909 _rm_buffer(cache, p);
1910 break;
1911 } else {
1912 _rm_buffer(cache, p--);
1913 }
1914 }
1915 ceph_assert(writing.empty());
1916 cache->_trim();
1917 }
1918
1919 // OnodeSpace
1920
1921 #undef dout_prefix
1922 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1923
1924 BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
1925 OnodeRef& o)
1926 {
1927 std::lock_guard l(cache->lock);
1928 auto p = onode_map.find(oid);
1929 if (p != onode_map.end()) {
1930 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1931 << " raced, returning existing " << p->second
1932 << dendl;
1933 return p->second;
1934 }
1935 ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
1936 onode_map[oid] = o;
1937 cache->_add(o.get(), 1);
1938 cache->_trim();
1939 return o;
1940 }
1941
1942 void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
1943 {
1944 ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
1945 onode_map.erase(oid);
1946 }
1947
1948 BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1949 {
1950 ldout(cache->cct, 30) << __func__ << dendl;
1951 OnodeRef o;
1952
1953 {
1954 std::lock_guard l(cache->lock);
1955 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1956 if (p == onode_map.end()) {
1957 cache->logger->inc(l_bluestore_onode_misses);
1958 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1959 } else {
1960 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1961 << " " << p->second->nref
1962 << " " << p->second->cached
1963 << " " << p->second->pinned
1964 << dendl;
1965 // This will pin onode and implicitly touch the cache when Onode
1966 // eventually will become unpinned
1967 o = p->second;
1968 ceph_assert(!o->cached || o->pinned);
1969
1970 cache->logger->inc(l_bluestore_onode_hits);
1971 }
1972 }
1973
1974 return o;
1975 }
1976
1977 void BlueStore::OnodeSpace::clear()
1978 {
1979 std::lock_guard l(cache->lock);
1980 ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
1981 for (auto &p : onode_map) {
1982 cache->_rm(p.second.get());
1983 }
1984 onode_map.clear();
1985 }
1986
1987 bool BlueStore::OnodeSpace::empty()
1988 {
1989 std::lock_guard l(cache->lock);
1990 return onode_map.empty();
1991 }
1992
1993 void BlueStore::OnodeSpace::rename(
1994 OnodeRef& oldo,
1995 const ghobject_t& old_oid,
1996 const ghobject_t& new_oid,
1997 const mempool::bluestore_cache_meta::string& new_okey)
1998 {
1999 std::lock_guard l(cache->lock);
2000 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
2001 << dendl;
2002 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
2003 po = onode_map.find(old_oid);
2004 pn = onode_map.find(new_oid);
2005 ceph_assert(po != pn);
2006
2007 ceph_assert(po != onode_map.end());
2008 if (pn != onode_map.end()) {
2009 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
2010 << dendl;
2011 cache->_rm(pn->second.get());
2012 onode_map.erase(pn);
2013 }
2014 OnodeRef o = po->second;
2015
2016 // install a non-existent onode at old location
2017 oldo.reset(new Onode(o->c, old_oid, o->key));
2018 po->second = oldo;
2019 cache->_add(oldo.get(), 1);
2020 // add at new position and fix oid, key.
2021 // This will pin 'o' and implicitly touch cache
2022 // when it will eventually become unpinned
2023 onode_map.insert(make_pair(new_oid, o));
2024 ceph_assert(o->pinned);
2025
2026 o->oid = new_oid;
2027 o->key = new_okey;
2028 cache->_trim();
2029 }
2030
2031 bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
2032 {
2033 std::lock_guard l(cache->lock);
2034 ldout(cache->cct, 20) << __func__ << dendl;
2035 for (auto& i : onode_map) {
2036 if (f(i.second.get())) {
2037 return true;
2038 }
2039 }
2040 return false;
2041 }
2042
2043 template <int LogLevelV = 30>
2044 void BlueStore::OnodeSpace::dump(CephContext *cct)
2045 {
2046 for (auto& i : onode_map) {
2047 ldout(cct, LogLevelV) << i.first << " : " << i.second
2048 << " " << i.second->nref
2049 << " " << i.second->cached
2050 << " " << i.second->pinned
2051 << dendl;
2052 }
2053 }
2054
2055 // SharedBlob
2056
2057 #undef dout_prefix
2058 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
2059 #undef dout_context
2060 #define dout_context coll->store->cct
2061
2062 void BlueStore::SharedBlob::dump(Formatter* f) const
2063 {
2064 f->dump_bool("loaded", loaded);
2065 if (loaded) {
2066 persistent->dump(f);
2067 } else {
2068 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
2069 }
2070 }
2071
2072 ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
2073 {
2074 out << "SharedBlob(" << &sb;
2075
2076 if (sb.loaded) {
2077 out << " loaded " << *sb.persistent;
2078 } else {
2079 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
2080 }
2081 return out << ")";
2082 }
2083
2084 BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
2085 : coll(_coll), sbid_unloaded(i)
2086 {
2087 ceph_assert(sbid_unloaded > 0);
2088 if (get_cache()) {
2089 get_cache()->add_blob();
2090 }
2091 }
2092
2093 BlueStore::SharedBlob::~SharedBlob()
2094 {
2095 if (loaded && persistent) {
2096 delete persistent;
2097 }
2098 }
2099
2100 void BlueStore::SharedBlob::put()
2101 {
2102 if (--nref == 0) {
2103 dout(20) << __func__ << " " << this
2104 << " removing self from set " << get_parent()
2105 << dendl;
2106 again:
2107 auto coll_snap = coll;
2108 if (coll_snap) {
2109 std::lock_guard l(coll_snap->cache->lock);
2110 if (coll_snap != coll) {
2111 goto again;
2112 }
2113 if (!coll_snap->shared_blob_set.remove(this, true)) {
2114 // race with lookup
2115 return;
2116 }
2117 bc._clear(coll_snap->cache);
2118 coll_snap->cache->rm_blob();
2119 }
2120 delete this;
2121 }
2122 }
2123
2124 void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
2125 {
2126 ceph_assert(persistent);
2127 persistent->ref_map.get(offset, length);
2128 }
2129
2130 void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
2131 PExtentVector *r,
2132 bool *unshare)
2133 {
2134 ceph_assert(persistent);
2135 persistent->ref_map.put(offset, length, r,
2136 unshare && !*unshare ? unshare : nullptr);
2137 }
2138
2139 void BlueStore::SharedBlob::finish_write(uint64_t seq)
2140 {
2141 while (true) {
2142 BufferCacheShard *cache = coll->cache;
2143 std::lock_guard l(cache->lock);
2144 if (coll->cache != cache) {
2145 dout(20) << __func__
2146 << " raced with sb cache update, was " << cache
2147 << ", now " << coll->cache << ", retrying"
2148 << dendl;
2149 continue;
2150 }
2151 bc._finish_write(cache, seq);
2152 break;
2153 }
2154 }
2155
2156 // SharedBlobSet
2157
2158 #undef dout_prefix
2159 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2160
2161 template <int LogLevelV = 30>
2162 void BlueStore::SharedBlobSet::dump(CephContext *cct)
2163 {
2164 std::lock_guard l(lock);
2165 for (auto& i : sb_map) {
2166 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
2167 }
2168 }
2169
2170 // Blob
2171
2172 #undef dout_prefix
2173 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2174
2175 void BlueStore::Blob::dump(Formatter* f) const
2176 {
2177 if (is_spanning()) {
2178 f->dump_unsigned("spanning_id ", id);
2179 }
2180 blob.dump(f);
2181 if (shared_blob) {
2182 f->dump_object("shared", *shared_blob);
2183 }
2184 }
2185
2186 ostream& operator<<(ostream& out, const BlueStore::Blob& b)
2187 {
2188 out << "Blob(" << &b;
2189 if (b.is_spanning()) {
2190 out << " spanning " << b.id;
2191 }
2192 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
2193 if (b.shared_blob) {
2194 out << " " << *b.shared_blob;
2195 } else {
2196 out << " (shared_blob=NULL)";
2197 }
2198 out << ")";
2199 return out;
2200 }
2201
2202 void BlueStore::Blob::discard_unallocated(Collection *coll)
2203 {
2204 if (get_blob().is_shared()) {
2205 return;
2206 }
2207 if (get_blob().is_compressed()) {
2208 bool discard = false;
2209 bool all_invalid = true;
2210 for (auto e : get_blob().get_extents()) {
2211 if (!e.is_valid()) {
2212 discard = true;
2213 } else {
2214 all_invalid = false;
2215 }
2216 }
2217 ceph_assert(discard == all_invalid); // in case of compressed blob all
2218 // or none pextents are invalid.
2219 if (discard) {
2220 shared_blob->bc.discard(shared_blob->get_cache(), 0,
2221 get_blob().get_logical_length());
2222 }
2223 } else {
2224 size_t pos = 0;
2225 for (auto e : get_blob().get_extents()) {
2226 if (!e.is_valid()) {
2227 dout(20) << __func__ << " 0x" << std::hex << pos
2228 << "~" << e.length
2229 << std::dec << dendl;
2230 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
2231 }
2232 pos += e.length;
2233 }
2234 if (get_blob().can_prune_tail()) {
2235 dirty_blob().prune_tail();
2236 used_in_blob.prune_tail(get_blob().get_ondisk_length());
2237 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
2238 }
2239 }
2240 }
2241
2242 void BlueStore::Blob::get_ref(
2243 Collection *coll,
2244 uint32_t offset,
2245 uint32_t length)
2246 {
2247 // Caller has to initialize Blob's logical length prior to increment
2248 // references. Otherwise one is neither unable to determine required
2249 // amount of counters in case of per-au tracking nor obtain min_release_size
2250 // for single counter mode.
2251 ceph_assert(get_blob().get_logical_length() != 0);
2252 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2253 << std::dec << " " << *this << dendl;
2254
2255 if (used_in_blob.is_empty()) {
2256 uint32_t min_release_size =
2257 get_blob().get_release_size(coll->store->min_alloc_size);
2258 uint64_t l = get_blob().get_logical_length();
2259 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
2260 << min_release_size << std::dec << dendl;
2261 used_in_blob.init(l, min_release_size);
2262 }
2263 used_in_blob.get(
2264 offset,
2265 length);
2266 }
2267
2268 bool BlueStore::Blob::put_ref(
2269 Collection *coll,
2270 uint32_t offset,
2271 uint32_t length,
2272 PExtentVector *r)
2273 {
2274 PExtentVector logical;
2275
2276 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2277 << std::dec << " " << *this << dendl;
2278
2279 bool empty = used_in_blob.put(
2280 offset,
2281 length,
2282 &logical);
2283 r->clear();
2284 // nothing to release
2285 if (!empty && logical.empty()) {
2286 return false;
2287 }
2288
2289 bluestore_blob_t& b = dirty_blob();
2290 return b.release_extents(empty, logical, r);
2291 }
2292
2293 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
2294 uint32_t target_blob_size,
2295 uint32_t b_offset,
2296 uint32_t *length0) {
2297 ceph_assert(min_alloc_size);
2298 ceph_assert(target_blob_size);
2299 if (!get_blob().is_mutable()) {
2300 return false;
2301 }
2302
2303 uint32_t length = *length0;
2304 uint32_t end = b_offset + length;
2305
2306 // Currently for the sake of simplicity we omit blob reuse if data is
2307 // unaligned with csum chunk. Later we can perform padding if needed.
2308 if (get_blob().has_csum() &&
2309 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2310 (end % get_blob().get_csum_chunk_size()) != 0)) {
2311 return false;
2312 }
2313
2314 auto blen = get_blob().get_logical_length();
2315 uint32_t new_blen = blen;
2316
2317 // make sure target_blob_size isn't less than current blob len
2318 target_blob_size = std::max(blen, target_blob_size);
2319
2320 if (b_offset >= blen) {
2321 // new data totally stands out of the existing blob
2322 new_blen = end;
2323 } else {
2324 // new data overlaps with the existing blob
2325 new_blen = std::max(blen, end);
2326
2327 uint32_t overlap = 0;
2328 if (new_blen > blen) {
2329 overlap = blen - b_offset;
2330 } else {
2331 overlap = length;
2332 }
2333
2334 if (!get_blob().is_unallocated(b_offset, overlap)) {
2335 // abort if any piece of the overlap has already been allocated
2336 return false;
2337 }
2338 }
2339
2340 if (new_blen > blen) {
2341 int64_t overflow = int64_t(new_blen) - target_blob_size;
2342 // Unable to decrease the provided length to fit into max_blob_size
2343 if (overflow >= length) {
2344 return false;
2345 }
2346
2347 // FIXME: in some cases we could reduce unused resolution
2348 if (get_blob().has_unused()) {
2349 return false;
2350 }
2351
2352 if (overflow > 0) {
2353 new_blen -= overflow;
2354 length -= overflow;
2355 *length0 = length;
2356 }
2357
2358 if (new_blen > blen) {
2359 dirty_blob().add_tail(new_blen);
2360 used_in_blob.add_tail(new_blen,
2361 get_blob().get_release_size(min_alloc_size));
2362 }
2363 }
2364 return true;
2365 }
2366
2367 void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2368 {
2369 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2370 << " start " << *this << dendl;
2371 ceph_assert(blob.can_split());
2372 ceph_assert(used_in_blob.can_split());
2373 bluestore_blob_t &lb = dirty_blob();
2374 bluestore_blob_t &rb = r->dirty_blob();
2375
2376 used_in_blob.split(
2377 blob_offset,
2378 &(r->used_in_blob));
2379
2380 lb.split(blob_offset, rb);
2381 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2382
2383 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2384 << " finish " << *this << dendl;
2385 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2386 << " and " << *r << dendl;
2387 }
2388
2389 #ifndef CACHE_BLOB_BL
2390 void BlueStore::Blob::decode(
2391 Collection *coll,
2392 bufferptr::const_iterator& p,
2393 uint64_t struct_v,
2394 uint64_t* sbid,
2395 bool include_ref_map)
2396 {
2397 denc(blob, p, struct_v);
2398 if (blob.is_shared()) {
2399 denc(*sbid, p);
2400 }
2401 if (include_ref_map) {
2402 if (struct_v > 1) {
2403 used_in_blob.decode(p);
2404 } else {
2405 used_in_blob.clear();
2406 bluestore_extent_ref_map_t legacy_ref_map;
2407 legacy_ref_map.decode(p);
2408 for (auto r : legacy_ref_map.ref_map) {
2409 get_ref(
2410 coll,
2411 r.first,
2412 r.second.refs * r.second.length);
2413 }
2414 }
2415 }
2416 }
2417 #endif
2418
2419 // Extent
2420
2421 void BlueStore::Extent::dump(Formatter* f) const
2422 {
2423 f->dump_unsigned("logical_offset", logical_offset);
2424 f->dump_unsigned("length", length);
2425 f->dump_unsigned("blob_offset", blob_offset);
2426 f->dump_object("blob", *blob);
2427 }
2428
2429 ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2430 {
2431 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2432 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2433 << " " << *e.blob;
2434 }
2435
2436 // OldExtent
2437 BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2438 uint32_t lo,
2439 uint32_t o,
2440 uint32_t l,
2441 BlobRef& b) {
2442 OldExtent* oe = new OldExtent(lo, o, l, b);
2443 b->put_ref(c.get(), o, l, &(oe->r));
2444 oe->blob_empty = !b->is_referenced();
2445 return oe;
2446 }
2447
2448 // ExtentMap
2449
2450 #undef dout_prefix
2451 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
2452 #undef dout_context
2453 #define dout_context onode->c->store->cct
2454
2455 BlueStore::ExtentMap::ExtentMap(Onode *o)
2456 : onode(o),
2457 inline_bl(
2458 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2459 }
2460
2461 void BlueStore::ExtentMap::dump(Formatter* f) const
2462 {
2463 f->open_array_section("extents");
2464
2465 for (auto& e : extent_map) {
2466 f->dump_object("extent", e);
2467 }
2468 f->close_section();
2469 }
2470
2471 void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2472 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2473 uint64_t& length, uint64_t& dstoff) {
2474
2475 auto cct = onode->c->store->cct;
2476 bool inject_21040 =
2477 cct->_conf->bluestore_debug_inject_bug21040;
2478 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2479 for (auto& e : oldo->extent_map.extent_map) {
2480 e.blob->last_encoded_id = -1;
2481 }
2482
2483 int n = 0;
2484 uint64_t end = srcoff + length;
2485 uint32_t dirty_range_begin = 0;
2486 uint32_t dirty_range_end = 0;
2487 bool src_dirty = false;
2488 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2489 ep != oldo->extent_map.extent_map.end();
2490 ++ep) {
2491 auto& e = *ep;
2492 if (e.logical_offset >= end) {
2493 break;
2494 }
2495 dout(20) << __func__ << " src " << e << dendl;
2496 BlobRef cb;
2497 bool blob_duped = true;
2498 if (e.blob->last_encoded_id >= 0) {
2499 cb = id_to_blob[e.blob->last_encoded_id];
2500 blob_duped = false;
2501 } else {
2502 // dup the blob
2503 const bluestore_blob_t& blob = e.blob->get_blob();
2504 // make sure it is shared
2505 if (!blob.is_shared()) {
2506 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2507 if (!inject_21040 && !src_dirty) {
2508 src_dirty = true;
2509 dirty_range_begin = e.logical_offset;
2510 } else if (inject_21040 &&
2511 dirty_range_begin == 0 && dirty_range_end == 0) {
2512 dirty_range_begin = e.logical_offset;
2513 }
2514 ceph_assert(e.logical_end() > 0);
2515 // -1 to exclude next potential shard
2516 dirty_range_end = e.logical_end() - 1;
2517 } else {
2518 c->load_shared_blob(e.blob->shared_blob);
2519 }
2520 cb = new Blob();
2521 e.blob->last_encoded_id = n;
2522 id_to_blob[n] = cb;
2523 e.blob->dup(*cb);
2524 // bump the extent refs on the copied blob's extents
2525 for (auto p : blob.get_extents()) {
2526 if (p.is_valid()) {
2527 e.blob->shared_blob->get_ref(p.offset, p.length);
2528 }
2529 }
2530 txc->write_shared_blob(e.blob->shared_blob);
2531 dout(20) << __func__ << " new " << *cb << dendl;
2532 }
2533
2534 int skip_front, skip_back;
2535 if (e.logical_offset < srcoff) {
2536 skip_front = srcoff - e.logical_offset;
2537 } else {
2538 skip_front = 0;
2539 }
2540 if (e.logical_end() > end) {
2541 skip_back = e.logical_end() - end;
2542 } else {
2543 skip_back = 0;
2544 }
2545
2546 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2547 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2548 newo->extent_map.extent_map.insert(*ne);
2549 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2550 // fixme: we may leave parts of new blob unreferenced that could
2551 // be freed (relative to the shared_blob).
2552 txc->statfs_delta.stored() += ne->length;
2553 if (e.blob->get_blob().is_compressed()) {
2554 txc->statfs_delta.compressed_original() += ne->length;
2555 if (blob_duped) {
2556 txc->statfs_delta.compressed() +=
2557 cb->get_blob().get_compressed_payload_length();
2558 }
2559 }
2560 dout(20) << __func__ << " dst " << *ne << dendl;
2561 ++n;
2562 }
2563 if ((!inject_21040 && src_dirty) ||
2564 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2565 oldo->extent_map.dirty_range(dirty_range_begin,
2566 dirty_range_end - dirty_range_begin);
2567 txc->write_onode(oldo);
2568 }
2569 txc->write_onode(newo);
2570
2571 if (dstoff + length > newo->onode.size) {
2572 newo->onode.size = dstoff + length;
2573 }
2574 newo->extent_map.dirty_range(dstoff, length);
2575 }
2576 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2577 bool force)
2578 {
2579 auto cct = onode->c->store->cct; //used by dout
2580 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2581 if (onode->onode.extent_map_shards.empty()) {
2582 if (inline_bl.length() == 0) {
2583 unsigned n;
2584 // we need to encode inline_bl to measure encoded length
2585 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
2586 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
2587 ceph_assert(!never_happen);
2588 size_t len = inline_bl.length();
2589 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2590 << " extents" << dendl;
2591 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2592 request_reshard(0, OBJECT_MAX_SIZE);
2593 return;
2594 }
2595 }
2596 // will persist in the onode key.
2597 } else {
2598 // pending shard update
2599 struct dirty_shard_t {
2600 Shard *shard;
2601 bufferlist bl;
2602 dirty_shard_t(Shard *s) : shard(s) {}
2603 };
2604 vector<dirty_shard_t> encoded_shards;
2605 // allocate slots for all shards in a single call instead of
2606 // doing multiple allocations - one per each dirty shard
2607 encoded_shards.reserve(shards.size());
2608
2609 auto p = shards.begin();
2610 auto prev_p = p;
2611 while (p != shards.end()) {
2612 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
2613 auto n = p;
2614 ++n;
2615 if (p->dirty) {
2616 uint32_t endoff;
2617 if (n == shards.end()) {
2618 endoff = OBJECT_MAX_SIZE;
2619 } else {
2620 endoff = n->shard_info->offset;
2621 }
2622 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2623 bufferlist& bl = encoded_shards.back().bl;
2624 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2625 bl, &p->extents)) {
2626 if (force) {
2627 derr << __func__ << " encode_some needs reshard" << dendl;
2628 ceph_assert(!force);
2629 }
2630 }
2631 size_t len = bl.length();
2632
2633 dout(20) << __func__ << " shard 0x" << std::hex
2634 << p->shard_info->offset << std::dec << " is " << len
2635 << " bytes (was " << p->shard_info->bytes << ") from "
2636 << p->extents << " extents" << dendl;
2637
2638 if (!force) {
2639 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2640 // we are big; reshard ourselves
2641 request_reshard(p->shard_info->offset, endoff);
2642 }
2643 // avoid resharding the trailing shard, even if it is small
2644 else if (n != shards.end() &&
2645 len < g_conf()->bluestore_extent_map_shard_min_size) {
2646 ceph_assert(endoff != OBJECT_MAX_SIZE);
2647 if (p == shards.begin()) {
2648 // we are the first shard, combine with next shard
2649 request_reshard(p->shard_info->offset, endoff + 1);
2650 } else {
2651 // combine either with the previous shard or the next,
2652 // whichever is smaller
2653 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2654 request_reshard(p->shard_info->offset, endoff + 1);
2655 } else {
2656 request_reshard(prev_p->shard_info->offset, endoff);
2657 }
2658 }
2659 }
2660 }
2661 }
2662 prev_p = p;
2663 p = n;
2664 }
2665 if (needs_reshard()) {
2666 return;
2667 }
2668
2669 // schedule DB update for dirty shards
2670 string key;
2671 for (auto& it : encoded_shards) {
2672 dout(20) << __func__ << " encoding key for shard 0x" << std::hex
2673 << it.shard->shard_info->offset << std::dec << dendl;
2674 it.shard->dirty = false;
2675 it.shard->shard_info->bytes = it.bl.length();
2676 generate_extent_shard_key_and_apply(
2677 onode->key,
2678 it.shard->shard_info->offset,
2679 &key,
2680 [&](const string& final_key) {
2681 t->set(PREFIX_OBJ, final_key, it.bl);
2682 }
2683 );
2684 }
2685 }
2686 }
2687
2688 bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2689 {
2690 if (spanning_blob_map.empty())
2691 return 0;
2692 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2693 // bid is valid and available.
2694 if (bid >= 0)
2695 return bid;
2696 // Find next unused bid;
2697 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2698 const auto begin_bid = bid;
2699 do {
2700 if (!spanning_blob_map.count(bid))
2701 return bid;
2702 else {
2703 bid++;
2704 if (bid < 0) bid = 0;
2705 }
2706 } while (bid != begin_bid);
2707 auto cct = onode->c->store->cct; // used by dout
2708 _dump_onode<0>(cct, *onode);
2709 ceph_abort_msg("no available blob id");
2710 }
2711
2712 void BlueStore::ExtentMap::reshard(
2713 KeyValueDB *db,
2714 KeyValueDB::Transaction t)
2715 {
2716 auto cct = onode->c->store->cct; // used by dout
2717
2718 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2719 << needs_reshard_end << ")" << std::dec
2720 << " of " << onode->onode.extent_map_shards.size()
2721 << " shards on " << onode->oid << dendl;
2722 for (auto& p : spanning_blob_map) {
2723 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2724 << dendl;
2725 }
2726 // determine shard index range
2727 unsigned si_begin = 0, si_end = 0;
2728 if (!shards.empty()) {
2729 while (si_begin + 1 < shards.size() &&
2730 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2731 ++si_begin;
2732 }
2733 needs_reshard_begin = shards[si_begin].shard_info->offset;
2734 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2735 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2736 needs_reshard_end = shards[si_end].shard_info->offset;
2737 break;
2738 }
2739 }
2740 if (si_end == shards.size()) {
2741 needs_reshard_end = OBJECT_MAX_SIZE;
2742 }
2743 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2744 << " over 0x[" << std::hex << needs_reshard_begin << ","
2745 << needs_reshard_end << ")" << std::dec << dendl;
2746 }
2747
2748 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
2749
2750 // we may need to fault in a larger interval later must have all
2751 // referring extents for spanning blobs loaded in order to have
2752 // accurate use_tracker values.
2753 uint32_t spanning_scan_begin = needs_reshard_begin;
2754 uint32_t spanning_scan_end = needs_reshard_end;
2755
2756 // remove old keys
2757 string key;
2758 for (unsigned i = si_begin; i < si_end; ++i) {
2759 generate_extent_shard_key_and_apply(
2760 onode->key, shards[i].shard_info->offset, &key,
2761 [&](const string& final_key) {
2762 t->rmkey(PREFIX_OBJ, final_key);
2763 }
2764 );
2765 }
2766
2767 // calculate average extent size
2768 unsigned bytes = 0;
2769 unsigned extents = 0;
2770 if (onode->onode.extent_map_shards.empty()) {
2771 bytes = inline_bl.length();
2772 extents = extent_map.size();
2773 } else {
2774 for (unsigned i = si_begin; i < si_end; ++i) {
2775 bytes += shards[i].shard_info->bytes;
2776 extents += shards[i].extents;
2777 }
2778 }
2779 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2780 unsigned slop = target *
2781 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2782 unsigned extent_avg = bytes / std::max(1u, extents);
2783 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2784 << ", slop " << slop << dendl;
2785
2786 // reshard
2787 unsigned estimate = 0;
2788 unsigned offset = needs_reshard_begin;
2789 vector<bluestore_onode_t::shard_info> new_shard_info;
2790 unsigned max_blob_end = 0;
2791 Extent dummy(needs_reshard_begin);
2792 for (auto e = extent_map.lower_bound(dummy);
2793 e != extent_map.end();
2794 ++e) {
2795 if (e->logical_offset >= needs_reshard_end) {
2796 break;
2797 }
2798 dout(30) << " extent " << *e << dendl;
2799
2800 // disfavor shard boundaries that span a blob
2801 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2802 if (estimate &&
2803 estimate + extent_avg > target + (would_span ? slop : 0)) {
2804 // new shard
2805 if (offset == needs_reshard_begin) {
2806 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2807 new_shard_info.back().offset = offset;
2808 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2809 << std::dec << dendl;
2810 }
2811 offset = e->logical_offset;
2812 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2813 new_shard_info.back().offset = offset;
2814 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2815 << std::dec << dendl;
2816 estimate = 0;
2817 }
2818 estimate += extent_avg;
2819 unsigned bs = e->blob_start();
2820 if (bs < spanning_scan_begin) {
2821 spanning_scan_begin = bs;
2822 }
2823 uint32_t be = e->blob_end();
2824 if (be > max_blob_end) {
2825 max_blob_end = be;
2826 }
2827 if (be > spanning_scan_end) {
2828 spanning_scan_end = be;
2829 }
2830 }
2831 if (new_shard_info.empty() && (si_begin > 0 ||
2832 si_end < shards.size())) {
2833 // we resharded a partial range; we must produce at least one output
2834 // shard
2835 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2836 new_shard_info.back().offset = needs_reshard_begin;
2837 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2838 << std::dec << " (singleton degenerate case)" << dendl;
2839 }
2840
2841 auto& sv = onode->onode.extent_map_shards;
2842 dout(20) << __func__ << " new " << new_shard_info << dendl;
2843 dout(20) << __func__ << " old " << sv << dendl;
2844 if (sv.empty()) {
2845 // no old shards to keep
2846 sv.swap(new_shard_info);
2847 init_shards(true, true);
2848 } else {
2849 // splice in new shards
2850 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2851 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2852 sv.insert(
2853 sv.begin() + si_begin,
2854 new_shard_info.begin(),
2855 new_shard_info.end());
2856 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
2857 si_end = si_begin + new_shard_info.size();
2858
2859 ceph_assert(sv.size() == shards.size());
2860
2861 // note that we need to update every shard_info of shards here,
2862 // as sv might have been totally re-allocated above
2863 for (unsigned i = 0; i < shards.size(); i++) {
2864 shards[i].shard_info = &sv[i];
2865 }
2866
2867 // mark newly added shards as dirty
2868 for (unsigned i = si_begin; i < si_end; ++i) {
2869 shards[i].loaded = true;
2870 shards[i].dirty = true;
2871 }
2872 }
2873 dout(20) << __func__ << " fin " << sv << dendl;
2874 inline_bl.clear();
2875
2876 if (sv.empty()) {
2877 // no more shards; unspan all previously spanning blobs
2878 auto p = spanning_blob_map.begin();
2879 while (p != spanning_blob_map.end()) {
2880 p->second->id = -1;
2881 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2882 p = spanning_blob_map.erase(p);
2883 }
2884 } else {
2885 // identify new spanning blobs
2886 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2887 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2888 if (spanning_scan_begin < needs_reshard_begin) {
2889 fault_range(db, spanning_scan_begin,
2890 needs_reshard_begin - spanning_scan_begin);
2891 }
2892 if (spanning_scan_end > needs_reshard_end) {
2893 fault_range(db, needs_reshard_end,
2894 spanning_scan_end - needs_reshard_end);
2895 }
2896 auto sp = sv.begin() + si_begin;
2897 auto esp = sv.end();
2898 unsigned shard_start = sp->offset;
2899 unsigned shard_end;
2900 ++sp;
2901 if (sp == esp) {
2902 shard_end = OBJECT_MAX_SIZE;
2903 } else {
2904 shard_end = sp->offset;
2905 }
2906 Extent dummy(needs_reshard_begin);
2907
2908 bool was_too_many_blobs_check = false;
2909 auto too_many_blobs_threshold =
2910 g_conf()->bluestore_debug_too_many_blobs_threshold;
2911 auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
2912 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2913 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
2914
2915 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2916 if (e->logical_offset >= needs_reshard_end) {
2917 break;
2918 }
2919 dout(30) << " extent " << *e << dendl;
2920 while (e->logical_offset >= shard_end) {
2921 shard_start = shard_end;
2922 ceph_assert(sp != esp);
2923 ++sp;
2924 if (sp == esp) {
2925 shard_end = OBJECT_MAX_SIZE;
2926 } else {
2927 shard_end = sp->offset;
2928 }
2929 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2930 << " to 0x" << shard_end << std::dec << dendl;
2931 }
2932
2933 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2934 if (!e->blob->is_spanning()) {
2935 // We have two options: (1) split the blob into pieces at the
2936 // shard boundaries (and adjust extents accordingly), or (2)
2937 // mark it spanning. We prefer to cut the blob if we can. Note that
2938 // we may have to split it multiple times--potentially at every
2939 // shard boundary.
2940 bool must_span = false;
2941 BlobRef b = e->blob;
2942 if (b->can_split()) {
2943 uint32_t bstart = e->blob_start();
2944 uint32_t bend = e->blob_end();
2945 for (const auto& sh : shards) {
2946 if (bstart < sh.shard_info->offset &&
2947 bend > sh.shard_info->offset) {
2948 uint32_t blob_offset = sh.shard_info->offset - bstart;
2949 if (b->can_split_at(blob_offset)) {
2950 dout(20) << __func__ << " splitting blob, bstart 0x"
2951 << std::hex << bstart << " blob_offset 0x"
2952 << blob_offset << std::dec << " " << *b << dendl;
2953 b = split_blob(b, blob_offset, sh.shard_info->offset);
2954 // switch b to the new right-hand side, in case it
2955 // *also* has to get split.
2956 bstart += blob_offset;
2957 onode->c->store->logger->inc(l_bluestore_blob_split);
2958 } else {
2959 must_span = true;
2960 break;
2961 }
2962 }
2963 }
2964 } else {
2965 must_span = true;
2966 }
2967 if (must_span) {
2968 auto bid = allocate_spanning_blob_id();
2969 b->id = bid;
2970 spanning_blob_map[b->id] = b;
2971 dout(20) << __func__ << " adding spanning " << *b << dendl;
2972 if (!was_too_many_blobs_check &&
2973 too_many_blobs_threshold &&
2974 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2975
2976 was_too_many_blobs_check = true;
2977 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2978 if (dumped_onodes[i].first == onode->oid) {
2979 oid_slot = &dumped_onodes[i];
2980 break;
2981 }
2982 if (!oldest_slot || (oldest_slot &&
2983 dumped_onodes[i].second < oldest_slot->second)) {
2984 oldest_slot = &dumped_onodes[i];
2985 }
2986 }
2987 }
2988 }
2989 }
2990 } else {
2991 if (e->blob->is_spanning()) {
2992 spanning_blob_map.erase(e->blob->id);
2993 e->blob->id = -1;
2994 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2995 }
2996 }
2997 }
2998 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
2999 (oid_slot &&
3000 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
3001 if (do_dump) {
3002 dout(0) << __func__
3003 << " spanning blob count exceeds threshold, "
3004 << spanning_blob_map.size() << " spanning blobs"
3005 << dendl;
3006 _dump_onode<0>(cct, *onode);
3007 if (oid_slot) {
3008 oid_slot->second = mono_clock::now();
3009 } else {
3010 ceph_assert(oldest_slot);
3011 oldest_slot->first = onode->oid;
3012 oldest_slot->second = mono_clock::now();
3013 }
3014 }
3015 }
3016
3017 clear_needs_reshard();
3018 }
3019
3020 bool BlueStore::ExtentMap::encode_some(
3021 uint32_t offset,
3022 uint32_t length,
3023 bufferlist& bl,
3024 unsigned *pn)
3025 {
3026 Extent dummy(offset);
3027 auto start = extent_map.lower_bound(dummy);
3028 uint32_t end = offset + length;
3029
3030 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
3031 // serialization only. Hence there is no specific
3032 // handling at ExtentMap level.
3033
3034 unsigned n = 0;
3035 size_t bound = 0;
3036 bool must_reshard = false;
3037 for (auto p = start;
3038 p != extent_map.end() && p->logical_offset < end;
3039 ++p, ++n) {
3040 ceph_assert(p->logical_offset >= offset);
3041 p->blob->last_encoded_id = -1;
3042 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
3043 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3044 << std::dec << " hit new spanning blob " << *p << dendl;
3045 request_reshard(p->blob_start(), p->blob_end());
3046 must_reshard = true;
3047 }
3048 if (!must_reshard) {
3049 denc_varint(0, bound); // blobid
3050 denc_varint(0, bound); // logical_offset
3051 denc_varint(0, bound); // len
3052 denc_varint(0, bound); // blob_offset
3053
3054 p->blob->bound_encode(
3055 bound,
3056 struct_v,
3057 p->blob->shared_blob->get_sbid(),
3058 false);
3059 }
3060 }
3061 if (must_reshard) {
3062 return true;
3063 }
3064
3065 denc(struct_v, bound);
3066 denc_varint(0, bound); // number of extents
3067
3068 {
3069 auto app = bl.get_contiguous_appender(bound);
3070 denc(struct_v, app);
3071 denc_varint(n, app);
3072 if (pn) {
3073 *pn = n;
3074 }
3075
3076 n = 0;
3077 uint64_t pos = 0;
3078 uint64_t prev_len = 0;
3079 for (auto p = start;
3080 p != extent_map.end() && p->logical_offset < end;
3081 ++p, ++n) {
3082 unsigned blobid;
3083 bool include_blob = false;
3084 if (p->blob->is_spanning()) {
3085 blobid = p->blob->id << BLOBID_SHIFT_BITS;
3086 blobid |= BLOBID_FLAG_SPANNING;
3087 } else if (p->blob->last_encoded_id < 0) {
3088 p->blob->last_encoded_id = n + 1; // so it is always non-zero
3089 include_blob = true;
3090 blobid = 0; // the decoder will infer the id from n
3091 } else {
3092 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
3093 }
3094 if (p->logical_offset == pos) {
3095 blobid |= BLOBID_FLAG_CONTIGUOUS;
3096 }
3097 if (p->blob_offset == 0) {
3098 blobid |= BLOBID_FLAG_ZEROOFFSET;
3099 }
3100 if (p->length == prev_len) {
3101 blobid |= BLOBID_FLAG_SAMELENGTH;
3102 } else {
3103 prev_len = p->length;
3104 }
3105 denc_varint(blobid, app);
3106 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3107 denc_varint_lowz(p->logical_offset - pos, app);
3108 }
3109 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3110 denc_varint_lowz(p->blob_offset, app);
3111 }
3112 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3113 denc_varint_lowz(p->length, app);
3114 }
3115 pos = p->logical_end();
3116 if (include_blob) {
3117 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
3118 }
3119 }
3120 }
3121 /*derr << __func__ << bl << dendl;
3122 derr << __func__ << ":";
3123 bl.hexdump(*_dout);
3124 *_dout << dendl;
3125 */
3126 return false;
3127 }
3128
3129 unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
3130 {
3131 /*
3132 derr << __func__ << ":";
3133 bl.hexdump(*_dout);
3134 *_dout << dendl;
3135 */
3136
3137 ceph_assert(bl.get_num_buffers() <= 1);
3138 auto p = bl.front().begin_deep();
3139 __u8 struct_v;
3140 denc(struct_v, p);
3141 // Version 2 differs from v1 in blob's ref_map
3142 // serialization only. Hence there is no specific
3143 // handling at ExtentMap level below.
3144 ceph_assert(struct_v == 1 || struct_v == 2);
3145
3146 uint32_t num;
3147 denc_varint(num, p);
3148 vector<BlobRef> blobs(num);
3149 uint64_t pos = 0;
3150 uint64_t prev_len = 0;
3151 unsigned n = 0;
3152
3153 while (!p.end()) {
3154 Extent *le = new Extent();
3155 uint64_t blobid;
3156 denc_varint(blobid, p);
3157 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3158 uint64_t gap;
3159 denc_varint_lowz(gap, p);
3160 pos += gap;
3161 }
3162 le->logical_offset = pos;
3163 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3164 denc_varint_lowz(le->blob_offset, p);
3165 } else {
3166 le->blob_offset = 0;
3167 }
3168 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3169 denc_varint_lowz(prev_len, p);
3170 }
3171 le->length = prev_len;
3172
3173 if (blobid & BLOBID_FLAG_SPANNING) {
3174 dout(30) << __func__ << " getting spanning blob "
3175 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
3176 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
3177 } else {
3178 blobid >>= BLOBID_SHIFT_BITS;
3179 if (blobid) {
3180 le->assign_blob(blobs[blobid - 1]);
3181 ceph_assert(le->blob);
3182 } else {
3183 Blob *b = new Blob();
3184 uint64_t sbid = 0;
3185 b->decode(onode->c, p, struct_v, &sbid, false);
3186 blobs[n] = b;
3187 onode->c->open_shared_blob(sbid, b);
3188 le->assign_blob(b);
3189 }
3190 // we build ref_map dynamically for non-spanning blobs
3191 le->blob->get_ref(
3192 onode->c,
3193 le->blob_offset,
3194 le->length);
3195 }
3196 pos += prev_len;
3197 ++n;
3198 extent_map.insert(*le);
3199 }
3200
3201 ceph_assert(n == num);
3202 return num;
3203 }
3204
3205 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
3206 {
3207 // Version 2 differs from v1 in blob's ref_map
3208 // serialization only. Hence there is no specific
3209 // handling at ExtentMap level.
3210 __u8 struct_v = 2;
3211
3212 denc(struct_v, p);
3213 denc_varint((uint32_t)0, p);
3214 size_t key_size = 0;
3215 denc_varint((uint32_t)0, key_size);
3216 p += spanning_blob_map.size() * key_size;
3217 for (const auto& i : spanning_blob_map) {
3218 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3219 }
3220 }
3221
3222 void BlueStore::ExtentMap::encode_spanning_blobs(
3223 bufferlist::contiguous_appender& p)
3224 {
3225 // Version 2 differs from v1 in blob's ref_map
3226 // serialization only. Hence there is no specific
3227 // handling at ExtentMap level.
3228 __u8 struct_v = 2;
3229
3230 denc(struct_v, p);
3231 denc_varint(spanning_blob_map.size(), p);
3232 for (auto& i : spanning_blob_map) {
3233 denc_varint(i.second->id, p);
3234 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3235 }
3236 }
3237
3238 void BlueStore::ExtentMap::decode_spanning_blobs(
3239 bufferptr::const_iterator& p)
3240 {
3241 __u8 struct_v;
3242 denc(struct_v, p);
3243 // Version 2 differs from v1 in blob's ref_map
3244 // serialization only. Hence there is no specific
3245 // handling at ExtentMap level.
3246 ceph_assert(struct_v == 1 || struct_v == 2);
3247
3248 unsigned n;
3249 denc_varint(n, p);
3250 while (n--) {
3251 BlobRef b(new Blob());
3252 denc_varint(b->id, p);
3253 spanning_blob_map[b->id] = b;
3254 uint64_t sbid = 0;
3255 b->decode(onode->c, p, struct_v, &sbid, true);
3256 onode->c->open_shared_blob(sbid, b);
3257 }
3258 }
3259
3260 void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
3261 {
3262 shards.resize(onode->onode.extent_map_shards.size());
3263 unsigned i = 0;
3264 for (auto &s : onode->onode.extent_map_shards) {
3265 shards[i].shard_info = &s;
3266 shards[i].loaded = loaded;
3267 shards[i].dirty = dirty;
3268 ++i;
3269 }
3270 }
3271
3272 void BlueStore::ExtentMap::fault_range(
3273 KeyValueDB *db,
3274 uint32_t offset,
3275 uint32_t length)
3276 {
3277 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3278 << std::dec << dendl;
3279 auto start = seek_shard(offset);
3280 auto last = seek_shard(offset + length);
3281
3282 if (start < 0)
3283 return;
3284
3285 ceph_assert(last >= start);
3286 string key;
3287 while (start <= last) {
3288 ceph_assert((size_t)start < shards.size());
3289 auto p = &shards[start];
3290 if (!p->loaded) {
3291 dout(30) << __func__ << " opening shard 0x" << std::hex
3292 << p->shard_info->offset << std::dec << dendl;
3293 bufferlist v;
3294 generate_extent_shard_key_and_apply(
3295 onode->key, p->shard_info->offset, &key,
3296 [&](const string& final_key) {
3297 int r = db->get(PREFIX_OBJ, final_key, &v);
3298 if (r < 0) {
3299 derr << __func__ << " missing shard 0x" << std::hex
3300 << p->shard_info->offset << std::dec << " for " << onode->oid
3301 << dendl;
3302 ceph_assert(r >= 0);
3303 }
3304 }
3305 );
3306 p->extents = decode_some(v);
3307 p->loaded = true;
3308 dout(20) << __func__ << " open shard 0x" << std::hex
3309 << p->shard_info->offset
3310 << " for range 0x" << offset << "~" << length << std::dec
3311 << " (" << v.length() << " bytes)" << dendl;
3312 ceph_assert(p->dirty == false);
3313 ceph_assert(v.length() == p->shard_info->bytes);
3314 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3315 } else {
3316 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3317 }
3318 ++start;
3319 }
3320 }
3321
3322 void BlueStore::ExtentMap::dirty_range(
3323 uint32_t offset,
3324 uint32_t length)
3325 {
3326 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3327 << std::dec << dendl;
3328 if (shards.empty()) {
3329 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3330 inline_bl.clear();
3331 return;
3332 }
3333 auto start = seek_shard(offset);
3334 if (length == 0) {
3335 length = 1;
3336 }
3337 auto last = seek_shard(offset + length - 1);
3338 if (start < 0)
3339 return;
3340
3341 ceph_assert(last >= start);
3342 while (start <= last) {
3343 ceph_assert((size_t)start < shards.size());
3344 auto p = &shards[start];
3345 if (!p->loaded) {
3346 derr << __func__ << "on write 0x" << std::hex << offset
3347 << "~" << length << " shard 0x" << p->shard_info->offset
3348 << std::dec << " is not loaded, can't mark dirty" << dendl;
3349 ceph_abort_msg("can't mark unloaded shard dirty");
3350 }
3351 if (!p->dirty) {
3352 dout(20) << __func__ << " mark shard 0x" << std::hex
3353 << p->shard_info->offset << std::dec << " dirty" << dendl;
3354 p->dirty = true;
3355 }
3356 ++start;
3357 }
3358 }
3359
3360 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3361 uint64_t offset)
3362 {
3363 Extent dummy(offset);
3364 return extent_map.find(dummy);
3365 }
3366
3367 BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3368 uint64_t offset)
3369 {
3370 Extent dummy(offset);
3371 auto fp = extent_map.lower_bound(dummy);
3372 if (fp != extent_map.begin()) {
3373 --fp;
3374 if (fp->logical_end() <= offset) {
3375 ++fp;
3376 }
3377 }
3378 return fp;
3379 }
3380
3381 BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3382 uint64_t offset) const
3383 {
3384 Extent dummy(offset);
3385 auto fp = extent_map.lower_bound(dummy);
3386 if (fp != extent_map.begin()) {
3387 --fp;
3388 if (fp->logical_end() <= offset) {
3389 ++fp;
3390 }
3391 }
3392 return fp;
3393 }
3394
3395 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3396 {
3397 auto fp = seek_lextent(offset);
3398 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3399 return false;
3400 }
3401 return true;
3402 }
3403
3404 int BlueStore::ExtentMap::compress_extent_map(
3405 uint64_t offset,
3406 uint64_t length)
3407 {
3408 if (extent_map.empty())
3409 return 0;
3410 int removed = 0;
3411 auto p = seek_lextent(offset);
3412 if (p != extent_map.begin()) {
3413 --p; // start to the left of offset
3414 }
3415 // the caller should have just written to this region
3416 ceph_assert(p != extent_map.end());
3417
3418 // identify the *next* shard
3419 auto pshard = shards.begin();
3420 while (pshard != shards.end() &&
3421 p->logical_offset >= pshard->shard_info->offset) {
3422 ++pshard;
3423 }
3424 uint64_t shard_end;
3425 if (pshard != shards.end()) {
3426 shard_end = pshard->shard_info->offset;
3427 } else {
3428 shard_end = OBJECT_MAX_SIZE;
3429 }
3430
3431 auto n = p;
3432 for (++n; n != extent_map.end(); p = n++) {
3433 if (n->logical_offset > offset + length) {
3434 break; // stop after end
3435 }
3436 while (n != extent_map.end() &&
3437 p->logical_end() == n->logical_offset &&
3438 p->blob == n->blob &&
3439 p->blob_offset + p->length == n->blob_offset &&
3440 n->logical_offset < shard_end) {
3441 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3442 << " next shard 0x" << shard_end << std::dec
3443 << " merging " << *p << " and " << *n << dendl;
3444 p->length += n->length;
3445 rm(n++);
3446 ++removed;
3447 }
3448 if (n == extent_map.end()) {
3449 break;
3450 }
3451 if (n->logical_offset >= shard_end) {
3452 ceph_assert(pshard != shards.end());
3453 ++pshard;
3454 if (pshard != shards.end()) {
3455 shard_end = pshard->shard_info->offset;
3456 } else {
3457 shard_end = OBJECT_MAX_SIZE;
3458 }
3459 }
3460 }
3461 if (removed) {
3462 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3463 }
3464 return removed;
3465 }
3466
3467 void BlueStore::ExtentMap::punch_hole(
3468 CollectionRef &c,
3469 uint64_t offset,
3470 uint64_t length,
3471 old_extent_map_t *old_extents)
3472 {
3473 auto p = seek_lextent(offset);
3474 uint64_t end = offset + length;
3475 while (p != extent_map.end()) {
3476 if (p->logical_offset >= end) {
3477 break;
3478 }
3479 if (p->logical_offset < offset) {
3480 if (p->logical_end() > end) {
3481 // split and deref middle
3482 uint64_t front = offset - p->logical_offset;
3483 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3484 length, p->blob);
3485 old_extents->push_back(*oe);
3486 add(end,
3487 p->blob_offset + front + length,
3488 p->length - front - length,
3489 p->blob);
3490 p->length = front;
3491 break;
3492 } else {
3493 // deref tail
3494 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
3495 uint64_t keep = offset - p->logical_offset;
3496 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3497 p->length - keep, p->blob);
3498 old_extents->push_back(*oe);
3499 p->length = keep;
3500 ++p;
3501 continue;
3502 }
3503 }
3504 if (p->logical_offset + p->length <= end) {
3505 // deref whole lextent
3506 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3507 p->length, p->blob);
3508 old_extents->push_back(*oe);
3509 rm(p++);
3510 continue;
3511 }
3512 // deref head
3513 uint64_t keep = p->logical_end() - end;
3514 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3515 p->length - keep, p->blob);
3516 old_extents->push_back(*oe);
3517
3518 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3519 rm(p);
3520 break;
3521 }
3522 }
3523
3524 BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3525 CollectionRef &c,
3526 uint64_t logical_offset,
3527 uint64_t blob_offset, uint64_t length, BlobRef b,
3528 old_extent_map_t *old_extents)
3529 {
3530 // We need to have completely initialized Blob to increment its ref counters.
3531 ceph_assert(b->get_blob().get_logical_length() != 0);
3532
3533 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3534 // old_extents list if we overwre the blob totally
3535 // This might happen during WAL overwrite.
3536 b->get_ref(onode->c, blob_offset, length);
3537
3538 if (old_extents) {
3539 punch_hole(c, logical_offset, length, old_extents);
3540 }
3541
3542 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3543 extent_map.insert(*le);
3544 if (spans_shard(logical_offset, length)) {
3545 request_reshard(logical_offset, logical_offset + length);
3546 }
3547 return le;
3548 }
3549
3550 BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3551 BlobRef lb,
3552 uint32_t blob_offset,
3553 uint32_t pos)
3554 {
3555 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3556 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3557 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3558 << dendl;
3559 BlobRef rb = onode->c->new_blob();
3560 lb->split(onode->c, blob_offset, rb.get());
3561
3562 for (auto ep = seek_lextent(pos);
3563 ep != extent_map.end() && ep->logical_offset < end_pos;
3564 ++ep) {
3565 if (ep->blob != lb) {
3566 continue;
3567 }
3568 if (ep->logical_offset < pos) {
3569 // split extent
3570 size_t left = pos - ep->logical_offset;
3571 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3572 extent_map.insert(*ne);
3573 ep->length = left;
3574 dout(30) << __func__ << " split " << *ep << dendl;
3575 dout(30) << __func__ << " to " << *ne << dendl;
3576 } else {
3577 // switch blob
3578 ceph_assert(ep->blob_offset >= blob_offset);
3579
3580 ep->blob = rb;
3581 ep->blob_offset -= blob_offset;
3582 dout(30) << __func__ << " adjusted " << *ep << dendl;
3583 }
3584 }
3585 return rb;
3586 }
3587
3588 // Onode
3589
3590 #undef dout_prefix
3591 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3592
3593 const std::string& BlueStore::Onode::calc_omap_prefix(uint8_t flags)
3594 {
3595 if (bluestore_onode_t::is_pgmeta_omap(flags)) {
3596 return PREFIX_PGMETA_OMAP;
3597 }
3598 if (bluestore_onode_t::is_perpg_omap(flags)) {
3599 return PREFIX_PERPG_OMAP;
3600 }
3601 if (bluestore_onode_t::is_perpool_omap(flags)) {
3602 return PREFIX_PERPOOL_OMAP;
3603 }
3604 return PREFIX_OMAP;
3605 }
3606
3607 // '-' < '.' < '~'
3608 void BlueStore::Onode::calc_omap_header(
3609 uint8_t flags,
3610 const Onode* o,
3611 std::string* out)
3612 {
3613 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3614 if (bluestore_onode_t::is_perpg_omap(flags)) {
3615 _key_encode_u64(o->c->pool(), out);
3616 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3617 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3618 _key_encode_u64(o->c->pool(), out);
3619 }
3620 }
3621 _key_encode_u64(o->onode.nid, out);
3622 out->push_back('-');
3623 }
3624
3625 void BlueStore::Onode::calc_omap_key(uint8_t flags,
3626 const Onode* o,
3627 const std::string& key,
3628 std::string* out)
3629 {
3630 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3631 if (bluestore_onode_t::is_perpg_omap(flags)) {
3632 _key_encode_u64(o->c->pool(), out);
3633 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3634 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3635 _key_encode_u64(o->c->pool(), out);
3636 }
3637 }
3638 _key_encode_u64(o->onode.nid, out);
3639 out->push_back('.');
3640 out->append(key);
3641 }
3642
3643 void BlueStore::Onode::calc_omap_tail(
3644 uint8_t flags,
3645 const Onode* o,
3646 std::string* out)
3647 {
3648 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3649 if (bluestore_onode_t::is_perpg_omap(flags)) {
3650 _key_encode_u64(o->c->pool(), out);
3651 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3652 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3653 _key_encode_u64(o->c->pool(), out);
3654 }
3655 }
3656 _key_encode_u64(o->onode.nid, out);
3657 out->push_back('~');
3658 }
3659
3660 void BlueStore::Onode::get() {
3661 if (++nref >= 2 && !pinned) {
3662 OnodeCacheShard* ocs = c->get_onode_cache();
3663 ocs->lock.lock();
3664 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3665 while (ocs != c->get_onode_cache()) {
3666 ocs->lock.unlock();
3667 ocs = c->get_onode_cache();
3668 ocs->lock.lock();
3669 }
3670 bool was_pinned = pinned;
3671 pinned = nref >= 2;
3672 bool r = !was_pinned && pinned;
3673 if (cached && r) {
3674 ocs->_pin(this);
3675 }
3676 ocs->lock.unlock();
3677 }
3678 }
3679 void BlueStore::Onode::put() {
3680 ++put_nref;
3681 int n = --nref;
3682 if (n == 1) {
3683 OnodeCacheShard* ocs = c->get_onode_cache();
3684 ocs->lock.lock();
3685 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3686 while (ocs != c->get_onode_cache()) {
3687 ocs->lock.unlock();
3688 ocs = c->get_onode_cache();
3689 ocs->lock.lock();
3690 }
3691 bool need_unpin = pinned;
3692 pinned = pinned && nref >= 2;
3693 need_unpin = need_unpin && !pinned;
3694 if (cached && need_unpin) {
3695 if (exists) {
3696 ocs->_unpin(this);
3697 } else {
3698 ocs->_unpin_and_rm(this);
3699 // remove will also decrement nref
3700 c->onode_map._remove(oid);
3701 }
3702 }
3703 ocs->lock.unlock();
3704 }
3705 auto pn = --put_nref;
3706 if (nref == 0 && pn == 0) {
3707 delete this;
3708 }
3709 }
3710
3711 BlueStore::Onode* BlueStore::Onode::decode(
3712 CollectionRef c,
3713 const ghobject_t& oid,
3714 const string& key,
3715 const bufferlist& v)
3716 {
3717 Onode* on = new Onode(c.get(), oid, key);
3718 on->exists = true;
3719 auto p = v.front().begin_deep();
3720 on->onode.decode(p);
3721 for (auto& i : on->onode.attrs) {
3722 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3723 }
3724
3725 // initialize extent_map
3726 on->extent_map.decode_spanning_blobs(p);
3727 if (on->onode.extent_map_shards.empty()) {
3728 denc(on->extent_map.inline_bl, p);
3729 on->extent_map.decode_some(on->extent_map.inline_bl);
3730 on->extent_map.inline_bl.reassign_to_mempool(
3731 mempool::mempool_bluestore_cache_data);
3732 }
3733 else {
3734 on->extent_map.init_shards(false, false);
3735 }
3736 return on;
3737 }
3738
3739 void BlueStore::Onode::flush()
3740 {
3741 if (flushing_count.load()) {
3742 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
3743 waiting_count++;
3744 std::unique_lock l(flush_lock);
3745 while (flushing_count.load()) {
3746 flush_cond.wait(l);
3747 }
3748 waiting_count--;
3749 }
3750 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3751 }
3752
3753 void BlueStore::Onode::dump(Formatter* f) const
3754 {
3755 onode.dump(f);
3756 extent_map.dump(f);
3757 }
3758
3759 void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3760 {
3761 if (!onode.is_pgmeta_omap()) {
3762 if (onode.is_perpg_omap()) {
3763 _key_encode_u64(c->pool(), out);
3764 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3765 } else if (onode.is_perpool_omap()) {
3766 _key_encode_u64(c->pool(), out);
3767 }
3768 }
3769 _key_encode_u64(onode.nid, out);
3770 out->append(old.c_str() + out->length(), old.size() - out->length());
3771 }
3772
3773 void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3774 {
3775 size_t pos = sizeof(uint64_t) + 1;
3776 if (!onode.is_pgmeta_omap()) {
3777 if (onode.is_perpg_omap()) {
3778 pos += sizeof(uint64_t) + sizeof(uint32_t);
3779 } else if (onode.is_perpool_omap()) {
3780 pos += sizeof(uint64_t);
3781 }
3782 }
3783 *user_key = key.substr(pos);
3784 }
3785
3786 // =======================================================
3787 // WriteContext
3788
3789 /// Checks for writes to the same pextent within a blob
3790 bool BlueStore::WriteContext::has_conflict(
3791 BlobRef b,
3792 uint64_t loffs,
3793 uint64_t loffs_end,
3794 uint64_t min_alloc_size)
3795 {
3796 ceph_assert((loffs % min_alloc_size) == 0);
3797 ceph_assert((loffs_end % min_alloc_size) == 0);
3798 for (auto w : writes) {
3799 if (b == w.b) {
3800 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3801 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
3802 if ((loffs <= loffs2 && loffs_end > loffs2) ||
3803 (loffs >= loffs2 && loffs < loffs2_end)) {
3804 return true;
3805 }
3806 }
3807 }
3808 return false;
3809 }
3810
3811 // =======================================================
3812
3813 // DeferredBatch
3814 #undef dout_prefix
3815 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3816 #undef dout_context
3817 #define dout_context cct
3818
3819 void BlueStore::DeferredBatch::prepare_write(
3820 CephContext *cct,
3821 uint64_t seq, uint64_t offset, uint64_t length,
3822 bufferlist::const_iterator& blp)
3823 {
3824 _discard(cct, offset, length);
3825 auto i = iomap.insert(make_pair(offset, deferred_io()));
3826 ceph_assert(i.second); // this should be a new insertion
3827 i.first->second.seq = seq;
3828 blp.copy(length, i.first->second.bl);
3829 i.first->second.bl.reassign_to_mempool(
3830 mempool::mempool_bluestore_writing_deferred);
3831 dout(20) << __func__ << " seq " << seq
3832 << " 0x" << std::hex << offset << "~" << length
3833 << " crc " << i.first->second.bl.crc32c(-1)
3834 << std::dec << dendl;
3835 seq_bytes[seq] += length;
3836 #ifdef DEBUG_DEFERRED
3837 _audit(cct);
3838 #endif
3839 }
3840
3841 void BlueStore::DeferredBatch::_discard(
3842 CephContext *cct, uint64_t offset, uint64_t length)
3843 {
3844 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3845 << std::dec << dendl;
3846 auto p = iomap.lower_bound(offset);
3847 if (p != iomap.begin()) {
3848 --p;
3849 auto end = p->first + p->second.bl.length();
3850 if (end > offset) {
3851 bufferlist head;
3852 head.substr_of(p->second.bl, 0, offset - p->first);
3853 dout(20) << __func__ << " keep head " << p->second.seq
3854 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3855 << " -> 0x" << head.length() << std::dec << dendl;
3856 auto i = seq_bytes.find(p->second.seq);
3857 ceph_assert(i != seq_bytes.end());
3858 if (end > offset + length) {
3859 bufferlist tail;
3860 tail.substr_of(p->second.bl, offset + length - p->first,
3861 end - (offset + length));
3862 dout(20) << __func__ << " keep tail " << p->second.seq
3863 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3864 << " -> 0x" << tail.length() << std::dec << dendl;
3865 auto &n = iomap[offset + length];
3866 n.bl.swap(tail);
3867 n.seq = p->second.seq;
3868 i->second -= length;
3869 } else {
3870 i->second -= end - offset;
3871 }
3872 ceph_assert(i->second >= 0);
3873 p->second.bl.swap(head);
3874 }
3875 ++p;
3876 }
3877 while (p != iomap.end()) {
3878 if (p->first >= offset + length) {
3879 break;
3880 }
3881 auto i = seq_bytes.find(p->second.seq);
3882 ceph_assert(i != seq_bytes.end());
3883 auto end = p->first + p->second.bl.length();
3884 if (end > offset + length) {
3885 unsigned drop_front = offset + length - p->first;
3886 unsigned keep_tail = end - (offset + length);
3887 dout(20) << __func__ << " truncate front " << p->second.seq
3888 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3889 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3890 << " to 0x" << (offset + length) << "~" << keep_tail
3891 << std::dec << dendl;
3892 auto &s = iomap[offset + length];
3893 s.seq = p->second.seq;
3894 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3895 i->second -= drop_front;
3896 } else {
3897 dout(20) << __func__ << " drop " << p->second.seq
3898 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3899 << std::dec << dendl;
3900 i->second -= p->second.bl.length();
3901 }
3902 ceph_assert(i->second >= 0);
3903 p = iomap.erase(p);
3904 }
3905 }
3906
3907 void BlueStore::DeferredBatch::_audit(CephContext *cct)
3908 {
3909 map<uint64_t,int> sb;
3910 for (auto p : seq_bytes) {
3911 sb[p.first] = 0; // make sure we have the same set of keys
3912 }
3913 uint64_t pos = 0;
3914 for (auto& p : iomap) {
3915 ceph_assert(p.first >= pos);
3916 sb[p.second.seq] += p.second.bl.length();
3917 pos = p.first + p.second.bl.length();
3918 }
3919 ceph_assert(sb == seq_bytes);
3920 }
3921
3922
3923 // Collection
3924
3925 #undef dout_prefix
3926 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3927
3928 BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3929 : CollectionImpl(store_->cct, cid),
3930 store(store_),
3931 cache(bc),
3932 exists(true),
3933 onode_map(oc),
3934 commit_queue(nullptr)
3935 {
3936 }
3937
3938 bool BlueStore::Collection::flush_commit(Context *c)
3939 {
3940 return osr->flush_commit(c);
3941 }
3942
3943 void BlueStore::Collection::flush()
3944 {
3945 osr->flush();
3946 }
3947
3948 void BlueStore::Collection::flush_all_but_last()
3949 {
3950 osr->flush_all_but_last();
3951 }
3952
3953 void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3954 {
3955 ceph_assert(!b->shared_blob);
3956 const bluestore_blob_t& blob = b->get_blob();
3957 if (!blob.is_shared()) {
3958 b->shared_blob = new SharedBlob(this);
3959 return;
3960 }
3961
3962 b->shared_blob = shared_blob_set.lookup(sbid);
3963 if (b->shared_blob) {
3964 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3965 << std::dec << " had " << *b->shared_blob << dendl;
3966 } else {
3967 b->shared_blob = new SharedBlob(sbid, this);
3968 shared_blob_set.add(this, b->shared_blob.get());
3969 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3970 << std::dec << " opened " << *b->shared_blob
3971 << dendl;
3972 }
3973 }
3974
3975 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3976 {
3977 if (!sb->is_loaded()) {
3978
3979 bufferlist v;
3980 string key;
3981 auto sbid = sb->get_sbid();
3982 get_shared_blob_key(sbid, &key);
3983 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3984 if (r < 0) {
3985 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3986 << std::dec << " not found at key "
3987 << pretty_binary_string(key) << dendl;
3988 ceph_abort_msg("uh oh, missing shared_blob");
3989 }
3990
3991 sb->loaded = true;
3992 sb->persistent = new bluestore_shared_blob_t(sbid);
3993 auto p = v.cbegin();
3994 decode(*(sb->persistent), p);
3995 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3996 << std::dec << " loaded shared_blob " << *sb << dendl;
3997 }
3998 }
3999
4000 void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
4001 {
4002 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
4003 ceph_assert(!b->shared_blob->is_loaded());
4004
4005 // update blob
4006 bluestore_blob_t& blob = b->dirty_blob();
4007 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
4008
4009 // update shared blob
4010 b->shared_blob->loaded = true;
4011 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
4012 shared_blob_set.add(this, b->shared_blob.get());
4013 for (auto p : blob.get_extents()) {
4014 if (p.is_valid()) {
4015 b->shared_blob->get_ref(
4016 p.offset,
4017 p.length);
4018 }
4019 }
4020 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
4021 }
4022
4023 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
4024 {
4025 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
4026 ceph_assert(sb->is_loaded());
4027
4028 uint64_t sbid = sb->get_sbid();
4029 shared_blob_set.remove(sb);
4030 sb->loaded = false;
4031 delete sb->persistent;
4032 sb->sbid_unloaded = 0;
4033 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
4034 return sbid;
4035 }
4036
4037 BlueStore::OnodeRef BlueStore::Collection::get_onode(
4038 const ghobject_t& oid,
4039 bool create,
4040 bool is_createop)
4041 {
4042 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
4043
4044 spg_t pgid;
4045 if (cid.is_pg(&pgid)) {
4046 if (!oid.match(cnode.bits, pgid.ps())) {
4047 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
4048 << pgid << " bits " << cnode.bits << dendl;
4049 ceph_abort();
4050 }
4051 }
4052
4053 OnodeRef o = onode_map.lookup(oid);
4054 if (o)
4055 return o;
4056
4057 string key;
4058 get_object_key(store->cct, oid, &key);
4059
4060 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
4061 << pretty_binary_string(key) << dendl;
4062
4063 bufferlist v;
4064 int r = -ENOENT;
4065 Onode *on;
4066 if (!is_createop) {
4067 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
4068 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
4069 }
4070 if (v.length() == 0) {
4071 ceph_assert(r == -ENOENT);
4072 if (!create)
4073 return OnodeRef();
4074
4075 // new object, new onode
4076 on = new Onode(this, oid, key);
4077 } else {
4078 // loaded
4079 ceph_assert(r >= 0);
4080 on = Onode::decode(this, oid, key, v);
4081 }
4082 o.reset(on);
4083 return onode_map.add(oid, o);
4084 }
4085
4086 void BlueStore::Collection::split_cache(
4087 Collection *dest)
4088 {
4089 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
4090
4091 auto *ocache = get_onode_cache();
4092 auto *ocache_dest = dest->get_onode_cache();
4093
4094 // lock cache shards
4095 std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock);
4096 std::lock_guard l(ocache->lock, std::adopt_lock);
4097 std::lock_guard l2(ocache_dest->lock, std::adopt_lock);
4098 std::lock_guard l3(cache->lock, std::adopt_lock);
4099 std::lock_guard l4(dest->cache->lock, std::adopt_lock);
4100
4101 int destbits = dest->cnode.bits;
4102 spg_t destpg;
4103 bool is_pg = dest->cid.is_pg(&destpg);
4104 ceph_assert(is_pg);
4105
4106 auto p = onode_map.onode_map.begin();
4107 while (p != onode_map.onode_map.end()) {
4108 OnodeRef o = p->second;
4109 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
4110 // onode does not belong to this child
4111 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
4112 << dendl;
4113 ++p;
4114 } else {
4115 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
4116 << dendl;
4117
4118 // ensuring that nref is always >= 2 and hence onode is pinned and
4119 // physically out of cache during the transition
4120 OnodeRef o_pin = o;
4121 ceph_assert(o->pinned);
4122
4123 p = onode_map.onode_map.erase(p);
4124 dest->onode_map.onode_map[o->oid] = o;
4125 if (o->cached) {
4126 get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
4127 }
4128 o->c = dest;
4129
4130 // move over shared blobs and buffers. cover shared blobs from
4131 // both extent map and spanning blob map (the full extent map
4132 // may not be faulted in)
4133 vector<SharedBlob*> sbvec;
4134 for (auto& e : o->extent_map.extent_map) {
4135 sbvec.push_back(e.blob->shared_blob.get());
4136 }
4137 for (auto& b : o->extent_map.spanning_blob_map) {
4138 sbvec.push_back(b.second->shared_blob.get());
4139 }
4140 for (auto sb : sbvec) {
4141 if (sb->coll == dest) {
4142 ldout(store->cct, 20) << __func__ << " already moved " << *sb
4143 << dendl;
4144 continue;
4145 }
4146 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
4147 if (sb->get_sbid()) {
4148 ldout(store->cct, 20) << __func__
4149 << " moving registration " << *sb << dendl;
4150 shared_blob_set.remove(sb);
4151 dest->shared_blob_set.add(dest, sb);
4152 }
4153 sb->coll = dest;
4154 if (dest->cache != cache) {
4155 for (auto& i : sb->bc.buffer_map) {
4156 if (!i.second->is_writing()) {
4157 ldout(store->cct, 20) << __func__ << " moving " << *i.second
4158 << dendl;
4159 dest->cache->_move(cache, i.second.get());
4160 }
4161 }
4162 }
4163 }
4164 }
4165 }
4166 dest->cache->_trim();
4167 }
4168
4169 // =======================================================
4170
4171 // MempoolThread
4172
4173 #undef dout_prefix
4174 #define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
4175 #undef dout_context
4176 #define dout_context store->cct
4177
4178 void *BlueStore::MempoolThread::entry()
4179 {
4180 std::unique_lock l{lock};
4181
4182 uint32_t prev_config_change = store->config_changed.load();
4183 uint64_t base = store->osd_memory_base;
4184 double fragmentation = store->osd_memory_expected_fragmentation;
4185 uint64_t target = store->osd_memory_target;
4186 uint64_t min = store->osd_memory_cache_min;
4187 uint64_t max = min;
4188
4189 // When setting the maximum amount of memory to use for cache, first
4190 // assume some base amount of memory for the OSD and then fudge in
4191 // some overhead for fragmentation that scales with cache usage.
4192 uint64_t ltarget = (1.0 - fragmentation) * target;
4193 if (ltarget > base + min) {
4194 max = ltarget - base;
4195 }
4196
4197 binned_kv_cache = store->db->get_priority_cache();
4198 binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
4199 if (store->cache_autotune && binned_kv_cache != nullptr) {
4200 pcm = std::make_shared<PriorityCache::Manager>(
4201 store->cct, min, max, target, true, "bluestore-pricache");
4202 pcm->insert("kv", binned_kv_cache, true);
4203 pcm->insert("meta", meta_cache, true);
4204 pcm->insert("data", data_cache, true);
4205 if (binned_kv_onode_cache != nullptr) {
4206 pcm->insert("kv_onode", binned_kv_onode_cache, true);
4207 }
4208 }
4209
4210 utime_t next_balance = ceph_clock_now();
4211 utime_t next_resize = ceph_clock_now();
4212 utime_t next_bin_rotation = ceph_clock_now();
4213 utime_t next_deferred_force_submit = ceph_clock_now();
4214 utime_t alloc_stats_dump_clock = ceph_clock_now();
4215
4216 bool interval_stats_trim = false;
4217 while (!stop) {
4218 // Update pcm cache settings if related configuration was changed
4219 uint32_t cur_config_change = store->config_changed.load();
4220 if (cur_config_change != prev_config_change) {
4221 _update_cache_settings();
4222 prev_config_change = cur_config_change;
4223 }
4224
4225 // define various intervals for background work
4226 double age_bin_interval = store->cache_age_bin_interval;
4227 double autotune_interval = store->cache_autotune_interval;
4228 double resize_interval = store->osd_memory_cache_resize_interval;
4229 double max_defer_interval = store->max_defer_interval;
4230 double alloc_stats_dump_interval =
4231 store->cct->_conf->bluestore_alloc_stats_dump_interval;
4232
4233 // alloc stats dump
4234 if (alloc_stats_dump_interval > 0 &&
4235 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
4236 store->_record_allocation_stats();
4237 alloc_stats_dump_clock = ceph_clock_now();
4238 }
4239 // cache age binning
4240 if (age_bin_interval > 0 && next_bin_rotation < ceph_clock_now()) {
4241 if (binned_kv_cache != nullptr) {
4242 binned_kv_cache->import_bins(store->kv_bins);
4243 }
4244 if (binned_kv_onode_cache != nullptr) {
4245 binned_kv_onode_cache->import_bins(store->kv_onode_bins);
4246 }
4247 meta_cache->import_bins(store->meta_bins);
4248 data_cache->import_bins(store->data_bins);
4249
4250 if (pcm != nullptr) {
4251 pcm->shift_bins();
4252 }
4253 next_bin_rotation = ceph_clock_now();
4254 next_bin_rotation += age_bin_interval;
4255 }
4256 // cache balancing
4257 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
4258 if (binned_kv_cache != nullptr) {
4259 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
4260 }
4261 if (binned_kv_onode_cache != nullptr) {
4262 binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
4263 }
4264 meta_cache->set_cache_ratio(store->cache_meta_ratio);
4265 data_cache->set_cache_ratio(store->cache_data_ratio);
4266
4267 // Log events at 5 instead of 20 when balance happens.
4268 interval_stats_trim = true;
4269
4270 if (pcm != nullptr) {
4271 pcm->balance();
4272 }
4273
4274 next_balance = ceph_clock_now();
4275 next_balance += autotune_interval;
4276 }
4277 // memory resizing (ie autotuning)
4278 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
4279 if (ceph_using_tcmalloc() && pcm != nullptr) {
4280 pcm->tune_memory();
4281 }
4282 next_resize = ceph_clock_now();
4283 next_resize += resize_interval;
4284 }
4285 // deferred force submit
4286 if (max_defer_interval > 0 &&
4287 next_deferred_force_submit < ceph_clock_now()) {
4288 if (store->get_deferred_last_submitted() + max_defer_interval <
4289 ceph_clock_now()) {
4290 store->deferred_try_submit();
4291 }
4292 next_deferred_force_submit = ceph_clock_now();
4293 next_deferred_force_submit += max_defer_interval/3;
4294 }
4295
4296 // Now Resize the shards
4297 _resize_shards(interval_stats_trim);
4298 interval_stats_trim = false;
4299
4300 store->_update_cache_logger();
4301 auto wait = ceph::make_timespan(
4302 store->cct->_conf->bluestore_cache_trim_interval);
4303 cond.wait_for(l, wait);
4304 }
4305 // do final dump
4306 store->_record_allocation_stats();
4307 stop = false;
4308 pcm = nullptr;
4309 return NULL;
4310 }
4311
4312 void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
4313 {
4314 size_t onode_shards = store->onode_cache_shards.size();
4315 size_t buffer_shards = store->buffer_cache_shards.size();
4316 int64_t kv_used = store->db->get_cache_usage();
4317 int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
4318 int64_t meta_used = meta_cache->_get_used_bytes();
4319 int64_t data_used = data_cache->_get_used_bytes();
4320
4321 uint64_t cache_size = store->cache_size;
4322 int64_t kv_alloc =
4323 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
4324 int64_t kv_onode_alloc =
4325 static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
4326 int64_t meta_alloc =
4327 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
4328 int64_t data_alloc =
4329 static_cast<int64_t>(store->cache_data_ratio * cache_size);
4330
4331 if (pcm != nullptr && binned_kv_cache != nullptr) {
4332 cache_size = pcm->get_tuned_mem();
4333 kv_alloc = binned_kv_cache->get_committed_size();
4334 meta_alloc = meta_cache->get_committed_size();
4335 data_alloc = data_cache->get_committed_size();
4336 if (binned_kv_onode_cache != nullptr) {
4337 kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
4338 }
4339 }
4340
4341 if (interval_stats) {
4342 dout(5) << __func__ << " cache_size: " << cache_size
4343 << " kv_alloc: " << kv_alloc
4344 << " kv_used: " << kv_used
4345 << " kv_onode_alloc: " << kv_onode_alloc
4346 << " kv_onode_used: " << kv_onode_used
4347 << " meta_alloc: " << meta_alloc
4348 << " meta_used: " << meta_used
4349 << " data_alloc: " << data_alloc
4350 << " data_used: " << data_used << dendl;
4351 } else {
4352 dout(20) << __func__ << " cache_size: " << cache_size
4353 << " kv_alloc: " << kv_alloc
4354 << " kv_used: " << kv_used
4355 << " kv_onode_alloc: " << kv_onode_alloc
4356 << " kv_onode_used: " << kv_onode_used
4357 << " meta_alloc: " << meta_alloc
4358 << " meta_used: " << meta_used
4359 << " data_alloc: " << data_alloc
4360 << " data_used: " << data_used << dendl;
4361 }
4362
4363 uint64_t max_shard_onodes = static_cast<uint64_t>(
4364 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
4365 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
4366
4367 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
4368 << " max_shard_buffer: " << max_shard_buffer << dendl;
4369
4370 for (auto i : store->onode_cache_shards) {
4371 i->set_max(max_shard_onodes);
4372 }
4373 for (auto i : store->buffer_cache_shards) {
4374 i->set_max(max_shard_buffer);
4375 }
4376 }
4377
4378 void BlueStore::MempoolThread::_update_cache_settings()
4379 {
4380 // Nothing to do if pcm is not used.
4381 if (pcm == nullptr) {
4382 return;
4383 }
4384
4385 uint64_t target = store->osd_memory_target;
4386 uint64_t base = store->osd_memory_base;
4387 uint64_t min = store->osd_memory_cache_min;
4388 uint64_t max = min;
4389 double fragmentation = store->osd_memory_expected_fragmentation;
4390
4391 uint64_t ltarget = (1.0 - fragmentation) * target;
4392 if (ltarget > base + min) {
4393 max = ltarget - base;
4394 }
4395
4396 // set pcm cache levels
4397 pcm->set_target_memory(target);
4398 pcm->set_min_memory(min);
4399 pcm->set_max_memory(max);
4400
4401 dout(5) << __func__ << " updated pcm target: " << target
4402 << " pcm min: " << min
4403 << " pcm max: " << max
4404 << dendl;
4405 }
4406
4407 // =======================================================
4408
4409 // OmapIteratorImpl
4410
4411 #undef dout_prefix
4412 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4413
4414 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4415 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
4416 : c(c), o(o), it(it)
4417 {
4418 std::shared_lock l(c->lock);
4419 if (o->onode.has_omap()) {
4420 o->get_omap_key(string(), &head);
4421 o->get_omap_tail(&tail);
4422 it->lower_bound(head);
4423 }
4424 }
4425
4426 string BlueStore::OmapIteratorImpl::_stringify() const
4427 {
4428 stringstream s;
4429 s << " omap_iterator(cid = " << c->cid
4430 <<", oid = " << o->oid << ")";
4431 return s.str();
4432 }
4433
4434 int BlueStore::OmapIteratorImpl::seek_to_first()
4435 {
4436 std::shared_lock l(c->lock);
4437 auto start1 = mono_clock::now();
4438 if (o->onode.has_omap()) {
4439 it->lower_bound(head);
4440 } else {
4441 it = KeyValueDB::Iterator();
4442 }
4443 c->store->log_latency(
4444 __func__,
4445 l_bluestore_omap_seek_to_first_lat,
4446 mono_clock::now() - start1,
4447 c->store->cct->_conf->bluestore_log_omap_iterator_age);
4448
4449 return 0;
4450 }
4451
4452 int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4453 {
4454 std::shared_lock l(c->lock);
4455 auto start1 = mono_clock::now();
4456 if (o->onode.has_omap()) {
4457 string key;
4458 o->get_omap_key(after, &key);
4459 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4460 << pretty_binary_string(key) << dendl;
4461 it->upper_bound(key);
4462 } else {
4463 it = KeyValueDB::Iterator();
4464 }
4465 c->store->log_latency_fn(
4466 __func__,
4467 l_bluestore_omap_upper_bound_lat,
4468 mono_clock::now() - start1,
4469 c->store->cct->_conf->bluestore_log_omap_iterator_age,
4470 [&] (const ceph::timespan& lat) {
4471 return ", after = " + after +
4472 _stringify();
4473 }
4474 );
4475 return 0;
4476 }
4477
4478 int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4479 {
4480 std::shared_lock l(c->lock);
4481 auto start1 = mono_clock::now();
4482 if (o->onode.has_omap()) {
4483 string key;
4484 o->get_omap_key(to, &key);
4485 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4486 << pretty_binary_string(key) << dendl;
4487 it->lower_bound(key);
4488 } else {
4489 it = KeyValueDB::Iterator();
4490 }
4491 c->store->log_latency_fn(
4492 __func__,
4493 l_bluestore_omap_lower_bound_lat,
4494 mono_clock::now() - start1,
4495 c->store->cct->_conf->bluestore_log_omap_iterator_age,
4496 [&] (const ceph::timespan& lat) {
4497 return ", to = " + to +
4498 _stringify();
4499 }
4500 );
4501 return 0;
4502 }
4503
4504 bool BlueStore::OmapIteratorImpl::valid()
4505 {
4506 std::shared_lock l(c->lock);
4507 bool r = o->onode.has_omap() && it && it->valid() &&
4508 it->raw_key().second < tail;
4509 if (it && it->valid()) {
4510 ldout(c->store->cct,20) << __func__ << " is at "
4511 << pretty_binary_string(it->raw_key().second)
4512 << dendl;
4513 }
4514 return r;
4515 }
4516
4517 int BlueStore::OmapIteratorImpl::next()
4518 {
4519 int r = -1;
4520 std::shared_lock l(c->lock);
4521 auto start1 = mono_clock::now();
4522 if (o->onode.has_omap()) {
4523 it->next();
4524 r = 0;
4525 }
4526 c->store->log_latency(
4527 __func__,
4528 l_bluestore_omap_next_lat,
4529 mono_clock::now() - start1,
4530 c->store->cct->_conf->bluestore_log_omap_iterator_age);
4531
4532 return r;
4533 }
4534
4535 string BlueStore::OmapIteratorImpl::key()
4536 {
4537 std::shared_lock l(c->lock);
4538 ceph_assert(it->valid());
4539 string db_key = it->raw_key().second;
4540 string user_key;
4541 o->decode_omap_key(db_key, &user_key);
4542
4543 return user_key;
4544 }
4545
4546 bufferlist BlueStore::OmapIteratorImpl::value()
4547 {
4548 std::shared_lock l(c->lock);
4549 ceph_assert(it->valid());
4550 return it->value();
4551 }
4552
4553
4554 // =====================================
4555
4556 #undef dout_prefix
4557 #define dout_prefix *_dout << "bluestore(" << path << ") "
4558 #undef dout_context
4559 #define dout_context cct
4560
4561
4562 static void aio_cb(void *priv, void *priv2)
4563 {
4564 BlueStore *store = static_cast<BlueStore*>(priv);
4565 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4566 c->aio_finish(store);
4567 }
4568
4569 static void discard_cb(void *priv, void *priv2)
4570 {
4571 BlueStore *store = static_cast<BlueStore*>(priv);
4572 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4573 store->handle_discard(*tmp);
4574 }
4575
4576 void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4577 {
4578 dout(10) << __func__ << dendl;
4579 ceph_assert(alloc);
4580 alloc->release(to_release);
4581 }
4582
4583 BlueStore::BlueStore(CephContext *cct, const string& path)
4584 : BlueStore(cct, path, 0) {}
4585
4586 BlueStore::BlueStore(CephContext *cct,
4587 const string& path,
4588 uint64_t _min_alloc_size)
4589 : ObjectStore(cct, path),
4590 throttle(cct),
4591 finisher(cct, "commit_finisher", "cfin"),
4592 kv_sync_thread(this),
4593 kv_finalize_thread(this),
4594 #ifdef HAVE_LIBZBD
4595 zoned_cleaner_thread(this),
4596 #endif
4597 min_alloc_size(_min_alloc_size),
4598 min_alloc_size_order(ctz(_min_alloc_size)),
4599 mempool_thread(this)
4600 {
4601 _init_logger();
4602 cct->_conf.add_observer(this);
4603 set_cache_shards(1);
4604 }
4605
4606 BlueStore::~BlueStore()
4607 {
4608 cct->_conf.remove_observer(this);
4609 _shutdown_logger();
4610 ceph_assert(!mounted);
4611 ceph_assert(db == NULL);
4612 ceph_assert(bluefs == NULL);
4613 ceph_assert(fsid_fd < 0);
4614 ceph_assert(path_fd < 0);
4615 for (auto i : onode_cache_shards) {
4616 delete i;
4617 }
4618 for (auto i : buffer_cache_shards) {
4619 delete i;
4620 }
4621 onode_cache_shards.clear();
4622 buffer_cache_shards.clear();
4623 }
4624
4625 const char **BlueStore::get_tracked_conf_keys() const
4626 {
4627 static const char* KEYS[] = {
4628 "bluestore_csum_type",
4629 "bluestore_compression_mode",
4630 "bluestore_compression_algorithm",
4631 "bluestore_compression_min_blob_size",
4632 "bluestore_compression_min_blob_size_ssd",
4633 "bluestore_compression_min_blob_size_hdd",
4634 "bluestore_compression_max_blob_size",
4635 "bluestore_compression_max_blob_size_ssd",
4636 "bluestore_compression_max_blob_size_hdd",
4637 "bluestore_compression_required_ratio",
4638 "bluestore_max_alloc_size",
4639 "bluestore_prefer_deferred_size",
4640 "bluestore_prefer_deferred_size_hdd",
4641 "bluestore_prefer_deferred_size_ssd",
4642 "bluestore_deferred_batch_ops",
4643 "bluestore_deferred_batch_ops_hdd",
4644 "bluestore_deferred_batch_ops_ssd",
4645 "bluestore_throttle_bytes",
4646 "bluestore_throttle_deferred_bytes",
4647 "bluestore_throttle_cost_per_io_hdd",
4648 "bluestore_throttle_cost_per_io_ssd",
4649 "bluestore_throttle_cost_per_io",
4650 "bluestore_max_blob_size",
4651 "bluestore_max_blob_size_ssd",
4652 "bluestore_max_blob_size_hdd",
4653 "osd_memory_target",
4654 "osd_memory_target_cgroup_limit_ratio",
4655 "osd_memory_base",
4656 "osd_memory_cache_min",
4657 "osd_memory_expected_fragmentation",
4658 "bluestore_cache_autotune",
4659 "bluestore_cache_autotune_interval",
4660 "bluestore_cache_age_bin_interval",
4661 "bluestore_cache_kv_age_bins",
4662 "bluestore_cache_kv_onode_age_bins",
4663 "bluestore_cache_meta_age_bins",
4664 "bluestore_cache_data_age_bins",
4665 "bluestore_warn_on_legacy_statfs",
4666 "bluestore_warn_on_no_per_pool_omap",
4667 "bluestore_warn_on_no_per_pg_omap",
4668 "bluestore_max_defer_interval",
4669 NULL
4670 };
4671 return KEYS;
4672 }
4673
4674 void BlueStore::handle_conf_change(const ConfigProxy& conf,
4675 const std::set<std::string> &changed)
4676 {
4677 if (changed.count("bluestore_warn_on_legacy_statfs")) {
4678 _check_legacy_statfs_alert();
4679 }
4680 if (changed.count("bluestore_warn_on_no_per_pool_omap") ||
4681 changed.count("bluestore_warn_on_no_per_pg_omap")) {
4682 _check_no_per_pg_or_pool_omap_alert();
4683 }
4684
4685 if (changed.count("bluestore_csum_type")) {
4686 _set_csum();
4687 }
4688 if (changed.count("bluestore_compression_mode") ||
4689 changed.count("bluestore_compression_algorithm") ||
4690 changed.count("bluestore_compression_min_blob_size") ||
4691 changed.count("bluestore_compression_max_blob_size")) {
4692 if (bdev) {
4693 _set_compression();
4694 }
4695 }
4696 if (changed.count("bluestore_max_blob_size") ||
4697 changed.count("bluestore_max_blob_size_ssd") ||
4698 changed.count("bluestore_max_blob_size_hdd")) {
4699 if (bdev) {
4700 // only after startup
4701 _set_blob_size();
4702 }
4703 }
4704 if (changed.count("bluestore_prefer_deferred_size") ||
4705 changed.count("bluestore_prefer_deferred_size_hdd") ||
4706 changed.count("bluestore_prefer_deferred_size_ssd") ||
4707 changed.count("bluestore_max_alloc_size") ||
4708 changed.count("bluestore_deferred_batch_ops") ||
4709 changed.count("bluestore_deferred_batch_ops_hdd") ||
4710 changed.count("bluestore_deferred_batch_ops_ssd")) {
4711 if (bdev) {
4712 // only after startup
4713 _set_alloc_sizes();
4714 }
4715 }
4716 if (changed.count("bluestore_throttle_cost_per_io") ||
4717 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4718 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4719 if (bdev) {
4720 _set_throttle_params();
4721 }
4722 }
4723 if (changed.count("bluestore_throttle_bytes") ||
4724 changed.count("bluestore_throttle_deferred_bytes") ||
4725 changed.count("bluestore_throttle_trace_rate")) {
4726 throttle.reset_throttle(conf);
4727 }
4728 if (changed.count("bluestore_max_defer_interval")) {
4729 if (bdev) {
4730 _set_max_defer_interval();
4731 }
4732 }
4733 if (changed.count("osd_memory_target") ||
4734 changed.count("osd_memory_base") ||
4735 changed.count("osd_memory_cache_min") ||
4736 changed.count("osd_memory_expected_fragmentation")) {
4737 _update_osd_memory_options();
4738 }
4739 }
4740
4741 void BlueStore::_set_compression()
4742 {
4743 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4744 if (m) {
4745 _clear_compression_alert();
4746 comp_mode = *m;
4747 } else {
4748 derr << __func__ << " unrecognized value '"
4749 << cct->_conf->bluestore_compression_mode
4750 << "' for bluestore_compression_mode, reverting to 'none'"
4751 << dendl;
4752 comp_mode = Compressor::COMP_NONE;
4753 string s("unknown mode: ");
4754 s += cct->_conf->bluestore_compression_mode;
4755 _set_compression_alert(true, s.c_str());
4756 }
4757
4758 compressor = nullptr;
4759
4760 if (cct->_conf->bluestore_compression_min_blob_size) {
4761 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
4762 } else {
4763 ceph_assert(bdev);
4764 if (_use_rotational_settings()) {
4765 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4766 } else {
4767 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4768 }
4769 }
4770
4771 if (cct->_conf->bluestore_compression_max_blob_size) {
4772 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4773 } else {
4774 ceph_assert(bdev);
4775 if (_use_rotational_settings()) {
4776 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4777 } else {
4778 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4779 }
4780 }
4781
4782 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4783 if (!alg_name.empty()) {
4784 compressor = Compressor::create(cct, alg_name);
4785 if (!compressor) {
4786 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4787 << dendl;
4788 _set_compression_alert(false, alg_name.c_str());
4789 }
4790 }
4791
4792 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4793 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
4794 << " min_blob " << comp_min_blob_size
4795 << " max_blob " << comp_max_blob_size
4796 << dendl;
4797 }
4798
4799 void BlueStore::_set_csum()
4800 {
4801 csum_type = Checksummer::CSUM_NONE;
4802 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4803 if (t > Checksummer::CSUM_NONE)
4804 csum_type = t;
4805
4806 dout(10) << __func__ << " csum_type "
4807 << Checksummer::get_csum_type_string(csum_type)
4808 << dendl;
4809 }
4810
4811 void BlueStore::_set_throttle_params()
4812 {
4813 if (cct->_conf->bluestore_throttle_cost_per_io) {
4814 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4815 } else {
4816 ceph_assert(bdev);
4817 if (_use_rotational_settings()) {
4818 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4819 } else {
4820 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4821 }
4822 }
4823
4824 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4825 << dendl;
4826 }
4827 void BlueStore::_set_blob_size()
4828 {
4829 if (cct->_conf->bluestore_max_blob_size) {
4830 max_blob_size = cct->_conf->bluestore_max_blob_size;
4831 } else {
4832 ceph_assert(bdev);
4833 if (_use_rotational_settings()) {
4834 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4835 } else {
4836 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4837 }
4838 }
4839 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4840 << std::dec << dendl;
4841 }
4842
4843 void BlueStore::_update_osd_memory_options()
4844 {
4845 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4846 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4847 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4848 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4849 config_changed++;
4850 dout(10) << __func__
4851 << " osd_memory_target " << osd_memory_target
4852 << " osd_memory_base " << osd_memory_base
4853 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4854 << " osd_memory_cache_min " << osd_memory_cache_min
4855 << dendl;
4856 }
4857
4858 int BlueStore::_set_cache_sizes()
4859 {
4860 ceph_assert(bdev);
4861 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
4862 cache_autotune_interval =
4863 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4864 cache_age_bin_interval =
4865 cct->_conf.get_val<double>("bluestore_cache_age_bin_interval");
4866 auto _set_bin = [&](std::string conf_name, std::vector<uint64_t>* intervals)
4867 {
4868 std::string intervals_str = cct->_conf.get_val<std::string>(conf_name);
4869 std::istringstream interval_stream(intervals_str);
4870 std::copy(
4871 std::istream_iterator<uint64_t>(interval_stream),
4872 std::istream_iterator<uint64_t>(),
4873 std::back_inserter(*intervals));
4874 };
4875 _set_bin("bluestore_cache_age_bins_kv", &kv_bins);
4876 _set_bin("bluestore_cache_age_bins_kv_onode", &kv_onode_bins);
4877 _set_bin("bluestore_cache_age_bins_meta", &meta_bins);
4878 _set_bin("bluestore_cache_age_bins_data", &data_bins);
4879
4880 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4881 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4882 osd_memory_expected_fragmentation =
4883 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4884 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4885 osd_memory_cache_resize_interval =
4886 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
4887
4888 if (cct->_conf->bluestore_cache_size) {
4889 cache_size = cct->_conf->bluestore_cache_size;
4890 } else {
4891 // choose global cache size based on backend type
4892 if (_use_rotational_settings()) {
4893 cache_size = cct->_conf->bluestore_cache_size_hdd;
4894 } else {
4895 cache_size = cct->_conf->bluestore_cache_size_ssd;
4896 }
4897 }
4898
4899 cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
4900 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
4901 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
4902 << ") must be in range [0,1.0]" << dendl;
4903 return -EINVAL;
4904 }
4905
4906 cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
4907 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
4908 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
4909 << ") must be in range [0,1.0]" << dendl;
4910 return -EINVAL;
4911 }
4912
4913 cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
4914 if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
4915 derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
4916 << ") must be in range [0,1.0]" << dendl;
4917 return -EINVAL;
4918 }
4919
4920 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
4921 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
4922 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4923 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4924 << dendl;
4925 return -EINVAL;
4926 }
4927
4928 cache_data_ratio = (double)1.0 -
4929 (double)cache_meta_ratio -
4930 (double)cache_kv_ratio -
4931 (double)cache_kv_onode_ratio;
4932 if (cache_data_ratio < 0) {
4933 // deal with floating point imprecision
4934 cache_data_ratio = 0;
4935 }
4936
4937 dout(1) << __func__ << " cache_size " << cache_size
4938 << " meta " << cache_meta_ratio
4939 << " kv " << cache_kv_ratio
4940 << " data " << cache_data_ratio
4941 << dendl;
4942 return 0;
4943 }
4944
4945 int BlueStore::write_meta(const std::string& key, const std::string& value)
4946 {
4947 bluestore_bdev_label_t label;
4948 string p = path + "/block";
4949 int r = _read_bdev_label(cct, p, &label);
4950 if (r < 0) {
4951 return ObjectStore::write_meta(key, value);
4952 }
4953 label.meta[key] = value;
4954 r = _write_bdev_label(cct, p, label);
4955 ceph_assert(r == 0);
4956 return ObjectStore::write_meta(key, value);
4957 }
4958
4959 int BlueStore::read_meta(const std::string& key, std::string *value)
4960 {
4961 bluestore_bdev_label_t label;
4962 string p = path + "/block";
4963 int r = _read_bdev_label(cct, p, &label);
4964 if (r < 0) {
4965 return ObjectStore::read_meta(key, value);
4966 }
4967 auto i = label.meta.find(key);
4968 if (i == label.meta.end()) {
4969 return ObjectStore::read_meta(key, value);
4970 }
4971 *value = i->second;
4972 return 0;
4973 }
4974
4975 void BlueStore::_init_logger()
4976 {
4977 PerfCountersBuilder b(cct, "bluestore",
4978 l_bluestore_first, l_bluestore_last);
4979
4980 // space utilization stats
4981 //****************************************
4982 b.add_u64(l_bluestore_allocated, "allocated",
4983 "Sum for allocated bytes",
4984 "al_b",
4985 PerfCountersBuilder::PRIO_CRITICAL,
4986 unit_t(UNIT_BYTES));
4987 b.add_u64(l_bluestore_stored, "stored",
4988 "Sum for stored bytes",
4989 "st_b",
4990 PerfCountersBuilder::PRIO_CRITICAL,
4991 unit_t(UNIT_BYTES));
4992 b.add_u64(l_bluestore_fragmentation, "fragmentation_micros",
4993 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
4994 b.add_u64(l_bluestore_alloc_unit, "alloc_unit",
4995 "allocation unit size in bytes",
4996 "au_b",
4997 PerfCountersBuilder::PRIO_CRITICAL,
4998 unit_t(UNIT_BYTES));
4999 //****************************************
5000
5001 // Update op processing state latencies
5002 //****************************************
5003 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
5004 "Average prepare state latency",
5005 "sprl", PerfCountersBuilder::PRIO_USEFUL);
5006 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
5007 "Average aio_wait state latency",
5008 "sawl", PerfCountersBuilder::PRIO_INTERESTING);
5009 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
5010 "Average io_done state latency",
5011 "sidl", PerfCountersBuilder::PRIO_USEFUL);
5012 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
5013 "Average kv_queued state latency",
5014 "skql", PerfCountersBuilder::PRIO_USEFUL);
5015 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
5016 "Average kv_commiting state latency",
5017 "skcl", PerfCountersBuilder::PRIO_USEFUL);
5018 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
5019 "Average kv_done state latency",
5020 "skdl", PerfCountersBuilder::PRIO_USEFUL);
5021 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
5022 "Average finishing state latency",
5023 "sfnl", PerfCountersBuilder::PRIO_USEFUL);
5024 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
5025 "Average done state latency",
5026 "sdnl", PerfCountersBuilder::PRIO_USEFUL);
5027 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
5028 "Average deferred_queued state latency",
5029 "sdql", PerfCountersBuilder::PRIO_USEFUL);
5030 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
5031 "Average aio_wait state latency",
5032 "sdal", PerfCountersBuilder::PRIO_USEFUL);
5033 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
5034 "Average cleanup state latency",
5035 "sdcl", PerfCountersBuilder::PRIO_USEFUL);
5036 //****************************************
5037
5038 // Update Transaction stats
5039 //****************************************
5040 b.add_time_avg(l_bluestore_throttle_lat, "txc_throttle_lat",
5041 "Average submit throttle latency",
5042 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
5043 b.add_time_avg(l_bluestore_submit_lat, "txc_submit_lat",
5044 "Average submit latency",
5045 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
5046 b.add_time_avg(l_bluestore_commit_lat, "txc_commit_lat",
5047 "Average commit latency",
5048 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
5049 b.add_u64_counter(l_bluestore_txc, "txc_count", "Transactions committed");
5050 //****************************************
5051
5052 // Read op stats
5053 //****************************************
5054 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
5055 "Average read onode metadata latency",
5056 "roml", PerfCountersBuilder::PRIO_USEFUL);
5057 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
5058 "Average read I/O waiting latency",
5059 "rwal", PerfCountersBuilder::PRIO_USEFUL);
5060 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
5061 "Average checksum latency",
5062 "csml", PerfCountersBuilder::PRIO_USEFUL);
5063 b.add_u64_counter(l_bluestore_read_eio, "read_eio",
5064 "Read EIO errors propagated to high level callers");
5065 b.add_u64_counter(l_bluestore_reads_with_retries, "reads_with_retries",
5066 "Read operations that required at least one retry due to failed checksum validation",
5067 "rd_r", PerfCountersBuilder::PRIO_USEFUL);
5068 b.add_time_avg(l_bluestore_read_lat, "read_lat",
5069 "Average read latency",
5070 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
5071 //****************************************
5072
5073 // kv_thread latencies
5074 //****************************************
5075 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
5076 "Average kv_thread flush latency",
5077 "kfsl", PerfCountersBuilder::PRIO_INTERESTING);
5078 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
5079 "Average kv_thread commit latency",
5080 "kcol", PerfCountersBuilder::PRIO_USEFUL);
5081 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
5082 "Average kv_sync thread latency",
5083 "kscl", PerfCountersBuilder::PRIO_INTERESTING);
5084 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
5085 "Average kv_finalize thread latency",
5086 "kfll", PerfCountersBuilder::PRIO_INTERESTING);
5087 //****************************************
5088
5089 // write op stats
5090 //****************************************
5091 b.add_u64_counter(l_bluestore_write_big, "write_big",
5092 "Large aligned writes into fresh blobs");
5093 b.add_u64_counter(l_bluestore_write_big_bytes, "write_big_bytes",
5094 "Large aligned writes into fresh blobs (bytes)",
5095 NULL,
5096 PerfCountersBuilder::PRIO_DEBUGONLY,
5097 unit_t(UNIT_BYTES));
5098 b.add_u64_counter(l_bluestore_write_big_blobs, "write_big_blobs",
5099 "Large aligned writes into fresh blobs (blobs)");
5100 b.add_u64_counter(l_bluestore_write_big_deferred,
5101 "write_big_deferred",
5102 "Big overwrites using deferred");
5103
5104 b.add_u64_counter(l_bluestore_write_small, "write_small",
5105 "Small writes into existing or sparse small blobs");
5106 b.add_u64_counter(l_bluestore_write_small_bytes, "write_small_bytes",
5107 "Small writes into existing or sparse small blobs (bytes)",
5108 NULL,
5109 PerfCountersBuilder::PRIO_DEBUGONLY,
5110 unit_t(UNIT_BYTES));
5111 b.add_u64_counter(l_bluestore_write_small_unused,
5112 "write_small_unused",
5113 "Small writes into unused portion of existing blob");
5114 b.add_u64_counter(l_bluestore_write_small_pre_read,
5115 "write_small_pre_read",
5116 "Small writes that required we read some data (possibly "
5117 "cached) to fill out the block");
5118
5119 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
5120 "Sum for write-op padded bytes",
5121 NULL,
5122 PerfCountersBuilder::PRIO_DEBUGONLY,
5123 unit_t(UNIT_BYTES));
5124 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
5125 "Sum for write penalty read ops");
5126 b.add_u64_counter(l_bluestore_write_new, "write_new",
5127 "Write into new blob");
5128
5129 b.add_u64_counter(l_bluestore_issued_deferred_writes,
5130 "issued_deferred_writes",
5131 "Total deferred writes issued");
5132 b.add_u64_counter(l_bluestore_issued_deferred_write_bytes,
5133 "issued_deferred_write_bytes",
5134 "Total bytes in issued deferred writes",
5135 NULL,
5136 PerfCountersBuilder::PRIO_DEBUGONLY,
5137 unit_t(UNIT_BYTES));
5138 b.add_u64_counter(l_bluestore_submitted_deferred_writes,
5139 "submitted_deferred_writes",
5140 "Total deferred writes submitted to disk");
5141 b.add_u64_counter(l_bluestore_submitted_deferred_write_bytes,
5142 "submitted_deferred_write_bytes",
5143 "Total bytes submitted to disk by deferred writes",
5144 NULL,
5145 PerfCountersBuilder::PRIO_DEBUGONLY,
5146 unit_t(UNIT_BYTES));
5147
5148 b.add_u64_counter(l_bluestore_write_big_skipped_blobs,
5149 "write_big_skipped_blobs",
5150 "Large aligned writes into fresh blobs skipped due to zero detection (blobs)");
5151 b.add_u64_counter(l_bluestore_write_big_skipped_bytes,
5152 "write_big_skipped_bytes",
5153 "Large aligned writes into fresh blobs skipped due to zero detection (bytes)");
5154 b.add_u64_counter(l_bluestore_write_small_skipped,
5155 "write_small_skipped",
5156 "Small writes into existing or sparse small blobs skipped due to zero detection");
5157 b.add_u64_counter(l_bluestore_write_small_skipped_bytes,
5158 "write_small_skipped_bytes",
5159 "Small writes into existing or sparse small blobs skipped due to zero detection (bytes)");
5160 //****************************************
5161
5162 // compressions stats
5163 //****************************************
5164 b.add_u64(l_bluestore_compressed, "compressed",
5165 "Sum for stored compressed bytes",
5166 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5167 b.add_u64(l_bluestore_compressed_allocated, "compressed_allocated",
5168 "Sum for bytes allocated for compressed data",
5169 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5170 b.add_u64(l_bluestore_compressed_original, "compressed_original",
5171 "Sum for original bytes that were compressed",
5172 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5173 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
5174 "Average compress latency",
5175 "_cpl", PerfCountersBuilder::PRIO_USEFUL);
5176 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
5177 "Average decompress latency",
5178 "dcpl", PerfCountersBuilder::PRIO_USEFUL);
5179 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
5180 "Sum for beneficial compress ops");
5181 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
5182 "Sum for compress ops rejected due to low net gain of space");
5183 //****************************************
5184
5185 // onode cache stats
5186 //****************************************
5187 b.add_u64(l_bluestore_onodes, "onodes",
5188 "Number of onodes in cache");
5189 b.add_u64(l_bluestore_pinned_onodes, "onodes_pinned",
5190 "Number of pinned onodes in cache");
5191 b.add_u64_counter(l_bluestore_onode_hits, "onode_hits",
5192 "Count of onode cache lookup hits",
5193 "o_ht", PerfCountersBuilder::PRIO_USEFUL);
5194 b.add_u64_counter(l_bluestore_onode_misses, "onode_misses",
5195 "Count of onode cache lookup misses",
5196 "o_ms", PerfCountersBuilder::PRIO_USEFUL);
5197 b.add_u64_counter(l_bluestore_onode_shard_hits, "onode_shard_hits",
5198 "Count of onode shard cache lookups hits");
5199 b.add_u64_counter(l_bluestore_onode_shard_misses,
5200 "onode_shard_misses",
5201 "Count of onode shard cache lookups misses");
5202 b.add_u64(l_bluestore_extents, "onode_extents",
5203 "Number of extents in cache");
5204 b.add_u64(l_bluestore_blobs, "onode_blobs",
5205 "Number of blobs in cache");
5206 //****************************************
5207
5208 // buffer cache stats
5209 //****************************************
5210 b.add_u64(l_bluestore_buffers, "buffers",
5211 "Number of buffers in cache");
5212 b.add_u64(l_bluestore_buffer_bytes, "buffer_bytes",
5213 "Number of buffer bytes in cache",
5214 NULL,
5215 PerfCountersBuilder::PRIO_DEBUGONLY,
5216 unit_t(UNIT_BYTES));
5217 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "buffer_hit_bytes",
5218 "Sum for bytes of read hit in the cache",
5219 NULL,
5220 PerfCountersBuilder::PRIO_DEBUGONLY,
5221 unit_t(UNIT_BYTES));
5222 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "buffer_miss_bytes",
5223 "Sum for bytes of read missed in the cache",
5224 NULL,
5225 PerfCountersBuilder::PRIO_DEBUGONLY,
5226 unit_t(UNIT_BYTES));
5227 //****************************************
5228
5229 // internal stats
5230 //****************************************
5231 b.add_u64_counter(l_bluestore_onode_reshard, "onode_reshard",
5232 "Onode extent map reshard events");
5233 b.add_u64_counter(l_bluestore_blob_split, "blob_split",
5234 "Sum for blob splitting due to resharding");
5235 b.add_u64_counter(l_bluestore_extent_compress, "extent_compress",
5236 "Sum for extents that have been removed due to compression");
5237 b.add_u64_counter(l_bluestore_gc_merged, "gc_merged",
5238 "Sum for extents that have been merged due to garbage "
5239 "collection");
5240 //****************************************
5241
5242 // other client ops latencies
5243 //****************************************
5244 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
5245 "Average omap iterator seek_to_first call latency",
5246 "osfl", PerfCountersBuilder::PRIO_USEFUL);
5247 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
5248 "Average omap iterator upper_bound call latency",
5249 "oubl", PerfCountersBuilder::PRIO_USEFUL);
5250 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
5251 "Average omap iterator lower_bound call latency",
5252 "olbl", PerfCountersBuilder::PRIO_USEFUL);
5253 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
5254 "Average omap iterator next call latency",
5255 "onxl", PerfCountersBuilder::PRIO_USEFUL);
5256 b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
5257 "Average omap get_keys call latency",
5258 "ogkl", PerfCountersBuilder::PRIO_USEFUL);
5259 b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
5260 "Average omap get_values call latency",
5261 "ogvl", PerfCountersBuilder::PRIO_USEFUL);
5262 b.add_time_avg(l_bluestore_omap_clear_lat, "omap_clear_lat",
5263 "Average omap clear call latency");
5264 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
5265 "Average collection listing latency",
5266 "cl_l", PerfCountersBuilder::PRIO_USEFUL);
5267 b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
5268 "Average removal latency",
5269 "rm_l", PerfCountersBuilder::PRIO_USEFUL);
5270 b.add_time_avg(l_bluestore_truncate_lat, "truncate_lat",
5271 "Average truncate latency",
5272 "tr_l", PerfCountersBuilder::PRIO_USEFUL);
5273 //****************************************
5274
5275 // Resulting size axis configuration for op histograms, values are in bytes
5276 PerfHistogramCommon::axis_config_d alloc_hist_x_axis_config{
5277 "Given size (bytes)",
5278 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
5279 0, ///< Start at 0
5280 4096, ///< Quantization unit
5281 13, ///< Enough to cover 4+M requests
5282 };
5283 // Req size axis configuration for op histograms, values are in bytes
5284 PerfHistogramCommon::axis_config_d alloc_hist_y_axis_config{
5285 "Request size (bytes)",
5286 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
5287 0, ///< Start at 0
5288 4096, ///< Quantization unit
5289 13, ///< Enough to cover 4+M requests
5290 };
5291 b.add_u64_counter_histogram(
5292 l_bluestore_allocate_hist, "allocate_histogram",
5293 alloc_hist_x_axis_config, alloc_hist_y_axis_config,
5294 "Histogram of requested block allocations vs. given ones");
5295
5296 logger = b.create_perf_counters();
5297 cct->get_perfcounters_collection()->add(logger);
5298 }
5299
5300 int BlueStore::_reload_logger()
5301 {
5302 struct store_statfs_t store_statfs;
5303 int r = statfs(&store_statfs);
5304 if (r >= 0) {
5305 logger->set(l_bluestore_allocated, store_statfs.allocated);
5306 logger->set(l_bluestore_stored, store_statfs.data_stored);
5307 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
5308 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
5309 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
5310 }
5311 return r;
5312 }
5313
5314 void BlueStore::_shutdown_logger()
5315 {
5316 cct->get_perfcounters_collection()->remove(logger);
5317 delete logger;
5318 }
5319
5320 int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
5321 uuid_d *fsid)
5322 {
5323 bluestore_bdev_label_t label;
5324 int r = _read_bdev_label(cct, path, &label);
5325 if (r < 0)
5326 return r;
5327 *fsid = label.osd_uuid;
5328 return 0;
5329 }
5330
5331 int BlueStore::_open_path()
5332 {
5333 // sanity check(s)
5334 ceph_assert(path_fd < 0);
5335 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
5336 if (path_fd < 0) {
5337 int r = -errno;
5338 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
5339 << dendl;
5340 return r;
5341 }
5342 return 0;
5343 }
5344
5345 void BlueStore::_close_path()
5346 {
5347 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
5348 path_fd = -1;
5349 }
5350
5351 int BlueStore::_write_bdev_label(CephContext *cct,
5352 const string &path, bluestore_bdev_label_t label)
5353 {
5354 dout(10) << __func__ << " path " << path << " label " << label << dendl;
5355 bufferlist bl;
5356 encode(label, bl);
5357 uint32_t crc = bl.crc32c(-1);
5358 encode(crc, bl);
5359 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
5360 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
5361 z.zero();
5362 bl.append(std::move(z));
5363
5364 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
5365 if (fd < 0) {
5366 fd = -errno;
5367 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5368 << dendl;
5369 return fd;
5370 }
5371 int r = bl.write_fd(fd);
5372 if (r < 0) {
5373 derr << __func__ << " failed to write to " << path
5374 << ": " << cpp_strerror(r) << dendl;
5375 goto out;
5376 }
5377 r = ::fsync(fd);
5378 if (r < 0) {
5379 derr << __func__ << " failed to fsync " << path
5380 << ": " << cpp_strerror(r) << dendl;
5381 }
5382 out:
5383 VOID_TEMP_FAILURE_RETRY(::close(fd));
5384 return r;
5385 }
5386
5387 int BlueStore::_read_bdev_label(CephContext* cct, const string &path,
5388 bluestore_bdev_label_t *label)
5389 {
5390 dout(10) << __func__ << dendl;
5391 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
5392 if (fd < 0) {
5393 fd = -errno;
5394 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5395 << dendl;
5396 return fd;
5397 }
5398 bufferlist bl;
5399 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
5400 VOID_TEMP_FAILURE_RETRY(::close(fd));
5401 if (r < 0) {
5402 derr << __func__ << " failed to read from " << path
5403 << ": " << cpp_strerror(r) << dendl;
5404 return r;
5405 }
5406
5407 uint32_t crc, expected_crc;
5408 auto p = bl.cbegin();
5409 try {
5410 decode(*label, p);
5411 bufferlist t;
5412 t.substr_of(bl, 0, p.get_off());
5413 crc = t.crc32c(-1);
5414 decode(expected_crc, p);
5415 }
5416 catch (ceph::buffer::error& e) {
5417 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
5418 << ": " << e.what()
5419 << dendl;
5420 return -ENOENT;
5421 }
5422 if (crc != expected_crc) {
5423 derr << __func__ << " bad crc on label, expected " << expected_crc
5424 << " != actual " << crc << dendl;
5425 return -EIO;
5426 }
5427 dout(10) << __func__ << " got " << *label << dendl;
5428 return 0;
5429 }
5430
5431 int BlueStore::_check_or_set_bdev_label(
5432 string path, uint64_t size, string desc, bool create)
5433 {
5434 bluestore_bdev_label_t label;
5435 if (create) {
5436 label.osd_uuid = fsid;
5437 label.size = size;
5438 label.btime = ceph_clock_now();
5439 label.description = desc;
5440 int r = _write_bdev_label(cct, path, label);
5441 if (r < 0)
5442 return r;
5443 } else {
5444 int r = _read_bdev_label(cct, path, &label);
5445 if (r < 0)
5446 return r;
5447 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
5448 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5449 << " and fsid " << fsid << " check bypassed" << dendl;
5450 } else if (label.osd_uuid != fsid) {
5451 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5452 << " does not match our fsid " << fsid << dendl;
5453 return -EIO;
5454 }
5455 }
5456 return 0;
5457 }
5458
5459 void BlueStore::_set_alloc_sizes(void)
5460 {
5461 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
5462
5463 #ifdef HAVE_LIBZBD
5464 ceph_assert(bdev);
5465 if (bdev->is_smr()) {
5466 prefer_deferred_size = 0;
5467 } else
5468 #endif
5469 if (cct->_conf->bluestore_prefer_deferred_size) {
5470 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
5471 } else {
5472 if (_use_rotational_settings()) {
5473 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
5474 } else {
5475 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
5476 }
5477 }
5478
5479 if (cct->_conf->bluestore_deferred_batch_ops) {
5480 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
5481 } else {
5482 if (_use_rotational_settings()) {
5483 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
5484 } else {
5485 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
5486 }
5487 }
5488
5489 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
5490 << std::dec << " order " << (int)min_alloc_size_order
5491 << " max_alloc_size 0x" << std::hex << max_alloc_size
5492 << " prefer_deferred_size 0x" << prefer_deferred_size
5493 << std::dec
5494 << " deferred_batch_ops " << deferred_batch_ops
5495 << dendl;
5496 }
5497
5498 int BlueStore::_open_bdev(bool create)
5499 {
5500 ceph_assert(bdev == NULL);
5501 string p = path + "/block";
5502 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
5503 int r = bdev->open(p);
5504 if (r < 0)
5505 goto fail;
5506
5507 if (create && cct->_conf->bdev_enable_discard) {
5508 bdev->discard(0, bdev->get_size());
5509 }
5510
5511 if (bdev->supported_bdev_label()) {
5512 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
5513 if (r < 0)
5514 goto fail_close;
5515 }
5516
5517 // initialize global block parameters
5518 block_size = bdev->get_block_size();
5519 block_mask = ~(block_size - 1);
5520 block_size_order = ctz(block_size);
5521 ceph_assert(block_size == 1u << block_size_order);
5522 _set_max_defer_interval();
5523 // and set cache_size based on device type
5524 r = _set_cache_sizes();
5525 if (r < 0) {
5526 goto fail_close;
5527 }
5528 // get block dev optimal io size
5529 optimal_io_size = bdev->get_optimal_io_size();
5530
5531 return 0;
5532
5533 fail_close:
5534 bdev->close();
5535 fail:
5536 delete bdev;
5537 bdev = NULL;
5538 return r;
5539 }
5540
5541 void BlueStore::_validate_bdev()
5542 {
5543 ceph_assert(bdev);
5544 uint64_t dev_size = bdev->get_size();
5545 ceph_assert(dev_size > _get_ondisk_reserved());
5546 }
5547
5548 void BlueStore::_close_bdev()
5549 {
5550 ceph_assert(bdev);
5551 bdev->close();
5552 delete bdev;
5553 bdev = NULL;
5554 }
5555
5556 int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only, bool fm_restore)
5557 {
5558 int r;
5559
5560 dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
5561 ceph_assert(fm == NULL);
5562 // fm_restore means we are transitioning from null-fm to bitmap-fm
5563 ceph_assert(!fm_restore || (freelist_type != "null"));
5564 // fm restore must pass in a valid transaction
5565 ceph_assert(!fm_restore || (t != nullptr));
5566
5567 // When allocation-info is stored in a single file we set freelist_type to "null"
5568 bool set_null_freemap = false;
5569 if (freelist_type == "null") {
5570 // use BitmapFreelistManager with the null option to stop allocations from going to RocksDB
5571 // we will store the allocation info in a single file during umount()
5572 freelist_type = "bitmap";
5573 set_null_freemap = true;
5574 }
5575 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
5576 ceph_assert(fm);
5577 if (set_null_freemap) {
5578 fm->set_null_manager();
5579 }
5580 if (t) {
5581 // create mode. initialize freespace
5582 dout(20) << __func__ << " initializing freespace" << dendl;
5583 {
5584 bufferlist bl;
5585 bl.append(freelist_type);
5586 t->set(PREFIX_SUPER, "freelist_type", bl);
5587 }
5588 // being able to allocate in units less than bdev block size
5589 // seems to be a bad idea.
5590 ceph_assert(cct->_conf->bdev_block_size <= min_alloc_size);
5591
5592 uint64_t alloc_size = min_alloc_size;
5593 #ifdef HAVE_LIBZBD
5594 if (bdev->is_smr()) {
5595 if (freelist_type != "zoned") {
5596 derr << "SMR device but freelist_type = " << freelist_type << " (not zoned)"
5597 << dendl;
5598 return -EINVAL;
5599 }
5600 } else
5601 #endif
5602 if (freelist_type == "zoned") {
5603 derr << "non-SMR device (or SMR support not built-in) but freelist_type = zoned"
5604 << dendl;
5605 return -EINVAL;
5606 }
5607
5608 fm->create(bdev->get_size(), alloc_size,
5609 zone_size, first_sequential_zone,
5610 t);
5611
5612 // allocate superblock reserved space. note that we do not mark
5613 // bluefs space as allocated in the freelist; we instead rely on
5614 // bluefs doing that itself.
5615 auto reserved = _get_ondisk_reserved();
5616 if (fm_restore) {
5617 // we need to allocate the full space in restore case
5618 // as later we will add free-space marked in the allocator file
5619 fm->allocate(0, bdev->get_size(), t);
5620 } else {
5621 // allocate superblock reserved space. note that we do not mark
5622 // bluefs space as allocated in the freelist; we instead rely on
5623 // bluefs doing that itself.
5624 fm->allocate(0, reserved, t);
5625 }
5626 // debug code - not needed for NULL FM
5627 if (cct->_conf->bluestore_debug_prefill > 0) {
5628 uint64_t end = bdev->get_size() - reserved;
5629 dout(1) << __func__ << " pre-fragmenting freespace, using "
5630 << cct->_conf->bluestore_debug_prefill << " with max free extent "
5631 << cct->_conf->bluestore_debug_prefragment_max << dendl;
5632 uint64_t start = p2roundup(reserved, min_alloc_size);
5633 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
5634 float r = cct->_conf->bluestore_debug_prefill;
5635 r /= 1.0 - r;
5636 bool stop = false;
5637
5638 while (!stop && start < end) {
5639 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
5640 if (start + l > end) {
5641 l = end - start;
5642 l = p2align(l, min_alloc_size);
5643 }
5644 ceph_assert(start + l <= end);
5645
5646 uint64_t u = 1 + (uint64_t)(r * (double)l);
5647 u = p2roundup(u, min_alloc_size);
5648 if (start + l + u > end) {
5649 u = end - (start + l);
5650 // trim to align so we don't overflow again
5651 u = p2align(u, min_alloc_size);
5652 stop = true;
5653 }
5654 ceph_assert(start + l + u <= end);
5655
5656 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
5657 << " use 0x" << u << std::dec << dendl;
5658
5659 if (u == 0) {
5660 // break if u has been trimmed to nothing
5661 break;
5662 }
5663
5664 fm->allocate(start + l, u, t);
5665 start += l + u;
5666 }
5667 }
5668 r = _write_out_fm_meta(0);
5669 ceph_assert(r == 0);
5670 } else {
5671 r = fm->init(db, read_only,
5672 [&](const std::string& key, std::string* result) {
5673 return read_meta(key, result);
5674 });
5675 if (r < 0) {
5676 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
5677 delete fm;
5678 fm = NULL;
5679 return r;
5680 }
5681 }
5682 // if space size tracked by free list manager is that higher than actual
5683 // dev size one can hit out-of-space allocation which will result
5684 // in data loss and/or assertions
5685 // Probably user altered the device size somehow.
5686 // The only fix for now is to redeploy OSD.
5687 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5688 ostringstream ss;
5689 ss << "slow device size mismatch detected, "
5690 << " fm size(" << fm->get_size()
5691 << ") > slow device size(" << bdev->get_size()
5692 << "), Please stop using this OSD as it might cause data loss.";
5693 _set_disk_size_mismatch_alert(ss.str());
5694 }
5695 return 0;
5696 }
5697
5698 void BlueStore::_close_fm()
5699 {
5700 dout(10) << __func__ << dendl;
5701 ceph_assert(fm);
5702 fm->shutdown();
5703 delete fm;
5704 fm = NULL;
5705 }
5706
5707 int BlueStore::_write_out_fm_meta(uint64_t target_size)
5708 {
5709 int r = 0;
5710 string p = path + "/block";
5711
5712 std::vector<std::pair<string, string>> fm_meta;
5713 fm->get_meta(target_size, &fm_meta);
5714
5715 for (auto& m : fm_meta) {
5716 r = write_meta(m.first, m.second);
5717 ceph_assert(r == 0);
5718 }
5719 return r;
5720 }
5721
5722 int BlueStore::_create_alloc()
5723 {
5724 ceph_assert(alloc == NULL);
5725 ceph_assert(shared_alloc.a == NULL);
5726 ceph_assert(bdev->get_size());
5727
5728 uint64_t alloc_size = min_alloc_size;
5729
5730 std::string allocator_type = cct->_conf->bluestore_allocator;
5731
5732 #ifdef HAVE_LIBZBD
5733 if (freelist_type == "zoned") {
5734 allocator_type = "zoned";
5735 }
5736 #endif
5737
5738 alloc = Allocator::create(
5739 cct, allocator_type,
5740 bdev->get_size(),
5741 alloc_size,
5742 zone_size,
5743 first_sequential_zone,
5744 "block");
5745 if (!alloc) {
5746 lderr(cct) << __func__ << " failed to create " << allocator_type << " allocator"
5747 << dendl;
5748 return -EINVAL;
5749 }
5750
5751 #ifdef HAVE_LIBZBD
5752 if (freelist_type == "zoned") {
5753 Allocator *a = Allocator::create(
5754 cct, cct->_conf->bluestore_allocator,
5755 bdev->get_conventional_region_size(),
5756 alloc_size,
5757 0, 0,
5758 "zoned_block");
5759 if (!a) {
5760 lderr(cct) << __func__ << " failed to create " << cct->_conf->bluestore_allocator
5761 << " allocator" << dendl;
5762 delete alloc;
5763 return -EINVAL;
5764 }
5765 shared_alloc.set(a);
5766 } else
5767 #endif
5768 {
5769 // BlueFS will share the same allocator
5770 shared_alloc.set(alloc);
5771 }
5772
5773 return 0;
5774 }
5775
5776 int BlueStore::_init_alloc(std::map<uint64_t, uint64_t> *zone_adjustments)
5777 {
5778 int r = _create_alloc();
5779 if (r < 0) {
5780 return r;
5781 }
5782 ceph_assert(alloc != NULL);
5783
5784 #ifdef HAVE_LIBZBD
5785 if (bdev->is_smr()) {
5786 auto a = dynamic_cast<ZonedAllocator*>(alloc);
5787 ceph_assert(a);
5788 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
5789 ceph_assert(f);
5790 vector<uint64_t> wp = bdev->get_zones();
5791 vector<zone_state_t> zones = f->get_zone_states(db);
5792 ceph_assert(wp.size() == zones.size());
5793
5794 // reconcile zone state
5795 auto num_zones = bdev->get_size() / zone_size;
5796 for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
5797 ceph_assert(wp[i] >= i * zone_size);
5798 ceph_assert(wp[i] <= (i + 1) * zone_size); // pos might be at start of next zone
5799 uint64_t p = wp[i] - i * zone_size;
5800 if (zones[i].write_pointer > p) {
5801 derr << __func__ << " zone 0x" << std::hex << i
5802 << " bluestore write pointer 0x" << zones[i].write_pointer
5803 << " > device write pointer 0x" << p
5804 << std::dec << " -- VERY SUSPICIOUS!" << dendl;
5805 } else if (zones[i].write_pointer < p) {
5806 // this is "normal" in that it can happen after any crash (if we have a
5807 // write in flight but did not manage to commit the transaction)
5808 auto delta = p - zones[i].write_pointer;
5809 dout(1) << __func__ << " zone 0x" << std::hex << i
5810 << " device write pointer 0x" << p
5811 << " > bluestore pointer 0x" << zones[i].write_pointer
5812 << ", advancing 0x" << delta << std::dec << dendl;
5813 (*zone_adjustments)[zones[i].write_pointer] = delta;
5814 zones[i].num_dead_bytes += delta;
5815 zones[i].write_pointer = p;
5816 }
5817 }
5818
5819 // start with conventional zone "free" (bluefs may adjust this when it starts up)
5820 auto reserved = _get_ondisk_reserved();
5821 // for now we require a conventional zone
5822 ceph_assert(bdev->get_conventional_region_size());
5823 ceph_assert(shared_alloc.a != alloc); // zoned allocator doesn't use conventional region
5824 shared_alloc.a->init_add_free(
5825 reserved,
5826 p2align(bdev->get_conventional_region_size(), min_alloc_size) - reserved);
5827
5828 // init sequential zone based on the device's write pointers
5829 a->init_from_zone_pointers(std::move(zones));
5830 dout(1) << __func__
5831 << " loaded zone pointers: "
5832 << std::hex
5833 << ", allocator type " << alloc->get_type()
5834 << ", capacity 0x" << alloc->get_capacity()
5835 << ", block size 0x" << alloc->get_block_size()
5836 << ", free 0x" << alloc->get_free()
5837 << ", fragmentation " << alloc->get_fragmentation()
5838 << std::dec << dendl;
5839
5840 return 0;
5841 }
5842 #endif
5843
5844 uint64_t num = 0, bytes = 0;
5845 utime_t start_time = ceph_clock_now();
5846 if (!fm->is_null_manager()) {
5847 // This is the original path - loading allocation map from RocksDB and feeding into the allocator
5848 dout(5) << __func__ << "::NCB::loading allocation from FM -> alloc" << dendl;
5849 // initialize from freelist
5850 fm->enumerate_reset();
5851 uint64_t offset, length;
5852 while (fm->enumerate_next(db, &offset, &length)) {
5853 alloc->init_add_free(offset, length);
5854 ++num;
5855 bytes += length;
5856 }
5857 fm->enumerate_reset();
5858
5859 utime_t duration = ceph_clock_now() - start_time;
5860 dout(5) << __func__ << "::num_entries=" << num << " free_size=" << bytes << " alloc_size=" <<
5861 alloc->get_capacity() - bytes << " time=" << duration << " seconds" << dendl;
5862 } else {
5863 // This is the new path reading the allocation map from a flat bluefs file and feeding them into the allocator
5864
5865 if (!cct->_conf->bluestore_allocation_from_file) {
5866 derr << __func__ << "::NCB::cct->_conf->bluestore_allocation_from_file is set to FALSE with an active NULL-FM" << dendl;
5867 derr << __func__ << "::NCB::Please change the value of bluestore_allocation_from_file to TRUE in your ceph.conf file" << dendl;
5868 return -ENOTSUP; // Operation not supported
5869 }
5870
5871 if (restore_allocator(alloc, &num, &bytes) == 0) {
5872 dout(5) << __func__ << "::NCB::restore_allocator() completed successfully alloc=" << alloc << dendl;
5873 } else {
5874 // This must mean that we had an unplanned shutdown and didn't manage to destage the allocator
5875 dout(0) << __func__ << "::NCB::restore_allocator() failed! Run Full Recovery from ONodes (might take a while) ..." << dendl;
5876 // if failed must recover from on-disk ONode internal state
5877 if (read_allocation_from_drive_on_startup() != 0) {
5878 derr << __func__ << "::NCB::Failed Recovery" << dendl;
5879 derr << __func__ << "::NCB::Ceph-OSD won't start, make sure your drives are connected and readable" << dendl;
5880 derr << __func__ << "::NCB::If no HW fault is found, please report failure and consider redeploying OSD" << dendl;
5881 return -ENOTRECOVERABLE;
5882 }
5883 }
5884 }
5885 dout(1) << __func__
5886 << " loaded " << byte_u_t(bytes) << " in " << num << " extents"
5887 << std::hex
5888 << ", allocator type " << alloc->get_type()
5889 << ", capacity 0x" << alloc->get_capacity()
5890 << ", block size 0x" << alloc->get_block_size()
5891 << ", free 0x" << alloc->get_free()
5892 << ", fragmentation " << alloc->get_fragmentation()
5893 << std::dec << dendl;
5894
5895 return 0;
5896 }
5897
5898 void BlueStore::_post_init_alloc(const std::map<uint64_t, uint64_t>& zone_adjustments)
5899 {
5900 #ifdef HAVE_LIBZBD
5901 assert(bdev->is_smr());
5902 dout(1) << __func__ << " adjusting freelist based on device write pointers" << dendl;
5903 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
5904 ceph_assert(f);
5905 KeyValueDB::Transaction t = db->get_transaction();
5906 for (auto& i : zone_adjustments) {
5907 // allocate AND release since this gap is now dead space
5908 // note that the offset is imprecise, but only need to select the zone
5909 f->allocate(i.first, i.second, t);
5910 f->release(i.first, i.second, t);
5911 }
5912 int r = db->submit_transaction_sync(t);
5913 ceph_assert(r == 0);
5914 #endif
5915 }
5916
5917 void BlueStore::_close_alloc()
5918 {
5919 ceph_assert(bdev);
5920 bdev->discard_drain();
5921
5922 ceph_assert(alloc);
5923 alloc->shutdown();
5924 delete alloc;
5925
5926 ceph_assert(shared_alloc.a);
5927 if (alloc != shared_alloc.a) {
5928 shared_alloc.a->shutdown();
5929 delete shared_alloc.a;
5930 }
5931
5932 shared_alloc.reset();
5933 alloc = nullptr;
5934 }
5935
5936 int BlueStore::_open_fsid(bool create)
5937 {
5938 ceph_assert(fsid_fd < 0);
5939 int flags = O_RDWR|O_CLOEXEC;
5940 if (create)
5941 flags |= O_CREAT;
5942 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
5943 if (fsid_fd < 0) {
5944 int err = -errno;
5945 derr << __func__ << " " << cpp_strerror(err) << dendl;
5946 return err;
5947 }
5948 return 0;
5949 }
5950
5951 int BlueStore::_read_fsid(uuid_d *uuid)
5952 {
5953 char fsid_str[40];
5954 memset(fsid_str, 0, sizeof(fsid_str));
5955 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
5956 if (ret < 0) {
5957 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
5958 return ret;
5959 }
5960 if (ret > 36)
5961 fsid_str[36] = 0;
5962 else
5963 fsid_str[ret] = 0;
5964 if (!uuid->parse(fsid_str)) {
5965 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
5966 return -EINVAL;
5967 }
5968 return 0;
5969 }
5970
5971 int BlueStore::_write_fsid()
5972 {
5973 int r = ::ftruncate(fsid_fd, 0);
5974 if (r < 0) {
5975 r = -errno;
5976 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
5977 return r;
5978 }
5979 string str = stringify(fsid) + "\n";
5980 r = safe_write(fsid_fd, str.c_str(), str.length());
5981 if (r < 0) {
5982 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
5983 return r;
5984 }
5985 r = ::fsync(fsid_fd);
5986 if (r < 0) {
5987 r = -errno;
5988 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
5989 return r;
5990 }
5991 return 0;
5992 }
5993
5994 void BlueStore::_close_fsid()
5995 {
5996 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
5997 fsid_fd = -1;
5998 }
5999
6000 int BlueStore::_lock_fsid()
6001 {
6002 struct flock l;
6003 memset(&l, 0, sizeof(l));
6004 l.l_type = F_WRLCK;
6005 l.l_whence = SEEK_SET;
6006 int r = ::fcntl(fsid_fd, F_SETLK, &l);
6007 if (r < 0) {
6008 int err = errno;
6009 derr << __func__ << " failed to lock " << path << "/fsid"
6010 << " (is another ceph-osd still running?)"
6011 << cpp_strerror(err) << dendl;
6012 return -err;
6013 }
6014 return 0;
6015 }
6016
6017 bool BlueStore::is_rotational()
6018 {
6019 if (bdev) {
6020 return bdev->is_rotational();
6021 }
6022
6023 bool rotational = true;
6024 int r = _open_path();
6025 if (r < 0)
6026 goto out;
6027 r = _open_fsid(false);
6028 if (r < 0)
6029 goto out_path;
6030 r = _read_fsid(&fsid);
6031 if (r < 0)
6032 goto out_fsid;
6033 r = _lock_fsid();
6034 if (r < 0)
6035 goto out_fsid;
6036 r = _open_bdev(false);
6037 if (r < 0)
6038 goto out_fsid;
6039 rotational = bdev->is_rotational();
6040 _close_bdev();
6041 out_fsid:
6042 _close_fsid();
6043 out_path:
6044 _close_path();
6045 out:
6046 return rotational;
6047 }
6048
6049 bool BlueStore::is_journal_rotational()
6050 {
6051 if (!bluefs) {
6052 dout(5) << __func__ << " bluefs disabled, default to store media type"
6053 << dendl;
6054 return is_rotational();
6055 }
6056 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
6057 return bluefs->wal_is_rotational();
6058 }
6059
6060 bool BlueStore::is_db_rotational()
6061 {
6062 if (!bluefs) {
6063 dout(5) << __func__ << " bluefs disabled, default to store media type"
6064 << dendl;
6065 return is_rotational();
6066 }
6067 dout(10) << __func__ << " " << (int)bluefs->db_is_rotational() << dendl;
6068 return bluefs->db_is_rotational();
6069 }
6070
6071 bool BlueStore::_use_rotational_settings()
6072 {
6073 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
6074 return true;
6075 }
6076 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
6077 return false;
6078 }
6079 return bdev->is_rotational();
6080 }
6081
6082 bool BlueStore::test_mount_in_use()
6083 {
6084 // most error conditions mean the mount is not in use (e.g., because
6085 // it doesn't exist). only if we fail to lock do we conclude it is
6086 // in use.
6087 bool ret = false;
6088 int r = _open_path();
6089 if (r < 0)
6090 return false;
6091 r = _open_fsid(false);
6092 if (r < 0)
6093 goto out_path;
6094 r = _lock_fsid();
6095 if (r < 0)
6096 ret = true; // if we can't lock, it is in use
6097 _close_fsid();
6098 out_path:
6099 _close_path();
6100 return ret;
6101 }
6102
6103 int BlueStore::_minimal_open_bluefs(bool create)
6104 {
6105 int r;
6106 bluefs = new BlueFS(cct);
6107
6108 string bfn;
6109 struct stat st;
6110
6111 bfn = path + "/block.db";
6112 if (::stat(bfn.c_str(), &st) == 0) {
6113 r = bluefs->add_block_device(
6114 BlueFS::BDEV_DB, bfn,
6115 create && cct->_conf->bdev_enable_discard,
6116 SUPER_RESERVED);
6117 if (r < 0) {
6118 derr << __func__ << " add block device(" << bfn << ") returned: "
6119 << cpp_strerror(r) << dendl;
6120 goto free_bluefs;
6121 }
6122
6123 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
6124 r = _check_or_set_bdev_label(
6125 bfn,
6126 bluefs->get_block_device_size(BlueFS::BDEV_DB),
6127 "bluefs db", create);
6128 if (r < 0) {
6129 derr << __func__
6130 << " check block device(" << bfn << ") label returned: "
6131 << cpp_strerror(r) << dendl;
6132 goto free_bluefs;
6133 }
6134 }
6135 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6136 bluefs_layout.dedicated_db = true;
6137 } else {
6138 r = -errno;
6139 if (::lstat(bfn.c_str(), &st) == -1) {
6140 r = 0;
6141 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6142 } else {
6143 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
6144 << cpp_strerror(r) << dendl;
6145 goto free_bluefs;
6146 }
6147 }
6148
6149 // shared device
6150 bfn = path + "/block";
6151 // never trim here
6152 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
6153 0, // no need to provide valid 'reserved' for shared dev
6154 &shared_alloc);
6155 if (r < 0) {
6156 derr << __func__ << " add block device(" << bfn << ") returned: "
6157 << cpp_strerror(r) << dendl;
6158 goto free_bluefs;
6159 }
6160
6161 bfn = path + "/block.wal";
6162 if (::stat(bfn.c_str(), &st) == 0) {
6163 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
6164 create && cct->_conf->bdev_enable_discard,
6165 BDEV_LABEL_BLOCK_SIZE);
6166 if (r < 0) {
6167 derr << __func__ << " add block device(" << bfn << ") returned: "
6168 << cpp_strerror(r) << dendl;
6169 goto free_bluefs;
6170 }
6171
6172 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
6173 r = _check_or_set_bdev_label(
6174 bfn,
6175 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
6176 "bluefs wal", create);
6177 if (r < 0) {
6178 derr << __func__ << " check block device(" << bfn
6179 << ") label returned: " << cpp_strerror(r) << dendl;
6180 goto free_bluefs;
6181 }
6182 }
6183
6184 bluefs_layout.dedicated_wal = true;
6185 } else {
6186 r = 0;
6187 if (::lstat(bfn.c_str(), &st) != -1) {
6188 r = -errno;
6189 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
6190 << cpp_strerror(r) << dendl;
6191 goto free_bluefs;
6192 }
6193 }
6194 return 0;
6195
6196 free_bluefs:
6197 ceph_assert(bluefs);
6198 delete bluefs;
6199 bluefs = NULL;
6200 return r;
6201 }
6202
6203 int BlueStore::_open_bluefs(bool create, bool read_only)
6204 {
6205 int r = _minimal_open_bluefs(create);
6206 if (r < 0) {
6207 return r;
6208 }
6209 BlueFSVolumeSelector* vselector = nullptr;
6210 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
6211
6212 string options = cct->_conf->bluestore_rocksdb_options;
6213 string options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6214 if (!options_annex.empty()) {
6215 if (!options.empty() &&
6216 *options.rbegin() != ',') {
6217 options += ',';
6218 }
6219 options += options_annex;
6220 }
6221
6222 rocksdb::Options rocks_opts;
6223 r = RocksDBStore::ParseOptionsFromStringStatic(
6224 cct,
6225 options,
6226 rocks_opts,
6227 nullptr);
6228 if (r < 0) {
6229 return r;
6230 }
6231 if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
6232 vselector = new FitToFastVolumeSelector(
6233 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
6234 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
6235 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100);
6236 } else {
6237 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
6238 vselector =
6239 new RocksDBBlueFSVolumeSelector(
6240 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
6241 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
6242 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
6243 1024 * 1024 * 1024, //FIXME: set expected l0 size here
6244 rocks_opts.max_bytes_for_level_base,
6245 rocks_opts.max_bytes_for_level_multiplier,
6246 reserved_factor,
6247 cct->_conf->bluestore_volume_selection_reserved,
6248 cct->_conf->bluestore_volume_selection_policy == "use_some_extra");
6249 }
6250 }
6251 if (create) {
6252 bluefs->mkfs(fsid, bluefs_layout);
6253 }
6254 bluefs->set_volume_selector(vselector);
6255 r = bluefs->mount();
6256 if (r < 0) {
6257 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
6258 }
6259 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
6260 return r;
6261 }
6262
6263 void BlueStore::_close_bluefs()
6264 {
6265 bluefs->umount(db_was_opened_read_only);
6266 _minimal_close_bluefs();
6267 }
6268
6269 void BlueStore::_minimal_close_bluefs()
6270 {
6271 delete bluefs;
6272 bluefs = NULL;
6273 }
6274
6275 int BlueStore::_is_bluefs(bool create, bool* ret)
6276 {
6277 if (create) {
6278 *ret = cct->_conf->bluestore_bluefs;
6279 } else {
6280 string s;
6281 int r = read_meta("bluefs", &s);
6282 if (r < 0) {
6283 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
6284 return -EIO;
6285 }
6286 if (s == "1") {
6287 *ret = true;
6288 } else if (s == "0") {
6289 *ret = false;
6290 } else {
6291 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
6292 << dendl;
6293 return -EIO;
6294 }
6295 }
6296 return 0;
6297 }
6298
6299 /*
6300 * opens both DB and dependant super_meta, FreelistManager and allocator
6301 * in the proper order
6302 */
6303 int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
6304 {
6305 dout(5) << __func__ << "::NCB::read_only=" << read_only << ", to_repair=" << to_repair << dendl;
6306 {
6307 string type;
6308 int r = read_meta("type", &type);
6309 if (r < 0) {
6310 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
6311 << dendl;
6312 return r;
6313 }
6314
6315 if (type != "bluestore") {
6316 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6317 return -EIO;
6318 }
6319 }
6320
6321 // SMR devices may require a freelist adjustment, but that can only happen after
6322 // the db is read-write. we'll stash pending changes here.
6323 std::map<uint64_t, uint64_t> zone_adjustments;
6324
6325 int r = _open_path();
6326 if (r < 0)
6327 return r;
6328 r = _open_fsid(false);
6329 if (r < 0)
6330 goto out_path;
6331
6332 r = _read_fsid(&fsid);
6333 if (r < 0)
6334 goto out_fsid;
6335
6336 r = _lock_fsid();
6337 if (r < 0)
6338 goto out_fsid;
6339
6340 r = _open_bdev(false);
6341 if (r < 0)
6342 goto out_fsid;
6343
6344 // GBH: can probably skip open_db step in REad-Only mode when operating in NULL-FM mode
6345 // (might need to open if failed to restore from file)
6346
6347 // open in read-only first to read FM list and init allocator
6348 // as they might be needed for some BlueFS procedures
6349 r = _open_db(false, false, true);
6350 if (r < 0)
6351 goto out_bdev;
6352
6353 r = _open_super_meta();
6354 if (r < 0) {
6355 goto out_db;
6356 }
6357
6358 r = _open_fm(nullptr, true);
6359 if (r < 0)
6360 goto out_db;
6361
6362 r = _init_alloc(&zone_adjustments);
6363 if (r < 0)
6364 goto out_fm;
6365
6366 // Re-open in the proper mode(s).
6367
6368 // Can't simply bypass second open for read-only mode as we need to
6369 // load allocated extents from bluefs into allocator.
6370 // And now it's time to do that
6371 //
6372 _close_db();
6373 r = _open_db(false, to_repair, read_only);
6374 if (r < 0) {
6375 goto out_alloc;
6376 }
6377
6378 if (!read_only && !zone_adjustments.empty()) {
6379 // for SMR devices that have freelist mismatch with device write pointers
6380 _post_init_alloc(zone_adjustments);
6381 }
6382
6383 // when function is called in repair mode (to_repair=true) we skip db->open()/create()
6384 // we can't change bluestore allocation so no need to invlidate allocation-file
6385 if (fm->is_null_manager() && !read_only && !to_repair) {
6386 // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected
6387 // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount()
6388 // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort
6389 // to recovery from RocksDB::ONodes
6390 r = invalidate_allocation_file_on_bluefs();
6391 if (r != 0) {
6392 derr << __func__ << "::NCB::invalidate_allocation_file_on_bluefs() failed!" << dendl;
6393 goto out_alloc;
6394 }
6395 }
6396
6397 // when function is called in repair mode (to_repair=true) we skip db->open()/create()
6398 if (!is_db_rotational() && !read_only && !to_repair && cct->_conf->bluestore_allocation_from_file
6399 #ifdef HAVE_LIBZBD
6400 && !bdev->is_smr()
6401 #endif
6402 ) {
6403 dout(5) << __func__ << "::NCB::Commit to Null-Manager" << dendl;
6404 commit_to_null_manager();
6405 need_to_destage_allocation_file = true;
6406 dout(10) << __func__ << "::NCB::need_to_destage_allocation_file was set" << dendl;
6407 }
6408
6409 return 0;
6410
6411 out_alloc:
6412 _close_alloc();
6413 out_fm:
6414 _close_fm();
6415 out_db:
6416 _close_db();
6417 out_bdev:
6418 _close_bdev();
6419 out_fsid:
6420 _close_fsid();
6421 out_path:
6422 _close_path();
6423 return r;
6424 }
6425
6426 void BlueStore::_close_db_and_around()
6427 {
6428 if (db) {
6429 _close_db();
6430 }
6431 if (bluefs) {
6432 _close_bluefs();
6433 }
6434 _close_fm();
6435 _close_alloc();
6436 _close_bdev();
6437 _close_fsid();
6438 _close_path();
6439 }
6440
6441 int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair)
6442 {
6443 _kv_only = true;
6444 int r = _open_db_and_around(false, to_repair);
6445 if (r == 0) {
6446 *pdb = db;
6447 } else {
6448 *pdb = nullptr;
6449 }
6450 return r;
6451 }
6452
6453 int BlueStore::close_db_environment()
6454 {
6455 _close_db_and_around();
6456 return 0;
6457 }
6458
6459 /* gets access to bluefs supporting RocksDB */
6460 BlueFS* BlueStore::get_bluefs() {
6461 return bluefs;
6462 }
6463
6464 int BlueStore::_prepare_db_environment(bool create, bool read_only,
6465 std::string* _fn, std::string* _kv_backend)
6466 {
6467 int r;
6468 ceph_assert(!db);
6469 std::string& fn=*_fn;
6470 std::string& kv_backend=*_kv_backend;
6471 fn = path + "/db";
6472 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
6473
6474 if (create) {
6475 kv_backend = cct->_conf->bluestore_kvbackend;
6476 } else {
6477 r = read_meta("kv_backend", &kv_backend);
6478 if (r < 0) {
6479 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
6480 return -EIO;
6481 }
6482 }
6483 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
6484
6485 bool do_bluefs;
6486 r = _is_bluefs(create, &do_bluefs);
6487 if (r < 0) {
6488 return r;
6489 }
6490 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
6491
6492 map<string,string> kv_options;
6493 // force separate wal dir for all new deployments.
6494 kv_options["separate_wal_dir"] = 1;
6495 rocksdb::Env *env = NULL;
6496 if (do_bluefs) {
6497 dout(10) << __func__ << " initializing bluefs" << dendl;
6498 if (kv_backend != "rocksdb") {
6499 derr << " backend must be rocksdb to use bluefs" << dendl;
6500 return -EINVAL;
6501 }
6502
6503 r = _open_bluefs(create, read_only);
6504 if (r < 0) {
6505 return r;
6506 }
6507
6508 if (cct->_conf->bluestore_bluefs_env_mirror) {
6509 rocksdb::Env* a = new BlueRocksEnv(bluefs);
6510 rocksdb::Env* b = rocksdb::Env::Default();
6511 if (create) {
6512 string cmd = "rm -rf " + path + "/db " +
6513 path + "/db.slow " +
6514 path + "/db.wal";
6515 int r = system(cmd.c_str());
6516 (void)r;
6517 }
6518 env = new rocksdb::EnvMirror(b, a, false, true);
6519 } else {
6520 env = new BlueRocksEnv(bluefs);
6521
6522 // simplify the dir names, too, as "seen" by rocksdb
6523 fn = "db";
6524 }
6525 BlueFSVolumeSelector::paths paths;
6526 bluefs->get_vselector_paths(fn, paths);
6527
6528 {
6529 ostringstream db_paths;
6530 bool first = true;
6531 for (auto& p : paths) {
6532 if (!first) {
6533 db_paths << " ";
6534 }
6535 first = false;
6536 db_paths << p.first << "," << p.second;
6537
6538 }
6539 kv_options["db_paths"] = db_paths.str();
6540 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
6541 }
6542
6543 if (create) {
6544 for (auto& p : paths) {
6545 env->CreateDir(p.first);
6546 }
6547 // Selectors don't provide wal path so far hence create explicitly
6548 env->CreateDir(fn + ".wal");
6549 } else {
6550 std::vector<std::string> res;
6551 // check for dir presence
6552 auto r = env->GetChildren(fn+".wal", &res);
6553 if (r.IsNotFound()) {
6554 kv_options.erase("separate_wal_dir");
6555 }
6556 }
6557 } else {
6558 string walfn = path + "/db.wal";
6559
6560 if (create) {
6561 int r = ::mkdir(fn.c_str(), 0755);
6562 if (r < 0)
6563 r = -errno;
6564 if (r < 0 && r != -EEXIST) {
6565 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
6566 << dendl;
6567 return r;
6568 }
6569
6570 // wal_dir, too!
6571 r = ::mkdir(walfn.c_str(), 0755);
6572 if (r < 0)
6573 r = -errno;
6574 if (r < 0 && r != -EEXIST) {
6575 derr << __func__ << " failed to create " << walfn
6576 << ": " << cpp_strerror(r)
6577 << dendl;
6578 return r;
6579 }
6580 } else {
6581 struct stat st;
6582 r = ::stat(walfn.c_str(), &st);
6583 if (r < 0 && errno == ENOENT) {
6584 kv_options.erase("separate_wal_dir");
6585 }
6586 }
6587 }
6588
6589
6590 db = KeyValueDB::create(cct,
6591 kv_backend,
6592 fn,
6593 kv_options,
6594 static_cast<void*>(env));
6595 if (!db) {
6596 derr << __func__ << " error creating db" << dendl;
6597 if (bluefs) {
6598 _close_bluefs();
6599 }
6600 // delete env manually here since we can't depend on db to do this
6601 // under this case
6602 delete env;
6603 env = NULL;
6604 return -EIO;
6605 }
6606
6607 FreelistManager::setup_merge_operators(db, freelist_type);
6608 db->set_merge_operator(PREFIX_STAT, merge_op);
6609 db->set_cache_size(cache_kv_ratio * cache_size);
6610 return 0;
6611 }
6612
6613 int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
6614 {
6615 int r;
6616 ceph_assert(!(create && read_only));
6617 string options;
6618 string options_annex;
6619 stringstream err;
6620 string kv_dir_fn;
6621 string kv_backend;
6622 std::string sharding_def;
6623 // prevent write attempts to BlueFS in case we failed before BlueFS was opened
6624 db_was_opened_read_only = true;
6625 r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend);
6626 if (r < 0) {
6627 derr << __func__ << " failed to prepare db environment: " << err.str() << dendl;
6628 return -EIO;
6629 }
6630 // if reached here then BlueFS is already opened
6631 db_was_opened_read_only = read_only;
6632 dout(10) << __func__ << "::db_was_opened_read_only was set to " << read_only << dendl;
6633 if (kv_backend == "rocksdb") {
6634 options = cct->_conf->bluestore_rocksdb_options;
6635 options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6636 if (!options_annex.empty()) {
6637 if (!options.empty() &&
6638 *options.rbegin() != ',') {
6639 options += ',';
6640 }
6641 options += options_annex;
6642 }
6643
6644 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
6645 sharding_def = cct->_conf.get_val<std::string>("bluestore_rocksdb_cfs");
6646 }
6647 }
6648
6649 db->init(options);
6650 if (to_repair_db)
6651 return 0;
6652 if (create) {
6653 r = db->create_and_open(err, sharding_def);
6654 } else {
6655 // we pass in cf list here, but it is only used if the db already has
6656 // column families created.
6657 r = read_only ?
6658 db->open_read_only(err, sharding_def) :
6659 db->open(err, sharding_def);
6660 }
6661 if (r) {
6662 derr << __func__ << " erroring opening db: " << err.str() << dendl;
6663 _close_db();
6664 return -EIO;
6665 }
6666 dout(1) << __func__ << " opened " << kv_backend
6667 << " path " << kv_dir_fn << " options " << options << dendl;
6668 return 0;
6669 }
6670
6671 void BlueStore::_close_db_leave_bluefs()
6672 {
6673 ceph_assert(db);
6674 delete db;
6675 db = nullptr;
6676 }
6677
6678 void BlueStore::_close_db()
6679 {
6680 dout(10) << __func__ << ":read_only=" << db_was_opened_read_only << " fm=" << fm << " destage_alloc_file=" << need_to_destage_allocation_file << dendl;
6681 _close_db_leave_bluefs();
6682
6683 if (need_to_destage_allocation_file) {
6684 ceph_assert(fm && fm->is_null_manager());
6685 int ret = store_allocator(alloc);
6686 if (ret != 0) {
6687 derr << __func__ << "::NCB::store_allocator() failed (continue with bitmapFreelistManager)" << dendl;
6688 }
6689 }
6690
6691 if (bluefs) {
6692 _close_bluefs();
6693 }
6694 }
6695
6696 void BlueStore::_dump_alloc_on_failure()
6697 {
6698 auto dump_interval =
6699 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
6700 if (dump_interval > 0 &&
6701 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
6702 shared_alloc.a->dump();
6703 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
6704 next_dump_on_bluefs_alloc_failure += dump_interval;
6705 }
6706 }
6707
6708 int BlueStore::_open_collections()
6709 {
6710 if (!coll_map.empty()) {
6711 // could be opened from another path
6712 dout(20) << __func__ << "::NCB::collections are already opened, nothing to do" << dendl;
6713 return 0;
6714 }
6715
6716 dout(10) << __func__ << dendl;
6717 collections_had_errors = false;
6718 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6719 size_t load_cnt = 0;
6720 for (it->upper_bound(string());
6721 it->valid();
6722 it->next()) {
6723 coll_t cid;
6724 if (cid.parse(it->key())) {
6725 auto c = ceph::make_ref<Collection>(
6726 this,
6727 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6728 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6729 cid);
6730 bufferlist bl = it->value();
6731 auto p = bl.cbegin();
6732 try {
6733 decode(c->cnode, p);
6734 } catch (ceph::buffer::error& e) {
6735 derr << __func__ << " failed to decode cnode, key:"
6736 << pretty_binary_string(it->key()) << dendl;
6737 return -EIO;
6738 }
6739 dout(20) << __func__ << " opened " << cid << " " << c
6740 << " " << c->cnode << dendl;
6741 _osr_attach(c.get());
6742 coll_map[cid] = c;
6743 load_cnt++;
6744 } else {
6745 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6746 collections_had_errors = true;
6747 }
6748 }
6749 dout(10) << __func__ << " collections loaded: " << load_cnt
6750 << dendl;
6751 return 0;
6752 }
6753
6754 void BlueStore::_fsck_collections(int64_t* errors)
6755 {
6756 if (collections_had_errors) {
6757 dout(10) << __func__ << dendl;
6758 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE);
6759 for (it->upper_bound(string());
6760 it->valid();
6761 it->next()) {
6762 coll_t cid;
6763 if (!cid.parse(it->key())) {
6764 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6765 if (errors) {
6766 (*errors)++;
6767 }
6768 }
6769 }
6770 }
6771 }
6772
6773 void BlueStore::_set_per_pool_omap()
6774 {
6775 per_pool_omap = OMAP_BULK;
6776 bufferlist bl;
6777 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6778 if (bl.length()) {
6779 auto s = bl.to_str();
6780 if (s == stringify(OMAP_PER_POOL)) {
6781 per_pool_omap = OMAP_PER_POOL;
6782 } else if (s == stringify(OMAP_PER_PG)) {
6783 per_pool_omap = OMAP_PER_PG;
6784 } else {
6785 ceph_assert(s == stringify(OMAP_BULK));
6786 }
6787 dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl;
6788 } else {
6789 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6790 }
6791 _check_no_per_pg_or_pool_omap_alert();
6792 }
6793
6794 void BlueStore::_open_statfs()
6795 {
6796 osd_pools.clear();
6797 vstatfs.reset();
6798
6799 bufferlist bl;
6800 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
6801 if (r >= 0) {
6802 per_pool_stat_collection = false;
6803 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
6804 auto it = bl.cbegin();
6805 vstatfs.decode(it);
6806 dout(10) << __func__ << " store_statfs is found" << dendl;
6807 } else {
6808 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6809 }
6810 _check_legacy_statfs_alert();
6811 } else {
6812 per_pool_stat_collection = true;
6813 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
6814 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
6815 for (it->upper_bound(string());
6816 it->valid();
6817 it->next()) {
6818
6819 uint64_t pool_id;
6820 int r = get_key_pool_stat(it->key(), &pool_id);
6821 ceph_assert(r == 0);
6822
6823 bufferlist bl;
6824 bl = it->value();
6825 auto p = bl.cbegin();
6826 auto& st = osd_pools[pool_id];
6827 try {
6828 st.decode(p);
6829 vstatfs += st;
6830
6831 dout(30) << __func__ << " pool " << pool_id
6832 << " statfs " << st << dendl;
6833 } catch (ceph::buffer::error& e) {
6834 derr << __func__ << " failed to decode pool stats, key:"
6835 << pretty_binary_string(it->key()) << dendl;
6836 }
6837 }
6838 }
6839 dout(30) << __func__ << " statfs " << vstatfs << dendl;
6840
6841 }
6842
6843 int BlueStore::_setup_block_symlink_or_file(
6844 string name,
6845 string epath,
6846 uint64_t size,
6847 bool create)
6848 {
6849 dout(20) << __func__ << " name " << name << " path " << epath
6850 << " size " << size << " create=" << (int)create << dendl;
6851 int r = 0;
6852 int flags = O_RDWR|O_CLOEXEC;
6853 if (create)
6854 flags |= O_CREAT;
6855 if (epath.length()) {
6856 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6857 if (r < 0) {
6858 r = -errno;
6859 derr << __func__ << " failed to create " << name << " symlink to "
6860 << epath << ": " << cpp_strerror(r) << dendl;
6861 return r;
6862 }
6863
6864 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6865 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6866 if (fd < 0) {
6867 r = -errno;
6868 derr << __func__ << " failed to open " << epath << " file: "
6869 << cpp_strerror(r) << dendl;
6870 return r;
6871 }
6872 // write the Transport ID of the NVMe device
6873 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6874 // where "0000:02:00.0" is the selector of a PCI device, see
6875 // the first column of "lspci -mm -n -D"
6876 string trid{"trtype:PCIe "};
6877 trid += "traddr:";
6878 trid += epath.substr(strlen(SPDK_PREFIX));
6879 r = ::write(fd, trid.c_str(), trid.size());
6880 ceph_assert(r == static_cast<int>(trid.size()));
6881 dout(1) << __func__ << " created " << name << " symlink to "
6882 << epath << dendl;
6883 VOID_TEMP_FAILURE_RETRY(::close(fd));
6884 }
6885 }
6886 if (size) {
6887 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
6888 if (fd >= 0) {
6889 // block file is present
6890 struct stat st;
6891 int r = ::fstat(fd, &st);
6892 if (r == 0 &&
6893 S_ISREG(st.st_mode) && // if it is a regular file
6894 st.st_size == 0) { // and is 0 bytes
6895 r = ::ftruncate(fd, size);
6896 if (r < 0) {
6897 r = -errno;
6898 derr << __func__ << " failed to resize " << name << " file to "
6899 << size << ": " << cpp_strerror(r) << dendl;
6900 VOID_TEMP_FAILURE_RETRY(::close(fd));
6901 return r;
6902 }
6903
6904 if (cct->_conf->bluestore_block_preallocate_file) {
6905 r = ::ceph_posix_fallocate(fd, 0, size);
6906 if (r > 0) {
6907 derr << __func__ << " failed to prefallocate " << name << " file to "
6908 << size << ": " << cpp_strerror(r) << dendl;
6909 VOID_TEMP_FAILURE_RETRY(::close(fd));
6910 return -r;
6911 }
6912 }
6913 dout(1) << __func__ << " resized " << name << " file to "
6914 << byte_u_t(size) << dendl;
6915 }
6916 VOID_TEMP_FAILURE_RETRY(::close(fd));
6917 } else {
6918 int r = -errno;
6919 if (r != -ENOENT) {
6920 derr << __func__ << " failed to open " << name << " file: "
6921 << cpp_strerror(r) << dendl;
6922 return r;
6923 }
6924 }
6925 }
6926 return 0;
6927 }
6928
6929 int BlueStore::mkfs()
6930 {
6931 dout(1) << __func__ << " path " << path << dendl;
6932 int r;
6933 uuid_d old_fsid;
6934 uint64_t reserved;
6935 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6936 derr << __func__ << " osd_max_object_size "
6937 << cct->_conf->osd_max_object_size << " > bluestore max "
6938 << OBJECT_MAX_SIZE << dendl;
6939 return -EINVAL;
6940 }
6941
6942 {
6943 string done;
6944 r = read_meta("mkfs_done", &done);
6945 if (r == 0) {
6946 dout(1) << __func__ << " already created" << dendl;
6947 if (cct->_conf->bluestore_fsck_on_mkfs) {
6948 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6949 if (r < 0) {
6950 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
6951 << dendl;
6952 return r;
6953 }
6954 if (r > 0) {
6955 derr << __func__ << " fsck found " << r << " errors" << dendl;
6956 r = -EIO;
6957 }
6958 }
6959 return r; // idempotent
6960 }
6961 }
6962
6963 {
6964 string type;
6965 r = read_meta("type", &type);
6966 if (r == 0) {
6967 if (type != "bluestore") {
6968 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6969 return -EIO;
6970 }
6971 } else {
6972 r = write_meta("type", "bluestore");
6973 if (r < 0)
6974 return r;
6975 }
6976 }
6977
6978 r = _open_path();
6979 if (r < 0)
6980 return r;
6981
6982 r = _open_fsid(true);
6983 if (r < 0)
6984 goto out_path_fd;
6985
6986 r = _lock_fsid();
6987 if (r < 0)
6988 goto out_close_fsid;
6989
6990 r = _read_fsid(&old_fsid);
6991 if (r < 0 || old_fsid.is_zero()) {
6992 if (fsid.is_zero()) {
6993 fsid.generate_random();
6994 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6995 } else {
6996 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6997 }
6998 // we'll write it later.
6999 } else {
7000 if (!fsid.is_zero() && fsid != old_fsid) {
7001 derr << __func__ << " on-disk fsid " << old_fsid
7002 << " != provided " << fsid << dendl;
7003 r = -EINVAL;
7004 goto out_close_fsid;
7005 }
7006 fsid = old_fsid;
7007 }
7008
7009 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
7010 cct->_conf->bluestore_block_size,
7011 cct->_conf->bluestore_block_create);
7012 if (r < 0)
7013 goto out_close_fsid;
7014 if (cct->_conf->bluestore_bluefs) {
7015 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
7016 cct->_conf->bluestore_block_wal_size,
7017 cct->_conf->bluestore_block_wal_create);
7018 if (r < 0)
7019 goto out_close_fsid;
7020 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
7021 cct->_conf->bluestore_block_db_size,
7022 cct->_conf->bluestore_block_db_create);
7023 if (r < 0)
7024 goto out_close_fsid;
7025 }
7026
7027 r = _open_bdev(true);
7028 if (r < 0)
7029 goto out_close_fsid;
7030
7031 // choose freelist manager
7032 #ifdef HAVE_LIBZBD
7033 if (bdev->is_smr()) {
7034 freelist_type = "zoned";
7035 zone_size = bdev->get_zone_size();
7036 first_sequential_zone = bdev->get_conventional_region_size() / zone_size;
7037 bdev->reset_all_zones();
7038 } else
7039 #endif
7040 {
7041 freelist_type = "bitmap";
7042 }
7043 dout(10) << " freelist_type " << freelist_type << dendl;
7044
7045 // choose min_alloc_size
7046 dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
7047 << " block_size: 0x" << block_size << std::dec << dendl;
7048 if ((cct->_conf->bluestore_use_optimal_io_size_for_min_alloc_size) && (optimal_io_size != 0)) {
7049 dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
7050 << " for min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
7051 min_alloc_size = optimal_io_size;
7052 }
7053 else if (cct->_conf->bluestore_min_alloc_size) {
7054 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
7055 } else {
7056 ceph_assert(bdev);
7057 if (_use_rotational_settings()) {
7058 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
7059 } else {
7060 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
7061 }
7062 }
7063 _validate_bdev();
7064
7065 // make sure min_alloc_size is power of 2 aligned.
7066 if (!isp2(min_alloc_size)) {
7067 derr << __func__ << " min_alloc_size 0x"
7068 << std::hex << min_alloc_size << std::dec
7069 << " is not power of 2 aligned!"
7070 << dendl;
7071 r = -EINVAL;
7072 goto out_close_bdev;
7073 }
7074
7075 // make sure min_alloc_size is >= and aligned with block size
7076 if (min_alloc_size % block_size != 0) {
7077 derr << __func__ << " min_alloc_size 0x"
7078 << std::hex << min_alloc_size
7079 << " is less or not aligned with block_size: 0x"
7080 << block_size << std::dec << dendl;
7081 r = -EINVAL;
7082 goto out_close_bdev;
7083 }
7084
7085 r = _create_alloc();
7086 if (r < 0) {
7087 goto out_close_bdev;
7088 }
7089
7090 reserved = _get_ondisk_reserved();
7091 alloc->init_add_free(reserved,
7092 p2align(bdev->get_size(), min_alloc_size) - reserved);
7093 #ifdef HAVE_LIBZBD
7094 if (bdev->is_smr() && alloc != shared_alloc.a) {
7095 shared_alloc.a->init_add_free(reserved,
7096 p2align(bdev->get_conventional_region_size(),
7097 min_alloc_size) - reserved);
7098 }
7099 #endif
7100
7101 r = _open_db(true);
7102 if (r < 0)
7103 goto out_close_alloc;
7104
7105 {
7106 KeyValueDB::Transaction t = db->get_transaction();
7107 r = _open_fm(t, true);
7108 if (r < 0)
7109 goto out_close_db;
7110 {
7111 bufferlist bl;
7112 encode((uint64_t)0, bl);
7113 t->set(PREFIX_SUPER, "nid_max", bl);
7114 t->set(PREFIX_SUPER, "blobid_max", bl);
7115 }
7116
7117 {
7118 bufferlist bl;
7119 encode((uint64_t)min_alloc_size, bl);
7120 t->set(PREFIX_SUPER, "min_alloc_size", bl);
7121 }
7122 {
7123 bufferlist bl;
7124 if (cct->_conf.get_val<bool>("bluestore_debug_legacy_omap")) {
7125 bl.append(stringify(OMAP_BULK));
7126 } else {
7127 bl.append(stringify(OMAP_PER_PG));
7128 }
7129 t->set(PREFIX_SUPER, "per_pool_omap", bl);
7130 }
7131
7132 #ifdef HAVE_LIBZBD
7133 if (bdev->is_smr()) {
7134 {
7135 bufferlist bl;
7136 encode((uint64_t)zone_size, bl);
7137 t->set(PREFIX_SUPER, "zone_size", bl);
7138 }
7139 {
7140 bufferlist bl;
7141 encode((uint64_t)first_sequential_zone, bl);
7142 t->set(PREFIX_SUPER, "first_sequential_zone", bl);
7143 }
7144 }
7145 #endif
7146
7147 ondisk_format = latest_ondisk_format;
7148 _prepare_ondisk_format_super(t);
7149 db->submit_transaction_sync(t);
7150 }
7151
7152 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
7153 if (r < 0)
7154 goto out_close_fm;
7155
7156 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7157 if (r < 0)
7158 goto out_close_fm;
7159
7160 if (fsid != old_fsid) {
7161 r = _write_fsid();
7162 if (r < 0) {
7163 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
7164 goto out_close_fm;
7165 }
7166 }
7167
7168 out_close_fm:
7169 _close_fm();
7170 out_close_db:
7171 _close_db();
7172 out_close_alloc:
7173 _close_alloc();
7174 out_close_bdev:
7175 _close_bdev();
7176 out_close_fsid:
7177 _close_fsid();
7178 out_path_fd:
7179 _close_path();
7180
7181 if (r == 0 &&
7182 cct->_conf->bluestore_fsck_on_mkfs) {
7183 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
7184 if (rc < 0)
7185 return rc;
7186 if (rc > 0) {
7187 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7188 r = -EIO;
7189 }
7190 }
7191
7192 if (r == 0) {
7193 // indicate success by writing the 'mkfs_done' file
7194 r = write_meta("mkfs_done", "yes");
7195 }
7196
7197 if (r < 0) {
7198 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
7199 } else {
7200 dout(0) << __func__ << " success" << dendl;
7201 }
7202 return r;
7203 }
7204
7205 int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
7206 {
7207 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
7208 int r;
7209 ceph_assert(path_fd < 0);
7210
7211 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
7212
7213 if (!cct->_conf->bluestore_bluefs) {
7214 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7215 return -EIO;
7216 }
7217 dout(5) << __func__ << "::NCB::calling open_db_and_around(read-only)" << dendl;
7218 r = _open_db_and_around(true);
7219 if (r < 0) {
7220 return r;
7221 }
7222
7223 if (id == BlueFS::BDEV_NEWWAL) {
7224 string p = path + "/block.wal";
7225 r = _setup_block_symlink_or_file("block.wal", dev_path,
7226 cct->_conf->bluestore_block_wal_size,
7227 true);
7228 ceph_assert(r == 0);
7229
7230 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
7231 cct->_conf->bdev_enable_discard,
7232 BDEV_LABEL_BLOCK_SIZE);
7233 ceph_assert(r == 0);
7234
7235 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
7236 r = _check_or_set_bdev_label(
7237 p,
7238 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
7239 "bluefs wal",
7240 true);
7241 ceph_assert(r == 0);
7242 }
7243
7244 bluefs_layout.dedicated_wal = true;
7245 } else if (id == BlueFS::BDEV_NEWDB) {
7246 string p = path + "/block.db";
7247 r = _setup_block_symlink_or_file("block.db", dev_path,
7248 cct->_conf->bluestore_block_db_size,
7249 true);
7250 ceph_assert(r == 0);
7251
7252 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
7253 cct->_conf->bdev_enable_discard,
7254 SUPER_RESERVED);
7255 ceph_assert(r == 0);
7256
7257 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
7258 r = _check_or_set_bdev_label(
7259 p,
7260 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
7261 "bluefs db",
7262 true);
7263 ceph_assert(r == 0);
7264 }
7265 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
7266 bluefs_layout.dedicated_db = true;
7267 }
7268 bluefs->umount();
7269 bluefs->mount();
7270
7271 r = bluefs->prepare_new_device(id, bluefs_layout);
7272 ceph_assert(r == 0);
7273
7274 if (r < 0) {
7275 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
7276 } else {
7277 dout(0) << __func__ << " success" << dendl;
7278 }
7279
7280 _close_db_and_around();
7281 return r;
7282 }
7283
7284 int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
7285 int id)
7286 {
7287 dout(10) << __func__ << " id:" << id << dendl;
7288 ceph_assert(path_fd < 0);
7289
7290 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
7291
7292 if (!cct->_conf->bluestore_bluefs) {
7293 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7294 return -EIO;
7295 }
7296
7297 int r = _open_db_and_around(true);
7298 if (r < 0) {
7299 return r;
7300 }
7301 auto close_db = make_scope_guard([&] {
7302 _close_db_and_around();
7303 });
7304 uint64_t used_space = 0;
7305 for(auto src_id : devs_source) {
7306 used_space += bluefs->get_used(src_id);
7307 }
7308 uint64_t target_free = bluefs->get_free(id);
7309 if (target_free < used_space) {
7310 derr << __func__
7311 << " can't migrate, free space at target: " << target_free
7312 << " is less than required space: " << used_space
7313 << dendl;
7314 return -ENOSPC;
7315 }
7316 if (devs_source.count(BlueFS::BDEV_DB)) {
7317 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7318 bluefs_layout.dedicated_db = false;
7319 }
7320 if (devs_source.count(BlueFS::BDEV_WAL)) {
7321 bluefs_layout.dedicated_wal = false;
7322 }
7323 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
7324 if (r < 0) {
7325 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
7326 return r;
7327 }
7328
7329 if (devs_source.count(BlueFS::BDEV_DB)) {
7330 r = unlink(string(path + "/block.db").c_str());
7331 ceph_assert(r == 0);
7332 }
7333 if (devs_source.count(BlueFS::BDEV_WAL)) {
7334 r = unlink(string(path + "/block.wal").c_str());
7335 ceph_assert(r == 0);
7336 }
7337 return r;
7338 }
7339
7340 int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
7341 int id,
7342 const string& dev_path)
7343 {
7344 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
7345 ceph_assert(path_fd < 0);
7346
7347 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
7348
7349 if (!cct->_conf->bluestore_bluefs) {
7350 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7351 return -EIO;
7352 }
7353
7354 int r = _open_db_and_around(true);
7355 if (r < 0) {
7356 return r;
7357 }
7358 auto close_db = make_scope_guard([&] {
7359 _close_db_and_around();
7360 });
7361
7362 string link_db;
7363 string link_wal;
7364 if (devs_source.count(BlueFS::BDEV_DB) &&
7365 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
7366 link_db = path + "/block.db";
7367 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7368 bluefs_layout.dedicated_db = false;
7369 }
7370 if (devs_source.count(BlueFS::BDEV_WAL)) {
7371 link_wal = path + "/block.wal";
7372 bluefs_layout.dedicated_wal = false;
7373 }
7374
7375 size_t target_size = 0;
7376 string target_name;
7377 if (id == BlueFS::BDEV_NEWWAL) {
7378 target_name = "block.wal";
7379 target_size = cct->_conf->bluestore_block_wal_size;
7380 bluefs_layout.dedicated_wal = true;
7381
7382 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
7383 cct->_conf->bdev_enable_discard,
7384 BDEV_LABEL_BLOCK_SIZE);
7385 ceph_assert(r == 0);
7386
7387 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
7388 r = _check_or_set_bdev_label(
7389 dev_path,
7390 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
7391 "bluefs wal",
7392 true);
7393 ceph_assert(r == 0);
7394 }
7395 } else if (id == BlueFS::BDEV_NEWDB) {
7396 target_name = "block.db";
7397 target_size = cct->_conf->bluestore_block_db_size;
7398 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
7399 bluefs_layout.dedicated_db = true;
7400
7401 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
7402 cct->_conf->bdev_enable_discard,
7403 SUPER_RESERVED);
7404 ceph_assert(r == 0);
7405
7406 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
7407 r = _check_or_set_bdev_label(
7408 dev_path,
7409 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
7410 "bluefs db",
7411 true);
7412 ceph_assert(r == 0);
7413 }
7414 }
7415
7416 bluefs->umount();
7417 bluefs->mount();
7418
7419 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
7420
7421 if (r < 0) {
7422 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
7423 return r;
7424 }
7425
7426 if (!link_db.empty()) {
7427 r = unlink(link_db.c_str());
7428 ceph_assert(r == 0);
7429 }
7430 if (!link_wal.empty()) {
7431 r = unlink(link_wal.c_str());
7432 ceph_assert(r == 0);
7433 }
7434 r = _setup_block_symlink_or_file(
7435 target_name,
7436 dev_path,
7437 target_size,
7438 true);
7439 ceph_assert(r == 0);
7440 dout(0) << __func__ << " success" << dendl;
7441
7442 return r;
7443 }
7444
7445 string BlueStore::get_device_path(unsigned id)
7446 {
7447 string res;
7448 if (id < BlueFS::MAX_BDEV) {
7449 switch (id) {
7450 case BlueFS::BDEV_WAL:
7451 res = path + "/block.wal";
7452 break;
7453 case BlueFS::BDEV_DB:
7454 if (id == bluefs_layout.shared_bdev) {
7455 res = path + "/block";
7456 } else {
7457 res = path + "/block.db";
7458 }
7459 break;
7460 case BlueFS::BDEV_SLOW:
7461 res = path + "/block";
7462 break;
7463 }
7464 }
7465 return res;
7466 }
7467
7468 int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
7469 {
7470 bluestore_bdev_label_t label;
7471 int r = _read_bdev_label(cct, path, &label);
7472 if (r < 0) {
7473 derr << "unable to read label for " << path << ": "
7474 << cpp_strerror(r) << dendl;
7475 } else {
7476 label.size = size;
7477 r = _write_bdev_label(cct, path, label);
7478 if (r < 0) {
7479 derr << "unable to write label for " << path << ": "
7480 << cpp_strerror(r) << dendl;
7481 }
7482 }
7483 return r;
7484 }
7485
7486 int BlueStore::expand_devices(ostream& out)
7487 {
7488 int r = _open_db_and_around(true);
7489 ceph_assert(r == 0);
7490 bluefs->dump_block_extents(out);
7491 out << "Expanding DB/WAL..." << std::endl;
7492 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
7493 if (devid == bluefs_layout.shared_bdev ) {
7494 continue;
7495 }
7496 uint64_t size = bluefs->get_block_device_size(devid);
7497 if (size == 0) {
7498 // no bdev
7499 continue;
7500 }
7501
7502 out << devid
7503 <<" : expanding " << " to 0x" << size << std::dec << std::endl;
7504 string p = get_device_path(devid);
7505 const char* path = p.c_str();
7506 if (path == nullptr) {
7507 derr << devid
7508 <<": can't find device path " << dendl;
7509 continue;
7510 }
7511 if (bluefs->bdev_support_label(devid)) {
7512 if (_set_bdev_label_size(p, size) >= 0) {
7513 out << devid
7514 << " : size label updated to " << size
7515 << std::endl;
7516 }
7517 }
7518 }
7519 uint64_t size0 = fm->get_size();
7520 uint64_t size = bdev->get_size();
7521 if (size0 < size) {
7522 out << bluefs_layout.shared_bdev
7523 << " : expanding " << " from 0x" << std::hex
7524 << size0 << " to 0x" << size << std::dec << std::endl;
7525 _write_out_fm_meta(size);
7526 if (bdev->supported_bdev_label()) {
7527 if (_set_bdev_label_size(path, size) >= 0) {
7528 out << bluefs_layout.shared_bdev
7529 << " : size label updated to " << size
7530 << std::endl;
7531 }
7532 }
7533
7534 if (fm && fm->is_null_manager()) {
7535 // we grow the allocation range, must reflect it in the allocation file
7536 alloc->init_add_free(size0, size - size0);
7537 need_to_destage_allocation_file = true;
7538 }
7539 _close_db_and_around();
7540
7541 // mount in read/write to sync expansion changes
7542 r = _mount();
7543 ceph_assert(r == 0);
7544 umount();
7545 } else {
7546 _close_db_and_around();
7547 }
7548 return r;
7549 }
7550
7551 int BlueStore::dump_bluefs_sizes(ostream& out)
7552 {
7553 int r = _open_db_and_around(true);
7554 ceph_assert(r == 0);
7555 bluefs->dump_block_extents(out);
7556 _close_db_and_around();
7557 return r;
7558 }
7559
7560 void BlueStore::set_cache_shards(unsigned num)
7561 {
7562 dout(10) << __func__ << " " << num << dendl;
7563 size_t oold = onode_cache_shards.size();
7564 size_t bold = buffer_cache_shards.size();
7565 ceph_assert(num >= oold && num >= bold);
7566 onode_cache_shards.resize(num);
7567 buffer_cache_shards.resize(num);
7568 for (unsigned i = oold; i < num; ++i) {
7569 onode_cache_shards[i] =
7570 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7571 logger);
7572 }
7573 for (unsigned i = bold; i < num; ++i) {
7574 buffer_cache_shards[i] =
7575 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7576 logger);
7577 }
7578 }
7579
7580 //---------------------------------------------
7581 bool BlueStore::has_null_manager()
7582 {
7583 return (fm && fm->is_null_manager());
7584 }
7585
7586 int BlueStore::_mount()
7587 {
7588 dout(5) << __func__ << "NCB:: path " << path << dendl;
7589
7590 _kv_only = false;
7591 if (cct->_conf->bluestore_fsck_on_mount) {
7592 dout(5) << __func__ << "::NCB::calling fsck()" << dendl;
7593 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
7594 if (rc < 0)
7595 return rc;
7596 if (rc > 0) {
7597 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7598 return -EIO;
7599 }
7600 }
7601
7602 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
7603 derr << __func__ << " osd_max_object_size "
7604 << cct->_conf->osd_max_object_size << " > bluestore max "
7605 << OBJECT_MAX_SIZE << dendl;
7606 return -EINVAL;
7607 }
7608
7609 dout(5) << __func__ << "::NCB::calling open_db_and_around(read/write)" << dendl;
7610 int r = _open_db_and_around(false);
7611 if (r < 0) {
7612 return r;
7613 }
7614 auto close_db = make_scope_guard([&] {
7615 if (!mounted) {
7616 _close_db_and_around();
7617 }
7618 });
7619
7620 r = _upgrade_super();
7621 if (r < 0) {
7622 return r;
7623 }
7624
7625 // The recovery process for allocation-map needs to open collection early
7626 r = _open_collections();
7627 if (r < 0) {
7628 return r;
7629 }
7630 auto shutdown_cache = make_scope_guard([&] {
7631 if (!mounted) {
7632 _shutdown_cache();
7633 }
7634 });
7635
7636 r = _reload_logger();
7637 if (r < 0) {
7638 return r;
7639 }
7640
7641 _kv_start();
7642 auto stop_kv = make_scope_guard([&] {
7643 if (!mounted) {
7644 _kv_stop();
7645 }
7646 });
7647
7648 r = _deferred_replay();
7649 if (r < 0) {
7650 return r;
7651 }
7652
7653 #ifdef HAVE_LIBZBD
7654 if (bdev->is_smr()) {
7655 _zoned_cleaner_start();
7656 }
7657 #endif
7658
7659 mempool_thread.init();
7660
7661 if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
7662 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
7663
7664 auto was_per_pool_omap = per_pool_omap;
7665
7666 dout(1) << __func__ << " quick-fix on mount" << dendl;
7667 _fsck_on_open(FSCK_SHALLOW, true);
7668
7669 //reread statfs
7670 //FIXME minor: replace with actual open/close?
7671 _open_statfs();
7672 _check_legacy_statfs_alert();
7673
7674 //set again as hopefully it has been fixed
7675 if (was_per_pool_omap != OMAP_PER_PG) {
7676 _set_per_pool_omap();
7677 }
7678 }
7679
7680 mounted = true;
7681 return 0;
7682 }
7683
7684 int BlueStore::umount()
7685 {
7686 ceph_assert(_kv_only || mounted);
7687 _osr_drain_all();
7688
7689 mounted = false;
7690
7691 ceph_assert(alloc);
7692
7693 if (!_kv_only) {
7694 mempool_thread.shutdown();
7695 #ifdef HAVE_LIBZBD
7696 if (bdev->is_smr()) {
7697 dout(20) << __func__ << " stopping zone cleaner thread" << dendl;
7698 _zoned_cleaner_stop();
7699 }
7700 #endif
7701 dout(20) << __func__ << " stopping kv thread" << dendl;
7702 _kv_stop();
7703 // skip cache cleanup step on fast shutdown
7704 if (likely(!m_fast_shutdown)) {
7705 _shutdown_cache();
7706 }
7707 dout(20) << __func__ << " closing" << dendl;
7708 }
7709 _close_db_and_around();
7710 // disable fsck on fast-shutdown
7711 if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) {
7712 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7713 if (rc < 0)
7714 return rc;
7715 if (rc > 0) {
7716 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7717 return -EIO;
7718 }
7719 }
7720 return 0;
7721 }
7722
7723 int BlueStore::cold_open()
7724 {
7725 return _open_db_and_around(true);
7726 }
7727
7728 int BlueStore::cold_close()
7729 {
7730 _close_db_and_around();
7731 return 0;
7732 }
7733
7734 // derr wrapper to limit enormous output and avoid log flooding.
7735 // Of limited use where such output is expected for now
7736 #define fsck_derr(err_cnt, threshold) \
7737 if (err_cnt <= threshold) { \
7738 bool need_skip_print = err_cnt == threshold; \
7739 derr
7740
7741 #define fsck_dendl \
7742 dendl; \
7743 if (need_skip_print) \
7744 derr << "more error lines skipped..." << dendl; \
7745 }
7746
7747 int _fsck_sum_extents(
7748 const PExtentVector& extents,
7749 bool compressed,
7750 store_statfs_t& expected_statfs)
7751 {
7752 for (auto e : extents) {
7753 if (!e.is_valid())
7754 continue;
7755 expected_statfs.allocated += e.length;
7756 if (compressed) {
7757 expected_statfs.data_compressed_allocated += e.length;
7758 }
7759 }
7760 return 0;
7761 }
7762
7763 int BlueStore::_fsck_check_extents(
7764 std::string_view ctx_descr,
7765 const PExtentVector& extents,
7766 bool compressed,
7767 mempool_dynamic_bitset &used_blocks,
7768 uint64_t granularity,
7769 BlueStoreRepairer* repairer,
7770 store_statfs_t& expected_statfs,
7771 FSCKDepth depth)
7772 {
7773 dout(30) << __func__ << " " << ctx_descr << ", extents " << extents << dendl;
7774 int errors = 0;
7775 for (auto e : extents) {
7776 if (!e.is_valid())
7777 continue;
7778 expected_statfs.allocated += e.length;
7779 if (compressed) {
7780 expected_statfs.data_compressed_allocated += e.length;
7781 }
7782 if (depth != FSCK_SHALLOW) {
7783 bool already = false;
7784 apply_for_bitset_range(
7785 e.offset, e.length, granularity, used_blocks,
7786 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
7787 if (bs.test(pos)) {
7788 if (repairer) {
7789 repairer->note_misreference(
7790 pos * min_alloc_size, min_alloc_size, !already);
7791 }
7792 if (!already) {
7793 derr << __func__ << "::fsck error: " << ctx_descr << ", extent " << e
7794 << " or a subset is already allocated (misreferenced)" << dendl;
7795 ++errors;
7796 already = true;
7797 }
7798 }
7799 else
7800 bs.set(pos);
7801 });
7802
7803 if (e.end() > bdev->get_size()) {
7804 derr << "fsck error: " << ctx_descr << ", extent " << e
7805 << " past end of block device" << dendl;
7806 ++errors;
7807 }
7808 }
7809 }
7810 return errors;
7811 }
7812
7813 void BlueStore::_fsck_check_pool_statfs(
7814 BlueStore::per_pool_statfs& expected_pool_statfs,
7815 int64_t& errors,
7816 int64_t& warnings,
7817 BlueStoreRepairer* repairer)
7818 {
7819 auto it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
7820 if (it) {
7821 for (it->lower_bound(string()); it->valid(); it->next()) {
7822 string key = it->key();
7823 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
7824 if (repairer) {
7825 ++errors;
7826 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
7827 derr << "fsck error: " << "legacy statfs record found, removing"
7828 << dendl;
7829 }
7830 continue;
7831 }
7832 uint64_t pool_id;
7833 if (get_key_pool_stat(key, &pool_id) < 0) {
7834 derr << "fsck error: bad key " << key
7835 << "in statfs namespece" << dendl;
7836 if (repairer) {
7837 repairer->remove_key(db, PREFIX_STAT, key);
7838 }
7839 ++errors;
7840 continue;
7841 }
7842
7843 volatile_statfs vstatfs;
7844 bufferlist bl = it->value();
7845 auto blp = bl.cbegin();
7846 try {
7847 vstatfs.decode(blp);
7848 } catch (ceph::buffer::error& e) {
7849 derr << "fsck error: failed to decode Pool StatFS record"
7850 << pretty_binary_string(key) << dendl;
7851 if (repairer) {
7852 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
7853 << pretty_binary_string(key)
7854 << "', removing" << dendl;
7855 repairer->remove_key(db, PREFIX_STAT, key);
7856 }
7857 ++errors;
7858 vstatfs.reset();
7859 }
7860 auto stat_it = expected_pool_statfs.find(pool_id);
7861 if (stat_it == expected_pool_statfs.end()) {
7862 if (vstatfs.is_empty()) {
7863 // we don't consider that as an error since empty pool statfs
7864 // are left in DB for now
7865 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7866 << std::hex << pool_id << std::dec << dendl;
7867 if (repairer) {
7868 // but we need to increment error count in case of repair
7869 // to have proper counters at the end
7870 // (as repairer increments recovery counter anyway).
7871 ++errors;
7872 }
7873 } else {
7874 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
7875 << std::hex << pool_id << std::dec << dendl;
7876 ++errors;
7877 }
7878 if (repairer) {
7879 repairer->remove_key(db, PREFIX_STAT, key);
7880 }
7881 continue;
7882 }
7883 store_statfs_t statfs;
7884 vstatfs.publish(&statfs);
7885 if (!(stat_it->second == statfs)) {
7886 derr << "fsck error: actual " << statfs
7887 << " != expected " << stat_it->second
7888 << " for pool "
7889 << std::hex << pool_id << std::dec << dendl;
7890 if (repairer) {
7891 repairer->fix_statfs(db, key, stat_it->second);
7892 }
7893 ++errors;
7894 }
7895 expected_pool_statfs.erase(stat_it);
7896 }
7897 } // if (it)
7898 for (auto& s : expected_pool_statfs) {
7899 if (s.second.is_zero()) {
7900 // we might lack empty statfs recs in DB
7901 continue;
7902 }
7903 derr << "fsck error: missing Pool StatFS record for pool "
7904 << std::hex << s.first << std::dec << dendl;
7905 if (repairer) {
7906 string key;
7907 get_pool_stat_key(s.first, &key);
7908 repairer->fix_statfs(db, key, s.second);
7909 }
7910 ++errors;
7911 }
7912 if (!per_pool_stat_collection &&
7913 repairer) {
7914 // by virtue of running this method, we correct the top-level
7915 // error of having global stats
7916 repairer->inc_repaired();
7917 }
7918 }
7919
7920 void BlueStore::_fsck_repair_shared_blobs(
7921 BlueStoreRepairer& repairer,
7922 shared_blob_2hash_tracker_t& sb_ref_counts,
7923 sb_info_space_efficient_map_t& sb_info)
7924 {
7925 auto sb_ref_mismatches = sb_ref_counts.count_non_zero();
7926 dout(1) << __func__ << " repairing shared_blobs, ref mismatch estimate: "
7927 << sb_ref_mismatches << dendl;
7928 if (!sb_ref_mismatches) // not expected to succeed, just in case
7929 return;
7930
7931
7932 auto foreach_shared_blob = [&](std::function<
7933 void (coll_t,
7934 ghobject_t,
7935 uint64_t,
7936 const bluestore_blob_t&)> cb) {
7937 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
7938 if (it) {
7939 CollectionRef c;
7940 spg_t pgid;
7941 for (it->lower_bound(string()); it->valid(); it->next()) {
7942 dout(30) << __func__ << " key "
7943 << pretty_binary_string(it->key())
7944 << dendl;
7945 if (is_extent_shard_key(it->key())) {
7946 continue;
7947 }
7948
7949 ghobject_t oid;
7950 int r = get_key_object(it->key(), &oid);
7951 if (r < 0) {
7952 continue;
7953 }
7954
7955 if (!c ||
7956 oid.shard_id != pgid.shard ||
7957 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
7958 !c->contains(oid)) {
7959 c = nullptr;
7960 for (auto& p : coll_map) {
7961 if (p.second->contains(oid)) {
7962 c = p.second;
7963 break;
7964 }
7965 }
7966 if (!c) {
7967 continue;
7968 }
7969 }
7970 dout(20) << __func__
7971 << " inspecting shared blob refs for col:" << c->cid
7972 << " obj:" << oid
7973 << dendl;
7974
7975 OnodeRef o;
7976 o.reset(Onode::decode(c, oid, it->key(), it->value()));
7977 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7978
7979 _dump_onode<30>(cct, *o);
7980
7981 mempool::bluestore_fsck::set<BlobRef> passed_sbs;
7982 for (auto& e : o->extent_map.extent_map) {
7983 auto& b = e.blob->get_blob();
7984 if (b.is_shared() && passed_sbs.count(e.blob) == 0) {
7985 auto sbid = e.blob->shared_blob->get_sbid();
7986 cb(c->cid, oid, sbid, b);
7987 passed_sbs.emplace(e.blob);
7988 }
7989 } // for ... extent_map
7990 } // for ... it->valid
7991 } //if (it(PREFIX_OBJ))
7992 }; //foreach_shared_blob fn declaration
7993
7994 mempool::bluestore_fsck::map<uint64_t, bluestore_extent_ref_map_t> refs_map;
7995
7996 // first iteration over objects to identify all the broken sbids
7997 foreach_shared_blob( [&](coll_t cid,
7998 ghobject_t oid,
7999 uint64_t sbid,
8000 const bluestore_blob_t& b) {
8001 auto it = refs_map.lower_bound(sbid);
8002 if(it != refs_map.end() && it->first == sbid) {
8003 return;
8004 }
8005 for (auto& p : b.get_extents()) {
8006 if (p.is_valid() &&
8007 !sb_ref_counts.test_all_zero_range(sbid,
8008 p.offset,
8009 p.length)) {
8010 refs_map.emplace_hint(it, sbid, bluestore_extent_ref_map_t());
8011 dout(20) << __func__
8012 << " broken shared blob found for col:" << cid
8013 << " obj:" << oid
8014 << " sbid 0x " << std::hex << sbid << std::dec
8015 << dendl;
8016 break;
8017 }
8018 }
8019 });
8020
8021 // second iteration over objects to build new ref map for the broken sbids
8022 foreach_shared_blob( [&](coll_t cid,
8023 ghobject_t oid,
8024 uint64_t sbid,
8025 const bluestore_blob_t& b) {
8026 auto it = refs_map.find(sbid);
8027 if(it == refs_map.end()) {
8028 return;
8029 }
8030 for (auto& p : b.get_extents()) {
8031 if (p.is_valid()) {
8032 it->second.get(p.offset, p.length);
8033 break;
8034 }
8035 }
8036 });
8037
8038 // update shared blob records
8039 auto ref_it = refs_map.begin();
8040 while (ref_it != refs_map.end()) {
8041 size_t cnt = 0;
8042 const size_t max_transactions = 4096;
8043 KeyValueDB::Transaction txn = db->get_transaction();
8044 for (cnt = 0;
8045 cnt < max_transactions && ref_it != refs_map.end();
8046 ref_it++) {
8047 auto sbid = ref_it->first;
8048 dout(20) << __func__ << " repaired shared_blob 0x"
8049 << std::hex << sbid << std::dec
8050 << ref_it->second << dendl;
8051 repairer.fix_shared_blob(txn, sbid, &ref_it->second, 0);
8052 cnt++;
8053 }
8054 if (cnt) {
8055 db->submit_transaction_sync(txn);
8056 cnt = 0;
8057 }
8058 }
8059 // remove stray shared blob records
8060 size_t cnt = 0;
8061 const size_t max_transactions = 4096;
8062 KeyValueDB::Transaction txn = db->get_transaction();
8063 sb_info.foreach_stray([&](const sb_info_t& sbi) {
8064 auto sbid = sbi.get_sbid();
8065 dout(20) << __func__ << " removing stray shared_blob 0x"
8066 << std::hex << sbid << std::dec
8067 << dendl;
8068 repairer.fix_shared_blob(txn, sbid, nullptr, 0);
8069 cnt++;
8070 if (cnt >= max_transactions) {}
8071 db->submit_transaction_sync(txn);
8072 txn = db->get_transaction();
8073 cnt = 0;
8074 });
8075 if (cnt > 0) {
8076 db->submit_transaction_sync(txn);
8077 }
8078
8079 // amount of repairs to report to be equal to previously
8080 // determined error estimation, not the actual number of updated shared blobs
8081 repairer.inc_repaired(sb_ref_mismatches);
8082 }
8083
8084 BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
8085 BlueStore::FSCKDepth depth,
8086 int64_t pool_id,
8087 BlueStore::CollectionRef c,
8088 const ghobject_t& oid,
8089 const string& key,
8090 const bufferlist& value,
8091 mempool::bluestore_fsck::list<string>* expecting_shards,
8092 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
8093 const BlueStore::FSCK_ObjectCtx& ctx)
8094 {
8095 auto& errors = ctx.errors;
8096 auto& num_objects = ctx.num_objects;
8097 auto& num_extents = ctx.num_extents;
8098 auto& num_blobs = ctx.num_blobs;
8099 auto& num_sharded_objects = ctx.num_sharded_objects;
8100 auto& num_spanning_blobs = ctx.num_spanning_blobs;
8101 auto used_blocks = ctx.used_blocks;
8102 auto sb_info_lock = ctx.sb_info_lock;
8103 auto& sb_info = ctx.sb_info;
8104 auto& sb_ref_counts = ctx.sb_ref_counts;
8105 auto repairer = ctx.repairer;
8106
8107 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
8108 &ctx.expected_pool_statfs[pool_id] :
8109 &ctx.expected_store_statfs;
8110
8111 map<uint32_t, uint64_t> zone_first_offsets; // for zoned/smr devices
8112
8113 dout(10) << __func__ << " " << oid << dendl;
8114 OnodeRef o;
8115 o.reset(Onode::decode(c, oid, key, value));
8116 ++num_objects;
8117
8118 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
8119
8120 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8121 _dump_onode<30>(cct, *o);
8122 // shards
8123 if (!o->extent_map.shards.empty()) {
8124 ++num_sharded_objects;
8125 if (depth != FSCK_SHALLOW) {
8126 ceph_assert(expecting_shards);
8127 for (auto& s : o->extent_map.shards) {
8128 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
8129 expecting_shards->push_back(string());
8130 get_extent_shard_key(o->key, s.shard_info->offset,
8131 &expecting_shards->back());
8132 if (s.shard_info->offset >= o->onode.size) {
8133 derr << "fsck error: " << oid << " shard 0x" << std::hex
8134 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
8135 << std::dec << dendl;
8136 ++errors;
8137 }
8138 }
8139 }
8140 }
8141
8142 // lextents
8143 uint64_t pos = 0;
8144 mempool::bluestore_fsck::map<BlobRef,
8145 bluestore_blob_use_tracker_t> ref_map;
8146 for (auto& l : o->extent_map.extent_map) {
8147 dout(20) << __func__ << " " << l << dendl;
8148 if (l.logical_offset < pos) {
8149 derr << "fsck error: " << oid << " lextent at 0x"
8150 << std::hex << l.logical_offset
8151 << " overlaps with the previous, which ends at 0x" << pos
8152 << std::dec << dendl;
8153 ++errors;
8154 }
8155 if (depth != FSCK_SHALLOW &&
8156 o->extent_map.spans_shard(l.logical_offset, l.length)) {
8157 derr << "fsck error: " << oid << " lextent at 0x"
8158 << std::hex << l.logical_offset << "~" << l.length
8159 << " spans a shard boundary"
8160 << std::dec << dendl;
8161 ++errors;
8162 }
8163 pos = l.logical_offset + l.length;
8164 res_statfs->data_stored += l.length;
8165 ceph_assert(l.blob);
8166 const bluestore_blob_t& blob = l.blob->get_blob();
8167
8168 #ifdef HAVE_LIBZBD
8169 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
8170 for (auto& e : blob.get_extents()) {
8171 if (e.is_valid()) {
8172 uint32_t zone = e.offset / zone_size;
8173 uint64_t offset = e.offset % zone_size;
8174 auto p = zone_first_offsets.find(zone);
8175 if (p == zone_first_offsets.end() || p->second > offset) {
8176 // FIXME: use interator for guided insert?
8177 zone_first_offsets[zone] = offset;
8178 }
8179 }
8180 }
8181 }
8182 #endif
8183
8184 auto& ref = ref_map[l.blob];
8185 if (ref.is_empty()) {
8186 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
8187 uint32_t l = blob.get_logical_length();
8188 ref.init(l, min_release_size);
8189 }
8190 ref.get(
8191 l.blob_offset,
8192 l.length);
8193 ++num_extents;
8194 if (depth != FSCK_SHALLOW &&
8195 blob.has_unused()) {
8196 ceph_assert(referenced);
8197 auto p = referenced->find(l.blob);
8198 bluestore_blob_t::unused_t* pu;
8199 if (p == referenced->end()) {
8200 pu = &(*referenced)[l.blob];
8201 }
8202 else {
8203 pu = &p->second;
8204 }
8205 uint64_t blob_len = blob.get_logical_length();
8206 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
8207 ceph_assert(l.blob_offset + l.length <= blob_len);
8208 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
8209 uint64_t start = l.blob_offset / chunk_size;
8210 uint64_t end =
8211 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
8212 for (auto i = start; i < end; ++i) {
8213 (*pu) |= (1u << i);
8214 }
8215 }
8216 } //for (auto& l : o->extent_map.extent_map)
8217
8218 for (auto& i : ref_map) {
8219 ++num_blobs;
8220 const bluestore_blob_t& blob = i.first->get_blob();
8221 bool equal =
8222 depth == FSCK_SHALLOW ? true :
8223 i.first->get_blob_use_tracker().equal(i.second);
8224 if (!equal) {
8225 derr << "fsck error: " << oid << " blob " << *i.first
8226 << " doesn't match expected ref_map " << i.second << dendl;
8227 ++errors;
8228 }
8229 if (blob.is_compressed()) {
8230 res_statfs->data_compressed += blob.get_compressed_payload_length();
8231 res_statfs->data_compressed_original +=
8232 i.first->get_referenced_bytes();
8233 }
8234 if (depth != FSCK_SHALLOW && repairer) {
8235 for (auto e : blob.get_extents()) {
8236 if (!e.is_valid())
8237 continue;
8238 repairer->set_space_used(e.offset, e.length, c->cid, oid);
8239 }
8240 }
8241 if (blob.is_shared()) {
8242 if (i.first->shared_blob->get_sbid() > blobid_max) {
8243 derr << "fsck error: " << oid << " blob " << blob
8244 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
8245 << blobid_max << dendl;
8246 ++errors;
8247 } else if (i.first->shared_blob->get_sbid() == 0) {
8248 derr << "fsck error: " << oid << " blob " << blob
8249 << " marked as shared but has uninitialized sbid"
8250 << dendl;
8251 ++errors;
8252 }
8253 // the below lock is optional and provided in multithreading mode only
8254 if (sb_info_lock) {
8255 sb_info_lock->lock();
8256 }
8257 auto sbid = i.first->shared_blob->get_sbid();
8258 sb_info_t& sbi = sb_info.add_or_adopt(i.first->shared_blob->get_sbid());
8259 ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID ||
8260 sbi.pool_id == oid.hobj.get_logical_pool());
8261 sbi.pool_id = oid.hobj.get_logical_pool();
8262 bool compressed = blob.is_compressed();
8263 for (auto e : blob.get_extents()) {
8264 if (e.is_valid()) {
8265 if (compressed) {
8266 ceph_assert(sbi.allocated_chunks <= 0);
8267 sbi.allocated_chunks -= (e.length >> min_alloc_size_order);
8268 } else {
8269 ceph_assert(sbi.allocated_chunks >= 0);
8270 sbi.allocated_chunks += (e.length >> min_alloc_size_order);
8271 }
8272 sb_ref_counts.inc_range(sbid, e.offset, e.length, 1);
8273 }
8274 }
8275 if (sb_info_lock) {
8276 sb_info_lock->unlock();
8277 }
8278 } else if (depth != FSCK_SHALLOW) {
8279 ceph_assert(used_blocks);
8280 string ctx_descr = " oid " + stringify(oid);
8281 errors += _fsck_check_extents(ctx_descr,
8282 blob.get_extents(),
8283 blob.is_compressed(),
8284 *used_blocks,
8285 fm->get_alloc_size(),
8286 repairer,
8287 *res_statfs,
8288 depth);
8289 } else {
8290 errors += _fsck_sum_extents(
8291 blob.get_extents(),
8292 blob.is_compressed(),
8293 *res_statfs);
8294 }
8295 } // for (auto& i : ref_map)
8296
8297 {
8298 auto &sbm = o->extent_map.spanning_blob_map;
8299 size_t broken = 0;
8300 BlobRef first_broken;
8301 for (auto it = sbm.begin(); it != sbm.end();) {
8302 auto it1 = it++;
8303 if (ref_map.count(it1->second) == 0) {
8304 if (!broken) {
8305 first_broken = it1->second;
8306 ++errors;
8307 }
8308 broken++;
8309 if (repairer) {
8310 sbm.erase(it1);
8311 }
8312 }
8313 }
8314
8315 #ifdef HAVE_LIBZBD
8316 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
8317 for (auto& [zone, first_offset] : zone_first_offsets) {
8318 auto p = (*ctx.zone_refs)[zone].find(oid);
8319 if (p != (*ctx.zone_refs)[zone].end()) {
8320 if (first_offset < p->second) {
8321 dout(20) << " slightly wonky zone ref 0x" << std::hex << zone
8322 << " offset 0x" << p->second
8323 << " but first offset is 0x" << first_offset
8324 << "; this can happen due to clone_range"
8325 << dendl;
8326 } else {
8327 dout(20) << " good zone ref 0x" << std::hex << zone << " offset 0x" << p->second
8328 << " <= first offset 0x" << first_offset
8329 << std::dec << dendl;
8330 }
8331 (*ctx.zone_refs)[zone].erase(p);
8332 } else {
8333 derr << "fsck error: " << oid << " references zone 0x" << std::hex << zone
8334 << " but there is no zone ref" << std::dec << dendl;
8335 // FIXME: add repair
8336 ++errors;
8337 }
8338 }
8339 }
8340 #endif
8341
8342 if (broken) {
8343 derr << "fsck error: " << oid << " - " << broken
8344 << " zombie spanning blob(s) found, the first one: "
8345 << *first_broken << dendl;
8346 if(repairer) {
8347 repairer->fix_spanning_blobs(
8348 db,
8349 [&](KeyValueDB::Transaction txn) {
8350 _record_onode(o, txn);
8351 });
8352 }
8353 }
8354 }
8355
8356 if (o->onode.has_omap()) {
8357 _fsck_check_object_omap(depth, o, ctx);
8358 }
8359
8360 return o;
8361 }
8362
8363 #include "common/WorkQueue.h"
8364
8365 class ShallowFSCKThreadPool : public ThreadPool
8366 {
8367 public:
8368 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
8369 ThreadPool(cct_, nm, tn, n) {
8370 }
8371 void worker(ThreadPool::WorkThread* wt) override {
8372 int next_wq = 0;
8373 while (!_stop) {
8374 next_wq %= work_queues.size();
8375 WorkQueue_ *wq = work_queues[next_wq++];
8376
8377 void* item = wq->_void_dequeue();
8378 if (item) {
8379 processing++;
8380 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
8381 wq->_void_process(item, tp_handle);
8382 processing--;
8383 }
8384 }
8385 }
8386 template <size_t BatchLen>
8387 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
8388 {
8389 struct Entry {
8390 int64_t pool_id;
8391 BlueStore::CollectionRef c;
8392 ghobject_t oid;
8393 string key;
8394 bufferlist value;
8395 };
8396 struct Batch {
8397 std::atomic<size_t> running = { 0 };
8398 size_t entry_count = 0;
8399 std::array<Entry, BatchLen> entries;
8400
8401 int64_t errors = 0;
8402 int64_t warnings = 0;
8403 uint64_t num_objects = 0;
8404 uint64_t num_extents = 0;
8405 uint64_t num_blobs = 0;
8406 uint64_t num_sharded_objects = 0;
8407 uint64_t num_spanning_blobs = 0;
8408 store_statfs_t expected_store_statfs;
8409 BlueStore::per_pool_statfs expected_pool_statfs;
8410 };
8411
8412 size_t batchCount;
8413 BlueStore* store = nullptr;
8414
8415 ceph::mutex* sb_info_lock = nullptr;
8416 sb_info_space_efficient_map_t* sb_info = nullptr;
8417 shared_blob_2hash_tracker_t* sb_ref_counts = nullptr;
8418 BlueStoreRepairer* repairer = nullptr;
8419
8420 Batch* batches = nullptr;
8421 size_t last_batch_pos = 0;
8422 bool batch_acquired = false;
8423
8424 FSCKWorkQueue(std::string n,
8425 size_t _batchCount,
8426 BlueStore* _store,
8427 ceph::mutex* _sb_info_lock,
8428 sb_info_space_efficient_map_t& _sb_info,
8429 shared_blob_2hash_tracker_t& _sb_ref_counts,
8430 BlueStoreRepairer* _repairer) :
8431 WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()),
8432 batchCount(_batchCount),
8433 store(_store),
8434 sb_info_lock(_sb_info_lock),
8435 sb_info(&_sb_info),
8436 sb_ref_counts(&_sb_ref_counts),
8437 repairer(_repairer)
8438 {
8439 batches = new Batch[batchCount];
8440 }
8441 ~FSCKWorkQueue() {
8442 delete[] batches;
8443 }
8444
8445 /// Remove all work items from the queue.
8446 void _clear() override {
8447 //do nothing
8448 }
8449 /// Check whether there is anything to do.
8450 bool _empty() override {
8451 ceph_assert(false);
8452 }
8453
8454 /// Get the next work item to process.
8455 void* _void_dequeue() override {
8456 size_t pos = rand() % batchCount;
8457 size_t pos0 = pos;
8458 do {
8459 auto& batch = batches[pos];
8460 if (batch.running.fetch_add(1) == 0) {
8461 if (batch.entry_count) {
8462 return &batch;
8463 }
8464 }
8465 batch.running--;
8466 pos++;
8467 pos %= batchCount;
8468 } while (pos != pos0);
8469 return nullptr;
8470 }
8471 /** @brief Process the work item.
8472 * This function will be called several times in parallel
8473 * and must therefore be thread-safe. */
8474 void _void_process(void* item, TPHandle& handle) override {
8475 Batch* batch = (Batch*)item;
8476
8477 BlueStore::FSCK_ObjectCtx ctx(
8478 batch->errors,
8479 batch->warnings,
8480 batch->num_objects,
8481 batch->num_extents,
8482 batch->num_blobs,
8483 batch->num_sharded_objects,
8484 batch->num_spanning_blobs,
8485 nullptr, // used_blocks
8486 nullptr, //used_omap_head
8487 nullptr,
8488 sb_info_lock,
8489 *sb_info,
8490 *sb_ref_counts,
8491 batch->expected_store_statfs,
8492 batch->expected_pool_statfs,
8493 repairer);
8494
8495 for (size_t i = 0; i < batch->entry_count; i++) {
8496 auto& entry = batch->entries[i];
8497
8498 store->fsck_check_objects_shallow(
8499 BlueStore::FSCK_SHALLOW,
8500 entry.pool_id,
8501 entry.c,
8502 entry.oid,
8503 entry.key,
8504 entry.value,
8505 nullptr, // expecting_shards - this will need a protection if passed
8506 nullptr, // referenced
8507 ctx);
8508 }
8509 batch->entry_count = 0;
8510 batch->running--;
8511 }
8512 /** @brief Synchronously finish processing a work item.
8513 * This function is called after _void_process with the global thread pool lock held,
8514 * so at most one copy will execute simultaneously for a given thread pool.
8515 * It can be used for non-thread-safe finalization. */
8516 void _void_process_finish(void*) override {
8517 ceph_assert(false);
8518 }
8519
8520 bool queue(
8521 int64_t pool_id,
8522 BlueStore::CollectionRef c,
8523 const ghobject_t& oid,
8524 const string& key,
8525 const bufferlist& value) {
8526 bool res = false;
8527 size_t pos0 = last_batch_pos;
8528 if (!batch_acquired) {
8529 do {
8530 auto& batch = batches[last_batch_pos];
8531 if (batch.running.fetch_add(1) == 0) {
8532 if (batch.entry_count < BatchLen) {
8533 batch_acquired = true;
8534 break;
8535 }
8536 }
8537 batch.running.fetch_sub(1);
8538 last_batch_pos++;
8539 last_batch_pos %= batchCount;
8540 } while (last_batch_pos != pos0);
8541 }
8542 if (batch_acquired) {
8543 auto& batch = batches[last_batch_pos];
8544 ceph_assert(batch.running);
8545 ceph_assert(batch.entry_count < BatchLen);
8546
8547 auto& entry = batch.entries[batch.entry_count];
8548 entry.pool_id = pool_id;
8549 entry.c = c;
8550 entry.oid = oid;
8551 entry.key = key;
8552 entry.value = value;
8553
8554 ++batch.entry_count;
8555 if (batch.entry_count == BatchLen) {
8556 batch_acquired = false;
8557 batch.running.fetch_sub(1);
8558 last_batch_pos++;
8559 last_batch_pos %= batchCount;
8560 }
8561 res = true;
8562 }
8563 return res;
8564 }
8565
8566 void finalize(ThreadPool& tp,
8567 BlueStore::FSCK_ObjectCtx& ctx) {
8568 if (batch_acquired) {
8569 auto& batch = batches[last_batch_pos];
8570 ceph_assert(batch.running);
8571 batch.running.fetch_sub(1);
8572 }
8573 tp.stop();
8574
8575 for (size_t i = 0; i < batchCount; i++) {
8576 auto& batch = batches[i];
8577
8578 //process leftovers if any
8579 if (batch.entry_count) {
8580 TPHandle tp_handle(store->cct,
8581 nullptr,
8582 timeout_interval,
8583 suicide_interval);
8584 ceph_assert(batch.running == 0);
8585
8586 batch.running++; // just to be on-par with the regular call
8587 _void_process(&batch, tp_handle);
8588 }
8589 ceph_assert(batch.entry_count == 0);
8590
8591 ctx.errors += batch.errors;
8592 ctx.warnings += batch.warnings;
8593 ctx.num_objects += batch.num_objects;
8594 ctx.num_extents += batch.num_extents;
8595 ctx.num_blobs += batch.num_blobs;
8596 ctx.num_sharded_objects += batch.num_sharded_objects;
8597 ctx.num_spanning_blobs += batch.num_spanning_blobs;
8598
8599 ctx.expected_store_statfs.add(batch.expected_store_statfs);
8600
8601 for (auto it = batch.expected_pool_statfs.begin();
8602 it != batch.expected_pool_statfs.end();
8603 it++) {
8604 ctx.expected_pool_statfs[it->first].add(it->second);
8605 }
8606 }
8607 }
8608 };
8609 };
8610
8611 void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
8612 OnodeRef& o,
8613 const BlueStore::FSCK_ObjectCtx& ctx)
8614 {
8615 auto& errors = ctx.errors;
8616 auto& warnings = ctx.warnings;
8617 auto repairer = ctx.repairer;
8618
8619 ceph_assert(o->onode.has_omap());
8620 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
8621 if (per_pool_omap == OMAP_PER_POOL) {
8622 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8623 << "fsck error: " << o->oid
8624 << " has omap that is not per-pool or pgmeta"
8625 << fsck_dendl;
8626 ++errors;
8627 } else {
8628 const char* w;
8629 int64_t num;
8630 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8631 ++errors;
8632 num = errors;
8633 w = "error";
8634 } else {
8635 ++warnings;
8636 num = warnings;
8637 w = "warning";
8638 }
8639 fsck_derr(num, MAX_FSCK_ERROR_LINES)
8640 << "fsck " << w << ": " << o->oid
8641 << " has omap that is not per-pool or pgmeta"
8642 << fsck_dendl;
8643 }
8644 } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) {
8645 if (per_pool_omap == OMAP_PER_PG) {
8646 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8647 << "fsck error: " << o->oid
8648 << " has omap that is not per-pg or pgmeta"
8649 << fsck_dendl;
8650 ++errors;
8651 } else {
8652 const char* w;
8653 int64_t num;
8654 if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) {
8655 ++errors;
8656 num = errors;
8657 w = "error";
8658 } else {
8659 ++warnings;
8660 num = warnings;
8661 w = "warning";
8662 }
8663 fsck_derr(num, MAX_FSCK_ERROR_LINES)
8664 << "fsck " << w << ": " << o->oid
8665 << " has omap that is not per-pg or pgmeta"
8666 << fsck_dendl;
8667 }
8668 }
8669 if (repairer &&
8670 !o->onode.is_perpg_omap() &&
8671 !o->onode.is_pgmeta_omap()) {
8672 dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl;
8673 bufferlist header;
8674 map<string, bufferlist> kv;
8675 {
8676 KeyValueDB::Transaction txn = db->get_transaction();
8677 uint64_t txn_cost = 0;
8678 const string& prefix = Onode::calc_omap_prefix(o->onode.flags);
8679 uint8_t new_flags = o->onode.flags |
8680 bluestore_onode_t::FLAG_PERPOOL_OMAP |
8681 bluestore_onode_t::FLAG_PERPG_OMAP;
8682 const string& new_omap_prefix = Onode::calc_omap_prefix(new_flags);
8683
8684 KeyValueDB::Iterator it = db->get_iterator(prefix);
8685 string head, tail;
8686 o->get_omap_header(&head);
8687 o->get_omap_tail(&tail);
8688 it->lower_bound(head);
8689 // head
8690 if (it->valid() && it->key() == head) {
8691 dout(30) << __func__ << " got header" << dendl;
8692 header = it->value();
8693 if (header.length()) {
8694 string new_head;
8695 Onode::calc_omap_header(new_flags, o.get(), &new_head);
8696 txn->set(new_omap_prefix, new_head, header);
8697 txn_cost += new_head.length() + header.length();
8698 }
8699 it->next();
8700 }
8701 // tail
8702 {
8703 string new_tail;
8704 Onode::calc_omap_tail(new_flags, o.get(), &new_tail);
8705 bufferlist empty;
8706 txn->set(new_omap_prefix, new_tail, empty);
8707 txn_cost += new_tail.length() + new_tail.length();
8708 }
8709 // values
8710 string final_key;
8711 Onode::calc_omap_key(new_flags, o.get(), string(), &final_key);
8712 size_t base_key_len = final_key.size();
8713 while (it->valid() && it->key() < tail) {
8714 string user_key;
8715 o->decode_omap_key(it->key(), &user_key);
8716 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
8717 << " -> " << user_key << dendl;
8718
8719 final_key.resize(base_key_len);
8720 final_key += user_key;
8721 auto v = it->value();
8722 txn->set(new_omap_prefix, final_key, v);
8723 txn_cost += final_key.length() + v.length();
8724
8725 // submit a portion if cost exceeds 16MB
8726 if (txn_cost >= 16 * (1 << 20) ) {
8727 db->submit_transaction_sync(txn);
8728 txn = db->get_transaction();
8729 txn_cost = 0;
8730 }
8731 it->next();
8732 }
8733 if (txn_cost > 0) {
8734 db->submit_transaction_sync(txn);
8735 }
8736 }
8737 // finalize: remove legacy data
8738 {
8739 KeyValueDB::Transaction txn = db->get_transaction();
8740 // remove old keys
8741 const string& old_omap_prefix = o->get_omap_prefix();
8742 string old_head, old_tail;
8743 o->get_omap_header(&old_head);
8744 o->get_omap_tail(&old_tail);
8745 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
8746 txn->rmkey(old_omap_prefix, old_tail);
8747 // set flag
8748 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP);
8749 _record_onode(o, txn);
8750 db->submit_transaction_sync(txn);
8751 repairer->inc_repaired();
8752 repairer->request_compaction();
8753 }
8754 }
8755 }
8756
8757 void BlueStore::_fsck_check_objects(
8758 FSCKDepth depth,
8759 BlueStore::FSCK_ObjectCtx& ctx)
8760 {
8761 auto& errors = ctx.errors;
8762 auto sb_info_lock = ctx.sb_info_lock;
8763 auto& sb_info = ctx.sb_info;
8764 auto& sb_ref_counts = ctx.sb_ref_counts;
8765 auto repairer = ctx.repairer;
8766
8767 uint64_t_btree_t used_nids;
8768
8769 size_t processed_myself = 0;
8770
8771 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
8772 mempool::bluestore_fsck::list<string> expecting_shards;
8773 if (it) {
8774 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
8775 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
8776 std::unique_ptr<WQ> wq(
8777 new WQ(
8778 "FSCKWorkQueue",
8779 (thread_count ? : 1) * 32,
8780 this,
8781 sb_info_lock,
8782 sb_info,
8783 sb_ref_counts,
8784 repairer));
8785
8786 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
8787
8788 thread_pool.add_work_queue(wq.get());
8789 if (depth == FSCK_SHALLOW && thread_count > 0) {
8790 //not the best place but let's check anyway
8791 ceph_assert(sb_info_lock);
8792 thread_pool.start();
8793 }
8794
8795 // fill global if not overriden below
8796 CollectionRef c;
8797 int64_t pool_id = -1;
8798 spg_t pgid;
8799 for (it->lower_bound(string()); it->valid(); it->next()) {
8800 dout(30) << __func__ << " key "
8801 << pretty_binary_string(it->key()) << dendl;
8802 if (is_extent_shard_key(it->key())) {
8803 if (depth == FSCK_SHALLOW) {
8804 continue;
8805 }
8806 while (!expecting_shards.empty() &&
8807 expecting_shards.front() < it->key()) {
8808 derr << "fsck error: missing shard key "
8809 << pretty_binary_string(expecting_shards.front())
8810 << dendl;
8811 ++errors;
8812 expecting_shards.pop_front();
8813 }
8814 if (!expecting_shards.empty() &&
8815 expecting_shards.front() == it->key()) {
8816 // all good
8817 expecting_shards.pop_front();
8818 continue;
8819 }
8820
8821 uint32_t offset;
8822 string okey;
8823 get_key_extent_shard(it->key(), &okey, &offset);
8824 derr << "fsck error: stray shard 0x" << std::hex << offset
8825 << std::dec << dendl;
8826 if (expecting_shards.empty()) {
8827 derr << "fsck error: " << pretty_binary_string(it->key())
8828 << " is unexpected" << dendl;
8829 ++errors;
8830 continue;
8831 }
8832 while (expecting_shards.front() > it->key()) {
8833 derr << "fsck error: saw " << pretty_binary_string(it->key())
8834 << dendl;
8835 derr << "fsck error: exp "
8836 << pretty_binary_string(expecting_shards.front()) << dendl;
8837 ++errors;
8838 expecting_shards.pop_front();
8839 if (expecting_shards.empty()) {
8840 break;
8841 }
8842 }
8843 continue;
8844 }
8845
8846 ghobject_t oid;
8847 int r = get_key_object(it->key(), &oid);
8848 if (r < 0) {
8849 derr << "fsck error: bad object key "
8850 << pretty_binary_string(it->key()) << dendl;
8851 ++errors;
8852 continue;
8853 }
8854 if (!c ||
8855 oid.shard_id != pgid.shard ||
8856 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8857 !c->contains(oid)) {
8858 c = nullptr;
8859 for (auto& p : coll_map) {
8860 if (p.second->contains(oid)) {
8861 c = p.second;
8862 break;
8863 }
8864 }
8865 if (!c) {
8866 derr << "fsck error: stray object " << oid
8867 << " not owned by any collection" << dendl;
8868 ++errors;
8869 continue;
8870 }
8871 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
8872 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
8873 << dendl;
8874 }
8875
8876 if (depth != FSCK_SHALLOW &&
8877 !expecting_shards.empty()) {
8878 for (auto& k : expecting_shards) {
8879 derr << "fsck error: missing shard key "
8880 << pretty_binary_string(k) << dendl;
8881 }
8882 ++errors;
8883 expecting_shards.clear();
8884 }
8885
8886 bool queued = false;
8887 if (depth == FSCK_SHALLOW && thread_count > 0) {
8888 queued = wq->queue(
8889 pool_id,
8890 c,
8891 oid,
8892 it->key(),
8893 it->value());
8894 }
8895 OnodeRef o;
8896 map<BlobRef, bluestore_blob_t::unused_t> referenced;
8897
8898 if (!queued) {
8899 ++processed_myself;
8900 o = fsck_check_objects_shallow(
8901 depth,
8902 pool_id,
8903 c,
8904 oid,
8905 it->key(),
8906 it->value(),
8907 &expecting_shards,
8908 &referenced,
8909 ctx);
8910 }
8911
8912 if (depth != FSCK_SHALLOW) {
8913 ceph_assert(o != nullptr);
8914 if (o->onode.nid) {
8915 if (o->onode.nid > nid_max) {
8916 derr << "fsck error: " << oid << " nid " << o->onode.nid
8917 << " > nid_max " << nid_max << dendl;
8918 ++errors;
8919 }
8920 if (used_nids.count(o->onode.nid)) {
8921 derr << "fsck error: " << oid << " nid " << o->onode.nid
8922 << " already in use" << dendl;
8923 ++errors;
8924 continue; // go for next object
8925 }
8926 used_nids.insert(o->onode.nid);
8927 }
8928 for (auto& i : referenced) {
8929 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
8930 << std::dec << " for " << *i.first << dendl;
8931 const bluestore_blob_t& blob = i.first->get_blob();
8932 if (i.second & blob.unused) {
8933 derr << "fsck error: " << oid << " blob claims unused 0x"
8934 << std::hex << blob.unused
8935 << " but extents reference 0x" << i.second << std::dec
8936 << " on blob " << *i.first << dendl;
8937 ++errors;
8938 }
8939 if (blob.has_csum()) {
8940 uint64_t blob_len = blob.get_logical_length();
8941 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
8942 unsigned csum_count = blob.get_csum_count();
8943 unsigned csum_chunk_size = blob.get_csum_chunk_size();
8944 for (unsigned p = 0; p < csum_count; ++p) {
8945 unsigned pos = p * csum_chunk_size;
8946 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
8947 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
8948 unsigned mask = 1u << firstbit;
8949 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
8950 mask |= 1u << b;
8951 }
8952 if ((blob.unused & mask) == mask) {
8953 // this csum chunk region is marked unused
8954 if (blob.get_csum_item(p) != 0) {
8955 derr << "fsck error: " << oid
8956 << " blob claims csum chunk 0x" << std::hex << pos
8957 << "~" << csum_chunk_size
8958 << " is unused (mask 0x" << mask << " of unused 0x"
8959 << blob.unused << ") but csum is non-zero 0x"
8960 << blob.get_csum_item(p) << std::dec << " on blob "
8961 << *i.first << dendl;
8962 ++errors;
8963 }
8964 }
8965 }
8966 }
8967 }
8968 // omap
8969 if (o->onode.has_omap()) {
8970 ceph_assert(ctx.used_omap_head);
8971 if (ctx.used_omap_head->count(o->onode.nid)) {
8972 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
8973 << " already in use" << dendl;
8974 ++errors;
8975 } else {
8976 ctx.used_omap_head->insert(o->onode.nid);
8977 }
8978 } // if (o->onode.has_omap())
8979 if (depth == FSCK_DEEP) {
8980 bufferlist bl;
8981 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
8982 uint64_t offset = 0;
8983 do {
8984 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
8985 int r = _do_read(c.get(), o, offset, l, bl,
8986 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
8987 if (r < 0) {
8988 ++errors;
8989 derr << "fsck error: " << oid << std::hex
8990 << " error during read: "
8991 << " " << offset << "~" << l
8992 << " " << cpp_strerror(r) << std::dec
8993 << dendl;
8994 break;
8995 }
8996 offset += l;
8997 } while (offset < o->onode.size);
8998 } // deep
8999 } //if (depth != FSCK_SHALLOW)
9000 } // for (it->lower_bound(string()); it->valid(); it->next())
9001 if (depth == FSCK_SHALLOW && thread_count > 0) {
9002 wq->finalize(thread_pool, ctx);
9003 if (processed_myself) {
9004 // may be needs more threads?
9005 dout(0) << __func__ << " partial offload"
9006 << ", done myself " << processed_myself
9007 << " of " << ctx.num_objects
9008 << "objects, threads " << thread_count
9009 << dendl;
9010 }
9011 }
9012 } // if (it)
9013 }
9014 /**
9015 An overview for currently implemented repair logics
9016 performed in fsck in two stages: detection(+preparation) and commit.
9017 Detection stage (in processing order):
9018 (Issue -> Repair action to schedule)
9019 - Detect undecodable keys for Shared Blobs -> Remove
9020 - Detect undecodable records for Shared Blobs -> Remove
9021 (might trigger missed Shared Blob detection below)
9022 - Detect stray records for Shared Blobs -> Remove
9023 - Detect misreferenced pextents -> Fix
9024 Prepare Bloom-like filter to track cid/oid -> pextent
9025 Prepare list of extents that are improperly referenced
9026 Enumerate Onode records that might use 'misreferenced' pextents
9027 (Bloom-like filter applied to reduce computation)
9028 Per each questinable Onode enumerate all blobs and identify broken ones
9029 (i.e. blobs having 'misreferences')
9030 Rewrite each broken blob data by allocating another extents and
9031 copying data there
9032 If blob is shared - unshare it and mark corresponding Shared Blob
9033 for removal
9034 Release previously allocated space
9035 Update Extent Map
9036 - Detect missed Shared Blobs -> Recreate
9037 - Detect undecodable deferred transaction -> Remove
9038 - Detect Freelist Manager's 'false free' entries -> Mark as used
9039 - Detect Freelist Manager's leaked entries -> Mark as free
9040 - Detect statfs inconsistency - Update
9041 Commit stage (separate DB commit per each step):
9042 - Apply leaked FM entries fix
9043 - Apply 'false free' FM entries fix
9044 - Apply 'Remove' actions
9045 - Apply fix for misreference pextents
9046 - Apply Shared Blob recreate
9047 (can be merged with the step above if misreferences were dectected)
9048 - Apply StatFS update
9049 */
9050 int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
9051 {
9052 dout(5) << __func__
9053 << (repair ? " repair" : " check")
9054 << (depth == FSCK_DEEP ? " (deep)" :
9055 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
9056 << dendl;
9057
9058 // in deep mode we need R/W write access to be able to replay deferred ops
9059 const bool read_only = !(repair || depth == FSCK_DEEP);
9060 int r = _open_db_and_around(read_only);
9061 if (r < 0) {
9062 return r;
9063 }
9064 auto close_db = make_scope_guard([&] {
9065 _close_db_and_around();
9066 });
9067
9068 if (!read_only) {
9069 r = _upgrade_super();
9070 if (r < 0) {
9071 return r;
9072 }
9073 }
9074
9075 // NullFreelistManager needs to open collection early
9076 r = _open_collections();
9077 if (r < 0) {
9078 return r;
9079 }
9080
9081 mempool_thread.init();
9082 auto stop_mempool = make_scope_guard([&] {
9083 mempool_thread.shutdown();
9084 _shutdown_cache();
9085 });
9086 // we need finisher and kv_{sync,finalize}_thread *just* for replay
9087 // enable in repair or deep mode modes only
9088 if (!read_only) {
9089 _kv_start();
9090 r = _deferred_replay();
9091 _kv_stop();
9092 }
9093
9094 if (r < 0) {
9095 return r;
9096 }
9097 return _fsck_on_open(depth, repair);
9098 }
9099
9100 int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
9101 {
9102 uint64_t sb_hash_size = uint64_t(
9103 cct->_conf.get_val<Option::size_t>("osd_memory_target") *
9104 cct->_conf.get_val<double>(
9105 "bluestore_fsck_shared_blob_tracker_size"));
9106
9107 dout(1) << __func__
9108 << " <<<START>>>"
9109 << (repair ? " repair" : " check")
9110 << (depth == FSCK_DEEP ? " (deep)" :
9111 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
9112 << " start sb_tracker_hash_size:" << sb_hash_size
9113 << dendl;
9114 int64_t errors = 0;
9115 int64_t warnings = 0;
9116 unsigned repaired = 0;
9117
9118 uint64_t_btree_t used_omap_head;
9119 uint64_t_btree_t used_sbids;
9120
9121 mempool_dynamic_bitset used_blocks, bluefs_used_blocks;
9122 KeyValueDB::Iterator it;
9123 store_statfs_t expected_store_statfs, actual_statfs;
9124 per_pool_statfs expected_pool_statfs;
9125
9126 sb_info_space_efficient_map_t sb_info;
9127 shared_blob_2hash_tracker_t sb_ref_counts(
9128 sb_hash_size,
9129 min_alloc_size);
9130 size_t sb_ref_mismatches = 0;
9131
9132 /// map of oid -> (first_)offset for each zone
9133 std::vector<std::unordered_map<ghobject_t, uint64_t>> zone_refs; // FIXME: this may be a lot of RAM!
9134
9135 uint64_t num_objects = 0;
9136 uint64_t num_extents = 0;
9137 uint64_t num_blobs = 0;
9138 uint64_t num_spanning_blobs = 0;
9139 uint64_t num_shared_blobs = 0;
9140 uint64_t num_sharded_objects = 0;
9141 BlueStoreRepairer repairer;
9142
9143 auto alloc_size = fm->get_alloc_size();
9144
9145 utime_t start = ceph_clock_now();
9146
9147 _fsck_collections(&errors);
9148 used_blocks.resize(fm->get_alloc_units());
9149
9150 if (bluefs) {
9151 interval_set<uint64_t> bluefs_extents;
9152
9153 int r = bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents);
9154 ceph_assert(r == 0);
9155 for (auto [start, len] : bluefs_extents) {
9156 apply_for_bitset_range(start, len, alloc_size, used_blocks,
9157 [&](uint64_t pos, mempool_dynamic_bitset& bs) {
9158 ceph_assert(pos < bs.size());
9159 bs.set(pos);
9160 }
9161 );
9162 }
9163 }
9164
9165 bluefs_used_blocks = used_blocks;
9166
9167 apply_for_bitset_range(
9168 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks,
9169 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9170 bs.set(pos);
9171 }
9172 );
9173
9174
9175 if (repair) {
9176 repairer.init_space_usage_tracker(
9177 bdev->get_size(),
9178 min_alloc_size);
9179 }
9180
9181 if (bluefs) {
9182 int r = bluefs->fsck();
9183 if (r < 0) {
9184 return r;
9185 }
9186 if (r > 0)
9187 errors += r;
9188 }
9189
9190 if (!per_pool_stat_collection) {
9191 const char *w;
9192 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
9193 w = "error";
9194 ++errors;
9195 } else {
9196 w = "warning";
9197 ++warnings;
9198 }
9199 derr << "fsck " << w << ": store not yet converted to per-pool stats"
9200 << dendl;
9201 }
9202 if (per_pool_omap != OMAP_PER_PG) {
9203 const char *w;
9204 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
9205 w = "error";
9206 ++errors;
9207 } else {
9208 w = "warning";
9209 ++warnings;
9210 }
9211 derr << "fsck " << w << ": store not yet converted to per-pg omap"
9212 << dendl;
9213 }
9214
9215 // get expected statfs; reset unaffected fields to be able to compare
9216 // structs
9217 statfs(&actual_statfs);
9218 actual_statfs.total = 0;
9219 actual_statfs.internally_reserved = 0;
9220 actual_statfs.available = 0;
9221 actual_statfs.internal_metadata = 0;
9222 actual_statfs.omap_allocated = 0;
9223
9224 if (g_conf()->bluestore_debug_fsck_abort) {
9225 dout(1) << __func__ << " debug abort" << dendl;
9226 goto out_scan;
9227 }
9228
9229 #ifdef HAVE_LIBZBD
9230 if (bdev->is_smr()) {
9231 auto a = dynamic_cast<ZonedAllocator*>(alloc);
9232 ceph_assert(a);
9233 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
9234 ceph_assert(f);
9235 vector<uint64_t> wp = bdev->get_zones();
9236 vector<zone_state_t> zones = f->get_zone_states(db);
9237 ceph_assert(wp.size() == zones.size());
9238 auto num_zones = bdev->get_size() / zone_size;
9239 for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
9240 uint64_t p = wp[i] == (i + 1) * zone_size ? zone_size : wp[i] % zone_size;
9241 if (zones[i].write_pointer > p &&
9242 zones[i].num_dead_bytes < zones[i].write_pointer) {
9243 derr << "fsck error: zone 0x" << std::hex << i
9244 << " bluestore write pointer 0x" << zones[i].write_pointer
9245 << " > device write pointer 0x" << p
9246 << " (with only 0x" << zones[i].num_dead_bytes << " dead bytes)"
9247 << std::dec << dendl;
9248 ++errors;
9249 }
9250 }
9251
9252 if (depth != FSCK_SHALLOW) {
9253 // load zone refs
9254 zone_refs.resize(bdev->get_size() / zone_size);
9255 it = db->get_iterator(PREFIX_ZONED_CL_INFO, KeyValueDB::ITERATOR_NOCACHE);
9256 if (it) {
9257 for (it->lower_bound(string());
9258 it->valid();
9259 it->next()) {
9260 uint32_t zone = 0;
9261 uint64_t offset = 0;
9262 ghobject_t oid;
9263 string key = it->key();
9264 int r = get_key_zone_offset_object(key, &zone, &offset, &oid);
9265 if (r < 0) {
9266 derr << "fsck error: invalid zone ref key " << pretty_binary_string(key)
9267 << dendl;
9268 if (repair) {
9269 repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
9270 }
9271 ++errors;
9272 continue;
9273 }
9274 dout(30) << " zone ref 0x" << std::hex << zone << " offset 0x" << offset
9275 << " -> " << std::dec << oid << dendl;
9276 if (zone_refs[zone].count(oid)) {
9277 derr << "fsck error: second zone ref in zone 0x" << std::hex << zone
9278 << " offset 0x" << offset << std::dec << " for " << oid << dendl;
9279 if (repair) {
9280 repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
9281 }
9282 ++errors;
9283 continue;
9284 }
9285 zone_refs[zone][oid] = offset;
9286 }
9287 }
9288 }
9289 }
9290 #endif
9291
9292 dout(1) << __func__ << " checking shared_blobs (phase 1)" << dendl;
9293 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
9294 if (it) {
9295 for (it->lower_bound(string()); it->valid(); it->next()) {
9296 string key = it->key();
9297 uint64_t sbid;
9298 if (get_key_shared_blob(key, &sbid) < 0) {
9299 // Failed to parse the key.
9300 // This gonna to be handled at the second stage
9301 continue;
9302 }
9303 bluestore_shared_blob_t shared_blob(sbid);
9304 bufferlist bl = it->value();
9305 auto blp = bl.cbegin();
9306 try {
9307 decode(shared_blob, blp);
9308 }
9309 catch (ceph::buffer::error& e) {
9310 // this gonna to be handled at the second stage
9311 continue;
9312 }
9313 dout(20) << __func__ << " " << shared_blob << dendl;
9314 auto& sbi = sb_info.add_maybe_stray(sbid);
9315
9316 // primarily to silent the 'unused' warning
9317 ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID);
9318
9319 for (auto& r : shared_blob.ref_map.ref_map) {
9320 sb_ref_counts.inc_range(
9321 sbid,
9322 r.first,
9323 r.second.length,
9324 -r.second.refs);
9325 }
9326 }
9327 } // if (it) //checking shared_blobs (phase1)
9328
9329 // walk PREFIX_OBJ
9330 {
9331 dout(1) << __func__ << " walking object keyspace" << dendl;
9332 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
9333 BlueStore::FSCK_ObjectCtx ctx(
9334 errors,
9335 warnings,
9336 num_objects,
9337 num_extents,
9338 num_blobs,
9339 num_sharded_objects,
9340 num_spanning_blobs,
9341 &used_blocks,
9342 &used_omap_head,
9343 &zone_refs,
9344 //no need for the below lock when in non-shallow mode as
9345 // there is no multithreading in this case
9346 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
9347 sb_info,
9348 sb_ref_counts,
9349 expected_store_statfs,
9350 expected_pool_statfs,
9351 repair ? &repairer : nullptr);
9352
9353 _fsck_check_objects(depth, ctx);
9354 }
9355
9356 #ifdef HAVE_LIBZBD
9357 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
9358 dout(1) << __func__ << " checking for leaked zone refs" << dendl;
9359 for (uint32_t zone = 0; zone < zone_refs.size(); ++zone) {
9360 for (auto& [oid, offset] : zone_refs[zone]) {
9361 derr << "fsck error: stray zone ref 0x" << std::hex << zone
9362 << " offset 0x" << offset << " -> " << std::dec << oid << dendl;
9363 // FIXME: add repair
9364 ++errors;
9365 }
9366 }
9367 }
9368 #endif
9369
9370 sb_ref_mismatches = sb_ref_counts.count_non_zero();
9371 if (sb_ref_mismatches != 0) {
9372 derr << "fsck error: shared blob references aren't matching, at least "
9373 << sb_ref_mismatches << " found" << dendl;
9374 errors += sb_ref_mismatches;
9375 }
9376
9377 if (depth != FSCK_SHALLOW && repair) {
9378 _fsck_repair_shared_blobs(repairer, sb_ref_counts, sb_info);
9379 }
9380 dout(1) << __func__ << " checking shared_blobs (phase 2)" << dendl;
9381 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
9382 if (it) {
9383 // FIXME minor: perhaps simplify for shallow mode?
9384 // fill global if not overriden below
9385 auto expected_statfs = &expected_store_statfs;
9386 for (it->lower_bound(string()); it->valid(); it->next()) {
9387 string key = it->key();
9388 uint64_t sbid;
9389 if (get_key_shared_blob(key, &sbid)) {
9390 derr << "fsck error: bad key '" << key
9391 << "' in shared blob namespace" << dendl;
9392 if (repair) {
9393 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9394 }
9395 ++errors;
9396 continue;
9397 }
9398 auto p = sb_info.find(sbid);
9399 if (p == sb_info.end()) {
9400 if (sb_ref_mismatches > 0) {
9401 // highly likely this has been already reported before, ignoring...
9402 dout(5) << __func__ << " found duplicate(?) stray shared blob data for sbid 0x"
9403 << std::hex << sbid << std::dec << dendl;
9404 } else {
9405 derr<< "fsck error: found stray shared blob data for sbid 0x"
9406 << std::hex << sbid << std::dec << dendl;
9407 ++errors;
9408 if (repair) {
9409 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9410 }
9411 }
9412 } else {
9413 ++num_shared_blobs;
9414 sb_info_t& sbi = *p;
9415 bluestore_shared_blob_t shared_blob(sbid);
9416 bufferlist bl = it->value();
9417 auto blp = bl.cbegin();
9418 try {
9419 decode(shared_blob, blp);
9420 }
9421 catch (ceph::buffer::error& e) {
9422 ++errors;
9423
9424 derr << "fsck error: failed to decode Shared Blob"
9425 << pretty_binary_string(key) << dendl;
9426 if (repair) {
9427 dout(20) << __func__ << " undecodable Shared Blob, key:'"
9428 << pretty_binary_string(key)
9429 << "', removing" << dendl;
9430 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9431 }
9432 continue;
9433 }
9434 dout(20) << __func__ << " " << shared_blob << dendl;
9435 PExtentVector extents;
9436 for (auto& r : shared_blob.ref_map.ref_map) {
9437 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
9438 }
9439 if (sbi.pool_id != sb_info_t::INVALID_POOL_ID &&
9440 (per_pool_stat_collection || repair)) {
9441 expected_statfs = &expected_pool_statfs[sbi.pool_id];
9442 }
9443 std::stringstream ss;
9444 ss << "sbid 0x" << std::hex << sbid << std::dec;
9445 errors += _fsck_check_extents(ss.str(),
9446 extents,
9447 sbi.allocated_chunks < 0,
9448 used_blocks,
9449 fm->get_alloc_size(),
9450 repair ? &repairer : nullptr,
9451 *expected_statfs,
9452 depth);
9453 }
9454 }
9455 } // if (it) /* checking shared_blobs (phase 2)*/
9456
9457 if (repair && repairer.preprocess_misreference(db)) {
9458
9459 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
9460 auto& misref_extents = repairer.get_misreferences();
9461 interval_set<uint64_t> to_release;
9462 it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
9463 if (it) {
9464 // fill global if not overriden below
9465 auto expected_statfs = &expected_store_statfs;
9466
9467 CollectionRef c;
9468 spg_t pgid;
9469 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
9470 bool bypass_rest = false;
9471 for (it->lower_bound(string()); it->valid() && !bypass_rest;
9472 it->next()) {
9473 dout(30) << __func__ << " key "
9474 << pretty_binary_string(it->key()) << dendl;
9475 if (is_extent_shard_key(it->key())) {
9476 continue;
9477 }
9478
9479 ghobject_t oid;
9480 int r = get_key_object(it->key(), &oid);
9481 if (r < 0 || !repairer.is_used(oid)) {
9482 continue;
9483 }
9484
9485 if (!c ||
9486 oid.shard_id != pgid.shard ||
9487 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
9488 !c->contains(oid)) {
9489 c = nullptr;
9490 for (auto& p : coll_map) {
9491 if (p.second->contains(oid)) {
9492 c = p.second;
9493 break;
9494 }
9495 }
9496 if (!c) {
9497 continue;
9498 }
9499 if (per_pool_stat_collection || repair) {
9500 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
9501 expected_statfs = &expected_pool_statfs[pool_id];
9502 }
9503 }
9504 if (!repairer.is_used(c->cid)) {
9505 continue;
9506 }
9507
9508 dout(20) << __func__ << " check misreference for col:" << c->cid
9509 << " obj:" << oid << dendl;
9510
9511 OnodeRef o;
9512 o.reset(Onode::decode(c, oid, it->key(), it->value()));
9513 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9514 mempool::bluestore_fsck::set<BlobRef> blobs;
9515
9516 for (auto& e : o->extent_map.extent_map) {
9517 blobs.insert(e.blob);
9518 }
9519 bool need_onode_update = false;
9520 bool first_dump = true;
9521 for(auto b : blobs) {
9522 bool broken_blob = false;
9523 auto& pextents = b->dirty_blob().dirty_extents();
9524 for (auto& e : pextents) {
9525 if (!e.is_valid()) {
9526 continue;
9527 }
9528 // for the sake of simplicity and proper shared blob handling
9529 // always rewrite the whole blob even when it's partially
9530 // misreferenced.
9531 if (misref_extents.intersects(e.offset, e.length)) {
9532 if (first_dump) {
9533 first_dump = false;
9534 _dump_onode<10>(cct, *o);
9535 }
9536 broken_blob = true;
9537 break;
9538 }
9539 }
9540 if (!broken_blob)
9541 continue;
9542 bool compressed = b->get_blob().is_compressed();
9543 need_onode_update = true;
9544 dout(10) << __func__
9545 << " fix misreferences in oid:" << oid
9546 << " " << *b << dendl;
9547 uint64_t b_off = 0;
9548 PExtentVector pext_to_release;
9549 pext_to_release.reserve(pextents.size());
9550 // rewriting all valid pextents
9551 for (auto e = pextents.begin(); e != pextents.end();
9552 e++) {
9553 auto b_off_cur = b_off;
9554 b_off += e->length;
9555 if (!e->is_valid()) {
9556 continue;
9557 }
9558 PExtentVector exts;
9559 dout(5) << __func__ << "::NCB::(F)alloc=" << alloc << ", length=" << e->length << dendl;
9560 int64_t alloc_len =
9561 alloc->allocate(e->length, min_alloc_size,
9562 0, 0, &exts);
9563 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
9564 derr << __func__
9565 << " failed to allocate 0x" << std::hex << e->length
9566 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
9567 << " min_alloc_size 0x" << min_alloc_size
9568 << " available 0x " << alloc->get_free()
9569 << std::dec << dendl;
9570 if (alloc_len > 0) {
9571 alloc->release(exts);
9572 }
9573 bypass_rest = true;
9574 break;
9575 }
9576 expected_statfs->allocated += e->length;
9577 if (compressed) {
9578 expected_statfs->data_compressed_allocated += e->length;
9579 }
9580
9581 bufferlist bl;
9582 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
9583 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
9584 if (r < 0) {
9585 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
9586 <<"~" << e->length << std::dec << dendl;
9587 ceph_abort_msg("read failed, wtf");
9588 }
9589 pext_to_release.push_back(*e);
9590 e = pextents.erase(e);
9591 e = pextents.insert(e, exts.begin(), exts.end());
9592 b->get_blob().map_bl(
9593 b_off_cur, bl,
9594 [&](uint64_t offset, bufferlist& t) {
9595 int r = bdev->write(offset, t, false);
9596 ceph_assert(r == 0);
9597 });
9598 e += exts.size() - 1;
9599 for (auto& p : exts) {
9600 fm->allocate(p.offset, p.length, txn);
9601 }
9602 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
9603
9604 if (b->get_blob().is_shared()) {
9605 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
9606
9607 auto sbid = b->shared_blob->get_sbid();
9608 auto sb_it = sb_info.find(sbid);
9609 ceph_assert(sb_it != sb_info.end());
9610 sb_info_t& sbi = *sb_it;
9611
9612 if (sbi.allocated_chunks < 0) {
9613 // NB: it's crucial to use compressed_allocated_chunks from sb_info_t
9614 // as we originally used that value while accumulating
9615 // expected_statfs
9616 expected_statfs->allocated -= uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
9617 expected_statfs->data_compressed_allocated -=
9618 uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
9619 } else {
9620 expected_statfs->allocated -= uint64_t(sbi.allocated_chunks) << min_alloc_size_order;
9621 }
9622 sbi.allocated_chunks = 0;
9623 repairer.fix_shared_blob(txn, sbid, nullptr, 0);
9624
9625 // relying on blob's pextents to decide what to release.
9626 for (auto& p : pext_to_release) {
9627 to_release.union_insert(p.offset, p.length);
9628 }
9629 } else {
9630 for (auto& p : pext_to_release) {
9631 expected_statfs->allocated -= p.length;
9632 if (compressed) {
9633 expected_statfs->data_compressed_allocated -= p.length;
9634 }
9635 to_release.union_insert(p.offset, p.length);
9636 }
9637 }
9638 if (bypass_rest) {
9639 break;
9640 }
9641 } // for(auto b : blobs)
9642 if (need_onode_update) {
9643 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
9644 _record_onode(o, txn);
9645 }
9646 } // for (it->lower_bound(string()); it->valid(); it->next())
9647
9648 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
9649 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
9650 << "~" << it.get_len() << std::dec << dendl;
9651 fm->release(it.get_start(), it.get_len(), txn);
9652 }
9653 alloc->release(to_release);
9654 to_release.clear();
9655 } // if (it) {
9656 } //if (repair && repairer.preprocess_misreference()) {
9657 sb_info.clear();
9658 sb_ref_counts.reset();
9659
9660 // check global stats only if fscking (not repairing) w/o per-pool stats
9661 if (!per_pool_stat_collection &&
9662 !repair &&
9663 !(actual_statfs == expected_store_statfs)) {
9664 derr << "fsck error: actual " << actual_statfs
9665 << " != expected " << expected_store_statfs << dendl;
9666 if (repair) {
9667 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
9668 expected_store_statfs);
9669 }
9670 ++errors;
9671 }
9672
9673 dout(1) << __func__ << " checking pool_statfs" << dendl;
9674 _fsck_check_pool_statfs(expected_pool_statfs,
9675 errors, warnings, repair ? &repairer : nullptr);
9676
9677 if (depth != FSCK_SHALLOW) {
9678 dout(1) << __func__ << " checking for stray omap data " << dendl;
9679 it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9680 if (it) {
9681 uint64_t last_omap_head = 0;
9682 for (it->lower_bound(string()); it->valid(); it->next()) {
9683 uint64_t omap_head;
9684
9685 _key_decode_u64(it->key().c_str(), &omap_head);
9686
9687 if (used_omap_head.count(omap_head) == 0 &&
9688 omap_head != last_omap_head) {
9689 pair<string,string> rk = it->raw_key();
9690 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9691 << "fsck error: found stray omap data on omap_head "
9692 << omap_head << " " << last_omap_head
9693 << " prefix/key: " << url_escape(rk.first)
9694 << " " << url_escape(rk.second)
9695 << fsck_dendl;
9696 ++errors;
9697 last_omap_head = omap_head;
9698 }
9699 }
9700 }
9701 it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9702 if (it) {
9703 uint64_t last_omap_head = 0;
9704 for (it->lower_bound(string()); it->valid(); it->next()) {
9705 uint64_t omap_head;
9706 _key_decode_u64(it->key().c_str(), &omap_head);
9707 if (used_omap_head.count(omap_head) == 0 &&
9708 omap_head != last_omap_head) {
9709 pair<string,string> rk = it->raw_key();
9710 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9711 << "fsck error: found stray (pgmeta) omap data on omap_head "
9712 << omap_head << " " << last_omap_head
9713 << " prefix/key: " << url_escape(rk.first)
9714 << " " << url_escape(rk.second)
9715 << fsck_dendl;
9716 last_omap_head = omap_head;
9717 ++errors;
9718 }
9719 }
9720 }
9721 it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9722 if (it) {
9723 uint64_t last_omap_head = 0;
9724 for (it->lower_bound(string()); it->valid(); it->next()) {
9725 uint64_t pool;
9726 uint64_t omap_head;
9727 string k = it->key();
9728 const char *c = k.c_str();
9729 c = _key_decode_u64(c, &pool);
9730 c = _key_decode_u64(c, &omap_head);
9731 if (used_omap_head.count(omap_head) == 0 &&
9732 omap_head != last_omap_head) {
9733 pair<string,string> rk = it->raw_key();
9734 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9735 << "fsck error: found stray (per-pool) omap data on omap_head "
9736 << omap_head << " " << last_omap_head
9737 << " prefix/key: " << url_escape(rk.first)
9738 << " " << url_escape(rk.second)
9739 << fsck_dendl;
9740 ++errors;
9741 last_omap_head = omap_head;
9742 }
9743 }
9744 }
9745 it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9746 if (it) {
9747 uint64_t last_omap_head = 0;
9748 for (it->lower_bound(string()); it->valid(); it->next()) {
9749 uint64_t pool;
9750 uint32_t hash;
9751 uint64_t omap_head;
9752 string k = it->key();
9753 const char* c = k.c_str();
9754 c = _key_decode_u64(c, &pool);
9755 c = _key_decode_u32(c, &hash);
9756 c = _key_decode_u64(c, &omap_head);
9757 if (used_omap_head.count(omap_head) == 0 &&
9758 omap_head != last_omap_head) {
9759 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9760 << "fsck error: found stray (per-pg) omap data on omap_head "
9761 << " key " << pretty_binary_string(it->key())
9762 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
9763 ++errors;
9764 last_omap_head = omap_head;
9765 }
9766 }
9767 }
9768 dout(1) << __func__ << " checking deferred events" << dendl;
9769 it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE);
9770 if (it) {
9771 for (it->lower_bound(string()); it->valid(); it->next()) {
9772 bufferlist bl = it->value();
9773 auto p = bl.cbegin();
9774 bluestore_deferred_transaction_t wt;
9775 try {
9776 decode(wt, p);
9777 } catch (ceph::buffer::error& e) {
9778 derr << "fsck error: failed to decode deferred txn "
9779 << pretty_binary_string(it->key()) << dendl;
9780 if (repair) {
9781 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
9782 << pretty_binary_string(it->key())
9783 << "', removing" << dendl;
9784 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
9785 }
9786 continue;
9787 }
9788 dout(20) << __func__ << " deferred " << wt.seq
9789 << " ops " << wt.ops.size()
9790 << " released 0x" << std::hex << wt.released << std::dec << dendl;
9791 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
9792 apply_for_bitset_range(
9793 e.get_start(), e.get_len(), alloc_size, used_blocks,
9794 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9795 bs.set(pos);
9796 }
9797 );
9798 }
9799 }
9800 }
9801
9802 // skip freelist vs allocated compare when we have Null fm
9803 if (!fm->is_null_manager()) {
9804 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
9805 #ifdef HAVE_LIBZBD
9806 if (freelist_type == "zoned") {
9807 // verify per-zone state
9808 // - verify no allocations beyond write pointer
9809 // - verify num_dead_bytes count (neither allocated nor
9810 // free space past the write pointer)
9811 auto a = dynamic_cast<ZonedAllocator*>(alloc);
9812 auto num_zones = bdev->get_size() / zone_size;
9813
9814 // mark the free space past the write pointer
9815 for (uint32_t zone = first_sequential_zone; zone < num_zones; ++zone) {
9816 auto wp = a->get_write_pointer(zone);
9817 uint64_t offset = zone_size * zone + wp;
9818 uint64_t length = zone_size - wp;
9819 if (!length) {
9820 continue;
9821 }
9822 bool intersects = false;
9823 dout(10) << " marking zone 0x" << std::hex << zone
9824 << " region after wp 0x" << offset << "~" << length
9825 << std::dec << dendl;
9826 apply_for_bitset_range(
9827 offset, length, alloc_size, used_blocks,
9828 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9829 if (bs.test(pos)) {
9830 derr << "fsck error: zone 0x" << std::hex << zone
9831 << " has used space at 0x" << pos * alloc_size
9832 << " beyond write pointer 0x" << wp
9833 << std::dec << dendl;
9834 intersects = true;
9835 } else {
9836 bs.set(pos);
9837 }
9838 }
9839 );
9840 if (intersects) {
9841 ++errors;
9842 }
9843 }
9844
9845 used_blocks.flip();
9846
9847 // skip conventional zones
9848 uint64_t pos = (first_sequential_zone * zone_size) / min_alloc_size - 1;
9849 pos = used_blocks.find_next(pos);
9850
9851 uint64_t zone_dead = 0;
9852 for (uint32_t zone = first_sequential_zone;
9853 zone < num_zones;
9854 ++zone, zone_dead = 0) {
9855 while (pos != decltype(used_blocks)::npos &&
9856 (pos * min_alloc_size) / zone_size == zone) {
9857 dout(40) << " zone 0x" << std::hex << zone
9858 << " dead 0x" << (pos * min_alloc_size) << "~" << min_alloc_size
9859 << std::dec << dendl;
9860 zone_dead += min_alloc_size;
9861 pos = used_blocks.find_next(pos);
9862 }
9863 dout(20) << " zone 0x" << std::hex << zone << " dead is 0x" << zone_dead
9864 << std::dec << dendl;
9865 // cross-check dead bytes against zone state
9866 if (a->get_dead_bytes(zone) != zone_dead) {
9867 derr << "fsck error: zone 0x" << std::hex << zone << " has 0x" << zone_dead
9868 << " dead bytes but freelist says 0x" << a->get_dead_bytes(zone)
9869 << dendl;
9870 ++errors;
9871 // TODO: repair
9872 }
9873 }
9874 used_blocks.flip();
9875 } else
9876 #endif
9877 {
9878 fm->enumerate_reset();
9879 uint64_t offset, length;
9880 while (fm->enumerate_next(db, &offset, &length)) {
9881 bool intersects = false;
9882 apply_for_bitset_range(
9883 offset, length, alloc_size, used_blocks,
9884 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9885 ceph_assert(pos < bs.size());
9886 if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
9887 if (offset == SUPER_RESERVED &&
9888 length == min_alloc_size - SUPER_RESERVED) {
9889 // this is due to the change just after luminous to min_alloc_size
9890 // granularity allocations, and our baked in assumption at the top
9891 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
9892 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
9893 // since we will never allocate this region below min_alloc_size.
9894 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
9895 << " and min_alloc_size, 0x" << std::hex << offset << "~"
9896 << length << std::dec << dendl;
9897 } else {
9898 intersects = true;
9899 if (repair) {
9900 repairer.fix_false_free(db, fm,
9901 pos * min_alloc_size,
9902 min_alloc_size);
9903 }
9904 }
9905 } else {
9906 bs.set(pos);
9907 }
9908 }
9909 );
9910 if (intersects) {
9911 derr << "fsck error: free extent 0x" << std::hex << offset
9912 << "~" << length << std::dec
9913 << " intersects allocated blocks" << dendl;
9914 ++errors;
9915 }
9916 }
9917 fm->enumerate_reset();
9918
9919 // check for leaked extents
9920 size_t count = used_blocks.count();
9921 if (used_blocks.size() != count) {
9922 ceph_assert(used_blocks.size() > count);
9923 used_blocks.flip();
9924 size_t start = used_blocks.find_first();
9925 while (start != decltype(used_blocks)::npos) {
9926 size_t cur = start;
9927 while (true) {
9928 size_t next = used_blocks.find_next(cur);
9929 if (next != cur + 1) {
9930 ++errors;
9931 derr << "fsck error: leaked extent 0x" << std::hex
9932 << ((uint64_t)start * fm->get_alloc_size()) << "~"
9933 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
9934 << dendl;
9935 if (repair) {
9936 repairer.fix_leaked(db,
9937 fm,
9938 start * min_alloc_size,
9939 (cur + 1 - start) * min_alloc_size);
9940 }
9941 start = next;
9942 break;
9943 }
9944 cur = next;
9945 }
9946 }
9947 used_blocks.flip();
9948 }
9949 }
9950 }
9951 }
9952 if (repair) {
9953 if (per_pool_omap != OMAP_PER_PG) {
9954 dout(5) << __func__ << " fixing per_pg_omap" << dendl;
9955 repairer.fix_per_pool_omap(db, OMAP_PER_PG);
9956 }
9957
9958 dout(5) << __func__ << " applying repair results" << dendl;
9959 repaired = repairer.apply(db);
9960 dout(5) << __func__ << " repair applied" << dendl;
9961 }
9962
9963 out_scan:
9964 dout(2) << __func__ << " " << num_objects << " objects, "
9965 << num_sharded_objects << " of them sharded. "
9966 << dendl;
9967 dout(2) << __func__ << " " << num_extents << " extents to "
9968 << num_blobs << " blobs, "
9969 << num_spanning_blobs << " spanning, "
9970 << num_shared_blobs << " shared."
9971 << dendl;
9972
9973 utime_t duration = ceph_clock_now() - start;
9974 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
9975 << warnings << " warnings, "
9976 << repaired << " repaired, "
9977 << (errors + warnings - (int)repaired) << " remaining in "
9978 << duration << " seconds" << dendl;
9979
9980 // In non-repair mode we should return error count only as
9981 // it indicates if store status is OK.
9982 // In repair mode both errors and warnings are taken into account
9983 // since repaired counter relates to them both.
9984 return repair ? errors + warnings - (int)repaired : errors;
9985 }
9986
9987 /// methods to inject various errors fsck can repair
9988 void BlueStore::inject_broken_shared_blob_key(const string& key,
9989 const bufferlist& bl)
9990 {
9991 KeyValueDB::Transaction txn;
9992 txn = db->get_transaction();
9993 txn->set(PREFIX_SHARED_BLOB, key, bl);
9994 db->submit_transaction_sync(txn);
9995 };
9996
9997 void BlueStore::inject_no_shared_blob_key()
9998 {
9999 KeyValueDB::Transaction txn;
10000 txn = db->get_transaction();
10001 ceph_assert(blobid_last > 0);
10002 // kill the last used sbid, this can be broken due to blobid preallocation
10003 // in rare cases, leaving as-is for the sake of simplicity
10004 uint64_t sbid = blobid_last;
10005
10006 string key;
10007 dout(5) << __func__<< " " << sbid << dendl;
10008 get_shared_blob_key(sbid, &key);
10009 txn->rmkey(PREFIX_SHARED_BLOB, key);
10010 db->submit_transaction_sync(txn);
10011 };
10012
10013 void BlueStore::inject_stray_shared_blob_key(uint64_t sbid)
10014 {
10015 KeyValueDB::Transaction txn;
10016 txn = db->get_transaction();
10017
10018 dout(5) << __func__ << " " << sbid << dendl;
10019
10020 string key;
10021 get_shared_blob_key(sbid, &key);
10022 bluestore_shared_blob_t persistent(sbid);
10023 persistent.ref_map.get(0xdead0000, 0x1000);
10024 bufferlist bl;
10025 encode(persistent, bl);
10026 dout(20) << __func__ << " sbid " << sbid
10027 << " takes " << bl.length() << " bytes, updating"
10028 << dendl;
10029
10030 txn->set(PREFIX_SHARED_BLOB, key, bl);
10031 db->submit_transaction_sync(txn);
10032 };
10033
10034
10035 void BlueStore::inject_leaked(uint64_t len)
10036 {
10037 PExtentVector exts;
10038 int64_t alloc_len = alloc->allocate(len, min_alloc_size,
10039 min_alloc_size * 256, 0, &exts);
10040
10041 if (fm->is_null_manager()) {
10042 return;
10043 }
10044
10045 KeyValueDB::Transaction txn;
10046 txn = db->get_transaction();
10047
10048 ceph_assert(alloc_len >= (int64_t)len);
10049 for (auto& p : exts) {
10050 fm->allocate(p.offset, p.length, txn);
10051 }
10052 db->submit_transaction_sync(txn);
10053 }
10054
10055 void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
10056 {
10057 ceph_assert(!fm->is_null_manager());
10058
10059 KeyValueDB::Transaction txn;
10060 OnodeRef o;
10061 CollectionRef c = _get_collection(cid);
10062 ceph_assert(c);
10063 {
10064 std::unique_lock l{c->lock}; // just to avoid internal asserts
10065 o = c->get_onode(oid, false);
10066 ceph_assert(o);
10067 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10068 }
10069
10070 bool injected = false;
10071 txn = db->get_transaction();
10072 auto& em = o->extent_map.extent_map;
10073 std::vector<const PExtentVector*> v;
10074 if (em.size()) {
10075 v.push_back(&em.begin()->blob->get_blob().get_extents());
10076 }
10077 if (em.size() > 1) {
10078 auto it = em.end();
10079 --it;
10080 v.push_back(&(it->blob->get_blob().get_extents()));
10081 }
10082 for (auto pext : v) {
10083 if (pext->size()) {
10084 auto p = pext->begin();
10085 while (p != pext->end()) {
10086 if (p->is_valid()) {
10087 dout(20) << __func__ << " release 0x" << std::hex << p->offset
10088 << "~" << p->length << std::dec << dendl;
10089 fm->release(p->offset, p->length, txn);
10090 injected = true;
10091 break;
10092 }
10093 ++p;
10094 }
10095 }
10096 }
10097 ceph_assert(injected);
10098 db->submit_transaction_sync(txn);
10099 }
10100
10101 void BlueStore::inject_legacy_omap()
10102 {
10103 dout(1) << __func__ << dendl;
10104 per_pool_omap = OMAP_BULK;
10105 KeyValueDB::Transaction txn;
10106 txn = db->get_transaction();
10107 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
10108 db->submit_transaction_sync(txn);
10109 }
10110
10111 void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
10112 {
10113 dout(1) << __func__ << " "
10114 << cid << " " << oid
10115 <<dendl;
10116 KeyValueDB::Transaction txn;
10117 OnodeRef o;
10118 CollectionRef c = _get_collection(cid);
10119 ceph_assert(c);
10120 {
10121 std::unique_lock l{ c->lock }; // just to avoid internal asserts
10122 o = c->get_onode(oid, false);
10123 ceph_assert(o);
10124 }
10125 o->onode.clear_flag(
10126 bluestore_onode_t::FLAG_PERPG_OMAP |
10127 bluestore_onode_t::FLAG_PERPOOL_OMAP |
10128 bluestore_onode_t::FLAG_PGMETA_OMAP);
10129 txn = db->get_transaction();
10130 _record_onode(o, txn);
10131 db->submit_transaction_sync(txn);
10132 }
10133
10134 void BlueStore::inject_stray_omap(uint64_t head, const string& name)
10135 {
10136 dout(1) << __func__ << dendl;
10137 KeyValueDB::Transaction txn = db->get_transaction();
10138
10139 string key;
10140 bufferlist bl;
10141 _key_encode_u64(head, &key);
10142 key.append(name);
10143 txn->set(PREFIX_OMAP, key, bl);
10144
10145 db->submit_transaction_sync(txn);
10146 }
10147
10148 void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
10149 {
10150 BlueStoreRepairer repairer;
10151 repairer.fix_statfs(db, key, new_statfs);
10152 repairer.apply(db);
10153 }
10154
10155 void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
10156 {
10157 KeyValueDB::Transaction t = db->get_transaction();
10158 volatile_statfs v;
10159 v = new_statfs;
10160 bufferlist bl;
10161 v.encode(bl);
10162 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
10163 db->submit_transaction_sync(t);
10164 }
10165
10166 void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
10167 coll_t cid2, ghobject_t oid2,
10168 uint64_t offset)
10169 {
10170 OnodeRef o1;
10171 CollectionRef c1 = _get_collection(cid1);
10172 ceph_assert(c1);
10173 {
10174 std::unique_lock l{c1->lock}; // just to avoid internal asserts
10175 o1 = c1->get_onode(oid1, false);
10176 ceph_assert(o1);
10177 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
10178 }
10179 OnodeRef o2;
10180 CollectionRef c2 = _get_collection(cid2);
10181 ceph_assert(c2);
10182 {
10183 std::unique_lock l{c2->lock}; // just to avoid internal asserts
10184 o2 = c2->get_onode(oid2, false);
10185 ceph_assert(o2);
10186 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
10187 }
10188 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
10189 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
10190
10191 // require onode/extent layout to be the same (and simple)
10192 // to make things easier
10193 ceph_assert(o1->onode.extent_map_shards.empty());
10194 ceph_assert(o2->onode.extent_map_shards.empty());
10195 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
10196 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
10197 ceph_assert(e1.logical_offset == e2.logical_offset);
10198 ceph_assert(e1.length == e2.length);
10199 ceph_assert(e1.blob_offset == e2.blob_offset);
10200
10201 KeyValueDB::Transaction txn;
10202 txn = db->get_transaction();
10203
10204 // along with misreference error this will create space leaks errors
10205 e2.blob->dirty_blob() = e1.blob->get_blob();
10206 o2->extent_map.dirty_range(offset, e2.length);
10207 o2->extent_map.update(txn, false);
10208
10209 _record_onode(o2, txn);
10210 db->submit_transaction_sync(txn);
10211 }
10212
10213 void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
10214 int16_t blob_id)
10215 {
10216 OnodeRef o;
10217 CollectionRef c = _get_collection(cid);
10218 ceph_assert(c);
10219 {
10220 std::unique_lock l{ c->lock }; // just to avoid internal asserts
10221 o = c->get_onode(oid, false);
10222 ceph_assert(o);
10223 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10224 }
10225
10226 BlobRef b = c->new_blob();
10227 b->id = blob_id;
10228 o->extent_map.spanning_blob_map[blob_id] = b;
10229
10230 KeyValueDB::Transaction txn;
10231 txn = db->get_transaction();
10232
10233 _record_onode(o, txn);
10234 db->submit_transaction_sync(txn);
10235 }
10236
10237 void BlueStore::inject_bluefs_file(std::string_view dir, std::string_view name, size_t new_size)
10238 {
10239 ceph_assert(bluefs);
10240
10241 BlueFS::FileWriter* p_handle = nullptr;
10242 auto ret = bluefs->open_for_write(dir, name, &p_handle, false);
10243 ceph_assert(ret == 0);
10244
10245 std::string s('0', new_size);
10246 bufferlist bl;
10247 bl.append(s);
10248 p_handle->append(bl);
10249
10250 bluefs->fsync(p_handle);
10251 bluefs->close_writer(p_handle);
10252 }
10253
10254 void BlueStore::collect_metadata(map<string,string> *pm)
10255 {
10256 dout(10) << __func__ << dendl;
10257 bdev->collect_metadata("bluestore_bdev_", pm);
10258 if (bluefs) {
10259 (*pm)["bluefs"] = "1";
10260 // this value is for backward compatibility only
10261 (*pm)["bluefs_single_shared_device"] = \
10262 stringify((int)bluefs_layout.single_shared_device());
10263 (*pm)["bluefs_dedicated_db"] = \
10264 stringify((int)bluefs_layout.dedicated_db);
10265 (*pm)["bluefs_dedicated_wal"] = \
10266 stringify((int)bluefs_layout.dedicated_wal);
10267 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
10268 } else {
10269 (*pm)["bluefs"] = "0";
10270 }
10271
10272 // report numa mapping for underlying devices
10273 int node = -1;
10274 set<int> nodes;
10275 set<string> failed;
10276 int r = get_numa_node(&node, &nodes, &failed);
10277 if (r >= 0) {
10278 if (!failed.empty()) {
10279 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
10280 }
10281 if (!nodes.empty()) {
10282 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
10283 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
10284 }
10285 if (node >= 0) {
10286 (*pm)["objectstore_numa_node"] = stringify(node);
10287 }
10288 }
10289 }
10290
10291 int BlueStore::get_numa_node(
10292 int *final_node,
10293 set<int> *out_nodes,
10294 set<string> *out_failed)
10295 {
10296 int node = -1;
10297 set<string> devices;
10298 get_devices(&devices);
10299 set<int> nodes;
10300 set<string> failed;
10301 for (auto& devname : devices) {
10302 int n;
10303 BlkDev bdev(devname);
10304 int r = bdev.get_numa_node(&n);
10305 if (r < 0) {
10306 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
10307 << dendl;
10308 failed.insert(devname);
10309 continue;
10310 }
10311 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
10312 << dendl;
10313 nodes.insert(n);
10314 if (node < 0) {
10315 node = n;
10316 }
10317 }
10318 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
10319 *final_node = node;
10320 }
10321 if (out_nodes) {
10322 *out_nodes = nodes;
10323 }
10324 if (out_failed) {
10325 *out_failed = failed;
10326 }
10327 return 0;
10328 }
10329
10330 void BlueStore::prepare_for_fast_shutdown()
10331 {
10332 m_fast_shutdown = true;
10333 }
10334
10335 int BlueStore::get_devices(set<string> *ls)
10336 {
10337 if (bdev) {
10338 bdev->get_devices(ls);
10339 if (bluefs) {
10340 bluefs->get_devices(ls);
10341 }
10342 return 0;
10343 }
10344
10345 // grumble, we haven't started up yet.
10346 if (int r = _open_path(); r < 0) {
10347 return r;
10348 }
10349 auto close_path = make_scope_guard([&] {
10350 _close_path();
10351 });
10352 if (int r = _open_fsid(false); r < 0) {
10353 return r;
10354 }
10355 auto close_fsid = make_scope_guard([&] {
10356 _close_fsid();
10357 });
10358 if (int r = _read_fsid(&fsid); r < 0) {
10359 return r;
10360 }
10361 if (int r = _lock_fsid(); r < 0) {
10362 return r;
10363 }
10364 if (int r = _open_bdev(false); r < 0) {
10365 return r;
10366 }
10367 auto close_bdev = make_scope_guard([&] {
10368 _close_bdev();
10369 });
10370 if (int r = _minimal_open_bluefs(false); r < 0) {
10371 return r;
10372 }
10373 bdev->get_devices(ls);
10374 if (bluefs) {
10375 bluefs->get_devices(ls);
10376 }
10377 _minimal_close_bluefs();
10378 return 0;
10379 }
10380
10381 void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
10382 {
10383 buf->reset();
10384
10385 auto prefix = per_pool_omap == OMAP_BULK ?
10386 PREFIX_OMAP :
10387 per_pool_omap == OMAP_PER_POOL ?
10388 PREFIX_PERPOOL_OMAP :
10389 PREFIX_PERPG_OMAP;
10390 buf->omap_allocated =
10391 db->estimate_prefix_size(prefix, string());
10392
10393 uint64_t bfree = alloc->get_free();
10394
10395 if (bluefs) {
10396 buf->internally_reserved = 0;
10397 // include dedicated db, too, if that isn't the shared device.
10398 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
10399 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
10400 }
10401 // call any non-omap bluefs space "internal metadata"
10402 buf->internal_metadata =
10403 bluefs->get_used()
10404 - buf->omap_allocated;
10405 }
10406
10407 uint64_t thin_total, thin_avail;
10408 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
10409 buf->total += thin_total;
10410
10411 // we are limited by both the size of the virtual device and the
10412 // underlying physical device.
10413 bfree = std::min(bfree, thin_avail);
10414
10415 buf->allocated = thin_total - thin_avail;
10416 } else {
10417 buf->total += bdev->get_size();
10418 }
10419 buf->available = bfree;
10420 }
10421
10422 int BlueStore::statfs(struct store_statfs_t *buf,
10423 osd_alert_list_t* alerts)
10424 {
10425 if (alerts) {
10426 alerts->clear();
10427 _log_alerts(*alerts);
10428 }
10429 _get_statfs_overall(buf);
10430 {
10431 std::lock_guard l(vstatfs_lock);
10432 buf->allocated = vstatfs.allocated();
10433 buf->data_stored = vstatfs.stored();
10434 buf->data_compressed = vstatfs.compressed();
10435 buf->data_compressed_original = vstatfs.compressed_original();
10436 buf->data_compressed_allocated = vstatfs.compressed_allocated();
10437 }
10438
10439 dout(20) << __func__ << " " << *buf << dendl;
10440 return 0;
10441 }
10442
10443 int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
10444 bool *out_per_pool_omap)
10445 {
10446 dout(20) << __func__ << " pool " << pool_id<< dendl;
10447
10448 if (!per_pool_stat_collection) {
10449 dout(20) << __func__ << " not supported in legacy mode " << dendl;
10450 return -ENOTSUP;
10451 }
10452 buf->reset();
10453
10454 {
10455 std::lock_guard l(vstatfs_lock);
10456 osd_pools[pool_id].publish(buf);
10457 }
10458
10459 string key_prefix;
10460 _key_encode_u64(pool_id, &key_prefix);
10461 *out_per_pool_omap = per_pool_omap != OMAP_BULK;
10462 // stop calls after db was closed
10463 if (*out_per_pool_omap && db) {
10464 auto prefix = per_pool_omap == OMAP_PER_POOL ?
10465 PREFIX_PERPOOL_OMAP :
10466 PREFIX_PERPG_OMAP;
10467 buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix);
10468 }
10469
10470 dout(10) << __func__ << *buf << dendl;
10471 return 0;
10472 }
10473
10474 void BlueStore::_check_legacy_statfs_alert()
10475 {
10476 string s;
10477 if (!per_pool_stat_collection &&
10478 cct->_conf->bluestore_warn_on_legacy_statfs) {
10479 s = "legacy statfs reporting detected, "
10480 "suggest to run store repair to get consistent statistic reports";
10481 }
10482 std::lock_guard l(qlock);
10483 legacy_statfs_alert = s;
10484 }
10485
10486 void BlueStore::_check_no_per_pg_or_pool_omap_alert()
10487 {
10488 string per_pg, per_pool;
10489 if (per_pool_omap != OMAP_PER_PG) {
10490 if (cct->_conf->bluestore_warn_on_no_per_pg_omap) {
10491 per_pg = "legacy (not per-pg) omap detected, "
10492 "suggest to run store repair to benefit from faster PG removal";
10493 }
10494 if (per_pool_omap != OMAP_PER_POOL) {
10495 if (cct->_conf->bluestore_warn_on_no_per_pool_omap) {
10496 per_pool = "legacy (not per-pool) omap detected, "
10497 "suggest to run store repair to benefit from per-pool omap usage statistics";
10498 }
10499 }
10500 }
10501 std::lock_guard l(qlock);
10502 no_per_pg_omap_alert = per_pg;
10503 no_per_pool_omap_alert = per_pool;
10504 }
10505
10506 // ---------------
10507 // cache
10508
10509 BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
10510 {
10511 std::shared_lock l(coll_lock);
10512 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
10513 if (cp == coll_map.end())
10514 return CollectionRef();
10515 return cp->second;
10516 }
10517
10518 BlueStore::CollectionRef BlueStore::_get_collection_by_oid(const ghobject_t& oid)
10519 {
10520 std::shared_lock l(coll_lock);
10521
10522 // FIXME: we must replace this with something more efficient
10523
10524 for (auto& i : coll_map) {
10525 spg_t spgid;
10526 if (i.first.is_pg(&spgid) &&
10527 i.second->contains(oid)) {
10528 return i.second;
10529 }
10530 }
10531 return CollectionRef();
10532 }
10533
10534 void BlueStore::_queue_reap_collection(CollectionRef& c)
10535 {
10536 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
10537 // _reap_collections and this in the same thread,
10538 // so no need a lock.
10539 removed_collections.push_back(c);
10540 }
10541
10542 void BlueStore::_reap_collections()
10543 {
10544
10545 list<CollectionRef> removed_colls;
10546 {
10547 // _queue_reap_collection and this in the same thread.
10548 // So no need a lock.
10549 if (!removed_collections.empty())
10550 removed_colls.swap(removed_collections);
10551 else
10552 return;
10553 }
10554
10555 list<CollectionRef>::iterator p = removed_colls.begin();
10556 while (p != removed_colls.end()) {
10557 CollectionRef c = *p;
10558 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
10559 if (c->onode_map.map_any([&](Onode* o) {
10560 ceph_assert(!o->exists);
10561 if (o->flushing_count.load()) {
10562 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
10563 << " flush_txns " << o->flushing_count << dendl;
10564 return true;
10565 }
10566 return false;
10567 })) {
10568 ++p;
10569 continue;
10570 }
10571 c->onode_map.clear();
10572 p = removed_colls.erase(p);
10573 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
10574 }
10575 if (removed_colls.empty()) {
10576 dout(10) << __func__ << " all reaped" << dendl;
10577 } else {
10578 removed_collections.splice(removed_collections.begin(), removed_colls);
10579 }
10580 }
10581
10582 void BlueStore::_update_cache_logger()
10583 {
10584 uint64_t num_onodes = 0;
10585 uint64_t num_pinned_onodes = 0;
10586 uint64_t num_extents = 0;
10587 uint64_t num_blobs = 0;
10588 uint64_t num_buffers = 0;
10589 uint64_t num_buffer_bytes = 0;
10590 for (auto c : onode_cache_shards) {
10591 c->add_stats(&num_onodes, &num_pinned_onodes);
10592 }
10593 for (auto c : buffer_cache_shards) {
10594 c->add_stats(&num_extents, &num_blobs,
10595 &num_buffers, &num_buffer_bytes);
10596 }
10597 logger->set(l_bluestore_onodes, num_onodes);
10598 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
10599 logger->set(l_bluestore_extents, num_extents);
10600 logger->set(l_bluestore_blobs, num_blobs);
10601 logger->set(l_bluestore_buffers, num_buffers);
10602 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
10603 }
10604
10605 // ---------------
10606 // read operations
10607
10608 ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
10609 {
10610 return _get_collection(cid);
10611 }
10612
10613 ObjectStore::CollectionHandle BlueStore::create_new_collection(
10614 const coll_t& cid)
10615 {
10616 std::unique_lock l{coll_lock};
10617 auto c = ceph::make_ref<Collection>(
10618 this,
10619 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
10620 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
10621 cid);
10622 new_coll_map[cid] = c;
10623 _osr_attach(c.get());
10624 return c;
10625 }
10626
10627 void BlueStore::set_collection_commit_queue(
10628 const coll_t& cid,
10629 ContextQueue *commit_queue)
10630 {
10631 if (commit_queue) {
10632 std::shared_lock l(coll_lock);
10633 if (coll_map.count(cid)) {
10634 coll_map[cid]->commit_queue = commit_queue;
10635 } else if (new_coll_map.count(cid)) {
10636 new_coll_map[cid]->commit_queue = commit_queue;
10637 }
10638 }
10639 }
10640
10641
10642 bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
10643 {
10644 Collection *c = static_cast<Collection *>(c_.get());
10645 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
10646 if (!c->exists)
10647 return false;
10648
10649 bool r = true;
10650
10651 {
10652 std::shared_lock l(c->lock);
10653 OnodeRef o = c->get_onode(oid, false);
10654 if (!o || !o->exists)
10655 r = false;
10656 }
10657
10658 return r;
10659 }
10660
10661 int BlueStore::stat(
10662 CollectionHandle &c_,
10663 const ghobject_t& oid,
10664 struct stat *st,
10665 bool allow_eio)
10666 {
10667 Collection *c = static_cast<Collection *>(c_.get());
10668 if (!c->exists)
10669 return -ENOENT;
10670 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
10671
10672 {
10673 std::shared_lock l(c->lock);
10674 OnodeRef o = c->get_onode(oid, false);
10675 if (!o || !o->exists)
10676 return -ENOENT;
10677 st->st_size = o->onode.size;
10678 st->st_blksize = 4096;
10679 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
10680 st->st_nlink = 1;
10681 }
10682
10683 int r = 0;
10684 if (_debug_mdata_eio(oid)) {
10685 r = -EIO;
10686 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10687 }
10688 return r;
10689 }
10690 int BlueStore::set_collection_opts(
10691 CollectionHandle& ch,
10692 const pool_opts_t& opts)
10693 {
10694 Collection *c = static_cast<Collection *>(ch.get());
10695 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
10696 if (!c->exists)
10697 return -ENOENT;
10698 std::unique_lock l{c->lock};
10699 c->pool_opts = opts;
10700 return 0;
10701 }
10702
10703 int BlueStore::read(
10704 CollectionHandle &c_,
10705 const ghobject_t& oid,
10706 uint64_t offset,
10707 size_t length,
10708 bufferlist& bl,
10709 uint32_t op_flags)
10710 {
10711 auto start = mono_clock::now();
10712 Collection *c = static_cast<Collection *>(c_.get());
10713 const coll_t &cid = c->get_cid();
10714 dout(15) << __func__ << " " << cid << " " << oid
10715 << " 0x" << std::hex << offset << "~" << length << std::dec
10716 << dendl;
10717 if (!c->exists)
10718 return -ENOENT;
10719
10720 bl.clear();
10721 int r;
10722 {
10723 std::shared_lock l(c->lock);
10724 auto start1 = mono_clock::now();
10725 OnodeRef o = c->get_onode(oid, false);
10726 log_latency("get_onode@read",
10727 l_bluestore_read_onode_meta_lat,
10728 mono_clock::now() - start1,
10729 cct->_conf->bluestore_log_op_age);
10730 if (!o || !o->exists) {
10731 r = -ENOENT;
10732 goto out;
10733 }
10734
10735 if (offset == length && offset == 0)
10736 length = o->onode.size;
10737
10738 r = _do_read(c, o, offset, length, bl, op_flags);
10739 if (r == -EIO) {
10740 logger->inc(l_bluestore_read_eio);
10741 }
10742 }
10743
10744 out:
10745 if (r >= 0 && _debug_data_eio(oid)) {
10746 r = -EIO;
10747 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10748 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10749 cct->_conf->bluestore_debug_random_read_err &&
10750 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10751 100.0)) == 0) {
10752 dout(0) << __func__ << ": inject random EIO" << dendl;
10753 r = -EIO;
10754 }
10755 dout(10) << __func__ << " " << cid << " " << oid
10756 << " 0x" << std::hex << offset << "~" << length << std::dec
10757 << " = " << r << dendl;
10758 log_latency(__func__,
10759 l_bluestore_read_lat,
10760 mono_clock::now() - start,
10761 cct->_conf->bluestore_log_op_age);
10762 return r;
10763 }
10764
10765 void BlueStore::_read_cache(
10766 OnodeRef o,
10767 uint64_t offset,
10768 size_t length,
10769 int read_cache_policy,
10770 ready_regions_t& ready_regions,
10771 blobs2read_t& blobs2read)
10772 {
10773 // build blob-wise list to of stuff read (that isn't cached)
10774 unsigned left = length;
10775 uint64_t pos = offset;
10776 auto lp = o->extent_map.seek_lextent(offset);
10777 while (left > 0 && lp != o->extent_map.extent_map.end()) {
10778 if (pos < lp->logical_offset) {
10779 unsigned hole = lp->logical_offset - pos;
10780 if (hole >= left) {
10781 break;
10782 }
10783 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
10784 << std::dec << dendl;
10785 pos += hole;
10786 left -= hole;
10787 }
10788 BlobRef& bptr = lp->blob;
10789 unsigned l_off = pos - lp->logical_offset;
10790 unsigned b_off = l_off + lp->blob_offset;
10791 unsigned b_len = std::min(left, lp->length - l_off);
10792
10793 ready_regions_t cache_res;
10794 interval_set<uint32_t> cache_interval;
10795 bptr->shared_blob->bc.read(
10796 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
10797 read_cache_policy);
10798 dout(20) << __func__ << " blob " << *bptr << std::hex
10799 << " need 0x" << b_off << "~" << b_len
10800 << " cache has 0x" << cache_interval
10801 << std::dec << dendl;
10802
10803 auto pc = cache_res.begin();
10804 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
10805 while (b_len > 0) {
10806 unsigned l;
10807 if (pc != cache_res.end() &&
10808 pc->first == b_off) {
10809 l = pc->second.length();
10810 ready_regions[pos] = std::move(pc->second);
10811 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
10812 << b_off << "~" << l << std::dec << dendl;
10813 ++pc;
10814 } else {
10815 l = b_len;
10816 if (pc != cache_res.end()) {
10817 ceph_assert(pc->first > b_off);
10818 l = pc->first - b_off;
10819 }
10820 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
10821 << b_off << "~" << l << std::dec << dendl;
10822 // merge regions
10823 {
10824 uint64_t r_off = b_off;
10825 uint64_t r_len = l;
10826 uint64_t front = r_off % chunk_size;
10827 if (front) {
10828 r_off -= front;
10829 r_len += front;
10830 }
10831 unsigned tail = r_len % chunk_size;
10832 if (tail) {
10833 r_len += chunk_size - tail;
10834 }
10835 bool merged = false;
10836 regions2read_t& r2r = blobs2read[bptr];
10837 if (r2r.size()) {
10838 read_req_t& pre = r2r.back();
10839 if (r_off <= (pre.r_off + pre.r_len)) {
10840 front += (r_off - pre.r_off);
10841 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
10842 pre.regs.emplace_back(region_t(pos, b_off, l, front));
10843 merged = true;
10844 }
10845 }
10846 if (!merged) {
10847 read_req_t req(r_off, r_len);
10848 req.regs.emplace_back(region_t(pos, b_off, l, front));
10849 r2r.emplace_back(std::move(req));
10850 }
10851 }
10852 }
10853 pos += l;
10854 b_off += l;
10855 left -= l;
10856 b_len -= l;
10857 }
10858 ++lp;
10859 }
10860 }
10861
10862 int BlueStore::_prepare_read_ioc(
10863 blobs2read_t& blobs2read,
10864 vector<bufferlist>* compressed_blob_bls,
10865 IOContext* ioc)
10866 {
10867 for (auto& p : blobs2read) {
10868 const BlobRef& bptr = p.first;
10869 regions2read_t& r2r = p.second;
10870 dout(20) << __func__ << " blob " << *bptr << " need "
10871 << r2r << dendl;
10872 if (bptr->get_blob().is_compressed()) {
10873 // read the whole thing
10874 if (compressed_blob_bls->empty()) {
10875 // ensure we avoid any reallocation on subsequent blobs
10876 compressed_blob_bls->reserve(blobs2read.size());
10877 }
10878 compressed_blob_bls->push_back(bufferlist());
10879 bufferlist& bl = compressed_blob_bls->back();
10880 auto r = bptr->get_blob().map(
10881 0, bptr->get_blob().get_ondisk_length(),
10882 [&](uint64_t offset, uint64_t length) {
10883 int r = bdev->aio_read(offset, length, &bl, ioc);
10884 if (r < 0)
10885 return r;
10886 return 0;
10887 });
10888 if (r < 0) {
10889 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
10890 if (r == -EIO) {
10891 // propagate EIO to caller
10892 return r;
10893 }
10894 ceph_assert(r == 0);
10895 }
10896 } else {
10897 // read the pieces
10898 for (auto& req : r2r) {
10899 dout(20) << __func__ << " region 0x" << std::hex
10900 << req.regs.front().logical_offset
10901 << ": 0x" << req.regs.front().blob_xoffset
10902 << " reading 0x" << req.r_off
10903 << "~" << req.r_len << std::dec
10904 << dendl;
10905
10906 // read it
10907 auto r = bptr->get_blob().map(
10908 req.r_off, req.r_len,
10909 [&](uint64_t offset, uint64_t length) {
10910 int r = bdev->aio_read(offset, length, &req.bl, ioc);
10911 if (r < 0)
10912 return r;
10913 return 0;
10914 });
10915 if (r < 0) {
10916 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
10917 << dendl;
10918 if (r == -EIO) {
10919 // propagate EIO to caller
10920 return r;
10921 }
10922 ceph_assert(r == 0);
10923 }
10924 ceph_assert(req.bl.length() == req.r_len);
10925 }
10926 }
10927 }
10928 return 0;
10929 }
10930
10931 int BlueStore::_generate_read_result_bl(
10932 OnodeRef o,
10933 uint64_t offset,
10934 size_t length,
10935 ready_regions_t& ready_regions,
10936 vector<bufferlist>& compressed_blob_bls,
10937 blobs2read_t& blobs2read,
10938 bool buffered,
10939 bool* csum_error,
10940 bufferlist& bl)
10941 {
10942 // enumerate and decompress desired blobs
10943 auto p = compressed_blob_bls.begin();
10944 blobs2read_t::iterator b2r_it = blobs2read.begin();
10945 while (b2r_it != blobs2read.end()) {
10946 const BlobRef& bptr = b2r_it->first;
10947 regions2read_t& r2r = b2r_it->second;
10948 dout(20) << __func__ << " blob " << *bptr << " need "
10949 << r2r << dendl;
10950 if (bptr->get_blob().is_compressed()) {
10951 ceph_assert(p != compressed_blob_bls.end());
10952 bufferlist& compressed_bl = *p++;
10953 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
10954 r2r.front().regs.front().logical_offset) < 0) {
10955 *csum_error = true;
10956 return -EIO;
10957 }
10958 bufferlist raw_bl;
10959 auto r = _decompress(compressed_bl, &raw_bl);
10960 if (r < 0)
10961 return r;
10962 if (buffered) {
10963 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
10964 raw_bl);
10965 }
10966 for (auto& req : r2r) {
10967 for (auto& r : req.regs) {
10968 ready_regions[r.logical_offset].substr_of(
10969 raw_bl, r.blob_xoffset, r.length);
10970 }
10971 }
10972 } else {
10973 for (auto& req : r2r) {
10974 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
10975 req.regs.front().logical_offset) < 0) {
10976 *csum_error = true;
10977 return -EIO;
10978 }
10979 if (buffered) {
10980 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
10981 req.r_off, req.bl);
10982 }
10983
10984 // prune and keep result
10985 for (const auto& r : req.regs) {
10986 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
10987 }
10988 }
10989 }
10990 ++b2r_it;
10991 }
10992
10993 // generate a resulting buffer
10994 auto pr = ready_regions.begin();
10995 auto pr_end = ready_regions.end();
10996 uint64_t pos = 0;
10997 while (pos < length) {
10998 if (pr != pr_end && pr->first == pos + offset) {
10999 dout(30) << __func__ << " assemble 0x" << std::hex << pos
11000 << ": data from 0x" << pr->first << "~" << pr->second.length()
11001 << std::dec << dendl;
11002 pos += pr->second.length();
11003 bl.claim_append(pr->second);
11004 ++pr;
11005 } else {
11006 uint64_t l = length - pos;
11007 if (pr != pr_end) {
11008 ceph_assert(pr->first > pos + offset);
11009 l = pr->first - (pos + offset);
11010 }
11011 dout(30) << __func__ << " assemble 0x" << std::hex << pos
11012 << ": zeros for 0x" << (pos + offset) << "~" << l
11013 << std::dec << dendl;
11014 bl.append_zero(l);
11015 pos += l;
11016 }
11017 }
11018 ceph_assert(bl.length() == length);
11019 ceph_assert(pos == length);
11020 ceph_assert(pr == pr_end);
11021 return 0;
11022 }
11023
11024 int BlueStore::_do_read(
11025 Collection *c,
11026 OnodeRef o,
11027 uint64_t offset,
11028 size_t length,
11029 bufferlist& bl,
11030 uint32_t op_flags,
11031 uint64_t retry_count)
11032 {
11033 FUNCTRACE(cct);
11034 int r = 0;
11035 int read_cache_policy = 0; // do not bypass clean or dirty cache
11036
11037 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11038 << " size 0x" << o->onode.size << " (" << std::dec
11039 << o->onode.size << ")" << dendl;
11040 bl.clear();
11041
11042 if (offset >= o->onode.size) {
11043 return r;
11044 }
11045
11046 // generally, don't buffer anything, unless the client explicitly requests
11047 // it.
11048 bool buffered = false;
11049 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
11050 dout(20) << __func__ << " will do buffered read" << dendl;
11051 buffered = true;
11052 } else if (cct->_conf->bluestore_default_buffered_read &&
11053 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
11054 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
11055 dout(20) << __func__ << " defaulting to buffered read" << dendl;
11056 buffered = true;
11057 }
11058
11059 if (offset + length > o->onode.size) {
11060 length = o->onode.size - offset;
11061 }
11062
11063 auto start = mono_clock::now();
11064 o->extent_map.fault_range(db, offset, length);
11065 log_latency(__func__,
11066 l_bluestore_read_onode_meta_lat,
11067 mono_clock::now() - start,
11068 cct->_conf->bluestore_log_op_age);
11069 _dump_onode<30>(cct, *o);
11070
11071 // for deep-scrub, we only read dirty cache and bypass clean cache in
11072 // order to read underlying block device in case there are silent disk errors.
11073 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
11074 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
11075 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
11076 }
11077
11078 // build blob-wise list to of stuff read (that isn't cached)
11079 ready_regions_t ready_regions;
11080 blobs2read_t blobs2read;
11081 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
11082
11083
11084 // read raw blob data.
11085 start = mono_clock::now(); // for the sake of simplicity
11086 // measure the whole block below.
11087 // The error isn't that much...
11088 vector<bufferlist> compressed_blob_bls;
11089 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
11090 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
11091 // we always issue aio for reading, so errors other than EIO are not allowed
11092 if (r < 0)
11093 return r;
11094
11095 int64_t num_ios = blobs2read.size();
11096 if (ioc.has_pending_aios()) {
11097 num_ios = ioc.get_num_ios();
11098 bdev->aio_submit(&ioc);
11099 dout(20) << __func__ << " waiting for aio" << dendl;
11100 ioc.aio_wait();
11101 r = ioc.get_return_value();
11102 if (r < 0) {
11103 ceph_assert(r == -EIO); // no other errors allowed
11104 return -EIO;
11105 }
11106 }
11107 log_latency_fn(__func__,
11108 l_bluestore_read_wait_aio_lat,
11109 mono_clock::now() - start,
11110 cct->_conf->bluestore_log_op_age,
11111 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
11112 );
11113
11114 bool csum_error = false;
11115 r = _generate_read_result_bl(o, offset, length, ready_regions,
11116 compressed_blob_bls, blobs2read,
11117 buffered && !ioc.skip_cache(),
11118 &csum_error, bl);
11119 if (csum_error) {
11120 // Handles spurious read errors caused by a kernel bug.
11121 // We sometimes get all-zero pages as a result of the read under
11122 // high memory pressure. Retrying the failing read succeeds in most
11123 // cases.
11124 // See also: http://tracker.ceph.com/issues/22464
11125 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
11126 return -EIO;
11127 }
11128 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
11129 }
11130 r = bl.length();
11131 if (retry_count) {
11132 logger->inc(l_bluestore_reads_with_retries);
11133 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
11134 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
11135 stringstream s;
11136 s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
11137 _set_spurious_read_errors_alert(s.str());
11138 }
11139 return r;
11140 }
11141
11142 int BlueStore::_verify_csum(OnodeRef& o,
11143 const bluestore_blob_t* blob, uint64_t blob_xoffset,
11144 const bufferlist& bl,
11145 uint64_t logical_offset) const
11146 {
11147 int bad;
11148 uint64_t bad_csum;
11149 auto start = mono_clock::now();
11150 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
11151 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
11152 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
11153 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
11154 bad = blob_xoffset;
11155 r = -1;
11156 bad_csum = 0xDEADBEEF;
11157 }
11158 if (r < 0) {
11159 if (r == -1) {
11160 PExtentVector pex;
11161 blob->map(
11162 bad,
11163 blob->get_csum_chunk_size(),
11164 [&](uint64_t offset, uint64_t length) {
11165 pex.emplace_back(bluestore_pextent_t(offset, length));
11166 return 0;
11167 });
11168 derr << __func__ << " bad "
11169 << Checksummer::get_csum_type_string(blob->csum_type)
11170 << "/0x" << std::hex << blob->get_csum_chunk_size()
11171 << " checksum at blob offset 0x" << bad
11172 << ", got 0x" << bad_csum << ", expected 0x"
11173 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
11174 << ", device location " << pex
11175 << ", logical extent 0x" << std::hex
11176 << (logical_offset + bad - blob_xoffset) << "~"
11177 << blob->get_csum_chunk_size() << std::dec
11178 << ", object " << o->oid
11179 << dendl;
11180 } else {
11181 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
11182 }
11183 }
11184 log_latency(__func__,
11185 l_bluestore_csum_lat,
11186 mono_clock::now() - start,
11187 cct->_conf->bluestore_log_op_age);
11188 if (cct->_conf->bluestore_ignore_data_csum) {
11189 return 0;
11190 }
11191 return r;
11192 }
11193
11194 int BlueStore::_decompress(bufferlist& source, bufferlist* result)
11195 {
11196 int r = 0;
11197 auto start = mono_clock::now();
11198 auto i = source.cbegin();
11199 bluestore_compression_header_t chdr;
11200 decode(chdr, i);
11201 int alg = int(chdr.type);
11202 CompressorRef cp = compressor;
11203 if (!cp || (int)cp->get_type() != alg) {
11204 cp = Compressor::create(cct, alg);
11205 }
11206
11207 if (!cp.get()) {
11208 // if compressor isn't available - error, because cannot return
11209 // decompressed data?
11210
11211 const char* alg_name = Compressor::get_comp_alg_name(alg);
11212 derr << __func__ << " can't load decompressor " << alg_name << dendl;
11213 _set_compression_alert(false, alg_name);
11214 r = -EIO;
11215 } else {
11216 r = cp->decompress(i, chdr.length, *result, chdr.compressor_message);
11217 if (r < 0) {
11218 derr << __func__ << " decompression failed with exit code " << r << dendl;
11219 r = -EIO;
11220 }
11221 }
11222 log_latency(__func__,
11223 l_bluestore_decompress_lat,
11224 mono_clock::now() - start,
11225 cct->_conf->bluestore_log_op_age);
11226 return r;
11227 }
11228
11229 // this stores fiemap into interval_set, other variations
11230 // use it internally
11231 int BlueStore::_fiemap(
11232 CollectionHandle &c_,
11233 const ghobject_t& oid,
11234 uint64_t offset,
11235 size_t length,
11236 interval_set<uint64_t>& destset)
11237 {
11238 Collection *c = static_cast<Collection *>(c_.get());
11239 if (!c->exists)
11240 return -ENOENT;
11241 {
11242 std::shared_lock l(c->lock);
11243
11244 OnodeRef o = c->get_onode(oid, false);
11245 if (!o || !o->exists) {
11246 return -ENOENT;
11247 }
11248 _dump_onode<30>(cct, *o);
11249
11250 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11251 << " size 0x" << o->onode.size << std::dec << dendl;
11252
11253 boost::intrusive::set<Extent>::iterator ep, eend;
11254 if (offset >= o->onode.size)
11255 goto out;
11256
11257 if (offset + length > o->onode.size) {
11258 length = o->onode.size - offset;
11259 }
11260
11261 o->extent_map.fault_range(db, offset, length);
11262 eend = o->extent_map.extent_map.end();
11263 ep = o->extent_map.seek_lextent(offset);
11264 while (length > 0) {
11265 dout(20) << __func__ << " offset " << offset << dendl;
11266 if (ep != eend && ep->logical_offset + ep->length <= offset) {
11267 ++ep;
11268 continue;
11269 }
11270
11271 uint64_t x_len = length;
11272 if (ep != eend && ep->logical_offset <= offset) {
11273 uint64_t x_off = offset - ep->logical_offset;
11274 x_len = std::min(x_len, ep->length - x_off);
11275 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
11276 << x_len << std::dec << " blob " << ep->blob << dendl;
11277 destset.insert(offset, x_len);
11278 length -= x_len;
11279 offset += x_len;
11280 if (x_off + x_len == ep->length)
11281 ++ep;
11282 continue;
11283 }
11284 if (ep != eend &&
11285 ep->logical_offset > offset &&
11286 ep->logical_offset - offset < x_len) {
11287 x_len = ep->logical_offset - offset;
11288 }
11289 offset += x_len;
11290 length -= x_len;
11291 }
11292 }
11293
11294 out:
11295 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11296 << " size = 0x(" << destset << ")" << std::dec << dendl;
11297 return 0;
11298 }
11299
11300 int BlueStore::fiemap(
11301 CollectionHandle &c_,
11302 const ghobject_t& oid,
11303 uint64_t offset,
11304 size_t length,
11305 bufferlist& bl)
11306 {
11307 interval_set<uint64_t> m;
11308 int r = _fiemap(c_, oid, offset, length, m);
11309 if (r >= 0) {
11310 encode(m, bl);
11311 }
11312 return r;
11313 }
11314
11315 int BlueStore::fiemap(
11316 CollectionHandle &c_,
11317 const ghobject_t& oid,
11318 uint64_t offset,
11319 size_t length,
11320 map<uint64_t, uint64_t>& destmap)
11321 {
11322 interval_set<uint64_t> m;
11323 int r = _fiemap(c_, oid, offset, length, m);
11324 if (r >= 0) {
11325 destmap = std::move(m).detach();
11326 }
11327 return r;
11328 }
11329
11330 int BlueStore::readv(
11331 CollectionHandle &c_,
11332 const ghobject_t& oid,
11333 interval_set<uint64_t>& m,
11334 bufferlist& bl,
11335 uint32_t op_flags)
11336 {
11337 auto start = mono_clock::now();
11338 Collection *c = static_cast<Collection *>(c_.get());
11339 const coll_t &cid = c->get_cid();
11340 dout(15) << __func__ << " " << cid << " " << oid
11341 << " fiemap " << m
11342 << dendl;
11343 if (!c->exists)
11344 return -ENOENT;
11345
11346 bl.clear();
11347 int r;
11348 {
11349 std::shared_lock l(c->lock);
11350 auto start1 = mono_clock::now();
11351 OnodeRef o = c->get_onode(oid, false);
11352 log_latency("get_onode@read",
11353 l_bluestore_read_onode_meta_lat,
11354 mono_clock::now() - start1,
11355 cct->_conf->bluestore_log_op_age);
11356 if (!o || !o->exists) {
11357 r = -ENOENT;
11358 goto out;
11359 }
11360
11361 if (m.empty()) {
11362 r = 0;
11363 goto out;
11364 }
11365
11366 r = _do_readv(c, o, m, bl, op_flags);
11367 if (r == -EIO) {
11368 logger->inc(l_bluestore_read_eio);
11369 }
11370 }
11371
11372 out:
11373 if (r >= 0 && _debug_data_eio(oid)) {
11374 r = -EIO;
11375 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11376 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
11377 cct->_conf->bluestore_debug_random_read_err &&
11378 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
11379 100.0)) == 0) {
11380 dout(0) << __func__ << ": inject random EIO" << dendl;
11381 r = -EIO;
11382 }
11383 dout(10) << __func__ << " " << cid << " " << oid
11384 << " fiemap " << m << std::dec
11385 << " = " << r << dendl;
11386 log_latency(__func__,
11387 l_bluestore_read_lat,
11388 mono_clock::now() - start,
11389 cct->_conf->bluestore_log_op_age);
11390 return r;
11391 }
11392
11393 int BlueStore::_do_readv(
11394 Collection *c,
11395 OnodeRef o,
11396 const interval_set<uint64_t>& m,
11397 bufferlist& bl,
11398 uint32_t op_flags,
11399 uint64_t retry_count)
11400 {
11401 FUNCTRACE(cct);
11402 int r = 0;
11403 int read_cache_policy = 0; // do not bypass clean or dirty cache
11404
11405 dout(20) << __func__ << " fiemap " << m << std::hex
11406 << " size 0x" << o->onode.size << " (" << std::dec
11407 << o->onode.size << ")" << dendl;
11408
11409 // generally, don't buffer anything, unless the client explicitly requests
11410 // it.
11411 bool buffered = false;
11412 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
11413 dout(20) << __func__ << " will do buffered read" << dendl;
11414 buffered = true;
11415 } else if (cct->_conf->bluestore_default_buffered_read &&
11416 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
11417 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
11418 dout(20) << __func__ << " defaulting to buffered read" << dendl;
11419 buffered = true;
11420 }
11421 // this method must be idempotent since we may call it several times
11422 // before we finally read the expected result.
11423 bl.clear();
11424
11425 // call fiemap first!
11426 ceph_assert(m.range_start() <= o->onode.size);
11427 ceph_assert(m.range_end() <= o->onode.size);
11428 auto start = mono_clock::now();
11429 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
11430 log_latency(__func__,
11431 l_bluestore_read_onode_meta_lat,
11432 mono_clock::now() - start,
11433 cct->_conf->bluestore_log_op_age);
11434 _dump_onode<30>(cct, *o);
11435
11436 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
11437 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
11438 raw_results.reserve(m.num_intervals());
11439 int i = 0;
11440 for (auto p = m.begin(); p != m.end(); p++, i++) {
11441 raw_results.push_back({});
11442 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
11443 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
11444 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
11445 // we always issue aio for reading, so errors other than EIO are not allowed
11446 if (r < 0)
11447 return r;
11448 }
11449
11450 auto num_ios = m.size();
11451 if (ioc.has_pending_aios()) {
11452 num_ios = ioc.get_num_ios();
11453 bdev->aio_submit(&ioc);
11454 dout(20) << __func__ << " waiting for aio" << dendl;
11455 ioc.aio_wait();
11456 r = ioc.get_return_value();
11457 if (r < 0) {
11458 ceph_assert(r == -EIO); // no other errors allowed
11459 return -EIO;
11460 }
11461 }
11462 log_latency_fn(__func__,
11463 l_bluestore_read_wait_aio_lat,
11464 mono_clock::now() - start,
11465 cct->_conf->bluestore_log_op_age,
11466 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
11467 );
11468
11469 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
11470 i = 0;
11471 for (auto p = m.begin(); p != m.end(); p++, i++) {
11472 bool csum_error = false;
11473 bufferlist t;
11474 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
11475 std::get<0>(raw_results[i]),
11476 std::get<1>(raw_results[i]),
11477 std::get<2>(raw_results[i]),
11478 buffered, &csum_error, t);
11479 if (csum_error) {
11480 // Handles spurious read errors caused by a kernel bug.
11481 // We sometimes get all-zero pages as a result of the read under
11482 // high memory pressure. Retrying the failing read succeeds in most
11483 // cases.
11484 // See also: http://tracker.ceph.com/issues/22464
11485 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
11486 return -EIO;
11487 }
11488 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
11489 }
11490 bl.claim_append(t);
11491 }
11492 if (retry_count) {
11493 logger->inc(l_bluestore_reads_with_retries);
11494 dout(5) << __func__ << " read fiemap " << m
11495 << " failed " << retry_count << " times before succeeding"
11496 << dendl;
11497 }
11498 return bl.length();
11499 }
11500
11501 int BlueStore::dump_onode(CollectionHandle &c_,
11502 const ghobject_t& oid,
11503 const string& section_name,
11504 Formatter *f)
11505 {
11506 Collection *c = static_cast<Collection *>(c_.get());
11507 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
11508 if (!c->exists)
11509 return -ENOENT;
11510
11511 int r;
11512 {
11513 std::shared_lock l(c->lock);
11514
11515 OnodeRef o = c->get_onode(oid, false);
11516 if (!o || !o->exists) {
11517 r = -ENOENT;
11518 goto out;
11519 }
11520 // FIXME minor: actually the next line isn't enough to
11521 // load shared blobs. Leaving as is for now..
11522 //
11523 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
11524
11525 _dump_onode<0>(cct, *o);
11526 f->open_object_section(section_name.c_str());
11527 o->dump(f);
11528 f->close_section();
11529 r = 0;
11530 }
11531 out:
11532 dout(10) << __func__ << " " << c->cid << " " << oid
11533 << " = " << r << dendl;
11534 return r;
11535 }
11536
11537 int BlueStore::getattr(
11538 CollectionHandle &c_,
11539 const ghobject_t& oid,
11540 const char *name,
11541 bufferptr& value)
11542 {
11543 Collection *c = static_cast<Collection *>(c_.get());
11544 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
11545 if (!c->exists)
11546 return -ENOENT;
11547
11548 int r;
11549 {
11550 std::shared_lock l(c->lock);
11551 mempool::bluestore_cache_meta::string k(name);
11552
11553 OnodeRef o = c->get_onode(oid, false);
11554 if (!o || !o->exists) {
11555 r = -ENOENT;
11556 goto out;
11557 }
11558
11559 if (!o->onode.attrs.count(k)) {
11560 r = -ENODATA;
11561 goto out;
11562 }
11563 value = o->onode.attrs[k];
11564 r = 0;
11565 }
11566 out:
11567 if (r == 0 && _debug_mdata_eio(oid)) {
11568 r = -EIO;
11569 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11570 }
11571 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
11572 << " = " << r << dendl;
11573 return r;
11574 }
11575
11576 int BlueStore::getattrs(
11577 CollectionHandle &c_,
11578 const ghobject_t& oid,
11579 map<string,bufferptr,less<>>& aset)
11580 {
11581 Collection *c = static_cast<Collection *>(c_.get());
11582 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
11583 if (!c->exists)
11584 return -ENOENT;
11585
11586 int r;
11587 {
11588 std::shared_lock l(c->lock);
11589
11590 OnodeRef o = c->get_onode(oid, false);
11591 if (!o || !o->exists) {
11592 r = -ENOENT;
11593 goto out;
11594 }
11595 for (auto& i : o->onode.attrs) {
11596 aset.emplace(i.first.c_str(), i.second);
11597 }
11598 r = 0;
11599 }
11600
11601 out:
11602 if (r == 0 && _debug_mdata_eio(oid)) {
11603 r = -EIO;
11604 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11605 }
11606 dout(10) << __func__ << " " << c->cid << " " << oid
11607 << " = " << r << dendl;
11608 return r;
11609 }
11610
11611 int BlueStore::list_collections(vector<coll_t>& ls)
11612 {
11613 std::shared_lock l(coll_lock);
11614 ls.reserve(coll_map.size());
11615 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
11616 p != coll_map.end();
11617 ++p)
11618 ls.push_back(p->first);
11619 return 0;
11620 }
11621
11622 bool BlueStore::collection_exists(const coll_t& c)
11623 {
11624 std::shared_lock l(coll_lock);
11625 return coll_map.count(c);
11626 }
11627
11628 int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
11629 {
11630 dout(15) << __func__ << " " << ch->cid << dendl;
11631 vector<ghobject_t> ls;
11632 ghobject_t next;
11633 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
11634 &ls, &next);
11635 if (r < 0) {
11636 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
11637 << dendl;
11638 return r;
11639 }
11640 *empty = ls.empty();
11641 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
11642 return 0;
11643 }
11644
11645 int BlueStore::collection_bits(CollectionHandle& ch)
11646 {
11647 dout(15) << __func__ << " " << ch->cid << dendl;
11648 Collection *c = static_cast<Collection*>(ch.get());
11649 std::shared_lock l(c->lock);
11650 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
11651 return c->cnode.bits;
11652 }
11653
11654 int BlueStore::collection_list(
11655 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
11656 vector<ghobject_t> *ls, ghobject_t *pnext)
11657 {
11658 Collection *c = static_cast<Collection *>(c_.get());
11659 c->flush();
11660 dout(15) << __func__ << " " << c->cid
11661 << " start " << start << " end " << end << " max " << max << dendl;
11662 int r;
11663 {
11664 std::shared_lock l(c->lock);
11665 r = _collection_list(c, start, end, max, false, ls, pnext);
11666 }
11667
11668 dout(10) << __func__ << " " << c->cid
11669 << " start " << start << " end " << end << " max " << max
11670 << " = " << r << ", ls.size() = " << ls->size()
11671 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
11672 return r;
11673 }
11674
11675 int BlueStore::collection_list_legacy(
11676 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
11677 vector<ghobject_t> *ls, ghobject_t *pnext)
11678 {
11679 Collection *c = static_cast<Collection *>(c_.get());
11680 c->flush();
11681 dout(15) << __func__ << " " << c->cid
11682 << " start " << start << " end " << end << " max " << max << dendl;
11683 int r;
11684 {
11685 std::shared_lock l(c->lock);
11686 r = _collection_list(c, start, end, max, true, ls, pnext);
11687 }
11688
11689 dout(10) << __func__ << " " << c->cid
11690 << " start " << start << " end " << end << " max " << max
11691 << " = " << r << ", ls.size() = " << ls->size()
11692 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
11693 return r;
11694 }
11695
11696 int BlueStore::_collection_list(
11697 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
11698 bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
11699 {
11700
11701 if (!c->exists)
11702 return -ENOENT;
11703
11704 ghobject_t static_next;
11705 std::unique_ptr<CollectionListIterator> it;
11706 ghobject_t coll_range_temp_start, coll_range_temp_end;
11707 ghobject_t coll_range_start, coll_range_end;
11708 ghobject_t pend;
11709 bool temp;
11710
11711 if (!pnext)
11712 pnext = &static_next;
11713
11714 auto log_latency = make_scope_guard(
11715 [&, start_time = mono_clock::now(), func_name = __func__] {
11716 log_latency_fn(
11717 func_name,
11718 l_bluestore_remove_lat,
11719 mono_clock::now() - start_time,
11720 cct->_conf->bluestore_log_collection_list_age,
11721 [&](const ceph::timespan& lat) {
11722 ostringstream ostr;
11723 ostr << ", lat = " << timespan_str(lat)
11724 << " cid =" << c->cid
11725 << " start " << start << " end " << end
11726 << " max " << max;
11727 return ostr.str();
11728 });
11729 });
11730
11731 if (start.is_max() || start.hobj.is_max()) {
11732 *pnext = ghobject_t::get_max();
11733 return 0;
11734 }
11735 get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
11736 &coll_range_temp_end, &coll_range_start, &coll_range_end, legacy);
11737 dout(20) << __func__
11738 << " range " << coll_range_temp_start
11739 << " to " << coll_range_temp_end
11740 << " and " << coll_range_start
11741 << " to " << coll_range_end
11742 << " start " << start << dendl;
11743 if (legacy) {
11744 it = std::make_unique<SimpleCollectionListIterator>(
11745 cct, db->get_iterator(PREFIX_OBJ));
11746 } else {
11747 it = std::make_unique<SortedCollectionListIterator>(
11748 db->get_iterator(PREFIX_OBJ));
11749 }
11750 if (start == ghobject_t() ||
11751 start.hobj == hobject_t() ||
11752 start == c->cid.get_min_hobj()) {
11753 it->upper_bound(coll_range_temp_start);
11754 temp = true;
11755 } else {
11756 if (start.hobj.is_temp()) {
11757 temp = true;
11758 ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
11759 } else {
11760 temp = false;
11761 ceph_assert(start >= coll_range_start && start < coll_range_end);
11762 }
11763 dout(20) << __func__ << " temp=" << (int)temp << dendl;
11764 it->lower_bound(start);
11765 }
11766 if (end.hobj.is_max()) {
11767 pend = temp ? coll_range_temp_end : coll_range_end;
11768 } else {
11769 if (end.hobj.is_temp()) {
11770 if (temp) {
11771 pend = end;
11772 } else {
11773 *pnext = ghobject_t::get_max();
11774 return 0;
11775 }
11776 } else {
11777 pend = temp ? coll_range_temp_end : end;
11778 }
11779 }
11780 dout(20) << __func__ << " pend " << pend << dendl;
11781 while (true) {
11782 if (!it->valid() || it->is_ge(pend)) {
11783 if (!it->valid())
11784 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
11785 else
11786 dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
11787 if (temp) {
11788 if (end.hobj.is_temp()) {
11789 if (it->valid() && it->is_lt(coll_range_temp_end)) {
11790 *pnext = it->oid();
11791 return 0;
11792 }
11793 break;
11794 }
11795 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
11796 temp = false;
11797 it->upper_bound(coll_range_start);
11798 if (end.hobj.is_max())
11799 pend = coll_range_end;
11800 else
11801 pend = end;
11802 dout(30) << __func__ << " pend " << pend << dendl;
11803 continue;
11804 }
11805 if (it->valid() && it->is_lt(coll_range_end)) {
11806 *pnext = it->oid();
11807 return 0;
11808 }
11809 break;
11810 }
11811 dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
11812 if (ls->size() >= (unsigned)max) {
11813 dout(20) << __func__ << " reached max " << max << dendl;
11814 *pnext = it->oid();
11815 return 0;
11816 }
11817 ls->push_back(it->oid());
11818 it->next();
11819 }
11820 *pnext = ghobject_t::get_max();
11821 return 0;
11822 }
11823
11824 int BlueStore::omap_get(
11825 CollectionHandle &c_, ///< [in] Collection containing oid
11826 const ghobject_t &oid, ///< [in] Object containing omap
11827 bufferlist *header, ///< [out] omap header
11828 map<string, bufferlist> *out /// < [out] Key to value map
11829 )
11830 {
11831 Collection *c = static_cast<Collection *>(c_.get());
11832 return _omap_get(c, oid, header, out);
11833 }
11834
11835 int BlueStore::_omap_get(
11836 Collection *c, ///< [in] Collection containing oid
11837 const ghobject_t &oid, ///< [in] Object containing omap
11838 bufferlist *header, ///< [out] omap header
11839 map<string, bufferlist> *out /// < [out] Key to value map
11840 )
11841 {
11842 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11843 if (!c->exists)
11844 return -ENOENT;
11845 std::shared_lock l(c->lock);
11846 int r = 0;
11847 OnodeRef o = c->get_onode(oid, false);
11848 if (!o || !o->exists) {
11849 r = -ENOENT;
11850 goto out;
11851 }
11852 r = _onode_omap_get(o, header, out);
11853 out:
11854 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11855 << dendl;
11856 return r;
11857 }
11858
11859 int BlueStore::_onode_omap_get(
11860 const OnodeRef &o, ///< [in] Object containing omap
11861 bufferlist *header, ///< [out] omap header
11862 map<string, bufferlist> *out /// < [out] Key to value map
11863 )
11864 {
11865 int r = 0;
11866 if (!o || !o->exists) {
11867 r = -ENOENT;
11868 goto out;
11869 }
11870 if (!o->onode.has_omap())
11871 goto out;
11872 o->flush();
11873 {
11874 const string& prefix = o->get_omap_prefix();
11875 KeyValueDB::Iterator it = db->get_iterator(prefix);
11876 string head, tail;
11877 o->get_omap_header(&head);
11878 o->get_omap_tail(&tail);
11879 it->lower_bound(head);
11880 while (it->valid()) {
11881 if (it->key() == head) {
11882 dout(30) << __func__ << " got header" << dendl;
11883 *header = it->value();
11884 } else if (it->key() >= tail) {
11885 dout(30) << __func__ << " reached tail" << dendl;
11886 break;
11887 } else {
11888 string user_key;
11889 o->decode_omap_key(it->key(), &user_key);
11890 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
11891 << " -> " << user_key << dendl;
11892 (*out)[user_key] = it->value();
11893 }
11894 it->next();
11895 }
11896 }
11897 out:
11898 return r;
11899 }
11900
11901 int BlueStore::omap_get_header(
11902 CollectionHandle &c_, ///< [in] Collection containing oid
11903 const ghobject_t &oid, ///< [in] Object containing omap
11904 bufferlist *header, ///< [out] omap header
11905 bool allow_eio ///< [in] don't assert on eio
11906 )
11907 {
11908 Collection *c = static_cast<Collection *>(c_.get());
11909 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11910 if (!c->exists)
11911 return -ENOENT;
11912 std::shared_lock l(c->lock);
11913 int r = 0;
11914 OnodeRef o = c->get_onode(oid, false);
11915 if (!o || !o->exists) {
11916 r = -ENOENT;
11917 goto out;
11918 }
11919 if (!o->onode.has_omap())
11920 goto out;
11921 o->flush();
11922 {
11923 string head;
11924 o->get_omap_header(&head);
11925 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
11926 dout(30) << __func__ << " got header" << dendl;
11927 } else {
11928 dout(30) << __func__ << " no header" << dendl;
11929 }
11930 }
11931 out:
11932 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11933 << dendl;
11934 return r;
11935 }
11936
11937 int BlueStore::omap_get_keys(
11938 CollectionHandle &c_, ///< [in] Collection containing oid
11939 const ghobject_t &oid, ///< [in] Object containing omap
11940 set<string> *keys ///< [out] Keys defined on oid
11941 )
11942 {
11943 Collection *c = static_cast<Collection *>(c_.get());
11944 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11945 if (!c->exists)
11946 return -ENOENT;
11947 auto start1 = mono_clock::now();
11948 std::shared_lock l(c->lock);
11949 int r = 0;
11950 OnodeRef o = c->get_onode(oid, false);
11951 if (!o || !o->exists) {
11952 r = -ENOENT;
11953 goto out;
11954 }
11955 if (!o->onode.has_omap())
11956 goto out;
11957 o->flush();
11958 {
11959 const string& prefix = o->get_omap_prefix();
11960 KeyValueDB::Iterator it = db->get_iterator(prefix);
11961 string head, tail;
11962 o->get_omap_key(string(), &head);
11963 o->get_omap_tail(&tail);
11964 it->lower_bound(head);
11965 while (it->valid()) {
11966 if (it->key() >= tail) {
11967 dout(30) << __func__ << " reached tail" << dendl;
11968 break;
11969 }
11970 string user_key;
11971 o->decode_omap_key(it->key(), &user_key);
11972 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
11973 << " -> " << user_key << dendl;
11974 keys->insert(user_key);
11975 it->next();
11976 }
11977 }
11978 out:
11979 c->store->log_latency(
11980 __func__,
11981 l_bluestore_omap_get_keys_lat,
11982 mono_clock::now() - start1,
11983 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11984
11985 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11986 << dendl;
11987 return r;
11988 }
11989
11990 int BlueStore::omap_get_values(
11991 CollectionHandle &c_, ///< [in] Collection containing oid
11992 const ghobject_t &oid, ///< [in] Object containing omap
11993 const set<string> &keys, ///< [in] Keys to get
11994 map<string, bufferlist> *out ///< [out] Returned keys and values
11995 )
11996 {
11997 Collection *c = static_cast<Collection *>(c_.get());
11998 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11999 if (!c->exists)
12000 return -ENOENT;
12001 std::shared_lock l(c->lock);
12002 auto start1 = mono_clock::now();
12003 int r = 0;
12004 string final_key;
12005 OnodeRef o = c->get_onode(oid, false);
12006 if (!o || !o->exists) {
12007 r = -ENOENT;
12008 goto out;
12009 }
12010 if (!o->onode.has_omap()) {
12011 goto out;
12012 }
12013 o->flush();
12014 {
12015 const string& prefix = o->get_omap_prefix();
12016 o->get_omap_key(string(), &final_key);
12017 size_t base_key_len = final_key.size();
12018 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
12019 final_key.resize(base_key_len); // keep prefix
12020 final_key += *p;
12021 bufferlist val;
12022 if (db->get(prefix, final_key, &val) >= 0) {
12023 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
12024 << " -> " << *p << dendl;
12025 out->insert(make_pair(*p, val));
12026 }
12027 }
12028 }
12029 out:
12030 c->store->log_latency(
12031 __func__,
12032 l_bluestore_omap_get_values_lat,
12033 mono_clock::now() - start1,
12034 c->store->cct->_conf->bluestore_log_omap_iterator_age);
12035
12036 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12037 << dendl;
12038 return r;
12039 }
12040
12041 #ifdef WITH_SEASTAR
12042 int BlueStore::omap_get_values(
12043 CollectionHandle &c_, ///< [in] Collection containing oid
12044 const ghobject_t &oid, ///< [in] Object containing omap
12045 const std::optional<string> &start_after, ///< [in] Keys to get
12046 map<string, bufferlist> *output ///< [out] Returned keys and values
12047 )
12048 {
12049 Collection *c = static_cast<Collection *>(c_.get());
12050 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12051 if (!c->exists)
12052 return -ENOENT;
12053 std::shared_lock l(c->lock);
12054 int r = 0;
12055 OnodeRef o = c->get_onode(oid, false);
12056 if (!o || !o->exists) {
12057 r = -ENOENT;
12058 goto out;
12059 }
12060 if (!o->onode.has_omap()) {
12061 goto out;
12062 }
12063 o->flush();
12064 {
12065 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
12066 if (!iter) {
12067 r = -ENOENT;
12068 goto out;
12069 }
12070 iter->upper_bound(*start_after);
12071 for (; iter->valid(); iter->next()) {
12072 output->insert(make_pair(iter->key(), iter->value()));
12073 }
12074 }
12075
12076 out:
12077 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12078 << dendl;
12079 return r;
12080 }
12081 #endif
12082
12083 int BlueStore::omap_check_keys(
12084 CollectionHandle &c_, ///< [in] Collection containing oid
12085 const ghobject_t &oid, ///< [in] Object containing omap
12086 const set<string> &keys, ///< [in] Keys to check
12087 set<string> *out ///< [out] Subset of keys defined on oid
12088 )
12089 {
12090 Collection *c = static_cast<Collection *>(c_.get());
12091 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12092 if (!c->exists)
12093 return -ENOENT;
12094 std::shared_lock l(c->lock);
12095 int r = 0;
12096 string final_key;
12097 OnodeRef o = c->get_onode(oid, false);
12098 if (!o || !o->exists) {
12099 r = -ENOENT;
12100 goto out;
12101 }
12102 if (!o->onode.has_omap()) {
12103 goto out;
12104 }
12105 o->flush();
12106 {
12107 const string& prefix = o->get_omap_prefix();
12108 o->get_omap_key(string(), &final_key);
12109 size_t base_key_len = final_key.size();
12110 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
12111 final_key.resize(base_key_len); // keep prefix
12112 final_key += *p;
12113 bufferlist val;
12114 if (db->get(prefix, final_key, &val) >= 0) {
12115 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
12116 << " -> " << *p << dendl;
12117 out->insert(*p);
12118 } else {
12119 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
12120 << " -> " << *p << dendl;
12121 }
12122 }
12123 }
12124 out:
12125 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12126 << dendl;
12127 return r;
12128 }
12129
12130 ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
12131 CollectionHandle &c_, ///< [in] collection
12132 const ghobject_t &oid ///< [in] object
12133 )
12134 {
12135 Collection *c = static_cast<Collection *>(c_.get());
12136 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
12137 if (!c->exists) {
12138 return ObjectMap::ObjectMapIterator();
12139 }
12140 std::shared_lock l(c->lock);
12141 OnodeRef o = c->get_onode(oid, false);
12142 if (!o || !o->exists) {
12143 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
12144 return ObjectMap::ObjectMapIterator();
12145 }
12146 o->flush();
12147 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
12148 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix());
12149 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
12150 }
12151
12152 // -----------------
12153 // write helpers
12154
12155 uint64_t BlueStore::_get_ondisk_reserved() const {
12156 ceph_assert(min_alloc_size);
12157 return round_up_to(
12158 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
12159 }
12160
12161 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
12162 {
12163 dout(10) << __func__ << " ondisk_format " << ondisk_format
12164 << " min_compat_ondisk_format " << min_compat_ondisk_format
12165 << dendl;
12166 ceph_assert(ondisk_format == latest_ondisk_format);
12167 {
12168 bufferlist bl;
12169 encode(ondisk_format, bl);
12170 t->set(PREFIX_SUPER, "ondisk_format", bl);
12171 }
12172 {
12173 bufferlist bl;
12174 encode(min_compat_ondisk_format, bl);
12175 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
12176 }
12177 }
12178
12179 int BlueStore::_open_super_meta()
12180 {
12181 // nid
12182 {
12183 nid_max = 0;
12184 bufferlist bl;
12185 db->get(PREFIX_SUPER, "nid_max", &bl);
12186 auto p = bl.cbegin();
12187 try {
12188 uint64_t v;
12189 decode(v, p);
12190 nid_max = v;
12191 } catch (ceph::buffer::error& e) {
12192 derr << __func__ << " unable to read nid_max" << dendl;
12193 return -EIO;
12194 }
12195 dout(1) << __func__ << " old nid_max " << nid_max << dendl;
12196 nid_last = nid_max.load();
12197 }
12198
12199 // blobid
12200 {
12201 blobid_max = 0;
12202 bufferlist bl;
12203 db->get(PREFIX_SUPER, "blobid_max", &bl);
12204 auto p = bl.cbegin();
12205 try {
12206 uint64_t v;
12207 decode(v, p);
12208 blobid_max = v;
12209 } catch (ceph::buffer::error& e) {
12210 derr << __func__ << " unable to read blobid_max" << dendl;
12211 return -EIO;
12212 }
12213 dout(1) << __func__ << " old blobid_max " << blobid_max << dendl;
12214 blobid_last = blobid_max.load();
12215 }
12216
12217 // freelist
12218 {
12219 bufferlist bl;
12220 db->get(PREFIX_SUPER, "freelist_type", &bl);
12221 if (bl.length()) {
12222 freelist_type = std::string(bl.c_str(), bl.length());
12223 } else {
12224 ceph_abort_msg("Not Support extent freelist manager");
12225 }
12226 dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
12227 }
12228 // ondisk format
12229 int32_t compat_ondisk_format = 0;
12230 {
12231 bufferlist bl;
12232 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
12233 if (r < 0) {
12234 // base case: kraken bluestore is v1 and readable by v1
12235 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
12236 << dendl;
12237 ondisk_format = 1;
12238 compat_ondisk_format = 1;
12239 } else {
12240 auto p = bl.cbegin();
12241 try {
12242 decode(ondisk_format, p);
12243 } catch (ceph::buffer::error& e) {
12244 derr << __func__ << " unable to read ondisk_format" << dendl;
12245 return -EIO;
12246 }
12247 bl.clear();
12248 {
12249 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
12250 ceph_assert(!r);
12251 auto p = bl.cbegin();
12252 try {
12253 decode(compat_ondisk_format, p);
12254 } catch (ceph::buffer::error& e) {
12255 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
12256 return -EIO;
12257 }
12258 }
12259 }
12260 dout(1) << __func__ << " ondisk_format " << ondisk_format
12261 << " compat_ondisk_format " << compat_ondisk_format
12262 << dendl;
12263 }
12264
12265 if (latest_ondisk_format < compat_ondisk_format) {
12266 derr << __func__ << " compat_ondisk_format is "
12267 << compat_ondisk_format << " but we only understand version "
12268 << latest_ondisk_format << dendl;
12269 return -EPERM;
12270 }
12271
12272 {
12273 bufferlist bl;
12274 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
12275 auto p = bl.cbegin();
12276 try {
12277 uint64_t val;
12278 decode(val, p);
12279 min_alloc_size = val;
12280 min_alloc_size_order = ctz(val);
12281 min_alloc_size_mask = min_alloc_size - 1;
12282
12283 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
12284 } catch (ceph::buffer::error& e) {
12285 derr << __func__ << " unable to read min_alloc_size" << dendl;
12286 return -EIO;
12287 }
12288 dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
12289 << std::dec << dendl;
12290 logger->set(l_bluestore_alloc_unit, min_alloc_size);
12291 }
12292
12293 // smr fields
12294 {
12295 bufferlist bl;
12296 int r = db->get(PREFIX_SUPER, "zone_size", &bl);
12297 if (r >= 0) {
12298 auto p = bl.cbegin();
12299 decode(zone_size, p);
12300 dout(1) << __func__ << " zone_size 0x" << std::hex << zone_size << std::dec << dendl;
12301 ceph_assert(bdev->is_smr());
12302 } else {
12303 ceph_assert(!bdev->is_smr());
12304 }
12305 }
12306 {
12307 bufferlist bl;
12308 int r = db->get(PREFIX_SUPER, "first_sequential_zone", &bl);
12309 if (r >= 0) {
12310 auto p = bl.cbegin();
12311 decode(first_sequential_zone, p);
12312 dout(1) << __func__ << " first_sequential_zone 0x" << std::hex
12313 << first_sequential_zone << std::dec << dendl;
12314 ceph_assert(bdev->is_smr());
12315 } else {
12316 ceph_assert(!bdev->is_smr());
12317 }
12318 }
12319
12320 _set_per_pool_omap();
12321
12322 _open_statfs();
12323 _set_alloc_sizes();
12324 _set_throttle_params();
12325
12326 _set_csum();
12327 _set_compression();
12328 _set_blob_size();
12329
12330 _validate_bdev();
12331 return 0;
12332 }
12333
12334 int BlueStore::_upgrade_super()
12335 {
12336 dout(1) << __func__ << " from " << ondisk_format << ", latest "
12337 << latest_ondisk_format << dendl;
12338 if (ondisk_format < latest_ondisk_format) {
12339 ceph_assert(ondisk_format > 0);
12340 ceph_assert(ondisk_format < latest_ondisk_format);
12341
12342 KeyValueDB::Transaction t = db->get_transaction();
12343 if (ondisk_format == 1) {
12344 // changes:
12345 // - super: added ondisk_format
12346 // - super: added min_readable_ondisk_format
12347 // - super: added min_compat_ondisk_format
12348 // - super: added min_alloc_size
12349 // - super: removed min_min_alloc_size
12350 {
12351 bufferlist bl;
12352 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
12353 auto p = bl.cbegin();
12354 try {
12355 uint64_t val;
12356 decode(val, p);
12357 min_alloc_size = val;
12358 } catch (ceph::buffer::error& e) {
12359 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
12360 return -EIO;
12361 }
12362 t->set(PREFIX_SUPER, "min_alloc_size", bl);
12363 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
12364 }
12365 ondisk_format = 2;
12366 }
12367 if (ondisk_format == 2) {
12368 // changes:
12369 // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all*
12370 // oondes are using the per-pool prefix until a repair is run; at that
12371 // point the per_pool_omap=1 key will be set.
12372 // - super: added per_pool_omap key, which indicates that *all* objects
12373 // are using the new prefix and key format
12374 ondisk_format = 3;
12375 }
12376 if (ondisk_format == 3) {
12377 // changes:
12378 // - FreelistManager keeps meta within bdev label
12379 int r = _write_out_fm_meta(0);
12380 ceph_assert(r == 0);
12381 ondisk_format = 4;
12382 }
12383 // This to be the last operation
12384 _prepare_ondisk_format_super(t);
12385 int r = db->submit_transaction_sync(t);
12386 ceph_assert(r == 0);
12387 }
12388 // done
12389 dout(1) << __func__ << " done" << dendl;
12390 return 0;
12391 }
12392
12393 void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
12394 {
12395 if (o->onode.nid) {
12396 ceph_assert(o->exists);
12397 return;
12398 }
12399 uint64_t nid = ++nid_last;
12400 dout(20) << __func__ << " " << nid << dendl;
12401 o->onode.nid = nid;
12402 txc->last_nid = nid;
12403 o->exists = true;
12404 }
12405
12406 uint64_t BlueStore::_assign_blobid(TransContext *txc)
12407 {
12408 uint64_t bid = ++blobid_last;
12409 dout(20) << __func__ << " " << bid << dendl;
12410 txc->last_blobid = bid;
12411 return bid;
12412 }
12413
12414 void BlueStore::get_db_statistics(Formatter *f)
12415 {
12416 db->get_statistics(f);
12417 }
12418
12419 BlueStore::TransContext *BlueStore::_txc_create(
12420 Collection *c, OpSequencer *osr,
12421 list<Context*> *on_commits,
12422 TrackedOpRef osd_op)
12423 {
12424 TransContext *txc = new TransContext(cct, c, osr, on_commits);
12425 txc->t = db->get_transaction();
12426
12427 #ifdef WITH_BLKIN
12428 if (osd_op && osd_op->pg_trace) {
12429 txc->trace.init("TransContext", &trace_endpoint,
12430 &osd_op->pg_trace);
12431 txc->trace.event("txc create");
12432 txc->trace.keyval("txc seq", txc->seq);
12433 }
12434 #endif
12435
12436 osr->queue_new(txc);
12437 dout(20) << __func__ << " osr " << osr << " = " << txc
12438 << " seq " << txc->seq << dendl;
12439 return txc;
12440 }
12441
12442 void BlueStore::_txc_calc_cost(TransContext *txc)
12443 {
12444 // one "io" for the kv commit
12445 auto ios = 1 + txc->ioc.get_num_ios();
12446 auto cost = throttle_cost_per_io.load();
12447 txc->cost = ios * cost + txc->bytes;
12448 txc->ios = ios;
12449 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
12450 << ios << " ios * " << cost << " + " << txc->bytes
12451 << " bytes)" << dendl;
12452 }
12453
12454 void BlueStore::_txc_update_store_statfs(TransContext *txc)
12455 {
12456 if (txc->statfs_delta.is_empty())
12457 return;
12458
12459 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
12460 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
12461 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
12462 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
12463 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
12464
12465 bufferlist bl;
12466 txc->statfs_delta.encode(bl);
12467 if (per_pool_stat_collection) {
12468 string key;
12469 get_pool_stat_key(txc->osd_pool_id, &key);
12470 txc->t->merge(PREFIX_STAT, key, bl);
12471
12472 std::lock_guard l(vstatfs_lock);
12473 auto& stats = osd_pools[txc->osd_pool_id];
12474 stats += txc->statfs_delta;
12475
12476 vstatfs += txc->statfs_delta; //non-persistent in this mode
12477
12478 } else {
12479 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
12480
12481 std::lock_guard l(vstatfs_lock);
12482 vstatfs += txc->statfs_delta;
12483 }
12484 txc->statfs_delta.reset();
12485 }
12486
12487 void BlueStore::_txc_state_proc(TransContext *txc)
12488 {
12489 while (true) {
12490 dout(10) << __func__ << " txc " << txc
12491 << " " << txc->get_state_name() << dendl;
12492 switch (txc->get_state()) {
12493 case TransContext::STATE_PREPARE:
12494 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
12495 if (txc->ioc.has_pending_aios()) {
12496 txc->set_state(TransContext::STATE_AIO_WAIT);
12497 #ifdef WITH_BLKIN
12498 if (txc->trace) {
12499 txc->trace.keyval("pending aios", txc->ioc.num_pending.load());
12500 }
12501 #endif
12502 txc->had_ios = true;
12503 _txc_aio_submit(txc);
12504 return;
12505 }
12506 // ** fall-thru **
12507
12508 case TransContext::STATE_AIO_WAIT:
12509 {
12510 mono_clock::duration lat = throttle.log_state_latency(
12511 *txc, logger, l_bluestore_state_aio_wait_lat);
12512 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
12513 dout(0) << __func__ << " slow aio_wait, txc = " << txc
12514 << ", latency = " << lat
12515 << dendl;
12516 }
12517 }
12518
12519 _txc_finish_io(txc); // may trigger blocked txc's too
12520 return;
12521
12522 case TransContext::STATE_IO_DONE:
12523 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
12524 if (txc->had_ios) {
12525 ++txc->osr->txc_with_unstable_io;
12526 }
12527 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
12528 txc->set_state(TransContext::STATE_KV_QUEUED);
12529 if (cct->_conf->bluestore_sync_submit_transaction) {
12530 if (txc->last_nid >= nid_max ||
12531 txc->last_blobid >= blobid_max) {
12532 dout(20) << __func__
12533 << " last_{nid,blobid} exceeds max, submit via kv thread"
12534 << dendl;
12535 } else if (txc->osr->kv_committing_serially) {
12536 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
12537 << dendl;
12538 // note: this is starvation-prone. once we have a txc in a busy
12539 // sequencer that is committing serially it is possible to keep
12540 // submitting new transactions fast enough that we get stuck doing
12541 // so. the alternative is to block here... fixme?
12542 } else if (txc->osr->txc_with_unstable_io) {
12543 dout(20) << __func__ << " prior txc(s) with unstable ios "
12544 << txc->osr->txc_with_unstable_io.load() << dendl;
12545 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
12546 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
12547 == 0) {
12548 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
12549 << dendl;
12550 } else {
12551 _txc_apply_kv(txc, true);
12552 }
12553 }
12554 {
12555 std::lock_guard l(kv_lock);
12556 kv_queue.push_back(txc);
12557 if (!kv_sync_in_progress) {
12558 kv_sync_in_progress = true;
12559 kv_cond.notify_one();
12560 }
12561 if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) {
12562 kv_queue_unsubmitted.push_back(txc);
12563 ++txc->osr->kv_committing_serially;
12564 }
12565 if (txc->had_ios)
12566 kv_ios++;
12567 kv_throttle_costs += txc->cost;
12568 }
12569 return;
12570 case TransContext::STATE_KV_SUBMITTED:
12571 _txc_committed_kv(txc);
12572 // ** fall-thru **
12573
12574 case TransContext::STATE_KV_DONE:
12575 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
12576 if (txc->deferred_txn) {
12577 txc->set_state(TransContext::STATE_DEFERRED_QUEUED);
12578 _deferred_queue(txc);
12579 return;
12580 }
12581 txc->set_state(TransContext::STATE_FINISHING);
12582 break;
12583
12584 case TransContext::STATE_DEFERRED_CLEANUP:
12585 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
12586 txc->set_state(TransContext::STATE_FINISHING);
12587 // ** fall-thru **
12588
12589 case TransContext::STATE_FINISHING:
12590 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
12591 _txc_finish(txc);
12592 return;
12593
12594 default:
12595 derr << __func__ << " unexpected txc " << txc
12596 << " state " << txc->get_state_name() << dendl;
12597 ceph_abort_msg("unexpected txc state");
12598 return;
12599 }
12600 }
12601 }
12602
12603 void BlueStore::_txc_finish_io(TransContext *txc)
12604 {
12605 dout(20) << __func__ << " " << txc << dendl;
12606
12607 /*
12608 * we need to preserve the order of kv transactions,
12609 * even though aio will complete in any order.
12610 */
12611
12612 OpSequencer *osr = txc->osr.get();
12613 std::lock_guard l(osr->qlock);
12614 txc->set_state(TransContext::STATE_IO_DONE);
12615 txc->ioc.release_running_aios();
12616 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
12617 while (p != osr->q.begin()) {
12618 --p;
12619 if (p->get_state() < TransContext::STATE_IO_DONE) {
12620 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
12621 << p->get_state_name() << dendl;
12622 return;
12623 }
12624 if (p->get_state() > TransContext::STATE_IO_DONE) {
12625 ++p;
12626 break;
12627 }
12628 }
12629 do {
12630 _txc_state_proc(&*p++);
12631 } while (p != osr->q.end() &&
12632 p->get_state() == TransContext::STATE_IO_DONE);
12633
12634 if (osr->kv_submitted_waiters) {
12635 osr->qcond.notify_all();
12636 }
12637 }
12638
12639 void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
12640 {
12641 dout(20) << __func__ << " txc " << txc
12642 << " onodes " << txc->onodes
12643 << " shared_blobs " << txc->shared_blobs
12644 << dendl;
12645
12646 // finalize onodes
12647 for (auto o : txc->onodes) {
12648 _record_onode(o, t);
12649 o->flushing_count++;
12650 }
12651
12652 // objects we modified but didn't affect the onode
12653 auto p = txc->modified_objects.begin();
12654 while (p != txc->modified_objects.end()) {
12655 if (txc->onodes.count(*p) == 0) {
12656 (*p)->flushing_count++;
12657 ++p;
12658 } else {
12659 // remove dups with onodes list to avoid problems in _txc_finish
12660 p = txc->modified_objects.erase(p);
12661 }
12662 }
12663
12664 // finalize shared_blobs
12665 for (auto sb : txc->shared_blobs) {
12666 string key;
12667 auto sbid = sb->get_sbid();
12668 get_shared_blob_key(sbid, &key);
12669 if (sb->persistent->empty()) {
12670 dout(20) << __func__ << " shared_blob 0x"
12671 << std::hex << sbid << std::dec
12672 << " is empty" << dendl;
12673 t->rmkey(PREFIX_SHARED_BLOB, key);
12674 } else {
12675 bufferlist bl;
12676 encode(*(sb->persistent), bl);
12677 dout(20) << __func__ << " shared_blob 0x"
12678 << std::hex << sbid << std::dec
12679 << " is " << bl.length() << " " << *sb << dendl;
12680 t->set(PREFIX_SHARED_BLOB, key, bl);
12681 }
12682 }
12683 }
12684
12685 void BlueStore::BSPerfTracker::update_from_perfcounters(
12686 PerfCounters &logger)
12687 {
12688 os_commit_latency_ns.consume_next(
12689 logger.get_tavg_ns(
12690 l_bluestore_commit_lat));
12691 os_apply_latency_ns.consume_next(
12692 logger.get_tavg_ns(
12693 l_bluestore_commit_lat));
12694 }
12695
12696 void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
12697 {
12698 dout(20) << __func__ << " txc " << txc << std::hex
12699 << " allocated 0x" << txc->allocated
12700 << " released 0x" << txc->released
12701 << std::dec << dendl;
12702
12703 if (!fm->is_null_manager())
12704 {
12705 // We have to handle the case where we allocate *and* deallocate the
12706 // same region in this transaction. The freelist doesn't like that.
12707 // (Actually, the only thing that cares is the BitmapFreelistManager
12708 // debug check. But that's important.)
12709 interval_set<uint64_t> tmp_allocated, tmp_released;
12710 interval_set<uint64_t> *pallocated = &txc->allocated;
12711 interval_set<uint64_t> *preleased = &txc->released;
12712 if (!txc->allocated.empty() && !txc->released.empty()) {
12713 interval_set<uint64_t> overlap;
12714 overlap.intersection_of(txc->allocated, txc->released);
12715 if (!overlap.empty()) {
12716 tmp_allocated = txc->allocated;
12717 tmp_allocated.subtract(overlap);
12718 tmp_released = txc->released;
12719 tmp_released.subtract(overlap);
12720 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
12721 << ", new allocated 0x" << tmp_allocated
12722 << " released 0x" << tmp_released << std::dec
12723 << dendl;
12724 pallocated = &tmp_allocated;
12725 preleased = &tmp_released;
12726 }
12727 }
12728
12729 // update freelist with non-overlap sets
12730 for (interval_set<uint64_t>::iterator p = pallocated->begin();
12731 p != pallocated->end();
12732 ++p) {
12733 fm->allocate(p.get_start(), p.get_len(), t);
12734 }
12735 for (interval_set<uint64_t>::iterator p = preleased->begin();
12736 p != preleased->end();
12737 ++p) {
12738 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
12739 << "~" << p.get_len() << std::dec << dendl;
12740 fm->release(p.get_start(), p.get_len(), t);
12741 }
12742 }
12743
12744 #ifdef HAVE_LIBZBD
12745 if (bdev->is_smr()) {
12746 for (auto& i : txc->old_zone_offset_refs) {
12747 dout(20) << __func__ << " rm ref zone 0x" << std::hex << i.first.second
12748 << " offset 0x" << i.second << std::dec
12749 << " -> " << i.first.first->oid << dendl;
12750 string key;
12751 get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
12752 txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
12753 }
12754 for (auto& i : txc->new_zone_offset_refs) {
12755 // (zone, offset) -> oid
12756 dout(20) << __func__ << " add ref zone 0x" << std::hex << i.first.second
12757 << " offset 0x" << i.second << std::dec
12758 << " -> " << i.first.first->oid << dendl;
12759 string key;
12760 get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
12761 bufferlist v;
12762 txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
12763 }
12764 }
12765 #endif
12766
12767 _txc_update_store_statfs(txc);
12768 }
12769
12770 void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
12771 {
12772 ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED);
12773 {
12774 #if defined(WITH_LTTNG)
12775 auto start = mono_clock::now();
12776 #endif
12777
12778 #ifdef WITH_BLKIN
12779 if (txc->trace) {
12780 txc->trace.event("db async submit");
12781 }
12782 #endif
12783
12784 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
12785 ceph_assert(r == 0);
12786 txc->set_state(TransContext::STATE_KV_SUBMITTED);
12787 if (txc->osr->kv_submitted_waiters) {
12788 std::lock_guard l(txc->osr->qlock);
12789 txc->osr->qcond.notify_all();
12790 }
12791
12792 #if defined(WITH_LTTNG)
12793 if (txc->tracing) {
12794 tracepoint(
12795 bluestore,
12796 transaction_kv_submit_latency,
12797 txc->osr->get_sequencer_id(),
12798 txc->seq,
12799 sync_submit_transaction,
12800 ceph::to_seconds<double>(mono_clock::now() - start));
12801 }
12802 #endif
12803 }
12804
12805 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
12806 for (auto& o : *ls) {
12807 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
12808 << dendl;
12809 if (--o->flushing_count == 0 && o->waiting_count.load()) {
12810 std::lock_guard l(o->flush_lock);
12811 o->flush_cond.notify_all();
12812 }
12813 }
12814 }
12815 }
12816
12817 void BlueStore::_txc_committed_kv(TransContext *txc)
12818 {
12819 dout(20) << __func__ << " txc " << txc << dendl;
12820 throttle.complete_kv(*txc);
12821 {
12822 std::lock_guard l(txc->osr->qlock);
12823 txc->set_state(TransContext::STATE_KV_DONE);
12824 if (txc->ch->commit_queue) {
12825 txc->ch->commit_queue->queue(txc->oncommits);
12826 } else {
12827 finisher.queue(txc->oncommits);
12828 }
12829 }
12830 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
12831 log_latency_fn(
12832 __func__,
12833 l_bluestore_commit_lat,
12834 mono_clock::now() - txc->start,
12835 cct->_conf->bluestore_log_op_age,
12836 [&](auto lat) {
12837 return ", txc = " + stringify(txc);
12838 }
12839 );
12840 }
12841
12842 void BlueStore::_txc_finish(TransContext *txc)
12843 {
12844 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
12845 ceph_assert(txc->get_state() == TransContext::STATE_FINISHING);
12846
12847 for (auto& sb : txc->shared_blobs_written) {
12848 sb->finish_write(txc->seq);
12849 }
12850 txc->shared_blobs_written.clear();
12851
12852 while (!txc->removed_collections.empty()) {
12853 _queue_reap_collection(txc->removed_collections.front());
12854 txc->removed_collections.pop_front();
12855 }
12856
12857 OpSequencerRef osr = txc->osr;
12858 bool empty = false;
12859 bool submit_deferred = false;
12860 OpSequencer::q_list_t releasing_txc;
12861 {
12862 std::lock_guard l(osr->qlock);
12863 txc->set_state(TransContext::STATE_DONE);
12864 bool notify = false;
12865 while (!osr->q.empty()) {
12866 TransContext *txc = &osr->q.front();
12867 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
12868 << dendl;
12869 if (txc->get_state() != TransContext::STATE_DONE) {
12870 if (txc->get_state() == TransContext::STATE_PREPARE &&
12871 deferred_aggressive) {
12872 // for _osr_drain_preceding()
12873 notify = true;
12874 }
12875 if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED &&
12876 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
12877 submit_deferred = true;
12878 }
12879 break;
12880 }
12881
12882 osr->q.pop_front();
12883 releasing_txc.push_back(*txc);
12884 }
12885
12886 if (osr->q.empty()) {
12887 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
12888 empty = true;
12889 }
12890
12891 // only drain()/drain_preceding() need wakeup,
12892 // other cases use kv_submitted_waiters
12893 if (notify || empty) {
12894 osr->qcond.notify_all();
12895 }
12896 }
12897
12898 while (!releasing_txc.empty()) {
12899 // release to allocator only after all preceding txc's have also
12900 // finished any deferred writes that potentially land in these
12901 // blocks
12902 auto txc = &releasing_txc.front();
12903 _txc_release_alloc(txc);
12904 releasing_txc.pop_front();
12905 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
12906 throttle.complete(*txc);
12907 delete txc;
12908 }
12909
12910 if (submit_deferred) {
12911 // we're pinning memory; flush! we could be more fine-grained here but
12912 // i'm not sure it's worth the bother.
12913 deferred_try_submit();
12914 }
12915
12916 if (empty && osr->zombie) {
12917 std::lock_guard l(zombie_osr_lock);
12918 if (zombie_osr_set.erase(osr->cid)) {
12919 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
12920 } else {
12921 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
12922 << dendl;
12923 }
12924 }
12925 }
12926
12927 void BlueStore::_txc_release_alloc(TransContext *txc)
12928 {
12929 // it's expected we're called with lazy_release_lock already taken!
12930 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
12931 int r = 0;
12932 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
12933 r = bdev->queue_discard(txc->released);
12934 if (r == 0) {
12935 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
12936 << txc->released << std::dec << dendl;
12937 goto out;
12938 }
12939 } else if (cct->_conf->bdev_enable_discard) {
12940 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
12941 bdev->discard(p.get_start(), p.get_len());
12942 }
12943 }
12944 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
12945 << txc->released << std::dec << dendl;
12946 alloc->release(txc->released);
12947 }
12948
12949 out:
12950 txc->allocated.clear();
12951 txc->released.clear();
12952 }
12953
12954 void BlueStore::_osr_attach(Collection *c)
12955 {
12956 // note: caller has coll_lock
12957 auto q = coll_map.find(c->cid);
12958 if (q != coll_map.end()) {
12959 c->osr = q->second->osr;
12960 ldout(cct, 10) << __func__ << " " << c->cid
12961 << " reusing osr " << c->osr << " from existing coll "
12962 << q->second << dendl;
12963 } else {
12964 std::lock_guard l(zombie_osr_lock);
12965 auto p = zombie_osr_set.find(c->cid);
12966 if (p == zombie_osr_set.end()) {
12967 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
12968 ldout(cct, 10) << __func__ << " " << c->cid
12969 << " fresh osr " << c->osr << dendl;
12970 } else {
12971 c->osr = p->second;
12972 zombie_osr_set.erase(p);
12973 ldout(cct, 10) << __func__ << " " << c->cid
12974 << " resurrecting zombie osr " << c->osr << dendl;
12975 c->osr->zombie = false;
12976 }
12977 }
12978 }
12979
12980 void BlueStore::_osr_register_zombie(OpSequencer *osr)
12981 {
12982 std::lock_guard l(zombie_osr_lock);
12983 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
12984 osr->zombie = true;
12985 auto i = zombie_osr_set.emplace(osr->cid, osr);
12986 // this is either a new insertion or the same osr is already there
12987 ceph_assert(i.second || i.first->second == osr);
12988 }
12989
12990 void BlueStore::_osr_drain_preceding(TransContext *txc)
12991 {
12992 OpSequencer *osr = txc->osr.get();
12993 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
12994 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
12995 {
12996 // submit anything pending
12997 osr->deferred_lock.lock();
12998 if (osr->deferred_pending && !osr->deferred_running) {
12999 _deferred_submit_unlock(osr);
13000 } else {
13001 osr->deferred_lock.unlock();
13002 }
13003 }
13004 {
13005 // wake up any previously finished deferred events
13006 std::lock_guard l(kv_lock);
13007 if (!kv_sync_in_progress) {
13008 kv_sync_in_progress = true;
13009 kv_cond.notify_one();
13010 }
13011 }
13012 osr->drain_preceding(txc);
13013 --deferred_aggressive;
13014 dout(10) << __func__ << " " << osr << " done" << dendl;
13015 }
13016
13017 void BlueStore::_osr_drain(OpSequencer *osr)
13018 {
13019 dout(10) << __func__ << " " << osr << dendl;
13020 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
13021 {
13022 // submit anything pending
13023 osr->deferred_lock.lock();
13024 if (osr->deferred_pending && !osr->deferred_running) {
13025 _deferred_submit_unlock(osr);
13026 } else {
13027 osr->deferred_lock.unlock();
13028 }
13029 }
13030 {
13031 // wake up any previously finished deferred events
13032 std::lock_guard l(kv_lock);
13033 if (!kv_sync_in_progress) {
13034 kv_sync_in_progress = true;
13035 kv_cond.notify_one();
13036 }
13037 }
13038 osr->drain();
13039 --deferred_aggressive;
13040 dout(10) << __func__ << " " << osr << " done" << dendl;
13041 }
13042
13043 void BlueStore::_osr_drain_all()
13044 {
13045 dout(10) << __func__ << dendl;
13046
13047 set<OpSequencerRef> s;
13048 vector<OpSequencerRef> zombies;
13049 {
13050 std::shared_lock l(coll_lock);
13051 for (auto& i : coll_map) {
13052 s.insert(i.second->osr);
13053 }
13054 }
13055 {
13056 std::lock_guard l(zombie_osr_lock);
13057 for (auto& i : zombie_osr_set) {
13058 s.insert(i.second);
13059 zombies.push_back(i.second);
13060 }
13061 }
13062 dout(20) << __func__ << " osr_set " << s << dendl;
13063
13064 ++deferred_aggressive;
13065 {
13066 // submit anything pending
13067 deferred_try_submit();
13068 }
13069 {
13070 // wake up any previously finished deferred events
13071 std::lock_guard l(kv_lock);
13072 kv_cond.notify_one();
13073 }
13074 {
13075 std::lock_guard l(kv_finalize_lock);
13076 kv_finalize_cond.notify_one();
13077 }
13078 for (auto osr : s) {
13079 dout(20) << __func__ << " drain " << osr << dendl;
13080 osr->drain();
13081 }
13082 --deferred_aggressive;
13083
13084 {
13085 std::lock_guard l(zombie_osr_lock);
13086 for (auto& osr : zombies) {
13087 if (zombie_osr_set.erase(osr->cid)) {
13088 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
13089 ceph_assert(osr->q.empty());
13090 } else if (osr->zombie) {
13091 dout(10) << __func__ << " empty zombie osr " << osr
13092 << " already reaped" << dendl;
13093 ceph_assert(osr->q.empty());
13094 } else {
13095 dout(10) << __func__ << " empty zombie osr " << osr
13096 << " resurrected" << dendl;
13097 }
13098 }
13099 }
13100
13101 dout(10) << __func__ << " done" << dendl;
13102 }
13103
13104
13105 void BlueStore::_kv_start()
13106 {
13107 dout(10) << __func__ << dendl;
13108
13109 finisher.start();
13110 kv_sync_thread.create("bstore_kv_sync");
13111 kv_finalize_thread.create("bstore_kv_final");
13112 }
13113
13114 void BlueStore::_kv_stop()
13115 {
13116 dout(10) << __func__ << dendl;
13117 {
13118 std::unique_lock l{kv_lock};
13119 while (!kv_sync_started) {
13120 kv_cond.wait(l);
13121 }
13122 kv_stop = true;
13123 kv_cond.notify_all();
13124 }
13125 {
13126 std::unique_lock l{kv_finalize_lock};
13127 while (!kv_finalize_started) {
13128 kv_finalize_cond.wait(l);
13129 }
13130 kv_finalize_stop = true;
13131 kv_finalize_cond.notify_all();
13132 }
13133 kv_sync_thread.join();
13134 kv_finalize_thread.join();
13135 ceph_assert(removed_collections.empty());
13136 {
13137 std::lock_guard l(kv_lock);
13138 kv_stop = false;
13139 }
13140 {
13141 std::lock_guard l(kv_finalize_lock);
13142 kv_finalize_stop = false;
13143 }
13144 dout(10) << __func__ << " stopping finishers" << dendl;
13145 finisher.wait_for_empty();
13146 finisher.stop();
13147 dout(10) << __func__ << " stopped" << dendl;
13148 }
13149
13150 void BlueStore::_kv_sync_thread()
13151 {
13152 dout(10) << __func__ << " start" << dendl;
13153 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
13154 std::unique_lock l{kv_lock};
13155 ceph_assert(!kv_sync_started);
13156 kv_sync_started = true;
13157 kv_cond.notify_all();
13158
13159 auto t0 = mono_clock::now();
13160 timespan twait = ceph::make_timespan(0);
13161 size_t kv_submitted = 0;
13162
13163 while (true) {
13164 auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
13165 auto observation_period =
13166 ceph::make_timespan(period);
13167 auto elapsed = mono_clock::now() - t0;
13168 if (period && elapsed >= observation_period) {
13169 dout(5) << __func__ << " utilization: idle "
13170 << twait << " of " << elapsed
13171 << ", submitted: " << kv_submitted
13172 <<dendl;
13173 t0 = mono_clock::now();
13174 twait = ceph::make_timespan(0);
13175 kv_submitted = 0;
13176 }
13177 ceph_assert(kv_committing.empty());
13178 if (kv_queue.empty() &&
13179 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
13180 !deferred_aggressive)) {
13181 if (kv_stop)
13182 break;
13183 dout(20) << __func__ << " sleep" << dendl;
13184 auto t = mono_clock::now();
13185 kv_sync_in_progress = false;
13186 kv_cond.wait(l);
13187 twait += mono_clock::now() - t;
13188
13189 dout(20) << __func__ << " wake" << dendl;
13190 } else {
13191 deque<TransContext*> kv_submitting;
13192 deque<DeferredBatch*> deferred_done, deferred_stable;
13193 uint64_t aios = 0, costs = 0;
13194
13195 dout(20) << __func__ << " committing " << kv_queue.size()
13196 << " submitting " << kv_queue_unsubmitted.size()
13197 << " deferred done " << deferred_done_queue.size()
13198 << " stable " << deferred_stable_queue.size()
13199 << dendl;
13200 kv_committing.swap(kv_queue);
13201 kv_submitting.swap(kv_queue_unsubmitted);
13202 deferred_done.swap(deferred_done_queue);
13203 deferred_stable.swap(deferred_stable_queue);
13204 aios = kv_ios;
13205 costs = kv_throttle_costs;
13206 kv_ios = 0;
13207 kv_throttle_costs = 0;
13208 l.unlock();
13209
13210 dout(30) << __func__ << " committing " << kv_committing << dendl;
13211 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
13212 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
13213 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
13214
13215 auto start = mono_clock::now();
13216
13217 bool force_flush = false;
13218 // if bluefs is sharing the same device as data (only), then we
13219 // can rely on the bluefs commit to flush the device and make
13220 // deferred aios stable. that means that if we do have done deferred
13221 // txcs AND we are not on a single device, we need to force a flush.
13222 if (bluefs && bluefs_layout.single_shared_device()) {
13223 if (aios) {
13224 force_flush = true;
13225 } else if (kv_committing.empty() && deferred_stable.empty()) {
13226 force_flush = true; // there's nothing else to commit!
13227 } else if (deferred_aggressive) {
13228 force_flush = true;
13229 }
13230 } else {
13231 if (aios || !deferred_done.empty()) {
13232 force_flush = true;
13233 } else {
13234 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
13235 }
13236 }
13237
13238 if (force_flush) {
13239 dout(20) << __func__ << " num_aios=" << aios
13240 << " force_flush=" << (int)force_flush
13241 << ", flushing, deferred done->stable" << dendl;
13242 // flush/barrier on block device
13243 bdev->flush();
13244
13245 // if we flush then deferred done are now deferred stable
13246 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
13247 deferred_done.end());
13248 deferred_done.clear();
13249 }
13250 auto after_flush = mono_clock::now();
13251
13252 // we will use one final transaction to force a sync
13253 KeyValueDB::Transaction synct = db->get_transaction();
13254
13255 // increase {nid,blobid}_max? note that this covers both the
13256 // case where we are approaching the max and the case we passed
13257 // it. in either case, we increase the max in the earlier txn
13258 // we submit.
13259 uint64_t new_nid_max = 0, new_blobid_max = 0;
13260 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
13261 KeyValueDB::Transaction t =
13262 kv_submitting.empty() ? synct : kv_submitting.front()->t;
13263 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
13264 bufferlist bl;
13265 encode(new_nid_max, bl);
13266 t->set(PREFIX_SUPER, "nid_max", bl);
13267 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
13268 }
13269 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
13270 KeyValueDB::Transaction t =
13271 kv_submitting.empty() ? synct : kv_submitting.front()->t;
13272 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
13273 bufferlist bl;
13274 encode(new_blobid_max, bl);
13275 t->set(PREFIX_SUPER, "blobid_max", bl);
13276 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
13277 }
13278
13279 for (auto txc : kv_committing) {
13280 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
13281 if (txc->get_state() == TransContext::STATE_KV_QUEUED) {
13282 ++kv_submitted;
13283 _txc_apply_kv(txc, false);
13284 --txc->osr->kv_committing_serially;
13285 } else {
13286 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
13287 }
13288 if (txc->had_ios) {
13289 --txc->osr->txc_with_unstable_io;
13290 }
13291 }
13292
13293 // release throttle *before* we commit. this allows new ops
13294 // to be prepared and enter pipeline while we are waiting on
13295 // the kv commit sync/flush. then hopefully on the next
13296 // iteration there will already be ops awake. otherwise, we
13297 // end up going to sleep, and then wake up when the very first
13298 // transaction is ready for commit.
13299 throttle.release_kv_throttle(costs);
13300
13301 // cleanup sync deferred keys
13302 for (auto b : deferred_stable) {
13303 for (auto& txc : b->txcs) {
13304 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
13305 ceph_assert(wt.released.empty()); // only kraken did this
13306 string key;
13307 get_deferred_key(wt.seq, &key);
13308 synct->rm_single_key(PREFIX_DEFERRED, key);
13309 }
13310 }
13311
13312 #if defined(WITH_LTTNG)
13313 auto sync_start = mono_clock::now();
13314 #endif
13315 // submit synct synchronously (block and wait for it to commit)
13316 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
13317 ceph_assert(r == 0);
13318
13319 #ifdef WITH_BLKIN
13320 for (auto txc : kv_committing) {
13321 if (txc->trace) {
13322 txc->trace.event("db sync submit");
13323 txc->trace.keyval("kv_committing size", kv_committing.size());
13324 }
13325 }
13326 #endif
13327
13328 int committing_size = kv_committing.size();
13329 int deferred_size = deferred_stable.size();
13330
13331 #if defined(WITH_LTTNG)
13332 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
13333 for (auto txc: kv_committing) {
13334 if (txc->tracing) {
13335 tracepoint(
13336 bluestore,
13337 transaction_kv_sync_latency,
13338 txc->osr->get_sequencer_id(),
13339 txc->seq,
13340 kv_committing.size(),
13341 deferred_done.size(),
13342 deferred_stable.size(),
13343 sync_latency);
13344 }
13345 }
13346 #endif
13347
13348 {
13349 std::unique_lock m{kv_finalize_lock};
13350 if (kv_committing_to_finalize.empty()) {
13351 kv_committing_to_finalize.swap(kv_committing);
13352 } else {
13353 kv_committing_to_finalize.insert(
13354 kv_committing_to_finalize.end(),
13355 kv_committing.begin(),
13356 kv_committing.end());
13357 kv_committing.clear();
13358 }
13359 if (deferred_stable_to_finalize.empty()) {
13360 deferred_stable_to_finalize.swap(deferred_stable);
13361 } else {
13362 deferred_stable_to_finalize.insert(
13363 deferred_stable_to_finalize.end(),
13364 deferred_stable.begin(),
13365 deferred_stable.end());
13366 deferred_stable.clear();
13367 }
13368 if (!kv_finalize_in_progress) {
13369 kv_finalize_in_progress = true;
13370 kv_finalize_cond.notify_one();
13371 }
13372 }
13373
13374 if (new_nid_max) {
13375 nid_max = new_nid_max;
13376 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
13377 }
13378 if (new_blobid_max) {
13379 blobid_max = new_blobid_max;
13380 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
13381 }
13382
13383 {
13384 auto finish = mono_clock::now();
13385 ceph::timespan dur_flush = after_flush - start;
13386 ceph::timespan dur_kv = finish - after_flush;
13387 ceph::timespan dur = finish - start;
13388 dout(20) << __func__ << " committed " << committing_size
13389 << " cleaned " << deferred_size
13390 << " in " << dur
13391 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
13392 << dendl;
13393 log_latency("kv_flush",
13394 l_bluestore_kv_flush_lat,
13395 dur_flush,
13396 cct->_conf->bluestore_log_op_age);
13397 log_latency("kv_commit",
13398 l_bluestore_kv_commit_lat,
13399 dur_kv,
13400 cct->_conf->bluestore_log_op_age);
13401 log_latency("kv_sync",
13402 l_bluestore_kv_sync_lat,
13403 dur,
13404 cct->_conf->bluestore_log_op_age);
13405 }
13406
13407 l.lock();
13408 // previously deferred "done" are now "stable" by virtue of this
13409 // commit cycle.
13410 deferred_stable_queue.swap(deferred_done);
13411 }
13412 }
13413 dout(10) << __func__ << " finish" << dendl;
13414 kv_sync_started = false;
13415 }
13416
13417 void BlueStore::_kv_finalize_thread()
13418 {
13419 deque<TransContext*> kv_committed;
13420 deque<DeferredBatch*> deferred_stable;
13421 dout(10) << __func__ << " start" << dendl;
13422 std::unique_lock l(kv_finalize_lock);
13423 ceph_assert(!kv_finalize_started);
13424 kv_finalize_started = true;
13425 kv_finalize_cond.notify_all();
13426 while (true) {
13427 ceph_assert(kv_committed.empty());
13428 ceph_assert(deferred_stable.empty());
13429 if (kv_committing_to_finalize.empty() &&
13430 deferred_stable_to_finalize.empty()) {
13431 if (kv_finalize_stop)
13432 break;
13433 dout(20) << __func__ << " sleep" << dendl;
13434 kv_finalize_in_progress = false;
13435 kv_finalize_cond.wait(l);
13436 dout(20) << __func__ << " wake" << dendl;
13437 } else {
13438 kv_committed.swap(kv_committing_to_finalize);
13439 deferred_stable.swap(deferred_stable_to_finalize);
13440 l.unlock();
13441 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
13442 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
13443
13444 auto start = mono_clock::now();
13445
13446 while (!kv_committed.empty()) {
13447 TransContext *txc = kv_committed.front();
13448 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
13449 _txc_state_proc(txc);
13450 kv_committed.pop_front();
13451 }
13452
13453 for (auto b : deferred_stable) {
13454 auto p = b->txcs.begin();
13455 while (p != b->txcs.end()) {
13456 TransContext *txc = &*p;
13457 p = b->txcs.erase(p); // unlink here because
13458 _txc_state_proc(txc); // this may destroy txc
13459 }
13460 delete b;
13461 }
13462 deferred_stable.clear();
13463
13464 if (!deferred_aggressive) {
13465 if (deferred_queue_size >= deferred_batch_ops.load() ||
13466 throttle.should_submit_deferred()) {
13467 deferred_try_submit();
13468 }
13469 }
13470
13471 // this is as good a place as any ...
13472 _reap_collections();
13473
13474 logger->set(l_bluestore_fragmentation,
13475 (uint64_t)(alloc->get_fragmentation() * 1000));
13476
13477 log_latency("kv_final",
13478 l_bluestore_kv_final_lat,
13479 mono_clock::now() - start,
13480 cct->_conf->bluestore_log_op_age);
13481
13482 l.lock();
13483 }
13484 }
13485 dout(10) << __func__ << " finish" << dendl;
13486 kv_finalize_started = false;
13487 }
13488
13489 #ifdef HAVE_LIBZBD
13490 void BlueStore::_zoned_cleaner_start()
13491 {
13492 dout(10) << __func__ << dendl;
13493 zoned_cleaner_thread.create("bstore_zcleaner");
13494 }
13495
13496 void BlueStore::_zoned_cleaner_stop()
13497 {
13498 dout(10) << __func__ << dendl;
13499 {
13500 std::unique_lock l{zoned_cleaner_lock};
13501 while (!zoned_cleaner_started) {
13502 zoned_cleaner_cond.wait(l);
13503 }
13504 zoned_cleaner_stop = true;
13505 zoned_cleaner_cond.notify_all();
13506 }
13507 zoned_cleaner_thread.join();
13508 {
13509 std::lock_guard l{zoned_cleaner_lock};
13510 zoned_cleaner_stop = false;
13511 }
13512 dout(10) << __func__ << " done" << dendl;
13513 }
13514
13515 void BlueStore::_zoned_cleaner_thread()
13516 {
13517 dout(10) << __func__ << " start" << dendl;
13518 std::unique_lock l{zoned_cleaner_lock};
13519 ceph_assert(!zoned_cleaner_started);
13520 zoned_cleaner_started = true;
13521 zoned_cleaner_cond.notify_all();
13522 auto a = dynamic_cast<ZonedAllocator*>(alloc);
13523 ceph_assert(a);
13524 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
13525 ceph_assert(f);
13526 while (true) {
13527 // thresholds to trigger cleaning
13528 // FIXME
13529 float min_score = .05; // score: bytes saved / bytes moved
13530 uint64_t min_saved = zone_size / 32; // min bytes saved to consider cleaning
13531 auto zone_to_clean = a->pick_zone_to_clean(min_score, min_saved);
13532 if (zone_to_clean < 0) {
13533 if (zoned_cleaner_stop) {
13534 break;
13535 }
13536 auto period = ceph::make_timespan(cct->_conf->bluestore_cleaner_sleep_interval);
13537 dout(20) << __func__ << " sleep for " << period << dendl;
13538 zoned_cleaner_cond.wait_for(l, period);
13539 dout(20) << __func__ << " wake" << dendl;
13540 } else {
13541 l.unlock();
13542 a->set_cleaning_zone(zone_to_clean);
13543 _zoned_clean_zone(zone_to_clean, a, f);
13544 a->clear_cleaning_zone(zone_to_clean);
13545 l.lock();
13546 }
13547 }
13548 dout(10) << __func__ << " finish" << dendl;
13549 zoned_cleaner_started = false;
13550 }
13551
13552 void BlueStore::_zoned_clean_zone(
13553 uint64_t zone,
13554 ZonedAllocator *a,
13555 ZonedFreelistManager *f
13556 )
13557 {
13558 dout(10) << __func__ << " cleaning zone 0x" << std::hex << zone << std::dec << dendl;
13559
13560 KeyValueDB::Iterator it = db->get_iterator(PREFIX_ZONED_CL_INFO);
13561 std::string zone_start;
13562 get_zone_offset_object_key(zone, 0, ghobject_t(), &zone_start);
13563 for (it->lower_bound(zone_start); it->valid(); it->next()) {
13564 uint32_t z;
13565 uint64_t offset;
13566 ghobject_t oid;
13567 string k = it->key();
13568 int r = get_key_zone_offset_object(k, &z, &offset, &oid);
13569 if (r < 0) {
13570 derr << __func__ << " failed to decode zone ref " << pretty_binary_string(k)
13571 << dendl;
13572 continue;
13573 }
13574 if (zone != z) {
13575 dout(10) << __func__ << " reached end of zone refs" << dendl;
13576 break;
13577 }
13578 dout(10) << __func__ << " zone 0x" << std::hex << zone << " offset 0x" << offset
13579 << std::dec << " " << oid << dendl;
13580 _clean_some(oid, zone);
13581 }
13582
13583 if (a->get_live_bytes(zone) > 0) {
13584 derr << "zone 0x" << std::hex << zone << " still has 0x" << a->get_live_bytes(zone)
13585 << " live bytes" << std::dec << dendl;
13586 // should we do something else here to avoid a live-lock in the event of a problem?
13587 return;
13588 }
13589
13590 // make sure transactions flush/drain/commit (and data is all rewritten
13591 // safely elsewhere) before we blow away the cleaned zone
13592 _osr_drain_all();
13593
13594 // reset the device zone
13595 dout(10) << __func__ << " resetting zone 0x" << std::hex << zone << std::dec << dendl;
13596 bdev->reset_zone(zone);
13597
13598 // record that we can now write there
13599 f->mark_zone_to_clean_free(zone, db);
13600 bdev->flush();
13601
13602 // then allow ourselves to start allocating there
13603 dout(10) << __func__ << " done cleaning zone 0x" << std::hex << zone << std::dec
13604 << dendl;
13605 a->reset_zone(zone);
13606 }
13607
13608 void BlueStore::_clean_some(ghobject_t oid, uint32_t zone)
13609 {
13610 dout(10) << __func__ << " " << oid << " from zone 0x" << std::hex << zone << std::dec
13611 << dendl;
13612
13613 CollectionRef cref = _get_collection_by_oid(oid);
13614 if (!cref) {
13615 dout(10) << __func__ << " can't find collection for " << oid << dendl;
13616 return;
13617 }
13618 Collection *c = cref.get();
13619
13620 // serialize io dispatch vs other transactions
13621 std::lock_guard l(atomic_alloc_and_submit_lock);
13622 std::unique_lock l2(c->lock);
13623
13624 auto o = c->get_onode(oid, false);
13625 if (!o) {
13626 dout(10) << __func__ << " can't find " << oid << dendl;
13627 return;
13628 }
13629
13630 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
13631 _dump_onode<30>(cct, *o);
13632
13633 // NOTE: This is a naive rewrite strategy. If any blobs are
13634 // shared, they will be duplicated for each object that references
13635 // them. That means any cloned/snapshotted objects will explode
13636 // their utilization. This won't matter for RGW workloads, but
13637 // for RBD and CephFS it is completely unacceptable, and it's
13638 // entirely reasonable to have "archival" data workloads on SMR
13639 // for CephFS and (possibly/probably) RBD.
13640 //
13641 // At some point we need to replace this with something more
13642 // sophisticated that ensures that a shared blob gets moved once
13643 // and all referencing objects get updated to point to the new
13644 // location.
13645
13646 map<uint32_t, uint32_t> to_move;
13647 for (auto& e : o->extent_map.extent_map) {
13648 bool touches_zone = false;
13649 for (auto& be : e.blob->get_blob().get_extents()) {
13650 if (be.is_valid()) {
13651 uint32_t z = be.offset / zone_size;
13652 if (z == zone) {
13653 touches_zone = true;
13654 break;
13655 }
13656 }
13657 }
13658 if (touches_zone) {
13659 to_move[e.logical_offset] = e.length;
13660 }
13661 }
13662 if (to_move.empty()) {
13663 dout(10) << __func__ << " no references to zone 0x" << std::hex << zone
13664 << std::dec << " from " << oid << dendl;
13665 return;
13666 }
13667
13668 dout(10) << __func__ << " rewriting object extents 0x" << std::hex << to_move
13669 << std::dec << dendl;
13670 OpSequencer *osr = c->osr.get();
13671 TransContext *txc = _txc_create(c, osr, nullptr);
13672
13673 spg_t pgid;
13674 if (c->cid.is_pg(&pgid)) {
13675 txc->osd_pool_id = pgid.pool();
13676 }
13677
13678 for (auto& [offset, length] : to_move) {
13679 bufferlist bl;
13680 int r = _do_read(c, o, offset, length, bl, 0);
13681 ceph_assert(r == (int)length);
13682
13683 r = _do_write(txc, cref, o, offset, length, bl, 0);
13684 ceph_assert(r >= 0);
13685 }
13686 txc->write_onode(o);
13687
13688 _txc_write_nodes(txc, txc->t);
13689 _txc_finalize_kv(txc, txc->t);
13690 _txc_state_proc(txc);
13691 }
13692 #endif
13693
13694 bluestore_deferred_op_t *BlueStore::_get_deferred_op(
13695 TransContext *txc, uint64_t len)
13696 {
13697 if (!txc->deferred_txn) {
13698 txc->deferred_txn = new bluestore_deferred_transaction_t;
13699 }
13700 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
13701 logger->inc(l_bluestore_issued_deferred_writes);
13702 logger->inc(l_bluestore_issued_deferred_write_bytes, len);
13703 return &txc->deferred_txn->ops.back();
13704 }
13705
13706 void BlueStore::_deferred_queue(TransContext *txc)
13707 {
13708 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
13709
13710 DeferredBatch *tmp;
13711 txc->osr->deferred_lock.lock();
13712 {
13713 if (!txc->osr->deferred_pending) {
13714 tmp = new DeferredBatch(cct, txc->osr.get());
13715 } else {
13716 tmp = txc->osr->deferred_pending;
13717 }
13718 }
13719
13720 tmp->txcs.push_back(*txc);
13721 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
13722 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
13723 const auto& op = *opi;
13724 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
13725 bufferlist::const_iterator p = op.data.begin();
13726 for (auto e : op.extents) {
13727 tmp->prepare_write(cct, wt.seq, e.offset, e.length, p);
13728 }
13729 }
13730
13731 {
13732 ++deferred_queue_size;
13733 txc->osr->deferred_pending = tmp;
13734 // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
13735 // So we should add osr into deferred_queue.
13736 if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) {
13737 deferred_lock.lock();
13738 deferred_queue.push_back(*txc->osr);
13739 deferred_lock.unlock();
13740 }
13741
13742 if (deferred_aggressive &&
13743 !txc->osr->deferred_running) {
13744 _deferred_submit_unlock(txc->osr.get());
13745 } else {
13746 txc->osr->deferred_lock.unlock();
13747 }
13748 }
13749 }
13750
13751 void BlueStore::deferred_try_submit()
13752 {
13753 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
13754 << deferred_queue_size << " txcs" << dendl;
13755 vector<OpSequencerRef> osrs;
13756
13757 {
13758 std::lock_guard l(deferred_lock);
13759 osrs.reserve(deferred_queue.size());
13760 for (auto& osr : deferred_queue) {
13761 osrs.push_back(&osr);
13762 }
13763 }
13764
13765 for (auto& osr : osrs) {
13766 osr->deferred_lock.lock();
13767 if (osr->deferred_pending) {
13768 if (!osr->deferred_running) {
13769 _deferred_submit_unlock(osr.get());
13770 } else {
13771 osr->deferred_lock.unlock();
13772 dout(20) << __func__ << " osr " << osr << " already has running"
13773 << dendl;
13774 }
13775 } else {
13776 osr->deferred_lock.unlock();
13777 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
13778 }
13779 }
13780
13781 {
13782 std::lock_guard l(deferred_lock);
13783 deferred_last_submitted = ceph_clock_now();
13784 }
13785 }
13786
13787 void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
13788 {
13789 dout(10) << __func__ << " osr " << osr
13790 << " " << osr->deferred_pending->iomap.size() << " ios pending "
13791 << dendl;
13792 ceph_assert(osr->deferred_pending);
13793 ceph_assert(!osr->deferred_running);
13794
13795 auto b = osr->deferred_pending;
13796 deferred_queue_size -= b->seq_bytes.size();
13797 ceph_assert(deferred_queue_size >= 0);
13798
13799 osr->deferred_running = osr->deferred_pending;
13800 osr->deferred_pending = nullptr;
13801
13802 osr->deferred_lock.unlock();
13803
13804 for (auto& txc : b->txcs) {
13805 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
13806 }
13807 uint64_t start = 0, pos = 0;
13808 bufferlist bl;
13809 auto i = b->iomap.begin();
13810 while (true) {
13811 if (i == b->iomap.end() || i->first != pos) {
13812 if (bl.length()) {
13813 dout(20) << __func__ << " write 0x" << std::hex
13814 << start << "~" << bl.length()
13815 << " crc " << bl.crc32c(-1) << std::dec << dendl;
13816 if (!g_conf()->bluestore_debug_omit_block_device_write) {
13817 logger->inc(l_bluestore_submitted_deferred_writes);
13818 logger->inc(l_bluestore_submitted_deferred_write_bytes, bl.length());
13819 int r = bdev->aio_write(start, bl, &b->ioc, false);
13820 ceph_assert(r == 0);
13821 }
13822 }
13823 if (i == b->iomap.end()) {
13824 break;
13825 }
13826 start = 0;
13827 pos = i->first;
13828 bl.clear();
13829 }
13830 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
13831 << std::hex << pos << "~" << i->second.bl.length() << std::dec
13832 << dendl;
13833 if (!bl.length()) {
13834 start = pos;
13835 }
13836 pos += i->second.bl.length();
13837 bl.claim_append(i->second.bl);
13838 ++i;
13839 }
13840
13841 bdev->aio_submit(&b->ioc);
13842 }
13843
13844 struct C_DeferredTrySubmit : public Context {
13845 BlueStore *store;
13846 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
13847 void finish(int r) {
13848 store->deferred_try_submit();
13849 }
13850 };
13851
13852 void BlueStore::_deferred_aio_finish(OpSequencer *osr)
13853 {
13854 dout(10) << __func__ << " osr " << osr << dendl;
13855 ceph_assert(osr->deferred_running);
13856 DeferredBatch *b = osr->deferred_running;
13857
13858 {
13859 osr->deferred_lock.lock();
13860 ceph_assert(osr->deferred_running == b);
13861 osr->deferred_running = nullptr;
13862 if (!osr->deferred_pending) {
13863 dout(20) << __func__ << " dequeueing" << dendl;
13864 {
13865 deferred_lock.lock();
13866 auto q = deferred_queue.iterator_to(*osr);
13867 deferred_queue.erase(q);
13868 deferred_lock.unlock();
13869 }
13870 osr->deferred_lock.unlock();
13871 } else {
13872 osr->deferred_lock.unlock();
13873 if (deferred_aggressive) {
13874 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
13875 finisher.queue(new C_DeferredTrySubmit(this));
13876 } else {
13877 dout(20) << __func__ << " leaving queued, more pending" << dendl;
13878 }
13879 }
13880 }
13881
13882 {
13883 uint64_t costs = 0;
13884 {
13885 for (auto& i : b->txcs) {
13886 TransContext *txc = &i;
13887 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
13888 txc->set_state(TransContext::STATE_DEFERRED_CLEANUP);
13889 costs += txc->cost;
13890 }
13891 }
13892 throttle.release_deferred_throttle(costs);
13893 }
13894
13895 {
13896 std::lock_guard l(kv_lock);
13897 deferred_done_queue.emplace_back(b);
13898
13899 // in the normal case, do not bother waking up the kv thread; it will
13900 // catch us on the next commit anyway.
13901 if (deferred_aggressive && !kv_sync_in_progress) {
13902 kv_sync_in_progress = true;
13903 kv_cond.notify_one();
13904 }
13905 }
13906 }
13907
13908 int BlueStore::_deferred_replay()
13909 {
13910 dout(10) << __func__ << " start" << dendl;
13911 int count = 0;
13912 int r = 0;
13913 CollectionRef ch = _get_collection(coll_t::meta());
13914 bool fake_ch = false;
13915 if (!ch) {
13916 // hmm, replaying initial mkfs?
13917 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
13918 fake_ch = true;
13919 }
13920 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
13921 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
13922 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
13923 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
13924 << dendl;
13925 bluestore_deferred_transaction_t *deferred_txn =
13926 new bluestore_deferred_transaction_t;
13927 bufferlist bl = it->value();
13928 auto p = bl.cbegin();
13929 try {
13930 decode(*deferred_txn, p);
13931 } catch (ceph::buffer::error& e) {
13932 derr << __func__ << " failed to decode deferred txn "
13933 << pretty_binary_string(it->key()) << dendl;
13934 delete deferred_txn;
13935 r = -EIO;
13936 goto out;
13937 }
13938 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
13939 txc->deferred_txn = deferred_txn;
13940 txc->set_state(TransContext::STATE_KV_DONE);
13941 _txc_state_proc(txc);
13942 }
13943 out:
13944 dout(20) << __func__ << " draining osr" << dendl;
13945 _osr_register_zombie(osr);
13946 _osr_drain_all();
13947 if (fake_ch) {
13948 new_coll_map.clear();
13949 }
13950 dout(10) << __func__ << " completed " << count << " events" << dendl;
13951 return r;
13952 }
13953
13954 // ---------------------------
13955 // transactions
13956
13957 int BlueStore::queue_transactions(
13958 CollectionHandle& ch,
13959 vector<Transaction>& tls,
13960 TrackedOpRef op,
13961 ThreadPool::TPHandle *handle)
13962 {
13963 FUNCTRACE(cct);
13964 list<Context *> on_applied, on_commit, on_applied_sync;
13965 ObjectStore::Transaction::collect_contexts(
13966 tls, &on_applied, &on_commit, &on_applied_sync);
13967
13968 auto start = mono_clock::now();
13969
13970 Collection *c = static_cast<Collection*>(ch.get());
13971 OpSequencer *osr = c->osr.get();
13972 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
13973
13974 // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
13975 // submission to happen atomically because if I/O submission happens in a
13976 // different order than I/O allocation, we end up issuing non-sequential
13977 // writes to the drive. This is a temporary solution until ZONE APPEND
13978 // support matures in the kernel. For more information please see:
13979 // https://www.usenix.org/conference/vault20/presentation/bjorling
13980 if (bdev->is_smr()) {
13981 atomic_alloc_and_submit_lock.lock();
13982 }
13983
13984 // prepare
13985 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
13986 &on_commit, op);
13987
13988 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
13989 txc->bytes += (*p).get_num_bytes();
13990 _txc_add_transaction(txc, &(*p));
13991 }
13992 _txc_calc_cost(txc);
13993
13994 _txc_write_nodes(txc, txc->t);
13995
13996 // journal deferred items
13997 if (txc->deferred_txn) {
13998 txc->deferred_txn->seq = ++deferred_seq;
13999 bufferlist bl;
14000 encode(*txc->deferred_txn, bl);
14001 string key;
14002 get_deferred_key(txc->deferred_txn->seq, &key);
14003 txc->t->set(PREFIX_DEFERRED, key, bl);
14004 }
14005
14006 _txc_finalize_kv(txc, txc->t);
14007
14008 #ifdef WITH_BLKIN
14009 if (txc->trace) {
14010 txc->trace.event("txc encode finished");
14011 }
14012 #endif
14013
14014 if (handle)
14015 handle->suspend_tp_timeout();
14016
14017 auto tstart = mono_clock::now();
14018
14019 if (!throttle.try_start_transaction(
14020 *db,
14021 *txc,
14022 tstart)) {
14023 // ensure we do not block here because of deferred writes
14024 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
14025 << dendl;
14026 ++deferred_aggressive;
14027 deferred_try_submit();
14028 {
14029 // wake up any previously finished deferred events
14030 std::lock_guard l(kv_lock);
14031 if (!kv_sync_in_progress) {
14032 kv_sync_in_progress = true;
14033 kv_cond.notify_one();
14034 }
14035 }
14036 throttle.finish_start_transaction(*db, *txc, tstart);
14037 --deferred_aggressive;
14038 }
14039 auto tend = mono_clock::now();
14040
14041 if (handle)
14042 handle->reset_tp_timeout();
14043
14044 logger->inc(l_bluestore_txc);
14045
14046 // execute (start)
14047 _txc_state_proc(txc);
14048
14049 if (bdev->is_smr()) {
14050 atomic_alloc_and_submit_lock.unlock();
14051 }
14052
14053 // we're immediately readable (unlike FileStore)
14054 for (auto c : on_applied_sync) {
14055 c->complete(0);
14056 }
14057 if (!on_applied.empty()) {
14058 if (c->commit_queue) {
14059 c->commit_queue->queue(on_applied);
14060 } else {
14061 finisher.queue(on_applied);
14062 }
14063 }
14064
14065 #ifdef WITH_BLKIN
14066 if (txc->trace) {
14067 txc->trace.event("txc applied");
14068 }
14069 #endif
14070
14071 log_latency("submit_transact",
14072 l_bluestore_submit_lat,
14073 mono_clock::now() - start,
14074 cct->_conf->bluestore_log_op_age);
14075 log_latency("throttle_transact",
14076 l_bluestore_throttle_lat,
14077 tend - tstart,
14078 cct->_conf->bluestore_log_op_age);
14079 return 0;
14080 }
14081
14082 void BlueStore::_txc_aio_submit(TransContext *txc)
14083 {
14084 dout(10) << __func__ << " txc " << txc << dendl;
14085 bdev->aio_submit(&txc->ioc);
14086 }
14087
14088 void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
14089 {
14090 Transaction::iterator i = t->begin();
14091
14092 _dump_transaction<30>(cct, t);
14093
14094 vector<CollectionRef> cvec(i.colls.size());
14095 unsigned j = 0;
14096 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
14097 ++p, ++j) {
14098 cvec[j] = _get_collection(*p);
14099 }
14100
14101 vector<OnodeRef> ovec(i.objects.size());
14102
14103 for (int pos = 0; i.have_op(); ++pos) {
14104 Transaction::Op *op = i.decode_op();
14105 int r = 0;
14106
14107 // no coll or obj
14108 if (op->op == Transaction::OP_NOP)
14109 continue;
14110
14111
14112 // collection operations
14113 CollectionRef &c = cvec[op->cid];
14114
14115 // initialize osd_pool_id and do a smoke test that all collections belong
14116 // to the same pool
14117 spg_t pgid;
14118 if (!!c ? c->cid.is_pg(&pgid) : false) {
14119 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
14120 txc->osd_pool_id == pgid.pool());
14121 txc->osd_pool_id = pgid.pool();
14122 }
14123
14124 switch (op->op) {
14125 case Transaction::OP_RMCOLL:
14126 {
14127 const coll_t &cid = i.get_cid(op->cid);
14128 r = _remove_collection(txc, cid, &c);
14129 if (!r)
14130 continue;
14131 }
14132 break;
14133
14134 case Transaction::OP_MKCOLL:
14135 {
14136 ceph_assert(!c);
14137 const coll_t &cid = i.get_cid(op->cid);
14138 r = _create_collection(txc, cid, op->split_bits, &c);
14139 if (!r)
14140 continue;
14141 }
14142 break;
14143
14144 case Transaction::OP_SPLIT_COLLECTION:
14145 ceph_abort_msg("deprecated");
14146 break;
14147
14148 case Transaction::OP_SPLIT_COLLECTION2:
14149 {
14150 uint32_t bits = op->split_bits;
14151 uint32_t rem = op->split_rem;
14152 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
14153 if (!r)
14154 continue;
14155 }
14156 break;
14157
14158 case Transaction::OP_MERGE_COLLECTION:
14159 {
14160 uint32_t bits = op->split_bits;
14161 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
14162 if (!r)
14163 continue;
14164 }
14165 break;
14166
14167 case Transaction::OP_COLL_HINT:
14168 {
14169 uint32_t type = op->hint;
14170 bufferlist hint;
14171 i.decode_bl(hint);
14172 auto hiter = hint.cbegin();
14173 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
14174 uint32_t pg_num;
14175 uint64_t num_objs;
14176 decode(pg_num, hiter);
14177 decode(num_objs, hiter);
14178 dout(10) << __func__ << " collection hint objects is a no-op, "
14179 << " pg_num " << pg_num << " num_objects " << num_objs
14180 << dendl;
14181 } else {
14182 // Ignore the hint
14183 dout(10) << __func__ << " unknown collection hint " << type << dendl;
14184 }
14185 continue;
14186 }
14187 break;
14188
14189 case Transaction::OP_COLL_SETATTR:
14190 r = -EOPNOTSUPP;
14191 break;
14192
14193 case Transaction::OP_COLL_RMATTR:
14194 r = -EOPNOTSUPP;
14195 break;
14196
14197 case Transaction::OP_COLL_RENAME:
14198 ceph_abort_msg("not implemented");
14199 break;
14200 }
14201 if (r < 0) {
14202 derr << __func__ << " error " << cpp_strerror(r)
14203 << " not handled on operation " << op->op
14204 << " (op " << pos << ", counting from 0)" << dendl;
14205 _dump_transaction<0>(cct, t);
14206 ceph_abort_msg("unexpected error");
14207 }
14208
14209 // these operations implicity create the object
14210 bool create = false;
14211 if (op->op == Transaction::OP_TOUCH ||
14212 op->op == Transaction::OP_CREATE ||
14213 op->op == Transaction::OP_WRITE ||
14214 op->op == Transaction::OP_ZERO) {
14215 create = true;
14216 }
14217
14218 // object operations
14219 std::unique_lock l(c->lock);
14220 OnodeRef &o = ovec[op->oid];
14221 if (!o) {
14222 ghobject_t oid = i.get_oid(op->oid);
14223 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
14224 }
14225 if (!create && (!o || !o->exists)) {
14226 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
14227 << i.get_oid(op->oid) << dendl;
14228 r = -ENOENT;
14229 goto endop;
14230 }
14231
14232 switch (op->op) {
14233 case Transaction::OP_CREATE:
14234 case Transaction::OP_TOUCH:
14235 r = _touch(txc, c, o);
14236 break;
14237
14238 case Transaction::OP_WRITE:
14239 {
14240 uint64_t off = op->off;
14241 uint64_t len = op->len;
14242 uint32_t fadvise_flags = i.get_fadvise_flags();
14243 bufferlist bl;
14244 i.decode_bl(bl);
14245 r = _write(txc, c, o, off, len, bl, fadvise_flags);
14246 }
14247 break;
14248
14249 case Transaction::OP_ZERO:
14250 {
14251 uint64_t off = op->off;
14252 uint64_t len = op->len;
14253 r = _zero(txc, c, o, off, len);
14254 }
14255 break;
14256
14257 case Transaction::OP_TRIMCACHE:
14258 {
14259 // deprecated, no-op
14260 }
14261 break;
14262
14263 case Transaction::OP_TRUNCATE:
14264 {
14265 uint64_t off = op->off;
14266 r = _truncate(txc, c, o, off);
14267 }
14268 break;
14269
14270 case Transaction::OP_REMOVE:
14271 {
14272 r = _remove(txc, c, o);
14273 }
14274 break;
14275
14276 case Transaction::OP_SETATTR:
14277 {
14278 string name = i.decode_string();
14279 bufferptr bp;
14280 i.decode_bp(bp);
14281 r = _setattr(txc, c, o, name, bp);
14282 }
14283 break;
14284
14285 case Transaction::OP_SETATTRS:
14286 {
14287 map<string, bufferptr> aset;
14288 i.decode_attrset(aset);
14289 r = _setattrs(txc, c, o, aset);
14290 }
14291 break;
14292
14293 case Transaction::OP_RMATTR:
14294 {
14295 string name = i.decode_string();
14296 r = _rmattr(txc, c, o, name);
14297 }
14298 break;
14299
14300 case Transaction::OP_RMATTRS:
14301 {
14302 r = _rmattrs(txc, c, o);
14303 }
14304 break;
14305
14306 case Transaction::OP_CLONE:
14307 {
14308 OnodeRef& no = ovec[op->dest_oid];
14309 if (!no) {
14310 const ghobject_t& noid = i.get_oid(op->dest_oid);
14311 no = c->get_onode(noid, true);
14312 }
14313 r = _clone(txc, c, o, no);
14314 }
14315 break;
14316
14317 case Transaction::OP_CLONERANGE:
14318 ceph_abort_msg("deprecated");
14319 break;
14320
14321 case Transaction::OP_CLONERANGE2:
14322 {
14323 OnodeRef& no = ovec[op->dest_oid];
14324 if (!no) {
14325 const ghobject_t& noid = i.get_oid(op->dest_oid);
14326 no = c->get_onode(noid, true);
14327 }
14328 uint64_t srcoff = op->off;
14329 uint64_t len = op->len;
14330 uint64_t dstoff = op->dest_off;
14331 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
14332 }
14333 break;
14334
14335 case Transaction::OP_COLL_ADD:
14336 ceph_abort_msg("not implemented");
14337 break;
14338
14339 case Transaction::OP_COLL_REMOVE:
14340 ceph_abort_msg("not implemented");
14341 break;
14342
14343 case Transaction::OP_COLL_MOVE:
14344 ceph_abort_msg("deprecated");
14345 break;
14346
14347 case Transaction::OP_COLL_MOVE_RENAME:
14348 case Transaction::OP_TRY_RENAME:
14349 {
14350 ceph_assert(op->cid == op->dest_cid);
14351 const ghobject_t& noid = i.get_oid(op->dest_oid);
14352 OnodeRef& no = ovec[op->dest_oid];
14353 if (!no) {
14354 no = c->get_onode(noid, false);
14355 }
14356 r = _rename(txc, c, o, no, noid);
14357 }
14358 break;
14359
14360 case Transaction::OP_OMAP_CLEAR:
14361 {
14362 r = _omap_clear(txc, c, o);
14363 }
14364 break;
14365 case Transaction::OP_OMAP_SETKEYS:
14366 {
14367 bufferlist aset_bl;
14368 i.decode_attrset_bl(&aset_bl);
14369 r = _omap_setkeys(txc, c, o, aset_bl);
14370 }
14371 break;
14372 case Transaction::OP_OMAP_RMKEYS:
14373 {
14374 bufferlist keys_bl;
14375 i.decode_keyset_bl(&keys_bl);
14376 r = _omap_rmkeys(txc, c, o, keys_bl);
14377 }
14378 break;
14379 case Transaction::OP_OMAP_RMKEYRANGE:
14380 {
14381 string first, last;
14382 first = i.decode_string();
14383 last = i.decode_string();
14384 r = _omap_rmkey_range(txc, c, o, first, last);
14385 }
14386 break;
14387 case Transaction::OP_OMAP_SETHEADER:
14388 {
14389 bufferlist bl;
14390 i.decode_bl(bl);
14391 r = _omap_setheader(txc, c, o, bl);
14392 }
14393 break;
14394
14395 case Transaction::OP_SETALLOCHINT:
14396 {
14397 r = _set_alloc_hint(txc, c, o,
14398 op->expected_object_size,
14399 op->expected_write_size,
14400 op->hint);
14401 }
14402 break;
14403
14404 default:
14405 derr << __func__ << " bad op " << op->op << dendl;
14406 ceph_abort();
14407 }
14408
14409 endop:
14410 if (r < 0) {
14411 bool ok = false;
14412
14413 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
14414 op->op == Transaction::OP_CLONE ||
14415 op->op == Transaction::OP_CLONERANGE2 ||
14416 op->op == Transaction::OP_COLL_ADD ||
14417 op->op == Transaction::OP_SETATTR ||
14418 op->op == Transaction::OP_SETATTRS ||
14419 op->op == Transaction::OP_RMATTR ||
14420 op->op == Transaction::OP_OMAP_SETKEYS ||
14421 op->op == Transaction::OP_OMAP_RMKEYS ||
14422 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
14423 op->op == Transaction::OP_OMAP_SETHEADER))
14424 // -ENOENT is usually okay
14425 ok = true;
14426 if (r == -ENODATA)
14427 ok = true;
14428
14429 if (!ok) {
14430 const char *msg = "unexpected error code";
14431
14432 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
14433 op->op == Transaction::OP_CLONE ||
14434 op->op == Transaction::OP_CLONERANGE2))
14435 msg = "ENOENT on clone suggests osd bug";
14436
14437 if (r == -ENOSPC)
14438 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
14439 // by partially applying transactions.
14440 msg = "ENOSPC from bluestore, misconfigured cluster";
14441
14442 if (r == -ENOTEMPTY) {
14443 msg = "ENOTEMPTY suggests garbage data in osd data dir";
14444 }
14445
14446 derr << __func__ << " error " << cpp_strerror(r)
14447 << " not handled on operation " << op->op
14448 << " (op " << pos << ", counting from 0)"
14449 << dendl;
14450 derr << msg << dendl;
14451 _dump_transaction<0>(cct, t);
14452 ceph_abort_msg("unexpected error");
14453 }
14454 }
14455 }
14456 }
14457
14458
14459
14460 // -----------------
14461 // write operations
14462
14463 int BlueStore::_touch(TransContext *txc,
14464 CollectionRef& c,
14465 OnodeRef &o)
14466 {
14467 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14468 int r = 0;
14469 _assign_nid(txc, o);
14470 txc->write_onode(o);
14471 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14472 return r;
14473 }
14474
14475 void BlueStore::_pad_zeros(
14476 bufferlist *bl, uint64_t *offset,
14477 uint64_t chunk_size)
14478 {
14479 auto length = bl->length();
14480 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
14481 << " chunk_size 0x" << chunk_size << std::dec << dendl;
14482 dout(40) << "before:\n";
14483 bl->hexdump(*_dout);
14484 *_dout << dendl;
14485 // front
14486 size_t front_pad = *offset % chunk_size;
14487 size_t back_pad = 0;
14488 size_t pad_count = 0;
14489 if (front_pad) {
14490 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
14491 bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size);
14492 z.zero(0, front_pad, false);
14493 pad_count += front_pad;
14494 bl->begin().copy(front_copy, z.c_str() + front_pad);
14495 if (front_copy + front_pad < chunk_size) {
14496 back_pad = chunk_size - (length + front_pad);
14497 z.zero(front_pad + length, back_pad, false);
14498 pad_count += back_pad;
14499 }
14500 bufferlist old, t;
14501 old.swap(*bl);
14502 t.substr_of(old, front_copy, length - front_copy);
14503 bl->append(z);
14504 bl->claim_append(t);
14505 *offset -= front_pad;
14506 length += pad_count;
14507 }
14508
14509 // back
14510 uint64_t end = *offset + length;
14511 unsigned back_copy = end % chunk_size;
14512 if (back_copy) {
14513 ceph_assert(back_pad == 0);
14514 back_pad = chunk_size - back_copy;
14515 ceph_assert(back_copy <= length);
14516 bufferptr tail(chunk_size);
14517 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
14518 tail.zero(back_copy, back_pad, false);
14519 bufferlist old;
14520 old.swap(*bl);
14521 bl->substr_of(old, 0, length - back_copy);
14522 bl->append(tail);
14523 length += back_pad;
14524 pad_count += back_pad;
14525 }
14526 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
14527 << back_pad << " on front/back, now 0x" << *offset << "~"
14528 << length << std::dec << dendl;
14529 dout(40) << "after:\n";
14530 bl->hexdump(*_dout);
14531 *_dout << dendl;
14532 if (pad_count)
14533 logger->inc(l_bluestore_write_pad_bytes, pad_count);
14534 ceph_assert(bl->length() == length);
14535 }
14536
14537 void BlueStore::_do_write_small(
14538 TransContext *txc,
14539 CollectionRef &c,
14540 OnodeRef o,
14541 uint64_t offset, uint64_t length,
14542 bufferlist::iterator& blp,
14543 WriteContext *wctx)
14544 {
14545 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
14546 << std::dec << dendl;
14547 ceph_assert(length < min_alloc_size);
14548
14549 uint64_t end_offs = offset + length;
14550
14551 logger->inc(l_bluestore_write_small);
14552 logger->inc(l_bluestore_write_small_bytes, length);
14553
14554 bufferlist bl;
14555 blp.copy(length, bl);
14556
14557 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
14558 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
14559 uint32_t alloc_len = min_alloc_size;
14560 auto offset0 = p2align<uint64_t>(offset, alloc_len);
14561
14562 bool any_change;
14563
14564 // search suitable extent in both forward and reverse direction in
14565 // [offset - target_max_blob_size, offset + target_max_blob_size] range
14566 // then check if blob can be reused via can_reuse_blob func or apply
14567 // direct/deferred write (the latter for extents including or higher
14568 // than 'offset' only).
14569 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
14570
14571 #ifdef HAVE_LIBZBD
14572 // On zoned devices, the first goal is to support non-overwrite workloads,
14573 // such as RGW, with large, aligned objects. Therefore, for user writes
14574 // _do_write_small should not trigger. OSDs, however, write and update a tiny
14575 // amount of metadata, such as OSD maps, to disk. For those cases, we
14576 // temporarily just pad them to min_alloc_size and write them to a new place
14577 // on every update.
14578 if (bdev->is_smr()) {
14579 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
14580 uint64_t b_off0 = b_off;
14581 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
14582
14583 // Zero detection -- small block
14584 if (!bl.is_zero()) {
14585 BlobRef b = c->new_blob();
14586 _pad_zeros(&bl, &b_off0, min_alloc_size);
14587 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true);
14588 } else { // if (bl.is_zero())
14589 dout(20) << __func__ << " skip small zero block " << std::hex
14590 << " (0x" << b_off0 << "~" << bl.length() << ")"
14591 << " (0x" << b_off << "~" << length << ")"
14592 << std::dec << dendl;
14593 logger->inc(l_bluestore_write_small_skipped);
14594 logger->inc(l_bluestore_write_small_skipped_bytes, length);
14595 }
14596
14597 return;
14598 }
14599 #endif
14600
14601 // Look for an existing mutable blob we can use.
14602 auto begin = o->extent_map.extent_map.begin();
14603 auto end = o->extent_map.extent_map.end();
14604 auto ep = o->extent_map.seek_lextent(offset);
14605 if (ep != begin) {
14606 --ep;
14607 if (ep->blob_end() <= offset) {
14608 ++ep;
14609 }
14610 }
14611 auto prev_ep = end;
14612 if (ep != begin) {
14613 prev_ep = ep;
14614 --prev_ep;
14615 }
14616
14617 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
14618 // We don't want to have more blobs than min alloc units fit
14619 // into 2 max blobs
14620 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
14621 bool above_blob_threshold = false;
14622
14623 inspected_blobs.reserve(blob_threshold);
14624
14625 uint64_t max_off = 0;
14626 auto start_ep = ep;
14627 auto end_ep = ep; // exclusively
14628 do {
14629 any_change = false;
14630
14631 if (ep != end && ep->logical_offset < offset + max_bsize) {
14632 BlobRef b = ep->blob;
14633 if (!above_blob_threshold) {
14634 inspected_blobs.insert(&b->get_blob());
14635 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
14636 }
14637 max_off = ep->logical_end();
14638 auto bstart = ep->blob_start();
14639
14640 dout(20) << __func__ << " considering " << *b
14641 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
14642 if (bstart >= end_offs) {
14643 dout(20) << __func__ << " ignoring distant " << *b << dendl;
14644 } else if (!b->get_blob().is_mutable()) {
14645 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
14646 } else if (ep->logical_offset % min_alloc_size !=
14647 ep->blob_offset % min_alloc_size) {
14648 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
14649 } else {
14650 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
14651 // can we pad our head/tail out with zeros?
14652 uint64_t head_pad, tail_pad;
14653 head_pad = p2phase(offset, chunk_size);
14654 tail_pad = p2nphase(end_offs, chunk_size);
14655 if (head_pad || tail_pad) {
14656 o->extent_map.fault_range(db, offset - head_pad,
14657 end_offs - offset + head_pad + tail_pad);
14658 }
14659 if (head_pad &&
14660 o->extent_map.has_any_lextents(offset - head_pad, head_pad)) {
14661 head_pad = 0;
14662 }
14663 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
14664 tail_pad = 0;
14665 }
14666
14667 uint64_t b_off = offset - head_pad - bstart;
14668 uint64_t b_len = length + head_pad + tail_pad;
14669
14670 // direct write into unused blocks of an existing mutable blob?
14671 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
14672 b->get_blob().get_ondisk_length() >= b_off + b_len &&
14673 b->get_blob().is_unused(b_off, b_len) &&
14674 b->get_blob().is_allocated(b_off, b_len)) {
14675 _apply_padding(head_pad, tail_pad, bl);
14676
14677 dout(20) << __func__ << " write to unused 0x" << std::hex
14678 << b_off << "~" << b_len
14679 << " pad 0x" << head_pad << " + 0x" << tail_pad
14680 << std::dec << " of mutable " << *b << dendl;
14681 _buffer_cache_write(txc, b, b_off, bl,
14682 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14683
14684 if (!g_conf()->bluestore_debug_omit_block_device_write) {
14685 if (b_len < prefer_deferred_size) {
14686 dout(20) << __func__ << " deferring small 0x" << std::hex
14687 << b_len << std::dec << " unused write via deferred" << dendl;
14688 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
14689 op->op = bluestore_deferred_op_t::OP_WRITE;
14690 b->get_blob().map(
14691 b_off, b_len,
14692 [&](uint64_t offset, uint64_t length) {
14693 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14694 return 0;
14695 });
14696 op->data = bl;
14697 } else {
14698 b->get_blob().map_bl(
14699 b_off, bl,
14700 [&](uint64_t offset, bufferlist& t) {
14701 bdev->aio_write(offset, t,
14702 &txc->ioc, wctx->buffered);
14703 });
14704 }
14705 }
14706 b->dirty_blob().calc_csum(b_off, bl);
14707 dout(20) << __func__ << " lex old " << *ep << dendl;
14708 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
14709 b,
14710 &wctx->old_extents);
14711 b->dirty_blob().mark_used(le->blob_offset, le->length);
14712
14713 txc->statfs_delta.stored() += le->length;
14714 dout(20) << __func__ << " lex " << *le << dendl;
14715 logger->inc(l_bluestore_write_small_unused);
14716 return;
14717 }
14718 // read some data to fill out the chunk?
14719 uint64_t head_read = p2phase(b_off, chunk_size);
14720 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
14721 if ((head_read || tail_read) &&
14722 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
14723 head_read + tail_read < min_alloc_size) {
14724 b_off -= head_read;
14725 b_len += head_read + tail_read;
14726
14727 } else {
14728 head_read = tail_read = 0;
14729 }
14730
14731 // chunk-aligned deferred overwrite?
14732 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
14733 b_off % chunk_size == 0 &&
14734 b_len % chunk_size == 0 &&
14735 b->get_blob().is_allocated(b_off, b_len)) {
14736
14737 _apply_padding(head_pad, tail_pad, bl);
14738
14739 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
14740 << " and tail 0x" << tail_read << std::dec << dendl;
14741 if (head_read) {
14742 bufferlist head_bl;
14743 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
14744 head_bl, 0);
14745 ceph_assert(r >= 0 && r <= (int)head_read);
14746 size_t zlen = head_read - r;
14747 if (zlen) {
14748 head_bl.append_zero(zlen);
14749 logger->inc(l_bluestore_write_pad_bytes, zlen);
14750 }
14751 head_bl.claim_append(bl);
14752 bl.swap(head_bl);
14753 logger->inc(l_bluestore_write_penalty_read_ops);
14754 }
14755 if (tail_read) {
14756 bufferlist tail_bl;
14757 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
14758 tail_bl, 0);
14759 ceph_assert(r >= 0 && r <= (int)tail_read);
14760 size_t zlen = tail_read - r;
14761 if (zlen) {
14762 tail_bl.append_zero(zlen);
14763 logger->inc(l_bluestore_write_pad_bytes, zlen);
14764 }
14765 bl.claim_append(tail_bl);
14766 logger->inc(l_bluestore_write_penalty_read_ops);
14767 }
14768 logger->inc(l_bluestore_write_small_pre_read);
14769
14770 _buffer_cache_write(txc, b, b_off, bl,
14771 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14772
14773 b->dirty_blob().calc_csum(b_off, bl);
14774
14775 if (!g_conf()->bluestore_debug_omit_block_device_write) {
14776 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
14777 op->op = bluestore_deferred_op_t::OP_WRITE;
14778 int r = b->get_blob().map(
14779 b_off, b_len,
14780 [&](uint64_t offset, uint64_t length) {
14781 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14782 return 0;
14783 });
14784 ceph_assert(r == 0);
14785 op->data = std::move(bl);
14786 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
14787 << b_len << std::dec << " of mutable " << *b
14788 << " at " << op->extents << dendl;
14789 }
14790
14791 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
14792 b, &wctx->old_extents);
14793 b->dirty_blob().mark_used(le->blob_offset, le->length);
14794 txc->statfs_delta.stored() += le->length;
14795 dout(20) << __func__ << " lex " << *le << dendl;
14796 return;
14797 }
14798 // try to reuse blob if we can
14799 if (b->can_reuse_blob(min_alloc_size,
14800 max_bsize,
14801 offset0 - bstart,
14802 &alloc_len)) {
14803 ceph_assert(alloc_len == min_alloc_size); // expecting data always
14804 // fit into reused blob
14805 // Need to check for pending writes desiring to
14806 // reuse the same pextent. The rationale is that during GC two chunks
14807 // from garbage blobs(compressed?) can share logical space within the same
14808 // AU. That's in turn might be caused by unaligned len in clone_range2.
14809 // Hence the second write will fail in an attempt to reuse blob at
14810 // do_alloc_write().
14811 if (!wctx->has_conflict(b,
14812 offset0,
14813 offset0 + alloc_len,
14814 min_alloc_size)) {
14815
14816 // we can't reuse pad_head/pad_tail since they might be truncated
14817 // due to existent extents
14818 uint64_t b_off = offset - bstart;
14819 uint64_t b_off0 = b_off;
14820 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
14821
14822 // Zero detection -- small block
14823 if (!bl.is_zero()) {
14824 _pad_zeros(&bl, &b_off0, chunk_size);
14825
14826 dout(20) << __func__ << " reuse blob " << *b << std::hex
14827 << " (0x" << b_off0 << "~" << bl.length() << ")"
14828 << " (0x" << b_off << "~" << length << ")"
14829 << std::dec << dendl;
14830
14831 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
14832 false, false);
14833 logger->inc(l_bluestore_write_small_unused);
14834 } else { // if (bl.is_zero())
14835 dout(20) << __func__ << " skip small zero block " << std::hex
14836 << " (0x" << b_off0 << "~" << bl.length() << ")"
14837 << " (0x" << b_off << "~" << length << ")"
14838 << std::dec << dendl;
14839 logger->inc(l_bluestore_write_small_skipped);
14840 logger->inc(l_bluestore_write_small_skipped_bytes, length);
14841 }
14842
14843 return;
14844 }
14845 }
14846 }
14847 ++ep;
14848 end_ep = ep;
14849 any_change = true;
14850 } // if (ep != end && ep->logical_offset < offset + max_bsize)
14851
14852 // check extent for reuse in reverse order
14853 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
14854 BlobRef b = prev_ep->blob;
14855 if (!above_blob_threshold) {
14856 inspected_blobs.insert(&b->get_blob());
14857 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
14858 }
14859 start_ep = prev_ep;
14860 auto bstart = prev_ep->blob_start();
14861 dout(20) << __func__ << " considering " << *b
14862 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
14863 if (b->can_reuse_blob(min_alloc_size,
14864 max_bsize,
14865 offset0 - bstart,
14866 &alloc_len)) {
14867 ceph_assert(alloc_len == min_alloc_size); // expecting data always
14868 // fit into reused blob
14869 // Need to check for pending writes desiring to
14870 // reuse the same pextent. The rationale is that during GC two chunks
14871 // from garbage blobs(compressed?) can share logical space within the same
14872 // AU. That's in turn might be caused by unaligned len in clone_range2.
14873 // Hence the second write will fail in an attempt to reuse blob at
14874 // do_alloc_write().
14875 if (!wctx->has_conflict(b,
14876 offset0,
14877 offset0 + alloc_len,
14878 min_alloc_size)) {
14879
14880 uint64_t b_off = offset - bstart;
14881 uint64_t b_off0 = b_off;
14882 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
14883
14884 // Zero detection -- small block
14885 if (!bl.is_zero()) {
14886 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
14887 _pad_zeros(&bl, &b_off0, chunk_size);
14888
14889 dout(20) << __func__ << " reuse blob " << *b << std::hex
14890 << " (0x" << b_off0 << "~" << bl.length() << ")"
14891 << " (0x" << b_off << "~" << length << ")"
14892 << std::dec << dendl;
14893
14894 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
14895 false, false);
14896 logger->inc(l_bluestore_write_small_unused);
14897 } else { // if (bl.is_zero())
14898 dout(20) << __func__ << " skip small zero block " << std::hex
14899 << " (0x" << b_off0 << "~" << bl.length() << ")"
14900 << " (0x" << b_off << "~" << length << ")"
14901 << std::dec << dendl;
14902 logger->inc(l_bluestore_write_small_skipped);
14903 logger->inc(l_bluestore_write_small_skipped_bytes, length);
14904 }
14905
14906 return;
14907 }
14908 }
14909 if (prev_ep != begin) {
14910 --prev_ep;
14911 any_change = true;
14912 } else {
14913 prev_ep = end; // to avoid useless first extent re-check
14914 }
14915 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
14916 } while (any_change);
14917
14918 if (above_blob_threshold) {
14919 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
14920 << " " << std::hex << min_off << "~" << max_off << std::dec
14921 << dendl;
14922 ceph_assert(start_ep != end_ep);
14923 for (auto ep = start_ep; ep != end_ep; ++ep) {
14924 dout(20) << __func__ << " inserting for GC "
14925 << std::hex << ep->logical_offset << "~" << ep->length
14926 << std::dec << dendl;
14927
14928 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
14929 }
14930 // insert newly written extent to GC
14931 wctx->extents_to_gc.union_insert(offset, length);
14932 dout(20) << __func__ << " inserting (last) for GC "
14933 << std::hex << offset << "~" << length
14934 << std::dec << dendl;
14935 }
14936 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
14937 uint64_t b_off0 = b_off;
14938 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
14939
14940 // Zero detection -- small block
14941 if (!bl.is_zero()) {
14942 // new blob.
14943 BlobRef b = c->new_blob();
14944 _pad_zeros(&bl, &b_off0, block_size);
14945 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
14946 min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
14947 // doesn't match disk one only
14948 true);
14949 } else { // if (bl.is_zero())
14950 dout(20) << __func__ << " skip small zero block " << std::hex
14951 << " (0x" << b_off0 << "~" << bl.length() << ")"
14952 << " (0x" << b_off << "~" << length << ")"
14953 << std::dec << dendl;
14954 logger->inc(l_bluestore_write_small_skipped);
14955 logger->inc(l_bluestore_write_small_skipped_bytes, length);
14956 }
14957
14958 return;
14959 }
14960
14961 bool BlueStore::has_null_fm()
14962 {
14963 return fm->is_null_manager();
14964 }
14965
14966 bool BlueStore::BigDeferredWriteContext::can_defer(
14967 BlueStore::extent_map_t::iterator ep,
14968 uint64_t prefer_deferred_size,
14969 uint64_t block_size,
14970 uint64_t offset,
14971 uint64_t l)
14972 {
14973 bool res = false;
14974 auto& blob = ep->blob->get_blob();
14975 if (offset >= ep->blob_start() &&
14976 blob.is_mutable()) {
14977 off = offset;
14978 b_off = offset - ep->blob_start();
14979 uint64_t chunk_size = blob.get_chunk_size(block_size);
14980 uint64_t ondisk = blob.get_ondisk_length();
14981 used = std::min(l, ondisk - b_off);
14982
14983 // will read some data to fill out the chunk?
14984 head_read = p2phase<uint64_t>(b_off, chunk_size);
14985 tail_read = p2nphase<uint64_t>(b_off + used, chunk_size);
14986 b_off -= head_read;
14987
14988 ceph_assert(b_off % chunk_size == 0);
14989 ceph_assert(blob_aligned_len() % chunk_size == 0);
14990
14991 res = blob_aligned_len() < prefer_deferred_size &&
14992 blob_aligned_len() <= ondisk &&
14993 blob.is_allocated(b_off, blob_aligned_len());
14994 if (res) {
14995 blob_ref = ep->blob;
14996 blob_start = ep->blob_start();
14997 }
14998 }
14999 return res;
15000 }
15001
15002 bool BlueStore::BigDeferredWriteContext::apply_defer()
15003 {
15004 int r = blob_ref->get_blob().map(
15005 b_off, blob_aligned_len(),
15006 [&](const bluestore_pextent_t& pext,
15007 uint64_t offset,
15008 uint64_t length) {
15009 // apply deferred if overwrite breaks blob continuity only.
15010 // if it totally overlaps some pextent - fallback to regular write
15011 if (pext.offset < offset ||
15012 pext.end() > offset + length) {
15013 res_extents.emplace_back(bluestore_pextent_t(offset, length));
15014 return 0;
15015 }
15016 return -1;
15017 });
15018 return r >= 0;
15019 }
15020
15021 void BlueStore::_do_write_big_apply_deferred(
15022 TransContext* txc,
15023 CollectionRef& c,
15024 OnodeRef o,
15025 BlueStore::BigDeferredWriteContext& dctx,
15026 bufferlist::iterator& blp,
15027 WriteContext* wctx)
15028 {
15029 bufferlist bl;
15030 dout(20) << __func__ << " reading head 0x" << std::hex << dctx.head_read
15031 << " and tail 0x" << dctx.tail_read << std::dec << dendl;
15032 if (dctx.head_read) {
15033 int r = _do_read(c.get(), o,
15034 dctx.off - dctx.head_read,
15035 dctx.head_read,
15036 bl,
15037 0);
15038 ceph_assert(r >= 0 && r <= (int)dctx.head_read);
15039 size_t zlen = dctx.head_read - r;
15040 if (zlen) {
15041 bl.append_zero(zlen);
15042 logger->inc(l_bluestore_write_pad_bytes, zlen);
15043 }
15044 logger->inc(l_bluestore_write_penalty_read_ops);
15045 }
15046 blp.copy(dctx.used, bl);
15047
15048 if (dctx.tail_read) {
15049 bufferlist tail_bl;
15050 int r = _do_read(c.get(), o,
15051 dctx.off + dctx.used, dctx.tail_read,
15052 tail_bl, 0);
15053 ceph_assert(r >= 0 && r <= (int)dctx.tail_read);
15054 size_t zlen = dctx.tail_read - r;
15055 if (zlen) {
15056 tail_bl.append_zero(zlen);
15057 logger->inc(l_bluestore_write_pad_bytes, zlen);
15058 }
15059 bl.claim_append(tail_bl);
15060 logger->inc(l_bluestore_write_penalty_read_ops);
15061 }
15062 auto& b0 = dctx.blob_ref;
15063 _buffer_cache_write(txc, b0, dctx.b_off, bl,
15064 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
15065
15066 b0->dirty_blob().calc_csum(dctx.b_off, bl);
15067
15068 Extent* le = o->extent_map.set_lextent(c, dctx.off,
15069 dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents);
15070
15071 // in fact this is a no-op for big writes but left here to maintain
15072 // uniformity and avoid missing after some refactor.
15073 b0->dirty_blob().mark_used(le->blob_offset, le->length);
15074 txc->statfs_delta.stored() += le->length;
15075
15076 if (!g_conf()->bluestore_debug_omit_block_device_write) {
15077 bluestore_deferred_op_t* op = _get_deferred_op(txc, bl.length());
15078 op->op = bluestore_deferred_op_t::OP_WRITE;
15079 op->extents.swap(dctx.res_extents);
15080 op->data = std::move(bl);
15081 }
15082 }
15083
15084 void BlueStore::_do_write_big(
15085 TransContext *txc,
15086 CollectionRef &c,
15087 OnodeRef o,
15088 uint64_t offset, uint64_t length,
15089 bufferlist::iterator& blp,
15090 WriteContext *wctx)
15091 {
15092 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
15093 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
15094 << " compress " << (int)wctx->compress
15095 << dendl;
15096 logger->inc(l_bluestore_write_big);
15097 logger->inc(l_bluestore_write_big_bytes, length);
15098 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
15099 uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load();
15100 while (length > 0) {
15101 bool new_blob = false;
15102 BlobRef b;
15103 uint32_t b_off = 0;
15104 uint32_t l = 0;
15105
15106 //attempting to reuse existing blob
15107 if (!wctx->compress) {
15108 // enforce target blob alignment with max_bsize
15109 l = max_bsize - p2phase(offset, max_bsize);
15110 l = std::min(uint64_t(l), length);
15111
15112 auto end = o->extent_map.extent_map.end();
15113
15114 dout(20) << __func__ << " may be defer: 0x" << std::hex
15115 << offset << "~" << l
15116 << std::dec << dendl;
15117
15118 if (prefer_deferred_size_snapshot &&
15119 l <= prefer_deferred_size_snapshot * 2) {
15120 // Single write that spans two adjusted existing blobs can result
15121 // in up to two deferred blocks of 'prefer_deferred_size'
15122 // So we're trying to minimize the amount of resulting blobs
15123 // and preserve 2 blobs rather than inserting one more in between
15124 // E.g. write 0x10000~20000 over existing blobs
15125 // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
15126 // performance point of view) to result in two deferred writes to
15127 // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
15128
15129 // look for an existing mutable blob we can write into
15130 auto ep = o->extent_map.seek_lextent(offset);
15131 auto ep_next = end;
15132 BigDeferredWriteContext head_info, tail_info;
15133
15134 bool will_defer = ep != end ?
15135 head_info.can_defer(ep,
15136 prefer_deferred_size_snapshot,
15137 block_size,
15138 offset,
15139 l) :
15140 false;
15141 auto offset_next = offset + head_info.used;
15142 auto remaining = l - head_info.used;
15143 if (will_defer && remaining) {
15144 will_defer = false;
15145 if (remaining <= prefer_deferred_size_snapshot) {
15146 ep_next = o->extent_map.seek_lextent(offset_next);
15147 // check if we can defer remaining totally
15148 will_defer = ep_next == end ?
15149 false :
15150 tail_info.can_defer(ep_next,
15151 prefer_deferred_size_snapshot,
15152 block_size,
15153 offset_next,
15154 remaining);
15155 will_defer = will_defer && remaining == tail_info.used;
15156 }
15157 }
15158 if (will_defer) {
15159 dout(20) << __func__ << " " << *(head_info.blob_ref)
15160 << " deferring big " << std::hex
15161 << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")"
15162 << std::dec << " write via deferred"
15163 << dendl;
15164 if (remaining) {
15165 dout(20) << __func__ << " " << *(tail_info.blob_ref)
15166 << " deferring big " << std::hex
15167 << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")"
15168 << std::dec << " write via deferred"
15169 << dendl;
15170 }
15171
15172 will_defer = head_info.apply_defer();
15173 if (!will_defer) {
15174 dout(20) << __func__
15175 << " deferring big fell back, head isn't continuous"
15176 << dendl;
15177 } else if (remaining) {
15178 will_defer = tail_info.apply_defer();
15179 if (!will_defer) {
15180 dout(20) << __func__
15181 << " deferring big fell back, tail isn't continuous"
15182 << dendl;
15183 }
15184 }
15185 }
15186 if (will_defer) {
15187 _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx);
15188 if (remaining) {
15189 _do_write_big_apply_deferred(txc, c, o, tail_info,
15190 blp, wctx);
15191 }
15192 dout(20) << __func__ << " defer big: 0x" << std::hex
15193 << offset << "~" << l
15194 << std::dec << dendl;
15195 offset += l;
15196 length -= l;
15197 logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1);
15198 logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1);
15199 continue;
15200 }
15201 }
15202 dout(20) << __func__ << " lookup for blocks to reuse..." << dendl;
15203
15204 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
15205
15206 // seek again as punch_hole could invalidate ep
15207 auto ep = o->extent_map.seek_lextent(offset);
15208 auto begin = o->extent_map.extent_map.begin();
15209 auto prev_ep = end;
15210 if (ep != begin) {
15211 prev_ep = ep;
15212 --prev_ep;
15213 }
15214
15215 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
15216 // search suitable extent in both forward and reverse direction in
15217 // [offset - target_max_blob_size, offset + target_max_blob_size] range
15218 // then check if blob can be reused via can_reuse_blob func.
15219 bool any_change;
15220 do {
15221 any_change = false;
15222 if (ep != end && ep->logical_offset < offset + max_bsize) {
15223 dout(20) << __func__ << " considering " << *ep
15224 << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl;
15225
15226 if (offset >= ep->blob_start() &&
15227 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
15228 offset - ep->blob_start(),
15229 &l)) {
15230 b = ep->blob;
15231 b_off = offset - ep->blob_start();
15232 prev_ep = end; // to avoid check below
15233 dout(20) << __func__ << " reuse blob " << *b << std::hex
15234 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
15235 } else {
15236 ++ep;
15237 any_change = true;
15238 }
15239 }
15240
15241 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
15242 dout(20) << __func__ << " considering rev " << *prev_ep
15243 << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl;
15244 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
15245 offset - prev_ep->blob_start(),
15246 &l)) {
15247 b = prev_ep->blob;
15248 b_off = offset - prev_ep->blob_start();
15249 dout(20) << __func__ << " reuse blob " << *b << std::hex
15250 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
15251 } else if (prev_ep != begin) {
15252 --prev_ep;
15253 any_change = true;
15254 } else {
15255 prev_ep = end; // to avoid useless first extent re-check
15256 }
15257 }
15258 } while (b == nullptr && any_change);
15259 } else {
15260 // trying to utilize as longer chunk as permitted in case of compression.
15261 l = std::min(max_bsize, length);
15262 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
15263 } // if (!wctx->compress)
15264
15265 if (b == nullptr) {
15266 b = c->new_blob();
15267 b_off = 0;
15268 new_blob = true;
15269 }
15270 bufferlist t;
15271 blp.copy(l, t);
15272
15273 // Zero detection -- big block
15274 if (!t.is_zero()) {
15275 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
15276
15277 dout(20) << __func__ << " schedule write big: 0x"
15278 << std::hex << offset << "~" << l << std::dec
15279 << (new_blob ? " new " : " reuse ")
15280 << *b << dendl;
15281
15282 logger->inc(l_bluestore_write_big_blobs);
15283 } else { // if (!t.is_zero())
15284 dout(20) << __func__ << " skip big zero block " << std::hex
15285 << " (0x" << b_off << "~" << t.length() << ")"
15286 << " (0x" << b_off << "~" << l << ")"
15287 << std::dec << dendl;
15288 logger->inc(l_bluestore_write_big_skipped_blobs);
15289 logger->inc(l_bluestore_write_big_skipped_bytes, l);
15290 }
15291
15292 offset += l;
15293 length -= l;
15294 }
15295 }
15296
15297 int BlueStore::_do_alloc_write(
15298 TransContext *txc,
15299 CollectionRef coll,
15300 OnodeRef o,
15301 WriteContext *wctx)
15302 {
15303 dout(20) << __func__ << " txc " << txc
15304 << " " << wctx->writes.size() << " blobs"
15305 << dendl;
15306 if (wctx->writes.empty()) {
15307 return 0;
15308 }
15309
15310 CompressorRef c;
15311 double crr = 0;
15312 if (wctx->compress) {
15313 c = select_option(
15314 "compression_algorithm",
15315 compressor,
15316 [&]() {
15317 string val;
15318 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
15319 CompressorRef cp = compressor;
15320 if (!cp || cp->get_type_name() != val) {
15321 cp = Compressor::create(cct, val);
15322 if (!cp) {
15323 if (_set_compression_alert(false, val.c_str())) {
15324 derr << __func__ << " unable to initialize " << val.c_str()
15325 << " compressor" << dendl;
15326 }
15327 }
15328 }
15329 return boost::optional<CompressorRef>(cp);
15330 }
15331 return boost::optional<CompressorRef>();
15332 }
15333 );
15334
15335 crr = select_option(
15336 "compression_required_ratio",
15337 cct->_conf->bluestore_compression_required_ratio,
15338 [&]() {
15339 double val;
15340 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
15341 return boost::optional<double>(val);
15342 }
15343 return boost::optional<double>();
15344 }
15345 );
15346 }
15347
15348 // checksum
15349 int64_t csum = csum_type.load();
15350 csum = select_option(
15351 "csum_type",
15352 csum,
15353 [&]() {
15354 int64_t val;
15355 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
15356 return boost::optional<int64_t>(val);
15357 }
15358 return boost::optional<int64_t>();
15359 }
15360 );
15361
15362 // compress (as needed) and calc needed space
15363 uint64_t need = 0;
15364 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
15365 for (auto& wi : wctx->writes) {
15366 if (c && wi.blob_length > min_alloc_size) {
15367 auto start = mono_clock::now();
15368
15369 // compress
15370 ceph_assert(wi.b_off == 0);
15371 ceph_assert(wi.blob_length == wi.bl.length());
15372
15373 // FIXME: memory alignment here is bad
15374 bufferlist t;
15375 boost::optional<int32_t> compressor_message;
15376 int r = c->compress(wi.bl, t, compressor_message);
15377 uint64_t want_len_raw = wi.blob_length * crr;
15378 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
15379 bool rejected = false;
15380 uint64_t compressed_len = t.length();
15381 // do an approximate (fast) estimation for resulting blob size
15382 // that doesn't take header overhead into account
15383 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
15384 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
15385 bluestore_compression_header_t chdr;
15386 chdr.type = c->get_type();
15387 chdr.length = t.length();
15388 chdr.compressor_message = compressor_message;
15389 encode(chdr, wi.compressed_bl);
15390 wi.compressed_bl.claim_append(t);
15391
15392 compressed_len = wi.compressed_bl.length();
15393 result_len = p2roundup(compressed_len, min_alloc_size);
15394 if (result_len <= want_len && result_len < wi.blob_length) {
15395 // Cool. We compressed at least as much as we were hoping to.
15396 // pad out to min_alloc_size
15397 wi.compressed_bl.append_zero(result_len - compressed_len);
15398 wi.compressed_len = compressed_len;
15399 wi.compressed = true;
15400 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
15401 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
15402 << " -> 0x" << compressed_len << " => 0x" << result_len
15403 << " with " << c->get_type()
15404 << std::dec << dendl;
15405 txc->statfs_delta.compressed() += compressed_len;
15406 txc->statfs_delta.compressed_original() += wi.blob_length;
15407 txc->statfs_delta.compressed_allocated() += result_len;
15408 logger->inc(l_bluestore_compress_success_count);
15409 need += result_len;
15410 } else {
15411 rejected = true;
15412 }
15413 } else if (r != 0) {
15414 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
15415 << " bytes compressed using " << c->get_type_name()
15416 << std::dec
15417 << " failed with errcode = " << r
15418 << ", leaving uncompressed"
15419 << dendl;
15420 logger->inc(l_bluestore_compress_rejected_count);
15421 need += wi.blob_length;
15422 } else {
15423 rejected = true;
15424 }
15425
15426 if (rejected) {
15427 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
15428 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
15429 << " with " << c->get_type()
15430 << ", which is more than required 0x" << want_len_raw
15431 << " -> 0x" << want_len
15432 << ", leaving uncompressed"
15433 << std::dec << dendl;
15434 logger->inc(l_bluestore_compress_rejected_count);
15435 need += wi.blob_length;
15436 }
15437 log_latency("compress@_do_alloc_write",
15438 l_bluestore_compress_lat,
15439 mono_clock::now() - start,
15440 cct->_conf->bluestore_log_op_age );
15441 } else {
15442 need += wi.blob_length;
15443 }
15444 }
15445 PExtentVector prealloc;
15446 prealloc.reserve(2 * wctx->writes.size());;
15447 int64_t prealloc_left = 0;
15448 prealloc_left = alloc->allocate(
15449 need, min_alloc_size, need,
15450 0, &prealloc);
15451 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
15452 derr << __func__ << " failed to allocate 0x" << std::hex << need
15453 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
15454 << " min_alloc_size 0x" << min_alloc_size
15455 << " available 0x " << alloc->get_free()
15456 << std::dec << dendl;
15457 if (prealloc.size()) {
15458 alloc->release(prealloc);
15459 }
15460 return -ENOSPC;
15461 }
15462 _collect_allocation_stats(need, min_alloc_size, prealloc);
15463
15464 dout(20) << __func__ << " prealloc " << prealloc << dendl;
15465 auto prealloc_pos = prealloc.begin();
15466 ceph_assert(prealloc_pos != prealloc.end());
15467 uint64_t prealloc_pos_length = prealloc_pos->length;
15468
15469 for (auto& wi : wctx->writes) {
15470 bluestore_blob_t& dblob = wi.b->dirty_blob();
15471 uint64_t b_off = wi.b_off;
15472 bufferlist *l = &wi.bl;
15473 uint64_t final_length = wi.blob_length;
15474 uint64_t csum_length = wi.blob_length;
15475 if (wi.compressed) {
15476 final_length = wi.compressed_bl.length();
15477 csum_length = final_length;
15478 unsigned csum_order = ctz(csum_length);
15479 l = &wi.compressed_bl;
15480 dblob.set_compressed(wi.blob_length, wi.compressed_len);
15481 if (csum != Checksummer::CSUM_NONE) {
15482 dout(20) << __func__
15483 << " initialize csum setting for compressed blob " << *wi.b
15484 << " csum_type " << Checksummer::get_csum_type_string(csum)
15485 << " csum_order " << csum_order
15486 << " csum_length 0x" << std::hex << csum_length
15487 << " blob_length 0x" << wi.blob_length
15488 << " compressed_length 0x" << wi.compressed_len << std::dec
15489 << dendl;
15490 dblob.init_csum(csum, csum_order, csum_length);
15491 }
15492 } else if (wi.new_blob) {
15493 unsigned csum_order;
15494 // initialize newly created blob only
15495 ceph_assert(dblob.is_mutable());
15496 if (l->length() != wi.blob_length) {
15497 // hrm, maybe we could do better here, but let's not bother.
15498 dout(20) << __func__ << " forcing csum_order to block_size_order "
15499 << block_size_order << dendl;
15500 csum_order = block_size_order;
15501 } else {
15502 csum_order = std::min(wctx->csum_order, ctz(l->length()));
15503 }
15504 // try to align blob with max_blob_size to improve
15505 // its reuse ratio, e.g. in case of reverse write
15506 uint32_t suggested_boff =
15507 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
15508 if ((suggested_boff % (1 << csum_order)) == 0 &&
15509 suggested_boff + final_length <= max_bsize &&
15510 suggested_boff > b_off) {
15511 dout(20) << __func__ << " forcing blob_offset to 0x"
15512 << std::hex << suggested_boff << std::dec << dendl;
15513 ceph_assert(suggested_boff >= b_off);
15514 csum_length += suggested_boff - b_off;
15515 b_off = suggested_boff;
15516 }
15517 if (csum != Checksummer::CSUM_NONE) {
15518 dout(20) << __func__
15519 << " initialize csum setting for new blob " << *wi.b
15520 << " csum_type " << Checksummer::get_csum_type_string(csum)
15521 << " csum_order " << csum_order
15522 << " csum_length 0x" << std::hex << csum_length << std::dec
15523 << dendl;
15524 dblob.init_csum(csum, csum_order, csum_length);
15525 }
15526 }
15527
15528 PExtentVector extents;
15529 int64_t left = final_length;
15530 bool has_chunk2defer = false;
15531 auto prefer_deferred_size_snapshot = prefer_deferred_size.load();
15532 while (left > 0) {
15533 ceph_assert(prealloc_left > 0);
15534 has_chunk2defer |= (prealloc_pos_length < prefer_deferred_size_snapshot);
15535 if (prealloc_pos->length <= left) {
15536 prealloc_left -= prealloc_pos->length;
15537 left -= prealloc_pos->length;
15538 txc->statfs_delta.allocated() += prealloc_pos->length;
15539 extents.push_back(*prealloc_pos);
15540 ++prealloc_pos;
15541 if (prealloc_pos != prealloc.end()) {
15542 prealloc_pos_length = prealloc_pos->length;
15543 }
15544 } else {
15545 extents.emplace_back(prealloc_pos->offset, left);
15546 prealloc_pos->offset += left;
15547 prealloc_pos->length -= left;
15548 prealloc_left -= left;
15549 txc->statfs_delta.allocated() += left;
15550 left = 0;
15551 break;
15552 }
15553 }
15554 for (auto& p : extents) {
15555 txc->allocated.insert(p.offset, p.length);
15556 }
15557 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
15558
15559 dout(20) << __func__ << " blob " << *wi.b << dendl;
15560 if (dblob.has_csum()) {
15561 dblob.calc_csum(b_off, *l);
15562 }
15563
15564 if (wi.mark_unused) {
15565 ceph_assert(!dblob.is_compressed());
15566 auto b_end = b_off + wi.bl.length();
15567 if (b_off) {
15568 dblob.add_unused(0, b_off);
15569 }
15570 uint64_t llen = dblob.get_logical_length();
15571 if (b_end < llen) {
15572 dblob.add_unused(b_end, llen - b_end);
15573 }
15574 }
15575
15576 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
15577 b_off + (wi.b_off0 - wi.b_off),
15578 wi.length0,
15579 wi.b,
15580 nullptr);
15581 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
15582 txc->statfs_delta.stored() += le->length;
15583 dout(20) << __func__ << " lex " << *le << dendl;
15584 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
15585 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
15586
15587 // queue io
15588 if (!g_conf()->bluestore_debug_omit_block_device_write) {
15589 if (has_chunk2defer && l->length() < prefer_deferred_size_snapshot) {
15590 dout(20) << __func__ << " deferring 0x" << std::hex
15591 << l->length() << std::dec << " write via deferred" << dendl;
15592 bluestore_deferred_op_t *op = _get_deferred_op(txc, l->length());
15593 op->op = bluestore_deferred_op_t::OP_WRITE;
15594 int r = wi.b->get_blob().map(
15595 b_off, l->length(),
15596 [&](uint64_t offset, uint64_t length) {
15597 op->extents.emplace_back(bluestore_pextent_t(offset, length));
15598 return 0;
15599 });
15600 ceph_assert(r == 0);
15601 op->data = *l;
15602 } else {
15603 wi.b->get_blob().map_bl(
15604 b_off, *l,
15605 [&](uint64_t offset, bufferlist& t) {
15606 bdev->aio_write(offset, t, &txc->ioc, false);
15607 });
15608 logger->inc(l_bluestore_write_new);
15609 }
15610 }
15611 }
15612 ceph_assert(prealloc_pos == prealloc.end());
15613 ceph_assert(prealloc_left == 0);
15614 return 0;
15615 }
15616
15617 void BlueStore::_wctx_finish(
15618 TransContext *txc,
15619 CollectionRef& c,
15620 OnodeRef o,
15621 WriteContext *wctx,
15622 set<SharedBlob*> *maybe_unshared_blobs)
15623 {
15624 #ifdef HAVE_LIBZBD
15625 if (bdev->is_smr()) {
15626 for (auto& w : wctx->writes) {
15627 for (auto& e : w.b->get_blob().get_extents()) {
15628 if (!e.is_valid()) {
15629 continue;
15630 }
15631 uint32_t zone = e.offset / zone_size;
15632 if (!o->onode.zone_offset_refs.count(zone)) {
15633 uint64_t zoff = e.offset % zone_size;
15634 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
15635 << " offset 0x" << zoff << std::dec << dendl;
15636 txc->note_write_zone_offset(o, zone, zoff);
15637 }
15638 }
15639 }
15640 }
15641 set<uint32_t> zones_with_releases;
15642 #endif
15643
15644 auto oep = wctx->old_extents.begin();
15645 while (oep != wctx->old_extents.end()) {
15646 auto &lo = *oep;
15647 oep = wctx->old_extents.erase(oep);
15648 dout(20) << __func__ << " lex_old " << lo.e << dendl;
15649 BlobRef b = lo.e.blob;
15650 const bluestore_blob_t& blob = b->get_blob();
15651 if (blob.is_compressed()) {
15652 if (lo.blob_empty) {
15653 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
15654 }
15655 txc->statfs_delta.compressed_original() -= lo.e.length;
15656 }
15657 auto& r = lo.r;
15658 txc->statfs_delta.stored() -= lo.e.length;
15659 if (!r.empty()) {
15660 dout(20) << __func__ << " blob " << *b << " release " << r << dendl;
15661 if (blob.is_shared()) {
15662 PExtentVector final;
15663 c->load_shared_blob(b->shared_blob);
15664 bool unshare = false;
15665 bool* unshare_ptr =
15666 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
15667 for (auto e : r) {
15668 b->shared_blob->put_ref(
15669 e.offset, e.length, &final,
15670 unshare_ptr);
15671 #ifdef HAVE_LIBZBD
15672 // we also drop zone ref for shared blob extents
15673 if (bdev->is_smr() && e.is_valid()) {
15674 zones_with_releases.insert(e.offset / zone_size);
15675 }
15676 #endif
15677 }
15678 if (unshare) {
15679 ceph_assert(maybe_unshared_blobs);
15680 maybe_unshared_blobs->insert(b->shared_blob.get());
15681 }
15682 dout(20) << __func__ << " shared_blob release " << final
15683 << " from " << *b->shared_blob << dendl;
15684 txc->write_shared_blob(b->shared_blob);
15685 r.clear();
15686 r.swap(final);
15687 }
15688 }
15689 // we can't invalidate our logical extents as we drop them because
15690 // other lextents (either in our onode or others) may still
15691 // reference them. but we can throw out anything that is no
15692 // longer allocated. Note that this will leave behind edge bits
15693 // that are no longer referenced but not deallocated (until they
15694 // age out of the cache naturally).
15695 b->discard_unallocated(c.get());
15696 for (auto e : r) {
15697 dout(20) << __func__ << " release " << e << dendl;
15698 txc->released.insert(e.offset, e.length);
15699 txc->statfs_delta.allocated() -= e.length;
15700 if (blob.is_compressed()) {
15701 txc->statfs_delta.compressed_allocated() -= e.length;
15702 }
15703 #ifdef HAVE_LIBZBD
15704 if (bdev->is_smr() && e.is_valid()) {
15705 zones_with_releases.insert(e.offset / zone_size);
15706 }
15707 #endif
15708 }
15709
15710 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
15711 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
15712 << dendl;
15713 o->extent_map.spanning_blob_map.erase(b->id);
15714 }
15715 delete &lo;
15716 }
15717
15718 #ifdef HAVE_LIBZBD
15719 if (!zones_with_releases.empty()) {
15720 // we need to fault the entire extent range in here to determinte if we've dropped
15721 // all refs to a zone.
15722 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
15723 for (auto& b : o->extent_map.extent_map) {
15724 for (auto& e : b.blob->get_blob().get_extents()) {
15725 if (e.is_valid()) {
15726 zones_with_releases.erase(e.offset / zone_size);
15727 }
15728 }
15729 }
15730 for (auto zone : zones_with_releases) {
15731 auto p = o->onode.zone_offset_refs.find(zone);
15732 if (p != o->onode.zone_offset_refs.end()) {
15733 dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
15734 << " offset 0x" << p->second << std::dec << dendl;
15735 txc->note_release_zone_offset(o, zone, p->second);
15736 }
15737 }
15738 }
15739 #endif
15740 }
15741
15742 void BlueStore::_do_write_data(
15743 TransContext *txc,
15744 CollectionRef& c,
15745 OnodeRef o,
15746 uint64_t offset,
15747 uint64_t length,
15748 bufferlist& bl,
15749 WriteContext *wctx)
15750 {
15751 uint64_t end = offset + length;
15752 bufferlist::iterator p = bl.begin();
15753
15754 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
15755 (length != min_alloc_size)) {
15756 // we fall within the same block
15757 _do_write_small(txc, c, o, offset, length, p, wctx);
15758 } else {
15759 uint64_t head_offset, head_length;
15760 uint64_t middle_offset, middle_length;
15761 uint64_t tail_offset, tail_length;
15762
15763 head_offset = offset;
15764 head_length = p2nphase(offset, min_alloc_size);
15765
15766 tail_offset = p2align(end, min_alloc_size);
15767 tail_length = p2phase(end, min_alloc_size);
15768
15769 middle_offset = head_offset + head_length;
15770 middle_length = length - head_length - tail_length;
15771
15772 if (head_length) {
15773 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
15774 }
15775
15776 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
15777
15778 if (tail_length) {
15779 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
15780 }
15781 }
15782 }
15783
15784 void BlueStore::_choose_write_options(
15785 CollectionRef& c,
15786 OnodeRef o,
15787 uint32_t fadvise_flags,
15788 WriteContext *wctx)
15789 {
15790 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
15791 dout(20) << __func__ << " will do buffered write" << dendl;
15792 wctx->buffered = true;
15793 } else if (cct->_conf->bluestore_default_buffered_write &&
15794 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
15795 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
15796 dout(20) << __func__ << " defaulting to buffered write" << dendl;
15797 wctx->buffered = true;
15798 }
15799
15800 // apply basic csum block size
15801 wctx->csum_order = block_size_order;
15802
15803 // compression parameters
15804 unsigned alloc_hints = o->onode.alloc_hint_flags;
15805 auto cm = select_option(
15806 "compression_mode",
15807 comp_mode.load(),
15808 [&]() {
15809 string val;
15810 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
15811 return boost::optional<Compressor::CompressionMode>(
15812 Compressor::get_comp_mode_type(val));
15813 }
15814 return boost::optional<Compressor::CompressionMode>();
15815 }
15816 );
15817
15818 wctx->compress = (cm != Compressor::COMP_NONE) &&
15819 ((cm == Compressor::COMP_FORCE) ||
15820 (cm == Compressor::COMP_AGGRESSIVE &&
15821 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
15822 (cm == Compressor::COMP_PASSIVE &&
15823 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
15824
15825 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
15826 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
15827 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
15828 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
15829 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
15830
15831 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
15832
15833 if (o->onode.expected_write_size) {
15834 wctx->csum_order = std::max(min_alloc_size_order,
15835 (uint8_t)ctz(o->onode.expected_write_size));
15836 } else {
15837 wctx->csum_order = min_alloc_size_order;
15838 }
15839
15840 if (wctx->compress) {
15841 wctx->target_blob_size = select_option(
15842 "compression_max_blob_size",
15843 comp_max_blob_size.load(),
15844 [&]() {
15845 int64_t val;
15846 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
15847 return boost::optional<uint64_t>((uint64_t)val);
15848 }
15849 return boost::optional<uint64_t>();
15850 }
15851 );
15852 }
15853 } else {
15854 if (wctx->compress) {
15855 wctx->target_blob_size = select_option(
15856 "compression_min_blob_size",
15857 comp_min_blob_size.load(),
15858 [&]() {
15859 int64_t val;
15860 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
15861 return boost::optional<uint64_t>((uint64_t)val);
15862 }
15863 return boost::optional<uint64_t>();
15864 }
15865 );
15866 }
15867 }
15868
15869 uint64_t max_bsize = max_blob_size.load();
15870 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
15871 wctx->target_blob_size = max_bsize;
15872 }
15873
15874 // set the min blob size floor at 2x the min_alloc_size, or else we
15875 // won't be able to allocate a smaller extent for the compressed
15876 // data.
15877 if (wctx->compress &&
15878 wctx->target_blob_size < min_alloc_size * 2) {
15879 wctx->target_blob_size = min_alloc_size * 2;
15880 }
15881
15882 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
15883 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
15884 << " compress=" << (int)wctx->compress
15885 << " buffered=" << (int)wctx->buffered
15886 << std::dec << dendl;
15887 }
15888
15889 int BlueStore::_do_gc(
15890 TransContext *txc,
15891 CollectionRef& c,
15892 OnodeRef o,
15893 const WriteContext& wctx,
15894 uint64_t *dirty_start,
15895 uint64_t *dirty_end)
15896 {
15897
15898 bool dirty_range_updated = false;
15899 WriteContext wctx_gc;
15900 wctx_gc.fork(wctx); // make a clone for garbage collection
15901
15902 auto & extents_to_collect = wctx.extents_to_gc;
15903 for (auto it = extents_to_collect.begin();
15904 it != extents_to_collect.end();
15905 ++it) {
15906 bufferlist bl;
15907 auto offset = (*it).first;
15908 auto length = (*it).second;
15909 dout(20) << __func__ << " processing " << std::hex
15910 << offset << "~" << length << std::dec
15911 << dendl;
15912 int r = _do_read(c.get(), o, offset, length, bl, 0);
15913 ceph_assert(r == (int)length);
15914
15915 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
15916 logger->inc(l_bluestore_gc_merged, length);
15917
15918 if (*dirty_start > offset) {
15919 *dirty_start = offset;
15920 dirty_range_updated = true;
15921 }
15922
15923 if (*dirty_end < offset + length) {
15924 *dirty_end = offset + length;
15925 dirty_range_updated = true;
15926 }
15927 }
15928 if (dirty_range_updated) {
15929 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
15930 }
15931
15932 dout(30) << __func__ << " alloc write" << dendl;
15933 int r = _do_alloc_write(txc, c, o, &wctx_gc);
15934 if (r < 0) {
15935 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
15936 << dendl;
15937 return r;
15938 }
15939
15940 _wctx_finish(txc, c, o, &wctx_gc);
15941 return 0;
15942 }
15943
15944 int BlueStore::_do_write(
15945 TransContext *txc,
15946 CollectionRef& c,
15947 OnodeRef o,
15948 uint64_t offset,
15949 uint64_t length,
15950 bufferlist& bl,
15951 uint32_t fadvise_flags)
15952 {
15953 int r = 0;
15954
15955 dout(20) << __func__
15956 << " " << o->oid
15957 << " 0x" << std::hex << offset << "~" << length
15958 << " - have 0x" << o->onode.size
15959 << " (" << std::dec << o->onode.size << ")"
15960 << " bytes" << std::hex
15961 << " fadvise_flags 0x" << fadvise_flags
15962 << " alloc_hint 0x" << o->onode.alloc_hint_flags
15963 << " expected_object_size " << o->onode.expected_object_size
15964 << " expected_write_size " << o->onode.expected_write_size
15965 << std::dec
15966 << dendl;
15967 _dump_onode<30>(cct, *o);
15968
15969 if (length == 0) {
15970 return 0;
15971 }
15972
15973 uint64_t end = offset + length;
15974
15975 GarbageCollector gc(c->store->cct);
15976 int64_t benefit = 0;
15977 auto dirty_start = offset;
15978 auto dirty_end = end;
15979
15980 WriteContext wctx;
15981 _choose_write_options(c, o, fadvise_flags, &wctx);
15982 o->extent_map.fault_range(db, offset, length);
15983 _do_write_data(txc, c, o, offset, length, bl, &wctx);
15984 r = _do_alloc_write(txc, c, o, &wctx);
15985 if (r < 0) {
15986 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
15987 << dendl;
15988 goto out;
15989 }
15990
15991 if (wctx.extents_to_gc.empty() ||
15992 wctx.extents_to_gc.range_start() > offset ||
15993 wctx.extents_to_gc.range_end() < offset + length) {
15994 benefit = gc.estimate(offset,
15995 length,
15996 o->extent_map,
15997 wctx.old_extents,
15998 min_alloc_size);
15999 }
16000
16001 // NB: _wctx_finish() will empty old_extents
16002 // so we must do gc estimation before that
16003 _wctx_finish(txc, c, o, &wctx);
16004 if (end > o->onode.size) {
16005 dout(20) << __func__ << " extending size to 0x" << std::hex << end
16006 << std::dec << dendl;
16007 o->onode.size = end;
16008 }
16009
16010 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
16011 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
16012 dout(20) << __func__
16013 << " perform garbage collection for compressed extents, "
16014 << "expected benefit = " << benefit << " AUs" << dendl;
16015 }
16016 if (!wctx.extents_to_gc.empty()) {
16017 dout(20) << __func__ << " perform garbage collection" << dendl;
16018
16019 r = _do_gc(txc, c, o,
16020 wctx,
16021 &dirty_start, &dirty_end);
16022 if (r < 0) {
16023 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
16024 << dendl;
16025 goto out;
16026 }
16027 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
16028 << "~" << dirty_end - dirty_start << std::dec << dendl;
16029 }
16030 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
16031 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
16032
16033 r = 0;
16034
16035 out:
16036 return r;
16037 }
16038
16039 int BlueStore::_write(TransContext *txc,
16040 CollectionRef& c,
16041 OnodeRef& o,
16042 uint64_t offset, size_t length,
16043 bufferlist& bl,
16044 uint32_t fadvise_flags)
16045 {
16046 dout(15) << __func__ << " " << c->cid << " " << o->oid
16047 << " 0x" << std::hex << offset << "~" << length << std::dec
16048 << dendl;
16049 int r = 0;
16050 if (offset + length >= OBJECT_MAX_SIZE) {
16051 r = -E2BIG;
16052 } else {
16053 _assign_nid(txc, o);
16054 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
16055 txc->write_onode(o);
16056 }
16057 dout(10) << __func__ << " " << c->cid << " " << o->oid
16058 << " 0x" << std::hex << offset << "~" << length << std::dec
16059 << " = " << r << dendl;
16060 return r;
16061 }
16062
16063 int BlueStore::_zero(TransContext *txc,
16064 CollectionRef& c,
16065 OnodeRef& o,
16066 uint64_t offset, size_t length)
16067 {
16068 dout(15) << __func__ << " " << c->cid << " " << o->oid
16069 << " 0x" << std::hex << offset << "~" << length << std::dec
16070 << dendl;
16071 int r = 0;
16072 if (offset + length >= OBJECT_MAX_SIZE) {
16073 r = -E2BIG;
16074 } else {
16075 _assign_nid(txc, o);
16076 r = _do_zero(txc, c, o, offset, length);
16077 }
16078 dout(10) << __func__ << " " << c->cid << " " << o->oid
16079 << " 0x" << std::hex << offset << "~" << length << std::dec
16080 << " = " << r << dendl;
16081 return r;
16082 }
16083
16084 int BlueStore::_do_zero(TransContext *txc,
16085 CollectionRef& c,
16086 OnodeRef& o,
16087 uint64_t offset, size_t length)
16088 {
16089 dout(15) << __func__ << " " << c->cid << " " << o->oid
16090 << " 0x" << std::hex << offset << "~" << length << std::dec
16091 << dendl;
16092 int r = 0;
16093
16094 _dump_onode<30>(cct, *o);
16095
16096 WriteContext wctx;
16097 o->extent_map.fault_range(db, offset, length);
16098 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
16099 o->extent_map.dirty_range(offset, length);
16100 _wctx_finish(txc, c, o, &wctx);
16101
16102 if (length > 0 && offset + length > o->onode.size) {
16103 o->onode.size = offset + length;
16104 dout(20) << __func__ << " extending size to " << offset + length
16105 << dendl;
16106 }
16107 txc->write_onode(o);
16108
16109 dout(10) << __func__ << " " << c->cid << " " << o->oid
16110 << " 0x" << std::hex << offset << "~" << length << std::dec
16111 << " = " << r << dendl;
16112 return r;
16113 }
16114
16115 void BlueStore::_do_truncate(
16116 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
16117 set<SharedBlob*> *maybe_unshared_blobs)
16118 {
16119 dout(15) << __func__ << " " << c->cid << " " << o->oid
16120 << " 0x" << std::hex << offset << std::dec << dendl;
16121
16122 _dump_onode<30>(cct, *o);
16123
16124 if (offset == o->onode.size)
16125 return;
16126
16127 WriteContext wctx;
16128 if (offset < o->onode.size) {
16129 uint64_t length = o->onode.size - offset;
16130 o->extent_map.fault_range(db, offset, length);
16131 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
16132 o->extent_map.dirty_range(offset, length);
16133
16134 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
16135
16136 // if we have shards past EOF, ask for a reshard
16137 if (!o->onode.extent_map_shards.empty() &&
16138 o->onode.extent_map_shards.back().offset >= offset) {
16139 dout(10) << __func__ << " request reshard past EOF" << dendl;
16140 if (offset) {
16141 o->extent_map.request_reshard(offset - 1, offset + length);
16142 } else {
16143 o->extent_map.request_reshard(0, length);
16144 }
16145 }
16146 }
16147
16148 o->onode.size = offset;
16149
16150 txc->write_onode(o);
16151 }
16152
16153 int BlueStore::_truncate(TransContext *txc,
16154 CollectionRef& c,
16155 OnodeRef& o,
16156 uint64_t offset)
16157 {
16158 dout(15) << __func__ << " " << c->cid << " " << o->oid
16159 << " 0x" << std::hex << offset << std::dec
16160 << dendl;
16161
16162 auto start_time = mono_clock::now();
16163 int r = 0;
16164 if (offset >= OBJECT_MAX_SIZE) {
16165 r = -E2BIG;
16166 } else {
16167 _do_truncate(txc, c, o, offset);
16168 }
16169 log_latency_fn(
16170 __func__,
16171 l_bluestore_truncate_lat,
16172 mono_clock::now() - start_time,
16173 cct->_conf->bluestore_log_op_age,
16174 [&](const ceph::timespan& lat) {
16175 ostringstream ostr;
16176 ostr << ", lat = " << timespan_str(lat)
16177 << " cid =" << c->cid
16178 << " oid =" << o->oid;
16179 return ostr.str();
16180 }
16181 );
16182 dout(10) << __func__ << " " << c->cid << " " << o->oid
16183 << " 0x" << std::hex << offset << std::dec
16184 << " = " << r << dendl;
16185 return r;
16186 }
16187
16188 int BlueStore::_do_remove(
16189 TransContext *txc,
16190 CollectionRef& c,
16191 OnodeRef o)
16192 {
16193 set<SharedBlob*> maybe_unshared_blobs;
16194 bool is_gen = !o->oid.is_no_gen();
16195 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
16196 if (o->onode.has_omap()) {
16197 o->flush();
16198 _do_omap_clear(txc, o);
16199 }
16200 o->exists = false;
16201 string key;
16202 for (auto &s : o->extent_map.shards) {
16203 dout(20) << __func__ << " removing shard 0x" << std::hex
16204 << s.shard_info->offset << std::dec << dendl;
16205 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
16206 [&](const string& final_key) {
16207 txc->t->rmkey(PREFIX_OBJ, final_key);
16208 }
16209 );
16210 }
16211 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
16212 txc->note_removed_object(o);
16213 o->extent_map.clear();
16214 o->onode = bluestore_onode_t();
16215 _debug_obj_on_delete(o->oid);
16216
16217 if (!is_gen || maybe_unshared_blobs.empty()) {
16218 return 0;
16219 }
16220
16221 // see if we can unshare blobs still referenced by the head
16222 dout(10) << __func__ << " gen and maybe_unshared_blobs "
16223 << maybe_unshared_blobs << dendl;
16224 ghobject_t nogen = o->oid;
16225 nogen.generation = ghobject_t::NO_GEN;
16226 OnodeRef h = c->get_onode(nogen, false);
16227
16228 if (!h || !h->exists) {
16229 return 0;
16230 }
16231
16232 dout(20) << __func__ << " checking for unshareable blobs on " << h
16233 << " " << h->oid << dendl;
16234 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
16235 for (auto& e : h->extent_map.extent_map) {
16236 const bluestore_blob_t& b = e.blob->get_blob();
16237 SharedBlob *sb = e.blob->shared_blob.get();
16238 if (b.is_shared() &&
16239 sb->loaded &&
16240 maybe_unshared_blobs.count(sb)) {
16241 if (b.is_compressed()) {
16242 expect[sb].get(0, b.get_ondisk_length());
16243 } else {
16244 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
16245 expect[sb].get(off, len);
16246 return 0;
16247 });
16248 }
16249 }
16250 }
16251
16252 vector<SharedBlob*> unshared_blobs;
16253 unshared_blobs.reserve(maybe_unshared_blobs.size());
16254 for (auto& p : expect) {
16255 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
16256 if (p.first->persistent->ref_map == p.second) {
16257 SharedBlob *sb = p.first;
16258 dout(20) << __func__ << " unsharing " << *sb << dendl;
16259 unshared_blobs.push_back(sb);
16260 txc->unshare_blob(sb);
16261 uint64_t sbid = c->make_blob_unshared(sb);
16262 string key;
16263 get_shared_blob_key(sbid, &key);
16264 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
16265 }
16266 }
16267
16268 if (unshared_blobs.empty()) {
16269 return 0;
16270 }
16271
16272 for (auto& e : h->extent_map.extent_map) {
16273 const bluestore_blob_t& b = e.blob->get_blob();
16274 SharedBlob *sb = e.blob->shared_blob.get();
16275 if (b.is_shared() &&
16276 std::find(unshared_blobs.begin(), unshared_blobs.end(),
16277 sb) != unshared_blobs.end()) {
16278 dout(20) << __func__ << " unsharing " << e << dendl;
16279 bluestore_blob_t& blob = e.blob->dirty_blob();
16280 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
16281 h->extent_map.dirty_range(e.logical_offset, 1);
16282 }
16283 }
16284 txc->write_onode(h);
16285
16286 return 0;
16287 }
16288
16289 int BlueStore::_remove(TransContext *txc,
16290 CollectionRef& c,
16291 OnodeRef &o)
16292 {
16293 dout(15) << __func__ << " " << c->cid << " " << o->oid
16294 << " onode " << o.get()
16295 << " txc "<< txc << dendl;
16296 auto start_time = mono_clock::now();
16297 int r = _do_remove(txc, c, o);
16298
16299 log_latency_fn(
16300 __func__,
16301 l_bluestore_remove_lat,
16302 mono_clock::now() - start_time,
16303 cct->_conf->bluestore_log_op_age,
16304 [&](const ceph::timespan& lat) {
16305 ostringstream ostr;
16306 ostr << ", lat = " << timespan_str(lat)
16307 << " cid =" << c->cid
16308 << " oid =" << o->oid;
16309 return ostr.str();
16310 }
16311 );
16312
16313 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16314 return r;
16315 }
16316
16317 int BlueStore::_setattr(TransContext *txc,
16318 CollectionRef& c,
16319 OnodeRef& o,
16320 const string& name,
16321 bufferptr& val)
16322 {
16323 dout(15) << __func__ << " " << c->cid << " " << o->oid
16324 << " " << name << " (" << val.length() << " bytes)"
16325 << dendl;
16326 int r = 0;
16327 if (val.is_partial()) {
16328 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
16329 val.length());
16330 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
16331 } else {
16332 auto& b = o->onode.attrs[name.c_str()] = val;
16333 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
16334 }
16335 txc->write_onode(o);
16336 dout(10) << __func__ << " " << c->cid << " " << o->oid
16337 << " " << name << " (" << val.length() << " bytes)"
16338 << " = " << r << dendl;
16339 return r;
16340 }
16341
16342 int BlueStore::_setattrs(TransContext *txc,
16343 CollectionRef& c,
16344 OnodeRef& o,
16345 const map<string,bufferptr>& aset)
16346 {
16347 dout(15) << __func__ << " " << c->cid << " " << o->oid
16348 << " " << aset.size() << " keys"
16349 << dendl;
16350 int r = 0;
16351 for (map<string,bufferptr>::const_iterator p = aset.begin();
16352 p != aset.end(); ++p) {
16353 if (p->second.is_partial()) {
16354 auto& b = o->onode.attrs[p->first.c_str()] =
16355 bufferptr(p->second.c_str(), p->second.length());
16356 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
16357 } else {
16358 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
16359 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
16360 }
16361 }
16362 txc->write_onode(o);
16363 dout(10) << __func__ << " " << c->cid << " " << o->oid
16364 << " " << aset.size() << " keys"
16365 << " = " << r << dendl;
16366 return r;
16367 }
16368
16369
16370 int BlueStore::_rmattr(TransContext *txc,
16371 CollectionRef& c,
16372 OnodeRef& o,
16373 const string& name)
16374 {
16375 dout(15) << __func__ << " " << c->cid << " " << o->oid
16376 << " " << name << dendl;
16377 int r = 0;
16378 auto it = o->onode.attrs.find(name.c_str());
16379 if (it == o->onode.attrs.end())
16380 goto out;
16381
16382 o->onode.attrs.erase(it);
16383 txc->write_onode(o);
16384
16385 out:
16386 dout(10) << __func__ << " " << c->cid << " " << o->oid
16387 << " " << name << " = " << r << dendl;
16388 return r;
16389 }
16390
16391 int BlueStore::_rmattrs(TransContext *txc,
16392 CollectionRef& c,
16393 OnodeRef& o)
16394 {
16395 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16396 int r = 0;
16397
16398 if (o->onode.attrs.empty())
16399 goto out;
16400
16401 o->onode.attrs.clear();
16402 txc->write_onode(o);
16403
16404 out:
16405 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16406 return r;
16407 }
16408
16409 void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
16410 {
16411 const string& omap_prefix = o->get_omap_prefix();
16412 string prefix, tail;
16413 o->get_omap_header(&prefix);
16414 o->get_omap_tail(&tail);
16415 txc->t->rm_range_keys(omap_prefix, prefix, tail);
16416 txc->t->rmkey(omap_prefix, tail);
16417 o->onode.clear_omap_flag();
16418 dout(20) << __func__ << " remove range start: "
16419 << pretty_binary_string(prefix) << " end: "
16420 << pretty_binary_string(tail) << dendl;
16421 }
16422
16423 int BlueStore::_omap_clear(TransContext *txc,
16424 CollectionRef& c,
16425 OnodeRef& o)
16426 {
16427 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16428 auto t0 = mono_clock::now();
16429
16430 int r = 0;
16431 if (o->onode.has_omap()) {
16432 o->flush();
16433 _do_omap_clear(txc, o);
16434 txc->write_onode(o);
16435 }
16436 logger->tinc(l_bluestore_omap_clear_lat, mono_clock::now() - t0);
16437
16438 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16439 return r;
16440 }
16441
16442 int BlueStore::_omap_setkeys(TransContext *txc,
16443 CollectionRef& c,
16444 OnodeRef& o,
16445 bufferlist &bl)
16446 {
16447 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16448 int r;
16449 auto p = bl.cbegin();
16450 __u32 num;
16451 if (!o->onode.has_omap()) {
16452 if (o->oid.is_pgmeta()) {
16453 o->onode.set_omap_flags_pgmeta();
16454 } else {
16455 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
16456 }
16457 txc->write_onode(o);
16458
16459 const string& prefix = o->get_omap_prefix();
16460 string key_tail;
16461 bufferlist tail;
16462 o->get_omap_tail(&key_tail);
16463 txc->t->set(prefix, key_tail, tail);
16464 } else {
16465 txc->note_modified_object(o);
16466 }
16467 const string& prefix = o->get_omap_prefix();
16468 string final_key;
16469 o->get_omap_key(string(), &final_key);
16470 size_t base_key_len = final_key.size();
16471 decode(num, p);
16472 while (num--) {
16473 string key;
16474 bufferlist value;
16475 decode(key, p);
16476 decode(value, p);
16477 final_key.resize(base_key_len); // keep prefix
16478 final_key += key;
16479 dout(20) << __func__ << " " << pretty_binary_string(final_key)
16480 << " <- " << key << dendl;
16481 txc->t->set(prefix, final_key, value);
16482 }
16483 r = 0;
16484 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16485 return r;
16486 }
16487
16488 int BlueStore::_omap_setheader(TransContext *txc,
16489 CollectionRef& c,
16490 OnodeRef &o,
16491 bufferlist& bl)
16492 {
16493 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16494 int r;
16495 string key;
16496 if (!o->onode.has_omap()) {
16497 if (o->oid.is_pgmeta()) {
16498 o->onode.set_omap_flags_pgmeta();
16499 } else {
16500 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
16501 }
16502 txc->write_onode(o);
16503
16504 const string& prefix = o->get_omap_prefix();
16505 string key_tail;
16506 bufferlist tail;
16507 o->get_omap_tail(&key_tail);
16508 txc->t->set(prefix, key_tail, tail);
16509 } else {
16510 txc->note_modified_object(o);
16511 }
16512 const string& prefix = o->get_omap_prefix();
16513 o->get_omap_header(&key);
16514 txc->t->set(prefix, key, bl);
16515 r = 0;
16516 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16517 return r;
16518 }
16519
16520 int BlueStore::_omap_rmkeys(TransContext *txc,
16521 CollectionRef& c,
16522 OnodeRef& o,
16523 bufferlist& bl)
16524 {
16525 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16526 int r = 0;
16527 auto p = bl.cbegin();
16528 __u32 num;
16529 string final_key;
16530
16531 if (!o->onode.has_omap()) {
16532 goto out;
16533 }
16534 {
16535 const string& prefix = o->get_omap_prefix();
16536 o->get_omap_key(string(), &final_key);
16537 size_t base_key_len = final_key.size();
16538 decode(num, p);
16539 while (num--) {
16540 string key;
16541 decode(key, p);
16542 final_key.resize(base_key_len); // keep prefix
16543 final_key += key;
16544 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
16545 << " <- " << key << dendl;
16546 txc->t->rmkey(prefix, final_key);
16547 }
16548 }
16549 txc->note_modified_object(o);
16550
16551 out:
16552 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16553 return r;
16554 }
16555
16556 int BlueStore::_omap_rmkey_range(TransContext *txc,
16557 CollectionRef& c,
16558 OnodeRef& o,
16559 const string& first, const string& last)
16560 {
16561 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16562 string key_first, key_last;
16563 int r = 0;
16564 if (!o->onode.has_omap()) {
16565 goto out;
16566 }
16567 {
16568 const string& prefix = o->get_omap_prefix();
16569 o->flush();
16570 o->get_omap_key(first, &key_first);
16571 o->get_omap_key(last, &key_last);
16572 txc->t->rm_range_keys(prefix, key_first, key_last);
16573 dout(20) << __func__ << " remove range start: "
16574 << pretty_binary_string(key_first) << " end: "
16575 << pretty_binary_string(key_last) << dendl;
16576 }
16577 txc->note_modified_object(o);
16578
16579 out:
16580 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16581 return r;
16582 }
16583
16584 int BlueStore::_set_alloc_hint(
16585 TransContext *txc,
16586 CollectionRef& c,
16587 OnodeRef& o,
16588 uint64_t expected_object_size,
16589 uint64_t expected_write_size,
16590 uint32_t flags)
16591 {
16592 dout(15) << __func__ << " " << c->cid << " " << o->oid
16593 << " object_size " << expected_object_size
16594 << " write_size " << expected_write_size
16595 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
16596 << dendl;
16597 int r = 0;
16598 o->onode.expected_object_size = expected_object_size;
16599 o->onode.expected_write_size = expected_write_size;
16600 o->onode.alloc_hint_flags = flags;
16601 txc->write_onode(o);
16602 dout(10) << __func__ << " " << c->cid << " " << o->oid
16603 << " object_size " << expected_object_size
16604 << " write_size " << expected_write_size
16605 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
16606 << " = " << r << dendl;
16607 return r;
16608 }
16609
16610 int BlueStore::_clone(TransContext *txc,
16611 CollectionRef& c,
16612 OnodeRef& oldo,
16613 OnodeRef& newo)
16614 {
16615 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16616 << newo->oid << dendl;
16617 int r = 0;
16618 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
16619 derr << __func__ << " mismatched hash on " << oldo->oid
16620 << " and " << newo->oid << dendl;
16621 return -EINVAL;
16622 }
16623
16624 _assign_nid(txc, newo);
16625
16626 // clone data
16627 oldo->flush();
16628 _do_truncate(txc, c, newo, 0);
16629 if (cct->_conf->bluestore_clone_cow) {
16630 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
16631 } else {
16632 bufferlist bl;
16633 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
16634 if (r < 0)
16635 goto out;
16636 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
16637 if (r < 0)
16638 goto out;
16639 }
16640
16641 // clone attrs
16642 newo->onode.attrs = oldo->onode.attrs;
16643
16644 // clone omap
16645 if (newo->onode.has_omap()) {
16646 dout(20) << __func__ << " clearing old omap data" << dendl;
16647 newo->flush();
16648 _do_omap_clear(txc, newo);
16649 }
16650 if (oldo->onode.has_omap()) {
16651 dout(20) << __func__ << " copying omap data" << dendl;
16652 if (newo->oid.is_pgmeta()) {
16653 newo->onode.set_omap_flags_pgmeta();
16654 } else {
16655 newo->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
16656 }
16657 // check if prefix for omap key is exactly the same size for both objects
16658 // otherwise rewrite_omap_key will corrupt data
16659 ceph_assert(oldo->onode.flags == newo->onode.flags);
16660 const string& prefix = newo->get_omap_prefix();
16661 KeyValueDB::Iterator it = db->get_iterator(prefix);
16662 string head, tail;
16663 oldo->get_omap_header(&head);
16664 oldo->get_omap_tail(&tail);
16665 it->lower_bound(head);
16666 while (it->valid()) {
16667 if (it->key() >= tail) {
16668 dout(30) << __func__ << " reached tail" << dendl;
16669 break;
16670 } else {
16671 dout(30) << __func__ << " got header/data "
16672 << pretty_binary_string(it->key()) << dendl;
16673 string key;
16674 newo->rewrite_omap_key(it->key(), &key);
16675 txc->t->set(prefix, key, it->value());
16676 }
16677 it->next();
16678 }
16679 string new_tail;
16680 bufferlist new_tail_value;
16681 newo->get_omap_tail(&new_tail);
16682 txc->t->set(prefix, new_tail, new_tail_value);
16683 }
16684
16685 txc->write_onode(newo);
16686 r = 0;
16687
16688 out:
16689 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16690 << newo->oid << " = " << r << dendl;
16691 return r;
16692 }
16693
16694 int BlueStore::_do_clone_range(
16695 TransContext *txc,
16696 CollectionRef& c,
16697 OnodeRef& oldo,
16698 OnodeRef& newo,
16699 uint64_t srcoff,
16700 uint64_t length,
16701 uint64_t dstoff)
16702 {
16703 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16704 << newo->oid
16705 << " 0x" << std::hex << srcoff << "~" << length << " -> "
16706 << " 0x" << dstoff << "~" << length << std::dec << dendl;
16707 oldo->extent_map.fault_range(db, srcoff, length);
16708 newo->extent_map.fault_range(db, dstoff, length);
16709 _dump_onode<30>(cct, *oldo);
16710 _dump_onode<30>(cct, *newo);
16711
16712 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
16713
16714 #ifdef HAVE_LIBZBD
16715 if (bdev->is_smr()) {
16716 // duplicate the refs for the shared region.
16717 Extent dummy(dstoff);
16718 for (auto e = newo->extent_map.extent_map.lower_bound(dummy);
16719 e != newo->extent_map.extent_map.end();
16720 ++e) {
16721 if (e->logical_offset >= dstoff + length) {
16722 break;
16723 }
16724 for (auto& ex : e->blob->get_blob().get_extents()) {
16725 // note that we may introduce a new extent reference that is
16726 // earlier than the first zone ref. we allow this since it is
16727 // a lot of work to avoid and has marginal impact on cleaning
16728 // performance.
16729 if (!ex.is_valid()) {
16730 continue;
16731 }
16732 uint32_t zone = ex.offset / zone_size;
16733 if (!newo->onode.zone_offset_refs.count(zone)) {
16734 uint64_t zoff = ex.offset % zone_size;
16735 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
16736 << " offset 0x" << zoff << std::dec
16737 << " -> " << newo->oid << dendl;
16738 txc->note_write_zone_offset(newo, zone, zoff);
16739 }
16740 }
16741 }
16742 }
16743 #endif
16744
16745 _dump_onode<30>(cct, *oldo);
16746 _dump_onode<30>(cct, *newo);
16747 return 0;
16748 }
16749
16750 int BlueStore::_clone_range(TransContext *txc,
16751 CollectionRef& c,
16752 OnodeRef& oldo,
16753 OnodeRef& newo,
16754 uint64_t srcoff, uint64_t length, uint64_t dstoff)
16755 {
16756 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16757 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
16758 << " to offset 0x" << dstoff << std::dec << dendl;
16759 int r = 0;
16760
16761 if (srcoff + length >= OBJECT_MAX_SIZE ||
16762 dstoff + length >= OBJECT_MAX_SIZE) {
16763 r = -E2BIG;
16764 goto out;
16765 }
16766 if (srcoff + length > oldo->onode.size) {
16767 r = -EINVAL;
16768 goto out;
16769 }
16770
16771 _assign_nid(txc, newo);
16772
16773 if (length > 0) {
16774 if (cct->_conf->bluestore_clone_cow) {
16775 _do_zero(txc, c, newo, dstoff, length);
16776 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
16777 } else {
16778 bufferlist bl;
16779 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
16780 if (r < 0)
16781 goto out;
16782 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
16783 if (r < 0)
16784 goto out;
16785 }
16786 }
16787
16788 txc->write_onode(newo);
16789 r = 0;
16790
16791 out:
16792 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16793 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
16794 << " to offset 0x" << dstoff << std::dec
16795 << " = " << r << dendl;
16796 return r;
16797 }
16798
16799 int BlueStore::_rename(TransContext *txc,
16800 CollectionRef& c,
16801 OnodeRef& oldo,
16802 OnodeRef& newo,
16803 const ghobject_t& new_oid)
16804 {
16805 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16806 << new_oid << dendl;
16807 int r;
16808 ghobject_t old_oid = oldo->oid;
16809 mempool::bluestore_cache_meta::string new_okey;
16810
16811 if (newo) {
16812 if (newo->exists) {
16813 r = -EEXIST;
16814 goto out;
16815 }
16816 ceph_assert(txc->onodes.count(newo) == 0);
16817 }
16818
16819 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
16820
16821 // rewrite shards
16822 {
16823 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
16824 get_object_key(cct, new_oid, &new_okey);
16825 string key;
16826 for (auto &s : oldo->extent_map.shards) {
16827 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
16828 [&](const string& final_key) {
16829 txc->t->rmkey(PREFIX_OBJ, final_key);
16830 }
16831 );
16832 s.dirty = true;
16833 }
16834 }
16835
16836 newo = oldo;
16837 txc->write_onode(newo);
16838
16839 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
16840 // Onode in the old slot
16841 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
16842 r = 0;
16843
16844 // hold a ref to new Onode in old name position, to ensure we don't drop
16845 // it from the cache before this txc commits (or else someone may come along
16846 // and read newo's metadata via the old name).
16847 txc->note_modified_object(oldo);
16848
16849 #ifdef HAVE_LIBZBD
16850 if (bdev->is_smr()) {
16851 // adjust zone refs
16852 for (auto& [zone, offset] : newo->onode.zone_offset_refs) {
16853 dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
16854 << " offset 0x" << offset << std::dec
16855 << " -> " << oldo->oid << dendl;
16856 string key;
16857 get_zone_offset_object_key(zone, offset, oldo->oid, &key);
16858 txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
16859
16860 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
16861 << " offset 0x" << offset << std::dec
16862 << " -> " << newo->oid << dendl;
16863 get_zone_offset_object_key(zone, offset, newo->oid, &key);
16864 bufferlist v;
16865 txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
16866 }
16867 }
16868 #endif
16869
16870 out:
16871 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
16872 << new_oid << " = " << r << dendl;
16873 return r;
16874 }
16875
16876 // collections
16877
16878 int BlueStore::_create_collection(
16879 TransContext *txc,
16880 const coll_t &cid,
16881 unsigned bits,
16882 CollectionRef *c)
16883 {
16884 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
16885 int r;
16886 bufferlist bl;
16887
16888 {
16889 std::unique_lock l(coll_lock);
16890 if (*c) {
16891 r = -EEXIST;
16892 goto out;
16893 }
16894 auto p = new_coll_map.find(cid);
16895 ceph_assert(p != new_coll_map.end());
16896 *c = p->second;
16897 (*c)->cnode.bits = bits;
16898 coll_map[cid] = *c;
16899 new_coll_map.erase(p);
16900 }
16901 encode((*c)->cnode, bl);
16902 txc->t->set(PREFIX_COLL, stringify(cid), bl);
16903 r = 0;
16904
16905 out:
16906 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
16907 return r;
16908 }
16909
16910 int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
16911 CollectionRef *c)
16912 {
16913 dout(15) << __func__ << " " << cid << dendl;
16914 int r;
16915
16916 (*c)->flush_all_but_last();
16917 {
16918 std::unique_lock l(coll_lock);
16919 if (!*c) {
16920 r = -ENOENT;
16921 goto out;
16922 }
16923 size_t nonexistent_count = 0;
16924 ceph_assert((*c)->exists);
16925 if ((*c)->onode_map.map_any([&](Onode* o) {
16926 if (o->exists) {
16927 dout(1) << __func__ << " " << o->oid << " " << o
16928 << " exists in onode_map" << dendl;
16929 return true;
16930 }
16931 ++nonexistent_count;
16932 return false;
16933 })) {
16934 r = -ENOTEMPTY;
16935 goto out;
16936 }
16937 vector<ghobject_t> ls;
16938 ghobject_t next;
16939 // Enumerate onodes in db, up to nonexistent_count + 1
16940 // then check if all of them are marked as non-existent.
16941 // Bypass the check if (next != ghobject_t::get_max())
16942 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
16943 nonexistent_count + 1, false, &ls, &next);
16944 if (r >= 0) {
16945 // If true mean collecton has more objects than nonexistent_count,
16946 // so bypass check.
16947 bool exists = (!next.is_max());
16948 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
16949 dout(10) << __func__ << " oid " << *it << dendl;
16950 auto onode = (*c)->onode_map.lookup(*it);
16951 exists = !onode || onode->exists;
16952 if (exists) {
16953 dout(1) << __func__ << " " << *it
16954 << " exists in db, "
16955 << (!onode ? "not present in ram" : "present in ram")
16956 << dendl;
16957 }
16958 }
16959 if (!exists) {
16960 _do_remove_collection(txc, c);
16961 r = 0;
16962 } else {
16963 dout(10) << __func__ << " " << cid
16964 << " is non-empty" << dendl;
16965 r = -ENOTEMPTY;
16966 }
16967 }
16968 }
16969 out:
16970 dout(10) << __func__ << " " << cid << " = " << r << dendl;
16971 return r;
16972 }
16973
16974 void BlueStore::_do_remove_collection(TransContext *txc,
16975 CollectionRef *c)
16976 {
16977 coll_map.erase((*c)->cid);
16978 txc->removed_collections.push_back(*c);
16979 (*c)->exists = false;
16980 _osr_register_zombie((*c)->osr.get());
16981 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
16982 c->reset();
16983 }
16984
16985 int BlueStore::_split_collection(TransContext *txc,
16986 CollectionRef& c,
16987 CollectionRef& d,
16988 unsigned bits, int rem)
16989 {
16990 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
16991 << " bits " << bits << dendl;
16992 std::unique_lock l(c->lock);
16993 std::unique_lock l2(d->lock);
16994 int r;
16995
16996 // flush all previous deferred writes on this sequencer. this is a bit
16997 // heavyweight, but we need to make sure all deferred writes complete
16998 // before we split as the new collection's sequencer may need to order
16999 // this after those writes, and we don't bother with the complexity of
17000 // moving those TransContexts over to the new osr.
17001 _osr_drain_preceding(txc);
17002
17003 // move any cached items (onodes and referenced shared blobs) that will
17004 // belong to the child collection post-split. leave everything else behind.
17005 // this may include things that don't strictly belong to the now-smaller
17006 // parent split, but the OSD will always send us a split for every new
17007 // child.
17008
17009 spg_t pgid, dest_pgid;
17010 bool is_pg = c->cid.is_pg(&pgid);
17011 ceph_assert(is_pg);
17012 is_pg = d->cid.is_pg(&dest_pgid);
17013 ceph_assert(is_pg);
17014
17015 // the destination should initially be empty.
17016 ceph_assert(d->onode_map.empty());
17017 ceph_assert(d->shared_blob_set.empty());
17018 ceph_assert(d->cnode.bits == bits);
17019
17020 c->split_cache(d.get());
17021
17022 // adjust bits. note that this will be redundant for all but the first
17023 // split call for this parent (first child).
17024 c->cnode.bits = bits;
17025 ceph_assert(d->cnode.bits == bits);
17026 r = 0;
17027
17028 bufferlist bl;
17029 encode(c->cnode, bl);
17030 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
17031
17032 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
17033 << " bits " << bits << " = " << r << dendl;
17034 return r;
17035 }
17036
17037 int BlueStore::_merge_collection(
17038 TransContext *txc,
17039 CollectionRef *c,
17040 CollectionRef& d,
17041 unsigned bits)
17042 {
17043 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
17044 << " bits " << bits << dendl;
17045 std::unique_lock l((*c)->lock);
17046 std::unique_lock l2(d->lock);
17047 int r;
17048
17049 coll_t cid = (*c)->cid;
17050
17051 // flush all previous deferred writes on the source collection to ensure
17052 // that all deferred writes complete before we merge as the target collection's
17053 // sequencer may need to order new ops after those writes.
17054
17055 _osr_drain((*c)->osr.get());
17056
17057 // move any cached items (onodes and referenced shared blobs) that will
17058 // belong to the child collection post-split. leave everything else behind.
17059 // this may include things that don't strictly belong to the now-smaller
17060 // parent split, but the OSD will always send us a split for every new
17061 // child.
17062
17063 spg_t pgid, dest_pgid;
17064 bool is_pg = cid.is_pg(&pgid);
17065 ceph_assert(is_pg);
17066 is_pg = d->cid.is_pg(&dest_pgid);
17067 ceph_assert(is_pg);
17068
17069 // adjust bits. note that this will be redundant for all but the first
17070 // merge call for the parent/target.
17071 d->cnode.bits = bits;
17072
17073 // behavior depends on target (d) bits, so this after that is updated.
17074 (*c)->split_cache(d.get());
17075
17076 // remove source collection
17077 {
17078 std::unique_lock l3(coll_lock);
17079 _do_remove_collection(txc, c);
17080 }
17081
17082 r = 0;
17083
17084 bufferlist bl;
17085 encode(d->cnode, bl);
17086 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
17087
17088 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
17089 << " bits " << bits << " = " << r << dendl;
17090 return r;
17091 }
17092
17093 void BlueStore::log_latency(
17094 const char* name,
17095 int idx,
17096 const ceph::timespan& l,
17097 double lat_threshold,
17098 const char* info) const
17099 {
17100 logger->tinc(idx, l);
17101 if (lat_threshold > 0.0 &&
17102 l >= make_timespan(lat_threshold)) {
17103 dout(0) << __func__ << " slow operation observed for " << name
17104 << ", latency = " << l
17105 << info
17106 << dendl;
17107 }
17108 }
17109
17110 void BlueStore::log_latency_fn(
17111 const char* name,
17112 int idx,
17113 const ceph::timespan& l,
17114 double lat_threshold,
17115 std::function<string (const ceph::timespan& lat)> fn) const
17116 {
17117 logger->tinc(idx, l);
17118 if (lat_threshold > 0.0 &&
17119 l >= make_timespan(lat_threshold)) {
17120 dout(0) << __func__ << " slow operation observed for " << name
17121 << ", latency = " << l
17122 << fn(l)
17123 << dendl;
17124 }
17125 }
17126
17127 #if defined(WITH_LTTNG)
17128 void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
17129 KeyValueDB &db,
17130 TransContext &txc,
17131 mono_clock::time_point start_throttle_acquire)
17132 {
17133 pending_kv_ios += txc.ios;
17134 if (txc.deferred_txn) {
17135 pending_deferred_ios += txc.ios;
17136 }
17137
17138 uint64_t started = 0;
17139 uint64_t completed = 0;
17140 if (should_trace(&started, &completed)) {
17141 txc.tracing = true;
17142 uint64_t rocksdb_base_level,
17143 rocksdb_estimate_pending_compaction_bytes,
17144 rocksdb_cur_size_all_mem_tables,
17145 rocksdb_compaction_pending,
17146 rocksdb_mem_table_flush_pending,
17147 rocksdb_num_running_compactions,
17148 rocksdb_num_running_flushes,
17149 rocksdb_actual_delayed_write_rate;
17150 db.get_property(
17151 "rocksdb.base-level",
17152 &rocksdb_base_level);
17153 db.get_property(
17154 "rocksdb.estimate-pending-compaction-bytes",
17155 &rocksdb_estimate_pending_compaction_bytes);
17156 db.get_property(
17157 "rocksdb.cur-size-all-mem-tables",
17158 &rocksdb_cur_size_all_mem_tables);
17159 db.get_property(
17160 "rocksdb.compaction-pending",
17161 &rocksdb_compaction_pending);
17162 db.get_property(
17163 "rocksdb.mem-table-flush-pending",
17164 &rocksdb_mem_table_flush_pending);
17165 db.get_property(
17166 "rocksdb.num-running-compactions",
17167 &rocksdb_num_running_compactions);
17168 db.get_property(
17169 "rocksdb.num-running-flushes",
17170 &rocksdb_num_running_flushes);
17171 db.get_property(
17172 "rocksdb.actual-delayed-write-rate",
17173 &rocksdb_actual_delayed_write_rate);
17174
17175
17176 tracepoint(
17177 bluestore,
17178 transaction_initial_state,
17179 txc.osr->get_sequencer_id(),
17180 txc.seq,
17181 throttle_bytes.get_current(),
17182 throttle_deferred_bytes.get_current(),
17183 pending_kv_ios,
17184 pending_deferred_ios,
17185 started,
17186 completed,
17187 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
17188
17189 tracepoint(
17190 bluestore,
17191 transaction_initial_state_rocksdb,
17192 txc.osr->get_sequencer_id(),
17193 txc.seq,
17194 rocksdb_base_level,
17195 rocksdb_estimate_pending_compaction_bytes,
17196 rocksdb_cur_size_all_mem_tables,
17197 rocksdb_compaction_pending,
17198 rocksdb_mem_table_flush_pending,
17199 rocksdb_num_running_compactions,
17200 rocksdb_num_running_flushes,
17201 rocksdb_actual_delayed_write_rate);
17202 }
17203 }
17204 #endif
17205
17206 mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
17207 TransContext &txc, PerfCounters *logger, int state)
17208 {
17209 mono_clock::time_point now = mono_clock::now();
17210 mono_clock::duration lat = now - txc.last_stamp;
17211 logger->tinc(state, lat);
17212 #if defined(WITH_LTTNG)
17213 if (txc.tracing &&
17214 state >= l_bluestore_state_prepare_lat &&
17215 state <= l_bluestore_state_done_lat) {
17216 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
17217 tracepoint(
17218 bluestore,
17219 transaction_state_duration,
17220 txc.osr->get_sequencer_id(),
17221 txc.seq,
17222 state,
17223 ceph::to_seconds<double>(lat));
17224 }
17225 #endif
17226 txc.last_stamp = now;
17227 return lat;
17228 }
17229
17230 bool BlueStore::BlueStoreThrottle::try_start_transaction(
17231 KeyValueDB &db,
17232 TransContext &txc,
17233 mono_clock::time_point start_throttle_acquire)
17234 {
17235 throttle_bytes.get(txc.cost);
17236
17237 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
17238 emit_initial_tracepoint(db, txc, start_throttle_acquire);
17239 return true;
17240 } else {
17241 return false;
17242 }
17243 }
17244
17245 void BlueStore::BlueStoreThrottle::finish_start_transaction(
17246 KeyValueDB &db,
17247 TransContext &txc,
17248 mono_clock::time_point start_throttle_acquire)
17249 {
17250 ceph_assert(txc.deferred_txn);
17251 throttle_deferred_bytes.get(txc.cost);
17252 emit_initial_tracepoint(db, txc, start_throttle_acquire);
17253 }
17254
17255 #if defined(WITH_LTTNG)
17256 void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
17257 {
17258 pending_kv_ios -= 1;
17259 ios_completed_since_last_traced++;
17260 if (txc.tracing) {
17261 tracepoint(
17262 bluestore,
17263 transaction_commit_latency,
17264 txc.osr->get_sequencer_id(),
17265 txc.seq,
17266 ceph::to_seconds<double>(mono_clock::now() - txc.start));
17267 }
17268 }
17269 #endif
17270
17271 #if defined(WITH_LTTNG)
17272 void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
17273 {
17274 if (txc.deferred_txn) {
17275 pending_deferred_ios -= 1;
17276 }
17277 if (txc.tracing) {
17278 mono_clock::time_point now = mono_clock::now();
17279 mono_clock::duration lat = now - txc.start;
17280 tracepoint(
17281 bluestore,
17282 transaction_total_duration,
17283 txc.osr->get_sequencer_id(),
17284 txc.seq,
17285 ceph::to_seconds<double>(lat));
17286 }
17287 }
17288 #endif
17289
17290 const string prefix_onode = "o";
17291 const string prefix_onode_shard = "x";
17292 const string prefix_other = "Z";
17293 //Itrerates through the db and collects the stats
17294 void BlueStore::generate_db_histogram(Formatter *f)
17295 {
17296 //globals
17297 uint64_t num_onodes = 0;
17298 uint64_t num_shards = 0;
17299 uint64_t num_super = 0;
17300 uint64_t num_coll = 0;
17301 uint64_t num_omap = 0;
17302 uint64_t num_pgmeta_omap = 0;
17303 uint64_t num_deferred = 0;
17304 uint64_t num_alloc = 0;
17305 uint64_t num_stat = 0;
17306 uint64_t num_others = 0;
17307 uint64_t num_shared_shards = 0;
17308 size_t max_key_size =0, max_value_size = 0;
17309 uint64_t total_key_size = 0, total_value_size = 0;
17310 size_t key_size = 0, value_size = 0;
17311 KeyValueHistogram hist;
17312
17313 auto start = coarse_mono_clock::now();
17314
17315 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
17316 iter->seek_to_first();
17317 while (iter->valid()) {
17318 dout(30) << __func__ << " Key: " << iter->key() << dendl;
17319 key_size = iter->key_size();
17320 value_size = iter->value_size();
17321 hist.value_hist[hist.get_value_slab(value_size)]++;
17322 max_key_size = std::max(max_key_size, key_size);
17323 max_value_size = std::max(max_value_size, value_size);
17324 total_key_size += key_size;
17325 total_value_size += value_size;
17326
17327 pair<string,string> key(iter->raw_key());
17328
17329 if (key.first == PREFIX_SUPER) {
17330 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
17331 num_super++;
17332 } else if (key.first == PREFIX_STAT) {
17333 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
17334 num_stat++;
17335 } else if (key.first == PREFIX_COLL) {
17336 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
17337 num_coll++;
17338 } else if (key.first == PREFIX_OBJ) {
17339 if (key.second.back() == ONODE_KEY_SUFFIX) {
17340 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
17341 num_onodes++;
17342 } else {
17343 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
17344 num_shards++;
17345 }
17346 } else if (key.first == PREFIX_OMAP) {
17347 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
17348 num_omap++;
17349 } else if (key.first == PREFIX_PERPOOL_OMAP) {
17350 hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size);
17351 num_omap++;
17352 } else if (key.first == PREFIX_PERPG_OMAP) {
17353 hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size);
17354 num_omap++;
17355 } else if (key.first == PREFIX_PGMETA_OMAP) {
17356 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
17357 num_pgmeta_omap++;
17358 } else if (key.first == PREFIX_DEFERRED) {
17359 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
17360 num_deferred++;
17361 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
17362 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
17363 num_alloc++;
17364 } else if (key.first == PREFIX_SHARED_BLOB) {
17365 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
17366 num_shared_shards++;
17367 } else {
17368 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
17369 num_others++;
17370 }
17371 iter->next();
17372 }
17373
17374 ceph::timespan duration = coarse_mono_clock::now() - start;
17375 f->open_object_section("rocksdb_key_value_stats");
17376 f->dump_unsigned("num_onodes", num_onodes);
17377 f->dump_unsigned("num_shards", num_shards);
17378 f->dump_unsigned("num_super", num_super);
17379 f->dump_unsigned("num_coll", num_coll);
17380 f->dump_unsigned("num_omap", num_omap);
17381 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
17382 f->dump_unsigned("num_deferred", num_deferred);
17383 f->dump_unsigned("num_alloc", num_alloc);
17384 f->dump_unsigned("num_stat", num_stat);
17385 f->dump_unsigned("num_shared_shards", num_shared_shards);
17386 f->dump_unsigned("num_others", num_others);
17387 f->dump_unsigned("max_key_size", max_key_size);
17388 f->dump_unsigned("max_value_size", max_value_size);
17389 f->dump_unsigned("total_key_size", total_key_size);
17390 f->dump_unsigned("total_value_size", total_value_size);
17391 f->close_section();
17392
17393 hist.dump(f);
17394
17395 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
17396
17397 }
17398
17399 void BlueStore::_shutdown_cache()
17400 {
17401 dout(10) << __func__ << dendl;
17402 for (auto i : buffer_cache_shards) {
17403 i->flush();
17404 ceph_assert(i->empty());
17405 }
17406 for (auto& p : coll_map) {
17407 p.second->onode_map.clear();
17408 if (!p.second->shared_blob_set.empty()) {
17409 derr << __func__ << " stray shared blobs on " << p.first << dendl;
17410 p.second->shared_blob_set.dump<0>(cct);
17411 }
17412 ceph_assert(p.second->onode_map.empty());
17413 ceph_assert(p.second->shared_blob_set.empty());
17414 }
17415 coll_map.clear();
17416 for (auto i : onode_cache_shards) {
17417 ceph_assert(i->empty());
17418 }
17419 }
17420
17421 // For external caller.
17422 // We use a best-effort policy instead, e.g.,
17423 // we don't care if there are still some pinned onodes/data in the cache
17424 // after this command is completed.
17425 int BlueStore::flush_cache(ostream *os)
17426 {
17427 dout(10) << __func__ << dendl;
17428 for (auto i : onode_cache_shards) {
17429 i->flush();
17430 }
17431 for (auto i : buffer_cache_shards) {
17432 i->flush();
17433 }
17434
17435 return 0;
17436 }
17437
17438 void BlueStore::_apply_padding(uint64_t head_pad,
17439 uint64_t tail_pad,
17440 bufferlist& padded)
17441 {
17442 if (head_pad) {
17443 padded.prepend_zero(head_pad);
17444 }
17445 if (tail_pad) {
17446 padded.append_zero(tail_pad);
17447 }
17448 if (head_pad || tail_pad) {
17449 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
17450 << " tail 0x" << tail_pad << std::dec << dendl;
17451 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
17452 }
17453 }
17454
17455 void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
17456 {
17457 // finalize extent_map shards
17458 o->extent_map.update(txn, false);
17459 if (o->extent_map.needs_reshard()) {
17460 o->extent_map.reshard(db, txn);
17461 o->extent_map.update(txn, true);
17462 if (o->extent_map.needs_reshard()) {
17463 dout(20) << __func__ << " warning: still wants reshard, check options?"
17464 << dendl;
17465 o->extent_map.clear_needs_reshard();
17466 }
17467 logger->inc(l_bluestore_onode_reshard);
17468 }
17469
17470 // bound encode
17471 size_t bound = 0;
17472 denc(o->onode, bound);
17473 o->extent_map.bound_encode_spanning_blobs(bound);
17474 if (o->onode.extent_map_shards.empty()) {
17475 denc(o->extent_map.inline_bl, bound);
17476 }
17477
17478 // encode
17479 bufferlist bl;
17480 unsigned onode_part, blob_part, extent_part;
17481 {
17482 auto p = bl.get_contiguous_appender(bound, true);
17483 denc(o->onode, p);
17484 onode_part = p.get_logical_offset();
17485 o->extent_map.encode_spanning_blobs(p);
17486 blob_part = p.get_logical_offset() - onode_part;
17487 if (o->onode.extent_map_shards.empty()) {
17488 denc(o->extent_map.inline_bl, p);
17489 }
17490 extent_part = p.get_logical_offset() - onode_part - blob_part;
17491 }
17492
17493 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
17494 << " (" << onode_part << " bytes onode + "
17495 << blob_part << " bytes spanning blobs + "
17496 << extent_part << " bytes inline extents)"
17497 << dendl;
17498
17499
17500 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
17501 }
17502
17503 void BlueStore::_log_alerts(osd_alert_list_t& alerts)
17504 {
17505 std::lock_guard l(qlock);
17506
17507 if (!spurious_read_errors_alert.empty() &&
17508 cct->_conf->bluestore_warn_on_spurious_read_errors) {
17509 alerts.emplace(
17510 "BLUESTORE_SPURIOUS_READ_ERRORS",
17511 spurious_read_errors_alert);
17512 }
17513 if (!disk_size_mismatch_alert.empty()) {
17514 alerts.emplace(
17515 "BLUESTORE_DISK_SIZE_MISMATCH",
17516 disk_size_mismatch_alert);
17517 }
17518 if (!legacy_statfs_alert.empty()) {
17519 alerts.emplace(
17520 "BLUESTORE_LEGACY_STATFS",
17521 legacy_statfs_alert);
17522 }
17523 if (!spillover_alert.empty() &&
17524 cct->_conf->bluestore_warn_on_bluefs_spillover) {
17525 alerts.emplace(
17526 "BLUEFS_SPILLOVER",
17527 spillover_alert);
17528 }
17529 if (!no_per_pg_omap_alert.empty()) {
17530 alerts.emplace(
17531 "BLUESTORE_NO_PER_PG_OMAP",
17532 no_per_pg_omap_alert);
17533 }
17534 if (!no_per_pool_omap_alert.empty()) {
17535 alerts.emplace(
17536 "BLUESTORE_NO_PER_POOL_OMAP",
17537 no_per_pool_omap_alert);
17538 }
17539 string s0(failed_cmode);
17540
17541 if (!failed_compressors.empty()) {
17542 if (!s0.empty()) {
17543 s0 += ", ";
17544 }
17545 s0 += "unable to load:";
17546 bool first = true;
17547 for (auto& s : failed_compressors) {
17548 if (first) {
17549 first = false;
17550 } else {
17551 s0 += ", ";
17552 }
17553 s0 += s;
17554 }
17555 alerts.emplace(
17556 "BLUESTORE_NO_COMPRESSION",
17557 s0);
17558 }
17559 }
17560
17561 void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
17562 const PExtentVector& extents)
17563 {
17564 alloc_stats_count++;
17565 alloc_stats_fragments += extents.size();
17566 alloc_stats_size += need;
17567
17568 for (auto& e : extents) {
17569 logger->hinc(l_bluestore_allocate_hist, e.length, need);
17570 }
17571 }
17572
17573 void BlueStore::_record_allocation_stats()
17574 {
17575 // don't care about data consistency,
17576 // fields can be partially modified while making the tuple
17577 auto t0 = std::make_tuple(
17578 alloc_stats_count.exchange(0),
17579 alloc_stats_fragments.exchange(0),
17580 alloc_stats_size.exchange(0));
17581
17582 dout(0) << " allocation stats probe "
17583 << probe_count << ":"
17584 << " cnt: " << std::get<0>(t0)
17585 << " frags: " << std::get<1>(t0)
17586 << " size: " << std::get<2>(t0)
17587 << dendl;
17588
17589
17590 //
17591 // Keep the history for probes from the power-of-two sequence:
17592 // -1, -2, -4, -8, -16
17593 //
17594 size_t base = 1;
17595 for (auto& t : alloc_stats_history) {
17596 dout(0) << " probe -"
17597 << base + (probe_count % base) << ": "
17598 << std::get<0>(t)
17599 << ", " << std::get<1>(t)
17600 << ", " << std::get<2>(t)
17601 << dendl;
17602 base <<= 1;
17603 }
17604 dout(0) << "------------" << dendl;
17605
17606 ++ probe_count;
17607
17608 for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) {
17609 if ((probe_count % (1 << i)) == 0) {
17610 alloc_stats_history[i] = alloc_stats_history[i - 1];
17611 }
17612 }
17613 alloc_stats_history[0].swap(t0);
17614 }
17615
17616 // ===========================================
17617 // BlueStoreRepairer
17618
17619 size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
17620 const interval_set<uint64_t>& extents)
17621 {
17622 ceph_assert(granularity); // initialized
17623 // can't call for the second time
17624 ceph_assert(!was_filtered_out);
17625 ceph_assert(collections_bfs.size() == objects_bfs.size());
17626
17627 uint64_t prev_pos = 0;
17628 uint64_t npos = collections_bfs.size();
17629
17630 bloom_vector collections_reduced;
17631 bloom_vector objects_reduced;
17632
17633 for (auto e : extents) {
17634 if (e.second == 0) {
17635 continue;
17636 }
17637 uint64_t pos = max(e.first / granularity, prev_pos);
17638 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
17639 while (pos != npos && pos < end_pos) {
17640 ceph_assert( collections_bfs[pos].element_count() ==
17641 objects_bfs[pos].element_count());
17642 if (collections_bfs[pos].element_count()) {
17643 collections_reduced.push_back(std::move(collections_bfs[pos]));
17644 objects_reduced.push_back(std::move(objects_bfs[pos]));
17645 }
17646 ++pos;
17647 }
17648 prev_pos = end_pos;
17649 }
17650 collections_reduced.swap(collections_bfs);
17651 objects_reduced.swap(objects_bfs);
17652 was_filtered_out = true;
17653 return collections_bfs.size();
17654 }
17655
17656 bool BlueStoreRepairer::remove_key(KeyValueDB *db,
17657 const string& prefix,
17658 const string& key)
17659 {
17660 std::lock_guard l(lock);
17661 if (!remove_key_txn) {
17662 remove_key_txn = db->get_transaction();
17663 }
17664 ++to_repair_cnt;
17665 remove_key_txn->rmkey(prefix, key);
17666
17667 return true;
17668 }
17669
17670 void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val)
17671 {
17672 std::lock_guard l(lock); // possibly redundant
17673 ceph_assert(fix_per_pool_omap_txn == nullptr);
17674 fix_per_pool_omap_txn = db->get_transaction();
17675 ++to_repair_cnt;
17676 bufferlist bl;
17677 bl.append(stringify(val));
17678 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
17679 }
17680
17681 bool BlueStoreRepairer::fix_shared_blob(
17682 KeyValueDB::Transaction txn,
17683 uint64_t sbid,
17684 bluestore_extent_ref_map_t* ref_map,
17685 size_t repaired)
17686 {
17687 string key;
17688 get_shared_blob_key(sbid, &key);
17689 if (ref_map) {
17690 bluestore_shared_blob_t persistent(sbid, std::move(*ref_map));
17691 bufferlist bl;
17692 encode(persistent, bl);
17693 txn->set(PREFIX_SHARED_BLOB, key, bl);
17694 } else {
17695 txn->rmkey(PREFIX_SHARED_BLOB, key);
17696 }
17697 to_repair_cnt += repaired;
17698 return true;
17699 }
17700
17701 bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
17702 const string& key,
17703 const store_statfs_t& new_statfs)
17704 {
17705 std::lock_guard l(lock);
17706 if (!fix_statfs_txn) {
17707 fix_statfs_txn = db->get_transaction();
17708 }
17709 BlueStore::volatile_statfs vstatfs;
17710 vstatfs = new_statfs;
17711 bufferlist bl;
17712 vstatfs.encode(bl);
17713 ++to_repair_cnt;
17714 fix_statfs_txn->set(PREFIX_STAT, key, bl);
17715 return true;
17716 }
17717
17718 bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
17719 FreelistManager* fm,
17720 uint64_t offset, uint64_t len)
17721 {
17722 std::lock_guard l(lock);
17723 ceph_assert(!fm->is_null_manager());
17724
17725 if (!fix_fm_leaked_txn) {
17726 fix_fm_leaked_txn = db->get_transaction();
17727 }
17728 ++to_repair_cnt;
17729 fm->release(offset, len, fix_fm_leaked_txn);
17730 return true;
17731 }
17732 bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
17733 FreelistManager* fm,
17734 uint64_t offset, uint64_t len)
17735 {
17736 std::lock_guard l(lock);
17737 ceph_assert(!fm->is_null_manager());
17738
17739 if (!fix_fm_false_free_txn) {
17740 fix_fm_false_free_txn = db->get_transaction();
17741 }
17742 ++to_repair_cnt;
17743 fm->allocate(offset, len, fix_fm_false_free_txn);
17744 return true;
17745 }
17746
17747 bool BlueStoreRepairer::fix_spanning_blobs(
17748 KeyValueDB* db,
17749 std::function<void(KeyValueDB::Transaction)> f)
17750 {
17751 std::lock_guard l(lock);
17752 if (!fix_onode_txn) {
17753 fix_onode_txn = db->get_transaction();
17754 }
17755 f(fix_onode_txn);
17756 ++to_repair_cnt;
17757 return true;
17758 }
17759
17760 bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
17761 {
17762 //NB: not for use in multithreading mode!!!
17763 if (misreferenced_extents.size()) {
17764 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
17765 ceph_assert(n > 0);
17766 if (!fix_misreferences_txn) {
17767 fix_misreferences_txn = db->get_transaction();
17768 }
17769 return true;
17770 }
17771 return false;
17772 }
17773
17774 unsigned BlueStoreRepairer::apply(KeyValueDB* db)
17775 {
17776 //NB: not for use in multithreading mode!!!
17777 if (fix_per_pool_omap_txn) {
17778 auto ok = db->submit_transaction_sync(fix_per_pool_omap_txn) == 0;
17779 ceph_assert(ok);
17780 fix_per_pool_omap_txn = nullptr;
17781 }
17782 if (fix_fm_leaked_txn) {
17783 auto ok = db->submit_transaction_sync(fix_fm_leaked_txn) == 0;
17784 ceph_assert(ok);
17785 fix_fm_leaked_txn = nullptr;
17786 }
17787 if (fix_fm_false_free_txn) {
17788 auto ok = db->submit_transaction_sync(fix_fm_false_free_txn) == 0;
17789 ceph_assert(ok);
17790 fix_fm_false_free_txn = nullptr;
17791 }
17792 if (remove_key_txn) {
17793 auto ok = db->submit_transaction_sync(remove_key_txn) == 0;
17794 ceph_assert(ok);
17795 remove_key_txn = nullptr;
17796 }
17797 if (fix_misreferences_txn) {
17798 auto ok = db->submit_transaction_sync(fix_misreferences_txn) == 0;
17799 ceph_assert(ok);
17800 fix_misreferences_txn = nullptr;
17801 }
17802 if (fix_onode_txn) {
17803 auto ok = db->submit_transaction_sync(fix_onode_txn) == 0;
17804 ceph_assert(ok);
17805 fix_onode_txn = nullptr;
17806 }
17807 if (fix_shared_blob_txn) {
17808 auto ok = db->submit_transaction_sync(fix_shared_blob_txn) == 0;
17809 ceph_assert(ok);
17810 fix_shared_blob_txn = nullptr;
17811 }
17812 if (fix_statfs_txn) {
17813 auto ok = db->submit_transaction_sync(fix_statfs_txn) == 0;
17814 ceph_assert(ok);
17815 fix_statfs_txn = nullptr;
17816 }
17817 if (need_compact) {
17818 db->compact();
17819 need_compact = false;
17820 }
17821 unsigned repaired = to_repair_cnt;
17822 to_repair_cnt = 0;
17823 return repaired;
17824 }
17825
17826 // =======================================================
17827 // RocksDBBlueFSVolumeSelector
17828
17829 uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
17830 ceph_assert(h != nullptr);
17831 uint64_t hint = reinterpret_cast<uint64_t>(h);
17832 uint8_t res;
17833 switch (hint) {
17834 case LEVEL_SLOW:
17835 res = BlueFS::BDEV_SLOW;
17836 if (db_avail4slow > 0) {
17837 // considering statically available db space vs.
17838 // - observed maximums on DB dev for DB/WAL/UNSORTED data
17839 // - observed maximum spillovers
17840 uint64_t max_db_use = 0; // max db usage we potentially observed
17841 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
17842 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
17843 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
17844 // this could go to db hence using it in the estimation
17845 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
17846
17847 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
17848 uint64_t avail = min(
17849 db_avail4slow,
17850 max_db_use < db_total ? db_total - max_db_use : 0);
17851
17852 // considering current DB dev usage for SLOW data
17853 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
17854 res = BlueFS::BDEV_DB;
17855 }
17856 }
17857 break;
17858 case LEVEL_LOG:
17859 case LEVEL_WAL:
17860 res = BlueFS::BDEV_WAL;
17861 break;
17862 case LEVEL_DB:
17863 default:
17864 res = BlueFS::BDEV_DB;
17865 break;
17866 }
17867 return res;
17868 }
17869
17870 void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
17871 {
17872 auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST];
17873 res.emplace_back(base, db_size);
17874 auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST];
17875 if (slow_size == 0) {
17876 slow_size = db_size;
17877 }
17878 res.emplace_back(base + ".slow", slow_size);
17879 }
17880
17881 void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
17882 uint8_t res = LEVEL_DB;
17883 if (dirname.length() > 5) {
17884 // the "db.slow" and "db.wal" directory names are hard-coded at
17885 // match up with bluestore. the slow device is always the second
17886 // one (when a dedicated block.db device is present and used at
17887 // bdev 0). the wal device is always last.
17888 if (boost::algorithm::ends_with(dirname, ".slow")) {
17889 res = LEVEL_SLOW;
17890 }
17891 else if (boost::algorithm::ends_with(dirname, ".wal")) {
17892 res = LEVEL_WAL;
17893 }
17894 }
17895 return reinterpret_cast<void*>(res);
17896 }
17897
17898 void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
17899 auto max_x = per_level_per_dev_usage.get_max_x();
17900 auto max_y = per_level_per_dev_usage.get_max_y();
17901 sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
17902 << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
17903 << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
17904 << ", db_avail:" << db_avail4slow << std::endl
17905 << "Usage matrix:" << std::endl;
17906 constexpr std::array<const char*, 8> names{ {
17907 "DEV/LEV",
17908 "WAL",
17909 "DB",
17910 "SLOW",
17911 "*",
17912 "*",
17913 "REAL",
17914 "FILES",
17915 } };
17916 const size_t width = 12;
17917 for (size_t i = 0; i < names.size(); ++i) {
17918 sout.setf(std::ios::left, std::ios::adjustfield);
17919 sout.width(width);
17920 sout << names[i];
17921 }
17922 sout << std::endl;
17923 for (size_t l = 0; l < max_y; l++) {
17924 sout.setf(std::ios::left, std::ios::adjustfield);
17925 sout.width(width);
17926 switch (l + LEVEL_FIRST) {
17927 case LEVEL_LOG:
17928 sout << "LOG"; break;
17929 case LEVEL_WAL:
17930 sout << "WAL"; break;
17931 case LEVEL_DB:
17932 sout << "DB"; break;
17933 case LEVEL_SLOW:
17934 sout << "SLOW"; break;
17935 case LEVEL_MAX:
17936 sout << "TOTALS"; break;
17937 }
17938 for (size_t d = 0; d < max_x; d++) {
17939 sout.setf(std::ios::left, std::ios::adjustfield);
17940 sout.width(width);
17941 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
17942 }
17943 sout.setf(std::ios::left, std::ios::adjustfield);
17944 sout.width(width);
17945 sout << stringify(per_level_files[l]) << std::endl;
17946 }
17947 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
17948 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
17949 sout << "MAXIMUMS:" << std::endl;
17950 for (size_t l = 0; l < max_y; l++) {
17951 sout.setf(std::ios::left, std::ios::adjustfield);
17952 sout.width(width);
17953 switch (l + LEVEL_FIRST) {
17954 case LEVEL_LOG:
17955 sout << "LOG"; break;
17956 case LEVEL_WAL:
17957 sout << "WAL"; break;
17958 case LEVEL_DB:
17959 sout << "DB"; break;
17960 case LEVEL_SLOW:
17961 sout << "SLOW"; break;
17962 case LEVEL_MAX:
17963 sout << "TOTALS"; break;
17964 }
17965 for (size_t d = 0; d < max_x - 1; d++) {
17966 sout.setf(std::ios::left, std::ios::adjustfield);
17967 sout.width(width);
17968 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
17969 }
17970 sout.setf(std::ios::left, std::ios::adjustfield);
17971 sout.width(width);
17972 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
17973 if (l < max_y - 1) {
17974 sout << std::endl;
17975 }
17976 }
17977 }
17978
17979 BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const {
17980 RocksDBBlueFSVolumeSelector* ns =
17981 new RocksDBBlueFSVolumeSelector(0, 0, 0,
17982 0, 0, 0,
17983 0, 0, false);
17984 return ns;
17985 }
17986
17987 bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) {
17988 RocksDBBlueFSVolumeSelector* o = dynamic_cast<RocksDBBlueFSVolumeSelector*>(other);
17989 ceph_assert(o);
17990 bool equal = true;
17991 for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) {
17992 for (size_t y = 0; y <LEVEL_MAX - LEVEL_FIRST + 1; y++) {
17993 equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y));
17994 }
17995 }
17996 for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) {
17997 equal &= (per_level_files[t] == o->per_level_files[t]);
17998 }
17999 return equal;
18000 }
18001
18002 // =======================================================
18003
18004 //================================================================================================================
18005 // BlueStore is committing all allocation information (alloc/release) into RocksDB before the client Write is performed.
18006 // This cause a delay in write path and add significant load to the CPU/Memory/Disk.
18007 // The reason for the RocksDB updates is that it allows Ceph to survive any failure without losing the allocation state.
18008 //
18009 // We changed the code skiping RocksDB updates on allocation time and instead performing a full desatge of the allocator object
18010 // with all the OSD allocation state in a single step during umount().
18011 // This change leads to a 25% increase in IOPS and reduced latency in small random-write workload, but exposes the system
18012 // to losing allocation info in failure cases where we don't call umount.
18013 // We add code to perform a full allocation-map rebuild from information stored inside the ONode which is used in failure cases.
18014 // When we perform a graceful shutdown there is no need for recovery and we simply read the allocation-map from a flat file
18015 // where we store the allocation-map during umount().
18016 //================================================================================================================
18017
18018 #undef dout_prefix
18019 #define dout_prefix *_dout << "bluestore::NCB::" << __func__ << "::"
18020
18021 static const std::string allocator_dir = "ALLOCATOR_NCB_DIR";
18022 static const std::string allocator_file = "ALLOCATOR_NCB_FILE";
18023 static uint32_t s_format_version = 0x01; // support future changes to allocator-map file
18024 static uint32_t s_serial = 0x01;
18025
18026 #if 1
18027 #define CEPHTOH_32 le32toh
18028 #define CEPHTOH_64 le64toh
18029 #define HTOCEPH_32 htole32
18030 #define HTOCEPH_64 htole64
18031 #else
18032 // help debug the encode/decode by forcing alien format
18033 #define CEPHTOH_32 be32toh
18034 #define CEPHTOH_64 be64toh
18035 #define HTOCEPH_32 htobe32
18036 #define HTOCEPH_64 htobe64
18037 #endif
18038
18039 // 48 Bytes header for on-disk alloator image
18040 const uint64_t ALLOCATOR_IMAGE_VALID_SIGNATURE = 0x1FACE0FF;
18041 struct allocator_image_header {
18042 uint32_t format_version; // 0x00
18043 uint32_t valid_signature; // 0x04
18044 utime_t timestamp; // 0x08
18045 uint32_t serial; // 0x10
18046 uint32_t pad[0x7]; // 0x14
18047
18048 allocator_image_header() {
18049 memset((char*)this, 0, sizeof(allocator_image_header));
18050 }
18051
18052 // create header in CEPH format
18053 allocator_image_header(utime_t timestamp, uint32_t format_version, uint32_t serial) {
18054 this->format_version = format_version;
18055 this->timestamp = timestamp;
18056 this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
18057 this->serial = serial;
18058 memset(this->pad, 0, sizeof(this->pad));
18059 }
18060
18061 friend std::ostream& operator<<(std::ostream& out, const allocator_image_header& header) {
18062 out << "format_version = " << header.format_version << std::endl;
18063 out << "valid_signature = " << header.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
18064 out << "timestamp = " << header.timestamp << std::endl;
18065 out << "serial = " << header.serial << std::endl;
18066 for (unsigned i = 0; i < sizeof(header.pad)/sizeof(uint32_t); i++) {
18067 if (header.pad[i]) {
18068 out << "header.pad[" << i << "] = " << header.pad[i] << std::endl;
18069 }
18070 }
18071 return out;
18072 }
18073
18074 DENC(allocator_image_header, v, p) {
18075 denc(v.format_version, p);
18076 denc(v.valid_signature, p);
18077 denc(v.timestamp.tv.tv_sec, p);
18078 denc(v.timestamp.tv.tv_nsec, p);
18079 denc(v.serial, p);
18080 for (auto& pad: v.pad) {
18081 denc(pad, p);
18082 }
18083 }
18084
18085
18086 int verify(CephContext* cct, const std::string &path) {
18087 if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
18088 for (unsigned i = 0; i < (sizeof(pad) / sizeof(uint32_t)); i++) {
18089 if (this->pad[i]) {
18090 derr << "Illegal Header - pad[" << i << "]="<< pad[i] << dendl;
18091 return -1;
18092 }
18093 }
18094 return 0;
18095 }
18096 else {
18097 derr << "Illegal Header - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
18098 return -1;
18099 }
18100 }
18101 };
18102 WRITE_CLASS_DENC(allocator_image_header)
18103
18104 // 56 Bytes trailer for on-disk alloator image
18105 struct allocator_image_trailer {
18106 extent_t null_extent; // 0x00
18107
18108 uint32_t format_version; // 0x10
18109 uint32_t valid_signature; // 0x14
18110
18111 utime_t timestamp; // 0x18
18112
18113 uint32_t serial; // 0x20
18114 uint32_t pad; // 0x24
18115 uint64_t entries_count; // 0x28
18116 uint64_t allocation_size; // 0x30
18117
18118 // trailer is created in CEPH format
18119 allocator_image_trailer(utime_t timestamp, uint32_t format_version, uint32_t serial, uint64_t entries_count, uint64_t allocation_size) {
18120 memset((char*)&(this->null_extent), 0, sizeof(this->null_extent));
18121 this->format_version = format_version;
18122 this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
18123 this->timestamp = timestamp;
18124 this->serial = serial;
18125 this->pad = 0;
18126 this->entries_count = entries_count;
18127 this->allocation_size = allocation_size;
18128 }
18129
18130 allocator_image_trailer() {
18131 memset((char*)this, 0, sizeof(allocator_image_trailer));
18132 }
18133
18134 friend std::ostream& operator<<(std::ostream& out, const allocator_image_trailer& trailer) {
18135 if (trailer.null_extent.offset || trailer.null_extent.length) {
18136 out << "trailer.null_extent.offset = " << trailer.null_extent.offset << std::endl;
18137 out << "trailer.null_extent.length = " << trailer.null_extent.length << std::endl;
18138 }
18139 out << "format_version = " << trailer.format_version << std::endl;
18140 out << "valid_signature = " << trailer.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
18141 out << "timestamp = " << trailer.timestamp << std::endl;
18142 out << "serial = " << trailer.serial << std::endl;
18143 if (trailer.pad) {
18144 out << "trailer.pad= " << trailer.pad << std::endl;
18145 }
18146 out << "entries_count = " << trailer.entries_count << std::endl;
18147 out << "allocation_size = " << trailer.allocation_size << std::endl;
18148 return out;
18149 }
18150
18151 int verify(CephContext* cct, const std::string &path, const allocator_image_header *p_header, uint64_t entries_count, uint64_t allocation_size) {
18152 if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
18153
18154 // trailer must starts with null extents (both fields set to zero) [no need to convert formats for zero)
18155 if (null_extent.offset || null_extent.length) {
18156 derr << "illegal trailer - null_extent = [" << null_extent.offset << "," << null_extent.length << "]"<< dendl;
18157 return -1;
18158 }
18159
18160 if (serial != p_header->serial) {
18161 derr << "Illegal trailer: header->serial(" << p_header->serial << ") != trailer->serial(" << serial << ")" << dendl;
18162 return -1;
18163 }
18164
18165 if (format_version != p_header->format_version) {
18166 derr << "Illegal trailer: header->format_version(" << p_header->format_version
18167 << ") != trailer->format_version(" << format_version << ")" << dendl;
18168 return -1;
18169 }
18170
18171 if (timestamp != p_header->timestamp) {
18172 derr << "Illegal trailer: header->timestamp(" << p_header->timestamp
18173 << ") != trailer->timestamp(" << timestamp << ")" << dendl;
18174 return -1;
18175 }
18176
18177 if (this->entries_count != entries_count) {
18178 derr << "Illegal trailer: entries_count(" << entries_count << ") != trailer->entries_count("
18179 << this->entries_count << ")" << dendl;
18180 return -1;
18181 }
18182
18183 if (this->allocation_size != allocation_size) {
18184 derr << "Illegal trailer: allocation_size(" << allocation_size << ") != trailer->allocation_size("
18185 << this->allocation_size << ")" << dendl;
18186 return -1;
18187 }
18188
18189 if (pad) {
18190 derr << "Illegal Trailer - pad="<< pad << dendl;
18191 return -1;
18192 }
18193
18194 // if arrived here -> trailer is valid !!
18195 return 0;
18196 } else {
18197 derr << "Illegal Trailer - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
18198 return -1;
18199 }
18200 }
18201
18202 DENC(allocator_image_trailer, v, p) {
18203 denc(v.null_extent.offset, p);
18204 denc(v.null_extent.length, p);
18205 denc(v.format_version, p);
18206 denc(v.valid_signature, p);
18207 denc(v.timestamp.tv.tv_sec, p);
18208 denc(v.timestamp.tv.tv_nsec, p);
18209 denc(v.serial, p);
18210 denc(v.pad, p);
18211 denc(v.entries_count, p);
18212 denc(v.allocation_size, p);
18213 }
18214 };
18215 WRITE_CLASS_DENC(allocator_image_trailer)
18216
18217
18218 //-------------------------------------------------------------------------------------
18219 // invalidate old allocation file if exists so will go directly to recovery after failure
18220 // we can safely ignore non-existing file
18221 int BlueStore::invalidate_allocation_file_on_bluefs()
18222 {
18223 // mark that allocation-file was invalidated and we should destage a new copy whne closing db
18224 need_to_destage_allocation_file = true;
18225 dout(10) << "need_to_destage_allocation_file was set" << dendl;
18226
18227 BlueFS::FileWriter *p_handle = nullptr;
18228 if (!bluefs->dir_exists(allocator_dir)) {
18229 dout(5) << "allocator_dir(" << allocator_dir << ") doesn't exist" << dendl;
18230 // nothing to do -> return
18231 return 0;
18232 }
18233
18234 int ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
18235 if (ret != 0) {
18236 dout(5) << "allocator_file(" << allocator_file << ") doesn't exist" << dendl;
18237 // nothing to do -> return
18238 return 0;
18239 }
18240
18241
18242 ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, true);
18243 if (ret != 0) {
18244 derr << "Failed open_for_write with error-code " << ret << dendl;
18245 return -1;
18246 }
18247
18248 dout(5) << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
18249 ret = bluefs->truncate(p_handle, 0);
18250 if (ret != 0) {
18251 derr << "Failed truncate with error-code " << ret << dendl;
18252 bluefs->close_writer(p_handle);
18253 return -1;
18254 }
18255
18256 bluefs->fsync(p_handle);
18257 bluefs->close_writer(p_handle);
18258
18259 return 0;
18260 }
18261
18262 //-----------------------------------------------------------------------------------
18263 // load bluefs extents into bluefs_extents_vec
18264 int load_bluefs_extents(BlueFS *bluefs,
18265 bluefs_layout_t *bluefs_layout,
18266 CephContext* cct,
18267 const std::string &path,
18268 std::vector<extent_t> &bluefs_extents_vec,
18269 uint64_t min_alloc_size)
18270 {
18271 if (! bluefs) {
18272 dout(5) << "No BlueFS device found!!" << dendl;
18273 return 0;
18274 }
18275
18276 interval_set<uint64_t> bluefs_extents;
18277 int ret = bluefs->get_block_extents(bluefs_layout->shared_bdev, &bluefs_extents);
18278 if (ret < 0) {
18279 derr << "failed bluefs->get_block_extents()!!" << dendl;
18280 return ret;
18281 }
18282
18283 for (auto itr = bluefs_extents.begin(); itr != bluefs_extents.end(); itr++) {
18284 extent_t e = { .offset = itr.get_start(), .length = itr.get_len() };
18285 bluefs_extents_vec.push_back(e);
18286 }
18287
18288 dout(5) << "BlueFS extent_count=" << bluefs_extents_vec.size() << dendl;
18289 return 0;
18290 }
18291
18292 //-----------------------------------------------------------------------------------
18293 int BlueStore::copy_allocator(Allocator* src_alloc, Allocator* dest_alloc, uint64_t* p_num_entries)
18294 {
18295 *p_num_entries = 0;
18296 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
18297 (*p_num_entries)++;
18298 };
18299 src_alloc->dump(count_entries);
18300
18301 dout(5) << "count num_entries=" << *p_num_entries << dendl;
18302
18303 // add 16K extra entries in case new allocation happened
18304 (*p_num_entries) += 16*1024;
18305 unique_ptr<extent_t[]> arr;
18306 try {
18307 arr = make_unique<extent_t[]>(*p_num_entries);
18308 } catch (std::bad_alloc&) {
18309 derr << "****Failed dynamic allocation, num_entries=" << *p_num_entries << dendl;
18310 return -1;
18311 }
18312
18313 uint64_t idx = 0;
18314 auto copy_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
18315 if (extent_length > 0) {
18316 if (idx < *p_num_entries) {
18317 arr[idx] = {extent_offset, extent_length};
18318 }
18319 idx++;
18320 }
18321 else {
18322 derr << "zero length extent!!! offset=" << extent_offset << ", index=" << idx << dendl;
18323 }
18324 };
18325 src_alloc->dump(copy_entries);
18326
18327 dout(5) << "copy num_entries=" << idx << dendl;
18328 if (idx > *p_num_entries) {
18329 derr << "****spillover, num_entries=" << *p_num_entries << ", spillover=" << (idx - *p_num_entries) << dendl;
18330 ceph_assert(idx <= *p_num_entries);
18331 }
18332
18333 *p_num_entries = idx;
18334
18335 for (idx = 0; idx < *p_num_entries; idx++) {
18336 const extent_t *p_extent = &arr[idx];
18337 dest_alloc->init_add_free(p_extent->offset, p_extent->length);
18338 }
18339
18340 return 0;
18341 }
18342
18343 //-----------------------------------------------------------------------------------
18344 static uint32_t flush_extent_buffer_with_crc(BlueFS::FileWriter *p_handle, const char* buffer, const char *p_curr, uint32_t crc)
18345 {
18346 std::ptrdiff_t length = p_curr - buffer;
18347 p_handle->append(buffer, length);
18348
18349 crc = ceph_crc32c(crc, (const uint8_t*)buffer, length);
18350 uint32_t encoded_crc = HTOCEPH_32(crc);
18351 p_handle->append((byte*)&encoded_crc, sizeof(encoded_crc));
18352
18353 return crc;
18354 }
18355
18356 const unsigned MAX_EXTENTS_IN_BUFFER = 4 * 1024; // 4K extents = 64KB of data
18357 // write the allocator to a flat bluefs file - 4K extents at a time
18358 //-----------------------------------------------------------------------------------
18359 int BlueStore::store_allocator(Allocator* src_allocator)
18360 {
18361 // when storing allocations to file we must be sure there is no background compactions
18362 // the easiest way to achieve it is to make sure db is closed
18363 ceph_assert(db == nullptr);
18364 utime_t start_time = ceph_clock_now();
18365 int ret = 0;
18366
18367 // create dir if doesn't exist already
18368 if (!bluefs->dir_exists(allocator_dir) ) {
18369 ret = bluefs->mkdir(allocator_dir);
18370 if (ret != 0) {
18371 derr << "Failed mkdir with error-code " << ret << dendl;
18372 return -1;
18373 }
18374 }
18375 bluefs->compact_log();
18376 // reuse previous file-allocation if exists
18377 ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
18378 bool overwrite_file = (ret == 0);
18379 BlueFS::FileWriter *p_handle = nullptr;
18380 ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, overwrite_file);
18381 if (ret != 0) {
18382 derr << __func__ << "Failed open_for_write with error-code " << ret << dendl;
18383 return -1;
18384 }
18385
18386 uint64_t file_size = p_handle->file->fnode.size;
18387 uint64_t allocated = p_handle->file->fnode.get_allocated();
18388 dout(10) << "file_size=" << file_size << ", allocated=" << allocated << dendl;
18389
18390 bluefs->sync_metadata(false);
18391 unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(src_allocator));
18392 if (!allocator) {
18393 bluefs->close_writer(p_handle);
18394 return -1;
18395 }
18396
18397 // store all extents (except for the bluefs extents we removed) in a single flat file
18398 utime_t timestamp = ceph_clock_now();
18399 uint32_t crc = -1;
18400 {
18401 allocator_image_header header(timestamp, s_format_version, s_serial);
18402 bufferlist header_bl;
18403 encode(header, header_bl);
18404 crc = header_bl.crc32c(crc);
18405 encode(crc, header_bl);
18406 p_handle->append(header_bl);
18407 }
18408
18409 crc = -1; // reset crc
18410 extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
18411 extent_t *p_curr = buffer;
18412 const extent_t *p_end = buffer + MAX_EXTENTS_IN_BUFFER;
18413 uint64_t extent_count = 0;
18414 uint64_t allocation_size = 0;
18415 auto iterated_allocation = [&](uint64_t extent_offset, uint64_t extent_length) {
18416 if (extent_length == 0) {
18417 derr << __func__ << "" << extent_count << "::[" << extent_offset << "," << extent_length << "]" << dendl;
18418 ret = -1;
18419 return;
18420 }
18421 p_curr->offset = HTOCEPH_64(extent_offset);
18422 p_curr->length = HTOCEPH_64(extent_length);
18423 extent_count++;
18424 allocation_size += extent_length;
18425 p_curr++;
18426
18427 if (p_curr == p_end) {
18428 crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
18429 p_curr = buffer; // recycle the buffer
18430 }
18431 };
18432 allocator->dump(iterated_allocation);
18433 // if got null extent -> fail the operation
18434 if (ret != 0) {
18435 derr << "Illegal extent, fail store operation" << dendl;
18436 derr << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
18437 bluefs->truncate(p_handle, 0);
18438 bluefs->close_writer(p_handle);
18439 return -1;
18440 }
18441
18442 // if we got any leftovers -> add crc and append to file
18443 if (p_curr > buffer) {
18444 crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
18445 }
18446
18447 {
18448 allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
18449 bufferlist trailer_bl;
18450 encode(trailer, trailer_bl);
18451 uint32_t crc = -1;
18452 crc = trailer_bl.crc32c(crc);
18453 encode(crc, trailer_bl);
18454 p_handle->append(trailer_bl);
18455 }
18456
18457 bluefs->fsync(p_handle);
18458 bluefs->truncate(p_handle, p_handle->pos);
18459 bluefs->fsync(p_handle);
18460
18461 utime_t duration = ceph_clock_now() - start_time;
18462 dout(5) <<"WRITE-extent_count=" << extent_count << ", allocation_size=" << allocation_size << ", serial=" << s_serial << dendl;
18463 dout(5) <<"p_handle->pos=" << p_handle->pos << " WRITE-duration=" << duration << " seconds" << dendl;
18464
18465 bluefs->close_writer(p_handle);
18466 need_to_destage_allocation_file = false;
18467 return 0;
18468 }
18469
18470 //-----------------------------------------------------------------------------------
18471 Allocator* BlueStore::create_bitmap_allocator(uint64_t bdev_size) {
18472 // create allocator
18473 uint64_t alloc_size = min_alloc_size;
18474 Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size,
18475 zone_size, first_sequential_zone,
18476 "recovery");
18477 if (alloc) {
18478 return alloc;
18479 } else {
18480 derr << "Failed Allocator Creation" << dendl;
18481 return nullptr;
18482 }
18483 }
18484
18485 //-----------------------------------------------------------------------------------
18486 size_t calc_allocator_image_header_size()
18487 {
18488 utime_t timestamp = ceph_clock_now();
18489 allocator_image_header header(timestamp, s_format_version, s_serial);
18490 bufferlist header_bl;
18491 encode(header, header_bl);
18492 uint32_t crc = -1;
18493 crc = header_bl.crc32c(crc);
18494 encode(crc, header_bl);
18495
18496 return header_bl.length();
18497 }
18498
18499 //-----------------------------------------------------------------------------------
18500 int calc_allocator_image_trailer_size()
18501 {
18502 utime_t timestamp = ceph_clock_now();
18503 uint64_t extent_count = -1;
18504 uint64_t allocation_size = -1;
18505 uint32_t crc = -1;
18506 bufferlist trailer_bl;
18507 allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
18508
18509 encode(trailer, trailer_bl);
18510 crc = trailer_bl.crc32c(crc);
18511 encode(crc, trailer_bl);
18512 return trailer_bl.length();
18513 }
18514
18515 //-----------------------------------------------------------------------------------
18516 int BlueStore::__restore_allocator(Allocator* allocator, uint64_t *num, uint64_t *bytes)
18517 {
18518 utime_t start_time = ceph_clock_now();
18519 BlueFS::FileReader *p_temp_handle = nullptr;
18520 int ret = bluefs->open_for_read(allocator_dir, allocator_file, &p_temp_handle, false);
18521 if (ret != 0) {
18522 derr << "Failed open_for_read with error-code " << ret << dendl;
18523 return -1;
18524 }
18525 unique_ptr<BlueFS::FileReader> p_handle(p_temp_handle);
18526 uint64_t read_alloc_size = 0;
18527 uint64_t file_size = p_handle->file->fnode.size;
18528 dout(5) << "file_size=" << file_size << ",sizeof(extent_t)=" << sizeof(extent_t) << dendl;
18529
18530 // make sure we were able to store a valid copy
18531 if (file_size == 0) {
18532 derr << "No Valid allocation info on disk (empty file)" << dendl;
18533 return -1;
18534 }
18535
18536 // first read the header
18537 size_t offset = 0;
18538 allocator_image_header header;
18539 int header_size = calc_allocator_image_header_size();
18540 {
18541 bufferlist header_bl,temp_bl;
18542 int read_bytes = bluefs->read(p_handle.get(), offset, header_size, &temp_bl, nullptr);
18543 if (read_bytes != header_size) {
18544 derr << "Failed bluefs->read() for header::read_bytes=" << read_bytes << ", req_bytes=" << header_size << dendl;
18545 return -1;
18546 }
18547
18548 offset += read_bytes;
18549
18550 header_bl.claim_append(temp_bl);
18551 auto p = header_bl.cbegin();
18552 decode(header, p);
18553 if (header.verify(cct, path) != 0 ) {
18554 derr << "header = \n" << header << dendl;
18555 return -1;
18556 }
18557
18558 uint32_t crc_calc = -1, crc;
18559 crc_calc = header_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
18560 decode(crc, p);
18561 if (crc != crc_calc) {
18562 derr << "crc mismatch!!! crc=" << crc << ", crc_calc=" << crc_calc << dendl;
18563 derr << "header = \n" << header << dendl;
18564 return -1;
18565 }
18566
18567 // increment version for next store
18568 s_serial = header.serial + 1;
18569 }
18570
18571 // then read the payload (extents list) using a recycled buffer
18572 extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
18573 uint32_t crc = -1;
18574 int trailer_size = calc_allocator_image_trailer_size();
18575 uint64_t extent_count = 0;
18576 uint64_t extents_bytes_left = file_size - (header_size + trailer_size + sizeof(crc));
18577 while (extents_bytes_left) {
18578 int req_bytes = std::min(extents_bytes_left, sizeof(buffer));
18579 int read_bytes = bluefs->read(p_handle.get(), offset, req_bytes, nullptr, (char*)buffer);
18580 if (read_bytes != req_bytes) {
18581 derr << "Failed bluefs->read()::read_bytes=" << read_bytes << ", req_bytes=" << req_bytes << dendl;
18582 return -1;
18583 }
18584
18585 offset += read_bytes;
18586 extents_bytes_left -= read_bytes;
18587
18588 const unsigned num_extent_in_buffer = read_bytes/sizeof(extent_t);
18589 const extent_t *p_end = buffer + num_extent_in_buffer;
18590 for (const extent_t *p_ext = buffer; p_ext < p_end; p_ext++) {
18591 uint64_t offset = CEPHTOH_64(p_ext->offset);
18592 uint64_t length = CEPHTOH_64(p_ext->length);
18593 read_alloc_size += length;
18594
18595 if (length > 0) {
18596 allocator->init_add_free(offset, length);
18597 extent_count ++;
18598 } else {
18599 derr << "extent with zero length at idx=" << extent_count << dendl;
18600 return -1;
18601 }
18602 }
18603
18604 uint32_t calc_crc = ceph_crc32c(crc, (const uint8_t*)buffer, read_bytes);
18605 read_bytes = bluefs->read(p_handle.get(), offset, sizeof(crc), nullptr, (char*)&crc);
18606 if (read_bytes == sizeof(crc) ) {
18607 crc = CEPHTOH_32(crc);
18608 if (crc != calc_crc) {
18609 derr << "data crc mismatch!!! crc=" << crc << ", calc_crc=" << calc_crc << dendl;
18610 derr << "extents_bytes_left=" << extents_bytes_left << ", offset=" << offset << ", extent_count=" << extent_count << dendl;
18611 return -1;
18612 }
18613
18614 offset += read_bytes;
18615 if (extents_bytes_left) {
18616 extents_bytes_left -= read_bytes;
18617 }
18618 } else {
18619 derr << "Failed bluefs->read() for crc::read_bytes=" << read_bytes << ", req_bytes=" << sizeof(crc) << dendl;
18620 return -1;
18621 }
18622
18623 }
18624
18625 // finally, read teh trailer and verify it is in good shape and that we got all the extents
18626 {
18627 bufferlist trailer_bl,temp_bl;
18628 int read_bytes = bluefs->read(p_handle.get(), offset, trailer_size, &temp_bl, nullptr);
18629 if (read_bytes != trailer_size) {
18630 derr << "Failed bluefs->read() for trailer::read_bytes=" << read_bytes << ", req_bytes=" << trailer_size << dendl;
18631 return -1;
18632 }
18633 offset += read_bytes;
18634
18635 trailer_bl.claim_append(temp_bl);
18636 uint32_t crc_calc = -1;
18637 uint32_t crc;
18638 allocator_image_trailer trailer;
18639 auto p = trailer_bl.cbegin();
18640 decode(trailer, p);
18641 if (trailer.verify(cct, path, &header, extent_count, read_alloc_size) != 0 ) {
18642 derr << "trailer=\n" << trailer << dendl;
18643 return -1;
18644 }
18645
18646 crc_calc = trailer_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
18647 decode(crc, p);
18648 if (crc != crc_calc) {
18649 derr << "trailer crc mismatch!::crc=" << crc << ", crc_calc=" << crc_calc << dendl;
18650 derr << "trailer=\n" << trailer << dendl;
18651 return -1;
18652 }
18653 }
18654
18655 utime_t duration = ceph_clock_now() - start_time;
18656 dout(5) << "READ--extent_count=" << extent_count << ", read_alloc_size= "
18657 << read_alloc_size << ", file_size=" << file_size << dendl;
18658 dout(5) << "READ duration=" << duration << " seconds, s_serial=" << header.serial << dendl;
18659 *num = extent_count;
18660 *bytes = read_alloc_size;
18661 return 0;
18662 }
18663
18664 //-----------------------------------------------------------------------------------
18665 int BlueStore::restore_allocator(Allocator* dest_allocator, uint64_t *num, uint64_t *bytes)
18666 {
18667 utime_t start = ceph_clock_now();
18668 auto temp_allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
18669 int ret = __restore_allocator(temp_allocator.get(), num, bytes);
18670 if (ret != 0) {
18671 return ret;
18672 }
18673
18674 uint64_t num_entries = 0;
18675 dout(5) << " calling copy_allocator(bitmap_allocator -> shared_alloc.a)" << dendl;
18676 copy_allocator(temp_allocator.get(), dest_allocator, &num_entries);
18677 utime_t duration = ceph_clock_now() - start;
18678 dout(5) << "restored in " << duration << " seconds, num_entries=" << num_entries << dendl;
18679 return ret;
18680 }
18681
18682 //-------------------------------------------------------------------------
18683 void BlueStore::ExtentMap::provide_shard_info_to_onode(bufferlist v, uint32_t shard_id)
18684 {
18685 [[maybe_unused]] auto cct = onode->c->store->cct;
18686 auto path = onode->c->store->path;
18687 if (shard_id < shards.size()) {
18688 auto p = &shards[shard_id];
18689 if (!p->loaded) {
18690 dout(30) << "opening shard 0x" << std::hex << p->shard_info->offset << std::dec << dendl;
18691 p->extents = decode_some(v);
18692 p->loaded = true;
18693 dout(20) << "open shard 0x" << std::hex << p->shard_info->offset << std::dec << dendl;
18694 ceph_assert(p->dirty == false);
18695 ceph_assert(v.length() == p->shard_info->bytes);
18696 }
18697 } else {
18698 derr << "illegal shard-id=" << shard_id << " shards.size()=" << shards.size() << dendl;
18699 ceph_assert(shard_id < shards.size());
18700 }
18701 }
18702
18703 //-----------------------------------------------------------------------------------
18704 void BlueStore::set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length)
18705 {
18706 ceph_assert((offset & min_alloc_size_mask) == 0);
18707 ceph_assert((length & min_alloc_size_mask) == 0);
18708 sbmap->set(offset >> min_alloc_size_order, length >> min_alloc_size_order);
18709 }
18710
18711 //---------------------------------------------------------
18712 // Process all physical extents from a given Onode (including all its shards)
18713 void BlueStore::read_allocation_from_single_onode(
18714 SimpleBitmap* sbmap,
18715 BlueStore::OnodeRef& onode_ref,
18716 read_alloc_stats_t& stats)
18717 {
18718 // create a map holding all physical-extents of this Onode to prevent duplication from being added twice and more
18719 std::unordered_map<uint64_t, uint32_t> lcl_extnt_map;
18720 unsigned blobs_count = 0;
18721 uint64_t pos = 0;
18722
18723 stats.spanning_blob_count += onode_ref->extent_map.spanning_blob_map.size();
18724 // first iterate over all logical-extents
18725 for (struct Extent& l_extent : onode_ref->extent_map.extent_map) {
18726 ceph_assert(l_extent.logical_offset >= pos);
18727
18728 pos = l_extent.logical_offset + l_extent.length;
18729 ceph_assert(l_extent.blob);
18730 const bluestore_blob_t& blob = l_extent.blob->get_blob();
18731 const PExtentVector& p_extent_vec = blob.get_extents();
18732 blobs_count++;
18733 if (blob.is_compressed()) {
18734 stats.compressed_blob_count++;
18735 }
18736
18737 if (blob.is_shared()) {
18738 stats.shared_blobs_count++;
18739 }
18740
18741 // process all physical extent in this blob
18742 for (auto p_extent = p_extent_vec.begin(); p_extent != p_extent_vec.end(); p_extent++) {
18743 auto offset = p_extent->offset;
18744 auto length = p_extent->length;
18745
18746 // Offset of -1 means that the extent was removed (and it is only a place holder) and can be safely skipped
18747 if (offset == (uint64_t)-1) {
18748 stats.skipped_illegal_extent++;
18749 continue;
18750 }
18751
18752 if (!blob.is_shared()) {
18753 // skip repeating extents
18754 auto lcl_itr = lcl_extnt_map.find(offset);
18755 // extents using shared blobs might have differnt length
18756 if (lcl_itr != lcl_extnt_map.end() ) {
18757 // repeated extents must have the same length!
18758 ceph_assert(lcl_extnt_map[offset] == length);
18759 stats.skipped_repeated_extent++;
18760 } else {
18761 lcl_extnt_map[offset] = length;
18762 set_allocation_in_simple_bmap(sbmap, offset, length);
18763 stats.extent_count++;
18764 }
18765 } else {
18766 // extents using shared blobs might have differnt length
18767 set_allocation_in_simple_bmap(sbmap, offset, length);
18768 stats.extent_count++;
18769 }
18770
18771 } // physical-extents loop
18772
18773 } // logical-extents loop
18774
18775 if (blobs_count < MAX_BLOBS_IN_ONODE) {
18776 stats.blobs_in_onode[blobs_count]++;
18777 } else {
18778 // store all counts higher than MAX_BLOBS_IN_ONODE in a single bucket at offset zero
18779 stats.blobs_in_onode[MAX_BLOBS_IN_ONODE]++;
18780 }
18781 }
18782
18783 //-------------------------------------------------------------------------
18784 int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats_t& stats)
18785 {
18786 // finally add all space take by user data
18787 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
18788 if (!it) {
18789 // TBD - find a better error code
18790 derr << "failed db->get_iterator(PREFIX_OBJ)" << dendl;
18791 return -1;
18792 }
18793
18794 CollectionRef collection_ref;
18795 spg_t pgid;
18796 BlueStore::OnodeRef onode_ref;
18797 bool has_open_onode = false;
18798 uint32_t shard_id = 0;
18799 uint64_t kv_count = 0;
18800 uint64_t count_interval = 1'000'000;
18801 // iterate over all ONodes stored in RocksDB
18802 for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) {
18803 // trace an even after every million processed objects (typically every 5-10 seconds)
18804 if (kv_count && (kv_count % count_interval == 0) ) {
18805 dout(5) << "processed objects count = " << kv_count << dendl;
18806 }
18807
18808 // Shards - Code
18809 // add the extents from the shards to the main Obj
18810 if (is_extent_shard_key(it->key())) {
18811 // shards must follow a valid main object
18812 if (has_open_onode) {
18813 // shards keys must start with the main object key
18814 if (it->key().find(onode_ref->key) == 0) {
18815 // shards count can't exceed declared shard-count in the main-object
18816 if (shard_id < onode_ref->extent_map.shards.size()) {
18817 onode_ref->extent_map.provide_shard_info_to_onode(it->value(), shard_id);
18818 stats.shard_count++;
18819 shard_id++;
18820 } else {
18821 derr << "illegal shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
18822 derr << "shard->key=" << pretty_binary_string(it->key()) << dendl;
18823 ceph_assert(shard_id < onode_ref->extent_map.shards.size());
18824 }
18825 } else {
18826 derr << "illegal shard-key::onode->key=" << pretty_binary_string(onode_ref->key) << " shard->key=" << pretty_binary_string(it->key()) << dendl;
18827 ceph_assert(it->key().find(onode_ref->key) == 0);
18828 }
18829 } else {
18830 derr << "error::shard without main objects for key=" << pretty_binary_string(it->key()) << dendl;
18831 ceph_assert(has_open_onode);
18832 }
18833
18834 } else {
18835 // Main Object Code
18836
18837 if (has_open_onode) {
18838 // make sure we got all shards of this object
18839 if (shard_id == onode_ref->extent_map.shards.size()) {
18840 // We completed an Onode Object -> pass it to be processed
18841 read_allocation_from_single_onode(sbmap, onode_ref, stats);
18842 } else {
18843 derr << "Missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
18844 ceph_assert(shard_id == onode_ref->extent_map.shards.size());
18845 }
18846 } else {
18847 // We opened a new Object
18848 has_open_onode = true;
18849 }
18850
18851 // The main Obj is always first in RocksDB so we can start with shard_id set to zero
18852 shard_id = 0;
18853 stats.onode_count++;
18854 ghobject_t oid;
18855 int ret = get_key_object(it->key(), &oid);
18856 if (ret < 0) {
18857 derr << "bad object key " << pretty_binary_string(it->key()) << dendl;
18858 ceph_assert(ret == 0);
18859 continue;
18860 }
18861
18862 // fill collection_ref if doesn't exist yet
18863 // We process all the obejcts in a given collection and then move to the next collection
18864 // This means we only search once for every given collection
18865 if (!collection_ref ||
18866 oid.shard_id != pgid.shard ||
18867 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
18868 !collection_ref->contains(oid)) {
18869 stats.collection_search++;
18870 collection_ref = nullptr;
18871
18872 for (auto& p : coll_map) {
18873 if (p.second->contains(oid)) {
18874 collection_ref = p.second;
18875 break;
18876 }
18877 }
18878
18879 if (!collection_ref) {
18880 derr << "stray object " << oid << " not owned by any collection" << dendl;
18881 ceph_assert(collection_ref);
18882 continue;
18883 }
18884
18885 collection_ref->cid.is_pg(&pgid);
18886 }
18887 onode_ref.reset(BlueStore::Onode::decode(collection_ref, oid, it->key(), it->value()));
18888 }
18889 }
18890
18891 // process the last object
18892 if (has_open_onode) {
18893 // make sure we got all shards of this object
18894 if (shard_id == onode_ref->extent_map.shards.size()) {
18895 // We completed an Onode Object -> pass it to be processed
18896 read_allocation_from_single_onode(sbmap, onode_ref, stats);
18897 } else {
18898 derr << "Last Object is missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
18899 ceph_assert(shard_id == onode_ref->extent_map.shards.size());
18900 }
18901 }
18902 dout(5) << "onode_count=" << stats.onode_count << " ,shard_count=" << stats.shard_count << dendl;
18903
18904 return 0;
18905 }
18906
18907 //---------------------------------------------------------
18908 int BlueStore::reconstruct_allocations(SimpleBitmap *sbmap, read_alloc_stats_t &stats)
18909 {
18910 // first set space used by superblock
18911 auto super_length = std::max<uint64_t>(min_alloc_size, SUPER_RESERVED);
18912 set_allocation_in_simple_bmap(sbmap, 0, super_length);
18913 stats.extent_count++;
18914
18915 // then set all space taken by Objects
18916 int ret = read_allocation_from_onodes(sbmap, stats);
18917 if (ret < 0) {
18918 derr << "failed read_allocation_from_onodes()" << dendl;
18919 return ret;
18920 }
18921
18922 return 0;
18923 }
18924
18925 //-----------------------------------------------------------------------------------
18926 static void copy_simple_bitmap_to_allocator(SimpleBitmap* sbmap, Allocator* dest_alloc, uint64_t alloc_size)
18927 {
18928 int alloc_size_shift = ctz(alloc_size);
18929 uint64_t offset = 0;
18930 extent_t ext = sbmap->get_next_clr_extent(offset);
18931 while (ext.length != 0) {
18932 dest_alloc->init_add_free(ext.offset << alloc_size_shift, ext.length << alloc_size_shift);
18933 offset = ext.offset + ext.length;
18934 ext = sbmap->get_next_clr_extent(offset);
18935 }
18936 }
18937
18938 //---------------------------------------------------------
18939 int BlueStore::read_allocation_from_drive_on_startup()
18940 {
18941 int ret = 0;
18942
18943 ret = _open_collections();
18944 if (ret < 0) {
18945 return ret;
18946 }
18947 auto shutdown_cache = make_scope_guard([&] {
18948 _shutdown_cache();
18949 });
18950
18951 utime_t start = ceph_clock_now();
18952 read_alloc_stats_t stats = {};
18953 SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
18954 ret = reconstruct_allocations(&sbmap, stats);
18955 if (ret != 0) {
18956 return ret;
18957 }
18958
18959 copy_simple_bitmap_to_allocator(&sbmap, alloc, min_alloc_size);
18960
18961 utime_t duration = ceph_clock_now() - start;
18962 dout(1) << "::Allocation Recovery was completed in " << duration << " seconds, extent_count=" << stats.extent_count << dendl;
18963 return ret;
18964 }
18965
18966
18967
18968
18969 // Only used for debugging purposes - we build a secondary allocator from the Onodes and compare it to the existing one
18970 // Not meant to be run by customers
18971 #ifdef CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
18972
18973 #include <stdlib.h>
18974 #include <algorithm>
18975 //---------------------------------------------------------
18976 int cmpfunc (const void * a, const void * b)
18977 {
18978 if ( ((extent_t*)a)->offset > ((extent_t*)b)->offset ) {
18979 return 1;
18980 }
18981 else if( ((extent_t*)a)->offset < ((extent_t*)b)->offset ) {
18982 return -1;
18983 }
18984 else {
18985 return 0;
18986 }
18987 }
18988
18989 // compare the allocator built from Onodes with the system allocator (CF-B)
18990 //---------------------------------------------------------
18991 int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t req_extent_count, uint64_t memory_target)
18992 {
18993 uint64_t allocation_size = std::min((req_extent_count) * sizeof(extent_t), memory_target / 3);
18994 uint64_t extent_count = allocation_size/sizeof(extent_t);
18995 dout(5) << "req_extent_count=" << req_extent_count << ", granted extent_count="<< extent_count << dendl;
18996
18997 unique_ptr<extent_t[]> arr1;
18998 unique_ptr<extent_t[]> arr2;
18999 try {
19000 arr1 = make_unique<extent_t[]>(extent_count);
19001 arr2 = make_unique<extent_t[]>(extent_count);
19002 } catch (std::bad_alloc&) {
19003 derr << "****Failed dynamic allocation, extent_count=" << extent_count << dendl;
19004 return -1;
19005 }
19006
19007 // copy the extents from the allocators into simple array and then compare them
19008 uint64_t size1 = 0, size2 = 0;
19009 uint64_t idx1 = 0, idx2 = 0;
19010 auto iterated_mapper1 = [&](uint64_t offset, uint64_t length) {
19011 size1 += length;
19012 if (idx1 < extent_count) {
19013 arr1[idx1++] = {offset, length};
19014 }
19015 else if (idx1 == extent_count) {
19016 derr << "(2)compare_allocators:: spillover" << dendl;
19017 idx1 ++;
19018 }
19019
19020 };
19021
19022 auto iterated_mapper2 = [&](uint64_t offset, uint64_t length) {
19023 size2 += length;
19024 if (idx2 < extent_count) {
19025 arr2[idx2++] = {offset, length};
19026 }
19027 else if (idx2 == extent_count) {
19028 derr << "(2)compare_allocators:: spillover" << dendl;
19029 idx2 ++;
19030 }
19031 };
19032
19033 alloc1->dump(iterated_mapper1);
19034 alloc2->dump(iterated_mapper2);
19035
19036 qsort(arr1.get(), std::min(idx1, extent_count), sizeof(extent_t), cmpfunc);
19037 qsort(arr2.get(), std::min(idx2, extent_count), sizeof(extent_t), cmpfunc);
19038
19039 if (idx1 == idx2) {
19040 idx1 = idx2 = std::min(idx1, extent_count);
19041 if (memcmp(arr1.get(), arr2.get(), sizeof(extent_t) * idx2) == 0) {
19042 return 0;
19043 }
19044 derr << "Failed memcmp(arr1, arr2, sizeof(extent_t)*idx2)" << dendl;
19045 for (uint64_t i = 0; i < idx1; i++) {
19046 if (memcmp(arr1.get()+i, arr2.get()+i, sizeof(extent_t)) != 0) {
19047 derr << "!!!![" << i << "] arr1::<" << arr1[i].offset << "," << arr1[i].length << ">" << dendl;
19048 derr << "!!!![" << i << "] arr2::<" << arr2[i].offset << "," << arr2[i].length << ">" << dendl;
19049 return -1;
19050 }
19051 }
19052 return 0;
19053 } else {
19054 derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl;
19055 return -1;
19056 }
19057 }
19058
19059 //---------------------------------------------------------
19060 int BlueStore::add_existing_bluefs_allocation(Allocator* allocator, read_alloc_stats_t &stats)
19061 {
19062 // then add space used by bluefs to store rocksdb
19063 unsigned extent_count = 0;
19064 if (bluefs) {
19065 interval_set<uint64_t> bluefs_extents;
19066 int ret = bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents);
19067 if (ret < 0) {
19068 return ret;
19069 }
19070 for (auto itr = bluefs_extents.begin(); itr != bluefs_extents.end(); extent_count++, itr++) {
19071 allocator->init_rm_free(itr.get_start(), itr.get_len());
19072 stats.extent_count++;
19073 }
19074 }
19075
19076 dout(5) << "bluefs extent_count=" << extent_count << dendl;
19077 return 0;
19078 }
19079
19080 //---------------------------------------------------------
19081 int BlueStore::read_allocation_from_drive_for_bluestore_tool()
19082 {
19083 dout(5) << __func__ << dendl;
19084 int ret = 0;
19085 uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
19086 ret = _open_db_and_around(true, false);
19087 if (ret < 0) {
19088 return ret;
19089 }
19090
19091 ret = _open_collections();
19092 if (ret < 0) {
19093 _close_db_and_around();
19094 return ret;
19095 }
19096
19097 utime_t duration;
19098 read_alloc_stats_t stats = {};
19099 utime_t start = ceph_clock_now();
19100
19101 auto shutdown_cache = make_scope_guard([&] {
19102 dout(1) << "Allocation Recovery was completed in " << duration
19103 << " seconds; insert_count=" << stats.insert_count
19104 << "; extent_count=" << stats.extent_count << dendl;
19105 _shutdown_cache();
19106 _close_db_and_around();
19107 });
19108
19109 {
19110 auto allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
19111 //reconstruct allocations into a temp simple-bitmap and copy into allocator
19112 {
19113 SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
19114 ret = reconstruct_allocations(&sbmap, stats);
19115 if (ret != 0) {
19116 return ret;
19117 }
19118 copy_simple_bitmap_to_allocator(&sbmap, allocator.get(), min_alloc_size);
19119 }
19120
19121 // add allocation space used by the bluefs itself
19122 ret = add_existing_bluefs_allocation(allocator.get(), stats);
19123 if (ret < 0) {
19124 return ret;
19125 }
19126
19127 duration = ceph_clock_now() - start;
19128 stats.insert_count = 0;
19129 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
19130 stats.insert_count++;
19131 };
19132 allocator->dump(count_entries);
19133 ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target);
19134 if (ret == 0) {
19135 dout(5) << "Allocator drive - file integrity check OK" << dendl;
19136 } else {
19137 derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl;
19138 }
19139 }
19140
19141 dout(1) << stats << dendl;
19142 return ret;
19143 }
19144
19145 //---------------------------------------------------------
19146 Allocator* BlueStore::clone_allocator_without_bluefs(Allocator *src_allocator)
19147 {
19148 uint64_t bdev_size = bdev->get_size();
19149 Allocator* allocator = create_bitmap_allocator(bdev_size);
19150 if (allocator) {
19151 dout(5) << "bitmap-allocator=" << allocator << dendl;
19152 } else {
19153 derr << "****failed create_bitmap_allocator()" << dendl;
19154 return nullptr;
19155 }
19156
19157 uint64_t num_entries = 0;
19158 copy_allocator(src_allocator, allocator, &num_entries);
19159
19160 // BlueFS stores its internal allocation outside RocksDB (FM) so we should not destage them to the allcoator-file
19161 // we are going to hide bluefs allocation during allocator-destage as they are stored elsewhere
19162 {
19163 std::vector<extent_t> bluefs_extents_vec;
19164 // load current bluefs internal allocation into a vector
19165 load_bluefs_extents(bluefs, &bluefs_layout, cct, path, bluefs_extents_vec, min_alloc_size);
19166 // then remove them from the shared allocator before dumping it to disk (bluefs stored them internally)
19167 for (auto itr = bluefs_extents_vec.begin(); itr != bluefs_extents_vec.end(); ++itr) {
19168 allocator->init_add_free(itr->offset, itr->length);
19169 }
19170 }
19171
19172 return allocator;
19173 }
19174
19175 //---------------------------------------------------------
19176 static void clear_allocation_objects_from_rocksdb(KeyValueDB *db, CephContext *cct, const std::string &path)
19177 {
19178 dout(5) << "t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP)" << dendl;
19179 KeyValueDB::Transaction t = db->get_transaction();
19180 t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP);
19181 db->submit_transaction_sync(t);
19182 }
19183
19184 //---------------------------------------------------------
19185 void BlueStore::copy_allocator_content_to_fm(Allocator *allocator, FreelistManager *real_fm)
19186 {
19187 unsigned max_txn = 1024;
19188 dout(5) << "max_transaction_submit=" << max_txn << dendl;
19189 uint64_t size = 0, idx = 0;
19190 KeyValueDB::Transaction txn = db->get_transaction();
19191 auto iterated_insert = [&](uint64_t offset, uint64_t length) {
19192 size += length;
19193 real_fm->release(offset, length, txn);
19194 if ((++idx % max_txn) == 0) {
19195 db->submit_transaction_sync(txn);
19196 txn = db->get_transaction();
19197 }
19198 };
19199 allocator->dump(iterated_insert);
19200 if (idx % max_txn != 0) {
19201 db->submit_transaction_sync(txn);
19202 }
19203 dout(5) << "size=" << size << ", num extents=" << idx << dendl;
19204 }
19205
19206 //---------------------------------------------------------
19207 Allocator* BlueStore::initialize_allocator_from_freelist(FreelistManager *real_fm)
19208 {
19209 dout(5) << "real_fm->enumerate_next" << dendl;
19210 Allocator* allocator2 = create_bitmap_allocator(bdev->get_size());
19211 if (allocator2) {
19212 dout(5) << "bitmap-allocator=" << allocator2 << dendl;
19213 } else {
19214 return nullptr;
19215 }
19216
19217 uint64_t size2 = 0, idx2 = 0;
19218 real_fm->enumerate_reset();
19219 uint64_t offset, length;
19220 while (real_fm->enumerate_next(db, &offset, &length)) {
19221 allocator2->init_add_free(offset, length);
19222 ++idx2;
19223 size2 += length;
19224 }
19225 real_fm->enumerate_reset();
19226
19227 dout(5) << "size2=" << size2 << ", num2=" << idx2 << dendl;
19228 return allocator2;
19229 }
19230
19231 //---------------------------------------------------------
19232 // close the active fm and open it in a new mode like makefs()
19233 // but make sure to mark the full device space as allocated
19234 // later we will mark all exetents from the allocator as free
19235 int BlueStore::reset_fm_for_restore()
19236 {
19237 dout(5) << "<<==>> fm->clear_null_manager()" << dendl;
19238 fm->shutdown();
19239 delete fm;
19240 fm = nullptr;
19241 freelist_type = "bitmap";
19242 KeyValueDB::Transaction t = db->get_transaction();
19243 // call _open_fm() with fm_restore set to TRUE
19244 // this will mark the full device space as allocated (and not just the reserved space)
19245 _open_fm(t, true, true);
19246 if (fm == nullptr) {
19247 derr << "Failed _open_fm()" << dendl;
19248 return -1;
19249 }
19250 db->submit_transaction_sync(t);
19251 ceph_assert(!fm->is_null_manager());
19252 dout(5) << "fm was reactivated in full mode" << dendl;
19253 return 0;
19254 }
19255
19256
19257 //---------------------------------------------------------
19258 // create a temp allocator filled with allocation state from the fm
19259 // and compare it to the base allocator passed in
19260 int BlueStore::verify_rocksdb_allocations(Allocator *allocator)
19261 {
19262 dout(5) << "verify that alloc content is identical to FM" << dendl;
19263 // initialize from freelist
19264 Allocator* temp_allocator = initialize_allocator_from_freelist(fm);
19265 if (temp_allocator == nullptr) {
19266 return -1;
19267 }
19268
19269 uint64_t insert_count = 0;
19270 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
19271 insert_count++;
19272 };
19273 temp_allocator->dump(count_entries);
19274 uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
19275 int ret = compare_allocators(allocator, temp_allocator, insert_count, memory_target);
19276
19277 delete temp_allocator;
19278
19279 if (ret == 0) {
19280 dout(5) << "SUCCESS!!! compare(allocator, temp_allocator)" << dendl;
19281 return 0;
19282 } else {
19283 derr << "**** FAILURE compare(allocator, temp_allocator)::ret=" << ret << dendl;
19284 return -1;
19285 }
19286 }
19287
19288 //---------------------------------------------------------
19289 int BlueStore::db_cleanup(int ret)
19290 {
19291 _shutdown_cache();
19292 _close_db_and_around();
19293 return ret;
19294 }
19295
19296 //---------------------------------------------------------
19297 // convert back the system from null-allocator to using rocksdb to store allocation
19298 int BlueStore::push_allocation_to_rocksdb()
19299 {
19300 if (cct->_conf->bluestore_allocation_from_file) {
19301 derr << "cct->_conf->bluestore_allocation_from_file must be cleared first" << dendl;
19302 derr << "please change default to false in ceph.conf file>" << dendl;
19303 return -1;
19304 }
19305
19306 dout(5) << "calling open_db_and_around() in read/write mode" << dendl;
19307 int ret = _open_db_and_around(false);
19308 if (ret < 0) {
19309 return ret;
19310 }
19311
19312 if (!fm->is_null_manager()) {
19313 derr << "This is not a NULL-MANAGER -> nothing to do..." << dendl;
19314 return db_cleanup(0);
19315 }
19316
19317 // start by creating a clone copy of the shared-allocator
19318 unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(alloc));
19319 if (!allocator) {
19320 return db_cleanup(-1);
19321 }
19322
19323 // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
19324 clear_allocation_objects_from_rocksdb(db, cct, path);
19325
19326 // then open fm in new mode with the full devie marked as alloctaed
19327 if (reset_fm_for_restore() != 0) {
19328 return db_cleanup(-1);
19329 }
19330
19331 // push the free-space from the allocator (shared-alloc without bfs) to rocksdb
19332 copy_allocator_content_to_fm(allocator.get(), fm);
19333
19334 // compare the allocator info with the info stored in the fm/rocksdb
19335 if (verify_rocksdb_allocations(allocator.get()) == 0) {
19336 // all is good -> we can commit to rocksdb allocator
19337 commit_to_real_manager();
19338 } else {
19339 return db_cleanup(-1);
19340 }
19341
19342 // can't be too paranoid :-)
19343 dout(5) << "Running full scale verification..." << dendl;
19344 // close db/fm/allocator and start fresh
19345 db_cleanup(0);
19346 dout(5) << "calling open_db_and_around() in read-only mode" << dendl;
19347 ret = _open_db_and_around(true);
19348 if (ret < 0) {
19349 return db_cleanup(ret);
19350 }
19351 ceph_assert(!fm->is_null_manager());
19352 ceph_assert(verify_rocksdb_allocations(allocator.get()) == 0);
19353
19354 return db_cleanup(ret);
19355 }
19356
19357 #endif // CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
19358
19359 //-------------------------------------------------------------------------------------
19360 static int commit_freelist_type(KeyValueDB *db, const std::string& freelist_type, CephContext *cct, const std::string &path)
19361 {
19362 // When freelist_type to "bitmap" we will store allocation in RocksDB
19363 // When allocation-info is stored in a single file we set freelist_type to "null"
19364 // This will direct the startup code to read allocation from file and not RocksDB
19365 KeyValueDB::Transaction t = db->get_transaction();
19366 if (t == nullptr) {
19367 derr << "db->get_transaction() failed!!!" << dendl;
19368 return -1;
19369 }
19370
19371 bufferlist bl;
19372 bl.append(freelist_type);
19373 t->set(PREFIX_SUPER, "freelist_type", bl);
19374
19375 int ret = db->submit_transaction_sync(t);
19376 if (ret != 0) {
19377 derr << "Failed db->submit_transaction_sync(t)" << dendl;
19378 }
19379 return ret;
19380 }
19381
19382 //-------------------------------------------------------------------------------------
19383 int BlueStore::commit_to_null_manager()
19384 {
19385 dout(5) << "Set FreelistManager to NULL FM..." << dendl;
19386 fm->set_null_manager();
19387 freelist_type = "null";
19388 #if 1
19389 return commit_freelist_type(db, freelist_type, cct, path);
19390 #else
19391 // should check how long this step take on a big configuration as deletes are expensive
19392 if (commit_freelist_type(db, freelist_type, cct, path) == 0) {
19393 // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
19394 clear_allocation_objects_from_rocksdb(db, cct, path);
19395 }
19396 #endif
19397 }
19398
19399
19400 //-------------------------------------------------------------------------------------
19401 int BlueStore::commit_to_real_manager()
19402 {
19403 dout(5) << "Set FreelistManager to Real FM..." << dendl;
19404 ceph_assert(!fm->is_null_manager());
19405 freelist_type = "bitmap";
19406 int ret = commit_freelist_type(db, freelist_type, cct, path);
19407 if (ret == 0) {
19408 //remove the allocation_file
19409 invalidate_allocation_file_on_bluefs();
19410 ret = bluefs->unlink(allocator_dir, allocator_file);
19411 bluefs->sync_metadata(false);
19412 if (ret == 0) {
19413 dout(5) << "Remove Allocation File successfully" << dendl;
19414 }
19415 else {
19416 derr << "Remove Allocation File ret_code=" << ret << dendl;
19417 }
19418 }
19419
19420 return ret;
19421 }
19422
19423 //================================================================================================================
19424 //================================================================================================================