]> git.proxmox.com Git - ceph.git/blame_incremental - ceph/src/os/bluestore/BlueStore.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
... / ...
CommitLineData
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <bit>
16#include <unistd.h>
17#include <stdlib.h>
18#include <sys/types.h>
19#include <sys/stat.h>
20#include <fcntl.h>
21#include <algorithm>
22
23#include <boost/container/flat_set.hpp>
24#include <boost/algorithm/string.hpp>
25#include <boost/random/mersenne_twister.hpp>
26#include <boost/random/uniform_real.hpp>
27
28#include "include/cpp-btree/btree_set.h"
29
30#include "BlueStore.h"
31#include "bluestore_common.h"
32#include "simple_bitmap.h"
33#include "os/kv.h"
34#include "include/compat.h"
35#include "include/intarith.h"
36#include "include/stringify.h"
37#include "include/str_map.h"
38#include "include/util.h"
39#include "common/errno.h"
40#include "common/safe_io.h"
41#include "common/PriorityCache.h"
42#include "common/url_escape.h"
43#include "Allocator.h"
44#include "FreelistManager.h"
45#include "BlueFS.h"
46#include "BlueRocksEnv.h"
47#include "auth/Crypto.h"
48#include "common/EventTrace.h"
49#include "perfglue/heap_profiler.h"
50#include "common/blkdev.h"
51#include "common/numa.h"
52#include "common/pretty_binary.h"
53#include "kv/KeyValueHistogram.h"
54
55#ifdef HAVE_LIBZBD
56#include "ZonedAllocator.h"
57#include "ZonedFreelistManager.h"
58#endif
59
60#if defined(WITH_LTTNG)
61#define TRACEPOINT_DEFINE
62#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
63#include "tracing/bluestore.h"
64#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
65#undef TRACEPOINT_DEFINE
66#else
67#define tracepoint(...)
68#endif
69
70#define dout_context cct
71#define dout_subsys ceph_subsys_bluestore
72
73using bid_t = decltype(BlueStore::Blob::id);
74
75// bluestore_cache_onode
76MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
77 bluestore_cache_onode);
78
79MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
80 bluestore_cache_buffer);
81MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
82 bluestore_extent);
83MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
84 bluestore_blob);
85MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
86 bluestore_shared_blob);
87
88// bluestore_txc
89MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
90 bluestore_txc);
91using std::byte;
92using std::deque;
93using std::min;
94using std::make_pair;
95using std::numeric_limits;
96using std::pair;
97using std::less;
98using std::list;
99using std::make_unique;
100using std::map;
101using std::max;
102using std::ostream;
103using std::ostringstream;
104using std::set;
105using std::string;
106using std::stringstream;
107using std::unique_ptr;
108using std::vector;
109
110using ceph::bufferlist;
111using ceph::bufferptr;
112using ceph::coarse_mono_clock;
113using ceph::decode;
114using ceph::encode;
115using ceph::Formatter;
116using ceph::JSONFormatter;
117using ceph::make_timespan;
118using ceph::mono_clock;
119using ceph::mono_time;
120using ceph::timespan_str;
121
122// kv store prefixes
123const string PREFIX_SUPER = "S"; // field -> value
124const string PREFIX_STAT = "T"; // field -> value(int64 array)
125const string PREFIX_COLL = "C"; // collection name -> cnode_t
126const string PREFIX_OBJ = "O"; // object name -> onode_t
127const string PREFIX_OMAP = "M"; // u64 + keyname -> value
128const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
129const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
130const string PREFIX_PERPG_OMAP = "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value
131const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
132const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
133const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
134const string PREFIX_SHARED_BLOB = "X"; // u64 SB id -> shared_blob_t
135
136#ifdef HAVE_LIBZBD
137const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager)
138const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager)
139const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata)
140#endif
141
142const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
143
144// write a label in the first block. always use this size. note that
145// bluefs makes a matching assumption about the location of its
146// superblock (always the second block of the device).
147#define BDEV_LABEL_BLOCK_SIZE 4096
148
149// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
150#define SUPER_RESERVED 8192
151
152#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
153
154
155/*
156 * extent map blob encoding
157 *
158 * we use the low bits of the blobid field to indicate some common scenarios
159 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
160 */
161#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
162#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
163#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
164#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
165#define BLOBID_SHIFT_BITS 4
166
167/*
168 * object name key structure
169 *
170 * encoded u8: shard + 2^7 (so that it sorts properly)
171 * encoded u64: poolid + 2^63 (so that it sorts properly)
172 * encoded u32: hash (bit reversed)
173 *
174 * escaped string: namespace
175 *
176 * escaped string: key or object name
177 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
178 * we are done. otherwise, we are followed by the object name.
179 * escaped string: object name (unless '=' above)
180 *
181 * encoded u64: snap
182 * encoded u64: generation
183 * 'o'
184 */
185#define ONODE_KEY_SUFFIX 'o'
186
187/*
188 * extent shard key
189 *
190 * object prefix key
191 * u32
192 * 'x'
193 */
194#define EXTENT_SHARD_KEY_SUFFIX 'x'
195
196/*
197 * string encoding in the key
198 *
199 * The key string needs to lexicographically sort the same way that
200 * ghobject_t does. We do this by escaping anything <= to '#' with #
201 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
202 * hex digits.
203 *
204 * We use ! as a terminator for strings; this works because it is < #
205 * and will get escaped if it is present in the string.
206 *
207 * NOTE: There is a bug in this implementation: due to implicit
208 * character type conversion in comparison it may produce unexpected
209 * ordering. Unfortunately fixing the bug would mean invalidating the
210 * keys in existing deployments. Instead we do additional sorting
211 * where it is needed.
212 */
213template<typename S>
214static void append_escaped(const string &in, S *out)
215{
216 char hexbyte[in.length() * 3 + 1];
217 char* ptr = &hexbyte[0];
218 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
219 if (*i <= '#') { // bug: unexpected result for *i > 0x7f
220 *ptr++ = '#';
221 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
222 *ptr++ = "0123456789abcdef"[*i & 0x0f];
223 } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
224 *ptr++ = '~';
225 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
226 *ptr++ = "0123456789abcdef"[*i & 0x0f];
227 } else {
228 *ptr++ = *i;
229 }
230 }
231 *ptr++ = '!';
232 out->append(hexbyte, ptr - &hexbyte[0]);
233}
234
235inline unsigned h2i(char c)
236{
237 if ((c >= '0') && (c <= '9')) {
238 return c - 0x30;
239 } else if ((c >= 'a') && (c <= 'f')) {
240 return c - 'a' + 10;
241 } else if ((c >= 'A') && (c <= 'F')) {
242 return c - 'A' + 10;
243 } else {
244 return 256; // make it always larger than 255
245 }
246}
247
248static int decode_escaped(const char *p, string *out)
249{
250 char buff[256];
251 char* ptr = &buff[0];
252 char* max = &buff[252];
253 const char *orig_p = p;
254 while (*p && *p != '!') {
255 if (*p == '#' || *p == '~') {
256 unsigned hex = 0;
257 p++;
258 hex = h2i(*p++) << 4;
259 if (hex > 255) {
260 return -EINVAL;
261 }
262 hex |= h2i(*p++);
263 if (hex > 255) {
264 return -EINVAL;
265 }
266 *ptr++ = hex;
267 } else {
268 *ptr++ = *p++;
269 }
270 if (ptr > max) {
271 out->append(buff, ptr-buff);
272 ptr = &buff[0];
273 }
274 }
275 if (ptr != buff) {
276 out->append(buff, ptr-buff);
277 }
278 return p - orig_p;
279}
280
281template<typename T>
282static void _key_encode_shard(shard_id_t shard, T *key)
283{
284 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
285}
286
287static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
288{
289 pshard->id = (uint8_t)*key - (uint8_t)0x80;
290 return key + 1;
291}
292
293static void get_coll_range(const coll_t& cid, int bits,
294 ghobject_t *temp_start, ghobject_t *temp_end,
295 ghobject_t *start, ghobject_t *end, bool legacy)
296{
297 spg_t pgid;
298 constexpr uint32_t MAX_HASH = std::numeric_limits<uint32_t>::max();
299 // use different nspaces due to we use different schemes when encoding
300 // keys for listing objects
301 const std::string_view MAX_NSPACE = legacy ? "\x7f" : "\xff";
302 if (cid.is_pg(&pgid)) {
303 start->shard_id = pgid.shard;
304 *temp_start = *start;
305
306 start->hobj.pool = pgid.pool();
307 temp_start->hobj.pool = -2ll - pgid.pool();
308
309 *end = *start;
310 *temp_end = *temp_start;
311
312 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
313 start->hobj.set_bitwise_key_u32(reverse_hash);
314 temp_start->hobj.set_bitwise_key_u32(reverse_hash);
315
316 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
317 if (end_hash > MAX_HASH) {
318 // make sure end hobj is even greater than the maximum possible hobj
319 end->hobj.set_bitwise_key_u32(MAX_HASH);
320 temp_end->hobj.set_bitwise_key_u32(MAX_HASH);
321 end->hobj.nspace = MAX_NSPACE;
322 } else {
323 end->hobj.set_bitwise_key_u32(end_hash);
324 temp_end->hobj.set_bitwise_key_u32(end_hash);
325 }
326 } else {
327 start->shard_id = shard_id_t::NO_SHARD;
328 start->hobj.pool = -1ull;
329
330 *end = *start;
331 start->hobj.set_bitwise_key_u32(0);
332 end->hobj.set_bitwise_key_u32(MAX_HASH);
333 end->hobj.nspace = MAX_NSPACE;
334 // no separate temp section
335 *temp_start = *end;
336 *temp_end = *end;
337 }
338
339 start->generation = 0;
340 end->generation = 0;
341 temp_start->generation = 0;
342 temp_end->generation = 0;
343}
344
345static void get_shared_blob_key(uint64_t sbid, string *key)
346{
347 key->clear();
348 _key_encode_u64(sbid, key);
349}
350
351static int get_key_shared_blob(const string& key, uint64_t *sbid)
352{
353 const char *p = key.c_str();
354 if (key.length() < sizeof(uint64_t))
355 return -1;
356 _key_decode_u64(p, sbid);
357 return 0;
358}
359
360template<typename S>
361static void _key_encode_prefix(const ghobject_t& oid, S *key)
362{
363 _key_encode_shard(oid.shard_id, key);
364 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
365 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
366}
367
368static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
369{
370 p = _key_decode_shard(p, &oid->shard_id);
371
372 uint64_t pool;
373 p = _key_decode_u64(p, &pool);
374 oid->hobj.pool = pool - 0x8000000000000000ull;
375
376 unsigned hash;
377 p = _key_decode_u32(p, &hash);
378
379 oid->hobj.set_bitwise_key_u32(hash);
380
381 return p;
382}
383
384
385#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
386
387static int _get_key_object(const char *p, ghobject_t *oid)
388{
389 int r;
390
391 p = _key_decode_prefix(p, oid);
392
393 r = decode_escaped(p, &oid->hobj.nspace);
394 if (r < 0)
395 return -2;
396 p += r + 1;
397
398 string k;
399 r = decode_escaped(p, &k);
400 if (r < 0)
401 return -3;
402 p += r + 1;
403 if (*p == '=') {
404 // no key
405 ++p;
406 oid->hobj.oid.name = k;
407 } else if (*p == '<' || *p == '>') {
408 // key + name
409 ++p;
410 r = decode_escaped(p, &oid->hobj.oid.name);
411 if (r < 0)
412 return -5;
413 p += r + 1;
414 oid->hobj.set_key(k);
415 } else {
416 // malformed
417 return -6;
418 }
419
420 p = _key_decode_u64(p, &oid->hobj.snap.val);
421 p = _key_decode_u64(p, &oid->generation);
422
423 if (*p != ONODE_KEY_SUFFIX) {
424 return -7;
425 }
426 p++;
427 if (*p) {
428 // if we get something other than a null terminator here,
429 // something goes wrong.
430 return -8;
431 }
432
433 return 0;
434}
435
436template<typename S>
437static int get_key_object(const S& key, ghobject_t *oid)
438{
439 if (key.length() < ENCODED_KEY_PREFIX_LEN)
440 return -1;
441 if (key.length() == ENCODED_KEY_PREFIX_LEN)
442 return -2;
443 const char *p = key.c_str();
444 return _get_key_object(p, oid);
445}
446
447template<typename S>
448static void _get_object_key(const ghobject_t& oid, S *key)
449{
450 size_t max_len = ENCODED_KEY_PREFIX_LEN +
451 (oid.hobj.nspace.length() * 3 + 1) +
452 (oid.hobj.get_key().length() * 3 + 1) +
453 1 + // for '<', '=', or '>'
454 (oid.hobj.oid.name.length() * 3 + 1) +
455 8 + 8 + 1;
456 key->reserve(max_len);
457
458 _key_encode_prefix(oid, key);
459
460 append_escaped(oid.hobj.nspace, key);
461
462 if (oid.hobj.get_key().length()) {
463 // is a key... could be < = or >.
464 append_escaped(oid.hobj.get_key(), key);
465 // (ASCII chars < = and > sort in that order, yay)
466 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
467 if (r) {
468 key->append(r > 0 ? ">" : "<");
469 append_escaped(oid.hobj.oid.name, key);
470 } else {
471 // same as no key
472 key->append("=");
473 }
474 } else {
475 // no key
476 append_escaped(oid.hobj.oid.name, key);
477 key->append("=");
478 }
479
480 _key_encode_u64(oid.hobj.snap, key);
481 _key_encode_u64(oid.generation, key);
482
483 key->push_back(ONODE_KEY_SUFFIX);
484}
485
486template<typename S>
487static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
488{
489 key->clear();
490 _get_object_key(oid, key);
491
492 // sanity check
493 if (true) {
494 ghobject_t t;
495 int r = get_key_object(*key, &t);
496 if (r || t != oid) {
497 derr << " r " << r << dendl;
498 derr << "key " << pretty_binary_string(*key) << dendl;
499 derr << "oid " << oid << dendl;
500 derr << " t " << t << dendl;
501 ceph_assert(r == 0 && t == oid);
502 }
503 }
504}
505
506// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
507// char lets us quickly test whether it is a shard key without decoding any
508// of the prefix bytes.
509template<typename S>
510static void get_extent_shard_key(const S& onode_key, uint32_t offset,
511 string *key)
512{
513 key->clear();
514 key->reserve(onode_key.length() + 4 + 1);
515 key->append(onode_key.c_str(), onode_key.size());
516 _key_encode_u32(offset, key);
517 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
518}
519
520static void rewrite_extent_shard_key(uint32_t offset, string *key)
521{
522 ceph_assert(key->size() > sizeof(uint32_t) + 1);
523 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
524 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
525}
526
527template<typename S>
528static void generate_extent_shard_key_and_apply(
529 const S& onode_key,
530 uint32_t offset,
531 string *key,
532 std::function<void(const string& final_key)> apply)
533{
534 if (key->empty()) { // make full key
535 ceph_assert(!onode_key.empty());
536 get_extent_shard_key(onode_key, offset, key);
537 } else {
538 rewrite_extent_shard_key(offset, key);
539 }
540 apply(*key);
541}
542
543int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
544{
545 ceph_assert(key.size() > sizeof(uint32_t) + 1);
546 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
547 int okey_len = key.size() - sizeof(uint32_t) - 1;
548 *onode_key = key.substr(0, okey_len);
549 const char *p = key.data() + okey_len;
550 _key_decode_u32(p, offset);
551 return 0;
552}
553
554static bool is_extent_shard_key(const string& key)
555{
556 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
557}
558
559static void get_deferred_key(uint64_t seq, string *out)
560{
561 _key_encode_u64(seq, out);
562}
563
564static void get_pool_stat_key(int64_t pool_id, string *key)
565{
566 key->clear();
567 _key_encode_u64(pool_id, key);
568}
569
570static int get_key_pool_stat(const string& key, uint64_t* pool_id)
571{
572 const char *p = key.c_str();
573 if (key.length() < sizeof(uint64_t))
574 return -1;
575 _key_decode_u64(p, pool_id);
576 return 0;
577}
578
579#ifdef HAVE_LIBZBD
580static void get_zone_offset_object_key(
581 uint32_t zone,
582 uint64_t offset,
583 ghobject_t oid,
584 std::string *key)
585{
586 key->clear();
587 _key_encode_u32(zone, key);
588 _key_encode_u64(offset, key);
589 _get_object_key(oid, key);
590}
591
592static int get_key_zone_offset_object(
593 const string& key,
594 uint32_t *zone,
595 uint64_t *offset,
596 ghobject_t *oid)
597{
598 const char *p = key.c_str();
599 if (key.length() < sizeof(uint64_t) + sizeof(uint32_t) + ENCODED_KEY_PREFIX_LEN + 1)
600 return -1;
601 p = _key_decode_u32(p, zone);
602 p = _key_decode_u64(p, offset);
603 int r = _get_key_object(p, oid);
604 if (r < 0) {
605 return r;
606 }
607 return 0;
608}
609#endif
610
611template <int LogLevelV>
612void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
613{
614 uint64_t pos = 0;
615 for (auto& s : em.shards) {
616 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
617 << (s.loaded ? " (loaded)" : "")
618 << (s.dirty ? " (dirty)" : "")
619 << dendl;
620 }
621 for (auto& e : em.extent_map) {
622 dout(LogLevelV) << __func__ << " " << e << dendl;
623 ceph_assert(e.logical_offset >= pos);
624 pos = e.logical_offset + e.length;
625 const bluestore_blob_t& blob = e.blob->get_blob();
626 if (blob.has_csum()) {
627 vector<uint64_t> v;
628 unsigned n = blob.get_csum_count();
629 for (unsigned i = 0; i < n; ++i)
630 v.push_back(blob.get_csum_item(i));
631 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
632 << dendl;
633 }
634 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
635 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
636 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
637 << "~" << i.second->length << std::dec
638 << " " << *i.second << dendl;
639 }
640 }
641}
642
643template <int LogLevelV>
644void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
645{
646 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
647 return;
648 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
649 << " nid " << o.onode.nid
650 << " size 0x" << std::hex << o.onode.size
651 << " (" << std::dec << o.onode.size << ")"
652 << " expected_object_size " << o.onode.expected_object_size
653 << " expected_write_size " << o.onode.expected_write_size
654 << " in " << o.onode.extent_map_shards.size() << " shards"
655 << ", " << o.extent_map.spanning_blob_map.size()
656 << " spanning blobs"
657 << dendl;
658 for (auto& [zone, offset] : o.onode.zone_offset_refs) {
659 dout(LogLevelV) << __func__ << " zone ref 0x" << std::hex << zone
660 << " offset 0x" << offset << std::dec << dendl;
661 }
662 for (auto p = o.onode.attrs.begin();
663 p != o.onode.attrs.end();
664 ++p) {
665 dout(LogLevelV) << __func__ << " attr " << p->first
666 << " len " << p->second.length() << dendl;
667 }
668 _dump_extent_map<LogLevelV>(cct, o.extent_map);
669}
670
671template <int LogLevelV>
672void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
673{
674 dout(LogLevelV) << __func__ << " transaction dump:\n";
675 JSONFormatter f(true);
676 f.open_object_section("transaction");
677 t->dump(&f);
678 f.close_section();
679 f.flush(*_dout);
680 *_dout << dendl;
681}
682
683// Buffer
684
685ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
686{
687 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
688 << b.offset << "~" << b.length << std::dec
689 << " " << BlueStore::Buffer::get_state_name(b.state);
690 if (b.flags)
691 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
692 return out << ")";
693}
694
695namespace {
696
697/*
698 * Due to a bug in key string encoding (see a comment for append_escaped)
699 * the KeyValueDB iterator does not lexicographically sort the same
700 * way that ghobject_t does: objects with the same hash may have wrong order.
701 *
702 * This is the iterator wrapper that fixes the keys order.
703 */
704
705class CollectionListIterator {
706public:
707 CollectionListIterator(const KeyValueDB::Iterator &it)
708 : m_it(it) {
709 }
710 virtual ~CollectionListIterator() {
711 }
712
713 virtual bool valid() const = 0;
714 virtual const ghobject_t &oid() const = 0;
715 virtual void lower_bound(const ghobject_t &oid) = 0;
716 virtual void upper_bound(const ghobject_t &oid) = 0;
717 virtual void next() = 0;
718
719 virtual int cmp(const ghobject_t &oid) const = 0;
720
721 bool is_ge(const ghobject_t &oid) const {
722 return cmp(oid) >= 0;
723 }
724
725 bool is_lt(const ghobject_t &oid) const {
726 return cmp(oid) < 0;
727 }
728
729protected:
730 KeyValueDB::Iterator m_it;
731};
732
733class SimpleCollectionListIterator : public CollectionListIterator {
734public:
735 SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
736 : CollectionListIterator(it), m_cct(cct) {
737 }
738
739 bool valid() const override {
740 return m_it->valid();
741 }
742
743 const ghobject_t &oid() const override {
744 ceph_assert(valid());
745
746 return m_oid;
747 }
748
749 void lower_bound(const ghobject_t &oid) override {
750 string key;
751 get_object_key(m_cct, oid, &key);
752
753 m_it->lower_bound(key);
754 get_oid();
755 }
756
757 void upper_bound(const ghobject_t &oid) override {
758 string key;
759 get_object_key(m_cct, oid, &key);
760
761 m_it->upper_bound(key);
762 get_oid();
763 }
764
765 void next() override {
766 ceph_assert(valid());
767
768 m_it->next();
769 get_oid();
770 }
771
772 int cmp(const ghobject_t &oid) const override {
773 ceph_assert(valid());
774
775 string key;
776 get_object_key(m_cct, oid, &key);
777
778 return m_it->key().compare(key);
779 }
780
781private:
782 CephContext *m_cct;
783 ghobject_t m_oid;
784
785 void get_oid() {
786 m_oid = ghobject_t();
787 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
788 m_it->next();
789 }
790 if (!valid()) {
791 return;
792 }
793
794 int r = get_key_object(m_it->key(), &m_oid);
795 ceph_assert(r == 0);
796 }
797};
798
799class SortedCollectionListIterator : public CollectionListIterator {
800public:
801 SortedCollectionListIterator(const KeyValueDB::Iterator &it)
802 : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
803 }
804
805 bool valid() const override {
806 return m_chunk_iter != m_chunk.end();
807 }
808
809 const ghobject_t &oid() const override {
810 ceph_assert(valid());
811
812 return m_chunk_iter->first;
813 }
814
815 void lower_bound(const ghobject_t &oid) override {
816 std::string key;
817 _key_encode_prefix(oid, &key);
818
819 m_it->lower_bound(key);
820 m_chunk_iter = m_chunk.end();
821 if (!get_next_chunk()) {
822 return;
823 }
824
825 if (this->oid().shard_id != oid.shard_id ||
826 this->oid().hobj.pool != oid.hobj.pool ||
827 this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
828 return;
829 }
830
831 m_chunk_iter = m_chunk.lower_bound(oid);
832 if (m_chunk_iter == m_chunk.end()) {
833 get_next_chunk();
834 }
835 }
836
837 void upper_bound(const ghobject_t &oid) override {
838 lower_bound(oid);
839
840 if (valid() && this->oid() == oid) {
841 next();
842 }
843 }
844
845 void next() override {
846 ceph_assert(valid());
847
848 m_chunk_iter++;
849 if (m_chunk_iter == m_chunk.end()) {
850 get_next_chunk();
851 }
852 }
853
854 int cmp(const ghobject_t &oid) const override {
855 ceph_assert(valid());
856
857 if (this->oid() < oid) {
858 return -1;
859 }
860 if (this->oid() > oid) {
861 return 1;
862 }
863 return 0;
864 }
865
866private:
867 std::map<ghobject_t, std::string> m_chunk;
868 std::map<ghobject_t, std::string>::iterator m_chunk_iter;
869
870 bool get_next_chunk() {
871 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
872 m_it->next();
873 }
874
875 if (!m_it->valid()) {
876 return false;
877 }
878
879 ghobject_t oid;
880 int r = get_key_object(m_it->key(), &oid);
881 ceph_assert(r == 0);
882
883 m_chunk.clear();
884 while (true) {
885 m_chunk.insert({oid, m_it->key()});
886
887 do {
888 m_it->next();
889 } while (m_it->valid() && is_extent_shard_key(m_it->key()));
890
891 if (!m_it->valid()) {
892 break;
893 }
894
895 ghobject_t next;
896 r = get_key_object(m_it->key(), &next);
897 ceph_assert(r == 0);
898 if (next.shard_id != oid.shard_id ||
899 next.hobj.pool != oid.hobj.pool ||
900 next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
901 break;
902 }
903 oid = next;
904 }
905
906 m_chunk_iter = m_chunk.begin();
907 return true;
908 }
909};
910
911} // anonymous namespace
912
913// Garbage Collector
914
915void BlueStore::GarbageCollector::process_protrusive_extents(
916 const BlueStore::ExtentMap& extent_map,
917 uint64_t start_offset,
918 uint64_t end_offset,
919 uint64_t start_touch_offset,
920 uint64_t end_touch_offset,
921 uint64_t min_alloc_size)
922{
923 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
924
925 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
926 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
927
928 dout(30) << __func__ << " (hex): [" << std::hex
929 << lookup_start_offset << ", " << lookup_end_offset
930 << ")" << std::dec << dendl;
931
932 for (auto it = extent_map.seek_lextent(lookup_start_offset);
933 it != extent_map.extent_map.end() &&
934 it->logical_offset < lookup_end_offset;
935 ++it) {
936 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
937 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
938
939 dout(30) << __func__ << " " << *it
940 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
941 << dendl;
942
943 Blob* b = it->blob.get();
944
945 if (it->logical_offset >=start_touch_offset &&
946 it->logical_end() <= end_touch_offset) {
947 // Process extents within the range affected by
948 // the current write request.
949 // Need to take into account if existing extents
950 // can be merged with them (uncompressed case)
951 if (!b->get_blob().is_compressed()) {
952 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
953 --blob_info_counted->expected_allocations; // don't need to allocate
954 // new AU for compressed
955 // data since another
956 // collocated uncompressed
957 // blob already exists
958 dout(30) << __func__ << " --expected:"
959 << alloc_unit_start << dendl;
960 }
961 used_alloc_unit = alloc_unit_end;
962 blob_info_counted = nullptr;
963 }
964 } else if (b->get_blob().is_compressed()) {
965
966 // additionally we take compressed blobs that were not impacted
967 // by the write into account too
968 BlobInfo& bi =
969 affected_blobs.emplace(
970 b, BlobInfo(b->get_referenced_bytes())).first->second;
971
972 int adjust =
973 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
974 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
975 dout(30) << __func__ << " expected_allocations="
976 << bi.expected_allocations << " end_au:"
977 << alloc_unit_end << dendl;
978
979 blob_info_counted = &bi;
980 used_alloc_unit = alloc_unit_end;
981
982 ceph_assert(it->length <= bi.referenced_bytes);
983 bi.referenced_bytes -= it->length;
984 dout(30) << __func__ << " affected_blob:" << *b
985 << " unref 0x" << std::hex << it->length
986 << " referenced = 0x" << bi.referenced_bytes
987 << std::dec << dendl;
988 // NOTE: we can't move specific blob to resulting GC list here
989 // when reference counter == 0 since subsequent extents might
990 // decrement its expected_allocation.
991 // Hence need to enumerate all the extents first.
992 if (!bi.collect_candidate) {
993 bi.first_lextent = it;
994 bi.collect_candidate = true;
995 }
996 bi.last_lextent = it;
997 } else {
998 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
999 // don't need to allocate new AU for compressed data since another
1000 // collocated uncompressed blob already exists
1001 --blob_info_counted->expected_allocations;
1002 dout(30) << __func__ << " --expected_allocations:"
1003 << alloc_unit_start << dendl;
1004 }
1005 used_alloc_unit = alloc_unit_end;
1006 blob_info_counted = nullptr;
1007 }
1008 }
1009
1010 for (auto b_it = affected_blobs.begin();
1011 b_it != affected_blobs.end();
1012 ++b_it) {
1013 Blob* b = b_it->first;
1014 BlobInfo& bi = b_it->second;
1015 if (bi.referenced_bytes == 0) {
1016 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
1017 int64_t blob_expected_for_release =
1018 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
1019
1020 dout(30) << __func__ << " " << *(b_it->first)
1021 << " expected4release=" << blob_expected_for_release
1022 << " expected_allocations=" << bi.expected_allocations
1023 << dendl;
1024 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
1025 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
1026 if (bi.collect_candidate) {
1027 auto it = bi.first_lextent;
1028 bool bExit = false;
1029 do {
1030 if (it->blob.get() == b) {
1031 extents_to_collect.insert(it->logical_offset, it->length);
1032 }
1033 bExit = it == bi.last_lextent;
1034 ++it;
1035 } while (!bExit);
1036 }
1037 expected_for_release += blob_expected_for_release;
1038 expected_allocations += bi.expected_allocations;
1039 }
1040 }
1041 }
1042}
1043
1044int64_t BlueStore::GarbageCollector::estimate(
1045 uint64_t start_offset,
1046 uint64_t length,
1047 const BlueStore::ExtentMap& extent_map,
1048 const BlueStore::old_extent_map_t& old_extents,
1049 uint64_t min_alloc_size)
1050{
1051
1052 affected_blobs.clear();
1053 extents_to_collect.clear();
1054 used_alloc_unit = boost::optional<uint64_t >();
1055 blob_info_counted = nullptr;
1056
1057 uint64_t gc_start_offset = start_offset;
1058 uint64_t gc_end_offset = start_offset + length;
1059
1060 uint64_t end_offset = start_offset + length;
1061
1062 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
1063 Blob* b = it->e.blob.get();
1064 if (b->get_blob().is_compressed()) {
1065
1066 // update gc_start_offset/gc_end_offset if needed
1067 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
1068 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
1069
1070 auto o = it->e.logical_offset;
1071 auto l = it->e.length;
1072
1073 uint64_t ref_bytes = b->get_referenced_bytes();
1074 // micro optimization to bypass blobs that have no more references
1075 if (ref_bytes != 0) {
1076 dout(30) << __func__ << " affected_blob:" << *b
1077 << " unref 0x" << std::hex << o << "~" << l
1078 << std::dec << dendl;
1079 affected_blobs.emplace(b, BlobInfo(ref_bytes));
1080 }
1081 }
1082 }
1083 dout(30) << __func__ << " gc range(hex): [" << std::hex
1084 << gc_start_offset << ", " << gc_end_offset
1085 << ")" << std::dec << dendl;
1086
1087 // enumerate preceeding extents to check if they reference affected blobs
1088 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
1089 process_protrusive_extents(extent_map,
1090 gc_start_offset,
1091 gc_end_offset,
1092 start_offset,
1093 end_offset,
1094 min_alloc_size);
1095 }
1096 return expected_for_release - expected_allocations;
1097}
1098
1099// LruOnodeCacheShard
1100struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
1101 typedef boost::intrusive::list<
1102 BlueStore::Onode,
1103 boost::intrusive::member_hook<
1104 BlueStore::Onode,
1105 boost::intrusive::list_member_hook<>,
1106 &BlueStore::Onode::lru_item> > list_t;
1107
1108 list_t lru;
1109
1110 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
1111
1112 void _add(BlueStore::Onode* o, int level) override
1113 {
1114 o->set_cached();
1115 if (o->pin_nref == 1) {
1116 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
1117 o->cache_age_bin = age_bins.front();
1118 *(o->cache_age_bin) += 1;
1119 }
1120 ++num; // we count both pinned and unpinned entries
1121 dout(20) << __func__ << " " << this << " " << o->oid << " added, num="
1122 << num << dendl;
1123 }
1124 void _rm(BlueStore::Onode* o) override
1125 {
1126 o->clear_cached();
1127 if (o->lru_item.is_linked()) {
1128 *(o->cache_age_bin) -= 1;
1129 lru.erase(lru.iterator_to(*o));
1130 }
1131 ceph_assert(num);
1132 --num;
1133 dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
1134 }
1135
1136 void maybe_unpin(BlueStore::Onode* o) override
1137 {
1138 OnodeCacheShard* ocs = this;
1139 ocs->lock.lock();
1140 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
1141 while (ocs != o->c->get_onode_cache()) {
1142 ocs->lock.unlock();
1143 ocs = o->c->get_onode_cache();
1144 ocs->lock.lock();
1145 }
1146 if (o->is_cached() && o->pin_nref == 1) {
1147 if(!o->lru_item.is_linked()) {
1148 if (o->exists) {
1149 lru.push_front(*o);
1150 o->cache_age_bin = age_bins.front();
1151 *(o->cache_age_bin) += 1;
1152 dout(20) << __func__ << " " << this << " " << o->oid << " unpinned"
1153 << dendl;
1154 } else {
1155 ceph_assert(num);
1156 --num;
1157 o->clear_cached();
1158 dout(20) << __func__ << " " << this << " " << o->oid << " removed"
1159 << dendl;
1160 // remove will also decrement nref
1161 o->c->onode_space._remove(o->oid);
1162 }
1163 } else if (o->exists) {
1164 // move onode within LRU
1165 lru.erase(lru.iterator_to(*o));
1166 lru.push_front(*o);
1167 if (o->cache_age_bin != age_bins.front()) {
1168 *(o->cache_age_bin) -= 1;
1169 o->cache_age_bin = age_bins.front();
1170 *(o->cache_age_bin) += 1;
1171 }
1172 dout(20) << __func__ << " " << this << " " << o->oid << " touched"
1173 << dendl;
1174 }
1175 }
1176 ocs->lock.unlock();
1177 }
1178
1179 void _trim_to(uint64_t new_size) override
1180 {
1181 if (new_size >= lru.size()) {
1182 return; // don't even try
1183 }
1184 uint64_t n = num - new_size; // note: we might get empty LRU
1185 // before n == 0 due to pinned
1186 // entries. And hence being unable
1187 // to reach new_size target.
1188 while (n-- > 0 && lru.size() > 0) {
1189 BlueStore::Onode *o = &lru.back();
1190 lru.pop_back();
1191
1192 dout(20) << __func__ << " rm " << o->oid << " "
1193 << o->nref << " " << o->cached << dendl;
1194
1195 *(o->cache_age_bin) -= 1;
1196 if (o->pin_nref > 1) {
1197 dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << dendl;
1198 } else {
1199 ceph_assert(num);
1200 --num;
1201 o->clear_cached();
1202 o->c->onode_space._remove(o->oid);
1203 }
1204 }
1205 }
1206 void _move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
1207 {
1208 if (to == this) {
1209 return;
1210 }
1211 _rm(o);
1212 ceph_assert(o->nref > 1);
1213 to->_add(o, 0);
1214 }
1215 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
1216 {
1217 std::lock_guard l(lock);
1218 *onodes += num;
1219 *pinned_onodes += num - lru.size();
1220 }
1221};
1222
1223// OnodeCacheShard
1224BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
1225 CephContext* cct,
1226 string type,
1227 PerfCounters *logger)
1228{
1229 BlueStore::OnodeCacheShard *c = nullptr;
1230 // Currently we only implement an LRU cache for onodes
1231 c = new LruOnodeCacheShard(cct);
1232 c->logger = logger;
1233 return c;
1234}
1235
1236// LruBufferCacheShard
1237struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
1238 typedef boost::intrusive::list<
1239 BlueStore::Buffer,
1240 boost::intrusive::member_hook<
1241 BlueStore::Buffer,
1242 boost::intrusive::list_member_hook<>,
1243 &BlueStore::Buffer::lru_item> > list_t;
1244 list_t lru;
1245
1246 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
1247
1248 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
1249 if (near) {
1250 auto q = lru.iterator_to(*near);
1251 lru.insert(q, *b);
1252 } else if (level > 0) {
1253 lru.push_front(*b);
1254 } else {
1255 lru.push_back(*b);
1256 }
1257 buffer_bytes += b->length;
1258 b->cache_age_bin = age_bins.front();
1259 *(b->cache_age_bin) += b->length;
1260 num = lru.size();
1261 }
1262 void _rm(BlueStore::Buffer *b) override {
1263 ceph_assert(buffer_bytes >= b->length);
1264 buffer_bytes -= b->length;
1265 assert(*(b->cache_age_bin) >= b->length);
1266 *(b->cache_age_bin) -= b->length;
1267 auto q = lru.iterator_to(*b);
1268 lru.erase(q);
1269 num = lru.size();
1270 }
1271 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
1272 src->_rm(b);
1273 _add(b, 0, nullptr);
1274 }
1275 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
1276 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1277 buffer_bytes += delta;
1278 assert(*(b->cache_age_bin) + delta >= 0);
1279 *(b->cache_age_bin) += delta;
1280 }
1281 void _touch(BlueStore::Buffer *b) override {
1282 auto p = lru.iterator_to(*b);
1283 lru.erase(p);
1284 lru.push_front(*b);
1285 *(b->cache_age_bin) -= b->length;
1286 b->cache_age_bin = age_bins.front();
1287 *(b->cache_age_bin) += b->length;
1288 num = lru.size();
1289 _audit("_touch_buffer end");
1290 }
1291
1292 void _trim_to(uint64_t max) override
1293 {
1294 while (buffer_bytes > max) {
1295 auto i = lru.rbegin();
1296 if (i == lru.rend()) {
1297 // stop if lru is now empty
1298 break;
1299 }
1300
1301 BlueStore::Buffer *b = &*i;
1302 ceph_assert(b->is_clean());
1303 dout(20) << __func__ << " rm " << *b << dendl;
1304 assert(*(b->cache_age_bin) >= b->length);
1305 *(b->cache_age_bin) -= b->length;
1306 b->space->_rm_buffer(this, b);
1307 }
1308 num = lru.size();
1309 }
1310
1311 void add_stats(uint64_t *extents,
1312 uint64_t *blobs,
1313 uint64_t *buffers,
1314 uint64_t *bytes) override {
1315 *extents += num_extents;
1316 *blobs += num_blobs;
1317 *buffers += num;
1318 *bytes += buffer_bytes;
1319 }
1320#ifdef DEBUG_CACHE
1321 void _audit(const char *s) override
1322 {
1323 dout(10) << __func__ << " " << when << " start" << dendl;
1324 uint64_t s = 0;
1325 for (auto i = lru.begin(); i != lru.end(); ++i) {
1326 s += i->length;
1327 }
1328 if (s != buffer_bytes) {
1329 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1330 << dendl;
1331 for (auto i = lru.begin(); i != lru.end(); ++i) {
1332 derr << __func__ << " " << *i << dendl;
1333 }
1334 ceph_assert(s == buffer_bytes);
1335 }
1336 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1337 << " ok" << dendl;
1338 }
1339#endif
1340};
1341
1342// TwoQBufferCacheShard
1343
1344struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1345 typedef boost::intrusive::list<
1346 BlueStore::Buffer,
1347 boost::intrusive::member_hook<
1348 BlueStore::Buffer,
1349 boost::intrusive::list_member_hook<>,
1350 &BlueStore::Buffer::lru_item> > list_t;
1351 list_t hot; ///< "Am" hot buffers
1352 list_t warm_in; ///< "A1in" newly warm buffers
1353 list_t warm_out; ///< "A1out" empty buffers we've evicted
1354
1355 enum {
1356 BUFFER_NEW = 0,
1357 BUFFER_WARM_IN, ///< in warm_in
1358 BUFFER_WARM_OUT, ///< in warm_out
1359 BUFFER_HOT, ///< in hot
1360 BUFFER_TYPE_MAX
1361 };
1362
1363 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
1364
1365public:
1366 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
1367
1368 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1369 {
1370 dout(20) << __func__ << " level " << level << " near " << near
1371 << " on " << *b
1372 << " which has cache_private " << b->cache_private << dendl;
1373 if (near) {
1374 b->cache_private = near->cache_private;
1375 switch (b->cache_private) {
1376 case BUFFER_WARM_IN:
1377 warm_in.insert(warm_in.iterator_to(*near), *b);
1378 break;
1379 case BUFFER_WARM_OUT:
1380 ceph_assert(b->is_empty());
1381 warm_out.insert(warm_out.iterator_to(*near), *b);
1382 break;
1383 case BUFFER_HOT:
1384 hot.insert(hot.iterator_to(*near), *b);
1385 break;
1386 default:
1387 ceph_abort_msg("bad cache_private");
1388 }
1389 } else if (b->cache_private == BUFFER_NEW) {
1390 b->cache_private = BUFFER_WARM_IN;
1391 if (level > 0) {
1392 warm_in.push_front(*b);
1393 } else {
1394 // take caller hint to start at the back of the warm queue
1395 warm_in.push_back(*b);
1396 }
1397 } else {
1398 // we got a hint from discard
1399 switch (b->cache_private) {
1400 case BUFFER_WARM_IN:
1401 // stay in warm_in. move to front, even though 2Q doesn't actually
1402 // do this.
1403 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1404 warm_in.push_front(*b);
1405 break;
1406 case BUFFER_WARM_OUT:
1407 b->cache_private = BUFFER_HOT;
1408 // move to hot. fall-thru
1409 case BUFFER_HOT:
1410 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1411 hot.push_front(*b);
1412 break;
1413 default:
1414 ceph_abort_msg("bad cache_private");
1415 }
1416 }
1417 b->cache_age_bin = age_bins.front();
1418 if (!b->is_empty()) {
1419 buffer_bytes += b->length;
1420 list_bytes[b->cache_private] += b->length;
1421 *(b->cache_age_bin) += b->length;
1422 }
1423 num = hot.size() + warm_in.size();
1424 }
1425
1426 void _rm(BlueStore::Buffer *b) override
1427 {
1428 dout(20) << __func__ << " " << *b << dendl;
1429 if (!b->is_empty()) {
1430 ceph_assert(buffer_bytes >= b->length);
1431 buffer_bytes -= b->length;
1432 ceph_assert(list_bytes[b->cache_private] >= b->length);
1433 list_bytes[b->cache_private] -= b->length;
1434 assert(*(b->cache_age_bin) >= b->length);
1435 *(b->cache_age_bin) -= b->length;
1436 }
1437 switch (b->cache_private) {
1438 case BUFFER_WARM_IN:
1439 warm_in.erase(warm_in.iterator_to(*b));
1440 break;
1441 case BUFFER_WARM_OUT:
1442 warm_out.erase(warm_out.iterator_to(*b));
1443 break;
1444 case BUFFER_HOT:
1445 hot.erase(hot.iterator_to(*b));
1446 break;
1447 default:
1448 ceph_abort_msg("bad cache_private");
1449 }
1450 num = hot.size() + warm_in.size();
1451 }
1452
1453 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1454 {
1455 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1456 src->_rm(b);
1457
1458 // preserve which list we're on (even if we can't preserve the order!)
1459 switch (b->cache_private) {
1460 case BUFFER_WARM_IN:
1461 ceph_assert(!b->is_empty());
1462 warm_in.push_back(*b);
1463 break;
1464 case BUFFER_WARM_OUT:
1465 ceph_assert(b->is_empty());
1466 warm_out.push_back(*b);
1467 break;
1468 case BUFFER_HOT:
1469 ceph_assert(!b->is_empty());
1470 hot.push_back(*b);
1471 break;
1472 default:
1473 ceph_abort_msg("bad cache_private");
1474 }
1475 if (!b->is_empty()) {
1476 buffer_bytes += b->length;
1477 list_bytes[b->cache_private] += b->length;
1478 *(b->cache_age_bin) += b->length;
1479 }
1480 num = hot.size() + warm_in.size();
1481 }
1482
1483 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1484 {
1485 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1486 if (!b->is_empty()) {
1487 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1488 buffer_bytes += delta;
1489 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1490 list_bytes[b->cache_private] += delta;
1491 assert(*(b->cache_age_bin) + delta >= 0);
1492 *(b->cache_age_bin) += delta;
1493 }
1494 }
1495
1496 void _touch(BlueStore::Buffer *b) override {
1497 switch (b->cache_private) {
1498 case BUFFER_WARM_IN:
1499 // do nothing (somewhat counter-intuitively!)
1500 break;
1501 case BUFFER_WARM_OUT:
1502 // move from warm_out to hot LRU
1503 ceph_abort_msg("this happens via discard hint");
1504 break;
1505 case BUFFER_HOT:
1506 // move to front of hot LRU
1507 hot.erase(hot.iterator_to(*b));
1508 hot.push_front(*b);
1509 break;
1510 }
1511 *(b->cache_age_bin) -= b->length;
1512 b->cache_age_bin = age_bins.front();
1513 *(b->cache_age_bin) += b->length;
1514 num = hot.size() + warm_in.size();
1515 _audit("_touch_buffer end");
1516 }
1517
1518 void _trim_to(uint64_t max) override
1519 {
1520 if (buffer_bytes > max) {
1521 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1522 uint64_t khot = max - kin;
1523
1524 // pre-calculate kout based on average buffer size too,
1525 // which is typical(the warm_in and hot lists may change later)
1526 uint64_t kout = 0;
1527 uint64_t buffer_num = hot.size() + warm_in.size();
1528 if (buffer_num) {
1529 uint64_t avg_size = buffer_bytes / buffer_num;
1530 ceph_assert(avg_size);
1531 uint64_t calculated_num = max / avg_size;
1532 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1533 }
1534
1535 if (list_bytes[BUFFER_HOT] < khot) {
1536 // hot is small, give slack to warm_in
1537 kin += khot - list_bytes[BUFFER_HOT];
1538 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1539 // warm_in is small, give slack to hot
1540 khot += kin - list_bytes[BUFFER_WARM_IN];
1541 }
1542
1543 // adjust warm_in list
1544 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1545 uint64_t evicted = 0;
1546
1547 while (to_evict_bytes > 0) {
1548 auto p = warm_in.rbegin();
1549 if (p == warm_in.rend()) {
1550 // stop if warm_in list is now empty
1551 break;
1552 }
1553
1554 BlueStore::Buffer *b = &*p;
1555 ceph_assert(b->is_clean());
1556 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1557 ceph_assert(buffer_bytes >= b->length);
1558 buffer_bytes -= b->length;
1559 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1560 list_bytes[BUFFER_WARM_IN] -= b->length;
1561 assert(*(b->cache_age_bin) >= b->length);
1562 *(b->cache_age_bin) -= b->length;
1563 to_evict_bytes -= b->length;
1564 evicted += b->length;
1565 b->state = BlueStore::Buffer::STATE_EMPTY;
1566 b->data.clear();
1567 warm_in.erase(warm_in.iterator_to(*b));
1568 warm_out.push_front(*b);
1569 b->cache_private = BUFFER_WARM_OUT;
1570 }
1571
1572 if (evicted > 0) {
1573 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1574 << " from warm_in list, done evicting warm_in buffers"
1575 << dendl;
1576 }
1577
1578 // adjust hot list
1579 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1580 evicted = 0;
1581
1582 while (to_evict_bytes > 0) {
1583 auto p = hot.rbegin();
1584 if (p == hot.rend()) {
1585 // stop if hot list is now empty
1586 break;
1587 }
1588
1589 BlueStore::Buffer *b = &*p;
1590 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1591 ceph_assert(b->is_clean());
1592 // adjust evict size before buffer goes invalid
1593 to_evict_bytes -= b->length;
1594 evicted += b->length;
1595 b->space->_rm_buffer(this, b);
1596 }
1597
1598 if (evicted > 0) {
1599 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1600 << " from hot list, done evicting hot buffers"
1601 << dendl;
1602 }
1603
1604 // adjust warm out list too, if necessary
1605 int64_t n = warm_out.size() - kout;
1606 while (n-- > 0) {
1607 BlueStore::Buffer *b = &*warm_out.rbegin();
1608 ceph_assert(b->is_empty());
1609 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1610 b->space->_rm_buffer(this, b);
1611 }
1612 }
1613 num = hot.size() + warm_in.size();
1614 }
1615
1616 void add_stats(uint64_t *extents,
1617 uint64_t *blobs,
1618 uint64_t *buffers,
1619 uint64_t *bytes) override {
1620 *extents += num_extents;
1621 *blobs += num_blobs;
1622 *buffers += num;
1623 *bytes += buffer_bytes;
1624 }
1625
1626#ifdef DEBUG_CACHE
1627 void _audit(const char *s) override
1628 {
1629 dout(10) << __func__ << " " << when << " start" << dendl;
1630 uint64_t s = 0;
1631 for (auto i = hot.begin(); i != hot.end(); ++i) {
1632 s += i->length;
1633 }
1634
1635 uint64_t hot_bytes = s;
1636 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1637 derr << __func__ << " hot_list_bytes "
1638 << list_bytes[BUFFER_HOT]
1639 << " != actual " << hot_bytes
1640 << dendl;
1641 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
1642 }
1643
1644 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1645 s += i->length;
1646 }
1647
1648 uint64_t warm_in_bytes = s - hot_bytes;
1649 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1650 derr << __func__ << " warm_in_list_bytes "
1651 << list_bytes[BUFFER_WARM_IN]
1652 << " != actual " << warm_in_bytes
1653 << dendl;
1654 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
1655 }
1656
1657 if (s != buffer_bytes) {
1658 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1659 << dendl;
1660 ceph_assert(s == buffer_bytes);
1661 }
1662
1663 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1664 << " ok" << dendl;
1665 }
1666#endif
1667};
1668
1669// BuferCacheShard
1670
1671BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1672 CephContext* cct,
1673 string type,
1674 PerfCounters *logger)
1675{
1676 BufferCacheShard *c = nullptr;
1677 if (type == "lru")
1678 c = new LruBufferCacheShard(cct);
1679 else if (type == "2q")
1680 c = new TwoQBufferCacheShard(cct);
1681 else
1682 ceph_abort_msg("unrecognized cache type");
1683 c->logger = logger;
1684 return c;
1685}
1686
1687// BufferSpace
1688
1689#undef dout_prefix
1690#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1691
1692void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
1693{
1694 // note: we already hold cache->lock
1695 ldout(cache->cct, 20) << __func__ << dendl;
1696 while (!buffer_map.empty()) {
1697 _rm_buffer(cache, buffer_map.begin());
1698 }
1699}
1700
1701int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
1702{
1703 // note: we already hold cache->lock
1704 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1705 << std::dec << dendl;
1706 int cache_private = 0;
1707 cache->_audit("discard start");
1708 auto i = _data_lower_bound(offset);
1709 uint32_t end = offset + length;
1710 while (i != buffer_map.end()) {
1711 Buffer *b = i->second.get();
1712 if (b->offset >= end) {
1713 break;
1714 }
1715 if (b->cache_private > cache_private) {
1716 cache_private = b->cache_private;
1717 }
1718 if (b->offset < offset) {
1719 int64_t front = offset - b->offset;
1720 if (b->end() > end) {
1721 // drop middle (split)
1722 uint32_t tail = b->end() - end;
1723 if (b->data.length()) {
1724 bufferlist bl;
1725 bl.substr_of(b->data, b->length - tail, tail);
1726 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
1727 nb->maybe_rebuild();
1728 _add_buffer(cache, nb, 0, b);
1729 } else {
1730 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail,
1731 b->flags),
1732 0, b);
1733 }
1734 if (!b->is_writing()) {
1735 cache->_adjust_size(b, front - (int64_t)b->length);
1736 }
1737 b->truncate(front);
1738 b->maybe_rebuild();
1739 cache->_audit("discard end 1");
1740 break;
1741 } else {
1742 // drop tail
1743 if (!b->is_writing()) {
1744 cache->_adjust_size(b, front - (int64_t)b->length);
1745 }
1746 b->truncate(front);
1747 b->maybe_rebuild();
1748 ++i;
1749 continue;
1750 }
1751 }
1752 if (b->end() <= end) {
1753 // drop entire buffer
1754 _rm_buffer(cache, i++);
1755 continue;
1756 }
1757 // drop front
1758 uint32_t keep = b->end() - end;
1759 if (b->data.length()) {
1760 bufferlist bl;
1761 bl.substr_of(b->data, b->length - keep, keep);
1762 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
1763 nb->maybe_rebuild();
1764 _add_buffer(cache, nb, 0, b);
1765 } else {
1766 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep,
1767 b->flags),
1768 0, b);
1769 }
1770 _rm_buffer(cache, i);
1771 cache->_audit("discard end 2");
1772 break;
1773 }
1774 return cache_private;
1775}
1776
1777void BlueStore::BufferSpace::read(
1778 BufferCacheShard* cache,
1779 uint32_t offset,
1780 uint32_t length,
1781 BlueStore::ready_regions_t& res,
1782 interval_set<uint32_t>& res_intervals,
1783 int flags)
1784{
1785 res.clear();
1786 res_intervals.clear();
1787 uint32_t want_bytes = length;
1788 uint32_t end = offset + length;
1789
1790 {
1791 std::lock_guard l(cache->lock);
1792 for (auto i = _data_lower_bound(offset);
1793 i != buffer_map.end() && offset < end && i->first < end;
1794 ++i) {
1795 Buffer *b = i->second.get();
1796 ceph_assert(b->end() > offset);
1797
1798 bool val = false;
1799 if (flags & BYPASS_CLEAN_CACHE)
1800 val = b->is_writing();
1801 else
1802 val = b->is_writing() || b->is_clean();
1803 if (val) {
1804 if (b->offset < offset) {
1805 uint32_t skip = offset - b->offset;
1806 uint32_t l = min(length, b->length - skip);
1807 res[offset].substr_of(b->data, skip, l);
1808 res_intervals.insert(offset, l);
1809 offset += l;
1810 length -= l;
1811 if (!b->is_writing()) {
1812 cache->_touch(b);
1813 }
1814 continue;
1815 }
1816 if (b->offset > offset) {
1817 uint32_t gap = b->offset - offset;
1818 if (length <= gap) {
1819 break;
1820 }
1821 offset += gap;
1822 length -= gap;
1823 }
1824 if (!b->is_writing()) {
1825 cache->_touch(b);
1826 }
1827 if (b->length > length) {
1828 res[offset].substr_of(b->data, 0, length);
1829 res_intervals.insert(offset, length);
1830 break;
1831 } else {
1832 res[offset].append(b->data);
1833 res_intervals.insert(offset, b->length);
1834 if (b->length == length)
1835 break;
1836 offset += b->length;
1837 length -= b->length;
1838 }
1839 }
1840 }
1841 }
1842
1843 uint64_t hit_bytes = res_intervals.size();
1844 ceph_assert(hit_bytes <= want_bytes);
1845 uint64_t miss_bytes = want_bytes - hit_bytes;
1846 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1847 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1848}
1849
1850void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
1851{
1852 auto i = writing.begin();
1853 while (i != writing.end()) {
1854 if (i->seq > seq) {
1855 break;
1856 }
1857 if (i->seq < seq) {
1858 ++i;
1859 continue;
1860 }
1861
1862 Buffer *b = &*i;
1863 ceph_assert(b->is_writing());
1864
1865 if (b->flags & Buffer::FLAG_NOCACHE) {
1866 writing.erase(i++);
1867 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1868 buffer_map.erase(b->offset);
1869 } else {
1870 b->state = Buffer::STATE_CLEAN;
1871 writing.erase(i++);
1872 b->maybe_rebuild();
1873 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
1874 cache->_add(b, 1, nullptr);
1875 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1876 }
1877 }
1878 cache->_trim();
1879 cache->_audit("finish_write end");
1880}
1881
1882void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
1883{
1884 std::lock_guard lk(cache->lock);
1885 if (buffer_map.empty())
1886 return;
1887
1888 auto p = --buffer_map.end();
1889 while (true) {
1890 if (p->second->end() <= pos)
1891 break;
1892
1893 if (p->second->offset < pos) {
1894 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1895 size_t left = pos - p->second->offset;
1896 size_t right = p->second->length - left;
1897 if (p->second->data.length()) {
1898 bufferlist bl;
1899 bl.substr_of(p->second->data, left, right);
1900 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1901 0, bl, p->second->flags),
1902 0, p->second.get());
1903 } else {
1904 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1905 0, right, p->second->flags),
1906 0, p->second.get());
1907 }
1908 cache->_adjust_size(p->second.get(), -right);
1909 p->second->truncate(left);
1910 break;
1911 }
1912
1913 ceph_assert(p->second->end() > pos);
1914 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1915 if (p->second->data.length()) {
1916 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1917 p->second->offset - pos, p->second->data, p->second->flags),
1918 0, p->second.get());
1919 } else {
1920 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1921 p->second->offset - pos, p->second->length, p->second->flags),
1922 0, p->second.get());
1923 }
1924 if (p == buffer_map.begin()) {
1925 _rm_buffer(cache, p);
1926 break;
1927 } else {
1928 _rm_buffer(cache, p--);
1929 }
1930 }
1931 ceph_assert(writing.empty());
1932 cache->_trim();
1933}
1934
1935// OnodeSpace
1936
1937#undef dout_prefix
1938#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1939
1940BlueStore::OnodeRef BlueStore::OnodeSpace::add_onode(const ghobject_t& oid,
1941 OnodeRef& o)
1942{
1943 std::lock_guard l(cache->lock);
1944 // add entry or return existing one
1945 auto p = onode_map.emplace(oid, o);
1946 if (!p.second) {
1947 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1948 << " raced, returning existing " << p.first->second
1949 << dendl;
1950 return p.first->second;
1951 }
1952 ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
1953 cache->_add(o.get(), 1);
1954 cache->_trim();
1955 return o;
1956}
1957
1958void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
1959{
1960 ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
1961 onode_map.erase(oid);
1962}
1963
1964BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1965{
1966 ldout(cache->cct, 30) << __func__ << dendl;
1967 OnodeRef o;
1968
1969 {
1970 std::lock_guard l(cache->lock);
1971 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1972 if (p == onode_map.end()) {
1973 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1974 cache->logger->inc(l_bluestore_onode_misses);
1975 } else {
1976 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1977 << " " << p->second->nref
1978 << " " << p->second->cached
1979 << dendl;
1980 // This will pin onode and implicitly touch the cache when Onode
1981 // eventually will become unpinned
1982 o = p->second;
1983
1984 cache->logger->inc(l_bluestore_onode_hits);
1985 }
1986 }
1987
1988 return o;
1989}
1990
1991void BlueStore::OnodeSpace::clear()
1992{
1993 std::lock_guard l(cache->lock);
1994 ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
1995 for (auto &p : onode_map) {
1996 cache->_rm(p.second.get());
1997 }
1998 onode_map.clear();
1999}
2000
2001bool BlueStore::OnodeSpace::empty()
2002{
2003 std::lock_guard l(cache->lock);
2004 return onode_map.empty();
2005}
2006
2007void BlueStore::OnodeSpace::rename(
2008 OnodeRef& oldo,
2009 const ghobject_t& old_oid,
2010 const ghobject_t& new_oid,
2011 const mempool::bluestore_cache_meta::string& new_okey)
2012{
2013 std::lock_guard l(cache->lock);
2014 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
2015 << dendl;
2016 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
2017 po = onode_map.find(old_oid);
2018 pn = onode_map.find(new_oid);
2019 ceph_assert(po != pn);
2020
2021 ceph_assert(po != onode_map.end());
2022 if (pn != onode_map.end()) {
2023 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
2024 << dendl;
2025 cache->_rm(pn->second.get());
2026 onode_map.erase(pn);
2027 }
2028 OnodeRef o = po->second;
2029
2030 // install a non-existent onode at old location
2031 oldo.reset(new Onode(o->c, old_oid, o->key));
2032 po->second = oldo;
2033 cache->_add(oldo.get(), 1);
2034 // add at new position and fix oid, key.
2035 // This will pin 'o' and implicitly touch cache
2036 // when it will eventually become unpinned
2037 onode_map.insert(make_pair(new_oid, o));
2038
2039 o->oid = new_oid;
2040 o->key = new_okey;
2041 cache->_trim();
2042}
2043
2044bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
2045{
2046 std::lock_guard l(cache->lock);
2047 ldout(cache->cct, 20) << __func__ << dendl;
2048 for (auto& i : onode_map) {
2049 if (f(i.second.get())) {
2050 return true;
2051 }
2052 }
2053 return false;
2054}
2055
2056template <int LogLevelV = 30>
2057void BlueStore::OnodeSpace::dump(CephContext *cct)
2058{
2059 for (auto& i : onode_map) {
2060 ldout(cct, LogLevelV) << i.first << " : " << i.second
2061 << " " << i.second->nref
2062 << " " << i.second->cached
2063 << dendl;
2064 }
2065}
2066
2067// SharedBlob
2068
2069#undef dout_prefix
2070#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
2071#undef dout_context
2072#define dout_context coll->store->cct
2073
2074void BlueStore::SharedBlob::dump(Formatter* f) const
2075{
2076 f->dump_bool("loaded", loaded);
2077 if (loaded) {
2078 persistent->dump(f);
2079 } else {
2080 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
2081 }
2082}
2083
2084ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
2085{
2086 out << "SharedBlob(" << &sb;
2087
2088 if (sb.loaded) {
2089 out << " loaded " << *sb.persistent;
2090 } else {
2091 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
2092 }
2093 return out << ")";
2094}
2095
2096BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
2097 : coll(_coll), sbid_unloaded(i)
2098{
2099 ceph_assert(sbid_unloaded > 0);
2100 if (get_cache()) {
2101 get_cache()->add_blob();
2102 }
2103}
2104
2105BlueStore::SharedBlob::~SharedBlob()
2106{
2107 if (loaded && persistent) {
2108 delete persistent;
2109 }
2110}
2111
2112void BlueStore::SharedBlob::put()
2113{
2114 if (--nref == 0) {
2115 dout(20) << __func__ << " " << this
2116 << " removing self from set " << get_parent()
2117 << dendl;
2118 again:
2119 auto coll_snap = coll;
2120 if (coll_snap) {
2121 std::lock_guard l(coll_snap->cache->lock);
2122 if (coll_snap != coll) {
2123 goto again;
2124 }
2125 if (!coll_snap->shared_blob_set.remove(this, true)) {
2126 // race with lookup
2127 return;
2128 }
2129 bc._clear(coll_snap->cache);
2130 coll_snap->cache->rm_blob();
2131 }
2132 delete this;
2133 }
2134}
2135
2136void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
2137{
2138 ceph_assert(persistent);
2139 persistent->ref_map.get(offset, length);
2140}
2141
2142void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
2143 PExtentVector *r,
2144 bool *unshare)
2145{
2146 ceph_assert(persistent);
2147 persistent->ref_map.put(offset, length, r,
2148 unshare && !*unshare ? unshare : nullptr);
2149}
2150
2151void BlueStore::SharedBlob::finish_write(uint64_t seq)
2152{
2153 while (true) {
2154 BufferCacheShard *cache = coll->cache;
2155 std::lock_guard l(cache->lock);
2156 if (coll->cache != cache) {
2157 dout(20) << __func__
2158 << " raced with sb cache update, was " << cache
2159 << ", now " << coll->cache << ", retrying"
2160 << dendl;
2161 continue;
2162 }
2163 bc._finish_write(cache, seq);
2164 break;
2165 }
2166}
2167
2168// SharedBlobSet
2169
2170#undef dout_prefix
2171#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2172
2173template <int LogLevelV = 30>
2174void BlueStore::SharedBlobSet::dump(CephContext *cct)
2175{
2176 std::lock_guard l(lock);
2177 for (auto& i : sb_map) {
2178 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
2179 }
2180}
2181
2182// Blob
2183
2184#undef dout_prefix
2185#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2186
2187void BlueStore::Blob::dump(Formatter* f) const
2188{
2189 if (is_spanning()) {
2190 f->dump_unsigned("spanning_id ", id);
2191 }
2192 blob.dump(f);
2193 if (shared_blob) {
2194 f->dump_object("shared", *shared_blob);
2195 }
2196}
2197
2198ostream& operator<<(ostream& out, const BlueStore::Blob& b)
2199{
2200 out << "Blob(" << &b;
2201 if (b.is_spanning()) {
2202 out << " spanning " << b.id;
2203 }
2204 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
2205 if (b.shared_blob) {
2206 out << " " << *b.shared_blob;
2207 } else {
2208 out << " (shared_blob=NULL)";
2209 }
2210 out << ")";
2211 return out;
2212}
2213
2214void BlueStore::Blob::discard_unallocated(Collection *coll)
2215{
2216 if (get_blob().is_shared()) {
2217 return;
2218 }
2219 if (get_blob().is_compressed()) {
2220 bool discard = false;
2221 bool all_invalid = true;
2222 for (auto e : get_blob().get_extents()) {
2223 if (!e.is_valid()) {
2224 discard = true;
2225 } else {
2226 all_invalid = false;
2227 }
2228 }
2229 ceph_assert(discard == all_invalid); // in case of compressed blob all
2230 // or none pextents are invalid.
2231 if (discard) {
2232 shared_blob->bc.discard(shared_blob->get_cache(), 0,
2233 get_blob().get_logical_length());
2234 }
2235 } else {
2236 size_t pos = 0;
2237 for (auto e : get_blob().get_extents()) {
2238 if (!e.is_valid()) {
2239 dout(20) << __func__ << " 0x" << std::hex << pos
2240 << "~" << e.length
2241 << std::dec << dendl;
2242 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
2243 }
2244 pos += e.length;
2245 }
2246 if (get_blob().can_prune_tail()) {
2247 dirty_blob().prune_tail();
2248 used_in_blob.prune_tail(get_blob().get_ondisk_length());
2249 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
2250 }
2251 }
2252}
2253
2254void BlueStore::Blob::get_ref(
2255 Collection *coll,
2256 uint32_t offset,
2257 uint32_t length)
2258{
2259 // Caller has to initialize Blob's logical length prior to increment
2260 // references. Otherwise one is neither unable to determine required
2261 // amount of counters in case of per-au tracking nor obtain min_release_size
2262 // for single counter mode.
2263 ceph_assert(get_blob().get_logical_length() != 0);
2264 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2265 << std::dec << " " << *this << dendl;
2266
2267 if (used_in_blob.is_empty()) {
2268 uint32_t min_release_size =
2269 get_blob().get_release_size(coll->store->min_alloc_size);
2270 uint64_t l = get_blob().get_logical_length();
2271 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
2272 << min_release_size << std::dec << dendl;
2273 used_in_blob.init(l, min_release_size);
2274 }
2275 used_in_blob.get(
2276 offset,
2277 length);
2278}
2279
2280bool BlueStore::Blob::put_ref(
2281 Collection *coll,
2282 uint32_t offset,
2283 uint32_t length,
2284 PExtentVector *r)
2285{
2286 PExtentVector logical;
2287
2288 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2289 << std::dec << " " << *this << dendl;
2290
2291 bool empty = used_in_blob.put(
2292 offset,
2293 length,
2294 &logical);
2295 r->clear();
2296 // nothing to release
2297 if (!empty && logical.empty()) {
2298 return false;
2299 }
2300
2301 bluestore_blob_t& b = dirty_blob();
2302 return b.release_extents(empty, logical, r);
2303}
2304
2305bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
2306 uint32_t target_blob_size,
2307 uint32_t b_offset,
2308 uint32_t *length0) {
2309 ceph_assert(min_alloc_size);
2310 ceph_assert(target_blob_size);
2311 if (!get_blob().is_mutable()) {
2312 return false;
2313 }
2314
2315 uint32_t length = *length0;
2316 uint32_t end = b_offset + length;
2317
2318 // Currently for the sake of simplicity we omit blob reuse if data is
2319 // unaligned with csum chunk. Later we can perform padding if needed.
2320 if (get_blob().has_csum() &&
2321 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2322 (end % get_blob().get_csum_chunk_size()) != 0)) {
2323 return false;
2324 }
2325
2326 auto blen = get_blob().get_logical_length();
2327 uint32_t new_blen = blen;
2328
2329 // make sure target_blob_size isn't less than current blob len
2330 target_blob_size = std::max(blen, target_blob_size);
2331
2332 if (b_offset >= blen) {
2333 // new data totally stands out of the existing blob
2334 new_blen = end;
2335 } else {
2336 // new data overlaps with the existing blob
2337 new_blen = std::max(blen, end);
2338
2339 uint32_t overlap = 0;
2340 if (new_blen > blen) {
2341 overlap = blen - b_offset;
2342 } else {
2343 overlap = length;
2344 }
2345
2346 if (!get_blob().is_unallocated(b_offset, overlap)) {
2347 // abort if any piece of the overlap has already been allocated
2348 return false;
2349 }
2350 }
2351
2352 if (new_blen > blen) {
2353 int64_t overflow = int64_t(new_blen) - target_blob_size;
2354 // Unable to decrease the provided length to fit into max_blob_size
2355 if (overflow >= length) {
2356 return false;
2357 }
2358
2359 // FIXME: in some cases we could reduce unused resolution
2360 if (get_blob().has_unused()) {
2361 return false;
2362 }
2363
2364 if (overflow > 0) {
2365 new_blen -= overflow;
2366 length -= overflow;
2367 *length0 = length;
2368 }
2369
2370 if (new_blen > blen) {
2371 dirty_blob().add_tail(new_blen);
2372 used_in_blob.add_tail(new_blen,
2373 get_blob().get_release_size(min_alloc_size));
2374 }
2375 }
2376 return true;
2377}
2378
2379void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2380{
2381 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2382 << " start " << *this << dendl;
2383 ceph_assert(blob.can_split());
2384 ceph_assert(used_in_blob.can_split());
2385 bluestore_blob_t &lb = dirty_blob();
2386 bluestore_blob_t &rb = r->dirty_blob();
2387
2388 used_in_blob.split(
2389 blob_offset,
2390 &(r->used_in_blob));
2391
2392 lb.split(blob_offset, rb);
2393 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2394
2395 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2396 << " finish " << *this << dendl;
2397 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2398 << " and " << *r << dendl;
2399}
2400
2401#ifndef CACHE_BLOB_BL
2402void BlueStore::Blob::decode(
2403 bufferptr::const_iterator& p,
2404 uint64_t struct_v,
2405 uint64_t* sbid,
2406 bool include_ref_map,
2407 Collection *coll)
2408{
2409 denc(blob, p, struct_v);
2410 if (blob.is_shared()) {
2411 denc(*sbid, p);
2412 }
2413 if (include_ref_map) {
2414 if (struct_v > 1) {
2415 used_in_blob.decode(p);
2416 } else {
2417 used_in_blob.clear();
2418 bluestore_extent_ref_map_t legacy_ref_map;
2419 legacy_ref_map.decode(p);
2420 if (coll) {
2421 for (auto r : legacy_ref_map.ref_map) {
2422 get_ref(
2423 coll,
2424 r.first,
2425 r.second.refs * r.second.length);
2426 }
2427 }
2428 }
2429 }
2430}
2431#endif
2432
2433// Extent
2434
2435void BlueStore::Extent::dump(Formatter* f) const
2436{
2437 f->dump_unsigned("logical_offset", logical_offset);
2438 f->dump_unsigned("length", length);
2439 f->dump_unsigned("blob_offset", blob_offset);
2440 f->dump_object("blob", *blob);
2441}
2442
2443ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2444{
2445 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2446 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2447 << " " << *e.blob;
2448}
2449
2450// OldExtent
2451BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2452 uint32_t lo,
2453 uint32_t o,
2454 uint32_t l,
2455 BlobRef& b) {
2456 OldExtent* oe = new OldExtent(lo, o, l, b);
2457 b->put_ref(c.get(), o, l, &(oe->r));
2458 oe->blob_empty = !b->is_referenced();
2459 return oe;
2460}
2461
2462// ExtentMap
2463
2464#undef dout_prefix
2465#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
2466#undef dout_context
2467#define dout_context onode->c->store->cct
2468
2469BlueStore::ExtentMap::ExtentMap(Onode *o, size_t inline_shard_prealloc_size)
2470 : onode(o),
2471 inline_bl(inline_shard_prealloc_size) {
2472}
2473
2474void BlueStore::ExtentMap::dump(Formatter* f) const
2475{
2476 f->open_array_section("extents");
2477
2478 for (auto& e : extent_map) {
2479 f->dump_object("extent", e);
2480 }
2481 f->close_section();
2482}
2483
2484void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2485 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2486 uint64_t& length, uint64_t& dstoff) {
2487
2488 auto cct = onode->c->store->cct;
2489 bool inject_21040 =
2490 cct->_conf->bluestore_debug_inject_bug21040;
2491 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2492 for (auto& e : oldo->extent_map.extent_map) {
2493 e.blob->last_encoded_id = -1;
2494 }
2495
2496 int n = 0;
2497 uint64_t end = srcoff + length;
2498 uint32_t dirty_range_begin = 0;
2499 uint32_t dirty_range_end = 0;
2500 bool src_dirty = false;
2501 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2502 ep != oldo->extent_map.extent_map.end();
2503 ++ep) {
2504 auto& e = *ep;
2505 if (e.logical_offset >= end) {
2506 break;
2507 }
2508 dout(20) << __func__ << " src " << e << dendl;
2509 BlobRef cb;
2510 bool blob_duped = true;
2511 if (e.blob->last_encoded_id >= 0) {
2512 cb = id_to_blob[e.blob->last_encoded_id];
2513 blob_duped = false;
2514 } else {
2515 // dup the blob
2516 const bluestore_blob_t& blob = e.blob->get_blob();
2517 // make sure it is shared
2518 if (!blob.is_shared()) {
2519 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2520 if (!inject_21040 && !src_dirty) {
2521 src_dirty = true;
2522 dirty_range_begin = e.logical_offset;
2523 } else if (inject_21040 &&
2524 dirty_range_begin == 0 && dirty_range_end == 0) {
2525 dirty_range_begin = e.logical_offset;
2526 }
2527 ceph_assert(e.logical_end() > 0);
2528 // -1 to exclude next potential shard
2529 dirty_range_end = e.logical_end() - 1;
2530 } else {
2531 c->load_shared_blob(e.blob->shared_blob);
2532 }
2533 cb = new Blob();
2534 e.blob->last_encoded_id = n;
2535 id_to_blob[n] = cb;
2536 e.blob->dup(*cb);
2537 // bump the extent refs on the copied blob's extents
2538 for (auto p : blob.get_extents()) {
2539 if (p.is_valid()) {
2540 e.blob->shared_blob->get_ref(p.offset, p.length);
2541 }
2542 }
2543 txc->write_shared_blob(e.blob->shared_blob);
2544 dout(20) << __func__ << " new " << *cb << dendl;
2545 }
2546
2547 int skip_front, skip_back;
2548 if (e.logical_offset < srcoff) {
2549 skip_front = srcoff - e.logical_offset;
2550 } else {
2551 skip_front = 0;
2552 }
2553 if (e.logical_end() > end) {
2554 skip_back = e.logical_end() - end;
2555 } else {
2556 skip_back = 0;
2557 }
2558
2559 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2560 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2561 newo->extent_map.extent_map.insert(*ne);
2562 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2563 // fixme: we may leave parts of new blob unreferenced that could
2564 // be freed (relative to the shared_blob).
2565 txc->statfs_delta.stored() += ne->length;
2566 if (e.blob->get_blob().is_compressed()) {
2567 txc->statfs_delta.compressed_original() += ne->length;
2568 if (blob_duped) {
2569 txc->statfs_delta.compressed() +=
2570 cb->get_blob().get_compressed_payload_length();
2571 }
2572 }
2573 dout(20) << __func__ << " dst " << *ne << dendl;
2574 ++n;
2575 }
2576 if ((!inject_21040 && src_dirty) ||
2577 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2578 oldo->extent_map.dirty_range(dirty_range_begin,
2579 dirty_range_end - dirty_range_begin);
2580 txc->write_onode(oldo);
2581 }
2582 txc->write_onode(newo);
2583
2584 if (dstoff + length > newo->onode.size) {
2585 newo->onode.size = dstoff + length;
2586 }
2587 newo->extent_map.dirty_range(dstoff, length);
2588}
2589void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2590 bool force)
2591{
2592 auto cct = onode->c->store->cct; //used by dout
2593 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2594 if (onode->onode.extent_map_shards.empty()) {
2595 if (inline_bl.length() == 0) {
2596 unsigned n;
2597 // we need to encode inline_bl to measure encoded length
2598 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
2599 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
2600 ceph_assert(!never_happen);
2601 size_t len = inline_bl.length();
2602 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2603 << " extents" << dendl;
2604 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2605 request_reshard(0, OBJECT_MAX_SIZE);
2606 return;
2607 }
2608 }
2609 // will persist in the onode key.
2610 } else {
2611 // pending shard update
2612 struct dirty_shard_t {
2613 Shard *shard;
2614 bufferlist bl;
2615 dirty_shard_t(Shard *s) : shard(s) {}
2616 };
2617 vector<dirty_shard_t> encoded_shards;
2618 // allocate slots for all shards in a single call instead of
2619 // doing multiple allocations - one per each dirty shard
2620 encoded_shards.reserve(shards.size());
2621
2622 auto p = shards.begin();
2623 auto prev_p = p;
2624 while (p != shards.end()) {
2625 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
2626 auto n = p;
2627 ++n;
2628 if (p->dirty) {
2629 uint32_t endoff;
2630 if (n == shards.end()) {
2631 endoff = OBJECT_MAX_SIZE;
2632 } else {
2633 endoff = n->shard_info->offset;
2634 }
2635 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2636 bufferlist& bl = encoded_shards.back().bl;
2637 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2638 bl, &p->extents)) {
2639 if (force) {
2640 derr << __func__ << " encode_some needs reshard" << dendl;
2641 ceph_assert(!force);
2642 }
2643 }
2644 size_t len = bl.length();
2645
2646 dout(20) << __func__ << " shard 0x" << std::hex
2647 << p->shard_info->offset << std::dec << " is " << len
2648 << " bytes (was " << p->shard_info->bytes << ") from "
2649 << p->extents << " extents" << dendl;
2650
2651 if (!force) {
2652 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2653 // we are big; reshard ourselves
2654 request_reshard(p->shard_info->offset, endoff);
2655 }
2656 // avoid resharding the trailing shard, even if it is small
2657 else if (n != shards.end() &&
2658 len < g_conf()->bluestore_extent_map_shard_min_size) {
2659 ceph_assert(endoff != OBJECT_MAX_SIZE);
2660 if (p == shards.begin()) {
2661 // we are the first shard, combine with next shard
2662 request_reshard(p->shard_info->offset, endoff + 1);
2663 } else {
2664 // combine either with the previous shard or the next,
2665 // whichever is smaller
2666 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2667 request_reshard(p->shard_info->offset, endoff + 1);
2668 } else {
2669 request_reshard(prev_p->shard_info->offset, endoff);
2670 }
2671 }
2672 }
2673 }
2674 }
2675 prev_p = p;
2676 p = n;
2677 }
2678 if (needs_reshard()) {
2679 return;
2680 }
2681
2682 // schedule DB update for dirty shards
2683 string key;
2684 for (auto& it : encoded_shards) {
2685 dout(20) << __func__ << " encoding key for shard 0x" << std::hex
2686 << it.shard->shard_info->offset << std::dec << dendl;
2687 it.shard->dirty = false;
2688 it.shard->shard_info->bytes = it.bl.length();
2689 generate_extent_shard_key_and_apply(
2690 onode->key,
2691 it.shard->shard_info->offset,
2692 &key,
2693 [&](const string& final_key) {
2694 t->set(PREFIX_OBJ, final_key, it.bl);
2695 }
2696 );
2697 }
2698 }
2699}
2700
2701bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2702{
2703 if (spanning_blob_map.empty())
2704 return 0;
2705 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2706 // bid is valid and available.
2707 if (bid >= 0)
2708 return bid;
2709 // Find next unused bid;
2710 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2711 const auto begin_bid = bid;
2712 do {
2713 if (!spanning_blob_map.count(bid))
2714 return bid;
2715 else {
2716 bid++;
2717 if (bid < 0) bid = 0;
2718 }
2719 } while (bid != begin_bid);
2720 auto cct = onode->c->store->cct; // used by dout
2721 _dump_onode<0>(cct, *onode);
2722 ceph_abort_msg("no available blob id");
2723}
2724
2725void BlueStore::ExtentMap::reshard(
2726 KeyValueDB *db,
2727 KeyValueDB::Transaction t)
2728{
2729 auto cct = onode->c->store->cct; // used by dout
2730
2731 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2732 << needs_reshard_end << ")" << std::dec
2733 << " of " << onode->onode.extent_map_shards.size()
2734 << " shards on " << onode->oid << dendl;
2735 for (auto& p : spanning_blob_map) {
2736 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2737 << dendl;
2738 }
2739 // determine shard index range
2740 unsigned si_begin = 0, si_end = 0;
2741 if (!shards.empty()) {
2742 while (si_begin + 1 < shards.size() &&
2743 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2744 ++si_begin;
2745 }
2746 needs_reshard_begin = shards[si_begin].shard_info->offset;
2747 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2748 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2749 needs_reshard_end = shards[si_end].shard_info->offset;
2750 break;
2751 }
2752 }
2753 if (si_end == shards.size()) {
2754 needs_reshard_end = OBJECT_MAX_SIZE;
2755 }
2756 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2757 << " over 0x[" << std::hex << needs_reshard_begin << ","
2758 << needs_reshard_end << ")" << std::dec << dendl;
2759 }
2760
2761 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
2762
2763 // we may need to fault in a larger interval later must have all
2764 // referring extents for spanning blobs loaded in order to have
2765 // accurate use_tracker values.
2766 uint32_t spanning_scan_begin = needs_reshard_begin;
2767 uint32_t spanning_scan_end = needs_reshard_end;
2768
2769 // remove old keys
2770 string key;
2771 for (unsigned i = si_begin; i < si_end; ++i) {
2772 generate_extent_shard_key_and_apply(
2773 onode->key, shards[i].shard_info->offset, &key,
2774 [&](const string& final_key) {
2775 t->rmkey(PREFIX_OBJ, final_key);
2776 }
2777 );
2778 }
2779
2780 // calculate average extent size
2781 unsigned bytes = 0;
2782 unsigned extents = 0;
2783 if (onode->onode.extent_map_shards.empty()) {
2784 bytes = inline_bl.length();
2785 extents = extent_map.size();
2786 } else {
2787 for (unsigned i = si_begin; i < si_end; ++i) {
2788 bytes += shards[i].shard_info->bytes;
2789 extents += shards[i].extents;
2790 }
2791 }
2792 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2793 unsigned slop = target *
2794 cct->_conf->bluestore_extent_map_shard_target_size_slop;
2795 unsigned extent_avg = bytes / std::max(1u, extents);
2796 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2797 << ", slop " << slop << dendl;
2798
2799 // reshard
2800 unsigned estimate = 0;
2801 unsigned offset = needs_reshard_begin;
2802 vector<bluestore_onode_t::shard_info> new_shard_info;
2803 unsigned max_blob_end = 0;
2804 Extent dummy(needs_reshard_begin);
2805 for (auto e = extent_map.lower_bound(dummy);
2806 e != extent_map.end();
2807 ++e) {
2808 if (e->logical_offset >= needs_reshard_end) {
2809 break;
2810 }
2811 dout(30) << " extent " << *e << dendl;
2812
2813 // disfavor shard boundaries that span a blob
2814 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2815 if (estimate &&
2816 estimate + extent_avg > target + (would_span ? slop : 0)) {
2817 // new shard
2818 if (offset == needs_reshard_begin) {
2819 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2820 new_shard_info.back().offset = offset;
2821 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2822 << std::dec << dendl;
2823 }
2824 offset = e->logical_offset;
2825 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2826 new_shard_info.back().offset = offset;
2827 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2828 << std::dec << dendl;
2829 estimate = 0;
2830 }
2831 estimate += extent_avg;
2832 unsigned bs = e->blob_start();
2833 if (bs < spanning_scan_begin) {
2834 spanning_scan_begin = bs;
2835 }
2836 uint32_t be = e->blob_end();
2837 if (be > max_blob_end) {
2838 max_blob_end = be;
2839 }
2840 if (be > spanning_scan_end) {
2841 spanning_scan_end = be;
2842 }
2843 }
2844 if (new_shard_info.empty() && (si_begin > 0 ||
2845 si_end < shards.size())) {
2846 // we resharded a partial range; we must produce at least one output
2847 // shard
2848 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2849 new_shard_info.back().offset = needs_reshard_begin;
2850 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2851 << std::dec << " (singleton degenerate case)" << dendl;
2852 }
2853
2854 auto& sv = onode->onode.extent_map_shards;
2855 dout(20) << __func__ << " new " << new_shard_info << dendl;
2856 dout(20) << __func__ << " old " << sv << dendl;
2857 if (sv.empty()) {
2858 // no old shards to keep
2859 sv.swap(new_shard_info);
2860 init_shards(true, true);
2861 } else {
2862 // splice in new shards
2863 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2864 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2865 sv.insert(
2866 sv.begin() + si_begin,
2867 new_shard_info.begin(),
2868 new_shard_info.end());
2869 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
2870 si_end = si_begin + new_shard_info.size();
2871
2872 ceph_assert(sv.size() == shards.size());
2873
2874 // note that we need to update every shard_info of shards here,
2875 // as sv might have been totally re-allocated above
2876 for (unsigned i = 0; i < shards.size(); i++) {
2877 shards[i].shard_info = &sv[i];
2878 }
2879
2880 // mark newly added shards as dirty
2881 for (unsigned i = si_begin; i < si_end; ++i) {
2882 shards[i].loaded = true;
2883 shards[i].dirty = true;
2884 }
2885 }
2886 dout(20) << __func__ << " fin " << sv << dendl;
2887 inline_bl.clear();
2888
2889 if (sv.empty()) {
2890 // no more shards; unspan all previously spanning blobs
2891 auto p = spanning_blob_map.begin();
2892 while (p != spanning_blob_map.end()) {
2893 p->second->id = -1;
2894 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2895 p = spanning_blob_map.erase(p);
2896 }
2897 } else {
2898 // identify new spanning blobs
2899 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2900 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2901 if (spanning_scan_begin < needs_reshard_begin) {
2902 fault_range(db, spanning_scan_begin,
2903 needs_reshard_begin - spanning_scan_begin);
2904 }
2905 if (spanning_scan_end > needs_reshard_end) {
2906 fault_range(db, needs_reshard_end,
2907 spanning_scan_end - needs_reshard_end);
2908 }
2909 auto sp = sv.begin() + si_begin;
2910 auto esp = sv.end();
2911 unsigned shard_start = sp->offset;
2912 unsigned shard_end;
2913 ++sp;
2914 if (sp == esp) {
2915 shard_end = OBJECT_MAX_SIZE;
2916 } else {
2917 shard_end = sp->offset;
2918 }
2919 Extent dummy(needs_reshard_begin);
2920
2921 bool was_too_many_blobs_check = false;
2922 auto too_many_blobs_threshold =
2923 g_conf()->bluestore_debug_too_many_blobs_threshold;
2924 auto& dumped_onodes = onode->c->onode_space.cache->dumped_onodes;
2925 decltype(onode->c->onode_space.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2926 decltype(onode->c->onode_space.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
2927
2928 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2929 if (e->logical_offset >= needs_reshard_end) {
2930 break;
2931 }
2932 dout(30) << " extent " << *e << dendl;
2933 while (e->logical_offset >= shard_end) {
2934 shard_start = shard_end;
2935 ceph_assert(sp != esp);
2936 ++sp;
2937 if (sp == esp) {
2938 shard_end = OBJECT_MAX_SIZE;
2939 } else {
2940 shard_end = sp->offset;
2941 }
2942 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2943 << " to 0x" << shard_end << std::dec << dendl;
2944 }
2945
2946 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2947 if (!e->blob->is_spanning()) {
2948 // We have two options: (1) split the blob into pieces at the
2949 // shard boundaries (and adjust extents accordingly), or (2)
2950 // mark it spanning. We prefer to cut the blob if we can. Note that
2951 // we may have to split it multiple times--potentially at every
2952 // shard boundary.
2953 bool must_span = false;
2954 BlobRef b = e->blob;
2955 if (b->can_split()) {
2956 uint32_t bstart = e->blob_start();
2957 uint32_t bend = e->blob_end();
2958 for (const auto& sh : shards) {
2959 if (bstart < sh.shard_info->offset &&
2960 bend > sh.shard_info->offset) {
2961 uint32_t blob_offset = sh.shard_info->offset - bstart;
2962 if (b->can_split_at(blob_offset)) {
2963 dout(20) << __func__ << " splitting blob, bstart 0x"
2964 << std::hex << bstart << " blob_offset 0x"
2965 << blob_offset << std::dec << " " << *b << dendl;
2966 b = split_blob(b, blob_offset, sh.shard_info->offset);
2967 // switch b to the new right-hand side, in case it
2968 // *also* has to get split.
2969 bstart += blob_offset;
2970 onode->c->store->logger->inc(l_bluestore_blob_split);
2971 } else {
2972 must_span = true;
2973 break;
2974 }
2975 }
2976 }
2977 } else {
2978 must_span = true;
2979 }
2980 if (must_span) {
2981 auto bid = allocate_spanning_blob_id();
2982 b->id = bid;
2983 spanning_blob_map[b->id] = b;
2984 dout(20) << __func__ << " adding spanning " << *b << dendl;
2985 if (!was_too_many_blobs_check &&
2986 too_many_blobs_threshold &&
2987 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2988
2989 was_too_many_blobs_check = true;
2990 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2991 if (dumped_onodes[i].first == onode->oid) {
2992 oid_slot = &dumped_onodes[i];
2993 break;
2994 }
2995 if (!oldest_slot || (oldest_slot &&
2996 dumped_onodes[i].second < oldest_slot->second)) {
2997 oldest_slot = &dumped_onodes[i];
2998 }
2999 }
3000 }
3001 }
3002 }
3003 } else {
3004 if (e->blob->is_spanning()) {
3005 spanning_blob_map.erase(e->blob->id);
3006 e->blob->id = -1;
3007 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
3008 }
3009 }
3010 }
3011 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
3012 (oid_slot &&
3013 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
3014 if (do_dump) {
3015 dout(0) << __func__
3016 << " spanning blob count exceeds threshold, "
3017 << spanning_blob_map.size() << " spanning blobs"
3018 << dendl;
3019 _dump_onode<0>(cct, *onode);
3020 if (oid_slot) {
3021 oid_slot->second = mono_clock::now();
3022 } else {
3023 ceph_assert(oldest_slot);
3024 oldest_slot->first = onode->oid;
3025 oldest_slot->second = mono_clock::now();
3026 }
3027 }
3028 }
3029
3030 clear_needs_reshard();
3031}
3032
3033bool BlueStore::ExtentMap::encode_some(
3034 uint32_t offset,
3035 uint32_t length,
3036 bufferlist& bl,
3037 unsigned *pn)
3038{
3039 Extent dummy(offset);
3040 auto start = extent_map.lower_bound(dummy);
3041 uint32_t end = offset + length;
3042
3043 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
3044 // serialization only. Hence there is no specific
3045 // handling at ExtentMap level.
3046
3047 unsigned n = 0;
3048 size_t bound = 0;
3049 bool must_reshard = false;
3050 for (auto p = start;
3051 p != extent_map.end() && p->logical_offset < end;
3052 ++p, ++n) {
3053 ceph_assert(p->logical_offset >= offset);
3054 p->blob->last_encoded_id = -1;
3055 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
3056 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3057 << std::dec << " hit new spanning blob " << *p << dendl;
3058 request_reshard(p->blob_start(), p->blob_end());
3059 must_reshard = true;
3060 }
3061 if (!must_reshard) {
3062 denc_varint(0, bound); // blobid
3063 denc_varint(0, bound); // logical_offset
3064 denc_varint(0, bound); // len
3065 denc_varint(0, bound); // blob_offset
3066
3067 p->blob->bound_encode(
3068 bound,
3069 struct_v,
3070 p->blob->shared_blob->get_sbid(),
3071 false);
3072 }
3073 }
3074 if (must_reshard) {
3075 return true;
3076 }
3077
3078 denc(struct_v, bound);
3079 denc_varint(0, bound); // number of extents
3080
3081 {
3082 auto app = bl.get_contiguous_appender(bound);
3083 denc(struct_v, app);
3084 denc_varint(n, app);
3085 if (pn) {
3086 *pn = n;
3087 }
3088
3089 n = 0;
3090 uint64_t pos = 0;
3091 uint64_t prev_len = 0;
3092 for (auto p = start;
3093 p != extent_map.end() && p->logical_offset < end;
3094 ++p, ++n) {
3095 unsigned blobid;
3096 bool include_blob = false;
3097 if (p->blob->is_spanning()) {
3098 blobid = p->blob->id << BLOBID_SHIFT_BITS;
3099 blobid |= BLOBID_FLAG_SPANNING;
3100 } else if (p->blob->last_encoded_id < 0) {
3101 p->blob->last_encoded_id = n + 1; // so it is always non-zero
3102 include_blob = true;
3103 blobid = 0; // the decoder will infer the id from n
3104 } else {
3105 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
3106 }
3107 if (p->logical_offset == pos) {
3108 blobid |= BLOBID_FLAG_CONTIGUOUS;
3109 }
3110 if (p->blob_offset == 0) {
3111 blobid |= BLOBID_FLAG_ZEROOFFSET;
3112 }
3113 if (p->length == prev_len) {
3114 blobid |= BLOBID_FLAG_SAMELENGTH;
3115 } else {
3116 prev_len = p->length;
3117 }
3118 denc_varint(blobid, app);
3119 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3120 denc_varint_lowz(p->logical_offset - pos, app);
3121 }
3122 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3123 denc_varint_lowz(p->blob_offset, app);
3124 }
3125 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3126 denc_varint_lowz(p->length, app);
3127 }
3128 pos = p->logical_end();
3129 if (include_blob) {
3130 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
3131 }
3132 }
3133 }
3134 /*derr << __func__ << bl << dendl;
3135 derr << __func__ << ":";
3136 bl.hexdump(*_dout);
3137 *_dout << dendl;
3138 */
3139 return false;
3140}
3141
3142/////////////////// BlueStore::ExtentMap::DecoderExtent ///////////
3143void BlueStore::ExtentMap::ExtentDecoder::decode_extent(
3144 Extent* le,
3145 __u8 struct_v,
3146 bptr_c_it_t& p,
3147 Collection* c)
3148{
3149 uint64_t blobid;
3150 denc_varint(blobid, p);
3151 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3152 uint64_t gap;
3153 denc_varint_lowz(gap, p);
3154 pos += gap;
3155 }
3156 le->logical_offset = pos;
3157 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3158 denc_varint_lowz(le->blob_offset, p);
3159 } else {
3160 le->blob_offset = 0;
3161 }
3162 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3163 denc_varint_lowz(prev_len, p);
3164 }
3165 le->length = prev_len;
3166 if (blobid & BLOBID_FLAG_SPANNING) {
3167 consume_blobid(le, true, blobid >> BLOBID_SHIFT_BITS);
3168 } else {
3169 blobid >>= BLOBID_SHIFT_BITS;
3170 if (blobid) {
3171 consume_blobid(le, false, blobid - 1);
3172 } else {
3173 Blob *b = new Blob();
3174 uint64_t sbid = 0;
3175 b->decode(p, struct_v, &sbid, false, c);
3176 consume_blob(le, extent_pos, sbid, b);
3177 }
3178 }
3179 pos += prev_len;
3180 ++extent_pos;
3181}
3182
3183unsigned BlueStore::ExtentMap::ExtentDecoder::decode_some(
3184 const bufferlist& bl, Collection* c)
3185{
3186 __u8 struct_v;
3187 uint32_t num;
3188
3189 ceph_assert(bl.get_num_buffers() <= 1);
3190 auto p = bl.front().begin_deep();
3191 denc(struct_v, p);
3192 // Version 2 differs from v1 in blob's ref_map
3193 // serialization only. Hence there is no specific
3194 // handling at ExtentMap level below.
3195 ceph_assert(struct_v == 1 || struct_v == 2);
3196 denc_varint(num, p);
3197
3198 extent_pos = 0;
3199 while (!p.end()) {
3200 Extent* le = get_next_extent();
3201 decode_extent(le, struct_v, p, c);
3202 add_extent(le);
3203 }
3204 ceph_assert(extent_pos == num);
3205 return num;
3206}
3207
3208void BlueStore::ExtentMap::ExtentDecoder::decode_spanning_blobs(
3209 bptr_c_it_t& p, Collection* c)
3210{
3211 __u8 struct_v;
3212 denc(struct_v, p);
3213 // Version 2 differs from v1 in blob's ref_map
3214 // serialization only. Hence there is no specific
3215 // handling at ExtentMap level.
3216 ceph_assert(struct_v == 1 || struct_v == 2);
3217
3218 unsigned n;
3219 denc_varint(n, p);
3220 while (n--) {
3221 BlueStore::BlobRef b(new Blob());
3222 denc_varint(b->id, p);
3223 uint64_t sbid = 0;
3224 b->decode(p, struct_v, &sbid, true, c);
3225 consume_spanning_blob(sbid, b);
3226 }
3227}
3228
3229/////////////////// BlueStore::ExtentMap::DecoderExtentFull ///////////
3230void BlueStore::ExtentMap::ExtentDecoderFull::consume_blobid(
3231 BlueStore::Extent* le, bool spanning, uint64_t blobid) {
3232 ceph_assert(le);
3233 if (spanning) {
3234 le->assign_blob(extent_map.get_spanning_blob(blobid));
3235 } else {
3236 ceph_assert(blobid < blobs.size());
3237 le->assign_blob(blobs[blobid]);
3238 // we build ref_map dynamically for non-spanning blobs
3239 le->blob->get_ref(
3240 extent_map.onode->c,
3241 le->blob_offset,
3242 le->length);
3243 }
3244}
3245
3246void BlueStore::ExtentMap::ExtentDecoderFull::consume_blob(
3247 BlueStore::Extent* le, uint64_t extent_no, uint64_t sbid, BlobRef b) {
3248 ceph_assert(le);
3249 blobs.resize(extent_no + 1);
3250 blobs[extent_no] = b;
3251 extent_map.onode->c->open_shared_blob(sbid, b);
3252 le->assign_blob(b);
3253 le->blob->get_ref(
3254 extent_map.onode->c,
3255 le->blob_offset,
3256 le->length);
3257}
3258
3259void BlueStore::ExtentMap::ExtentDecoderFull::consume_spanning_blob(
3260 uint64_t sbid, BlueStore::BlobRef b) {
3261 extent_map.spanning_blob_map[b->id] = b;
3262 extent_map.onode->c->open_shared_blob(sbid, b);
3263}
3264
3265BlueStore::Extent* BlueStore::ExtentMap::ExtentDecoderFull::get_next_extent()
3266{
3267 return new Extent();
3268}
3269
3270void BlueStore::ExtentMap::ExtentDecoderFull::add_extent(BlueStore::Extent* le)
3271{
3272 extent_map.extent_map.insert(*le);
3273}
3274
3275unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
3276{
3277 ExtentDecoderFull edecoder(*this);
3278 unsigned n = edecoder.decode_some(bl, onode->c);
3279 return n;
3280}
3281
3282void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
3283{
3284 // Version 2 differs from v1 in blob's ref_map
3285 // serialization only. Hence there is no specific
3286 // handling at ExtentMap level.
3287 __u8 struct_v = 2;
3288
3289 denc(struct_v, p);
3290 denc_varint((uint32_t)0, p);
3291 size_t key_size = 0;
3292 denc_varint((uint32_t)0, key_size);
3293 p += spanning_blob_map.size() * key_size;
3294 for (const auto& i : spanning_blob_map) {
3295 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3296 }
3297}
3298
3299void BlueStore::ExtentMap::encode_spanning_blobs(
3300 bufferlist::contiguous_appender& p)
3301{
3302 // Version 2 differs from v1 in blob's ref_map
3303 // serialization only. Hence there is no specific
3304 // handling at ExtentMap level.
3305 __u8 struct_v = 2;
3306
3307 denc(struct_v, p);
3308 denc_varint(spanning_blob_map.size(), p);
3309 for (auto& i : spanning_blob_map) {
3310 denc_varint(i.second->id, p);
3311 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3312 }
3313}
3314
3315void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
3316{
3317 shards.resize(onode->onode.extent_map_shards.size());
3318 unsigned i = 0;
3319 for (auto &s : onode->onode.extent_map_shards) {
3320 shards[i].shard_info = &s;
3321 shards[i].loaded = loaded;
3322 shards[i].dirty = dirty;
3323 ++i;
3324 }
3325}
3326
3327void BlueStore::ExtentMap::fault_range(
3328 KeyValueDB *db,
3329 uint32_t offset,
3330 uint32_t length)
3331{
3332 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3333 << std::dec << dendl;
3334 auto start = seek_shard(offset);
3335 auto last = seek_shard(offset + length);
3336
3337 if (start < 0)
3338 return;
3339
3340 ceph_assert(last >= start);
3341 string key;
3342 while (start <= last) {
3343 ceph_assert((size_t)start < shards.size());
3344 auto p = &shards[start];
3345 if (!p->loaded) {
3346 dout(30) << __func__ << " opening shard 0x" << std::hex
3347 << p->shard_info->offset << std::dec << dendl;
3348 bufferlist v;
3349 generate_extent_shard_key_and_apply(
3350 onode->key, p->shard_info->offset, &key,
3351 [&](const string& final_key) {
3352 int r = db->get(PREFIX_OBJ, final_key, &v);
3353 if (r < 0) {
3354 derr << __func__ << " missing shard 0x" << std::hex
3355 << p->shard_info->offset << std::dec << " for " << onode->oid
3356 << dendl;
3357 ceph_assert(r >= 0);
3358 }
3359 }
3360 );
3361 p->extents = decode_some(v);
3362 p->loaded = true;
3363 dout(20) << __func__ << " open shard 0x" << std::hex
3364 << p->shard_info->offset
3365 << " for range 0x" << offset << "~" << length << std::dec
3366 << " (" << v.length() << " bytes)" << dendl;
3367 ceph_assert(p->dirty == false);
3368 ceph_assert(v.length() == p->shard_info->bytes);
3369 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3370 } else {
3371 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3372 }
3373 ++start;
3374 }
3375}
3376
3377void BlueStore::ExtentMap::dirty_range(
3378 uint32_t offset,
3379 uint32_t length)
3380{
3381 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3382 << std::dec << dendl;
3383 if (shards.empty()) {
3384 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3385 inline_bl.clear();
3386 return;
3387 }
3388 auto start = seek_shard(offset);
3389 if (length == 0) {
3390 length = 1;
3391 }
3392 auto last = seek_shard(offset + length - 1);
3393 if (start < 0)
3394 return;
3395
3396 ceph_assert(last >= start);
3397 while (start <= last) {
3398 ceph_assert((size_t)start < shards.size());
3399 auto p = &shards[start];
3400 if (!p->loaded) {
3401 derr << __func__ << "on write 0x" << std::hex << offset
3402 << "~" << length << " shard 0x" << p->shard_info->offset
3403 << std::dec << " is not loaded, can't mark dirty" << dendl;
3404 ceph_abort_msg("can't mark unloaded shard dirty");
3405 }
3406 if (!p->dirty) {
3407 dout(20) << __func__ << " mark shard 0x" << std::hex
3408 << p->shard_info->offset << std::dec << " dirty" << dendl;
3409 p->dirty = true;
3410 }
3411 ++start;
3412 }
3413}
3414
3415BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3416 uint64_t offset)
3417{
3418 Extent dummy(offset);
3419 return extent_map.find(dummy);
3420}
3421
3422BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3423 uint64_t offset)
3424{
3425 Extent dummy(offset);
3426 auto fp = extent_map.lower_bound(dummy);
3427 if (fp != extent_map.begin()) {
3428 --fp;
3429 if (fp->logical_end() <= offset) {
3430 ++fp;
3431 }
3432 }
3433 return fp;
3434}
3435
3436BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3437 uint64_t offset) const
3438{
3439 Extent dummy(offset);
3440 auto fp = extent_map.lower_bound(dummy);
3441 if (fp != extent_map.begin()) {
3442 --fp;
3443 if (fp->logical_end() <= offset) {
3444 ++fp;
3445 }
3446 }
3447 return fp;
3448}
3449
3450bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3451{
3452 auto fp = seek_lextent(offset);
3453 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3454 return false;
3455 }
3456 return true;
3457}
3458
3459int BlueStore::ExtentMap::compress_extent_map(
3460 uint64_t offset,
3461 uint64_t length)
3462{
3463 if (extent_map.empty())
3464 return 0;
3465 int removed = 0;
3466 auto p = seek_lextent(offset);
3467 if (p != extent_map.begin()) {
3468 --p; // start to the left of offset
3469 }
3470 // the caller should have just written to this region
3471 ceph_assert(p != extent_map.end());
3472
3473 // identify the *next* shard
3474 auto pshard = shards.begin();
3475 while (pshard != shards.end() &&
3476 p->logical_offset >= pshard->shard_info->offset) {
3477 ++pshard;
3478 }
3479 uint64_t shard_end;
3480 if (pshard != shards.end()) {
3481 shard_end = pshard->shard_info->offset;
3482 } else {
3483 shard_end = OBJECT_MAX_SIZE;
3484 }
3485
3486 auto n = p;
3487 for (++n; n != extent_map.end(); p = n++) {
3488 if (n->logical_offset > offset + length) {
3489 break; // stop after end
3490 }
3491 while (n != extent_map.end() &&
3492 p->logical_end() == n->logical_offset &&
3493 p->blob == n->blob &&
3494 p->blob_offset + p->length == n->blob_offset &&
3495 n->logical_offset < shard_end) {
3496 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3497 << " next shard 0x" << shard_end << std::dec
3498 << " merging " << *p << " and " << *n << dendl;
3499 p->length += n->length;
3500 rm(n++);
3501 ++removed;
3502 }
3503 if (n == extent_map.end()) {
3504 break;
3505 }
3506 if (n->logical_offset >= shard_end) {
3507 ceph_assert(pshard != shards.end());
3508 ++pshard;
3509 if (pshard != shards.end()) {
3510 shard_end = pshard->shard_info->offset;
3511 } else {
3512 shard_end = OBJECT_MAX_SIZE;
3513 }
3514 }
3515 }
3516 if (removed) {
3517 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3518 }
3519 return removed;
3520}
3521
3522void BlueStore::ExtentMap::punch_hole(
3523 CollectionRef &c,
3524 uint64_t offset,
3525 uint64_t length,
3526 old_extent_map_t *old_extents)
3527{
3528 auto p = seek_lextent(offset);
3529 uint64_t end = offset + length;
3530 while (p != extent_map.end()) {
3531 if (p->logical_offset >= end) {
3532 break;
3533 }
3534 if (p->logical_offset < offset) {
3535 if (p->logical_end() > end) {
3536 // split and deref middle
3537 uint64_t front = offset - p->logical_offset;
3538 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3539 length, p->blob);
3540 old_extents->push_back(*oe);
3541 add(end,
3542 p->blob_offset + front + length,
3543 p->length - front - length,
3544 p->blob);
3545 p->length = front;
3546 break;
3547 } else {
3548 // deref tail
3549 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
3550 uint64_t keep = offset - p->logical_offset;
3551 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3552 p->length - keep, p->blob);
3553 old_extents->push_back(*oe);
3554 p->length = keep;
3555 ++p;
3556 continue;
3557 }
3558 }
3559 if (p->logical_offset + p->length <= end) {
3560 // deref whole lextent
3561 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3562 p->length, p->blob);
3563 old_extents->push_back(*oe);
3564 rm(p++);
3565 continue;
3566 }
3567 // deref head
3568 uint64_t keep = p->logical_end() - end;
3569 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3570 p->length - keep, p->blob);
3571 old_extents->push_back(*oe);
3572
3573 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3574 rm(p);
3575 break;
3576 }
3577}
3578
3579BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3580 CollectionRef &c,
3581 uint64_t logical_offset,
3582 uint64_t blob_offset, uint64_t length, BlobRef b,
3583 old_extent_map_t *old_extents)
3584{
3585 // We need to have completely initialized Blob to increment its ref counters.
3586 ceph_assert(b->get_blob().get_logical_length() != 0);
3587
3588 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3589 // old_extents list if we overwre the blob totally
3590 // This might happen during WAL overwrite.
3591 b->get_ref(onode->c, blob_offset, length);
3592
3593 if (old_extents) {
3594 punch_hole(c, logical_offset, length, old_extents);
3595 }
3596
3597 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3598 extent_map.insert(*le);
3599 if (spans_shard(logical_offset, length)) {
3600 request_reshard(logical_offset, logical_offset + length);
3601 }
3602 return le;
3603}
3604
3605BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3606 BlobRef lb,
3607 uint32_t blob_offset,
3608 uint32_t pos)
3609{
3610 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3611 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3612 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3613 << dendl;
3614 BlobRef rb = onode->c->new_blob();
3615 lb->split(onode->c, blob_offset, rb.get());
3616
3617 for (auto ep = seek_lextent(pos);
3618 ep != extent_map.end() && ep->logical_offset < end_pos;
3619 ++ep) {
3620 if (ep->blob != lb) {
3621 continue;
3622 }
3623 if (ep->logical_offset < pos) {
3624 // split extent
3625 size_t left = pos - ep->logical_offset;
3626 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3627 extent_map.insert(*ne);
3628 ep->length = left;
3629 dout(30) << __func__ << " split " << *ep << dendl;
3630 dout(30) << __func__ << " to " << *ne << dendl;
3631 } else {
3632 // switch blob
3633 ceph_assert(ep->blob_offset >= blob_offset);
3634
3635 ep->blob = rb;
3636 ep->blob_offset -= blob_offset;
3637 dout(30) << __func__ << " adjusted " << *ep << dendl;
3638 }
3639 }
3640 return rb;
3641}
3642
3643// Onode
3644
3645#undef dout_prefix
3646#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3647
3648const std::string& BlueStore::Onode::calc_omap_prefix(uint8_t flags)
3649{
3650 if (bluestore_onode_t::is_pgmeta_omap(flags)) {
3651 return PREFIX_PGMETA_OMAP;
3652 }
3653 if (bluestore_onode_t::is_perpg_omap(flags)) {
3654 return PREFIX_PERPG_OMAP;
3655 }
3656 if (bluestore_onode_t::is_perpool_omap(flags)) {
3657 return PREFIX_PERPOOL_OMAP;
3658 }
3659 return PREFIX_OMAP;
3660}
3661
3662// '-' < '.' < '~'
3663void BlueStore::Onode::calc_omap_header(
3664 uint8_t flags,
3665 const Onode* o,
3666 std::string* out)
3667{
3668 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3669 if (bluestore_onode_t::is_perpg_omap(flags)) {
3670 _key_encode_u64(o->c->pool(), out);
3671 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3672 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3673 _key_encode_u64(o->c->pool(), out);
3674 }
3675 }
3676 _key_encode_u64(o->onode.nid, out);
3677 out->push_back('-');
3678}
3679
3680void BlueStore::Onode::calc_omap_key(uint8_t flags,
3681 const Onode* o,
3682 const std::string& key,
3683 std::string* out)
3684{
3685 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3686 if (bluestore_onode_t::is_perpg_omap(flags)) {
3687 _key_encode_u64(o->c->pool(), out);
3688 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3689 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3690 _key_encode_u64(o->c->pool(), out);
3691 }
3692 }
3693 _key_encode_u64(o->onode.nid, out);
3694 out->push_back('.');
3695 out->append(key);
3696}
3697
3698void BlueStore::Onode::calc_omap_tail(
3699 uint8_t flags,
3700 const Onode* o,
3701 std::string* out)
3702{
3703 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3704 if (bluestore_onode_t::is_perpg_omap(flags)) {
3705 _key_encode_u64(o->c->pool(), out);
3706 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3707 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3708 _key_encode_u64(o->c->pool(), out);
3709 }
3710 }
3711 _key_encode_u64(o->onode.nid, out);
3712 out->push_back('~');
3713}
3714
3715void BlueStore::Onode::get()
3716{
3717 ++nref;
3718 ++pin_nref;
3719}
3720void BlueStore::Onode::put()
3721{
3722 if (--pin_nref == 1) {
3723 c->get_onode_cache()->maybe_unpin(this);
3724 }
3725 if (--nref == 0) {
3726 delete this;
3727 }
3728}
3729
3730void BlueStore::Onode::decode_raw(
3731 BlueStore::Onode* on,
3732 const bufferlist& v,
3733 BlueStore::ExtentMap::ExtentDecoder& edecoder)
3734{
3735 on->exists = true;
3736 auto p = v.front().begin_deep();
3737 on->onode.decode(p);
3738
3739 // initialize extent_map
3740 edecoder.decode_spanning_blobs(p, on->c);
3741 if (on->onode.extent_map_shards.empty()) {
3742 denc(on->extent_map.inline_bl, p);
3743 edecoder.decode_some(on->extent_map.inline_bl, on->c);
3744 }
3745}
3746
3747BlueStore::Onode* BlueStore::Onode::create_decode(
3748 CollectionRef c,
3749 const ghobject_t& oid,
3750 const string& key,
3751 const bufferlist& v,
3752 bool allow_empty)
3753{
3754 ceph_assert(v.length() || allow_empty);
3755 Onode* on = new Onode(c.get(), oid, key);
3756
3757 if (v.length()) {
3758 ExtentMap::ExtentDecoderFull edecoder(on->extent_map);
3759 decode_raw(on, v, edecoder);
3760
3761 for (auto& i : on->onode.attrs) {
3762 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3763 }
3764
3765 // initialize extent_map
3766 if (on->onode.extent_map_shards.empty()) {
3767 on->extent_map.inline_bl.reassign_to_mempool(
3768 mempool::mempool_bluestore_cache_data);
3769 } else {
3770 on->extent_map.init_shards(false, false);
3771 }
3772 }
3773 return on;
3774}
3775
3776void BlueStore::Onode::flush()
3777{
3778 if (flushing_count.load()) {
3779 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
3780 waiting_count++;
3781 std::unique_lock l(flush_lock);
3782 while (flushing_count.load()) {
3783 flush_cond.wait(l);
3784 }
3785 waiting_count--;
3786 }
3787 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3788}
3789
3790void BlueStore::Onode::dump(Formatter* f) const
3791{
3792 onode.dump(f);
3793 extent_map.dump(f);
3794}
3795
3796void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3797{
3798 if (!onode.is_pgmeta_omap()) {
3799 if (onode.is_perpg_omap()) {
3800 _key_encode_u64(c->pool(), out);
3801 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3802 } else if (onode.is_perpool_omap()) {
3803 _key_encode_u64(c->pool(), out);
3804 }
3805 }
3806 _key_encode_u64(onode.nid, out);
3807 out->append(old.c_str() + out->length(), old.size() - out->length());
3808}
3809
3810void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3811{
3812 size_t pos = sizeof(uint64_t) + 1;
3813 if (!onode.is_pgmeta_omap()) {
3814 if (onode.is_perpg_omap()) {
3815 pos += sizeof(uint64_t) + sizeof(uint32_t);
3816 } else if (onode.is_perpool_omap()) {
3817 pos += sizeof(uint64_t);
3818 }
3819 }
3820 *user_key = key.substr(pos);
3821}
3822
3823// =======================================================
3824// WriteContext
3825
3826/// Checks for writes to the same pextent within a blob
3827bool BlueStore::WriteContext::has_conflict(
3828 BlobRef b,
3829 uint64_t loffs,
3830 uint64_t loffs_end,
3831 uint64_t min_alloc_size)
3832{
3833 ceph_assert((loffs % min_alloc_size) == 0);
3834 ceph_assert((loffs_end % min_alloc_size) == 0);
3835 for (auto w : writes) {
3836 if (b == w.b) {
3837 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3838 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
3839 if ((loffs <= loffs2 && loffs_end > loffs2) ||
3840 (loffs >= loffs2 && loffs < loffs2_end)) {
3841 return true;
3842 }
3843 }
3844 }
3845 return false;
3846}
3847
3848// =======================================================
3849
3850// DeferredBatch
3851#undef dout_prefix
3852#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3853#undef dout_context
3854#define dout_context cct
3855
3856void BlueStore::DeferredBatch::prepare_write(
3857 CephContext *cct,
3858 uint64_t seq, uint64_t offset, uint64_t length,
3859 bufferlist::const_iterator& blp)
3860{
3861 _discard(cct, offset, length);
3862 auto i = iomap.insert(make_pair(offset, deferred_io()));
3863 ceph_assert(i.second); // this should be a new insertion
3864 i.first->second.seq = seq;
3865 blp.copy(length, i.first->second.bl);
3866 i.first->second.bl.reassign_to_mempool(
3867 mempool::mempool_bluestore_writing_deferred);
3868 dout(20) << __func__ << " seq " << seq
3869 << " 0x" << std::hex << offset << "~" << length
3870 << " crc " << i.first->second.bl.crc32c(-1)
3871 << std::dec << dendl;
3872 seq_bytes[seq] += length;
3873#ifdef DEBUG_DEFERRED
3874 _audit(cct);
3875#endif
3876}
3877
3878void BlueStore::DeferredBatch::_discard(
3879 CephContext *cct, uint64_t offset, uint64_t length)
3880{
3881 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3882 << std::dec << dendl;
3883 auto p = iomap.lower_bound(offset);
3884 if (p != iomap.begin()) {
3885 --p;
3886 auto end = p->first + p->second.bl.length();
3887 if (end > offset) {
3888 bufferlist head;
3889 head.substr_of(p->second.bl, 0, offset - p->first);
3890 dout(20) << __func__ << " keep head " << p->second.seq
3891 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3892 << " -> 0x" << head.length() << std::dec << dendl;
3893 auto i = seq_bytes.find(p->second.seq);
3894 ceph_assert(i != seq_bytes.end());
3895 if (end > offset + length) {
3896 bufferlist tail;
3897 tail.substr_of(p->second.bl, offset + length - p->first,
3898 end - (offset + length));
3899 dout(20) << __func__ << " keep tail " << p->second.seq
3900 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3901 << " -> 0x" << tail.length() << std::dec << dendl;
3902 auto &n = iomap[offset + length];
3903 n.bl.swap(tail);
3904 n.seq = p->second.seq;
3905 i->second -= length;
3906 } else {
3907 i->second -= end - offset;
3908 }
3909 ceph_assert(i->second >= 0);
3910 p->second.bl.swap(head);
3911 }
3912 ++p;
3913 }
3914 while (p != iomap.end()) {
3915 if (p->first >= offset + length) {
3916 break;
3917 }
3918 auto i = seq_bytes.find(p->second.seq);
3919 ceph_assert(i != seq_bytes.end());
3920 auto end = p->first + p->second.bl.length();
3921 if (end > offset + length) {
3922 unsigned drop_front = offset + length - p->first;
3923 unsigned keep_tail = end - (offset + length);
3924 dout(20) << __func__ << " truncate front " << p->second.seq
3925 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3926 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3927 << " to 0x" << (offset + length) << "~" << keep_tail
3928 << std::dec << dendl;
3929 auto &s = iomap[offset + length];
3930 s.seq = p->second.seq;
3931 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3932 i->second -= drop_front;
3933 } else {
3934 dout(20) << __func__ << " drop " << p->second.seq
3935 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3936 << std::dec << dendl;
3937 i->second -= p->second.bl.length();
3938 }
3939 ceph_assert(i->second >= 0);
3940 p = iomap.erase(p);
3941 }
3942}
3943
3944void BlueStore::DeferredBatch::_audit(CephContext *cct)
3945{
3946 map<uint64_t,int> sb;
3947 for (auto p : seq_bytes) {
3948 sb[p.first] = 0; // make sure we have the same set of keys
3949 }
3950 uint64_t pos = 0;
3951 for (auto& p : iomap) {
3952 ceph_assert(p.first >= pos);
3953 sb[p.second.seq] += p.second.bl.length();
3954 pos = p.first + p.second.bl.length();
3955 }
3956 ceph_assert(sb == seq_bytes);
3957}
3958
3959
3960// Collection
3961
3962#undef dout_prefix
3963#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3964
3965BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3966 : CollectionImpl(store_->cct, cid),
3967 store(store_),
3968 cache(bc),
3969 exists(true),
3970 onode_space(oc),
3971 commit_queue(nullptr)
3972{
3973}
3974
3975bool BlueStore::Collection::flush_commit(Context *c)
3976{
3977 return osr->flush_commit(c);
3978}
3979
3980void BlueStore::Collection::flush()
3981{
3982 osr->flush();
3983}
3984
3985void BlueStore::Collection::flush_all_but_last()
3986{
3987 osr->flush_all_but_last();
3988}
3989
3990void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3991{
3992 ceph_assert(!b->shared_blob);
3993 const bluestore_blob_t& blob = b->get_blob();
3994 if (!blob.is_shared()) {
3995 b->shared_blob = new SharedBlob(this);
3996 return;
3997 }
3998
3999 b->shared_blob = shared_blob_set.lookup(sbid);
4000 if (b->shared_blob) {
4001 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
4002 << std::dec << " had " << *b->shared_blob << dendl;
4003 } else {
4004 b->shared_blob = new SharedBlob(sbid, this);
4005 shared_blob_set.add(this, b->shared_blob.get());
4006 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
4007 << std::dec << " opened " << *b->shared_blob
4008 << dendl;
4009 }
4010}
4011
4012void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
4013{
4014 if (!sb->is_loaded()) {
4015
4016 bufferlist v;
4017 string key;
4018 auto sbid = sb->get_sbid();
4019 get_shared_blob_key(sbid, &key);
4020 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
4021 if (r < 0) {
4022 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
4023 << std::dec << " not found at key "
4024 << pretty_binary_string(key) << dendl;
4025 ceph_abort_msg("uh oh, missing shared_blob");
4026 }
4027
4028 sb->loaded = true;
4029 sb->persistent = new bluestore_shared_blob_t(sbid);
4030 auto p = v.cbegin();
4031 decode(*(sb->persistent), p);
4032 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
4033 << std::dec << " loaded shared_blob " << *sb << dendl;
4034 }
4035}
4036
4037void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
4038{
4039 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
4040 ceph_assert(!b->shared_blob->is_loaded());
4041
4042 // update blob
4043 bluestore_blob_t& blob = b->dirty_blob();
4044 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
4045
4046 // update shared blob
4047 b->shared_blob->loaded = true;
4048 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
4049 shared_blob_set.add(this, b->shared_blob.get());
4050 for (auto p : blob.get_extents()) {
4051 if (p.is_valid()) {
4052 b->shared_blob->get_ref(
4053 p.offset,
4054 p.length);
4055 }
4056 }
4057 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
4058}
4059
4060uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
4061{
4062 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
4063 ceph_assert(sb->is_loaded());
4064
4065 uint64_t sbid = sb->get_sbid();
4066 shared_blob_set.remove(sb);
4067 sb->loaded = false;
4068 delete sb->persistent;
4069 sb->sbid_unloaded = 0;
4070 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
4071 return sbid;
4072}
4073
4074BlueStore::OnodeRef BlueStore::Collection::get_onode(
4075 const ghobject_t& oid,
4076 bool create,
4077 bool is_createop)
4078{
4079 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
4080
4081 spg_t pgid;
4082 if (cid.is_pg(&pgid)) {
4083 if (!oid.match(cnode.bits, pgid.ps())) {
4084 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
4085 << pgid << " bits " << cnode.bits << dendl;
4086 ceph_abort();
4087 }
4088 }
4089
4090 OnodeRef o = onode_space.lookup(oid);
4091 if (o)
4092 return o;
4093
4094 string key;
4095 get_object_key(store->cct, oid, &key);
4096
4097 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
4098 << pretty_binary_string(key) << dendl;
4099
4100 bufferlist v;
4101 int r = -ENOENT;
4102 Onode *on;
4103 if (!is_createop) {
4104 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
4105 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
4106 }
4107 if (v.length() == 0) {
4108 ceph_assert(r == -ENOENT);
4109 if (!create)
4110 return OnodeRef();
4111 } else {
4112 ceph_assert(r >= 0);
4113 }
4114
4115 // new object, load onode if available
4116 on = Onode::create_decode(this, oid, key, v, true);
4117 o.reset(on);
4118 return onode_space.add_onode(oid, o);
4119}
4120
4121void BlueStore::Collection::split_cache(
4122 Collection *dest)
4123{
4124 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
4125
4126 auto *ocache = get_onode_cache();
4127 auto *ocache_dest = dest->get_onode_cache();
4128
4129 // lock cache shards
4130 std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock);
4131 std::lock_guard l(ocache->lock, std::adopt_lock);
4132 std::lock_guard l2(ocache_dest->lock, std::adopt_lock);
4133 std::lock_guard l3(cache->lock, std::adopt_lock);
4134 std::lock_guard l4(dest->cache->lock, std::adopt_lock);
4135
4136 int destbits = dest->cnode.bits;
4137 spg_t destpg;
4138 bool is_pg = dest->cid.is_pg(&destpg);
4139 ceph_assert(is_pg);
4140
4141 auto p = onode_space.onode_map.begin();
4142 while (p != onode_space.onode_map.end()) {
4143 OnodeRef o = p->second;
4144 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
4145 // onode does not belong to this child
4146 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
4147 << dendl;
4148 ++p;
4149 } else {
4150 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
4151 << dendl;
4152
4153 // ensuring that nref is always >= 2 and hence onode is pinned
4154 OnodeRef o_pin = o;
4155
4156 p = onode_space.onode_map.erase(p);
4157 dest->onode_space.onode_map[o->oid] = o;
4158 if (o->cached) {
4159 get_onode_cache()->_move_pinned(dest->get_onode_cache(), o.get());
4160 }
4161 o->c = dest;
4162
4163 // move over shared blobs and buffers. cover shared blobs from
4164 // both extent map and spanning blob map (the full extent map
4165 // may not be faulted in)
4166 vector<SharedBlob*> sbvec;
4167 for (auto& e : o->extent_map.extent_map) {
4168 sbvec.push_back(e.blob->shared_blob.get());
4169 }
4170 for (auto& b : o->extent_map.spanning_blob_map) {
4171 sbvec.push_back(b.second->shared_blob.get());
4172 }
4173 for (auto sb : sbvec) {
4174 if (sb->coll == dest) {
4175 ldout(store->cct, 20) << __func__ << " already moved " << *sb
4176 << dendl;
4177 continue;
4178 }
4179 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
4180 if (sb->get_sbid()) {
4181 ldout(store->cct, 20) << __func__
4182 << " moving registration " << *sb << dendl;
4183 shared_blob_set.remove(sb);
4184 dest->shared_blob_set.add(dest, sb);
4185 }
4186 sb->coll = dest;
4187 if (dest->cache != cache) {
4188 for (auto& i : sb->bc.buffer_map) {
4189 if (!i.second->is_writing()) {
4190 ldout(store->cct, 20) << __func__ << " moving " << *i.second
4191 << dendl;
4192 dest->cache->_move(cache, i.second.get());
4193 }
4194 }
4195 }
4196 }
4197 }
4198 }
4199 dest->cache->_trim();
4200}
4201
4202// =======================================================
4203
4204// MempoolThread
4205
4206#undef dout_prefix
4207#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
4208#undef dout_context
4209#define dout_context store->cct
4210
4211void *BlueStore::MempoolThread::entry()
4212{
4213 std::unique_lock l{lock};
4214
4215 uint32_t prev_config_change = store->config_changed.load();
4216 uint64_t base = store->osd_memory_base;
4217 double fragmentation = store->osd_memory_expected_fragmentation;
4218 uint64_t target = store->osd_memory_target;
4219 uint64_t min = store->osd_memory_cache_min;
4220 uint64_t max = min;
4221
4222 // When setting the maximum amount of memory to use for cache, first
4223 // assume some base amount of memory for the OSD and then fudge in
4224 // some overhead for fragmentation that scales with cache usage.
4225 uint64_t ltarget = (1.0 - fragmentation) * target;
4226 if (ltarget > base + min) {
4227 max = ltarget - base;
4228 }
4229
4230 binned_kv_cache = store->db->get_priority_cache();
4231 binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
4232 if (store->cache_autotune && binned_kv_cache != nullptr) {
4233 pcm = std::make_shared<PriorityCache::Manager>(
4234 store->cct, min, max, target, true, "bluestore-pricache");
4235 pcm->insert("kv", binned_kv_cache, true);
4236 pcm->insert("meta", meta_cache, true);
4237 pcm->insert("data", data_cache, true);
4238 if (binned_kv_onode_cache != nullptr) {
4239 pcm->insert("kv_onode", binned_kv_onode_cache, true);
4240 }
4241 }
4242
4243 utime_t next_balance = ceph_clock_now();
4244 utime_t next_resize = ceph_clock_now();
4245 utime_t next_bin_rotation = ceph_clock_now();
4246 utime_t next_deferred_force_submit = ceph_clock_now();
4247 utime_t alloc_stats_dump_clock = ceph_clock_now();
4248
4249 bool interval_stats_trim = false;
4250 while (!stop) {
4251 // Update pcm cache settings if related configuration was changed
4252 uint32_t cur_config_change = store->config_changed.load();
4253 if (cur_config_change != prev_config_change) {
4254 _update_cache_settings();
4255 prev_config_change = cur_config_change;
4256 }
4257
4258 // define various intervals for background work
4259 double age_bin_interval = store->cache_age_bin_interval;
4260 double autotune_interval = store->cache_autotune_interval;
4261 double resize_interval = store->osd_memory_cache_resize_interval;
4262 double max_defer_interval = store->max_defer_interval;
4263 double alloc_stats_dump_interval =
4264 store->cct->_conf->bluestore_alloc_stats_dump_interval;
4265
4266 // alloc stats dump
4267 if (alloc_stats_dump_interval > 0 &&
4268 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
4269 store->_record_allocation_stats();
4270 alloc_stats_dump_clock = ceph_clock_now();
4271 }
4272 // cache age binning
4273 if (age_bin_interval > 0 && next_bin_rotation < ceph_clock_now()) {
4274 if (binned_kv_cache != nullptr) {
4275 binned_kv_cache->import_bins(store->kv_bins);
4276 }
4277 if (binned_kv_onode_cache != nullptr) {
4278 binned_kv_onode_cache->import_bins(store->kv_onode_bins);
4279 }
4280 meta_cache->import_bins(store->meta_bins);
4281 data_cache->import_bins(store->data_bins);
4282
4283 if (pcm != nullptr) {
4284 pcm->shift_bins();
4285 }
4286 next_bin_rotation = ceph_clock_now();
4287 next_bin_rotation += age_bin_interval;
4288 }
4289 // cache balancing
4290 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
4291 if (binned_kv_cache != nullptr) {
4292 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
4293 }
4294 if (binned_kv_onode_cache != nullptr) {
4295 binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
4296 }
4297 meta_cache->set_cache_ratio(store->cache_meta_ratio);
4298 data_cache->set_cache_ratio(store->cache_data_ratio);
4299
4300 // Log events at 5 instead of 20 when balance happens.
4301 interval_stats_trim = true;
4302
4303 if (pcm != nullptr) {
4304 pcm->balance();
4305 }
4306
4307 next_balance = ceph_clock_now();
4308 next_balance += autotune_interval;
4309 }
4310 // memory resizing (ie autotuning)
4311 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
4312 if (ceph_using_tcmalloc() && pcm != nullptr) {
4313 pcm->tune_memory();
4314 }
4315 next_resize = ceph_clock_now();
4316 next_resize += resize_interval;
4317 }
4318 // deferred force submit
4319 if (max_defer_interval > 0 &&
4320 next_deferred_force_submit < ceph_clock_now()) {
4321 if (store->get_deferred_last_submitted() + max_defer_interval <
4322 ceph_clock_now()) {
4323 store->deferred_try_submit();
4324 }
4325 next_deferred_force_submit = ceph_clock_now();
4326 next_deferred_force_submit += max_defer_interval/3;
4327 }
4328
4329 // Now Resize the shards
4330 _resize_shards(interval_stats_trim);
4331 interval_stats_trim = false;
4332
4333 store->_update_logger();
4334 auto wait = ceph::make_timespan(
4335 store->cct->_conf->bluestore_cache_trim_interval);
4336 cond.wait_for(l, wait);
4337 }
4338 // do final dump
4339 store->_record_allocation_stats();
4340 stop = false;
4341 pcm = nullptr;
4342 return NULL;
4343}
4344
4345void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
4346{
4347 size_t onode_shards = store->onode_cache_shards.size();
4348 size_t buffer_shards = store->buffer_cache_shards.size();
4349 int64_t kv_used = store->db->get_cache_usage();
4350 int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
4351 int64_t meta_used = meta_cache->_get_used_bytes();
4352 int64_t data_used = data_cache->_get_used_bytes();
4353
4354 uint64_t cache_size = store->cache_size;
4355 int64_t kv_alloc =
4356 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
4357 int64_t kv_onode_alloc =
4358 static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
4359 int64_t meta_alloc =
4360 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
4361 int64_t data_alloc =
4362 static_cast<int64_t>(store->cache_data_ratio * cache_size);
4363
4364 if (pcm != nullptr && binned_kv_cache != nullptr) {
4365 cache_size = pcm->get_tuned_mem();
4366 kv_alloc = binned_kv_cache->get_committed_size();
4367 meta_alloc = meta_cache->get_committed_size();
4368 data_alloc = data_cache->get_committed_size();
4369 if (binned_kv_onode_cache != nullptr) {
4370 kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
4371 }
4372 }
4373
4374 if (interval_stats) {
4375 dout(5) << __func__ << " cache_size: " << cache_size
4376 << " kv_alloc: " << kv_alloc
4377 << " kv_used: " << kv_used
4378 << " kv_onode_alloc: " << kv_onode_alloc
4379 << " kv_onode_used: " << kv_onode_used
4380 << " meta_alloc: " << meta_alloc
4381 << " meta_used: " << meta_used
4382 << " data_alloc: " << data_alloc
4383 << " data_used: " << data_used << dendl;
4384 } else {
4385 dout(20) << __func__ << " cache_size: " << cache_size
4386 << " kv_alloc: " << kv_alloc
4387 << " kv_used: " << kv_used
4388 << " kv_onode_alloc: " << kv_onode_alloc
4389 << " kv_onode_used: " << kv_onode_used
4390 << " meta_alloc: " << meta_alloc
4391 << " meta_used: " << meta_used
4392 << " data_alloc: " << data_alloc
4393 << " data_used: " << data_used << dendl;
4394 }
4395
4396 uint64_t max_shard_onodes = static_cast<uint64_t>(
4397 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
4398 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
4399
4400 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
4401 << " max_shard_buffer: " << max_shard_buffer << dendl;
4402
4403 for (auto i : store->onode_cache_shards) {
4404 i->set_max(max_shard_onodes);
4405 }
4406 for (auto i : store->buffer_cache_shards) {
4407 i->set_max(max_shard_buffer);
4408 }
4409}
4410
4411void BlueStore::MempoolThread::_update_cache_settings()
4412{
4413 // Nothing to do if pcm is not used.
4414 if (pcm == nullptr) {
4415 return;
4416 }
4417
4418 uint64_t target = store->osd_memory_target;
4419 uint64_t base = store->osd_memory_base;
4420 uint64_t min = store->osd_memory_cache_min;
4421 uint64_t max = min;
4422 double fragmentation = store->osd_memory_expected_fragmentation;
4423
4424 uint64_t ltarget = (1.0 - fragmentation) * target;
4425 if (ltarget > base + min) {
4426 max = ltarget - base;
4427 }
4428
4429 // set pcm cache levels
4430 pcm->set_target_memory(target);
4431 pcm->set_min_memory(min);
4432 pcm->set_max_memory(max);
4433
4434 dout(5) << __func__ << " updated pcm target: " << target
4435 << " pcm min: " << min
4436 << " pcm max: " << max
4437 << dendl;
4438}
4439
4440// =======================================================
4441
4442// OmapIteratorImpl
4443
4444#undef dout_prefix
4445#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4446
4447BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4448 PerfCounters* _logger, CollectionRef c, OnodeRef& o, KeyValueDB::Iterator it)
4449 : logger(_logger), c(c), o(o), it(it)
4450{
4451 logger->inc(l_bluestore_omap_iterator_count);
4452 std::shared_lock l(c->lock);
4453 if (o->onode.has_omap()) {
4454 o->get_omap_key(string(), &head);
4455 o->get_omap_tail(&tail);
4456 it->lower_bound(head);
4457 }
4458}
4459BlueStore::OmapIteratorImpl::~OmapIteratorImpl()
4460{
4461 logger->dec(l_bluestore_omap_iterator_count);
4462}
4463
4464string BlueStore::OmapIteratorImpl::_stringify() const
4465{
4466 stringstream s;
4467 s << " omap_iterator(cid = " << c->cid
4468 <<", oid = " << o->oid << ")";
4469 return s.str();
4470}
4471
4472int BlueStore::OmapIteratorImpl::seek_to_first()
4473{
4474 std::shared_lock l(c->lock);
4475 auto start1 = mono_clock::now();
4476 if (o->onode.has_omap()) {
4477 it->lower_bound(head);
4478 } else {
4479 it = KeyValueDB::Iterator();
4480 }
4481 c->store->log_latency(
4482 __func__,
4483 l_bluestore_omap_seek_to_first_lat,
4484 mono_clock::now() - start1,
4485 c->store->cct->_conf->bluestore_log_omap_iterator_age);
4486
4487 return 0;
4488}
4489
4490int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4491{
4492 std::shared_lock l(c->lock);
4493 auto start1 = mono_clock::now();
4494 if (o->onode.has_omap()) {
4495 string key;
4496 o->get_omap_key(after, &key);
4497 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4498 << pretty_binary_string(key) << dendl;
4499 it->upper_bound(key);
4500 } else {
4501 it = KeyValueDB::Iterator();
4502 }
4503 c->store->log_latency_fn(
4504 __func__,
4505 l_bluestore_omap_upper_bound_lat,
4506 mono_clock::now() - start1,
4507 c->store->cct->_conf->bluestore_log_omap_iterator_age,
4508 [&] (const ceph::timespan& lat) {
4509 return ", after = " + after +
4510 _stringify();
4511 }
4512 );
4513 return 0;
4514}
4515
4516int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4517{
4518 std::shared_lock l(c->lock);
4519 auto start1 = mono_clock::now();
4520 if (o->onode.has_omap()) {
4521 string key;
4522 o->get_omap_key(to, &key);
4523 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4524 << pretty_binary_string(key) << dendl;
4525 it->lower_bound(key);
4526 } else {
4527 it = KeyValueDB::Iterator();
4528 }
4529 c->store->log_latency_fn(
4530 __func__,
4531 l_bluestore_omap_lower_bound_lat,
4532 mono_clock::now() - start1,
4533 c->store->cct->_conf->bluestore_log_omap_iterator_age,
4534 [&] (const ceph::timespan& lat) {
4535 return ", to = " + to +
4536 _stringify();
4537 }
4538 );
4539 return 0;
4540}
4541
4542bool BlueStore::OmapIteratorImpl::valid()
4543{
4544 std::shared_lock l(c->lock);
4545 bool r = o->onode.has_omap() && it && it->valid() &&
4546 it->raw_key().second < tail;
4547 if (it && it->valid()) {
4548 ldout(c->store->cct,20) << __func__ << " is at "
4549 << pretty_binary_string(it->raw_key().second)
4550 << dendl;
4551 }
4552 return r;
4553}
4554
4555int BlueStore::OmapIteratorImpl::next()
4556{
4557 int r = -1;
4558 std::shared_lock l(c->lock);
4559 auto start1 = mono_clock::now();
4560 if (o->onode.has_omap()) {
4561 it->next();
4562 r = 0;
4563 }
4564 c->store->log_latency(
4565 __func__,
4566 l_bluestore_omap_next_lat,
4567 mono_clock::now() - start1,
4568 c->store->cct->_conf->bluestore_log_omap_iterator_age);
4569
4570 return r;
4571}
4572
4573string BlueStore::OmapIteratorImpl::key()
4574{
4575 std::shared_lock l(c->lock);
4576 ceph_assert(it->valid());
4577 string db_key = it->raw_key().second;
4578 string user_key;
4579 o->decode_omap_key(db_key, &user_key);
4580
4581 return user_key;
4582}
4583
4584bufferlist BlueStore::OmapIteratorImpl::value()
4585{
4586 std::shared_lock l(c->lock);
4587 ceph_assert(it->valid());
4588 return it->value();
4589}
4590
4591
4592// =====================================
4593
4594#undef dout_prefix
4595#define dout_prefix *_dout << "bluestore(" << path << ") "
4596#undef dout_context
4597#define dout_context cct
4598
4599
4600static void aio_cb(void *priv, void *priv2)
4601{
4602 BlueStore *store = static_cast<BlueStore*>(priv);
4603 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4604 c->aio_finish(store);
4605}
4606
4607static void discard_cb(void *priv, void *priv2)
4608{
4609 BlueStore *store = static_cast<BlueStore*>(priv);
4610 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4611 store->handle_discard(*tmp);
4612}
4613
4614void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4615{
4616 dout(10) << __func__ << dendl;
4617 ceph_assert(alloc);
4618 alloc->release(to_release);
4619}
4620
4621BlueStore::BlueStore(CephContext *cct, const string& path)
4622 : BlueStore(cct, path, 0) {}
4623
4624BlueStore::BlueStore(CephContext *cct,
4625 const string& path,
4626 uint64_t _min_alloc_size)
4627 : ObjectStore(cct, path),
4628 throttle(cct),
4629 finisher(cct, "commit_finisher", "cfin"),
4630 kv_sync_thread(this),
4631 kv_finalize_thread(this),
4632#ifdef HAVE_LIBZBD
4633 zoned_cleaner_thread(this),
4634#endif
4635 min_alloc_size(_min_alloc_size),
4636 min_alloc_size_order(std::countr_zero(_min_alloc_size)),
4637 mempool_thread(this)
4638{
4639 _init_logger();
4640 cct->_conf.add_observer(this);
4641 set_cache_shards(1);
4642}
4643
4644BlueStore::~BlueStore()
4645{
4646 cct->_conf.remove_observer(this);
4647 _shutdown_logger();
4648 ceph_assert(!mounted);
4649 ceph_assert(db == NULL);
4650 ceph_assert(bluefs == NULL);
4651 ceph_assert(fsid_fd < 0);
4652 ceph_assert(path_fd < 0);
4653 for (auto i : onode_cache_shards) {
4654 delete i;
4655 }
4656 for (auto i : buffer_cache_shards) {
4657 delete i;
4658 }
4659 onode_cache_shards.clear();
4660 buffer_cache_shards.clear();
4661}
4662
4663const char **BlueStore::get_tracked_conf_keys() const
4664{
4665 static const char* KEYS[] = {
4666 "bluestore_csum_type",
4667 "bluestore_compression_mode",
4668 "bluestore_compression_algorithm",
4669 "bluestore_compression_min_blob_size",
4670 "bluestore_compression_min_blob_size_ssd",
4671 "bluestore_compression_min_blob_size_hdd",
4672 "bluestore_compression_max_blob_size",
4673 "bluestore_compression_max_blob_size_ssd",
4674 "bluestore_compression_max_blob_size_hdd",
4675 "bluestore_compression_required_ratio",
4676 "bluestore_max_alloc_size",
4677 "bluestore_prefer_deferred_size",
4678 "bluestore_prefer_deferred_size_hdd",
4679 "bluestore_prefer_deferred_size_ssd",
4680 "bluestore_deferred_batch_ops",
4681 "bluestore_deferred_batch_ops_hdd",
4682 "bluestore_deferred_batch_ops_ssd",
4683 "bluestore_throttle_bytes",
4684 "bluestore_throttle_deferred_bytes",
4685 "bluestore_throttle_cost_per_io_hdd",
4686 "bluestore_throttle_cost_per_io_ssd",
4687 "bluestore_throttle_cost_per_io",
4688 "bluestore_max_blob_size",
4689 "bluestore_max_blob_size_ssd",
4690 "bluestore_max_blob_size_hdd",
4691 "osd_memory_target",
4692 "osd_memory_target_cgroup_limit_ratio",
4693 "osd_memory_base",
4694 "osd_memory_cache_min",
4695 "osd_memory_expected_fragmentation",
4696 "bluestore_cache_autotune",
4697 "bluestore_cache_autotune_interval",
4698 "bluestore_cache_age_bin_interval",
4699 "bluestore_cache_kv_age_bins",
4700 "bluestore_cache_kv_onode_age_bins",
4701 "bluestore_cache_meta_age_bins",
4702 "bluestore_cache_data_age_bins",
4703 "bluestore_warn_on_legacy_statfs",
4704 "bluestore_warn_on_no_per_pool_omap",
4705 "bluestore_warn_on_no_per_pg_omap",
4706 "bluestore_max_defer_interval",
4707 NULL
4708 };
4709 return KEYS;
4710}
4711
4712void BlueStore::handle_conf_change(const ConfigProxy& conf,
4713 const std::set<std::string> &changed)
4714{
4715 if (changed.count("bluestore_warn_on_legacy_statfs")) {
4716 _check_legacy_statfs_alert();
4717 }
4718 if (changed.count("bluestore_warn_on_no_per_pool_omap") ||
4719 changed.count("bluestore_warn_on_no_per_pg_omap")) {
4720 _check_no_per_pg_or_pool_omap_alert();
4721 }
4722
4723 if (changed.count("bluestore_csum_type")) {
4724 _set_csum();
4725 }
4726 if (changed.count("bluestore_compression_mode") ||
4727 changed.count("bluestore_compression_algorithm") ||
4728 changed.count("bluestore_compression_min_blob_size") ||
4729 changed.count("bluestore_compression_max_blob_size")) {
4730 if (bdev) {
4731 _set_compression();
4732 }
4733 }
4734 if (changed.count("bluestore_max_blob_size") ||
4735 changed.count("bluestore_max_blob_size_ssd") ||
4736 changed.count("bluestore_max_blob_size_hdd")) {
4737 if (bdev) {
4738 // only after startup
4739 _set_blob_size();
4740 }
4741 }
4742 if (changed.count("bluestore_prefer_deferred_size") ||
4743 changed.count("bluestore_prefer_deferred_size_hdd") ||
4744 changed.count("bluestore_prefer_deferred_size_ssd") ||
4745 changed.count("bluestore_max_alloc_size") ||
4746 changed.count("bluestore_deferred_batch_ops") ||
4747 changed.count("bluestore_deferred_batch_ops_hdd") ||
4748 changed.count("bluestore_deferred_batch_ops_ssd")) {
4749 if (bdev) {
4750 // only after startup
4751 _set_alloc_sizes();
4752 }
4753 }
4754 if (changed.count("bluestore_throttle_cost_per_io") ||
4755 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4756 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4757 if (bdev) {
4758 _set_throttle_params();
4759 }
4760 }
4761 if (changed.count("bluestore_throttle_bytes") ||
4762 changed.count("bluestore_throttle_deferred_bytes") ||
4763 changed.count("bluestore_throttle_trace_rate")) {
4764 throttle.reset_throttle(conf);
4765 }
4766 if (changed.count("bluestore_max_defer_interval")) {
4767 if (bdev) {
4768 _set_max_defer_interval();
4769 }
4770 }
4771 if (changed.count("osd_memory_target") ||
4772 changed.count("osd_memory_base") ||
4773 changed.count("osd_memory_cache_min") ||
4774 changed.count("osd_memory_expected_fragmentation")) {
4775 _update_osd_memory_options();
4776 }
4777}
4778
4779void BlueStore::_set_compression()
4780{
4781 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4782 if (m) {
4783 _clear_compression_alert();
4784 comp_mode = *m;
4785 } else {
4786 derr << __func__ << " unrecognized value '"
4787 << cct->_conf->bluestore_compression_mode
4788 << "' for bluestore_compression_mode, reverting to 'none'"
4789 << dendl;
4790 comp_mode = Compressor::COMP_NONE;
4791 string s("unknown mode: ");
4792 s += cct->_conf->bluestore_compression_mode;
4793 _set_compression_alert(true, s.c_str());
4794 }
4795
4796 compressor = nullptr;
4797
4798 if (cct->_conf->bluestore_compression_min_blob_size) {
4799 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
4800 } else {
4801 ceph_assert(bdev);
4802 if (_use_rotational_settings()) {
4803 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4804 } else {
4805 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4806 }
4807 }
4808
4809 if (cct->_conf->bluestore_compression_max_blob_size) {
4810 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4811 } else {
4812 ceph_assert(bdev);
4813 if (_use_rotational_settings()) {
4814 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4815 } else {
4816 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4817 }
4818 }
4819
4820 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4821 if (!alg_name.empty()) {
4822 compressor = Compressor::create(cct, alg_name);
4823 if (!compressor) {
4824 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4825 << dendl;
4826 _set_compression_alert(false, alg_name.c_str());
4827 }
4828 }
4829
4830 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4831 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
4832 << " min_blob " << comp_min_blob_size
4833 << " max_blob " << comp_max_blob_size
4834 << dendl;
4835}
4836
4837void BlueStore::_set_csum()
4838{
4839 csum_type = Checksummer::CSUM_NONE;
4840 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4841 if (t > Checksummer::CSUM_NONE)
4842 csum_type = t;
4843
4844 dout(10) << __func__ << " csum_type "
4845 << Checksummer::get_csum_type_string(csum_type)
4846 << dendl;
4847}
4848
4849void BlueStore::_set_throttle_params()
4850{
4851 if (cct->_conf->bluestore_throttle_cost_per_io) {
4852 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4853 } else {
4854 ceph_assert(bdev);
4855 if (_use_rotational_settings()) {
4856 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4857 } else {
4858 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4859 }
4860 }
4861
4862 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4863 << dendl;
4864}
4865void BlueStore::_set_blob_size()
4866{
4867 if (cct->_conf->bluestore_max_blob_size) {
4868 max_blob_size = cct->_conf->bluestore_max_blob_size;
4869 } else {
4870 ceph_assert(bdev);
4871 if (_use_rotational_settings()) {
4872 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4873 } else {
4874 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4875 }
4876 }
4877 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4878 << std::dec << dendl;
4879}
4880
4881void BlueStore::_update_osd_memory_options()
4882{
4883 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4884 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4885 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4886 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4887 config_changed++;
4888 dout(10) << __func__
4889 << " osd_memory_target " << osd_memory_target
4890 << " osd_memory_base " << osd_memory_base
4891 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4892 << " osd_memory_cache_min " << osd_memory_cache_min
4893 << dendl;
4894}
4895
4896int BlueStore::_set_cache_sizes()
4897{
4898 ceph_assert(bdev);
4899 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
4900 cache_autotune_interval =
4901 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4902 cache_age_bin_interval =
4903 cct->_conf.get_val<double>("bluestore_cache_age_bin_interval");
4904 auto _set_bin = [&](std::string conf_name, std::vector<uint64_t>* intervals)
4905 {
4906 std::string intervals_str = cct->_conf.get_val<std::string>(conf_name);
4907 std::istringstream interval_stream(intervals_str);
4908 std::copy(
4909 std::istream_iterator<uint64_t>(interval_stream),
4910 std::istream_iterator<uint64_t>(),
4911 std::back_inserter(*intervals));
4912 };
4913 _set_bin("bluestore_cache_age_bins_kv", &kv_bins);
4914 _set_bin("bluestore_cache_age_bins_kv_onode", &kv_onode_bins);
4915 _set_bin("bluestore_cache_age_bins_meta", &meta_bins);
4916 _set_bin("bluestore_cache_age_bins_data", &data_bins);
4917
4918 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4919 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4920 osd_memory_expected_fragmentation =
4921 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4922 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4923 osd_memory_cache_resize_interval =
4924 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
4925
4926 if (cct->_conf->bluestore_cache_size) {
4927 cache_size = cct->_conf->bluestore_cache_size;
4928 } else {
4929 // choose global cache size based on backend type
4930 if (_use_rotational_settings()) {
4931 cache_size = cct->_conf->bluestore_cache_size_hdd;
4932 } else {
4933 cache_size = cct->_conf->bluestore_cache_size_ssd;
4934 }
4935 }
4936
4937 cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
4938 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
4939 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
4940 << ") must be in range [0,1.0]" << dendl;
4941 return -EINVAL;
4942 }
4943
4944 cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
4945 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
4946 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
4947 << ") must be in range [0,1.0]" << dendl;
4948 return -EINVAL;
4949 }
4950
4951 cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
4952 if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
4953 derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
4954 << ") must be in range [0,1.0]" << dendl;
4955 return -EINVAL;
4956 }
4957
4958 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
4959 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
4960 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4961 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4962 << dendl;
4963 return -EINVAL;
4964 }
4965
4966 cache_data_ratio = (double)1.0 -
4967 (double)cache_meta_ratio -
4968 (double)cache_kv_ratio -
4969 (double)cache_kv_onode_ratio;
4970 if (cache_data_ratio < 0) {
4971 // deal with floating point imprecision
4972 cache_data_ratio = 0;
4973 }
4974
4975 dout(1) << __func__ << " cache_size " << cache_size
4976 << " meta " << cache_meta_ratio
4977 << " kv " << cache_kv_ratio
4978 << " data " << cache_data_ratio
4979 << dendl;
4980 return 0;
4981}
4982
4983int BlueStore::write_meta(const std::string& key, const std::string& value)
4984{
4985 bluestore_bdev_label_t label;
4986 string p = path + "/block";
4987 int r = _read_bdev_label(cct, p, &label);
4988 if (r < 0) {
4989 return ObjectStore::write_meta(key, value);
4990 }
4991 label.meta[key] = value;
4992 r = _write_bdev_label(cct, p, label);
4993 ceph_assert(r == 0);
4994 return ObjectStore::write_meta(key, value);
4995}
4996
4997int BlueStore::read_meta(const std::string& key, std::string *value)
4998{
4999 bluestore_bdev_label_t label;
5000 string p = path + "/block";
5001 int r = _read_bdev_label(cct, p, &label);
5002 if (r < 0) {
5003 return ObjectStore::read_meta(key, value);
5004 }
5005 auto i = label.meta.find(key);
5006 if (i == label.meta.end()) {
5007 return ObjectStore::read_meta(key, value);
5008 }
5009 *value = i->second;
5010 return 0;
5011}
5012
5013void BlueStore::_init_logger()
5014{
5015 PerfCountersBuilder b(cct, "bluestore",
5016 l_bluestore_first, l_bluestore_last);
5017
5018 // space utilization stats
5019 //****************************************
5020 b.add_u64(l_bluestore_allocated, "allocated",
5021 "Sum for allocated bytes",
5022 "al_b",
5023 PerfCountersBuilder::PRIO_CRITICAL,
5024 unit_t(UNIT_BYTES));
5025 b.add_u64(l_bluestore_stored, "stored",
5026 "Sum for stored bytes",
5027 "st_b",
5028 PerfCountersBuilder::PRIO_CRITICAL,
5029 unit_t(UNIT_BYTES));
5030 b.add_u64(l_bluestore_fragmentation, "fragmentation_micros",
5031 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
5032 b.add_u64(l_bluestore_alloc_unit, "alloc_unit",
5033 "allocation unit size in bytes",
5034 "au_b",
5035 PerfCountersBuilder::PRIO_CRITICAL,
5036 unit_t(UNIT_BYTES));
5037 //****************************************
5038
5039 // Update op processing state latencies
5040 //****************************************
5041 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
5042 "Average prepare state latency",
5043 "sprl", PerfCountersBuilder::PRIO_USEFUL);
5044 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
5045 "Average aio_wait state latency",
5046 "sawl", PerfCountersBuilder::PRIO_INTERESTING);
5047 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
5048 "Average io_done state latency",
5049 "sidl", PerfCountersBuilder::PRIO_USEFUL);
5050 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
5051 "Average kv_queued state latency",
5052 "skql", PerfCountersBuilder::PRIO_USEFUL);
5053 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
5054 "Average kv_commiting state latency",
5055 "skcl", PerfCountersBuilder::PRIO_USEFUL);
5056 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
5057 "Average kv_done state latency",
5058 "skdl", PerfCountersBuilder::PRIO_USEFUL);
5059 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
5060 "Average finishing state latency",
5061 "sfnl", PerfCountersBuilder::PRIO_USEFUL);
5062 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
5063 "Average done state latency",
5064 "sdnl", PerfCountersBuilder::PRIO_USEFUL);
5065 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
5066 "Average deferred_queued state latency",
5067 "sdql", PerfCountersBuilder::PRIO_USEFUL);
5068 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
5069 "Average aio_wait state latency",
5070 "sdal", PerfCountersBuilder::PRIO_USEFUL);
5071 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
5072 "Average cleanup state latency",
5073 "sdcl", PerfCountersBuilder::PRIO_USEFUL);
5074 //****************************************
5075
5076 // Update Transaction stats
5077 //****************************************
5078 b.add_time_avg(l_bluestore_throttle_lat, "txc_throttle_lat",
5079 "Average submit throttle latency",
5080 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
5081 b.add_time_avg(l_bluestore_submit_lat, "txc_submit_lat",
5082 "Average submit latency",
5083 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
5084 b.add_time_avg(l_bluestore_commit_lat, "txc_commit_lat",
5085 "Average commit latency",
5086 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
5087 b.add_u64_counter(l_bluestore_txc, "txc_count", "Transactions committed");
5088 //****************************************
5089
5090 // Read op stats
5091 //****************************************
5092 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
5093 "Average read onode metadata latency",
5094 "roml", PerfCountersBuilder::PRIO_USEFUL);
5095 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
5096 "Average read I/O waiting latency",
5097 "rwal", PerfCountersBuilder::PRIO_USEFUL);
5098 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
5099 "Average checksum latency",
5100 "csml", PerfCountersBuilder::PRIO_USEFUL);
5101 b.add_u64_counter(l_bluestore_read_eio, "read_eio",
5102 "Read EIO errors propagated to high level callers");
5103 b.add_u64_counter(l_bluestore_reads_with_retries, "reads_with_retries",
5104 "Read operations that required at least one retry due to failed checksum validation",
5105 "rd_r", PerfCountersBuilder::PRIO_USEFUL);
5106 b.add_time_avg(l_bluestore_read_lat, "read_lat",
5107 "Average read latency",
5108 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
5109 //****************************************
5110
5111 // kv_thread latencies
5112 //****************************************
5113 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
5114 "Average kv_thread flush latency",
5115 "kfsl", PerfCountersBuilder::PRIO_INTERESTING);
5116 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
5117 "Average kv_thread commit latency",
5118 "kcol", PerfCountersBuilder::PRIO_USEFUL);
5119 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
5120 "Average kv_sync thread latency",
5121 "kscl", PerfCountersBuilder::PRIO_INTERESTING);
5122 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
5123 "Average kv_finalize thread latency",
5124 "kfll", PerfCountersBuilder::PRIO_INTERESTING);
5125 //****************************************
5126
5127 // write op stats
5128 //****************************************
5129 b.add_u64_counter(l_bluestore_write_big, "write_big",
5130 "Large aligned writes into fresh blobs");
5131 b.add_u64_counter(l_bluestore_write_big_bytes, "write_big_bytes",
5132 "Large aligned writes into fresh blobs (bytes)",
5133 NULL,
5134 PerfCountersBuilder::PRIO_DEBUGONLY,
5135 unit_t(UNIT_BYTES));
5136 b.add_u64_counter(l_bluestore_write_big_blobs, "write_big_blobs",
5137 "Large aligned writes into fresh blobs (blobs)");
5138 b.add_u64_counter(l_bluestore_write_big_deferred,
5139 "write_big_deferred",
5140 "Big overwrites using deferred");
5141
5142 b.add_u64_counter(l_bluestore_write_small, "write_small",
5143 "Small writes into existing or sparse small blobs");
5144 b.add_u64_counter(l_bluestore_write_small_bytes, "write_small_bytes",
5145 "Small writes into existing or sparse small blobs (bytes)",
5146 NULL,
5147 PerfCountersBuilder::PRIO_DEBUGONLY,
5148 unit_t(UNIT_BYTES));
5149 b.add_u64_counter(l_bluestore_write_small_unused,
5150 "write_small_unused",
5151 "Small writes into unused portion of existing blob");
5152 b.add_u64_counter(l_bluestore_write_small_pre_read,
5153 "write_small_pre_read",
5154 "Small writes that required we read some data (possibly "
5155 "cached) to fill out the block");
5156
5157 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
5158 "Sum for write-op padded bytes",
5159 NULL,
5160 PerfCountersBuilder::PRIO_DEBUGONLY,
5161 unit_t(UNIT_BYTES));
5162 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
5163 "Sum for write penalty read ops");
5164 b.add_u64_counter(l_bluestore_write_new, "write_new",
5165 "Write into new blob");
5166
5167 b.add_u64_counter(l_bluestore_issued_deferred_writes,
5168 "issued_deferred_writes",
5169 "Total deferred writes issued");
5170 b.add_u64_counter(l_bluestore_issued_deferred_write_bytes,
5171 "issued_deferred_write_bytes",
5172 "Total bytes in issued deferred writes",
5173 NULL,
5174 PerfCountersBuilder::PRIO_DEBUGONLY,
5175 unit_t(UNIT_BYTES));
5176 b.add_u64_counter(l_bluestore_submitted_deferred_writes,
5177 "submitted_deferred_writes",
5178 "Total deferred writes submitted to disk");
5179 b.add_u64_counter(l_bluestore_submitted_deferred_write_bytes,
5180 "submitted_deferred_write_bytes",
5181 "Total bytes submitted to disk by deferred writes",
5182 NULL,
5183 PerfCountersBuilder::PRIO_DEBUGONLY,
5184 unit_t(UNIT_BYTES));
5185
5186 b.add_u64_counter(l_bluestore_write_big_skipped_blobs,
5187 "write_big_skipped_blobs",
5188 "Large aligned writes into fresh blobs skipped due to zero detection (blobs)");
5189 b.add_u64_counter(l_bluestore_write_big_skipped_bytes,
5190 "write_big_skipped_bytes",
5191 "Large aligned writes into fresh blobs skipped due to zero detection (bytes)");
5192 b.add_u64_counter(l_bluestore_write_small_skipped,
5193 "write_small_skipped",
5194 "Small writes into existing or sparse small blobs skipped due to zero detection");
5195 b.add_u64_counter(l_bluestore_write_small_skipped_bytes,
5196 "write_small_skipped_bytes",
5197 "Small writes into existing or sparse small blobs skipped due to zero detection (bytes)");
5198 //****************************************
5199
5200 // compressions stats
5201 //****************************************
5202 b.add_u64(l_bluestore_compressed, "compressed",
5203 "Sum for stored compressed bytes",
5204 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5205 b.add_u64(l_bluestore_compressed_allocated, "compressed_allocated",
5206 "Sum for bytes allocated for compressed data",
5207 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5208 b.add_u64(l_bluestore_compressed_original, "compressed_original",
5209 "Sum for original bytes that were compressed",
5210 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5211 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
5212 "Average compress latency",
5213 "_cpl", PerfCountersBuilder::PRIO_USEFUL);
5214 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
5215 "Average decompress latency",
5216 "dcpl", PerfCountersBuilder::PRIO_USEFUL);
5217 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
5218 "Sum for beneficial compress ops");
5219 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
5220 "Sum for compress ops rejected due to low net gain of space");
5221 //****************************************
5222
5223 // onode cache stats
5224 //****************************************
5225 b.add_u64(l_bluestore_onodes, "onodes",
5226 "Number of onodes in cache");
5227 b.add_u64(l_bluestore_pinned_onodes, "onodes_pinned",
5228 "Number of pinned onodes in cache");
5229 b.add_u64_counter(l_bluestore_onode_hits, "onode_hits",
5230 "Count of onode cache lookup hits",
5231 "o_ht", PerfCountersBuilder::PRIO_USEFUL);
5232 b.add_u64_counter(l_bluestore_onode_misses, "onode_misses",
5233 "Count of onode cache lookup misses",
5234 "o_ms", PerfCountersBuilder::PRIO_USEFUL);
5235 b.add_u64_counter(l_bluestore_onode_shard_hits, "onode_shard_hits",
5236 "Count of onode shard cache lookups hits");
5237 b.add_u64_counter(l_bluestore_onode_shard_misses,
5238 "onode_shard_misses",
5239 "Count of onode shard cache lookups misses");
5240 b.add_u64(l_bluestore_extents, "onode_extents",
5241 "Number of extents in cache");
5242 b.add_u64(l_bluestore_blobs, "onode_blobs",
5243 "Number of blobs in cache");
5244 //****************************************
5245
5246 // buffer cache stats
5247 //****************************************
5248 b.add_u64(l_bluestore_buffers, "buffers",
5249 "Number of buffers in cache");
5250 b.add_u64(l_bluestore_buffer_bytes, "buffer_bytes",
5251 "Number of buffer bytes in cache",
5252 NULL,
5253 PerfCountersBuilder::PRIO_DEBUGONLY,
5254 unit_t(UNIT_BYTES));
5255 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "buffer_hit_bytes",
5256 "Sum for bytes of read hit in the cache",
5257 NULL,
5258 PerfCountersBuilder::PRIO_DEBUGONLY,
5259 unit_t(UNIT_BYTES));
5260 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "buffer_miss_bytes",
5261 "Sum for bytes of read missed in the cache",
5262 NULL,
5263 PerfCountersBuilder::PRIO_DEBUGONLY,
5264 unit_t(UNIT_BYTES));
5265 //****************************************
5266
5267 // internal stats
5268 //****************************************
5269 b.add_u64_counter(l_bluestore_onode_reshard, "onode_reshard",
5270 "Onode extent map reshard events");
5271 b.add_u64_counter(l_bluestore_blob_split, "blob_split",
5272 "Sum for blob splitting due to resharding");
5273 b.add_u64_counter(l_bluestore_extent_compress, "extent_compress",
5274 "Sum for extents that have been removed due to compression");
5275 b.add_u64_counter(l_bluestore_gc_merged, "gc_merged",
5276 "Sum for extents that have been merged due to garbage "
5277 "collection");
5278 //****************************************
5279 // misc
5280 //****************************************
5281 b.add_u64_counter(l_bluestore_omap_iterator_count, "omap_iterator_count",
5282 "Open omap iterators count");
5283 b.add_u64_counter(l_bluestore_omap_rmkeys_count, "omap_rmkeys_count",
5284 "amount of omap keys removed via rmkeys");
5285 b.add_u64_counter(l_bluestore_omap_rmkey_ranges_count, "omap_rmkey_range_count",
5286 "amount of omap key ranges removed via rmkeys");
5287 //****************************************
5288 // other client ops latencies
5289 //****************************************
5290 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
5291 "Average omap iterator seek_to_first call latency",
5292 "osfl", PerfCountersBuilder::PRIO_USEFUL);
5293 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
5294 "Average omap iterator upper_bound call latency",
5295 "oubl", PerfCountersBuilder::PRIO_USEFUL);
5296 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
5297 "Average omap iterator lower_bound call latency",
5298 "olbl", PerfCountersBuilder::PRIO_USEFUL);
5299 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
5300 "Average omap iterator next call latency",
5301 "onxl", PerfCountersBuilder::PRIO_USEFUL);
5302 b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
5303 "Average omap get_keys call latency",
5304 "ogkl", PerfCountersBuilder::PRIO_USEFUL);
5305 b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
5306 "Average omap get_values call latency",
5307 "ogvl", PerfCountersBuilder::PRIO_USEFUL);
5308 b.add_time_avg(l_bluestore_omap_clear_lat, "omap_clear_lat",
5309 "Average omap clear call latency");
5310 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
5311 "Average collection listing latency",
5312 "cl_l", PerfCountersBuilder::PRIO_USEFUL);
5313 b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
5314 "Average removal latency",
5315 "rm_l", PerfCountersBuilder::PRIO_USEFUL);
5316 b.add_time_avg(l_bluestore_truncate_lat, "truncate_lat",
5317 "Average truncate latency",
5318 "tr_l", PerfCountersBuilder::PRIO_USEFUL);
5319 //****************************************
5320
5321 // Resulting size axis configuration for op histograms, values are in bytes
5322 PerfHistogramCommon::axis_config_d alloc_hist_x_axis_config{
5323 "Given size (bytes)",
5324 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
5325 0, ///< Start at 0
5326 4096, ///< Quantization unit
5327 13, ///< Enough to cover 4+M requests
5328 };
5329 // Req size axis configuration for op histograms, values are in bytes
5330 PerfHistogramCommon::axis_config_d alloc_hist_y_axis_config{
5331 "Request size (bytes)",
5332 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
5333 0, ///< Start at 0
5334 4096, ///< Quantization unit
5335 13, ///< Enough to cover 4+M requests
5336 };
5337 b.add_u64_counter_histogram(
5338 l_bluestore_allocate_hist, "allocate_histogram",
5339 alloc_hist_x_axis_config, alloc_hist_y_axis_config,
5340 "Histogram of requested block allocations vs. given ones");
5341
5342 logger = b.create_perf_counters();
5343 cct->get_perfcounters_collection()->add(logger);
5344}
5345
5346int BlueStore::_reload_logger()
5347{
5348 struct store_statfs_t store_statfs;
5349 int r = statfs(&store_statfs);
5350 if (r >= 0) {
5351 logger->set(l_bluestore_allocated, store_statfs.allocated);
5352 logger->set(l_bluestore_stored, store_statfs.data_stored);
5353 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
5354 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
5355 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
5356 }
5357 return r;
5358}
5359
5360void BlueStore::_shutdown_logger()
5361{
5362 cct->get_perfcounters_collection()->remove(logger);
5363 delete logger;
5364}
5365
5366int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
5367 uuid_d *fsid)
5368{
5369 bluestore_bdev_label_t label;
5370 int r = _read_bdev_label(cct, path, &label);
5371 if (r < 0)
5372 return r;
5373 *fsid = label.osd_uuid;
5374 return 0;
5375}
5376
5377int BlueStore::_open_path()
5378{
5379 // sanity check(s)
5380 ceph_assert(path_fd < 0);
5381 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
5382 if (path_fd < 0) {
5383 int r = -errno;
5384 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
5385 << dendl;
5386 return r;
5387 }
5388 return 0;
5389}
5390
5391void BlueStore::_close_path()
5392{
5393 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
5394 path_fd = -1;
5395}
5396
5397int BlueStore::_write_bdev_label(CephContext *cct,
5398 const string &path, bluestore_bdev_label_t label)
5399{
5400 dout(10) << __func__ << " path " << path << " label " << label << dendl;
5401 bufferlist bl;
5402 encode(label, bl);
5403 uint32_t crc = bl.crc32c(-1);
5404 encode(crc, bl);
5405 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
5406 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
5407 z.zero();
5408 bl.append(std::move(z));
5409
5410 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC|O_DIRECT));
5411 if (fd < 0) {
5412 fd = -errno;
5413 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5414 << dendl;
5415 return fd;
5416 }
5417 bl.rebuild_aligned_size_and_memory(BDEV_LABEL_BLOCK_SIZE, BDEV_LABEL_BLOCK_SIZE, IOV_MAX);
5418 int r = bl.write_fd(fd);
5419 if (r < 0) {
5420 derr << __func__ << " failed to write to " << path
5421 << ": " << cpp_strerror(r) << dendl;
5422 goto out;
5423 }
5424 r = ::fsync(fd);
5425 if (r < 0) {
5426 derr << __func__ << " failed to fsync " << path
5427 << ": " << cpp_strerror(r) << dendl;
5428 }
5429out:
5430 VOID_TEMP_FAILURE_RETRY(::close(fd));
5431 return r;
5432}
5433
5434int BlueStore::_read_bdev_label(CephContext* cct, const string &path,
5435 bluestore_bdev_label_t *label)
5436{
5437 dout(10) << __func__ << dendl;
5438 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
5439 if (fd < 0) {
5440 fd = -errno;
5441 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5442 << dendl;
5443 return fd;
5444 }
5445 bufferlist bl;
5446 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
5447 VOID_TEMP_FAILURE_RETRY(::close(fd));
5448 if (r < 0) {
5449 derr << __func__ << " failed to read from " << path
5450 << ": " << cpp_strerror(r) << dendl;
5451 return r;
5452 }
5453
5454 uint32_t crc, expected_crc;
5455 auto p = bl.cbegin();
5456 try {
5457 decode(*label, p);
5458 bufferlist t;
5459 t.substr_of(bl, 0, p.get_off());
5460 crc = t.crc32c(-1);
5461 decode(expected_crc, p);
5462 }
5463 catch (ceph::buffer::error& e) {
5464 derr << __func__ << " unable to decode label at offset " << p.get_off()
5465 << ": " << e.what()
5466 << dendl;
5467 return -ENOENT;
5468 }
5469 if (crc != expected_crc) {
5470 derr << __func__ << " bad crc on label, expected " << expected_crc
5471 << " != actual " << crc << dendl;
5472 return -EIO;
5473 }
5474 dout(10) << __func__ << " got " << *label << dendl;
5475 return 0;
5476}
5477
5478int BlueStore::_check_or_set_bdev_label(
5479 string path, uint64_t size, string desc, bool create)
5480{
5481 bluestore_bdev_label_t label;
5482 if (create) {
5483 label.osd_uuid = fsid;
5484 label.size = size;
5485 label.btime = ceph_clock_now();
5486 label.description = desc;
5487 int r = _write_bdev_label(cct, path, label);
5488 if (r < 0)
5489 return r;
5490 } else {
5491 int r = _read_bdev_label(cct, path, &label);
5492 if (r < 0)
5493 return r;
5494 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
5495 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5496 << " and fsid " << fsid << " check bypassed" << dendl;
5497 } else if (label.osd_uuid != fsid) {
5498 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5499 << " does not match our fsid " << fsid << dendl;
5500 return -EIO;
5501 }
5502 }
5503 return 0;
5504}
5505
5506void BlueStore::_set_alloc_sizes(void)
5507{
5508 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
5509
5510#ifdef HAVE_LIBZBD
5511 ceph_assert(bdev);
5512 if (bdev->is_smr()) {
5513 prefer_deferred_size = 0;
5514 } else
5515#endif
5516 if (cct->_conf->bluestore_prefer_deferred_size) {
5517 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
5518 } else {
5519 if (_use_rotational_settings()) {
5520 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
5521 } else {
5522 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
5523 }
5524 }
5525
5526 if (cct->_conf->bluestore_deferred_batch_ops) {
5527 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
5528 } else {
5529 if (_use_rotational_settings()) {
5530 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
5531 } else {
5532 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
5533 }
5534 }
5535
5536 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
5537 << std::dec << " order " << (int)min_alloc_size_order
5538 << " max_alloc_size 0x" << std::hex << max_alloc_size
5539 << " prefer_deferred_size 0x" << prefer_deferred_size
5540 << std::dec
5541 << " deferred_batch_ops " << deferred_batch_ops
5542 << dendl;
5543}
5544
5545int BlueStore::_open_bdev(bool create)
5546{
5547 ceph_assert(bdev == NULL);
5548 string p = path + "/block";
5549 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
5550 int r = bdev->open(p);
5551 if (r < 0)
5552 goto fail;
5553
5554 if (create && cct->_conf->bdev_enable_discard) {
5555 interval_set<uint64_t> whole_device;
5556 whole_device.insert(0, bdev->get_size());
5557 bdev->try_discard(whole_device, false);
5558 }
5559
5560 if (bdev->supported_bdev_label()) {
5561 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
5562 if (r < 0)
5563 goto fail_close;
5564 }
5565
5566 // initialize global block parameters
5567 block_size = bdev->get_block_size();
5568 block_mask = ~(block_size - 1);
5569 block_size_order = std::countr_zero(block_size);
5570 ceph_assert(block_size == 1u << block_size_order);
5571 _set_max_defer_interval();
5572 // and set cache_size based on device type
5573 r = _set_cache_sizes();
5574 if (r < 0) {
5575 goto fail_close;
5576 }
5577 // get block dev optimal io size
5578 optimal_io_size = bdev->get_optimal_io_size();
5579
5580 return 0;
5581
5582 fail_close:
5583 bdev->close();
5584 fail:
5585 delete bdev;
5586 bdev = NULL;
5587 return r;
5588}
5589
5590void BlueStore::_validate_bdev()
5591{
5592 ceph_assert(bdev);
5593 uint64_t dev_size = bdev->get_size();
5594 ceph_assert(dev_size > _get_ondisk_reserved());
5595}
5596
5597void BlueStore::_close_bdev()
5598{
5599 ceph_assert(bdev);
5600 bdev->close();
5601 delete bdev;
5602 bdev = NULL;
5603}
5604
5605int BlueStore::_open_fm(KeyValueDB::Transaction t,
5606 bool read_only,
5607 bool db_avail,
5608 bool fm_restore)
5609{
5610 int r;
5611
5612 dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
5613 ceph_assert(fm == NULL);
5614 // fm_restore means we are transitioning from null-fm to bitmap-fm
5615 ceph_assert(!fm_restore || (freelist_type != "null"));
5616 // fm restore must pass in a valid transaction
5617 ceph_assert(!fm_restore || (t != nullptr));
5618
5619 // when function is called in repair mode (to_repair=true) we skip db->open()/create()
5620 bool can_have_null_fm = !is_db_rotational() &&
5621 !read_only &&
5622 db_avail &&
5623 cct->_conf->bluestore_allocation_from_file &&
5624 !bdev->is_smr();
5625
5626 // When allocation-info is stored in a single file we set freelist_type to "null"
5627 if (can_have_null_fm) {
5628 freelist_type = "null";
5629 need_to_destage_allocation_file = true;
5630 }
5631 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
5632 ceph_assert(fm);
5633 if (t) {
5634 // create mode. initialize freespace
5635 dout(20) << __func__ << " initializing freespace" << dendl;
5636 {
5637 bufferlist bl;
5638 bl.append(freelist_type);
5639 t->set(PREFIX_SUPER, "freelist_type", bl);
5640 }
5641 // being able to allocate in units less than bdev block size
5642 // seems to be a bad idea.
5643 ceph_assert(cct->_conf->bdev_block_size <= min_alloc_size);
5644
5645 uint64_t alloc_size = min_alloc_size;
5646 if (bdev->is_smr() && freelist_type != "zoned") {
5647 derr << "SMR device but freelist_type = " << freelist_type << " (not zoned)"
5648 << dendl;
5649 return -EINVAL;
5650 }
5651 if (!bdev->is_smr() && freelist_type == "zoned") {
5652 derr << "non-SMR device (or SMR support not built-in) but freelist_type = zoned"
5653 << dendl;
5654 return -EINVAL;
5655 }
5656
5657 fm->create(bdev->get_size(), alloc_size,
5658 zone_size, first_sequential_zone,
5659 t);
5660
5661 // allocate superblock reserved space. note that we do not mark
5662 // bluefs space as allocated in the freelist; we instead rely on
5663 // bluefs doing that itself.
5664 auto reserved = _get_ondisk_reserved();
5665 if (fm_restore) {
5666 // we need to allocate the full space in restore case
5667 // as later we will add free-space marked in the allocator file
5668 fm->allocate(0, bdev->get_size(), t);
5669 } else {
5670 // allocate superblock reserved space. note that we do not mark
5671 // bluefs space as allocated in the freelist; we instead rely on
5672 // bluefs doing that itself.
5673 fm->allocate(0, reserved, t);
5674 }
5675 // debug code - not needed for NULL FM
5676 if (cct->_conf->bluestore_debug_prefill > 0) {
5677 uint64_t end = bdev->get_size() - reserved;
5678 dout(1) << __func__ << " pre-fragmenting freespace, using "
5679 << cct->_conf->bluestore_debug_prefill << " with max free extent "
5680 << cct->_conf->bluestore_debug_prefragment_max << dendl;
5681 uint64_t start = p2roundup(reserved, min_alloc_size);
5682 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
5683 float r = cct->_conf->bluestore_debug_prefill;
5684 r /= 1.0 - r;
5685 bool stop = false;
5686
5687 while (!stop && start < end) {
5688 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
5689 if (start + l > end) {
5690 l = end - start;
5691 l = p2align(l, min_alloc_size);
5692 }
5693 ceph_assert(start + l <= end);
5694
5695 uint64_t u = 1 + (uint64_t)(r * (double)l);
5696 u = p2roundup(u, min_alloc_size);
5697 if (start + l + u > end) {
5698 u = end - (start + l);
5699 // trim to align so we don't overflow again
5700 u = p2align(u, min_alloc_size);
5701 stop = true;
5702 }
5703 ceph_assert(start + l + u <= end);
5704
5705 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
5706 << " use 0x" << u << std::dec << dendl;
5707
5708 if (u == 0) {
5709 // break if u has been trimmed to nothing
5710 break;
5711 }
5712
5713 fm->allocate(start + l, u, t);
5714 start += l + u;
5715 }
5716 }
5717 r = _write_out_fm_meta(0);
5718 ceph_assert(r == 0);
5719 } else {
5720 if (can_have_null_fm) {
5721 commit_to_null_manager();
5722 }
5723 r = fm->init(db, read_only,
5724 [&](const std::string& key, std::string* result) {
5725 return read_meta(key, result);
5726 });
5727 if (r < 0) {
5728 derr << __func__ << " failed: " << cpp_strerror(r) << dendl;
5729 delete fm;
5730 fm = NULL;
5731 return r;
5732 }
5733 }
5734 // if space size tracked by free list manager is that higher than actual
5735 // dev size one can hit out-of-space allocation which will result
5736 // in data loss and/or assertions
5737 // Probably user altered the device size somehow.
5738 // The only fix for now is to redeploy OSD.
5739 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5740 ostringstream ss;
5741 ss << "slow device size mismatch detected, "
5742 << " fm size(" << fm->get_size()
5743 << ") > slow device size(" << bdev->get_size()
5744 << "), Please stop using this OSD as it might cause data loss.";
5745 _set_disk_size_mismatch_alert(ss.str());
5746 }
5747 return 0;
5748}
5749
5750void BlueStore::_close_fm()
5751{
5752 dout(10) << __func__ << dendl;
5753 ceph_assert(fm);
5754 fm->shutdown();
5755 delete fm;
5756 fm = NULL;
5757}
5758
5759int BlueStore::_write_out_fm_meta(uint64_t target_size)
5760{
5761 int r = 0;
5762 string p = path + "/block";
5763
5764 std::vector<std::pair<string, string>> fm_meta;
5765 fm->get_meta(target_size, &fm_meta);
5766
5767 for (auto& m : fm_meta) {
5768 r = write_meta(m.first, m.second);
5769 ceph_assert(r == 0);
5770 }
5771 return r;
5772}
5773
5774int BlueStore::_create_alloc()
5775{
5776 ceph_assert(alloc == NULL);
5777 ceph_assert(shared_alloc.a == NULL);
5778 ceph_assert(bdev->get_size());
5779
5780 uint64_t alloc_size = min_alloc_size;
5781
5782 std::string allocator_type = cct->_conf->bluestore_allocator;
5783
5784#ifdef HAVE_LIBZBD
5785 if (freelist_type == "zoned") {
5786 allocator_type = "zoned";
5787 }
5788#endif
5789
5790 alloc = Allocator::create(
5791 cct, allocator_type,
5792 bdev->get_size(),
5793 alloc_size,
5794 zone_size,
5795 first_sequential_zone,
5796 "block");
5797 if (!alloc) {
5798 lderr(cct) << __func__ << " failed to create " << allocator_type << " allocator"
5799 << dendl;
5800 return -EINVAL;
5801 }
5802
5803#ifdef HAVE_LIBZBD
5804 if (freelist_type == "zoned") {
5805 Allocator *a = Allocator::create(
5806 cct, cct->_conf->bluestore_allocator,
5807 bdev->get_conventional_region_size(),
5808 alloc_size,
5809 0, 0,
5810 "zoned_block");
5811 if (!a) {
5812 lderr(cct) << __func__ << " failed to create " << cct->_conf->bluestore_allocator
5813 << " allocator" << dendl;
5814 delete alloc;
5815 return -EINVAL;
5816 }
5817 shared_alloc.set(a, alloc_size);
5818 } else
5819#endif
5820 {
5821 // BlueFS will share the same allocator
5822 shared_alloc.set(alloc, alloc_size);
5823 }
5824
5825 return 0;
5826}
5827
5828int BlueStore::_init_alloc(std::map<uint64_t, uint64_t> *zone_adjustments)
5829{
5830 int r = _create_alloc();
5831 if (r < 0) {
5832 return r;
5833 }
5834 ceph_assert(alloc != NULL);
5835
5836#ifdef HAVE_LIBZBD
5837 if (bdev->is_smr()) {
5838 auto a = dynamic_cast<ZonedAllocator*>(alloc);
5839 ceph_assert(a);
5840 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
5841 ceph_assert(f);
5842 vector<uint64_t> wp = bdev->get_zones();
5843 vector<zone_state_t> zones = f->get_zone_states(db);
5844 ceph_assert(wp.size() == zones.size());
5845
5846 // reconcile zone state
5847 auto num_zones = bdev->get_size() / zone_size;
5848 for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
5849 ceph_assert(wp[i] >= i * zone_size);
5850 ceph_assert(wp[i] <= (i + 1) * zone_size); // pos might be at start of next zone
5851 uint64_t p = wp[i] - i * zone_size;
5852 if (zones[i].write_pointer > p) {
5853 derr << __func__ << " zone 0x" << std::hex << i
5854 << " bluestore write pointer 0x" << zones[i].write_pointer
5855 << " > device write pointer 0x" << p
5856 << std::dec << " -- VERY SUSPICIOUS!" << dendl;
5857 } else if (zones[i].write_pointer < p) {
5858 // this is "normal" in that it can happen after any crash (if we have a
5859 // write in flight but did not manage to commit the transaction)
5860 auto delta = p - zones[i].write_pointer;
5861 dout(1) << __func__ << " zone 0x" << std::hex << i
5862 << " device write pointer 0x" << p
5863 << " > bluestore pointer 0x" << zones[i].write_pointer
5864 << ", advancing 0x" << delta << std::dec << dendl;
5865 (*zone_adjustments)[zones[i].write_pointer] = delta;
5866 zones[i].num_dead_bytes += delta;
5867 zones[i].write_pointer = p;
5868 }
5869 }
5870
5871 // start with conventional zone "free" (bluefs may adjust this when it starts up)
5872 auto reserved = _get_ondisk_reserved();
5873 // for now we require a conventional zone
5874 ceph_assert(bdev->get_conventional_region_size());
5875 ceph_assert(shared_alloc.a != alloc); // zoned allocator doesn't use conventional region
5876 shared_alloc.a->init_add_free(
5877 reserved,
5878 p2align(bdev->get_conventional_region_size(), min_alloc_size) - reserved);
5879
5880 // init sequential zone based on the device's write pointers
5881 a->init_from_zone_pointers(std::move(zones));
5882 dout(1) << __func__
5883 << " loaded zone pointers: "
5884 << std::hex
5885 << ", allocator type " << alloc->get_type()
5886 << ", capacity 0x" << alloc->get_capacity()
5887 << ", block size 0x" << alloc->get_block_size()
5888 << ", free 0x" << alloc->get_free()
5889 << ", fragmentation " << alloc->get_fragmentation()
5890 << std::dec << dendl;
5891
5892 return 0;
5893 }
5894#endif
5895
5896 uint64_t num = 0, bytes = 0;
5897 utime_t start_time = ceph_clock_now();
5898 if (!fm->is_null_manager()) {
5899 // This is the original path - loading allocation map from RocksDB and feeding into the allocator
5900 dout(5) << __func__ << "::NCB::loading allocation from FM -> alloc" << dendl;
5901 // initialize from freelist
5902 fm->enumerate_reset();
5903 uint64_t offset, length;
5904 while (fm->enumerate_next(db, &offset, &length)) {
5905 alloc->init_add_free(offset, length);
5906 ++num;
5907 bytes += length;
5908 }
5909 fm->enumerate_reset();
5910
5911 utime_t duration = ceph_clock_now() - start_time;
5912 dout(5) << __func__ << "::num_entries=" << num << " free_size=" << bytes << " alloc_size=" <<
5913 alloc->get_capacity() - bytes << " time=" << duration << " seconds" << dendl;
5914 } else {
5915 // This is the new path reading the allocation map from a flat bluefs file and feeding them into the allocator
5916
5917 if (!cct->_conf->bluestore_allocation_from_file) {
5918 derr << __func__ << "::NCB::cct->_conf->bluestore_allocation_from_file is set to FALSE with an active NULL-FM" << dendl;
5919 derr << __func__ << "::NCB::Please change the value of bluestore_allocation_from_file to TRUE in your ceph.conf file" << dendl;
5920 return -ENOTSUP; // Operation not supported
5921 }
5922 if (restore_allocator(alloc, &num, &bytes) == 0) {
5923 dout(5) << __func__ << "::NCB::restore_allocator() completed successfully alloc=" << alloc << dendl;
5924 } else {
5925 // This must mean that we had an unplanned shutdown and didn't manage to destage the allocator
5926 dout(0) << __func__ << "::NCB::restore_allocator() failed! Run Full Recovery from ONodes (might take a while) ..." << dendl;
5927 // if failed must recover from on-disk ONode internal state
5928 if (read_allocation_from_drive_on_startup() != 0) {
5929 derr << __func__ << "::NCB::Failed Recovery" << dendl;
5930 derr << __func__ << "::NCB::Ceph-OSD won't start, make sure your drives are connected and readable" << dendl;
5931 derr << __func__ << "::NCB::If no HW fault is found, please report failure and consider redeploying OSD" << dendl;
5932 return -ENOTRECOVERABLE;
5933 }
5934 }
5935 }
5936 dout(1) << __func__
5937 << " loaded " << byte_u_t(bytes) << " in " << num << " extents"
5938 << std::hex
5939 << ", allocator type " << alloc->get_type()
5940 << ", capacity 0x" << alloc->get_capacity()
5941 << ", block size 0x" << alloc->get_block_size()
5942 << ", free 0x" << alloc->get_free()
5943 << ", fragmentation " << alloc->get_fragmentation()
5944 << std::dec << dendl;
5945
5946 return 0;
5947}
5948
5949void BlueStore::_post_init_alloc(const std::map<uint64_t, uint64_t>& zone_adjustments)
5950{
5951 int r = 0;
5952#ifdef HAVE_LIBZBD
5953 if (bdev->is_smr()) {
5954 if (zone_adjustments.empty()) {
5955 return;
5956 }
5957 dout(1) << __func__ << " adjusting freelist based on device write pointers" << dendl;
5958 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
5959 ceph_assert(f);
5960 KeyValueDB::Transaction t = db->get_transaction();
5961 for (auto& i : zone_adjustments) {
5962 // allocate AND release since this gap is now dead space
5963 // note that the offset is imprecise, but only need to select the zone
5964 f->allocate(i.first, i.second, t);
5965 f->release(i.first, i.second, t);
5966 }
5967 r = db->submit_transaction_sync(t);
5968 } else
5969#endif
5970 if (fm->is_null_manager()) {
5971 // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected
5972 // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount()
5973 // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort
5974 // to recovery from RocksDB::ONodes
5975 r = invalidate_allocation_file_on_bluefs();
5976 }
5977 ceph_assert(r >= 0);
5978}
5979
5980void BlueStore::_close_alloc()
5981{
5982 ceph_assert(bdev);
5983 bdev->discard_drain();
5984
5985 ceph_assert(alloc);
5986 alloc->shutdown();
5987 delete alloc;
5988
5989 ceph_assert(shared_alloc.a);
5990 if (alloc != shared_alloc.a) {
5991 shared_alloc.a->shutdown();
5992 delete shared_alloc.a;
5993 }
5994
5995 shared_alloc.reset();
5996 alloc = nullptr;
5997}
5998
5999int BlueStore::_open_fsid(bool create)
6000{
6001 ceph_assert(fsid_fd < 0);
6002 int flags = O_RDWR|O_CLOEXEC;
6003 if (create)
6004 flags |= O_CREAT;
6005 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
6006 if (fsid_fd < 0) {
6007 int err = -errno;
6008 derr << __func__ << " " << cpp_strerror(err) << dendl;
6009 return err;
6010 }
6011 return 0;
6012}
6013
6014int BlueStore::_read_fsid(uuid_d *uuid)
6015{
6016 char fsid_str[40];
6017 memset(fsid_str, 0, sizeof(fsid_str));
6018 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
6019 if (ret < 0) {
6020 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
6021 return ret;
6022 }
6023 if (ret > 36)
6024 fsid_str[36] = 0;
6025 else
6026 fsid_str[ret] = 0;
6027 if (!uuid->parse(fsid_str)) {
6028 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
6029 return -EINVAL;
6030 }
6031 return 0;
6032}
6033
6034int BlueStore::_write_fsid()
6035{
6036 int r = ::ftruncate(fsid_fd, 0);
6037 if (r < 0) {
6038 r = -errno;
6039 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
6040 return r;
6041 }
6042 string str = stringify(fsid) + "\n";
6043 r = safe_write(fsid_fd, str.c_str(), str.length());
6044 if (r < 0) {
6045 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
6046 return r;
6047 }
6048 r = ::fsync(fsid_fd);
6049 if (r < 0) {
6050 r = -errno;
6051 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
6052 return r;
6053 }
6054 return 0;
6055}
6056
6057void BlueStore::_close_fsid()
6058{
6059 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
6060 fsid_fd = -1;
6061}
6062
6063int BlueStore::_lock_fsid()
6064{
6065 struct flock l;
6066 memset(&l, 0, sizeof(l));
6067 l.l_type = F_WRLCK;
6068 l.l_whence = SEEK_SET;
6069 int r = ::fcntl(fsid_fd, F_SETLK, &l);
6070 if (r < 0) {
6071 int err = errno;
6072 derr << __func__ << " failed to lock " << path << "/fsid"
6073 << " (is another ceph-osd still running?)"
6074 << cpp_strerror(err) << dendl;
6075 return -err;
6076 }
6077 return 0;
6078}
6079
6080bool BlueStore::is_rotational()
6081{
6082 if (bdev) {
6083 return bdev->is_rotational();
6084 }
6085
6086 bool rotational = true;
6087 int r = _open_path();
6088 if (r < 0)
6089 goto out;
6090 r = _open_fsid(false);
6091 if (r < 0)
6092 goto out_path;
6093 r = _read_fsid(&fsid);
6094 if (r < 0)
6095 goto out_fsid;
6096 r = _lock_fsid();
6097 if (r < 0)
6098 goto out_fsid;
6099 r = _open_bdev(false);
6100 if (r < 0)
6101 goto out_fsid;
6102 rotational = bdev->is_rotational();
6103 _close_bdev();
6104 out_fsid:
6105 _close_fsid();
6106 out_path:
6107 _close_path();
6108 out:
6109 return rotational;
6110}
6111
6112bool BlueStore::is_journal_rotational()
6113{
6114 if (!bluefs) {
6115 dout(5) << __func__ << " bluefs disabled, default to store media type"
6116 << dendl;
6117 return is_rotational();
6118 }
6119 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
6120 return bluefs->wal_is_rotational();
6121}
6122
6123bool BlueStore::is_db_rotational()
6124{
6125 if (!bluefs) {
6126 dout(5) << __func__ << " bluefs disabled, default to store media type"
6127 << dendl;
6128 return is_rotational();
6129 }
6130 dout(10) << __func__ << " " << (int)bluefs->db_is_rotational() << dendl;
6131 return bluefs->db_is_rotational();
6132}
6133
6134bool BlueStore::_use_rotational_settings()
6135{
6136 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
6137 return true;
6138 }
6139 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
6140 return false;
6141 }
6142 return bdev->is_rotational();
6143}
6144
6145bool BlueStore::is_statfs_recoverable() const
6146{
6147 // abuse fm for now
6148 return has_null_manager();
6149}
6150
6151bool BlueStore::test_mount_in_use()
6152{
6153 // most error conditions mean the mount is not in use (e.g., because
6154 // it doesn't exist). only if we fail to lock do we conclude it is
6155 // in use.
6156 bool ret = false;
6157 int r = _open_path();
6158 if (r < 0)
6159 return false;
6160 r = _open_fsid(false);
6161 if (r < 0)
6162 goto out_path;
6163 r = _lock_fsid();
6164 if (r < 0)
6165 ret = true; // if we can't lock, it is in use
6166 _close_fsid();
6167 out_path:
6168 _close_path();
6169 return ret;
6170}
6171
6172int BlueStore::_minimal_open_bluefs(bool create)
6173{
6174 int r;
6175 bluefs = new BlueFS(cct);
6176
6177 string bfn;
6178 struct stat st;
6179
6180 bfn = path + "/block.db";
6181 if (::stat(bfn.c_str(), &st) == 0) {
6182 r = bluefs->add_block_device(
6183 BlueFS::BDEV_DB, bfn,
6184 create && cct->_conf->bdev_enable_discard,
6185 SUPER_RESERVED);
6186 if (r < 0) {
6187 derr << __func__ << " add block device(" << bfn << ") returned: "
6188 << cpp_strerror(r) << dendl;
6189 goto free_bluefs;
6190 }
6191
6192 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
6193 r = _check_or_set_bdev_label(
6194 bfn,
6195 bluefs->get_block_device_size(BlueFS::BDEV_DB),
6196 "bluefs db", create);
6197 if (r < 0) {
6198 derr << __func__
6199 << " check block device(" << bfn << ") label returned: "
6200 << cpp_strerror(r) << dendl;
6201 goto free_bluefs;
6202 }
6203 }
6204 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6205 bluefs_layout.dedicated_db = true;
6206 } else {
6207 r = -errno;
6208 if (::lstat(bfn.c_str(), &st) == -1) {
6209 r = 0;
6210 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6211 } else {
6212 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
6213 << cpp_strerror(r) << dendl;
6214 goto free_bluefs;
6215 }
6216 }
6217
6218 // shared device
6219 bfn = path + "/block";
6220 // never trim here
6221 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
6222 0, // no need to provide valid 'reserved' for shared dev
6223 &shared_alloc);
6224 if (r < 0) {
6225 derr << __func__ << " add block device(" << bfn << ") returned: "
6226 << cpp_strerror(r) << dendl;
6227 goto free_bluefs;
6228 }
6229
6230 bfn = path + "/block.wal";
6231 if (::stat(bfn.c_str(), &st) == 0) {
6232 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
6233 create && cct->_conf->bdev_enable_discard,
6234 BDEV_LABEL_BLOCK_SIZE);
6235 if (r < 0) {
6236 derr << __func__ << " add block device(" << bfn << ") returned: "
6237 << cpp_strerror(r) << dendl;
6238 goto free_bluefs;
6239 }
6240
6241 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
6242 r = _check_or_set_bdev_label(
6243 bfn,
6244 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
6245 "bluefs wal", create);
6246 if (r < 0) {
6247 derr << __func__ << " check block device(" << bfn
6248 << ") label returned: " << cpp_strerror(r) << dendl;
6249 goto free_bluefs;
6250 }
6251 }
6252
6253 bluefs_layout.dedicated_wal = true;
6254 } else {
6255 r = 0;
6256 if (::lstat(bfn.c_str(), &st) != -1) {
6257 r = -errno;
6258 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
6259 << cpp_strerror(r) << dendl;
6260 goto free_bluefs;
6261 }
6262 }
6263 return 0;
6264
6265free_bluefs:
6266 ceph_assert(bluefs);
6267 delete bluefs;
6268 bluefs = NULL;
6269 return r;
6270}
6271
6272int BlueStore::_open_bluefs(bool create, bool read_only)
6273{
6274 int r = _minimal_open_bluefs(create);
6275 if (r < 0) {
6276 return r;
6277 }
6278 BlueFSVolumeSelector* vselector = nullptr;
6279 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW ||
6280 cct->_conf->bluestore_volume_selection_policy == "use_some_extra_enforced" ||
6281 cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
6282
6283 string options = cct->_conf->bluestore_rocksdb_options;
6284 string options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6285 if (!options_annex.empty()) {
6286 if (!options.empty() &&
6287 *options.rbegin() != ',') {
6288 options += ',';
6289 }
6290 options += options_annex;
6291 }
6292
6293 rocksdb::Options rocks_opts;
6294 r = RocksDBStore::ParseOptionsFromStringStatic(
6295 cct,
6296 options,
6297 rocks_opts,
6298 nullptr);
6299 if (r < 0) {
6300 return r;
6301 }
6302 if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
6303 vselector = new FitToFastVolumeSelector(
6304 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
6305 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
6306 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100);
6307 } else {
6308 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
6309 vselector =
6310 new RocksDBBlueFSVolumeSelector(
6311 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
6312 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
6313 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
6314 1024 * 1024 * 1024, //FIXME: set expected l0 size here
6315 rocks_opts.max_bytes_for_level_base,
6316 rocks_opts.max_bytes_for_level_multiplier,
6317 reserved_factor,
6318 cct->_conf->bluestore_volume_selection_reserved,
6319 cct->_conf->bluestore_volume_selection_policy.find("use_some_extra")
6320 == 0);
6321 }
6322 }
6323 if (create) {
6324 bluefs->mkfs(fsid, bluefs_layout);
6325 }
6326 bluefs->set_volume_selector(vselector);
6327 r = bluefs->mount();
6328 if (r < 0) {
6329 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
6330 }
6331 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
6332 return r;
6333}
6334
6335void BlueStore::_close_bluefs()
6336{
6337 bluefs->umount(db_was_opened_read_only);
6338 _minimal_close_bluefs();
6339}
6340
6341void BlueStore::_minimal_close_bluefs()
6342{
6343 delete bluefs;
6344 bluefs = NULL;
6345}
6346
6347int BlueStore::_is_bluefs(bool create, bool* ret)
6348{
6349 if (create) {
6350 *ret = cct->_conf->bluestore_bluefs;
6351 } else {
6352 string s;
6353 int r = read_meta("bluefs", &s);
6354 if (r < 0) {
6355 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
6356 return -EIO;
6357 }
6358 if (s == "1") {
6359 *ret = true;
6360 } else if (s == "0") {
6361 *ret = false;
6362 } else {
6363 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
6364 << dendl;
6365 return -EIO;
6366 }
6367 }
6368 return 0;
6369}
6370
6371/*
6372* opens both DB and dependant super_meta, FreelistManager and allocator
6373* in the proper order
6374*/
6375int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
6376{
6377 dout(5) << __func__ << "::NCB::read_only=" << read_only << ", to_repair=" << to_repair << dendl;
6378 {
6379 string type;
6380 int r = read_meta("type", &type);
6381 if (r < 0) {
6382 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
6383 << dendl;
6384 return r;
6385 }
6386
6387 if (type != "bluestore") {
6388 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6389 return -EIO;
6390 }
6391 }
6392
6393 // SMR devices may require a freelist adjustment, but that can only happen after
6394 // the db is read-write. we'll stash pending changes here.
6395 std::map<uint64_t, uint64_t> zone_adjustments;
6396
6397 int r = _open_path();
6398 if (r < 0)
6399 return r;
6400 r = _open_fsid(false);
6401 if (r < 0)
6402 goto out_path;
6403
6404 r = _read_fsid(&fsid);
6405 if (r < 0)
6406 goto out_fsid;
6407
6408 r = _lock_fsid();
6409 if (r < 0)
6410 goto out_fsid;
6411
6412 r = _open_bdev(false);
6413 if (r < 0)
6414 goto out_fsid;
6415
6416 // GBH: can probably skip open_db step in REad-Only mode when operating in NULL-FM mode
6417 // (might need to open if failed to restore from file)
6418
6419 // open in read-only first to read FM list and init allocator
6420 // as they might be needed for some BlueFS procedures
6421 r = _open_db(false, false, true);
6422 if (r < 0)
6423 goto out_bdev;
6424
6425 r = _open_super_meta();
6426 if (r < 0) {
6427 goto out_db;
6428 }
6429
6430 r = _open_fm(nullptr, true, false);
6431 if (r < 0)
6432 goto out_db;
6433
6434 r = _init_alloc(&zone_adjustments);
6435 if (r < 0)
6436 goto out_fm;
6437
6438 // Re-open in the proper mode(s).
6439
6440 // Can't simply bypass second open for read-only mode as we need to
6441 // load allocated extents from bluefs into allocator.
6442 // And now it's time to do that
6443 //
6444 _close_db();
6445 r = _open_db(false, to_repair, read_only);
6446 if (r < 0) {
6447 goto out_alloc;
6448 }
6449
6450 if (!read_only) {
6451 _post_init_alloc(zone_adjustments);
6452 }
6453
6454 // when function is called in repair mode (to_repair=true) we skip db->open()/create()
6455 // we can't change bluestore allocation so no need to invlidate allocation-file
6456 if (fm->is_null_manager() && !read_only && !to_repair) {
6457 // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected
6458 // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount()
6459 // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort
6460 // to recovery from RocksDB::ONodes
6461 r = invalidate_allocation_file_on_bluefs();
6462 if (r != 0) {
6463 derr << __func__ << "::NCB::invalidate_allocation_file_on_bluefs() failed!" << dendl;
6464 goto out_alloc;
6465 }
6466 }
6467
6468 // when function is called in repair mode (to_repair=true) we skip db->open()/create()
6469 if (!is_db_rotational() && !read_only && !to_repair && cct->_conf->bluestore_allocation_from_file
6470#ifdef HAVE_LIBZBD
6471 && !bdev->is_smr()
6472#endif
6473 ) {
6474 dout(5) << __func__ << "::NCB::Commit to Null-Manager" << dendl;
6475 commit_to_null_manager();
6476 need_to_destage_allocation_file = true;
6477 dout(10) << __func__ << "::NCB::need_to_destage_allocation_file was set" << dendl;
6478 }
6479
6480 return 0;
6481
6482out_alloc:
6483 _close_alloc();
6484out_fm:
6485 _close_fm();
6486 out_db:
6487 _close_db();
6488 out_bdev:
6489 _close_bdev();
6490 out_fsid:
6491 _close_fsid();
6492 out_path:
6493 _close_path();
6494 return r;
6495}
6496
6497void BlueStore::_close_db_and_around()
6498{
6499 if (db) {
6500 _close_db();
6501 }
6502 _close_around_db();
6503}
6504
6505void BlueStore::_close_around_db()
6506{
6507 if (bluefs) {
6508 _close_bluefs();
6509 }
6510 _close_fm();
6511 _close_alloc();
6512 _close_bdev();
6513 _close_fsid();
6514 _close_path();
6515}
6516
6517int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair)
6518{
6519 _kv_only = true;
6520 int r = _open_db_and_around(false, to_repair);
6521 if (r == 0) {
6522 *pdb = db;
6523 } else {
6524 *pdb = nullptr;
6525 }
6526 return r;
6527}
6528
6529int BlueStore::close_db_environment()
6530{
6531 if (db) {
6532 delete db;
6533 db = nullptr;
6534 }
6535 _close_around_db();
6536 return 0;
6537}
6538
6539/* gets access to bluefs supporting RocksDB */
6540BlueFS* BlueStore::get_bluefs() {
6541 return bluefs;
6542}
6543
6544int BlueStore::_prepare_db_environment(bool create, bool read_only,
6545 std::string* _fn, std::string* _kv_backend)
6546{
6547 int r;
6548 ceph_assert(!db);
6549 std::string& fn=*_fn;
6550 std::string& kv_backend=*_kv_backend;
6551 fn = path + "/db";
6552 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
6553
6554 if (create) {
6555 kv_backend = cct->_conf->bluestore_kvbackend;
6556 } else {
6557 r = read_meta("kv_backend", &kv_backend);
6558 if (r < 0) {
6559 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
6560 return -EIO;
6561 }
6562 }
6563 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
6564
6565 bool do_bluefs;
6566 r = _is_bluefs(create, &do_bluefs);
6567 if (r < 0) {
6568 return r;
6569 }
6570 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
6571
6572 map<string,string> kv_options;
6573 // force separate wal dir for all new deployments.
6574 kv_options["separate_wal_dir"] = 1;
6575 rocksdb::Env *env = NULL;
6576 if (do_bluefs) {
6577 dout(10) << __func__ << " initializing bluefs" << dendl;
6578 if (kv_backend != "rocksdb") {
6579 derr << " backend must be rocksdb to use bluefs" << dendl;
6580 return -EINVAL;
6581 }
6582
6583 r = _open_bluefs(create, read_only);
6584 if (r < 0) {
6585 return r;
6586 }
6587
6588 if (cct->_conf->bluestore_bluefs_env_mirror) {
6589 rocksdb::Env* a = new BlueRocksEnv(bluefs);
6590 rocksdb::Env* b = rocksdb::Env::Default();
6591 if (create) {
6592 string cmd = "rm -rf " + path + "/db " +
6593 path + "/db.slow " +
6594 path + "/db.wal";
6595 int r = system(cmd.c_str());
6596 (void)r;
6597 }
6598 env = new rocksdb::EnvMirror(b, a, false, true);
6599 } else {
6600 env = new BlueRocksEnv(bluefs);
6601
6602 // simplify the dir names, too, as "seen" by rocksdb
6603 fn = "db";
6604 }
6605 BlueFSVolumeSelector::paths paths;
6606 bluefs->get_vselector_paths(fn, paths);
6607
6608 {
6609 ostringstream db_paths;
6610 bool first = true;
6611 for (auto& p : paths) {
6612 if (!first) {
6613 db_paths << " ";
6614 }
6615 first = false;
6616 db_paths << p.first << "," << p.second;
6617
6618 }
6619 kv_options["db_paths"] = db_paths.str();
6620 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
6621 }
6622
6623 if (create) {
6624 for (auto& p : paths) {
6625 env->CreateDir(p.first);
6626 }
6627 // Selectors don't provide wal path so far hence create explicitly
6628 env->CreateDir(fn + ".wal");
6629 } else {
6630 std::vector<std::string> res;
6631 // check for dir presence
6632 auto r = env->GetChildren(fn+".wal", &res);
6633 if (r.IsNotFound()) {
6634 kv_options.erase("separate_wal_dir");
6635 }
6636 }
6637 } else {
6638 string walfn = path + "/db.wal";
6639
6640 if (create) {
6641 int r = ::mkdir(fn.c_str(), 0755);
6642 if (r < 0)
6643 r = -errno;
6644 if (r < 0 && r != -EEXIST) {
6645 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
6646 << dendl;
6647 return r;
6648 }
6649
6650 // wal_dir, too!
6651 r = ::mkdir(walfn.c_str(), 0755);
6652 if (r < 0)
6653 r = -errno;
6654 if (r < 0 && r != -EEXIST) {
6655 derr << __func__ << " failed to create " << walfn
6656 << ": " << cpp_strerror(r)
6657 << dendl;
6658 return r;
6659 }
6660 } else {
6661 struct stat st;
6662 r = ::stat(walfn.c_str(), &st);
6663 if (r < 0 && errno == ENOENT) {
6664 kv_options.erase("separate_wal_dir");
6665 }
6666 }
6667 }
6668
6669
6670 db = KeyValueDB::create(cct,
6671 kv_backend,
6672 fn,
6673 kv_options,
6674 static_cast<void*>(env));
6675 if (!db) {
6676 derr << __func__ << " error creating db" << dendl;
6677 if (bluefs) {
6678 _close_bluefs();
6679 }
6680 // delete env manually here since we can't depend on db to do this
6681 // under this case
6682 delete env;
6683 env = NULL;
6684 return -EIO;
6685 }
6686
6687 FreelistManager::setup_merge_operators(db, freelist_type);
6688 db->set_merge_operator(PREFIX_STAT, merge_op);
6689 db->set_cache_size(cache_kv_ratio * cache_size);
6690 return 0;
6691}
6692
6693int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
6694{
6695 int r;
6696 ceph_assert(!(create && read_only));
6697 string options;
6698 string options_annex;
6699 stringstream err;
6700 string kv_dir_fn;
6701 string kv_backend;
6702 std::string sharding_def;
6703 // prevent write attempts to BlueFS in case we failed before BlueFS was opened
6704 db_was_opened_read_only = true;
6705 r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend);
6706 if (r < 0) {
6707 derr << __func__ << " failed to prepare db environment: " << err.str() << dendl;
6708 return -EIO;
6709 }
6710 // if reached here then BlueFS is already opened
6711 db_was_opened_read_only = read_only;
6712 dout(10) << __func__ << "::db_was_opened_read_only was set to " << read_only << dendl;
6713 if (kv_backend == "rocksdb") {
6714 options = cct->_conf->bluestore_rocksdb_options;
6715 options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6716 if (!options_annex.empty()) {
6717 if (!options.empty() &&
6718 *options.rbegin() != ',') {
6719 options += ',';
6720 }
6721 options += options_annex;
6722 }
6723
6724 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
6725 sharding_def = cct->_conf.get_val<std::string>("bluestore_rocksdb_cfs");
6726 }
6727 }
6728
6729 db->init(options);
6730 if (to_repair_db)
6731 return 0;
6732 if (create) {
6733 r = db->create_and_open(err, sharding_def);
6734 } else {
6735 // we pass in cf list here, but it is only used if the db already has
6736 // column families created.
6737 r = read_only ?
6738 db->open_read_only(err, sharding_def) :
6739 db->open(err, sharding_def);
6740 }
6741 if (r) {
6742 derr << __func__ << " erroring opening db: " << err.str() << dendl;
6743 _close_db();
6744 return -EIO;
6745 }
6746 dout(1) << __func__ << " opened " << kv_backend
6747 << " path " << kv_dir_fn << " options " << options << dendl;
6748 return 0;
6749}
6750
6751void BlueStore::_close_db()
6752{
6753 dout(10) << __func__ << ":read_only=" << db_was_opened_read_only
6754 << " fm=" << fm
6755 << " destage_alloc_file=" << need_to_destage_allocation_file
6756 << " per_pool=" << per_pool_stat_collection
6757 << " pool stats=" << osd_pools.size()
6758 << dendl;
6759 bool do_destage = !db_was_opened_read_only && need_to_destage_allocation_file;
6760 if (do_destage && is_statfs_recoverable()) {
6761 auto t = db->get_transaction();
6762 store_statfs_t s;
6763 if (per_pool_stat_collection) {
6764 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
6765 uint64_t pool_id;
6766 for (it->upper_bound(string()); it->valid(); it->next()) {
6767 int r = get_key_pool_stat(it->key(), &pool_id);
6768 if (r >= 0) {
6769 dout(10) << __func__ << " wiping statfs for: " << pool_id << dendl;
6770 } else {
6771 derr << __func__ << " wiping invalid statfs key: " << it->key() << dendl;
6772 }
6773 t->rmkey(PREFIX_STAT, it->key());
6774 }
6775
6776 std::lock_guard l(vstatfs_lock);
6777 for(auto &p : osd_pools) {
6778 string key;
6779 get_pool_stat_key(p.first, &key);
6780 bufferlist bl;
6781 if (!p.second.is_empty()) {
6782 p.second.encode(bl);
6783 p.second.publish(&s);
6784 t->set(PREFIX_STAT, key, bl);
6785 dout(10) << __func__ << " persisting: "
6786 << p.first << "->" << s
6787 << dendl;
6788 }
6789 }
6790 } else {
6791 bufferlist bl;
6792 {
6793 std::lock_guard l(vstatfs_lock);
6794 vstatfs.encode(bl);
6795 vstatfs.publish(&s);
6796 }
6797 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
6798 dout(10) << __func__ << "persisting: " << s << dendl;
6799 }
6800 int r = db->submit_transaction_sync(t);
6801 dout(10) << __func__ << " statfs persisted." << dendl;
6802 ceph_assert(r >= 0);
6803 }
6804 ceph_assert(db);
6805 delete db;
6806 db = nullptr;
6807
6808 if (do_destage && fm && fm->is_null_manager()) {
6809 int ret = store_allocator(alloc);
6810 if (ret != 0) {
6811 derr << __func__ << "::NCB::store_allocator() failed (continue with bitmapFreelistManager)" << dendl;
6812 }
6813 }
6814
6815 if (bluefs) {
6816 _close_bluefs();
6817 }
6818}
6819
6820void BlueStore::_dump_alloc_on_failure()
6821{
6822 auto dump_interval =
6823 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
6824 if (dump_interval > 0 &&
6825 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
6826 shared_alloc.a->dump();
6827 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
6828 next_dump_on_bluefs_alloc_failure += dump_interval;
6829 }
6830}
6831
6832int BlueStore::_open_collections()
6833{
6834 if (!coll_map.empty()) {
6835 // could be opened from another path
6836 dout(20) << __func__ << "::NCB::collections are already opened, nothing to do" << dendl;
6837 return 0;
6838 }
6839
6840 dout(10) << __func__ << dendl;
6841 collections_had_errors = false;
6842 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6843 size_t load_cnt = 0;
6844 for (it->upper_bound(string());
6845 it->valid();
6846 it->next()) {
6847 coll_t cid;
6848 if (cid.parse(it->key())) {
6849 auto c = ceph::make_ref<Collection>(
6850 this,
6851 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6852 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6853 cid);
6854 bufferlist bl = it->value();
6855 auto p = bl.cbegin();
6856 try {
6857 decode(c->cnode, p);
6858 } catch (ceph::buffer::error& e) {
6859 derr << __func__ << " failed to decode cnode, key:"
6860 << pretty_binary_string(it->key()) << dendl;
6861 return -EIO;
6862 }
6863 dout(20) << __func__ << " opened " << cid << " " << c
6864 << " " << c->cnode << dendl;
6865 _osr_attach(c.get());
6866 coll_map[cid] = c;
6867 load_cnt++;
6868 } else {
6869 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6870 collections_had_errors = true;
6871 }
6872 }
6873 dout(10) << __func__ << " collections loaded: " << load_cnt
6874 << dendl;
6875 return 0;
6876}
6877
6878void BlueStore::_fsck_collections(int64_t* errors)
6879{
6880 if (collections_had_errors) {
6881 dout(10) << __func__ << dendl;
6882 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE);
6883 for (it->upper_bound(string());
6884 it->valid();
6885 it->next()) {
6886 coll_t cid;
6887 if (!cid.parse(it->key())) {
6888 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6889 if (errors) {
6890 (*errors)++;
6891 }
6892 }
6893 }
6894 }
6895}
6896
6897void BlueStore::_set_per_pool_omap()
6898{
6899 per_pool_omap = OMAP_BULK;
6900 bufferlist bl;
6901 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6902 if (bl.length()) {
6903 auto s = bl.to_str();
6904 if (s == stringify(OMAP_PER_POOL)) {
6905 per_pool_omap = OMAP_PER_POOL;
6906 } else if (s == stringify(OMAP_PER_PG)) {
6907 per_pool_omap = OMAP_PER_PG;
6908 } else {
6909 ceph_assert(s == stringify(OMAP_BULK));
6910 }
6911 dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl;
6912 } else {
6913 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6914 }
6915 _check_no_per_pg_or_pool_omap_alert();
6916}
6917
6918void BlueStore::_open_statfs()
6919{
6920 osd_pools.clear();
6921 vstatfs.reset();
6922
6923 bufferlist bl;
6924 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
6925 if (r >= 0) {
6926 per_pool_stat_collection = false;
6927 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
6928 auto it = bl.cbegin();
6929 vstatfs.decode(it);
6930 dout(10) << __func__ << " store_statfs is found" << dendl;
6931 } else {
6932 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6933 }
6934 _check_legacy_statfs_alert();
6935 } else {
6936 per_pool_stat_collection = true;
6937 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
6938 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
6939 for (it->upper_bound(string());
6940 it->valid();
6941 it->next()) {
6942
6943 uint64_t pool_id;
6944 int r = get_key_pool_stat(it->key(), &pool_id);
6945 ceph_assert(r == 0);
6946
6947 bufferlist bl;
6948 bl = it->value();
6949 auto p = bl.cbegin();
6950 auto& st = osd_pools[pool_id];
6951 try {
6952 st.decode(p);
6953 vstatfs += st;
6954
6955 dout(10) << __func__ << " pool " << std::hex << pool_id
6956 << " statfs(hex) " << st
6957 << std::dec << dendl;
6958 } catch (ceph::buffer::error& e) {
6959 derr << __func__ << " failed to decode pool stats, key:"
6960 << pretty_binary_string(it->key()) << dendl;
6961 }
6962 }
6963 }
6964 dout(10) << __func__ << " statfs " << std::hex
6965 << vstatfs << std::dec << dendl;
6966
6967}
6968
6969int BlueStore::_setup_block_symlink_or_file(
6970 string name,
6971 string epath,
6972 uint64_t size,
6973 bool create)
6974{
6975 dout(20) << __func__ << " name " << name << " path " << epath
6976 << " size " << size << " create=" << (int)create << dendl;
6977 int r = 0;
6978 int flags = O_RDWR|O_CLOEXEC;
6979 if (create)
6980 flags |= O_CREAT;
6981 if (epath.length()) {
6982 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6983 if (r < 0) {
6984 r = -errno;
6985 derr << __func__ << " failed to create " << name << " symlink to "
6986 << epath << ": " << cpp_strerror(r) << dendl;
6987 return r;
6988 }
6989
6990 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6991 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6992 if (fd < 0) {
6993 r = -errno;
6994 derr << __func__ << " failed to open " << epath << " file: "
6995 << cpp_strerror(r) << dendl;
6996 return r;
6997 }
6998 // write the Transport ID of the NVMe device
6999 // a transport id for PCIe looks like: "trtype:PCIe traddr:0000:02:00.0"
7000 // where "0000:02:00.0" is the selector of a PCI device, see
7001 // the first column of "lspci -mm -n -D"
7002 // a transport id for tcp looks like: "trype:TCP adrfam:IPv4 traddr:172.31.89.152 trsvcid:4420"
7003 string trid = epath.substr(strlen(SPDK_PREFIX));
7004 r = ::write(fd, trid.c_str(), trid.size());
7005 ceph_assert(r == static_cast<int>(trid.size()));
7006 dout(1) << __func__ << " created " << name << " symlink to "
7007 << epath << dendl;
7008 VOID_TEMP_FAILURE_RETRY(::close(fd));
7009 }
7010 }
7011 if (size) {
7012 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
7013 if (fd >= 0) {
7014 // block file is present
7015 struct stat st;
7016 int r = ::fstat(fd, &st);
7017 if (r == 0 &&
7018 S_ISREG(st.st_mode) && // if it is a regular file
7019 st.st_size == 0) { // and is 0 bytes
7020 r = ::ftruncate(fd, size);
7021 if (r < 0) {
7022 r = -errno;
7023 derr << __func__ << " failed to resize " << name << " file to "
7024 << size << ": " << cpp_strerror(r) << dendl;
7025 VOID_TEMP_FAILURE_RETRY(::close(fd));
7026 return r;
7027 }
7028
7029 if (cct->_conf->bluestore_block_preallocate_file) {
7030 r = ::ceph_posix_fallocate(fd, 0, size);
7031 if (r > 0) {
7032 derr << __func__ << " failed to prefallocate " << name << " file to "
7033 << size << ": " << cpp_strerror(r) << dendl;
7034 VOID_TEMP_FAILURE_RETRY(::close(fd));
7035 return -r;
7036 }
7037 }
7038 dout(1) << __func__ << " resized " << name << " file to "
7039 << byte_u_t(size) << dendl;
7040 }
7041 VOID_TEMP_FAILURE_RETRY(::close(fd));
7042 } else {
7043 int r = -errno;
7044 if (r != -ENOENT) {
7045 derr << __func__ << " failed to open " << name << " file: "
7046 << cpp_strerror(r) << dendl;
7047 return r;
7048 }
7049 }
7050 }
7051 return 0;
7052}
7053
7054int BlueStore::mkfs()
7055{
7056 dout(1) << __func__ << " path " << path << dendl;
7057 int r;
7058 uuid_d old_fsid;
7059 uint64_t reserved;
7060 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
7061 derr << __func__ << " osd_max_object_size "
7062 << cct->_conf->osd_max_object_size << " > bluestore max "
7063 << OBJECT_MAX_SIZE << dendl;
7064 return -EINVAL;
7065 }
7066
7067 {
7068 string done;
7069 r = read_meta("mkfs_done", &done);
7070 if (r == 0) {
7071 dout(1) << __func__ << " already created" << dendl;
7072 if (cct->_conf->bluestore_fsck_on_mkfs) {
7073 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
7074 if (r < 0) {
7075 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
7076 << dendl;
7077 return r;
7078 }
7079 if (r > 0) {
7080 derr << __func__ << " fsck found " << r << " errors" << dendl;
7081 r = -EIO;
7082 }
7083 }
7084 return r; // idempotent
7085 }
7086 }
7087
7088 {
7089 string type;
7090 r = read_meta("type", &type);
7091 if (r == 0) {
7092 if (type != "bluestore") {
7093 derr << __func__ << " expected bluestore, but type is " << type << dendl;
7094 return -EIO;
7095 }
7096 } else {
7097 r = write_meta("type", "bluestore");
7098 if (r < 0)
7099 return r;
7100 }
7101 }
7102
7103 r = _open_path();
7104 if (r < 0)
7105 return r;
7106
7107 r = _open_fsid(true);
7108 if (r < 0)
7109 goto out_path_fd;
7110
7111 r = _lock_fsid();
7112 if (r < 0)
7113 goto out_close_fsid;
7114
7115 r = _read_fsid(&old_fsid);
7116 if (r < 0 || old_fsid.is_zero()) {
7117 if (fsid.is_zero()) {
7118 fsid.generate_random();
7119 dout(1) << __func__ << " generated fsid " << fsid << dendl;
7120 } else {
7121 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
7122 }
7123 // we'll write it later.
7124 } else {
7125 if (!fsid.is_zero() && fsid != old_fsid) {
7126 derr << __func__ << " on-disk fsid " << old_fsid
7127 << " != provided " << fsid << dendl;
7128 r = -EINVAL;
7129 goto out_close_fsid;
7130 }
7131 fsid = old_fsid;
7132 }
7133
7134 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
7135 cct->_conf->bluestore_block_size,
7136 cct->_conf->bluestore_block_create);
7137 if (r < 0)
7138 goto out_close_fsid;
7139 if (cct->_conf->bluestore_bluefs) {
7140 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
7141 cct->_conf->bluestore_block_wal_size,
7142 cct->_conf->bluestore_block_wal_create);
7143 if (r < 0)
7144 goto out_close_fsid;
7145 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
7146 cct->_conf->bluestore_block_db_size,
7147 cct->_conf->bluestore_block_db_create);
7148 if (r < 0)
7149 goto out_close_fsid;
7150 }
7151
7152 r = _open_bdev(true);
7153 if (r < 0)
7154 goto out_close_fsid;
7155
7156 // choose freelist manager
7157#ifdef HAVE_LIBZBD
7158 if (bdev->is_smr()) {
7159 freelist_type = "zoned";
7160 zone_size = bdev->get_zone_size();
7161 first_sequential_zone = bdev->get_conventional_region_size() / zone_size;
7162 bdev->reset_all_zones();
7163 } else
7164#endif
7165 {
7166 freelist_type = "bitmap";
7167 }
7168 dout(10) << " freelist_type " << freelist_type << dendl;
7169
7170 // choose min_alloc_size
7171 dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
7172 << " block_size: 0x" << block_size << std::dec << dendl;
7173 if ((cct->_conf->bluestore_use_optimal_io_size_for_min_alloc_size) && (optimal_io_size != 0)) {
7174 dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
7175 << " for min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
7176 min_alloc_size = optimal_io_size;
7177 }
7178 else if (cct->_conf->bluestore_min_alloc_size) {
7179 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
7180 } else {
7181 ceph_assert(bdev);
7182 if (_use_rotational_settings()) {
7183 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
7184 } else {
7185 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
7186 }
7187 }
7188 _validate_bdev();
7189
7190 // make sure min_alloc_size is power of 2 aligned.
7191 if (!std::has_single_bit(min_alloc_size)) {
7192 derr << __func__ << " min_alloc_size 0x"
7193 << std::hex << min_alloc_size << std::dec
7194 << " is not power of 2 aligned!"
7195 << dendl;
7196 r = -EINVAL;
7197 goto out_close_bdev;
7198 }
7199
7200 // make sure min_alloc_size is >= and aligned with block size
7201 if (min_alloc_size % block_size != 0) {
7202 derr << __func__ << " min_alloc_size 0x"
7203 << std::hex << min_alloc_size
7204 << " is less or not aligned with block_size: 0x"
7205 << block_size << std::dec << dendl;
7206 r = -EINVAL;
7207 goto out_close_bdev;
7208 }
7209
7210 r = _create_alloc();
7211 if (r < 0) {
7212 goto out_close_bdev;
7213 }
7214
7215 reserved = _get_ondisk_reserved();
7216 alloc->init_add_free(reserved,
7217 p2align(bdev->get_size(), min_alloc_size) - reserved);
7218#ifdef HAVE_LIBZBD
7219 if (bdev->is_smr() && alloc != shared_alloc.a) {
7220 shared_alloc.a->init_add_free(reserved,
7221 p2align(bdev->get_conventional_region_size(),
7222 min_alloc_size) - reserved);
7223 }
7224#endif
7225
7226 r = _open_db(true);
7227 if (r < 0)
7228 goto out_close_alloc;
7229
7230 {
7231 KeyValueDB::Transaction t = db->get_transaction();
7232 r = _open_fm(t, false, true);
7233 if (r < 0)
7234 goto out_close_db;
7235 {
7236 bufferlist bl;
7237 encode((uint64_t)0, bl);
7238 t->set(PREFIX_SUPER, "nid_max", bl);
7239 t->set(PREFIX_SUPER, "blobid_max", bl);
7240 }
7241
7242 {
7243 bufferlist bl;
7244 encode((uint64_t)min_alloc_size, bl);
7245 t->set(PREFIX_SUPER, "min_alloc_size", bl);
7246 }
7247 {
7248 bufferlist bl;
7249 if (cct->_conf.get_val<bool>("bluestore_debug_legacy_omap")) {
7250 bl.append(stringify(OMAP_BULK));
7251 } else {
7252 bl.append(stringify(OMAP_PER_PG));
7253 }
7254 t->set(PREFIX_SUPER, "per_pool_omap", bl);
7255 }
7256
7257#ifdef HAVE_LIBZBD
7258 if (bdev->is_smr()) {
7259 {
7260 bufferlist bl;
7261 encode((uint64_t)zone_size, bl);
7262 t->set(PREFIX_SUPER, "zone_size", bl);
7263 }
7264 {
7265 bufferlist bl;
7266 encode((uint64_t)first_sequential_zone, bl);
7267 t->set(PREFIX_SUPER, "first_sequential_zone", bl);
7268 }
7269 }
7270#endif
7271
7272 ondisk_format = latest_ondisk_format;
7273 _prepare_ondisk_format_super(t);
7274 db->submit_transaction_sync(t);
7275 }
7276
7277 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
7278 if (r < 0)
7279 goto out_close_fm;
7280
7281 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7282 if (r < 0)
7283 goto out_close_fm;
7284
7285 if (fsid != old_fsid) {
7286 r = _write_fsid();
7287 if (r < 0) {
7288 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
7289 goto out_close_fm;
7290 }
7291 }
7292
7293 out_close_fm:
7294 _close_fm();
7295 out_close_db:
7296 _close_db();
7297 out_close_alloc:
7298 _close_alloc();
7299 out_close_bdev:
7300 _close_bdev();
7301 out_close_fsid:
7302 _close_fsid();
7303 out_path_fd:
7304 _close_path();
7305
7306 if (r == 0 &&
7307 cct->_conf->bluestore_fsck_on_mkfs) {
7308 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
7309 if (rc < 0)
7310 return rc;
7311 if (rc > 0) {
7312 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7313 r = -EIO;
7314 }
7315 }
7316
7317 if (r == 0) {
7318 // indicate success by writing the 'mkfs_done' file
7319 r = write_meta("mkfs_done", "yes");
7320 }
7321
7322 if (r < 0) {
7323 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
7324 } else {
7325 dout(0) << __func__ << " success" << dendl;
7326 }
7327 return r;
7328}
7329
7330int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
7331{
7332 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
7333 int r;
7334 ceph_assert(path_fd < 0);
7335
7336 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
7337
7338 if (!cct->_conf->bluestore_bluefs) {
7339 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7340 return -EIO;
7341 }
7342 dout(5) << __func__ << "::NCB::calling open_db_and_around(read-only)" << dendl;
7343 r = _open_db_and_around(true);
7344 if (r < 0) {
7345 return r;
7346 }
7347
7348 if (id == BlueFS::BDEV_NEWWAL) {
7349 string p = path + "/block.wal";
7350 r = _setup_block_symlink_or_file("block.wal", dev_path,
7351 cct->_conf->bluestore_block_wal_size,
7352 true);
7353 ceph_assert(r == 0);
7354
7355 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
7356 cct->_conf->bdev_enable_discard,
7357 BDEV_LABEL_BLOCK_SIZE);
7358 ceph_assert(r == 0);
7359
7360 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
7361 r = _check_or_set_bdev_label(
7362 p,
7363 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
7364 "bluefs wal",
7365 true);
7366 ceph_assert(r == 0);
7367 }
7368
7369 bluefs_layout.dedicated_wal = true;
7370 } else if (id == BlueFS::BDEV_NEWDB) {
7371 string p = path + "/block.db";
7372 r = _setup_block_symlink_or_file("block.db", dev_path,
7373 cct->_conf->bluestore_block_db_size,
7374 true);
7375 ceph_assert(r == 0);
7376
7377 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
7378 cct->_conf->bdev_enable_discard,
7379 SUPER_RESERVED);
7380 ceph_assert(r == 0);
7381
7382 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
7383 r = _check_or_set_bdev_label(
7384 p,
7385 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
7386 "bluefs db",
7387 true);
7388 ceph_assert(r == 0);
7389 }
7390 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
7391 bluefs_layout.dedicated_db = true;
7392 }
7393 bluefs->umount();
7394 bluefs->mount();
7395
7396 r = bluefs->prepare_new_device(id, bluefs_layout);
7397 ceph_assert(r == 0);
7398
7399 if (r < 0) {
7400 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
7401 } else {
7402 dout(0) << __func__ << " success" << dendl;
7403 }
7404
7405 _close_db_and_around();
7406 return r;
7407}
7408
7409int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
7410 int id)
7411{
7412 dout(10) << __func__ << " id:" << id << dendl;
7413 ceph_assert(path_fd < 0);
7414
7415 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
7416
7417 if (!cct->_conf->bluestore_bluefs) {
7418 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7419 return -EIO;
7420 }
7421
7422 int r = _open_db_and_around(true);
7423 if (r < 0) {
7424 return r;
7425 }
7426 auto close_db = make_scope_guard([&] {
7427 _close_db_and_around();
7428 });
7429 uint64_t used_space = 0;
7430 for(auto src_id : devs_source) {
7431 used_space += bluefs->get_used(src_id);
7432 }
7433 uint64_t target_free = bluefs->get_free(id);
7434 if (target_free < used_space) {
7435 derr << __func__
7436 << " can't migrate, free space at target: " << target_free
7437 << " is less than required space: " << used_space
7438 << dendl;
7439 return -ENOSPC;
7440 }
7441 if (devs_source.count(BlueFS::BDEV_DB)) {
7442 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7443 bluefs_layout.dedicated_db = false;
7444 }
7445 if (devs_source.count(BlueFS::BDEV_WAL)) {
7446 bluefs_layout.dedicated_wal = false;
7447 }
7448 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
7449 if (r < 0) {
7450 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
7451 return r;
7452 }
7453
7454 if (devs_source.count(BlueFS::BDEV_DB)) {
7455 r = unlink(string(path + "/block.db").c_str());
7456 ceph_assert(r == 0);
7457 }
7458 if (devs_source.count(BlueFS::BDEV_WAL)) {
7459 r = unlink(string(path + "/block.wal").c_str());
7460 ceph_assert(r == 0);
7461 }
7462 return r;
7463}
7464
7465int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
7466 int id,
7467 const string& dev_path)
7468{
7469 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
7470 ceph_assert(path_fd < 0);
7471
7472 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
7473
7474 if (!cct->_conf->bluestore_bluefs) {
7475 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7476 return -EIO;
7477 }
7478
7479 int r = _open_db_and_around(true);
7480 if (r < 0) {
7481 return r;
7482 }
7483 auto close_db = make_scope_guard([&] {
7484 _close_db_and_around();
7485 });
7486
7487 string link_db;
7488 string link_wal;
7489 if (devs_source.count(BlueFS::BDEV_DB) &&
7490 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
7491 link_db = path + "/block.db";
7492 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7493 bluefs_layout.dedicated_db = false;
7494 }
7495 if (devs_source.count(BlueFS::BDEV_WAL)) {
7496 link_wal = path + "/block.wal";
7497 bluefs_layout.dedicated_wal = false;
7498 }
7499
7500 size_t target_size = 0;
7501 string target_name;
7502 if (id == BlueFS::BDEV_NEWWAL) {
7503 target_name = "block.wal";
7504 target_size = cct->_conf->bluestore_block_wal_size;
7505 bluefs_layout.dedicated_wal = true;
7506
7507 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
7508 cct->_conf->bdev_enable_discard,
7509 BDEV_LABEL_BLOCK_SIZE);
7510 ceph_assert(r == 0);
7511
7512 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
7513 r = _check_or_set_bdev_label(
7514 dev_path,
7515 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
7516 "bluefs wal",
7517 true);
7518 ceph_assert(r == 0);
7519 }
7520 } else if (id == BlueFS::BDEV_NEWDB) {
7521 target_name = "block.db";
7522 target_size = cct->_conf->bluestore_block_db_size;
7523 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
7524 bluefs_layout.dedicated_db = true;
7525
7526 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
7527 cct->_conf->bdev_enable_discard,
7528 SUPER_RESERVED);
7529 ceph_assert(r == 0);
7530
7531 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
7532 r = _check_or_set_bdev_label(
7533 dev_path,
7534 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
7535 "bluefs db",
7536 true);
7537 ceph_assert(r == 0);
7538 }
7539 }
7540
7541 bluefs->umount();
7542 bluefs->mount();
7543
7544 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
7545
7546 if (r < 0) {
7547 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
7548 return r;
7549 }
7550
7551 if (!link_db.empty()) {
7552 r = unlink(link_db.c_str());
7553 ceph_assert(r == 0);
7554 }
7555 if (!link_wal.empty()) {
7556 r = unlink(link_wal.c_str());
7557 ceph_assert(r == 0);
7558 }
7559 r = _setup_block_symlink_or_file(
7560 target_name,
7561 dev_path,
7562 target_size,
7563 true);
7564 ceph_assert(r == 0);
7565 dout(0) << __func__ << " success" << dendl;
7566
7567 return r;
7568}
7569
7570string BlueStore::get_device_path(unsigned id)
7571{
7572 string res;
7573 if (id < BlueFS::MAX_BDEV) {
7574 switch (id) {
7575 case BlueFS::BDEV_WAL:
7576 res = path + "/block.wal";
7577 break;
7578 case BlueFS::BDEV_DB:
7579 if (id == bluefs_layout.shared_bdev) {
7580 res = path + "/block";
7581 } else {
7582 res = path + "/block.db";
7583 }
7584 break;
7585 case BlueFS::BDEV_SLOW:
7586 res = path + "/block";
7587 break;
7588 }
7589 }
7590 return res;
7591}
7592
7593int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
7594{
7595 bluestore_bdev_label_t label;
7596 int r = _read_bdev_label(cct, path, &label);
7597 if (r < 0) {
7598 derr << "unable to read label for " << path << ": "
7599 << cpp_strerror(r) << dendl;
7600 } else {
7601 label.size = size;
7602 r = _write_bdev_label(cct, path, label);
7603 if (r < 0) {
7604 derr << "unable to write label for " << path << ": "
7605 << cpp_strerror(r) << dendl;
7606 }
7607 }
7608 return r;
7609}
7610
7611int BlueStore::expand_devices(ostream& out)
7612{
7613 int r = _open_db_and_around(true);
7614 ceph_assert(r == 0);
7615 bluefs->dump_block_extents(out);
7616 out << "Expanding DB/WAL..." << std::endl;
7617 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
7618 if (devid == bluefs_layout.shared_bdev ) {
7619 continue;
7620 }
7621 uint64_t size = bluefs->get_block_device_size(devid);
7622 if (size == 0) {
7623 // no bdev
7624 continue;
7625 }
7626
7627 out << devid
7628 <<" : expanding " << " to 0x" << size << std::dec << std::endl;
7629 string p = get_device_path(devid);
7630 const char* path = p.c_str();
7631 if (path == nullptr) {
7632 derr << devid
7633 <<": can't find device path " << dendl;
7634 continue;
7635 }
7636 if (bluefs->bdev_support_label(devid)) {
7637 if (_set_bdev_label_size(p, size) >= 0) {
7638 out << devid
7639 << " : size label updated to " << size
7640 << std::endl;
7641 }
7642 }
7643 }
7644 uint64_t size0 = fm->get_size();
7645 uint64_t size = bdev->get_size();
7646 if (size0 < size) {
7647 out << bluefs_layout.shared_bdev
7648 << " : expanding " << " from 0x" << std::hex
7649 << size0 << " to 0x" << size << std::dec << std::endl;
7650 _write_out_fm_meta(size);
7651 if (bdev->supported_bdev_label()) {
7652 if (_set_bdev_label_size(path, size) >= 0) {
7653 out << bluefs_layout.shared_bdev
7654 << " : size label updated to " << size
7655 << std::endl;
7656 }
7657 }
7658
7659 if (fm && fm->is_null_manager()) {
7660 // we grow the allocation range, must reflect it in the allocation file
7661 alloc->init_add_free(size0, size - size0);
7662 need_to_destage_allocation_file = true;
7663 }
7664 _close_db_and_around();
7665
7666 // mount in read/write to sync expansion changes
7667 r = _mount();
7668 ceph_assert(r == 0);
7669 umount();
7670 } else {
7671 _close_db_and_around();
7672 }
7673 return r;
7674}
7675
7676int BlueStore::dump_bluefs_sizes(ostream& out)
7677{
7678 int r = _open_db_and_around(true);
7679 ceph_assert(r == 0);
7680 bluefs->dump_block_extents(out);
7681 _close_db_and_around();
7682 return r;
7683}
7684
7685void BlueStore::set_cache_shards(unsigned num)
7686{
7687 dout(10) << __func__ << " " << num << dendl;
7688 size_t oold = onode_cache_shards.size();
7689 size_t bold = buffer_cache_shards.size();
7690 ceph_assert(num >= oold && num >= bold);
7691 onode_cache_shards.resize(num);
7692 buffer_cache_shards.resize(num);
7693 for (unsigned i = oold; i < num; ++i) {
7694 onode_cache_shards[i] =
7695 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7696 logger);
7697 }
7698 for (unsigned i = bold; i < num; ++i) {
7699 buffer_cache_shards[i] =
7700 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7701 logger);
7702 }
7703}
7704
7705//---------------------------------------------
7706bool BlueStore::has_null_manager() const
7707{
7708 return (fm && fm->is_null_manager());
7709}
7710
7711int BlueStore::_mount()
7712{
7713 dout(5) << __func__ << "NCB:: path " << path << dendl;
7714
7715 _kv_only = false;
7716 if (cct->_conf->bluestore_fsck_on_mount) {
7717 dout(5) << __func__ << "::NCB::calling fsck()" << dendl;
7718 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
7719 if (rc < 0)
7720 return rc;
7721 if (rc > 0) {
7722 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7723 return -EIO;
7724 }
7725 }
7726
7727 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
7728 derr << __func__ << " osd_max_object_size "
7729 << cct->_conf->osd_max_object_size << " > bluestore max "
7730 << OBJECT_MAX_SIZE << dendl;
7731 return -EINVAL;
7732 }
7733
7734 dout(5) << __func__ << "::NCB::calling open_db_and_around(read/write)" << dendl;
7735 int r = _open_db_and_around(false);
7736 if (r < 0) {
7737 return r;
7738 }
7739 auto close_db = make_scope_guard([&] {
7740 if (!mounted) {
7741 _close_db_and_around();
7742 }
7743 });
7744
7745 r = _upgrade_super();
7746 if (r < 0) {
7747 return r;
7748 }
7749
7750 // The recovery process for allocation-map needs to open collection early
7751 r = _open_collections();
7752 if (r < 0) {
7753 return r;
7754 }
7755 auto shutdown_cache = make_scope_guard([&] {
7756 if (!mounted) {
7757 _shutdown_cache();
7758 }
7759 });
7760
7761 r = _reload_logger();
7762 if (r < 0) {
7763 return r;
7764 }
7765
7766 _kv_start();
7767 auto stop_kv = make_scope_guard([&] {
7768 if (!mounted) {
7769 _kv_stop();
7770 }
7771 });
7772
7773 r = _deferred_replay();
7774 if (r < 0) {
7775 return r;
7776 }
7777
7778#ifdef HAVE_LIBZBD
7779 if (bdev->is_smr()) {
7780 _zoned_cleaner_start();
7781 }
7782#endif
7783
7784 mempool_thread.init();
7785
7786 if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
7787 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
7788
7789 auto was_per_pool_omap = per_pool_omap;
7790
7791 dout(1) << __func__ << " quick-fix on mount" << dendl;
7792 _fsck_on_open(FSCK_SHALLOW, true);
7793
7794 //set again as hopefully it has been fixed
7795 if (was_per_pool_omap != OMAP_PER_PG) {
7796 _set_per_pool_omap();
7797 }
7798 }
7799
7800 mounted = true;
7801 return 0;
7802}
7803
7804int BlueStore::umount()
7805{
7806 ceph_assert(_kv_only || mounted);
7807 _osr_drain_all();
7808
7809 mounted = false;
7810
7811 ceph_assert(alloc);
7812
7813 if (!_kv_only) {
7814 mempool_thread.shutdown();
7815#ifdef HAVE_LIBZBD
7816 if (bdev->is_smr()) {
7817 dout(20) << __func__ << " stopping zone cleaner thread" << dendl;
7818 _zoned_cleaner_stop();
7819 }
7820#endif
7821 dout(20) << __func__ << " stopping kv thread" << dendl;
7822 _kv_stop();
7823 // skip cache cleanup step on fast shutdown
7824 if (likely(!m_fast_shutdown)) {
7825 _shutdown_cache();
7826 }
7827 dout(20) << __func__ << " closing" << dendl;
7828 }
7829 _close_db_and_around();
7830 // disable fsck on fast-shutdown
7831 if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) {
7832 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7833 if (rc < 0)
7834 return rc;
7835 if (rc > 0) {
7836 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7837 return -EIO;
7838 }
7839 }
7840 return 0;
7841}
7842
7843int BlueStore::cold_open()
7844{
7845 return _open_db_and_around(true);
7846}
7847
7848int BlueStore::cold_close()
7849{
7850 _close_db_and_around();
7851 return 0;
7852}
7853
7854// derr wrapper to limit enormous output and avoid log flooding.
7855// Of limited use where such output is expected for now
7856#define fsck_derr(err_cnt, threshold) \
7857 if (err_cnt <= threshold) { \
7858 bool need_skip_print = err_cnt == threshold; \
7859 derr
7860
7861#define fsck_dendl \
7862 dendl; \
7863 if (need_skip_print) \
7864 derr << "more error lines skipped..." << dendl; \
7865 }
7866
7867int _fsck_sum_extents(
7868 const PExtentVector& extents,
7869 bool compressed,
7870 store_statfs_t& expected_statfs)
7871{
7872 for (auto e : extents) {
7873 if (!e.is_valid())
7874 continue;
7875 expected_statfs.allocated += e.length;
7876 if (compressed) {
7877 expected_statfs.data_compressed_allocated += e.length;
7878 }
7879 }
7880 return 0;
7881}
7882
7883int BlueStore::_fsck_check_extents(
7884 std::string_view ctx_descr,
7885 const PExtentVector& extents,
7886 bool compressed,
7887 mempool_dynamic_bitset &used_blocks,
7888 uint64_t granularity,
7889 BlueStoreRepairer* repairer,
7890 store_statfs_t& expected_statfs,
7891 FSCKDepth depth)
7892{
7893 dout(30) << __func__ << " " << ctx_descr << ", extents " << extents << dendl;
7894 int errors = 0;
7895 for (auto e : extents) {
7896 if (!e.is_valid())
7897 continue;
7898 expected_statfs.allocated += e.length;
7899 if (compressed) {
7900 expected_statfs.data_compressed_allocated += e.length;
7901 }
7902 if (depth != FSCK_SHALLOW) {
7903 bool already = false;
7904 apply_for_bitset_range(
7905 e.offset, e.length, granularity, used_blocks,
7906 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
7907 if (bs.test(pos)) {
7908 if (repairer) {
7909 repairer->note_misreference(
7910 pos * min_alloc_size, min_alloc_size, !already);
7911 }
7912 if (!already) {
7913 derr << __func__ << "::fsck error: " << ctx_descr << ", extent " << e
7914 << " or a subset is already allocated (misreferenced)" << dendl;
7915 ++errors;
7916 already = true;
7917 }
7918 }
7919 else
7920 bs.set(pos);
7921 });
7922
7923 if (e.end() > bdev->get_size()) {
7924 derr << "fsck error: " << ctx_descr << ", extent " << e
7925 << " past end of block device" << dendl;
7926 ++errors;
7927 }
7928 }
7929 }
7930 return errors;
7931}
7932
7933void BlueStore::_fsck_check_statfs(
7934 const store_statfs_t& expected_statfs,
7935 const per_pool_statfs& expected_pool_statfs,
7936 int64_t& errors,
7937 int64_t& warnings,
7938 BlueStoreRepairer* repairer)
7939{
7940 string key;
7941 store_statfs_t actual_statfs;
7942 store_statfs_t s;
7943 {
7944 // make a copy
7945 per_pool_statfs my_expected_pool_statfs(expected_pool_statfs);
7946 auto op = osd_pools.begin();
7947 while (op != osd_pools.end()) {
7948 get_pool_stat_key(op->first, &key);
7949 op->second.publish(&s);
7950 auto it_expected = my_expected_pool_statfs.find(op->first);
7951 if (it_expected == my_expected_pool_statfs.end()) {
7952 auto op0 = op++;
7953 if (op0->second.is_empty()) {
7954 // It's OK to lack relevant empty statfs record
7955 continue;
7956 }
7957 derr << __func__ << "::fsck error: " << std::hex
7958 << "pool " << op0->first << " has got no statfs to match against: "
7959 << s
7960 << std::dec << dendl;
7961 ++errors;
7962 if (repairer) {
7963 osd_pools.erase(op0);
7964 repairer->remove_key(db, PREFIX_STAT, key);
7965 }
7966 } else {
7967 if (!(s == it_expected->second)) {
7968 derr << "fsck error: actual " << s
7969 << " != expected " << it_expected->second
7970 << " for pool "
7971 << std::hex << op->first << std::dec << dendl;
7972 ++errors;
7973 if (repairer) {
7974 // repair in-memory in a hope this would be flushed properly on shutdown
7975 s = it_expected->second;
7976 op->second = it_expected->second;
7977 repairer->fix_statfs(db, key, it_expected->second);
7978 }
7979 }
7980 actual_statfs.add(s);
7981 my_expected_pool_statfs.erase(it_expected);
7982 ++op;
7983 }
7984 }
7985 // check stats that lack matching entities in osd_pools
7986 for (auto &p : my_expected_pool_statfs) {
7987 if (p.second.is_zero()) {
7988 // It's OK to lack relevant empty statfs record
7989 continue;
7990 }
7991 get_pool_stat_key(p.first, &key);
7992 derr << __func__ << "::fsck error: " << std::hex
7993 << "pool " << p.first << " has got no actual statfs: "
7994 << std::dec << p.second
7995 << dendl;
7996 ++errors;
7997 if (repairer) {
7998 osd_pools[p.first] = p.second;
7999 repairer->fix_statfs(db, key, p.second);
8000 actual_statfs.add(p.second);
8001 }
8002 }
8003 }
8004 // process global statfs
8005 if (repairer) {
8006 if (!per_pool_stat_collection) {
8007 // by virtue of running this method, we correct the top-level
8008 // error of having global stats
8009 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
8010 per_pool_stat_collection = true;
8011 }
8012 vstatfs = actual_statfs;
8013 dout(20) << __func__ << " setting vstatfs to " << actual_statfs << dendl;
8014 } else if (!per_pool_stat_collection) {
8015 // check global stats only if fscking (not repairing) w/o per-pool stats
8016 vstatfs.publish(&s);
8017 if (!(s == expected_statfs)) {
8018 derr << "fsck error: actual " << s
8019 << " != expected " << expected_statfs << dendl;
8020 ++errors;
8021 }
8022 }
8023}
8024
8025void BlueStore::_fsck_repair_shared_blobs(
8026 BlueStoreRepairer& repairer,
8027 shared_blob_2hash_tracker_t& sb_ref_counts,
8028 sb_info_space_efficient_map_t& sb_info)
8029{
8030 auto sb_ref_mismatches = sb_ref_counts.count_non_zero();
8031 dout(1) << __func__ << " repairing shared_blobs, ref mismatch estimate: "
8032 << sb_ref_mismatches << dendl;
8033 if (!sb_ref_mismatches) // not expected to succeed, just in case
8034 return;
8035
8036
8037 auto foreach_shared_blob = [&](std::function<
8038 void (coll_t,
8039 ghobject_t,
8040 uint64_t,
8041 const bluestore_blob_t&)> cb) {
8042 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
8043 if (it) {
8044 CollectionRef c;
8045 spg_t pgid;
8046 for (it->lower_bound(string()); it->valid(); it->next()) {
8047 dout(30) << __func__ << " key "
8048 << pretty_binary_string(it->key())
8049 << dendl;
8050 if (is_extent_shard_key(it->key())) {
8051 continue;
8052 }
8053
8054 ghobject_t oid;
8055 int r = get_key_object(it->key(), &oid);
8056 if (r < 0) {
8057 continue;
8058 }
8059
8060 if (!c ||
8061 oid.shard_id != pgid.shard ||
8062 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8063 !c->contains(oid)) {
8064 c = nullptr;
8065 for (auto& p : coll_map) {
8066 if (p.second->contains(oid)) {
8067 c = p.second;
8068 break;
8069 }
8070 }
8071 if (!c) {
8072 continue;
8073 }
8074 }
8075 dout(20) << __func__
8076 << " inspecting shared blob refs for col:" << c->cid
8077 << " obj:" << oid
8078 << dendl;
8079
8080 OnodeRef o;
8081 o.reset(Onode::create_decode(c, oid, it->key(), it->value()));
8082 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8083
8084 _dump_onode<30>(cct, *o);
8085
8086 mempool::bluestore_fsck::set<BlobRef> passed_sbs;
8087 for (auto& e : o->extent_map.extent_map) {
8088 auto& b = e.blob->get_blob();
8089 if (b.is_shared() && passed_sbs.count(e.blob) == 0) {
8090 auto sbid = e.blob->shared_blob->get_sbid();
8091 cb(c->cid, oid, sbid, b);
8092 passed_sbs.emplace(e.blob);
8093 }
8094 } // for ... extent_map
8095 } // for ... it->valid
8096 } //if (it(PREFIX_OBJ))
8097 }; //foreach_shared_blob fn declaration
8098
8099 mempool::bluestore_fsck::map<uint64_t, bluestore_extent_ref_map_t> refs_map;
8100
8101 // first iteration over objects to identify all the broken sbids
8102 foreach_shared_blob( [&](coll_t cid,
8103 ghobject_t oid,
8104 uint64_t sbid,
8105 const bluestore_blob_t& b) {
8106 auto it = refs_map.lower_bound(sbid);
8107 if(it != refs_map.end() && it->first == sbid) {
8108 return;
8109 }
8110 for (auto& p : b.get_extents()) {
8111 if (p.is_valid() &&
8112 !sb_ref_counts.test_all_zero_range(sbid,
8113 p.offset,
8114 p.length)) {
8115 refs_map.emplace_hint(it, sbid, bluestore_extent_ref_map_t());
8116 dout(20) << __func__
8117 << " broken shared blob found for col:" << cid
8118 << " obj:" << oid
8119 << " sbid 0x " << std::hex << sbid << std::dec
8120 << dendl;
8121 break;
8122 }
8123 }
8124 });
8125
8126 // second iteration over objects to build new ref map for the broken sbids
8127 foreach_shared_blob( [&](coll_t cid,
8128 ghobject_t oid,
8129 uint64_t sbid,
8130 const bluestore_blob_t& b) {
8131 auto it = refs_map.find(sbid);
8132 if(it == refs_map.end()) {
8133 return;
8134 }
8135 for (auto& p : b.get_extents()) {
8136 if (p.is_valid()) {
8137 it->second.get(p.offset, p.length);
8138 break;
8139 }
8140 }
8141 });
8142
8143 // update shared blob records
8144 auto ref_it = refs_map.begin();
8145 while (ref_it != refs_map.end()) {
8146 size_t cnt = 0;
8147 const size_t max_transactions = 4096;
8148 KeyValueDB::Transaction txn = db->get_transaction();
8149 for (cnt = 0;
8150 cnt < max_transactions && ref_it != refs_map.end();
8151 ref_it++) {
8152 auto sbid = ref_it->first;
8153 dout(20) << __func__ << " repaired shared_blob 0x"
8154 << std::hex << sbid << std::dec
8155 << ref_it->second << dendl;
8156 repairer.fix_shared_blob(txn, sbid, &ref_it->second, 0);
8157 cnt++;
8158 }
8159 if (cnt) {
8160 db->submit_transaction_sync(txn);
8161 cnt = 0;
8162 }
8163 }
8164 // remove stray shared blob records
8165 size_t cnt = 0;
8166 const size_t max_transactions = 4096;
8167 KeyValueDB::Transaction txn = db->get_transaction();
8168 sb_info.foreach_stray([&](const sb_info_t& sbi) {
8169 auto sbid = sbi.get_sbid();
8170 dout(20) << __func__ << " removing stray shared_blob 0x"
8171 << std::hex << sbid << std::dec
8172 << dendl;
8173 repairer.fix_shared_blob(txn, sbid, nullptr, 0);
8174 cnt++;
8175 if (cnt >= max_transactions) {}
8176 db->submit_transaction_sync(txn);
8177 txn = db->get_transaction();
8178 cnt = 0;
8179 });
8180 if (cnt > 0) {
8181 db->submit_transaction_sync(txn);
8182 }
8183
8184 // amount of repairs to report to be equal to previously
8185 // determined error estimation, not the actual number of updated shared blobs
8186 repairer.inc_repaired(sb_ref_mismatches);
8187}
8188
8189BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
8190 BlueStore::FSCKDepth depth,
8191 int64_t pool_id,
8192 BlueStore::CollectionRef c,
8193 const ghobject_t& oid,
8194 const string& key,
8195 const bufferlist& value,
8196 mempool::bluestore_fsck::list<string>* expecting_shards,
8197 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
8198 const BlueStore::FSCK_ObjectCtx& ctx)
8199{
8200 auto& errors = ctx.errors;
8201 auto& num_objects = ctx.num_objects;
8202 auto& num_extents = ctx.num_extents;
8203 auto& num_blobs = ctx.num_blobs;
8204 auto& num_sharded_objects = ctx.num_sharded_objects;
8205 auto& num_spanning_blobs = ctx.num_spanning_blobs;
8206 auto used_blocks = ctx.used_blocks;
8207 auto sb_info_lock = ctx.sb_info_lock;
8208 auto& sb_info = ctx.sb_info;
8209 auto& sb_ref_counts = ctx.sb_ref_counts;
8210 auto repairer = ctx.repairer;
8211
8212 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
8213 &ctx.expected_pool_statfs[pool_id] :
8214 &ctx.expected_store_statfs;
8215
8216 map<uint32_t, uint64_t> zone_first_offsets; // for zoned/smr devices
8217
8218 dout(10) << __func__ << " " << oid << dendl;
8219 OnodeRef o;
8220 o.reset(Onode::create_decode(c, oid, key, value));
8221 ++num_objects;
8222
8223 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
8224
8225 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8226 _dump_onode<30>(cct, *o);
8227 // shards
8228 if (!o->extent_map.shards.empty()) {
8229 ++num_sharded_objects;
8230 if (depth != FSCK_SHALLOW) {
8231 ceph_assert(expecting_shards);
8232 for (auto& s : o->extent_map.shards) {
8233 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
8234 expecting_shards->push_back(string());
8235 get_extent_shard_key(o->key, s.shard_info->offset,
8236 &expecting_shards->back());
8237 if (s.shard_info->offset >= o->onode.size) {
8238 derr << "fsck error: " << oid << " shard 0x" << std::hex
8239 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
8240 << std::dec << dendl;
8241 ++errors;
8242 }
8243 }
8244 }
8245 }
8246
8247 // lextents
8248 uint64_t pos = 0;
8249 mempool::bluestore_fsck::map<BlobRef,
8250 bluestore_blob_use_tracker_t> ref_map;
8251 for (auto& l : o->extent_map.extent_map) {
8252 dout(20) << __func__ << " " << l << dendl;
8253 if (l.logical_offset < pos) {
8254 derr << "fsck error: " << oid << " lextent at 0x"
8255 << std::hex << l.logical_offset
8256 << " overlaps with the previous, which ends at 0x" << pos
8257 << std::dec << dendl;
8258 ++errors;
8259 }
8260 if (depth != FSCK_SHALLOW &&
8261 o->extent_map.spans_shard(l.logical_offset, l.length)) {
8262 derr << "fsck error: " << oid << " lextent at 0x"
8263 << std::hex << l.logical_offset << "~" << l.length
8264 << " spans a shard boundary"
8265 << std::dec << dendl;
8266 ++errors;
8267 }
8268 pos = l.logical_offset + l.length;
8269 res_statfs->data_stored += l.length;
8270 ceph_assert(l.blob);
8271 const bluestore_blob_t& blob = l.blob->get_blob();
8272
8273#ifdef HAVE_LIBZBD
8274 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
8275 for (auto& e : blob.get_extents()) {
8276 if (e.is_valid()) {
8277 uint32_t zone = e.offset / zone_size;
8278 uint64_t offset = e.offset % zone_size;
8279 auto p = zone_first_offsets.find(zone);
8280 if (p == zone_first_offsets.end() || p->second > offset) {
8281 // FIXME: use interator for guided insert?
8282 zone_first_offsets[zone] = offset;
8283 }
8284 }
8285 }
8286 }
8287#endif
8288
8289 auto& ref = ref_map[l.blob];
8290 if (ref.is_empty()) {
8291 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
8292 uint32_t l = blob.get_logical_length();
8293 ref.init(l, min_release_size);
8294 }
8295 ref.get(
8296 l.blob_offset,
8297 l.length);
8298 ++num_extents;
8299 if (depth != FSCK_SHALLOW &&
8300 blob.has_unused()) {
8301 ceph_assert(referenced);
8302 auto p = referenced->find(l.blob);
8303 bluestore_blob_t::unused_t* pu;
8304 if (p == referenced->end()) {
8305 pu = &(*referenced)[l.blob];
8306 }
8307 else {
8308 pu = &p->second;
8309 }
8310 uint64_t blob_len = blob.get_logical_length();
8311 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
8312 ceph_assert(l.blob_offset + l.length <= blob_len);
8313 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
8314 uint64_t start = l.blob_offset / chunk_size;
8315 uint64_t end =
8316 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
8317 for (auto i = start; i < end; ++i) {
8318 (*pu) |= (1u << i);
8319 }
8320 }
8321 } //for (auto& l : o->extent_map.extent_map)
8322
8323 for (auto& i : ref_map) {
8324 ++num_blobs;
8325 const bluestore_blob_t& blob = i.first->get_blob();
8326 bool equal =
8327 depth == FSCK_SHALLOW ? true :
8328 i.first->get_blob_use_tracker().equal(i.second);
8329 if (!equal) {
8330 derr << "fsck error: " << oid << " blob " << *i.first
8331 << " doesn't match expected ref_map " << i.second << dendl;
8332 ++errors;
8333 }
8334 if (blob.is_compressed()) {
8335 res_statfs->data_compressed += blob.get_compressed_payload_length();
8336 res_statfs->data_compressed_original +=
8337 i.first->get_referenced_bytes();
8338 }
8339 if (depth != FSCK_SHALLOW && repairer) {
8340 for (auto e : blob.get_extents()) {
8341 if (!e.is_valid())
8342 continue;
8343 repairer->set_space_used(e.offset, e.length, c->cid, oid);
8344 }
8345 }
8346 if (blob.is_shared()) {
8347 if (i.first->shared_blob->get_sbid() > blobid_max) {
8348 derr << "fsck error: " << oid << " blob " << blob
8349 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
8350 << blobid_max << dendl;
8351 ++errors;
8352 } else if (i.first->shared_blob->get_sbid() == 0) {
8353 derr << "fsck error: " << oid << " blob " << blob
8354 << " marked as shared but has uninitialized sbid"
8355 << dendl;
8356 ++errors;
8357 }
8358 // the below lock is optional and provided in multithreading mode only
8359 if (sb_info_lock) {
8360 sb_info_lock->lock();
8361 }
8362 auto sbid = i.first->shared_blob->get_sbid();
8363 sb_info_t& sbi = sb_info.add_or_adopt(i.first->shared_blob->get_sbid());
8364 ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID ||
8365 sbi.pool_id == oid.hobj.get_logical_pool());
8366 sbi.pool_id = oid.hobj.get_logical_pool();
8367 bool compressed = blob.is_compressed();
8368 for (auto e : blob.get_extents()) {
8369 if (e.is_valid()) {
8370 if (compressed) {
8371 ceph_assert(sbi.allocated_chunks <= 0);
8372 sbi.allocated_chunks -= (e.length >> min_alloc_size_order);
8373 } else {
8374 ceph_assert(sbi.allocated_chunks >= 0);
8375 sbi.allocated_chunks += (e.length >> min_alloc_size_order);
8376 }
8377 sb_ref_counts.inc_range(sbid, e.offset, e.length, 1);
8378 }
8379 }
8380 if (sb_info_lock) {
8381 sb_info_lock->unlock();
8382 }
8383 } else if (depth != FSCK_SHALLOW) {
8384 ceph_assert(used_blocks);
8385 string ctx_descr = " oid " + stringify(oid);
8386 errors += _fsck_check_extents(ctx_descr,
8387 blob.get_extents(),
8388 blob.is_compressed(),
8389 *used_blocks,
8390 fm->get_alloc_size(),
8391 repairer,
8392 *res_statfs,
8393 depth);
8394 } else {
8395 errors += _fsck_sum_extents(
8396 blob.get_extents(),
8397 blob.is_compressed(),
8398 *res_statfs);
8399 }
8400 } // for (auto& i : ref_map)
8401
8402 {
8403 auto &sbm = o->extent_map.spanning_blob_map;
8404 size_t broken = 0;
8405 BlobRef first_broken;
8406 for (auto it = sbm.begin(); it != sbm.end();) {
8407 auto it1 = it++;
8408 if (ref_map.count(it1->second) == 0) {
8409 if (!broken) {
8410 first_broken = it1->second;
8411 ++errors;
8412 derr << "fsck error:" << " stray spanning blob found:" << it1->first
8413 << dendl;
8414 }
8415 broken++;
8416 if (repairer) {
8417 sbm.erase(it1);
8418 }
8419 }
8420 }
8421
8422#ifdef HAVE_LIBZBD
8423 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
8424 for (auto& [zone, first_offset] : zone_first_offsets) {
8425 auto p = (*ctx.zone_refs)[zone].find(oid);
8426 if (p != (*ctx.zone_refs)[zone].end()) {
8427 if (first_offset < p->second) {
8428 dout(20) << " slightly wonky zone ref 0x" << std::hex << zone
8429 << " offset 0x" << p->second
8430 << " but first offset is 0x" << first_offset
8431 << "; this can happen due to clone_range"
8432 << dendl;
8433 } else {
8434 dout(20) << " good zone ref 0x" << std::hex << zone << " offset 0x" << p->second
8435 << " <= first offset 0x" << first_offset
8436 << std::dec << dendl;
8437 }
8438 (*ctx.zone_refs)[zone].erase(p);
8439 } else {
8440 derr << "fsck error: " << oid << " references zone 0x" << std::hex << zone
8441 << " but there is no zone ref" << std::dec << dendl;
8442 // FIXME: add repair
8443 ++errors;
8444 }
8445 }
8446 }
8447#endif
8448
8449 if (broken) {
8450 derr << "fsck error: " << oid << " - " << broken
8451 << " zombie spanning blob(s) found, the first one: "
8452 << *first_broken << dendl;
8453 if(repairer) {
8454 repairer->fix_spanning_blobs(
8455 db,
8456 [&](KeyValueDB::Transaction txn) {
8457 _record_onode(o, txn);
8458 });
8459 }
8460 }
8461 }
8462
8463 if (o->onode.has_omap()) {
8464 _fsck_check_object_omap(depth, o, ctx);
8465 }
8466
8467 return o;
8468}
8469
8470#include "common/WorkQueue.h"
8471
8472class ShallowFSCKThreadPool : public ThreadPool
8473{
8474public:
8475 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
8476 ThreadPool(cct_, nm, tn, n) {
8477 }
8478 void worker(ThreadPool::WorkThread* wt) override {
8479 int next_wq = 0;
8480 while (!_stop) {
8481 next_wq %= work_queues.size();
8482 WorkQueue_ *wq = work_queues[next_wq++];
8483
8484 void* item = wq->_void_dequeue();
8485 if (item) {
8486 processing++;
8487 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
8488 wq->_void_process(item, tp_handle);
8489 processing--;
8490 }
8491 }
8492 }
8493 template <size_t BatchLen>
8494 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
8495 {
8496 struct Entry {
8497 int64_t pool_id;
8498 BlueStore::CollectionRef c;
8499 ghobject_t oid;
8500 string key;
8501 bufferlist value;
8502 };
8503 struct Batch {
8504 std::atomic<size_t> running = { 0 };
8505 size_t entry_count = 0;
8506 std::array<Entry, BatchLen> entries;
8507
8508 int64_t errors = 0;
8509 int64_t warnings = 0;
8510 uint64_t num_objects = 0;
8511 uint64_t num_extents = 0;
8512 uint64_t num_blobs = 0;
8513 uint64_t num_sharded_objects = 0;
8514 uint64_t num_spanning_blobs = 0;
8515 store_statfs_t expected_store_statfs;
8516 BlueStore::per_pool_statfs expected_pool_statfs;
8517 };
8518
8519 size_t batchCount;
8520 BlueStore* store = nullptr;
8521
8522 ceph::mutex* sb_info_lock = nullptr;
8523 sb_info_space_efficient_map_t* sb_info = nullptr;
8524 shared_blob_2hash_tracker_t* sb_ref_counts = nullptr;
8525 BlueStoreRepairer* repairer = nullptr;
8526
8527 Batch* batches = nullptr;
8528 size_t last_batch_pos = 0;
8529 bool batch_acquired = false;
8530
8531 FSCKWorkQueue(std::string n,
8532 size_t _batchCount,
8533 BlueStore* _store,
8534 ceph::mutex* _sb_info_lock,
8535 sb_info_space_efficient_map_t& _sb_info,
8536 shared_blob_2hash_tracker_t& _sb_ref_counts,
8537 BlueStoreRepairer* _repairer) :
8538 WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()),
8539 batchCount(_batchCount),
8540 store(_store),
8541 sb_info_lock(_sb_info_lock),
8542 sb_info(&_sb_info),
8543 sb_ref_counts(&_sb_ref_counts),
8544 repairer(_repairer)
8545 {
8546 batches = new Batch[batchCount];
8547 }
8548 ~FSCKWorkQueue() {
8549 delete[] batches;
8550 }
8551
8552 /// Remove all work items from the queue.
8553 void _clear() override {
8554 //do nothing
8555 }
8556 /// Check whether there is anything to do.
8557 bool _empty() override {
8558 ceph_assert(false);
8559 }
8560
8561 /// Get the next work item to process.
8562 void* _void_dequeue() override {
8563 size_t pos = rand() % batchCount;
8564 size_t pos0 = pos;
8565 do {
8566 auto& batch = batches[pos];
8567 if (batch.running.fetch_add(1) == 0) {
8568 if (batch.entry_count) {
8569 return &batch;
8570 }
8571 }
8572 batch.running--;
8573 pos++;
8574 pos %= batchCount;
8575 } while (pos != pos0);
8576 return nullptr;
8577 }
8578 /** @brief Process the work item.
8579 * This function will be called several times in parallel
8580 * and must therefore be thread-safe. */
8581 void _void_process(void* item, TPHandle& handle) override {
8582 Batch* batch = (Batch*)item;
8583
8584 BlueStore::FSCK_ObjectCtx ctx(
8585 batch->errors,
8586 batch->warnings,
8587 batch->num_objects,
8588 batch->num_extents,
8589 batch->num_blobs,
8590 batch->num_sharded_objects,
8591 batch->num_spanning_blobs,
8592 nullptr, // used_blocks
8593 nullptr, //used_omap_head
8594 nullptr,
8595 sb_info_lock,
8596 *sb_info,
8597 *sb_ref_counts,
8598 batch->expected_store_statfs,
8599 batch->expected_pool_statfs,
8600 repairer);
8601
8602 for (size_t i = 0; i < batch->entry_count; i++) {
8603 auto& entry = batch->entries[i];
8604
8605 store->fsck_check_objects_shallow(
8606 BlueStore::FSCK_SHALLOW,
8607 entry.pool_id,
8608 entry.c,
8609 entry.oid,
8610 entry.key,
8611 entry.value,
8612 nullptr, // expecting_shards - this will need a protection if passed
8613 nullptr, // referenced
8614 ctx);
8615 }
8616 batch->entry_count = 0;
8617 batch->running--;
8618 }
8619 /** @brief Synchronously finish processing a work item.
8620 * This function is called after _void_process with the global thread pool lock held,
8621 * so at most one copy will execute simultaneously for a given thread pool.
8622 * It can be used for non-thread-safe finalization. */
8623 void _void_process_finish(void*) override {
8624 ceph_assert(false);
8625 }
8626
8627 bool queue(
8628 int64_t pool_id,
8629 BlueStore::CollectionRef c,
8630 const ghobject_t& oid,
8631 const string& key,
8632 const bufferlist& value) {
8633 bool res = false;
8634 size_t pos0 = last_batch_pos;
8635 if (!batch_acquired) {
8636 do {
8637 auto& batch = batches[last_batch_pos];
8638 if (batch.running.fetch_add(1) == 0) {
8639 if (batch.entry_count < BatchLen) {
8640 batch_acquired = true;
8641 break;
8642 }
8643 }
8644 batch.running.fetch_sub(1);
8645 last_batch_pos++;
8646 last_batch_pos %= batchCount;
8647 } while (last_batch_pos != pos0);
8648 }
8649 if (batch_acquired) {
8650 auto& batch = batches[last_batch_pos];
8651 ceph_assert(batch.running);
8652 ceph_assert(batch.entry_count < BatchLen);
8653
8654 auto& entry = batch.entries[batch.entry_count];
8655 entry.pool_id = pool_id;
8656 entry.c = c;
8657 entry.oid = oid;
8658 entry.key = key;
8659 entry.value = value;
8660
8661 ++batch.entry_count;
8662 if (batch.entry_count == BatchLen) {
8663 batch_acquired = false;
8664 batch.running.fetch_sub(1);
8665 last_batch_pos++;
8666 last_batch_pos %= batchCount;
8667 }
8668 res = true;
8669 }
8670 return res;
8671 }
8672
8673 void finalize(ThreadPool& tp,
8674 BlueStore::FSCK_ObjectCtx& ctx) {
8675 if (batch_acquired) {
8676 auto& batch = batches[last_batch_pos];
8677 ceph_assert(batch.running);
8678 batch.running.fetch_sub(1);
8679 }
8680 tp.stop();
8681
8682 for (size_t i = 0; i < batchCount; i++) {
8683 auto& batch = batches[i];
8684
8685 //process leftovers if any
8686 if (batch.entry_count) {
8687 TPHandle tp_handle(store->cct,
8688 nullptr,
8689 timeout_interval,
8690 suicide_interval);
8691 ceph_assert(batch.running == 0);
8692
8693 batch.running++; // just to be on-par with the regular call
8694 _void_process(&batch, tp_handle);
8695 }
8696 ceph_assert(batch.entry_count == 0);
8697
8698 ctx.errors += batch.errors;
8699 ctx.warnings += batch.warnings;
8700 ctx.num_objects += batch.num_objects;
8701 ctx.num_extents += batch.num_extents;
8702 ctx.num_blobs += batch.num_blobs;
8703 ctx.num_sharded_objects += batch.num_sharded_objects;
8704 ctx.num_spanning_blobs += batch.num_spanning_blobs;
8705
8706 ctx.expected_store_statfs.add(batch.expected_store_statfs);
8707
8708 for (auto it = batch.expected_pool_statfs.begin();
8709 it != batch.expected_pool_statfs.end();
8710 it++) {
8711 ctx.expected_pool_statfs[it->first].add(it->second);
8712 }
8713 }
8714 }
8715 };
8716};
8717
8718void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
8719 OnodeRef& o,
8720 const BlueStore::FSCK_ObjectCtx& ctx)
8721{
8722 auto& errors = ctx.errors;
8723 auto& warnings = ctx.warnings;
8724 auto repairer = ctx.repairer;
8725
8726 ceph_assert(o->onode.has_omap());
8727 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
8728 if (per_pool_omap == OMAP_PER_POOL) {
8729 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8730 << "fsck error: " << o->oid
8731 << " has omap that is not per-pool or pgmeta"
8732 << fsck_dendl;
8733 ++errors;
8734 } else {
8735 const char* w;
8736 int64_t num;
8737 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8738 ++errors;
8739 num = errors;
8740 w = "error";
8741 } else {
8742 ++warnings;
8743 num = warnings;
8744 w = "warning";
8745 }
8746 fsck_derr(num, MAX_FSCK_ERROR_LINES)
8747 << "fsck " << w << ": " << o->oid
8748 << " has omap that is not per-pool or pgmeta"
8749 << fsck_dendl;
8750 }
8751 } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) {
8752 if (per_pool_omap == OMAP_PER_PG) {
8753 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8754 << "fsck error: " << o->oid
8755 << " has omap that is not per-pg or pgmeta"
8756 << fsck_dendl;
8757 ++errors;
8758 } else {
8759 const char* w;
8760 int64_t num;
8761 if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) {
8762 ++errors;
8763 num = errors;
8764 w = "error";
8765 } else {
8766 ++warnings;
8767 num = warnings;
8768 w = "warning";
8769 }
8770 fsck_derr(num, MAX_FSCK_ERROR_LINES)
8771 << "fsck " << w << ": " << o->oid
8772 << " has omap that is not per-pg or pgmeta"
8773 << fsck_dendl;
8774 }
8775 }
8776 if (repairer &&
8777 !o->onode.is_perpg_omap() &&
8778 !o->onode.is_pgmeta_omap()) {
8779 dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl;
8780 bufferlist header;
8781 map<string, bufferlist> kv;
8782 {
8783 KeyValueDB::Transaction txn = db->get_transaction();
8784 uint64_t txn_cost = 0;
8785 const string& prefix = Onode::calc_omap_prefix(o->onode.flags);
8786 uint8_t new_flags = o->onode.flags |
8787 bluestore_onode_t::FLAG_PERPOOL_OMAP |
8788 bluestore_onode_t::FLAG_PERPG_OMAP;
8789 const string& new_omap_prefix = Onode::calc_omap_prefix(new_flags);
8790
8791 KeyValueDB::Iterator it = db->get_iterator(prefix);
8792 string head, tail;
8793 o->get_omap_header(&head);
8794 o->get_omap_tail(&tail);
8795 it->lower_bound(head);
8796 // head
8797 if (it->valid() && it->key() == head) {
8798 dout(30) << __func__ << " got header" << dendl;
8799 header = it->value();
8800 if (header.length()) {
8801 string new_head;
8802 Onode::calc_omap_header(new_flags, o.get(), &new_head);
8803 txn->set(new_omap_prefix, new_head, header);
8804 txn_cost += new_head.length() + header.length();
8805 }
8806 it->next();
8807 }
8808 // tail
8809 {
8810 string new_tail;
8811 Onode::calc_omap_tail(new_flags, o.get(), &new_tail);
8812 bufferlist empty;
8813 txn->set(new_omap_prefix, new_tail, empty);
8814 txn_cost += new_tail.length() + new_tail.length();
8815 }
8816 // values
8817 string final_key;
8818 Onode::calc_omap_key(new_flags, o.get(), string(), &final_key);
8819 size_t base_key_len = final_key.size();
8820 while (it->valid() && it->key() < tail) {
8821 string user_key;
8822 o->decode_omap_key(it->key(), &user_key);
8823 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
8824 << " -> " << user_key << dendl;
8825
8826 final_key.resize(base_key_len);
8827 final_key += user_key;
8828 auto v = it->value();
8829 txn->set(new_omap_prefix, final_key, v);
8830 txn_cost += final_key.length() + v.length();
8831
8832 // submit a portion if cost exceeds 16MB
8833 if (txn_cost >= 16 * (1 << 20) ) {
8834 db->submit_transaction_sync(txn);
8835 txn = db->get_transaction();
8836 txn_cost = 0;
8837 }
8838 it->next();
8839 }
8840 if (txn_cost > 0) {
8841 db->submit_transaction_sync(txn);
8842 }
8843 }
8844 // finalize: remove legacy data
8845 {
8846 KeyValueDB::Transaction txn = db->get_transaction();
8847 // remove old keys
8848 const string& old_omap_prefix = o->get_omap_prefix();
8849 string old_head, old_tail;
8850 o->get_omap_header(&old_head);
8851 o->get_omap_tail(&old_tail);
8852 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
8853 txn->rmkey(old_omap_prefix, old_tail);
8854 // set flag
8855 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP);
8856 _record_onode(o, txn);
8857 db->submit_transaction_sync(txn);
8858 repairer->inc_repaired();
8859 repairer->request_compaction();
8860 }
8861 }
8862}
8863
8864void BlueStore::_fsck_check_objects(
8865 FSCKDepth depth,
8866 BlueStore::FSCK_ObjectCtx& ctx)
8867{
8868 auto& errors = ctx.errors;
8869 auto sb_info_lock = ctx.sb_info_lock;
8870 auto& sb_info = ctx.sb_info;
8871 auto& sb_ref_counts = ctx.sb_ref_counts;
8872 auto repairer = ctx.repairer;
8873
8874 uint64_t_btree_t used_nids;
8875
8876 size_t processed_myself = 0;
8877
8878 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
8879 mempool::bluestore_fsck::list<string> expecting_shards;
8880 if (it) {
8881 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
8882 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
8883 std::unique_ptr<WQ> wq(
8884 new WQ(
8885 "FSCKWorkQueue",
8886 (thread_count ? : 1) * 32,
8887 this,
8888 sb_info_lock,
8889 sb_info,
8890 sb_ref_counts,
8891 repairer));
8892
8893 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
8894
8895 thread_pool.add_work_queue(wq.get());
8896 if (depth == FSCK_SHALLOW && thread_count > 0) {
8897 //not the best place but let's check anyway
8898 ceph_assert(sb_info_lock);
8899 thread_pool.start();
8900 }
8901
8902 // fill global if not overriden below
8903 CollectionRef c;
8904 int64_t pool_id = -1;
8905 spg_t pgid;
8906 for (it->lower_bound(string()); it->valid(); it->next()) {
8907 dout(30) << __func__ << " key "
8908 << pretty_binary_string(it->key()) << dendl;
8909 if (is_extent_shard_key(it->key())) {
8910 if (depth == FSCK_SHALLOW) {
8911 continue;
8912 }
8913 while (!expecting_shards.empty() &&
8914 expecting_shards.front() < it->key()) {
8915 derr << "fsck error: missing shard key "
8916 << pretty_binary_string(expecting_shards.front())
8917 << dendl;
8918 ++errors;
8919 expecting_shards.pop_front();
8920 }
8921 if (!expecting_shards.empty() &&
8922 expecting_shards.front() == it->key()) {
8923 // all good
8924 expecting_shards.pop_front();
8925 continue;
8926 }
8927
8928 uint32_t offset;
8929 string okey;
8930 get_key_extent_shard(it->key(), &okey, &offset);
8931 derr << "fsck error: stray shard 0x" << std::hex << offset
8932 << std::dec << dendl;
8933 if (expecting_shards.empty()) {
8934 derr << "fsck error: " << pretty_binary_string(it->key())
8935 << " is unexpected" << dendl;
8936 ++errors;
8937 continue;
8938 }
8939 while (expecting_shards.front() > it->key()) {
8940 derr << "fsck error: saw " << pretty_binary_string(it->key())
8941 << dendl;
8942 derr << "fsck error: exp "
8943 << pretty_binary_string(expecting_shards.front()) << dendl;
8944 ++errors;
8945 expecting_shards.pop_front();
8946 if (expecting_shards.empty()) {
8947 break;
8948 }
8949 }
8950 continue;
8951 }
8952
8953 ghobject_t oid;
8954 int r = get_key_object(it->key(), &oid);
8955 if (r < 0) {
8956 derr << "fsck error: bad object key "
8957 << pretty_binary_string(it->key()) << dendl;
8958 ++errors;
8959 continue;
8960 }
8961 if (!c ||
8962 oid.shard_id != pgid.shard ||
8963 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8964 !c->contains(oid)) {
8965 c = nullptr;
8966 for (auto& p : coll_map) {
8967 if (p.second->contains(oid)) {
8968 c = p.second;
8969 break;
8970 }
8971 }
8972 if (!c) {
8973 derr << "fsck error: stray object " << oid
8974 << " not owned by any collection" << dendl;
8975 ++errors;
8976 continue;
8977 }
8978 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
8979 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
8980 << dendl;
8981 }
8982
8983 if (depth != FSCK_SHALLOW &&
8984 !expecting_shards.empty()) {
8985 for (auto& k : expecting_shards) {
8986 derr << "fsck error: missing shard key "
8987 << pretty_binary_string(k) << dendl;
8988 }
8989 ++errors;
8990 expecting_shards.clear();
8991 }
8992
8993 bool queued = false;
8994 if (depth == FSCK_SHALLOW && thread_count > 0) {
8995 queued = wq->queue(
8996 pool_id,
8997 c,
8998 oid,
8999 it->key(),
9000 it->value());
9001 }
9002 OnodeRef o;
9003 map<BlobRef, bluestore_blob_t::unused_t> referenced;
9004
9005 if (!queued) {
9006 ++processed_myself;
9007 o = fsck_check_objects_shallow(
9008 depth,
9009 pool_id,
9010 c,
9011 oid,
9012 it->key(),
9013 it->value(),
9014 &expecting_shards,
9015 &referenced,
9016 ctx);
9017 }
9018
9019 if (depth != FSCK_SHALLOW) {
9020 ceph_assert(o != nullptr);
9021 if (o->onode.nid) {
9022 if (o->onode.nid > nid_max) {
9023 derr << "fsck error: " << oid << " nid " << o->onode.nid
9024 << " > nid_max " << nid_max << dendl;
9025 ++errors;
9026 }
9027 if (used_nids.count(o->onode.nid)) {
9028 derr << "fsck error: " << oid << " nid " << o->onode.nid
9029 << " already in use" << dendl;
9030 ++errors;
9031 continue; // go for next object
9032 }
9033 used_nids.insert(o->onode.nid);
9034 }
9035 for (auto& i : referenced) {
9036 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
9037 << std::dec << " for " << *i.first << dendl;
9038 const bluestore_blob_t& blob = i.first->get_blob();
9039 if (i.second & blob.unused) {
9040 derr << "fsck error: " << oid << " blob claims unused 0x"
9041 << std::hex << blob.unused
9042 << " but extents reference 0x" << i.second << std::dec
9043 << " on blob " << *i.first << dendl;
9044 ++errors;
9045 }
9046 if (blob.has_csum()) {
9047 uint64_t blob_len = blob.get_logical_length();
9048 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
9049 unsigned csum_count = blob.get_csum_count();
9050 unsigned csum_chunk_size = blob.get_csum_chunk_size();
9051 for (unsigned p = 0; p < csum_count; ++p) {
9052 unsigned pos = p * csum_chunk_size;
9053 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
9054 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
9055 unsigned mask = 1u << firstbit;
9056 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
9057 mask |= 1u << b;
9058 }
9059 if ((blob.unused & mask) == mask) {
9060 // this csum chunk region is marked unused
9061 if (blob.get_csum_item(p) != 0) {
9062 derr << "fsck error: " << oid
9063 << " blob claims csum chunk 0x" << std::hex << pos
9064 << "~" << csum_chunk_size
9065 << " is unused (mask 0x" << mask << " of unused 0x"
9066 << blob.unused << ") but csum is non-zero 0x"
9067 << blob.get_csum_item(p) << std::dec << " on blob "
9068 << *i.first << dendl;
9069 ++errors;
9070 }
9071 }
9072 }
9073 }
9074 }
9075 // omap
9076 if (o->onode.has_omap()) {
9077 ceph_assert(ctx.used_omap_head);
9078 if (ctx.used_omap_head->count(o->onode.nid)) {
9079 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
9080 << " already in use" << dendl;
9081 ++errors;
9082 } else {
9083 ctx.used_omap_head->insert(o->onode.nid);
9084 }
9085 } // if (o->onode.has_omap())
9086 if (depth == FSCK_DEEP) {
9087 bufferlist bl;
9088 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
9089 uint64_t offset = 0;
9090 do {
9091 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
9092 int r = _do_read(c.get(), o, offset, l, bl,
9093 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
9094 if (r < 0) {
9095 ++errors;
9096 derr << "fsck error: " << oid << std::hex
9097 << " error during read: "
9098 << " " << offset << "~" << l
9099 << " " << cpp_strerror(r) << std::dec
9100 << dendl;
9101 break;
9102 }
9103 offset += l;
9104 } while (offset < o->onode.size);
9105 } // deep
9106 } //if (depth != FSCK_SHALLOW)
9107 } // for (it->lower_bound(string()); it->valid(); it->next())
9108 if (depth == FSCK_SHALLOW && thread_count > 0) {
9109 wq->finalize(thread_pool, ctx);
9110 if (processed_myself) {
9111 // may be needs more threads?
9112 dout(0) << __func__ << " partial offload"
9113 << ", done myself " << processed_myself
9114 << " of " << ctx.num_objects
9115 << "objects, threads " << thread_count
9116 << dendl;
9117 }
9118 }
9119 } // if (it)
9120}
9121/**
9122An overview for currently implemented repair logics
9123performed in fsck in two stages: detection(+preparation) and commit.
9124Detection stage (in processing order):
9125 (Issue -> Repair action to schedule)
9126 - Detect undecodable keys for Shared Blobs -> Remove
9127 - Detect undecodable records for Shared Blobs -> Remove
9128 (might trigger missed Shared Blob detection below)
9129 - Detect stray records for Shared Blobs -> Remove
9130 - Detect misreferenced pextents -> Fix
9131 Prepare Bloom-like filter to track cid/oid -> pextent
9132 Prepare list of extents that are improperly referenced
9133 Enumerate Onode records that might use 'misreferenced' pextents
9134 (Bloom-like filter applied to reduce computation)
9135 Per each questinable Onode enumerate all blobs and identify broken ones
9136 (i.e. blobs having 'misreferences')
9137 Rewrite each broken blob data by allocating another extents and
9138 copying data there
9139 If blob is shared - unshare it and mark corresponding Shared Blob
9140 for removal
9141 Release previously allocated space
9142 Update Extent Map
9143 - Detect missed Shared Blobs -> Recreate
9144 - Detect undecodable deferred transaction -> Remove
9145 - Detect Freelist Manager's 'false free' entries -> Mark as used
9146 - Detect Freelist Manager's leaked entries -> Mark as free
9147 - Detect statfs inconsistency - Update
9148 Commit stage (separate DB commit per each step):
9149 - Apply leaked FM entries fix
9150 - Apply 'false free' FM entries fix
9151 - Apply 'Remove' actions
9152 - Apply fix for misreference pextents
9153 - Apply Shared Blob recreate
9154 (can be merged with the step above if misreferences were dectected)
9155 - Apply StatFS update
9156*/
9157int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
9158{
9159 dout(5) << __func__
9160 << (repair ? " repair" : " check")
9161 << (depth == FSCK_DEEP ? " (deep)" :
9162 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
9163 << dendl;
9164
9165 // in deep mode we need R/W write access to be able to replay deferred ops
9166 const bool read_only = !(repair || depth == FSCK_DEEP);
9167 int r = _open_db_and_around(read_only);
9168 if (r < 0) {
9169 return r;
9170 }
9171 auto close_db = make_scope_guard([&] {
9172 _close_db_and_around();
9173 });
9174
9175 if (!read_only) {
9176 r = _upgrade_super();
9177 if (r < 0) {
9178 return r;
9179 }
9180 }
9181
9182 // NullFreelistManager needs to open collection early
9183 r = _open_collections();
9184 if (r < 0) {
9185 return r;
9186 }
9187
9188 mempool_thread.init();
9189 auto stop_mempool = make_scope_guard([&] {
9190 mempool_thread.shutdown();
9191 _shutdown_cache();
9192 });
9193 // we need finisher and kv_{sync,finalize}_thread *just* for replay
9194 // enable in repair or deep mode modes only
9195 if (!read_only) {
9196 _kv_start();
9197 r = _deferred_replay();
9198 _kv_stop();
9199 }
9200
9201 if (r < 0) {
9202 return r;
9203 }
9204 return _fsck_on_open(depth, repair);
9205}
9206
9207int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
9208{
9209 uint64_t sb_hash_size = uint64_t(
9210 cct->_conf.get_val<Option::size_t>("osd_memory_target") *
9211 cct->_conf.get_val<double>(
9212 "bluestore_fsck_shared_blob_tracker_size"));
9213
9214 dout(1) << __func__
9215 << " <<<START>>>"
9216 << (repair ? " repair" : " check")
9217 << (depth == FSCK_DEEP ? " (deep)" :
9218 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
9219 << " start sb_tracker_hash_size:" << sb_hash_size
9220 << dendl;
9221 int64_t errors = 0;
9222 int64_t warnings = 0;
9223 unsigned repaired = 0;
9224
9225 uint64_t_btree_t used_omap_head;
9226 uint64_t_btree_t used_sbids;
9227
9228 mempool_dynamic_bitset used_blocks, bluefs_used_blocks;
9229 KeyValueDB::Iterator it;
9230 store_statfs_t expected_store_statfs;
9231 per_pool_statfs expected_pool_statfs;
9232
9233 sb_info_space_efficient_map_t sb_info;
9234 shared_blob_2hash_tracker_t sb_ref_counts(
9235 sb_hash_size,
9236 min_alloc_size);
9237 size_t sb_ref_mismatches = 0;
9238
9239 /// map of oid -> (first_)offset for each zone
9240 std::vector<std::unordered_map<ghobject_t, uint64_t>> zone_refs; // FIXME: this may be a lot of RAM!
9241
9242 uint64_t num_objects = 0;
9243 uint64_t num_extents = 0;
9244 uint64_t num_blobs = 0;
9245 uint64_t num_spanning_blobs = 0;
9246 uint64_t num_shared_blobs = 0;
9247 uint64_t num_sharded_objects = 0;
9248 BlueStoreRepairer repairer;
9249
9250 auto alloc_size = fm->get_alloc_size();
9251
9252 utime_t start = ceph_clock_now();
9253
9254 _fsck_collections(&errors);
9255 used_blocks.resize(fm->get_alloc_units());
9256
9257 if (bluefs) {
9258 interval_set<uint64_t> bluefs_extents;
9259
9260 bluefs->foreach_block_extents(
9261 bluefs_layout.shared_bdev,
9262 [&](uint64_t start, uint32_t len) {
9263 apply_for_bitset_range(start, len, alloc_size, used_blocks,
9264 [&](uint64_t pos, mempool_dynamic_bitset& bs) {
9265 ceph_assert(pos < bs.size());
9266 bs.set(pos);
9267 }
9268 );
9269 }
9270 );
9271 }
9272
9273 bluefs_used_blocks = used_blocks;
9274
9275 apply_for_bitset_range(
9276 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks,
9277 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9278 bs.set(pos);
9279 }
9280 );
9281
9282
9283 if (repair) {
9284 repairer.init_space_usage_tracker(
9285 bdev->get_size(),
9286 min_alloc_size);
9287 }
9288
9289 if (bluefs) {
9290 int r = bluefs->fsck();
9291 if (r < 0) {
9292 return r;
9293 }
9294 if (r > 0)
9295 errors += r;
9296 }
9297
9298 if (!per_pool_stat_collection) {
9299 const char *w;
9300 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
9301 w = "error";
9302 ++errors;
9303 } else {
9304 w = "warning";
9305 ++warnings;
9306 }
9307 derr << "fsck " << w << ": store not yet converted to per-pool stats"
9308 << dendl;
9309 }
9310 if (per_pool_omap != OMAP_PER_PG) {
9311 const char *w;
9312 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
9313 w = "error";
9314 ++errors;
9315 } else {
9316 w = "warning";
9317 ++warnings;
9318 }
9319 derr << "fsck " << w << ": store not yet converted to per-pg omap"
9320 << dendl;
9321 }
9322
9323 if (g_conf()->bluestore_debug_fsck_abort) {
9324 dout(1) << __func__ << " debug abort" << dendl;
9325 goto out_scan;
9326 }
9327
9328#ifdef HAVE_LIBZBD
9329 if (bdev->is_smr()) {
9330 auto a = dynamic_cast<ZonedAllocator*>(alloc);
9331 ceph_assert(a);
9332 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
9333 ceph_assert(f);
9334 vector<uint64_t> wp = bdev->get_zones();
9335 vector<zone_state_t> zones = f->get_zone_states(db);
9336 ceph_assert(wp.size() == zones.size());
9337 auto num_zones = bdev->get_size() / zone_size;
9338 for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
9339 uint64_t p = wp[i] == (i + 1) * zone_size ? zone_size : wp[i] % zone_size;
9340 if (zones[i].write_pointer > p &&
9341 zones[i].num_dead_bytes < zones[i].write_pointer) {
9342 derr << "fsck error: zone 0x" << std::hex << i
9343 << " bluestore write pointer 0x" << zones[i].write_pointer
9344 << " > device write pointer 0x" << p
9345 << " (with only 0x" << zones[i].num_dead_bytes << " dead bytes)"
9346 << std::dec << dendl;
9347 ++errors;
9348 }
9349 }
9350
9351 if (depth != FSCK_SHALLOW) {
9352 // load zone refs
9353 zone_refs.resize(bdev->get_size() / zone_size);
9354 it = db->get_iterator(PREFIX_ZONED_CL_INFO, KeyValueDB::ITERATOR_NOCACHE);
9355 if (it) {
9356 for (it->lower_bound(string());
9357 it->valid();
9358 it->next()) {
9359 uint32_t zone = 0;
9360 uint64_t offset = 0;
9361 ghobject_t oid;
9362 string key = it->key();
9363 int r = get_key_zone_offset_object(key, &zone, &offset, &oid);
9364 if (r < 0) {
9365 derr << "fsck error: invalid zone ref key " << pretty_binary_string(key)
9366 << dendl;
9367 if (repair) {
9368 repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
9369 }
9370 ++errors;
9371 continue;
9372 }
9373 dout(30) << " zone ref 0x" << std::hex << zone << " offset 0x" << offset
9374 << " -> " << std::dec << oid << dendl;
9375 if (zone_refs[zone].count(oid)) {
9376 derr << "fsck error: second zone ref in zone 0x" << std::hex << zone
9377 << " offset 0x" << offset << std::dec << " for " << oid << dendl;
9378 if (repair) {
9379 repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
9380 }
9381 ++errors;
9382 continue;
9383 }
9384 zone_refs[zone][oid] = offset;
9385 }
9386 }
9387 }
9388 }
9389#endif
9390
9391 dout(1) << __func__ << " checking shared_blobs (phase 1)" << dendl;
9392 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
9393 if (it) {
9394 for (it->lower_bound(string()); it->valid(); it->next()) {
9395 string key = it->key();
9396 uint64_t sbid;
9397 if (get_key_shared_blob(key, &sbid) < 0) {
9398 // Failed to parse the key.
9399 // This gonna to be handled at the second stage
9400 continue;
9401 }
9402 bluestore_shared_blob_t shared_blob(sbid);
9403 bufferlist bl = it->value();
9404 auto blp = bl.cbegin();
9405 try {
9406 decode(shared_blob, blp);
9407 }
9408 catch (ceph::buffer::error& e) {
9409 // this gonna to be handled at the second stage
9410 continue;
9411 }
9412 dout(20) << __func__ << " " << shared_blob << dendl;
9413 auto& sbi = sb_info.add_maybe_stray(sbid);
9414
9415 // primarily to silent the 'unused' warning
9416 ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID);
9417
9418 for (auto& r : shared_blob.ref_map.ref_map) {
9419 sb_ref_counts.inc_range(
9420 sbid,
9421 r.first,
9422 r.second.length,
9423 -r.second.refs);
9424 }
9425 }
9426 } // if (it) //checking shared_blobs (phase1)
9427
9428 // walk PREFIX_OBJ
9429 {
9430 dout(1) << __func__ << " walking object keyspace" << dendl;
9431 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
9432 BlueStore::FSCK_ObjectCtx ctx(
9433 errors,
9434 warnings,
9435 num_objects,
9436 num_extents,
9437 num_blobs,
9438 num_sharded_objects,
9439 num_spanning_blobs,
9440 &used_blocks,
9441 &used_omap_head,
9442 &zone_refs,
9443 //no need for the below lock when in non-shallow mode as
9444 // there is no multithreading in this case
9445 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
9446 sb_info,
9447 sb_ref_counts,
9448 expected_store_statfs,
9449 expected_pool_statfs,
9450 repair ? &repairer : nullptr);
9451
9452 _fsck_check_objects(depth, ctx);
9453 }
9454
9455#ifdef HAVE_LIBZBD
9456 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
9457 dout(1) << __func__ << " checking for leaked zone refs" << dendl;
9458 for (uint32_t zone = 0; zone < zone_refs.size(); ++zone) {
9459 for (auto& [oid, offset] : zone_refs[zone]) {
9460 derr << "fsck error: stray zone ref 0x" << std::hex << zone
9461 << " offset 0x" << offset << " -> " << std::dec << oid << dendl;
9462 // FIXME: add repair
9463 ++errors;
9464 }
9465 }
9466 }
9467#endif
9468
9469 sb_ref_mismatches = sb_ref_counts.count_non_zero();
9470 if (sb_ref_mismatches != 0) {
9471 derr << "fsck error:" << "*" << sb_ref_mismatches
9472 << " shared blob references aren't matching, at least "
9473 << sb_ref_mismatches << " found" << dendl;
9474 errors += sb_ref_mismatches;
9475 }
9476
9477 if (depth != FSCK_SHALLOW && repair) {
9478 _fsck_repair_shared_blobs(repairer, sb_ref_counts, sb_info);
9479 }
9480 dout(1) << __func__ << " checking shared_blobs (phase 2)" << dendl;
9481 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
9482 if (it) {
9483 // FIXME minor: perhaps simplify for shallow mode?
9484 // fill global if not overriden below
9485 auto expected_statfs = &expected_store_statfs;
9486 for (it->lower_bound(string()); it->valid(); it->next()) {
9487 string key = it->key();
9488 uint64_t sbid;
9489 if (get_key_shared_blob(key, &sbid)) {
9490 derr << "fsck error: bad key '" << key
9491 << "' in shared blob namespace" << dendl;
9492 if (repair) {
9493 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9494 }
9495 ++errors;
9496 continue;
9497 }
9498 auto p = sb_info.find(sbid);
9499 if (p == sb_info.end()) {
9500 if (sb_ref_mismatches > 0) {
9501 // highly likely this has been already reported before, ignoring...
9502 dout(5) << __func__ << " found duplicate(?) stray shared blob data for sbid 0x"
9503 << std::hex << sbid << std::dec << dendl;
9504 } else {
9505 derr<< "fsck error: found stray shared blob data for sbid 0x"
9506 << std::hex << sbid << std::dec << dendl;
9507 ++errors;
9508 if (repair) {
9509 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9510 }
9511 }
9512 } else {
9513 ++num_shared_blobs;
9514 sb_info_t& sbi = *p;
9515 bluestore_shared_blob_t shared_blob(sbid);
9516 bufferlist bl = it->value();
9517 auto blp = bl.cbegin();
9518 try {
9519 decode(shared_blob, blp);
9520 }
9521 catch (ceph::buffer::error& e) {
9522 ++errors;
9523
9524 derr << "fsck error: failed to decode Shared Blob"
9525 << pretty_binary_string(key) << dendl;
9526 if (repair) {
9527 dout(20) << __func__ << " undecodable Shared Blob, key:'"
9528 << pretty_binary_string(key)
9529 << "', removing" << dendl;
9530 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9531 }
9532 continue;
9533 }
9534 dout(20) << __func__ << " " << shared_blob << dendl;
9535 PExtentVector extents;
9536 for (auto& r : shared_blob.ref_map.ref_map) {
9537 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
9538 }
9539 if (sbi.pool_id != sb_info_t::INVALID_POOL_ID &&
9540 (per_pool_stat_collection || repair)) {
9541 expected_statfs = &expected_pool_statfs[sbi.pool_id];
9542 }
9543 std::stringstream ss;
9544 ss << "sbid 0x" << std::hex << sbid << std::dec;
9545 errors += _fsck_check_extents(ss.str(),
9546 extents,
9547 sbi.allocated_chunks < 0,
9548 used_blocks,
9549 fm->get_alloc_size(),
9550 repair ? &repairer : nullptr,
9551 *expected_statfs,
9552 depth);
9553 }
9554 }
9555 } // if (it) /* checking shared_blobs (phase 2)*/
9556
9557 if (repair && repairer.preprocess_misreference(db)) {
9558
9559 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
9560 auto& misref_extents = repairer.get_misreferences();
9561 interval_set<uint64_t> to_release;
9562 it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
9563 if (it) {
9564 // fill global if not overriden below
9565 auto expected_statfs = &expected_store_statfs;
9566
9567 CollectionRef c;
9568 spg_t pgid;
9569 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
9570 bool bypass_rest = false;
9571 for (it->lower_bound(string()); it->valid() && !bypass_rest;
9572 it->next()) {
9573 dout(30) << __func__ << " key "
9574 << pretty_binary_string(it->key()) << dendl;
9575 if (is_extent_shard_key(it->key())) {
9576 continue;
9577 }
9578
9579 ghobject_t oid;
9580 int r = get_key_object(it->key(), &oid);
9581 if (r < 0 || !repairer.is_used(oid)) {
9582 continue;
9583 }
9584
9585 if (!c ||
9586 oid.shard_id != pgid.shard ||
9587 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
9588 !c->contains(oid)) {
9589 c = nullptr;
9590 for (auto& p : coll_map) {
9591 if (p.second->contains(oid)) {
9592 c = p.second;
9593 break;
9594 }
9595 }
9596 if (!c) {
9597 continue;
9598 }
9599 if (per_pool_stat_collection || repair) {
9600 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
9601 expected_statfs = &expected_pool_statfs[pool_id];
9602 }
9603 }
9604 if (!repairer.is_used(c->cid)) {
9605 continue;
9606 }
9607
9608 dout(20) << __func__ << " check misreference for col:" << c->cid
9609 << " obj:" << oid << dendl;
9610
9611 OnodeRef o;
9612 o.reset(Onode::create_decode(c, oid, it->key(), it->value()));
9613 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9614 mempool::bluestore_fsck::set<BlobRef> blobs;
9615
9616 for (auto& e : o->extent_map.extent_map) {
9617 blobs.insert(e.blob);
9618 }
9619 bool need_onode_update = false;
9620 bool first_dump = true;
9621 for(auto b : blobs) {
9622 bool broken_blob = false;
9623 auto& pextents = b->dirty_blob().dirty_extents();
9624 for (auto& e : pextents) {
9625 if (!e.is_valid()) {
9626 continue;
9627 }
9628 // for the sake of simplicity and proper shared blob handling
9629 // always rewrite the whole blob even when it's partially
9630 // misreferenced.
9631 if (misref_extents.intersects(e.offset, e.length)) {
9632 if (first_dump) {
9633 first_dump = false;
9634 _dump_onode<10>(cct, *o);
9635 }
9636 broken_blob = true;
9637 break;
9638 }
9639 }
9640 if (!broken_blob)
9641 continue;
9642 bool compressed = b->get_blob().is_compressed();
9643 need_onode_update = true;
9644 dout(10) << __func__
9645 << " fix misreferences in oid:" << oid
9646 << " " << *b << dendl;
9647 uint64_t b_off = 0;
9648 PExtentVector pext_to_release;
9649 pext_to_release.reserve(pextents.size());
9650 // rewriting all valid pextents
9651 for (auto e = pextents.begin(); e != pextents.end();
9652 e++) {
9653 auto b_off_cur = b_off;
9654 b_off += e->length;
9655 if (!e->is_valid()) {
9656 continue;
9657 }
9658 PExtentVector exts;
9659 dout(5) << __func__ << "::NCB::(F)alloc=" << alloc << ", length=" << e->length << dendl;
9660 int64_t alloc_len =
9661 alloc->allocate(e->length, min_alloc_size,
9662 0, 0, &exts);
9663 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
9664 derr << __func__
9665 << " failed to allocate 0x" << std::hex << e->length
9666 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
9667 << " min_alloc_size 0x" << min_alloc_size
9668 << " available 0x " << alloc->get_free()
9669 << std::dec << dendl;
9670 if (alloc_len > 0) {
9671 alloc->release(exts);
9672 }
9673 bypass_rest = true;
9674 break;
9675 }
9676 expected_statfs->allocated += e->length;
9677 if (compressed) {
9678 expected_statfs->data_compressed_allocated += e->length;
9679 }
9680
9681 bufferlist bl;
9682 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
9683 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
9684 if (r < 0) {
9685 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
9686 <<"~" << e->length << std::dec << dendl;
9687 ceph_abort_msg("read failed, wtf");
9688 }
9689 pext_to_release.push_back(*e);
9690 e = pextents.erase(e);
9691 e = pextents.insert(e, exts.begin(), exts.end());
9692 b->get_blob().map_bl(
9693 b_off_cur, bl,
9694 [&](uint64_t offset, bufferlist& t) {
9695 int r = bdev->write(offset, t, false);
9696 ceph_assert(r == 0);
9697 });
9698 e += exts.size() - 1;
9699 for (auto& p : exts) {
9700 fm->allocate(p.offset, p.length, txn);
9701 }
9702 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
9703
9704 if (b->get_blob().is_shared()) {
9705 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
9706
9707 auto sbid = b->shared_blob->get_sbid();
9708 auto sb_it = sb_info.find(sbid);
9709 ceph_assert(sb_it != sb_info.end());
9710 sb_info_t& sbi = *sb_it;
9711
9712 if (sbi.allocated_chunks < 0) {
9713 // NB: it's crucial to use compressed_allocated_chunks from sb_info_t
9714 // as we originally used that value while accumulating
9715 // expected_statfs
9716 expected_statfs->allocated -= uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
9717 expected_statfs->data_compressed_allocated -=
9718 uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
9719 } else {
9720 expected_statfs->allocated -= uint64_t(sbi.allocated_chunks) << min_alloc_size_order;
9721 }
9722 sbi.allocated_chunks = 0;
9723 repairer.fix_shared_blob(txn, sbid, nullptr, 0);
9724
9725 // relying on blob's pextents to decide what to release.
9726 for (auto& p : pext_to_release) {
9727 to_release.union_insert(p.offset, p.length);
9728 }
9729 } else {
9730 for (auto& p : pext_to_release) {
9731 expected_statfs->allocated -= p.length;
9732 if (compressed) {
9733 expected_statfs->data_compressed_allocated -= p.length;
9734 }
9735 to_release.union_insert(p.offset, p.length);
9736 }
9737 }
9738 if (bypass_rest) {
9739 break;
9740 }
9741 } // for(auto b : blobs)
9742 if (need_onode_update) {
9743 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
9744 _record_onode(o, txn);
9745 }
9746 } // for (it->lower_bound(string()); it->valid(); it->next())
9747
9748 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
9749 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
9750 << "~" << it.get_len() << std::dec << dendl;
9751 fm->release(it.get_start(), it.get_len(), txn);
9752 }
9753 alloc->release(to_release);
9754 to_release.clear();
9755 } // if (it) {
9756 } //if (repair && repairer.preprocess_misreference()) {
9757 sb_info.clear();
9758 sb_ref_counts.reset();
9759
9760 dout(1) << __func__ << " checking pool_statfs" << dendl;
9761 _fsck_check_statfs(expected_store_statfs, expected_pool_statfs,
9762 errors, warnings, repair ? &repairer : nullptr);
9763 if (depth != FSCK_SHALLOW) {
9764 dout(1) << __func__ << " checking for stray omap data " << dendl;
9765 it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9766 if (it) {
9767 uint64_t last_omap_head = 0;
9768 for (it->lower_bound(string()); it->valid(); it->next()) {
9769 uint64_t omap_head;
9770
9771 _key_decode_u64(it->key().c_str(), &omap_head);
9772
9773 if (used_omap_head.count(omap_head) == 0 &&
9774 omap_head != last_omap_head) {
9775 pair<string,string> rk = it->raw_key();
9776 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9777 << "fsck error: found stray omap data on omap_head "
9778 << omap_head << " " << last_omap_head
9779 << " prefix/key: " << url_escape(rk.first)
9780 << " " << url_escape(rk.second)
9781 << fsck_dendl;
9782 ++errors;
9783 last_omap_head = omap_head;
9784 }
9785 }
9786 }
9787 it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9788 if (it) {
9789 uint64_t last_omap_head = 0;
9790 for (it->lower_bound(string()); it->valid(); it->next()) {
9791 uint64_t omap_head;
9792 _key_decode_u64(it->key().c_str(), &omap_head);
9793 if (used_omap_head.count(omap_head) == 0 &&
9794 omap_head != last_omap_head) {
9795 pair<string,string> rk = it->raw_key();
9796 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9797 << "fsck error: found stray (pgmeta) omap data on omap_head "
9798 << omap_head << " " << last_omap_head
9799 << " prefix/key: " << url_escape(rk.first)
9800 << " " << url_escape(rk.second)
9801 << fsck_dendl;
9802 last_omap_head = omap_head;
9803 ++errors;
9804 }
9805 }
9806 }
9807 it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9808 if (it) {
9809 uint64_t last_omap_head = 0;
9810 for (it->lower_bound(string()); it->valid(); it->next()) {
9811 uint64_t pool;
9812 uint64_t omap_head;
9813 string k = it->key();
9814 const char *c = k.c_str();
9815 c = _key_decode_u64(c, &pool);
9816 c = _key_decode_u64(c, &omap_head);
9817 if (used_omap_head.count(omap_head) == 0 &&
9818 omap_head != last_omap_head) {
9819 pair<string,string> rk = it->raw_key();
9820 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9821 << "fsck error: found stray (per-pool) omap data on omap_head "
9822 << omap_head << " " << last_omap_head
9823 << " prefix/key: " << url_escape(rk.first)
9824 << " " << url_escape(rk.second)
9825 << fsck_dendl;
9826 ++errors;
9827 last_omap_head = omap_head;
9828 }
9829 }
9830 }
9831 it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9832 if (it) {
9833 uint64_t last_omap_head = 0;
9834 for (it->lower_bound(string()); it->valid(); it->next()) {
9835 uint64_t pool;
9836 uint32_t hash;
9837 uint64_t omap_head;
9838 string k = it->key();
9839 const char* c = k.c_str();
9840 c = _key_decode_u64(c, &pool);
9841 c = _key_decode_u32(c, &hash);
9842 c = _key_decode_u64(c, &omap_head);
9843 if (used_omap_head.count(omap_head) == 0 &&
9844 omap_head != last_omap_head) {
9845 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9846 << "fsck error: found stray (per-pg) omap data on omap_head "
9847 << " key " << pretty_binary_string(it->key())
9848 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
9849 ++errors;
9850 last_omap_head = omap_head;
9851 }
9852 }
9853 }
9854 dout(1) << __func__ << " checking deferred events" << dendl;
9855 it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE);
9856 if (it) {
9857 for (it->lower_bound(string()); it->valid(); it->next()) {
9858 bufferlist bl = it->value();
9859 auto p = bl.cbegin();
9860 bluestore_deferred_transaction_t wt;
9861 try {
9862 decode(wt, p);
9863 } catch (ceph::buffer::error& e) {
9864 derr << "fsck error: failed to decode deferred txn "
9865 << pretty_binary_string(it->key()) << dendl;
9866 if (repair) {
9867 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
9868 << pretty_binary_string(it->key())
9869 << "', removing" << dendl;
9870 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
9871 }
9872 continue;
9873 }
9874 dout(20) << __func__ << " deferred " << wt.seq
9875 << " ops " << wt.ops.size()
9876 << " released 0x" << std::hex << wt.released << std::dec << dendl;
9877 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
9878 apply_for_bitset_range(
9879 e.get_start(), e.get_len(), alloc_size, used_blocks,
9880 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9881 bs.set(pos);
9882 }
9883 );
9884 }
9885 }
9886 }
9887
9888 // skip freelist vs allocated compare when we have Null fm
9889 if (!fm->is_null_manager()) {
9890 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
9891#ifdef HAVE_LIBZBD
9892 if (freelist_type == "zoned") {
9893 // verify per-zone state
9894 // - verify no allocations beyond write pointer
9895 // - verify num_dead_bytes count (neither allocated nor
9896 // free space past the write pointer)
9897 auto a = dynamic_cast<ZonedAllocator*>(alloc);
9898 auto num_zones = bdev->get_size() / zone_size;
9899
9900 // mark the free space past the write pointer
9901 for (uint32_t zone = first_sequential_zone; zone < num_zones; ++zone) {
9902 auto wp = a->get_write_pointer(zone);
9903 uint64_t offset = zone_size * zone + wp;
9904 uint64_t length = zone_size - wp;
9905 if (!length) {
9906 continue;
9907 }
9908 bool intersects = false;
9909 dout(10) << " marking zone 0x" << std::hex << zone
9910 << " region after wp 0x" << offset << "~" << length
9911 << std::dec << dendl;
9912 apply_for_bitset_range(
9913 offset, length, alloc_size, used_blocks,
9914 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9915 if (bs.test(pos)) {
9916 derr << "fsck error: zone 0x" << std::hex << zone
9917 << " has used space at 0x" << pos * alloc_size
9918 << " beyond write pointer 0x" << wp
9919 << std::dec << dendl;
9920 intersects = true;
9921 } else {
9922 bs.set(pos);
9923 }
9924 }
9925 );
9926 if (intersects) {
9927 ++errors;
9928 }
9929 }
9930
9931 used_blocks.flip();
9932
9933 // skip conventional zones
9934 uint64_t pos = (first_sequential_zone * zone_size) / min_alloc_size - 1;
9935 pos = used_blocks.find_next(pos);
9936
9937 uint64_t zone_dead = 0;
9938 for (uint32_t zone = first_sequential_zone;
9939 zone < num_zones;
9940 ++zone, zone_dead = 0) {
9941 while (pos != decltype(used_blocks)::npos &&
9942 (pos * min_alloc_size) / zone_size == zone) {
9943 dout(40) << " zone 0x" << std::hex << zone
9944 << " dead 0x" << (pos * min_alloc_size) << "~" << min_alloc_size
9945 << std::dec << dendl;
9946 zone_dead += min_alloc_size;
9947 pos = used_blocks.find_next(pos);
9948 }
9949 dout(20) << " zone 0x" << std::hex << zone << " dead is 0x" << zone_dead
9950 << std::dec << dendl;
9951 // cross-check dead bytes against zone state
9952 if (a->get_dead_bytes(zone) != zone_dead) {
9953 derr << "fsck error: zone 0x" << std::hex << zone << " has 0x" << zone_dead
9954 << " dead bytes but freelist says 0x" << a->get_dead_bytes(zone)
9955 << dendl;
9956 ++errors;
9957 // TODO: repair
9958 }
9959 }
9960 used_blocks.flip();
9961 } else
9962#endif
9963 {
9964 fm->enumerate_reset();
9965 uint64_t offset, length;
9966 while (fm->enumerate_next(db, &offset, &length)) {
9967 bool intersects = false;
9968 apply_for_bitset_range(
9969 offset, length, alloc_size, used_blocks,
9970 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9971 ceph_assert(pos < bs.size());
9972 if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
9973 if (offset == SUPER_RESERVED &&
9974 length == min_alloc_size - SUPER_RESERVED) {
9975 // this is due to the change just after luminous to min_alloc_size
9976 // granularity allocations, and our baked in assumption at the top
9977 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
9978 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
9979 // since we will never allocate this region below min_alloc_size.
9980 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
9981 << " and min_alloc_size, 0x" << std::hex << offset << "~"
9982 << length << std::dec << dendl;
9983 } else {
9984 intersects = true;
9985 if (repair) {
9986 repairer.fix_false_free(db, fm,
9987 pos * min_alloc_size,
9988 min_alloc_size);
9989 }
9990 }
9991 } else {
9992 bs.set(pos);
9993 }
9994 }
9995 );
9996 if (intersects) {
9997 derr << "fsck error: free extent 0x" << std::hex << offset
9998 << "~" << length << std::dec
9999 << " intersects allocated blocks" << dendl;
10000 ++errors;
10001 }
10002 }
10003 fm->enumerate_reset();
10004
10005 // check for leaked extents
10006 size_t count = used_blocks.count();
10007 if (used_blocks.size() != count) {
10008 ceph_assert(used_blocks.size() > count);
10009 used_blocks.flip();
10010 size_t start = used_blocks.find_first();
10011 while (start != decltype(used_blocks)::npos) {
10012 size_t cur = start;
10013 while (true) {
10014 size_t next = used_blocks.find_next(cur);
10015 if (next != cur + 1) {
10016 ++errors;
10017 derr << "fsck error: leaked extent 0x" << std::hex
10018 << ((uint64_t)start * fm->get_alloc_size()) << "~"
10019 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
10020 << dendl;
10021 if (repair) {
10022 repairer.fix_leaked(db,
10023 fm,
10024 start * min_alloc_size,
10025 (cur + 1 - start) * min_alloc_size);
10026 }
10027 start = next;
10028 break;
10029 }
10030 cur = next;
10031 }
10032 }
10033 used_blocks.flip();
10034 }
10035 }
10036 }
10037 }
10038 if (repair) {
10039 if (per_pool_omap != OMAP_PER_PG) {
10040 dout(5) << __func__ << " fixing per_pg_omap" << dendl;
10041 repairer.fix_per_pool_omap(db, OMAP_PER_PG);
10042 }
10043
10044 dout(5) << __func__ << " applying repair results" << dendl;
10045 repaired = repairer.apply(db);
10046 dout(5) << __func__ << " repair applied" << dendl;
10047 }
10048
10049out_scan:
10050 dout(2) << __func__ << " " << num_objects << " objects, "
10051 << num_sharded_objects << " of them sharded. "
10052 << dendl;
10053 dout(2) << __func__ << " " << num_extents << " extents to "
10054 << num_blobs << " blobs, "
10055 << num_spanning_blobs << " spanning, "
10056 << num_shared_blobs << " shared."
10057 << dendl;
10058
10059 utime_t duration = ceph_clock_now() - start;
10060 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
10061 << warnings << " warnings, "
10062 << repaired << " repaired, "
10063 << (errors + warnings - (int)repaired) << " remaining in "
10064 << duration << " seconds" << dendl;
10065
10066 // In non-repair mode we should return error count only as
10067 // it indicates if store status is OK.
10068 // In repair mode both errors and warnings are taken into account
10069 // since repaired counter relates to them both.
10070 return repair ? errors + warnings - (int)repaired : errors;
10071}
10072
10073/// methods to inject various errors fsck can repair
10074void BlueStore::inject_broken_shared_blob_key(const string& key,
10075 const bufferlist& bl)
10076{
10077 KeyValueDB::Transaction txn;
10078 txn = db->get_transaction();
10079 txn->set(PREFIX_SHARED_BLOB, key, bl);
10080 db->submit_transaction_sync(txn);
10081};
10082
10083void BlueStore::inject_no_shared_blob_key()
10084{
10085 KeyValueDB::Transaction txn;
10086 txn = db->get_transaction();
10087 ceph_assert(blobid_last > 0);
10088 // kill the last used sbid, this can be broken due to blobid preallocation
10089 // in rare cases, leaving as-is for the sake of simplicity
10090 uint64_t sbid = blobid_last;
10091
10092 string key;
10093 dout(5) << __func__<< " " << sbid << dendl;
10094 get_shared_blob_key(sbid, &key);
10095 txn->rmkey(PREFIX_SHARED_BLOB, key);
10096 db->submit_transaction_sync(txn);
10097};
10098
10099void BlueStore::inject_stray_shared_blob_key(uint64_t sbid)
10100{
10101 KeyValueDB::Transaction txn;
10102 txn = db->get_transaction();
10103
10104 dout(5) << __func__ << " " << sbid << dendl;
10105
10106 string key;
10107 get_shared_blob_key(sbid, &key);
10108 bluestore_shared_blob_t persistent(sbid);
10109 persistent.ref_map.get(0xdead0000, min_alloc_size);
10110 bufferlist bl;
10111 encode(persistent, bl);
10112 dout(20) << __func__ << " sbid " << sbid
10113 << " takes " << bl.length() << " bytes, updating"
10114 << dendl;
10115
10116 txn->set(PREFIX_SHARED_BLOB, key, bl);
10117 db->submit_transaction_sync(txn);
10118};
10119
10120
10121void BlueStore::inject_leaked(uint64_t len)
10122{
10123 PExtentVector exts;
10124 int64_t alloc_len = alloc->allocate(len, min_alloc_size,
10125 min_alloc_size * 256, 0, &exts);
10126
10127 if (fm->is_null_manager()) {
10128 return;
10129 }
10130
10131 KeyValueDB::Transaction txn;
10132 txn = db->get_transaction();
10133
10134 ceph_assert(alloc_len >= (int64_t)len);
10135 for (auto& p : exts) {
10136 fm->allocate(p.offset, p.length, txn);
10137 }
10138 db->submit_transaction_sync(txn);
10139}
10140
10141void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
10142{
10143 ceph_assert(!fm->is_null_manager());
10144
10145 KeyValueDB::Transaction txn;
10146 OnodeRef o;
10147 CollectionRef c = _get_collection(cid);
10148 ceph_assert(c);
10149 {
10150 std::unique_lock l{c->lock}; // just to avoid internal asserts
10151 o = c->get_onode(oid, false);
10152 ceph_assert(o);
10153 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10154 }
10155
10156 bool injected = false;
10157 txn = db->get_transaction();
10158 auto& em = o->extent_map.extent_map;
10159 std::vector<const PExtentVector*> v;
10160 if (em.size()) {
10161 v.push_back(&em.begin()->blob->get_blob().get_extents());
10162 }
10163 if (em.size() > 1) {
10164 auto it = em.end();
10165 --it;
10166 v.push_back(&(it->blob->get_blob().get_extents()));
10167 }
10168 for (auto pext : v) {
10169 if (pext->size()) {
10170 auto p = pext->begin();
10171 while (p != pext->end()) {
10172 if (p->is_valid()) {
10173 dout(20) << __func__ << " release 0x" << std::hex << p->offset
10174 << "~" << p->length << std::dec << dendl;
10175 fm->release(p->offset, p->length, txn);
10176 injected = true;
10177 break;
10178 }
10179 ++p;
10180 }
10181 }
10182 }
10183 ceph_assert(injected);
10184 db->submit_transaction_sync(txn);
10185}
10186
10187void BlueStore::inject_legacy_omap()
10188{
10189 dout(1) << __func__ << dendl;
10190 per_pool_omap = OMAP_BULK;
10191 KeyValueDB::Transaction txn;
10192 txn = db->get_transaction();
10193 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
10194 db->submit_transaction_sync(txn);
10195}
10196
10197void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
10198{
10199 dout(1) << __func__ << " "
10200 << cid << " " << oid
10201 <<dendl;
10202 KeyValueDB::Transaction txn;
10203 OnodeRef o;
10204 CollectionRef c = _get_collection(cid);
10205 ceph_assert(c);
10206 {
10207 std::unique_lock l{ c->lock }; // just to avoid internal asserts
10208 o = c->get_onode(oid, false);
10209 ceph_assert(o);
10210 }
10211 o->onode.clear_flag(
10212 bluestore_onode_t::FLAG_PERPG_OMAP |
10213 bluestore_onode_t::FLAG_PERPOOL_OMAP |
10214 bluestore_onode_t::FLAG_PGMETA_OMAP);
10215 txn = db->get_transaction();
10216 _record_onode(o, txn);
10217 db->submit_transaction_sync(txn);
10218}
10219
10220void BlueStore::inject_stray_omap(uint64_t head, const string& name)
10221{
10222 dout(1) << __func__ << dendl;
10223 KeyValueDB::Transaction txn = db->get_transaction();
10224
10225 string key;
10226 bufferlist bl;
10227 _key_encode_u64(head, &key);
10228 key.append(name);
10229 txn->set(PREFIX_OMAP, key, bl);
10230
10231 db->submit_transaction_sync(txn);
10232}
10233
10234void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
10235{
10236 BlueStoreRepairer repairer;
10237 repairer.fix_statfs(db, key, new_statfs);
10238 repairer.apply(db);
10239}
10240
10241void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
10242{
10243 KeyValueDB::Transaction t = db->get_transaction();
10244 volatile_statfs v;
10245 v = new_statfs;
10246 bufferlist bl;
10247 v.encode(bl);
10248 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
10249 db->submit_transaction_sync(t);
10250}
10251
10252void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
10253 coll_t cid2, ghobject_t oid2,
10254 uint64_t offset)
10255{
10256 OnodeRef o1;
10257 CollectionRef c1 = _get_collection(cid1);
10258 ceph_assert(c1);
10259 {
10260 std::unique_lock l{c1->lock}; // just to avoid internal asserts
10261 o1 = c1->get_onode(oid1, false);
10262 ceph_assert(o1);
10263 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
10264 }
10265 OnodeRef o2;
10266 CollectionRef c2 = _get_collection(cid2);
10267 ceph_assert(c2);
10268 {
10269 std::unique_lock l{c2->lock}; // just to avoid internal asserts
10270 o2 = c2->get_onode(oid2, false);
10271 ceph_assert(o2);
10272 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
10273 }
10274 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
10275 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
10276
10277 // require onode/extent layout to be the same (and simple)
10278 // to make things easier
10279 ceph_assert(o1->onode.extent_map_shards.empty());
10280 ceph_assert(o2->onode.extent_map_shards.empty());
10281 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
10282 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
10283 ceph_assert(e1.logical_offset == e2.logical_offset);
10284 ceph_assert(e1.length == e2.length);
10285 ceph_assert(e1.blob_offset == e2.blob_offset);
10286
10287 KeyValueDB::Transaction txn;
10288 txn = db->get_transaction();
10289
10290 // along with misreference error this will create space leaks errors
10291 e2.blob->dirty_blob() = e1.blob->get_blob();
10292 o2->extent_map.dirty_range(offset, e2.length);
10293 o2->extent_map.update(txn, false);
10294
10295 _record_onode(o2, txn);
10296 db->submit_transaction_sync(txn);
10297}
10298
10299void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
10300 int16_t blob_id)
10301{
10302 OnodeRef o;
10303 CollectionRef c = _get_collection(cid);
10304 ceph_assert(c);
10305 {
10306 std::unique_lock l{ c->lock }; // just to avoid internal asserts
10307 o = c->get_onode(oid, false);
10308 ceph_assert(o);
10309 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10310 }
10311
10312 BlobRef b = c->new_blob();
10313 b->id = blob_id;
10314 o->extent_map.spanning_blob_map[blob_id] = b;
10315
10316 KeyValueDB::Transaction txn;
10317 txn = db->get_transaction();
10318
10319 _record_onode(o, txn);
10320 db->submit_transaction_sync(txn);
10321}
10322
10323void BlueStore::inject_bluefs_file(std::string_view dir, std::string_view name, size_t new_size)
10324{
10325 ceph_assert(bluefs);
10326
10327 BlueFS::FileWriter* p_handle = nullptr;
10328 auto ret = bluefs->open_for_write(dir, name, &p_handle, false);
10329 ceph_assert(ret == 0);
10330
10331 std::string s('0', new_size);
10332 bufferlist bl;
10333 bl.append(s);
10334 p_handle->append(bl);
10335
10336 bluefs->fsync(p_handle);
10337 bluefs->close_writer(p_handle);
10338}
10339
10340void BlueStore::collect_metadata(map<string,string> *pm)
10341{
10342 dout(10) << __func__ << dendl;
10343 bdev->collect_metadata("bluestore_bdev_", pm);
10344 if (bluefs) {
10345 (*pm)["bluefs"] = "1";
10346 // this value is for backward compatibility only
10347 (*pm)["bluefs_single_shared_device"] = \
10348 stringify((int)bluefs_layout.single_shared_device());
10349 (*pm)["bluefs_dedicated_db"] = \
10350 stringify((int)bluefs_layout.dedicated_db);
10351 (*pm)["bluefs_dedicated_wal"] = \
10352 stringify((int)bluefs_layout.dedicated_wal);
10353 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
10354 } else {
10355 (*pm)["bluefs"] = "0";
10356 }
10357
10358 // report numa mapping for underlying devices
10359 int node = -1;
10360 set<int> nodes;
10361 set<string> failed;
10362 int r = get_numa_node(&node, &nodes, &failed);
10363 if (r >= 0) {
10364 if (!failed.empty()) {
10365 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
10366 }
10367 if (!nodes.empty()) {
10368 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
10369 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
10370 }
10371 if (node >= 0) {
10372 (*pm)["objectstore_numa_node"] = stringify(node);
10373 }
10374 }
10375 (*pm)["bluestore_min_alloc_size"] = stringify(min_alloc_size);
10376}
10377
10378int BlueStore::get_numa_node(
10379 int *final_node,
10380 set<int> *out_nodes,
10381 set<string> *out_failed)
10382{
10383 int node = -1;
10384 set<string> devices;
10385 get_devices(&devices);
10386 set<int> nodes;
10387 set<string> failed;
10388 for (auto& devname : devices) {
10389 int n;
10390 BlkDev bdev(devname);
10391 int r = bdev.get_numa_node(&n);
10392 if (r < 0) {
10393 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
10394 << dendl;
10395 failed.insert(devname);
10396 continue;
10397 }
10398 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
10399 << dendl;
10400 nodes.insert(n);
10401 if (node < 0) {
10402 node = n;
10403 }
10404 }
10405 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
10406 *final_node = node;
10407 }
10408 if (out_nodes) {
10409 *out_nodes = nodes;
10410 }
10411 if (out_failed) {
10412 *out_failed = failed;
10413 }
10414 return 0;
10415}
10416
10417void BlueStore::prepare_for_fast_shutdown()
10418{
10419 m_fast_shutdown = true;
10420}
10421
10422int BlueStore::get_devices(set<string> *ls)
10423{
10424 if (bdev) {
10425 bdev->get_devices(ls);
10426 if (bluefs) {
10427 bluefs->get_devices(ls);
10428 }
10429 return 0;
10430 }
10431
10432 // grumble, we haven't started up yet.
10433 if (int r = _open_path(); r < 0) {
10434 return r;
10435 }
10436 auto close_path = make_scope_guard([&] {
10437 _close_path();
10438 });
10439 if (int r = _open_fsid(false); r < 0) {
10440 return r;
10441 }
10442 auto close_fsid = make_scope_guard([&] {
10443 _close_fsid();
10444 });
10445 if (int r = _read_fsid(&fsid); r < 0) {
10446 return r;
10447 }
10448 if (int r = _lock_fsid(); r < 0) {
10449 return r;
10450 }
10451 if (int r = _open_bdev(false); r < 0) {
10452 return r;
10453 }
10454 auto close_bdev = make_scope_guard([&] {
10455 _close_bdev();
10456 });
10457 if (int r = _minimal_open_bluefs(false); r < 0) {
10458 return r;
10459 }
10460 bdev->get_devices(ls);
10461 if (bluefs) {
10462 bluefs->get_devices(ls);
10463 }
10464 _minimal_close_bluefs();
10465 return 0;
10466}
10467
10468void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
10469{
10470 buf->reset();
10471
10472 auto prefix = per_pool_omap == OMAP_BULK ?
10473 PREFIX_OMAP :
10474 per_pool_omap == OMAP_PER_POOL ?
10475 PREFIX_PERPOOL_OMAP :
10476 PREFIX_PERPG_OMAP;
10477 buf->omap_allocated =
10478 db->estimate_prefix_size(prefix, string());
10479
10480 uint64_t bfree = alloc->get_free();
10481
10482 if (bluefs) {
10483 buf->internally_reserved = 0;
10484 // include dedicated db, too, if that isn't the shared device.
10485 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
10486 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
10487 }
10488 // call any non-omap bluefs space "internal metadata"
10489 buf->internal_metadata =
10490 bluefs->get_used()
10491 - buf->omap_allocated;
10492 }
10493
10494 ExtBlkDevState ebd_state;
10495 int rc = bdev->get_ebd_state(ebd_state);
10496 if (rc == 0) {
10497 buf->total += ebd_state.get_physical_total();
10498
10499 // we are limited by both the size of the virtual device and the
10500 // underlying physical device.
10501 bfree = std::min(bfree, ebd_state.get_physical_avail());
10502
10503 buf->allocated = ebd_state.get_physical_total() - ebd_state.get_physical_avail();;
10504 } else {
10505 buf->total += bdev->get_size();
10506 }
10507 buf->available = bfree;
10508}
10509
10510int BlueStore::statfs(struct store_statfs_t *buf,
10511 osd_alert_list_t* alerts)
10512{
10513 if (alerts) {
10514 alerts->clear();
10515 _log_alerts(*alerts);
10516 }
10517 _get_statfs_overall(buf);
10518 {
10519 std::lock_guard l(vstatfs_lock);
10520 buf->allocated = vstatfs.allocated();
10521 buf->data_stored = vstatfs.stored();
10522 buf->data_compressed = vstatfs.compressed();
10523 buf->data_compressed_original = vstatfs.compressed_original();
10524 buf->data_compressed_allocated = vstatfs.compressed_allocated();
10525 }
10526
10527 dout(20) << __func__ << " " << *buf << dendl;
10528 return 0;
10529}
10530
10531int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
10532 bool *out_per_pool_omap)
10533{
10534 dout(20) << __func__ << " pool " << pool_id<< dendl;
10535
10536 if (!per_pool_stat_collection) {
10537 dout(20) << __func__ << " not supported in legacy mode " << dendl;
10538 return -ENOTSUP;
10539 }
10540 buf->reset();
10541
10542 {
10543 std::lock_guard l(vstatfs_lock);
10544 osd_pools[pool_id].publish(buf);
10545 }
10546
10547 string key_prefix;
10548 _key_encode_u64(pool_id, &key_prefix);
10549 *out_per_pool_omap = per_pool_omap != OMAP_BULK;
10550 // stop calls after db was closed
10551 if (*out_per_pool_omap && db) {
10552 auto prefix = per_pool_omap == OMAP_PER_POOL ?
10553 PREFIX_PERPOOL_OMAP :
10554 PREFIX_PERPG_OMAP;
10555 buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix);
10556 }
10557
10558 dout(10) << __func__ << *buf << dendl;
10559 return 0;
10560}
10561
10562void BlueStore::_check_legacy_statfs_alert()
10563{
10564 string s;
10565 if (!per_pool_stat_collection &&
10566 cct->_conf->bluestore_warn_on_legacy_statfs) {
10567 s = "legacy statfs reporting detected, "
10568 "suggest to run store repair to get consistent statistic reports";
10569 }
10570 std::lock_guard l(qlock);
10571 legacy_statfs_alert = s;
10572}
10573
10574void BlueStore::_check_no_per_pg_or_pool_omap_alert()
10575{
10576 string per_pg, per_pool;
10577 if (per_pool_omap != OMAP_PER_PG) {
10578 if (cct->_conf->bluestore_warn_on_no_per_pg_omap) {
10579 per_pg = "legacy (not per-pg) omap detected, "
10580 "suggest to run store repair to benefit from faster PG removal";
10581 }
10582 if (per_pool_omap != OMAP_PER_POOL) {
10583 if (cct->_conf->bluestore_warn_on_no_per_pool_omap) {
10584 per_pool = "legacy (not per-pool) omap detected, "
10585 "suggest to run store repair to benefit from per-pool omap usage statistics";
10586 }
10587 }
10588 }
10589 std::lock_guard l(qlock);
10590 no_per_pg_omap_alert = per_pg;
10591 no_per_pool_omap_alert = per_pool;
10592}
10593
10594// ---------------
10595// cache
10596
10597BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
10598{
10599 std::shared_lock l(coll_lock);
10600 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
10601 if (cp == coll_map.end())
10602 return CollectionRef();
10603 return cp->second;
10604}
10605
10606BlueStore::CollectionRef BlueStore::_get_collection_by_oid(const ghobject_t& oid)
10607{
10608 std::shared_lock l(coll_lock);
10609
10610 // FIXME: we must replace this with something more efficient
10611
10612 for (auto& i : coll_map) {
10613 spg_t spgid;
10614 if (i.first.is_pg(&spgid) &&
10615 i.second->contains(oid)) {
10616 return i.second;
10617 }
10618 }
10619 return CollectionRef();
10620}
10621
10622void BlueStore::_queue_reap_collection(CollectionRef& c)
10623{
10624 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
10625 // _reap_collections and this in the same thread,
10626 // so no need a lock.
10627 removed_collections.push_back(c);
10628}
10629
10630void BlueStore::_reap_collections()
10631{
10632
10633 list<CollectionRef> removed_colls;
10634 {
10635 // _queue_reap_collection and this in the same thread.
10636 // So no need a lock.
10637 if (!removed_collections.empty())
10638 removed_colls.swap(removed_collections);
10639 else
10640 return;
10641 }
10642
10643 list<CollectionRef>::iterator p = removed_colls.begin();
10644 while (p != removed_colls.end()) {
10645 CollectionRef c = *p;
10646 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
10647 if (c->onode_space.map_any([&](Onode* o) {
10648 ceph_assert(!o->exists);
10649 if (o->flushing_count.load()) {
10650 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
10651 << " flush_txns " << o->flushing_count << dendl;
10652 return true;
10653 }
10654 return false;
10655 })) {
10656 ++p;
10657 continue;
10658 }
10659 c->onode_space.clear();
10660 p = removed_colls.erase(p);
10661 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
10662 }
10663 if (removed_colls.empty()) {
10664 dout(10) << __func__ << " all reaped" << dendl;
10665 } else {
10666 removed_collections.splice(removed_collections.begin(), removed_colls);
10667 }
10668}
10669
10670void BlueStore::_update_logger()
10671{
10672 uint64_t num_onodes = 0;
10673 uint64_t num_pinned_onodes = 0;
10674 uint64_t num_extents = 0;
10675 uint64_t num_blobs = 0;
10676 uint64_t num_buffers = 0;
10677 uint64_t num_buffer_bytes = 0;
10678 for (auto c : onode_cache_shards) {
10679 c->add_stats(&num_onodes, &num_pinned_onodes);
10680 }
10681 for (auto c : buffer_cache_shards) {
10682 c->add_stats(&num_extents, &num_blobs,
10683 &num_buffers, &num_buffer_bytes);
10684 }
10685 logger->set(l_bluestore_onodes, num_onodes);
10686 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
10687 logger->set(l_bluestore_extents, num_extents);
10688 logger->set(l_bluestore_blobs, num_blobs);
10689 logger->set(l_bluestore_buffers, num_buffers);
10690 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
10691}
10692
10693// ---------------
10694// read operations
10695
10696ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
10697{
10698 return _get_collection(cid);
10699}
10700
10701ObjectStore::CollectionHandle BlueStore::create_new_collection(
10702 const coll_t& cid)
10703{
10704 std::unique_lock l{coll_lock};
10705 auto c = ceph::make_ref<Collection>(
10706 this,
10707 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
10708 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
10709 cid);
10710 new_coll_map[cid] = c;
10711 _osr_attach(c.get());
10712 return c;
10713}
10714
10715void BlueStore::set_collection_commit_queue(
10716 const coll_t& cid,
10717 ContextQueue *commit_queue)
10718{
10719 if (commit_queue) {
10720 std::shared_lock l(coll_lock);
10721 if (coll_map.count(cid)) {
10722 coll_map[cid]->commit_queue = commit_queue;
10723 } else if (new_coll_map.count(cid)) {
10724 new_coll_map[cid]->commit_queue = commit_queue;
10725 }
10726 }
10727}
10728
10729
10730bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
10731{
10732 Collection *c = static_cast<Collection *>(c_.get());
10733 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
10734 if (!c->exists)
10735 return false;
10736
10737 bool r = true;
10738
10739 {
10740 std::shared_lock l(c->lock);
10741 OnodeRef o = c->get_onode(oid, false);
10742 if (!o || !o->exists)
10743 r = false;
10744 }
10745
10746 return r;
10747}
10748
10749int BlueStore::stat(
10750 CollectionHandle &c_,
10751 const ghobject_t& oid,
10752 struct stat *st,
10753 bool allow_eio)
10754{
10755 Collection *c = static_cast<Collection *>(c_.get());
10756 if (!c->exists)
10757 return -ENOENT;
10758 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
10759
10760 {
10761 std::shared_lock l(c->lock);
10762 OnodeRef o = c->get_onode(oid, false);
10763 if (!o || !o->exists)
10764 return -ENOENT;
10765 st->st_size = o->onode.size;
10766 st->st_blksize = 4096;
10767 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
10768 st->st_nlink = 1;
10769 }
10770
10771 int r = 0;
10772 if (_debug_mdata_eio(oid)) {
10773 r = -EIO;
10774 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10775 }
10776 return r;
10777}
10778int BlueStore::set_collection_opts(
10779 CollectionHandle& ch,
10780 const pool_opts_t& opts)
10781{
10782 Collection *c = static_cast<Collection *>(ch.get());
10783 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
10784 if (!c->exists)
10785 return -ENOENT;
10786 std::unique_lock l{c->lock};
10787 c->pool_opts = opts;
10788 return 0;
10789}
10790
10791int BlueStore::read(
10792 CollectionHandle &c_,
10793 const ghobject_t& oid,
10794 uint64_t offset,
10795 size_t length,
10796 bufferlist& bl,
10797 uint32_t op_flags)
10798{
10799 auto start = mono_clock::now();
10800 Collection *c = static_cast<Collection *>(c_.get());
10801 const coll_t &cid = c->get_cid();
10802 dout(15) << __func__ << " " << cid << " " << oid
10803 << " 0x" << std::hex << offset << "~" << length << std::dec
10804 << dendl;
10805 if (!c->exists)
10806 return -ENOENT;
10807
10808 bl.clear();
10809 int r;
10810 {
10811 std::shared_lock l(c->lock);
10812 auto start1 = mono_clock::now();
10813 OnodeRef o = c->get_onode(oid, false);
10814 log_latency("get_onode@read",
10815 l_bluestore_read_onode_meta_lat,
10816 mono_clock::now() - start1,
10817 cct->_conf->bluestore_log_op_age);
10818 if (!o || !o->exists) {
10819 r = -ENOENT;
10820 goto out;
10821 }
10822
10823 if (offset == length && offset == 0)
10824 length = o->onode.size;
10825
10826 r = _do_read(c, o, offset, length, bl, op_flags);
10827 if (r == -EIO) {
10828 logger->inc(l_bluestore_read_eio);
10829 }
10830 }
10831
10832 out:
10833 if (r >= 0 && _debug_data_eio(oid)) {
10834 r = -EIO;
10835 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10836 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10837 cct->_conf->bluestore_debug_random_read_err &&
10838 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10839 100.0)) == 0) {
10840 dout(0) << __func__ << ": inject random EIO" << dendl;
10841 r = -EIO;
10842 }
10843 dout(10) << __func__ << " " << cid << " " << oid
10844 << " 0x" << std::hex << offset << "~" << length << std::dec
10845 << " = " << r << dendl;
10846 log_latency(__func__,
10847 l_bluestore_read_lat,
10848 mono_clock::now() - start,
10849 cct->_conf->bluestore_log_op_age);
10850 return r;
10851}
10852
10853void BlueStore::_read_cache(
10854 OnodeRef& o,
10855 uint64_t offset,
10856 size_t length,
10857 int read_cache_policy,
10858 ready_regions_t& ready_regions,
10859 blobs2read_t& blobs2read)
10860{
10861 // build blob-wise list to of stuff read (that isn't cached)
10862 unsigned left = length;
10863 uint64_t pos = offset;
10864 auto lp = o->extent_map.seek_lextent(offset);
10865 while (left > 0 && lp != o->extent_map.extent_map.end()) {
10866 if (pos < lp->logical_offset) {
10867 unsigned hole = lp->logical_offset - pos;
10868 if (hole >= left) {
10869 break;
10870 }
10871 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
10872 << std::dec << dendl;
10873 pos += hole;
10874 left -= hole;
10875 }
10876 BlobRef& bptr = lp->blob;
10877 unsigned l_off = pos - lp->logical_offset;
10878 unsigned b_off = l_off + lp->blob_offset;
10879 unsigned b_len = std::min(left, lp->length - l_off);
10880
10881 ready_regions_t cache_res;
10882 interval_set<uint32_t> cache_interval;
10883 bptr->shared_blob->bc.read(
10884 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
10885 read_cache_policy);
10886 dout(20) << __func__ << " blob " << *bptr << std::hex
10887 << " need 0x" << b_off << "~" << b_len
10888 << " cache has 0x" << cache_interval
10889 << std::dec << dendl;
10890
10891 auto pc = cache_res.begin();
10892 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
10893 while (b_len > 0) {
10894 unsigned l;
10895 if (pc != cache_res.end() &&
10896 pc->first == b_off) {
10897 l = pc->second.length();
10898 ready_regions[pos] = std::move(pc->second);
10899 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
10900 << b_off << "~" << l << std::dec << dendl;
10901 ++pc;
10902 } else {
10903 l = b_len;
10904 if (pc != cache_res.end()) {
10905 ceph_assert(pc->first > b_off);
10906 l = pc->first - b_off;
10907 }
10908 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
10909 << b_off << "~" << l << std::dec << dendl;
10910 // merge regions
10911 {
10912 uint64_t r_off = b_off;
10913 uint64_t r_len = l;
10914 uint64_t front = r_off % chunk_size;
10915 if (front) {
10916 r_off -= front;
10917 r_len += front;
10918 }
10919 unsigned tail = r_len % chunk_size;
10920 if (tail) {
10921 r_len += chunk_size - tail;
10922 }
10923 bool merged = false;
10924 regions2read_t& r2r = blobs2read[bptr];
10925 if (r2r.size()) {
10926 read_req_t& pre = r2r.back();
10927 if (r_off <= (pre.r_off + pre.r_len)) {
10928 front += (r_off - pre.r_off);
10929 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
10930 pre.regs.emplace_back(region_t(pos, b_off, l, front));
10931 merged = true;
10932 }
10933 }
10934 if (!merged) {
10935 read_req_t req(r_off, r_len);
10936 req.regs.emplace_back(region_t(pos, b_off, l, front));
10937 r2r.emplace_back(std::move(req));
10938 }
10939 }
10940 }
10941 pos += l;
10942 b_off += l;
10943 left -= l;
10944 b_len -= l;
10945 }
10946 ++lp;
10947 }
10948}
10949
10950int BlueStore::_prepare_read_ioc(
10951 blobs2read_t& blobs2read,
10952 vector<bufferlist>* compressed_blob_bls,
10953 IOContext* ioc)
10954{
10955 for (auto& p : blobs2read) {
10956 const BlobRef& bptr = p.first;
10957 regions2read_t& r2r = p.second;
10958 dout(20) << __func__ << " blob " << *bptr << " need "
10959 << r2r << dendl;
10960 if (bptr->get_blob().is_compressed()) {
10961 // read the whole thing
10962 if (compressed_blob_bls->empty()) {
10963 // ensure we avoid any reallocation on subsequent blobs
10964 compressed_blob_bls->reserve(blobs2read.size());
10965 }
10966 compressed_blob_bls->push_back(bufferlist());
10967 bufferlist& bl = compressed_blob_bls->back();
10968 auto r = bptr->get_blob().map(
10969 0, bptr->get_blob().get_ondisk_length(),
10970 [&](uint64_t offset, uint64_t length) {
10971 int r = bdev->aio_read(offset, length, &bl, ioc);
10972 if (r < 0)
10973 return r;
10974 return 0;
10975 });
10976 if (r < 0) {
10977 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
10978 if (r == -EIO) {
10979 // propagate EIO to caller
10980 return r;
10981 }
10982 ceph_assert(r == 0);
10983 }
10984 } else {
10985 // read the pieces
10986 for (auto& req : r2r) {
10987 dout(20) << __func__ << " region 0x" << std::hex
10988 << req.regs.front().logical_offset
10989 << ": 0x" << req.regs.front().blob_xoffset
10990 << " reading 0x" << req.r_off
10991 << "~" << req.r_len << std::dec
10992 << dendl;
10993
10994 // read it
10995 auto r = bptr->get_blob().map(
10996 req.r_off, req.r_len,
10997 [&](uint64_t offset, uint64_t length) {
10998 int r = bdev->aio_read(offset, length, &req.bl, ioc);
10999 if (r < 0)
11000 return r;
11001 return 0;
11002 });
11003 if (r < 0) {
11004 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
11005 << dendl;
11006 if (r == -EIO) {
11007 // propagate EIO to caller
11008 return r;
11009 }
11010 ceph_assert(r == 0);
11011 }
11012 ceph_assert(req.bl.length() == req.r_len);
11013 }
11014 }
11015 }
11016 return 0;
11017}
11018
11019int BlueStore::_generate_read_result_bl(
11020 OnodeRef& o,
11021 uint64_t offset,
11022 size_t length,
11023 ready_regions_t& ready_regions,
11024 vector<bufferlist>& compressed_blob_bls,
11025 blobs2read_t& blobs2read,
11026 bool buffered,
11027 bool* csum_error,
11028 bufferlist& bl)
11029{
11030 // enumerate and decompress desired blobs
11031 auto p = compressed_blob_bls.begin();
11032 blobs2read_t::iterator b2r_it = blobs2read.begin();
11033 while (b2r_it != blobs2read.end()) {
11034 const BlobRef& bptr = b2r_it->first;
11035 regions2read_t& r2r = b2r_it->second;
11036 dout(20) << __func__ << " blob " << *bptr << " need "
11037 << r2r << dendl;
11038 if (bptr->get_blob().is_compressed()) {
11039 ceph_assert(p != compressed_blob_bls.end());
11040 bufferlist& compressed_bl = *p++;
11041 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
11042 r2r.front().regs.front().logical_offset) < 0) {
11043 *csum_error = true;
11044 return -EIO;
11045 }
11046 bufferlist raw_bl;
11047 auto r = _decompress(compressed_bl, &raw_bl);
11048 if (r < 0)
11049 return r;
11050 if (buffered) {
11051 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
11052 raw_bl);
11053 }
11054 for (auto& req : r2r) {
11055 for (auto& r : req.regs) {
11056 ready_regions[r.logical_offset].substr_of(
11057 raw_bl, r.blob_xoffset, r.length);
11058 }
11059 }
11060 } else {
11061 for (auto& req : r2r) {
11062 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
11063 req.regs.front().logical_offset) < 0) {
11064 *csum_error = true;
11065 return -EIO;
11066 }
11067 if (buffered) {
11068 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
11069 req.r_off, req.bl);
11070 }
11071
11072 // prune and keep result
11073 for (const auto& r : req.regs) {
11074 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
11075 }
11076 }
11077 }
11078 ++b2r_it;
11079 }
11080
11081 // generate a resulting buffer
11082 auto pr = ready_regions.begin();
11083 auto pr_end = ready_regions.end();
11084 uint64_t pos = 0;
11085 while (pos < length) {
11086 if (pr != pr_end && pr->first == pos + offset) {
11087 dout(30) << __func__ << " assemble 0x" << std::hex << pos
11088 << ": data from 0x" << pr->first << "~" << pr->second.length()
11089 << std::dec << dendl;
11090 pos += pr->second.length();
11091 bl.claim_append(pr->second);
11092 ++pr;
11093 } else {
11094 uint64_t l = length - pos;
11095 if (pr != pr_end) {
11096 ceph_assert(pr->first > pos + offset);
11097 l = pr->first - (pos + offset);
11098 }
11099 dout(30) << __func__ << " assemble 0x" << std::hex << pos
11100 << ": zeros for 0x" << (pos + offset) << "~" << l
11101 << std::dec << dendl;
11102 bl.append_zero(l);
11103 pos += l;
11104 }
11105 }
11106 ceph_assert(bl.length() == length);
11107 ceph_assert(pos == length);
11108 ceph_assert(pr == pr_end);
11109 return 0;
11110}
11111
11112int BlueStore::_do_read(
11113 Collection *c,
11114 OnodeRef& o,
11115 uint64_t offset,
11116 size_t length,
11117 bufferlist& bl,
11118 uint32_t op_flags,
11119 uint64_t retry_count)
11120{
11121 FUNCTRACE(cct);
11122 int r = 0;
11123 int read_cache_policy = 0; // do not bypass clean or dirty cache
11124
11125 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11126 << " size 0x" << o->onode.size << " (" << std::dec
11127 << o->onode.size << ")" << dendl;
11128 bl.clear();
11129
11130 if (offset >= o->onode.size) {
11131 return r;
11132 }
11133
11134 // generally, don't buffer anything, unless the client explicitly requests
11135 // it.
11136 bool buffered = false;
11137 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
11138 dout(20) << __func__ << " will do buffered read" << dendl;
11139 buffered = true;
11140 } else if (cct->_conf->bluestore_default_buffered_read &&
11141 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
11142 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
11143 dout(20) << __func__ << " defaulting to buffered read" << dendl;
11144 buffered = true;
11145 }
11146
11147 if (offset + length > o->onode.size) {
11148 length = o->onode.size - offset;
11149 }
11150
11151 auto start = mono_clock::now();
11152 o->extent_map.fault_range(db, offset, length);
11153 log_latency(__func__,
11154 l_bluestore_read_onode_meta_lat,
11155 mono_clock::now() - start,
11156 cct->_conf->bluestore_log_op_age);
11157 _dump_onode<30>(cct, *o);
11158
11159 // for deep-scrub, we only read dirty cache and bypass clean cache in
11160 // order to read underlying block device in case there are silent disk errors.
11161 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
11162 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
11163 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
11164 }
11165
11166 // build blob-wise list to of stuff read (that isn't cached)
11167 ready_regions_t ready_regions;
11168 blobs2read_t blobs2read;
11169 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
11170
11171
11172 // read raw blob data.
11173 start = mono_clock::now(); // for the sake of simplicity
11174 // measure the whole block below.
11175 // The error isn't that much...
11176 vector<bufferlist> compressed_blob_bls;
11177 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
11178 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
11179 // we always issue aio for reading, so errors other than EIO are not allowed
11180 if (r < 0)
11181 return r;
11182
11183 int64_t num_ios = blobs2read.size();
11184 if (ioc.has_pending_aios()) {
11185 num_ios = ioc.get_num_ios();
11186 bdev->aio_submit(&ioc);
11187 dout(20) << __func__ << " waiting for aio" << dendl;
11188 ioc.aio_wait();
11189 r = ioc.get_return_value();
11190 if (r < 0) {
11191 ceph_assert(r == -EIO); // no other errors allowed
11192 return -EIO;
11193 }
11194 }
11195 log_latency_fn(__func__,
11196 l_bluestore_read_wait_aio_lat,
11197 mono_clock::now() - start,
11198 cct->_conf->bluestore_log_op_age,
11199 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
11200 );
11201
11202 bool csum_error = false;
11203 r = _generate_read_result_bl(o, offset, length, ready_regions,
11204 compressed_blob_bls, blobs2read,
11205 buffered && !ioc.skip_cache(),
11206 &csum_error, bl);
11207 if (csum_error) {
11208 // Handles spurious read errors caused by a kernel bug.
11209 // We sometimes get all-zero pages as a result of the read under
11210 // high memory pressure. Retrying the failing read succeeds in most
11211 // cases.
11212 // See also: http://tracker.ceph.com/issues/22464
11213 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
11214 return -EIO;
11215 }
11216 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
11217 }
11218 r = bl.length();
11219 if (retry_count) {
11220 logger->inc(l_bluestore_reads_with_retries);
11221 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
11222 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
11223 stringstream s;
11224 s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
11225 _set_spurious_read_errors_alert(s.str());
11226 }
11227 return r;
11228}
11229
11230int BlueStore::_verify_csum(OnodeRef& o,
11231 const bluestore_blob_t* blob, uint64_t blob_xoffset,
11232 const bufferlist& bl,
11233 uint64_t logical_offset) const
11234{
11235 int bad;
11236 uint64_t bad_csum;
11237 auto start = mono_clock::now();
11238 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
11239 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
11240 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
11241 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
11242 bad = blob_xoffset;
11243 r = -1;
11244 bad_csum = 0xDEADBEEF;
11245 }
11246 if (r < 0) {
11247 if (r == -1) {
11248 PExtentVector pex;
11249 blob->map(
11250 bad,
11251 blob->get_csum_chunk_size(),
11252 [&](uint64_t offset, uint64_t length) {
11253 pex.emplace_back(bluestore_pextent_t(offset, length));
11254 return 0;
11255 });
11256 derr << __func__ << " bad "
11257 << Checksummer::get_csum_type_string(blob->csum_type)
11258 << "/0x" << std::hex << blob->get_csum_chunk_size()
11259 << " checksum at blob offset 0x" << bad
11260 << ", got 0x" << bad_csum << ", expected 0x"
11261 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
11262 << ", device location " << pex
11263 << ", logical extent 0x" << std::hex
11264 << (logical_offset + bad - blob_xoffset) << "~"
11265 << blob->get_csum_chunk_size() << std::dec
11266 << ", object " << o->oid
11267 << dendl;
11268 } else {
11269 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
11270 }
11271 }
11272 log_latency(__func__,
11273 l_bluestore_csum_lat,
11274 mono_clock::now() - start,
11275 cct->_conf->bluestore_log_op_age);
11276 if (cct->_conf->bluestore_ignore_data_csum) {
11277 return 0;
11278 }
11279 return r;
11280}
11281
11282int BlueStore::_decompress(bufferlist& source, bufferlist* result)
11283{
11284 int r = 0;
11285 auto start = mono_clock::now();
11286 auto i = source.cbegin();
11287 bluestore_compression_header_t chdr;
11288 decode(chdr, i);
11289 int alg = int(chdr.type);
11290 CompressorRef cp = compressor;
11291 if (!cp || (int)cp->get_type() != alg) {
11292 cp = Compressor::create(cct, alg);
11293 }
11294
11295 if (!cp.get()) {
11296 // if compressor isn't available - error, because cannot return
11297 // decompressed data?
11298
11299 const char* alg_name = Compressor::get_comp_alg_name(alg);
11300 derr << __func__ << " can't load decompressor " << alg_name << dendl;
11301 _set_compression_alert(false, alg_name);
11302 r = -EIO;
11303 } else {
11304 r = cp->decompress(i, chdr.length, *result, chdr.compressor_message);
11305 if (r < 0) {
11306 derr << __func__ << " decompression failed with exit code " << r << dendl;
11307 r = -EIO;
11308 }
11309 }
11310 log_latency(__func__,
11311 l_bluestore_decompress_lat,
11312 mono_clock::now() - start,
11313 cct->_conf->bluestore_log_op_age);
11314 return r;
11315}
11316
11317// this stores fiemap into interval_set, other variations
11318// use it internally
11319int BlueStore::_fiemap(
11320 CollectionHandle &c_,
11321 const ghobject_t& oid,
11322 uint64_t offset,
11323 size_t length,
11324 interval_set<uint64_t>& destset)
11325{
11326 Collection *c = static_cast<Collection *>(c_.get());
11327 if (!c->exists)
11328 return -ENOENT;
11329 {
11330 std::shared_lock l(c->lock);
11331
11332 OnodeRef o = c->get_onode(oid, false);
11333 if (!o || !o->exists) {
11334 return -ENOENT;
11335 }
11336 _dump_onode<30>(cct, *o);
11337
11338 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11339 << " size 0x" << o->onode.size << std::dec << dendl;
11340
11341 boost::intrusive::set<Extent>::iterator ep, eend;
11342 if (offset >= o->onode.size)
11343 goto out;
11344
11345 if (offset + length > o->onode.size) {
11346 length = o->onode.size - offset;
11347 }
11348
11349 o->extent_map.fault_range(db, offset, length);
11350 eend = o->extent_map.extent_map.end();
11351 ep = o->extent_map.seek_lextent(offset);
11352 while (length > 0) {
11353 dout(20) << __func__ << " offset " << offset << dendl;
11354 if (ep != eend && ep->logical_offset + ep->length <= offset) {
11355 ++ep;
11356 continue;
11357 }
11358
11359 uint64_t x_len = length;
11360 if (ep != eend && ep->logical_offset <= offset) {
11361 uint64_t x_off = offset - ep->logical_offset;
11362 x_len = std::min(x_len, ep->length - x_off);
11363 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
11364 << x_len << std::dec << " blob " << ep->blob << dendl;
11365 destset.insert(offset, x_len);
11366 length -= x_len;
11367 offset += x_len;
11368 if (x_off + x_len == ep->length)
11369 ++ep;
11370 continue;
11371 }
11372 if (ep != eend &&
11373 ep->logical_offset > offset &&
11374 ep->logical_offset - offset < x_len) {
11375 x_len = ep->logical_offset - offset;
11376 }
11377 offset += x_len;
11378 length -= x_len;
11379 }
11380 }
11381
11382 out:
11383 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11384 << " size = 0x(" << destset << ")" << std::dec << dendl;
11385 return 0;
11386}
11387
11388int BlueStore::fiemap(
11389 CollectionHandle &c_,
11390 const ghobject_t& oid,
11391 uint64_t offset,
11392 size_t length,
11393 bufferlist& bl)
11394{
11395 interval_set<uint64_t> m;
11396 int r = _fiemap(c_, oid, offset, length, m);
11397 if (r >= 0) {
11398 encode(m, bl);
11399 }
11400 return r;
11401}
11402
11403int BlueStore::fiemap(
11404 CollectionHandle &c_,
11405 const ghobject_t& oid,
11406 uint64_t offset,
11407 size_t length,
11408 map<uint64_t, uint64_t>& destmap)
11409{
11410 interval_set<uint64_t> m;
11411 int r = _fiemap(c_, oid, offset, length, m);
11412 if (r >= 0) {
11413 destmap = std::move(m).detach();
11414 }
11415 return r;
11416}
11417
11418int BlueStore::readv(
11419 CollectionHandle &c_,
11420 const ghobject_t& oid,
11421 interval_set<uint64_t>& m,
11422 bufferlist& bl,
11423 uint32_t op_flags)
11424{
11425 auto start = mono_clock::now();
11426 Collection *c = static_cast<Collection *>(c_.get());
11427 const coll_t &cid = c->get_cid();
11428 dout(15) << __func__ << " " << cid << " " << oid
11429 << " fiemap " << m
11430 << dendl;
11431 if (!c->exists)
11432 return -ENOENT;
11433
11434 bl.clear();
11435 int r;
11436 {
11437 std::shared_lock l(c->lock);
11438 auto start1 = mono_clock::now();
11439 OnodeRef o = c->get_onode(oid, false);
11440 log_latency("get_onode@read",
11441 l_bluestore_read_onode_meta_lat,
11442 mono_clock::now() - start1,
11443 cct->_conf->bluestore_log_op_age);
11444 if (!o || !o->exists) {
11445 r = -ENOENT;
11446 goto out;
11447 }
11448
11449 if (m.empty()) {
11450 r = 0;
11451 goto out;
11452 }
11453
11454 r = _do_readv(c, o, m, bl, op_flags);
11455 if (r == -EIO) {
11456 logger->inc(l_bluestore_read_eio);
11457 }
11458 }
11459
11460 out:
11461 if (r >= 0 && _debug_data_eio(oid)) {
11462 r = -EIO;
11463 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11464 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
11465 cct->_conf->bluestore_debug_random_read_err &&
11466 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
11467 100.0)) == 0) {
11468 dout(0) << __func__ << ": inject random EIO" << dendl;
11469 r = -EIO;
11470 }
11471 dout(10) << __func__ << " " << cid << " " << oid
11472 << " fiemap " << m << std::dec
11473 << " = " << r << dendl;
11474 log_latency(__func__,
11475 l_bluestore_read_lat,
11476 mono_clock::now() - start,
11477 cct->_conf->bluestore_log_op_age);
11478 return r;
11479}
11480
11481int BlueStore::_do_readv(
11482 Collection *c,
11483 OnodeRef& o,
11484 const interval_set<uint64_t>& m,
11485 bufferlist& bl,
11486 uint32_t op_flags,
11487 uint64_t retry_count)
11488{
11489 FUNCTRACE(cct);
11490 int r = 0;
11491 int read_cache_policy = 0; // do not bypass clean or dirty cache
11492
11493 dout(20) << __func__ << " fiemap " << m << std::hex
11494 << " size 0x" << o->onode.size << " (" << std::dec
11495 << o->onode.size << ")" << dendl;
11496
11497 // generally, don't buffer anything, unless the client explicitly requests
11498 // it.
11499 bool buffered = false;
11500 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
11501 dout(20) << __func__ << " will do buffered read" << dendl;
11502 buffered = true;
11503 } else if (cct->_conf->bluestore_default_buffered_read &&
11504 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
11505 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
11506 dout(20) << __func__ << " defaulting to buffered read" << dendl;
11507 buffered = true;
11508 }
11509 // this method must be idempotent since we may call it several times
11510 // before we finally read the expected result.
11511 bl.clear();
11512
11513 // call fiemap first!
11514 ceph_assert(m.range_start() <= o->onode.size);
11515 ceph_assert(m.range_end() <= o->onode.size);
11516 auto start = mono_clock::now();
11517 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
11518 log_latency(__func__,
11519 l_bluestore_read_onode_meta_lat,
11520 mono_clock::now() - start,
11521 cct->_conf->bluestore_log_op_age);
11522 _dump_onode<30>(cct, *o);
11523
11524 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
11525 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
11526 raw_results.reserve(m.num_intervals());
11527 int i = 0;
11528 for (auto p = m.begin(); p != m.end(); p++, i++) {
11529 raw_results.push_back({});
11530 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
11531 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
11532 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
11533 // we always issue aio for reading, so errors other than EIO are not allowed
11534 if (r < 0)
11535 return r;
11536 }
11537
11538 auto num_ios = m.size();
11539 if (ioc.has_pending_aios()) {
11540 num_ios = ioc.get_num_ios();
11541 bdev->aio_submit(&ioc);
11542 dout(20) << __func__ << " waiting for aio" << dendl;
11543 ioc.aio_wait();
11544 r = ioc.get_return_value();
11545 if (r < 0) {
11546 ceph_assert(r == -EIO); // no other errors allowed
11547 return -EIO;
11548 }
11549 }
11550 log_latency_fn(__func__,
11551 l_bluestore_read_wait_aio_lat,
11552 mono_clock::now() - start,
11553 cct->_conf->bluestore_log_op_age,
11554 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
11555 );
11556
11557 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
11558 i = 0;
11559 for (auto p = m.begin(); p != m.end(); p++, i++) {
11560 bool csum_error = false;
11561 bufferlist t;
11562 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
11563 std::get<0>(raw_results[i]),
11564 std::get<1>(raw_results[i]),
11565 std::get<2>(raw_results[i]),
11566 buffered, &csum_error, t);
11567 if (csum_error) {
11568 // Handles spurious read errors caused by a kernel bug.
11569 // We sometimes get all-zero pages as a result of the read under
11570 // high memory pressure. Retrying the failing read succeeds in most
11571 // cases.
11572 // See also: http://tracker.ceph.com/issues/22464
11573 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
11574 return -EIO;
11575 }
11576 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
11577 }
11578 bl.claim_append(t);
11579 }
11580 if (retry_count) {
11581 logger->inc(l_bluestore_reads_with_retries);
11582 dout(5) << __func__ << " read fiemap " << m
11583 << " failed " << retry_count << " times before succeeding"
11584 << dendl;
11585 }
11586 return bl.length();
11587}
11588
11589int BlueStore::dump_onode(CollectionHandle &c_,
11590 const ghobject_t& oid,
11591 const string& section_name,
11592 Formatter *f)
11593{
11594 Collection *c = static_cast<Collection *>(c_.get());
11595 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
11596 if (!c->exists)
11597 return -ENOENT;
11598
11599 int r;
11600 {
11601 std::shared_lock l(c->lock);
11602
11603 OnodeRef o = c->get_onode(oid, false);
11604 if (!o || !o->exists) {
11605 r = -ENOENT;
11606 goto out;
11607 }
11608 // FIXME minor: actually the next line isn't enough to
11609 // load shared blobs. Leaving as is for now..
11610 //
11611 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
11612
11613 _dump_onode<0>(cct, *o);
11614 f->open_object_section(section_name.c_str());
11615 o->dump(f);
11616 f->close_section();
11617 r = 0;
11618 }
11619 out:
11620 dout(10) << __func__ << " " << c->cid << " " << oid
11621 << " = " << r << dendl;
11622 return r;
11623}
11624
11625int BlueStore::getattr(
11626 CollectionHandle &c_,
11627 const ghobject_t& oid,
11628 const char *name,
11629 bufferptr& value)
11630{
11631 Collection *c = static_cast<Collection *>(c_.get());
11632 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
11633 if (!c->exists)
11634 return -ENOENT;
11635
11636 int r;
11637 {
11638 std::shared_lock l(c->lock);
11639 mempool::bluestore_cache_meta::string k(name);
11640
11641 OnodeRef o = c->get_onode(oid, false);
11642 if (!o || !o->exists) {
11643 r = -ENOENT;
11644 goto out;
11645 }
11646
11647 if (!o->onode.attrs.count(k)) {
11648 r = -ENODATA;
11649 goto out;
11650 }
11651 value = o->onode.attrs[k];
11652 r = 0;
11653 }
11654 out:
11655 if (r == 0 && _debug_mdata_eio(oid)) {
11656 r = -EIO;
11657 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11658 }
11659 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
11660 << " = " << r << dendl;
11661 return r;
11662}
11663
11664int BlueStore::getattrs(
11665 CollectionHandle &c_,
11666 const ghobject_t& oid,
11667 map<string,bufferptr,less<>>& aset)
11668{
11669 Collection *c = static_cast<Collection *>(c_.get());
11670 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
11671 if (!c->exists)
11672 return -ENOENT;
11673
11674 int r;
11675 {
11676 std::shared_lock l(c->lock);
11677
11678 OnodeRef o = c->get_onode(oid, false);
11679 if (!o || !o->exists) {
11680 r = -ENOENT;
11681 goto out;
11682 }
11683 for (auto& i : o->onode.attrs) {
11684 aset.emplace(i.first.c_str(), i.second);
11685 }
11686 r = 0;
11687 }
11688
11689 out:
11690 if (r == 0 && _debug_mdata_eio(oid)) {
11691 r = -EIO;
11692 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11693 }
11694 dout(10) << __func__ << " " << c->cid << " " << oid
11695 << " = " << r << dendl;
11696 return r;
11697}
11698
11699int BlueStore::list_collections(vector<coll_t>& ls)
11700{
11701 std::shared_lock l(coll_lock);
11702 ls.reserve(coll_map.size());
11703 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
11704 p != coll_map.end();
11705 ++p)
11706 ls.push_back(p->first);
11707 return 0;
11708}
11709
11710bool BlueStore::collection_exists(const coll_t& c)
11711{
11712 std::shared_lock l(coll_lock);
11713 return coll_map.count(c);
11714}
11715
11716int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
11717{
11718 dout(15) << __func__ << " " << ch->cid << dendl;
11719 vector<ghobject_t> ls;
11720 ghobject_t next;
11721 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
11722 &ls, &next);
11723 if (r < 0) {
11724 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
11725 << dendl;
11726 return r;
11727 }
11728 *empty = ls.empty();
11729 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
11730 return 0;
11731}
11732
11733int BlueStore::collection_bits(CollectionHandle& ch)
11734{
11735 dout(15) << __func__ << " " << ch->cid << dendl;
11736 Collection *c = static_cast<Collection*>(ch.get());
11737 std::shared_lock l(c->lock);
11738 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
11739 return c->cnode.bits;
11740}
11741
11742int BlueStore::collection_list(
11743 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
11744 vector<ghobject_t> *ls, ghobject_t *pnext)
11745{
11746 Collection *c = static_cast<Collection *>(c_.get());
11747 c->flush();
11748 dout(15) << __func__ << " " << c->cid
11749 << " start " << start << " end " << end << " max " << max << dendl;
11750 int r;
11751 {
11752 std::shared_lock l(c->lock);
11753 r = _collection_list(c, start, end, max, false, ls, pnext);
11754 }
11755
11756 dout(10) << __func__ << " " << c->cid
11757 << " start " << start << " end " << end << " max " << max
11758 << " = " << r << ", ls.size() = " << ls->size()
11759 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
11760 return r;
11761}
11762
11763int BlueStore::collection_list_legacy(
11764 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
11765 vector<ghobject_t> *ls, ghobject_t *pnext)
11766{
11767 Collection *c = static_cast<Collection *>(c_.get());
11768 c->flush();
11769 dout(15) << __func__ << " " << c->cid
11770 << " start " << start << " end " << end << " max " << max << dendl;
11771 int r;
11772 {
11773 std::shared_lock l(c->lock);
11774 r = _collection_list(c, start, end, max, true, ls, pnext);
11775 }
11776
11777 dout(10) << __func__ << " " << c->cid
11778 << " start " << start << " end " << end << " max " << max
11779 << " = " << r << ", ls.size() = " << ls->size()
11780 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
11781 return r;
11782}
11783
11784int BlueStore::_collection_list(
11785 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
11786 bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
11787{
11788
11789 if (!c->exists)
11790 return -ENOENT;
11791
11792 ghobject_t static_next;
11793 std::unique_ptr<CollectionListIterator> it;
11794 ghobject_t coll_range_temp_start, coll_range_temp_end;
11795 ghobject_t coll_range_start, coll_range_end;
11796 ghobject_t pend;
11797 bool temp;
11798
11799 if (!pnext)
11800 pnext = &static_next;
11801
11802 auto log_latency = make_scope_guard(
11803 [&, start_time = mono_clock::now(), func_name = __func__] {
11804 log_latency_fn(
11805 func_name,
11806 l_bluestore_clist_lat,
11807 mono_clock::now() - start_time,
11808 cct->_conf->bluestore_log_collection_list_age,
11809 [&](const ceph::timespan& lat) {
11810 ostringstream ostr;
11811 ostr << ", lat = " << timespan_str(lat)
11812 << " cid =" << c->cid
11813 << " start " << start << " end " << end
11814 << " max " << max;
11815 return ostr.str();
11816 });
11817 });
11818
11819 if (start.is_max() || start.hobj.is_max()) {
11820 *pnext = ghobject_t::get_max();
11821 return 0;
11822 }
11823 get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
11824 &coll_range_temp_end, &coll_range_start, &coll_range_end, legacy);
11825 dout(20) << __func__
11826 << " range " << coll_range_temp_start
11827 << " to " << coll_range_temp_end
11828 << " and " << coll_range_start
11829 << " to " << coll_range_end
11830 << " start " << start << dendl;
11831 if (legacy) {
11832 it = std::make_unique<SimpleCollectionListIterator>(
11833 cct, db->get_iterator(PREFIX_OBJ));
11834 } else {
11835 it = std::make_unique<SortedCollectionListIterator>(
11836 db->get_iterator(PREFIX_OBJ));
11837 }
11838 if (start == ghobject_t() ||
11839 start.hobj == hobject_t() ||
11840 start == c->cid.get_min_hobj()) {
11841 it->upper_bound(coll_range_temp_start);
11842 temp = true;
11843 } else {
11844 if (start.hobj.is_temp()) {
11845 temp = true;
11846 ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
11847 } else {
11848 temp = false;
11849 ceph_assert(start >= coll_range_start && start < coll_range_end);
11850 }
11851 dout(20) << __func__ << " temp=" << (int)temp << dendl;
11852 it->lower_bound(start);
11853 }
11854 if (end.hobj.is_max()) {
11855 pend = temp ? coll_range_temp_end : coll_range_end;
11856 } else {
11857 if (end.hobj.is_temp()) {
11858 if (temp) {
11859 pend = end;
11860 } else {
11861 *pnext = ghobject_t::get_max();
11862 return 0;
11863 }
11864 } else {
11865 pend = temp ? coll_range_temp_end : end;
11866 }
11867 }
11868 dout(20) << __func__ << " pend " << pend << dendl;
11869 while (true) {
11870 if (!it->valid() || it->is_ge(pend)) {
11871 if (!it->valid())
11872 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
11873 else
11874 dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
11875 if (temp) {
11876 if (end.hobj.is_temp()) {
11877 if (it->valid() && it->is_lt(coll_range_temp_end)) {
11878 *pnext = it->oid();
11879 return 0;
11880 }
11881 break;
11882 }
11883 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
11884 temp = false;
11885 it->upper_bound(coll_range_start);
11886 if (end.hobj.is_max())
11887 pend = coll_range_end;
11888 else
11889 pend = end;
11890 dout(30) << __func__ << " pend " << pend << dendl;
11891 continue;
11892 }
11893 if (it->valid() && it->is_lt(coll_range_end)) {
11894 *pnext = it->oid();
11895 return 0;
11896 }
11897 break;
11898 }
11899 dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
11900 if (ls->size() >= (unsigned)max) {
11901 dout(20) << __func__ << " reached max " << max << dendl;
11902 *pnext = it->oid();
11903 return 0;
11904 }
11905 ls->push_back(it->oid());
11906 it->next();
11907 }
11908 *pnext = ghobject_t::get_max();
11909 return 0;
11910}
11911
11912int BlueStore::omap_get(
11913 CollectionHandle &c_, ///< [in] Collection containing oid
11914 const ghobject_t &oid, ///< [in] Object containing omap
11915 bufferlist *header, ///< [out] omap header
11916 map<string, bufferlist> *out /// < [out] Key to value map
11917 )
11918{
11919 Collection *c = static_cast<Collection *>(c_.get());
11920 return _omap_get(c, oid, header, out);
11921}
11922
11923int BlueStore::_omap_get(
11924 Collection *c, ///< [in] Collection containing oid
11925 const ghobject_t &oid, ///< [in] Object containing omap
11926 bufferlist *header, ///< [out] omap header
11927 map<string, bufferlist> *out /// < [out] Key to value map
11928 )
11929{
11930 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11931 if (!c->exists)
11932 return -ENOENT;
11933 std::shared_lock l(c->lock);
11934 int r = 0;
11935 OnodeRef o = c->get_onode(oid, false);
11936 if (!o || !o->exists) {
11937 r = -ENOENT;
11938 goto out;
11939 }
11940 r = _onode_omap_get(o, header, out);
11941 out:
11942 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11943 << dendl;
11944 return r;
11945}
11946
11947int BlueStore::_onode_omap_get(
11948 const OnodeRef &o, ///< [in] Object containing omap
11949 bufferlist *header, ///< [out] omap header
11950 map<string, bufferlist> *out /// < [out] Key to value map
11951)
11952{
11953 int r = 0;
11954 if (!o || !o->exists) {
11955 r = -ENOENT;
11956 goto out;
11957 }
11958 if (!o->onode.has_omap())
11959 goto out;
11960 o->flush();
11961 {
11962 const string& prefix = o->get_omap_prefix();
11963 string head, tail;
11964 o->get_omap_header(&head);
11965 o->get_omap_tail(&tail);
11966 KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
11967 it->lower_bound(head);
11968 while (it->valid()) {
11969 if (it->key() == head) {
11970 dout(30) << __func__ << " got header" << dendl;
11971 *header = it->value();
11972 } else if (it->key() >= tail) {
11973 dout(30) << __func__ << " reached tail" << dendl;
11974 break;
11975 } else {
11976 string user_key;
11977 o->decode_omap_key(it->key(), &user_key);
11978 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
11979 << " -> " << user_key << dendl;
11980 (*out)[user_key] = it->value();
11981 }
11982 it->next();
11983 }
11984 }
11985out:
11986 return r;
11987}
11988
11989int BlueStore::omap_get_header(
11990 CollectionHandle &c_, ///< [in] Collection containing oid
11991 const ghobject_t &oid, ///< [in] Object containing omap
11992 bufferlist *header, ///< [out] omap header
11993 bool allow_eio ///< [in] don't assert on eio
11994 )
11995{
11996 Collection *c = static_cast<Collection *>(c_.get());
11997 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11998 if (!c->exists)
11999 return -ENOENT;
12000 std::shared_lock l(c->lock);
12001 int r = 0;
12002 OnodeRef o = c->get_onode(oid, false);
12003 if (!o || !o->exists) {
12004 r = -ENOENT;
12005 goto out;
12006 }
12007 if (!o->onode.has_omap())
12008 goto out;
12009 o->flush();
12010 {
12011 string head;
12012 o->get_omap_header(&head);
12013 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
12014 dout(30) << __func__ << " got header" << dendl;
12015 } else {
12016 dout(30) << __func__ << " no header" << dendl;
12017 }
12018 }
12019 out:
12020 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12021 << dendl;
12022 return r;
12023}
12024
12025int BlueStore::omap_get_keys(
12026 CollectionHandle &c_, ///< [in] Collection containing oid
12027 const ghobject_t &oid, ///< [in] Object containing omap
12028 set<string> *keys ///< [out] Keys defined on oid
12029 )
12030{
12031 Collection *c = static_cast<Collection *>(c_.get());
12032 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12033 if (!c->exists)
12034 return -ENOENT;
12035 auto start1 = mono_clock::now();
12036 std::shared_lock l(c->lock);
12037 int r = 0;
12038 OnodeRef o = c->get_onode(oid, false);
12039 if (!o || !o->exists) {
12040 r = -ENOENT;
12041 goto out;
12042 }
12043 if (!o->onode.has_omap())
12044 goto out;
12045 o->flush();
12046 {
12047 const string& prefix = o->get_omap_prefix();
12048 string head, tail;
12049 o->get_omap_key(string(), &head);
12050 o->get_omap_tail(&tail);
12051 KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
12052 it->lower_bound(head);
12053 while (it->valid()) {
12054 if (it->key() >= tail) {
12055 dout(30) << __func__ << " reached tail" << dendl;
12056 break;
12057 }
12058 string user_key;
12059 o->decode_omap_key(it->key(), &user_key);
12060 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
12061 << " -> " << user_key << dendl;
12062 keys->insert(user_key);
12063 it->next();
12064 }
12065 }
12066 out:
12067 c->store->log_latency(
12068 __func__,
12069 l_bluestore_omap_get_keys_lat,
12070 mono_clock::now() - start1,
12071 c->store->cct->_conf->bluestore_log_omap_iterator_age);
12072
12073 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12074 << dendl;
12075 return r;
12076}
12077
12078int BlueStore::omap_get_values(
12079 CollectionHandle &c_, ///< [in] Collection containing oid
12080 const ghobject_t &oid, ///< [in] Object containing omap
12081 const set<string> &keys, ///< [in] Keys to get
12082 map<string, bufferlist> *out ///< [out] Returned keys and values
12083 )
12084{
12085 Collection *c = static_cast<Collection *>(c_.get());
12086 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12087 if (!c->exists)
12088 return -ENOENT;
12089 std::shared_lock l(c->lock);
12090 auto start1 = mono_clock::now();
12091 int r = 0;
12092 string final_key;
12093 OnodeRef o = c->get_onode(oid, false);
12094 if (!o || !o->exists) {
12095 r = -ENOENT;
12096 goto out;
12097 }
12098 if (!o->onode.has_omap()) {
12099 goto out;
12100 }
12101 o->flush();
12102 {
12103 const string& prefix = o->get_omap_prefix();
12104 o->get_omap_key(string(), &final_key);
12105 size_t base_key_len = final_key.size();
12106 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
12107 final_key.resize(base_key_len); // keep prefix
12108 final_key += *p;
12109 bufferlist val;
12110 if (db->get(prefix, final_key, &val) >= 0) {
12111 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
12112 << " -> " << *p << dendl;
12113 out->insert(make_pair(*p, val));
12114 }
12115 }
12116 }
12117 out:
12118 c->store->log_latency(
12119 __func__,
12120 l_bluestore_omap_get_values_lat,
12121 mono_clock::now() - start1,
12122 c->store->cct->_conf->bluestore_log_omap_iterator_age);
12123
12124 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12125 << dendl;
12126 return r;
12127}
12128
12129#ifdef WITH_SEASTAR
12130int BlueStore::omap_get_values(
12131 CollectionHandle &c_, ///< [in] Collection containing oid
12132 const ghobject_t &oid, ///< [in] Object containing omap
12133 const std::optional<string> &start_after, ///< [in] Keys to get
12134 map<string, bufferlist> *output ///< [out] Returned keys and values
12135 )
12136{
12137 Collection *c = static_cast<Collection *>(c_.get());
12138 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12139 if (!c->exists)
12140 return -ENOENT;
12141 std::shared_lock l(c->lock);
12142 int r = 0;
12143 OnodeRef o = c->get_onode(oid, false);
12144 if (!o || !o->exists) {
12145 r = -ENOENT;
12146 goto out;
12147 }
12148 if (!o->onode.has_omap()) {
12149 goto out;
12150 }
12151 o->flush();
12152 {
12153 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
12154 if (!iter) {
12155 r = -ENOENT;
12156 goto out;
12157 }
12158 iter->upper_bound(*start_after);
12159 for (; iter->valid(); iter->next()) {
12160 output->insert(make_pair(iter->key(), iter->value()));
12161 }
12162 }
12163
12164out:
12165 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12166 << dendl;
12167 return r;
12168}
12169#endif
12170
12171int BlueStore::omap_check_keys(
12172 CollectionHandle &c_, ///< [in] Collection containing oid
12173 const ghobject_t &oid, ///< [in] Object containing omap
12174 const set<string> &keys, ///< [in] Keys to check
12175 set<string> *out ///< [out] Subset of keys defined on oid
12176 )
12177{
12178 Collection *c = static_cast<Collection *>(c_.get());
12179 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12180 if (!c->exists)
12181 return -ENOENT;
12182 std::shared_lock l(c->lock);
12183 int r = 0;
12184 string final_key;
12185 OnodeRef o = c->get_onode(oid, false);
12186 if (!o || !o->exists) {
12187 r = -ENOENT;
12188 goto out;
12189 }
12190 if (!o->onode.has_omap()) {
12191 goto out;
12192 }
12193 o->flush();
12194 {
12195 const string& prefix = o->get_omap_prefix();
12196 o->get_omap_key(string(), &final_key);
12197 size_t base_key_len = final_key.size();
12198 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
12199 final_key.resize(base_key_len); // keep prefix
12200 final_key += *p;
12201 bufferlist val;
12202 if (db->get(prefix, final_key, &val) >= 0) {
12203 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
12204 << " -> " << *p << dendl;
12205 out->insert(*p);
12206 } else {
12207 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
12208 << " -> " << *p << dendl;
12209 }
12210 }
12211 }
12212 out:
12213 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12214 << dendl;
12215 return r;
12216}
12217
12218ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
12219 CollectionHandle &c_, ///< [in] collection
12220 const ghobject_t &oid ///< [in] object
12221 )
12222{
12223 Collection *c = static_cast<Collection *>(c_.get());
12224 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
12225 if (!c->exists) {
12226 return ObjectMap::ObjectMapIterator();
12227 }
12228 std::shared_lock l(c->lock);
12229 OnodeRef o = c->get_onode(oid, false);
12230 if (!o || !o->exists) {
12231 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
12232 return ObjectMap::ObjectMapIterator();
12233 }
12234 o->flush();
12235 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
12236 auto bounds = KeyValueDB::IteratorBounds();
12237 if (o->onode.has_omap()) {
12238 std::string lower_bound, upper_bound;
12239 o->get_omap_key(string(), &lower_bound);
12240 o->get_omap_tail(&upper_bound);
12241 bounds.lower_bound = std::move(lower_bound);
12242 bounds.upper_bound = std::move(upper_bound);
12243 }
12244 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix(), 0, std::move(bounds));
12245 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(logger,c, o, it));
12246}
12247
12248// -----------------
12249// write helpers
12250
12251uint64_t BlueStore::_get_ondisk_reserved() const {
12252 ceph_assert(min_alloc_size);
12253 return round_up_to(
12254 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
12255}
12256
12257void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
12258{
12259 dout(10) << __func__ << " ondisk_format " << ondisk_format
12260 << " min_compat_ondisk_format " << min_compat_ondisk_format
12261 << dendl;
12262 ceph_assert(ondisk_format == latest_ondisk_format);
12263 {
12264 bufferlist bl;
12265 encode(ondisk_format, bl);
12266 t->set(PREFIX_SUPER, "ondisk_format", bl);
12267 }
12268 {
12269 bufferlist bl;
12270 encode(min_compat_ondisk_format, bl);
12271 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
12272 }
12273}
12274
12275int BlueStore::_open_super_meta()
12276{
12277 // nid
12278 {
12279 nid_max = 0;
12280 bufferlist bl;
12281 db->get(PREFIX_SUPER, "nid_max", &bl);
12282 auto p = bl.cbegin();
12283 try {
12284 uint64_t v;
12285 decode(v, p);
12286 nid_max = v;
12287 } catch (ceph::buffer::error& e) {
12288 derr << __func__ << " unable to read nid_max" << dendl;
12289 return -EIO;
12290 }
12291 dout(1) << __func__ << " old nid_max " << nid_max << dendl;
12292 nid_last = nid_max.load();
12293 }
12294
12295 // blobid
12296 {
12297 blobid_max = 0;
12298 bufferlist bl;
12299 db->get(PREFIX_SUPER, "blobid_max", &bl);
12300 auto p = bl.cbegin();
12301 try {
12302 uint64_t v;
12303 decode(v, p);
12304 blobid_max = v;
12305 } catch (ceph::buffer::error& e) {
12306 derr << __func__ << " unable to read blobid_max" << dendl;
12307 return -EIO;
12308 }
12309 dout(1) << __func__ << " old blobid_max " << blobid_max << dendl;
12310 blobid_last = blobid_max.load();
12311 }
12312
12313 // freelist
12314 {
12315 bufferlist bl;
12316 db->get(PREFIX_SUPER, "freelist_type", &bl);
12317 if (bl.length()) {
12318 freelist_type = std::string(bl.c_str(), bl.length());
12319 } else {
12320 ceph_abort_msg("Not Support extent freelist manager");
12321 }
12322 dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
12323 }
12324 // ondisk format
12325 int32_t compat_ondisk_format = 0;
12326 {
12327 bufferlist bl;
12328 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
12329 if (r < 0) {
12330 // base case: kraken bluestore is v1 and readable by v1
12331 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
12332 << dendl;
12333 ondisk_format = 1;
12334 compat_ondisk_format = 1;
12335 } else {
12336 auto p = bl.cbegin();
12337 try {
12338 decode(ondisk_format, p);
12339 } catch (ceph::buffer::error& e) {
12340 derr << __func__ << " unable to read ondisk_format" << dendl;
12341 return -EIO;
12342 }
12343 bl.clear();
12344 {
12345 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
12346 ceph_assert(!r);
12347 auto p = bl.cbegin();
12348 try {
12349 decode(compat_ondisk_format, p);
12350 } catch (ceph::buffer::error& e) {
12351 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
12352 return -EIO;
12353 }
12354 }
12355 }
12356 dout(1) << __func__ << " ondisk_format " << ondisk_format
12357 << " compat_ondisk_format " << compat_ondisk_format
12358 << dendl;
12359 }
12360
12361 if (latest_ondisk_format < compat_ondisk_format) {
12362 derr << __func__ << " compat_ondisk_format is "
12363 << compat_ondisk_format << " but we only understand version "
12364 << latest_ondisk_format << dendl;
12365 return -EPERM;
12366 }
12367
12368 {
12369 bufferlist bl;
12370 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
12371 auto p = bl.cbegin();
12372 try {
12373 uint64_t val;
12374 decode(val, p);
12375 min_alloc_size = val;
12376 min_alloc_size_order = std::countr_zero(val);
12377 min_alloc_size_mask = min_alloc_size - 1;
12378
12379 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
12380 } catch (ceph::buffer::error& e) {
12381 derr << __func__ << " unable to read min_alloc_size" << dendl;
12382 return -EIO;
12383 }
12384 dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
12385 << std::dec << dendl;
12386 logger->set(l_bluestore_alloc_unit, min_alloc_size);
12387 }
12388
12389 // smr fields
12390 {
12391 bufferlist bl;
12392 int r = db->get(PREFIX_SUPER, "zone_size", &bl);
12393 if (r >= 0) {
12394 auto p = bl.cbegin();
12395 decode(zone_size, p);
12396 dout(1) << __func__ << " zone_size 0x" << std::hex << zone_size << std::dec << dendl;
12397 ceph_assert(bdev->is_smr());
12398 } else {
12399 ceph_assert(!bdev->is_smr());
12400 }
12401 }
12402 {
12403 bufferlist bl;
12404 int r = db->get(PREFIX_SUPER, "first_sequential_zone", &bl);
12405 if (r >= 0) {
12406 auto p = bl.cbegin();
12407 decode(first_sequential_zone, p);
12408 dout(1) << __func__ << " first_sequential_zone 0x" << std::hex
12409 << first_sequential_zone << std::dec << dendl;
12410 ceph_assert(bdev->is_smr());
12411 } else {
12412 ceph_assert(!bdev->is_smr());
12413 }
12414 }
12415
12416 _set_per_pool_omap();
12417
12418 _open_statfs();
12419 _set_alloc_sizes();
12420 _set_throttle_params();
12421
12422 _set_csum();
12423 _set_compression();
12424 _set_blob_size();
12425
12426 _validate_bdev();
12427 return 0;
12428}
12429
12430int BlueStore::_upgrade_super()
12431{
12432 dout(1) << __func__ << " from " << ondisk_format << ", latest "
12433 << latest_ondisk_format << dendl;
12434 if (ondisk_format < latest_ondisk_format) {
12435 ceph_assert(ondisk_format > 0);
12436 ceph_assert(ondisk_format < latest_ondisk_format);
12437
12438 KeyValueDB::Transaction t = db->get_transaction();
12439 if (ondisk_format == 1) {
12440 // changes:
12441 // - super: added ondisk_format
12442 // - super: added min_readable_ondisk_format
12443 // - super: added min_compat_ondisk_format
12444 // - super: added min_alloc_size
12445 // - super: removed min_min_alloc_size
12446 {
12447 bufferlist bl;
12448 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
12449 auto p = bl.cbegin();
12450 try {
12451 uint64_t val;
12452 decode(val, p);
12453 min_alloc_size = val;
12454 } catch (ceph::buffer::error& e) {
12455 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
12456 return -EIO;
12457 }
12458 t->set(PREFIX_SUPER, "min_alloc_size", bl);
12459 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
12460 }
12461 ondisk_format = 2;
12462 }
12463 if (ondisk_format == 2) {
12464 // changes:
12465 // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all*
12466 // oondes are using the per-pool prefix until a repair is run; at that
12467 // point the per_pool_omap=1 key will be set.
12468 // - super: added per_pool_omap key, which indicates that *all* objects
12469 // are using the new prefix and key format
12470 ondisk_format = 3;
12471 }
12472 if (ondisk_format == 3) {
12473 // changes:
12474 // - FreelistManager keeps meta within bdev label
12475 int r = _write_out_fm_meta(0);
12476 ceph_assert(r == 0);
12477 ondisk_format = 4;
12478 }
12479 // This to be the last operation
12480 _prepare_ondisk_format_super(t);
12481 int r = db->submit_transaction_sync(t);
12482 ceph_assert(r == 0);
12483 }
12484 // done
12485 dout(1) << __func__ << " done" << dendl;
12486 return 0;
12487}
12488
12489void BlueStore::_assign_nid(TransContext *txc, OnodeRef& o)
12490{
12491 if (o->onode.nid) {
12492 ceph_assert(o->exists);
12493 return;
12494 }
12495 uint64_t nid = ++nid_last;
12496 dout(20) << __func__ << " " << nid << dendl;
12497 o->onode.nid = nid;
12498 txc->last_nid = nid;
12499 o->exists = true;
12500}
12501
12502uint64_t BlueStore::_assign_blobid(TransContext *txc)
12503{
12504 uint64_t bid = ++blobid_last;
12505 dout(20) << __func__ << " " << bid << dendl;
12506 txc->last_blobid = bid;
12507 return bid;
12508}
12509
12510void BlueStore::get_db_statistics(Formatter *f)
12511{
12512 db->get_statistics(f);
12513}
12514
12515BlueStore::TransContext *BlueStore::_txc_create(
12516 Collection *c, OpSequencer *osr,
12517 list<Context*> *on_commits,
12518 TrackedOpRef osd_op)
12519{
12520 TransContext *txc = new TransContext(cct, c, osr, on_commits);
12521 txc->t = db->get_transaction();
12522
12523#ifdef WITH_BLKIN
12524 if (osd_op && osd_op->pg_trace) {
12525 txc->trace.init("TransContext", &trace_endpoint,
12526 &osd_op->pg_trace);
12527 txc->trace.event("txc create");
12528 txc->trace.keyval("txc seq", txc->seq);
12529 }
12530#endif
12531
12532 osr->queue_new(txc);
12533 dout(20) << __func__ << " osr " << osr << " = " << txc
12534 << " seq " << txc->seq << dendl;
12535 return txc;
12536}
12537
12538void BlueStore::_txc_calc_cost(TransContext *txc)
12539{
12540 // one "io" for the kv commit
12541 auto ios = 1 + txc->ioc.get_num_ios();
12542 auto cost = throttle_cost_per_io.load();
12543 txc->cost = ios * cost + txc->bytes;
12544 txc->ios = ios;
12545 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
12546 << ios << " ios * " << cost << " + " << txc->bytes
12547 << " bytes)" << dendl;
12548}
12549
12550void BlueStore::_txc_update_store_statfs(TransContext *txc)
12551{
12552 if (txc->statfs_delta.is_empty())
12553 return;
12554
12555 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
12556 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
12557 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
12558 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
12559 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
12560
12561 if (per_pool_stat_collection) {
12562 if (!is_statfs_recoverable()) {
12563 bufferlist bl;
12564 txc->statfs_delta.encode(bl);
12565 string key;
12566 get_pool_stat_key(txc->osd_pool_id, &key);
12567 txc->t->merge(PREFIX_STAT, key, bl);
12568 }
12569
12570 std::lock_guard l(vstatfs_lock);
12571 auto& stats = osd_pools[txc->osd_pool_id];
12572 stats += txc->statfs_delta;
12573
12574 vstatfs += txc->statfs_delta; //non-persistent in this mode
12575
12576 } else {
12577 if (!is_statfs_recoverable()) {
12578 bufferlist bl;
12579 txc->statfs_delta.encode(bl);
12580 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
12581 }
12582
12583 std::lock_guard l(vstatfs_lock);
12584 vstatfs += txc->statfs_delta;
12585 }
12586 txc->statfs_delta.reset();
12587}
12588
12589void BlueStore::_txc_state_proc(TransContext *txc)
12590{
12591 while (true) {
12592 dout(10) << __func__ << " txc " << txc
12593 << " " << txc->get_state_name() << dendl;
12594 switch (txc->get_state()) {
12595 case TransContext::STATE_PREPARE:
12596 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
12597 if (txc->ioc.has_pending_aios()) {
12598 txc->set_state(TransContext::STATE_AIO_WAIT);
12599#ifdef WITH_BLKIN
12600 if (txc->trace) {
12601 txc->trace.keyval("pending aios", txc->ioc.num_pending.load());
12602 }
12603#endif
12604 txc->had_ios = true;
12605 _txc_aio_submit(txc);
12606 return;
12607 }
12608 // ** fall-thru **
12609
12610 case TransContext::STATE_AIO_WAIT:
12611 {
12612 mono_clock::duration lat = throttle.log_state_latency(
12613 *txc, logger, l_bluestore_state_aio_wait_lat);
12614 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
12615 dout(0) << __func__ << " slow aio_wait, txc = " << txc
12616 << ", latency = " << lat
12617 << dendl;
12618 }
12619 }
12620
12621 _txc_finish_io(txc); // may trigger blocked txc's too
12622 return;
12623
12624 case TransContext::STATE_IO_DONE:
12625 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
12626 if (txc->had_ios) {
12627 ++txc->osr->txc_with_unstable_io;
12628 }
12629 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
12630 txc->set_state(TransContext::STATE_KV_QUEUED);
12631 if (cct->_conf->bluestore_sync_submit_transaction) {
12632 if (txc->last_nid >= nid_max ||
12633 txc->last_blobid >= blobid_max) {
12634 dout(20) << __func__
12635 << " last_{nid,blobid} exceeds max, submit via kv thread"
12636 << dendl;
12637 } else if (txc->osr->kv_committing_serially) {
12638 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
12639 << dendl;
12640 // note: this is starvation-prone. once we have a txc in a busy
12641 // sequencer that is committing serially it is possible to keep
12642 // submitting new transactions fast enough that we get stuck doing
12643 // so. the alternative is to block here... fixme?
12644 } else if (txc->osr->txc_with_unstable_io) {
12645 dout(20) << __func__ << " prior txc(s) with unstable ios "
12646 << txc->osr->txc_with_unstable_io.load() << dendl;
12647 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
12648 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
12649 == 0) {
12650 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
12651 << dendl;
12652 } else {
12653 _txc_apply_kv(txc, true);
12654 }
12655 }
12656 {
12657 std::lock_guard l(kv_lock);
12658 kv_queue.push_back(txc);
12659 if (!kv_sync_in_progress) {
12660 kv_sync_in_progress = true;
12661 kv_cond.notify_one();
12662 }
12663 if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) {
12664 kv_queue_unsubmitted.push_back(txc);
12665 ++txc->osr->kv_committing_serially;
12666 }
12667 if (txc->had_ios)
12668 kv_ios++;
12669 kv_throttle_costs += txc->cost;
12670 }
12671 return;
12672 case TransContext::STATE_KV_SUBMITTED:
12673 _txc_committed_kv(txc);
12674 // ** fall-thru **
12675
12676 case TransContext::STATE_KV_DONE:
12677 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
12678 if (txc->deferred_txn) {
12679 txc->set_state(TransContext::STATE_DEFERRED_QUEUED);
12680 _deferred_queue(txc);
12681 return;
12682 }
12683 txc->set_state(TransContext::STATE_FINISHING);
12684 break;
12685
12686 case TransContext::STATE_DEFERRED_CLEANUP:
12687 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
12688 txc->set_state(TransContext::STATE_FINISHING);
12689 // ** fall-thru **
12690
12691 case TransContext::STATE_FINISHING:
12692 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
12693 _txc_finish(txc);
12694 return;
12695
12696 default:
12697 derr << __func__ << " unexpected txc " << txc
12698 << " state " << txc->get_state_name() << dendl;
12699 ceph_abort_msg("unexpected txc state");
12700 return;
12701 }
12702 }
12703}
12704
12705void BlueStore::_txc_finish_io(TransContext *txc)
12706{
12707 dout(20) << __func__ << " " << txc << dendl;
12708
12709 /*
12710 * we need to preserve the order of kv transactions,
12711 * even though aio will complete in any order.
12712 */
12713
12714 OpSequencer *osr = txc->osr.get();
12715 std::lock_guard l(osr->qlock);
12716 txc->set_state(TransContext::STATE_IO_DONE);
12717 txc->ioc.release_running_aios();
12718 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
12719 while (p != osr->q.begin()) {
12720 --p;
12721 if (p->get_state() < TransContext::STATE_IO_DONE) {
12722 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
12723 << p->get_state_name() << dendl;
12724 return;
12725 }
12726 if (p->get_state() > TransContext::STATE_IO_DONE) {
12727 ++p;
12728 break;
12729 }
12730 }
12731 do {
12732 _txc_state_proc(&*p++);
12733 } while (p != osr->q.end() &&
12734 p->get_state() == TransContext::STATE_IO_DONE);
12735
12736 if (osr->kv_submitted_waiters) {
12737 osr->qcond.notify_all();
12738 }
12739}
12740
12741void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
12742{
12743 dout(20) << __func__ << " txc " << txc
12744 << " onodes " << txc->onodes
12745 << " shared_blobs " << txc->shared_blobs
12746 << dendl;
12747
12748 // finalize onodes
12749 for (auto o : txc->onodes) {
12750 _record_onode(o, t);
12751 o->flushing_count++;
12752 }
12753
12754 // objects we modified but didn't affect the onode
12755 auto p = txc->modified_objects.begin();
12756 while (p != txc->modified_objects.end()) {
12757 if (txc->onodes.count(*p) == 0) {
12758 (*p)->flushing_count++;
12759 ++p;
12760 } else {
12761 // remove dups with onodes list to avoid problems in _txc_finish
12762 p = txc->modified_objects.erase(p);
12763 }
12764 }
12765
12766 // finalize shared_blobs
12767 for (auto sb : txc->shared_blobs) {
12768 string key;
12769 auto sbid = sb->get_sbid();
12770 get_shared_blob_key(sbid, &key);
12771 if (sb->persistent->empty()) {
12772 dout(20) << __func__ << " shared_blob 0x"
12773 << std::hex << sbid << std::dec
12774 << " is empty" << dendl;
12775 t->rmkey(PREFIX_SHARED_BLOB, key);
12776 } else {
12777 bufferlist bl;
12778 encode(*(sb->persistent), bl);
12779 dout(20) << __func__ << " shared_blob 0x"
12780 << std::hex << sbid << std::dec
12781 << " is " << bl.length() << " " << *sb << dendl;
12782 t->set(PREFIX_SHARED_BLOB, key, bl);
12783 }
12784 }
12785}
12786
12787void BlueStore::BSPerfTracker::update_from_perfcounters(
12788 PerfCounters &logger)
12789{
12790 os_commit_latency_ns.consume_next(
12791 logger.get_tavg_ns(
12792 l_bluestore_commit_lat));
12793 os_apply_latency_ns.consume_next(
12794 logger.get_tavg_ns(
12795 l_bluestore_commit_lat));
12796}
12797
12798void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
12799{
12800 dout(20) << __func__ << " txc " << txc << std::hex
12801 << " allocated 0x" << txc->allocated
12802 << " released 0x" << txc->released
12803 << std::dec << dendl;
12804
12805 if (!fm->is_null_manager())
12806 {
12807 // We have to handle the case where we allocate *and* deallocate the
12808 // same region in this transaction. The freelist doesn't like that.
12809 // (Actually, the only thing that cares is the BitmapFreelistManager
12810 // debug check. But that's important.)
12811 interval_set<uint64_t> tmp_allocated, tmp_released;
12812 interval_set<uint64_t> *pallocated = &txc->allocated;
12813 interval_set<uint64_t> *preleased = &txc->released;
12814 if (!txc->allocated.empty() && !txc->released.empty()) {
12815 interval_set<uint64_t> overlap;
12816 overlap.intersection_of(txc->allocated, txc->released);
12817 if (!overlap.empty()) {
12818 tmp_allocated = txc->allocated;
12819 tmp_allocated.subtract(overlap);
12820 tmp_released = txc->released;
12821 tmp_released.subtract(overlap);
12822 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
12823 << ", new allocated 0x" << tmp_allocated
12824 << " released 0x" << tmp_released << std::dec
12825 << dendl;
12826 pallocated = &tmp_allocated;
12827 preleased = &tmp_released;
12828 }
12829 }
12830
12831 // update freelist with non-overlap sets
12832 for (interval_set<uint64_t>::iterator p = pallocated->begin();
12833 p != pallocated->end();
12834 ++p) {
12835 fm->allocate(p.get_start(), p.get_len(), t);
12836 }
12837 for (interval_set<uint64_t>::iterator p = preleased->begin();
12838 p != preleased->end();
12839 ++p) {
12840 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
12841 << "~" << p.get_len() << std::dec << dendl;
12842 fm->release(p.get_start(), p.get_len(), t);
12843 }
12844 }
12845
12846#ifdef HAVE_LIBZBD
12847 if (bdev->is_smr()) {
12848 for (auto& i : txc->old_zone_offset_refs) {
12849 dout(20) << __func__ << " rm ref zone 0x" << std::hex << i.first.second
12850 << " offset 0x" << i.second << std::dec
12851 << " -> " << i.first.first->oid << dendl;
12852 string key;
12853 get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
12854 txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
12855 }
12856 for (auto& i : txc->new_zone_offset_refs) {
12857 // (zone, offset) -> oid
12858 dout(20) << __func__ << " add ref zone 0x" << std::hex << i.first.second
12859 << " offset 0x" << i.second << std::dec
12860 << " -> " << i.first.first->oid << dendl;
12861 string key;
12862 get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
12863 bufferlist v;
12864 txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
12865 }
12866 }
12867#endif
12868
12869 _txc_update_store_statfs(txc);
12870}
12871
12872void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
12873{
12874 ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED);
12875 {
12876#if defined(WITH_LTTNG)
12877 auto start = mono_clock::now();
12878#endif
12879
12880#ifdef WITH_BLKIN
12881 if (txc->trace) {
12882 txc->trace.event("db async submit");
12883 }
12884#endif
12885
12886 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
12887 ceph_assert(r == 0);
12888 txc->set_state(TransContext::STATE_KV_SUBMITTED);
12889 if (txc->osr->kv_submitted_waiters) {
12890 std::lock_guard l(txc->osr->qlock);
12891 txc->osr->qcond.notify_all();
12892 }
12893
12894#if defined(WITH_LTTNG)
12895 if (txc->tracing) {
12896 tracepoint(
12897 bluestore,
12898 transaction_kv_submit_latency,
12899 txc->osr->get_sequencer_id(),
12900 txc->seq,
12901 sync_submit_transaction,
12902 ceph::to_seconds<double>(mono_clock::now() - start));
12903 }
12904#endif
12905 }
12906
12907 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
12908 for (auto& o : *ls) {
12909 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
12910 << dendl;
12911 if (--o->flushing_count == 0 && o->waiting_count.load()) {
12912 std::lock_guard l(o->flush_lock);
12913 o->flush_cond.notify_all();
12914 }
12915 }
12916 }
12917}
12918
12919void BlueStore::_txc_committed_kv(TransContext *txc)
12920{
12921 dout(20) << __func__ << " txc " << txc << dendl;
12922 throttle.complete_kv(*txc);
12923 {
12924 std::lock_guard l(txc->osr->qlock);
12925 txc->set_state(TransContext::STATE_KV_DONE);
12926 if (txc->ch->commit_queue) {
12927 txc->ch->commit_queue->queue(txc->oncommits);
12928 } else {
12929 finisher.queue(txc->oncommits);
12930 }
12931 }
12932 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
12933 log_latency_fn(
12934 __func__,
12935 l_bluestore_commit_lat,
12936 mono_clock::now() - txc->start,
12937 cct->_conf->bluestore_log_op_age,
12938 [&](auto lat) {
12939 return ", txc = " + stringify(txc);
12940 }
12941 );
12942}
12943
12944void BlueStore::_txc_finish(TransContext *txc)
12945{
12946 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
12947 ceph_assert(txc->get_state() == TransContext::STATE_FINISHING);
12948
12949 for (auto& sb : txc->shared_blobs_written) {
12950 sb->finish_write(txc->seq);
12951 }
12952 txc->shared_blobs_written.clear();
12953
12954 while (!txc->removed_collections.empty()) {
12955 _queue_reap_collection(txc->removed_collections.front());
12956 txc->removed_collections.pop_front();
12957 }
12958
12959 OpSequencerRef osr = txc->osr;
12960 bool empty = false;
12961 bool submit_deferred = false;
12962 OpSequencer::q_list_t releasing_txc;
12963 {
12964 std::lock_guard l(osr->qlock);
12965 txc->set_state(TransContext::STATE_DONE);
12966 bool notify = false;
12967 while (!osr->q.empty()) {
12968 TransContext *txc = &osr->q.front();
12969 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
12970 << dendl;
12971 if (txc->get_state() != TransContext::STATE_DONE) {
12972 if (txc->get_state() == TransContext::STATE_PREPARE &&
12973 deferred_aggressive) {
12974 // for _osr_drain_preceding()
12975 notify = true;
12976 }
12977 if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED &&
12978 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
12979 submit_deferred = true;
12980 }
12981 break;
12982 }
12983
12984 osr->q.pop_front();
12985 releasing_txc.push_back(*txc);
12986 }
12987
12988 if (osr->q.empty()) {
12989 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
12990 empty = true;
12991 }
12992
12993 // only drain()/drain_preceding() need wakeup,
12994 // other cases use kv_submitted_waiters
12995 if (notify || empty) {
12996 osr->qcond.notify_all();
12997 }
12998 }
12999
13000 while (!releasing_txc.empty()) {
13001 // release to allocator only after all preceding txc's have also
13002 // finished any deferred writes that potentially land in these
13003 // blocks
13004 auto txc = &releasing_txc.front();
13005 _txc_release_alloc(txc);
13006 releasing_txc.pop_front();
13007 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
13008 throttle.complete(*txc);
13009 delete txc;
13010 }
13011
13012 if (submit_deferred) {
13013 // we're pinning memory; flush! we could be more fine-grained here but
13014 // i'm not sure it's worth the bother.
13015 deferred_try_submit();
13016 }
13017
13018 if (empty && osr->zombie) {
13019 std::lock_guard l(zombie_osr_lock);
13020 if (zombie_osr_set.erase(osr->cid)) {
13021 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
13022 } else {
13023 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
13024 << dendl;
13025 }
13026 }
13027}
13028
13029void BlueStore::_txc_release_alloc(TransContext *txc)
13030{
13031 bool discard_queued = false;
13032 // it's expected we're called with lazy_release_lock already taken!
13033 if (unlikely(cct->_conf->bluestore_debug_no_reuse_blocks)) {
13034 goto out;
13035 }
13036 discard_queued = bdev->try_discard(txc->released);
13037 // if async discard succeeded, will do alloc->release when discard callback
13038 // else we should release here
13039 if (!discard_queued) {
13040 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
13041 << txc->released << std::dec << dendl;
13042 alloc->release(txc->released);
13043 }
13044
13045out:
13046 txc->allocated.clear();
13047 txc->released.clear();
13048}
13049
13050void BlueStore::_osr_attach(Collection *c)
13051{
13052 // note: caller has coll_lock
13053 auto q = coll_map.find(c->cid);
13054 if (q != coll_map.end()) {
13055 c->osr = q->second->osr;
13056 ldout(cct, 10) << __func__ << " " << c->cid
13057 << " reusing osr " << c->osr << " from existing coll "
13058 << q->second << dendl;
13059 } else {
13060 std::lock_guard l(zombie_osr_lock);
13061 auto p = zombie_osr_set.find(c->cid);
13062 if (p == zombie_osr_set.end()) {
13063 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
13064 ldout(cct, 10) << __func__ << " " << c->cid
13065 << " fresh osr " << c->osr << dendl;
13066 } else {
13067 c->osr = p->second;
13068 zombie_osr_set.erase(p);
13069 ldout(cct, 10) << __func__ << " " << c->cid
13070 << " resurrecting zombie osr " << c->osr << dendl;
13071 c->osr->zombie = false;
13072 }
13073 }
13074}
13075
13076void BlueStore::_osr_register_zombie(OpSequencer *osr)
13077{
13078 std::lock_guard l(zombie_osr_lock);
13079 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
13080 osr->zombie = true;
13081 auto i = zombie_osr_set.emplace(osr->cid, osr);
13082 // this is either a new insertion or the same osr is already there
13083 ceph_assert(i.second || i.first->second == osr);
13084}
13085
13086void BlueStore::_osr_drain_preceding(TransContext *txc)
13087{
13088 OpSequencer *osr = txc->osr.get();
13089 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
13090 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
13091 {
13092 // submit anything pending
13093 osr->deferred_lock.lock();
13094 if (osr->deferred_pending && !osr->deferred_running) {
13095 _deferred_submit_unlock(osr);
13096 } else {
13097 osr->deferred_lock.unlock();
13098 }
13099 }
13100 {
13101 // wake up any previously finished deferred events
13102 std::lock_guard l(kv_lock);
13103 if (!kv_sync_in_progress) {
13104 kv_sync_in_progress = true;
13105 kv_cond.notify_one();
13106 }
13107 }
13108 osr->drain_preceding(txc);
13109 --deferred_aggressive;
13110 dout(10) << __func__ << " " << osr << " done" << dendl;
13111}
13112
13113void BlueStore::_osr_drain(OpSequencer *osr)
13114{
13115 dout(10) << __func__ << " " << osr << dendl;
13116 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
13117 {
13118 // submit anything pending
13119 osr->deferred_lock.lock();
13120 if (osr->deferred_pending && !osr->deferred_running) {
13121 _deferred_submit_unlock(osr);
13122 } else {
13123 osr->deferred_lock.unlock();
13124 }
13125 }
13126 {
13127 // wake up any previously finished deferred events
13128 std::lock_guard l(kv_lock);
13129 if (!kv_sync_in_progress) {
13130 kv_sync_in_progress = true;
13131 kv_cond.notify_one();
13132 }
13133 }
13134 osr->drain();
13135 --deferred_aggressive;
13136 dout(10) << __func__ << " " << osr << " done" << dendl;
13137}
13138
13139void BlueStore::_osr_drain_all()
13140{
13141 dout(10) << __func__ << dendl;
13142
13143 set<OpSequencerRef> s;
13144 vector<OpSequencerRef> zombies;
13145 {
13146 std::shared_lock l(coll_lock);
13147 for (auto& i : coll_map) {
13148 s.insert(i.second->osr);
13149 }
13150 }
13151 {
13152 std::lock_guard l(zombie_osr_lock);
13153 for (auto& i : zombie_osr_set) {
13154 s.insert(i.second);
13155 zombies.push_back(i.second);
13156 }
13157 }
13158 dout(20) << __func__ << " osr_set " << s << dendl;
13159
13160 ++deferred_aggressive;
13161 {
13162 // submit anything pending
13163 deferred_try_submit();
13164 }
13165 {
13166 // wake up any previously finished deferred events
13167 std::lock_guard l(kv_lock);
13168 kv_cond.notify_one();
13169 }
13170 {
13171 std::lock_guard l(kv_finalize_lock);
13172 kv_finalize_cond.notify_one();
13173 }
13174 for (auto osr : s) {
13175 dout(20) << __func__ << " drain " << osr << dendl;
13176 osr->drain();
13177 }
13178 --deferred_aggressive;
13179
13180 {
13181 std::lock_guard l(zombie_osr_lock);
13182 for (auto& osr : zombies) {
13183 if (zombie_osr_set.erase(osr->cid)) {
13184 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
13185 ceph_assert(osr->q.empty());
13186 } else if (osr->zombie) {
13187 dout(10) << __func__ << " empty zombie osr " << osr
13188 << " already reaped" << dendl;
13189 ceph_assert(osr->q.empty());
13190 } else {
13191 dout(10) << __func__ << " empty zombie osr " << osr
13192 << " resurrected" << dendl;
13193 }
13194 }
13195 }
13196
13197 dout(10) << __func__ << " done" << dendl;
13198}
13199
13200
13201void BlueStore::_kv_start()
13202{
13203 dout(10) << __func__ << dendl;
13204
13205 finisher.start();
13206 kv_sync_thread.create("bstore_kv_sync");
13207 kv_finalize_thread.create("bstore_kv_final");
13208}
13209
13210void BlueStore::_kv_stop()
13211{
13212 dout(10) << __func__ << dendl;
13213 {
13214 std::unique_lock l{kv_lock};
13215 while (!kv_sync_started) {
13216 kv_cond.wait(l);
13217 }
13218 kv_stop = true;
13219 kv_cond.notify_all();
13220 }
13221 {
13222 std::unique_lock l{kv_finalize_lock};
13223 while (!kv_finalize_started) {
13224 kv_finalize_cond.wait(l);
13225 }
13226 kv_finalize_stop = true;
13227 kv_finalize_cond.notify_all();
13228 }
13229 kv_sync_thread.join();
13230 kv_finalize_thread.join();
13231 ceph_assert(removed_collections.empty());
13232 {
13233 std::lock_guard l(kv_lock);
13234 kv_stop = false;
13235 }
13236 {
13237 std::lock_guard l(kv_finalize_lock);
13238 kv_finalize_stop = false;
13239 }
13240 dout(10) << __func__ << " stopping finishers" << dendl;
13241 finisher.wait_for_empty();
13242 finisher.stop();
13243 dout(10) << __func__ << " stopped" << dendl;
13244}
13245
13246void BlueStore::_kv_sync_thread()
13247{
13248 dout(10) << __func__ << " start" << dendl;
13249 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
13250 std::unique_lock l{kv_lock};
13251 ceph_assert(!kv_sync_started);
13252 kv_sync_started = true;
13253 kv_cond.notify_all();
13254
13255 auto t0 = mono_clock::now();
13256 timespan twait = ceph::make_timespan(0);
13257 size_t kv_submitted = 0;
13258
13259 while (true) {
13260 auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
13261 auto observation_period =
13262 ceph::make_timespan(period);
13263 auto elapsed = mono_clock::now() - t0;
13264 if (period && elapsed >= observation_period) {
13265 dout(5) << __func__ << " utilization: idle "
13266 << twait << " of " << elapsed
13267 << ", submitted: " << kv_submitted
13268 <<dendl;
13269 t0 = mono_clock::now();
13270 twait = ceph::make_timespan(0);
13271 kv_submitted = 0;
13272 }
13273 ceph_assert(kv_committing.empty());
13274 if (kv_queue.empty() &&
13275 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
13276 !deferred_aggressive)) {
13277 if (kv_stop)
13278 break;
13279 dout(20) << __func__ << " sleep" << dendl;
13280 auto t = mono_clock::now();
13281 kv_sync_in_progress = false;
13282 kv_cond.wait(l);
13283 twait += mono_clock::now() - t;
13284
13285 dout(20) << __func__ << " wake" << dendl;
13286 } else {
13287 deque<TransContext*> kv_submitting;
13288 deque<DeferredBatch*> deferred_done, deferred_stable;
13289 uint64_t aios = 0, costs = 0;
13290
13291 dout(20) << __func__ << " committing " << kv_queue.size()
13292 << " submitting " << kv_queue_unsubmitted.size()
13293 << " deferred done " << deferred_done_queue.size()
13294 << " stable " << deferred_stable_queue.size()
13295 << dendl;
13296 kv_committing.swap(kv_queue);
13297 kv_submitting.swap(kv_queue_unsubmitted);
13298 deferred_done.swap(deferred_done_queue);
13299 deferred_stable.swap(deferred_stable_queue);
13300 aios = kv_ios;
13301 costs = kv_throttle_costs;
13302 kv_ios = 0;
13303 kv_throttle_costs = 0;
13304 l.unlock();
13305
13306 dout(30) << __func__ << " committing " << kv_committing << dendl;
13307 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
13308 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
13309 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
13310
13311 auto start = mono_clock::now();
13312
13313 bool force_flush = false;
13314 // if bluefs is sharing the same device as data (only), then we
13315 // can rely on the bluefs commit to flush the device and make
13316 // deferred aios stable. that means that if we do have done deferred
13317 // txcs AND we are not on a single device, we need to force a flush.
13318 if (bluefs && bluefs_layout.single_shared_device()) {
13319 if (aios) {
13320 force_flush = true;
13321 } else if (kv_committing.empty() && deferred_stable.empty()) {
13322 force_flush = true; // there's nothing else to commit!
13323 } else if (deferred_aggressive) {
13324 force_flush = true;
13325 }
13326 } else {
13327 if (aios || !deferred_done.empty()) {
13328 force_flush = true;
13329 } else {
13330 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
13331 }
13332 }
13333
13334 if (force_flush) {
13335 dout(20) << __func__ << " num_aios=" << aios
13336 << " force_flush=" << (int)force_flush
13337 << ", flushing, deferred done->stable" << dendl;
13338 // flush/barrier on block device
13339 bdev->flush();
13340
13341 // if we flush then deferred done are now deferred stable
13342 if (deferred_stable.empty()) {
13343 deferred_stable.swap(deferred_done);
13344 } else {
13345 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
13346 deferred_done.end());
13347 deferred_done.clear();
13348 }
13349 }
13350 auto after_flush = mono_clock::now();
13351
13352 // we will use one final transaction to force a sync
13353 KeyValueDB::Transaction synct = db->get_transaction();
13354
13355 // increase {nid,blobid}_max? note that this covers both the
13356 // case where we are approaching the max and the case we passed
13357 // it. in either case, we increase the max in the earlier txn
13358 // we submit.
13359 uint64_t new_nid_max = 0, new_blobid_max = 0;
13360 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
13361 KeyValueDB::Transaction t =
13362 kv_submitting.empty() ? synct : kv_submitting.front()->t;
13363 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
13364 bufferlist bl;
13365 encode(new_nid_max, bl);
13366 t->set(PREFIX_SUPER, "nid_max", bl);
13367 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
13368 }
13369 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
13370 KeyValueDB::Transaction t =
13371 kv_submitting.empty() ? synct : kv_submitting.front()->t;
13372 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
13373 bufferlist bl;
13374 encode(new_blobid_max, bl);
13375 t->set(PREFIX_SUPER, "blobid_max", bl);
13376 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
13377 }
13378
13379 for (auto txc : kv_committing) {
13380 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
13381 if (txc->get_state() == TransContext::STATE_KV_QUEUED) {
13382 ++kv_submitted;
13383 _txc_apply_kv(txc, false);
13384 --txc->osr->kv_committing_serially;
13385 } else {
13386 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
13387 }
13388 if (txc->had_ios) {
13389 --txc->osr->txc_with_unstable_io;
13390 }
13391 }
13392
13393 // release throttle *before* we commit. this allows new ops
13394 // to be prepared and enter pipeline while we are waiting on
13395 // the kv commit sync/flush. then hopefully on the next
13396 // iteration there will already be ops awake. otherwise, we
13397 // end up going to sleep, and then wake up when the very first
13398 // transaction is ready for commit.
13399 throttle.release_kv_throttle(costs);
13400
13401 // cleanup sync deferred keys
13402 for (auto b : deferred_stable) {
13403 for (auto& txc : b->txcs) {
13404 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
13405 ceph_assert(wt.released.empty()); // only kraken did this
13406 string key;
13407 get_deferred_key(wt.seq, &key);
13408 synct->rm_single_key(PREFIX_DEFERRED, key);
13409 }
13410 }
13411
13412#if defined(WITH_LTTNG)
13413 auto sync_start = mono_clock::now();
13414#endif
13415 // submit synct synchronously (block and wait for it to commit)
13416 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
13417 ceph_assert(r == 0);
13418
13419#ifdef WITH_BLKIN
13420 for (auto txc : kv_committing) {
13421 if (txc->trace) {
13422 txc->trace.event("db sync submit");
13423 txc->trace.keyval("kv_committing size", kv_committing.size());
13424 }
13425 }
13426#endif
13427
13428 int committing_size = kv_committing.size();
13429 int deferred_size = deferred_stable.size();
13430
13431#if defined(WITH_LTTNG)
13432 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
13433 for (auto txc: kv_committing) {
13434 if (txc->tracing) {
13435 tracepoint(
13436 bluestore,
13437 transaction_kv_sync_latency,
13438 txc->osr->get_sequencer_id(),
13439 txc->seq,
13440 kv_committing.size(),
13441 deferred_done.size(),
13442 deferred_stable.size(),
13443 sync_latency);
13444 }
13445 }
13446#endif
13447
13448 {
13449 std::unique_lock m{kv_finalize_lock};
13450 if (kv_committing_to_finalize.empty()) {
13451 kv_committing_to_finalize.swap(kv_committing);
13452 } else {
13453 kv_committing_to_finalize.insert(
13454 kv_committing_to_finalize.end(),
13455 kv_committing.begin(),
13456 kv_committing.end());
13457 kv_committing.clear();
13458 }
13459 if (deferred_stable_to_finalize.empty()) {
13460 deferred_stable_to_finalize.swap(deferred_stable);
13461 } else {
13462 deferred_stable_to_finalize.insert(
13463 deferred_stable_to_finalize.end(),
13464 deferred_stable.begin(),
13465 deferred_stable.end());
13466 deferred_stable.clear();
13467 }
13468 if (!kv_finalize_in_progress) {
13469 kv_finalize_in_progress = true;
13470 kv_finalize_cond.notify_one();
13471 }
13472 }
13473
13474 if (new_nid_max) {
13475 nid_max = new_nid_max;
13476 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
13477 }
13478 if (new_blobid_max) {
13479 blobid_max = new_blobid_max;
13480 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
13481 }
13482
13483 {
13484 auto finish = mono_clock::now();
13485 ceph::timespan dur_flush = after_flush - start;
13486 ceph::timespan dur_kv = finish - after_flush;
13487 ceph::timespan dur = finish - start;
13488 dout(20) << __func__ << " committed " << committing_size
13489 << " cleaned " << deferred_size
13490 << " in " << dur
13491 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
13492 << dendl;
13493 log_latency("kv_flush",
13494 l_bluestore_kv_flush_lat,
13495 dur_flush,
13496 cct->_conf->bluestore_log_op_age);
13497 log_latency("kv_commit",
13498 l_bluestore_kv_commit_lat,
13499 dur_kv,
13500 cct->_conf->bluestore_log_op_age);
13501 log_latency("kv_sync",
13502 l_bluestore_kv_sync_lat,
13503 dur,
13504 cct->_conf->bluestore_log_op_age);
13505 }
13506
13507 l.lock();
13508 // previously deferred "done" are now "stable" by virtue of this
13509 // commit cycle.
13510 deferred_stable_queue.swap(deferred_done);
13511 }
13512 }
13513 dout(10) << __func__ << " finish" << dendl;
13514 kv_sync_started = false;
13515}
13516
13517void BlueStore::_kv_finalize_thread()
13518{
13519 deque<TransContext*> kv_committed;
13520 deque<DeferredBatch*> deferred_stable;
13521 dout(10) << __func__ << " start" << dendl;
13522 std::unique_lock l(kv_finalize_lock);
13523 ceph_assert(!kv_finalize_started);
13524 kv_finalize_started = true;
13525 kv_finalize_cond.notify_all();
13526 while (true) {
13527 ceph_assert(kv_committed.empty());
13528 ceph_assert(deferred_stable.empty());
13529 if (kv_committing_to_finalize.empty() &&
13530 deferred_stable_to_finalize.empty()) {
13531 if (kv_finalize_stop)
13532 break;
13533 dout(20) << __func__ << " sleep" << dendl;
13534 kv_finalize_in_progress = false;
13535 kv_finalize_cond.wait(l);
13536 dout(20) << __func__ << " wake" << dendl;
13537 } else {
13538 kv_committed.swap(kv_committing_to_finalize);
13539 deferred_stable.swap(deferred_stable_to_finalize);
13540 l.unlock();
13541 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
13542 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
13543
13544 auto start = mono_clock::now();
13545
13546 while (!kv_committed.empty()) {
13547 TransContext *txc = kv_committed.front();
13548 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
13549 _txc_state_proc(txc);
13550 kv_committed.pop_front();
13551 }
13552
13553 for (auto b : deferred_stable) {
13554 auto p = b->txcs.begin();
13555 while (p != b->txcs.end()) {
13556 TransContext *txc = &*p;
13557 p = b->txcs.erase(p); // unlink here because
13558 _txc_state_proc(txc); // this may destroy txc
13559 }
13560 delete b;
13561 }
13562 deferred_stable.clear();
13563
13564 if (!deferred_aggressive) {
13565 if (deferred_queue_size >= deferred_batch_ops.load() ||
13566 throttle.should_submit_deferred()) {
13567 deferred_try_submit();
13568 }
13569 }
13570
13571 // this is as good a place as any ...
13572 _reap_collections();
13573
13574 logger->set(l_bluestore_fragmentation,
13575 (uint64_t)(alloc->get_fragmentation() * 1000));
13576
13577 log_latency("kv_final",
13578 l_bluestore_kv_final_lat,
13579 mono_clock::now() - start,
13580 cct->_conf->bluestore_log_op_age);
13581
13582 l.lock();
13583 }
13584 }
13585 dout(10) << __func__ << " finish" << dendl;
13586 kv_finalize_started = false;
13587}
13588
13589#ifdef HAVE_LIBZBD
13590void BlueStore::_zoned_cleaner_start()
13591{
13592 dout(10) << __func__ << dendl;
13593 zoned_cleaner_thread.create("bstore_zcleaner");
13594}
13595
13596void BlueStore::_zoned_cleaner_stop()
13597{
13598 dout(10) << __func__ << dendl;
13599 {
13600 std::unique_lock l{zoned_cleaner_lock};
13601 while (!zoned_cleaner_started) {
13602 zoned_cleaner_cond.wait(l);
13603 }
13604 zoned_cleaner_stop = true;
13605 zoned_cleaner_cond.notify_all();
13606 }
13607 zoned_cleaner_thread.join();
13608 {
13609 std::lock_guard l{zoned_cleaner_lock};
13610 zoned_cleaner_stop = false;
13611 }
13612 dout(10) << __func__ << " done" << dendl;
13613}
13614
13615void BlueStore::_zoned_cleaner_thread()
13616{
13617 dout(10) << __func__ << " start" << dendl;
13618 std::unique_lock l{zoned_cleaner_lock};
13619 ceph_assert(!zoned_cleaner_started);
13620 zoned_cleaner_started = true;
13621 zoned_cleaner_cond.notify_all();
13622 auto a = dynamic_cast<ZonedAllocator*>(alloc);
13623 ceph_assert(a);
13624 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
13625 ceph_assert(f);
13626 while (true) {
13627 // thresholds to trigger cleaning
13628 // FIXME
13629 float min_score = .05; // score: bytes saved / bytes moved
13630 uint64_t min_saved = zone_size / 32; // min bytes saved to consider cleaning
13631 auto zone_to_clean = a->pick_zone_to_clean(min_score, min_saved);
13632 if (zone_to_clean < 0) {
13633 if (zoned_cleaner_stop) {
13634 break;
13635 }
13636 auto period = ceph::make_timespan(cct->_conf->bluestore_cleaner_sleep_interval);
13637 dout(20) << __func__ << " sleep for " << period << dendl;
13638 zoned_cleaner_cond.wait_for(l, period);
13639 dout(20) << __func__ << " wake" << dendl;
13640 } else {
13641 l.unlock();
13642 a->set_cleaning_zone(zone_to_clean);
13643 _zoned_clean_zone(zone_to_clean, a, f);
13644 a->clear_cleaning_zone(zone_to_clean);
13645 l.lock();
13646 }
13647 }
13648 dout(10) << __func__ << " finish" << dendl;
13649 zoned_cleaner_started = false;
13650}
13651
13652void BlueStore::_zoned_clean_zone(
13653 uint64_t zone,
13654 ZonedAllocator *a,
13655 ZonedFreelistManager *f
13656 )
13657{
13658 dout(10) << __func__ << " cleaning zone 0x" << std::hex << zone << std::dec << dendl;
13659
13660 KeyValueDB::Iterator it = db->get_iterator(PREFIX_ZONED_CL_INFO);
13661 std::string zone_start;
13662 get_zone_offset_object_key(zone, 0, ghobject_t(), &zone_start);
13663 for (it->lower_bound(zone_start); it->valid(); it->next()) {
13664 uint32_t z;
13665 uint64_t offset;
13666 ghobject_t oid;
13667 string k = it->key();
13668 int r = get_key_zone_offset_object(k, &z, &offset, &oid);
13669 if (r < 0) {
13670 derr << __func__ << " failed to decode zone ref " << pretty_binary_string(k)
13671 << dendl;
13672 continue;
13673 }
13674 if (zone != z) {
13675 dout(10) << __func__ << " reached end of zone refs" << dendl;
13676 break;
13677 }
13678 dout(10) << __func__ << " zone 0x" << std::hex << zone << " offset 0x" << offset
13679 << std::dec << " " << oid << dendl;
13680 _clean_some(oid, zone);
13681 }
13682
13683 if (a->get_live_bytes(zone) > 0) {
13684 derr << "zone 0x" << std::hex << zone << " still has 0x" << a->get_live_bytes(zone)
13685 << " live bytes" << std::dec << dendl;
13686 // should we do something else here to avoid a live-lock in the event of a problem?
13687 return;
13688 }
13689
13690 // make sure transactions flush/drain/commit (and data is all rewritten
13691 // safely elsewhere) before we blow away the cleaned zone
13692 _osr_drain_all();
13693
13694 // reset the device zone
13695 dout(10) << __func__ << " resetting zone 0x" << std::hex << zone << std::dec << dendl;
13696 bdev->reset_zone(zone);
13697
13698 // record that we can now write there
13699 f->mark_zone_to_clean_free(zone, db);
13700 bdev->flush();
13701
13702 // then allow ourselves to start allocating there
13703 dout(10) << __func__ << " done cleaning zone 0x" << std::hex << zone << std::dec
13704 << dendl;
13705 a->reset_zone(zone);
13706}
13707
13708void BlueStore::_clean_some(ghobject_t oid, uint32_t zone)
13709{
13710 dout(10) << __func__ << " " << oid << " from zone 0x" << std::hex << zone << std::dec
13711 << dendl;
13712
13713 CollectionRef cref = _get_collection_by_oid(oid);
13714 if (!cref) {
13715 dout(10) << __func__ << " can't find collection for " << oid << dendl;
13716 return;
13717 }
13718 Collection *c = cref.get();
13719
13720 // serialize io dispatch vs other transactions
13721 std::lock_guard l(atomic_alloc_and_submit_lock);
13722 std::unique_lock l2(c->lock);
13723
13724 auto o = c->get_onode(oid, false);
13725 if (!o) {
13726 dout(10) << __func__ << " can't find " << oid << dendl;
13727 return;
13728 }
13729
13730 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
13731 _dump_onode<30>(cct, *o);
13732
13733 // NOTE: This is a naive rewrite strategy. If any blobs are
13734 // shared, they will be duplicated for each object that references
13735 // them. That means any cloned/snapshotted objects will explode
13736 // their utilization. This won't matter for RGW workloads, but
13737 // for RBD and CephFS it is completely unacceptable, and it's
13738 // entirely reasonable to have "archival" data workloads on SMR
13739 // for CephFS and (possibly/probably) RBD.
13740 //
13741 // At some point we need to replace this with something more
13742 // sophisticated that ensures that a shared blob gets moved once
13743 // and all referencing objects get updated to point to the new
13744 // location.
13745
13746 map<uint32_t, uint32_t> to_move;
13747 for (auto& e : o->extent_map.extent_map) {
13748 bool touches_zone = false;
13749 for (auto& be : e.blob->get_blob().get_extents()) {
13750 if (be.is_valid()) {
13751 uint32_t z = be.offset / zone_size;
13752 if (z == zone) {
13753 touches_zone = true;
13754 break;
13755 }
13756 }
13757 }
13758 if (touches_zone) {
13759 to_move[e.logical_offset] = e.length;
13760 }
13761 }
13762 if (to_move.empty()) {
13763 dout(10) << __func__ << " no references to zone 0x" << std::hex << zone
13764 << std::dec << " from " << oid << dendl;
13765 return;
13766 }
13767
13768 dout(10) << __func__ << " rewriting object extents 0x" << std::hex << to_move
13769 << std::dec << dendl;
13770 OpSequencer *osr = c->osr.get();
13771 TransContext *txc = _txc_create(c, osr, nullptr);
13772
13773 spg_t pgid;
13774 if (c->cid.is_pg(&pgid)) {
13775 txc->osd_pool_id = pgid.pool();
13776 }
13777
13778 for (auto& [offset, length] : to_move) {
13779 bufferlist bl;
13780 int r = _do_read(c, o, offset, length, bl, 0);
13781 ceph_assert(r == (int)length);
13782
13783 r = _do_write(txc, cref, o, offset, length, bl, 0);
13784 ceph_assert(r >= 0);
13785 }
13786 txc->write_onode(o);
13787
13788 _txc_write_nodes(txc, txc->t);
13789 _txc_finalize_kv(txc, txc->t);
13790 _txc_state_proc(txc);
13791}
13792#endif
13793
13794bluestore_deferred_op_t *BlueStore::_get_deferred_op(
13795 TransContext *txc, uint64_t len)
13796{
13797 if (!txc->deferred_txn) {
13798 txc->deferred_txn = new bluestore_deferred_transaction_t;
13799 }
13800 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
13801 logger->inc(l_bluestore_issued_deferred_writes);
13802 logger->inc(l_bluestore_issued_deferred_write_bytes, len);
13803 return &txc->deferred_txn->ops.back();
13804}
13805
13806void BlueStore::_deferred_queue(TransContext *txc)
13807{
13808 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
13809
13810 DeferredBatch *tmp;
13811 txc->osr->deferred_lock.lock();
13812 {
13813 if (!txc->osr->deferred_pending) {
13814 tmp = new DeferredBatch(cct, txc->osr.get());
13815 } else {
13816 tmp = txc->osr->deferred_pending;
13817 }
13818 }
13819
13820 tmp->txcs.push_back(*txc);
13821 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
13822 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
13823 const auto& op = *opi;
13824 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
13825 bufferlist::const_iterator p = op.data.begin();
13826 for (auto e : op.extents) {
13827 tmp->prepare_write(cct, wt.seq, e.offset, e.length, p);
13828 }
13829 }
13830
13831 {
13832 ++deferred_queue_size;
13833 txc->osr->deferred_pending = tmp;
13834 // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
13835 // So we should add osr into deferred_queue.
13836 if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) {
13837 deferred_lock.lock();
13838 deferred_queue.push_back(*txc->osr);
13839 deferred_lock.unlock();
13840 }
13841
13842 if (deferred_aggressive &&
13843 !txc->osr->deferred_running) {
13844 _deferred_submit_unlock(txc->osr.get());
13845 } else {
13846 txc->osr->deferred_lock.unlock();
13847 }
13848 }
13849 }
13850
13851void BlueStore::deferred_try_submit()
13852{
13853 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
13854 << deferred_queue_size << " txcs" << dendl;
13855 vector<OpSequencerRef> osrs;
13856
13857 {
13858 std::lock_guard l(deferred_lock);
13859 osrs.reserve(deferred_queue.size());
13860 for (auto& osr : deferred_queue) {
13861 osrs.push_back(&osr);
13862 }
13863 }
13864
13865 for (auto& osr : osrs) {
13866 osr->deferred_lock.lock();
13867 if (osr->deferred_pending) {
13868 if (!osr->deferred_running) {
13869 _deferred_submit_unlock(osr.get());
13870 } else {
13871 osr->deferred_lock.unlock();
13872 dout(20) << __func__ << " osr " << osr << " already has running"
13873 << dendl;
13874 }
13875 } else {
13876 osr->deferred_lock.unlock();
13877 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
13878 }
13879 }
13880
13881 {
13882 std::lock_guard l(deferred_lock);
13883 deferred_last_submitted = ceph_clock_now();
13884 }
13885}
13886
13887void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
13888{
13889 dout(10) << __func__ << " osr " << osr
13890 << " " << osr->deferred_pending->iomap.size() << " ios pending "
13891 << dendl;
13892 ceph_assert(osr->deferred_pending);
13893 ceph_assert(!osr->deferred_running);
13894
13895 auto b = osr->deferred_pending;
13896 deferred_queue_size -= b->seq_bytes.size();
13897 ceph_assert(deferred_queue_size >= 0);
13898
13899 osr->deferred_running = osr->deferred_pending;
13900 osr->deferred_pending = nullptr;
13901
13902 osr->deferred_lock.unlock();
13903
13904 for (auto& txc : b->txcs) {
13905 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
13906 }
13907 uint64_t start = 0, pos = 0;
13908 bufferlist bl;
13909 auto i = b->iomap.begin();
13910 while (true) {
13911 if (i == b->iomap.end() || i->first != pos) {
13912 if (bl.length()) {
13913 dout(20) << __func__ << " write 0x" << std::hex
13914 << start << "~" << bl.length()
13915 << " crc " << bl.crc32c(-1) << std::dec << dendl;
13916 if (!g_conf()->bluestore_debug_omit_block_device_write) {
13917 logger->inc(l_bluestore_submitted_deferred_writes);
13918 logger->inc(l_bluestore_submitted_deferred_write_bytes, bl.length());
13919 int r = bdev->aio_write(start, bl, &b->ioc, false);
13920 ceph_assert(r == 0);
13921 }
13922 }
13923 if (i == b->iomap.end()) {
13924 break;
13925 }
13926 start = 0;
13927 pos = i->first;
13928 bl.clear();
13929 }
13930 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
13931 << std::hex << pos << "~" << i->second.bl.length() << std::dec
13932 << dendl;
13933 if (!bl.length()) {
13934 start = pos;
13935 }
13936 pos += i->second.bl.length();
13937 bl.claim_append(i->second.bl);
13938 ++i;
13939 }
13940
13941 bdev->aio_submit(&b->ioc);
13942}
13943
13944struct C_DeferredTrySubmit : public Context {
13945 BlueStore *store;
13946 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
13947 void finish(int r) {
13948 store->deferred_try_submit();
13949 }
13950};
13951
13952void BlueStore::_deferred_aio_finish(OpSequencer *osr)
13953{
13954 dout(10) << __func__ << " osr " << osr << dendl;
13955 ceph_assert(osr->deferred_running);
13956 DeferredBatch *b = osr->deferred_running;
13957
13958 {
13959 osr->deferred_lock.lock();
13960 ceph_assert(osr->deferred_running == b);
13961 osr->deferred_running = nullptr;
13962 if (!osr->deferred_pending) {
13963 dout(20) << __func__ << " dequeueing" << dendl;
13964 {
13965 deferred_lock.lock();
13966 auto q = deferred_queue.iterator_to(*osr);
13967 deferred_queue.erase(q);
13968 deferred_lock.unlock();
13969 }
13970 osr->deferred_lock.unlock();
13971 } else {
13972 osr->deferred_lock.unlock();
13973 if (deferred_aggressive) {
13974 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
13975 finisher.queue(new C_DeferredTrySubmit(this));
13976 } else {
13977 dout(20) << __func__ << " leaving queued, more pending" << dendl;
13978 }
13979 }
13980 }
13981
13982 {
13983 uint64_t costs = 0;
13984 {
13985 for (auto& i : b->txcs) {
13986 TransContext *txc = &i;
13987 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
13988 txc->set_state(TransContext::STATE_DEFERRED_CLEANUP);
13989 costs += txc->cost;
13990 }
13991 }
13992 throttle.release_deferred_throttle(costs);
13993 }
13994
13995 {
13996 std::lock_guard l(kv_lock);
13997 deferred_done_queue.emplace_back(b);
13998
13999 // in the normal case, do not bother waking up the kv thread; it will
14000 // catch us on the next commit anyway.
14001 if (deferred_aggressive && !kv_sync_in_progress) {
14002 kv_sync_in_progress = true;
14003 kv_cond.notify_one();
14004 }
14005 }
14006}
14007
14008int BlueStore::_deferred_replay()
14009{
14010 dout(10) << __func__ << " start" << dendl;
14011 int count = 0;
14012 int r = 0;
14013 interval_set<uint64_t> bluefs_extents;
14014 if (bluefs) {
14015 bluefs->foreach_block_extents(
14016 bluefs_layout.shared_bdev,
14017 [&] (uint64_t start, uint32_t len) {
14018 bluefs_extents.insert(start, len);
14019 }
14020 );
14021 }
14022 CollectionRef ch = _get_collection(coll_t::meta());
14023 bool fake_ch = false;
14024 if (!ch) {
14025 // hmm, replaying initial mkfs?
14026 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
14027 fake_ch = true;
14028 }
14029 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
14030 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
14031 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
14032 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
14033 << dendl;
14034 bluestore_deferred_transaction_t *deferred_txn =
14035 new bluestore_deferred_transaction_t;
14036 bufferlist bl = it->value();
14037 auto p = bl.cbegin();
14038 try {
14039 decode(*deferred_txn, p);
14040 } catch (ceph::buffer::error& e) {
14041 derr << __func__ << " failed to decode deferred txn "
14042 << pretty_binary_string(it->key()) << dendl;
14043 delete deferred_txn;
14044 r = -EIO;
14045 goto out;
14046 }
14047 bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents);
14048 if (has_some) {
14049 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
14050 txc->deferred_txn = deferred_txn;
14051 txc->set_state(TransContext::STATE_KV_DONE);
14052 _txc_state_proc(txc);
14053 } else {
14054 delete deferred_txn;
14055 }
14056 }
14057 out:
14058 dout(20) << __func__ << " draining osr" << dendl;
14059 _osr_register_zombie(osr);
14060 _osr_drain_all();
14061 if (fake_ch) {
14062 new_coll_map.clear();
14063 }
14064 dout(10) << __func__ << " completed " << count << " events" << dendl;
14065 return r;
14066}
14067
14068bool BlueStore::_eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn,
14069 interval_set<uint64_t>& bluefs_extents)
14070{
14071 bool has_some = false;
14072 dout(30) << __func__ << " bluefs_extents: " << std::hex << bluefs_extents << std::dec << dendl;
14073 auto it = deferred_txn->ops.begin();
14074 while (it != deferred_txn->ops.end()) {
14075 // We process a pair of _data_/_extents_ (here: it->data/it->extents)
14076 // by eliminating _extents_ that belong to bluefs, removing relevant parts of _data_
14077 // example:
14078 // +------------+---------------+---------------+---------------+
14079 // | data | aaaaaaaabbbbb | bbbbcccccdddd | ddddeeeeeefff |
14080 // | extent | 40000 - 44000 | 50000 - 58000 | 58000 - 60000 |
14081 // | in bluefs? | no | yes | no |
14082 // +------------+---------------+---------------+---------------+
14083 // result:
14084 // +------------+---------------+---------------+
14085 // | data | aaaaaaaabbbbb | ddddeeeeeefff |
14086 // | extent | 40000 - 44000 | 58000 - 60000 |
14087 // +------------+---------------+---------------+
14088 PExtentVector new_extents;
14089 ceph::buffer::list new_data;
14090 uint32_t data_offset = 0; // this tracks location of extent 'e' inside it->data
14091 dout(30) << __func__ << " input extents: " << it->extents << dendl;
14092 for (auto& e: it->extents) {
14093 interval_set<uint64_t> region;
14094 region.insert(e.offset, e.length);
14095
14096 auto mi = bluefs_extents.lower_bound(e.offset);
14097 if (mi != bluefs_extents.begin()) {
14098 --mi;
14099 if (mi.get_end() <= e.offset) {
14100 ++mi;
14101 }
14102 }
14103 while (mi != bluefs_extents.end() && mi.get_start() < e.offset + e.length) {
14104 // The interval_set does not like (asserts) when we erase interval that does not exist.
14105 // Hence we do we implement (region-mi) by ((region+mi)-mi).
14106 region.union_insert(mi.get_start(), mi.get_len());
14107 region.erase(mi.get_start(), mi.get_len());
14108 ++mi;
14109 }
14110 // 'region' is now a subset of e, without parts used by bluefs
14111 // we trim coresponding parts from it->data (actally constructing new_data / new_extents)
14112 for (auto ki = region.begin(); ki != region.end(); ki++) {
14113 ceph::buffer::list chunk;
14114 // A chunk from it->data; data_offset is a an offset where 'e' was located;
14115 // 'ki.get_start() - e.offset' is an offset of ki inside 'e'.
14116 chunk.substr_of(it->data, data_offset + (ki.get_start() - e.offset), ki.get_len());
14117 new_data.claim_append(chunk);
14118 new_extents.emplace_back(bluestore_pextent_t(ki.get_start(), ki.get_len()));
14119 }
14120 data_offset += e.length;
14121 }
14122 dout(30) << __func__ << " output extents: " << new_extents << dendl;
14123 if (it->data.length() != new_data.length()) {
14124 dout(10) << __func__ << " trimmed deferred extents: " << it->extents << "->" << new_extents << dendl;
14125 }
14126 if (new_extents.size() == 0) {
14127 it = deferred_txn->ops.erase(it);
14128 } else {
14129 has_some = true;
14130 std::swap(it->extents, new_extents);
14131 std::swap(it->data, new_data);
14132 ++it;
14133 }
14134 }
14135 return has_some;
14136}
14137
14138// ---------------------------
14139// transactions
14140
14141int BlueStore::queue_transactions(
14142 CollectionHandle& ch,
14143 vector<Transaction>& tls,
14144 TrackedOpRef op,
14145 ThreadPool::TPHandle *handle)
14146{
14147 FUNCTRACE(cct);
14148 list<Context *> on_applied, on_commit, on_applied_sync;
14149 ObjectStore::Transaction::collect_contexts(
14150 tls, &on_applied, &on_commit, &on_applied_sync);
14151
14152 auto start = mono_clock::now();
14153
14154 Collection *c = static_cast<Collection*>(ch.get());
14155 OpSequencer *osr = c->osr.get();
14156 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
14157
14158 // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
14159 // submission to happen atomically because if I/O submission happens in a
14160 // different order than I/O allocation, we end up issuing non-sequential
14161 // writes to the drive. This is a temporary solution until ZONE APPEND
14162 // support matures in the kernel. For more information please see:
14163 // https://www.usenix.org/conference/vault20/presentation/bjorling
14164 if (bdev->is_smr()) {
14165 atomic_alloc_and_submit_lock.lock();
14166 }
14167
14168 // prepare
14169 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
14170 &on_commit, op);
14171
14172 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
14173 txc->bytes += (*p).get_num_bytes();
14174 _txc_add_transaction(txc, &(*p));
14175 }
14176 _txc_calc_cost(txc);
14177
14178 _txc_write_nodes(txc, txc->t);
14179
14180 // journal deferred items
14181 if (txc->deferred_txn) {
14182 txc->deferred_txn->seq = ++deferred_seq;
14183 bufferlist bl;
14184 encode(*txc->deferred_txn, bl);
14185 string key;
14186 get_deferred_key(txc->deferred_txn->seq, &key);
14187 txc->t->set(PREFIX_DEFERRED, key, bl);
14188 }
14189
14190 _txc_finalize_kv(txc, txc->t);
14191
14192#ifdef WITH_BLKIN
14193 if (txc->trace) {
14194 txc->trace.event("txc encode finished");
14195 }
14196#endif
14197
14198 if (handle)
14199 handle->suspend_tp_timeout();
14200
14201 auto tstart = mono_clock::now();
14202
14203 if (!throttle.try_start_transaction(
14204 *db,
14205 *txc,
14206 tstart)) {
14207 // ensure we do not block here because of deferred writes
14208 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
14209 << dendl;
14210 ++deferred_aggressive;
14211 deferred_try_submit();
14212 {
14213 // wake up any previously finished deferred events
14214 std::lock_guard l(kv_lock);
14215 if (!kv_sync_in_progress) {
14216 kv_sync_in_progress = true;
14217 kv_cond.notify_one();
14218 }
14219 }
14220 throttle.finish_start_transaction(*db, *txc, tstart);
14221 --deferred_aggressive;
14222 }
14223 auto tend = mono_clock::now();
14224
14225 if (handle)
14226 handle->reset_tp_timeout();
14227
14228 logger->inc(l_bluestore_txc);
14229
14230 // execute (start)
14231 _txc_state_proc(txc);
14232
14233 if (bdev->is_smr()) {
14234 atomic_alloc_and_submit_lock.unlock();
14235 }
14236
14237 // we're immediately readable (unlike FileStore)
14238 for (auto c : on_applied_sync) {
14239 c->complete(0);
14240 }
14241 if (!on_applied.empty()) {
14242 if (c->commit_queue) {
14243 c->commit_queue->queue(on_applied);
14244 } else {
14245 finisher.queue(on_applied);
14246 }
14247 }
14248
14249#ifdef WITH_BLKIN
14250 if (txc->trace) {
14251 txc->trace.event("txc applied");
14252 }
14253#endif
14254
14255 log_latency("submit_transact",
14256 l_bluestore_submit_lat,
14257 mono_clock::now() - start,
14258 cct->_conf->bluestore_log_op_age);
14259 log_latency("throttle_transact",
14260 l_bluestore_throttle_lat,
14261 tend - tstart,
14262 cct->_conf->bluestore_log_op_age);
14263 return 0;
14264}
14265
14266void BlueStore::_txc_aio_submit(TransContext *txc)
14267{
14268 dout(10) << __func__ << " txc " << txc << dendl;
14269 bdev->aio_submit(&txc->ioc);
14270}
14271
14272void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
14273{
14274 Transaction::iterator i = t->begin();
14275
14276 _dump_transaction<30>(cct, t);
14277
14278 vector<CollectionRef> cvec(i.colls.size());
14279 unsigned j = 0;
14280 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
14281 ++p, ++j) {
14282 cvec[j] = _get_collection(*p);
14283 }
14284
14285 vector<OnodeRef> ovec(i.objects.size());
14286
14287 for (int pos = 0; i.have_op(); ++pos) {
14288 Transaction::Op *op = i.decode_op();
14289 int r = 0;
14290
14291 // no coll or obj
14292 if (op->op == Transaction::OP_NOP)
14293 continue;
14294
14295
14296 // collection operations
14297 CollectionRef &c = cvec[op->cid];
14298
14299 // initialize osd_pool_id and do a smoke test that all collections belong
14300 // to the same pool
14301 spg_t pgid;
14302 if (!!c ? c->cid.is_pg(&pgid) : false) {
14303 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
14304 txc->osd_pool_id == pgid.pool());
14305 txc->osd_pool_id = pgid.pool();
14306 }
14307
14308 switch (op->op) {
14309 case Transaction::OP_RMCOLL:
14310 {
14311 const coll_t &cid = i.get_cid(op->cid);
14312 r = _remove_collection(txc, cid, &c);
14313 if (!r)
14314 continue;
14315 }
14316 break;
14317
14318 case Transaction::OP_MKCOLL:
14319 {
14320 ceph_assert(!c);
14321 const coll_t &cid = i.get_cid(op->cid);
14322 r = _create_collection(txc, cid, op->split_bits, &c);
14323 if (!r)
14324 continue;
14325 }
14326 break;
14327
14328 case Transaction::OP_SPLIT_COLLECTION:
14329 ceph_abort_msg("deprecated");
14330 break;
14331
14332 case Transaction::OP_SPLIT_COLLECTION2:
14333 {
14334 uint32_t bits = op->split_bits;
14335 uint32_t rem = op->split_rem;
14336 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
14337 if (!r)
14338 continue;
14339 }
14340 break;
14341
14342 case Transaction::OP_MERGE_COLLECTION:
14343 {
14344 uint32_t bits = op->split_bits;
14345 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
14346 if (!r)
14347 continue;
14348 }
14349 break;
14350
14351 case Transaction::OP_COLL_HINT:
14352 {
14353 uint32_t type = op->hint;
14354 bufferlist hint;
14355 i.decode_bl(hint);
14356 auto hiter = hint.cbegin();
14357 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
14358 uint32_t pg_num;
14359 uint64_t num_objs;
14360 decode(pg_num, hiter);
14361 decode(num_objs, hiter);
14362 dout(10) << __func__ << " collection hint objects is a no-op, "
14363 << " pg_num " << pg_num << " num_objects " << num_objs
14364 << dendl;
14365 } else {
14366 // Ignore the hint
14367 dout(10) << __func__ << " unknown collection hint " << type << dendl;
14368 }
14369 continue;
14370 }
14371 break;
14372
14373 case Transaction::OP_COLL_SETATTR:
14374 r = -EOPNOTSUPP;
14375 break;
14376
14377 case Transaction::OP_COLL_RMATTR:
14378 r = -EOPNOTSUPP;
14379 break;
14380
14381 case Transaction::OP_COLL_RENAME:
14382 ceph_abort_msg("not implemented");
14383 break;
14384 }
14385 if (r < 0) {
14386 derr << __func__ << " error " << cpp_strerror(r)
14387 << " not handled on operation " << op->op
14388 << " (op " << pos << ", counting from 0)" << dendl;
14389 _dump_transaction<0>(cct, t);
14390 ceph_abort_msg("unexpected error");
14391 }
14392
14393 // these operations implicity create the object
14394 bool create = false;
14395 if (op->op == Transaction::OP_TOUCH ||
14396 op->op == Transaction::OP_CREATE ||
14397 op->op == Transaction::OP_WRITE ||
14398 op->op == Transaction::OP_ZERO) {
14399 create = true;
14400 }
14401
14402 // object operations
14403 std::unique_lock l(c->lock);
14404 OnodeRef &o = ovec[op->oid];
14405 if (!o) {
14406 ghobject_t oid = i.get_oid(op->oid);
14407 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
14408 }
14409 if (!create && (!o || !o->exists)) {
14410 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
14411 << i.get_oid(op->oid) << dendl;
14412 r = -ENOENT;
14413 goto endop;
14414 }
14415
14416 switch (op->op) {
14417 case Transaction::OP_CREATE:
14418 case Transaction::OP_TOUCH:
14419 r = _touch(txc, c, o);
14420 break;
14421
14422 case Transaction::OP_WRITE:
14423 {
14424 uint64_t off = op->off;
14425 uint64_t len = op->len;
14426 uint32_t fadvise_flags = i.get_fadvise_flags();
14427 bufferlist bl;
14428 i.decode_bl(bl);
14429 r = _write(txc, c, o, off, len, bl, fadvise_flags);
14430 }
14431 break;
14432
14433 case Transaction::OP_ZERO:
14434 {
14435 uint64_t off = op->off;
14436 uint64_t len = op->len;
14437 r = _zero(txc, c, o, off, len);
14438 }
14439 break;
14440
14441 case Transaction::OP_TRIMCACHE:
14442 {
14443 // deprecated, no-op
14444 }
14445 break;
14446
14447 case Transaction::OP_TRUNCATE:
14448 {
14449 uint64_t off = op->off;
14450 r = _truncate(txc, c, o, off);
14451 }
14452 break;
14453
14454 case Transaction::OP_REMOVE:
14455 {
14456 r = _remove(txc, c, o);
14457 }
14458 break;
14459
14460 case Transaction::OP_SETATTR:
14461 {
14462 string name = i.decode_string();
14463 bufferptr bp;
14464 i.decode_bp(bp);
14465 r = _setattr(txc, c, o, name, bp);
14466 }
14467 break;
14468
14469 case Transaction::OP_SETATTRS:
14470 {
14471 map<string, bufferptr> aset;
14472 i.decode_attrset(aset);
14473 r = _setattrs(txc, c, o, aset);
14474 }
14475 break;
14476
14477 case Transaction::OP_RMATTR:
14478 {
14479 string name = i.decode_string();
14480 r = _rmattr(txc, c, o, name);
14481 }
14482 break;
14483
14484 case Transaction::OP_RMATTRS:
14485 {
14486 r = _rmattrs(txc, c, o);
14487 }
14488 break;
14489
14490 case Transaction::OP_CLONE:
14491 {
14492 OnodeRef& no = ovec[op->dest_oid];
14493 if (!no) {
14494 const ghobject_t& noid = i.get_oid(op->dest_oid);
14495 no = c->get_onode(noid, true);
14496 }
14497 r = _clone(txc, c, o, no);
14498 }
14499 break;
14500
14501 case Transaction::OP_CLONERANGE:
14502 ceph_abort_msg("deprecated");
14503 break;
14504
14505 case Transaction::OP_CLONERANGE2:
14506 {
14507 OnodeRef& no = ovec[op->dest_oid];
14508 if (!no) {
14509 const ghobject_t& noid = i.get_oid(op->dest_oid);
14510 no = c->get_onode(noid, true);
14511 }
14512 uint64_t srcoff = op->off;
14513 uint64_t len = op->len;
14514 uint64_t dstoff = op->dest_off;
14515 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
14516 }
14517 break;
14518
14519 case Transaction::OP_COLL_ADD:
14520 ceph_abort_msg("not implemented");
14521 break;
14522
14523 case Transaction::OP_COLL_REMOVE:
14524 ceph_abort_msg("not implemented");
14525 break;
14526
14527 case Transaction::OP_COLL_MOVE:
14528 ceph_abort_msg("deprecated");
14529 break;
14530
14531 case Transaction::OP_COLL_MOVE_RENAME:
14532 case Transaction::OP_TRY_RENAME:
14533 {
14534 ceph_assert(op->cid == op->dest_cid);
14535 const ghobject_t& noid = i.get_oid(op->dest_oid);
14536 OnodeRef& no = ovec[op->dest_oid];
14537 if (!no) {
14538 no = c->get_onode(noid, false);
14539 }
14540 r = _rename(txc, c, o, no, noid);
14541 }
14542 break;
14543
14544 case Transaction::OP_OMAP_CLEAR:
14545 {
14546 r = _omap_clear(txc, c, o);
14547 }
14548 break;
14549 case Transaction::OP_OMAP_SETKEYS:
14550 {
14551 bufferlist aset_bl;
14552 i.decode_attrset_bl(&aset_bl);
14553 r = _omap_setkeys(txc, c, o, aset_bl);
14554 }
14555 break;
14556 case Transaction::OP_OMAP_RMKEYS:
14557 {
14558 bufferlist keys_bl;
14559 i.decode_keyset_bl(&keys_bl);
14560 r = _omap_rmkeys(txc, c, o, keys_bl);
14561 }
14562 break;
14563 case Transaction::OP_OMAP_RMKEYRANGE:
14564 {
14565 string first, last;
14566 first = i.decode_string();
14567 last = i.decode_string();
14568 r = _omap_rmkey_range(txc, c, o, first, last);
14569 }
14570 break;
14571 case Transaction::OP_OMAP_SETHEADER:
14572 {
14573 bufferlist bl;
14574 i.decode_bl(bl);
14575 r = _omap_setheader(txc, c, o, bl);
14576 }
14577 break;
14578
14579 case Transaction::OP_SETALLOCHINT:
14580 {
14581 r = _set_alloc_hint(txc, c, o,
14582 op->expected_object_size,
14583 op->expected_write_size,
14584 op->hint);
14585 }
14586 break;
14587
14588 default:
14589 derr << __func__ << " bad op " << op->op << dendl;
14590 ceph_abort();
14591 }
14592
14593 endop:
14594 if (r < 0) {
14595 bool ok = false;
14596
14597 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
14598 op->op == Transaction::OP_CLONE ||
14599 op->op == Transaction::OP_CLONERANGE2 ||
14600 op->op == Transaction::OP_COLL_ADD ||
14601 op->op == Transaction::OP_SETATTR ||
14602 op->op == Transaction::OP_SETATTRS ||
14603 op->op == Transaction::OP_RMATTR ||
14604 op->op == Transaction::OP_OMAP_SETKEYS ||
14605 op->op == Transaction::OP_OMAP_RMKEYS ||
14606 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
14607 op->op == Transaction::OP_OMAP_SETHEADER))
14608 // -ENOENT is usually okay
14609 ok = true;
14610 if (r == -ENODATA)
14611 ok = true;
14612
14613 if (!ok) {
14614 const char *msg = "unexpected error code";
14615
14616 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
14617 op->op == Transaction::OP_CLONE ||
14618 op->op == Transaction::OP_CLONERANGE2))
14619 msg = "ENOENT on clone suggests osd bug";
14620
14621 if (r == -ENOSPC)
14622 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
14623 // by partially applying transactions.
14624 msg = "ENOSPC from bluestore, misconfigured cluster";
14625
14626 if (r == -ENOTEMPTY) {
14627 msg = "ENOTEMPTY suggests garbage data in osd data dir";
14628 }
14629
14630 derr << __func__ << " error " << cpp_strerror(r)
14631 << " not handled on operation " << op->op
14632 << " (op " << pos << ", counting from 0)"
14633 << dendl;
14634 derr << msg << dendl;
14635 _dump_transaction<0>(cct, t);
14636 ceph_abort_msg("unexpected error");
14637 }
14638 }
14639 }
14640}
14641
14642
14643
14644// -----------------
14645// write operations
14646
14647int BlueStore::_touch(TransContext *txc,
14648 CollectionRef& c,
14649 OnodeRef& o)
14650{
14651 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14652 int r = 0;
14653 _assign_nid(txc, o);
14654 txc->write_onode(o);
14655 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14656 return r;
14657}
14658
14659void BlueStore::_pad_zeros(
14660 bufferlist *bl, uint64_t *offset,
14661 uint64_t chunk_size)
14662{
14663 auto length = bl->length();
14664 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
14665 << " chunk_size 0x" << chunk_size << std::dec << dendl;
14666 dout(40) << "before:\n";
14667 bl->hexdump(*_dout);
14668 *_dout << dendl;
14669 // front
14670 size_t front_pad = *offset % chunk_size;
14671 size_t back_pad = 0;
14672 size_t pad_count = 0;
14673 if (front_pad) {
14674 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
14675 bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size);
14676 z.zero(0, front_pad, false);
14677 pad_count += front_pad;
14678 bl->begin().copy(front_copy, z.c_str() + front_pad);
14679 if (front_copy + front_pad < chunk_size) {
14680 back_pad = chunk_size - (length + front_pad);
14681 z.zero(front_pad + length, back_pad, false);
14682 pad_count += back_pad;
14683 }
14684 bufferlist old, t;
14685 old.swap(*bl);
14686 t.substr_of(old, front_copy, length - front_copy);
14687 bl->append(z);
14688 bl->claim_append(t);
14689 *offset -= front_pad;
14690 length += pad_count;
14691 }
14692
14693 // back
14694 uint64_t end = *offset + length;
14695 unsigned back_copy = end % chunk_size;
14696 if (back_copy) {
14697 ceph_assert(back_pad == 0);
14698 back_pad = chunk_size - back_copy;
14699 ceph_assert(back_copy <= length);
14700 bufferptr tail(chunk_size);
14701 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
14702 tail.zero(back_copy, back_pad, false);
14703 bufferlist old;
14704 old.swap(*bl);
14705 bl->substr_of(old, 0, length - back_copy);
14706 bl->append(tail);
14707 length += back_pad;
14708 pad_count += back_pad;
14709 }
14710 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
14711 << back_pad << " on front/back, now 0x" << *offset << "~"
14712 << length << std::dec << dendl;
14713 dout(40) << "after:\n";
14714 bl->hexdump(*_dout);
14715 *_dout << dendl;
14716 if (pad_count)
14717 logger->inc(l_bluestore_write_pad_bytes, pad_count);
14718 ceph_assert(bl->length() == length);
14719}
14720
14721void BlueStore::_do_write_small(
14722 TransContext *txc,
14723 CollectionRef &c,
14724 OnodeRef& o,
14725 uint64_t offset, uint64_t length,
14726 bufferlist::iterator& blp,
14727 WriteContext *wctx)
14728{
14729 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
14730 << std::dec << dendl;
14731 ceph_assert(length < min_alloc_size);
14732
14733 uint64_t end_offs = offset + length;
14734
14735 logger->inc(l_bluestore_write_small);
14736 logger->inc(l_bluestore_write_small_bytes, length);
14737
14738 bufferlist bl;
14739 blp.copy(length, bl);
14740
14741 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
14742 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
14743 uint32_t alloc_len = min_alloc_size;
14744 auto offset0 = p2align<uint64_t>(offset, alloc_len);
14745
14746 bool any_change;
14747
14748 // search suitable extent in both forward and reverse direction in
14749 // [offset - target_max_blob_size, offset + target_max_blob_size] range
14750 // then check if blob can be reused via can_reuse_blob func or apply
14751 // direct/deferred write (the latter for extents including or higher
14752 // than 'offset' only).
14753 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
14754
14755#ifdef HAVE_LIBZBD
14756 // On zoned devices, the first goal is to support non-overwrite workloads,
14757 // such as RGW, with large, aligned objects. Therefore, for user writes
14758 // _do_write_small should not trigger. OSDs, however, write and update a tiny
14759 // amount of metadata, such as OSD maps, to disk. For those cases, we
14760 // temporarily just pad them to min_alloc_size and write them to a new place
14761 // on every update.
14762 if (bdev->is_smr()) {
14763 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
14764 uint64_t b_off0 = b_off;
14765 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
14766
14767 // Zero detection -- small block
14768 if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
14769 BlobRef b = c->new_blob();
14770 _pad_zeros(&bl, &b_off0, min_alloc_size);
14771 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true);
14772 } else { // if (bl.is_zero())
14773 dout(20) << __func__ << " skip small zero block " << std::hex
14774 << " (0x" << b_off0 << "~" << bl.length() << ")"
14775 << " (0x" << b_off << "~" << length << ")"
14776 << std::dec << dendl;
14777 logger->inc(l_bluestore_write_small_skipped);
14778 logger->inc(l_bluestore_write_small_skipped_bytes, length);
14779 }
14780
14781 return;
14782 }
14783#endif
14784
14785 // Look for an existing mutable blob we can use.
14786 auto begin = o->extent_map.extent_map.begin();
14787 auto end = o->extent_map.extent_map.end();
14788 auto ep = o->extent_map.seek_lextent(offset);
14789 if (ep != begin) {
14790 --ep;
14791 if (ep->blob_end() <= offset) {
14792 ++ep;
14793 }
14794 }
14795 auto prev_ep = end;
14796 if (ep != begin) {
14797 prev_ep = ep;
14798 --prev_ep;
14799 }
14800
14801 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
14802 // We don't want to have more blobs than min alloc units fit
14803 // into 2 max blobs
14804 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
14805 bool above_blob_threshold = false;
14806
14807 inspected_blobs.reserve(blob_threshold);
14808
14809 uint64_t max_off = 0;
14810 auto start_ep = ep;
14811 auto end_ep = ep; // exclusively
14812 do {
14813 any_change = false;
14814
14815 if (ep != end && ep->logical_offset < offset + max_bsize) {
14816 BlobRef b = ep->blob;
14817 if (!above_blob_threshold) {
14818 inspected_blobs.insert(&b->get_blob());
14819 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
14820 }
14821 max_off = ep->logical_end();
14822 auto bstart = ep->blob_start();
14823
14824 dout(20) << __func__ << " considering " << *b
14825 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
14826 if (bstart >= end_offs) {
14827 dout(20) << __func__ << " ignoring distant " << *b << dendl;
14828 } else if (!b->get_blob().is_mutable()) {
14829 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
14830 } else if (ep->logical_offset % min_alloc_size !=
14831 ep->blob_offset % min_alloc_size) {
14832 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
14833 } else {
14834 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
14835 // can we pad our head/tail out with zeros?
14836 uint64_t head_pad, tail_pad;
14837 head_pad = p2phase(offset, chunk_size);
14838 tail_pad = p2nphase(end_offs, chunk_size);
14839 if (head_pad || tail_pad) {
14840 o->extent_map.fault_range(db, offset - head_pad,
14841 end_offs - offset + head_pad + tail_pad);
14842 }
14843 if (head_pad &&
14844 o->extent_map.has_any_lextents(offset - head_pad, head_pad)) {
14845 head_pad = 0;
14846 }
14847 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
14848 tail_pad = 0;
14849 }
14850
14851 uint64_t b_off = offset - head_pad - bstart;
14852 uint64_t b_len = length + head_pad + tail_pad;
14853
14854 // direct write into unused blocks of an existing mutable blob?
14855 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
14856 b->get_blob().get_ondisk_length() >= b_off + b_len &&
14857 b->get_blob().is_unused(b_off, b_len) &&
14858 b->get_blob().is_allocated(b_off, b_len)) {
14859 _apply_padding(head_pad, tail_pad, bl);
14860
14861 dout(20) << __func__ << " write to unused 0x" << std::hex
14862 << b_off << "~" << b_len
14863 << " pad 0x" << head_pad << " + 0x" << tail_pad
14864 << std::dec << " of mutable " << *b << dendl;
14865 _buffer_cache_write(txc, b, b_off, bl,
14866 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14867
14868 if (!g_conf()->bluestore_debug_omit_block_device_write) {
14869 if (b_len < prefer_deferred_size) {
14870 dout(20) << __func__ << " deferring small 0x" << std::hex
14871 << b_len << std::dec << " unused write via deferred" << dendl;
14872 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
14873 op->op = bluestore_deferred_op_t::OP_WRITE;
14874 b->get_blob().map(
14875 b_off, b_len,
14876 [&](uint64_t offset, uint64_t length) {
14877 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14878 return 0;
14879 });
14880 op->data = bl;
14881 } else {
14882 b->get_blob().map_bl(
14883 b_off, bl,
14884 [&](uint64_t offset, bufferlist& t) {
14885 bdev->aio_write(offset, t,
14886 &txc->ioc, wctx->buffered);
14887 });
14888 }
14889 }
14890 b->dirty_blob().calc_csum(b_off, bl);
14891 dout(20) << __func__ << " lex old " << *ep << dendl;
14892 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
14893 b,
14894 &wctx->old_extents);
14895 b->dirty_blob().mark_used(le->blob_offset, le->length);
14896
14897 txc->statfs_delta.stored() += le->length;
14898 dout(20) << __func__ << " lex " << *le << dendl;
14899 logger->inc(l_bluestore_write_small_unused);
14900 return;
14901 }
14902 // read some data to fill out the chunk?
14903 uint64_t head_read = p2phase(b_off, chunk_size);
14904 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
14905 if ((head_read || tail_read) &&
14906 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
14907 head_read + tail_read < min_alloc_size) {
14908 b_off -= head_read;
14909 b_len += head_read + tail_read;
14910
14911 } else {
14912 head_read = tail_read = 0;
14913 }
14914
14915 // chunk-aligned deferred overwrite?
14916 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
14917 b_off % chunk_size == 0 &&
14918 b_len % chunk_size == 0 &&
14919 b->get_blob().is_allocated(b_off, b_len)) {
14920
14921 _apply_padding(head_pad, tail_pad, bl);
14922
14923 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
14924 << " and tail 0x" << tail_read << std::dec << dendl;
14925 if (head_read) {
14926 bufferlist head_bl;
14927 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
14928 head_bl, 0);
14929 ceph_assert(r >= 0 && r <= (int)head_read);
14930 size_t zlen = head_read - r;
14931 if (zlen) {
14932 head_bl.append_zero(zlen);
14933 logger->inc(l_bluestore_write_pad_bytes, zlen);
14934 }
14935 head_bl.claim_append(bl);
14936 bl.swap(head_bl);
14937 logger->inc(l_bluestore_write_penalty_read_ops);
14938 }
14939 if (tail_read) {
14940 bufferlist tail_bl;
14941 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
14942 tail_bl, 0);
14943 ceph_assert(r >= 0 && r <= (int)tail_read);
14944 size_t zlen = tail_read - r;
14945 if (zlen) {
14946 tail_bl.append_zero(zlen);
14947 logger->inc(l_bluestore_write_pad_bytes, zlen);
14948 }
14949 bl.claim_append(tail_bl);
14950 logger->inc(l_bluestore_write_penalty_read_ops);
14951 }
14952 logger->inc(l_bluestore_write_small_pre_read);
14953
14954 _buffer_cache_write(txc, b, b_off, bl,
14955 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14956
14957 b->dirty_blob().calc_csum(b_off, bl);
14958
14959 if (!g_conf()->bluestore_debug_omit_block_device_write) {
14960 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
14961 op->op = bluestore_deferred_op_t::OP_WRITE;
14962 int r = b->get_blob().map(
14963 b_off, b_len,
14964 [&](uint64_t offset, uint64_t length) {
14965 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14966 return 0;
14967 });
14968 ceph_assert(r == 0);
14969 op->data = std::move(bl);
14970 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
14971 << b_len << std::dec << " of mutable " << *b
14972 << " at " << op->extents << dendl;
14973 }
14974
14975 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
14976 b, &wctx->old_extents);
14977 b->dirty_blob().mark_used(le->blob_offset, le->length);
14978 txc->statfs_delta.stored() += le->length;
14979 dout(20) << __func__ << " lex " << *le << dendl;
14980 return;
14981 }
14982 // try to reuse blob if we can
14983 if (b->can_reuse_blob(min_alloc_size,
14984 max_bsize,
14985 offset0 - bstart,
14986 &alloc_len)) {
14987 ceph_assert(alloc_len == min_alloc_size); // expecting data always
14988 // fit into reused blob
14989 // Need to check for pending writes desiring to
14990 // reuse the same pextent. The rationale is that during GC two chunks
14991 // from garbage blobs(compressed?) can share logical space within the same
14992 // AU. That's in turn might be caused by unaligned len in clone_range2.
14993 // Hence the second write will fail in an attempt to reuse blob at
14994 // do_alloc_write().
14995 if (!wctx->has_conflict(b,
14996 offset0,
14997 offset0 + alloc_len,
14998 min_alloc_size)) {
14999
15000 // we can't reuse pad_head/pad_tail since they might be truncated
15001 // due to existent extents
15002 uint64_t b_off = offset - bstart;
15003 uint64_t b_off0 = b_off;
15004 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
15005
15006 // Zero detection -- small block
15007 if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
15008 _pad_zeros(&bl, &b_off0, chunk_size);
15009
15010 dout(20) << __func__ << " reuse blob " << *b << std::hex
15011 << " (0x" << b_off0 << "~" << bl.length() << ")"
15012 << " (0x" << b_off << "~" << length << ")"
15013 << std::dec << dendl;
15014
15015 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
15016 false, false);
15017 logger->inc(l_bluestore_write_small_unused);
15018 } else { // if (bl.is_zero())
15019 dout(20) << __func__ << " skip small zero block " << std::hex
15020 << " (0x" << b_off0 << "~" << bl.length() << ")"
15021 << " (0x" << b_off << "~" << length << ")"
15022 << std::dec << dendl;
15023 logger->inc(l_bluestore_write_small_skipped);
15024 logger->inc(l_bluestore_write_small_skipped_bytes, length);
15025 }
15026
15027 return;
15028 }
15029 }
15030 }
15031 ++ep;
15032 end_ep = ep;
15033 any_change = true;
15034 } // if (ep != end && ep->logical_offset < offset + max_bsize)
15035
15036 // check extent for reuse in reverse order
15037 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
15038 BlobRef b = prev_ep->blob;
15039 if (!above_blob_threshold) {
15040 inspected_blobs.insert(&b->get_blob());
15041 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
15042 }
15043 start_ep = prev_ep;
15044 auto bstart = prev_ep->blob_start();
15045 dout(20) << __func__ << " considering " << *b
15046 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
15047 if (b->can_reuse_blob(min_alloc_size,
15048 max_bsize,
15049 offset0 - bstart,
15050 &alloc_len)) {
15051 ceph_assert(alloc_len == min_alloc_size); // expecting data always
15052 // fit into reused blob
15053 // Need to check for pending writes desiring to
15054 // reuse the same pextent. The rationale is that during GC two chunks
15055 // from garbage blobs(compressed?) can share logical space within the same
15056 // AU. That's in turn might be caused by unaligned len in clone_range2.
15057 // Hence the second write will fail in an attempt to reuse blob at
15058 // do_alloc_write().
15059 if (!wctx->has_conflict(b,
15060 offset0,
15061 offset0 + alloc_len,
15062 min_alloc_size)) {
15063
15064 uint64_t b_off = offset - bstart;
15065 uint64_t b_off0 = b_off;
15066 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
15067
15068 // Zero detection -- small block
15069 if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
15070 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
15071 _pad_zeros(&bl, &b_off0, chunk_size);
15072
15073 dout(20) << __func__ << " reuse blob " << *b << std::hex
15074 << " (0x" << b_off0 << "~" << bl.length() << ")"
15075 << " (0x" << b_off << "~" << length << ")"
15076 << std::dec << dendl;
15077
15078 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
15079 false, false);
15080 logger->inc(l_bluestore_write_small_unused);
15081 } else { // if (bl.is_zero())
15082 dout(20) << __func__ << " skip small zero block " << std::hex
15083 << " (0x" << b_off0 << "~" << bl.length() << ")"
15084 << " (0x" << b_off << "~" << length << ")"
15085 << std::dec << dendl;
15086 logger->inc(l_bluestore_write_small_skipped);
15087 logger->inc(l_bluestore_write_small_skipped_bytes, length);
15088 }
15089
15090 return;
15091 }
15092 }
15093 if (prev_ep != begin) {
15094 --prev_ep;
15095 any_change = true;
15096 } else {
15097 prev_ep = end; // to avoid useless first extent re-check
15098 }
15099 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
15100 } while (any_change);
15101
15102 if (above_blob_threshold) {
15103 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
15104 << " " << std::hex << min_off << "~" << max_off << std::dec
15105 << dendl;
15106 ceph_assert(start_ep != end_ep);
15107 for (auto ep = start_ep; ep != end_ep; ++ep) {
15108 dout(20) << __func__ << " inserting for GC "
15109 << std::hex << ep->logical_offset << "~" << ep->length
15110 << std::dec << dendl;
15111
15112 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
15113 }
15114 // insert newly written extent to GC
15115 wctx->extents_to_gc.union_insert(offset, length);
15116 dout(20) << __func__ << " inserting (last) for GC "
15117 << std::hex << offset << "~" << length
15118 << std::dec << dendl;
15119 }
15120 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
15121 uint64_t b_off0 = b_off;
15122 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
15123
15124 // Zero detection -- small block
15125 if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
15126 // new blob.
15127 BlobRef b = c->new_blob();
15128 _pad_zeros(&bl, &b_off0, block_size);
15129 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
15130 min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
15131 // doesn't match disk one only
15132 true);
15133 } else { // if (bl.is_zero())
15134 dout(20) << __func__ << " skip small zero block " << std::hex
15135 << " (0x" << b_off0 << "~" << bl.length() << ")"
15136 << " (0x" << b_off << "~" << length << ")"
15137 << std::dec << dendl;
15138 logger->inc(l_bluestore_write_small_skipped);
15139 logger->inc(l_bluestore_write_small_skipped_bytes, length);
15140 }
15141
15142 return;
15143}
15144
15145bool BlueStore::BigDeferredWriteContext::can_defer(
15146 BlueStore::extent_map_t::iterator ep,
15147 uint64_t prefer_deferred_size,
15148 uint64_t block_size,
15149 uint64_t offset,
15150 uint64_t l)
15151{
15152 bool res = false;
15153 auto& blob = ep->blob->get_blob();
15154 if (offset >= ep->blob_start() &&
15155 blob.is_mutable()) {
15156 off = offset;
15157 b_off = offset - ep->blob_start();
15158 uint64_t chunk_size = blob.get_chunk_size(block_size);
15159 uint64_t ondisk = blob.get_ondisk_length();
15160 used = std::min(l, ondisk - b_off);
15161
15162 // will read some data to fill out the chunk?
15163 head_read = p2phase<uint64_t>(b_off, chunk_size);
15164 tail_read = p2nphase<uint64_t>(b_off + used, chunk_size);
15165 b_off -= head_read;
15166
15167 ceph_assert(b_off % chunk_size == 0);
15168 ceph_assert(blob_aligned_len() % chunk_size == 0);
15169
15170 res = blob_aligned_len() < prefer_deferred_size &&
15171 blob_aligned_len() <= ondisk &&
15172 blob.is_allocated(b_off, blob_aligned_len());
15173 if (res) {
15174 blob_ref = ep->blob;
15175 blob_start = ep->blob_start();
15176 }
15177 }
15178 return res;
15179}
15180
15181bool BlueStore::BigDeferredWriteContext::apply_defer()
15182{
15183 int r = blob_ref->get_blob().map(
15184 b_off, blob_aligned_len(),
15185 [&](const bluestore_pextent_t& pext,
15186 uint64_t offset,
15187 uint64_t length) {
15188 // apply deferred if overwrite breaks blob continuity only.
15189 // if it totally overlaps some pextent - fallback to regular write
15190 if (pext.offset < offset ||
15191 pext.end() > offset + length) {
15192 res_extents.emplace_back(bluestore_pextent_t(offset, length));
15193 return 0;
15194 }
15195 return -1;
15196 });
15197 return r >= 0;
15198}
15199
15200void BlueStore::_do_write_big_apply_deferred(
15201 TransContext* txc,
15202 CollectionRef& c,
15203 OnodeRef& o,
15204 BlueStore::BigDeferredWriteContext& dctx,
15205 bufferlist::iterator& blp,
15206 WriteContext* wctx)
15207{
15208 bufferlist bl;
15209 dout(20) << __func__ << " reading head 0x" << std::hex << dctx.head_read
15210 << " and tail 0x" << dctx.tail_read << std::dec << dendl;
15211 if (dctx.head_read) {
15212 int r = _do_read(c.get(), o,
15213 dctx.off - dctx.head_read,
15214 dctx.head_read,
15215 bl,
15216 0);
15217 ceph_assert(r >= 0 && r <= (int)dctx.head_read);
15218 size_t zlen = dctx.head_read - r;
15219 if (zlen) {
15220 bl.append_zero(zlen);
15221 logger->inc(l_bluestore_write_pad_bytes, zlen);
15222 }
15223 logger->inc(l_bluestore_write_penalty_read_ops);
15224 }
15225 blp.copy(dctx.used, bl);
15226
15227 if (dctx.tail_read) {
15228 bufferlist tail_bl;
15229 int r = _do_read(c.get(), o,
15230 dctx.off + dctx.used, dctx.tail_read,
15231 tail_bl, 0);
15232 ceph_assert(r >= 0 && r <= (int)dctx.tail_read);
15233 size_t zlen = dctx.tail_read - r;
15234 if (zlen) {
15235 tail_bl.append_zero(zlen);
15236 logger->inc(l_bluestore_write_pad_bytes, zlen);
15237 }
15238 bl.claim_append(tail_bl);
15239 logger->inc(l_bluestore_write_penalty_read_ops);
15240 }
15241 auto& b0 = dctx.blob_ref;
15242 _buffer_cache_write(txc, b0, dctx.b_off, bl,
15243 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
15244
15245 b0->dirty_blob().calc_csum(dctx.b_off, bl);
15246
15247 Extent* le = o->extent_map.set_lextent(c, dctx.off,
15248 dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents);
15249
15250 // in fact this is a no-op for big writes but left here to maintain
15251 // uniformity and avoid missing after some refactor.
15252 b0->dirty_blob().mark_used(le->blob_offset, le->length);
15253 txc->statfs_delta.stored() += le->length;
15254
15255 if (!g_conf()->bluestore_debug_omit_block_device_write) {
15256 bluestore_deferred_op_t* op = _get_deferred_op(txc, bl.length());
15257 op->op = bluestore_deferred_op_t::OP_WRITE;
15258 op->extents.swap(dctx.res_extents);
15259 op->data = std::move(bl);
15260 }
15261}
15262
15263void BlueStore::_do_write_big(
15264 TransContext *txc,
15265 CollectionRef &c,
15266 OnodeRef& o,
15267 uint64_t offset, uint64_t length,
15268 bufferlist::iterator& blp,
15269 WriteContext *wctx)
15270{
15271 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
15272 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
15273 << " compress " << (int)wctx->compress
15274 << dendl;
15275 logger->inc(l_bluestore_write_big);
15276 logger->inc(l_bluestore_write_big_bytes, length);
15277 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
15278 uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load();
15279 while (length > 0) {
15280 bool new_blob = false;
15281 BlobRef b;
15282 uint32_t b_off = 0;
15283 uint32_t l = 0;
15284
15285 //attempting to reuse existing blob
15286 if (!wctx->compress) {
15287 // enforce target blob alignment with max_bsize
15288 l = max_bsize - p2phase(offset, max_bsize);
15289 l = std::min(uint64_t(l), length);
15290
15291 auto end = o->extent_map.extent_map.end();
15292
15293 dout(20) << __func__ << " may be defer: 0x" << std::hex
15294 << offset << "~" << l
15295 << std::dec << dendl;
15296
15297 if (prefer_deferred_size_snapshot &&
15298 l <= prefer_deferred_size_snapshot * 2) {
15299 // Single write that spans two adjusted existing blobs can result
15300 // in up to two deferred blocks of 'prefer_deferred_size'
15301 // So we're trying to minimize the amount of resulting blobs
15302 // and preserve 2 blobs rather than inserting one more in between
15303 // E.g. write 0x10000~20000 over existing blobs
15304 // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
15305 // performance point of view) to result in two deferred writes to
15306 // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
15307
15308 // look for an existing mutable blob we can write into
15309 auto ep = o->extent_map.seek_lextent(offset);
15310 auto ep_next = end;
15311 BigDeferredWriteContext head_info, tail_info;
15312
15313 bool will_defer = ep != end ?
15314 head_info.can_defer(ep,
15315 prefer_deferred_size_snapshot,
15316 block_size,
15317 offset,
15318 l) :
15319 false;
15320 auto offset_next = offset + head_info.used;
15321 auto remaining = l - head_info.used;
15322 if (will_defer && remaining) {
15323 will_defer = false;
15324 if (remaining <= prefer_deferred_size_snapshot) {
15325 ep_next = o->extent_map.seek_lextent(offset_next);
15326 // check if we can defer remaining totally
15327 will_defer = ep_next == end ?
15328 false :
15329 tail_info.can_defer(ep_next,
15330 prefer_deferred_size_snapshot,
15331 block_size,
15332 offset_next,
15333 remaining);
15334 will_defer = will_defer && remaining == tail_info.used;
15335 }
15336 }
15337 if (will_defer) {
15338 dout(20) << __func__ << " " << *(head_info.blob_ref)
15339 << " deferring big " << std::hex
15340 << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")"
15341 << std::dec << " write via deferred"
15342 << dendl;
15343 if (remaining) {
15344 dout(20) << __func__ << " " << *(tail_info.blob_ref)
15345 << " deferring big " << std::hex
15346 << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")"
15347 << std::dec << " write via deferred"
15348 << dendl;
15349 }
15350
15351 will_defer = head_info.apply_defer();
15352 if (!will_defer) {
15353 dout(20) << __func__
15354 << " deferring big fell back, head isn't continuous"
15355 << dendl;
15356 } else if (remaining) {
15357 will_defer = tail_info.apply_defer();
15358 if (!will_defer) {
15359 dout(20) << __func__
15360 << " deferring big fell back, tail isn't continuous"
15361 << dendl;
15362 }
15363 }
15364 }
15365 if (will_defer) {
15366 _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx);
15367 if (remaining) {
15368 _do_write_big_apply_deferred(txc, c, o, tail_info,
15369 blp, wctx);
15370 }
15371 dout(20) << __func__ << " defer big: 0x" << std::hex
15372 << offset << "~" << l
15373 << std::dec << dendl;
15374 offset += l;
15375 length -= l;
15376 logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1);
15377 logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1);
15378 continue;
15379 }
15380 }
15381 dout(20) << __func__ << " lookup for blocks to reuse..." << dendl;
15382
15383 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
15384
15385 // seek again as punch_hole could invalidate ep
15386 auto ep = o->extent_map.seek_lextent(offset);
15387 auto begin = o->extent_map.extent_map.begin();
15388 auto prev_ep = end;
15389 if (ep != begin) {
15390 prev_ep = ep;
15391 --prev_ep;
15392 }
15393
15394 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
15395 // search suitable extent in both forward and reverse direction in
15396 // [offset - target_max_blob_size, offset + target_max_blob_size] range
15397 // then check if blob can be reused via can_reuse_blob func.
15398 bool any_change;
15399 do {
15400 any_change = false;
15401 if (ep != end && ep->logical_offset < offset + max_bsize) {
15402 dout(20) << __func__ << " considering " << *ep
15403 << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl;
15404
15405 if (offset >= ep->blob_start() &&
15406 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
15407 offset - ep->blob_start(),
15408 &l)) {
15409 b = ep->blob;
15410 b_off = offset - ep->blob_start();
15411 prev_ep = end; // to avoid check below
15412 dout(20) << __func__ << " reuse blob " << *b << std::hex
15413 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
15414 } else {
15415 ++ep;
15416 any_change = true;
15417 }
15418 }
15419
15420 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
15421 dout(20) << __func__ << " considering rev " << *prev_ep
15422 << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl;
15423 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
15424 offset - prev_ep->blob_start(),
15425 &l)) {
15426 b = prev_ep->blob;
15427 b_off = offset - prev_ep->blob_start();
15428 dout(20) << __func__ << " reuse blob " << *b << std::hex
15429 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
15430 } else if (prev_ep != begin) {
15431 --prev_ep;
15432 any_change = true;
15433 } else {
15434 prev_ep = end; // to avoid useless first extent re-check
15435 }
15436 }
15437 } while (b == nullptr && any_change);
15438 } else {
15439 // trying to utilize as longer chunk as permitted in case of compression.
15440 l = std::min(max_bsize, length);
15441 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
15442 } // if (!wctx->compress)
15443
15444 if (b == nullptr) {
15445 b = c->new_blob();
15446 b_off = 0;
15447 new_blob = true;
15448 }
15449 bufferlist t;
15450 blp.copy(l, t);
15451
15452 // Zero detection -- big block
15453 if (!cct->_conf->bluestore_zero_block_detection || !t.is_zero()) {
15454 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
15455
15456 dout(20) << __func__ << " schedule write big: 0x"
15457 << std::hex << offset << "~" << l << std::dec
15458 << (new_blob ? " new " : " reuse ")
15459 << *b << dendl;
15460
15461 logger->inc(l_bluestore_write_big_blobs);
15462 } else { // if (!t.is_zero())
15463 dout(20) << __func__ << " skip big zero block " << std::hex
15464 << " (0x" << b_off << "~" << t.length() << ")"
15465 << " (0x" << b_off << "~" << l << ")"
15466 << std::dec << dendl;
15467 logger->inc(l_bluestore_write_big_skipped_blobs);
15468 logger->inc(l_bluestore_write_big_skipped_bytes, l);
15469 }
15470
15471 offset += l;
15472 length -= l;
15473 }
15474}
15475
15476int BlueStore::_do_alloc_write(
15477 TransContext *txc,
15478 CollectionRef coll,
15479 OnodeRef& o,
15480 WriteContext *wctx)
15481{
15482 dout(20) << __func__ << " txc " << txc
15483 << " " << wctx->writes.size() << " blobs"
15484 << dendl;
15485 if (wctx->writes.empty()) {
15486 return 0;
15487 }
15488
15489 CompressorRef c;
15490 double crr = 0;
15491 if (wctx->compress) {
15492 c = select_option(
15493 "compression_algorithm",
15494 compressor,
15495 [&]() {
15496 string val;
15497 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
15498 CompressorRef cp = compressor;
15499 if (!cp || cp->get_type_name() != val) {
15500 cp = Compressor::create(cct, val);
15501 if (!cp) {
15502 if (_set_compression_alert(false, val.c_str())) {
15503 derr << __func__ << " unable to initialize " << val.c_str()
15504 << " compressor" << dendl;
15505 }
15506 }
15507 }
15508 return std::optional<CompressorRef>(cp);
15509 }
15510 return std::optional<CompressorRef>();
15511 }
15512 );
15513
15514 crr = select_option(
15515 "compression_required_ratio",
15516 cct->_conf->bluestore_compression_required_ratio,
15517 [&]() {
15518 double val;
15519 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
15520 return std::optional<double>(val);
15521 }
15522 return std::optional<double>();
15523 }
15524 );
15525 }
15526
15527 // checksum
15528 int64_t csum = csum_type.load();
15529 csum = select_option(
15530 "csum_type",
15531 csum,
15532 [&]() {
15533 int64_t val;
15534 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
15535 return std::optional<int64_t>(val);
15536 }
15537 return std::optional<int64_t>();
15538 }
15539 );
15540
15541 // compress (as needed) and calc needed space
15542 uint64_t need = 0;
15543 uint64_t data_size = 0;
15544 // 'need' is amount of space that must be provided by allocator.
15545 // 'data_size' is a size of data that will be transferred to disk.
15546 // Note that data_size is always <= need. This comes from:
15547 // - write to blob was unaligned, and there is free space
15548 // - data has been compressed
15549 //
15550 // We make one decision and apply it to all blobs.
15551 // All blobs will be deferred or none will.
15552 // We assume that allocator does its best to provide contiguous space,
15553 // and the condition is : (data_size < deferred).
15554
15555 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
15556 for (auto& wi : wctx->writes) {
15557 if (c && wi.blob_length > min_alloc_size) {
15558 auto start = mono_clock::now();
15559
15560 // compress
15561 ceph_assert(wi.b_off == 0);
15562 ceph_assert(wi.blob_length == wi.bl.length());
15563
15564 // FIXME: memory alignment here is bad
15565 bufferlist t;
15566 std::optional<int32_t> compressor_message;
15567 int r = c->compress(wi.bl, t, compressor_message);
15568 uint64_t want_len_raw = wi.blob_length * crr;
15569 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
15570 bool rejected = false;
15571 uint64_t compressed_len = t.length();
15572 // do an approximate (fast) estimation for resulting blob size
15573 // that doesn't take header overhead into account
15574 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
15575 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
15576 bluestore_compression_header_t chdr;
15577 chdr.type = c->get_type();
15578 chdr.length = t.length();
15579 chdr.compressor_message = compressor_message;
15580 encode(chdr, wi.compressed_bl);
15581 wi.compressed_bl.claim_append(t);
15582
15583 compressed_len = wi.compressed_bl.length();
15584 result_len = p2roundup(compressed_len, min_alloc_size);
15585 if (result_len <= want_len && result_len < wi.blob_length) {
15586 // Cool. We compressed at least as much as we were hoping to.
15587 // pad out to min_alloc_size
15588 wi.compressed_bl.append_zero(result_len - compressed_len);
15589 wi.compressed_len = compressed_len;
15590 wi.compressed = true;
15591 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
15592 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
15593 << " -> 0x" << compressed_len << " => 0x" << result_len
15594 << " with " << c->get_type()
15595 << std::dec << dendl;
15596 txc->statfs_delta.compressed() += compressed_len;
15597 txc->statfs_delta.compressed_original() += wi.blob_length;
15598 txc->statfs_delta.compressed_allocated() += result_len;
15599 logger->inc(l_bluestore_compress_success_count);
15600 need += result_len;
15601 data_size += result_len;
15602 } else {
15603 rejected = true;
15604 }
15605 } else if (r != 0) {
15606 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
15607 << " bytes compressed using " << c->get_type_name()
15608 << std::dec
15609 << " failed with errcode = " << r
15610 << ", leaving uncompressed"
15611 << dendl;
15612 logger->inc(l_bluestore_compress_rejected_count);
15613 need += wi.blob_length;
15614 data_size += wi.bl.length();
15615 } else {
15616 rejected = true;
15617 }
15618
15619 if (rejected) {
15620 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
15621 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
15622 << " with " << c->get_type()
15623 << ", which is more than required 0x" << want_len_raw
15624 << " -> 0x" << want_len
15625 << ", leaving uncompressed"
15626 << std::dec << dendl;
15627 logger->inc(l_bluestore_compress_rejected_count);
15628 need += wi.blob_length;
15629 data_size += wi.bl.length();
15630 }
15631 log_latency("compress@_do_alloc_write",
15632 l_bluestore_compress_lat,
15633 mono_clock::now() - start,
15634 cct->_conf->bluestore_log_op_age );
15635 } else {
15636 need += wi.blob_length;
15637 data_size += wi.bl.length();
15638 }
15639 }
15640 PExtentVector prealloc;
15641 prealloc.reserve(2 * wctx->writes.size());
15642 int64_t prealloc_left = 0;
15643 prealloc_left = alloc->allocate(
15644 need, min_alloc_size, need,
15645 0, &prealloc);
15646 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
15647 derr << __func__ << " failed to allocate 0x" << std::hex << need
15648 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
15649 << " min_alloc_size 0x" << min_alloc_size
15650 << " available 0x " << alloc->get_free()
15651 << std::dec << dendl;
15652 if (prealloc.size()) {
15653 alloc->release(prealloc);
15654 }
15655 return -ENOSPC;
15656 }
15657 _collect_allocation_stats(need, min_alloc_size, prealloc);
15658
15659 dout(20) << __func__ << std::hex << " need=0x" << need << " data=0x" << data_size
15660 << " prealloc " << prealloc << dendl;
15661 auto prealloc_pos = prealloc.begin();
15662 ceph_assert(prealloc_pos != prealloc.end());
15663
15664 for (auto& wi : wctx->writes) {
15665 bluestore_blob_t& dblob = wi.b->dirty_blob();
15666 uint64_t b_off = wi.b_off;
15667 bufferlist *l = &wi.bl;
15668 uint64_t final_length = wi.blob_length;
15669 uint64_t csum_length = wi.blob_length;
15670 if (wi.compressed) {
15671 final_length = wi.compressed_bl.length();
15672 csum_length = final_length;
15673 unsigned csum_order = std::countr_zero(csum_length);
15674 l = &wi.compressed_bl;
15675 dblob.set_compressed(wi.blob_length, wi.compressed_len);
15676 if (csum != Checksummer::CSUM_NONE) {
15677 dout(20) << __func__
15678 << " initialize csum setting for compressed blob " << *wi.b
15679 << " csum_type " << Checksummer::get_csum_type_string(csum)
15680 << " csum_order " << csum_order
15681 << " csum_length 0x" << std::hex << csum_length
15682 << " blob_length 0x" << wi.blob_length
15683 << " compressed_length 0x" << wi.compressed_len << std::dec
15684 << dendl;
15685 dblob.init_csum(csum, csum_order, csum_length);
15686 }
15687 } else if (wi.new_blob) {
15688 unsigned csum_order;
15689 // initialize newly created blob only
15690 ceph_assert(dblob.is_mutable());
15691 if (l->length() != wi.blob_length) {
15692 // hrm, maybe we could do better here, but let's not bother.
15693 dout(20) << __func__ << " forcing csum_order to block_size_order "
15694 << block_size_order << dendl;
15695 csum_order = block_size_order;
15696 } else {
15697 csum_order = std::min<unsigned>(wctx->csum_order, std::countr_zero(l->length()));
15698 }
15699 // try to align blob with max_blob_size to improve
15700 // its reuse ratio, e.g. in case of reverse write
15701 uint32_t suggested_boff =
15702 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
15703 if ((suggested_boff % (1 << csum_order)) == 0 &&
15704 suggested_boff + final_length <= max_bsize &&
15705 suggested_boff > b_off) {
15706 dout(20) << __func__ << " forcing blob_offset to 0x"
15707 << std::hex << suggested_boff << std::dec << dendl;
15708 ceph_assert(suggested_boff >= b_off);
15709 csum_length += suggested_boff - b_off;
15710 b_off = suggested_boff;
15711 }
15712 if (csum != Checksummer::CSUM_NONE) {
15713 dout(20) << __func__
15714 << " initialize csum setting for new blob " << *wi.b
15715 << " csum_type " << Checksummer::get_csum_type_string(csum)
15716 << " csum_order " << csum_order
15717 << " csum_length 0x" << std::hex << csum_length << std::dec
15718 << dendl;
15719 dblob.init_csum(csum, csum_order, csum_length);
15720 }
15721 }
15722
15723 PExtentVector extents;
15724 int64_t left = final_length;
15725 auto prefer_deferred_size_snapshot = prefer_deferred_size.load();
15726 while (left > 0) {
15727 ceph_assert(prealloc_left > 0);
15728 if (prealloc_pos->length <= left) {
15729 prealloc_left -= prealloc_pos->length;
15730 left -= prealloc_pos->length;
15731 txc->statfs_delta.allocated() += prealloc_pos->length;
15732 extents.push_back(*prealloc_pos);
15733 ++prealloc_pos;
15734 } else {
15735 extents.emplace_back(prealloc_pos->offset, left);
15736 prealloc_pos->offset += left;
15737 prealloc_pos->length -= left;
15738 prealloc_left -= left;
15739 txc->statfs_delta.allocated() += left;
15740 left = 0;
15741 break;
15742 }
15743 }
15744 for (auto& p : extents) {
15745 txc->allocated.insert(p.offset, p.length);
15746 }
15747 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
15748
15749 dout(20) << __func__ << " blob " << *wi.b << dendl;
15750 if (dblob.has_csum()) {
15751 dblob.calc_csum(b_off, *l);
15752 }
15753
15754 if (wi.mark_unused) {
15755 ceph_assert(!dblob.is_compressed());
15756 auto b_end = b_off + wi.bl.length();
15757 if (b_off) {
15758 dblob.add_unused(0, b_off);
15759 }
15760 uint64_t llen = dblob.get_logical_length();
15761 if (b_end < llen) {
15762 dblob.add_unused(b_end, llen - b_end);
15763 }
15764 }
15765
15766 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
15767 b_off + (wi.b_off0 - wi.b_off),
15768 wi.length0,
15769 wi.b,
15770 nullptr);
15771 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
15772 txc->statfs_delta.stored() += le->length;
15773 dout(20) << __func__ << " lex " << *le << dendl;
15774 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
15775 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
15776
15777 // queue io
15778 if (!g_conf()->bluestore_debug_omit_block_device_write) {
15779 if (data_size < prefer_deferred_size_snapshot) {
15780 dout(20) << __func__ << " deferring 0x" << std::hex
15781 << l->length() << std::dec << " write via deferred" << dendl;
15782 bluestore_deferred_op_t *op = _get_deferred_op(txc, l->length());
15783 op->op = bluestore_deferred_op_t::OP_WRITE;
15784 int r = wi.b->get_blob().map(
15785 b_off, l->length(),
15786 [&](uint64_t offset, uint64_t length) {
15787 op->extents.emplace_back(bluestore_pextent_t(offset, length));
15788 return 0;
15789 });
15790 ceph_assert(r == 0);
15791 op->data = *l;
15792 } else {
15793 wi.b->get_blob().map_bl(
15794 b_off, *l,
15795 [&](uint64_t offset, bufferlist& t) {
15796 bdev->aio_write(offset, t, &txc->ioc, false);
15797 });
15798 logger->inc(l_bluestore_write_new);
15799 }
15800 }
15801 }
15802 ceph_assert(prealloc_pos == prealloc.end());
15803 ceph_assert(prealloc_left == 0);
15804 return 0;
15805}
15806
15807void BlueStore::_wctx_finish(
15808 TransContext *txc,
15809 CollectionRef& c,
15810 OnodeRef& o,
15811 WriteContext *wctx,
15812 set<SharedBlob*> *maybe_unshared_blobs)
15813{
15814#ifdef HAVE_LIBZBD
15815 if (bdev->is_smr()) {
15816 for (auto& w : wctx->writes) {
15817 for (auto& e : w.b->get_blob().get_extents()) {
15818 if (!e.is_valid()) {
15819 continue;
15820 }
15821 uint32_t zone = e.offset / zone_size;
15822 if (!o->onode.zone_offset_refs.count(zone)) {
15823 uint64_t zoff = e.offset % zone_size;
15824 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
15825 << " offset 0x" << zoff << std::dec << dendl;
15826 txc->note_write_zone_offset(o, zone, zoff);
15827 }
15828 }
15829 }
15830 }
15831 set<uint32_t> zones_with_releases;
15832#endif
15833
15834 auto oep = wctx->old_extents.begin();
15835 while (oep != wctx->old_extents.end()) {
15836 auto &lo = *oep;
15837 oep = wctx->old_extents.erase(oep);
15838 dout(20) << __func__ << " lex_old " << lo.e << dendl;
15839 BlobRef b = lo.e.blob;
15840 const bluestore_blob_t& blob = b->get_blob();
15841 if (blob.is_compressed()) {
15842 if (lo.blob_empty) {
15843 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
15844 }
15845 txc->statfs_delta.compressed_original() -= lo.e.length;
15846 }
15847 auto& r = lo.r;
15848 txc->statfs_delta.stored() -= lo.e.length;
15849 if (!r.empty()) {
15850 dout(20) << __func__ << " blob " << *b << " release " << r << dendl;
15851 if (blob.is_shared()) {
15852 PExtentVector final;
15853 c->load_shared_blob(b->shared_blob);
15854 bool unshare = false;
15855 bool* unshare_ptr =
15856 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
15857 for (auto e : r) {
15858 b->shared_blob->put_ref(
15859 e.offset, e.length, &final,
15860 unshare_ptr);
15861#ifdef HAVE_LIBZBD
15862 // we also drop zone ref for shared blob extents
15863 if (bdev->is_smr() && e.is_valid()) {
15864 zones_with_releases.insert(e.offset / zone_size);
15865 }
15866#endif
15867 }
15868 if (unshare) {
15869 ceph_assert(maybe_unshared_blobs);
15870 maybe_unshared_blobs->insert(b->shared_blob.get());
15871 }
15872 dout(20) << __func__ << " shared_blob release " << final
15873 << " from " << *b->shared_blob << dendl;
15874 txc->write_shared_blob(b->shared_blob);
15875 r.clear();
15876 r.swap(final);
15877 }
15878 }
15879 // we can't invalidate our logical extents as we drop them because
15880 // other lextents (either in our onode or others) may still
15881 // reference them. but we can throw out anything that is no
15882 // longer allocated. Note that this will leave behind edge bits
15883 // that are no longer referenced but not deallocated (until they
15884 // age out of the cache naturally).
15885 b->discard_unallocated(c.get());
15886 for (auto e : r) {
15887 dout(20) << __func__ << " release " << e << dendl;
15888 txc->released.insert(e.offset, e.length);
15889 txc->statfs_delta.allocated() -= e.length;
15890 if (blob.is_compressed()) {
15891 txc->statfs_delta.compressed_allocated() -= e.length;
15892 }
15893#ifdef HAVE_LIBZBD
15894 if (bdev->is_smr() && e.is_valid()) {
15895 zones_with_releases.insert(e.offset / zone_size);
15896 }
15897#endif
15898 }
15899
15900 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
15901 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
15902 << dendl;
15903 o->extent_map.spanning_blob_map.erase(b->id);
15904 }
15905 delete &lo;
15906 }
15907
15908#ifdef HAVE_LIBZBD
15909 if (!zones_with_releases.empty()) {
15910 // we need to fault the entire extent range in here to determinte if we've dropped
15911 // all refs to a zone.
15912 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
15913 for (auto& b : o->extent_map.extent_map) {
15914 for (auto& e : b.blob->get_blob().get_extents()) {
15915 if (e.is_valid()) {
15916 zones_with_releases.erase(e.offset / zone_size);
15917 }
15918 }
15919 }
15920 for (auto zone : zones_with_releases) {
15921 auto p = o->onode.zone_offset_refs.find(zone);
15922 if (p != o->onode.zone_offset_refs.end()) {
15923 dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
15924 << " offset 0x" << p->second << std::dec << dendl;
15925 txc->note_release_zone_offset(o, zone, p->second);
15926 }
15927 }
15928 }
15929#endif
15930}
15931
15932void BlueStore::_do_write_data(
15933 TransContext *txc,
15934 CollectionRef& c,
15935 OnodeRef& o,
15936 uint64_t offset,
15937 uint64_t length,
15938 bufferlist& bl,
15939 WriteContext *wctx)
15940{
15941 uint64_t end = offset + length;
15942 bufferlist::iterator p = bl.begin();
15943
15944 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
15945 (length != min_alloc_size)) {
15946 // we fall within the same block
15947 _do_write_small(txc, c, o, offset, length, p, wctx);
15948 } else {
15949 uint64_t head_offset, head_length;
15950 uint64_t middle_offset, middle_length;
15951 uint64_t tail_offset, tail_length;
15952
15953 head_offset = offset;
15954 head_length = p2nphase(offset, min_alloc_size);
15955
15956 tail_offset = p2align(end, min_alloc_size);
15957 tail_length = p2phase(end, min_alloc_size);
15958
15959 middle_offset = head_offset + head_length;
15960 middle_length = length - head_length - tail_length;
15961
15962 if (head_length) {
15963 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
15964 }
15965
15966 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
15967
15968 if (tail_length) {
15969 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
15970 }
15971 }
15972}
15973
15974void BlueStore::_choose_write_options(
15975 CollectionRef& c,
15976 OnodeRef& o,
15977 uint32_t fadvise_flags,
15978 WriteContext *wctx)
15979{
15980 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
15981 dout(20) << __func__ << " will do buffered write" << dendl;
15982 wctx->buffered = true;
15983 } else if (cct->_conf->bluestore_default_buffered_write &&
15984 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
15985 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
15986 dout(20) << __func__ << " defaulting to buffered write" << dendl;
15987 wctx->buffered = true;
15988 }
15989
15990 // apply basic csum block size
15991 wctx->csum_order = block_size_order;
15992
15993 // compression parameters
15994 unsigned alloc_hints = o->onode.alloc_hint_flags;
15995 auto cm = select_option(
15996 "compression_mode",
15997 comp_mode.load(),
15998 [&]() {
15999 string val;
16000 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
16001 return std::optional<Compressor::CompressionMode>(
16002 Compressor::get_comp_mode_type(val));
16003 }
16004 return std::optional<Compressor::CompressionMode>();
16005 }
16006 );
16007
16008 wctx->compress = (cm != Compressor::COMP_NONE) &&
16009 ((cm == Compressor::COMP_FORCE) ||
16010 (cm == Compressor::COMP_AGGRESSIVE &&
16011 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
16012 (cm == Compressor::COMP_PASSIVE &&
16013 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
16014
16015 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
16016 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
16017 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
16018 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
16019 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
16020
16021 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
16022
16023 if (o->onode.expected_write_size) {
16024 wctx->csum_order = std::max(min_alloc_size_order,
16025 (uint8_t)std::countr_zero(o->onode.expected_write_size));
16026 } else {
16027 wctx->csum_order = min_alloc_size_order;
16028 }
16029
16030 if (wctx->compress) {
16031 wctx->target_blob_size = select_option(
16032 "compression_max_blob_size",
16033 comp_max_blob_size.load(),
16034 [&]() {
16035 int64_t val;
16036 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
16037 return std::optional<uint64_t>((uint64_t)val);
16038 }
16039 return std::optional<uint64_t>();
16040 }
16041 );
16042 }
16043 } else {
16044 if (wctx->compress) {
16045 wctx->target_blob_size = select_option(
16046 "compression_min_blob_size",
16047 comp_min_blob_size.load(),
16048 [&]() {
16049 int64_t val;
16050 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
16051 return std::optional<uint64_t>((uint64_t)val);
16052 }
16053 return std::optional<uint64_t>();
16054 }
16055 );
16056 }
16057 }
16058
16059 uint64_t max_bsize = max_blob_size.load();
16060 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
16061 wctx->target_blob_size = max_bsize;
16062 }
16063
16064 // set the min blob size floor at 2x the min_alloc_size, or else we
16065 // won't be able to allocate a smaller extent for the compressed
16066 // data.
16067 if (wctx->compress &&
16068 wctx->target_blob_size < min_alloc_size * 2) {
16069 wctx->target_blob_size = min_alloc_size * 2;
16070 }
16071
16072 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
16073 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
16074 << " compress=" << (int)wctx->compress
16075 << " buffered=" << (int)wctx->buffered
16076 << std::dec << dendl;
16077}
16078
16079int BlueStore::_do_gc(
16080 TransContext *txc,
16081 CollectionRef& c,
16082 OnodeRef& o,
16083 const WriteContext& wctx,
16084 uint64_t *dirty_start,
16085 uint64_t *dirty_end)
16086{
16087
16088 bool dirty_range_updated = false;
16089 WriteContext wctx_gc;
16090 wctx_gc.fork(wctx); // make a clone for garbage collection
16091
16092 auto & extents_to_collect = wctx.extents_to_gc;
16093 for (auto it = extents_to_collect.begin();
16094 it != extents_to_collect.end();
16095 ++it) {
16096 bufferlist bl;
16097 auto offset = (*it).first;
16098 auto length = (*it).second;
16099 dout(20) << __func__ << " processing " << std::hex
16100 << offset << "~" << length << std::dec
16101 << dendl;
16102 int r = _do_read(c.get(), o, offset, length, bl, 0);
16103 ceph_assert(r == (int)length);
16104
16105 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
16106 logger->inc(l_bluestore_gc_merged, length);
16107
16108 if (*dirty_start > offset) {
16109 *dirty_start = offset;
16110 dirty_range_updated = true;
16111 }
16112
16113 if (*dirty_end < offset + length) {
16114 *dirty_end = offset + length;
16115 dirty_range_updated = true;
16116 }
16117 }
16118 if (dirty_range_updated) {
16119 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
16120 }
16121
16122 dout(30) << __func__ << " alloc write" << dendl;
16123 int r = _do_alloc_write(txc, c, o, &wctx_gc);
16124 if (r < 0) {
16125 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
16126 << dendl;
16127 return r;
16128 }
16129
16130 _wctx_finish(txc, c, o, &wctx_gc);
16131 return 0;
16132}
16133
16134int BlueStore::_do_write(
16135 TransContext *txc,
16136 CollectionRef& c,
16137 OnodeRef& o,
16138 uint64_t offset,
16139 uint64_t length,
16140 bufferlist& bl,
16141 uint32_t fadvise_flags)
16142{
16143 int r = 0;
16144
16145 dout(20) << __func__
16146 << " " << o->oid
16147 << " 0x" << std::hex << offset << "~" << length
16148 << " - have 0x" << o->onode.size
16149 << " (" << std::dec << o->onode.size << ")"
16150 << " bytes" << std::hex
16151 << " fadvise_flags 0x" << fadvise_flags
16152 << " alloc_hint 0x" << o->onode.alloc_hint_flags
16153 << " expected_object_size " << o->onode.expected_object_size
16154 << " expected_write_size " << o->onode.expected_write_size
16155 << std::dec
16156 << dendl;
16157 _dump_onode<30>(cct, *o);
16158
16159 if (length == 0) {
16160 return 0;
16161 }
16162
16163 uint64_t end = offset + length;
16164
16165 GarbageCollector gc(c->store->cct);
16166 int64_t benefit = 0;
16167 auto dirty_start = offset;
16168 auto dirty_end = end;
16169
16170 WriteContext wctx;
16171 _choose_write_options(c, o, fadvise_flags, &wctx);
16172 o->extent_map.fault_range(db, offset, length);
16173 _do_write_data(txc, c, o, offset, length, bl, &wctx);
16174 r = _do_alloc_write(txc, c, o, &wctx);
16175 if (r < 0) {
16176 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
16177 << dendl;
16178 goto out;
16179 }
16180
16181 if (wctx.extents_to_gc.empty() ||
16182 wctx.extents_to_gc.range_start() > offset ||
16183 wctx.extents_to_gc.range_end() < offset + length) {
16184 benefit = gc.estimate(offset,
16185 length,
16186 o->extent_map,
16187 wctx.old_extents,
16188 min_alloc_size);
16189 }
16190
16191 // NB: _wctx_finish() will empty old_extents
16192 // so we must do gc estimation before that
16193 _wctx_finish(txc, c, o, &wctx);
16194 if (end > o->onode.size) {
16195 dout(20) << __func__ << " extending size to 0x" << std::hex << end
16196 << std::dec << dendl;
16197 o->onode.size = end;
16198 }
16199
16200 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
16201 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
16202 dout(20) << __func__
16203 << " perform garbage collection for compressed extents, "
16204 << "expected benefit = " << benefit << " AUs" << dendl;
16205 }
16206 if (!wctx.extents_to_gc.empty()) {
16207 dout(20) << __func__ << " perform garbage collection" << dendl;
16208
16209 r = _do_gc(txc, c, o,
16210 wctx,
16211 &dirty_start, &dirty_end);
16212 if (r < 0) {
16213 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
16214 << dendl;
16215 goto out;
16216 }
16217 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
16218 << "~" << dirty_end - dirty_start << std::dec << dendl;
16219 }
16220 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
16221 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
16222
16223 r = 0;
16224
16225 out:
16226 return r;
16227}
16228
16229int BlueStore::_write(TransContext *txc,
16230 CollectionRef& c,
16231 OnodeRef& o,
16232 uint64_t offset, size_t length,
16233 bufferlist& bl,
16234 uint32_t fadvise_flags)
16235{
16236 dout(15) << __func__ << " " << c->cid << " " << o->oid
16237 << " 0x" << std::hex << offset << "~" << length << std::dec
16238 << dendl;
16239 int r = 0;
16240 if (offset + length >= OBJECT_MAX_SIZE) {
16241 r = -E2BIG;
16242 } else {
16243 _assign_nid(txc, o);
16244 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
16245 txc->write_onode(o);
16246 }
16247 dout(10) << __func__ << " " << c->cid << " " << o->oid
16248 << " 0x" << std::hex << offset << "~" << length << std::dec
16249 << " = " << r << dendl;
16250 return r;
16251}
16252
16253int BlueStore::_zero(TransContext *txc,
16254 CollectionRef& c,
16255 OnodeRef& o,
16256 uint64_t offset, size_t length)
16257{
16258 dout(15) << __func__ << " " << c->cid << " " << o->oid
16259 << " 0x" << std::hex << offset << "~" << length << std::dec
16260 << dendl;
16261 int r = 0;
16262 if (offset + length >= OBJECT_MAX_SIZE) {
16263 r = -E2BIG;
16264 } else {
16265 _assign_nid(txc, o);
16266 r = _do_zero(txc, c, o, offset, length);
16267 }
16268 dout(10) << __func__ << " " << c->cid << " " << o->oid
16269 << " 0x" << std::hex << offset << "~" << length << std::dec
16270 << " = " << r << dendl;
16271 return r;
16272}
16273
16274int BlueStore::_do_zero(TransContext *txc,
16275 CollectionRef& c,
16276 OnodeRef& o,
16277 uint64_t offset, size_t length)
16278{
16279 dout(15) << __func__ << " " << c->cid << " " << o->oid
16280 << " 0x" << std::hex << offset << "~" << length << std::dec
16281 << dendl;
16282 int r = 0;
16283
16284 _dump_onode<30>(cct, *o);
16285
16286 WriteContext wctx;
16287 o->extent_map.fault_range(db, offset, length);
16288 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
16289 o->extent_map.dirty_range(offset, length);
16290 _wctx_finish(txc, c, o, &wctx);
16291
16292 if (length > 0 && offset + length > o->onode.size) {
16293 o->onode.size = offset + length;
16294 dout(20) << __func__ << " extending size to " << offset + length
16295 << dendl;
16296 }
16297 txc->write_onode(o);
16298
16299 dout(10) << __func__ << " " << c->cid << " " << o->oid
16300 << " 0x" << std::hex << offset << "~" << length << std::dec
16301 << " = " << r << dendl;
16302 return r;
16303}
16304
16305void BlueStore::_do_truncate(
16306 TransContext *txc, CollectionRef& c, OnodeRef& o, uint64_t offset,
16307 set<SharedBlob*> *maybe_unshared_blobs)
16308{
16309 dout(15) << __func__ << " " << c->cid << " " << o->oid
16310 << " 0x" << std::hex << offset << std::dec << dendl;
16311
16312 _dump_onode<30>(cct, *o);
16313
16314 if (offset == o->onode.size)
16315 return;
16316
16317 WriteContext wctx;
16318 if (offset < o->onode.size) {
16319 uint64_t length = o->onode.size - offset;
16320 o->extent_map.fault_range(db, offset, length);
16321 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
16322 o->extent_map.dirty_range(offset, length);
16323
16324 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
16325
16326 // if we have shards past EOF, ask for a reshard
16327 if (!o->onode.extent_map_shards.empty() &&
16328 o->onode.extent_map_shards.back().offset >= offset) {
16329 dout(10) << __func__ << " request reshard past EOF" << dendl;
16330 if (offset) {
16331 o->extent_map.request_reshard(offset - 1, offset + length);
16332 } else {
16333 o->extent_map.request_reshard(0, length);
16334 }
16335 }
16336 }
16337
16338 o->onode.size = offset;
16339
16340 txc->write_onode(o);
16341}
16342
16343int BlueStore::_truncate(TransContext *txc,
16344 CollectionRef& c,
16345 OnodeRef& o,
16346 uint64_t offset)
16347{
16348 dout(15) << __func__ << " " << c->cid << " " << o->oid
16349 << " 0x" << std::hex << offset << std::dec
16350 << dendl;
16351
16352 auto start_time = mono_clock::now();
16353 int r = 0;
16354 if (offset >= OBJECT_MAX_SIZE) {
16355 r = -E2BIG;
16356 } else {
16357 _do_truncate(txc, c, o, offset);
16358 }
16359 log_latency_fn(
16360 __func__,
16361 l_bluestore_truncate_lat,
16362 mono_clock::now() - start_time,
16363 cct->_conf->bluestore_log_op_age,
16364 [&](const ceph::timespan& lat) {
16365 ostringstream ostr;
16366 ostr << ", lat = " << timespan_str(lat)
16367 << " cid =" << c->cid
16368 << " oid =" << o->oid;
16369 return ostr.str();
16370 }
16371 );
16372 dout(10) << __func__ << " " << c->cid << " " << o->oid
16373 << " 0x" << std::hex << offset << std::dec
16374 << " = " << r << dendl;
16375 return r;
16376}
16377
16378int BlueStore::_do_remove(
16379 TransContext *txc,
16380 CollectionRef& c,
16381 OnodeRef& o)
16382{
16383 set<SharedBlob*> maybe_unshared_blobs;
16384 bool is_gen = !o->oid.is_no_gen();
16385 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
16386 if (o->onode.has_omap()) {
16387 o->flush();
16388 _do_omap_clear(txc, o);
16389 }
16390 o->exists = false;
16391 string key;
16392 for (auto &s : o->extent_map.shards) {
16393 dout(20) << __func__ << " removing shard 0x" << std::hex
16394 << s.shard_info->offset << std::dec << dendl;
16395 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
16396 [&](const string& final_key) {
16397 txc->t->rmkey(PREFIX_OBJ, final_key);
16398 }
16399 );
16400 }
16401 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
16402 txc->note_removed_object(o);
16403 o->extent_map.clear();
16404 o->onode = bluestore_onode_t();
16405 _debug_obj_on_delete(o->oid);
16406
16407 if (!is_gen || maybe_unshared_blobs.empty()) {
16408 return 0;
16409 }
16410
16411 // see if we can unshare blobs still referenced by the head
16412 dout(10) << __func__ << " gen and maybe_unshared_blobs "
16413 << maybe_unshared_blobs << dendl;
16414 ghobject_t nogen = o->oid;
16415 nogen.generation = ghobject_t::NO_GEN;
16416 OnodeRef h = c->get_onode(nogen, false);
16417
16418 if (!h || !h->exists) {
16419 return 0;
16420 }
16421
16422 dout(20) << __func__ << " checking for unshareable blobs on " << h
16423 << " " << h->oid << dendl;
16424 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
16425 for (auto& e : h->extent_map.extent_map) {
16426 const bluestore_blob_t& b = e.blob->get_blob();
16427 SharedBlob *sb = e.blob->shared_blob.get();
16428 if (b.is_shared() &&
16429 sb->loaded &&
16430 maybe_unshared_blobs.count(sb)) {
16431 if (b.is_compressed()) {
16432 expect[sb].get(0, b.get_ondisk_length());
16433 } else {
16434 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
16435 expect[sb].get(off, len);
16436 return 0;
16437 });
16438 }
16439 }
16440 }
16441
16442 vector<SharedBlob*> unshared_blobs;
16443 unshared_blobs.reserve(maybe_unshared_blobs.size());
16444 for (auto& p : expect) {
16445 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
16446 if (p.first->persistent->ref_map == p.second) {
16447 SharedBlob *sb = p.first;
16448 dout(20) << __func__ << " unsharing " << *sb << dendl;
16449 unshared_blobs.push_back(sb);
16450 txc->unshare_blob(sb);
16451 uint64_t sbid = c->make_blob_unshared(sb);
16452 string key;
16453 get_shared_blob_key(sbid, &key);
16454 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
16455 }
16456 }
16457
16458 if (unshared_blobs.empty()) {
16459 return 0;
16460 }
16461
16462 for (auto& e : h->extent_map.extent_map) {
16463 const bluestore_blob_t& b = e.blob->get_blob();
16464 SharedBlob *sb = e.blob->shared_blob.get();
16465 if (b.is_shared() &&
16466 std::find(unshared_blobs.begin(), unshared_blobs.end(),
16467 sb) != unshared_blobs.end()) {
16468 dout(20) << __func__ << " unsharing " << e << dendl;
16469 bluestore_blob_t& blob = e.blob->dirty_blob();
16470 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
16471 h->extent_map.dirty_range(e.logical_offset, 1);
16472 }
16473 }
16474 txc->write_onode(h);
16475
16476 return 0;
16477}
16478
16479int BlueStore::_remove(TransContext *txc,
16480 CollectionRef& c,
16481 OnodeRef& o)
16482{
16483 dout(15) << __func__ << " " << c->cid << " " << o->oid
16484 << " onode " << o.get()
16485 << " txc "<< txc << dendl;
16486 auto start_time = mono_clock::now();
16487 int r = _do_remove(txc, c, o);
16488
16489 log_latency_fn(
16490 __func__,
16491 l_bluestore_remove_lat,
16492 mono_clock::now() - start_time,
16493 cct->_conf->bluestore_log_op_age,
16494 [&](const ceph::timespan& lat) {
16495 ostringstream ostr;
16496 ostr << ", lat = " << timespan_str(lat)
16497 << " cid =" << c->cid
16498 << " oid =" << o->oid;
16499 return ostr.str();
16500 }
16501 );
16502
16503 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16504 return r;
16505}
16506
16507int BlueStore::_setattr(TransContext *txc,
16508 CollectionRef& c,
16509 OnodeRef& o,
16510 const string& name,
16511 bufferptr& val)
16512{
16513 dout(15) << __func__ << " " << c->cid << " " << o->oid
16514 << " " << name << " (" << val.length() << " bytes)"
16515 << dendl;
16516 int r = 0;
16517 if (val.is_partial()) {
16518 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
16519 val.length());
16520 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
16521 } else {
16522 auto& b = o->onode.attrs[name.c_str()] = val;
16523 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
16524 }
16525 txc->write_onode(o);
16526 dout(10) << __func__ << " " << c->cid << " " << o->oid
16527 << " " << name << " (" << val.length() << " bytes)"
16528 << " = " << r << dendl;
16529 return r;
16530}
16531
16532int BlueStore::_setattrs(TransContext *txc,
16533 CollectionRef& c,
16534 OnodeRef& o,
16535 const map<string,bufferptr>& aset)
16536{
16537 dout(15) << __func__ << " " << c->cid << " " << o->oid
16538 << " " << aset.size() << " keys"
16539 << dendl;
16540 int r = 0;
16541 for (map<string,bufferptr>::const_iterator p = aset.begin();
16542 p != aset.end(); ++p) {
16543 if (p->second.is_partial()) {
16544 auto& b = o->onode.attrs[p->first.c_str()] =
16545 bufferptr(p->second.c_str(), p->second.length());
16546 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
16547 } else {
16548 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
16549 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
16550 }
16551 }
16552 txc->write_onode(o);
16553 dout(10) << __func__ << " " << c->cid << " " << o->oid
16554 << " " << aset.size() << " keys"
16555 << " = " << r << dendl;
16556 return r;
16557}
16558
16559
16560int BlueStore::_rmattr(TransContext *txc,
16561 CollectionRef& c,
16562 OnodeRef& o,
16563 const string& name)
16564{
16565 dout(15) << __func__ << " " << c->cid << " " << o->oid
16566 << " " << name << dendl;
16567 int r = 0;
16568 auto it = o->onode.attrs.find(name.c_str());
16569 if (it == o->onode.attrs.end())
16570 goto out;
16571
16572 o->onode.attrs.erase(it);
16573 txc->write_onode(o);
16574
16575 out:
16576 dout(10) << __func__ << " " << c->cid << " " << o->oid
16577 << " " << name << " = " << r << dendl;
16578 return r;
16579}
16580
16581int BlueStore::_rmattrs(TransContext *txc,
16582 CollectionRef& c,
16583 OnodeRef& o)
16584{
16585 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16586 int r = 0;
16587
16588 if (o->onode.attrs.empty())
16589 goto out;
16590
16591 o->onode.attrs.clear();
16592 txc->write_onode(o);
16593
16594 out:
16595 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16596 return r;
16597}
16598
16599void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
16600{
16601 const string& omap_prefix = o->get_omap_prefix();
16602 string prefix, tail;
16603 o->get_omap_header(&prefix);
16604 o->get_omap_tail(&tail);
16605 txc->t->rm_range_keys(omap_prefix, prefix, tail);
16606 txc->t->rmkey(omap_prefix, tail);
16607 o->onode.clear_omap_flag();
16608 dout(20) << __func__ << " remove range start: "
16609 << pretty_binary_string(prefix) << " end: "
16610 << pretty_binary_string(tail) << dendl;
16611}
16612
16613int BlueStore::_omap_clear(TransContext *txc,
16614 CollectionRef& c,
16615 OnodeRef& o)
16616{
16617 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16618 auto t0 = mono_clock::now();
16619
16620 int r = 0;
16621 if (o->onode.has_omap()) {
16622 o->flush();
16623 _do_omap_clear(txc, o);
16624 txc->write_onode(o);
16625 }
16626 logger->tinc(l_bluestore_omap_clear_lat, mono_clock::now() - t0);
16627
16628 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16629 return r;
16630}
16631
16632int BlueStore::_omap_setkeys(TransContext *txc,
16633 CollectionRef& c,
16634 OnodeRef& o,
16635 bufferlist &bl)
16636{
16637 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16638 int r;
16639 auto p = bl.cbegin();
16640 __u32 num;
16641 if (!o->onode.has_omap()) {
16642 if (o->oid.is_pgmeta()) {
16643 o->onode.set_omap_flags_pgmeta();
16644 } else {
16645 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
16646 }
16647 txc->write_onode(o);
16648
16649 const string& prefix = o->get_omap_prefix();
16650 string key_tail;
16651 bufferlist tail;
16652 o->get_omap_tail(&key_tail);
16653 txc->t->set(prefix, key_tail, tail);
16654 } else {
16655 txc->note_modified_object(o);
16656 }
16657 const string& prefix = o->get_omap_prefix();
16658 string final_key;
16659 o->get_omap_key(string(), &final_key);
16660 size_t base_key_len = final_key.size();
16661 decode(num, p);
16662 while (num--) {
16663 string key;
16664 bufferlist value;
16665 decode(key, p);
16666 decode(value, p);
16667 final_key.resize(base_key_len); // keep prefix
16668 final_key += key;
16669 dout(20) << __func__ << " " << pretty_binary_string(final_key)
16670 << " <- " << key << dendl;
16671 txc->t->set(prefix, final_key, value);
16672 }
16673 r = 0;
16674 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16675 return r;
16676}
16677
16678int BlueStore::_omap_setheader(TransContext *txc,
16679 CollectionRef& c,
16680 OnodeRef& o,
16681 bufferlist& bl)
16682{
16683 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16684 int r;
16685 string key;
16686 if (!o->onode.has_omap()) {
16687 if (o->oid.is_pgmeta()) {
16688 o->onode.set_omap_flags_pgmeta();
16689 } else {
16690 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
16691 }
16692 txc->write_onode(o);
16693
16694 const string& prefix = o->get_omap_prefix();
16695 string key_tail;
16696 bufferlist tail;
16697 o->get_omap_tail(&key_tail);
16698 txc->t->set(prefix, key_tail, tail);
16699 } else {
16700 txc->note_modified_object(o);
16701 }
16702 const string& prefix = o->get_omap_prefix();
16703 o->get_omap_header(&key);
16704 txc->t->set(prefix, key, bl);
16705 r = 0;
16706 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16707 return r;
16708}
16709
16710int BlueStore::_omap_rmkeys(TransContext *txc,
16711 CollectionRef& c,
16712 OnodeRef& o,
16713 bufferlist& bl)
16714{
16715 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16716 int r = 0;
16717 auto p = bl.cbegin();
16718 __u32 num;
16719 string final_key;
16720 if (!o->onode.has_omap()) {
16721 goto out;
16722 }
16723 {
16724 const string& prefix = o->get_omap_prefix();
16725 o->get_omap_key(string(), &final_key);
16726 size_t base_key_len = final_key.size();
16727 decode(num, p);
16728 logger->inc(l_bluestore_omap_rmkeys_count, num);
16729 while (num--) {
16730 string key;
16731 decode(key, p);
16732 final_key.resize(base_key_len); // keep prefix
16733 final_key += key;
16734 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
16735 << " <- " << key << dendl;
16736 txc->t->rmkey(prefix, final_key);
16737 }
16738 }
16739 txc->note_modified_object(o);
16740
16741 out:
16742 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16743 return r;
16744}
16745
16746int BlueStore::_omap_rmkey_range(TransContext *txc,
16747 CollectionRef& c,
16748 OnodeRef& o,
16749 const string& first, const string& last)
16750{
16751 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16752 string key_first, key_last;
16753 int r = 0;
16754 if (!o->onode.has_omap()) {
16755 goto out;
16756 }
16757 {
16758 const string& prefix = o->get_omap_prefix();
16759 o->flush();
16760 o->get_omap_key(first, &key_first);
16761 o->get_omap_key(last, &key_last);
16762 logger->inc(l_bluestore_omap_rmkey_ranges_count);
16763 txc->t->rm_range_keys(prefix, key_first, key_last);
16764 dout(20) << __func__ << " remove range start: "
16765 << pretty_binary_string(key_first) << " end: "
16766 << pretty_binary_string(key_last) << dendl;
16767 }
16768 txc->note_modified_object(o);
16769
16770 out:
16771 return r;
16772}
16773
16774int BlueStore::_set_alloc_hint(
16775 TransContext *txc,
16776 CollectionRef& c,
16777 OnodeRef& o,
16778 uint64_t expected_object_size,
16779 uint64_t expected_write_size,
16780 uint32_t flags)
16781{
16782 dout(15) << __func__ << " " << c->cid << " " << o->oid
16783 << " object_size " << expected_object_size
16784 << " write_size " << expected_write_size
16785 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
16786 << dendl;
16787 int r = 0;
16788 o->onode.expected_object_size = expected_object_size;
16789 o->onode.expected_write_size = expected_write_size;
16790 o->onode.alloc_hint_flags = flags;
16791 txc->write_onode(o);
16792 dout(10) << __func__ << " " << c->cid << " " << o->oid
16793 << " object_size " << expected_object_size
16794 << " write_size " << expected_write_size
16795 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
16796 << " = " << r << dendl;
16797 return r;
16798}
16799
16800int BlueStore::_clone(TransContext *txc,
16801 CollectionRef& c,
16802 OnodeRef& oldo,
16803 OnodeRef& newo)
16804{
16805 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16806 << newo->oid << dendl;
16807 int r = 0;
16808 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
16809 derr << __func__ << " mismatched hash on " << oldo->oid
16810 << " and " << newo->oid << dendl;
16811 return -EINVAL;
16812 }
16813
16814 _assign_nid(txc, newo);
16815
16816 // clone data
16817 oldo->flush();
16818 _do_truncate(txc, c, newo, 0);
16819 if (cct->_conf->bluestore_clone_cow) {
16820 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
16821 } else {
16822 bufferlist bl;
16823 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
16824 if (r < 0)
16825 goto out;
16826 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
16827 if (r < 0)
16828 goto out;
16829 }
16830
16831 // clone attrs
16832 newo->onode.attrs = oldo->onode.attrs;
16833
16834 // clone omap
16835 if (newo->onode.has_omap()) {
16836 dout(20) << __func__ << " clearing old omap data" << dendl;
16837 newo->flush();
16838 _do_omap_clear(txc, newo);
16839 }
16840 if (oldo->onode.has_omap()) {
16841 dout(20) << __func__ << " copying omap data" << dendl;
16842 if (newo->oid.is_pgmeta()) {
16843 newo->onode.set_omap_flags_pgmeta();
16844 } else {
16845 newo->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
16846 }
16847 // check if prefix for omap key is exactly the same size for both objects
16848 // otherwise rewrite_omap_key will corrupt data
16849 ceph_assert(oldo->onode.flags == newo->onode.flags);
16850 const string& prefix = newo->get_omap_prefix();
16851 string head, tail;
16852 oldo->get_omap_header(&head);
16853 oldo->get_omap_tail(&tail);
16854 KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
16855 it->lower_bound(head);
16856 while (it->valid()) {
16857 if (it->key() >= tail) {
16858 dout(30) << __func__ << " reached tail" << dendl;
16859 break;
16860 } else {
16861 dout(30) << __func__ << " got header/data "
16862 << pretty_binary_string(it->key()) << dendl;
16863 string key;
16864 newo->rewrite_omap_key(it->key(), &key);
16865 txc->t->set(prefix, key, it->value());
16866 }
16867 it->next();
16868 }
16869 string new_tail;
16870 bufferlist new_tail_value;
16871 newo->get_omap_tail(&new_tail);
16872 txc->t->set(prefix, new_tail, new_tail_value);
16873 }
16874
16875 txc->write_onode(newo);
16876 r = 0;
16877
16878 out:
16879 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16880 << newo->oid << " = " << r << dendl;
16881 return r;
16882}
16883
16884int BlueStore::_do_clone_range(
16885 TransContext *txc,
16886 CollectionRef& c,
16887 OnodeRef& oldo,
16888 OnodeRef& newo,
16889 uint64_t srcoff,
16890 uint64_t length,
16891 uint64_t dstoff)
16892{
16893 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16894 << newo->oid
16895 << " 0x" << std::hex << srcoff << "~" << length << " -> "
16896 << " 0x" << dstoff << "~" << length << std::dec << dendl;
16897 oldo->extent_map.fault_range(db, srcoff, length);
16898 newo->extent_map.fault_range(db, dstoff, length);
16899 _dump_onode<30>(cct, *oldo);
16900 _dump_onode<30>(cct, *newo);
16901
16902 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
16903
16904#ifdef HAVE_LIBZBD
16905 if (bdev->is_smr()) {
16906 // duplicate the refs for the shared region.
16907 Extent dummy(dstoff);
16908 for (auto e = newo->extent_map.extent_map.lower_bound(dummy);
16909 e != newo->extent_map.extent_map.end();
16910 ++e) {
16911 if (e->logical_offset >= dstoff + length) {
16912 break;
16913 }
16914 for (auto& ex : e->blob->get_blob().get_extents()) {
16915 // note that we may introduce a new extent reference that is
16916 // earlier than the first zone ref. we allow this since it is
16917 // a lot of work to avoid and has marginal impact on cleaning
16918 // performance.
16919 if (!ex.is_valid()) {
16920 continue;
16921 }
16922 uint32_t zone = ex.offset / zone_size;
16923 if (!newo->onode.zone_offset_refs.count(zone)) {
16924 uint64_t zoff = ex.offset % zone_size;
16925 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
16926 << " offset 0x" << zoff << std::dec
16927 << " -> " << newo->oid << dendl;
16928 txc->note_write_zone_offset(newo, zone, zoff);
16929 }
16930 }
16931 }
16932 }
16933#endif
16934
16935 _dump_onode<30>(cct, *oldo);
16936 _dump_onode<30>(cct, *newo);
16937 return 0;
16938}
16939
16940int BlueStore::_clone_range(TransContext *txc,
16941 CollectionRef& c,
16942 OnodeRef& oldo,
16943 OnodeRef& newo,
16944 uint64_t srcoff, uint64_t length, uint64_t dstoff)
16945{
16946 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16947 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
16948 << " to offset 0x" << dstoff << std::dec << dendl;
16949 int r = 0;
16950
16951 if (srcoff + length >= OBJECT_MAX_SIZE ||
16952 dstoff + length >= OBJECT_MAX_SIZE) {
16953 r = -E2BIG;
16954 goto out;
16955 }
16956 if (srcoff + length > oldo->onode.size) {
16957 r = -EINVAL;
16958 goto out;
16959 }
16960
16961 _assign_nid(txc, newo);
16962
16963 if (length > 0) {
16964 if (cct->_conf->bluestore_clone_cow) {
16965 _do_zero(txc, c, newo, dstoff, length);
16966 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
16967 } else {
16968 bufferlist bl;
16969 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
16970 if (r < 0)
16971 goto out;
16972 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
16973 if (r < 0)
16974 goto out;
16975 }
16976 }
16977
16978 txc->write_onode(newo);
16979 r = 0;
16980
16981 out:
16982 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16983 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
16984 << " to offset 0x" << dstoff << std::dec
16985 << " = " << r << dendl;
16986 return r;
16987}
16988
16989int BlueStore::_rename(TransContext *txc,
16990 CollectionRef& c,
16991 OnodeRef& oldo,
16992 OnodeRef& newo,
16993 const ghobject_t& new_oid)
16994{
16995 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16996 << new_oid << dendl;
16997 int r;
16998 ghobject_t old_oid = oldo->oid;
16999 mempool::bluestore_cache_meta::string new_okey;
17000
17001 if (newo) {
17002 if (newo->exists) {
17003 r = -EEXIST;
17004 goto out;
17005 }
17006 ceph_assert(txc->onodes.count(newo) == 0);
17007 }
17008
17009 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
17010
17011 // rewrite shards
17012 {
17013 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
17014 get_object_key(cct, new_oid, &new_okey);
17015 string key;
17016 for (auto &s : oldo->extent_map.shards) {
17017 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
17018 [&](const string& final_key) {
17019 txc->t->rmkey(PREFIX_OBJ, final_key);
17020 }
17021 );
17022 s.dirty = true;
17023 }
17024 }
17025
17026 newo = oldo;
17027 txc->write_onode(newo);
17028
17029 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
17030 // Onode in the old slot
17031 c->onode_space.rename(oldo, old_oid, new_oid, new_okey);
17032 r = 0;
17033
17034 // hold a ref to new Onode in old name position, to ensure we don't drop
17035 // it from the cache before this txc commits (or else someone may come along
17036 // and read newo's metadata via the old name).
17037 txc->note_modified_object(oldo);
17038
17039#ifdef HAVE_LIBZBD
17040 if (bdev->is_smr()) {
17041 // adjust zone refs
17042 for (auto& [zone, offset] : newo->onode.zone_offset_refs) {
17043 dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
17044 << " offset 0x" << offset << std::dec
17045 << " -> " << oldo->oid << dendl;
17046 string key;
17047 get_zone_offset_object_key(zone, offset, oldo->oid, &key);
17048 txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
17049
17050 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
17051 << " offset 0x" << offset << std::dec
17052 << " -> " << newo->oid << dendl;
17053 get_zone_offset_object_key(zone, offset, newo->oid, &key);
17054 bufferlist v;
17055 txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
17056 }
17057 }
17058#endif
17059
17060 out:
17061 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
17062 << new_oid << " = " << r << dendl;
17063 return r;
17064}
17065
17066// collections
17067
17068int BlueStore::_create_collection(
17069 TransContext *txc,
17070 const coll_t &cid,
17071 unsigned bits,
17072 CollectionRef *c)
17073{
17074 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
17075 int r;
17076 bufferlist bl;
17077
17078 {
17079 std::unique_lock l(coll_lock);
17080 if (*c) {
17081 r = -EEXIST;
17082 goto out;
17083 }
17084 auto p = new_coll_map.find(cid);
17085 ceph_assert(p != new_coll_map.end());
17086 *c = p->second;
17087 (*c)->cnode.bits = bits;
17088 coll_map[cid] = *c;
17089 new_coll_map.erase(p);
17090 }
17091 encode((*c)->cnode, bl);
17092 txc->t->set(PREFIX_COLL, stringify(cid), bl);
17093 r = 0;
17094
17095 out:
17096 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
17097 return r;
17098}
17099
17100int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
17101 CollectionRef *c)
17102{
17103 dout(15) << __func__ << " " << cid << dendl;
17104 int r;
17105
17106 (*c)->flush_all_but_last();
17107 {
17108 std::unique_lock l(coll_lock);
17109 if (!*c) {
17110 r = -ENOENT;
17111 goto out;
17112 }
17113 size_t nonexistent_count = 0;
17114 ceph_assert((*c)->exists);
17115 if ((*c)->onode_space.map_any([&](Onode* o) {
17116 if (o->exists) {
17117 dout(1) << __func__ << " " << o->oid << " " << o
17118 << " exists in onode_map" << dendl;
17119 return true;
17120 }
17121 ++nonexistent_count;
17122 return false;
17123 })) {
17124 r = -ENOTEMPTY;
17125 goto out;
17126 }
17127 vector<ghobject_t> ls;
17128 ghobject_t next;
17129 // Enumerate onodes in db, up to nonexistent_count + 1
17130 // then check if all of them are marked as non-existent.
17131 // Bypass the check if (next != ghobject_t::get_max())
17132 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
17133 nonexistent_count + 1, false, &ls, &next);
17134 if (r >= 0) {
17135 // If true mean collecton has more objects than nonexistent_count,
17136 // so bypass check.
17137 bool exists = (!next.is_max());
17138 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
17139 dout(10) << __func__ << " oid " << *it << dendl;
17140 auto onode = (*c)->onode_space.lookup(*it);
17141 exists = !onode || onode->exists;
17142 if (exists) {
17143 dout(1) << __func__ << " " << *it
17144 << " exists in db, "
17145 << (!onode ? "not present in ram" : "present in ram")
17146 << dendl;
17147 }
17148 }
17149 if (!exists) {
17150 _do_remove_collection(txc, c);
17151 r = 0;
17152 } else {
17153 dout(10) << __func__ << " " << cid
17154 << " is non-empty" << dendl;
17155 r = -ENOTEMPTY;
17156 }
17157 }
17158 }
17159out:
17160 dout(10) << __func__ << " " << cid << " = " << r << dendl;
17161 return r;
17162}
17163
17164void BlueStore::_do_remove_collection(TransContext *txc,
17165 CollectionRef *c)
17166{
17167 coll_map.erase((*c)->cid);
17168 txc->removed_collections.push_back(*c);
17169 (*c)->exists = false;
17170 _osr_register_zombie((*c)->osr.get());
17171 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
17172 c->reset();
17173}
17174
17175int BlueStore::_split_collection(TransContext *txc,
17176 CollectionRef& c,
17177 CollectionRef& d,
17178 unsigned bits, int rem)
17179{
17180 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
17181 << " bits " << bits << dendl;
17182 std::unique_lock l(c->lock);
17183 std::unique_lock l2(d->lock);
17184 int r;
17185
17186 // flush all previous deferred writes on this sequencer. this is a bit
17187 // heavyweight, but we need to make sure all deferred writes complete
17188 // before we split as the new collection's sequencer may need to order
17189 // this after those writes, and we don't bother with the complexity of
17190 // moving those TransContexts over to the new osr.
17191 _osr_drain_preceding(txc);
17192
17193 // move any cached items (onodes and referenced shared blobs) that will
17194 // belong to the child collection post-split. leave everything else behind.
17195 // this may include things that don't strictly belong to the now-smaller
17196 // parent split, but the OSD will always send us a split for every new
17197 // child.
17198
17199 spg_t pgid, dest_pgid;
17200 bool is_pg = c->cid.is_pg(&pgid);
17201 ceph_assert(is_pg);
17202 is_pg = d->cid.is_pg(&dest_pgid);
17203 ceph_assert(is_pg);
17204
17205 // the destination should initially be empty.
17206 ceph_assert(d->onode_space.empty());
17207 ceph_assert(d->shared_blob_set.empty());
17208 ceph_assert(d->cnode.bits == bits);
17209
17210 c->split_cache(d.get());
17211
17212 // adjust bits. note that this will be redundant for all but the first
17213 // split call for this parent (first child).
17214 c->cnode.bits = bits;
17215 ceph_assert(d->cnode.bits == bits);
17216 r = 0;
17217
17218 bufferlist bl;
17219 encode(c->cnode, bl);
17220 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
17221
17222 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
17223 << " bits " << bits << " = " << r << dendl;
17224 return r;
17225}
17226
17227int BlueStore::_merge_collection(
17228 TransContext *txc,
17229 CollectionRef *c,
17230 CollectionRef& d,
17231 unsigned bits)
17232{
17233 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
17234 << " bits " << bits << dendl;
17235 std::unique_lock l((*c)->lock);
17236 std::unique_lock l2(d->lock);
17237 int r;
17238
17239 coll_t cid = (*c)->cid;
17240
17241 // flush all previous deferred writes on the source collection to ensure
17242 // that all deferred writes complete before we merge as the target collection's
17243 // sequencer may need to order new ops after those writes.
17244
17245 _osr_drain((*c)->osr.get());
17246
17247 // move any cached items (onodes and referenced shared blobs) that will
17248 // belong to the child collection post-split. leave everything else behind.
17249 // this may include things that don't strictly belong to the now-smaller
17250 // parent split, but the OSD will always send us a split for every new
17251 // child.
17252
17253 spg_t pgid, dest_pgid;
17254 bool is_pg = cid.is_pg(&pgid);
17255 ceph_assert(is_pg);
17256 is_pg = d->cid.is_pg(&dest_pgid);
17257 ceph_assert(is_pg);
17258
17259 // adjust bits. note that this will be redundant for all but the first
17260 // merge call for the parent/target.
17261 d->cnode.bits = bits;
17262
17263 // behavior depends on target (d) bits, so this after that is updated.
17264 (*c)->split_cache(d.get());
17265
17266 // remove source collection
17267 {
17268 std::unique_lock l3(coll_lock);
17269 _do_remove_collection(txc, c);
17270 }
17271
17272 r = 0;
17273
17274 bufferlist bl;
17275 encode(d->cnode, bl);
17276 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
17277
17278 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
17279 << " bits " << bits << " = " << r << dendl;
17280 return r;
17281}
17282
17283void BlueStore::log_latency(
17284 const char* name,
17285 int idx,
17286 const ceph::timespan& l,
17287 double lat_threshold,
17288 const char* info) const
17289{
17290 logger->tinc(idx, l);
17291 if (lat_threshold > 0.0 &&
17292 l >= make_timespan(lat_threshold)) {
17293 dout(0) << __func__ << " slow operation observed for " << name
17294 << ", latency = " << l
17295 << info
17296 << dendl;
17297 }
17298}
17299
17300void BlueStore::log_latency_fn(
17301 const char* name,
17302 int idx,
17303 const ceph::timespan& l,
17304 double lat_threshold,
17305 std::function<string (const ceph::timespan& lat)> fn) const
17306{
17307 logger->tinc(idx, l);
17308 if (lat_threshold > 0.0 &&
17309 l >= make_timespan(lat_threshold)) {
17310 dout(0) << __func__ << " slow operation observed for " << name
17311 << ", latency = " << l
17312 << fn(l)
17313 << dendl;
17314 }
17315}
17316
17317#if defined(WITH_LTTNG)
17318void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
17319 KeyValueDB &db,
17320 TransContext &txc,
17321 mono_clock::time_point start_throttle_acquire)
17322{
17323 pending_kv_ios += txc.ios;
17324 if (txc.deferred_txn) {
17325 pending_deferred_ios += txc.ios;
17326 }
17327
17328 uint64_t started = 0;
17329 uint64_t completed = 0;
17330 if (should_trace(&started, &completed)) {
17331 txc.tracing = true;
17332 uint64_t rocksdb_base_level,
17333 rocksdb_estimate_pending_compaction_bytes,
17334 rocksdb_cur_size_all_mem_tables,
17335 rocksdb_compaction_pending,
17336 rocksdb_mem_table_flush_pending,
17337 rocksdb_num_running_compactions,
17338 rocksdb_num_running_flushes,
17339 rocksdb_actual_delayed_write_rate;
17340 db.get_property(
17341 "rocksdb.base-level",
17342 &rocksdb_base_level);
17343 db.get_property(
17344 "rocksdb.estimate-pending-compaction-bytes",
17345 &rocksdb_estimate_pending_compaction_bytes);
17346 db.get_property(
17347 "rocksdb.cur-size-all-mem-tables",
17348 &rocksdb_cur_size_all_mem_tables);
17349 db.get_property(
17350 "rocksdb.compaction-pending",
17351 &rocksdb_compaction_pending);
17352 db.get_property(
17353 "rocksdb.mem-table-flush-pending",
17354 &rocksdb_mem_table_flush_pending);
17355 db.get_property(
17356 "rocksdb.num-running-compactions",
17357 &rocksdb_num_running_compactions);
17358 db.get_property(
17359 "rocksdb.num-running-flushes",
17360 &rocksdb_num_running_flushes);
17361 db.get_property(
17362 "rocksdb.actual-delayed-write-rate",
17363 &rocksdb_actual_delayed_write_rate);
17364
17365
17366 tracepoint(
17367 bluestore,
17368 transaction_initial_state,
17369 txc.osr->get_sequencer_id(),
17370 txc.seq,
17371 throttle_bytes.get_current(),
17372 throttle_deferred_bytes.get_current(),
17373 pending_kv_ios,
17374 pending_deferred_ios,
17375 started,
17376 completed,
17377 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
17378
17379 tracepoint(
17380 bluestore,
17381 transaction_initial_state_rocksdb,
17382 txc.osr->get_sequencer_id(),
17383 txc.seq,
17384 rocksdb_base_level,
17385 rocksdb_estimate_pending_compaction_bytes,
17386 rocksdb_cur_size_all_mem_tables,
17387 rocksdb_compaction_pending,
17388 rocksdb_mem_table_flush_pending,
17389 rocksdb_num_running_compactions,
17390 rocksdb_num_running_flushes,
17391 rocksdb_actual_delayed_write_rate);
17392 }
17393}
17394#endif
17395
17396mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
17397 TransContext &txc, PerfCounters *logger, int state)
17398{
17399 mono_clock::time_point now = mono_clock::now();
17400 mono_clock::duration lat = now - txc.last_stamp;
17401 logger->tinc(state, lat);
17402#if defined(WITH_LTTNG)
17403 if (txc.tracing &&
17404 state >= l_bluestore_state_prepare_lat &&
17405 state <= l_bluestore_state_done_lat) {
17406 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
17407 tracepoint(
17408 bluestore,
17409 transaction_state_duration,
17410 txc.osr->get_sequencer_id(),
17411 txc.seq,
17412 state,
17413 ceph::to_seconds<double>(lat));
17414 }
17415#endif
17416 txc.last_stamp = now;
17417 return lat;
17418}
17419
17420bool BlueStore::BlueStoreThrottle::try_start_transaction(
17421 KeyValueDB &db,
17422 TransContext &txc,
17423 mono_clock::time_point start_throttle_acquire)
17424{
17425 throttle_bytes.get(txc.cost);
17426
17427 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
17428 emit_initial_tracepoint(db, txc, start_throttle_acquire);
17429 return true;
17430 } else {
17431 return false;
17432 }
17433}
17434
17435void BlueStore::BlueStoreThrottle::finish_start_transaction(
17436 KeyValueDB &db,
17437 TransContext &txc,
17438 mono_clock::time_point start_throttle_acquire)
17439{
17440 ceph_assert(txc.deferred_txn);
17441 throttle_deferred_bytes.get(txc.cost);
17442 emit_initial_tracepoint(db, txc, start_throttle_acquire);
17443}
17444
17445#if defined(WITH_LTTNG)
17446void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
17447{
17448 pending_kv_ios -= 1;
17449 ios_completed_since_last_traced++;
17450 if (txc.tracing) {
17451 tracepoint(
17452 bluestore,
17453 transaction_commit_latency,
17454 txc.osr->get_sequencer_id(),
17455 txc.seq,
17456 ceph::to_seconds<double>(mono_clock::now() - txc.start));
17457 }
17458}
17459#endif
17460
17461#if defined(WITH_LTTNG)
17462void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
17463{
17464 if (txc.deferred_txn) {
17465 pending_deferred_ios -= 1;
17466 }
17467 if (txc.tracing) {
17468 mono_clock::time_point now = mono_clock::now();
17469 mono_clock::duration lat = now - txc.start;
17470 tracepoint(
17471 bluestore,
17472 transaction_total_duration,
17473 txc.osr->get_sequencer_id(),
17474 txc.seq,
17475 ceph::to_seconds<double>(lat));
17476 }
17477}
17478#endif
17479
17480const string prefix_onode = "o";
17481const string prefix_onode_shard = "x";
17482const string prefix_other = "Z";
17483//Itrerates through the db and collects the stats
17484void BlueStore::generate_db_histogram(Formatter *f)
17485{
17486 //globals
17487 uint64_t num_onodes = 0;
17488 uint64_t num_shards = 0;
17489 uint64_t num_super = 0;
17490 uint64_t num_coll = 0;
17491 uint64_t num_omap = 0;
17492 uint64_t num_pgmeta_omap = 0;
17493 uint64_t num_deferred = 0;
17494 uint64_t num_alloc = 0;
17495 uint64_t num_stat = 0;
17496 uint64_t num_others = 0;
17497 uint64_t num_shared_shards = 0;
17498 size_t max_key_size =0, max_value_size = 0;
17499 uint64_t total_key_size = 0, total_value_size = 0;
17500 size_t key_size = 0, value_size = 0;
17501 KeyValueHistogram hist;
17502
17503 auto start = coarse_mono_clock::now();
17504
17505 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
17506 iter->seek_to_first();
17507 while (iter->valid()) {
17508 dout(30) << __func__ << " Key: " << iter->key() << dendl;
17509 key_size = iter->key_size();
17510 value_size = iter->value_size();
17511 hist.value_hist[hist.get_value_slab(value_size)]++;
17512 max_key_size = std::max(max_key_size, key_size);
17513 max_value_size = std::max(max_value_size, value_size);
17514 total_key_size += key_size;
17515 total_value_size += value_size;
17516
17517 pair<string,string> key(iter->raw_key());
17518
17519 if (key.first == PREFIX_SUPER) {
17520 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
17521 num_super++;
17522 } else if (key.first == PREFIX_STAT) {
17523 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
17524 num_stat++;
17525 } else if (key.first == PREFIX_COLL) {
17526 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
17527 num_coll++;
17528 } else if (key.first == PREFIX_OBJ) {
17529 if (key.second.back() == ONODE_KEY_SUFFIX) {
17530 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
17531 num_onodes++;
17532 } else {
17533 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
17534 num_shards++;
17535 }
17536 } else if (key.first == PREFIX_OMAP) {
17537 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
17538 num_omap++;
17539 } else if (key.first == PREFIX_PERPOOL_OMAP) {
17540 hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size);
17541 num_omap++;
17542 } else if (key.first == PREFIX_PERPG_OMAP) {
17543 hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size);
17544 num_omap++;
17545 } else if (key.first == PREFIX_PGMETA_OMAP) {
17546 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
17547 num_pgmeta_omap++;
17548 } else if (key.first == PREFIX_DEFERRED) {
17549 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
17550 num_deferred++;
17551 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
17552 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
17553 num_alloc++;
17554 } else if (key.first == PREFIX_SHARED_BLOB) {
17555 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
17556 num_shared_shards++;
17557 } else {
17558 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
17559 num_others++;
17560 }
17561 iter->next();
17562 }
17563
17564 ceph::timespan duration = coarse_mono_clock::now() - start;
17565 f->open_object_section("rocksdb_key_value_stats");
17566 f->dump_unsigned("num_onodes", num_onodes);
17567 f->dump_unsigned("num_shards", num_shards);
17568 f->dump_unsigned("num_super", num_super);
17569 f->dump_unsigned("num_coll", num_coll);
17570 f->dump_unsigned("num_omap", num_omap);
17571 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
17572 f->dump_unsigned("num_deferred", num_deferred);
17573 f->dump_unsigned("num_alloc", num_alloc);
17574 f->dump_unsigned("num_stat", num_stat);
17575 f->dump_unsigned("num_shared_shards", num_shared_shards);
17576 f->dump_unsigned("num_others", num_others);
17577 f->dump_unsigned("max_key_size", max_key_size);
17578 f->dump_unsigned("max_value_size", max_value_size);
17579 f->dump_unsigned("total_key_size", total_key_size);
17580 f->dump_unsigned("total_value_size", total_value_size);
17581 f->close_section();
17582
17583 hist.dump(f);
17584
17585 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
17586
17587}
17588
17589void BlueStore::_shutdown_cache()
17590{
17591 dout(10) << __func__ << dendl;
17592 for (auto i : buffer_cache_shards) {
17593 i->flush();
17594 ceph_assert(i->empty());
17595 }
17596 for (auto& p : coll_map) {
17597 p.second->onode_space.clear();
17598 if (!p.second->shared_blob_set.empty()) {
17599 derr << __func__ << " stray shared blobs on " << p.first << dendl;
17600 p.second->shared_blob_set.dump<0>(cct);
17601 }
17602 ceph_assert(p.second->onode_space.empty());
17603 ceph_assert(p.second->shared_blob_set.empty());
17604 }
17605 coll_map.clear();
17606 for (auto i : onode_cache_shards) {
17607 ceph_assert(i->empty());
17608 }
17609}
17610
17611// For external caller.
17612// We use a best-effort policy instead, e.g.,
17613// we don't care if there are still some pinned onodes/data in the cache
17614// after this command is completed.
17615int BlueStore::flush_cache(ostream *os)
17616{
17617 dout(10) << __func__ << dendl;
17618 for (auto i : onode_cache_shards) {
17619 i->flush();
17620 }
17621 for (auto i : buffer_cache_shards) {
17622 i->flush();
17623 }
17624
17625 return 0;
17626}
17627
17628void BlueStore::_apply_padding(uint64_t head_pad,
17629 uint64_t tail_pad,
17630 bufferlist& padded)
17631{
17632 if (head_pad) {
17633 padded.prepend_zero(head_pad);
17634 }
17635 if (tail_pad) {
17636 padded.append_zero(tail_pad);
17637 }
17638 if (head_pad || tail_pad) {
17639 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
17640 << " tail 0x" << tail_pad << std::dec << dendl;
17641 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
17642 }
17643}
17644
17645void BlueStore::_record_onode(OnodeRef& o, KeyValueDB::Transaction &txn)
17646{
17647 // finalize extent_map shards
17648 o->extent_map.update(txn, false);
17649 if (o->extent_map.needs_reshard()) {
17650 o->extent_map.reshard(db, txn);
17651 o->extent_map.update(txn, true);
17652 if (o->extent_map.needs_reshard()) {
17653 dout(20) << __func__ << " warning: still wants reshard, check options?"
17654 << dendl;
17655 o->extent_map.clear_needs_reshard();
17656 }
17657 logger->inc(l_bluestore_onode_reshard);
17658 }
17659
17660 // bound encode
17661 size_t bound = 0;
17662 denc(o->onode, bound);
17663 o->extent_map.bound_encode_spanning_blobs(bound);
17664 if (o->onode.extent_map_shards.empty()) {
17665 denc(o->extent_map.inline_bl, bound);
17666 }
17667
17668 // encode
17669 bufferlist bl;
17670 unsigned onode_part, blob_part, extent_part;
17671 {
17672 auto p = bl.get_contiguous_appender(bound, true);
17673 denc(o->onode, p);
17674 onode_part = p.get_logical_offset();
17675 o->extent_map.encode_spanning_blobs(p);
17676 blob_part = p.get_logical_offset() - onode_part;
17677 if (o->onode.extent_map_shards.empty()) {
17678 denc(o->extent_map.inline_bl, p);
17679 }
17680 extent_part = p.get_logical_offset() - onode_part - blob_part;
17681 }
17682
17683 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
17684 << " (" << onode_part << " bytes onode + "
17685 << blob_part << " bytes spanning blobs + "
17686 << extent_part << " bytes inline extents)"
17687 << dendl;
17688
17689
17690 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
17691}
17692
17693void BlueStore::_log_alerts(osd_alert_list_t& alerts)
17694{
17695 std::lock_guard l(qlock);
17696 size_t used = bluefs && bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW ?
17697 bluefs->get_used(BlueFS::BDEV_SLOW) : 0;
17698 if (used > 0) {
17699 auto db_used = bluefs->get_used(BlueFS::BDEV_DB);
17700 auto db_total = bluefs->get_total(BlueFS::BDEV_DB);
17701 ostringstream ss;
17702 ss << "spilled over " << byte_u_t(used)
17703 << " metadata from 'db' device (" << byte_u_t(db_used)
17704 << " used of " << byte_u_t(db_total) << ") to slow device";
17705 spillover_alert = ss.str();
17706 } else if (!spillover_alert.empty()){
17707 spillover_alert.clear();
17708 }
17709
17710 if (!spurious_read_errors_alert.empty() &&
17711 cct->_conf->bluestore_warn_on_spurious_read_errors) {
17712 alerts.emplace(
17713 "BLUESTORE_SPURIOUS_READ_ERRORS",
17714 spurious_read_errors_alert);
17715 }
17716 if (!disk_size_mismatch_alert.empty()) {
17717 alerts.emplace(
17718 "BLUESTORE_DISK_SIZE_MISMATCH",
17719 disk_size_mismatch_alert);
17720 }
17721 if (!legacy_statfs_alert.empty()) {
17722 alerts.emplace(
17723 "BLUESTORE_LEGACY_STATFS",
17724 legacy_statfs_alert);
17725 }
17726 if (!spillover_alert.empty() &&
17727 cct->_conf->bluestore_warn_on_bluefs_spillover) {
17728 alerts.emplace(
17729 "BLUEFS_SPILLOVER",
17730 spillover_alert);
17731 }
17732 if (!no_per_pg_omap_alert.empty()) {
17733 alerts.emplace(
17734 "BLUESTORE_NO_PER_PG_OMAP",
17735 no_per_pg_omap_alert);
17736 }
17737 if (!no_per_pool_omap_alert.empty()) {
17738 alerts.emplace(
17739 "BLUESTORE_NO_PER_POOL_OMAP",
17740 no_per_pool_omap_alert);
17741 }
17742 string s0(failed_cmode);
17743
17744 if (!failed_compressors.empty()) {
17745 if (!s0.empty()) {
17746 s0 += ", ";
17747 }
17748 s0 += "unable to load:";
17749 bool first = true;
17750 for (auto& s : failed_compressors) {
17751 if (first) {
17752 first = false;
17753 } else {
17754 s0 += ", ";
17755 }
17756 s0 += s;
17757 }
17758 alerts.emplace(
17759 "BLUESTORE_NO_COMPRESSION",
17760 s0);
17761 }
17762}
17763
17764void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
17765 const PExtentVector& extents)
17766{
17767 alloc_stats_count++;
17768 alloc_stats_fragments += extents.size();
17769 alloc_stats_size += need;
17770
17771 for (auto& e : extents) {
17772 logger->hinc(l_bluestore_allocate_hist, e.length, need);
17773 }
17774}
17775
17776void BlueStore::_record_allocation_stats()
17777{
17778 // don't care about data consistency,
17779 // fields can be partially modified while making the tuple
17780 auto t0 = std::make_tuple(
17781 alloc_stats_count.exchange(0),
17782 alloc_stats_fragments.exchange(0),
17783 alloc_stats_size.exchange(0));
17784
17785 dout(0) << " allocation stats probe "
17786 << probe_count << ":"
17787 << " cnt: " << std::get<0>(t0)
17788 << " frags: " << std::get<1>(t0)
17789 << " size: " << std::get<2>(t0)
17790 << dendl;
17791
17792
17793 //
17794 // Keep the history for probes from the power-of-two sequence:
17795 // -1, -2, -4, -8, -16
17796 //
17797 size_t base = 1;
17798 for (auto& t : alloc_stats_history) {
17799 dout(0) << " probe -"
17800 << base + (probe_count % base) << ": "
17801 << std::get<0>(t)
17802 << ", " << std::get<1>(t)
17803 << ", " << std::get<2>(t)
17804 << dendl;
17805 base <<= 1;
17806 }
17807 dout(0) << "------------" << dendl;
17808
17809 ++ probe_count;
17810
17811 for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) {
17812 if ((probe_count % (1 << i)) == 0) {
17813 alloc_stats_history[i] = alloc_stats_history[i - 1];
17814 }
17815 }
17816 alloc_stats_history[0].swap(t0);
17817}
17818
17819// ===========================================
17820// BlueStoreRepairer
17821
17822size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
17823 const interval_set<uint64_t>& extents)
17824{
17825 ceph_assert(granularity); // initialized
17826 // can't call for the second time
17827 ceph_assert(!was_filtered_out);
17828 ceph_assert(collections_bfs.size() == objects_bfs.size());
17829
17830 uint64_t prev_pos = 0;
17831 uint64_t npos = collections_bfs.size();
17832
17833 bloom_vector collections_reduced;
17834 bloom_vector objects_reduced;
17835
17836 for (auto e : extents) {
17837 if (e.second == 0) {
17838 continue;
17839 }
17840 uint64_t pos = max(e.first / granularity, prev_pos);
17841 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
17842 while (pos != npos && pos < end_pos) {
17843 ceph_assert( collections_bfs[pos].element_count() ==
17844 objects_bfs[pos].element_count());
17845 if (collections_bfs[pos].element_count()) {
17846 collections_reduced.push_back(std::move(collections_bfs[pos]));
17847 objects_reduced.push_back(std::move(objects_bfs[pos]));
17848 }
17849 ++pos;
17850 }
17851 prev_pos = end_pos;
17852 }
17853 collections_reduced.swap(collections_bfs);
17854 objects_reduced.swap(objects_bfs);
17855 was_filtered_out = true;
17856 return collections_bfs.size();
17857}
17858
17859bool BlueStoreRepairer::remove_key(KeyValueDB *db,
17860 const string& prefix,
17861 const string& key)
17862{
17863 std::lock_guard l(lock);
17864 if (!remove_key_txn) {
17865 remove_key_txn = db->get_transaction();
17866 }
17867 ++to_repair_cnt;
17868 remove_key_txn->rmkey(prefix, key);
17869
17870 return true;
17871}
17872
17873void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val)
17874{
17875 std::lock_guard l(lock); // possibly redundant
17876 ceph_assert(fix_per_pool_omap_txn == nullptr);
17877 fix_per_pool_omap_txn = db->get_transaction();
17878 ++to_repair_cnt;
17879 bufferlist bl;
17880 bl.append(stringify(val));
17881 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
17882}
17883
17884bool BlueStoreRepairer::fix_shared_blob(
17885 KeyValueDB::Transaction txn,
17886 uint64_t sbid,
17887 bluestore_extent_ref_map_t* ref_map,
17888 size_t repaired)
17889{
17890 string key;
17891 get_shared_blob_key(sbid, &key);
17892 if (ref_map) {
17893 bluestore_shared_blob_t persistent(sbid, std::move(*ref_map));
17894 bufferlist bl;
17895 encode(persistent, bl);
17896 txn->set(PREFIX_SHARED_BLOB, key, bl);
17897 } else {
17898 txn->rmkey(PREFIX_SHARED_BLOB, key);
17899 }
17900 to_repair_cnt += repaired;
17901 return true;
17902}
17903
17904bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
17905 const string& key,
17906 const store_statfs_t& new_statfs)
17907{
17908 std::lock_guard l(lock);
17909 if (!fix_statfs_txn) {
17910 fix_statfs_txn = db->get_transaction();
17911 }
17912 BlueStore::volatile_statfs vstatfs;
17913 vstatfs = new_statfs;
17914 bufferlist bl;
17915 vstatfs.encode(bl);
17916 ++to_repair_cnt;
17917 fix_statfs_txn->set(PREFIX_STAT, key, bl);
17918 return true;
17919}
17920
17921bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
17922 FreelistManager* fm,
17923 uint64_t offset, uint64_t len)
17924{
17925 std::lock_guard l(lock);
17926 ceph_assert(!fm->is_null_manager());
17927
17928 if (!fix_fm_leaked_txn) {
17929 fix_fm_leaked_txn = db->get_transaction();
17930 }
17931 ++to_repair_cnt;
17932 fm->release(offset, len, fix_fm_leaked_txn);
17933 return true;
17934}
17935bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
17936 FreelistManager* fm,
17937 uint64_t offset, uint64_t len)
17938{
17939 std::lock_guard l(lock);
17940 ceph_assert(!fm->is_null_manager());
17941
17942 if (!fix_fm_false_free_txn) {
17943 fix_fm_false_free_txn = db->get_transaction();
17944 }
17945 ++to_repair_cnt;
17946 fm->allocate(offset, len, fix_fm_false_free_txn);
17947 return true;
17948}
17949
17950bool BlueStoreRepairer::fix_spanning_blobs(
17951 KeyValueDB* db,
17952 std::function<void(KeyValueDB::Transaction)> f)
17953{
17954 std::lock_guard l(lock);
17955 if (!fix_onode_txn) {
17956 fix_onode_txn = db->get_transaction();
17957 }
17958 f(fix_onode_txn);
17959 ++to_repair_cnt;
17960 return true;
17961}
17962
17963bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
17964{
17965 //NB: not for use in multithreading mode!!!
17966 if (misreferenced_extents.size()) {
17967 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
17968 ceph_assert(n > 0);
17969 if (!fix_misreferences_txn) {
17970 fix_misreferences_txn = db->get_transaction();
17971 }
17972 return true;
17973 }
17974 return false;
17975}
17976
17977unsigned BlueStoreRepairer::apply(KeyValueDB* db)
17978{
17979 //NB: not for use in multithreading mode!!!
17980 if (fix_per_pool_omap_txn) {
17981 auto ok = db->submit_transaction_sync(fix_per_pool_omap_txn) == 0;
17982 ceph_assert(ok);
17983 fix_per_pool_omap_txn = nullptr;
17984 }
17985 if (fix_fm_leaked_txn) {
17986 auto ok = db->submit_transaction_sync(fix_fm_leaked_txn) == 0;
17987 ceph_assert(ok);
17988 fix_fm_leaked_txn = nullptr;
17989 }
17990 if (fix_fm_false_free_txn) {
17991 auto ok = db->submit_transaction_sync(fix_fm_false_free_txn) == 0;
17992 ceph_assert(ok);
17993 fix_fm_false_free_txn = nullptr;
17994 }
17995 if (remove_key_txn) {
17996 auto ok = db->submit_transaction_sync(remove_key_txn) == 0;
17997 ceph_assert(ok);
17998 remove_key_txn = nullptr;
17999 }
18000 if (fix_misreferences_txn) {
18001 auto ok = db->submit_transaction_sync(fix_misreferences_txn) == 0;
18002 ceph_assert(ok);
18003 fix_misreferences_txn = nullptr;
18004 }
18005 if (fix_onode_txn) {
18006 auto ok = db->submit_transaction_sync(fix_onode_txn) == 0;
18007 ceph_assert(ok);
18008 fix_onode_txn = nullptr;
18009 }
18010 if (fix_shared_blob_txn) {
18011 auto ok = db->submit_transaction_sync(fix_shared_blob_txn) == 0;
18012 ceph_assert(ok);
18013 fix_shared_blob_txn = nullptr;
18014 }
18015 if (fix_statfs_txn) {
18016 auto ok = db->submit_transaction_sync(fix_statfs_txn) == 0;
18017 ceph_assert(ok);
18018 fix_statfs_txn = nullptr;
18019 }
18020 if (need_compact) {
18021 db->compact();
18022 need_compact = false;
18023 }
18024 unsigned repaired = to_repair_cnt;
18025 to_repair_cnt = 0;
18026 return repaired;
18027}
18028
18029// =======================================================
18030// RocksDBBlueFSVolumeSelector
18031
18032uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
18033 ceph_assert(h != nullptr);
18034 uint64_t hint = reinterpret_cast<uint64_t>(h);
18035 uint8_t res;
18036 switch (hint) {
18037 case LEVEL_SLOW:
18038 res = BlueFS::BDEV_SLOW;
18039 if (db_avail4slow > 0) {
18040 // considering statically available db space vs.
18041 // - observed maximums on DB dev for DB/WAL/UNSORTED data
18042 // - observed maximum spillovers
18043 uint64_t max_db_use = 0; // max db usage we potentially observed
18044 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
18045 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
18046 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
18047 // this could go to db hence using it in the estimation
18048 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
18049
18050 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
18051 uint64_t avail = min(
18052 db_avail4slow,
18053 max_db_use < db_total ? db_total - max_db_use : 0);
18054
18055 // considering current DB dev usage for SLOW data
18056 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
18057 res = BlueFS::BDEV_DB;
18058 }
18059 }
18060 break;
18061 case LEVEL_LOG:
18062 case LEVEL_WAL:
18063 res = BlueFS::BDEV_WAL;
18064 break;
18065 case LEVEL_DB:
18066 default:
18067 res = BlueFS::BDEV_DB;
18068 break;
18069 }
18070 return res;
18071}
18072
18073void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
18074{
18075 auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST];
18076 res.emplace_back(base, db_size);
18077 auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST];
18078 if (slow_size == 0) {
18079 slow_size = db_size;
18080 }
18081 res.emplace_back(base + ".slow", slow_size);
18082}
18083
18084void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
18085 uint8_t res = LEVEL_DB;
18086 if (dirname.length() > 5) {
18087 // the "db.slow" and "db.wal" directory names are hard-coded at
18088 // match up with bluestore. the slow device is always the second
18089 // one (when a dedicated block.db device is present and used at
18090 // bdev 0). the wal device is always last.
18091 if (boost::algorithm::ends_with(dirname, ".slow")) {
18092 res = LEVEL_SLOW;
18093 }
18094 else if (boost::algorithm::ends_with(dirname, ".wal")) {
18095 res = LEVEL_WAL;
18096 }
18097 }
18098 return reinterpret_cast<void*>(res);
18099}
18100
18101void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
18102 auto max_x = per_level_per_dev_usage.get_max_x();
18103 auto max_y = per_level_per_dev_usage.get_max_y();
18104
18105 sout << "RocksDBBlueFSVolumeSelector Usage Matrix:" << std::endl;
18106 constexpr std::array<const char*, 8> names{ {
18107 "DEV/LEV",
18108 "WAL",
18109 "DB",
18110 "SLOW",
18111 "*",
18112 "*",
18113 "REAL",
18114 "FILES",
18115 } };
18116 const size_t width = 12;
18117 for (size_t i = 0; i < names.size(); ++i) {
18118 sout.setf(std::ios::left, std::ios::adjustfield);
18119 sout.width(width);
18120 sout << names[i];
18121 }
18122 sout << std::endl;
18123 for (size_t l = 0; l < max_y; l++) {
18124 sout.setf(std::ios::left, std::ios::adjustfield);
18125 sout.width(width);
18126 switch (l + LEVEL_FIRST) {
18127 case LEVEL_LOG:
18128 sout << "LOG"; break;
18129 case LEVEL_WAL:
18130 sout << "WAL"; break;
18131 case LEVEL_DB:
18132 sout << "DB"; break;
18133 case LEVEL_SLOW:
18134 sout << "SLOW"; break;
18135 case LEVEL_MAX:
18136 sout << "TOTAL"; break;
18137 }
18138 for (size_t d = 0; d < max_x; d++) {
18139 sout.setf(std::ios::left, std::ios::adjustfield);
18140 sout.width(width);
18141 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
18142 }
18143 sout.setf(std::ios::left, std::ios::adjustfield);
18144 sout.width(width);
18145 sout << stringify(per_level_files[l]) << std::endl;
18146 }
18147 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
18148 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
18149 sout << "MAXIMUMS:" << std::endl;
18150 for (size_t l = 0; l < max_y; l++) {
18151 sout.setf(std::ios::left, std::ios::adjustfield);
18152 sout.width(width);
18153 switch (l + LEVEL_FIRST) {
18154 case LEVEL_LOG:
18155 sout << "LOG"; break;
18156 case LEVEL_WAL:
18157 sout << "WAL"; break;
18158 case LEVEL_DB:
18159 sout << "DB"; break;
18160 case LEVEL_SLOW:
18161 sout << "SLOW"; break;
18162 case LEVEL_MAX:
18163 sout << "TOTAL"; break;
18164 }
18165 for (size_t d = 0; d < max_x - 1; d++) {
18166 sout.setf(std::ios::left, std::ios::adjustfield);
18167 sout.width(width);
18168 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
18169 }
18170 sout.setf(std::ios::left, std::ios::adjustfield);
18171 sout.width(width);
18172 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
18173 sout << std::endl;
18174 }
18175 string sizes[] = {
18176 ">> SIZE <<",
18177 stringify(byte_u_t(l_totals[LEVEL_WAL - LEVEL_FIRST])),
18178 stringify(byte_u_t(l_totals[LEVEL_DB - LEVEL_FIRST])),
18179 stringify(byte_u_t(l_totals[LEVEL_SLOW - LEVEL_FIRST])),
18180 };
18181 for (size_t i = 0; i < (sizeof(sizes) / sizeof(sizes[0])); i++) {
18182 sout.setf(std::ios::left, std::ios::adjustfield);
18183 sout.width(width);
18184 sout << sizes[i];
18185 }
18186 sout << std::endl;
18187}
18188
18189BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const {
18190 RocksDBBlueFSVolumeSelector* ns =
18191 new RocksDBBlueFSVolumeSelector(0, 0, 0,
18192 0, 0, 0,
18193 0, 0, false);
18194 return ns;
18195}
18196
18197bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) {
18198 RocksDBBlueFSVolumeSelector* o = dynamic_cast<RocksDBBlueFSVolumeSelector*>(other);
18199 ceph_assert(o);
18200 bool equal = true;
18201 for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) {
18202 for (size_t y = 0; y <LEVEL_MAX - LEVEL_FIRST + 1; y++) {
18203 equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y));
18204 }
18205 }
18206 for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) {
18207 equal &= (per_level_files[t] == o->per_level_files[t]);
18208 }
18209 return equal;
18210}
18211
18212// =======================================================
18213
18214//================================================================================================================
18215// BlueStore is committing all allocation information (alloc/release) into RocksDB before the client Write is performed.
18216// This cause a delay in write path and add significant load to the CPU/Memory/Disk.
18217// The reason for the RocksDB updates is that it allows Ceph to survive any failure without losing the allocation state.
18218//
18219// We changed the code skiping RocksDB updates on allocation time and instead performing a full desatge of the allocator object
18220// with all the OSD allocation state in a single step during umount().
18221// This change leads to a 25% increase in IOPS and reduced latency in small random-write workload, but exposes the system
18222// to losing allocation info in failure cases where we don't call umount.
18223// We add code to perform a full allocation-map rebuild from information stored inside the ONode which is used in failure cases.
18224// When we perform a graceful shutdown there is no need for recovery and we simply read the allocation-map from a flat file
18225// where we store the allocation-map during umount().
18226//================================================================================================================
18227
18228#undef dout_prefix
18229#define dout_prefix *_dout << "bluestore::NCB::" << __func__ << "::"
18230
18231static const std::string allocator_dir = "ALLOCATOR_NCB_DIR";
18232static const std::string allocator_file = "ALLOCATOR_NCB_FILE";
18233static uint32_t s_format_version = 0x01; // support future changes to allocator-map file
18234static uint32_t s_serial = 0x01;
18235
18236#if 1
18237#define CEPHTOH_32 le32toh
18238#define CEPHTOH_64 le64toh
18239#define HTOCEPH_32 htole32
18240#define HTOCEPH_64 htole64
18241#else
18242// help debug the encode/decode by forcing alien format
18243#define CEPHTOH_32 be32toh
18244#define CEPHTOH_64 be64toh
18245#define HTOCEPH_32 htobe32
18246#define HTOCEPH_64 htobe64
18247#endif
18248
18249// 48 Bytes header for on-disk alloator image
18250const uint64_t ALLOCATOR_IMAGE_VALID_SIGNATURE = 0x1FACE0FF;
18251struct allocator_image_header {
18252 uint32_t format_version; // 0x00
18253 uint32_t valid_signature; // 0x04
18254 utime_t timestamp; // 0x08
18255 uint32_t serial; // 0x10
18256 uint32_t pad[0x7]; // 0x14
18257
18258 allocator_image_header() {
18259 memset((char*)this, 0, sizeof(allocator_image_header));
18260 }
18261
18262 // create header in CEPH format
18263 allocator_image_header(utime_t timestamp, uint32_t format_version, uint32_t serial) {
18264 this->format_version = format_version;
18265 this->timestamp = timestamp;
18266 this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
18267 this->serial = serial;
18268 memset(this->pad, 0, sizeof(this->pad));
18269 }
18270
18271 friend std::ostream& operator<<(std::ostream& out, const allocator_image_header& header) {
18272 out << "format_version = " << header.format_version << std::endl;
18273 out << "valid_signature = " << header.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
18274 out << "timestamp = " << header.timestamp << std::endl;
18275 out << "serial = " << header.serial << std::endl;
18276 for (unsigned i = 0; i < sizeof(header.pad)/sizeof(uint32_t); i++) {
18277 if (header.pad[i]) {
18278 out << "header.pad[" << i << "] = " << header.pad[i] << std::endl;
18279 }
18280 }
18281 return out;
18282 }
18283
18284 DENC(allocator_image_header, v, p) {
18285 denc(v.format_version, p);
18286 denc(v.valid_signature, p);
18287 denc(v.timestamp.tv.tv_sec, p);
18288 denc(v.timestamp.tv.tv_nsec, p);
18289 denc(v.serial, p);
18290 for (auto& pad: v.pad) {
18291 denc(pad, p);
18292 }
18293 }
18294
18295
18296 int verify(CephContext* cct, const std::string &path) {
18297 if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
18298 for (unsigned i = 0; i < (sizeof(pad) / sizeof(uint32_t)); i++) {
18299 if (this->pad[i]) {
18300 derr << "Illegal Header - pad[" << i << "]="<< pad[i] << dendl;
18301 return -1;
18302 }
18303 }
18304 return 0;
18305 }
18306 else {
18307 derr << "Illegal Header - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
18308 return -1;
18309 }
18310 }
18311};
18312WRITE_CLASS_DENC(allocator_image_header)
18313
18314// 56 Bytes trailer for on-disk alloator image
18315struct allocator_image_trailer {
18316 extent_t null_extent; // 0x00
18317
18318 uint32_t format_version; // 0x10
18319 uint32_t valid_signature; // 0x14
18320
18321 utime_t timestamp; // 0x18
18322
18323 uint32_t serial; // 0x20
18324 uint32_t pad; // 0x24
18325 uint64_t entries_count; // 0x28
18326 uint64_t allocation_size; // 0x30
18327
18328 // trailer is created in CEPH format
18329 allocator_image_trailer(utime_t timestamp, uint32_t format_version, uint32_t serial, uint64_t entries_count, uint64_t allocation_size) {
18330 memset((char*)&(this->null_extent), 0, sizeof(this->null_extent));
18331 this->format_version = format_version;
18332 this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
18333 this->timestamp = timestamp;
18334 this->serial = serial;
18335 this->pad = 0;
18336 this->entries_count = entries_count;
18337 this->allocation_size = allocation_size;
18338 }
18339
18340 allocator_image_trailer() {
18341 memset((char*)this, 0, sizeof(allocator_image_trailer));
18342 }
18343
18344 friend std::ostream& operator<<(std::ostream& out, const allocator_image_trailer& trailer) {
18345 if (trailer.null_extent.offset || trailer.null_extent.length) {
18346 out << "trailer.null_extent.offset = " << trailer.null_extent.offset << std::endl;
18347 out << "trailer.null_extent.length = " << trailer.null_extent.length << std::endl;
18348 }
18349 out << "format_version = " << trailer.format_version << std::endl;
18350 out << "valid_signature = " << trailer.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
18351 out << "timestamp = " << trailer.timestamp << std::endl;
18352 out << "serial = " << trailer.serial << std::endl;
18353 if (trailer.pad) {
18354 out << "trailer.pad= " << trailer.pad << std::endl;
18355 }
18356 out << "entries_count = " << trailer.entries_count << std::endl;
18357 out << "allocation_size = " << trailer.allocation_size << std::endl;
18358 return out;
18359 }
18360
18361 int verify(CephContext* cct, const std::string &path, const allocator_image_header *p_header, uint64_t entries_count, uint64_t allocation_size) {
18362 if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
18363
18364 // trailer must starts with null extents (both fields set to zero) [no need to convert formats for zero)
18365 if (null_extent.offset || null_extent.length) {
18366 derr << "illegal trailer - null_extent = [" << null_extent.offset << "," << null_extent.length << "]"<< dendl;
18367 return -1;
18368 }
18369
18370 if (serial != p_header->serial) {
18371 derr << "Illegal trailer: header->serial(" << p_header->serial << ") != trailer->serial(" << serial << ")" << dendl;
18372 return -1;
18373 }
18374
18375 if (format_version != p_header->format_version) {
18376 derr << "Illegal trailer: header->format_version(" << p_header->format_version
18377 << ") != trailer->format_version(" << format_version << ")" << dendl;
18378 return -1;
18379 }
18380
18381 if (timestamp != p_header->timestamp) {
18382 derr << "Illegal trailer: header->timestamp(" << p_header->timestamp
18383 << ") != trailer->timestamp(" << timestamp << ")" << dendl;
18384 return -1;
18385 }
18386
18387 if (this->entries_count != entries_count) {
18388 derr << "Illegal trailer: entries_count(" << entries_count << ") != trailer->entries_count("
18389 << this->entries_count << ")" << dendl;
18390 return -1;
18391 }
18392
18393 if (this->allocation_size != allocation_size) {
18394 derr << "Illegal trailer: allocation_size(" << allocation_size << ") != trailer->allocation_size("
18395 << this->allocation_size << ")" << dendl;
18396 return -1;
18397 }
18398
18399 if (pad) {
18400 derr << "Illegal Trailer - pad="<< pad << dendl;
18401 return -1;
18402 }
18403
18404 // if arrived here -> trailer is valid !!
18405 return 0;
18406 } else {
18407 derr << "Illegal Trailer - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
18408 return -1;
18409 }
18410 }
18411
18412 DENC(allocator_image_trailer, v, p) {
18413 denc(v.null_extent.offset, p);
18414 denc(v.null_extent.length, p);
18415 denc(v.format_version, p);
18416 denc(v.valid_signature, p);
18417 denc(v.timestamp.tv.tv_sec, p);
18418 denc(v.timestamp.tv.tv_nsec, p);
18419 denc(v.serial, p);
18420 denc(v.pad, p);
18421 denc(v.entries_count, p);
18422 denc(v.allocation_size, p);
18423 }
18424};
18425WRITE_CLASS_DENC(allocator_image_trailer)
18426
18427
18428//-------------------------------------------------------------------------------------
18429// invalidate old allocation file if exists so will go directly to recovery after failure
18430// we can safely ignore non-existing file
18431int BlueStore::invalidate_allocation_file_on_bluefs()
18432{
18433 // mark that allocation-file was invalidated and we should destage a new copy whne closing db
18434 need_to_destage_allocation_file = true;
18435 dout(10) << __func__ << " need_to_destage_allocation_file was set" << dendl;
18436
18437 BlueFS::FileWriter *p_handle = nullptr;
18438 if (!bluefs->dir_exists(allocator_dir)) {
18439 dout(5) << "allocator_dir(" << allocator_dir << ") doesn't exist" << dendl;
18440 // nothing to do -> return
18441 return 0;
18442 }
18443
18444 int ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
18445 if (ret != 0) {
18446 dout(5) << __func__ << " allocator_file(" << allocator_file << ") doesn't exist" << dendl;
18447 // nothing to do -> return
18448 return 0;
18449 }
18450
18451
18452 ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, true);
18453 if (ret != 0) {
18454 derr << __func__ << "::NCB:: Failed open_for_write with error-code "
18455 << ret << dendl;
18456 return -1;
18457 }
18458
18459 dout(5) << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
18460 ret = bluefs->truncate(p_handle, 0);
18461 if (ret != 0) {
18462 derr << __func__ << "::NCB:: Failed truncaste with error-code "
18463 << ret << dendl;
18464 bluefs->close_writer(p_handle);
18465 return -1;
18466 }
18467
18468 bluefs->fsync(p_handle);
18469 bluefs->close_writer(p_handle);
18470
18471 return 0;
18472}
18473
18474//-----------------------------------------------------------------------------------
18475int BlueStore::copy_allocator(Allocator* src_alloc, Allocator* dest_alloc, uint64_t* p_num_entries)
18476{
18477 *p_num_entries = 0;
18478 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
18479 (*p_num_entries)++;
18480 };
18481 src_alloc->foreach(count_entries);
18482
18483 dout(5) << "count num_entries=" << *p_num_entries << dendl;
18484
18485 // add 16K extra entries in case new allocation happened
18486 (*p_num_entries) += 16*1024;
18487 unique_ptr<extent_t[]> arr;
18488 try {
18489 arr = make_unique<extent_t[]>(*p_num_entries);
18490 } catch (std::bad_alloc&) {
18491 derr << "****Failed dynamic allocation, num_entries=" << *p_num_entries << dendl;
18492 return -1;
18493 }
18494
18495 uint64_t idx = 0;
18496 auto copy_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
18497 if (extent_length > 0) {
18498 if (idx < *p_num_entries) {
18499 arr[idx] = {extent_offset, extent_length};
18500 }
18501 idx++;
18502 }
18503 else {
18504 derr << "zero length extent!!! offset=" << extent_offset << ", index=" << idx << dendl;
18505 }
18506 };
18507 src_alloc->foreach(copy_entries);
18508
18509 dout(5) << "copy num_entries=" << idx << dendl;
18510 if (idx > *p_num_entries) {
18511 derr << "****spillover, num_entries=" << *p_num_entries << ", spillover=" << (idx - *p_num_entries) << dendl;
18512 ceph_assert(idx <= *p_num_entries);
18513 }
18514
18515 *p_num_entries = idx;
18516
18517 for (idx = 0; idx < *p_num_entries; idx++) {
18518 const extent_t *p_extent = &arr[idx];
18519 dest_alloc->init_add_free(p_extent->offset, p_extent->length);
18520 }
18521
18522 return 0;
18523}
18524
18525//-----------------------------------------------------------------------------------
18526static uint32_t flush_extent_buffer_with_crc(BlueFS::FileWriter *p_handle, const char* buffer, const char *p_curr, uint32_t crc)
18527{
18528 std::ptrdiff_t length = p_curr - buffer;
18529 p_handle->append(buffer, length);
18530
18531 crc = ceph_crc32c(crc, (const uint8_t*)buffer, length);
18532 uint32_t encoded_crc = HTOCEPH_32(crc);
18533 p_handle->append((byte*)&encoded_crc, sizeof(encoded_crc));
18534
18535 return crc;
18536}
18537
18538const unsigned MAX_EXTENTS_IN_BUFFER = 4 * 1024; // 4K extents = 64KB of data
18539// write the allocator to a flat bluefs file - 4K extents at a time
18540//-----------------------------------------------------------------------------------
18541int BlueStore::store_allocator(Allocator* src_allocator)
18542{
18543 // when storing allocations to file we must be sure there is no background compactions
18544 // the easiest way to achieve it is to make sure db is closed
18545 ceph_assert(db == nullptr);
18546 utime_t start_time = ceph_clock_now();
18547 int ret = 0;
18548
18549 // create dir if doesn't exist already
18550 if (!bluefs->dir_exists(allocator_dir) ) {
18551 ret = bluefs->mkdir(allocator_dir);
18552 if (ret != 0) {
18553 derr << "Failed mkdir with error-code " << ret << dendl;
18554 return -1;
18555 }
18556 }
18557 bluefs->compact_log();
18558 // reuse previous file-allocation if exists
18559 ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
18560 bool overwrite_file = (ret == 0);
18561 BlueFS::FileWriter *p_handle = nullptr;
18562 ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, overwrite_file);
18563 if (ret != 0) {
18564 derr << __func__ << "Failed open_for_write with error-code " << ret << dendl;
18565 return -1;
18566 }
18567
18568 uint64_t file_size = p_handle->file->fnode.size;
18569 uint64_t allocated = p_handle->file->fnode.get_allocated();
18570 dout(10) << "file_size=" << file_size << ", allocated=" << allocated << dendl;
18571
18572 bluefs->sync_metadata(false);
18573 unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(src_allocator));
18574 if (!allocator) {
18575 bluefs->close_writer(p_handle);
18576 return -1;
18577 }
18578
18579 // store all extents (except for the bluefs extents we removed) in a single flat file
18580 utime_t timestamp = ceph_clock_now();
18581 uint32_t crc = -1;
18582 {
18583 allocator_image_header header(timestamp, s_format_version, s_serial);
18584 bufferlist header_bl;
18585 encode(header, header_bl);
18586 crc = header_bl.crc32c(crc);
18587 encode(crc, header_bl);
18588 p_handle->append(header_bl);
18589 }
18590
18591 crc = -1; // reset crc
18592 extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
18593 extent_t *p_curr = buffer;
18594 const extent_t *p_end = buffer + MAX_EXTENTS_IN_BUFFER;
18595 uint64_t extent_count = 0;
18596 uint64_t allocation_size = 0;
18597 auto iterated_allocation = [&](uint64_t extent_offset, uint64_t extent_length) {
18598 if (extent_length == 0) {
18599 derr << __func__ << "" << extent_count << "::[" << extent_offset << "," << extent_length << "]" << dendl;
18600 ret = -1;
18601 return;
18602 }
18603 p_curr->offset = HTOCEPH_64(extent_offset);
18604 p_curr->length = HTOCEPH_64(extent_length);
18605 extent_count++;
18606 allocation_size += extent_length;
18607 p_curr++;
18608
18609 if (p_curr == p_end) {
18610 crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
18611 p_curr = buffer; // recycle the buffer
18612 }
18613 };
18614 allocator->foreach(iterated_allocation);
18615 // if got null extent -> fail the operation
18616 if (ret != 0) {
18617 derr << "Illegal extent, fail store operation" << dendl;
18618 derr << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
18619 bluefs->truncate(p_handle, 0);
18620 bluefs->close_writer(p_handle);
18621 return -1;
18622 }
18623
18624 // if we got any leftovers -> add crc and append to file
18625 if (p_curr > buffer) {
18626 crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
18627 }
18628
18629 {
18630 allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
18631 bufferlist trailer_bl;
18632 encode(trailer, trailer_bl);
18633 uint32_t crc = -1;
18634 crc = trailer_bl.crc32c(crc);
18635 encode(crc, trailer_bl);
18636 p_handle->append(trailer_bl);
18637 }
18638
18639 bluefs->fsync(p_handle);
18640 bluefs->truncate(p_handle, p_handle->pos);
18641 bluefs->fsync(p_handle);
18642
18643 utime_t duration = ceph_clock_now() - start_time;
18644 dout(5) <<"WRITE-extent_count=" << extent_count << ", allocation_size=" << allocation_size << ", serial=" << s_serial << dendl;
18645 dout(5) <<"p_handle->pos=" << p_handle->pos << " WRITE-duration=" << duration << " seconds" << dendl;
18646
18647 bluefs->close_writer(p_handle);
18648 need_to_destage_allocation_file = false;
18649 return 0;
18650}
18651
18652//-----------------------------------------------------------------------------------
18653Allocator* BlueStore::create_bitmap_allocator(uint64_t bdev_size) {
18654 // create allocator
18655 uint64_t alloc_size = min_alloc_size;
18656 Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size,
18657 zone_size, first_sequential_zone,
18658 "recovery");
18659 if (alloc) {
18660 return alloc;
18661 } else {
18662 derr << "Failed Allocator Creation" << dendl;
18663 return nullptr;
18664 }
18665}
18666
18667//-----------------------------------------------------------------------------------
18668size_t calc_allocator_image_header_size()
18669{
18670 utime_t timestamp = ceph_clock_now();
18671 allocator_image_header header(timestamp, s_format_version, s_serial);
18672 bufferlist header_bl;
18673 encode(header, header_bl);
18674 uint32_t crc = -1;
18675 crc = header_bl.crc32c(crc);
18676 encode(crc, header_bl);
18677
18678 return header_bl.length();
18679}
18680
18681//-----------------------------------------------------------------------------------
18682int calc_allocator_image_trailer_size()
18683{
18684 utime_t timestamp = ceph_clock_now();
18685 uint64_t extent_count = -1;
18686 uint64_t allocation_size = -1;
18687 uint32_t crc = -1;
18688 bufferlist trailer_bl;
18689 allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
18690
18691 encode(trailer, trailer_bl);
18692 crc = trailer_bl.crc32c(crc);
18693 encode(crc, trailer_bl);
18694 return trailer_bl.length();
18695}
18696
18697//-----------------------------------------------------------------------------------
18698int BlueStore::__restore_allocator(Allocator* allocator, uint64_t *num, uint64_t *bytes)
18699{
18700 if (cct->_conf->bluestore_debug_inject_allocation_from_file_failure > 0) {
18701 boost::mt11213b rng(time(NULL));
18702 boost::uniform_real<> ur(0, 1);
18703 if (ur(rng) < cct->_conf->bluestore_debug_inject_allocation_from_file_failure) {
18704 derr << __func__ << " failure injected." << dendl;
18705 return -1;
18706 }
18707 }
18708 utime_t start_time = ceph_clock_now();
18709 BlueFS::FileReader *p_temp_handle = nullptr;
18710 int ret = bluefs->open_for_read(allocator_dir, allocator_file, &p_temp_handle, false);
18711 if (ret != 0) {
18712 dout(1) << "Failed open_for_read with error-code " << ret << dendl;
18713 return -1;
18714 }
18715 unique_ptr<BlueFS::FileReader> p_handle(p_temp_handle);
18716 uint64_t read_alloc_size = 0;
18717 uint64_t file_size = p_handle->file->fnode.size;
18718 dout(5) << "file_size=" << file_size << ",sizeof(extent_t)=" << sizeof(extent_t) << dendl;
18719
18720 // make sure we were able to store a valid copy
18721 if (file_size == 0) {
18722 dout(1) << "No Valid allocation info on disk (empty file)" << dendl;
18723 return -1;
18724 }
18725
18726 // first read the header
18727 size_t offset = 0;
18728 allocator_image_header header;
18729 int header_size = calc_allocator_image_header_size();
18730 {
18731 bufferlist header_bl,temp_bl;
18732 int read_bytes = bluefs->read(p_handle.get(), offset, header_size, &temp_bl, nullptr);
18733 if (read_bytes != header_size) {
18734 derr << "Failed bluefs->read() for header::read_bytes=" << read_bytes << ", req_bytes=" << header_size << dendl;
18735 return -1;
18736 }
18737
18738 offset += read_bytes;
18739
18740 header_bl.claim_append(temp_bl);
18741 auto p = header_bl.cbegin();
18742 decode(header, p);
18743 if (header.verify(cct, path) != 0 ) {
18744 derr << "header = \n" << header << dendl;
18745 return -1;
18746 }
18747
18748 uint32_t crc_calc = -1, crc;
18749 crc_calc = header_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
18750 decode(crc, p);
18751 if (crc != crc_calc) {
18752 derr << "crc mismatch!!! crc=" << crc << ", crc_calc=" << crc_calc << dendl;
18753 derr << "header = \n" << header << dendl;
18754 return -1;
18755 }
18756
18757 // increment version for next store
18758 s_serial = header.serial + 1;
18759 }
18760
18761 // then read the payload (extents list) using a recycled buffer
18762 extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
18763 uint32_t crc = -1;
18764 int trailer_size = calc_allocator_image_trailer_size();
18765 uint64_t extent_count = 0;
18766 uint64_t extents_bytes_left = file_size - (header_size + trailer_size + sizeof(crc));
18767 while (extents_bytes_left) {
18768 int req_bytes = std::min(extents_bytes_left, static_cast<uint64_t>(sizeof(buffer)));
18769 int read_bytes = bluefs->read(p_handle.get(), offset, req_bytes, nullptr, (char*)buffer);
18770 if (read_bytes != req_bytes) {
18771 derr << "Failed bluefs->read()::read_bytes=" << read_bytes << ", req_bytes=" << req_bytes << dendl;
18772 return -1;
18773 }
18774
18775 offset += read_bytes;
18776 extents_bytes_left -= read_bytes;
18777
18778 const unsigned num_extent_in_buffer = read_bytes/sizeof(extent_t);
18779 const extent_t *p_end = buffer + num_extent_in_buffer;
18780 for (const extent_t *p_ext = buffer; p_ext < p_end; p_ext++) {
18781 uint64_t offset = CEPHTOH_64(p_ext->offset);
18782 uint64_t length = CEPHTOH_64(p_ext->length);
18783 read_alloc_size += length;
18784
18785 if (length > 0) {
18786 allocator->init_add_free(offset, length);
18787 extent_count ++;
18788 } else {
18789 derr << "extent with zero length at idx=" << extent_count << dendl;
18790 return -1;
18791 }
18792 }
18793
18794 uint32_t calc_crc = ceph_crc32c(crc, (const uint8_t*)buffer, read_bytes);
18795 read_bytes = bluefs->read(p_handle.get(), offset, sizeof(crc), nullptr, (char*)&crc);
18796 if (read_bytes == sizeof(crc) ) {
18797 crc = CEPHTOH_32(crc);
18798 if (crc != calc_crc) {
18799 derr << "data crc mismatch!!! crc=" << crc << ", calc_crc=" << calc_crc << dendl;
18800 derr << "extents_bytes_left=" << extents_bytes_left << ", offset=" << offset << ", extent_count=" << extent_count << dendl;
18801 return -1;
18802 }
18803
18804 offset += read_bytes;
18805 if (extents_bytes_left) {
18806 extents_bytes_left -= read_bytes;
18807 }
18808 } else {
18809 derr << "Failed bluefs->read() for crc::read_bytes=" << read_bytes << ", req_bytes=" << sizeof(crc) << dendl;
18810 return -1;
18811 }
18812
18813 }
18814
18815 // finally, read the trailer and verify it is in good shape and that we got all the extents
18816 {
18817 bufferlist trailer_bl,temp_bl;
18818 int read_bytes = bluefs->read(p_handle.get(), offset, trailer_size, &temp_bl, nullptr);
18819 if (read_bytes != trailer_size) {
18820 derr << "Failed bluefs->read() for trailer::read_bytes=" << read_bytes << ", req_bytes=" << trailer_size << dendl;
18821 return -1;
18822 }
18823 offset += read_bytes;
18824
18825 trailer_bl.claim_append(temp_bl);
18826 uint32_t crc_calc = -1;
18827 uint32_t crc;
18828 allocator_image_trailer trailer;
18829 auto p = trailer_bl.cbegin();
18830 decode(trailer, p);
18831 if (trailer.verify(cct, path, &header, extent_count, read_alloc_size) != 0 ) {
18832 derr << "trailer=\n" << trailer << dendl;
18833 return -1;
18834 }
18835
18836 crc_calc = trailer_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
18837 decode(crc, p);
18838 if (crc != crc_calc) {
18839 derr << "trailer crc mismatch!::crc=" << crc << ", crc_calc=" << crc_calc << dendl;
18840 derr << "trailer=\n" << trailer << dendl;
18841 return -1;
18842 }
18843 }
18844
18845 utime_t duration = ceph_clock_now() - start_time;
18846 dout(5) << "READ--extent_count=" << extent_count << ", read_alloc_size= "
18847 << read_alloc_size << ", file_size=" << file_size << dendl;
18848 dout(5) << "READ duration=" << duration << " seconds, s_serial=" << header.serial << dendl;
18849 *num = extent_count;
18850 *bytes = read_alloc_size;
18851 return 0;
18852}
18853
18854//-----------------------------------------------------------------------------------
18855int BlueStore::restore_allocator(Allocator* dest_allocator, uint64_t *num, uint64_t *bytes)
18856{
18857 utime_t start = ceph_clock_now();
18858 auto temp_allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
18859 int ret = __restore_allocator(temp_allocator.get(), num, bytes);
18860 if (ret != 0) {
18861 return ret;
18862 }
18863
18864 uint64_t num_entries = 0;
18865 dout(5) << " calling copy_allocator(bitmap_allocator -> shared_alloc.a)" << dendl;
18866 copy_allocator(temp_allocator.get(), dest_allocator, &num_entries);
18867 utime_t duration = ceph_clock_now() - start;
18868 dout(5) << "restored in " << duration << " seconds, num_entries=" << num_entries << dendl;
18869 return ret;
18870}
18871
18872//-----------------------------------------------------------------------------------
18873void BlueStore::set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length)
18874{
18875 dout(30) << __func__ << " 0x" << std::hex
18876 << offset << "~" << length
18877 << " " << min_alloc_size_mask
18878 << dendl;
18879 ceph_assert((offset & min_alloc_size_mask) == 0);
18880 ceph_assert((length & min_alloc_size_mask) == 0);
18881 sbmap->set(offset >> min_alloc_size_order, length >> min_alloc_size_order);
18882}
18883
18884void BlueStore::ExtentDecoderPartial::_consume_new_blob(bool spanning,
18885 uint64_t extent_no,
18886 uint64_t sbid,
18887 BlobRef b)
18888{
18889 [[maybe_unused]] auto cct = store.cct;
18890 ceph_assert(per_pool_statfs);
18891 ceph_assert(oid != ghobject_t());
18892
18893 auto &blob = b->get_blob();
18894 if(spanning) {
18895 dout(20) << __func__ << " " << spanning << " " << b->id << dendl;
18896 ceph_assert(b->id >= 0);
18897 spanning_blobs[b->id] = b;
18898 ++stats.spanning_blob_count;
18899 } else {
18900 dout(20) << __func__ << " " << spanning << " " << extent_no << dendl;
18901 blobs[extent_no] = b;
18902 }
18903 bool compressed = blob.is_compressed();
18904 if (!blob.is_shared()) {
18905 for (auto& pe : blob.get_extents()) {
18906 if (pe.offset == bluestore_pextent_t::INVALID_OFFSET) {
18907 ++stats.skipped_illegal_extent;
18908 continue;
18909 }
18910 store.set_allocation_in_simple_bmap(&sbmap, pe.offset, pe.length);
18911
18912 per_pool_statfs->allocated() += pe.length;
18913 if (compressed) {
18914 per_pool_statfs->compressed_allocated() += pe.length;
18915 }
18916 }
18917 if (compressed) {
18918 per_pool_statfs->compressed() +=
18919 blob.get_compressed_payload_length();
18920 ++stats.compressed_blob_count;
18921 }
18922 } else {
18923 auto it = sb_info.find(sbid);
18924 if (it == sb_info.end()) {
18925 derr << __func__ << " shared blob not found:" << sbid
18926 << dendl;
18927 }
18928 auto &sbi = *it;
18929 auto pool_id = oid.hobj.get_logical_pool();
18930 if (sbi.pool_id == sb_info_t::INVALID_POOL_ID) {
18931 sbi.pool_id = pool_id;
18932 size_t alloc_delta = sbi.allocated_chunks << min_alloc_size_order;
18933 per_pool_statfs->allocated() += alloc_delta;
18934 if (compressed) {
18935 per_pool_statfs->compressed_allocated() += alloc_delta;
18936 ++stats.compressed_blob_count;
18937 }
18938 }
18939 if (compressed) {
18940 per_pool_statfs->compressed() +=
18941 blob.get_compressed_payload_length();
18942 }
18943 }
18944}
18945
18946void BlueStore::ExtentDecoderPartial::consume_blobid(Extent* le,
18947 bool spanning,
18948 uint64_t blobid)
18949{
18950 [[maybe_unused]] auto cct = store.cct;
18951 dout(20) << __func__ << " " << spanning << " " << blobid << dendl;
18952 auto &map = spanning ? spanning_blobs : blobs;
18953 auto it = map.find(blobid);
18954 ceph_assert(it != map.end());
18955 per_pool_statfs->stored() += le->length;
18956 if (it->second->get_blob().is_compressed()) {
18957 per_pool_statfs->compressed_original() += le->length;
18958 }
18959}
18960
18961void BlueStore::ExtentDecoderPartial::consume_blob(Extent* le,
18962 uint64_t extent_no,
18963 uint64_t sbid,
18964 BlobRef b)
18965{
18966 _consume_new_blob(false, extent_no, sbid, b);
18967 per_pool_statfs->stored() += le->length;
18968 if (b->get_blob().is_compressed()) {
18969 per_pool_statfs->compressed_original() += le->length;
18970 }
18971}
18972
18973void BlueStore::ExtentDecoderPartial::consume_spanning_blob(uint64_t sbid,
18974 BlobRef b)
18975{
18976 _consume_new_blob(true, 0/*doesn't matter*/, sbid, b);
18977}
18978
18979void BlueStore::ExtentDecoderPartial::reset(const ghobject_t _oid,
18980 volatile_statfs* _per_pool_statfs)
18981{
18982 oid = _oid;
18983 per_pool_statfs = _per_pool_statfs;
18984 blob_map_t empty;
18985 blob_map_t empty2;
18986 std::swap(blobs, empty);
18987 std::swap(spanning_blobs, empty2);
18988}
18989
18990int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats_t& stats)
18991{
18992 sb_info_space_efficient_map_t sb_info;
18993 // iterate over all shared blobs
18994 auto it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
18995 if (!it) {
18996 derr << "failed getting shared blob's iterator" << dendl;
18997 return -ENOENT;
18998 }
18999 if (it) {
19000 for (it->lower_bound(string()); it->valid(); it->next()) {
19001 const auto& key = it->key();
19002 dout(20) << __func__ << " decode sb " << pretty_binary_string(key) << dendl;
19003 uint64_t sbid = 0;
19004 if (get_key_shared_blob(key, &sbid) != 0) {
19005 derr << __func__ << " bad shared blob key '" << pretty_binary_string(key)
19006 << "'" << dendl;
19007 }
19008 bluestore_shared_blob_t shared_blob(sbid);
19009 bufferlist bl = it->value();
19010 auto blp = bl.cbegin();
19011 try {
19012 decode(shared_blob, blp);
19013 }
19014 catch (ceph::buffer::error& e) {
19015 derr << __func__ << " failed to decode Shared Blob"
19016 << pretty_binary_string(key) << dendl;
19017 continue;
19018 }
19019 dout(20) << __func__ << " " << shared_blob << dendl;
19020 uint64_t allocated = 0;
19021 for (auto& r : shared_blob.ref_map.ref_map) {
19022 ceph_assert(r.first != bluestore_pextent_t::INVALID_OFFSET);
19023 set_allocation_in_simple_bmap(sbmap, r.first, r.second.length);
19024 allocated += r.second.length;
19025 }
19026 auto &sbi = sb_info.add_or_adopt(sbid);
19027 ceph_assert(p2phase(allocated, min_alloc_size) == 0);
19028 sbi.allocated_chunks += (allocated >> min_alloc_size_order);
19029 ++stats.shared_blob_count;
19030 }
19031 }
19032
19033 it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
19034 if (!it) {
19035 derr << "failed getting onode's iterator" << dendl;
19036 return -ENOENT;
19037 }
19038
19039 uint64_t kv_count = 0;
19040 uint64_t count_interval = 1'000'000;
19041 ExtentDecoderPartial edecoder(*this,
19042 stats,
19043 *sbmap,
19044 sb_info,
19045 min_alloc_size_order);
19046
19047 // iterate over all ONodes stored in RocksDB
19048 for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) {
19049 // trace an even after every million processed objects (typically every 5-10 seconds)
19050 if (kv_count && (kv_count % count_interval == 0) ) {
19051 dout(5) << __func__ << " processed objects count = " << kv_count << dendl;
19052 }
19053
19054 auto key = it->key();
19055 auto okey = key;
19056 dout(20) << __func__ << " decode onode " << pretty_binary_string(key) << dendl;
19057 ghobject_t oid;
19058 if (!is_extent_shard_key(it->key())) {
19059 int r = get_key_object(okey, &oid);
19060 if (r != 0) {
19061 derr << __func__ << " failed to decode onode key = "
19062 << pretty_binary_string(okey) << dendl;
19063 return -EIO;
19064 }
19065 edecoder.reset(oid,
19066 &stats.actual_pool_vstatfs[oid.hobj.get_logical_pool()]);
19067 Onode dummy_on(cct);
19068 Onode::decode_raw(&dummy_on,
19069 it->value(),
19070 edecoder);
19071 ++stats.onode_count;
19072 } else {
19073 uint32_t offset;
19074 int r = get_key_extent_shard(key, &okey, &offset);
19075 if (r != 0) {
19076 derr << __func__ << " failed to decode onode extent key = "
19077 << pretty_binary_string(key) << dendl;
19078 return -EIO;
19079 }
19080 r = get_key_object(okey, &oid);
19081 if (r != 0) {
19082 derr << __func__
19083 << " failed to decode onode key= " << pretty_binary_string(okey)
19084 << " from extent key= " << pretty_binary_string(key)
19085 << dendl;
19086 return -EIO;
19087 }
19088 ceph_assert(oid == edecoder.get_oid());
19089 edecoder.decode_some(it->value(), nullptr);
19090 ++stats.shard_count;
19091 }
19092 }
19093
19094 std::lock_guard l(vstatfs_lock);
19095 store_statfs_t s;
19096 osd_pools.clear();
19097 for (auto& p : stats.actual_pool_vstatfs) {
19098 if (per_pool_stat_collection) {
19099 osd_pools[p.first] = p.second;
19100 }
19101 stats.actual_store_vstatfs += p.second;
19102 p.second.publish(&s);
19103 dout(5) << __func__ << " recovered pool "
19104 << std::hex
19105 << p.first << "->" << s
19106 << std::dec
19107 << " per-pool:" << per_pool_stat_collection
19108 << dendl;
19109 }
19110 vstatfs = stats.actual_store_vstatfs;
19111 vstatfs.publish(&s);
19112 dout(5) << __func__ << " recovered " << s
19113 << dendl;
19114 return 0;
19115}
19116
19117//---------------------------------------------------------
19118int BlueStore::reconstruct_allocations(SimpleBitmap *sbmap, read_alloc_stats_t &stats)
19119{
19120 // first set space used by superblock
19121 auto super_length = std::max<uint64_t>(min_alloc_size, SUPER_RESERVED);
19122 set_allocation_in_simple_bmap(sbmap, 0, super_length);
19123 stats.extent_count++;
19124
19125 // then set all space taken by Objects
19126 int ret = read_allocation_from_onodes(sbmap, stats);
19127 if (ret < 0) {
19128 derr << "failed read_allocation_from_onodes()" << dendl;
19129 return ret;
19130 }
19131
19132 return 0;
19133}
19134
19135//-----------------------------------------------------------------------------------
19136static void copy_simple_bitmap_to_allocator(SimpleBitmap* sbmap, Allocator* dest_alloc, uint64_t alloc_size)
19137{
19138 int alloc_size_shift = std::countr_zero(alloc_size);
19139 uint64_t offset = 0;
19140 extent_t ext = sbmap->get_next_clr_extent(offset);
19141 while (ext.length != 0) {
19142 dest_alloc->init_add_free(ext.offset << alloc_size_shift, ext.length << alloc_size_shift);
19143 offset = ext.offset + ext.length;
19144 ext = sbmap->get_next_clr_extent(offset);
19145 }
19146}
19147
19148//---------------------------------------------------------
19149int BlueStore::read_allocation_from_drive_on_startup()
19150{
19151 int ret = 0;
19152
19153 ret = _open_collections();
19154 if (ret < 0) {
19155 return ret;
19156 }
19157 auto shutdown_cache = make_scope_guard([&] {
19158 _shutdown_cache();
19159 });
19160
19161 utime_t start = ceph_clock_now();
19162 read_alloc_stats_t stats = {};
19163 SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
19164 ret = reconstruct_allocations(&sbmap, stats);
19165 if (ret != 0) {
19166 return ret;
19167 }
19168
19169 copy_simple_bitmap_to_allocator(&sbmap, alloc, min_alloc_size);
19170
19171 utime_t duration = ceph_clock_now() - start;
19172 dout(1) << "::Allocation Recovery was completed in " << duration << " seconds, extent_count=" << stats.extent_count << dendl;
19173 return ret;
19174}
19175
19176
19177
19178
19179// Only used for debugging purposes - we build a secondary allocator from the Onodes and compare it to the existing one
19180// Not meant to be run by customers
19181#ifdef CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
19182
19183#include <stdlib.h>
19184#include <algorithm>
19185//---------------------------------------------------------
19186int cmpfunc (const void * a, const void * b)
19187{
19188 if ( ((extent_t*)a)->offset > ((extent_t*)b)->offset ) {
19189 return 1;
19190 }
19191 else if( ((extent_t*)a)->offset < ((extent_t*)b)->offset ) {
19192 return -1;
19193 }
19194 else {
19195 return 0;
19196 }
19197}
19198
19199// compare the allocator built from Onodes with the system allocator (CF-B)
19200//---------------------------------------------------------
19201int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t req_extent_count, uint64_t memory_target)
19202{
19203 uint64_t allocation_size = std::min((req_extent_count) * sizeof(extent_t), memory_target / 3);
19204 uint64_t extent_count = allocation_size/sizeof(extent_t);
19205 dout(5) << "req_extent_count=" << req_extent_count << ", granted extent_count="<< extent_count << dendl;
19206
19207 unique_ptr<extent_t[]> arr1;
19208 unique_ptr<extent_t[]> arr2;
19209 try {
19210 arr1 = make_unique<extent_t[]>(extent_count);
19211 arr2 = make_unique<extent_t[]>(extent_count);
19212 } catch (std::bad_alloc&) {
19213 derr << "****Failed dynamic allocation, extent_count=" << extent_count << dendl;
19214 return -1;
19215 }
19216
19217 // copy the extents from the allocators into simple array and then compare them
19218 uint64_t size1 = 0, size2 = 0;
19219 uint64_t idx1 = 0, idx2 = 0;
19220 auto iterated_mapper1 = [&](uint64_t offset, uint64_t length) {
19221 size1 += length;
19222 if (idx1 < extent_count) {
19223 arr1[idx1++] = {offset, length};
19224 }
19225 else if (idx1 == extent_count) {
19226 derr << "(2)compare_allocators:: spillover" << dendl;
19227 idx1 ++;
19228 }
19229
19230 };
19231
19232 auto iterated_mapper2 = [&](uint64_t offset, uint64_t length) {
19233 size2 += length;
19234 if (idx2 < extent_count) {
19235 arr2[idx2++] = {offset, length};
19236 }
19237 else if (idx2 == extent_count) {
19238 derr << "(2)compare_allocators:: spillover" << dendl;
19239 idx2 ++;
19240 }
19241 };
19242
19243 alloc1->foreach(iterated_mapper1);
19244 alloc2->foreach(iterated_mapper2);
19245
19246 qsort(arr1.get(), std::min(idx1, extent_count), sizeof(extent_t), cmpfunc);
19247 qsort(arr2.get(), std::min(idx2, extent_count), sizeof(extent_t), cmpfunc);
19248
19249 if (idx1 == idx2) {
19250 idx1 = idx2 = std::min(idx1, extent_count);
19251 if (memcmp(arr1.get(), arr2.get(), sizeof(extent_t) * idx2) == 0) {
19252 return 0;
19253 }
19254 derr << "Failed memcmp(arr1, arr2, sizeof(extent_t)*idx2)" << dendl;
19255 for (uint64_t i = 0; i < idx1; i++) {
19256 if (memcmp(arr1.get()+i, arr2.get()+i, sizeof(extent_t)) != 0) {
19257 derr << "!!!![" << i << "] arr1::<" << arr1[i].offset << "," << arr1[i].length << ">" << dendl;
19258 derr << "!!!![" << i << "] arr2::<" << arr2[i].offset << "," << arr2[i].length << ">" << dendl;
19259 return -1;
19260 }
19261 }
19262 return 0;
19263 } else {
19264 derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl;
19265 return -1;
19266 }
19267}
19268
19269//---------------------------------------------------------
19270int BlueStore::add_existing_bluefs_allocation(Allocator* allocator, read_alloc_stats_t &stats)
19271{
19272 // then add space used by bluefs to store rocksdb
19273 unsigned extent_count = 0;
19274 if (bluefs) {
19275 bluefs->foreach_block_extents(
19276 bluefs_layout.shared_bdev,
19277 [&](uint64_t start, uint32_t len) {
19278 allocator->init_rm_free(start, len);
19279 stats.extent_count++;
19280 }
19281 );
19282 }
19283
19284 dout(5) << "bluefs extent_count=" << extent_count << dendl;
19285 return 0;
19286}
19287
19288//---------------------------------------------------------
19289int BlueStore::read_allocation_from_drive_for_bluestore_tool()
19290{
19291 dout(5) << __func__ << dendl;
19292 int ret = 0;
19293 uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
19294 ret = _open_db_and_around(true, false);
19295 if (ret < 0) {
19296 return ret;
19297 }
19298
19299 ret = _open_collections();
19300 if (ret < 0) {
19301 _close_db_and_around();
19302 return ret;
19303 }
19304
19305 utime_t duration;
19306 read_alloc_stats_t stats = {};
19307 utime_t start = ceph_clock_now();
19308
19309 auto shutdown_cache = make_scope_guard([&] {
19310 dout(1) << "Allocation Recovery was completed in " << duration
19311 << " seconds; insert_count=" << stats.insert_count
19312 << "; extent_count=" << stats.extent_count << dendl;
19313 _shutdown_cache();
19314 _close_db_and_around();
19315 });
19316
19317 {
19318 auto allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
19319 //reconstruct allocations into a temp simple-bitmap and copy into allocator
19320 {
19321 SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
19322 ret = reconstruct_allocations(&sbmap, stats);
19323 if (ret != 0) {
19324 return ret;
19325 }
19326 copy_simple_bitmap_to_allocator(&sbmap, allocator.get(), min_alloc_size);
19327 }
19328
19329 // add allocation space used by the bluefs itself
19330 ret = add_existing_bluefs_allocation(allocator.get(), stats);
19331 if (ret < 0) {
19332 return ret;
19333 }
19334
19335 duration = ceph_clock_now() - start;
19336 stats.insert_count = 0;
19337 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
19338 stats.insert_count++;
19339 };
19340 allocator->foreach(count_entries);
19341 ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target);
19342 if (ret == 0) {
19343 dout(5) << "Allocator drive - file integrity check OK" << dendl;
19344 } else {
19345 derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl;
19346 }
19347 }
19348
19349 dout(1) << stats << dendl;
19350 return ret;
19351}
19352
19353//---------------------------------------------------------
19354Allocator* BlueStore::clone_allocator_without_bluefs(Allocator *src_allocator)
19355{
19356 uint64_t bdev_size = bdev->get_size();
19357 Allocator* allocator = create_bitmap_allocator(bdev_size);
19358 if (allocator) {
19359 dout(5) << "bitmap-allocator=" << allocator << dendl;
19360 } else {
19361 derr << "****failed create_bitmap_allocator()" << dendl;
19362 return nullptr;
19363 }
19364
19365 uint64_t num_entries = 0;
19366 copy_allocator(src_allocator, allocator, &num_entries);
19367
19368 // BlueFS stores its internal allocation outside RocksDB (FM) so we should not destage them to the allcoator-file
19369 // we are going to hide bluefs allocation during allocator-destage as they are stored elsewhere
19370 {
19371 bluefs->foreach_block_extents(
19372 bluefs_layout.shared_bdev,
19373 [&] (uint64_t start, uint32_t len) {
19374 allocator->init_add_free(start, len);
19375 }
19376 );
19377 }
19378
19379 return allocator;
19380}
19381
19382//---------------------------------------------------------
19383static void clear_allocation_objects_from_rocksdb(KeyValueDB *db, CephContext *cct, const std::string &path)
19384{
19385 dout(5) << "t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP)" << dendl;
19386 KeyValueDB::Transaction t = db->get_transaction();
19387 t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP);
19388 db->submit_transaction_sync(t);
19389}
19390
19391//---------------------------------------------------------
19392void BlueStore::copy_allocator_content_to_fm(Allocator *allocator, FreelistManager *real_fm)
19393{
19394 unsigned max_txn = 1024;
19395 dout(5) << "max_transaction_submit=" << max_txn << dendl;
19396 uint64_t size = 0, idx = 0;
19397 KeyValueDB::Transaction txn = db->get_transaction();
19398 auto iterated_insert = [&](uint64_t offset, uint64_t length) {
19399 size += length;
19400 real_fm->release(offset, length, txn);
19401 if ((++idx % max_txn) == 0) {
19402 db->submit_transaction_sync(txn);
19403 txn = db->get_transaction();
19404 }
19405 };
19406 allocator->foreach(iterated_insert);
19407 if (idx % max_txn != 0) {
19408 db->submit_transaction_sync(txn);
19409 }
19410 dout(5) << "size=" << size << ", num extents=" << idx << dendl;
19411}
19412
19413//---------------------------------------------------------
19414Allocator* BlueStore::initialize_allocator_from_freelist(FreelistManager *real_fm)
19415{
19416 dout(5) << "real_fm->enumerate_next" << dendl;
19417 Allocator* allocator2 = create_bitmap_allocator(bdev->get_size());
19418 if (allocator2) {
19419 dout(5) << "bitmap-allocator=" << allocator2 << dendl;
19420 } else {
19421 return nullptr;
19422 }
19423
19424 uint64_t size2 = 0, idx2 = 0;
19425 real_fm->enumerate_reset();
19426 uint64_t offset, length;
19427 while (real_fm->enumerate_next(db, &offset, &length)) {
19428 allocator2->init_add_free(offset, length);
19429 ++idx2;
19430 size2 += length;
19431 }
19432 real_fm->enumerate_reset();
19433
19434 dout(5) << "size2=" << size2 << ", num2=" << idx2 << dendl;
19435 return allocator2;
19436}
19437
19438//---------------------------------------------------------
19439// close the active fm and open it in a new mode like makefs()
19440// but make sure to mark the full device space as allocated
19441// later we will mark all exetents from the allocator as free
19442int BlueStore::reset_fm_for_restore()
19443{
19444 dout(5) << "<<==>> fm->clear_null_manager()" << dendl;
19445 fm->shutdown();
19446 delete fm;
19447 fm = nullptr;
19448 freelist_type = "bitmap";
19449 KeyValueDB::Transaction t = db->get_transaction();
19450 // call _open_fm() with fm_restore set to TRUE
19451 // this will mark the full device space as allocated (and not just the reserved space)
19452 _open_fm(t, true, true, true);
19453 if (fm == nullptr) {
19454 derr << "Failed _open_fm()" << dendl;
19455 return -1;
19456 }
19457 db->submit_transaction_sync(t);
19458 ceph_assert(!fm->is_null_manager());
19459 dout(5) << "fm was reactivated in full mode" << dendl;
19460 return 0;
19461}
19462
19463
19464//---------------------------------------------------------
19465// create a temp allocator filled with allocation state from the fm
19466// and compare it to the base allocator passed in
19467int BlueStore::verify_rocksdb_allocations(Allocator *allocator)
19468{
19469 dout(5) << "verify that alloc content is identical to FM" << dendl;
19470 // initialize from freelist
19471 Allocator* temp_allocator = initialize_allocator_from_freelist(fm);
19472 if (temp_allocator == nullptr) {
19473 return -1;
19474 }
19475
19476 uint64_t insert_count = 0;
19477 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
19478 insert_count++;
19479 };
19480 temp_allocator->foreach(count_entries);
19481 uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
19482 int ret = compare_allocators(allocator, temp_allocator, insert_count, memory_target);
19483
19484 delete temp_allocator;
19485
19486 if (ret == 0) {
19487 dout(5) << "SUCCESS!!! compare(allocator, temp_allocator)" << dendl;
19488 return 0;
19489 } else {
19490 derr << "**** FAILURE compare(allocator, temp_allocator)::ret=" << ret << dendl;
19491 return -1;
19492 }
19493}
19494
19495//---------------------------------------------------------
19496int BlueStore::db_cleanup(int ret)
19497{
19498 _shutdown_cache();
19499 _close_db_and_around();
19500 return ret;
19501}
19502
19503//---------------------------------------------------------
19504// convert back the system from null-allocator to using rocksdb to store allocation
19505int BlueStore::push_allocation_to_rocksdb()
19506{
19507 if (cct->_conf->bluestore_allocation_from_file) {
19508 derr << "cct->_conf->bluestore_allocation_from_file must be cleared first" << dendl;
19509 derr << "please change default to false in ceph.conf file>" << dendl;
19510 return -1;
19511 }
19512
19513 dout(5) << "calling open_db_and_around() in read/write mode" << dendl;
19514 int ret = _open_db_and_around(false);
19515 if (ret < 0) {
19516 return ret;
19517 }
19518
19519 if (!fm->is_null_manager()) {
19520 derr << "This is not a NULL-MANAGER -> nothing to do..." << dendl;
19521 return db_cleanup(0);
19522 }
19523
19524 // start by creating a clone copy of the shared-allocator
19525 unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(alloc));
19526 if (!allocator) {
19527 return db_cleanup(-1);
19528 }
19529
19530 // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
19531 clear_allocation_objects_from_rocksdb(db, cct, path);
19532
19533 // then open fm in new mode with the full devie marked as alloctaed
19534 if (reset_fm_for_restore() != 0) {
19535 return db_cleanup(-1);
19536 }
19537
19538 // push the free-space from the allocator (shared-alloc without bfs) to rocksdb
19539 copy_allocator_content_to_fm(allocator.get(), fm);
19540
19541 // compare the allocator info with the info stored in the fm/rocksdb
19542 if (verify_rocksdb_allocations(allocator.get()) == 0) {
19543 // all is good -> we can commit to rocksdb allocator
19544 commit_to_real_manager();
19545 } else {
19546 return db_cleanup(-1);
19547 }
19548
19549 // can't be too paranoid :-)
19550 dout(5) << "Running full scale verification..." << dendl;
19551 // close db/fm/allocator and start fresh
19552 db_cleanup(0);
19553 dout(5) << "calling open_db_and_around() in read-only mode" << dendl;
19554 ret = _open_db_and_around(true);
19555 if (ret < 0) {
19556 return db_cleanup(ret);
19557 }
19558 ceph_assert(!fm->is_null_manager());
19559 ceph_assert(verify_rocksdb_allocations(allocator.get()) == 0);
19560
19561 return db_cleanup(ret);
19562}
19563
19564#endif // CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
19565
19566//-------------------------------------------------------------------------------------
19567int BlueStore::commit_freelist_type()
19568{
19569 // When freelist_type to "bitmap" we will store allocation in RocksDB
19570 // When allocation-info is stored in a single file we set freelist_type to "null"
19571 // This will direct the startup code to read allocation from file and not RocksDB
19572 KeyValueDB::Transaction t = db->get_transaction();
19573 if (t == nullptr) {
19574 derr << "db->get_transaction() failed!!!" << dendl;
19575 return -1;
19576 }
19577
19578 bufferlist bl;
19579 bl.append(freelist_type);
19580 t->set(PREFIX_SUPER, "freelist_type", bl);
19581
19582 int ret = db->submit_transaction_sync(t);
19583 if (ret != 0) {
19584 derr << "Failed db->submit_transaction_sync(t)" << dendl;
19585 }
19586 return ret;
19587}
19588
19589//-------------------------------------------------------------------------------------
19590int BlueStore::commit_to_null_manager()
19591{
19592 dout(5) << __func__ << " Set FreelistManager to NULL FM..." << dendl;
19593 fm->set_null_manager();
19594 freelist_type = "null";
19595#if 1
19596 return commit_freelist_type();
19597#else
19598 // should check how long this step take on a big configuration as deletes are expensive
19599 if (commit_freelist_type() == 0) {
19600 // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
19601 clear_allocation_objects_from_rocksdb(db, cct, path);
19602 }
19603#endif
19604}
19605
19606
19607//-------------------------------------------------------------------------------------
19608int BlueStore::commit_to_real_manager()
19609{
19610 dout(5) << "Set FreelistManager to Real FM..." << dendl;
19611 ceph_assert(!fm->is_null_manager());
19612 freelist_type = "bitmap";
19613 int ret = commit_freelist_type();
19614 if (ret == 0) {
19615 //remove the allocation_file
19616 invalidate_allocation_file_on_bluefs();
19617 ret = bluefs->unlink(allocator_dir, allocator_file);
19618 bluefs->sync_metadata(false);
19619 if (ret == 0) {
19620 dout(5) << "Remove Allocation File successfully" << dendl;
19621 }
19622 else {
19623 derr << "Remove Allocation File ret_code=" << ret << dendl;
19624 }
19625 }
19626
19627 return ret;
19628}
19629
19630//================================================================================================================
19631//================================================================================================================