]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
1e59de90 15#include <bit>
7c673cae
FG
16#include <unistd.h>
17#include <stdlib.h>
18#include <sys/types.h>
19#include <sys/stat.h>
20#include <fcntl.h>
20effc67 21#include <algorithm>
7c673cae 22
eafe8130 23#include <boost/container/flat_set.hpp>
20effc67 24#include <boost/algorithm/string.hpp>
39ae355f
TL
25#include <boost/random/mersenne_twister.hpp>
26#include <boost/random/uniform_real.hpp>
eafe8130 27
31f18b77
FG
28#include "include/cpp-btree/btree_set.h"
29
7c673cae 30#include "BlueStore.h"
f67539c2 31#include "bluestore_common.h"
20effc67 32#include "simple_bitmap.h"
7c673cae
FG
33#include "os/kv.h"
34#include "include/compat.h"
35#include "include/intarith.h"
36#include "include/stringify.h"
11fdf7f2
TL
37#include "include/str_map.h"
38#include "include/util.h"
7c673cae
FG
39#include "common/errno.h"
40#include "common/safe_io.h"
91327a77 41#include "common/PriorityCache.h"
20effc67 42#include "common/url_escape.h"
7c673cae
FG
43#include "Allocator.h"
44#include "FreelistManager.h"
45#include "BlueFS.h"
46#include "BlueRocksEnv.h"
47#include "auth/Crypto.h"
48#include "common/EventTrace.h"
91327a77 49#include "perfglue/heap_profiler.h"
11fdf7f2
TL
50#include "common/blkdev.h"
51#include "common/numa.h"
f67539c2 52#include "common/pretty_binary.h"
20effc67
TL
53#include "kv/KeyValueHistogram.h"
54
55#ifdef HAVE_LIBZBD
56#include "ZonedAllocator.h"
57#include "ZonedFreelistManager.h"
58#endif
7c673cae 59
9f95a23c
TL
60#if defined(WITH_LTTNG)
61#define TRACEPOINT_DEFINE
62#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
63#include "tracing/bluestore.h"
64#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
65#undef TRACEPOINT_DEFINE
66#else
67#define tracepoint(...)
68#endif
69
7c673cae
FG
70#define dout_context cct
71#define dout_subsys ceph_subsys_bluestore
72
31f18b77
FG
73using bid_t = decltype(BlueStore::Blob::id);
74
75// bluestore_cache_onode
7c673cae 76MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 77 bluestore_cache_onode);
7c673cae 78
7c673cae 79MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
1e59de90 80 bluestore_cache_buffer);
7c673cae 81MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
1e59de90 82 bluestore_extent);
7c673cae 83MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
1e59de90 84 bluestore_blob);
7c673cae 85MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
1e59de90 86 bluestore_shared_blob);
31f18b77
FG
87
88// bluestore_txc
89MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
90 bluestore_txc);
20effc67 91using std::byte;
f67539c2
TL
92using std::deque;
93using std::min;
94using std::make_pair;
95using std::numeric_limits;
96using std::pair;
20effc67 97using std::less;
f67539c2 98using std::list;
20effc67 99using std::make_unique;
f67539c2
TL
100using std::map;
101using std::max;
102using std::ostream;
103using std::ostringstream;
104using std::set;
105using std::string;
106using std::stringstream;
20effc67 107using std::unique_ptr;
f67539c2
TL
108using std::vector;
109
110using ceph::bufferlist;
111using ceph::bufferptr;
112using ceph::coarse_mono_clock;
113using ceph::decode;
114using ceph::encode;
115using ceph::Formatter;
116using ceph::JSONFormatter;
117using ceph::make_timespan;
118using ceph::mono_clock;
119using ceph::mono_time;
120using ceph::timespan_str;
7c673cae
FG
121
122// kv store prefixes
11fdf7f2
TL
123const string PREFIX_SUPER = "S"; // field -> value
124const string PREFIX_STAT = "T"; // field -> value(int64 array)
125const string PREFIX_COLL = "C"; // collection name -> cnode_t
126const string PREFIX_OBJ = "O"; // object name -> onode_t
127const string PREFIX_OMAP = "M"; // u64 + keyname -> value
128const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
9f95a23c 129const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
f67539c2 130const string PREFIX_PERPG_OMAP = "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value
11fdf7f2
TL
131const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
132const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
133const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
20effc67
TL
134const string PREFIX_SHARED_BLOB = "X"; // u64 SB id -> shared_blob_t
135
136#ifdef HAVE_LIBZBD
f67539c2
TL
137const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager)
138const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager)
139const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata)
20effc67 140#endif
7c673cae 141
11fdf7f2
TL
142const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
143
7c673cae
FG
144// write a label in the first block. always use this size. note that
145// bluefs makes a matching assumption about the location of its
146// superblock (always the second block of the device).
147#define BDEV_LABEL_BLOCK_SIZE 4096
148
149// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
150#define SUPER_RESERVED 8192
151
152#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
153
154
155/*
156 * extent map blob encoding
157 *
158 * we use the low bits of the blobid field to indicate some common scenarios
159 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
160 */
161#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
162#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
163#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
164#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
165#define BLOBID_SHIFT_BITS 4
166
167/*
168 * object name key structure
169 *
170 * encoded u8: shard + 2^7 (so that it sorts properly)
171 * encoded u64: poolid + 2^63 (so that it sorts properly)
172 * encoded u32: hash (bit reversed)
173 *
174 * escaped string: namespace
175 *
176 * escaped string: key or object name
177 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
178 * we are done. otherwise, we are followed by the object name.
179 * escaped string: object name (unless '=' above)
180 *
181 * encoded u64: snap
182 * encoded u64: generation
183 * 'o'
184 */
185#define ONODE_KEY_SUFFIX 'o'
186
187/*
188 * extent shard key
189 *
190 * object prefix key
191 * u32
192 * 'x'
193 */
194#define EXTENT_SHARD_KEY_SUFFIX 'x'
195
196/*
197 * string encoding in the key
198 *
199 * The key string needs to lexicographically sort the same way that
200 * ghobject_t does. We do this by escaping anything <= to '#' with #
201 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
202 * hex digits.
203 *
204 * We use ! as a terminator for strings; this works because it is < #
205 * and will get escaped if it is present in the string.
206 *
f91f0fd5
TL
207 * NOTE: There is a bug in this implementation: due to implicit
208 * character type conversion in comparison it may produce unexpected
209 * ordering. Unfortunately fixing the bug would mean invalidating the
210 * keys in existing deployments. Instead we do additional sorting
211 * where it is needed.
7c673cae
FG
212 */
213template<typename S>
214static void append_escaped(const string &in, S *out)
215{
224ce89b
WB
216 char hexbyte[in.length() * 3 + 1];
217 char* ptr = &hexbyte[0];
7c673cae 218 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
f91f0fd5 219 if (*i <= '#') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
220 *ptr++ = '#';
221 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
222 *ptr++ = "0123456789abcdef"[*i & 0x0f];
f91f0fd5 223 } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
224 *ptr++ = '~';
225 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
226 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 227 } else {
224ce89b 228 *ptr++ = *i;
7c673cae
FG
229 }
230 }
224ce89b
WB
231 *ptr++ = '!';
232 out->append(hexbyte, ptr - &hexbyte[0]);
233}
234
235inline unsigned h2i(char c)
236{
237 if ((c >= '0') && (c <= '9')) {
238 return c - 0x30;
239 } else if ((c >= 'a') && (c <= 'f')) {
240 return c - 'a' + 10;
241 } else if ((c >= 'A') && (c <= 'F')) {
242 return c - 'A' + 10;
243 } else {
244 return 256; // make it always larger than 255
245 }
7c673cae
FG
246}
247
248static int decode_escaped(const char *p, string *out)
249{
224ce89b
WB
250 char buff[256];
251 char* ptr = &buff[0];
252 char* max = &buff[252];
7c673cae
FG
253 const char *orig_p = p;
254 while (*p && *p != '!') {
255 if (*p == '#' || *p == '~') {
224ce89b
WB
256 unsigned hex = 0;
257 p++;
258 hex = h2i(*p++) << 4;
259 if (hex > 255) {
260 return -EINVAL;
261 }
262 hex |= h2i(*p++);
263 if (hex > 255) {
264 return -EINVAL;
265 }
266 *ptr++ = hex;
7c673cae 267 } else {
224ce89b
WB
268 *ptr++ = *p++;
269 }
270 if (ptr > max) {
271 out->append(buff, ptr-buff);
272 ptr = &buff[0];
7c673cae
FG
273 }
274 }
224ce89b
WB
275 if (ptr != buff) {
276 out->append(buff, ptr-buff);
277 }
7c673cae
FG
278 return p - orig_p;
279}
280
7c673cae
FG
281template<typename T>
282static void _key_encode_shard(shard_id_t shard, T *key)
283{
284 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
285}
286
287static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
288{
289 pshard->id = (uint8_t)*key - (uint8_t)0x80;
290 return key + 1;
291}
292
f91f0fd5 293static void get_coll_range(const coll_t& cid, int bits,
f67539c2 294 ghobject_t *temp_start, ghobject_t *temp_end,
a4b75251 295 ghobject_t *start, ghobject_t *end, bool legacy)
7c673cae 296{
7c673cae 297 spg_t pgid;
a4b75251
TL
298 constexpr uint32_t MAX_HASH = std::numeric_limits<uint32_t>::max();
299 // use different nspaces due to we use different schemes when encoding
300 // keys for listing objects
301 const std::string_view MAX_NSPACE = legacy ? "\x7f" : "\xff";
7c673cae 302 if (cid.is_pg(&pgid)) {
f91f0fd5 303 start->shard_id = pgid.shard;
7c673cae
FG
304 *temp_start = *start;
305
f91f0fd5
TL
306 start->hobj.pool = pgid.pool();
307 temp_start->hobj.pool = -2ll - pgid.pool();
7c673cae
FG
308
309 *end = *start;
310 *temp_end = *temp_start;
311
312 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
f91f0fd5
TL
313 start->hobj.set_bitwise_key_u32(reverse_hash);
314 temp_start->hobj.set_bitwise_key_u32(reverse_hash);
7c673cae
FG
315
316 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
a4b75251
TL
317 if (end_hash > MAX_HASH) {
318 // make sure end hobj is even greater than the maximum possible hobj
319 end->hobj.set_bitwise_key_u32(MAX_HASH);
320 temp_end->hobj.set_bitwise_key_u32(MAX_HASH);
321 end->hobj.nspace = MAX_NSPACE;
322 } else {
323 end->hobj.set_bitwise_key_u32(end_hash);
324 temp_end->hobj.set_bitwise_key_u32(end_hash);
325 }
7c673cae 326 } else {
f91f0fd5
TL
327 start->shard_id = shard_id_t::NO_SHARD;
328 start->hobj.pool = -1ull;
329
7c673cae 330 *end = *start;
f91f0fd5 331 start->hobj.set_bitwise_key_u32(0);
a4b75251
TL
332 end->hobj.set_bitwise_key_u32(MAX_HASH);
333 end->hobj.nspace = MAX_NSPACE;
7c673cae
FG
334 // no separate temp section
335 *temp_start = *end;
336 *temp_end = *end;
337 }
f91f0fd5
TL
338
339 start->generation = 0;
340 end->generation = 0;
341 temp_start->generation = 0;
342 temp_end->generation = 0;
7c673cae
FG
343}
344
345static void get_shared_blob_key(uint64_t sbid, string *key)
346{
347 key->clear();
348 _key_encode_u64(sbid, key);
349}
350
351static int get_key_shared_blob(const string& key, uint64_t *sbid)
352{
353 const char *p = key.c_str();
354 if (key.length() < sizeof(uint64_t))
355 return -1;
224ce89b 356 _key_decode_u64(p, sbid);
7c673cae
FG
357 return 0;
358}
359
360template<typename S>
f91f0fd5 361static void _key_encode_prefix(const ghobject_t& oid, S *key)
7c673cae 362{
f91f0fd5
TL
363 _key_encode_shard(oid.shard_id, key);
364 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
365 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
366}
7c673cae 367
f91f0fd5
TL
368static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
369{
7c673cae
FG
370 p = _key_decode_shard(p, &oid->shard_id);
371
372 uint64_t pool;
373 p = _key_decode_u64(p, &pool);
374 oid->hobj.pool = pool - 0x8000000000000000ull;
375
376 unsigned hash;
377 p = _key_decode_u32(p, &hash);
378
379 oid->hobj.set_bitwise_key_u32(hash);
380
f91f0fd5
TL
381 return p;
382}
383
39ae355f 384
f91f0fd5
TL
385#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
386
20effc67 387static int _get_key_object(const char *p, ghobject_t *oid)
f91f0fd5
TL
388{
389 int r;
f91f0fd5
TL
390
391 p = _key_decode_prefix(p, oid);
392
7c673cae
FG
393 r = decode_escaped(p, &oid->hobj.nspace);
394 if (r < 0)
395 return -2;
396 p += r + 1;
397
398 string k;
399 r = decode_escaped(p, &k);
400 if (r < 0)
401 return -3;
402 p += r + 1;
403 if (*p == '=') {
404 // no key
405 ++p;
406 oid->hobj.oid.name = k;
407 } else if (*p == '<' || *p == '>') {
408 // key + name
409 ++p;
410 r = decode_escaped(p, &oid->hobj.oid.name);
411 if (r < 0)
412 return -5;
413 p += r + 1;
414 oid->hobj.set_key(k);
415 } else {
416 // malformed
417 return -6;
418 }
419
420 p = _key_decode_u64(p, &oid->hobj.snap.val);
421 p = _key_decode_u64(p, &oid->generation);
422
423 if (*p != ONODE_KEY_SUFFIX) {
424 return -7;
425 }
426 p++;
427 if (*p) {
428 // if we get something other than a null terminator here,
429 // something goes wrong.
430 return -8;
431 }
432
433 return 0;
434}
435
436template<typename S>
20effc67 437static int get_key_object(const S& key, ghobject_t *oid)
7c673cae 438{
20effc67
TL
439 if (key.length() < ENCODED_KEY_PREFIX_LEN)
440 return -1;
441 if (key.length() == ENCODED_KEY_PREFIX_LEN)
442 return -2;
443 const char *p = key.c_str();
444 return _get_key_object(p, oid);
445}
7c673cae 446
20effc67
TL
447template<typename S>
448static void _get_object_key(const ghobject_t& oid, S *key)
449{
f91f0fd5 450 size_t max_len = ENCODED_KEY_PREFIX_LEN +
7c673cae
FG
451 (oid.hobj.nspace.length() * 3 + 1) +
452 (oid.hobj.get_key().length() * 3 + 1) +
453 1 + // for '<', '=', or '>'
454 (oid.hobj.oid.name.length() * 3 + 1) +
455 8 + 8 + 1;
456 key->reserve(max_len);
457
f91f0fd5 458 _key_encode_prefix(oid, key);
7c673cae
FG
459
460 append_escaped(oid.hobj.nspace, key);
461
462 if (oid.hobj.get_key().length()) {
463 // is a key... could be < = or >.
464 append_escaped(oid.hobj.get_key(), key);
465 // (ASCII chars < = and > sort in that order, yay)
466 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
467 if (r) {
468 key->append(r > 0 ? ">" : "<");
469 append_escaped(oid.hobj.oid.name, key);
470 } else {
471 // same as no key
472 key->append("=");
473 }
474 } else {
475 // no key
476 append_escaped(oid.hobj.oid.name, key);
477 key->append("=");
478 }
479
480 _key_encode_u64(oid.hobj.snap, key);
481 _key_encode_u64(oid.generation, key);
482
483 key->push_back(ONODE_KEY_SUFFIX);
20effc67
TL
484}
485
486template<typename S>
487static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
488{
489 key->clear();
490 _get_object_key(oid, key);
7c673cae
FG
491
492 // sanity check
493 if (true) {
494 ghobject_t t;
495 int r = get_key_object(*key, &t);
496 if (r || t != oid) {
497 derr << " r " << r << dendl;
498 derr << "key " << pretty_binary_string(*key) << dendl;
499 derr << "oid " << oid << dendl;
500 derr << " t " << t << dendl;
11fdf7f2 501 ceph_assert(r == 0 && t == oid);
7c673cae
FG
502 }
503 }
504}
505
7c673cae
FG
506// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
507// char lets us quickly test whether it is a shard key without decoding any
508// of the prefix bytes.
509template<typename S>
510static void get_extent_shard_key(const S& onode_key, uint32_t offset,
511 string *key)
512{
513 key->clear();
514 key->reserve(onode_key.length() + 4 + 1);
515 key->append(onode_key.c_str(), onode_key.size());
516 _key_encode_u32(offset, key);
517 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
518}
519
520static void rewrite_extent_shard_key(uint32_t offset, string *key)
521{
11fdf7f2
TL
522 ceph_assert(key->size() > sizeof(uint32_t) + 1);
523 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
524 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
525}
526
527template<typename S>
528static void generate_extent_shard_key_and_apply(
529 const S& onode_key,
530 uint32_t offset,
531 string *key,
532 std::function<void(const string& final_key)> apply)
533{
534 if (key->empty()) { // make full key
11fdf7f2 535 ceph_assert(!onode_key.empty());
7c673cae
FG
536 get_extent_shard_key(onode_key, offset, key);
537 } else {
538 rewrite_extent_shard_key(offset, key);
539 }
540 apply(*key);
541}
542
543int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
544{
11fdf7f2
TL
545 ceph_assert(key.size() > sizeof(uint32_t) + 1);
546 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
547 int okey_len = key.size() - sizeof(uint32_t) - 1;
548 *onode_key = key.substr(0, okey_len);
549 const char *p = key.data() + okey_len;
224ce89b 550 _key_decode_u32(p, offset);
7c673cae
FG
551 return 0;
552}
553
554static bool is_extent_shard_key(const string& key)
555{
556 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
557}
558
7c673cae
FG
559static void get_deferred_key(uint64_t seq, string *out)
560{
561 _key_encode_u64(seq, out);
562}
563
11fdf7f2
TL
564static void get_pool_stat_key(int64_t pool_id, string *key)
565{
566 key->clear();
567 _key_encode_u64(pool_id, key);
568}
569
570static int get_key_pool_stat(const string& key, uint64_t* pool_id)
571{
572 const char *p = key.c_str();
573 if (key.length() < sizeof(uint64_t))
574 return -1;
575 _key_decode_u64(p, pool_id);
576 return 0;
577}
7c673cae 578
20effc67
TL
579#ifdef HAVE_LIBZBD
580static void get_zone_offset_object_key(
581 uint32_t zone,
582 uint64_t offset,
583 ghobject_t oid,
584 std::string *key)
585{
586 key->clear();
587 _key_encode_u32(zone, key);
588 _key_encode_u64(offset, key);
589 _get_object_key(oid, key);
590}
591
592static int get_key_zone_offset_object(
593 const string& key,
594 uint32_t *zone,
595 uint64_t *offset,
596 ghobject_t *oid)
597{
598 const char *p = key.c_str();
599 if (key.length() < sizeof(uint64_t) + sizeof(uint32_t) + ENCODED_KEY_PREFIX_LEN + 1)
600 return -1;
601 p = _key_decode_u32(p, zone);
602 p = _key_decode_u64(p, offset);
603 int r = _get_key_object(p, oid);
604 if (r < 0) {
605 return r;
606 }
607 return 0;
608}
609#endif
522d829b 610
81eedcae
TL
611template <int LogLevelV>
612void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
613{
614 uint64_t pos = 0;
615 for (auto& s : em.shards) {
616 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
617 << (s.loaded ? " (loaded)" : "")
618 << (s.dirty ? " (dirty)" : "")
619 << dendl;
620 }
621 for (auto& e : em.extent_map) {
622 dout(LogLevelV) << __func__ << " " << e << dendl;
623 ceph_assert(e.logical_offset >= pos);
624 pos = e.logical_offset + e.length;
625 const bluestore_blob_t& blob = e.blob->get_blob();
626 if (blob.has_csum()) {
627 vector<uint64_t> v;
628 unsigned n = blob.get_csum_count();
629 for (unsigned i = 0; i < n; ++i)
630 v.push_back(blob.get_csum_item(i));
631 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
632 << dendl;
633 }
634 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
635 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
636 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
637 << "~" << i.second->length << std::dec
638 << " " << *i.second << dendl;
639 }
640 }
641}
642
643template <int LogLevelV>
644void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
645{
646 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
647 return;
648 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
649 << " nid " << o.onode.nid
650 << " size 0x" << std::hex << o.onode.size
651 << " (" << std::dec << o.onode.size << ")"
652 << " expected_object_size " << o.onode.expected_object_size
653 << " expected_write_size " << o.onode.expected_write_size
654 << " in " << o.onode.extent_map_shards.size() << " shards"
655 << ", " << o.extent_map.spanning_blob_map.size()
656 << " spanning blobs"
657 << dendl;
20effc67
TL
658 for (auto& [zone, offset] : o.onode.zone_offset_refs) {
659 dout(LogLevelV) << __func__ << " zone ref 0x" << std::hex << zone
660 << " offset 0x" << offset << std::dec << dendl;
661 }
81eedcae
TL
662 for (auto p = o.onode.attrs.begin();
663 p != o.onode.attrs.end();
664 ++p) {
665 dout(LogLevelV) << __func__ << " attr " << p->first
666 << " len " << p->second.length() << dendl;
667 }
668 _dump_extent_map<LogLevelV>(cct, o.extent_map);
669}
670
671template <int LogLevelV>
672void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
673{
674 dout(LogLevelV) << __func__ << " transaction dump:\n";
675 JSONFormatter f(true);
676 f.open_object_section("transaction");
677 t->dump(&f);
678 f.close_section();
679 f.flush(*_dout);
680 *_dout << dendl;
681}
682
7c673cae
FG
683// Buffer
684
685ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
686{
687 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
688 << b.offset << "~" << b.length << std::dec
689 << " " << BlueStore::Buffer::get_state_name(b.state);
690 if (b.flags)
691 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
692 return out << ")";
693}
694
f91f0fd5
TL
695namespace {
696
697/*
698 * Due to a bug in key string encoding (see a comment for append_escaped)
699 * the KeyValueDB iterator does not lexicographically sort the same
700 * way that ghobject_t does: objects with the same hash may have wrong order.
701 *
702 * This is the iterator wrapper that fixes the keys order.
703 */
704
705class CollectionListIterator {
706public:
707 CollectionListIterator(const KeyValueDB::Iterator &it)
708 : m_it(it) {
709 }
710 virtual ~CollectionListIterator() {
711 }
712
713 virtual bool valid() const = 0;
714 virtual const ghobject_t &oid() const = 0;
715 virtual void lower_bound(const ghobject_t &oid) = 0;
716 virtual void upper_bound(const ghobject_t &oid) = 0;
717 virtual void next() = 0;
718
adb31ebb
TL
719 virtual int cmp(const ghobject_t &oid) const = 0;
720
721 bool is_ge(const ghobject_t &oid) const {
722 return cmp(oid) >= 0;
723 }
724
725 bool is_lt(const ghobject_t &oid) const {
726 return cmp(oid) < 0;
727 }
728
f91f0fd5
TL
729protected:
730 KeyValueDB::Iterator m_it;
731};
732
733class SimpleCollectionListIterator : public CollectionListIterator {
734public:
735 SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
736 : CollectionListIterator(it), m_cct(cct) {
737 }
738
739 bool valid() const override {
740 return m_it->valid();
741 }
742
743 const ghobject_t &oid() const override {
744 ceph_assert(valid());
745
746 return m_oid;
747 }
748
749 void lower_bound(const ghobject_t &oid) override {
750 string key;
751 get_object_key(m_cct, oid, &key);
752
753 m_it->lower_bound(key);
754 get_oid();
755 }
756
757 void upper_bound(const ghobject_t &oid) override {
758 string key;
759 get_object_key(m_cct, oid, &key);
760
761 m_it->upper_bound(key);
762 get_oid();
763 }
764
765 void next() override {
766 ceph_assert(valid());
767
768 m_it->next();
769 get_oid();
770 }
771
adb31ebb
TL
772 int cmp(const ghobject_t &oid) const override {
773 ceph_assert(valid());
774
775 string key;
776 get_object_key(m_cct, oid, &key);
777
778 return m_it->key().compare(key);
779 }
780
f91f0fd5
TL
781private:
782 CephContext *m_cct;
783 ghobject_t m_oid;
784
785 void get_oid() {
f67539c2
TL
786 m_oid = ghobject_t();
787 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
788 m_it->next();
f91f0fd5 789 }
f67539c2 790 if (!valid()) {
f91f0fd5
TL
791 return;
792 }
793
f91f0fd5
TL
794 int r = get_key_object(m_it->key(), &m_oid);
795 ceph_assert(r == 0);
796 }
797};
798
799class SortedCollectionListIterator : public CollectionListIterator {
800public:
801 SortedCollectionListIterator(const KeyValueDB::Iterator &it)
802 : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
803 }
804
805 bool valid() const override {
806 return m_chunk_iter != m_chunk.end();
807 }
808
809 const ghobject_t &oid() const override {
810 ceph_assert(valid());
811
812 return m_chunk_iter->first;
813 }
814
815 void lower_bound(const ghobject_t &oid) override {
816 std::string key;
817 _key_encode_prefix(oid, &key);
818
819 m_it->lower_bound(key);
820 m_chunk_iter = m_chunk.end();
821 if (!get_next_chunk()) {
822 return;
823 }
824
825 if (this->oid().shard_id != oid.shard_id ||
826 this->oid().hobj.pool != oid.hobj.pool ||
827 this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
828 return;
829 }
830
831 m_chunk_iter = m_chunk.lower_bound(oid);
832 if (m_chunk_iter == m_chunk.end()) {
833 get_next_chunk();
834 }
835 }
836
837 void upper_bound(const ghobject_t &oid) override {
838 lower_bound(oid);
839
840 if (valid() && this->oid() == oid) {
841 next();
842 }
843 }
844
845 void next() override {
846 ceph_assert(valid());
847
848 m_chunk_iter++;
849 if (m_chunk_iter == m_chunk.end()) {
850 get_next_chunk();
851 }
852 }
853
adb31ebb
TL
854 int cmp(const ghobject_t &oid) const override {
855 ceph_assert(valid());
856
857 if (this->oid() < oid) {
858 return -1;
859 }
860 if (this->oid() > oid) {
861 return 1;
862 }
863 return 0;
864 }
865
f91f0fd5
TL
866private:
867 std::map<ghobject_t, std::string> m_chunk;
868 std::map<ghobject_t, std::string>::iterator m_chunk_iter;
869
870 bool get_next_chunk() {
871 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
872 m_it->next();
873 }
874
875 if (!m_it->valid()) {
876 return false;
877 }
878
879 ghobject_t oid;
880 int r = get_key_object(m_it->key(), &oid);
881 ceph_assert(r == 0);
882
883 m_chunk.clear();
884 while (true) {
885 m_chunk.insert({oid, m_it->key()});
886
887 do {
888 m_it->next();
889 } while (m_it->valid() && is_extent_shard_key(m_it->key()));
890
891 if (!m_it->valid()) {
892 break;
893 }
894
895 ghobject_t next;
896 r = get_key_object(m_it->key(), &next);
897 ceph_assert(r == 0);
898 if (next.shard_id != oid.shard_id ||
899 next.hobj.pool != oid.hobj.pool ||
900 next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
901 break;
902 }
903 oid = next;
904 }
905
906 m_chunk_iter = m_chunk.begin();
907 return true;
908 }
909};
910
911} // anonymous namespace
912
7c673cae
FG
913// Garbage Collector
914
915void BlueStore::GarbageCollector::process_protrusive_extents(
916 const BlueStore::ExtentMap& extent_map,
917 uint64_t start_offset,
918 uint64_t end_offset,
919 uint64_t start_touch_offset,
920 uint64_t end_touch_offset,
921 uint64_t min_alloc_size)
922{
11fdf7f2 923 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
7c673cae 924
11fdf7f2
TL
925 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
926 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
7c673cae
FG
927
928 dout(30) << __func__ << " (hex): [" << std::hex
929 << lookup_start_offset << ", " << lookup_end_offset
930 << ")" << std::dec << dendl;
931
932 for (auto it = extent_map.seek_lextent(lookup_start_offset);
933 it != extent_map.extent_map.end() &&
934 it->logical_offset < lookup_end_offset;
935 ++it) {
936 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
937 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
938
939 dout(30) << __func__ << " " << *it
940 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
941 << dendl;
942
943 Blob* b = it->blob.get();
944
945 if (it->logical_offset >=start_touch_offset &&
946 it->logical_end() <= end_touch_offset) {
947 // Process extents within the range affected by
948 // the current write request.
949 // Need to take into account if existing extents
950 // can be merged with them (uncompressed case)
951 if (!b->get_blob().is_compressed()) {
952 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
953 --blob_info_counted->expected_allocations; // don't need to allocate
954 // new AU for compressed
955 // data since another
956 // collocated uncompressed
957 // blob already exists
958 dout(30) << __func__ << " --expected:"
959 << alloc_unit_start << dendl;
960 }
961 used_alloc_unit = alloc_unit_end;
962 blob_info_counted = nullptr;
963 }
964 } else if (b->get_blob().is_compressed()) {
965
966 // additionally we take compressed blobs that were not impacted
967 // by the write into account too
968 BlobInfo& bi =
969 affected_blobs.emplace(
970 b, BlobInfo(b->get_referenced_bytes())).first->second;
971
972 int adjust =
973 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
974 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
975 dout(30) << __func__ << " expected_allocations="
976 << bi.expected_allocations << " end_au:"
977 << alloc_unit_end << dendl;
978
979 blob_info_counted = &bi;
980 used_alloc_unit = alloc_unit_end;
981
11fdf7f2 982 ceph_assert(it->length <= bi.referenced_bytes);
7c673cae
FG
983 bi.referenced_bytes -= it->length;
984 dout(30) << __func__ << " affected_blob:" << *b
985 << " unref 0x" << std::hex << it->length
986 << " referenced = 0x" << bi.referenced_bytes
987 << std::dec << dendl;
988 // NOTE: we can't move specific blob to resulting GC list here
989 // when reference counter == 0 since subsequent extents might
990 // decrement its expected_allocation.
991 // Hence need to enumerate all the extents first.
992 if (!bi.collect_candidate) {
993 bi.first_lextent = it;
994 bi.collect_candidate = true;
995 }
996 bi.last_lextent = it;
997 } else {
998 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
999 // don't need to allocate new AU for compressed data since another
1000 // collocated uncompressed blob already exists
1001 --blob_info_counted->expected_allocations;
1002 dout(30) << __func__ << " --expected_allocations:"
1003 << alloc_unit_start << dendl;
1004 }
1005 used_alloc_unit = alloc_unit_end;
1006 blob_info_counted = nullptr;
1007 }
1008 }
1009
1010 for (auto b_it = affected_blobs.begin();
1011 b_it != affected_blobs.end();
1012 ++b_it) {
1013 Blob* b = b_it->first;
1014 BlobInfo& bi = b_it->second;
1015 if (bi.referenced_bytes == 0) {
1016 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
1017 int64_t blob_expected_for_release =
11fdf7f2 1018 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
7c673cae
FG
1019
1020 dout(30) << __func__ << " " << *(b_it->first)
1021 << " expected4release=" << blob_expected_for_release
1022 << " expected_allocations=" << bi.expected_allocations
1023 << dendl;
1024 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
11fdf7f2 1025 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
7c673cae
FG
1026 if (bi.collect_candidate) {
1027 auto it = bi.first_lextent;
1028 bool bExit = false;
1029 do {
1030 if (it->blob.get() == b) {
eafe8130 1031 extents_to_collect.insert(it->logical_offset, it->length);
7c673cae
FG
1032 }
1033 bExit = it == bi.last_lextent;
1034 ++it;
31f18b77 1035 } while (!bExit);
7c673cae
FG
1036 }
1037 expected_for_release += blob_expected_for_release;
1038 expected_allocations += bi.expected_allocations;
1039 }
1040 }
1041 }
1042}
1043
1044int64_t BlueStore::GarbageCollector::estimate(
1045 uint64_t start_offset,
1046 uint64_t length,
1047 const BlueStore::ExtentMap& extent_map,
1048 const BlueStore::old_extent_map_t& old_extents,
1049 uint64_t min_alloc_size)
1050{
1051
1052 affected_blobs.clear();
1053 extents_to_collect.clear();
1054 used_alloc_unit = boost::optional<uint64_t >();
1055 blob_info_counted = nullptr;
1056
eafe8130
TL
1057 uint64_t gc_start_offset = start_offset;
1058 uint64_t gc_end_offset = start_offset + length;
7c673cae
FG
1059
1060 uint64_t end_offset = start_offset + length;
1061
1062 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
1063 Blob* b = it->e.blob.get();
1064 if (b->get_blob().is_compressed()) {
1065
1066 // update gc_start_offset/gc_end_offset if needed
1067 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
11fdf7f2 1068 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
7c673cae
FG
1069
1070 auto o = it->e.logical_offset;
1071 auto l = it->e.length;
1072
1073 uint64_t ref_bytes = b->get_referenced_bytes();
1074 // micro optimization to bypass blobs that have no more references
1075 if (ref_bytes != 0) {
1076 dout(30) << __func__ << " affected_blob:" << *b
1077 << " unref 0x" << std::hex << o << "~" << l
1078 << std::dec << dendl;
1079 affected_blobs.emplace(b, BlobInfo(ref_bytes));
1080 }
1081 }
1082 }
1083 dout(30) << __func__ << " gc range(hex): [" << std::hex
1084 << gc_start_offset << ", " << gc_end_offset
1085 << ")" << std::dec << dendl;
1086
1087 // enumerate preceeding extents to check if they reference affected blobs
1088 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
1089 process_protrusive_extents(extent_map,
1090 gc_start_offset,
1091 gc_end_offset,
1092 start_offset,
1093 end_offset,
1094 min_alloc_size);
1095 }
1096 return expected_for_release - expected_allocations;
1097}
1098
9f95a23c
TL
1099// LruOnodeCacheShard
1100struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
1101 typedef boost::intrusive::list<
1102 BlueStore::Onode,
1103 boost::intrusive::member_hook<
1104 BlueStore::Onode,
1105 boost::intrusive::list_member_hook<>,
1106 &BlueStore::Onode::lru_item> > list_t;
7c673cae 1107
9f95a23c 1108 list_t lru;
7c673cae 1109
9f95a23c 1110 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
7c673cae 1111
f6b5b4d7 1112 void _add(BlueStore::Onode* o, int level) override
9f95a23c 1113 {
39ae355f
TL
1114 o->set_cached();
1115 if (o->pin_nref == 1) {
9f95a23c 1116 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
20effc67
TL
1117 o->cache_age_bin = age_bins.front();
1118 *(o->cache_age_bin) += 1;
9f95a23c 1119 }
f6b5b4d7 1120 ++num; // we count both pinned and unpinned entries
20effc67
TL
1121 dout(20) << __func__ << " " << this << " " << o->oid << " added, num="
1122 << num << dendl;
eafe8130 1123 }
f6b5b4d7 1124 void _rm(BlueStore::Onode* o) override
9f95a23c 1125 {
39ae355f
TL
1126 o->clear_cached();
1127 if (o->lru_item.is_linked()) {
20effc67 1128 *(o->cache_age_bin) -= 1;
9f95a23c
TL
1129 lru.erase(lru.iterator_to(*o));
1130 }
f6b5b4d7
TL
1131 ceph_assert(num);
1132 --num;
1133 dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
9f95a23c 1134 }
39ae355f
TL
1135
1136 void maybe_unpin(BlueStore::Onode* o) override
adb31ebb 1137 {
39ae355f
TL
1138 OnodeCacheShard* ocs = this;
1139 ocs->lock.lock();
1140 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
1141 while (ocs != o->c->get_onode_cache()) {
1142 ocs->lock.unlock();
1143 ocs = o->c->get_onode_cache();
1144 ocs->lock.lock();
1145 }
1146 if (o->is_cached() && o->pin_nref == 1) {
1147 if(!o->lru_item.is_linked()) {
1148 if (o->exists) {
1149 lru.push_front(*o);
1150 o->cache_age_bin = age_bins.front();
1151 *(o->cache_age_bin) += 1;
1152 dout(20) << __func__ << " " << this << " " << o->oid << " unpinned"
1153 << dendl;
1154 } else {
1155 ceph_assert(num);
1156 --num;
1157 o->clear_cached();
1158 dout(20) << __func__ << " " << this << " " << o->oid << " removed"
1159 << dendl;
1160 // remove will also decrement nref
1161 o->c->onode_space._remove(o->oid);
1162 }
1163 } else if (o->exists) {
1164 // move onode within LRU
1165 lru.erase(lru.iterator_to(*o));
1166 lru.push_front(*o);
1167 if (o->cache_age_bin != age_bins.front()) {
1168 *(o->cache_age_bin) -= 1;
1169 o->cache_age_bin = age_bins.front();
1170 *(o->cache_age_bin) += 1;
1171 }
1172 dout(20) << __func__ << " " << this << " " << o->oid << " touched"
1173 << dendl;
1174 }
1175 }
1176 ocs->lock.unlock();
adb31ebb 1177 }
39ae355f 1178
9f95a23c
TL
1179 void _trim_to(uint64_t new_size) override
1180 {
1181 if (new_size >= lru.size()) {
1182 return; // don't even try
1183 }
39ae355f
TL
1184 uint64_t n = num - new_size; // note: we might get empty LRU
1185 // before n == 0 due to pinned
1186 // entries. And hence being unable
1187 // to reach new_size target.
1188 while (n-- > 0 && lru.size() > 0) {
1189 BlueStore::Onode *o = &lru.back();
1190 lru.pop_back();
1191
f6b5b4d7 1192 dout(20) << __func__ << " rm " << o->oid << " "
39ae355f
TL
1193 << o->nref << " " << o->cached << dendl;
1194
1195 *(o->cache_age_bin) -= 1;
1196 if (o->pin_nref > 1) {
1197 dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << dendl;
9f95a23c 1198 } else {
39ae355f
TL
1199 ceph_assert(num);
1200 --num;
1201 o->clear_cached();
1202 o->c->onode_space._remove(o->oid);
9f95a23c 1203 }
9f95a23c 1204 }
f6b5b4d7 1205 }
39ae355f 1206 void _move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
f6b5b4d7
TL
1207 {
1208 if (to == this) {
1209 return;
1210 }
39ae355f
TL
1211 _rm(o);
1212 ceph_assert(o->nref > 1);
1213 to->_add(o, 0);
9f95a23c
TL
1214 }
1215 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
1216 {
39ae355f 1217 std::lock_guard l(lock);
f6b5b4d7 1218 *onodes += num;
39ae355f 1219 *pinned_onodes += num - lru.size();
9f95a23c
TL
1220 }
1221};
7c673cae 1222
9f95a23c
TL
1223// OnodeCacheShard
1224BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
1225 CephContext* cct,
1226 string type,
1227 PerfCounters *logger)
7c673cae 1228{
9f95a23c
TL
1229 BlueStore::OnodeCacheShard *c = nullptr;
1230 // Currently we only implement an LRU cache for onodes
1231 c = new LruOnodeCacheShard(cct);
1232 c->logger = logger;
1233 return c;
7c673cae
FG
1234}
1235
9f95a23c
TL
1236// LruBufferCacheShard
1237struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
1238 typedef boost::intrusive::list<
1239 BlueStore::Buffer,
1240 boost::intrusive::member_hook<
1241 BlueStore::Buffer,
1242 boost::intrusive::list_member_hook<>,
1243 &BlueStore::Buffer::lru_item> > list_t;
1244 list_t lru;
1245
1246 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
1247
1248 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
1249 if (near) {
1250 auto q = lru.iterator_to(*near);
1251 lru.insert(q, *b);
1252 } else if (level > 0) {
1253 lru.push_front(*b);
1254 } else {
1255 lru.push_back(*b);
7c673cae 1256 }
9f95a23c 1257 buffer_bytes += b->length;
20effc67
TL
1258 b->cache_age_bin = age_bins.front();
1259 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1260 num = lru.size();
1261 }
1262 void _rm(BlueStore::Buffer *b) override {
1263 ceph_assert(buffer_bytes >= b->length);
1264 buffer_bytes -= b->length;
20effc67
TL
1265 assert(*(b->cache_age_bin) >= b->length);
1266 *(b->cache_age_bin) -= b->length;
9f95a23c
TL
1267 auto q = lru.iterator_to(*b);
1268 lru.erase(q);
1269 num = lru.size();
1270 }
1271 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
1272 src->_rm(b);
1273 _add(b, 0, nullptr);
1274 }
1275 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
1276 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1277 buffer_bytes += delta;
20effc67
TL
1278 assert(*(b->cache_age_bin) + delta >= 0);
1279 *(b->cache_age_bin) += delta;
9f95a23c
TL
1280 }
1281 void _touch(BlueStore::Buffer *b) override {
1282 auto p = lru.iterator_to(*b);
1283 lru.erase(p);
1284 lru.push_front(*b);
20effc67
TL
1285 *(b->cache_age_bin) -= b->length;
1286 b->cache_age_bin = age_bins.front();
1287 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1288 num = lru.size();
1289 _audit("_touch_buffer end");
1290 }
7c673cae 1291
9f95a23c
TL
1292 void _trim_to(uint64_t max) override
1293 {
1294 while (buffer_bytes > max) {
1295 auto i = lru.rbegin();
1296 if (i == lru.rend()) {
1297 // stop if lru is now empty
7c673cae
FG
1298 break;
1299 }
1300
9f95a23c
TL
1301 BlueStore::Buffer *b = &*i;
1302 ceph_assert(b->is_clean());
1303 dout(20) << __func__ << " rm " << *b << dendl;
20effc67
TL
1304 assert(*(b->cache_age_bin) >= b->length);
1305 *(b->cache_age_bin) -= b->length;
9f95a23c 1306 b->space->_rm_buffer(this, b);
7c673cae 1307 }
9f95a23c 1308 num = lru.size();
7c673cae 1309 }
7c673cae 1310
9f95a23c
TL
1311 void add_stats(uint64_t *extents,
1312 uint64_t *blobs,
1313 uint64_t *buffers,
1314 uint64_t *bytes) override {
1315 *extents += num_extents;
1316 *blobs += num_blobs;
1317 *buffers += num;
1318 *bytes += buffer_bytes;
7c673cae 1319 }
9f95a23c
TL
1320#ifdef DEBUG_CACHE
1321 void _audit(const char *s) override
1322 {
1323 dout(10) << __func__ << " " << when << " start" << dendl;
1324 uint64_t s = 0;
1325 for (auto i = lru.begin(); i != lru.end(); ++i) {
1326 s += i->length;
1327 }
1328 if (s != buffer_bytes) {
1329 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1330 << dendl;
1331 for (auto i = lru.begin(); i != lru.end(); ++i) {
1332 derr << __func__ << " " << *i << dendl;
1333 }
1334 ceph_assert(s == buffer_bytes);
7c673cae 1335 }
9f95a23c
TL
1336 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1337 << " ok" << dendl;
7c673cae 1338 }
7c673cae 1339#endif
9f95a23c 1340};
7c673cae 1341
9f95a23c
TL
1342// TwoQBufferCacheShard
1343
1344struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1345 typedef boost::intrusive::list<
1346 BlueStore::Buffer,
1347 boost::intrusive::member_hook<
1348 BlueStore::Buffer,
1349 boost::intrusive::list_member_hook<>,
1350 &BlueStore::Buffer::lru_item> > list_t;
1351 list_t hot; ///< "Am" hot buffers
1352 list_t warm_in; ///< "A1in" newly warm buffers
1353 list_t warm_out; ///< "A1out" empty buffers we've evicted
9f95a23c
TL
1354
1355 enum {
1356 BUFFER_NEW = 0,
1357 BUFFER_WARM_IN, ///< in warm_in
1358 BUFFER_WARM_OUT, ///< in warm_out
1359 BUFFER_HOT, ///< in hot
1360 BUFFER_TYPE_MAX
1361 };
7c673cae 1362
9f95a23c 1363 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
7c673cae 1364
9f95a23c
TL
1365public:
1366 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
7c673cae 1367
9f95a23c
TL
1368 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1369 {
1370 dout(20) << __func__ << " level " << level << " near " << near
1371 << " on " << *b
1372 << " which has cache_private " << b->cache_private << dendl;
1373 if (near) {
1374 b->cache_private = near->cache_private;
1375 switch (b->cache_private) {
1376 case BUFFER_WARM_IN:
1377 warm_in.insert(warm_in.iterator_to(*near), *b);
1378 break;
1379 case BUFFER_WARM_OUT:
1380 ceph_assert(b->is_empty());
1381 warm_out.insert(warm_out.iterator_to(*near), *b);
1382 break;
1383 case BUFFER_HOT:
1384 hot.insert(hot.iterator_to(*near), *b);
1385 break;
1386 default:
1387 ceph_abort_msg("bad cache_private");
1388 }
1389 } else if (b->cache_private == BUFFER_NEW) {
1390 b->cache_private = BUFFER_WARM_IN;
1391 if (level > 0) {
1392 warm_in.push_front(*b);
1393 } else {
1394 // take caller hint to start at the back of the warm queue
1395 warm_in.push_back(*b);
1396 }
1397 } else {
1398 // we got a hint from discard
1399 switch (b->cache_private) {
1400 case BUFFER_WARM_IN:
1401 // stay in warm_in. move to front, even though 2Q doesn't actually
1402 // do this.
1403 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1404 warm_in.push_front(*b);
1405 break;
1406 case BUFFER_WARM_OUT:
1407 b->cache_private = BUFFER_HOT;
1408 // move to hot. fall-thru
1409 case BUFFER_HOT:
1410 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1411 hot.push_front(*b);
1412 break;
1413 default:
1414 ceph_abort_msg("bad cache_private");
1415 }
1416 }
20effc67 1417 b->cache_age_bin = age_bins.front();
9f95a23c
TL
1418 if (!b->is_empty()) {
1419 buffer_bytes += b->length;
1420 list_bytes[b->cache_private] += b->length;
20effc67 1421 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1422 }
1423 num = hot.size() + warm_in.size();
1424 }
1425
1426 void _rm(BlueStore::Buffer *b) override
1427 {
1428 dout(20) << __func__ << " " << *b << dendl;
1429 if (!b->is_empty()) {
1430 ceph_assert(buffer_bytes >= b->length);
1431 buffer_bytes -= b->length;
1432 ceph_assert(list_bytes[b->cache_private] >= b->length);
1433 list_bytes[b->cache_private] -= b->length;
20effc67
TL
1434 assert(*(b->cache_age_bin) >= b->length);
1435 *(b->cache_age_bin) -= b->length;
9f95a23c 1436 }
7c673cae
FG
1437 switch (b->cache_private) {
1438 case BUFFER_WARM_IN:
9f95a23c 1439 warm_in.erase(warm_in.iterator_to(*b));
7c673cae
FG
1440 break;
1441 case BUFFER_WARM_OUT:
9f95a23c 1442 warm_out.erase(warm_out.iterator_to(*b));
7c673cae
FG
1443 break;
1444 case BUFFER_HOT:
9f95a23c 1445 hot.erase(hot.iterator_to(*b));
7c673cae
FG
1446 break;
1447 default:
11fdf7f2 1448 ceph_abort_msg("bad cache_private");
7c673cae 1449 }
9f95a23c
TL
1450 num = hot.size() + warm_in.size();
1451 }
1452
1453 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1454 {
1455 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1456 src->_rm(b);
1457
1458 // preserve which list we're on (even if we can't preserve the order!)
7c673cae
FG
1459 switch (b->cache_private) {
1460 case BUFFER_WARM_IN:
9f95a23c
TL
1461 ceph_assert(!b->is_empty());
1462 warm_in.push_back(*b);
7c673cae
FG
1463 break;
1464 case BUFFER_WARM_OUT:
9f95a23c
TL
1465 ceph_assert(b->is_empty());
1466 warm_out.push_back(*b);
1467 break;
7c673cae 1468 case BUFFER_HOT:
9f95a23c
TL
1469 ceph_assert(!b->is_empty());
1470 hot.push_back(*b);
7c673cae
FG
1471 break;
1472 default:
11fdf7f2 1473 ceph_abort_msg("bad cache_private");
7c673cae 1474 }
9f95a23c
TL
1475 if (!b->is_empty()) {
1476 buffer_bytes += b->length;
1477 list_bytes[b->cache_private] += b->length;
20effc67 1478 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1479 }
1480 num = hot.size() + warm_in.size();
7c673cae 1481 }
7c673cae 1482
9f95a23c
TL
1483 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1484 {
1485 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1486 if (!b->is_empty()) {
1487 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1488 buffer_bytes += delta;
1489 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1490 list_bytes[b->cache_private] += delta;
20effc67
TL
1491 assert(*(b->cache_age_bin) + delta >= 0);
1492 *(b->cache_age_bin) += delta;
9f95a23c 1493 }
7c673cae 1494 }
7c673cae 1495
9f95a23c
TL
1496 void _touch(BlueStore::Buffer *b) override {
1497 switch (b->cache_private) {
1498 case BUFFER_WARM_IN:
1499 // do nothing (somewhat counter-intuitively!)
1500 break;
1501 case BUFFER_WARM_OUT:
1502 // move from warm_out to hot LRU
1503 ceph_abort_msg("this happens via discard hint");
1504 break;
1505 case BUFFER_HOT:
1506 // move to front of hot LRU
1507 hot.erase(hot.iterator_to(*b));
1508 hot.push_front(*b);
1509 break;
1510 }
20effc67
TL
1511 *(b->cache_age_bin) -= b->length;
1512 b->cache_age_bin = age_bins.front();
1513 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1514 num = hot.size() + warm_in.size();
1515 _audit("_touch_buffer end");
7c673cae 1516 }
7c673cae 1517
9f95a23c
TL
1518 void _trim_to(uint64_t max) override
1519 {
1520 if (buffer_bytes > max) {
1521 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1522 uint64_t khot = max - kin;
1523
1524 // pre-calculate kout based on average buffer size too,
1525 // which is typical(the warm_in and hot lists may change later)
1526 uint64_t kout = 0;
1527 uint64_t buffer_num = hot.size() + warm_in.size();
1528 if (buffer_num) {
1529 uint64_t avg_size = buffer_bytes / buffer_num;
1530 ceph_assert(avg_size);
1531 uint64_t calculated_num = max / avg_size;
1532 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1533 }
1534
1535 if (list_bytes[BUFFER_HOT] < khot) {
1536 // hot is small, give slack to warm_in
1537 kin += khot - list_bytes[BUFFER_HOT];
1538 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1539 // warm_in is small, give slack to hot
1540 khot += kin - list_bytes[BUFFER_WARM_IN];
1541 }
1542
1543 // adjust warm_in list
1544 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1545 uint64_t evicted = 0;
1546
1547 while (to_evict_bytes > 0) {
1548 auto p = warm_in.rbegin();
1549 if (p == warm_in.rend()) {
1550 // stop if warm_in list is now empty
1551 break;
1552 }
7c673cae 1553
9f95a23c
TL
1554 BlueStore::Buffer *b = &*p;
1555 ceph_assert(b->is_clean());
1556 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1557 ceph_assert(buffer_bytes >= b->length);
1558 buffer_bytes -= b->length;
1559 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1560 list_bytes[BUFFER_WARM_IN] -= b->length;
20effc67
TL
1561 assert(*(b->cache_age_bin) >= b->length);
1562 *(b->cache_age_bin) -= b->length;
1563 to_evict_bytes -= b->length;
9f95a23c
TL
1564 evicted += b->length;
1565 b->state = BlueStore::Buffer::STATE_EMPTY;
1566 b->data.clear();
1567 warm_in.erase(warm_in.iterator_to(*b));
1568 warm_out.push_front(*b);
1569 b->cache_private = BUFFER_WARM_OUT;
1570 }
1571
1572 if (evicted > 0) {
1573 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1574 << " from warm_in list, done evicting warm_in buffers"
1575 << dendl;
1576 }
7c673cae 1577
9f95a23c
TL
1578 // adjust hot list
1579 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1580 evicted = 0;
7c673cae 1581
9f95a23c
TL
1582 while (to_evict_bytes > 0) {
1583 auto p = hot.rbegin();
1584 if (p == hot.rend()) {
1585 // stop if hot list is now empty
1586 break;
1587 }
7c673cae 1588
9f95a23c
TL
1589 BlueStore::Buffer *b = &*p;
1590 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1591 ceph_assert(b->is_clean());
1592 // adjust evict size before buffer goes invalid
1593 to_evict_bytes -= b->length;
1594 evicted += b->length;
1595 b->space->_rm_buffer(this, b);
1596 }
7c673cae 1597
9f95a23c
TL
1598 if (evicted > 0) {
1599 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1600 << " from hot list, done evicting hot buffers"
1601 << dendl;
7c673cae
FG
1602 }
1603
9f95a23c
TL
1604 // adjust warm out list too, if necessary
1605 int64_t n = warm_out.size() - kout;
1606 while (n-- > 0) {
1607 BlueStore::Buffer *b = &*warm_out.rbegin();
1608 ceph_assert(b->is_empty());
1609 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1610 b->space->_rm_buffer(this, b);
1611 }
7c673cae 1612 }
9f95a23c
TL
1613 num = hot.size() + warm_in.size();
1614 }
7c673cae 1615
9f95a23c
TL
1616 void add_stats(uint64_t *extents,
1617 uint64_t *blobs,
1618 uint64_t *buffers,
1619 uint64_t *bytes) override {
1620 *extents += num_extents;
1621 *blobs += num_blobs;
1622 *buffers += num;
1623 *bytes += buffer_bytes;
1624 }
7c673cae 1625
9f95a23c
TL
1626#ifdef DEBUG_CACHE
1627 void _audit(const char *s) override
1628 {
1629 dout(10) << __func__ << " " << when << " start" << dendl;
1630 uint64_t s = 0;
1631 for (auto i = hot.begin(); i != hot.end(); ++i) {
1632 s += i->length;
7c673cae
FG
1633 }
1634
9f95a23c
TL
1635 uint64_t hot_bytes = s;
1636 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1637 derr << __func__ << " hot_list_bytes "
1638 << list_bytes[BUFFER_HOT]
1639 << " != actual " << hot_bytes
1640 << dendl;
1641 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
7c673cae
FG
1642 }
1643
9f95a23c
TL
1644 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1645 s += i->length;
7c673cae 1646 }
7c673cae 1647
9f95a23c
TL
1648 uint64_t warm_in_bytes = s - hot_bytes;
1649 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1650 derr << __func__ << " warm_in_list_bytes "
1651 << list_bytes[BUFFER_WARM_IN]
1652 << " != actual " << warm_in_bytes
1653 << dendl;
1654 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
7c673cae 1655 }
7c673cae 1656
9f95a23c
TL
1657 if (s != buffer_bytes) {
1658 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1659 << dendl;
1660 ceph_assert(s == buffer_bytes);
1661 }
7c673cae 1662
9f95a23c
TL
1663 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1664 << " ok" << dendl;
7c673cae 1665 }
9f95a23c
TL
1666#endif
1667};
7c673cae 1668
9f95a23c 1669// BuferCacheShard
7c673cae 1670
9f95a23c
TL
1671BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1672 CephContext* cct,
1673 string type,
1674 PerfCounters *logger)
1675{
1676 BufferCacheShard *c = nullptr;
1677 if (type == "lru")
1678 c = new LruBufferCacheShard(cct);
1679 else if (type == "2q")
1680 c = new TwoQBufferCacheShard(cct);
1681 else
1682 ceph_abort_msg("unrecognized cache type");
1683 c->logger = logger;
1684 return c;
7c673cae 1685}
7c673cae
FG
1686
1687// BufferSpace
1688
1689#undef dout_prefix
1690#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1691
9f95a23c 1692void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
7c673cae
FG
1693{
1694 // note: we already hold cache->lock
1695 ldout(cache->cct, 20) << __func__ << dendl;
1696 while (!buffer_map.empty()) {
1697 _rm_buffer(cache, buffer_map.begin());
1698 }
1699}
1700
9f95a23c 1701int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
7c673cae
FG
1702{
1703 // note: we already hold cache->lock
1704 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1705 << std::dec << dendl;
1706 int cache_private = 0;
1707 cache->_audit("discard start");
1708 auto i = _data_lower_bound(offset);
1709 uint32_t end = offset + length;
1710 while (i != buffer_map.end()) {
1711 Buffer *b = i->second.get();
1712 if (b->offset >= end) {
1713 break;
1714 }
1715 if (b->cache_private > cache_private) {
1716 cache_private = b->cache_private;
1717 }
1718 if (b->offset < offset) {
1719 int64_t front = offset - b->offset;
1720 if (b->end() > end) {
1721 // drop middle (split)
1722 uint32_t tail = b->end() - end;
1723 if (b->data.length()) {
1724 bufferlist bl;
1725 bl.substr_of(b->data, b->length - tail, tail);
f67539c2 1726 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
31f18b77
FG
1727 nb->maybe_rebuild();
1728 _add_buffer(cache, nb, 0, b);
7c673cae 1729 } else {
f67539c2
TL
1730 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail,
1731 b->flags),
1732 0, b);
7c673cae
FG
1733 }
1734 if (!b->is_writing()) {
9f95a23c 1735 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1736 }
1737 b->truncate(front);
31f18b77 1738 b->maybe_rebuild();
7c673cae
FG
1739 cache->_audit("discard end 1");
1740 break;
1741 } else {
1742 // drop tail
1743 if (!b->is_writing()) {
9f95a23c 1744 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1745 }
1746 b->truncate(front);
31f18b77 1747 b->maybe_rebuild();
7c673cae
FG
1748 ++i;
1749 continue;
1750 }
1751 }
1752 if (b->end() <= end) {
1753 // drop entire buffer
1754 _rm_buffer(cache, i++);
1755 continue;
1756 }
1757 // drop front
1758 uint32_t keep = b->end() - end;
1759 if (b->data.length()) {
1760 bufferlist bl;
1761 bl.substr_of(b->data, b->length - keep, keep);
f67539c2 1762 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
31f18b77
FG
1763 nb->maybe_rebuild();
1764 _add_buffer(cache, nb, 0, b);
7c673cae 1765 } else {
f67539c2
TL
1766 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep,
1767 b->flags),
1768 0, b);
7c673cae
FG
1769 }
1770 _rm_buffer(cache, i);
1771 cache->_audit("discard end 2");
1772 break;
1773 }
1774 return cache_private;
1775}
1776
1777void BlueStore::BufferSpace::read(
9f95a23c 1778 BufferCacheShard* cache,
224ce89b
WB
1779 uint32_t offset,
1780 uint32_t length,
7c673cae 1781 BlueStore::ready_regions_t& res,
91327a77
AA
1782 interval_set<uint32_t>& res_intervals,
1783 int flags)
7c673cae 1784{
7c673cae
FG
1785 res.clear();
1786 res_intervals.clear();
1787 uint32_t want_bytes = length;
1788 uint32_t end = offset + length;
224ce89b
WB
1789
1790 {
11fdf7f2 1791 std::lock_guard l(cache->lock);
224ce89b
WB
1792 for (auto i = _data_lower_bound(offset);
1793 i != buffer_map.end() && offset < end && i->first < end;
1794 ++i) {
1795 Buffer *b = i->second.get();
11fdf7f2 1796 ceph_assert(b->end() > offset);
91327a77
AA
1797
1798 bool val = false;
1799 if (flags & BYPASS_CLEAN_CACHE)
1800 val = b->is_writing();
1801 else
1802 val = b->is_writing() || b->is_clean();
1803 if (val) {
224ce89b
WB
1804 if (b->offset < offset) {
1805 uint32_t skip = offset - b->offset;
11fdf7f2 1806 uint32_t l = min(length, b->length - skip);
224ce89b
WB
1807 res[offset].substr_of(b->data, skip, l);
1808 res_intervals.insert(offset, l);
1809 offset += l;
1810 length -= l;
1811 if (!b->is_writing()) {
9f95a23c 1812 cache->_touch(b);
f67539c2 1813 }
224ce89b
WB
1814 continue;
1815 }
1816 if (b->offset > offset) {
1817 uint32_t gap = b->offset - offset;
1818 if (length <= gap) {
1819 break;
1820 }
1821 offset += gap;
1822 length -= gap;
1823 }
1824 if (!b->is_writing()) {
9f95a23c 1825 cache->_touch(b);
224ce89b
WB
1826 }
1827 if (b->length > length) {
1828 res[offset].substr_of(b->data, 0, length);
1829 res_intervals.insert(offset, length);
7c673cae 1830 break;
224ce89b
WB
1831 } else {
1832 res[offset].append(b->data);
1833 res_intervals.insert(offset, b->length);
1834 if (b->length == length)
1835 break;
1836 offset += b->length;
1837 length -= b->length;
1838 }
7c673cae
FG
1839 }
1840 }
1841 }
1842
1843 uint64_t hit_bytes = res_intervals.size();
11fdf7f2 1844 ceph_assert(hit_bytes <= want_bytes);
7c673cae
FG
1845 uint64_t miss_bytes = want_bytes - hit_bytes;
1846 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1847 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1848}
1849
9f95a23c 1850void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
7c673cae 1851{
7c673cae
FG
1852 auto i = writing.begin();
1853 while (i != writing.end()) {
1854 if (i->seq > seq) {
1855 break;
1856 }
1857 if (i->seq < seq) {
1858 ++i;
1859 continue;
1860 }
1861
1862 Buffer *b = &*i;
11fdf7f2 1863 ceph_assert(b->is_writing());
7c673cae
FG
1864
1865 if (b->flags & Buffer::FLAG_NOCACHE) {
1866 writing.erase(i++);
1867 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1868 buffer_map.erase(b->offset);
1869 } else {
1870 b->state = Buffer::STATE_CLEAN;
1871 writing.erase(i++);
31f18b77
FG
1872 b->maybe_rebuild();
1873 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
9f95a23c 1874 cache->_add(b, 1, nullptr);
7c673cae
FG
1875 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1876 }
1877 }
9f95a23c 1878 cache->_trim();
7c673cae
FG
1879 cache->_audit("finish_write end");
1880}
1881
9f95a23c 1882void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
7c673cae 1883{
11fdf7f2 1884 std::lock_guard lk(cache->lock);
7c673cae
FG
1885 if (buffer_map.empty())
1886 return;
1887
1888 auto p = --buffer_map.end();
1889 while (true) {
1890 if (p->second->end() <= pos)
1891 break;
1892
1893 if (p->second->offset < pos) {
1894 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1895 size_t left = pos - p->second->offset;
1896 size_t right = p->second->length - left;
1897 if (p->second->data.length()) {
1898 bufferlist bl;
1899 bl.substr_of(p->second->data, left, right);
f67539c2
TL
1900 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1901 0, bl, p->second->flags),
7c673cae
FG
1902 0, p->second.get());
1903 } else {
f67539c2
TL
1904 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1905 0, right, p->second->flags),
7c673cae
FG
1906 0, p->second.get());
1907 }
9f95a23c 1908 cache->_adjust_size(p->second.get(), -right);
7c673cae
FG
1909 p->second->truncate(left);
1910 break;
1911 }
1912
11fdf7f2 1913 ceph_assert(p->second->end() > pos);
7c673cae
FG
1914 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1915 if (p->second->data.length()) {
1916 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
f67539c2 1917 p->second->offset - pos, p->second->data, p->second->flags),
7c673cae
FG
1918 0, p->second.get());
1919 } else {
1920 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
f67539c2 1921 p->second->offset - pos, p->second->length, p->second->flags),
7c673cae
FG
1922 0, p->second.get());
1923 }
1924 if (p == buffer_map.begin()) {
1925 _rm_buffer(cache, p);
1926 break;
1927 } else {
1928 _rm_buffer(cache, p--);
1929 }
1930 }
11fdf7f2 1931 ceph_assert(writing.empty());
9f95a23c 1932 cache->_trim();
7c673cae
FG
1933}
1934
1935// OnodeSpace
1936
1937#undef dout_prefix
1938#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1939
39ae355f 1940BlueStore::OnodeRef BlueStore::OnodeSpace::add_onode(const ghobject_t& oid,
f6b5b4d7 1941 OnodeRef& o)
7c673cae 1942{
11fdf7f2 1943 std::lock_guard l(cache->lock);
39ae355f
TL
1944 // add entry or return existing one
1945 auto p = onode_map.emplace(oid, o);
1946 if (!p.second) {
7c673cae 1947 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
39ae355f 1948 << " raced, returning existing " << p.first->second
7c673cae 1949 << dendl;
39ae355f 1950 return p.first->second;
7c673cae 1951 }
f6b5b4d7 1952 ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
f6b5b4d7 1953 cache->_add(o.get(), 1);
9f95a23c 1954 cache->_trim();
7c673cae
FG
1955 return o;
1956}
1957
f6b5b4d7
TL
1958void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
1959{
1960 ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
1961 onode_map.erase(oid);
1962}
1963
7c673cae
FG
1964BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1965{
7c673cae 1966 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b 1967 OnodeRef o;
224ce89b
WB
1968
1969 {
11fdf7f2 1970 std::lock_guard l(cache->lock);
224ce89b
WB
1971 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1972 if (p == onode_map.end()) {
1973 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
39ae355f 1974 cache->logger->inc(l_bluestore_onode_misses);
224ce89b
WB
1975 } else {
1976 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
f6b5b4d7
TL
1977 << " " << p->second->nref
1978 << " " << p->second->cached
224ce89b 1979 << dendl;
f6b5b4d7
TL
1980 // This will pin onode and implicitly touch the cache when Onode
1981 // eventually will become unpinned
224ce89b 1982 o = p->second;
f6b5b4d7 1983
20effc67 1984 cache->logger->inc(l_bluestore_onode_hits);
224ce89b
WB
1985 }
1986 }
1987
224ce89b 1988 return o;
7c673cae
FG
1989}
1990
1991void BlueStore::OnodeSpace::clear()
1992{
11fdf7f2 1993 std::lock_guard l(cache->lock);
f6b5b4d7 1994 ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
7c673cae 1995 for (auto &p : onode_map) {
f6b5b4d7 1996 cache->_rm(p.second.get());
7c673cae
FG
1997 }
1998 onode_map.clear();
1999}
2000
2001bool BlueStore::OnodeSpace::empty()
2002{
11fdf7f2 2003 std::lock_guard l(cache->lock);
7c673cae
FG
2004 return onode_map.empty();
2005}
2006
2007void BlueStore::OnodeSpace::rename(
2008 OnodeRef& oldo,
2009 const ghobject_t& old_oid,
2010 const ghobject_t& new_oid,
f91f0fd5 2011 const mempool::bluestore_cache_meta::string& new_okey)
7c673cae 2012{
11fdf7f2 2013 std::lock_guard l(cache->lock);
7c673cae
FG
2014 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
2015 << dendl;
2016 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
2017 po = onode_map.find(old_oid);
2018 pn = onode_map.find(new_oid);
11fdf7f2 2019 ceph_assert(po != pn);
7c673cae 2020
11fdf7f2 2021 ceph_assert(po != onode_map.end());
7c673cae
FG
2022 if (pn != onode_map.end()) {
2023 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
2024 << dendl;
f6b5b4d7 2025 cache->_rm(pn->second.get());
7c673cae
FG
2026 onode_map.erase(pn);
2027 }
2028 OnodeRef o = po->second;
2029
2030 // install a non-existent onode at old location
2031 oldo.reset(new Onode(o->c, old_oid, o->key));
2032 po->second = oldo;
f6b5b4d7
TL
2033 cache->_add(oldo.get(), 1);
2034 // add at new position and fix oid, key.
2035 // This will pin 'o' and implicitly touch cache
2036 // when it will eventually become unpinned
7c673cae 2037 onode_map.insert(make_pair(new_oid, o));
f6b5b4d7 2038
7c673cae
FG
2039 o->oid = new_oid;
2040 o->key = new_okey;
9f95a23c 2041 cache->_trim();
7c673cae
FG
2042}
2043
adb31ebb 2044bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
7c673cae 2045{
11fdf7f2 2046 std::lock_guard l(cache->lock);
7c673cae
FG
2047 ldout(cache->cct, 20) << __func__ << dendl;
2048 for (auto& i : onode_map) {
adb31ebb 2049 if (f(i.second.get())) {
7c673cae
FG
2050 return true;
2051 }
2052 }
2053 return false;
2054}
2055
11fdf7f2
TL
2056template <int LogLevelV = 30>
2057void BlueStore::OnodeSpace::dump(CephContext *cct)
3efd9988
FG
2058{
2059 for (auto& i : onode_map) {
f6b5b4d7
TL
2060 ldout(cct, LogLevelV) << i.first << " : " << i.second
2061 << " " << i.second->nref
2062 << " " << i.second->cached
f6b5b4d7 2063 << dendl;
3efd9988
FG
2064 }
2065}
7c673cae
FG
2066
2067// SharedBlob
2068
2069#undef dout_prefix
2070#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
9f95a23c
TL
2071#undef dout_context
2072#define dout_context coll->store->cct
7c673cae 2073
9f95a23c 2074void BlueStore::SharedBlob::dump(Formatter* f) const
7c673cae 2075{
9f95a23c
TL
2076 f->dump_bool("loaded", loaded);
2077 if (loaded) {
2078 persistent->dump(f);
2079 } else {
2080 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
2081 }
2082}
2083
2084ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
2085{
2086 out << "SharedBlob(" << &sb;
2087
7c673cae
FG
2088 if (sb.loaded) {
2089 out << " loaded " << *sb.persistent;
2090 } else {
2091 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
2092 }
2093 return out << ")";
2094}
2095
2096BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
2097 : coll(_coll), sbid_unloaded(i)
2098{
11fdf7f2 2099 ceph_assert(sbid_unloaded > 0);
7c673cae
FG
2100 if (get_cache()) {
2101 get_cache()->add_blob();
2102 }
2103}
2104
2105BlueStore::SharedBlob::~SharedBlob()
2106{
7c673cae
FG
2107 if (loaded && persistent) {
2108 delete persistent;
2109 }
2110}
2111
2112void BlueStore::SharedBlob::put()
2113{
2114 if (--nref == 0) {
9f95a23c
TL
2115 dout(20) << __func__ << " " << this
2116 << " removing self from set " << get_parent()
2117 << dendl;
1adf2230
AA
2118 again:
2119 auto coll_snap = coll;
2120 if (coll_snap) {
11fdf7f2 2121 std::lock_guard l(coll_snap->cache->lock);
1adf2230
AA
2122 if (coll_snap != coll) {
2123 goto again;
2124 }
91327a77
AA
2125 if (!coll_snap->shared_blob_set.remove(this, true)) {
2126 // race with lookup
2127 return;
2128 }
1adf2230
AA
2129 bc._clear(coll_snap->cache);
2130 coll_snap->cache->rm_blob();
7c673cae 2131 }
28e407b8 2132 delete this;
7c673cae
FG
2133 }
2134}
2135
2136void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
2137{
11fdf7f2 2138 ceph_assert(persistent);
7c673cae
FG
2139 persistent->ref_map.get(offset, length);
2140}
2141
2142void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77 2143 PExtentVector *r,
11fdf7f2 2144 bool *unshare)
7c673cae 2145{
11fdf7f2
TL
2146 ceph_assert(persistent);
2147 persistent->ref_map.put(offset, length, r,
2148 unshare && !*unshare ? unshare : nullptr);
7c673cae
FG
2149}
2150
f64942e4
AA
2151void BlueStore::SharedBlob::finish_write(uint64_t seq)
2152{
2153 while (true) {
9f95a23c 2154 BufferCacheShard *cache = coll->cache;
11fdf7f2 2155 std::lock_guard l(cache->lock);
f64942e4 2156 if (coll->cache != cache) {
9f95a23c
TL
2157 dout(20) << __func__
2158 << " raced with sb cache update, was " << cache
2159 << ", now " << coll->cache << ", retrying"
2160 << dendl;
f64942e4
AA
2161 continue;
2162 }
2163 bc._finish_write(cache, seq);
2164 break;
2165 }
2166}
2167
3efd9988
FG
2168// SharedBlobSet
2169
2170#undef dout_prefix
2171#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2172
11fdf7f2
TL
2173template <int LogLevelV = 30>
2174void BlueStore::SharedBlobSet::dump(CephContext *cct)
3efd9988 2175{
11fdf7f2 2176 std::lock_guard l(lock);
3efd9988 2177 for (auto& i : sb_map) {
11fdf7f2 2178 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
3efd9988
FG
2179 }
2180}
2181
7c673cae
FG
2182// Blob
2183
2184#undef dout_prefix
2185#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2186
9f95a23c
TL
2187void BlueStore::Blob::dump(Formatter* f) const
2188{
2189 if (is_spanning()) {
2190 f->dump_unsigned("spanning_id ", id);
2191 }
2192 blob.dump(f);
2193 if (shared_blob) {
2194 f->dump_object("shared", *shared_blob);
2195 }
2196}
2197
7c673cae
FG
2198ostream& operator<<(ostream& out, const BlueStore::Blob& b)
2199{
2200 out << "Blob(" << &b;
2201 if (b.is_spanning()) {
2202 out << " spanning " << b.id;
2203 }
35e4c445
FG
2204 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
2205 if (b.shared_blob) {
2206 out << " " << *b.shared_blob;
2207 } else {
2208 out << " (shared_blob=NULL)";
2209 }
2210 out << ")";
7c673cae
FG
2211 return out;
2212}
2213
2214void BlueStore::Blob::discard_unallocated(Collection *coll)
2215{
224ce89b 2216 if (get_blob().is_shared()) {
7c673cae
FG
2217 return;
2218 }
224ce89b 2219 if (get_blob().is_compressed()) {
7c673cae
FG
2220 bool discard = false;
2221 bool all_invalid = true;
224ce89b 2222 for (auto e : get_blob().get_extents()) {
7c673cae
FG
2223 if (!e.is_valid()) {
2224 discard = true;
2225 } else {
2226 all_invalid = false;
2227 }
2228 }
11fdf7f2 2229 ceph_assert(discard == all_invalid); // in case of compressed blob all
7c673cae
FG
2230 // or none pextents are invalid.
2231 if (discard) {
224ce89b
WB
2232 shared_blob->bc.discard(shared_blob->get_cache(), 0,
2233 get_blob().get_logical_length());
7c673cae
FG
2234 }
2235 } else {
2236 size_t pos = 0;
224ce89b 2237 for (auto e : get_blob().get_extents()) {
7c673cae 2238 if (!e.is_valid()) {
9f95a23c
TL
2239 dout(20) << __func__ << " 0x" << std::hex << pos
2240 << "~" << e.length
2241 << std::dec << dendl;
7c673cae
FG
2242 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
2243 }
2244 pos += e.length;
2245 }
224ce89b
WB
2246 if (get_blob().can_prune_tail()) {
2247 dirty_blob().prune_tail();
2248 used_in_blob.prune_tail(get_blob().get_ondisk_length());
224ce89b 2249 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
2250 }
2251 }
2252}
2253
2254void BlueStore::Blob::get_ref(
2255 Collection *coll,
2256 uint32_t offset,
2257 uint32_t length)
2258{
2259 // Caller has to initialize Blob's logical length prior to increment
2260 // references. Otherwise one is neither unable to determine required
2261 // amount of counters in case of per-au tracking nor obtain min_release_size
2262 // for single counter mode.
11fdf7f2 2263 ceph_assert(get_blob().get_logical_length() != 0);
7c673cae
FG
2264 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2265 << std::dec << " " << *this << dendl;
2266
2267 if (used_in_blob.is_empty()) {
2268 uint32_t min_release_size =
224ce89b
WB
2269 get_blob().get_release_size(coll->store->min_alloc_size);
2270 uint64_t l = get_blob().get_logical_length();
2271 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
2272 << min_release_size << std::dec << dendl;
7c673cae
FG
2273 used_in_blob.init(l, min_release_size);
2274 }
2275 used_in_blob.get(
2276 offset,
2277 length);
2278}
2279
2280bool BlueStore::Blob::put_ref(
2281 Collection *coll,
2282 uint32_t offset,
2283 uint32_t length,
2284 PExtentVector *r)
2285{
2286 PExtentVector logical;
2287
7c673cae
FG
2288 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2289 << std::dec << " " << *this << dendl;
2290
2291 bool empty = used_in_blob.put(
2292 offset,
2293 length,
2294 &logical);
2295 r->clear();
2296 // nothing to release
2297 if (!empty && logical.empty()) {
2298 return false;
2299 }
2300
2301 bluestore_blob_t& b = dirty_blob();
2302 return b.release_extents(empty, logical, r);
2303}
2304
224ce89b 2305bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
2306 uint32_t target_blob_size,
2307 uint32_t b_offset,
2308 uint32_t *length0) {
11fdf7f2
TL
2309 ceph_assert(min_alloc_size);
2310 ceph_assert(target_blob_size);
7c673cae
FG
2311 if (!get_blob().is_mutable()) {
2312 return false;
2313 }
2314
2315 uint32_t length = *length0;
2316 uint32_t end = b_offset + length;
2317
2318 // Currently for the sake of simplicity we omit blob reuse if data is
2319 // unaligned with csum chunk. Later we can perform padding if needed.
2320 if (get_blob().has_csum() &&
2321 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2322 (end % get_blob().get_csum_chunk_size()) != 0)) {
2323 return false;
2324 }
2325
2326 auto blen = get_blob().get_logical_length();
2327 uint32_t new_blen = blen;
2328
2329 // make sure target_blob_size isn't less than current blob len
11fdf7f2 2330 target_blob_size = std::max(blen, target_blob_size);
7c673cae
FG
2331
2332 if (b_offset >= blen) {
224ce89b
WB
2333 // new data totally stands out of the existing blob
2334 new_blen = end;
7c673cae 2335 } else {
224ce89b 2336 // new data overlaps with the existing blob
11fdf7f2 2337 new_blen = std::max(blen, end);
224ce89b
WB
2338
2339 uint32_t overlap = 0;
2340 if (new_blen > blen) {
2341 overlap = blen - b_offset;
2342 } else {
2343 overlap = length;
2344 }
2345
2346 if (!get_blob().is_unallocated(b_offset, overlap)) {
2347 // abort if any piece of the overlap has already been allocated
2348 return false;
7c673cae
FG
2349 }
2350 }
224ce89b 2351
7c673cae
FG
2352 if (new_blen > blen) {
2353 int64_t overflow = int64_t(new_blen) - target_blob_size;
2354 // Unable to decrease the provided length to fit into max_blob_size
2355 if (overflow >= length) {
2356 return false;
2357 }
2358
2359 // FIXME: in some cases we could reduce unused resolution
2360 if (get_blob().has_unused()) {
2361 return false;
2362 }
2363
2364 if (overflow > 0) {
2365 new_blen -= overflow;
2366 length -= overflow;
2367 *length0 = length;
2368 }
224ce89b 2369
7c673cae
FG
2370 if (new_blen > blen) {
2371 dirty_blob().add_tail(new_blen);
2372 used_in_blob.add_tail(new_blen,
224ce89b 2373 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
2374 }
2375 }
2376 return true;
2377}
2378
2379void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2380{
7c673cae
FG
2381 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2382 << " start " << *this << dendl;
11fdf7f2
TL
2383 ceph_assert(blob.can_split());
2384 ceph_assert(used_in_blob.can_split());
7c673cae
FG
2385 bluestore_blob_t &lb = dirty_blob();
2386 bluestore_blob_t &rb = r->dirty_blob();
2387
2388 used_in_blob.split(
2389 blob_offset,
2390 &(r->used_in_blob));
2391
2392 lb.split(blob_offset, rb);
2393 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2394
2395 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2396 << " finish " << *this << dendl;
2397 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2398 << " and " << *r << dendl;
2399}
2400
2401#ifndef CACHE_BLOB_BL
2402void BlueStore::Blob::decode(
11fdf7f2 2403 bufferptr::const_iterator& p,
7c673cae
FG
2404 uint64_t struct_v,
2405 uint64_t* sbid,
39ae355f
TL
2406 bool include_ref_map,
2407 Collection *coll)
7c673cae
FG
2408{
2409 denc(blob, p, struct_v);
2410 if (blob.is_shared()) {
2411 denc(*sbid, p);
2412 }
2413 if (include_ref_map) {
2414 if (struct_v > 1) {
2415 used_in_blob.decode(p);
2416 } else {
2417 used_in_blob.clear();
2418 bluestore_extent_ref_map_t legacy_ref_map;
2419 legacy_ref_map.decode(p);
39ae355f
TL
2420 if (coll) {
2421 for (auto r : legacy_ref_map.ref_map) {
2422 get_ref(
2423 coll,
2424 r.first,
2425 r.second.refs * r.second.length);
2426 }
7c673cae
FG
2427 }
2428 }
2429 }
2430}
2431#endif
2432
2433// Extent
2434
9f95a23c
TL
2435void BlueStore::Extent::dump(Formatter* f) const
2436{
2437 f->dump_unsigned("logical_offset", logical_offset);
2438 f->dump_unsigned("length", length);
2439 f->dump_unsigned("blob_offset", blob_offset);
2440 f->dump_object("blob", *blob);
2441}
2442
7c673cae
FG
2443ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2444{
2445 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2446 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2447 << " " << *e.blob;
2448}
2449
2450// OldExtent
2451BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2452 uint32_t lo,
2453 uint32_t o,
2454 uint32_t l,
2455 BlobRef& b) {
2456 OldExtent* oe = new OldExtent(lo, o, l, b);
2457 b->put_ref(c.get(), o, l, &(oe->r));
adb31ebb 2458 oe->blob_empty = !b->is_referenced();
7c673cae
FG
2459 return oe;
2460}
2461
2462// ExtentMap
2463
2464#undef dout_prefix
2465#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
9f95a23c
TL
2466#undef dout_context
2467#define dout_context onode->c->store->cct
7c673cae 2468
39ae355f 2469BlueStore::ExtentMap::ExtentMap(Onode *o, size_t inline_shard_prealloc_size)
7c673cae 2470 : onode(o),
39ae355f 2471 inline_bl(inline_shard_prealloc_size) {
7c673cae
FG
2472}
2473
9f95a23c
TL
2474void BlueStore::ExtentMap::dump(Formatter* f) const
2475{
2476 f->open_array_section("extents");
2477
2478 for (auto& e : extent_map) {
2479 f->dump_object("extent", e);
2480 }
2481 f->close_section();
2482}
2483
11fdf7f2
TL
2484void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2485 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2486 uint64_t& length, uint64_t& dstoff) {
2487
2488 auto cct = onode->c->store->cct;
2489 bool inject_21040 =
2490 cct->_conf->bluestore_debug_inject_bug21040;
2491 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2492 for (auto& e : oldo->extent_map.extent_map) {
2493 e.blob->last_encoded_id = -1;
2494 }
2495
2496 int n = 0;
2497 uint64_t end = srcoff + length;
2498 uint32_t dirty_range_begin = 0;
2499 uint32_t dirty_range_end = 0;
2500 bool src_dirty = false;
2501 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2502 ep != oldo->extent_map.extent_map.end();
2503 ++ep) {
2504 auto& e = *ep;
2505 if (e.logical_offset >= end) {
2506 break;
2507 }
2508 dout(20) << __func__ << " src " << e << dendl;
2509 BlobRef cb;
2510 bool blob_duped = true;
2511 if (e.blob->last_encoded_id >= 0) {
2512 cb = id_to_blob[e.blob->last_encoded_id];
2513 blob_duped = false;
2514 } else {
2515 // dup the blob
2516 const bluestore_blob_t& blob = e.blob->get_blob();
2517 // make sure it is shared
2518 if (!blob.is_shared()) {
2519 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2520 if (!inject_21040 && !src_dirty) {
2521 src_dirty = true;
2522 dirty_range_begin = e.logical_offset;
2523 } else if (inject_21040 &&
2524 dirty_range_begin == 0 && dirty_range_end == 0) {
2525 dirty_range_begin = e.logical_offset;
2526 }
2527 ceph_assert(e.logical_end() > 0);
2528 // -1 to exclude next potential shard
2529 dirty_range_end = e.logical_end() - 1;
2530 } else {
2531 c->load_shared_blob(e.blob->shared_blob);
2532 }
2533 cb = new Blob();
2534 e.blob->last_encoded_id = n;
2535 id_to_blob[n] = cb;
2536 e.blob->dup(*cb);
2537 // bump the extent refs on the copied blob's extents
2538 for (auto p : blob.get_extents()) {
2539 if (p.is_valid()) {
2540 e.blob->shared_blob->get_ref(p.offset, p.length);
2541 }
2542 }
2543 txc->write_shared_blob(e.blob->shared_blob);
2544 dout(20) << __func__ << " new " << *cb << dendl;
2545 }
2546
2547 int skip_front, skip_back;
2548 if (e.logical_offset < srcoff) {
2549 skip_front = srcoff - e.logical_offset;
2550 } else {
2551 skip_front = 0;
2552 }
2553 if (e.logical_end() > end) {
2554 skip_back = e.logical_end() - end;
2555 } else {
2556 skip_back = 0;
2557 }
2558
2559 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2560 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2561 newo->extent_map.extent_map.insert(*ne);
2562 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2563 // fixme: we may leave parts of new blob unreferenced that could
2564 // be freed (relative to the shared_blob).
2565 txc->statfs_delta.stored() += ne->length;
2566 if (e.blob->get_blob().is_compressed()) {
2567 txc->statfs_delta.compressed_original() += ne->length;
2568 if (blob_duped) {
2569 txc->statfs_delta.compressed() +=
2570 cb->get_blob().get_compressed_payload_length();
2571 }
2572 }
2573 dout(20) << __func__ << " dst " << *ne << dendl;
2574 ++n;
2575 }
2576 if ((!inject_21040 && src_dirty) ||
2577 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2578 oldo->extent_map.dirty_range(dirty_range_begin,
2579 dirty_range_end - dirty_range_begin);
2580 txc->write_onode(oldo);
2581 }
2582 txc->write_onode(newo);
2583
2584 if (dstoff + length > newo->onode.size) {
2585 newo->onode.size = dstoff + length;
2586 }
2587 newo->extent_map.dirty_range(dstoff, length);
2588}
7c673cae
FG
2589void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2590 bool force)
2591{
2592 auto cct = onode->c->store->cct; //used by dout
2593 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2594 if (onode->onode.extent_map_shards.empty()) {
2595 if (inline_bl.length() == 0) {
2596 unsigned n;
2597 // we need to encode inline_bl to measure encoded length
2598 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
f91f0fd5 2599 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
11fdf7f2 2600 ceph_assert(!never_happen);
7c673cae
FG
2601 size_t len = inline_bl.length();
2602 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2603 << " extents" << dendl;
2604 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2605 request_reshard(0, OBJECT_MAX_SIZE);
2606 return;
2607 }
2608 }
2609 // will persist in the onode key.
2610 } else {
2611 // pending shard update
2612 struct dirty_shard_t {
2613 Shard *shard;
2614 bufferlist bl;
2615 dirty_shard_t(Shard *s) : shard(s) {}
2616 };
2617 vector<dirty_shard_t> encoded_shards;
2618 // allocate slots for all shards in a single call instead of
2619 // doing multiple allocations - one per each dirty shard
2620 encoded_shards.reserve(shards.size());
2621
2622 auto p = shards.begin();
2623 auto prev_p = p;
2624 while (p != shards.end()) {
11fdf7f2 2625 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2626 auto n = p;
2627 ++n;
2628 if (p->dirty) {
2629 uint32_t endoff;
2630 if (n == shards.end()) {
2631 endoff = OBJECT_MAX_SIZE;
2632 } else {
2633 endoff = n->shard_info->offset;
2634 }
2635 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2636 bufferlist& bl = encoded_shards.back().bl;
2637 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2638 bl, &p->extents)) {
2639 if (force) {
2640 derr << __func__ << " encode_some needs reshard" << dendl;
11fdf7f2 2641 ceph_assert(!force);
7c673cae
FG
2642 }
2643 }
2644 size_t len = bl.length();
2645
2646 dout(20) << __func__ << " shard 0x" << std::hex
2647 << p->shard_info->offset << std::dec << " is " << len
2648 << " bytes (was " << p->shard_info->bytes << ") from "
2649 << p->extents << " extents" << dendl;
2650
2651 if (!force) {
2652 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2653 // we are big; reshard ourselves
2654 request_reshard(p->shard_info->offset, endoff);
2655 }
2656 // avoid resharding the trailing shard, even if it is small
2657 else if (n != shards.end() &&
11fdf7f2
TL
2658 len < g_conf()->bluestore_extent_map_shard_min_size) {
2659 ceph_assert(endoff != OBJECT_MAX_SIZE);
31f18b77
FG
2660 if (p == shards.begin()) {
2661 // we are the first shard, combine with next shard
7c673cae 2662 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2663 } else {
31f18b77
FG
2664 // combine either with the previous shard or the next,
2665 // whichever is smaller
7c673cae
FG
2666 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2667 request_reshard(p->shard_info->offset, endoff + 1);
2668 } else {
2669 request_reshard(prev_p->shard_info->offset, endoff);
2670 }
2671 }
2672 }
2673 }
2674 }
2675 prev_p = p;
2676 p = n;
2677 }
2678 if (needs_reshard()) {
2679 return;
2680 }
2681
2682 // schedule DB update for dirty shards
2683 string key;
2684 for (auto& it : encoded_shards) {
20effc67
TL
2685 dout(20) << __func__ << " encoding key for shard 0x" << std::hex
2686 << it.shard->shard_info->offset << std::dec << dendl;
7c673cae
FG
2687 it.shard->dirty = false;
2688 it.shard->shard_info->bytes = it.bl.length();
2689 generate_extent_shard_key_and_apply(
2690 onode->key,
2691 it.shard->shard_info->offset,
2692 &key,
2693 [&](const string& final_key) {
2694 t->set(PREFIX_OBJ, final_key, it.bl);
2695 }
2696 );
2697 }
2698 }
2699}
2700
31f18b77
FG
2701bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2702{
2703 if (spanning_blob_map.empty())
2704 return 0;
2705 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2706 // bid is valid and available.
2707 if (bid >= 0)
2708 return bid;
2709 // Find next unused bid;
2710 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2711 const auto begin_bid = bid;
2712 do {
2713 if (!spanning_blob_map.count(bid))
2714 return bid;
2715 else {
2716 bid++;
2717 if (bid < 0) bid = 0;
2718 }
2719 } while (bid != begin_bid);
81eedcae
TL
2720 auto cct = onode->c->store->cct; // used by dout
2721 _dump_onode<0>(cct, *onode);
11fdf7f2 2722 ceph_abort_msg("no available blob id");
31f18b77
FG
2723}
2724
7c673cae
FG
2725void BlueStore::ExtentMap::reshard(
2726 KeyValueDB *db,
2727 KeyValueDB::Transaction t)
2728{
2729 auto cct = onode->c->store->cct; // used by dout
2730
2731 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2732 << needs_reshard_end << ")" << std::dec
2733 << " of " << onode->onode.extent_map_shards.size()
2734 << " shards on " << onode->oid << dendl;
2735 for (auto& p : spanning_blob_map) {
2736 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2737 << dendl;
2738 }
2739 // determine shard index range
2740 unsigned si_begin = 0, si_end = 0;
2741 if (!shards.empty()) {
2742 while (si_begin + 1 < shards.size() &&
2743 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2744 ++si_begin;
2745 }
2746 needs_reshard_begin = shards[si_begin].shard_info->offset;
2747 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2748 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2749 needs_reshard_end = shards[si_end].shard_info->offset;
2750 break;
2751 }
2752 }
2753 if (si_end == shards.size()) {
2754 needs_reshard_end = OBJECT_MAX_SIZE;
2755 }
2756 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2757 << " over 0x[" << std::hex << needs_reshard_begin << ","
2758 << needs_reshard_end << ")" << std::dec << dendl;
2759 }
2760
181888fb 2761 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2762
2763 // we may need to fault in a larger interval later must have all
2764 // referring extents for spanning blobs loaded in order to have
2765 // accurate use_tracker values.
2766 uint32_t spanning_scan_begin = needs_reshard_begin;
2767 uint32_t spanning_scan_end = needs_reshard_end;
2768
2769 // remove old keys
2770 string key;
2771 for (unsigned i = si_begin; i < si_end; ++i) {
2772 generate_extent_shard_key_and_apply(
2773 onode->key, shards[i].shard_info->offset, &key,
2774 [&](const string& final_key) {
2775 t->rmkey(PREFIX_OBJ, final_key);
2776 }
2777 );
2778 }
2779
2780 // calculate average extent size
2781 unsigned bytes = 0;
2782 unsigned extents = 0;
2783 if (onode->onode.extent_map_shards.empty()) {
2784 bytes = inline_bl.length();
2785 extents = extent_map.size();
2786 } else {
2787 for (unsigned i = si_begin; i < si_end; ++i) {
2788 bytes += shards[i].shard_info->bytes;
2789 extents += shards[i].extents;
2790 }
2791 }
2792 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2793 unsigned slop = target *
2794 cct->_conf->bluestore_extent_map_shard_target_size_slop;
11fdf7f2 2795 unsigned extent_avg = bytes / std::max(1u, extents);
7c673cae
FG
2796 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2797 << ", slop " << slop << dendl;
2798
2799 // reshard
2800 unsigned estimate = 0;
31f18b77 2801 unsigned offset = needs_reshard_begin;
7c673cae
FG
2802 vector<bluestore_onode_t::shard_info> new_shard_info;
2803 unsigned max_blob_end = 0;
2804 Extent dummy(needs_reshard_begin);
2805 for (auto e = extent_map.lower_bound(dummy);
2806 e != extent_map.end();
2807 ++e) {
2808 if (e->logical_offset >= needs_reshard_end) {
2809 break;
2810 }
2811 dout(30) << " extent " << *e << dendl;
2812
2813 // disfavor shard boundaries that span a blob
2814 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2815 if (estimate &&
2816 estimate + extent_avg > target + (would_span ? slop : 0)) {
2817 // new shard
31f18b77 2818 if (offset == needs_reshard_begin) {
7c673cae
FG
2819 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2820 new_shard_info.back().offset = offset;
2821 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2822 << std::dec << dendl;
7c673cae
FG
2823 }
2824 offset = e->logical_offset;
2825 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2826 new_shard_info.back().offset = offset;
2827 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2828 << std::dec << dendl;
2829 estimate = 0;
2830 }
2831 estimate += extent_avg;
31f18b77
FG
2832 unsigned bs = e->blob_start();
2833 if (bs < spanning_scan_begin) {
2834 spanning_scan_begin = bs;
7c673cae
FG
2835 }
2836 uint32_t be = e->blob_end();
2837 if (be > max_blob_end) {
2838 max_blob_end = be;
2839 }
2840 if (be > spanning_scan_end) {
2841 spanning_scan_end = be;
2842 }
2843 }
2844 if (new_shard_info.empty() && (si_begin > 0 ||
2845 si_end < shards.size())) {
2846 // we resharded a partial range; we must produce at least one output
2847 // shard
2848 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2849 new_shard_info.back().offset = needs_reshard_begin;
2850 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2851 << std::dec << " (singleton degenerate case)" << dendl;
2852 }
2853
2854 auto& sv = onode->onode.extent_map_shards;
2855 dout(20) << __func__ << " new " << new_shard_info << dendl;
2856 dout(20) << __func__ << " old " << sv << dendl;
2857 if (sv.empty()) {
2858 // no old shards to keep
2859 sv.swap(new_shard_info);
2860 init_shards(true, true);
2861 } else {
2862 // splice in new shards
2863 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2864 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2865 sv.insert(
2866 sv.begin() + si_begin,
2867 new_shard_info.begin(),
2868 new_shard_info.end());
2869 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2870 si_end = si_begin + new_shard_info.size();
31f18b77 2871
11fdf7f2 2872 ceph_assert(sv.size() == shards.size());
31f18b77
FG
2873
2874 // note that we need to update every shard_info of shards here,
2875 // as sv might have been totally re-allocated above
2876 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2877 shards[i].shard_info = &sv[i];
31f18b77
FG
2878 }
2879
2880 // mark newly added shards as dirty
2881 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2882 shards[i].loaded = true;
2883 shards[i].dirty = true;
2884 }
7c673cae
FG
2885 }
2886 dout(20) << __func__ << " fin " << sv << dendl;
2887 inline_bl.clear();
2888
2889 if (sv.empty()) {
2890 // no more shards; unspan all previously spanning blobs
2891 auto p = spanning_blob_map.begin();
2892 while (p != spanning_blob_map.end()) {
2893 p->second->id = -1;
2894 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2895 p = spanning_blob_map.erase(p);
2896 }
2897 } else {
2898 // identify new spanning blobs
2899 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2900 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2901 if (spanning_scan_begin < needs_reshard_begin) {
2902 fault_range(db, spanning_scan_begin,
2903 needs_reshard_begin - spanning_scan_begin);
2904 }
2905 if (spanning_scan_end > needs_reshard_end) {
2906 fault_range(db, needs_reshard_end,
31f18b77 2907 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2908 }
2909 auto sp = sv.begin() + si_begin;
2910 auto esp = sv.end();
2911 unsigned shard_start = sp->offset;
2912 unsigned shard_end;
2913 ++sp;
2914 if (sp == esp) {
2915 shard_end = OBJECT_MAX_SIZE;
2916 } else {
2917 shard_end = sp->offset;
2918 }
7c673cae 2919 Extent dummy(needs_reshard_begin);
9f95a23c
TL
2920
2921 bool was_too_many_blobs_check = false;
2922 auto too_many_blobs_threshold =
2923 g_conf()->bluestore_debug_too_many_blobs_threshold;
39ae355f
TL
2924 auto& dumped_onodes = onode->c->onode_space.cache->dumped_onodes;
2925 decltype(onode->c->onode_space.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2926 decltype(onode->c->onode_space.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
9f95a23c 2927
7c673cae
FG
2928 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2929 if (e->logical_offset >= needs_reshard_end) {
2930 break;
2931 }
2932 dout(30) << " extent " << *e << dendl;
2933 while (e->logical_offset >= shard_end) {
2934 shard_start = shard_end;
11fdf7f2 2935 ceph_assert(sp != esp);
7c673cae
FG
2936 ++sp;
2937 if (sp == esp) {
2938 shard_end = OBJECT_MAX_SIZE;
2939 } else {
2940 shard_end = sp->offset;
2941 }
2942 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2943 << " to 0x" << shard_end << std::dec << dendl;
2944 }
9f95a23c 2945
7c673cae
FG
2946 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2947 if (!e->blob->is_spanning()) {
2948 // We have two options: (1) split the blob into pieces at the
2949 // shard boundaries (and adjust extents accordingly), or (2)
2950 // mark it spanning. We prefer to cut the blob if we can. Note that
2951 // we may have to split it multiple times--potentially at every
2952 // shard boundary.
2953 bool must_span = false;
2954 BlobRef b = e->blob;
2955 if (b->can_split()) {
2956 uint32_t bstart = e->blob_start();
2957 uint32_t bend = e->blob_end();
2958 for (const auto& sh : shards) {
2959 if (bstart < sh.shard_info->offset &&
2960 bend > sh.shard_info->offset) {
2961 uint32_t blob_offset = sh.shard_info->offset - bstart;
2962 if (b->can_split_at(blob_offset)) {
2963 dout(20) << __func__ << " splitting blob, bstart 0x"
2964 << std::hex << bstart << " blob_offset 0x"
2965 << blob_offset << std::dec << " " << *b << dendl;
2966 b = split_blob(b, blob_offset, sh.shard_info->offset);
2967 // switch b to the new right-hand side, in case it
2968 // *also* has to get split.
2969 bstart += blob_offset;
2970 onode->c->store->logger->inc(l_bluestore_blob_split);
2971 } else {
2972 must_span = true;
2973 break;
2974 }
2975 }
2976 }
2977 } else {
2978 must_span = true;
2979 }
2980 if (must_span) {
31f18b77
FG
2981 auto bid = allocate_spanning_blob_id();
2982 b->id = bid;
7c673cae
FG
2983 spanning_blob_map[b->id] = b;
2984 dout(20) << __func__ << " adding spanning " << *b << dendl;
9f95a23c
TL
2985 if (!was_too_many_blobs_check &&
2986 too_many_blobs_threshold &&
2987 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2988
2989 was_too_many_blobs_check = true;
2990 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2991 if (dumped_onodes[i].first == onode->oid) {
2992 oid_slot = &dumped_onodes[i];
2993 break;
2994 }
2995 if (!oldest_slot || (oldest_slot &&
2996 dumped_onodes[i].second < oldest_slot->second)) {
2997 oldest_slot = &dumped_onodes[i];
2998 }
2999 }
3000 }
7c673cae
FG
3001 }
3002 }
3003 } else {
3004 if (e->blob->is_spanning()) {
3005 spanning_blob_map.erase(e->blob->id);
3006 e->blob->id = -1;
3007 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
3008 }
3009 }
3010 }
9f95a23c
TL
3011 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
3012 (oid_slot &&
3013 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
3014 if (do_dump) {
3015 dout(0) << __func__
3016 << " spanning blob count exceeds threshold, "
3017 << spanning_blob_map.size() << " spanning blobs"
3018 << dendl;
3019 _dump_onode<0>(cct, *onode);
3020 if (oid_slot) {
3021 oid_slot->second = mono_clock::now();
3022 } else {
3023 ceph_assert(oldest_slot);
3024 oldest_slot->first = onode->oid;
3025 oldest_slot->second = mono_clock::now();
3026 }
3027 }
7c673cae
FG
3028 }
3029
3030 clear_needs_reshard();
3031}
3032
3033bool BlueStore::ExtentMap::encode_some(
3034 uint32_t offset,
3035 uint32_t length,
3036 bufferlist& bl,
3037 unsigned *pn)
3038{
7c673cae
FG
3039 Extent dummy(offset);
3040 auto start = extent_map.lower_bound(dummy);
3041 uint32_t end = offset + length;
3042
3043 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
3044 // serialization only. Hence there is no specific
3045 // handling at ExtentMap level.
3046
3047 unsigned n = 0;
3048 size_t bound = 0;
7c673cae
FG
3049 bool must_reshard = false;
3050 for (auto p = start;
3051 p != extent_map.end() && p->logical_offset < end;
3052 ++p, ++n) {
11fdf7f2 3053 ceph_assert(p->logical_offset >= offset);
7c673cae
FG
3054 p->blob->last_encoded_id = -1;
3055 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
3056 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3057 << std::dec << " hit new spanning blob " << *p << dendl;
3058 request_reshard(p->blob_start(), p->blob_end());
3059 must_reshard = true;
3060 }
31f18b77
FG
3061 if (!must_reshard) {
3062 denc_varint(0, bound); // blobid
3063 denc_varint(0, bound); // logical_offset
3064 denc_varint(0, bound); // len
3065 denc_varint(0, bound); // blob_offset
7c673cae 3066
31f18b77
FG
3067 p->blob->bound_encode(
3068 bound,
3069 struct_v,
3070 p->blob->shared_blob->get_sbid(),
3071 false);
3072 }
7c673cae
FG
3073 }
3074 if (must_reshard) {
3075 return true;
3076 }
3077
31f18b77
FG
3078 denc(struct_v, bound);
3079 denc_varint(0, bound); // number of extents
3080
7c673cae
FG
3081 {
3082 auto app = bl.get_contiguous_appender(bound);
3083 denc(struct_v, app);
3084 denc_varint(n, app);
3085 if (pn) {
3086 *pn = n;
3087 }
3088
3089 n = 0;
3090 uint64_t pos = 0;
3091 uint64_t prev_len = 0;
3092 for (auto p = start;
3093 p != extent_map.end() && p->logical_offset < end;
3094 ++p, ++n) {
3095 unsigned blobid;
3096 bool include_blob = false;
3097 if (p->blob->is_spanning()) {
3098 blobid = p->blob->id << BLOBID_SHIFT_BITS;
3099 blobid |= BLOBID_FLAG_SPANNING;
3100 } else if (p->blob->last_encoded_id < 0) {
3101 p->blob->last_encoded_id = n + 1; // so it is always non-zero
3102 include_blob = true;
3103 blobid = 0; // the decoder will infer the id from n
3104 } else {
3105 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
3106 }
3107 if (p->logical_offset == pos) {
3108 blobid |= BLOBID_FLAG_CONTIGUOUS;
3109 }
3110 if (p->blob_offset == 0) {
3111 blobid |= BLOBID_FLAG_ZEROOFFSET;
3112 }
3113 if (p->length == prev_len) {
3114 blobid |= BLOBID_FLAG_SAMELENGTH;
3115 } else {
3116 prev_len = p->length;
3117 }
3118 denc_varint(blobid, app);
3119 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3120 denc_varint_lowz(p->logical_offset - pos, app);
3121 }
3122 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3123 denc_varint_lowz(p->blob_offset, app);
3124 }
3125 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3126 denc_varint_lowz(p->length, app);
3127 }
3128 pos = p->logical_end();
3129 if (include_blob) {
3130 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
3131 }
3132 }
3133 }
3134 /*derr << __func__ << bl << dendl;
3135 derr << __func__ << ":";
3136 bl.hexdump(*_dout);
3137 *_dout << dendl;
3138 */
3139 return false;
3140}
3141
39ae355f
TL
3142/////////////////// BlueStore::ExtentMap::DecoderExtent ///////////
3143void BlueStore::ExtentMap::ExtentDecoder::decode_extent(
3144 Extent* le,
3145 __u8 struct_v,
3146 bptr_c_it_t& p,
3147 Collection* c)
7c673cae 3148{
39ae355f
TL
3149 uint64_t blobid;
3150 denc_varint(blobid, p);
3151 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3152 uint64_t gap;
3153 denc_varint_lowz(gap, p);
3154 pos += gap;
3155 }
3156 le->logical_offset = pos;
3157 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3158 denc_varint_lowz(le->blob_offset, p);
3159 } else {
3160 le->blob_offset = 0;
3161 }
3162 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3163 denc_varint_lowz(prev_len, p);
3164 }
3165 le->length = prev_len;
3166 if (blobid & BLOBID_FLAG_SPANNING) {
3167 consume_blobid(le, true, blobid >> BLOBID_SHIFT_BITS);
3168 } else {
3169 blobid >>= BLOBID_SHIFT_BITS;
3170 if (blobid) {
3171 consume_blobid(le, false, blobid - 1);
3172 } else {
3173 Blob *b = new Blob();
3174 uint64_t sbid = 0;
3175 b->decode(p, struct_v, &sbid, false, c);
3176 consume_blob(le, extent_pos, sbid, b);
3177 }
3178 }
3179 pos += prev_len;
3180 ++extent_pos;
3181}
3182
3183unsigned BlueStore::ExtentMap::ExtentDecoder::decode_some(
3184 const bufferlist& bl, Collection* c)
3185{
3186 __u8 struct_v;
3187 uint32_t num;
7c673cae 3188
11fdf7f2 3189 ceph_assert(bl.get_num_buffers() <= 1);
7c673cae 3190 auto p = bl.front().begin_deep();
7c673cae
FG
3191 denc(struct_v, p);
3192 // Version 2 differs from v1 in blob's ref_map
3193 // serialization only. Hence there is no specific
3194 // handling at ExtentMap level below.
11fdf7f2 3195 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae 3196 denc_varint(num, p);
7c673cae 3197
39ae355f 3198 extent_pos = 0;
7c673cae 3199 while (!p.end()) {
39ae355f
TL
3200 Extent* le = get_next_extent();
3201 decode_extent(le, struct_v, p, c);
3202 add_extent(le);
3203 }
3204 ceph_assert(extent_pos == num);
3205 return num;
3206}
7c673cae 3207
39ae355f
TL
3208void BlueStore::ExtentMap::ExtentDecoder::decode_spanning_blobs(
3209 bptr_c_it_t& p, Collection* c)
3210{
3211 __u8 struct_v;
3212 denc(struct_v, p);
3213 // Version 2 differs from v1 in blob's ref_map
3214 // serialization only. Hence there is no specific
3215 // handling at ExtentMap level.
3216 ceph_assert(struct_v == 1 || struct_v == 2);
3217
3218 unsigned n;
3219 denc_varint(n, p);
3220 while (n--) {
3221 BlueStore::BlobRef b(new Blob());
3222 denc_varint(b->id, p);
3223 uint64_t sbid = 0;
3224 b->decode(p, struct_v, &sbid, true, c);
3225 consume_spanning_blob(sbid, b);
7c673cae 3226 }
39ae355f 3227}
7c673cae 3228
39ae355f
TL
3229/////////////////// BlueStore::ExtentMap::DecoderExtentFull ///////////
3230void BlueStore::ExtentMap::ExtentDecoderFull::consume_blobid(
3231 BlueStore::Extent* le, bool spanning, uint64_t blobid) {
3232 ceph_assert(le);
3233 if (spanning) {
3234 le->assign_blob(extent_map.get_spanning_blob(blobid));
3235 } else {
3236 ceph_assert(blobid < blobs.size());
3237 le->assign_blob(blobs[blobid]);
3238 // we build ref_map dynamically for non-spanning blobs
3239 le->blob->get_ref(
3240 extent_map.onode->c,
3241 le->blob_offset,
3242 le->length);
3243 }
3244}
3245
3246void BlueStore::ExtentMap::ExtentDecoderFull::consume_blob(
3247 BlueStore::Extent* le, uint64_t extent_no, uint64_t sbid, BlobRef b) {
3248 ceph_assert(le);
3249 blobs.resize(extent_no + 1);
3250 blobs[extent_no] = b;
3251 extent_map.onode->c->open_shared_blob(sbid, b);
3252 le->assign_blob(b);
3253 le->blob->get_ref(
3254 extent_map.onode->c,
3255 le->blob_offset,
3256 le->length);
3257}
3258
3259void BlueStore::ExtentMap::ExtentDecoderFull::consume_spanning_blob(
3260 uint64_t sbid, BlueStore::BlobRef b) {
3261 extent_map.spanning_blob_map[b->id] = b;
3262 extent_map.onode->c->open_shared_blob(sbid, b);
3263}
3264
3265BlueStore::Extent* BlueStore::ExtentMap::ExtentDecoderFull::get_next_extent()
3266{
3267 return new Extent();
3268}
3269
3270void BlueStore::ExtentMap::ExtentDecoderFull::add_extent(BlueStore::Extent* le)
3271{
3272 extent_map.extent_map.insert(*le);
3273}
3274
3275unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
3276{
3277 ExtentDecoderFull edecoder(*this);
3278 unsigned n = edecoder.decode_some(bl, onode->c);
3279 return n;
7c673cae
FG
3280}
3281
3282void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
3283{
3284 // Version 2 differs from v1 in blob's ref_map
3285 // serialization only. Hence there is no specific
3286 // handling at ExtentMap level.
3287 __u8 struct_v = 2;
3288
3289 denc(struct_v, p);
3290 denc_varint((uint32_t)0, p);
3291 size_t key_size = 0;
3292 denc_varint((uint32_t)0, key_size);
3293 p += spanning_blob_map.size() * key_size;
3294 for (const auto& i : spanning_blob_map) {
3295 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3296 }
3297}
3298
3299void BlueStore::ExtentMap::encode_spanning_blobs(
3300 bufferlist::contiguous_appender& p)
3301{
3302 // Version 2 differs from v1 in blob's ref_map
3303 // serialization only. Hence there is no specific
3304 // handling at ExtentMap level.
3305 __u8 struct_v = 2;
3306
3307 denc(struct_v, p);
3308 denc_varint(spanning_blob_map.size(), p);
3309 for (auto& i : spanning_blob_map) {
3310 denc_varint(i.second->id, p);
3311 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3312 }
3313}
3314
7c673cae
FG
3315void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
3316{
3317 shards.resize(onode->onode.extent_map_shards.size());
3318 unsigned i = 0;
3319 for (auto &s : onode->onode.extent_map_shards) {
3320 shards[i].shard_info = &s;
3321 shards[i].loaded = loaded;
3322 shards[i].dirty = dirty;
3323 ++i;
3324 }
3325}
3326
3327void BlueStore::ExtentMap::fault_range(
3328 KeyValueDB *db,
3329 uint32_t offset,
3330 uint32_t length)
3331{
7c673cae
FG
3332 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3333 << std::dec << dendl;
3334 auto start = seek_shard(offset);
3335 auto last = seek_shard(offset + length);
3336
3337 if (start < 0)
3338 return;
3339
11fdf7f2 3340 ceph_assert(last >= start);
7c673cae
FG
3341 string key;
3342 while (start <= last) {
11fdf7f2 3343 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3344 auto p = &shards[start];
3345 if (!p->loaded) {
3346 dout(30) << __func__ << " opening shard 0x" << std::hex
3347 << p->shard_info->offset << std::dec << dendl;
3348 bufferlist v;
3349 generate_extent_shard_key_and_apply(
3350 onode->key, p->shard_info->offset, &key,
3351 [&](const string& final_key) {
3352 int r = db->get(PREFIX_OBJ, final_key, &v);
3353 if (r < 0) {
3354 derr << __func__ << " missing shard 0x" << std::hex
3355 << p->shard_info->offset << std::dec << " for " << onode->oid
3356 << dendl;
11fdf7f2 3357 ceph_assert(r >= 0);
7c673cae
FG
3358 }
3359 }
3360 );
3361 p->extents = decode_some(v);
3362 p->loaded = true;
3363 dout(20) << __func__ << " open shard 0x" << std::hex
81eedcae
TL
3364 << p->shard_info->offset
3365 << " for range 0x" << offset << "~" << length << std::dec
7c673cae 3366 << " (" << v.length() << " bytes)" << dendl;
11fdf7f2
TL
3367 ceph_assert(p->dirty == false);
3368 ceph_assert(v.length() == p->shard_info->bytes);
7c673cae
FG
3369 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3370 } else {
3371 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3372 }
3373 ++start;
3374 }
3375}
3376
3377void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
3378 uint32_t offset,
3379 uint32_t length)
3380{
7c673cae
FG
3381 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3382 << std::dec << dendl;
3383 if (shards.empty()) {
3384 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3385 inline_bl.clear();
3386 return;
3387 }
3388 auto start = seek_shard(offset);
11fdf7f2
TL
3389 if (length == 0) {
3390 length = 1;
3391 }
3392 auto last = seek_shard(offset + length - 1);
7c673cae
FG
3393 if (start < 0)
3394 return;
3395
11fdf7f2 3396 ceph_assert(last >= start);
7c673cae 3397 while (start <= last) {
11fdf7f2 3398 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3399 auto p = &shards[start];
3400 if (!p->loaded) {
11fdf7f2
TL
3401 derr << __func__ << "on write 0x" << std::hex << offset
3402 << "~" << length << " shard 0x" << p->shard_info->offset
3403 << std::dec << " is not loaded, can't mark dirty" << dendl;
3404 ceph_abort_msg("can't mark unloaded shard dirty");
7c673cae
FG
3405 }
3406 if (!p->dirty) {
3407 dout(20) << __func__ << " mark shard 0x" << std::hex
3408 << p->shard_info->offset << std::dec << " dirty" << dendl;
3409 p->dirty = true;
3410 }
3411 ++start;
3412 }
3413}
3414
3415BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3416 uint64_t offset)
3417{
3418 Extent dummy(offset);
3419 return extent_map.find(dummy);
3420}
3421
7c673cae
FG
3422BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3423 uint64_t offset)
3424{
3425 Extent dummy(offset);
3426 auto fp = extent_map.lower_bound(dummy);
3427 if (fp != extent_map.begin()) {
3428 --fp;
3429 if (fp->logical_end() <= offset) {
3430 ++fp;
3431 }
3432 }
3433 return fp;
3434}
3435
3436BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3437 uint64_t offset) const
3438{
3439 Extent dummy(offset);
3440 auto fp = extent_map.lower_bound(dummy);
3441 if (fp != extent_map.begin()) {
3442 --fp;
3443 if (fp->logical_end() <= offset) {
3444 ++fp;
3445 }
3446 }
3447 return fp;
3448}
3449
3450bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3451{
3452 auto fp = seek_lextent(offset);
3453 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3454 return false;
3455 }
3456 return true;
3457}
3458
3459int BlueStore::ExtentMap::compress_extent_map(
3460 uint64_t offset,
3461 uint64_t length)
3462{
7c673cae
FG
3463 if (extent_map.empty())
3464 return 0;
3465 int removed = 0;
3466 auto p = seek_lextent(offset);
3467 if (p != extent_map.begin()) {
3468 --p; // start to the left of offset
3469 }
3470 // the caller should have just written to this region
11fdf7f2 3471 ceph_assert(p != extent_map.end());
7c673cae
FG
3472
3473 // identify the *next* shard
3474 auto pshard = shards.begin();
3475 while (pshard != shards.end() &&
3476 p->logical_offset >= pshard->shard_info->offset) {
3477 ++pshard;
3478 }
3479 uint64_t shard_end;
3480 if (pshard != shards.end()) {
3481 shard_end = pshard->shard_info->offset;
3482 } else {
3483 shard_end = OBJECT_MAX_SIZE;
3484 }
3485
3486 auto n = p;
3487 for (++n; n != extent_map.end(); p = n++) {
3488 if (n->logical_offset > offset + length) {
3489 break; // stop after end
3490 }
3491 while (n != extent_map.end() &&
3492 p->logical_end() == n->logical_offset &&
3493 p->blob == n->blob &&
3494 p->blob_offset + p->length == n->blob_offset &&
3495 n->logical_offset < shard_end) {
3496 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3497 << " next shard 0x" << shard_end << std::dec
3498 << " merging " << *p << " and " << *n << dendl;
3499 p->length += n->length;
3500 rm(n++);
3501 ++removed;
3502 }
3503 if (n == extent_map.end()) {
3504 break;
3505 }
3506 if (n->logical_offset >= shard_end) {
11fdf7f2 3507 ceph_assert(pshard != shards.end());
7c673cae
FG
3508 ++pshard;
3509 if (pshard != shards.end()) {
3510 shard_end = pshard->shard_info->offset;
3511 } else {
3512 shard_end = OBJECT_MAX_SIZE;
3513 }
3514 }
3515 }
11fdf7f2 3516 if (removed) {
7c673cae
FG
3517 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3518 }
3519 return removed;
3520}
3521
3522void BlueStore::ExtentMap::punch_hole(
3523 CollectionRef &c,
3524 uint64_t offset,
3525 uint64_t length,
3526 old_extent_map_t *old_extents)
3527{
3528 auto p = seek_lextent(offset);
3529 uint64_t end = offset + length;
3530 while (p != extent_map.end()) {
3531 if (p->logical_offset >= end) {
3532 break;
3533 }
3534 if (p->logical_offset < offset) {
3535 if (p->logical_end() > end) {
3536 // split and deref middle
3537 uint64_t front = offset - p->logical_offset;
3538 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3539 length, p->blob);
3540 old_extents->push_back(*oe);
3541 add(end,
3542 p->blob_offset + front + length,
3543 p->length - front - length,
3544 p->blob);
3545 p->length = front;
3546 break;
3547 } else {
3548 // deref tail
11fdf7f2 3549 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
7c673cae
FG
3550 uint64_t keep = offset - p->logical_offset;
3551 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3552 p->length - keep, p->blob);
3553 old_extents->push_back(*oe);
3554 p->length = keep;
3555 ++p;
3556 continue;
3557 }
3558 }
3559 if (p->logical_offset + p->length <= end) {
3560 // deref whole lextent
3561 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3562 p->length, p->blob);
3563 old_extents->push_back(*oe);
3564 rm(p++);
3565 continue;
3566 }
3567 // deref head
3568 uint64_t keep = p->logical_end() - end;
3569 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3570 p->length - keep, p->blob);
3571 old_extents->push_back(*oe);
3572
3573 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3574 rm(p);
3575 break;
3576 }
3577}
3578
3579BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3580 CollectionRef &c,
3581 uint64_t logical_offset,
3582 uint64_t blob_offset, uint64_t length, BlobRef b,
3583 old_extent_map_t *old_extents)
3584{
3585 // We need to have completely initialized Blob to increment its ref counters.
11fdf7f2 3586 ceph_assert(b->get_blob().get_logical_length() != 0);
7c673cae
FG
3587
3588 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3589 // old_extents list if we overwre the blob totally
3590 // This might happen during WAL overwrite.
3591 b->get_ref(onode->c, blob_offset, length);
3592
3593 if (old_extents) {
3594 punch_hole(c, logical_offset, length, old_extents);
3595 }
3596
3597 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3598 extent_map.insert(*le);
3599 if (spans_shard(logical_offset, length)) {
3600 request_reshard(logical_offset, logical_offset + length);
3601 }
3602 return le;
3603}
3604
3605BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3606 BlobRef lb,
3607 uint32_t blob_offset,
3608 uint32_t pos)
3609{
7c673cae
FG
3610 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3611 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3612 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3613 << dendl;
3614 BlobRef rb = onode->c->new_blob();
3615 lb->split(onode->c, blob_offset, rb.get());
3616
3617 for (auto ep = seek_lextent(pos);
3618 ep != extent_map.end() && ep->logical_offset < end_pos;
3619 ++ep) {
3620 if (ep->blob != lb) {
3621 continue;
3622 }
3623 if (ep->logical_offset < pos) {
3624 // split extent
3625 size_t left = pos - ep->logical_offset;
3626 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3627 extent_map.insert(*ne);
3628 ep->length = left;
3629 dout(30) << __func__ << " split " << *ep << dendl;
3630 dout(30) << __func__ << " to " << *ne << dendl;
3631 } else {
3632 // switch blob
11fdf7f2 3633 ceph_assert(ep->blob_offset >= blob_offset);
7c673cae
FG
3634
3635 ep->blob = rb;
3636 ep->blob_offset -= blob_offset;
3637 dout(30) << __func__ << " adjusted " << *ep << dendl;
3638 }
3639 }
3640 return rb;
3641}
3642
3643// Onode
3644
3645#undef dout_prefix
3646#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3647
20effc67
TL
3648const std::string& BlueStore::Onode::calc_omap_prefix(uint8_t flags)
3649{
3650 if (bluestore_onode_t::is_pgmeta_omap(flags)) {
3651 return PREFIX_PGMETA_OMAP;
3652 }
3653 if (bluestore_onode_t::is_perpg_omap(flags)) {
3654 return PREFIX_PERPG_OMAP;
3655 }
3656 if (bluestore_onode_t::is_perpool_omap(flags)) {
3657 return PREFIX_PERPOOL_OMAP;
3658 }
3659 return PREFIX_OMAP;
3660}
3661
3662// '-' < '.' < '~'
3663void BlueStore::Onode::calc_omap_header(
3664 uint8_t flags,
3665 const Onode* o,
3666 std::string* out)
3667{
3668 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3669 if (bluestore_onode_t::is_perpg_omap(flags)) {
3670 _key_encode_u64(o->c->pool(), out);
3671 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3672 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3673 _key_encode_u64(o->c->pool(), out);
3674 }
3675 }
3676 _key_encode_u64(o->onode.nid, out);
3677 out->push_back('-');
3678}
3679
3680void BlueStore::Onode::calc_omap_key(uint8_t flags,
3681 const Onode* o,
3682 const std::string& key,
3683 std::string* out)
3684{
3685 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3686 if (bluestore_onode_t::is_perpg_omap(flags)) {
3687 _key_encode_u64(o->c->pool(), out);
3688 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3689 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3690 _key_encode_u64(o->c->pool(), out);
3691 }
3692 }
3693 _key_encode_u64(o->onode.nid, out);
3694 out->push_back('.');
3695 out->append(key);
3696}
3697
3698void BlueStore::Onode::calc_omap_tail(
3699 uint8_t flags,
3700 const Onode* o,
3701 std::string* out)
3702{
3703 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3704 if (bluestore_onode_t::is_perpg_omap(flags)) {
3705 _key_encode_u64(o->c->pool(), out);
3706 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3707 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3708 _key_encode_u64(o->c->pool(), out);
3709 }
3710 }
3711 _key_encode_u64(o->onode.nid, out);
3712 out->push_back('~');
3713}
3714
39ae355f
TL
3715void BlueStore::Onode::get()
3716{
3717 ++nref;
3718 ++pin_nref;
f6b5b4d7 3719}
39ae355f
TL
3720void BlueStore::Onode::put()
3721{
3722 if (--pin_nref == 1) {
3723 c->get_onode_cache()->maybe_unpin(this);
f6b5b4d7 3724 }
39ae355f 3725 if (--nref == 0) {
f6b5b4d7
TL
3726 delete this;
3727 }
3728}
3729
39ae355f
TL
3730void BlueStore::Onode::decode_raw(
3731 BlueStore::Onode* on,
3732 const bufferlist& v,
3733 BlueStore::ExtentMap::ExtentDecoder& edecoder)
eafe8130 3734{
eafe8130
TL
3735 on->exists = true;
3736 auto p = v.front().begin_deep();
3737 on->onode.decode(p);
eafe8130
TL
3738
3739 // initialize extent_map
39ae355f 3740 edecoder.decode_spanning_blobs(p, on->c);
eafe8130
TL
3741 if (on->onode.extent_map_shards.empty()) {
3742 denc(on->extent_map.inline_bl, p);
39ae355f 3743 edecoder.decode_some(on->extent_map.inline_bl, on->c);
eafe8130 3744 }
39ae355f
TL
3745}
3746
3747BlueStore::Onode* BlueStore::Onode::create_decode(
3748 CollectionRef c,
3749 const ghobject_t& oid,
3750 const string& key,
3751 const bufferlist& v,
3752 bool allow_empty)
3753{
3754 ceph_assert(v.length() || allow_empty);
3755 Onode* on = new Onode(c.get(), oid, key);
3756
3757 if (v.length()) {
3758 ExtentMap::ExtentDecoderFull edecoder(on->extent_map);
3759 decode_raw(on, v, edecoder);
3760
3761 for (auto& i : on->onode.attrs) {
3762 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3763 }
3764
3765 // initialize extent_map
3766 if (on->onode.extent_map_shards.empty()) {
3767 on->extent_map.inline_bl.reassign_to_mempool(
3768 mempool::mempool_bluestore_cache_data);
3769 } else {
3770 on->extent_map.init_shards(false, false);
3771 }
eafe8130
TL
3772 }
3773 return on;
3774}
3775
7c673cae
FG
3776void BlueStore::Onode::flush()
3777{
3778 if (flushing_count.load()) {
3779 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
9f95a23c 3780 waiting_count++;
11fdf7f2 3781 std::unique_lock l(flush_lock);
7c673cae
FG
3782 while (flushing_count.load()) {
3783 flush_cond.wait(l);
3784 }
9f95a23c 3785 waiting_count--;
7c673cae
FG
3786 }
3787 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3788}
3789
9f95a23c
TL
3790void BlueStore::Onode::dump(Formatter* f) const
3791{
3792 onode.dump(f);
3793 extent_map.dump(f);
3794}
3795
9f95a23c
TL
3796void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3797{
f67539c2
TL
3798 if (!onode.is_pgmeta_omap()) {
3799 if (onode.is_perpg_omap()) {
3800 _key_encode_u64(c->pool(), out);
3801 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3802 } else if (onode.is_perpool_omap()) {
3803 _key_encode_u64(c->pool(), out);
3804 }
9f95a23c
TL
3805 }
3806 _key_encode_u64(onode.nid, out);
3807 out->append(old.c_str() + out->length(), old.size() - out->length());
3808}
3809
9f95a23c
TL
3810void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3811{
f67539c2
TL
3812 size_t pos = sizeof(uint64_t) + 1;
3813 if (!onode.is_pgmeta_omap()) {
3814 if (onode.is_perpg_omap()) {
3815 pos += sizeof(uint64_t) + sizeof(uint32_t);
3816 } else if (onode.is_perpool_omap()) {
3817 pos += sizeof(uint64_t);
3818 }
9f95a23c 3819 }
f67539c2 3820 *user_key = key.substr(pos);
9f95a23c
TL
3821}
3822
7c673cae
FG
3823// =======================================================
3824// WriteContext
3825
3826/// Checks for writes to the same pextent within a blob
3827bool BlueStore::WriteContext::has_conflict(
3828 BlobRef b,
3829 uint64_t loffs,
3830 uint64_t loffs_end,
3831 uint64_t min_alloc_size)
3832{
11fdf7f2
TL
3833 ceph_assert((loffs % min_alloc_size) == 0);
3834 ceph_assert((loffs_end % min_alloc_size) == 0);
7c673cae
FG
3835 for (auto w : writes) {
3836 if (b == w.b) {
11fdf7f2
TL
3837 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3838 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
7c673cae 3839 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 3840 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
3841 return true;
3842 }
3843 }
3844 }
3845 return false;
3846}
3847
3848// =======================================================
3849
3850// DeferredBatch
3851#undef dout_prefix
3852#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
9f95a23c
TL
3853#undef dout_context
3854#define dout_context cct
7c673cae
FG
3855
3856void BlueStore::DeferredBatch::prepare_write(
3857 CephContext *cct,
3858 uint64_t seq, uint64_t offset, uint64_t length,
3859 bufferlist::const_iterator& blp)
3860{
3861 _discard(cct, offset, length);
3862 auto i = iomap.insert(make_pair(offset, deferred_io()));
11fdf7f2 3863 ceph_assert(i.second); // this should be a new insertion
7c673cae
FG
3864 i.first->second.seq = seq;
3865 blp.copy(length, i.first->second.bl);
31f18b77
FG
3866 i.first->second.bl.reassign_to_mempool(
3867 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3868 dout(20) << __func__ << " seq " << seq
3869 << " 0x" << std::hex << offset << "~" << length
3870 << " crc " << i.first->second.bl.crc32c(-1)
3871 << std::dec << dendl;
3872 seq_bytes[seq] += length;
3873#ifdef DEBUG_DEFERRED
3874 _audit(cct);
3875#endif
3876}
3877
3878void BlueStore::DeferredBatch::_discard(
3879 CephContext *cct, uint64_t offset, uint64_t length)
3880{
3881 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3882 << std::dec << dendl;
3883 auto p = iomap.lower_bound(offset);
3884 if (p != iomap.begin()) {
3885 --p;
3886 auto end = p->first + p->second.bl.length();
3887 if (end > offset) {
3888 bufferlist head;
3889 head.substr_of(p->second.bl, 0, offset - p->first);
3890 dout(20) << __func__ << " keep head " << p->second.seq
3891 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3892 << " -> 0x" << head.length() << std::dec << dendl;
3893 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3894 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3895 if (end > offset + length) {
3896 bufferlist tail;
3897 tail.substr_of(p->second.bl, offset + length - p->first,
3898 end - (offset + length));
3899 dout(20) << __func__ << " keep tail " << p->second.seq
3900 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3901 << " -> 0x" << tail.length() << std::dec << dendl;
3902 auto &n = iomap[offset + length];
3903 n.bl.swap(tail);
3904 n.seq = p->second.seq;
3905 i->second -= length;
3906 } else {
3907 i->second -= end - offset;
3908 }
11fdf7f2 3909 ceph_assert(i->second >= 0);
7c673cae
FG
3910 p->second.bl.swap(head);
3911 }
3912 ++p;
3913 }
3914 while (p != iomap.end()) {
3915 if (p->first >= offset + length) {
3916 break;
3917 }
3918 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3919 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3920 auto end = p->first + p->second.bl.length();
3921 if (end > offset + length) {
3922 unsigned drop_front = offset + length - p->first;
3923 unsigned keep_tail = end - (offset + length);
3924 dout(20) << __func__ << " truncate front " << p->second.seq
3925 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3926 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3927 << " to 0x" << (offset + length) << "~" << keep_tail
3928 << std::dec << dendl;
3929 auto &s = iomap[offset + length];
3930 s.seq = p->second.seq;
3931 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3932 i->second -= drop_front;
3933 } else {
3934 dout(20) << __func__ << " drop " << p->second.seq
3935 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3936 << std::dec << dendl;
3937 i->second -= p->second.bl.length();
3938 }
11fdf7f2 3939 ceph_assert(i->second >= 0);
7c673cae
FG
3940 p = iomap.erase(p);
3941 }
3942}
3943
3944void BlueStore::DeferredBatch::_audit(CephContext *cct)
3945{
3946 map<uint64_t,int> sb;
3947 for (auto p : seq_bytes) {
3948 sb[p.first] = 0; // make sure we have the same set of keys
3949 }
3950 uint64_t pos = 0;
3951 for (auto& p : iomap) {
11fdf7f2 3952 ceph_assert(p.first >= pos);
7c673cae
FG
3953 sb[p.second.seq] += p.second.bl.length();
3954 pos = p.first + p.second.bl.length();
3955 }
11fdf7f2 3956 ceph_assert(sb == seq_bytes);
7c673cae
FG
3957}
3958
3959
3960// Collection
3961
3962#undef dout_prefix
3963#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3964
9f95a23c
TL
3965BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3966 : CollectionImpl(store_->cct, cid),
11fdf7f2 3967 store(store_),
9f95a23c 3968 cache(bc),
7c673cae 3969 exists(true),
39ae355f 3970 onode_space(oc),
11fdf7f2
TL
3971 commit_queue(nullptr)
3972{
3973}
3974
3975bool BlueStore::Collection::flush_commit(Context *c)
3976{
3977 return osr->flush_commit(c);
3978}
3979
3980void BlueStore::Collection::flush()
3981{
3982 osr->flush();
3983}
3984
3985void BlueStore::Collection::flush_all_but_last()
7c673cae 3986{
11fdf7f2 3987 osr->flush_all_but_last();
7c673cae
FG
3988}
3989
3990void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3991{
11fdf7f2 3992 ceph_assert(!b->shared_blob);
7c673cae
FG
3993 const bluestore_blob_t& blob = b->get_blob();
3994 if (!blob.is_shared()) {
3995 b->shared_blob = new SharedBlob(this);
3996 return;
3997 }
3998
3999 b->shared_blob = shared_blob_set.lookup(sbid);
4000 if (b->shared_blob) {
4001 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
4002 << std::dec << " had " << *b->shared_blob << dendl;
4003 } else {
4004 b->shared_blob = new SharedBlob(sbid, this);
4005 shared_blob_set.add(this, b->shared_blob.get());
4006 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
4007 << std::dec << " opened " << *b->shared_blob
4008 << dendl;
4009 }
4010}
4011
4012void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
4013{
4014 if (!sb->is_loaded()) {
4015
4016 bufferlist v;
4017 string key;
4018 auto sbid = sb->get_sbid();
4019 get_shared_blob_key(sbid, &key);
4020 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
4021 if (r < 0) {
4022 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
4023 << std::dec << " not found at key "
4024 << pretty_binary_string(key) << dendl;
11fdf7f2 4025 ceph_abort_msg("uh oh, missing shared_blob");
7c673cae
FG
4026 }
4027
4028 sb->loaded = true;
4029 sb->persistent = new bluestore_shared_blob_t(sbid);
11fdf7f2
TL
4030 auto p = v.cbegin();
4031 decode(*(sb->persistent), p);
7c673cae
FG
4032 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
4033 << std::dec << " loaded shared_blob " << *sb << dendl;
4034 }
4035}
4036
4037void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
4038{
7c673cae 4039 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
11fdf7f2 4040 ceph_assert(!b->shared_blob->is_loaded());
7c673cae
FG
4041
4042 // update blob
31f18b77 4043 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 4044 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
4045
4046 // update shared blob
4047 b->shared_blob->loaded = true;
4048 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
4049 shared_blob_set.add(this, b->shared_blob.get());
4050 for (auto p : blob.get_extents()) {
4051 if (p.is_valid()) {
4052 b->shared_blob->get_ref(
4053 p.offset,
4054 p.length);
4055 }
4056 }
4057 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
4058}
4059
31f18b77
FG
4060uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
4061{
4062 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
11fdf7f2 4063 ceph_assert(sb->is_loaded());
31f18b77
FG
4064
4065 uint64_t sbid = sb->get_sbid();
4066 shared_blob_set.remove(sb);
4067 sb->loaded = false;
4068 delete sb->persistent;
4069 sb->sbid_unloaded = 0;
4070 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
4071 return sbid;
4072}
4073
7c673cae
FG
4074BlueStore::OnodeRef BlueStore::Collection::get_onode(
4075 const ghobject_t& oid,
9f95a23c
TL
4076 bool create,
4077 bool is_createop)
7c673cae 4078{
9f95a23c 4079 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
7c673cae
FG
4080
4081 spg_t pgid;
4082 if (cid.is_pg(&pgid)) {
4083 if (!oid.match(cnode.bits, pgid.ps())) {
4084 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
4085 << pgid << " bits " << cnode.bits << dendl;
4086 ceph_abort();
4087 }
4088 }
4089
39ae355f 4090 OnodeRef o = onode_space.lookup(oid);
7c673cae
FG
4091 if (o)
4092 return o;
4093
eafe8130 4094 string key;
7c673cae
FG
4095 get_object_key(store->cct, oid, &key);
4096
4097 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
4098 << pretty_binary_string(key) << dendl;
4099
4100 bufferlist v;
9f95a23c 4101 int r = -ENOENT;
7c673cae 4102 Onode *on;
9f95a23c
TL
4103 if (!is_createop) {
4104 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
4105 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
4106 }
7c673cae 4107 if (v.length() == 0) {
11fdf7f2 4108 ceph_assert(r == -ENOENT);
f67539c2 4109 if (!create)
7c673cae 4110 return OnodeRef();
7c673cae 4111 } else {
11fdf7f2 4112 ceph_assert(r >= 0);
7c673cae 4113 }
39ae355f
TL
4114
4115 // new object, load onode if available
4116 on = Onode::create_decode(this, oid, key, v, true);
7c673cae 4117 o.reset(on);
39ae355f 4118 return onode_space.add_onode(oid, o);
7c673cae
FG
4119}
4120
4121void BlueStore::Collection::split_cache(
4122 Collection *dest)
4123{
4124 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
4125
f67539c2
TL
4126 auto *ocache = get_onode_cache();
4127 auto *ocache_dest = dest->get_onode_cache();
4128
4129 // lock cache shards
4130 std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock);
4131 std::lock_guard l(ocache->lock, std::adopt_lock);
4132 std::lock_guard l2(ocache_dest->lock, std::adopt_lock);
4133 std::lock_guard l3(cache->lock, std::adopt_lock);
4134 std::lock_guard l4(dest->cache->lock, std::adopt_lock);
7c673cae
FG
4135
4136 int destbits = dest->cnode.bits;
4137 spg_t destpg;
4138 bool is_pg = dest->cid.is_pg(&destpg);
11fdf7f2 4139 ceph_assert(is_pg);
7c673cae 4140
39ae355f
TL
4141 auto p = onode_space.onode_map.begin();
4142 while (p != onode_space.onode_map.end()) {
11fdf7f2 4143 OnodeRef o = p->second;
7c673cae
FG
4144 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
4145 // onode does not belong to this child
11fdf7f2
TL
4146 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
4147 << dendl;
7c673cae
FG
4148 ++p;
4149 } else {
7c673cae
FG
4150 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
4151 << dendl;
4152
39ae355f 4153 // ensuring that nref is always >= 2 and hence onode is pinned
f6b5b4d7 4154 OnodeRef o_pin = o;
f6b5b4d7 4155
39ae355f
TL
4156 p = onode_space.onode_map.erase(p);
4157 dest->onode_space.onode_map[o->oid] = o;
adb31ebb 4158 if (o->cached) {
39ae355f 4159 get_onode_cache()->_move_pinned(dest->get_onode_cache(), o.get());
9f95a23c 4160 }
f6b5b4d7 4161 o->c = dest;
7c673cae
FG
4162
4163 // move over shared blobs and buffers. cover shared blobs from
4164 // both extent map and spanning blob map (the full extent map
4165 // may not be faulted in)
4166 vector<SharedBlob*> sbvec;
4167 for (auto& e : o->extent_map.extent_map) {
4168 sbvec.push_back(e.blob->shared_blob.get());
4169 }
4170 for (auto& b : o->extent_map.spanning_blob_map) {
4171 sbvec.push_back(b.second->shared_blob.get());
4172 }
4173 for (auto sb : sbvec) {
4174 if (sb->coll == dest) {
4175 ldout(store->cct, 20) << __func__ << " already moved " << *sb
4176 << dendl;
4177 continue;
4178 }
4179 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
4180 if (sb->get_sbid()) {
4181 ldout(store->cct, 20) << __func__
4182 << " moving registration " << *sb << dendl;
4183 shared_blob_set.remove(sb);
4184 dest->shared_blob_set.add(dest, sb);
4185 }
3efd9988 4186 sb->coll = dest;
7c673cae 4187 if (dest->cache != cache) {
7c673cae
FG
4188 for (auto& i : sb->bc.buffer_map) {
4189 if (!i.second->is_writing()) {
4190 ldout(store->cct, 20) << __func__ << " moving " << *i.second
4191 << dendl;
9f95a23c 4192 dest->cache->_move(cache, i.second.get());
7c673cae
FG
4193 }
4194 }
4195 }
4196 }
7c673cae
FG
4197 }
4198 }
9f95a23c 4199 dest->cache->_trim();
7c673cae
FG
4200}
4201
7c673cae
FG
4202// =======================================================
4203
91327a77
AA
4204// MempoolThread
4205
4206#undef dout_prefix
4207#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
9f95a23c
TL
4208#undef dout_context
4209#define dout_context store->cct
91327a77 4210
7c673cae
FG
4211void *BlueStore::MempoolThread::entry()
4212{
9f95a23c 4213 std::unique_lock l{lock};
11fdf7f2 4214
92f5a8d4 4215 uint32_t prev_config_change = store->config_changed.load();
eafe8130
TL
4216 uint64_t base = store->osd_memory_base;
4217 double fragmentation = store->osd_memory_expected_fragmentation;
4218 uint64_t target = store->osd_memory_target;
4219 uint64_t min = store->osd_memory_cache_min;
4220 uint64_t max = min;
4221
4222 // When setting the maximum amount of memory to use for cache, first
4223 // assume some base amount of memory for the OSD and then fudge in
4224 // some overhead for fragmentation that scales with cache usage.
4225 uint64_t ltarget = (1.0 - fragmentation) * target;
4226 if (ltarget > base + min) {
4227 max = ltarget - base;
11fdf7f2 4228 }
31f18b77 4229
eafe8130 4230 binned_kv_cache = store->db->get_priority_cache();
f67539c2 4231 binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
eafe8130
TL
4232 if (store->cache_autotune && binned_kv_cache != nullptr) {
4233 pcm = std::make_shared<PriorityCache::Manager>(
f67539c2 4234 store->cct, min, max, target, true, "bluestore-pricache");
eafe8130
TL
4235 pcm->insert("kv", binned_kv_cache, true);
4236 pcm->insert("meta", meta_cache, true);
4237 pcm->insert("data", data_cache, true);
f67539c2
TL
4238 if (binned_kv_onode_cache != nullptr) {
4239 pcm->insert("kv_onode", binned_kv_onode_cache, true);
4240 }
eafe8130 4241 }
91327a77
AA
4242
4243 utime_t next_balance = ceph_clock_now();
4244 utime_t next_resize = ceph_clock_now();
20effc67 4245 utime_t next_bin_rotation = ceph_clock_now();
9f95a23c
TL
4246 utime_t next_deferred_force_submit = ceph_clock_now();
4247 utime_t alloc_stats_dump_clock = ceph_clock_now();
31f18b77 4248
91327a77 4249 bool interval_stats_trim = false;
91327a77 4250 while (!stop) {
92f5a8d4
TL
4251 // Update pcm cache settings if related configuration was changed
4252 uint32_t cur_config_change = store->config_changed.load();
4253 if (cur_config_change != prev_config_change) {
4254 _update_cache_settings();
4255 prev_config_change = cur_config_change;
4256 }
4257
20effc67
TL
4258 // define various intervals for background work
4259 double age_bin_interval = store->cache_age_bin_interval;
91327a77
AA
4260 double autotune_interval = store->cache_autotune_interval;
4261 double resize_interval = store->osd_memory_cache_resize_interval;
9f95a23c 4262 double max_defer_interval = store->max_defer_interval;
9f95a23c
TL
4263 double alloc_stats_dump_interval =
4264 store->cct->_conf->bluestore_alloc_stats_dump_interval;
91327a77 4265
20effc67 4266 // alloc stats dump
9f95a23c
TL
4267 if (alloc_stats_dump_interval > 0 &&
4268 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
4269 store->_record_allocation_stats();
4270 alloc_stats_dump_clock = ceph_clock_now();
4271 }
20effc67
TL
4272 // cache age binning
4273 if (age_bin_interval > 0 && next_bin_rotation < ceph_clock_now()) {
4274 if (binned_kv_cache != nullptr) {
4275 binned_kv_cache->import_bins(store->kv_bins);
4276 }
4277 if (binned_kv_onode_cache != nullptr) {
4278 binned_kv_onode_cache->import_bins(store->kv_onode_bins);
4279 }
4280 meta_cache->import_bins(store->meta_bins);
4281 data_cache->import_bins(store->data_bins);
4282
4283 if (pcm != nullptr) {
4284 pcm->shift_bins();
4285 }
4286 next_bin_rotation = ceph_clock_now();
4287 next_bin_rotation += age_bin_interval;
4288 }
4289 // cache balancing
91327a77 4290 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
20effc67
TL
4291 if (binned_kv_cache != nullptr) {
4292 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
4293 }
4294 if (binned_kv_onode_cache != nullptr) {
4295 binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
4296 }
4297 meta_cache->set_cache_ratio(store->cache_meta_ratio);
4298 data_cache->set_cache_ratio(store->cache_data_ratio);
11fdf7f2 4299
91327a77 4300 // Log events at 5 instead of 20 when balance happens.
91327a77 4301 interval_stats_trim = true;
eafe8130
TL
4302
4303 if (pcm != nullptr) {
4304 pcm->balance();
91327a77 4305 }
31f18b77 4306
91327a77
AA
4307 next_balance = ceph_clock_now();
4308 next_balance += autotune_interval;
4309 }
20effc67 4310 // memory resizing (ie autotuning)
91327a77 4311 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
eafe8130
TL
4312 if (ceph_using_tcmalloc() && pcm != nullptr) {
4313 pcm->tune_memory();
91327a77
AA
4314 }
4315 next_resize = ceph_clock_now();
4316 next_resize += resize_interval;
31f18b77 4317 }
20effc67 4318 // deferred force submit
9f95a23c
TL
4319 if (max_defer_interval > 0 &&
4320 next_deferred_force_submit < ceph_clock_now()) {
4321 if (store->get_deferred_last_submitted() + max_defer_interval <
4322 ceph_clock_now()) {
4323 store->deferred_try_submit();
4324 }
4325 next_deferred_force_submit = ceph_clock_now();
4326 next_deferred_force_submit += max_defer_interval/3;
4327 }
4328
4329 // Now Resize the shards
4330 _resize_shards(interval_stats_trim);
91327a77 4331 interval_stats_trim = false;
31f18b77 4332
1e59de90 4333 store->_update_logger();
11fdf7f2
TL
4334 auto wait = ceph::make_timespan(
4335 store->cct->_conf->bluestore_cache_trim_interval);
4336 cond.wait_for(l, wait);
7c673cae 4337 }
9f95a23c
TL
4338 // do final dump
4339 store->_record_allocation_stats();
7c673cae 4340 stop = false;
f67539c2 4341 pcm = nullptr;
7c673cae
FG
4342 return NULL;
4343}
4344
9f95a23c 4345void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
91327a77 4346{
9f95a23c
TL
4347 size_t onode_shards = store->onode_cache_shards.size();
4348 size_t buffer_shards = store->buffer_cache_shards.size();
91327a77 4349 int64_t kv_used = store->db->get_cache_usage();
f67539c2 4350 int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
11fdf7f2
TL
4351 int64_t meta_used = meta_cache->_get_used_bytes();
4352 int64_t data_used = data_cache->_get_used_bytes();
91327a77
AA
4353
4354 uint64_t cache_size = store->cache_size;
4355 int64_t kv_alloc =
20effc67 4356 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
f67539c2
TL
4357 int64_t kv_onode_alloc =
4358 static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
91327a77 4359 int64_t meta_alloc =
11fdf7f2 4360 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
91327a77 4361 int64_t data_alloc =
11fdf7f2 4362 static_cast<int64_t>(store->cache_data_ratio * cache_size);
91327a77 4363
eafe8130
TL
4364 if (pcm != nullptr && binned_kv_cache != nullptr) {
4365 cache_size = pcm->get_tuned_mem();
11fdf7f2
TL
4366 kv_alloc = binned_kv_cache->get_committed_size();
4367 meta_alloc = meta_cache->get_committed_size();
4368 data_alloc = data_cache->get_committed_size();
f67539c2
TL
4369 if (binned_kv_onode_cache != nullptr) {
4370 kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
4371 }
91327a77
AA
4372 }
4373
4374 if (interval_stats) {
9f95a23c 4375 dout(5) << __func__ << " cache_size: " << cache_size
91327a77
AA
4376 << " kv_alloc: " << kv_alloc
4377 << " kv_used: " << kv_used
f67539c2
TL
4378 << " kv_onode_alloc: " << kv_onode_alloc
4379 << " kv_onode_used: " << kv_onode_used
91327a77
AA
4380 << " meta_alloc: " << meta_alloc
4381 << " meta_used: " << meta_used
4382 << " data_alloc: " << data_alloc
4383 << " data_used: " << data_used << dendl;
4384 } else {
9f95a23c 4385 dout(20) << __func__ << " cache_size: " << cache_size
91327a77
AA
4386 << " kv_alloc: " << kv_alloc
4387 << " kv_used: " << kv_used
f67539c2
TL
4388 << " kv_onode_alloc: " << kv_onode_alloc
4389 << " kv_onode_used: " << kv_onode_used
91327a77
AA
4390 << " meta_alloc: " << meta_alloc
4391 << " meta_used: " << meta_used
4392 << " data_alloc: " << data_alloc
4393 << " data_used: " << data_used << dendl;
4394 }
4395
4396 uint64_t max_shard_onodes = static_cast<uint64_t>(
9f95a23c
TL
4397 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
4398 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
91327a77 4399
9f95a23c 4400 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
91327a77
AA
4401 << " max_shard_buffer: " << max_shard_buffer << dendl;
4402
9f95a23c
TL
4403 for (auto i : store->onode_cache_shards) {
4404 i->set_max(max_shard_onodes);
4405 }
4406 for (auto i : store->buffer_cache_shards) {
4407 i->set_max(max_shard_buffer);
91327a77
AA
4408 }
4409}
4410
92f5a8d4
TL
4411void BlueStore::MempoolThread::_update_cache_settings()
4412{
4413 // Nothing to do if pcm is not used.
4414 if (pcm == nullptr) {
4415 return;
4416 }
4417
92f5a8d4
TL
4418 uint64_t target = store->osd_memory_target;
4419 uint64_t base = store->osd_memory_base;
4420 uint64_t min = store->osd_memory_cache_min;
4421 uint64_t max = min;
4422 double fragmentation = store->osd_memory_expected_fragmentation;
4423
4424 uint64_t ltarget = (1.0 - fragmentation) * target;
4425 if (ltarget > base + min) {
4426 max = ltarget - base;
4427 }
4428
4429 // set pcm cache levels
4430 pcm->set_target_memory(target);
4431 pcm->set_min_memory(min);
4432 pcm->set_max_memory(max);
4433
9f95a23c 4434 dout(5) << __func__ << " updated pcm target: " << target
92f5a8d4
TL
4435 << " pcm min: " << min
4436 << " pcm max: " << max
4437 << dendl;
4438}
4439
7c673cae
FG
4440// =======================================================
4441
31f18b77
FG
4442// OmapIteratorImpl
4443
4444#undef dout_prefix
4445#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4446
4447BlueStore::OmapIteratorImpl::OmapIteratorImpl(
1e59de90
TL
4448 PerfCounters* _logger, CollectionRef c, OnodeRef& o, KeyValueDB::Iterator it)
4449 : logger(_logger), c(c), o(o), it(it)
31f18b77 4450{
1e59de90 4451 logger->inc(l_bluestore_omap_iterator_count);
9f95a23c 4452 std::shared_lock l(c->lock);
31f18b77 4453 if (o->onode.has_omap()) {
9f95a23c
TL
4454 o->get_omap_key(string(), &head);
4455 o->get_omap_tail(&tail);
31f18b77
FG
4456 it->lower_bound(head);
4457 }
4458}
1e59de90
TL
4459BlueStore::OmapIteratorImpl::~OmapIteratorImpl()
4460{
4461 logger->dec(l_bluestore_omap_iterator_count);
4462}
31f18b77 4463
11fdf7f2
TL
4464string BlueStore::OmapIteratorImpl::_stringify() const
4465{
4466 stringstream s;
4467 s << " omap_iterator(cid = " << c->cid
4468 <<", oid = " << o->oid << ")";
4469 return s.str();
4470}
4471
31f18b77
FG
4472int BlueStore::OmapIteratorImpl::seek_to_first()
4473{
9f95a23c 4474 std::shared_lock l(c->lock);
11fdf7f2 4475 auto start1 = mono_clock::now();
31f18b77
FG
4476 if (o->onode.has_omap()) {
4477 it->lower_bound(head);
4478 } else {
4479 it = KeyValueDB::Iterator();
4480 }
494da23a
TL
4481 c->store->log_latency(
4482 __func__,
11fdf7f2
TL
4483 l_bluestore_omap_seek_to_first_lat,
4484 mono_clock::now() - start1,
494da23a 4485 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2 4486
31f18b77
FG
4487 return 0;
4488}
4489
4490int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4491{
9f95a23c 4492 std::shared_lock l(c->lock);
11fdf7f2 4493 auto start1 = mono_clock::now();
31f18b77
FG
4494 if (o->onode.has_omap()) {
4495 string key;
9f95a23c 4496 o->get_omap_key(after, &key);
31f18b77
FG
4497 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4498 << pretty_binary_string(key) << dendl;
4499 it->upper_bound(key);
4500 } else {
4501 it = KeyValueDB::Iterator();
4502 }
11fdf7f2 4503 c->store->log_latency_fn(
494da23a 4504 __func__,
11fdf7f2
TL
4505 l_bluestore_omap_upper_bound_lat,
4506 mono_clock::now() - start1,
494da23a 4507 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4508 [&] (const ceph::timespan& lat) {
494da23a 4509 return ", after = " + after +
11fdf7f2
TL
4510 _stringify();
4511 }
4512 );
31f18b77
FG
4513 return 0;
4514}
4515
4516int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4517{
9f95a23c 4518 std::shared_lock l(c->lock);
11fdf7f2 4519 auto start1 = mono_clock::now();
31f18b77
FG
4520 if (o->onode.has_omap()) {
4521 string key;
9f95a23c 4522 o->get_omap_key(to, &key);
31f18b77
FG
4523 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4524 << pretty_binary_string(key) << dendl;
4525 it->lower_bound(key);
4526 } else {
4527 it = KeyValueDB::Iterator();
4528 }
11fdf7f2 4529 c->store->log_latency_fn(
494da23a 4530 __func__,
11fdf7f2
TL
4531 l_bluestore_omap_lower_bound_lat,
4532 mono_clock::now() - start1,
494da23a 4533 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4534 [&] (const ceph::timespan& lat) {
494da23a 4535 return ", to = " + to +
11fdf7f2
TL
4536 _stringify();
4537 }
4538 );
31f18b77
FG
4539 return 0;
4540}
4541
4542bool BlueStore::OmapIteratorImpl::valid()
4543{
9f95a23c 4544 std::shared_lock l(c->lock);
31f18b77 4545 bool r = o->onode.has_omap() && it && it->valid() &&
494da23a 4546 it->raw_key().second < tail;
31f18b77
FG
4547 if (it && it->valid()) {
4548 ldout(c->store->cct,20) << __func__ << " is at "
4549 << pretty_binary_string(it->raw_key().second)
4550 << dendl;
4551 }
4552 return r;
4553}
4554
11fdf7f2 4555int BlueStore::OmapIteratorImpl::next()
31f18b77 4556{
11fdf7f2 4557 int r = -1;
9f95a23c 4558 std::shared_lock l(c->lock);
11fdf7f2 4559 auto start1 = mono_clock::now();
31f18b77
FG
4560 if (o->onode.has_omap()) {
4561 it->next();
11fdf7f2 4562 r = 0;
31f18b77 4563 }
494da23a
TL
4564 c->store->log_latency(
4565 __func__,
11fdf7f2
TL
4566 l_bluestore_omap_next_lat,
4567 mono_clock::now() - start1,
494da23a 4568 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2
TL
4569
4570 return r;
31f18b77
FG
4571}
4572
4573string BlueStore::OmapIteratorImpl::key()
4574{
9f95a23c 4575 std::shared_lock l(c->lock);
11fdf7f2 4576 ceph_assert(it->valid());
31f18b77
FG
4577 string db_key = it->raw_key().second;
4578 string user_key;
9f95a23c 4579 o->decode_omap_key(db_key, &user_key);
494da23a 4580
31f18b77
FG
4581 return user_key;
4582}
4583
4584bufferlist BlueStore::OmapIteratorImpl::value()
4585{
9f95a23c 4586 std::shared_lock l(c->lock);
11fdf7f2 4587 ceph_assert(it->valid());
31f18b77
FG
4588 return it->value();
4589}
4590
4591
4592// =====================================
4593
7c673cae
FG
4594#undef dout_prefix
4595#define dout_prefix *_dout << "bluestore(" << path << ") "
9f95a23c
TL
4596#undef dout_context
4597#define dout_context cct
7c673cae
FG
4598
4599
4600static void aio_cb(void *priv, void *priv2)
4601{
4602 BlueStore *store = static_cast<BlueStore*>(priv);
4603 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4604 c->aio_finish(store);
4605}
4606
11fdf7f2
TL
4607static void discard_cb(void *priv, void *priv2)
4608{
4609 BlueStore *store = static_cast<BlueStore*>(priv);
4610 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4611 store->handle_discard(*tmp);
4612}
4613
4614void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4615{
4616 dout(10) << __func__ << dendl;
20effc67
TL
4617 ceph_assert(alloc);
4618 alloc->release(to_release);
11fdf7f2
TL
4619}
4620
7c673cae 4621BlueStore::BlueStore(CephContext *cct, const string& path)
9f95a23c 4622 : BlueStore(cct, path, 0) {}
7c673cae
FG
4623
4624BlueStore::BlueStore(CephContext *cct,
4625 const string& path,
4626 uint64_t _min_alloc_size)
4627 : ObjectStore(cct, path),
9f95a23c 4628 throttle(cct),
11fdf7f2 4629 finisher(cct, "commit_finisher", "cfin"),
7c673cae 4630 kv_sync_thread(this),
31f18b77 4631 kv_finalize_thread(this),
20effc67 4632#ifdef HAVE_LIBZBD
f67539c2 4633 zoned_cleaner_thread(this),
20effc67 4634#endif
7c673cae 4635 min_alloc_size(_min_alloc_size),
1e59de90 4636 min_alloc_size_order(std::countr_zero(_min_alloc_size)),
7c673cae
FG
4637 mempool_thread(this)
4638{
4639 _init_logger();
11fdf7f2 4640 cct->_conf.add_observer(this);
7c673cae 4641 set_cache_shards(1);
7c673cae
FG
4642}
4643
4644BlueStore::~BlueStore()
4645{
11fdf7f2 4646 cct->_conf.remove_observer(this);
7c673cae 4647 _shutdown_logger();
11fdf7f2
TL
4648 ceph_assert(!mounted);
4649 ceph_assert(db == NULL);
4650 ceph_assert(bluefs == NULL);
4651 ceph_assert(fsid_fd < 0);
4652 ceph_assert(path_fd < 0);
9f95a23c
TL
4653 for (auto i : onode_cache_shards) {
4654 delete i;
4655 }
4656 for (auto i : buffer_cache_shards) {
7c673cae
FG
4657 delete i;
4658 }
9f95a23c
TL
4659 onode_cache_shards.clear();
4660 buffer_cache_shards.clear();
7c673cae
FG
4661}
4662
4663const char **BlueStore::get_tracked_conf_keys() const
4664{
4665 static const char* KEYS[] = {
4666 "bluestore_csum_type",
4667 "bluestore_compression_mode",
4668 "bluestore_compression_algorithm",
4669 "bluestore_compression_min_blob_size",
4670 "bluestore_compression_min_blob_size_ssd",
4671 "bluestore_compression_min_blob_size_hdd",
4672 "bluestore_compression_max_blob_size",
4673 "bluestore_compression_max_blob_size_ssd",
4674 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 4675 "bluestore_compression_required_ratio",
7c673cae
FG
4676 "bluestore_max_alloc_size",
4677 "bluestore_prefer_deferred_size",
181888fb
FG
4678 "bluestore_prefer_deferred_size_hdd",
4679 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
4680 "bluestore_deferred_batch_ops",
4681 "bluestore_deferred_batch_ops_hdd",
4682 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
4683 "bluestore_throttle_bytes",
4684 "bluestore_throttle_deferred_bytes",
4685 "bluestore_throttle_cost_per_io_hdd",
4686 "bluestore_throttle_cost_per_io_ssd",
4687 "bluestore_throttle_cost_per_io",
4688 "bluestore_max_blob_size",
4689 "bluestore_max_blob_size_ssd",
4690 "bluestore_max_blob_size_hdd",
11fdf7f2
TL
4691 "osd_memory_target",
4692 "osd_memory_target_cgroup_limit_ratio",
4693 "osd_memory_base",
4694 "osd_memory_cache_min",
92f5a8d4 4695 "osd_memory_expected_fragmentation",
11fdf7f2
TL
4696 "bluestore_cache_autotune",
4697 "bluestore_cache_autotune_interval",
20effc67
TL
4698 "bluestore_cache_age_bin_interval",
4699 "bluestore_cache_kv_age_bins",
4700 "bluestore_cache_kv_onode_age_bins",
4701 "bluestore_cache_meta_age_bins",
4702 "bluestore_cache_data_age_bins",
81eedcae 4703 "bluestore_warn_on_legacy_statfs",
9f95a23c 4704 "bluestore_warn_on_no_per_pool_omap",
20effc67 4705 "bluestore_warn_on_no_per_pg_omap",
9f95a23c 4706 "bluestore_max_defer_interval",
7c673cae
FG
4707 NULL
4708 };
4709 return KEYS;
4710}
4711
11fdf7f2 4712void BlueStore::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
4713 const std::set<std::string> &changed)
4714{
eafe8130 4715 if (changed.count("bluestore_warn_on_legacy_statfs")) {
81eedcae
TL
4716 _check_legacy_statfs_alert();
4717 }
f67539c2
TL
4718 if (changed.count("bluestore_warn_on_no_per_pool_omap") ||
4719 changed.count("bluestore_warn_on_no_per_pg_omap")) {
4720 _check_no_per_pg_or_pool_omap_alert();
9f95a23c 4721 }
81eedcae 4722
7c673cae
FG
4723 if (changed.count("bluestore_csum_type")) {
4724 _set_csum();
4725 }
4726 if (changed.count("bluestore_compression_mode") ||
4727 changed.count("bluestore_compression_algorithm") ||
4728 changed.count("bluestore_compression_min_blob_size") ||
4729 changed.count("bluestore_compression_max_blob_size")) {
4730 if (bdev) {
4731 _set_compression();
4732 }
4733 }
4734 if (changed.count("bluestore_max_blob_size") ||
4735 changed.count("bluestore_max_blob_size_ssd") ||
4736 changed.count("bluestore_max_blob_size_hdd")) {
4737 if (bdev) {
4738 // only after startup
4739 _set_blob_size();
4740 }
4741 }
4742 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
4743 changed.count("bluestore_prefer_deferred_size_hdd") ||
4744 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
4745 changed.count("bluestore_max_alloc_size") ||
4746 changed.count("bluestore_deferred_batch_ops") ||
4747 changed.count("bluestore_deferred_batch_ops_hdd") ||
4748 changed.count("bluestore_deferred_batch_ops_ssd")) {
4749 if (bdev) {
4750 // only after startup
4751 _set_alloc_sizes();
4752 }
4753 }
4754 if (changed.count("bluestore_throttle_cost_per_io") ||
4755 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4756 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4757 if (bdev) {
4758 _set_throttle_params();
4759 }
4760 }
9f95a23c
TL
4761 if (changed.count("bluestore_throttle_bytes") ||
4762 changed.count("bluestore_throttle_deferred_bytes") ||
4763 changed.count("bluestore_throttle_trace_rate")) {
4764 throttle.reset_throttle(conf);
7c673cae 4765 }
9f95a23c
TL
4766 if (changed.count("bluestore_max_defer_interval")) {
4767 if (bdev) {
4768 _set_max_defer_interval();
4769 }
7c673cae 4770 }
92f5a8d4
TL
4771 if (changed.count("osd_memory_target") ||
4772 changed.count("osd_memory_base") ||
4773 changed.count("osd_memory_cache_min") ||
4774 changed.count("osd_memory_expected_fragmentation")) {
4775 _update_osd_memory_options();
4776 }
7c673cae
FG
4777}
4778
4779void BlueStore::_set_compression()
4780{
224ce89b
WB
4781 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4782 if (m) {
11fdf7f2 4783 _clear_compression_alert();
224ce89b
WB
4784 comp_mode = *m;
4785 } else {
4786 derr << __func__ << " unrecognized value '"
4787 << cct->_conf->bluestore_compression_mode
4788 << "' for bluestore_compression_mode, reverting to 'none'"
4789 << dendl;
4790 comp_mode = Compressor::COMP_NONE;
11fdf7f2
TL
4791 string s("unknown mode: ");
4792 s += cct->_conf->bluestore_compression_mode;
4793 _set_compression_alert(true, s.c_str());
224ce89b
WB
4794 }
4795
4796 compressor = nullptr;
4797
3efd9988
FG
4798 if (cct->_conf->bluestore_compression_min_blob_size) {
4799 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae 4800 } else {
11fdf7f2 4801 ceph_assert(bdev);
9f95a23c 4802 if (_use_rotational_settings()) {
7c673cae
FG
4803 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4804 } else {
4805 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4806 }
4807 }
4808
4809 if (cct->_conf->bluestore_compression_max_blob_size) {
4810 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4811 } else {
11fdf7f2 4812 ceph_assert(bdev);
9f95a23c 4813 if (_use_rotational_settings()) {
7c673cae
FG
4814 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4815 } else {
4816 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4817 }
4818 }
4819
7c673cae
FG
4820 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4821 if (!alg_name.empty()) {
4822 compressor = Compressor::create(cct, alg_name);
4823 if (!compressor) {
4824 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4825 << dendl;
11fdf7f2 4826 _set_compression_alert(false, alg_name.c_str());
7c673cae
FG
4827 }
4828 }
4829
4830 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4831 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
11fdf7f2
TL
4832 << " min_blob " << comp_min_blob_size
4833 << " max_blob " << comp_max_blob_size
7c673cae
FG
4834 << dendl;
4835}
4836
4837void BlueStore::_set_csum()
4838{
4839 csum_type = Checksummer::CSUM_NONE;
4840 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4841 if (t > Checksummer::CSUM_NONE)
4842 csum_type = t;
4843
4844 dout(10) << __func__ << " csum_type "
4845 << Checksummer::get_csum_type_string(csum_type)
4846 << dendl;
4847}
4848
4849void BlueStore::_set_throttle_params()
4850{
4851 if (cct->_conf->bluestore_throttle_cost_per_io) {
4852 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4853 } else {
11fdf7f2 4854 ceph_assert(bdev);
9f95a23c 4855 if (_use_rotational_settings()) {
7c673cae
FG
4856 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4857 } else {
4858 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4859 }
4860 }
4861
4862 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4863 << dendl;
4864}
4865void BlueStore::_set_blob_size()
4866{
4867 if (cct->_conf->bluestore_max_blob_size) {
4868 max_blob_size = cct->_conf->bluestore_max_blob_size;
4869 } else {
11fdf7f2 4870 ceph_assert(bdev);
9f95a23c 4871 if (_use_rotational_settings()) {
7c673cae
FG
4872 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4873 } else {
4874 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4875 }
4876 }
4877 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4878 << std::dec << dendl;
4879}
4880
92f5a8d4
TL
4881void BlueStore::_update_osd_memory_options()
4882{
4883 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4884 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4885 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4886 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4887 config_changed++;
4888 dout(10) << __func__
4889 << " osd_memory_target " << osd_memory_target
4890 << " osd_memory_base " << osd_memory_base
4891 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4892 << " osd_memory_cache_min " << osd_memory_cache_min
4893 << dendl;
4894}
4895
11fdf7f2 4896int BlueStore::_set_cache_sizes()
1adf2230 4897{
11fdf7f2
TL
4898 ceph_assert(bdev);
4899 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
91327a77 4900 cache_autotune_interval =
11fdf7f2 4901 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
20effc67
TL
4902 cache_age_bin_interval =
4903 cct->_conf.get_val<double>("bluestore_cache_age_bin_interval");
4904 auto _set_bin = [&](std::string conf_name, std::vector<uint64_t>* intervals)
4905 {
4906 std::string intervals_str = cct->_conf.get_val<std::string>(conf_name);
4907 std::istringstream interval_stream(intervals_str);
4908 std::copy(
4909 std::istream_iterator<uint64_t>(interval_stream),
4910 std::istream_iterator<uint64_t>(),
4911 std::back_inserter(*intervals));
4912 };
4913 _set_bin("bluestore_cache_age_bins_kv", &kv_bins);
4914 _set_bin("bluestore_cache_age_bins_kv_onode", &kv_onode_bins);
4915 _set_bin("bluestore_cache_age_bins_meta", &meta_bins);
4916 _set_bin("bluestore_cache_age_bins_data", &data_bins);
4917
11fdf7f2
TL
4918 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4919 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
91327a77 4920 osd_memory_expected_fragmentation =
11fdf7f2
TL
4921 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4922 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
91327a77 4923 osd_memory_cache_resize_interval =
11fdf7f2 4924 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
91327a77 4925
224ce89b
WB
4926 if (cct->_conf->bluestore_cache_size) {
4927 cache_size = cct->_conf->bluestore_cache_size;
4928 } else {
4929 // choose global cache size based on backend type
9f95a23c 4930 if (_use_rotational_settings()) {
224ce89b
WB
4931 cache_size = cct->_conf->bluestore_cache_size_hdd;
4932 } else {
4933 cache_size = cct->_conf->bluestore_cache_size_ssd;
4934 }
4935 }
31f18b77 4936
f67539c2 4937 cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
224ce89b 4938 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 4939 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77 4940 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4941 return -EINVAL;
4942 }
91327a77 4943
f67539c2 4944 cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
224ce89b 4945 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 4946 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
91327a77 4947 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4948 return -EINVAL;
4949 }
91327a77 4950
f67539c2
TL
4951 cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
4952 if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
4953 derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
4954 << ") must be in range [0,1.0]" << dendl;
4955 return -EINVAL;
4956 }
4957
31f18b77 4958 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 4959 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77
AA
4960 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4961 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4962 << dendl;
31f18b77
FG
4963 return -EINVAL;
4964 }
91327a77 4965
f67539c2
TL
4966 cache_data_ratio = (double)1.0 -
4967 (double)cache_meta_ratio -
4968 (double)cache_kv_ratio -
4969 (double)cache_kv_onode_ratio;
31f18b77
FG
4970 if (cache_data_ratio < 0) {
4971 // deal with floating point imprecision
4972 cache_data_ratio = 0;
4973 }
91327a77 4974
224ce89b
WB
4975 dout(1) << __func__ << " cache_size " << cache_size
4976 << " meta " << cache_meta_ratio
31f18b77
FG
4977 << " kv " << cache_kv_ratio
4978 << " data " << cache_data_ratio
4979 << dendl;
4980 return 0;
4981}
4982
3efd9988
FG
4983int BlueStore::write_meta(const std::string& key, const std::string& value)
4984{
4985 bluestore_bdev_label_t label;
4986 string p = path + "/block";
4987 int r = _read_bdev_label(cct, p, &label);
4988 if (r < 0) {
4989 return ObjectStore::write_meta(key, value);
4990 }
4991 label.meta[key] = value;
4992 r = _write_bdev_label(cct, p, label);
11fdf7f2 4993 ceph_assert(r == 0);
3efd9988
FG
4994 return ObjectStore::write_meta(key, value);
4995}
4996
4997int BlueStore::read_meta(const std::string& key, std::string *value)
4998{
4999 bluestore_bdev_label_t label;
5000 string p = path + "/block";
5001 int r = _read_bdev_label(cct, p, &label);
5002 if (r < 0) {
5003 return ObjectStore::read_meta(key, value);
5004 }
5005 auto i = label.meta.find(key);
5006 if (i == label.meta.end()) {
5007 return ObjectStore::read_meta(key, value);
5008 }
5009 *value = i->second;
5010 return 0;
5011}
5012
7c673cae
FG
5013void BlueStore::_init_logger()
5014{
5015 PerfCountersBuilder b(cct, "bluestore",
5016 l_bluestore_first, l_bluestore_last);
20effc67
TL
5017
5018 // space utilization stats
5019 //****************************************
5020 b.add_u64(l_bluestore_allocated, "allocated",
5021 "Sum for allocated bytes",
5022 "al_b",
5023 PerfCountersBuilder::PRIO_CRITICAL,
5024 unit_t(UNIT_BYTES));
5025 b.add_u64(l_bluestore_stored, "stored",
5026 "Sum for stored bytes",
5027 "st_b",
5028 PerfCountersBuilder::PRIO_CRITICAL,
5029 unit_t(UNIT_BYTES));
5030 b.add_u64(l_bluestore_fragmentation, "fragmentation_micros",
5031 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
5032 b.add_u64(l_bluestore_alloc_unit, "alloc_unit",
5033 "allocation unit size in bytes",
5034 "au_b",
5035 PerfCountersBuilder::PRIO_CRITICAL,
5036 unit_t(UNIT_BYTES));
5037 //****************************************
5038
5039 // Update op processing state latencies
5040 //****************************************
7c673cae 5041 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
20effc67
TL
5042 "Average prepare state latency",
5043 "sprl", PerfCountersBuilder::PRIO_USEFUL);
7c673cae
FG
5044 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
5045 "Average aio_wait state latency",
20effc67 5046 "sawl", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae 5047 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
20effc67
TL
5048 "Average io_done state latency",
5049 "sidl", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5050 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
20effc67
TL
5051 "Average kv_queued state latency",
5052 "skql", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5053 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
20effc67
TL
5054 "Average kv_commiting state latency",
5055 "skcl", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5056 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
20effc67
TL
5057 "Average kv_done state latency",
5058 "skdl", PerfCountersBuilder::PRIO_USEFUL);
5059 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
5060 "Average finishing state latency",
5061 "sfnl", PerfCountersBuilder::PRIO_USEFUL);
5062 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
5063 "Average done state latency",
5064 "sdnl", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5065 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
20effc67
TL
5066 "Average deferred_queued state latency",
5067 "sdql", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5068 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
20effc67
TL
5069 "Average aio_wait state latency",
5070 "sdal", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5071 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
20effc67
TL
5072 "Average cleanup state latency",
5073 "sdcl", PerfCountersBuilder::PRIO_USEFUL);
5074 //****************************************
5075
5076 // Update Transaction stats
5077 //****************************************
5078 b.add_time_avg(l_bluestore_throttle_lat, "txc_throttle_lat",
7c673cae
FG
5079 "Average submit throttle latency",
5080 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
20effc67 5081 b.add_time_avg(l_bluestore_submit_lat, "txc_submit_lat",
7c673cae
FG
5082 "Average submit latency",
5083 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
20effc67 5084 b.add_time_avg(l_bluestore_commit_lat, "txc_commit_lat",
7c673cae
FG
5085 "Average commit latency",
5086 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
20effc67
TL
5087 b.add_u64_counter(l_bluestore_txc, "txc_count", "Transactions committed");
5088 //****************************************
5089
5090 // Read op stats
5091 //****************************************
7c673cae 5092 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
20effc67
TL
5093 "Average read onode metadata latency",
5094 "roml", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5095 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
20effc67
TL
5096 "Average read I/O waiting latency",
5097 "rwal", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5098 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
20effc67
TL
5099 "Average checksum latency",
5100 "csml", PerfCountersBuilder::PRIO_USEFUL);
5101 b.add_u64_counter(l_bluestore_read_eio, "read_eio",
5102 "Read EIO errors propagated to high level callers");
5103 b.add_u64_counter(l_bluestore_reads_with_retries, "reads_with_retries",
5104 "Read operations that required at least one retry due to failed checksum validation",
5105 "rd_r", PerfCountersBuilder::PRIO_USEFUL);
5106 b.add_time_avg(l_bluestore_read_lat, "read_lat",
5107 "Average read latency",
5108 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
5109 //****************************************
5110
5111 // kv_thread latencies
5112 //****************************************
5113 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
5114 "Average kv_thread flush latency",
5115 "kfsl", PerfCountersBuilder::PRIO_INTERESTING);
5116 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
5117 "Average kv_thread commit latency",
5118 "kcol", PerfCountersBuilder::PRIO_USEFUL);
5119 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
5120 "Average kv_sync thread latency",
5121 "kscl", PerfCountersBuilder::PRIO_INTERESTING);
5122 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
5123 "Average kv_finalize thread latency",
5124 "kfll", PerfCountersBuilder::PRIO_INTERESTING);
5125 //****************************************
5126
5127 // write op stats
5128 //****************************************
5129 b.add_u64_counter(l_bluestore_write_big, "write_big",
7c673cae 5130 "Large aligned writes into fresh blobs");
20effc67
TL
5131 b.add_u64_counter(l_bluestore_write_big_bytes, "write_big_bytes",
5132 "Large aligned writes into fresh blobs (bytes)",
5133 NULL,
5134 PerfCountersBuilder::PRIO_DEBUGONLY,
5135 unit_t(UNIT_BYTES));
5136 b.add_u64_counter(l_bluestore_write_big_blobs, "write_big_blobs",
7c673cae 5137 "Large aligned writes into fresh blobs (blobs)");
f67539c2 5138 b.add_u64_counter(l_bluestore_write_big_deferred,
20effc67 5139 "write_big_deferred",
f67539c2 5140 "Big overwrites using deferred");
20effc67
TL
5141
5142 b.add_u64_counter(l_bluestore_write_small, "write_small",
7c673cae 5143 "Small writes into existing or sparse small blobs");
20effc67
TL
5144 b.add_u64_counter(l_bluestore_write_small_bytes, "write_small_bytes",
5145 "Small writes into existing or sparse small blobs (bytes)",
5146 NULL,
5147 PerfCountersBuilder::PRIO_DEBUGONLY,
5148 unit_t(UNIT_BYTES));
7c673cae 5149 b.add_u64_counter(l_bluestore_write_small_unused,
20effc67 5150 "write_small_unused",
7c673cae 5151 "Small writes into unused portion of existing blob");
7c673cae 5152 b.add_u64_counter(l_bluestore_write_small_pre_read,
20effc67 5153 "write_small_pre_read",
7c673cae
FG
5154 "Small writes that required we read some data (possibly "
5155 "cached) to fill out the block");
7c673cae 5156
20effc67
TL
5157 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
5158 "Sum for write-op padded bytes",
5159 NULL,
5160 PerfCountersBuilder::PRIO_DEBUGONLY,
5161 unit_t(UNIT_BYTES));
5162 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
5163 "Sum for write penalty read ops");
5164 b.add_u64_counter(l_bluestore_write_new, "write_new",
5165 "Write into new blob");
5166
5167 b.add_u64_counter(l_bluestore_issued_deferred_writes,
5168 "issued_deferred_writes",
5169 "Total deferred writes issued");
5170 b.add_u64_counter(l_bluestore_issued_deferred_write_bytes,
5171 "issued_deferred_write_bytes",
5172 "Total bytes in issued deferred writes",
5173 NULL,
5174 PerfCountersBuilder::PRIO_DEBUGONLY,
5175 unit_t(UNIT_BYTES));
5176 b.add_u64_counter(l_bluestore_submitted_deferred_writes,
5177 "submitted_deferred_writes",
5178 "Total deferred writes submitted to disk");
5179 b.add_u64_counter(l_bluestore_submitted_deferred_write_bytes,
5180 "submitted_deferred_write_bytes",
5181 "Total bytes submitted to disk by deferred writes",
5182 NULL,
5183 PerfCountersBuilder::PRIO_DEBUGONLY,
5184 unit_t(UNIT_BYTES));
5185
5186 b.add_u64_counter(l_bluestore_write_big_skipped_blobs,
5187 "write_big_skipped_blobs",
5188 "Large aligned writes into fresh blobs skipped due to zero detection (blobs)");
5189 b.add_u64_counter(l_bluestore_write_big_skipped_bytes,
5190 "write_big_skipped_bytes",
5191 "Large aligned writes into fresh blobs skipped due to zero detection (bytes)");
5192 b.add_u64_counter(l_bluestore_write_small_skipped,
5193 "write_small_skipped",
5194 "Small writes into existing or sparse small blobs skipped due to zero detection");
5195 b.add_u64_counter(l_bluestore_write_small_skipped_bytes,
5196 "write_small_skipped_bytes",
5197 "Small writes into existing or sparse small blobs skipped due to zero detection (bytes)");
5198 //****************************************
5199
5200 // compressions stats
5201 //****************************************
5202 b.add_u64(l_bluestore_compressed, "compressed",
5203 "Sum for stored compressed bytes",
5204 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5205 b.add_u64(l_bluestore_compressed_allocated, "compressed_allocated",
5206 "Sum for bytes allocated for compressed data",
5207 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5208 b.add_u64(l_bluestore_compressed_original, "compressed_original",
5209 "Sum for original bytes that were compressed",
5210 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5211 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
5212 "Average compress latency",
5213 "_cpl", PerfCountersBuilder::PRIO_USEFUL);
5214 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
5215 "Average decompress latency",
5216 "dcpl", PerfCountersBuilder::PRIO_USEFUL);
5217 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
5218 "Sum for beneficial compress ops");
5219 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
5220 "Sum for compress ops rejected due to low net gain of space");
5221 //****************************************
5222
5223 // onode cache stats
5224 //****************************************
5225 b.add_u64(l_bluestore_onodes, "onodes",
5226 "Number of onodes in cache");
5227 b.add_u64(l_bluestore_pinned_onodes, "onodes_pinned",
5228 "Number of pinned onodes in cache");
5229 b.add_u64_counter(l_bluestore_onode_hits, "onode_hits",
5230 "Count of onode cache lookup hits",
5231 "o_ht", PerfCountersBuilder::PRIO_USEFUL);
5232 b.add_u64_counter(l_bluestore_onode_misses, "onode_misses",
5233 "Count of onode cache lookup misses",
5234 "o_ms", PerfCountersBuilder::PRIO_USEFUL);
5235 b.add_u64_counter(l_bluestore_onode_shard_hits, "onode_shard_hits",
5236 "Count of onode shard cache lookups hits");
5237 b.add_u64_counter(l_bluestore_onode_shard_misses,
5238 "onode_shard_misses",
5239 "Count of onode shard cache lookups misses");
5240 b.add_u64(l_bluestore_extents, "onode_extents",
5241 "Number of extents in cache");
5242 b.add_u64(l_bluestore_blobs, "onode_blobs",
5243 "Number of blobs in cache");
5244 //****************************************
5245
5246 // buffer cache stats
5247 //****************************************
5248 b.add_u64(l_bluestore_buffers, "buffers",
5249 "Number of buffers in cache");
5250 b.add_u64(l_bluestore_buffer_bytes, "buffer_bytes",
5251 "Number of buffer bytes in cache",
5252 NULL,
5253 PerfCountersBuilder::PRIO_DEBUGONLY,
5254 unit_t(UNIT_BYTES));
5255 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "buffer_hit_bytes",
5256 "Sum for bytes of read hit in the cache",
5257 NULL,
5258 PerfCountersBuilder::PRIO_DEBUGONLY,
5259 unit_t(UNIT_BYTES));
5260 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "buffer_miss_bytes",
5261 "Sum for bytes of read missed in the cache",
5262 NULL,
5263 PerfCountersBuilder::PRIO_DEBUGONLY,
5264 unit_t(UNIT_BYTES));
5265 //****************************************
5266
5267 // internal stats
5268 //****************************************
5269 b.add_u64_counter(l_bluestore_onode_reshard, "onode_reshard",
5270 "Onode extent map reshard events");
5271 b.add_u64_counter(l_bluestore_blob_split, "blob_split",
7c673cae 5272 "Sum for blob splitting due to resharding");
20effc67 5273 b.add_u64_counter(l_bluestore_extent_compress, "extent_compress",
7c673cae 5274 "Sum for extents that have been removed due to compression");
20effc67 5275 b.add_u64_counter(l_bluestore_gc_merged, "gc_merged",
7c673cae
FG
5276 "Sum for extents that have been merged due to garbage "
5277 "collection");
20effc67 5278 //****************************************
1e59de90
TL
5279 // misc
5280 //****************************************
5281 b.add_u64_counter(l_bluestore_omap_iterator_count, "omap_iterator_count",
5282 "Open omap iterators count");
5283 b.add_u64_counter(l_bluestore_omap_rmkeys_count, "omap_rmkeys_count",
5284 "amount of omap keys removed via rmkeys");
5285 b.add_u64_counter(l_bluestore_omap_rmkey_ranges_count, "omap_rmkey_range_count",
5286 "amount of omap key ranges removed via rmkeys");
5287 //****************************************
20effc67
TL
5288 // other client ops latencies
5289 //****************************************
11fdf7f2 5290 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
20effc67
TL
5291 "Average omap iterator seek_to_first call latency",
5292 "osfl", PerfCountersBuilder::PRIO_USEFUL);
11fdf7f2 5293 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
20effc67
TL
5294 "Average omap iterator upper_bound call latency",
5295 "oubl", PerfCountersBuilder::PRIO_USEFUL);
11fdf7f2 5296 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
20effc67
TL
5297 "Average omap iterator lower_bound call latency",
5298 "olbl", PerfCountersBuilder::PRIO_USEFUL);
11fdf7f2 5299 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
20effc67
TL
5300 "Average omap iterator next call latency",
5301 "onxl", PerfCountersBuilder::PRIO_USEFUL);
adb31ebb 5302 b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
20effc67
TL
5303 "Average omap get_keys call latency",
5304 "ogkl", PerfCountersBuilder::PRIO_USEFUL);
adb31ebb 5305 b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
20effc67
TL
5306 "Average omap get_values call latency",
5307 "ogvl", PerfCountersBuilder::PRIO_USEFUL);
5308 b.add_time_avg(l_bluestore_omap_clear_lat, "omap_clear_lat",
5309 "Average omap clear call latency");
494da23a 5310 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
20effc67
TL
5311 "Average collection listing latency",
5312 "cl_l", PerfCountersBuilder::PRIO_USEFUL);
adb31ebb 5313 b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
20effc67
TL
5314 "Average removal latency",
5315 "rm_l", PerfCountersBuilder::PRIO_USEFUL);
5316 b.add_time_avg(l_bluestore_truncate_lat, "truncate_lat",
5317 "Average truncate latency",
5318 "tr_l", PerfCountersBuilder::PRIO_USEFUL);
5319 //****************************************
5320
5321 // Resulting size axis configuration for op histograms, values are in bytes
5322 PerfHistogramCommon::axis_config_d alloc_hist_x_axis_config{
5323 "Given size (bytes)",
5324 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
5325 0, ///< Start at 0
5326 4096, ///< Quantization unit
5327 13, ///< Enough to cover 4+M requests
5328 };
5329 // Req size axis configuration for op histograms, values are in bytes
5330 PerfHistogramCommon::axis_config_d alloc_hist_y_axis_config{
5331 "Request size (bytes)",
5332 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
5333 0, ///< Start at 0
5334 4096, ///< Quantization unit
5335 13, ///< Enough to cover 4+M requests
5336 };
5337 b.add_u64_counter_histogram(
5338 l_bluestore_allocate_hist, "allocate_histogram",
5339 alloc_hist_x_axis_config, alloc_hist_y_axis_config,
5340 "Histogram of requested block allocations vs. given ones");
adb31ebb 5341
7c673cae
FG
5342 logger = b.create_perf_counters();
5343 cct->get_perfcounters_collection()->add(logger);
5344}
5345
5346int BlueStore::_reload_logger()
5347{
5348 struct store_statfs_t store_statfs;
7c673cae 5349 int r = statfs(&store_statfs);
11fdf7f2 5350 if (r >= 0) {
7c673cae 5351 logger->set(l_bluestore_allocated, store_statfs.allocated);
11fdf7f2
TL
5352 logger->set(l_bluestore_stored, store_statfs.data_stored);
5353 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
5354 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
5355 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
7c673cae
FG
5356 }
5357 return r;
5358}
5359
5360void BlueStore::_shutdown_logger()
5361{
5362 cct->get_perfcounters_collection()->remove(logger);
5363 delete logger;
5364}
5365
5366int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
5367 uuid_d *fsid)
5368{
5369 bluestore_bdev_label_t label;
5370 int r = _read_bdev_label(cct, path, &label);
5371 if (r < 0)
5372 return r;
5373 *fsid = label.osd_uuid;
5374 return 0;
5375}
5376
5377int BlueStore::_open_path()
5378{
b32b8144 5379 // sanity check(s)
11fdf7f2 5380 ceph_assert(path_fd < 0);
91327a77 5381 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
7c673cae
FG
5382 if (path_fd < 0) {
5383 int r = -errno;
5384 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
5385 << dendl;
5386 return r;
5387 }
5388 return 0;
5389}
5390
5391void BlueStore::_close_path()
5392{
5393 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
5394 path_fd = -1;
5395}
5396
3efd9988 5397int BlueStore::_write_bdev_label(CephContext *cct,
20effc67 5398 const string &path, bluestore_bdev_label_t label)
7c673cae
FG
5399{
5400 dout(10) << __func__ << " path " << path << " label " << label << dendl;
5401 bufferlist bl;
11fdf7f2 5402 encode(label, bl);
7c673cae 5403 uint32_t crc = bl.crc32c(-1);
11fdf7f2
TL
5404 encode(crc, bl);
5405 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
7c673cae
FG
5406 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
5407 z.zero();
5408 bl.append(std::move(z));
5409
1e59de90 5410 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC|O_DIRECT));
7c673cae
FG
5411 if (fd < 0) {
5412 fd = -errno;
5413 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5414 << dendl;
5415 return fd;
5416 }
1e59de90 5417 bl.rebuild_aligned_size_and_memory(BDEV_LABEL_BLOCK_SIZE, BDEV_LABEL_BLOCK_SIZE, IOV_MAX);
7c673cae
FG
5418 int r = bl.write_fd(fd);
5419 if (r < 0) {
5420 derr << __func__ << " failed to write to " << path
5421 << ": " << cpp_strerror(r) << dendl;
11fdf7f2 5422 goto out;
7c673cae 5423 }
3efd9988
FG
5424 r = ::fsync(fd);
5425 if (r < 0) {
5426 derr << __func__ << " failed to fsync " << path
5427 << ": " << cpp_strerror(r) << dendl;
5428 }
11fdf7f2 5429out:
7c673cae
FG
5430 VOID_TEMP_FAILURE_RETRY(::close(fd));
5431 return r;
5432}
5433
20effc67 5434int BlueStore::_read_bdev_label(CephContext* cct, const string &path,
7c673cae
FG
5435 bluestore_bdev_label_t *label)
5436{
5437 dout(10) << __func__ << dendl;
91327a77 5438 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
7c673cae
FG
5439 if (fd < 0) {
5440 fd = -errno;
5441 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5442 << dendl;
5443 return fd;
5444 }
5445 bufferlist bl;
5446 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
5447 VOID_TEMP_FAILURE_RETRY(::close(fd));
5448 if (r < 0) {
5449 derr << __func__ << " failed to read from " << path
5450 << ": " << cpp_strerror(r) << dendl;
5451 return r;
5452 }
5453
5454 uint32_t crc, expected_crc;
11fdf7f2 5455 auto p = bl.cbegin();
7c673cae 5456 try {
11fdf7f2 5457 decode(*label, p);
7c673cae
FG
5458 bufferlist t;
5459 t.substr_of(bl, 0, p.get_off());
5460 crc = t.crc32c(-1);
11fdf7f2 5461 decode(expected_crc, p);
7c673cae 5462 }
f67539c2 5463 catch (ceph::buffer::error& e) {
1e59de90 5464 derr << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
5465 << ": " << e.what()
5466 << dendl;
b32b8144 5467 return -ENOENT;
7c673cae
FG
5468 }
5469 if (crc != expected_crc) {
5470 derr << __func__ << " bad crc on label, expected " << expected_crc
5471 << " != actual " << crc << dendl;
5472 return -EIO;
5473 }
5474 dout(10) << __func__ << " got " << *label << dendl;
5475 return 0;
5476}
5477
5478int BlueStore::_check_or_set_bdev_label(
5479 string path, uint64_t size, string desc, bool create)
5480{
5481 bluestore_bdev_label_t label;
5482 if (create) {
5483 label.osd_uuid = fsid;
5484 label.size = size;
5485 label.btime = ceph_clock_now();
5486 label.description = desc;
3efd9988 5487 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
5488 if (r < 0)
5489 return r;
5490 } else {
5491 int r = _read_bdev_label(cct, path, &label);
5492 if (r < 0)
5493 return r;
31f18b77
FG
5494 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
5495 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5496 << " and fsid " << fsid << " check bypassed" << dendl;
1911f103 5497 } else if (label.osd_uuid != fsid) {
7c673cae
FG
5498 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5499 << " does not match our fsid " << fsid << dendl;
5500 return -EIO;
5501 }
5502 }
5503 return 0;
5504}
5505
5506void BlueStore::_set_alloc_sizes(void)
5507{
7c673cae
FG
5508 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
5509
20effc67
TL
5510#ifdef HAVE_LIBZBD
5511 ceph_assert(bdev);
5512 if (bdev->is_smr()) {
5513 prefer_deferred_size = 0;
5514 } else
5515#endif
7c673cae
FG
5516 if (cct->_conf->bluestore_prefer_deferred_size) {
5517 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
5518 } else {
9f95a23c 5519 if (_use_rotational_settings()) {
7c673cae
FG
5520 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
5521 } else {
5522 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
5523 }
5524 }
5525
5526 if (cct->_conf->bluestore_deferred_batch_ops) {
5527 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
5528 } else {
9f95a23c 5529 if (_use_rotational_settings()) {
7c673cae
FG
5530 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
5531 } else {
5532 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
5533 }
5534 }
5535
5536 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11fdf7f2 5537 << std::dec << " order " << (int)min_alloc_size_order
7c673cae
FG
5538 << " max_alloc_size 0x" << std::hex << max_alloc_size
5539 << " prefer_deferred_size 0x" << prefer_deferred_size
5540 << std::dec
5541 << " deferred_batch_ops " << deferred_batch_ops
5542 << dendl;
5543}
5544
5545int BlueStore::_open_bdev(bool create)
5546{
11fdf7f2 5547 ceph_assert(bdev == NULL);
7c673cae 5548 string p = path + "/block";
11fdf7f2 5549 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
7c673cae
FG
5550 int r = bdev->open(p);
5551 if (r < 0)
5552 goto fail;
5553
11fdf7f2 5554 if (create && cct->_conf->bdev_enable_discard) {
1e59de90
TL
5555 interval_set<uint64_t> whole_device;
5556 whole_device.insert(0, bdev->get_size());
5557 bdev->try_discard(whole_device, false);
11fdf7f2
TL
5558 }
5559
7c673cae
FG
5560 if (bdev->supported_bdev_label()) {
5561 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
5562 if (r < 0)
5563 goto fail_close;
5564 }
5565
5566 // initialize global block parameters
5567 block_size = bdev->get_block_size();
5568 block_mask = ~(block_size - 1);
1e59de90 5569 block_size_order = std::countr_zero(block_size);
11fdf7f2 5570 ceph_assert(block_size == 1u << block_size_order);
9f95a23c 5571 _set_max_defer_interval();
224ce89b
WB
5572 // and set cache_size based on device type
5573 r = _set_cache_sizes();
5574 if (r < 0) {
5575 goto fail_close;
5576 }
20effc67
TL
5577 // get block dev optimal io size
5578 optimal_io_size = bdev->get_optimal_io_size();
f67539c2 5579
7c673cae
FG
5580 return 0;
5581
5582 fail_close:
5583 bdev->close();
5584 fail:
5585 delete bdev;
5586 bdev = NULL;
5587 return r;
5588}
5589
11fdf7f2
TL
5590void BlueStore::_validate_bdev()
5591{
5592 ceph_assert(bdev);
11fdf7f2 5593 uint64_t dev_size = bdev->get_size();
f67539c2 5594 ceph_assert(dev_size > _get_ondisk_reserved());
11fdf7f2
TL
5595}
5596
7c673cae
FG
5597void BlueStore::_close_bdev()
5598{
11fdf7f2 5599 ceph_assert(bdev);
7c673cae
FG
5600 bdev->close();
5601 delete bdev;
5602 bdev = NULL;
5603}
5604
39ae355f
TL
5605int BlueStore::_open_fm(KeyValueDB::Transaction t,
5606 bool read_only,
5607 bool db_avail,
5608 bool fm_restore)
7c673cae 5609{
1911f103 5610 int r;
1911f103 5611
20effc67 5612 dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
11fdf7f2 5613 ceph_assert(fm == NULL);
20effc67
TL
5614 // fm_restore means we are transitioning from null-fm to bitmap-fm
5615 ceph_assert(!fm_restore || (freelist_type != "null"));
5616 // fm restore must pass in a valid transaction
5617 ceph_assert(!fm_restore || (t != nullptr));
5618
39ae355f
TL
5619 // when function is called in repair mode (to_repair=true) we skip db->open()/create()
5620 bool can_have_null_fm = !is_db_rotational() &&
5621 !read_only &&
5622 db_avail &&
5623 cct->_conf->bluestore_allocation_from_file &&
5624 !bdev->is_smr();
5625
20effc67 5626 // When allocation-info is stored in a single file we set freelist_type to "null"
39ae355f
TL
5627 if (can_have_null_fm) {
5628 freelist_type = "null";
5629 need_to_destage_allocation_file = true;
20effc67 5630 }
11fdf7f2
TL
5631 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
5632 ceph_assert(fm);
5633 if (t) {
5634 // create mode. initialize freespace
7c673cae 5635 dout(20) << __func__ << " initializing freespace" << dendl;
7c673cae
FG
5636 {
5637 bufferlist bl;
5638 bl.append(freelist_type);
5639 t->set(PREFIX_SUPER, "freelist_type", bl);
5640 }
b32b8144
FG
5641 // being able to allocate in units less than bdev block size
5642 // seems to be a bad idea.
20effc67 5643 ceph_assert(cct->_conf->bdev_block_size <= min_alloc_size);
f67539c2
TL
5644
5645 uint64_t alloc_size = min_alloc_size;
39ae355f
TL
5646 if (bdev->is_smr() && freelist_type != "zoned") {
5647 derr << "SMR device but freelist_type = " << freelist_type << " (not zoned)"
5648 << dendl;
5649 return -EINVAL;
5650 }
5651 if (!bdev->is_smr() && freelist_type == "zoned") {
20effc67
TL
5652 derr << "non-SMR device (or SMR support not built-in) but freelist_type = zoned"
5653 << dendl;
5654 return -EINVAL;
f67539c2
TL
5655 }
5656
20effc67
TL
5657 fm->create(bdev->get_size(), alloc_size,
5658 zone_size, first_sequential_zone,
5659 t);
7c673cae
FG
5660
5661 // allocate superblock reserved space. note that we do not mark
5662 // bluefs space as allocated in the freelist; we instead rely on
f67539c2 5663 // bluefs doing that itself.
11fdf7f2 5664 auto reserved = _get_ondisk_reserved();
20effc67
TL
5665 if (fm_restore) {
5666 // we need to allocate the full space in restore case
5667 // as later we will add free-space marked in the allocator file
5668 fm->allocate(0, bdev->get_size(), t);
5669 } else {
5670 // allocate superblock reserved space. note that we do not mark
5671 // bluefs space as allocated in the freelist; we instead rely on
5672 // bluefs doing that itself.
5673 fm->allocate(0, reserved, t);
5674 }
5675 // debug code - not needed for NULL FM
7c673cae
FG
5676 if (cct->_conf->bluestore_debug_prefill > 0) {
5677 uint64_t end = bdev->get_size() - reserved;
5678 dout(1) << __func__ << " pre-fragmenting freespace, using "
5679 << cct->_conf->bluestore_debug_prefill << " with max free extent "
5680 << cct->_conf->bluestore_debug_prefragment_max << dendl;
11fdf7f2 5681 uint64_t start = p2roundup(reserved, min_alloc_size);
7c673cae
FG
5682 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
5683 float r = cct->_conf->bluestore_debug_prefill;
5684 r /= 1.0 - r;
5685 bool stop = false;
5686
5687 while (!stop && start < end) {
5688 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
5689 if (start + l > end) {
5690 l = end - start;
11fdf7f2 5691 l = p2align(l, min_alloc_size);
7c673cae 5692 }
11fdf7f2 5693 ceph_assert(start + l <= end);
7c673cae
FG
5694
5695 uint64_t u = 1 + (uint64_t)(r * (double)l);
11fdf7f2 5696 u = p2roundup(u, min_alloc_size);
7c673cae
FG
5697 if (start + l + u > end) {
5698 u = end - (start + l);
5699 // trim to align so we don't overflow again
11fdf7f2 5700 u = p2align(u, min_alloc_size);
7c673cae
FG
5701 stop = true;
5702 }
11fdf7f2 5703 ceph_assert(start + l + u <= end);
7c673cae 5704
11fdf7f2 5705 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
7c673cae
FG
5706 << " use 0x" << u << std::dec << dendl;
5707
5708 if (u == 0) {
5709 // break if u has been trimmed to nothing
5710 break;
5711 }
5712
5713 fm->allocate(start + l, u, t);
5714 start += l + u;
5715 }
5716 }
f67539c2 5717 r = _write_out_fm_meta(0);
1911f103
TL
5718 ceph_assert(r == 0);
5719 } else {
39ae355f
TL
5720 if (can_have_null_fm) {
5721 commit_to_null_manager();
5722 }
f67539c2
TL
5723 r = fm->init(db, read_only,
5724 [&](const std::string& key, std::string* result) {
5725 return read_meta(key, result);
5726 });
1911f103 5727 if (r < 0) {
39ae355f 5728 derr << __func__ << " failed: " << cpp_strerror(r) << dendl;
1911f103
TL
5729 delete fm;
5730 fm = NULL;
5731 return r;
5732 }
7c673cae 5733 }
81eedcae
TL
5734 // if space size tracked by free list manager is that higher than actual
5735 // dev size one can hit out-of-space allocation which will result
5736 // in data loss and/or assertions
5737 // Probably user altered the device size somehow.
5738 // The only fix for now is to redeploy OSD.
5739 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5740 ostringstream ss;
5741 ss << "slow device size mismatch detected, "
5742 << " fm size(" << fm->get_size()
5743 << ") > slow device size(" << bdev->get_size()
5744 << "), Please stop using this OSD as it might cause data loss.";
5745 _set_disk_size_mismatch_alert(ss.str());
5746 }
7c673cae
FG
5747 return 0;
5748}
5749
5750void BlueStore::_close_fm()
5751{
5752 dout(10) << __func__ << dendl;
11fdf7f2 5753 ceph_assert(fm);
7c673cae
FG
5754 fm->shutdown();
5755 delete fm;
5756 fm = NULL;
5757}
5758
f67539c2 5759int BlueStore::_write_out_fm_meta(uint64_t target_size)
1911f103 5760{
f67539c2 5761 int r = 0;
1911f103
TL
5762 string p = path + "/block";
5763
5764 std::vector<std::pair<string, string>> fm_meta;
5765 fm->get_meta(target_size, &fm_meta);
5766
1911f103 5767 for (auto& m : fm_meta) {
f67539c2
TL
5768 r = write_meta(m.first, m.second);
5769 ceph_assert(r == 0);
1911f103 5770 }
1911f103
TL
5771 return r;
5772}
5773
f67539c2 5774int BlueStore::_create_alloc()
7c673cae 5775{
20effc67 5776 ceph_assert(alloc == NULL);
f67539c2 5777 ceph_assert(shared_alloc.a == NULL);
11fdf7f2
TL
5778 ceph_assert(bdev->get_size());
5779
f67539c2 5780 uint64_t alloc_size = min_alloc_size;
20effc67
TL
5781
5782 std::string allocator_type = cct->_conf->bluestore_allocator;
5783
5784#ifdef HAVE_LIBZBD
5785 if (freelist_type == "zoned") {
5786 allocator_type = "zoned";
11fdf7f2 5787 }
20effc67 5788#endif
11fdf7f2 5789
20effc67
TL
5790 alloc = Allocator::create(
5791 cct, allocator_type,
f67539c2 5792 bdev->get_size(),
20effc67
TL
5793 alloc_size,
5794 zone_size,
5795 first_sequential_zone,
5796 "block");
5797 if (!alloc) {
5798 lderr(cct) << __func__ << " failed to create " << allocator_type << " allocator"
5799 << dendl;
7c673cae
FG
5800 return -EINVAL;
5801 }
20effc67
TL
5802
5803#ifdef HAVE_LIBZBD
5804 if (freelist_type == "zoned") {
5805 Allocator *a = Allocator::create(
5806 cct, cct->_conf->bluestore_allocator,
5807 bdev->get_conventional_region_size(),
5808 alloc_size,
5809 0, 0,
5810 "zoned_block");
5811 if (!a) {
5812 lderr(cct) << __func__ << " failed to create " << cct->_conf->bluestore_allocator
5813 << " allocator" << dendl;
5814 delete alloc;
5815 return -EINVAL;
5816 }
39ae355f 5817 shared_alloc.set(a, alloc_size);
20effc67
TL
5818 } else
5819#endif
5820 {
5821 // BlueFS will share the same allocator
39ae355f 5822 shared_alloc.set(alloc, alloc_size);
20effc67
TL
5823 }
5824
f67539c2
TL
5825 return 0;
5826}
5827
20effc67 5828int BlueStore::_init_alloc(std::map<uint64_t, uint64_t> *zone_adjustments)
f67539c2
TL
5829{
5830 int r = _create_alloc();
5831 if (r < 0) {
5832 return r;
5833 }
20effc67 5834 ceph_assert(alloc != NULL);
f67539c2 5835
20effc67 5836#ifdef HAVE_LIBZBD
f67539c2 5837 if (bdev->is_smr()) {
20effc67
TL
5838 auto a = dynamic_cast<ZonedAllocator*>(alloc);
5839 ceph_assert(a);
5840 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
5841 ceph_assert(f);
5842 vector<uint64_t> wp = bdev->get_zones();
5843 vector<zone_state_t> zones = f->get_zone_states(db);
5844 ceph_assert(wp.size() == zones.size());
5845
5846 // reconcile zone state
5847 auto num_zones = bdev->get_size() / zone_size;
5848 for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
5849 ceph_assert(wp[i] >= i * zone_size);
5850 ceph_assert(wp[i] <= (i + 1) * zone_size); // pos might be at start of next zone
5851 uint64_t p = wp[i] - i * zone_size;
5852 if (zones[i].write_pointer > p) {
5853 derr << __func__ << " zone 0x" << std::hex << i
5854 << " bluestore write pointer 0x" << zones[i].write_pointer
5855 << " > device write pointer 0x" << p
5856 << std::dec << " -- VERY SUSPICIOUS!" << dendl;
5857 } else if (zones[i].write_pointer < p) {
5858 // this is "normal" in that it can happen after any crash (if we have a
5859 // write in flight but did not manage to commit the transaction)
5860 auto delta = p - zones[i].write_pointer;
5861 dout(1) << __func__ << " zone 0x" << std::hex << i
5862 << " device write pointer 0x" << p
5863 << " > bluestore pointer 0x" << zones[i].write_pointer
5864 << ", advancing 0x" << delta << std::dec << dendl;
5865 (*zone_adjustments)[zones[i].write_pointer] = delta;
5866 zones[i].num_dead_bytes += delta;
5867 zones[i].write_pointer = p;
5868 }
5869 }
5870
5871 // start with conventional zone "free" (bluefs may adjust this when it starts up)
5872 auto reserved = _get_ondisk_reserved();
5873 // for now we require a conventional zone
5874 ceph_assert(bdev->get_conventional_region_size());
5875 ceph_assert(shared_alloc.a != alloc); // zoned allocator doesn't use conventional region
5876 shared_alloc.a->init_add_free(
5877 reserved,
5878 p2align(bdev->get_conventional_region_size(), min_alloc_size) - reserved);
5879
5880 // init sequential zone based on the device's write pointers
5881 a->init_from_zone_pointers(std::move(zones));
5882 dout(1) << __func__
5883 << " loaded zone pointers: "
5884 << std::hex
5885 << ", allocator type " << alloc->get_type()
5886 << ", capacity 0x" << alloc->get_capacity()
5887 << ", block size 0x" << alloc->get_block_size()
5888 << ", free 0x" << alloc->get_free()
5889 << ", fragmentation " << alloc->get_fragmentation()
5890 << std::dec << dendl;
5891
5892 return 0;
f67539c2 5893 }
20effc67 5894#endif
7c673cae
FG
5895
5896 uint64_t num = 0, bytes = 0;
20effc67
TL
5897 utime_t start_time = ceph_clock_now();
5898 if (!fm->is_null_manager()) {
5899 // This is the original path - loading allocation map from RocksDB and feeding into the allocator
5900 dout(5) << __func__ << "::NCB::loading allocation from FM -> alloc" << dendl;
5901 // initialize from freelist
5902 fm->enumerate_reset();
5903 uint64_t offset, length;
5904 while (fm->enumerate_next(db, &offset, &length)) {
5905 alloc->init_add_free(offset, length);
5906 ++num;
5907 bytes += length;
5908 }
5909 fm->enumerate_reset();
5910
5911 utime_t duration = ceph_clock_now() - start_time;
5912 dout(5) << __func__ << "::num_entries=" << num << " free_size=" << bytes << " alloc_size=" <<
5913 alloc->get_capacity() - bytes << " time=" << duration << " seconds" << dendl;
5914 } else {
5915 // This is the new path reading the allocation map from a flat bluefs file and feeding them into the allocator
7c673cae 5916
20effc67
TL
5917 if (!cct->_conf->bluestore_allocation_from_file) {
5918 derr << __func__ << "::NCB::cct->_conf->bluestore_allocation_from_file is set to FALSE with an active NULL-FM" << dendl;
5919 derr << __func__ << "::NCB::Please change the value of bluestore_allocation_from_file to TRUE in your ceph.conf file" << dendl;
5920 return -ENOTSUP; // Operation not supported
5921 }
20effc67
TL
5922 if (restore_allocator(alloc, &num, &bytes) == 0) {
5923 dout(5) << __func__ << "::NCB::restore_allocator() completed successfully alloc=" << alloc << dendl;
5924 } else {
5925 // This must mean that we had an unplanned shutdown and didn't manage to destage the allocator
5926 dout(0) << __func__ << "::NCB::restore_allocator() failed! Run Full Recovery from ONodes (might take a while) ..." << dendl;
5927 // if failed must recover from on-disk ONode internal state
5928 if (read_allocation_from_drive_on_startup() != 0) {
5929 derr << __func__ << "::NCB::Failed Recovery" << dendl;
5930 derr << __func__ << "::NCB::Ceph-OSD won't start, make sure your drives are connected and readable" << dendl;
5931 derr << __func__ << "::NCB::If no HW fault is found, please report failure and consider redeploying OSD" << dendl;
5932 return -ENOTRECOVERABLE;
5933 }
5934 }
5935 }
f67539c2
TL
5936 dout(1) << __func__
5937 << " loaded " << byte_u_t(bytes) << " in " << num << " extents"
5938 << std::hex
20effc67
TL
5939 << ", allocator type " << alloc->get_type()
5940 << ", capacity 0x" << alloc->get_capacity()
5941 << ", block size 0x" << alloc->get_block_size()
5942 << ", free 0x" << alloc->get_free()
5943 << ", fragmentation " << alloc->get_fragmentation()
f67539c2 5944 << std::dec << dendl;
1911f103 5945
7c673cae
FG
5946 return 0;
5947}
5948
20effc67
TL
5949void BlueStore::_post_init_alloc(const std::map<uint64_t, uint64_t>& zone_adjustments)
5950{
39ae355f 5951 int r = 0;
20effc67 5952#ifdef HAVE_LIBZBD
39ae355f
TL
5953 if (bdev->is_smr()) {
5954 if (zone_adjustments.empty()) {
5955 return;
5956 }
5957 dout(1) << __func__ << " adjusting freelist based on device write pointers" << dendl;
5958 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
5959 ceph_assert(f);
5960 KeyValueDB::Transaction t = db->get_transaction();
5961 for (auto& i : zone_adjustments) {
5962 // allocate AND release since this gap is now dead space
5963 // note that the offset is imprecise, but only need to select the zone
5964 f->allocate(i.first, i.second, t);
5965 f->release(i.first, i.second, t);
5966 }
5967 r = db->submit_transaction_sync(t);
5968 } else
20effc67 5969#endif
39ae355f
TL
5970 if (fm->is_null_manager()) {
5971 // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected
5972 // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount()
5973 // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort
5974 // to recovery from RocksDB::ONodes
5975 r = invalidate_allocation_file_on_bluefs();
5976 }
5977 ceph_assert(r >= 0);
20effc67
TL
5978}
5979
7c673cae
FG
5980void BlueStore::_close_alloc()
5981{
11fdf7f2
TL
5982 ceph_assert(bdev);
5983 bdev->discard_drain();
5984
20effc67
TL
5985 ceph_assert(alloc);
5986 alloc->shutdown();
5987 delete alloc;
5988
f67539c2 5989 ceph_assert(shared_alloc.a);
20effc67
TL
5990 if (alloc != shared_alloc.a) {
5991 shared_alloc.a->shutdown();
5992 delete shared_alloc.a;
5993 }
5994
f67539c2 5995 shared_alloc.reset();
20effc67 5996 alloc = nullptr;
7c673cae
FG
5997}
5998
5999int BlueStore::_open_fsid(bool create)
6000{
11fdf7f2 6001 ceph_assert(fsid_fd < 0);
91327a77 6002 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
6003 if (create)
6004 flags |= O_CREAT;
6005 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
6006 if (fsid_fd < 0) {
6007 int err = -errno;
6008 derr << __func__ << " " << cpp_strerror(err) << dendl;
6009 return err;
6010 }
6011 return 0;
6012}
6013
6014int BlueStore::_read_fsid(uuid_d *uuid)
6015{
6016 char fsid_str[40];
6017 memset(fsid_str, 0, sizeof(fsid_str));
6018 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
6019 if (ret < 0) {
6020 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
6021 return ret;
6022 }
6023 if (ret > 36)
6024 fsid_str[36] = 0;
6025 else
6026 fsid_str[ret] = 0;
6027 if (!uuid->parse(fsid_str)) {
6028 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
6029 return -EINVAL;
6030 }
6031 return 0;
6032}
6033
6034int BlueStore::_write_fsid()
6035{
6036 int r = ::ftruncate(fsid_fd, 0);
6037 if (r < 0) {
6038 r = -errno;
6039 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
6040 return r;
6041 }
6042 string str = stringify(fsid) + "\n";
6043 r = safe_write(fsid_fd, str.c_str(), str.length());
6044 if (r < 0) {
6045 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
6046 return r;
6047 }
6048 r = ::fsync(fsid_fd);
6049 if (r < 0) {
6050 r = -errno;
6051 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
6052 return r;
6053 }
6054 return 0;
6055}
6056
6057void BlueStore::_close_fsid()
6058{
6059 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
6060 fsid_fd = -1;
6061}
6062
6063int BlueStore::_lock_fsid()
6064{
6065 struct flock l;
6066 memset(&l, 0, sizeof(l));
6067 l.l_type = F_WRLCK;
6068 l.l_whence = SEEK_SET;
6069 int r = ::fcntl(fsid_fd, F_SETLK, &l);
6070 if (r < 0) {
6071 int err = errno;
6072 derr << __func__ << " failed to lock " << path << "/fsid"
6073 << " (is another ceph-osd still running?)"
6074 << cpp_strerror(err) << dendl;
6075 return -err;
6076 }
6077 return 0;
6078}
6079
31f18b77
FG
6080bool BlueStore::is_rotational()
6081{
6082 if (bdev) {
6083 return bdev->is_rotational();
6084 }
6085
6086 bool rotational = true;
6087 int r = _open_path();
6088 if (r < 0)
6089 goto out;
6090 r = _open_fsid(false);
6091 if (r < 0)
6092 goto out_path;
6093 r = _read_fsid(&fsid);
6094 if (r < 0)
6095 goto out_fsid;
6096 r = _lock_fsid();
6097 if (r < 0)
6098 goto out_fsid;
6099 r = _open_bdev(false);
6100 if (r < 0)
6101 goto out_fsid;
6102 rotational = bdev->is_rotational();
6103 _close_bdev();
6104 out_fsid:
6105 _close_fsid();
6106 out_path:
6107 _close_path();
6108 out:
6109 return rotational;
6110}
6111
d2e6a577
FG
6112bool BlueStore::is_journal_rotational()
6113{
6114 if (!bluefs) {
6115 dout(5) << __func__ << " bluefs disabled, default to store media type"
6116 << dendl;
6117 return is_rotational();
6118 }
6119 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
6120 return bluefs->wal_is_rotational();
6121}
6122
1d09f67e
TL
6123bool BlueStore::is_db_rotational()
6124{
6125 if (!bluefs) {
6126 dout(5) << __func__ << " bluefs disabled, default to store media type"
6127 << dendl;
6128 return is_rotational();
6129 }
6130 dout(10) << __func__ << " " << (int)bluefs->db_is_rotational() << dendl;
6131 return bluefs->db_is_rotational();
6132}
6133
9f95a23c
TL
6134bool BlueStore::_use_rotational_settings()
6135{
6136 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
6137 return true;
6138 }
6139 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
6140 return false;
6141 }
6142 return bdev->is_rotational();
6143}
6144
39ae355f
TL
6145bool BlueStore::is_statfs_recoverable() const
6146{
6147 // abuse fm for now
6148 return has_null_manager();
6149}
6150
7c673cae
FG
6151bool BlueStore::test_mount_in_use()
6152{
6153 // most error conditions mean the mount is not in use (e.g., because
6154 // it doesn't exist). only if we fail to lock do we conclude it is
6155 // in use.
6156 bool ret = false;
6157 int r = _open_path();
6158 if (r < 0)
6159 return false;
6160 r = _open_fsid(false);
6161 if (r < 0)
6162 goto out_path;
6163 r = _lock_fsid();
6164 if (r < 0)
6165 ret = true; // if we can't lock, it is in use
6166 _close_fsid();
6167 out_path:
6168 _close_path();
6169 return ret;
6170}
6171
11fdf7f2 6172int BlueStore::_minimal_open_bluefs(bool create)
7c673cae
FG
6173{
6174 int r;
11fdf7f2 6175 bluefs = new BlueFS(cct);
7c673cae 6176
11fdf7f2
TL
6177 string bfn;
6178 struct stat st;
6179
6180 bfn = path + "/block.db";
6181 if (::stat(bfn.c_str(), &st) == 0) {
eafe8130
TL
6182 r = bluefs->add_block_device(
6183 BlueFS::BDEV_DB, bfn,
f67539c2
TL
6184 create && cct->_conf->bdev_enable_discard,
6185 SUPER_RESERVED);
7c673cae 6186 if (r < 0) {
11fdf7f2
TL
6187 derr << __func__ << " add block device(" << bfn << ") returned: "
6188 << cpp_strerror(r) << dendl;
6189 goto free_bluefs;
7c673cae 6190 }
7c673cae 6191
11fdf7f2
TL
6192 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
6193 r = _check_or_set_bdev_label(
6194 bfn,
6195 bluefs->get_block_device_size(BlueFS::BDEV_DB),
6196 "bluefs db", create);
6197 if (r < 0) {
6198 derr << __func__
6199 << " check block device(" << bfn << ") label returned: "
6200 << cpp_strerror(r) << dendl;
6201 goto free_bluefs;
6202 }
7c673cae 6203 }
9f95a23c
TL
6204 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6205 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
6206 } else {
6207 r = -errno;
6208 if (::lstat(bfn.c_str(), &st) == -1) {
6209 r = 0;
9f95a23c 6210 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7c673cae 6211 } else {
11fdf7f2
TL
6212 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
6213 << cpp_strerror(r) << dendl;
6214 goto free_bluefs;
7c673cae
FG
6215 }
6216 }
7c673cae 6217
11fdf7f2
TL
6218 // shared device
6219 bfn = path + "/block";
6220 // never trim here
9f95a23c 6221 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
f67539c2
TL
6222 0, // no need to provide valid 'reserved' for shared dev
6223 &shared_alloc);
11fdf7f2
TL
6224 if (r < 0) {
6225 derr << __func__ << " add block device(" << bfn << ") returned: "
6226 << cpp_strerror(r) << dendl;
6227 goto free_bluefs;
6228 }
11fdf7f2
TL
6229
6230 bfn = path + "/block.wal";
6231 if (::stat(bfn.c_str(), &st) == 0) {
6232 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
f67539c2
TL
6233 create && cct->_conf->bdev_enable_discard,
6234 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
6235 if (r < 0) {
6236 derr << __func__ << " add block device(" << bfn << ") returned: "
6237 << cpp_strerror(r) << dendl;
6238 goto free_bluefs;
6239 }
7c673cae 6240
11fdf7f2
TL
6241 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
6242 r = _check_or_set_bdev_label(
6243 bfn,
6244 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
6245 "bluefs wal", create);
7c673cae 6246 if (r < 0) {
11fdf7f2
TL
6247 derr << __func__ << " check block device(" << bfn
6248 << ") label returned: " << cpp_strerror(r) << dendl;
7c673cae
FG
6249 goto free_bluefs;
6250 }
7c673cae
FG
6251 }
6252
9f95a23c 6253 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6254 } else {
6255 r = 0;
6256 if (::lstat(bfn.c_str(), &st) != -1) {
6257 r = -errno;
6258 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
6259 << cpp_strerror(r) << dendl;
7c673cae
FG
6260 goto free_bluefs;
6261 }
11fdf7f2
TL
6262 }
6263 return 0;
7c673cae 6264
11fdf7f2
TL
6265free_bluefs:
6266 ceph_assert(bluefs);
6267 delete bluefs;
6268 bluefs = NULL;
6269 return r;
6270}
7c673cae 6271
f67539c2 6272int BlueStore::_open_bluefs(bool create, bool read_only)
11fdf7f2
TL
6273{
6274 int r = _minimal_open_bluefs(create);
6275 if (r < 0) {
6276 return r;
6277 }
f67539c2 6278 BlueFSVolumeSelector* vselector = nullptr;
1e59de90
TL
6279 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW ||
6280 cct->_conf->bluestore_volume_selection_policy == "use_some_extra_enforced" ||
6281 cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
9f95a23c
TL
6282
6283 string options = cct->_conf->bluestore_rocksdb_options;
cd265ab1
TL
6284 string options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6285 if (!options_annex.empty()) {
6286 if (!options.empty() &&
6287 *options.rbegin() != ',') {
6288 options += ',';
6289 }
6290 options += options_annex;
6291 }
9f95a23c
TL
6292
6293 rocksdb::Options rocks_opts;
f67539c2 6294 r = RocksDBStore::ParseOptionsFromStringStatic(
9f95a23c
TL
6295 cct,
6296 options,
6297 rocks_opts,
6298 nullptr);
6299 if (r < 0) {
6300 return r;
6301 }
f67539c2
TL
6302 if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
6303 vselector = new FitToFastVolumeSelector(
9f95a23c
TL
6304 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
6305 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
f67539c2
TL
6306 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100);
6307 } else {
6308 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
6309 vselector =
6310 new RocksDBBlueFSVolumeSelector(
6311 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
6312 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
6313 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
6314 1024 * 1024 * 1024, //FIXME: set expected l0 size here
6315 rocks_opts.max_bytes_for_level_base,
6316 rocks_opts.max_bytes_for_level_multiplier,
6317 reserved_factor,
6318 cct->_conf->bluestore_volume_selection_reserved,
1e59de90
TL
6319 cct->_conf->bluestore_volume_selection_policy.find("use_some_extra")
6320 == 0);
f67539c2 6321 }
9f95a23c 6322 }
11fdf7f2 6323 if (create) {
9f95a23c 6324 bluefs->mkfs(fsid, bluefs_layout);
11fdf7f2 6325 }
9f95a23c 6326 bluefs->set_volume_selector(vselector);
11fdf7f2
TL
6327 r = bluefs->mount();
6328 if (r < 0) {
6329 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
6330 }
9f95a23c 6331 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
11fdf7f2
TL
6332 return r;
6333}
6334
20effc67 6335void BlueStore::_close_bluefs()
11fdf7f2 6336{
20effc67 6337 bluefs->umount(db_was_opened_read_only);
11fdf7f2
TL
6338 _minimal_close_bluefs();
6339}
6340
6341void BlueStore::_minimal_close_bluefs()
6342{
6343 delete bluefs;
6344 bluefs = NULL;
6345}
6346
6347int BlueStore::_is_bluefs(bool create, bool* ret)
6348{
6349 if (create) {
6350 *ret = cct->_conf->bluestore_bluefs;
6351 } else {
6352 string s;
6353 int r = read_meta("bluefs", &s);
6354 if (r < 0) {
6355 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
6356 return -EIO;
6357 }
6358 if (s == "1") {
6359 *ret = true;
6360 } else if (s == "0") {
6361 *ret = false;
31f18b77 6362 } else {
11fdf7f2
TL
6363 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
6364 << dendl;
6365 return -EIO;
6366 }
6367 }
6368 return 0;
6369}
6370
6371/*
6372* opens both DB and dependant super_meta, FreelistManager and allocator
6373* in the proper order
6374*/
f67539c2 6375int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
11fdf7f2 6376{
20effc67 6377 dout(5) << __func__ << "::NCB::read_only=" << read_only << ", to_repair=" << to_repair << dendl;
f67539c2
TL
6378 {
6379 string type;
6380 int r = read_meta("type", &type);
6381 if (r < 0) {
6382 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
6383 << dendl;
11fdf7f2 6384 return r;
f67539c2 6385 }
11fdf7f2 6386
f67539c2
TL
6387 if (type != "bluestore") {
6388 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6389 return -EIO;
11fdf7f2 6390 }
f67539c2 6391 }
11fdf7f2 6392
20effc67
TL
6393 // SMR devices may require a freelist adjustment, but that can only happen after
6394 // the db is read-write. we'll stash pending changes here.
6395 std::map<uint64_t, uint64_t> zone_adjustments;
6396
f67539c2
TL
6397 int r = _open_path();
6398 if (r < 0)
6399 return r;
6400 r = _open_fsid(false);
6401 if (r < 0)
6402 goto out_path;
11fdf7f2 6403
f67539c2
TL
6404 r = _read_fsid(&fsid);
6405 if (r < 0)
6406 goto out_fsid;
11fdf7f2 6407
f67539c2
TL
6408 r = _lock_fsid();
6409 if (r < 0)
6410 goto out_fsid;
11fdf7f2 6411
f67539c2
TL
6412 r = _open_bdev(false);
6413 if (r < 0)
6414 goto out_fsid;
7c673cae 6415
20effc67
TL
6416 // GBH: can probably skip open_db step in REad-Only mode when operating in NULL-FM mode
6417 // (might need to open if failed to restore from file)
6418
f67539c2
TL
6419 // open in read-only first to read FM list and init allocator
6420 // as they might be needed for some BlueFS procedures
6421 r = _open_db(false, false, true);
6422 if (r < 0)
6423 goto out_bdev;
11fdf7f2 6424
f67539c2
TL
6425 r = _open_super_meta();
6426 if (r < 0) {
6427 goto out_db;
6428 }
6429
39ae355f 6430 r = _open_fm(nullptr, true, false);
f67539c2
TL
6431 if (r < 0)
6432 goto out_db;
6433
20effc67 6434 r = _init_alloc(&zone_adjustments);
f67539c2
TL
6435 if (r < 0)
6436 goto out_fm;
6437
6438 // Re-open in the proper mode(s).
6439
6440 // Can't simply bypass second open for read-only mode as we need to
6441 // load allocated extents from bluefs into allocator.
6442 // And now it's time to do that
6443 //
20effc67 6444 _close_db();
f67539c2
TL
6445 r = _open_db(false, to_repair, read_only);
6446 if (r < 0) {
6447 goto out_alloc;
11fdf7f2 6448 }
20effc67 6449
39ae355f 6450 if (!read_only) {
20effc67
TL
6451 _post_init_alloc(zone_adjustments);
6452 }
6453
6454 // when function is called in repair mode (to_repair=true) we skip db->open()/create()
6455 // we can't change bluestore allocation so no need to invlidate allocation-file
6456 if (fm->is_null_manager() && !read_only && !to_repair) {
6457 // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected
6458 // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount()
6459 // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort
6460 // to recovery from RocksDB::ONodes
6461 r = invalidate_allocation_file_on_bluefs();
6462 if (r != 0) {
6463 derr << __func__ << "::NCB::invalidate_allocation_file_on_bluefs() failed!" << dendl;
6464 goto out_alloc;
6465 }
6466 }
6467
6468 // when function is called in repair mode (to_repair=true) we skip db->open()/create()
1d09f67e 6469 if (!is_db_rotational() && !read_only && !to_repair && cct->_conf->bluestore_allocation_from_file
20effc67
TL
6470#ifdef HAVE_LIBZBD
6471 && !bdev->is_smr()
6472#endif
6473 ) {
6474 dout(5) << __func__ << "::NCB::Commit to Null-Manager" << dendl;
6475 commit_to_null_manager();
6476 need_to_destage_allocation_file = true;
6477 dout(10) << __func__ << "::NCB::need_to_destage_allocation_file was set" << dendl;
6478 }
6479
11fdf7f2
TL
6480 return 0;
6481
f67539c2
TL
6482out_alloc:
6483 _close_alloc();
6484out_fm:
11fdf7f2
TL
6485 _close_fm();
6486 out_db:
20effc67 6487 _close_db();
f67539c2
TL
6488 out_bdev:
6489 _close_bdev();
6490 out_fsid:
6491 _close_fsid();
6492 out_path:
6493 _close_path();
11fdf7f2
TL
6494 return r;
6495}
6496
20effc67 6497void BlueStore::_close_db_and_around()
11fdf7f2 6498{
20effc67
TL
6499 if (db) {
6500 _close_db();
6501 }
39ae355f
TL
6502 _close_around_db();
6503}
6504
6505void BlueStore::_close_around_db()
6506{
20effc67
TL
6507 if (bluefs) {
6508 _close_bluefs();
6509 }
f67539c2
TL
6510 _close_fm();
6511 _close_alloc();
6512 _close_bdev();
6513 _close_fsid();
6514 _close_path();
6515}
6516
6517int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair)
6518{
6519 _kv_only = true;
6520 int r = _open_db_and_around(false, to_repair);
6521 if (r == 0) {
6522 *pdb = db;
11fdf7f2 6523 } else {
f67539c2 6524 *pdb = nullptr;
11fdf7f2 6525 }
f67539c2 6526 return r;
11fdf7f2
TL
6527}
6528
f67539c2 6529int BlueStore::close_db_environment()
11fdf7f2 6530{
39ae355f
TL
6531 if (db) {
6532 delete db;
6533 db = nullptr;
6534 }
6535 _close_around_db();
f67539c2 6536 return 0;
11fdf7f2
TL
6537}
6538
20effc67
TL
6539/* gets access to bluefs supporting RocksDB */
6540BlueFS* BlueStore::get_bluefs() {
6541 return bluefs;
6542}
6543
f67539c2
TL
6544int BlueStore::_prepare_db_environment(bool create, bool read_only,
6545 std::string* _fn, std::string* _kv_backend)
11fdf7f2
TL
6546{
6547 int r;
6548 ceph_assert(!db);
f67539c2
TL
6549 std::string& fn=*_fn;
6550 std::string& kv_backend=*_kv_backend;
6551 fn = path + "/db";
11fdf7f2
TL
6552 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
6553
11fdf7f2
TL
6554 if (create) {
6555 kv_backend = cct->_conf->bluestore_kvbackend;
6556 } else {
6557 r = read_meta("kv_backend", &kv_backend);
7c673cae 6558 if (r < 0) {
11fdf7f2
TL
6559 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
6560 return -EIO;
6561 }
6562 }
6563 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
6564
6565 bool do_bluefs;
6566 r = _is_bluefs(create, &do_bluefs);
6567 if (r < 0) {
6568 return r;
6569 }
6570 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
6571
6572 map<string,string> kv_options;
6573 // force separate wal dir for all new deployments.
6574 kv_options["separate_wal_dir"] = 1;
6575 rocksdb::Env *env = NULL;
6576 if (do_bluefs) {
6577 dout(10) << __func__ << " initializing bluefs" << dendl;
6578 if (kv_backend != "rocksdb") {
6579 derr << " backend must be rocksdb to use bluefs" << dendl;
6580 return -EINVAL;
7c673cae 6581 }
11fdf7f2 6582
f67539c2 6583 r = _open_bluefs(create, read_only);
11fdf7f2
TL
6584 if (r < 0) {
6585 return r;
6586 }
11fdf7f2 6587
7c673cae 6588 if (cct->_conf->bluestore_bluefs_env_mirror) {
9f95a23c
TL
6589 rocksdb::Env* a = new BlueRocksEnv(bluefs);
6590 rocksdb::Env* b = rocksdb::Env::Default();
7c673cae 6591 if (create) {
9f95a23c
TL
6592 string cmd = "rm -rf " + path + "/db " +
6593 path + "/db.slow " +
6594 path + "/db.wal";
6595 int r = system(cmd.c_str());
6596 (void)r;
7c673cae
FG
6597 }
6598 env = new rocksdb::EnvMirror(b, a, false, true);
1911f103 6599 } else {
7c673cae
FG
6600 env = new BlueRocksEnv(bluefs);
6601
6602 // simplify the dir names, too, as "seen" by rocksdb
6603 fn = "db";
6604 }
9f95a23c
TL
6605 BlueFSVolumeSelector::paths paths;
6606 bluefs->get_vselector_paths(fn, paths);
7c673cae 6607
522d829b 6608 {
7c673cae 6609 ostringstream db_paths;
9f95a23c
TL
6610 bool first = true;
6611 for (auto& p : paths) {
6612 if (!first) {
6613 db_paths << " ";
6614 }
6615 first = false;
6616 db_paths << p.first << "," << p.second;
6617
6618 }
11fdf7f2 6619 kv_options["db_paths"] = db_paths.str();
9f95a23c 6620 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
7c673cae
FG
6621 }
6622
6623 if (create) {
9f95a23c
TL
6624 for (auto& p : paths) {
6625 env->CreateDir(p.first);
6626 }
6627 // Selectors don't provide wal path so far hence create explicitly
11fdf7f2 6628 env->CreateDir(fn + ".wal");
11fdf7f2
TL
6629 } else {
6630 std::vector<std::string> res;
6631 // check for dir presence
6632 auto r = env->GetChildren(fn+".wal", &res);
6633 if (r.IsNotFound()) {
6634 kv_options.erase("separate_wal_dir");
6635 }
7c673cae 6636 }
11fdf7f2
TL
6637 } else {
6638 string walfn = path + "/db.wal";
7c673cae 6639
11fdf7f2
TL
6640 if (create) {
6641 int r = ::mkdir(fn.c_str(), 0755);
6642 if (r < 0)
6643 r = -errno;
6644 if (r < 0 && r != -EEXIST) {
6645 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
6646 << dendl;
6647 return r;
6648 }
6649
6650 // wal_dir, too!
7c673cae
FG
6651 r = ::mkdir(walfn.c_str(), 0755);
6652 if (r < 0)
6653 r = -errno;
6654 if (r < 0 && r != -EEXIST) {
6655 derr << __func__ << " failed to create " << walfn
6656 << ": " << cpp_strerror(r)
6657 << dendl;
6658 return r;
6659 }
11fdf7f2
TL
6660 } else {
6661 struct stat st;
6662 r = ::stat(walfn.c_str(), &st);
6663 if (r < 0 && errno == ENOENT) {
6664 kv_options.erase("separate_wal_dir");
6665 }
7c673cae
FG
6666 }
6667 }
6668
91327a77 6669
7c673cae
FG
6670 db = KeyValueDB::create(cct,
6671 kv_backend,
6672 fn,
11fdf7f2 6673 kv_options,
7c673cae
FG
6674 static_cast<void*>(env));
6675 if (!db) {
6676 derr << __func__ << " error creating db" << dendl;
6677 if (bluefs) {
20effc67 6678 _close_bluefs();
7c673cae
FG
6679 }
6680 // delete env manually here since we can't depend on db to do this
6681 // under this case
6682 delete env;
6683 env = NULL;
6684 return -EIO;
6685 }
6686
f67539c2 6687 FreelistManager::setup_merge_operators(db, freelist_type);
7c673cae 6688 db->set_merge_operator(PREFIX_STAT, merge_op);
91327a77 6689 db->set_cache_size(cache_kv_ratio * cache_size);
f67539c2
TL
6690 return 0;
6691}
31f18b77 6692
f67539c2
TL
6693int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
6694{
6695 int r;
6696 ceph_assert(!(create && read_only));
6697 string options;
6698 string options_annex;
6699 stringstream err;
6700 string kv_dir_fn;
6701 string kv_backend;
6702 std::string sharding_def;
20effc67
TL
6703 // prevent write attempts to BlueFS in case we failed before BlueFS was opened
6704 db_was_opened_read_only = true;
f67539c2
TL
6705 r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend);
6706 if (r < 0) {
6707 derr << __func__ << " failed to prepare db environment: " << err.str() << dendl;
6708 return -EIO;
6709 }
20effc67
TL
6710 // if reached here then BlueFS is already opened
6711 db_was_opened_read_only = read_only;
6712 dout(10) << __func__ << "::db_was_opened_read_only was set to " << read_only << dendl;
11fdf7f2 6713 if (kv_backend == "rocksdb") {
7c673cae 6714 options = cct->_conf->bluestore_rocksdb_options;
cd265ab1
TL
6715 options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6716 if (!options_annex.empty()) {
6717 if (!options.empty() &&
6718 *options.rbegin() != ',') {
6719 options += ',';
6720 }
6721 options += options_annex;
6722 }
11fdf7f2 6723
f67539c2
TL
6724 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
6725 sharding_def = cct->_conf.get_val<std::string>("bluestore_rocksdb_cfs");
11fdf7f2
TL
6726 }
6727 }
6728
7c673cae 6729 db->init(options);
11fdf7f2
TL
6730 if (to_repair_db)
6731 return 0;
6732 if (create) {
f67539c2 6733 r = db->create_and_open(err, sharding_def);
11fdf7f2
TL
6734 } else {
6735 // we pass in cf list here, but it is only used if the db already has
6736 // column families created.
6737 r = read_only ?
f67539c2
TL
6738 db->open_read_only(err, sharding_def) :
6739 db->open(err, sharding_def);
11fdf7f2 6740 }
7c673cae
FG
6741 if (r) {
6742 derr << __func__ << " erroring opening db: " << err.str() << dendl;
20effc67 6743 _close_db();
7c673cae
FG
6744 return -EIO;
6745 }
6746 dout(1) << __func__ << " opened " << kv_backend
f67539c2 6747 << " path " << kv_dir_fn << " options " << options << dendl;
7c673cae 6748 return 0;
7c673cae
FG
6749}
6750
39ae355f 6751void BlueStore::_close_db()
7c673cae 6752{
39ae355f
TL
6753 dout(10) << __func__ << ":read_only=" << db_was_opened_read_only
6754 << " fm=" << fm
6755 << " destage_alloc_file=" << need_to_destage_allocation_file
6756 << " per_pool=" << per_pool_stat_collection
6757 << " pool stats=" << osd_pools.size()
6758 << dendl;
6759 bool do_destage = !db_was_opened_read_only && need_to_destage_allocation_file;
6760 if (do_destage && is_statfs_recoverable()) {
6761 auto t = db->get_transaction();
6762 store_statfs_t s;
6763 if (per_pool_stat_collection) {
6764 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
6765 uint64_t pool_id;
6766 for (it->upper_bound(string()); it->valid(); it->next()) {
6767 int r = get_key_pool_stat(it->key(), &pool_id);
6768 if (r >= 0) {
6769 dout(10) << __func__ << " wiping statfs for: " << pool_id << dendl;
6770 } else {
6771 derr << __func__ << " wiping invalid statfs key: " << it->key() << dendl;
6772 }
6773 t->rmkey(PREFIX_STAT, it->key());
6774 }
6775
6776 std::lock_guard l(vstatfs_lock);
6777 for(auto &p : osd_pools) {
6778 string key;
6779 get_pool_stat_key(p.first, &key);
6780 bufferlist bl;
6781 if (!p.second.is_empty()) {
6782 p.second.encode(bl);
6783 p.second.publish(&s);
6784 t->set(PREFIX_STAT, key, bl);
6785 dout(10) << __func__ << " persisting: "
6786 << p.first << "->" << s
6787 << dendl;
6788 }
6789 }
6790 } else {
6791 bufferlist bl;
6792 {
6793 std::lock_guard l(vstatfs_lock);
6794 vstatfs.encode(bl);
6795 vstatfs.publish(&s);
6796 }
6797 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
6798 dout(10) << __func__ << "persisting: " << s << dendl;
6799 }
6800 int r = db->submit_transaction_sync(t);
6801 dout(10) << __func__ << " statfs persisted." << dendl;
6802 ceph_assert(r >= 0);
6803 }
11fdf7f2 6804 ceph_assert(db);
7c673cae 6805 delete db;
20effc67 6806 db = nullptr;
20effc67 6807
39ae355f 6808 if (do_destage && fm && fm->is_null_manager()) {
20effc67
TL
6809 int ret = store_allocator(alloc);
6810 if (ret != 0) {
6811 derr << __func__ << "::NCB::store_allocator() failed (continue with bitmapFreelistManager)" << dendl;
6812 }
6813 }
6814
7c673cae 6815 if (bluefs) {
20effc67 6816 _close_bluefs();
7c673cae
FG
6817 }
6818}
6819
11fdf7f2 6820void BlueStore::_dump_alloc_on_failure()
7c673cae 6821{
11fdf7f2
TL
6822 auto dump_interval =
6823 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
6824 if (dump_interval > 0 &&
6825 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
f67539c2 6826 shared_alloc.a->dump();
11fdf7f2
TL
6827 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
6828 next_dump_on_bluefs_alloc_failure += dump_interval;
7c673cae 6829 }
11fdf7f2 6830}
7c673cae 6831
eafe8130 6832int BlueStore::_open_collections()
7c673cae 6833{
20effc67
TL
6834 if (!coll_map.empty()) {
6835 // could be opened from another path
6836 dout(20) << __func__ << "::NCB::collections are already opened, nothing to do" << dendl;
6837 return 0;
6838 }
6839
28e407b8 6840 dout(10) << __func__ << dendl;
eafe8130 6841 collections_had_errors = false;
7c673cae 6842 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
20effc67 6843 size_t load_cnt = 0;
7c673cae
FG
6844 for (it->upper_bound(string());
6845 it->valid();
6846 it->next()) {
6847 coll_t cid;
6848 if (cid.parse(it->key())) {
9f95a23c 6849 auto c = ceph::make_ref<Collection>(
7c673cae 6850 this,
9f95a23c
TL
6851 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6852 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6853 cid);
7c673cae 6854 bufferlist bl = it->value();
11fdf7f2 6855 auto p = bl.cbegin();
7c673cae 6856 try {
11fdf7f2 6857 decode(c->cnode, p);
f67539c2 6858 } catch (ceph::buffer::error& e) {
7c673cae
FG
6859 derr << __func__ << " failed to decode cnode, key:"
6860 << pretty_binary_string(it->key()) << dendl;
6861 return -EIO;
6862 }
28e407b8
AA
6863 dout(20) << __func__ << " opened " << cid << " " << c
6864 << " " << c->cnode << dendl;
11fdf7f2 6865 _osr_attach(c.get());
7c673cae 6866 coll_map[cid] = c;
20effc67 6867 load_cnt++;
7c673cae
FG
6868 } else {
6869 derr << __func__ << " unrecognized collection " << it->key() << dendl;
eafe8130 6870 collections_had_errors = true;
7c673cae
FG
6871 }
6872 }
20effc67
TL
6873 dout(10) << __func__ << " collections loaded: " << load_cnt
6874 << dendl;
7c673cae
FG
6875 return 0;
6876}
6877
eafe8130
TL
6878void BlueStore::_fsck_collections(int64_t* errors)
6879{
6880 if (collections_had_errors) {
6881 dout(10) << __func__ << dendl;
f67539c2 6882 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
6883 for (it->upper_bound(string());
6884 it->valid();
6885 it->next()) {
6886 coll_t cid;
6887 if (!cid.parse(it->key())) {
6888 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6889 if (errors) {
6890 (*errors)++;
6891 }
6892 }
6893 }
6894 }
6895}
6896
9f95a23c
TL
6897void BlueStore::_set_per_pool_omap()
6898{
f67539c2 6899 per_pool_omap = OMAP_BULK;
9f95a23c
TL
6900 bufferlist bl;
6901 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6902 if (bl.length()) {
f67539c2
TL
6903 auto s = bl.to_str();
6904 if (s == stringify(OMAP_PER_POOL)) {
6905 per_pool_omap = OMAP_PER_POOL;
a4b75251 6906 } else if (s == stringify(OMAP_PER_PG)) {
f67539c2 6907 per_pool_omap = OMAP_PER_PG;
a4b75251
TL
6908 } else {
6909 ceph_assert(s == stringify(OMAP_BULK));
f67539c2
TL
6910 }
6911 dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl;
9f95a23c
TL
6912 } else {
6913 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6914 }
f67539c2 6915 _check_no_per_pg_or_pool_omap_alert();
9f95a23c
TL
6916}
6917
224ce89b 6918void BlueStore::_open_statfs()
31f18b77 6919{
11fdf7f2
TL
6920 osd_pools.clear();
6921 vstatfs.reset();
6922
31f18b77 6923 bufferlist bl;
11fdf7f2 6924 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
31f18b77 6925 if (r >= 0) {
11fdf7f2 6926 per_pool_stat_collection = false;
31f18b77 6927 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
11fdf7f2 6928 auto it = bl.cbegin();
31f18b77 6929 vstatfs.decode(it);
11fdf7f2 6930 dout(10) << __func__ << " store_statfs is found" << dendl;
224ce89b 6931 } else {
31f18b77
FG
6932 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6933 }
81eedcae 6934 _check_legacy_statfs_alert();
11fdf7f2
TL
6935 } else {
6936 per_pool_stat_collection = true;
6937 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
f67539c2 6938 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2
TL
6939 for (it->upper_bound(string());
6940 it->valid();
6941 it->next()) {
6942
6943 uint64_t pool_id;
6944 int r = get_key_pool_stat(it->key(), &pool_id);
6945 ceph_assert(r == 0);
6946
6947 bufferlist bl;
6948 bl = it->value();
6949 auto p = bl.cbegin();
6950 auto& st = osd_pools[pool_id];
6951 try {
6952 st.decode(p);
6953 vstatfs += st;
6954
39ae355f
TL
6955 dout(10) << __func__ << " pool " << std::hex << pool_id
6956 << " statfs(hex) " << st
6957 << std::dec << dendl;
f67539c2 6958 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
6959 derr << __func__ << " failed to decode pool stats, key:"
6960 << pretty_binary_string(it->key()) << dendl;
6961 }
6962 }
31f18b77 6963 }
39ae355f
TL
6964 dout(10) << __func__ << " statfs " << std::hex
6965 << vstatfs << std::dec << dendl;
11fdf7f2 6966
31f18b77
FG
6967}
6968
7c673cae
FG
6969int BlueStore::_setup_block_symlink_or_file(
6970 string name,
6971 string epath,
6972 uint64_t size,
6973 bool create)
6974{
6975 dout(20) << __func__ << " name " << name << " path " << epath
6976 << " size " << size << " create=" << (int)create << dendl;
6977 int r = 0;
91327a77 6978 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
6979 if (create)
6980 flags |= O_CREAT;
6981 if (epath.length()) {
6982 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6983 if (r < 0) {
6984 r = -errno;
6985 derr << __func__ << " failed to create " << name << " symlink to "
6986 << epath << ": " << cpp_strerror(r) << dendl;
6987 return r;
6988 }
6989
6990 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6991 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6992 if (fd < 0) {
6993 r = -errno;
6994 derr << __func__ << " failed to open " << epath << " file: "
6995 << cpp_strerror(r) << dendl;
6996 return r;
6997 }
11fdf7f2 6998 // write the Transport ID of the NVMe device
1e59de90 6999 // a transport id for PCIe looks like: "trtype:PCIe traddr:0000:02:00.0"
11fdf7f2
TL
7000 // where "0000:02:00.0" is the selector of a PCI device, see
7001 // the first column of "lspci -mm -n -D"
1e59de90
TL
7002 // a transport id for tcp looks like: "trype:TCP adrfam:IPv4 traddr:172.31.89.152 trsvcid:4420"
7003 string trid = epath.substr(strlen(SPDK_PREFIX));
11fdf7f2
TL
7004 r = ::write(fd, trid.c_str(), trid.size());
7005 ceph_assert(r == static_cast<int>(trid.size()));
7c673cae
FG
7006 dout(1) << __func__ << " created " << name << " symlink to "
7007 << epath << dendl;
7008 VOID_TEMP_FAILURE_RETRY(::close(fd));
7009 }
7010 }
7011 if (size) {
7012 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
7013 if (fd >= 0) {
7014 // block file is present
7015 struct stat st;
7016 int r = ::fstat(fd, &st);
7017 if (r == 0 &&
7018 S_ISREG(st.st_mode) && // if it is a regular file
7019 st.st_size == 0) { // and is 0 bytes
7020 r = ::ftruncate(fd, size);
7021 if (r < 0) {
7022 r = -errno;
7023 derr << __func__ << " failed to resize " << name << " file to "
7024 << size << ": " << cpp_strerror(r) << dendl;
7025 VOID_TEMP_FAILURE_RETRY(::close(fd));
7026 return r;
7027 }
7028
7029 if (cct->_conf->bluestore_block_preallocate_file) {
28e407b8
AA
7030 r = ::ceph_posix_fallocate(fd, 0, size);
7031 if (r > 0) {
7c673cae
FG
7032 derr << __func__ << " failed to prefallocate " << name << " file to "
7033 << size << ": " << cpp_strerror(r) << dendl;
7034 VOID_TEMP_FAILURE_RETRY(::close(fd));
7035 return -r;
7036 }
7c673cae
FG
7037 }
7038 dout(1) << __func__ << " resized " << name << " file to "
1adf2230 7039 << byte_u_t(size) << dendl;
7c673cae
FG
7040 }
7041 VOID_TEMP_FAILURE_RETRY(::close(fd));
7042 } else {
7043 int r = -errno;
7044 if (r != -ENOENT) {
7045 derr << __func__ << " failed to open " << name << " file: "
7046 << cpp_strerror(r) << dendl;
7047 return r;
7048 }
7049 }
7050 }
7051 return 0;
7052}
7053
7054int BlueStore::mkfs()
7055{
7056 dout(1) << __func__ << " path " << path << dendl;
7057 int r;
7058 uuid_d old_fsid;
f67539c2 7059 uint64_t reserved;
eafe8130
TL
7060 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
7061 derr << __func__ << " osd_max_object_size "
7062 << cct->_conf->osd_max_object_size << " > bluestore max "
7063 << OBJECT_MAX_SIZE << dendl;
7064 return -EINVAL;
7065 }
7066
7c673cae
FG
7067 {
7068 string done;
7069 r = read_meta("mkfs_done", &done);
7070 if (r == 0) {
7071 dout(1) << __func__ << " already created" << dendl;
7072 if (cct->_conf->bluestore_fsck_on_mkfs) {
7073 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
7074 if (r < 0) {
7075 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
7076 << dendl;
7077 return r;
7078 }
7079 if (r > 0) {
7080 derr << __func__ << " fsck found " << r << " errors" << dendl;
7081 r = -EIO;
7082 }
7083 }
7084 return r; // idempotent
7085 }
7086 }
7087
7088 {
7089 string type;
7090 r = read_meta("type", &type);
7091 if (r == 0) {
7092 if (type != "bluestore") {
7093 derr << __func__ << " expected bluestore, but type is " << type << dendl;
7094 return -EIO;
7095 }
7096 } else {
7097 r = write_meta("type", "bluestore");
7098 if (r < 0)
7099 return r;
7100 }
7101 }
7102
7c673cae
FG
7103 r = _open_path();
7104 if (r < 0)
7105 return r;
7106
7107 r = _open_fsid(true);
7108 if (r < 0)
7109 goto out_path_fd;
7110
7111 r = _lock_fsid();
7112 if (r < 0)
7113 goto out_close_fsid;
7114
7115 r = _read_fsid(&old_fsid);
7116 if (r < 0 || old_fsid.is_zero()) {
7117 if (fsid.is_zero()) {
7118 fsid.generate_random();
7119 dout(1) << __func__ << " generated fsid " << fsid << dendl;
7120 } else {
7121 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
7122 }
7123 // we'll write it later.
7124 } else {
7125 if (!fsid.is_zero() && fsid != old_fsid) {
7126 derr << __func__ << " on-disk fsid " << old_fsid
7127 << " != provided " << fsid << dendl;
7128 r = -EINVAL;
7129 goto out_close_fsid;
7130 }
7131 fsid = old_fsid;
7132 }
7133
7134 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
7135 cct->_conf->bluestore_block_size,
7136 cct->_conf->bluestore_block_create);
7137 if (r < 0)
7138 goto out_close_fsid;
7139 if (cct->_conf->bluestore_bluefs) {
7140 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
7141 cct->_conf->bluestore_block_wal_size,
7142 cct->_conf->bluestore_block_wal_create);
7143 if (r < 0)
7144 goto out_close_fsid;
7145 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
7146 cct->_conf->bluestore_block_db_size,
7147 cct->_conf->bluestore_block_db_create);
7148 if (r < 0)
7149 goto out_close_fsid;
7150 }
7151
7152 r = _open_bdev(true);
7153 if (r < 0)
7154 goto out_close_fsid;
7155
20effc67
TL
7156 // choose freelist manager
7157#ifdef HAVE_LIBZBD
7158 if (bdev->is_smr()) {
7159 freelist_type = "zoned";
7160 zone_size = bdev->get_zone_size();
7161 first_sequential_zone = bdev->get_conventional_region_size() / zone_size;
7162 bdev->reset_all_zones();
7163 } else
7164#endif
7165 {
7166 freelist_type = "bitmap";
7167 }
7168 dout(10) << " freelist_type " << freelist_type << dendl;
7169
3efd9988 7170 // choose min_alloc_size
20effc67
TL
7171 dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
7172 << " block_size: 0x" << block_size << std::dec << dendl;
7173 if ((cct->_conf->bluestore_use_optimal_io_size_for_min_alloc_size) && (optimal_io_size != 0)) {
7174 dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
7175 << " for min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
7176 min_alloc_size = optimal_io_size;
7177 }
7178 else if (cct->_conf->bluestore_min_alloc_size) {
3efd9988
FG
7179 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
7180 } else {
11fdf7f2 7181 ceph_assert(bdev);
f67539c2 7182 if (_use_rotational_settings()) {
3efd9988
FG
7183 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
7184 } else {
7185 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
7186 }
7187 }
11fdf7f2 7188 _validate_bdev();
3efd9988
FG
7189
7190 // make sure min_alloc_size is power of 2 aligned.
1e59de90 7191 if (!std::has_single_bit(min_alloc_size)) {
3efd9988
FG
7192 derr << __func__ << " min_alloc_size 0x"
7193 << std::hex << min_alloc_size << std::dec
7194 << " is not power of 2 aligned!"
7195 << dendl;
7196 r = -EINVAL;
7197 goto out_close_bdev;
7198 }
7199
20effc67
TL
7200 // make sure min_alloc_size is >= and aligned with block size
7201 if (min_alloc_size % block_size != 0) {
7202 derr << __func__ << " min_alloc_size 0x"
7203 << std::hex << min_alloc_size
7204 << " is less or not aligned with block_size: 0x"
7205 << block_size << std::dec << dendl;
7206 r = -EINVAL;
7207 goto out_close_bdev;
7208 }
7209
f67539c2
TL
7210 r = _create_alloc();
7211 if (r < 0) {
7212 goto out_close_bdev;
7213 }
7214
7215 reserved = _get_ondisk_reserved();
20effc67 7216 alloc->init_add_free(reserved,
f67539c2 7217 p2align(bdev->get_size(), min_alloc_size) - reserved);
20effc67
TL
7218#ifdef HAVE_LIBZBD
7219 if (bdev->is_smr() && alloc != shared_alloc.a) {
7220 shared_alloc.a->init_add_free(reserved,
7221 p2align(bdev->get_conventional_region_size(),
7222 min_alloc_size) - reserved);
7223 }
7224#endif
f67539c2 7225
7c673cae
FG
7226 r = _open_db(true);
7227 if (r < 0)
f67539c2 7228 goto out_close_alloc;
7c673cae 7229
7c673cae
FG
7230 {
7231 KeyValueDB::Transaction t = db->get_transaction();
39ae355f 7232 r = _open_fm(t, false, true);
11fdf7f2
TL
7233 if (r < 0)
7234 goto out_close_db;
7c673cae
FG
7235 {
7236 bufferlist bl;
11fdf7f2 7237 encode((uint64_t)0, bl);
7c673cae
FG
7238 t->set(PREFIX_SUPER, "nid_max", bl);
7239 t->set(PREFIX_SUPER, "blobid_max", bl);
7240 }
7241
7c673cae
FG
7242 {
7243 bufferlist bl;
11fdf7f2 7244 encode((uint64_t)min_alloc_size, bl);
7c673cae
FG
7245 t->set(PREFIX_SUPER, "min_alloc_size", bl);
7246 }
9f95a23c
TL
7247 {
7248 bufferlist bl;
a4b75251
TL
7249 if (cct->_conf.get_val<bool>("bluestore_debug_legacy_omap")) {
7250 bl.append(stringify(OMAP_BULK));
7251 } else {
7252 bl.append(stringify(OMAP_PER_PG));
7253 }
9f95a23c
TL
7254 t->set(PREFIX_SUPER, "per_pool_omap", bl);
7255 }
20effc67
TL
7256
7257#ifdef HAVE_LIBZBD
7258 if (bdev->is_smr()) {
7259 {
7260 bufferlist bl;
7261 encode((uint64_t)zone_size, bl);
7262 t->set(PREFIX_SUPER, "zone_size", bl);
7263 }
7264 {
7265 bufferlist bl;
7266 encode((uint64_t)first_sequential_zone, bl);
7267 t->set(PREFIX_SUPER, "first_sequential_zone", bl);
7268 }
7269 }
7270#endif
7271
7c673cae
FG
7272 ondisk_format = latest_ondisk_format;
7273 _prepare_ondisk_format_super(t);
7274 db->submit_transaction_sync(t);
7275 }
7276
7c673cae
FG
7277 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
7278 if (r < 0)
224ce89b
WB
7279 goto out_close_fm;
7280
3efd9988 7281 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 7282 if (r < 0)
224ce89b 7283 goto out_close_fm;
7c673cae
FG
7284
7285 if (fsid != old_fsid) {
7286 r = _write_fsid();
7287 if (r < 0) {
7288 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 7289 goto out_close_fm;
7c673cae
FG
7290 }
7291 }
7292
7c673cae
FG
7293 out_close_fm:
7294 _close_fm();
7295 out_close_db:
20effc67 7296 _close_db();
f67539c2
TL
7297 out_close_alloc:
7298 _close_alloc();
7c673cae
FG
7299 out_close_bdev:
7300 _close_bdev();
7301 out_close_fsid:
7302 _close_fsid();
7303 out_path_fd:
7304 _close_path();
7305
7306 if (r == 0 &&
7307 cct->_conf->bluestore_fsck_on_mkfs) {
7308 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
7309 if (rc < 0)
7310 return rc;
7311 if (rc > 0) {
7312 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7313 r = -EIO;
7314 }
11fdf7f2
TL
7315 }
7316
7317 if (r == 0) {
7318 // indicate success by writing the 'mkfs_done' file
7319 r = write_meta("mkfs_done", "yes");
7320 }
7321
7322 if (r < 0) {
7323 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
7324 } else {
7325 dout(0) << __func__ << " success" << dendl;
7326 }
7327 return r;
7328}
7329
11fdf7f2
TL
7330int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
7331{
7332 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
7333 int r;
7334 ceph_assert(path_fd < 0);
7335
7336 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
7337
7338 if (!cct->_conf->bluestore_bluefs) {
7339 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7340 return -EIO;
7341 }
20effc67 7342 dout(5) << __func__ << "::NCB::calling open_db_and_around(read-only)" << dendl;
f67539c2 7343 r = _open_db_and_around(true);
20effc67
TL
7344 if (r < 0) {
7345 return r;
7346 }
11fdf7f2 7347
11fdf7f2
TL
7348 if (id == BlueFS::BDEV_NEWWAL) {
7349 string p = path + "/block.wal";
7350 r = _setup_block_symlink_or_file("block.wal", dev_path,
7351 cct->_conf->bluestore_block_wal_size,
7352 true);
7353 ceph_assert(r == 0);
7354
7355 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
f67539c2
TL
7356 cct->_conf->bdev_enable_discard,
7357 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
7358 ceph_assert(r == 0);
7359
7360 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
7361 r = _check_or_set_bdev_label(
7362 p,
7363 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
7364 "bluefs wal",
7365 true);
7366 ceph_assert(r == 0);
7367 }
7368
9f95a23c 7369 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
7370 } else if (id == BlueFS::BDEV_NEWDB) {
7371 string p = path + "/block.db";
7372 r = _setup_block_symlink_or_file("block.db", dev_path,
7373 cct->_conf->bluestore_block_db_size,
7374 true);
7375 ceph_assert(r == 0);
7376
7377 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
f67539c2
TL
7378 cct->_conf->bdev_enable_discard,
7379 SUPER_RESERVED);
11fdf7f2
TL
7380 ceph_assert(r == 0);
7381
7382 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
7383 r = _check_or_set_bdev_label(
7384 p,
7385 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
7386 "bluefs db",
7387 true);
7388 ceph_assert(r == 0);
7389 }
9f95a23c
TL
7390 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
7391 bluefs_layout.dedicated_db = true;
11fdf7f2 7392 }
11fdf7f2
TL
7393 bluefs->umount();
7394 bluefs->mount();
7395
9f95a23c 7396 r = bluefs->prepare_new_device(id, bluefs_layout);
11fdf7f2
TL
7397 ceph_assert(r == 0);
7398
7399 if (r < 0) {
7400 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
7401 } else {
7402 dout(0) << __func__ << " success" << dendl;
7403 }
7404
20effc67 7405 _close_db_and_around();
11fdf7f2
TL
7406 return r;
7407}
7408
7409int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
7410 int id)
7411{
7412 dout(10) << __func__ << " id:" << id << dendl;
7413 ceph_assert(path_fd < 0);
7414
7415 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
7416
7417 if (!cct->_conf->bluestore_bluefs) {
7418 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7419 return -EIO;
7420 }
7421
f67539c2 7422 int r = _open_db_and_around(true);
20effc67
TL
7423 if (r < 0) {
7424 return r;
7425 }
7426 auto close_db = make_scope_guard([&] {
7427 _close_db_and_around();
7428 });
f67539c2 7429 uint64_t used_space = 0;
11fdf7f2 7430 for(auto src_id : devs_source) {
f67539c2 7431 used_space += bluefs->get_used(src_id);
11fdf7f2
TL
7432 }
7433 uint64_t target_free = bluefs->get_free(id);
f67539c2 7434 if (target_free < used_space) {
11fdf7f2
TL
7435 derr << __func__
7436 << " can't migrate, free space at target: " << target_free
7437 << " is less than required space: " << used_space
7438 << dendl;
20effc67 7439 return -ENOSPC;
11fdf7f2 7440 }
9f95a23c
TL
7441 if (devs_source.count(BlueFS::BDEV_DB)) {
7442 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7443 bluefs_layout.dedicated_db = false;
7444 }
7445 if (devs_source.count(BlueFS::BDEV_WAL)) {
7446 bluefs_layout.dedicated_wal = false;
7447 }
7448 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
11fdf7f2
TL
7449 if (r < 0) {
7450 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
20effc67 7451 return r;
11fdf7f2
TL
7452 }
7453
7454 if (devs_source.count(BlueFS::BDEV_DB)) {
7455 r = unlink(string(path + "/block.db").c_str());
7456 ceph_assert(r == 0);
7457 }
7458 if (devs_source.count(BlueFS::BDEV_WAL)) {
7459 r = unlink(string(path + "/block.wal").c_str());
7460 ceph_assert(r == 0);
7461 }
11fdf7f2
TL
7462 return r;
7463}
7464
7465int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
7466 int id,
7467 const string& dev_path)
7468{
7469 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
11fdf7f2
TL
7470 ceph_assert(path_fd < 0);
7471
7472 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
7473
7474 if (!cct->_conf->bluestore_bluefs) {
7475 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7476 return -EIO;
7477 }
7478
20effc67
TL
7479 int r = _open_db_and_around(true);
7480 if (r < 0) {
7481 return r;
7482 }
7483 auto close_db = make_scope_guard([&] {
7484 _close_db_and_around();
7485 });
11fdf7f2 7486
11fdf7f2
TL
7487 string link_db;
7488 string link_wal;
7489 if (devs_source.count(BlueFS::BDEV_DB) &&
9f95a23c 7490 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 7491 link_db = path + "/block.db";
9f95a23c
TL
7492 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7493 bluefs_layout.dedicated_db = false;
11fdf7f2
TL
7494 }
7495 if (devs_source.count(BlueFS::BDEV_WAL)) {
7496 link_wal = path + "/block.wal";
9f95a23c 7497 bluefs_layout.dedicated_wal = false;
11fdf7f2
TL
7498 }
7499
20effc67 7500 size_t target_size = 0;
11fdf7f2
TL
7501 string target_name;
7502 if (id == BlueFS::BDEV_NEWWAL) {
7503 target_name = "block.wal";
7504 target_size = cct->_conf->bluestore_block_wal_size;
9f95a23c 7505 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
7506
7507 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
f67539c2
TL
7508 cct->_conf->bdev_enable_discard,
7509 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
7510 ceph_assert(r == 0);
7511
7512 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
7513 r = _check_or_set_bdev_label(
7514 dev_path,
7515 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
7516 "bluefs wal",
7517 true);
7518 ceph_assert(r == 0);
7519 }
11fdf7f2
TL
7520 } else if (id == BlueFS::BDEV_NEWDB) {
7521 target_name = "block.db";
7522 target_size = cct->_conf->bluestore_block_db_size;
9f95a23c
TL
7523 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
7524 bluefs_layout.dedicated_db = true;
31f18b77 7525
11fdf7f2 7526 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
f67539c2
TL
7527 cct->_conf->bdev_enable_discard,
7528 SUPER_RESERVED);
11fdf7f2
TL
7529 ceph_assert(r == 0);
7530
7531 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
7532 r = _check_or_set_bdev_label(
7533 dev_path,
7534 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
7535 "bluefs db",
7536 true);
7537 ceph_assert(r == 0);
7538 }
31f18b77
FG
7539 }
7540
11fdf7f2
TL
7541 bluefs->umount();
7542 bluefs->mount();
7543
9f95a23c 7544 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
11fdf7f2 7545
7c673cae 7546 if (r < 0) {
11fdf7f2 7547 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
20effc67 7548 return r;
11fdf7f2
TL
7549 }
7550
7551 if (!link_db.empty()) {
7552 r = unlink(link_db.c_str());
7553 ceph_assert(r == 0);
7554 }
7555 if (!link_wal.empty()) {
7556 r = unlink(link_wal.c_str());
7557 ceph_assert(r == 0);
7558 }
7559 r = _setup_block_symlink_or_file(
7560 target_name,
7561 dev_path,
7562 target_size,
7563 true);
7564 ceph_assert(r == 0);
7565 dout(0) << __func__ << " success" << dendl;
7566
11fdf7f2
TL
7567 return r;
7568}
7569
7570string BlueStore::get_device_path(unsigned id)
7571{
7572 string res;
7573 if (id < BlueFS::MAX_BDEV) {
7574 switch (id) {
7575 case BlueFS::BDEV_WAL:
7576 res = path + "/block.wal";
7577 break;
7578 case BlueFS::BDEV_DB:
9f95a23c 7579 if (id == bluefs_layout.shared_bdev) {
11fdf7f2
TL
7580 res = path + "/block";
7581 } else {
7582 res = path + "/block.db";
7583 }
7584 break;
7585 case BlueFS::BDEV_SLOW:
7586 res = path + "/block";
7587 break;
7588 }
7589 }
7590 return res;
7591}
7592
f67539c2
TL
7593int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
7594{
7595 bluestore_bdev_label_t label;
7596 int r = _read_bdev_label(cct, path, &label);
7597 if (r < 0) {
7598 derr << "unable to read label for " << path << ": "
7599 << cpp_strerror(r) << dendl;
7600 } else {
7601 label.size = size;
7602 r = _write_bdev_label(cct, path, label);
7603 if (r < 0) {
7604 derr << "unable to write label for " << path << ": "
7605 << cpp_strerror(r) << dendl;
7606 }
7607 }
7608 return r;
7609}
7610
11fdf7f2
TL
7611int BlueStore::expand_devices(ostream& out)
7612{
f67539c2 7613 int r = _open_db_and_around(true);
11fdf7f2
TL
7614 ceph_assert(r == 0);
7615 bluefs->dump_block_extents(out);
1911f103 7616 out << "Expanding DB/WAL..." << std::endl;
11fdf7f2 7617 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
9f95a23c 7618 if (devid == bluefs_layout.shared_bdev ) {
11fdf7f2
TL
7619 continue;
7620 }
7621 uint64_t size = bluefs->get_block_device_size(devid);
7622 if (size == 0) {
7623 // no bdev
7624 continue;
7625 }
7626
f67539c2
TL
7627 out << devid
7628 <<" : expanding " << " to 0x" << size << std::dec << std::endl;
7629 string p = get_device_path(devid);
7630 const char* path = p.c_str();
7631 if (path == nullptr) {
7632 derr << devid
7633 <<": can't find device path " << dendl;
7634 continue;
7635 }
7636 if (bluefs->bdev_support_label(devid)) {
7637 if (_set_bdev_label_size(p, size) >= 0) {
7638 out << devid
7639 << " : size label updated to " << size
7640 << std::endl;
11fdf7f2 7641 }
11fdf7f2
TL
7642 }
7643 }
7644 uint64_t size0 = fm->get_size();
7645 uint64_t size = bdev->get_size();
7646 if (size0 < size) {
9f95a23c 7647 out << bluefs_layout.shared_bdev
1911f103
TL
7648 << " : expanding " << " from 0x" << std::hex
7649 << size0 << " to 0x" << size << std::dec << std::endl;
f67539c2
TL
7650 _write_out_fm_meta(size);
7651 if (bdev->supported_bdev_label()) {
7652 if (_set_bdev_label_size(path, size) >= 0) {
7653 out << bluefs_layout.shared_bdev
7654 << " : size label updated to " << size
7655 << std::endl;
7656 }
7657 }
20effc67 7658
1d09f67e
TL
7659 if (fm && fm->is_null_manager()) {
7660 // we grow the allocation range, must reflect it in the allocation file
7661 alloc->init_add_free(size0, size - size0);
7662 need_to_destage_allocation_file = true;
7663 }
20effc67 7664 _close_db_and_around();
1911f103
TL
7665
7666 // mount in read/write to sync expansion changes
f67539c2 7667 r = _mount();
11fdf7f2 7668 ceph_assert(r == 0);
1911f103
TL
7669 umount();
7670 } else {
20effc67 7671 _close_db_and_around();
7c673cae 7672 }
1911f103
TL
7673 return r;
7674}
7675
7676int BlueStore::dump_bluefs_sizes(ostream& out)
7677{
f67539c2 7678 int r = _open_db_and_around(true);
1911f103
TL
7679 ceph_assert(r == 0);
7680 bluefs->dump_block_extents(out);
20effc67 7681 _close_db_and_around();
7c673cae
FG
7682 return r;
7683}
7684
7685void BlueStore::set_cache_shards(unsigned num)
7686{
7687 dout(10) << __func__ << " " << num << dendl;
9f95a23c
TL
7688 size_t oold = onode_cache_shards.size();
7689 size_t bold = buffer_cache_shards.size();
7690 ceph_assert(num >= oold && num >= bold);
7691 onode_cache_shards.resize(num);
7692 buffer_cache_shards.resize(num);
7693 for (unsigned i = oold; i < num; ++i) {
7694 onode_cache_shards[i] =
7695 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7696 logger);
7697 }
7698 for (unsigned i = bold; i < num; ++i) {
7699 buffer_cache_shards[i] =
7700 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7701 logger);
7c673cae
FG
7702 }
7703}
7704
1d09f67e 7705//---------------------------------------------
39ae355f 7706bool BlueStore::has_null_manager() const
1d09f67e
TL
7707{
7708 return (fm && fm->is_null_manager());
7709}
7710
f67539c2 7711int BlueStore::_mount()
7c673cae 7712{
20effc67 7713 dout(5) << __func__ << "NCB:: path " << path << dendl;
1d09f67e 7714
f67539c2 7715 _kv_only = false;
7c673cae 7716 if (cct->_conf->bluestore_fsck_on_mount) {
20effc67 7717 dout(5) << __func__ << "::NCB::calling fsck()" << dendl;
7c673cae
FG
7718 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
7719 if (rc < 0)
7720 return rc;
7721 if (rc > 0) {
7722 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7723 return -EIO;
7724 }
7725 }
7726
eafe8130
TL
7727 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
7728 derr << __func__ << " osd_max_object_size "
7729 << cct->_conf->osd_max_object_size << " > bluestore max "
7730 << OBJECT_MAX_SIZE << dendl;
7731 return -EINVAL;
7732 }
7733
20effc67 7734 dout(5) << __func__ << "::NCB::calling open_db_and_around(read/write)" << dendl;
f67539c2 7735 int r = _open_db_and_around(false);
9f95a23c 7736 if (r < 0) {
f67539c2 7737 return r;
11fdf7f2 7738 }
20effc67
TL
7739 auto close_db = make_scope_guard([&] {
7740 if (!mounted) {
7741 _close_db_and_around();
7742 }
7743 });
7c673cae 7744
11fdf7f2
TL
7745 r = _upgrade_super();
7746 if (r < 0) {
20effc67 7747 return r;
11fdf7f2 7748 }
7c673cae 7749
20effc67 7750 // The recovery process for allocation-map needs to open collection early
7c673cae 7751 r = _open_collections();
20effc67
TL
7752 if (r < 0) {
7753 return r;
7754 }
7755 auto shutdown_cache = make_scope_guard([&] {
7756 if (!mounted) {
7757 _shutdown_cache();
7758 }
7759 });
7c673cae
FG
7760
7761 r = _reload_logger();
20effc67
TL
7762 if (r < 0) {
7763 return r;
7764 }
7c673cae 7765
31f18b77 7766 _kv_start();
20effc67
TL
7767 auto stop_kv = make_scope_guard([&] {
7768 if (!mounted) {
7769 _kv_stop();
7770 }
7771 });
7772
7773 r = _deferred_replay();
7774 if (r < 0) {
7775 return r;
7776 }
7c673cae 7777
20effc67 7778#ifdef HAVE_LIBZBD
f67539c2
TL
7779 if (bdev->is_smr()) {
7780 _zoned_cleaner_start();
7781 }
20effc67 7782#endif
7c673cae
FG
7783
7784 mempool_thread.init();
7785
f67539c2 7786 if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
eafe8130 7787 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
9f95a23c 7788
f67539c2 7789 auto was_per_pool_omap = per_pool_omap;
9f95a23c 7790
eafe8130
TL
7791 dout(1) << __func__ << " quick-fix on mount" << dendl;
7792 _fsck_on_open(FSCK_SHALLOW, true);
7793
9f95a23c 7794 //set again as hopefully it has been fixed
f67539c2 7795 if (was_per_pool_omap != OMAP_PER_PG) {
9f95a23c
TL
7796 _set_per_pool_omap();
7797 }
eafe8130
TL
7798 }
7799
7c673cae
FG
7800 mounted = true;
7801 return 0;
7c673cae
FG
7802}
7803
7804int BlueStore::umount()
7805{
11fdf7f2 7806 ceph_assert(_kv_only || mounted);
7c673cae 7807 _osr_drain_all();
7c673cae 7808
7c673cae 7809 mounted = false;
20effc67
TL
7810
7811 ceph_assert(alloc);
7812
3efd9988
FG
7813 if (!_kv_only) {
7814 mempool_thread.shutdown();
20effc67 7815#ifdef HAVE_LIBZBD
f67539c2
TL
7816 if (bdev->is_smr()) {
7817 dout(20) << __func__ << " stopping zone cleaner thread" << dendl;
7818 _zoned_cleaner_stop();
7819 }
20effc67 7820#endif
3efd9988
FG
7821 dout(20) << __func__ << " stopping kv thread" << dendl;
7822 _kv_stop();
1d09f67e
TL
7823 // skip cache cleanup step on fast shutdown
7824 if (likely(!m_fast_shutdown)) {
7825 _shutdown_cache();
7826 }
3efd9988 7827 dout(20) << __func__ << " closing" << dendl;
3efd9988 7828 }
20effc67 7829 _close_db_and_around();
1d09f67e
TL
7830 // disable fsck on fast-shutdown
7831 if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) {
7c673cae
FG
7832 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7833 if (rc < 0)
7834 return rc;
7835 if (rc > 0) {
7836 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7837 return -EIO;
7838 }
7839 }
7840 return 0;
7841}
7842
eafe8130
TL
7843int BlueStore::cold_open()
7844{
f67539c2 7845 return _open_db_and_around(true);
eafe8130 7846}
f67539c2 7847
eafe8130
TL
7848int BlueStore::cold_close()
7849{
20effc67 7850 _close_db_and_around();
eafe8130
TL
7851 return 0;
7852}
7853
9f95a23c
TL
7854// derr wrapper to limit enormous output and avoid log flooding.
7855// Of limited use where such output is expected for now
7856#define fsck_derr(err_cnt, threshold) \
7857 if (err_cnt <= threshold) { \
7858 bool need_skip_print = err_cnt == threshold; \
7859 derr
7860
7861#define fsck_dendl \
7862 dendl; \
7863 if (need_skip_print) \
7864 derr << "more error lines skipped..." << dendl; \
7c673cae 7865 }
7c673cae 7866
eafe8130
TL
7867int _fsck_sum_extents(
7868 const PExtentVector& extents,
7869 bool compressed,
7870 store_statfs_t& expected_statfs)
7871{
7872 for (auto e : extents) {
7873 if (!e.is_valid())
7874 continue;
7875 expected_statfs.allocated += e.length;
7876 if (compressed) {
7877 expected_statfs.data_compressed_allocated += e.length;
7878 }
7879 }
7880 return 0;
7881}
7882
7c673cae 7883int BlueStore::_fsck_check_extents(
20effc67 7884 std::string_view ctx_descr,
7c673cae
FG
7885 const PExtentVector& extents,
7886 bool compressed,
7887 mempool_dynamic_bitset &used_blocks,
b32b8144 7888 uint64_t granularity,
11fdf7f2 7889 BlueStoreRepairer* repairer,
eafe8130
TL
7890 store_statfs_t& expected_statfs,
7891 FSCKDepth depth)
7c673cae 7892{
20effc67 7893 dout(30) << __func__ << " " << ctx_descr << ", extents " << extents << dendl;
7c673cae
FG
7894 int errors = 0;
7895 for (auto e : extents) {
7896 if (!e.is_valid())
7897 continue;
7898 expected_statfs.allocated += e.length;
7899 if (compressed) {
11fdf7f2 7900 expected_statfs.data_compressed_allocated += e.length;
7c673cae 7901 }
eafe8130
TL
7902 if (depth != FSCK_SHALLOW) {
7903 bool already = false;
9f95a23c 7904 apply_for_bitset_range(
eafe8130
TL
7905 e.offset, e.length, granularity, used_blocks,
7906 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
7907 if (bs.test(pos)) {
7908 if (repairer) {
7909 repairer->note_misreference(
7910 pos * min_alloc_size, min_alloc_size, !already);
7911 }
7912 if (!already) {
20effc67 7913 derr << __func__ << "::fsck error: " << ctx_descr << ", extent " << e
eafe8130
TL
7914 << " or a subset is already allocated (misreferenced)" << dendl;
7915 ++errors;
7916 already = true;
7917 }
11fdf7f2 7918 }
eafe8130
TL
7919 else
7920 bs.set(pos);
7921 });
11fdf7f2 7922
eafe8130 7923 if (e.end() > bdev->get_size()) {
20effc67 7924 derr << "fsck error: " << ctx_descr << ", extent " << e
eafe8130
TL
7925 << " past end of block device" << dendl;
7926 ++errors;
7927 }
7c673cae
FG
7928 }
7929 }
7930 return errors;
7931}
7932
39ae355f
TL
7933void BlueStore::_fsck_check_statfs(
7934 const store_statfs_t& expected_statfs,
7935 const per_pool_statfs& expected_pool_statfs,
eafe8130
TL
7936 int64_t& errors,
7937 int64_t& warnings,
11fdf7f2
TL
7938 BlueStoreRepairer* repairer)
7939{
39ae355f
TL
7940 string key;
7941 store_statfs_t actual_statfs;
7942 store_statfs_t s;
7943 {
7944 // make a copy
7945 per_pool_statfs my_expected_pool_statfs(expected_pool_statfs);
7946 auto op = osd_pools.begin();
7947 while (op != osd_pools.end()) {
7948 get_pool_stat_key(op->first, &key);
7949 op->second.publish(&s);
7950 auto it_expected = my_expected_pool_statfs.find(op->first);
7951 if (it_expected == my_expected_pool_statfs.end()) {
7952 auto op0 = op++;
7953 if (op0->second.is_empty()) {
7954 // It's OK to lack relevant empty statfs record
7955 continue;
7956 }
7957 derr << __func__ << "::fsck error: " << std::hex
7958 << "pool " << op0->first << " has got no statfs to match against: "
7959 << s
7960 << std::dec << dendl;
7961 ++errors;
11fdf7f2 7962 if (repairer) {
39ae355f 7963 osd_pools.erase(op0);
11fdf7f2
TL
7964 repairer->remove_key(db, PREFIX_STAT, key);
7965 }
39ae355f
TL
7966 } else {
7967 if (!(s == it_expected->second)) {
7968 derr << "fsck error: actual " << s
7969 << " != expected " << it_expected->second
7970 << " for pool "
7971 << std::hex << op->first << std::dec << dendl;
7972 ++errors;
11fdf7f2 7973 if (repairer) {
39ae355f
TL
7974 // repair in-memory in a hope this would be flushed properly on shutdown
7975 s = it_expected->second;
7976 op->second = it_expected->second;
7977 repairer->fix_statfs(db, key, it_expected->second);
11fdf7f2 7978 }
11fdf7f2 7979 }
39ae355f
TL
7980 actual_statfs.add(s);
7981 my_expected_pool_statfs.erase(it_expected);
7982 ++op;
11fdf7f2 7983 }
11fdf7f2 7984 }
39ae355f
TL
7985 // check stats that lack matching entities in osd_pools
7986 for (auto &p : my_expected_pool_statfs) {
7987 if (p.second.is_zero()) {
7988 // It's OK to lack relevant empty statfs record
7989 continue;
7990 }
7991 get_pool_stat_key(p.first, &key);
7992 derr << __func__ << "::fsck error: " << std::hex
7993 << "pool " << p.first << " has got no actual statfs: "
7994 << std::dec << p.second
7995 << dendl;
7996 ++errors;
7997 if (repairer) {
7998 osd_pools[p.first] = p.second;
7999 repairer->fix_statfs(db, key, p.second);
8000 actual_statfs.add(p.second);
8001 }
8002 }
8003 }
8004 // process global statfs
8005 if (repairer) {
8006 if (!per_pool_stat_collection) {
8007 // by virtue of running this method, we correct the top-level
8008 // error of having global stats
8009 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
8010 per_pool_stat_collection = true;
8011 }
8012 vstatfs = actual_statfs;
8013 dout(20) << __func__ << " setting vstatfs to " << actual_statfs << dendl;
8014 } else if (!per_pool_stat_collection) {
8015 // check global stats only if fscking (not repairing) w/o per-pool stats
8016 vstatfs.publish(&s);
8017 if (!(s == expected_statfs)) {
8018 derr << "fsck error: actual " << s
8019 << " != expected " << expected_statfs << dendl;
8020 ++errors;
11fdf7f2 8021 }
eafe8130 8022 }
11fdf7f2
TL
8023}
8024
20effc67
TL
8025void BlueStore::_fsck_repair_shared_blobs(
8026 BlueStoreRepairer& repairer,
8027 shared_blob_2hash_tracker_t& sb_ref_counts,
8028 sb_info_space_efficient_map_t& sb_info)
8029{
8030 auto sb_ref_mismatches = sb_ref_counts.count_non_zero();
8031 dout(1) << __func__ << " repairing shared_blobs, ref mismatch estimate: "
8032 << sb_ref_mismatches << dendl;
8033 if (!sb_ref_mismatches) // not expected to succeed, just in case
8034 return;
8035
8036
8037 auto foreach_shared_blob = [&](std::function<
8038 void (coll_t,
8039 ghobject_t,
8040 uint64_t,
8041 const bluestore_blob_t&)> cb) {
8042 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
8043 if (it) {
8044 CollectionRef c;
8045 spg_t pgid;
8046 for (it->lower_bound(string()); it->valid(); it->next()) {
8047 dout(30) << __func__ << " key "
8048 << pretty_binary_string(it->key())
8049 << dendl;
8050 if (is_extent_shard_key(it->key())) {
8051 continue;
8052 }
8053
8054 ghobject_t oid;
8055 int r = get_key_object(it->key(), &oid);
8056 if (r < 0) {
8057 continue;
8058 }
8059
8060 if (!c ||
8061 oid.shard_id != pgid.shard ||
8062 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8063 !c->contains(oid)) {
8064 c = nullptr;
8065 for (auto& p : coll_map) {
8066 if (p.second->contains(oid)) {
8067 c = p.second;
8068 break;
8069 }
8070 }
8071 if (!c) {
8072 continue;
8073 }
8074 }
8075 dout(20) << __func__
8076 << " inspecting shared blob refs for col:" << c->cid
8077 << " obj:" << oid
8078 << dendl;
8079
8080 OnodeRef o;
39ae355f 8081 o.reset(Onode::create_decode(c, oid, it->key(), it->value()));
20effc67
TL
8082 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8083
8084 _dump_onode<30>(cct, *o);
8085
8086 mempool::bluestore_fsck::set<BlobRef> passed_sbs;
8087 for (auto& e : o->extent_map.extent_map) {
8088 auto& b = e.blob->get_blob();
8089 if (b.is_shared() && passed_sbs.count(e.blob) == 0) {
8090 auto sbid = e.blob->shared_blob->get_sbid();
8091 cb(c->cid, oid, sbid, b);
8092 passed_sbs.emplace(e.blob);
8093 }
8094 } // for ... extent_map
8095 } // for ... it->valid
8096 } //if (it(PREFIX_OBJ))
8097 }; //foreach_shared_blob fn declaration
8098
8099 mempool::bluestore_fsck::map<uint64_t, bluestore_extent_ref_map_t> refs_map;
8100
8101 // first iteration over objects to identify all the broken sbids
8102 foreach_shared_blob( [&](coll_t cid,
8103 ghobject_t oid,
8104 uint64_t sbid,
8105 const bluestore_blob_t& b) {
8106 auto it = refs_map.lower_bound(sbid);
8107 if(it != refs_map.end() && it->first == sbid) {
8108 return;
8109 }
8110 for (auto& p : b.get_extents()) {
8111 if (p.is_valid() &&
8112 !sb_ref_counts.test_all_zero_range(sbid,
8113 p.offset,
8114 p.length)) {
8115 refs_map.emplace_hint(it, sbid, bluestore_extent_ref_map_t());
8116 dout(20) << __func__
8117 << " broken shared blob found for col:" << cid
8118 << " obj:" << oid
8119 << " sbid 0x " << std::hex << sbid << std::dec
8120 << dendl;
8121 break;
8122 }
8123 }
8124 });
8125
8126 // second iteration over objects to build new ref map for the broken sbids
8127 foreach_shared_blob( [&](coll_t cid,
8128 ghobject_t oid,
8129 uint64_t sbid,
8130 const bluestore_blob_t& b) {
8131 auto it = refs_map.find(sbid);
8132 if(it == refs_map.end()) {
8133 return;
8134 }
8135 for (auto& p : b.get_extents()) {
8136 if (p.is_valid()) {
8137 it->second.get(p.offset, p.length);
8138 break;
8139 }
8140 }
8141 });
8142
8143 // update shared blob records
8144 auto ref_it = refs_map.begin();
8145 while (ref_it != refs_map.end()) {
8146 size_t cnt = 0;
8147 const size_t max_transactions = 4096;
8148 KeyValueDB::Transaction txn = db->get_transaction();
8149 for (cnt = 0;
8150 cnt < max_transactions && ref_it != refs_map.end();
8151 ref_it++) {
8152 auto sbid = ref_it->first;
8153 dout(20) << __func__ << " repaired shared_blob 0x"
8154 << std::hex << sbid << std::dec
8155 << ref_it->second << dendl;
8156 repairer.fix_shared_blob(txn, sbid, &ref_it->second, 0);
8157 cnt++;
8158 }
8159 if (cnt) {
8160 db->submit_transaction_sync(txn);
8161 cnt = 0;
8162 }
8163 }
8164 // remove stray shared blob records
8165 size_t cnt = 0;
8166 const size_t max_transactions = 4096;
8167 KeyValueDB::Transaction txn = db->get_transaction();
8168 sb_info.foreach_stray([&](const sb_info_t& sbi) {
8169 auto sbid = sbi.get_sbid();
8170 dout(20) << __func__ << " removing stray shared_blob 0x"
8171 << std::hex << sbid << std::dec
8172 << dendl;
8173 repairer.fix_shared_blob(txn, sbid, nullptr, 0);
8174 cnt++;
8175 if (cnt >= max_transactions) {}
8176 db->submit_transaction_sync(txn);
8177 txn = db->get_transaction();
8178 cnt = 0;
8179 });
8180 if (cnt > 0) {
8181 db->submit_transaction_sync(txn);
8182 }
8183
8184 // amount of repairs to report to be equal to previously
8185 // determined error estimation, not the actual number of updated shared blobs
8186 repairer.inc_repaired(sb_ref_mismatches);
8187}
8188
eafe8130
TL
8189BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
8190 BlueStore::FSCKDepth depth,
8191 int64_t pool_id,
8192 BlueStore::CollectionRef c,
8193 const ghobject_t& oid,
8194 const string& key,
8195 const bufferlist& value,
9f95a23c 8196 mempool::bluestore_fsck::list<string>* expecting_shards,
eafe8130
TL
8197 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
8198 const BlueStore::FSCK_ObjectCtx& ctx)
8199{
8200 auto& errors = ctx.errors;
8201 auto& num_objects = ctx.num_objects;
8202 auto& num_extents = ctx.num_extents;
8203 auto& num_blobs = ctx.num_blobs;
8204 auto& num_sharded_objects = ctx.num_sharded_objects;
8205 auto& num_spanning_blobs = ctx.num_spanning_blobs;
8206 auto used_blocks = ctx.used_blocks;
8207 auto sb_info_lock = ctx.sb_info_lock;
8208 auto& sb_info = ctx.sb_info;
20effc67 8209 auto& sb_ref_counts = ctx.sb_ref_counts;
eafe8130
TL
8210 auto repairer = ctx.repairer;
8211
8212 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
8213 &ctx.expected_pool_statfs[pool_id] :
8214 &ctx.expected_store_statfs;
8215
20effc67
TL
8216 map<uint32_t, uint64_t> zone_first_offsets; // for zoned/smr devices
8217
eafe8130
TL
8218 dout(10) << __func__ << " " << oid << dendl;
8219 OnodeRef o;
39ae355f 8220 o.reset(Onode::create_decode(c, oid, key, value));
eafe8130 8221 ++num_objects;
7c673cae 8222
eafe8130 8223 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7c673cae 8224
eafe8130
TL
8225 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8226 _dump_onode<30>(cct, *o);
8227 // shards
8228 if (!o->extent_map.shards.empty()) {
8229 ++num_sharded_objects;
8230 if (depth != FSCK_SHALLOW) {
9f95a23c 8231 ceph_assert(expecting_shards);
eafe8130
TL
8232 for (auto& s : o->extent_map.shards) {
8233 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
9f95a23c 8234 expecting_shards->push_back(string());
eafe8130 8235 get_extent_shard_key(o->key, s.shard_info->offset,
9f95a23c 8236 &expecting_shards->back());
eafe8130
TL
8237 if (s.shard_info->offset >= o->onode.size) {
8238 derr << "fsck error: " << oid << " shard 0x" << std::hex
8239 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
8240 << std::dec << dendl;
8241 ++errors;
8242 }
8243 }
8244 }
8245 }
7c673cae 8246
eafe8130
TL
8247 // lextents
8248 uint64_t pos = 0;
8249 mempool::bluestore_fsck::map<BlobRef,
8250 bluestore_blob_use_tracker_t> ref_map;
8251 for (auto& l : o->extent_map.extent_map) {
8252 dout(20) << __func__ << " " << l << dendl;
8253 if (l.logical_offset < pos) {
8254 derr << "fsck error: " << oid << " lextent at 0x"
8255 << std::hex << l.logical_offset
8256 << " overlaps with the previous, which ends at 0x" << pos
8257 << std::dec << dendl;
8258 ++errors;
8259 }
8260 if (depth != FSCK_SHALLOW &&
8261 o->extent_map.spans_shard(l.logical_offset, l.length)) {
8262 derr << "fsck error: " << oid << " lextent at 0x"
8263 << std::hex << l.logical_offset << "~" << l.length
8264 << " spans a shard boundary"
8265 << std::dec << dendl;
8266 ++errors;
8267 }
8268 pos = l.logical_offset + l.length;
8269 res_statfs->data_stored += l.length;
8270 ceph_assert(l.blob);
8271 const bluestore_blob_t& blob = l.blob->get_blob();
8272
20effc67
TL
8273#ifdef HAVE_LIBZBD
8274 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
8275 for (auto& e : blob.get_extents()) {
8276 if (e.is_valid()) {
8277 uint32_t zone = e.offset / zone_size;
8278 uint64_t offset = e.offset % zone_size;
8279 auto p = zone_first_offsets.find(zone);
8280 if (p == zone_first_offsets.end() || p->second > offset) {
8281 // FIXME: use interator for guided insert?
8282 zone_first_offsets[zone] = offset;
8283 }
8284 }
8285 }
8286 }
8287#endif
8288
8289 auto& ref = ref_map[l.blob];
8290 if (ref.is_empty()) {
8291 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
8292 uint32_t l = blob.get_logical_length();
8293 ref.init(l, min_release_size);
eafe8130
TL
8294 }
8295 ref.get(
8296 l.blob_offset,
8297 l.length);
8298 ++num_extents;
8299 if (depth != FSCK_SHALLOW &&
8300 blob.has_unused()) {
8301 ceph_assert(referenced);
8302 auto p = referenced->find(l.blob);
8303 bluestore_blob_t::unused_t* pu;
8304 if (p == referenced->end()) {
8305 pu = &(*referenced)[l.blob];
8306 }
8307 else {
8308 pu = &p->second;
8309 }
8310 uint64_t blob_len = blob.get_logical_length();
8311 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
8312 ceph_assert(l.blob_offset + l.length <= blob_len);
8313 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
8314 uint64_t start = l.blob_offset / chunk_size;
8315 uint64_t end =
8316 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
8317 for (auto i = start; i < end; ++i) {
8318 (*pu) |= (1u << i);
8319 }
8320 }
8321 } //for (auto& l : o->extent_map.extent_map)
8322
8323 for (auto& i : ref_map) {
8324 ++num_blobs;
8325 const bluestore_blob_t& blob = i.first->get_blob();
8326 bool equal =
8327 depth == FSCK_SHALLOW ? true :
8328 i.first->get_blob_use_tracker().equal(i.second);
8329 if (!equal) {
8330 derr << "fsck error: " << oid << " blob " << *i.first
8331 << " doesn't match expected ref_map " << i.second << dendl;
8332 ++errors;
8333 }
8334 if (blob.is_compressed()) {
8335 res_statfs->data_compressed += blob.get_compressed_payload_length();
8336 res_statfs->data_compressed_original +=
8337 i.first->get_referenced_bytes();
8338 }
20effc67
TL
8339 if (depth != FSCK_SHALLOW && repairer) {
8340 for (auto e : blob.get_extents()) {
8341 if (!e.is_valid())
8342 continue;
8343 repairer->set_space_used(e.offset, e.length, c->cid, oid);
8344 }
8345 }
eafe8130
TL
8346 if (blob.is_shared()) {
8347 if (i.first->shared_blob->get_sbid() > blobid_max) {
8348 derr << "fsck error: " << oid << " blob " << blob
8349 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
8350 << blobid_max << dendl;
8351 ++errors;
20effc67 8352 } else if (i.first->shared_blob->get_sbid() == 0) {
eafe8130
TL
8353 derr << "fsck error: " << oid << " blob " << blob
8354 << " marked as shared but has uninitialized sbid"
8355 << dendl;
8356 ++errors;
8357 }
8358 // the below lock is optional and provided in multithreading mode only
8359 if (sb_info_lock) {
8360 sb_info_lock->lock();
8361 }
20effc67
TL
8362 auto sbid = i.first->shared_blob->get_sbid();
8363 sb_info_t& sbi = sb_info.add_or_adopt(i.first->shared_blob->get_sbid());
8364 ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID ||
eafe8130 8365 sbi.pool_id == oid.hobj.get_logical_pool());
eafe8130 8366 sbi.pool_id = oid.hobj.get_logical_pool();
20effc67 8367 bool compressed = blob.is_compressed();
eafe8130
TL
8368 for (auto e : blob.get_extents()) {
8369 if (e.is_valid()) {
20effc67
TL
8370 if (compressed) {
8371 ceph_assert(sbi.allocated_chunks <= 0);
8372 sbi.allocated_chunks -= (e.length >> min_alloc_size_order);
8373 } else {
8374 ceph_assert(sbi.allocated_chunks >= 0);
8375 sbi.allocated_chunks += (e.length >> min_alloc_size_order);
8376 }
8377 sb_ref_counts.inc_range(sbid, e.offset, e.length, 1);
eafe8130
TL
8378 }
8379 }
8380 if (sb_info_lock) {
8381 sb_info_lock->unlock();
8382 }
8383 } else if (depth != FSCK_SHALLOW) {
8384 ceph_assert(used_blocks);
20effc67
TL
8385 string ctx_descr = " oid " + stringify(oid);
8386 errors += _fsck_check_extents(ctx_descr,
8387 blob.get_extents(),
eafe8130
TL
8388 blob.is_compressed(),
8389 *used_blocks,
8390 fm->get_alloc_size(),
20effc67 8391 repairer,
eafe8130
TL
8392 *res_statfs,
8393 depth);
8394 } else {
8395 errors += _fsck_sum_extents(
8396 blob.get_extents(),
8397 blob.is_compressed(),
8398 *res_statfs);
8399 }
8400 } // for (auto& i : ref_map)
9f95a23c 8401
adb31ebb
TL
8402 {
8403 auto &sbm = o->extent_map.spanning_blob_map;
8404 size_t broken = 0;
8405 BlobRef first_broken;
8406 for (auto it = sbm.begin(); it != sbm.end();) {
8407 auto it1 = it++;
8408 if (ref_map.count(it1->second) == 0) {
8409 if (!broken) {
8410 first_broken = it1->second;
8411 ++errors;
39ae355f
TL
8412 derr << "fsck error:" << " stray spanning blob found:" << it1->first
8413 << dendl;
adb31ebb
TL
8414 }
8415 broken++;
8416 if (repairer) {
8417 sbm.erase(it1);
8418 }
8419 }
8420 }
20effc67
TL
8421
8422#ifdef HAVE_LIBZBD
8423 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
8424 for (auto& [zone, first_offset] : zone_first_offsets) {
8425 auto p = (*ctx.zone_refs)[zone].find(oid);
8426 if (p != (*ctx.zone_refs)[zone].end()) {
8427 if (first_offset < p->second) {
8428 dout(20) << " slightly wonky zone ref 0x" << std::hex << zone
8429 << " offset 0x" << p->second
8430 << " but first offset is 0x" << first_offset
8431 << "; this can happen due to clone_range"
8432 << dendl;
8433 } else {
8434 dout(20) << " good zone ref 0x" << std::hex << zone << " offset 0x" << p->second
8435 << " <= first offset 0x" << first_offset
8436 << std::dec << dendl;
8437 }
8438 (*ctx.zone_refs)[zone].erase(p);
8439 } else {
8440 derr << "fsck error: " << oid << " references zone 0x" << std::hex << zone
8441 << " but there is no zone ref" << std::dec << dendl;
8442 // FIXME: add repair
8443 ++errors;
8444 }
8445 }
8446 }
8447#endif
8448
adb31ebb
TL
8449 if (broken) {
8450 derr << "fsck error: " << oid << " - " << broken
8451 << " zombie spanning blob(s) found, the first one: "
8452 << *first_broken << dendl;
8453 if(repairer) {
b3b6e05e
TL
8454 repairer->fix_spanning_blobs(
8455 db,
8456 [&](KeyValueDB::Transaction txn) {
8457 _record_onode(o, txn);
8458 });
adb31ebb
TL
8459 }
8460 }
8461 }
8462
9f95a23c
TL
8463 if (o->onode.has_omap()) {
8464 _fsck_check_object_omap(depth, o, ctx);
8465 }
8466
eafe8130
TL
8467 return o;
8468}
8469
8470#include "common/WorkQueue.h"
8471
8472class ShallowFSCKThreadPool : public ThreadPool
8473{
8474public:
8475 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
8476 ThreadPool(cct_, nm, tn, n) {
8477 }
8478 void worker(ThreadPool::WorkThread* wt) override {
8479 int next_wq = 0;
8480 while (!_stop) {
8481 next_wq %= work_queues.size();
8482 WorkQueue_ *wq = work_queues[next_wq++];
8483
8484 void* item = wq->_void_dequeue();
8485 if (item) {
8486 processing++;
8487 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
8488 wq->_void_process(item, tp_handle);
8489 processing--;
8490 }
8491 }
8492 }
8493 template <size_t BatchLen>
8494 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
8495 {
8496 struct Entry {
8497 int64_t pool_id;
8498 BlueStore::CollectionRef c;
8499 ghobject_t oid;
8500 string key;
8501 bufferlist value;
8502 };
8503 struct Batch {
8504 std::atomic<size_t> running = { 0 };
8505 size_t entry_count = 0;
8506 std::array<Entry, BatchLen> entries;
8507
8508 int64_t errors = 0;
8509 int64_t warnings = 0;
8510 uint64_t num_objects = 0;
8511 uint64_t num_extents = 0;
8512 uint64_t num_blobs = 0;
8513 uint64_t num_sharded_objects = 0;
8514 uint64_t num_spanning_blobs = 0;
8515 store_statfs_t expected_store_statfs;
8516 BlueStore::per_pool_statfs expected_pool_statfs;
8517 };
8518
8519 size_t batchCount;
8520 BlueStore* store = nullptr;
8521
eafe8130 8522 ceph::mutex* sb_info_lock = nullptr;
20effc67
TL
8523 sb_info_space_efficient_map_t* sb_info = nullptr;
8524 shared_blob_2hash_tracker_t* sb_ref_counts = nullptr;
eafe8130
TL
8525 BlueStoreRepairer* repairer = nullptr;
8526
8527 Batch* batches = nullptr;
8528 size_t last_batch_pos = 0;
8529 bool batch_acquired = false;
8530
8531 FSCKWorkQueue(std::string n,
8532 size_t _batchCount,
8533 BlueStore* _store,
eafe8130 8534 ceph::mutex* _sb_info_lock,
20effc67
TL
8535 sb_info_space_efficient_map_t& _sb_info,
8536 shared_blob_2hash_tracker_t& _sb_ref_counts,
eafe8130 8537 BlueStoreRepairer* _repairer) :
f67539c2 8538 WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()),
eafe8130
TL
8539 batchCount(_batchCount),
8540 store(_store),
eafe8130
TL
8541 sb_info_lock(_sb_info_lock),
8542 sb_info(&_sb_info),
20effc67 8543 sb_ref_counts(&_sb_ref_counts),
eafe8130
TL
8544 repairer(_repairer)
8545 {
8546 batches = new Batch[batchCount];
8547 }
8548 ~FSCKWorkQueue() {
8549 delete[] batches;
8550 }
8551
8552 /// Remove all work items from the queue.
8553 void _clear() override {
8554 //do nothing
8555 }
8556 /// Check whether there is anything to do.
8557 bool _empty() override {
8558 ceph_assert(false);
8559 }
8560
8561 /// Get the next work item to process.
8562 void* _void_dequeue() override {
8563 size_t pos = rand() % batchCount;
8564 size_t pos0 = pos;
8565 do {
8566 auto& batch = batches[pos];
8567 if (batch.running.fetch_add(1) == 0) {
8568 if (batch.entry_count) {
8569 return &batch;
8570 }
8571 }
8572 batch.running--;
8573 pos++;
8574 pos %= batchCount;
8575 } while (pos != pos0);
8576 return nullptr;
8577 }
8578 /** @brief Process the work item.
8579 * This function will be called several times in parallel
8580 * and must therefore be thread-safe. */
8581 void _void_process(void* item, TPHandle& handle) override {
8582 Batch* batch = (Batch*)item;
8583
8584 BlueStore::FSCK_ObjectCtx ctx(
8585 batch->errors,
8586 batch->warnings,
8587 batch->num_objects,
8588 batch->num_extents,
8589 batch->num_blobs,
8590 batch->num_sharded_objects,
8591 batch->num_spanning_blobs,
8592 nullptr, // used_blocks
9f95a23c 8593 nullptr, //used_omap_head
20effc67 8594 nullptr,
eafe8130
TL
8595 sb_info_lock,
8596 *sb_info,
20effc67 8597 *sb_ref_counts,
eafe8130
TL
8598 batch->expected_store_statfs,
8599 batch->expected_pool_statfs,
8600 repairer);
8601
8602 for (size_t i = 0; i < batch->entry_count; i++) {
8603 auto& entry = batch->entries[i];
8604
8605 store->fsck_check_objects_shallow(
8606 BlueStore::FSCK_SHALLOW,
8607 entry.pool_id,
8608 entry.c,
8609 entry.oid,
8610 entry.key,
8611 entry.value,
9f95a23c 8612 nullptr, // expecting_shards - this will need a protection if passed
eafe8130
TL
8613 nullptr, // referenced
8614 ctx);
8615 }
eafe8130
TL
8616 batch->entry_count = 0;
8617 batch->running--;
8618 }
8619 /** @brief Synchronously finish processing a work item.
8620 * This function is called after _void_process with the global thread pool lock held,
8621 * so at most one copy will execute simultaneously for a given thread pool.
8622 * It can be used for non-thread-safe finalization. */
8623 void _void_process_finish(void*) override {
8624 ceph_assert(false);
8625 }
8626
8627 bool queue(
8628 int64_t pool_id,
8629 BlueStore::CollectionRef c,
8630 const ghobject_t& oid,
8631 const string& key,
8632 const bufferlist& value) {
8633 bool res = false;
8634 size_t pos0 = last_batch_pos;
8635 if (!batch_acquired) {
8636 do {
8637 auto& batch = batches[last_batch_pos];
8638 if (batch.running.fetch_add(1) == 0) {
8639 if (batch.entry_count < BatchLen) {
8640 batch_acquired = true;
8641 break;
8642 }
8643 }
8644 batch.running.fetch_sub(1);
8645 last_batch_pos++;
8646 last_batch_pos %= batchCount;
8647 } while (last_batch_pos != pos0);
8648 }
8649 if (batch_acquired) {
8650 auto& batch = batches[last_batch_pos];
8651 ceph_assert(batch.running);
8652 ceph_assert(batch.entry_count < BatchLen);
8653
8654 auto& entry = batch.entries[batch.entry_count];
8655 entry.pool_id = pool_id;
8656 entry.c = c;
8657 entry.oid = oid;
8658 entry.key = key;
8659 entry.value = value;
8660
8661 ++batch.entry_count;
8662 if (batch.entry_count == BatchLen) {
8663 batch_acquired = false;
8664 batch.running.fetch_sub(1);
8665 last_batch_pos++;
8666 last_batch_pos %= batchCount;
8667 }
8668 res = true;
8669 }
8670 return res;
8671 }
8672
8673 void finalize(ThreadPool& tp,
8674 BlueStore::FSCK_ObjectCtx& ctx) {
8675 if (batch_acquired) {
8676 auto& batch = batches[last_batch_pos];
8677 ceph_assert(batch.running);
8678 batch.running.fetch_sub(1);
8679 }
8680 tp.stop();
8681
8682 for (size_t i = 0; i < batchCount; i++) {
8683 auto& batch = batches[i];
8684
8685 //process leftovers if any
8686 if (batch.entry_count) {
8687 TPHandle tp_handle(store->cct,
8688 nullptr,
8689 timeout_interval,
8690 suicide_interval);
8691 ceph_assert(batch.running == 0);
8692
8693 batch.running++; // just to be on-par with the regular call
8694 _void_process(&batch, tp_handle);
8695 }
8696 ceph_assert(batch.entry_count == 0);
8697
8698 ctx.errors += batch.errors;
8699 ctx.warnings += batch.warnings;
8700 ctx.num_objects += batch.num_objects;
8701 ctx.num_extents += batch.num_extents;
8702 ctx.num_blobs += batch.num_blobs;
8703 ctx.num_sharded_objects += batch.num_sharded_objects;
8704 ctx.num_spanning_blobs += batch.num_spanning_blobs;
9f95a23c 8705
eafe8130
TL
8706 ctx.expected_store_statfs.add(batch.expected_store_statfs);
8707
8708 for (auto it = batch.expected_pool_statfs.begin();
8709 it != batch.expected_pool_statfs.end();
8710 it++) {
8711 ctx.expected_pool_statfs[it->first].add(it->second);
8712 }
8713 }
8714 }
8715 };
8716};
8717
9f95a23c
TL
8718void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
8719 OnodeRef& o,
8720 const BlueStore::FSCK_ObjectCtx& ctx)
eafe8130 8721{
9f95a23c
TL
8722 auto& errors = ctx.errors;
8723 auto& warnings = ctx.warnings;
8724 auto repairer = ctx.repairer;
8725
8726 ceph_assert(o->onode.has_omap());
8727 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
f67539c2 8728 if (per_pool_omap == OMAP_PER_POOL) {
9f95a23c
TL
8729 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8730 << "fsck error: " << o->oid
8731 << " has omap that is not per-pool or pgmeta"
8732 << fsck_dendl;
8733 ++errors;
8734 } else {
8735 const char* w;
8736 int64_t num;
8737 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8738 ++errors;
8739 num = errors;
8740 w = "error";
8741 } else {
8742 ++warnings;
8743 num = warnings;
8744 w = "warning";
8745 }
8746 fsck_derr(num, MAX_FSCK_ERROR_LINES)
8747 << "fsck " << w << ": " << o->oid
8748 << " has omap that is not per-pool or pgmeta"
8749 << fsck_dendl;
8750 }
f67539c2
TL
8751 } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) {
8752 if (per_pool_omap == OMAP_PER_PG) {
8753 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8754 << "fsck error: " << o->oid
8755 << " has omap that is not per-pg or pgmeta"
8756 << fsck_dendl;
8757 ++errors;
8758 } else {
8759 const char* w;
8760 int64_t num;
8761 if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) {
8762 ++errors;
8763 num = errors;
8764 w = "error";
8765 } else {
8766 ++warnings;
8767 num = warnings;
8768 w = "warning";
8769 }
8770 fsck_derr(num, MAX_FSCK_ERROR_LINES)
8771 << "fsck " << w << ": " << o->oid
8772 << " has omap that is not per-pg or pgmeta"
8773 << fsck_dendl;
8774 }
9f95a23c
TL
8775 }
8776 if (repairer &&
f67539c2 8777 !o->onode.is_perpg_omap() &&
9f95a23c 8778 !o->onode.is_pgmeta_omap()) {
f67539c2 8779 dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl;
522d829b 8780 bufferlist header;
9f95a23c 8781 map<string, bufferlist> kv;
522d829b
TL
8782 {
8783 KeyValueDB::Transaction txn = db->get_transaction();
8784 uint64_t txn_cost = 0;
8785 const string& prefix = Onode::calc_omap_prefix(o->onode.flags);
8786 uint8_t new_flags = o->onode.flags |
8787 bluestore_onode_t::FLAG_PERPOOL_OMAP |
8788 bluestore_onode_t::FLAG_PERPG_OMAP;
8789 const string& new_omap_prefix = Onode::calc_omap_prefix(new_flags);
8790
8791 KeyValueDB::Iterator it = db->get_iterator(prefix);
8792 string head, tail;
8793 o->get_omap_header(&head);
8794 o->get_omap_tail(&tail);
8795 it->lower_bound(head);
8796 // head
8797 if (it->valid() && it->key() == head) {
8798 dout(30) << __func__ << " got header" << dendl;
8799 header = it->value();
8800 if (header.length()) {
8801 string new_head;
8802 Onode::calc_omap_header(new_flags, o.get(), &new_head);
8803 txn->set(new_omap_prefix, new_head, header);
8804 txn_cost += new_head.length() + header.length();
8805 }
a4b75251 8806 it->next();
522d829b
TL
8807 }
8808 // tail
8809 {
8810 string new_tail;
8811 Onode::calc_omap_tail(new_flags, o.get(), &new_tail);
8812 bufferlist empty;
8813 txn->set(new_omap_prefix, new_tail, empty);
8814 txn_cost += new_tail.length() + new_tail.length();
8815 }
8816 // values
8817 string final_key;
8818 Onode::calc_omap_key(new_flags, o.get(), string(), &final_key);
8819 size_t base_key_len = final_key.size();
8820 while (it->valid() && it->key() < tail) {
8821 string user_key;
8822 o->decode_omap_key(it->key(), &user_key);
8823 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
8824 << " -> " << user_key << dendl;
8825
8826 final_key.resize(base_key_len);
a4b75251 8827 final_key += user_key;
522d829b
TL
8828 auto v = it->value();
8829 txn->set(new_omap_prefix, final_key, v);
8830 txn_cost += final_key.length() + v.length();
8831
8832 // submit a portion if cost exceeds 16MB
8833 if (txn_cost >= 16 * (1 << 20) ) {
8834 db->submit_transaction_sync(txn);
8835 txn = db->get_transaction();
8836 txn_cost = 0;
8837 }
8838 it->next();
8839 }
8840 if (txn_cost > 0) {
8841 db->submit_transaction_sync(txn);
8842 }
8843 }
8844 // finalize: remove legacy data
8845 {
9f95a23c
TL
8846 KeyValueDB::Transaction txn = db->get_transaction();
8847 // remove old keys
8848 const string& old_omap_prefix = o->get_omap_prefix();
8849 string old_head, old_tail;
8850 o->get_omap_header(&old_head);
8851 o->get_omap_tail(&old_tail);
8852 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
8853 txn->rmkey(old_omap_prefix, old_tail);
8854 // set flag
f67539c2 8855 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP);
9f95a23c 8856 _record_onode(o, txn);
9f95a23c
TL
8857 db->submit_transaction_sync(txn);
8858 repairer->inc_repaired();
522d829b 8859 repairer->request_compaction();
9f95a23c 8860 }
eafe8130 8861 }
9f95a23c 8862}
eafe8130 8863
20effc67
TL
8864void BlueStore::_fsck_check_objects(
8865 FSCKDepth depth,
9f95a23c
TL
8866 BlueStore::FSCK_ObjectCtx& ctx)
8867{
eafe8130 8868 auto& errors = ctx.errors;
eafe8130
TL
8869 auto sb_info_lock = ctx.sb_info_lock;
8870 auto& sb_info = ctx.sb_info;
20effc67 8871 auto& sb_ref_counts = ctx.sb_ref_counts;
eafe8130
TL
8872 auto repairer = ctx.repairer;
8873
8874 uint64_t_btree_t used_nids;
8875
8876 size_t processed_myself = 0;
8877
f67539c2 8878 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
8879 mempool::bluestore_fsck::list<string> expecting_shards;
8880 if (it) {
8881 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
8882 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
8883 std::unique_ptr<WQ> wq(
8884 new WQ(
8885 "FSCKWorkQueue",
8886 (thread_count ? : 1) * 32,
8887 this,
eafe8130
TL
8888 sb_info_lock,
8889 sb_info,
20effc67 8890 sb_ref_counts,
eafe8130
TL
8891 repairer));
8892
8893 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
8894
8895 thread_pool.add_work_queue(wq.get());
8896 if (depth == FSCK_SHALLOW && thread_count > 0) {
8897 //not the best place but let's check anyway
8898 ceph_assert(sb_info_lock);
8899 thread_pool.start();
8900 }
8901
20effc67 8902 // fill global if not overriden below
eafe8130
TL
8903 CollectionRef c;
8904 int64_t pool_id = -1;
8905 spg_t pgid;
8906 for (it->lower_bound(string()); it->valid(); it->next()) {
8907 dout(30) << __func__ << " key "
8908 << pretty_binary_string(it->key()) << dendl;
8909 if (is_extent_shard_key(it->key())) {
8910 if (depth == FSCK_SHALLOW) {
8911 continue;
8912 }
8913 while (!expecting_shards.empty() &&
8914 expecting_shards.front() < it->key()) {
8915 derr << "fsck error: missing shard key "
8916 << pretty_binary_string(expecting_shards.front())
8917 << dendl;
8918 ++errors;
8919 expecting_shards.pop_front();
8920 }
8921 if (!expecting_shards.empty() &&
8922 expecting_shards.front() == it->key()) {
8923 // all good
8924 expecting_shards.pop_front();
8925 continue;
8926 }
8927
8928 uint32_t offset;
8929 string okey;
8930 get_key_extent_shard(it->key(), &okey, &offset);
8931 derr << "fsck error: stray shard 0x" << std::hex << offset
8932 << std::dec << dendl;
8933 if (expecting_shards.empty()) {
8934 derr << "fsck error: " << pretty_binary_string(it->key())
8935 << " is unexpected" << dendl;
8936 ++errors;
8937 continue;
8938 }
8939 while (expecting_shards.front() > it->key()) {
8940 derr << "fsck error: saw " << pretty_binary_string(it->key())
8941 << dendl;
8942 derr << "fsck error: exp "
8943 << pretty_binary_string(expecting_shards.front()) << dendl;
8944 ++errors;
8945 expecting_shards.pop_front();
8946 if (expecting_shards.empty()) {
8947 break;
8948 }
8949 }
8950 continue;
8951 }
8952
8953 ghobject_t oid;
8954 int r = get_key_object(it->key(), &oid);
8955 if (r < 0) {
8956 derr << "fsck error: bad object key "
8957 << pretty_binary_string(it->key()) << dendl;
8958 ++errors;
8959 continue;
8960 }
8961 if (!c ||
8962 oid.shard_id != pgid.shard ||
8963 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8964 !c->contains(oid)) {
8965 c = nullptr;
8966 for (auto& p : coll_map) {
8967 if (p.second->contains(oid)) {
8968 c = p.second;
8969 break;
8970 }
8971 }
8972 if (!c) {
8973 derr << "fsck error: stray object " << oid
8974 << " not owned by any collection" << dendl;
8975 ++errors;
8976 continue;
8977 }
8978 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
8979 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
8980 << dendl;
8981 }
8982
8983 if (depth != FSCK_SHALLOW &&
8984 !expecting_shards.empty()) {
8985 for (auto& k : expecting_shards) {
8986 derr << "fsck error: missing shard key "
8987 << pretty_binary_string(k) << dendl;
8988 }
8989 ++errors;
8990 expecting_shards.clear();
8991 }
8992
8993 bool queued = false;
8994 if (depth == FSCK_SHALLOW && thread_count > 0) {
8995 queued = wq->queue(
8996 pool_id,
8997 c,
8998 oid,
8999 it->key(),
9000 it->value());
9001 }
9002 OnodeRef o;
9003 map<BlobRef, bluestore_blob_t::unused_t> referenced;
9004
9005 if (!queued) {
9006 ++processed_myself;
eafe8130
TL
9007 o = fsck_check_objects_shallow(
9008 depth,
9009 pool_id,
9010 c,
9011 oid,
9012 it->key(),
9013 it->value(),
9f95a23c 9014 &expecting_shards,
eafe8130
TL
9015 &referenced,
9016 ctx);
9017 }
9018
9019 if (depth != FSCK_SHALLOW) {
9020 ceph_assert(o != nullptr);
9021 if (o->onode.nid) {
9022 if (o->onode.nid > nid_max) {
9023 derr << "fsck error: " << oid << " nid " << o->onode.nid
9024 << " > nid_max " << nid_max << dendl;
9025 ++errors;
9026 }
9027 if (used_nids.count(o->onode.nid)) {
9028 derr << "fsck error: " << oid << " nid " << o->onode.nid
9029 << " already in use" << dendl;
9030 ++errors;
9031 continue; // go for next object
9032 }
9033 used_nids.insert(o->onode.nid);
9034 }
9035 for (auto& i : referenced) {
9036 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
9037 << std::dec << " for " << *i.first << dendl;
9038 const bluestore_blob_t& blob = i.first->get_blob();
9039 if (i.second & blob.unused) {
9040 derr << "fsck error: " << oid << " blob claims unused 0x"
9041 << std::hex << blob.unused
9042 << " but extents reference 0x" << i.second << std::dec
9043 << " on blob " << *i.first << dendl;
9044 ++errors;
9045 }
9046 if (blob.has_csum()) {
9047 uint64_t blob_len = blob.get_logical_length();
9048 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
9049 unsigned csum_count = blob.get_csum_count();
9050 unsigned csum_chunk_size = blob.get_csum_chunk_size();
9051 for (unsigned p = 0; p < csum_count; ++p) {
9052 unsigned pos = p * csum_chunk_size;
9053 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
9054 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
9055 unsigned mask = 1u << firstbit;
9056 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
9057 mask |= 1u << b;
9058 }
9059 if ((blob.unused & mask) == mask) {
9060 // this csum chunk region is marked unused
9061 if (blob.get_csum_item(p) != 0) {
9062 derr << "fsck error: " << oid
9063 << " blob claims csum chunk 0x" << std::hex << pos
9064 << "~" << csum_chunk_size
9065 << " is unused (mask 0x" << mask << " of unused 0x"
9066 << blob.unused << ") but csum is non-zero 0x"
9067 << blob.get_csum_item(p) << std::dec << " on blob "
9068 << *i.first << dendl;
9069 ++errors;
9070 }
9071 }
9072 }
9073 }
9074 }
9075 // omap
9076 if (o->onode.has_omap()) {
9f95a23c
TL
9077 ceph_assert(ctx.used_omap_head);
9078 if (ctx.used_omap_head->count(o->onode.nid)) {
9079 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
9080 << " already in use" << dendl;
eafe8130
TL
9081 ++errors;
9082 } else {
9f95a23c 9083 ctx.used_omap_head->insert(o->onode.nid);
eafe8130 9084 }
9f95a23c 9085 } // if (o->onode.has_omap())
eafe8130
TL
9086 if (depth == FSCK_DEEP) {
9087 bufferlist bl;
9088 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
9089 uint64_t offset = 0;
9090 do {
9091 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
9092 int r = _do_read(c.get(), o, offset, l, bl,
9093 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
9094 if (r < 0) {
9095 ++errors;
9096 derr << "fsck error: " << oid << std::hex
9097 << " error during read: "
9098 << " " << offset << "~" << l
9099 << " " << cpp_strerror(r) << std::dec
9100 << dendl;
9101 break;
9102 }
9103 offset += l;
9104 } while (offset < o->onode.size);
9105 } // deep
9106 } //if (depth != FSCK_SHALLOW)
9107 } // for (it->lower_bound(string()); it->valid(); it->next())
9108 if (depth == FSCK_SHALLOW && thread_count > 0) {
9109 wq->finalize(thread_pool, ctx);
9110 if (processed_myself) {
9111 // may be needs more threads?
9112 dout(0) << __func__ << " partial offload"
9113 << ", done myself " << processed_myself
9114 << " of " << ctx.num_objects
9115 << "objects, threads " << thread_count
9116 << dendl;
9117 }
9118 }
9119 } // if (it)
9120}
9121/**
9122An overview for currently implemented repair logics
9123performed in fsck in two stages: detection(+preparation) and commit.
9124Detection stage (in processing order):
9125 (Issue -> Repair action to schedule)
9126 - Detect undecodable keys for Shared Blobs -> Remove
9127 - Detect undecodable records for Shared Blobs -> Remove
9128 (might trigger missed Shared Blob detection below)
9129 - Detect stray records for Shared Blobs -> Remove
9130 - Detect misreferenced pextents -> Fix
9131 Prepare Bloom-like filter to track cid/oid -> pextent
9132 Prepare list of extents that are improperly referenced
9133 Enumerate Onode records that might use 'misreferenced' pextents
9134 (Bloom-like filter applied to reduce computation)
9135 Per each questinable Onode enumerate all blobs and identify broken ones
9136 (i.e. blobs having 'misreferences')
9137 Rewrite each broken blob data by allocating another extents and
9138 copying data there
9139 If blob is shared - unshare it and mark corresponding Shared Blob
9140 for removal
9141 Release previously allocated space
9142 Update Extent Map
9143 - Detect missed Shared Blobs -> Recreate
9144 - Detect undecodable deferred transaction -> Remove
9145 - Detect Freelist Manager's 'false free' entries -> Mark as used
9146 - Detect Freelist Manager's leaked entries -> Mark as free
9147 - Detect statfs inconsistency - Update
9148 Commit stage (separate DB commit per each step):
9149 - Apply leaked FM entries fix
9150 - Apply 'false free' FM entries fix
9151 - Apply 'Remove' actions
9152 - Apply fix for misreference pextents
9153 - Apply Shared Blob recreate
9154 (can be merged with the step above if misreferences were dectected)
9155 - Apply StatFS update
9156*/
9157int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
9158{
20effc67 9159 dout(5) << __func__
eafe8130
TL
9160 << (repair ? " repair" : " check")
9161 << (depth == FSCK_DEEP ? " (deep)" :
9162 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
9163 << dendl;
9164
9165 // in deep mode we need R/W write access to be able to replay deferred ops
20effc67 9166 const bool read_only = !(repair || depth == FSCK_DEEP);
f67539c2 9167 int r = _open_db_and_around(read_only);
20effc67 9168 if (r < 0) {
eafe8130 9169 return r;
20effc67
TL
9170 }
9171 auto close_db = make_scope_guard([&] {
9172 _close_db_and_around();
9173 });
7c673cae 9174
11fdf7f2
TL
9175 if (!read_only) {
9176 r = _upgrade_super();
9177 if (r < 0) {
20effc67 9178 return r;
11fdf7f2
TL
9179 }
9180 }
7c673cae 9181
20effc67 9182 // NullFreelistManager needs to open collection early
eafe8130 9183 r = _open_collections();
20effc67
TL
9184 if (r < 0) {
9185 return r;
9186 }
7c673cae
FG
9187
9188 mempool_thread.init();
20effc67
TL
9189 auto stop_mempool = make_scope_guard([&] {
9190 mempool_thread.shutdown();
9191 _shutdown_cache();
9192 });
11fdf7f2
TL
9193 // we need finisher and kv_{sync,finalize}_thread *just* for replay
9194 // enable in repair or deep mode modes only
9195 if (!read_only) {
9196 _kv_start();
9197 r = _deferred_replay();
9198 _kv_stop();
9199 }
eafe8130 9200
20effc67
TL
9201 if (r < 0) {
9202 return r;
9203 }
9204 return _fsck_on_open(depth, repair);
eafe8130
TL
9205}
9206
9207int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
9208{
20effc67
TL
9209 uint64_t sb_hash_size = uint64_t(
9210 cct->_conf.get_val<Option::size_t>("osd_memory_target") *
9211 cct->_conf.get_val<double>(
9212 "bluestore_fsck_shared_blob_tracker_size"));
9213
eafe8130
TL
9214 dout(1) << __func__
9215 << " <<<START>>>"
9216 << (repair ? " repair" : " check")
9217 << (depth == FSCK_DEEP ? " (deep)" :
9218 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
20effc67
TL
9219 << " start sb_tracker_hash_size:" << sb_hash_size
9220 << dendl;
eafe8130
TL
9221 int64_t errors = 0;
9222 int64_t warnings = 0;
9223 unsigned repaired = 0;
9224
9225 uint64_t_btree_t used_omap_head;
eafe8130
TL
9226 uint64_t_btree_t used_sbids;
9227
f67539c2 9228 mempool_dynamic_bitset used_blocks, bluefs_used_blocks;
eafe8130 9229 KeyValueDB::Iterator it;
39ae355f 9230 store_statfs_t expected_store_statfs;
eafe8130
TL
9231 per_pool_statfs expected_pool_statfs;
9232
20effc67
TL
9233 sb_info_space_efficient_map_t sb_info;
9234 shared_blob_2hash_tracker_t sb_ref_counts(
9235 sb_hash_size,
9236 min_alloc_size);
9237 size_t sb_ref_mismatches = 0;
9238
9239 /// map of oid -> (first_)offset for each zone
9240 std::vector<std::unordered_map<ghobject_t, uint64_t>> zone_refs; // FIXME: this may be a lot of RAM!
eafe8130
TL
9241
9242 uint64_t num_objects = 0;
9243 uint64_t num_extents = 0;
9244 uint64_t num_blobs = 0;
9245 uint64_t num_spanning_blobs = 0;
9246 uint64_t num_shared_blobs = 0;
9247 uint64_t num_sharded_objects = 0;
9248 BlueStoreRepairer repairer;
9249
f67539c2
TL
9250 auto alloc_size = fm->get_alloc_size();
9251
eafe8130
TL
9252 utime_t start = ceph_clock_now();
9253
9254 _fsck_collections(&errors);
b32b8144 9255 used_blocks.resize(fm->get_alloc_units());
7c673cae
FG
9256
9257 if (bluefs) {
f67539c2 9258 interval_set<uint64_t> bluefs_extents;
11fdf7f2 9259
1e59de90
TL
9260 bluefs->foreach_block_extents(
9261 bluefs_layout.shared_bdev,
9262 [&](uint64_t start, uint32_t len) {
9263 apply_for_bitset_range(start, len, alloc_size, used_blocks,
9264 [&](uint64_t pos, mempool_dynamic_bitset& bs) {
9265 ceph_assert(pos < bs.size());
9266 bs.set(pos);
9267 }
9268 );
9269 }
9270 );
f67539c2
TL
9271 }
9272
9273 bluefs_used_blocks = used_blocks;
9274
9275 apply_for_bitset_range(
9276 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks,
9277 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9278 bs.set(pos);
7c673cae 9279 }
f67539c2
TL
9280 );
9281
9282
9283 if (repair) {
b3b6e05e 9284 repairer.init_space_usage_tracker(
f67539c2
TL
9285 bdev->get_size(),
9286 min_alloc_size);
9287 }
9288
9289 if (bluefs) {
eafe8130 9290 int r = bluefs->fsck();
7c673cae 9291 if (r < 0) {
eafe8130 9292 return r;
7c673cae
FG
9293 }
9294 if (r > 0)
9295 errors += r;
9296 }
9297
eafe8130
TL
9298 if (!per_pool_stat_collection) {
9299 const char *w;
9300 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
9301 w = "error";
9302 ++errors;
9303 } else {
9304 w = "warning";
9305 ++warnings;
9306 }
9307 derr << "fsck " << w << ": store not yet converted to per-pool stats"
9308 << dendl;
9309 }
f67539c2 9310 if (per_pool_omap != OMAP_PER_PG) {
9f95a23c
TL
9311 const char *w;
9312 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
9313 w = "error";
9314 ++errors;
9315 } else {
9316 w = "warning";
9317 ++warnings;
9318 }
f67539c2 9319 derr << "fsck " << w << ": store not yet converted to per-pg omap"
9f95a23c
TL
9320 << dendl;
9321 }
9322
eafe8130
TL
9323 if (g_conf()->bluestore_debug_fsck_abort) {
9324 dout(1) << __func__ << " debug abort" << dendl;
9325 goto out_scan;
9326 }
20effc67
TL
9327
9328#ifdef HAVE_LIBZBD
9329 if (bdev->is_smr()) {
9330 auto a = dynamic_cast<ZonedAllocator*>(alloc);
9331 ceph_assert(a);
9332 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
9333 ceph_assert(f);
9334 vector<uint64_t> wp = bdev->get_zones();
9335 vector<zone_state_t> zones = f->get_zone_states(db);
9336 ceph_assert(wp.size() == zones.size());
9337 auto num_zones = bdev->get_size() / zone_size;
9338 for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
9339 uint64_t p = wp[i] == (i + 1) * zone_size ? zone_size : wp[i] % zone_size;
9340 if (zones[i].write_pointer > p &&
9341 zones[i].num_dead_bytes < zones[i].write_pointer) {
9342 derr << "fsck error: zone 0x" << std::hex << i
9343 << " bluestore write pointer 0x" << zones[i].write_pointer
9344 << " > device write pointer 0x" << p
9345 << " (with only 0x" << zones[i].num_dead_bytes << " dead bytes)"
9346 << std::dec << dendl;
9347 ++errors;
9348 }
9349 }
9350
9351 if (depth != FSCK_SHALLOW) {
9352 // load zone refs
9353 zone_refs.resize(bdev->get_size() / zone_size);
9354 it = db->get_iterator(PREFIX_ZONED_CL_INFO, KeyValueDB::ITERATOR_NOCACHE);
9355 if (it) {
9356 for (it->lower_bound(string());
9357 it->valid();
9358 it->next()) {
9359 uint32_t zone = 0;
9360 uint64_t offset = 0;
9361 ghobject_t oid;
9362 string key = it->key();
9363 int r = get_key_zone_offset_object(key, &zone, &offset, &oid);
9364 if (r < 0) {
9365 derr << "fsck error: invalid zone ref key " << pretty_binary_string(key)
9366 << dendl;
9367 if (repair) {
9368 repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
9369 }
9370 ++errors;
9371 continue;
9372 }
9373 dout(30) << " zone ref 0x" << std::hex << zone << " offset 0x" << offset
9374 << " -> " << std::dec << oid << dendl;
9375 if (zone_refs[zone].count(oid)) {
9376 derr << "fsck error: second zone ref in zone 0x" << std::hex << zone
9377 << " offset 0x" << offset << std::dec << " for " << oid << dendl;
9378 if (repair) {
9379 repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
9380 }
9381 ++errors;
9382 continue;
9383 }
9384 zone_refs[zone][oid] = offset;
9385 }
9386 }
9387 }
9388 }
9389#endif
9390
9391 dout(1) << __func__ << " checking shared_blobs (phase 1)" << dendl;
9392 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
9393 if (it) {
9394 for (it->lower_bound(string()); it->valid(); it->next()) {
9395 string key = it->key();
9396 uint64_t sbid;
9397 if (get_key_shared_blob(key, &sbid) < 0) {
9398 // Failed to parse the key.
9399 // This gonna to be handled at the second stage
9400 continue;
9401 }
9402 bluestore_shared_blob_t shared_blob(sbid);
9403 bufferlist bl = it->value();
9404 auto blp = bl.cbegin();
9405 try {
9406 decode(shared_blob, blp);
9407 }
9408 catch (ceph::buffer::error& e) {
9409 // this gonna to be handled at the second stage
9410 continue;
9411 }
9412 dout(20) << __func__ << " " << shared_blob << dendl;
9413 auto& sbi = sb_info.add_maybe_stray(sbid);
9414
9415 // primarily to silent the 'unused' warning
9416 ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID);
9417
9418 for (auto& r : shared_blob.ref_map.ref_map) {
9419 sb_ref_counts.inc_range(
9420 sbid,
9421 r.first,
9422 r.second.length,
9423 -r.second.refs);
9424 }
9425 }
9426 } // if (it) //checking shared_blobs (phase1)
9427
7c673cae 9428 // walk PREFIX_OBJ
eafe8130
TL
9429 {
9430 dout(1) << __func__ << " walking object keyspace" << dendl;
9431 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
9432 BlueStore::FSCK_ObjectCtx ctx(
9433 errors,
9434 warnings,
9435 num_objects,
9436 num_extents,
9437 num_blobs,
9438 num_sharded_objects,
9439 num_spanning_blobs,
9440 &used_blocks,
9441 &used_omap_head,
20effc67 9442 &zone_refs,
9f95a23c
TL
9443 //no need for the below lock when in non-shallow mode as
9444 // there is no multithreading in this case
9445 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
eafe8130 9446 sb_info,
20effc67 9447 sb_ref_counts,
eafe8130
TL
9448 expected_store_statfs,
9449 expected_pool_statfs,
9450 repair ? &repairer : nullptr);
9f95a23c
TL
9451
9452 _fsck_check_objects(depth, ctx);
eafe8130 9453 }
11fdf7f2 9454
20effc67
TL
9455#ifdef HAVE_LIBZBD
9456 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
9457 dout(1) << __func__ << " checking for leaked zone refs" << dendl;
9458 for (uint32_t zone = 0; zone < zone_refs.size(); ++zone) {
9459 for (auto& [oid, offset] : zone_refs[zone]) {
9460 derr << "fsck error: stray zone ref 0x" << std::hex << zone
9461 << " offset 0x" << offset << " -> " << std::dec << oid << dendl;
9462 // FIXME: add repair
9463 ++errors;
9464 }
9465 }
9466 }
9467#endif
9468
9469 sb_ref_mismatches = sb_ref_counts.count_non_zero();
9470 if (sb_ref_mismatches != 0) {
39ae355f
TL
9471 derr << "fsck error:" << "*" << sb_ref_mismatches
9472 << " shared blob references aren't matching, at least "
9473 << sb_ref_mismatches << " found" << dendl;
20effc67
TL
9474 errors += sb_ref_mismatches;
9475 }
9476
9477 if (depth != FSCK_SHALLOW && repair) {
9478 _fsck_repair_shared_blobs(repairer, sb_ref_counts, sb_info);
9479 }
9480 dout(1) << __func__ << " checking shared_blobs (phase 2)" << dendl;
f67539c2 9481 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
7c673cae 9482 if (it) {
eafe8130
TL
9483 // FIXME minor: perhaps simplify for shallow mode?
9484 // fill global if not overriden below
9485 auto expected_statfs = &expected_store_statfs;
7c673cae
FG
9486 for (it->lower_bound(string()); it->valid(); it->next()) {
9487 string key = it->key();
9488 uint64_t sbid;
9489 if (get_key_shared_blob(key, &sbid)) {
3efd9988 9490 derr << "fsck error: bad key '" << key
20effc67 9491 << "' in shared blob namespace" << dendl;
11fdf7f2
TL
9492 if (repair) {
9493 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9494 }
7c673cae
FG
9495 ++errors;
9496 continue;
9497 }
9498 auto p = sb_info.find(sbid);
9499 if (p == sb_info.end()) {
20effc67
TL
9500 if (sb_ref_mismatches > 0) {
9501 // highly likely this has been already reported before, ignoring...
9502 dout(5) << __func__ << " found duplicate(?) stray shared blob data for sbid 0x"
9503 << std::hex << sbid << std::dec << dendl;
9504 } else {
9505 derr<< "fsck error: found stray shared blob data for sbid 0x"
9506 << std::hex << sbid << std::dec << dendl;
9507 ++errors;
9508 if (repair) {
9509 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9510 }
11fdf7f2 9511 }
7c673cae
FG
9512 } else {
9513 ++num_shared_blobs;
20effc67 9514 sb_info_t& sbi = *p;
7c673cae
FG
9515 bluestore_shared_blob_t shared_blob(sbid);
9516 bufferlist bl = it->value();
11fdf7f2
TL
9517 auto blp = bl.cbegin();
9518 try {
20effc67
TL
9519 decode(shared_blob, blp);
9520 }
9521 catch (ceph::buffer::error& e) {
7c673cae 9522 ++errors;
20effc67
TL
9523
9524 derr << "fsck error: failed to decode Shared Blob"
9525 << pretty_binary_string(key) << dendl;
9526 if (repair) {
9527 dout(20) << __func__ << " undecodable Shared Blob, key:'"
9528 << pretty_binary_string(key)
9529 << "', removing" << dendl;
9530 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9531 }
9532 continue;
7c673cae 9533 }
20effc67 9534 dout(20) << __func__ << " " << shared_blob << dendl;
7c673cae 9535 PExtentVector extents;
20effc67 9536 for (auto& r : shared_blob.ref_map.ref_map) {
7c673cae
FG
9537 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
9538 }
20effc67
TL
9539 if (sbi.pool_id != sb_info_t::INVALID_POOL_ID &&
9540 (per_pool_stat_collection || repair)) {
11fdf7f2
TL
9541 expected_statfs = &expected_pool_statfs[sbi.pool_id];
9542 }
20effc67
TL
9543 std::stringstream ss;
9544 ss << "sbid 0x" << std::hex << sbid << std::dec;
9545 errors += _fsck_check_extents(ss.str(),
9546 extents,
9547 sbi.allocated_chunks < 0,
9548 used_blocks,
9549 fm->get_alloc_size(),
9550 repair ? &repairer : nullptr,
9551 *expected_statfs,
9552 depth);
11fdf7f2
TL
9553 }
9554 }
20effc67 9555 } // if (it) /* checking shared_blobs (phase 2)*/
11fdf7f2
TL
9556
9557 if (repair && repairer.preprocess_misreference(db)) {
9558
9559 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
11fdf7f2
TL
9560 auto& misref_extents = repairer.get_misreferences();
9561 interval_set<uint64_t> to_release;
f67539c2 9562 it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2 9563 if (it) {
eafe8130
TL
9564 // fill global if not overriden below
9565 auto expected_statfs = &expected_store_statfs;
11fdf7f2
TL
9566
9567 CollectionRef c;
9568 spg_t pgid;
9569 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
9570 bool bypass_rest = false;
9571 for (it->lower_bound(string()); it->valid() && !bypass_rest;
9572 it->next()) {
9573 dout(30) << __func__ << " key "
9574 << pretty_binary_string(it->key()) << dendl;
9575 if (is_extent_shard_key(it->key())) {
9576 continue;
9577 }
9578
9579 ghobject_t oid;
9580 int r = get_key_object(it->key(), &oid);
b3b6e05e 9581 if (r < 0 || !repairer.is_used(oid)) {
11fdf7f2
TL
9582 continue;
9583 }
9584
9585 if (!c ||
9586 oid.shard_id != pgid.shard ||
9587 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
9588 !c->contains(oid)) {
9589 c = nullptr;
9590 for (auto& p : coll_map) {
9591 if (p.second->contains(oid)) {
9592 c = p.second;
9593 break;
9594 }
9595 }
9596 if (!c) {
9597 continue;
9598 }
eafe8130
TL
9599 if (per_pool_stat_collection || repair) {
9600 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
11fdf7f2
TL
9601 expected_statfs = &expected_pool_statfs[pool_id];
9602 }
9603 }
b3b6e05e 9604 if (!repairer.is_used(c->cid)) {
11fdf7f2
TL
9605 continue;
9606 }
9607
9608 dout(20) << __func__ << " check misreference for col:" << c->cid
9609 << " obj:" << oid << dendl;
9610
eafe8130 9611 OnodeRef o;
39ae355f 9612 o.reset(Onode::create_decode(c, oid, it->key(), it->value()));
11fdf7f2
TL
9613 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9614 mempool::bluestore_fsck::set<BlobRef> blobs;
9615
9616 for (auto& e : o->extent_map.extent_map) {
9617 blobs.insert(e.blob);
9618 }
9619 bool need_onode_update = false;
9620 bool first_dump = true;
9621 for(auto b : blobs) {
9622 bool broken_blob = false;
9623 auto& pextents = b->dirty_blob().dirty_extents();
9624 for (auto& e : pextents) {
9625 if (!e.is_valid()) {
9626 continue;
9627 }
9628 // for the sake of simplicity and proper shared blob handling
9629 // always rewrite the whole blob even when it's partially
9630 // misreferenced.
9631 if (misref_extents.intersects(e.offset, e.length)) {
9632 if (first_dump) {
9633 first_dump = false;
81eedcae 9634 _dump_onode<10>(cct, *o);
11fdf7f2
TL
9635 }
9636 broken_blob = true;
9637 break;
9638 }
9639 }
9640 if (!broken_blob)
9641 continue;
9642 bool compressed = b->get_blob().is_compressed();
9643 need_onode_update = true;
9644 dout(10) << __func__
9645 << " fix misreferences in oid:" << oid
9646 << " " << *b << dendl;
9647 uint64_t b_off = 0;
9648 PExtentVector pext_to_release;
9649 pext_to_release.reserve(pextents.size());
9650 // rewriting all valid pextents
9651 for (auto e = pextents.begin(); e != pextents.end();
a4b75251
TL
9652 e++) {
9653 auto b_off_cur = b_off;
9654 b_off += e->length;
11fdf7f2
TL
9655 if (!e->is_valid()) {
9656 continue;
9657 }
9658 PExtentVector exts;
20effc67 9659 dout(5) << __func__ << "::NCB::(F)alloc=" << alloc << ", length=" << e->length << dendl;
f67539c2 9660 int64_t alloc_len =
20effc67 9661 alloc->allocate(e->length, min_alloc_size,
f67539c2 9662 0, 0, &exts);
eafe8130 9663 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
11fdf7f2
TL
9664 derr << __func__
9665 << " failed to allocate 0x" << std::hex << e->length
eafe8130 9666 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2 9667 << " min_alloc_size 0x" << min_alloc_size
20effc67 9668 << " available 0x " << alloc->get_free()
11fdf7f2
TL
9669 << std::dec << dendl;
9670 if (alloc_len > 0) {
20effc67 9671 alloc->release(exts);
11fdf7f2
TL
9672 }
9673 bypass_rest = true;
9674 break;
9675 }
9676 expected_statfs->allocated += e->length;
9677 if (compressed) {
9678 expected_statfs->data_compressed_allocated += e->length;
9679 }
9680
9681 bufferlist bl;
20effc67 9682 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
11fdf7f2
TL
9683 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
9684 if (r < 0) {
9685 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
9686 <<"~" << e->length << std::dec << dendl;
9687 ceph_abort_msg("read failed, wtf");
9688 }
9689 pext_to_release.push_back(*e);
9690 e = pextents.erase(e);
9691 e = pextents.insert(e, exts.begin(), exts.end());
9692 b->get_blob().map_bl(
20effc67 9693 b_off_cur, bl,
11fdf7f2
TL
9694 [&](uint64_t offset, bufferlist& t) {
9695 int r = bdev->write(offset, t, false);
9696 ceph_assert(r == 0);
9697 });
9698 e += exts.size() - 1;
9699 for (auto& p : exts) {
9700 fm->allocate(p.offset, p.length, txn);
9701 }
9702 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
9703
9704 if (b->get_blob().is_shared()) {
9705 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
9706
20effc67
TL
9707 auto sbid = b->shared_blob->get_sbid();
9708 auto sb_it = sb_info.find(sbid);
11fdf7f2 9709 ceph_assert(sb_it != sb_info.end());
20effc67
TL
9710 sb_info_t& sbi = *sb_it;
9711
9712 if (sbi.allocated_chunks < 0) {
9713 // NB: it's crucial to use compressed_allocated_chunks from sb_info_t
9714 // as we originally used that value while accumulating
9715 // expected_statfs
9716 expected_statfs->allocated -= uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
9717 expected_statfs->data_compressed_allocated -=
9718 uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
9719 } else {
9720 expected_statfs->allocated -= uint64_t(sbi.allocated_chunks) << min_alloc_size_order;
11fdf7f2 9721 }
20effc67
TL
9722 sbi.allocated_chunks = 0;
9723 repairer.fix_shared_blob(txn, sbid, nullptr, 0);
9724
11fdf7f2
TL
9725 // relying on blob's pextents to decide what to release.
9726 for (auto& p : pext_to_release) {
9727 to_release.union_insert(p.offset, p.length);
9728 }
9729 } else {
9730 for (auto& p : pext_to_release) {
9731 expected_statfs->allocated -= p.length;
9732 if (compressed) {
9733 expected_statfs->data_compressed_allocated -= p.length;
9734 }
9735 to_release.union_insert(p.offset, p.length);
9736 }
9737 }
9738 if (bypass_rest) {
9739 break;
9740 }
9741 } // for(auto b : blobs)
9742 if (need_onode_update) {
9743 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
9744 _record_onode(o, txn);
9745 }
9746 } // for (it->lower_bound(string()); it->valid(); it->next())
9747
9748 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
9749 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
9750 << "~" << it.get_len() << std::dec << dendl;
9751 fm->release(it.get_start(), it.get_len(), txn);
9752 }
20effc67 9753 alloc->release(to_release);
11fdf7f2
TL
9754 to_release.clear();
9755 } // if (it) {
9756 } //if (repair && repairer.preprocess_misreference()) {
11fdf7f2 9757 sb_info.clear();
20effc67 9758 sb_ref_counts.reset();
11fdf7f2 9759
eafe8130 9760 dout(1) << __func__ << " checking pool_statfs" << dendl;
39ae355f
TL
9761 _fsck_check_statfs(expected_store_statfs, expected_pool_statfs,
9762 errors, warnings, repair ? &repairer : nullptr);
eafe8130 9763 if (depth != FSCK_SHALLOW) {
9f95a23c 9764 dout(1) << __func__ << " checking for stray omap data " << dendl;
f67539c2 9765 it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE);
eafe8130 9766 if (it) {
9f95a23c 9767 uint64_t last_omap_head = 0;
eafe8130
TL
9768 for (it->lower_bound(string()); it->valid(); it->next()) {
9769 uint64_t omap_head;
f67539c2 9770
eafe8130 9771 _key_decode_u64(it->key().c_str(), &omap_head);
f67539c2 9772
9f95a23c 9773 if (used_omap_head.count(omap_head) == 0 &&
f67539c2 9774 omap_head != last_omap_head) {
20effc67 9775 pair<string,string> rk = it->raw_key();
9f95a23c
TL
9776 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9777 << "fsck error: found stray omap data on omap_head "
20effc67
TL
9778 << omap_head << " " << last_omap_head
9779 << " prefix/key: " << url_escape(rk.first)
9780 << " " << url_escape(rk.second)
9781 << fsck_dendl;
f67539c2
TL
9782 ++errors;
9783 last_omap_head = omap_head;
eafe8130 9784 }
7c673cae
FG
9785 }
9786 }
f67539c2 9787 it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE);
eafe8130 9788 if (it) {
9f95a23c 9789 uint64_t last_omap_head = 0;
eafe8130
TL
9790 for (it->lower_bound(string()); it->valid(); it->next()) {
9791 uint64_t omap_head;
9792 _key_decode_u64(it->key().c_str(), &omap_head);
9f95a23c
TL
9793 if (used_omap_head.count(omap_head) == 0 &&
9794 omap_head != last_omap_head) {
20effc67 9795 pair<string,string> rk = it->raw_key();
9f95a23c
TL
9796 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9797 << "fsck error: found stray (pgmeta) omap data on omap_head "
20effc67
TL
9798 << omap_head << " " << last_omap_head
9799 << " prefix/key: " << url_escape(rk.first)
9800 << " " << url_escape(rk.second)
9801 << fsck_dendl;
9f95a23c 9802 last_omap_head = omap_head;
eafe8130
TL
9803 ++errors;
9804 }
11fdf7f2
TL
9805 }
9806 }
f67539c2 9807 it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9f95a23c
TL
9808 if (it) {
9809 uint64_t last_omap_head = 0;
9810 for (it->lower_bound(string()); it->valid(); it->next()) {
9811 uint64_t pool;
9812 uint64_t omap_head;
9813 string k = it->key();
9814 const char *c = k.c_str();
9815 c = _key_decode_u64(c, &pool);
9816 c = _key_decode_u64(c, &omap_head);
9817 if (used_omap_head.count(omap_head) == 0 &&
f67539c2 9818 omap_head != last_omap_head) {
20effc67 9819 pair<string,string> rk = it->raw_key();
9f95a23c
TL
9820 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9821 << "fsck error: found stray (per-pool) omap data on omap_head "
20effc67
TL
9822 << omap_head << " " << last_omap_head
9823 << " prefix/key: " << url_escape(rk.first)
9824 << " " << url_escape(rk.second)
9825 << fsck_dendl;
9f95a23c 9826 ++errors;
f67539c2
TL
9827 last_omap_head = omap_head;
9828 }
9829 }
9830 }
9831 it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9832 if (it) {
9833 uint64_t last_omap_head = 0;
9834 for (it->lower_bound(string()); it->valid(); it->next()) {
9835 uint64_t pool;
9836 uint32_t hash;
9837 uint64_t omap_head;
9838 string k = it->key();
9839 const char* c = k.c_str();
9840 c = _key_decode_u64(c, &pool);
9841 c = _key_decode_u32(c, &hash);
9842 c = _key_decode_u64(c, &omap_head);
9843 if (used_omap_head.count(omap_head) == 0 &&
9844 omap_head != last_omap_head) {
9845 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9846 << "fsck error: found stray (per-pg) omap data on omap_head "
20effc67 9847 << " key " << pretty_binary_string(it->key())
f67539c2
TL
9848 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
9849 ++errors;
9850 last_omap_head = omap_head;
9f95a23c
TL
9851 }
9852 }
9853 }
eafe8130 9854 dout(1) << __func__ << " checking deferred events" << dendl;
f67539c2 9855 it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
9856 if (it) {
9857 for (it->lower_bound(string()); it->valid(); it->next()) {
9858 bufferlist bl = it->value();
9859 auto p = bl.cbegin();
9860 bluestore_deferred_transaction_t wt;
9861 try {
9862 decode(wt, p);
f67539c2 9863 } catch (ceph::buffer::error& e) {
eafe8130
TL
9864 derr << "fsck error: failed to decode deferred txn "
9865 << pretty_binary_string(it->key()) << dendl;
9866 if (repair) {
9867 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
9868 << pretty_binary_string(it->key())
9869 << "', removing" << dendl;
9870 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
9871 }
9872 continue;
9873 }
9874 dout(20) << __func__ << " deferred " << wt.seq
9875 << " ops " << wt.ops.size()
9876 << " released 0x" << std::hex << wt.released << std::dec << dendl;
9877 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
9f95a23c 9878 apply_for_bitset_range(
f67539c2 9879 e.get_start(), e.get_len(), alloc_size, used_blocks,
eafe8130 9880 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
9881 bs.set(pos);
9882 }
9883 );
9884 }
7c673cae 9885 }
eafe8130
TL
9886 }
9887
20effc67
TL
9888 // skip freelist vs allocated compare when we have Null fm
9889 if (!fm->is_null_manager()) {
9890 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
9891#ifdef HAVE_LIBZBD
9892 if (freelist_type == "zoned") {
9893 // verify per-zone state
9894 // - verify no allocations beyond write pointer
9895 // - verify num_dead_bytes count (neither allocated nor
9896 // free space past the write pointer)
9897 auto a = dynamic_cast<ZonedAllocator*>(alloc);
9898 auto num_zones = bdev->get_size() / zone_size;
9899
9900 // mark the free space past the write pointer
9901 for (uint32_t zone = first_sequential_zone; zone < num_zones; ++zone) {
9902 auto wp = a->get_write_pointer(zone);
9903 uint64_t offset = zone_size * zone + wp;
9904 uint64_t length = zone_size - wp;
9905 if (!length) {
9906 continue;
9907 }
9908 bool intersects = false;
9909 dout(10) << " marking zone 0x" << std::hex << zone
9910 << " region after wp 0x" << offset << "~" << length
9911 << std::dec << dendl;
9912 apply_for_bitset_range(
9913 offset, length, alloc_size, used_blocks,
9914 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9915 if (bs.test(pos)) {
9916 derr << "fsck error: zone 0x" << std::hex << zone
9917 << " has used space at 0x" << pos * alloc_size
9918 << " beyond write pointer 0x" << wp
9919 << std::dec << dendl;
9920 intersects = true;
eafe8130 9921 } else {
20effc67 9922 bs.set(pos);
11fdf7f2 9923 }
20effc67
TL
9924 }
9925 );
9926 if (intersects) {
9927 ++errors;
9928 }
9929 }
9930
9931 used_blocks.flip();
9932
9933 // skip conventional zones
9934 uint64_t pos = (first_sequential_zone * zone_size) / min_alloc_size - 1;
9935 pos = used_blocks.find_next(pos);
9936
9937 uint64_t zone_dead = 0;
9938 for (uint32_t zone = first_sequential_zone;
9939 zone < num_zones;
9940 ++zone, zone_dead = 0) {
9941 while (pos != decltype(used_blocks)::npos &&
9942 (pos * min_alloc_size) / zone_size == zone) {
9943 dout(40) << " zone 0x" << std::hex << zone
9944 << " dead 0x" << (pos * min_alloc_size) << "~" << min_alloc_size
9945 << std::dec << dendl;
9946 zone_dead += min_alloc_size;
9947 pos = used_blocks.find_next(pos);
9948 }
9949 dout(20) << " zone 0x" << std::hex << zone << " dead is 0x" << zone_dead
9950 << std::dec << dendl;
9951 // cross-check dead bytes against zone state
9952 if (a->get_dead_bytes(zone) != zone_dead) {
9953 derr << "fsck error: zone 0x" << std::hex << zone << " has 0x" << zone_dead
9954 << " dead bytes but freelist says 0x" << a->get_dead_bytes(zone)
9955 << dendl;
9956 ++errors;
9957 // TODO: repair
9958 }
9959 }
9960 used_blocks.flip();
9961 } else
9962#endif
9963 {
9964 fm->enumerate_reset();
9965 uint64_t offset, length;
9966 while (fm->enumerate_next(db, &offset, &length)) {
9967 bool intersects = false;
9968 apply_for_bitset_range(
9969 offset, length, alloc_size, used_blocks,
9970 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9971 ceph_assert(pos < bs.size());
9972 if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
9973 if (offset == SUPER_RESERVED &&
9974 length == min_alloc_size - SUPER_RESERVED) {
9975 // this is due to the change just after luminous to min_alloc_size
9976 // granularity allocations, and our baked in assumption at the top
9977 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
9978 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
9979 // since we will never allocate this region below min_alloc_size.
9980 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
9981 << " and min_alloc_size, 0x" << std::hex << offset << "~"
9982 << length << std::dec << dendl;
9983 } else {
9984 intersects = true;
9985 if (repair) {
9986 repairer.fix_false_free(db, fm,
9987 pos * min_alloc_size,
9988 min_alloc_size);
9989 }
9990 }
9991 } else {
9992 bs.set(pos);
eafe8130 9993 }
11fdf7f2 9994 }
20effc67
TL
9995 );
9996 if (intersects) {
9997 derr << "fsck error: free extent 0x" << std::hex << offset
9998 << "~" << length << std::dec
9999 << " intersects allocated blocks" << dendl;
10000 ++errors;
b5b8bbf5 10001 }
20effc67
TL
10002 }
10003 fm->enumerate_reset();
10004
10005 // check for leaked extents
10006 size_t count = used_blocks.count();
10007 if (used_blocks.size() != count) {
10008 ceph_assert(used_blocks.size() > count);
10009 used_blocks.flip();
10010 size_t start = used_blocks.find_first();
10011 while (start != decltype(used_blocks)::npos) {
10012 size_t cur = start;
10013 while (true) {
10014 size_t next = used_blocks.find_next(cur);
10015 if (next != cur + 1) {
10016 ++errors;
10017 derr << "fsck error: leaked extent 0x" << std::hex
10018 << ((uint64_t)start * fm->get_alloc_size()) << "~"
10019 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
10020 << dendl;
10021 if (repair) {
10022 repairer.fix_leaked(db,
10023 fm,
10024 start * min_alloc_size,
10025 (cur + 1 - start) * min_alloc_size);
10026 }
10027 start = next;
10028 break;
10029 }
10030 cur = next;
10031 }
10032 }
10033 used_blocks.flip();
10034 }
b5b8bbf5 10035 }
7c673cae
FG
10036 }
10037 }
11fdf7f2 10038 if (repair) {
f67539c2
TL
10039 if (per_pool_omap != OMAP_PER_PG) {
10040 dout(5) << __func__ << " fixing per_pg_omap" << dendl;
10041 repairer.fix_per_pool_omap(db, OMAP_PER_PG);
9f95a23c
TL
10042 }
10043
11fdf7f2
TL
10044 dout(5) << __func__ << " applying repair results" << dendl;
10045 repaired = repairer.apply(db);
10046 dout(5) << __func__ << " repair applied" << dendl;
10047 }
7c673cae 10048
eafe8130 10049out_scan:
7c673cae
FG
10050 dout(2) << __func__ << " " << num_objects << " objects, "
10051 << num_sharded_objects << " of them sharded. "
10052 << dendl;
10053 dout(2) << __func__ << " " << num_extents << " extents to "
10054 << num_blobs << " blobs, "
10055 << num_spanning_blobs << " spanning, "
10056 << num_shared_blobs << " shared."
10057 << dendl;
10058
10059 utime_t duration = ceph_clock_now() - start;
9f95a23c
TL
10060 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
10061 << warnings << " warnings, "
10062 << repaired << " repaired, "
10063 << (errors + warnings - (int)repaired) << " remaining in "
7c673cae 10064 << duration << " seconds" << dendl;
9f95a23c
TL
10065
10066 // In non-repair mode we should return error count only as
10067 // it indicates if store status is OK.
10068 // In repair mode both errors and warnings are taken into account
10069 // since repaired counter relates to them both.
10070 return repair ? errors + warnings - (int)repaired : errors;
11fdf7f2
TL
10071}
10072
10073/// methods to inject various errors fsck can repair
10074void BlueStore::inject_broken_shared_blob_key(const string& key,
10075 const bufferlist& bl)
10076{
10077 KeyValueDB::Transaction txn;
10078 txn = db->get_transaction();
10079 txn->set(PREFIX_SHARED_BLOB, key, bl);
10080 db->submit_transaction_sync(txn);
10081};
10082
a4b75251
TL
10083void BlueStore::inject_no_shared_blob_key()
10084{
10085 KeyValueDB::Transaction txn;
10086 txn = db->get_transaction();
10087 ceph_assert(blobid_last > 0);
10088 // kill the last used sbid, this can be broken due to blobid preallocation
10089 // in rare cases, leaving as-is for the sake of simplicity
10090 uint64_t sbid = blobid_last;
10091
10092 string key;
10093 dout(5) << __func__<< " " << sbid << dendl;
10094 get_shared_blob_key(sbid, &key);
10095 txn->rmkey(PREFIX_SHARED_BLOB, key);
10096 db->submit_transaction_sync(txn);
10097};
10098
20effc67 10099void BlueStore::inject_stray_shared_blob_key(uint64_t sbid)
11fdf7f2
TL
10100{
10101 KeyValueDB::Transaction txn;
10102 txn = db->get_transaction();
10103
20effc67
TL
10104 dout(5) << __func__ << " " << sbid << dendl;
10105
10106 string key;
10107 get_shared_blob_key(sbid, &key);
10108 bluestore_shared_blob_t persistent(sbid);
39ae355f 10109 persistent.ref_map.get(0xdead0000, min_alloc_size);
20effc67
TL
10110 bufferlist bl;
10111 encode(persistent, bl);
10112 dout(20) << __func__ << " sbid " << sbid
10113 << " takes " << bl.length() << " bytes, updating"
10114 << dendl;
10115
10116 txn->set(PREFIX_SHARED_BLOB, key, bl);
10117 db->submit_transaction_sync(txn);
10118};
10119
10120
10121void BlueStore::inject_leaked(uint64_t len)
10122{
11fdf7f2 10123 PExtentVector exts;
20effc67 10124 int64_t alloc_len = alloc->allocate(len, min_alloc_size,
11fdf7f2 10125 min_alloc_size * 256, 0, &exts);
20effc67
TL
10126
10127 if (fm->is_null_manager()) {
10128 return;
10129 }
10130
10131 KeyValueDB::Transaction txn;
10132 txn = db->get_transaction();
10133
11fdf7f2
TL
10134 ceph_assert(alloc_len >= (int64_t)len);
10135 for (auto& p : exts) {
10136 fm->allocate(p.offset, p.length, txn);
10137 }
10138 db->submit_transaction_sync(txn);
10139}
10140
10141void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
10142{
20effc67
TL
10143 ceph_assert(!fm->is_null_manager());
10144
11fdf7f2
TL
10145 KeyValueDB::Transaction txn;
10146 OnodeRef o;
10147 CollectionRef c = _get_collection(cid);
10148 ceph_assert(c);
10149 {
9f95a23c 10150 std::unique_lock l{c->lock}; // just to avoid internal asserts
11fdf7f2
TL
10151 o = c->get_onode(oid, false);
10152 ceph_assert(o);
10153 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10154 }
10155
10156 bool injected = false;
10157 txn = db->get_transaction();
10158 auto& em = o->extent_map.extent_map;
10159 std::vector<const PExtentVector*> v;
10160 if (em.size()) {
10161 v.push_back(&em.begin()->blob->get_blob().get_extents());
10162 }
10163 if (em.size() > 1) {
10164 auto it = em.end();
10165 --it;
10166 v.push_back(&(it->blob->get_blob().get_extents()));
10167 }
10168 for (auto pext : v) {
10169 if (pext->size()) {
10170 auto p = pext->begin();
10171 while (p != pext->end()) {
10172 if (p->is_valid()) {
10173 dout(20) << __func__ << " release 0x" << std::hex << p->offset
10174 << "~" << p->length << std::dec << dendl;
10175 fm->release(p->offset, p->length, txn);
10176 injected = true;
10177 break;
10178 }
10179 ++p;
10180 }
10181 }
10182 }
10183 ceph_assert(injected);
10184 db->submit_transaction_sync(txn);
10185}
10186
9f95a23c
TL
10187void BlueStore::inject_legacy_omap()
10188{
10189 dout(1) << __func__ << dendl;
f67539c2 10190 per_pool_omap = OMAP_BULK;
9f95a23c
TL
10191 KeyValueDB::Transaction txn;
10192 txn = db->get_transaction();
10193 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
10194 db->submit_transaction_sync(txn);
10195}
10196
10197void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
10198{
10199 dout(1) << __func__ << " "
10200 << cid << " " << oid
10201 <<dendl;
10202 KeyValueDB::Transaction txn;
10203 OnodeRef o;
10204 CollectionRef c = _get_collection(cid);
10205 ceph_assert(c);
10206 {
10207 std::unique_lock l{ c->lock }; // just to avoid internal asserts
10208 o = c->get_onode(oid, false);
10209 ceph_assert(o);
10210 }
f67539c2
TL
10211 o->onode.clear_flag(
10212 bluestore_onode_t::FLAG_PERPG_OMAP |
10213 bluestore_onode_t::FLAG_PERPOOL_OMAP |
10214 bluestore_onode_t::FLAG_PGMETA_OMAP);
9f95a23c
TL
10215 txn = db->get_transaction();
10216 _record_onode(o, txn);
10217 db->submit_transaction_sync(txn);
10218}
10219
20effc67
TL
10220void BlueStore::inject_stray_omap(uint64_t head, const string& name)
10221{
10222 dout(1) << __func__ << dendl;
10223 KeyValueDB::Transaction txn = db->get_transaction();
10224
10225 string key;
10226 bufferlist bl;
10227 _key_encode_u64(head, &key);
10228 key.append(name);
10229 txn->set(PREFIX_OMAP, key, bl);
10230
10231 db->submit_transaction_sync(txn);
10232}
9f95a23c 10233
11fdf7f2
TL
10234void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
10235{
10236 BlueStoreRepairer repairer;
10237 repairer.fix_statfs(db, key, new_statfs);
10238 repairer.apply(db);
10239}
10240
eafe8130
TL
10241void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
10242{
10243 KeyValueDB::Transaction t = db->get_transaction();
10244 volatile_statfs v;
10245 v = new_statfs;
10246 bufferlist bl;
10247 v.encode(bl);
10248 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
10249 db->submit_transaction_sync(t);
10250}
10251
11fdf7f2
TL
10252void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
10253 coll_t cid2, ghobject_t oid2,
10254 uint64_t offset)
10255{
10256 OnodeRef o1;
10257 CollectionRef c1 = _get_collection(cid1);
10258 ceph_assert(c1);
10259 {
9f95a23c 10260 std::unique_lock l{c1->lock}; // just to avoid internal asserts
11fdf7f2
TL
10261 o1 = c1->get_onode(oid1, false);
10262 ceph_assert(o1);
10263 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
10264 }
10265 OnodeRef o2;
10266 CollectionRef c2 = _get_collection(cid2);
10267 ceph_assert(c2);
10268 {
9f95a23c 10269 std::unique_lock l{c2->lock}; // just to avoid internal asserts
11fdf7f2
TL
10270 o2 = c2->get_onode(oid2, false);
10271 ceph_assert(o2);
10272 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
10273 }
10274 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
10275 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
10276
10277 // require onode/extent layout to be the same (and simple)
10278 // to make things easier
10279 ceph_assert(o1->onode.extent_map_shards.empty());
10280 ceph_assert(o2->onode.extent_map_shards.empty());
10281 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
10282 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
10283 ceph_assert(e1.logical_offset == e2.logical_offset);
10284 ceph_assert(e1.length == e2.length);
10285 ceph_assert(e1.blob_offset == e2.blob_offset);
10286
10287 KeyValueDB::Transaction txn;
10288 txn = db->get_transaction();
10289
10290 // along with misreference error this will create space leaks errors
10291 e2.blob->dirty_blob() = e1.blob->get_blob();
10292 o2->extent_map.dirty_range(offset, e2.length);
10293 o2->extent_map.update(txn, false);
10294
10295 _record_onode(o2, txn);
10296 db->submit_transaction_sync(txn);
7c673cae
FG
10297}
10298
adb31ebb
TL
10299void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
10300 int16_t blob_id)
10301{
10302 OnodeRef o;
10303 CollectionRef c = _get_collection(cid);
10304 ceph_assert(c);
10305 {
10306 std::unique_lock l{ c->lock }; // just to avoid internal asserts
10307 o = c->get_onode(oid, false);
10308 ceph_assert(o);
10309 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10310 }
10311
10312 BlobRef b = c->new_blob();
10313 b->id = blob_id;
10314 o->extent_map.spanning_blob_map[blob_id] = b;
10315
10316 KeyValueDB::Transaction txn;
10317 txn = db->get_transaction();
10318
10319 _record_onode(o, txn);
10320 db->submit_transaction_sync(txn);
10321}
10322
a4b75251
TL
10323void BlueStore::inject_bluefs_file(std::string_view dir, std::string_view name, size_t new_size)
10324{
10325 ceph_assert(bluefs);
10326
10327 BlueFS::FileWriter* p_handle = nullptr;
10328 auto ret = bluefs->open_for_write(dir, name, &p_handle, false);
10329 ceph_assert(ret == 0);
10330
10331 std::string s('0', new_size);
10332 bufferlist bl;
10333 bl.append(s);
10334 p_handle->append(bl);
10335
10336 bluefs->fsync(p_handle);
10337 bluefs->close_writer(p_handle);
10338}
10339
7c673cae
FG
10340void BlueStore::collect_metadata(map<string,string> *pm)
10341{
10342 dout(10) << __func__ << dendl;
10343 bdev->collect_metadata("bluestore_bdev_", pm);
10344 if (bluefs) {
10345 (*pm)["bluefs"] = "1";
9f95a23c
TL
10346 // this value is for backward compatibility only
10347 (*pm)["bluefs_single_shared_device"] = \
10348 stringify((int)bluefs_layout.single_shared_device());
10349 (*pm)["bluefs_dedicated_db"] = \
10350 stringify((int)bluefs_layout.dedicated_db);
10351 (*pm)["bluefs_dedicated_wal"] = \
10352 stringify((int)bluefs_layout.dedicated_wal);
10353 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
7c673cae
FG
10354 } else {
10355 (*pm)["bluefs"] = "0";
10356 }
11fdf7f2
TL
10357
10358 // report numa mapping for underlying devices
10359 int node = -1;
10360 set<int> nodes;
10361 set<string> failed;
10362 int r = get_numa_node(&node, &nodes, &failed);
10363 if (r >= 0) {
10364 if (!failed.empty()) {
10365 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
10366 }
10367 if (!nodes.empty()) {
10368 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
10369 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
10370 }
10371 if (node >= 0) {
10372 (*pm)["objectstore_numa_node"] = stringify(node);
10373 }
10374 }
05a536ef 10375 (*pm)["bluestore_min_alloc_size"] = stringify(min_alloc_size);
11fdf7f2
TL
10376}
10377
10378int BlueStore::get_numa_node(
10379 int *final_node,
10380 set<int> *out_nodes,
10381 set<string> *out_failed)
10382{
10383 int node = -1;
10384 set<string> devices;
10385 get_devices(&devices);
10386 set<int> nodes;
10387 set<string> failed;
10388 for (auto& devname : devices) {
10389 int n;
10390 BlkDev bdev(devname);
10391 int r = bdev.get_numa_node(&n);
10392 if (r < 0) {
10393 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
10394 << dendl;
10395 failed.insert(devname);
10396 continue;
10397 }
10398 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
10399 << dendl;
10400 nodes.insert(n);
10401 if (node < 0) {
10402 node = n;
10403 }
10404 }
10405 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
10406 *final_node = node;
10407 }
10408 if (out_nodes) {
10409 *out_nodes = nodes;
10410 }
10411 if (out_failed) {
10412 *out_failed = failed;
10413 }
10414 return 0;
10415}
10416
1d09f67e
TL
10417void BlueStore::prepare_for_fast_shutdown()
10418{
10419 m_fast_shutdown = true;
10420}
10421
11fdf7f2
TL
10422int BlueStore::get_devices(set<string> *ls)
10423{
10424 if (bdev) {
10425 bdev->get_devices(ls);
10426 if (bluefs) {
10427 bluefs->get_devices(ls);
10428 }
10429 return 0;
10430 }
20effc67 10431
11fdf7f2 10432 // grumble, we haven't started up yet.
20effc67
TL
10433 if (int r = _open_path(); r < 0) {
10434 return r;
10435 }
10436 auto close_path = make_scope_guard([&] {
10437 _close_path();
10438 });
10439 if (int r = _open_fsid(false); r < 0) {
10440 return r;
10441 }
10442 auto close_fsid = make_scope_guard([&] {
10443 _close_fsid();
10444 });
10445 if (int r = _read_fsid(&fsid); r < 0) {
10446 return r;
10447 }
10448 if (int r = _lock_fsid(); r < 0) {
10449 return r;
10450 }
10451 if (int r = _open_bdev(false); r < 0) {
10452 return r;
10453 }
10454 auto close_bdev = make_scope_guard([&] {
10455 _close_bdev();
10456 });
10457 if (int r = _minimal_open_bluefs(false); r < 0) {
10458 return r;
10459 }
11fdf7f2
TL
10460 bdev->get_devices(ls);
10461 if (bluefs) {
10462 bluefs->get_devices(ls);
10463 }
11fdf7f2 10464 _minimal_close_bluefs();
20effc67 10465 return 0;
7c673cae
FG
10466}
10467
11fdf7f2 10468void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
7c673cae
FG
10469{
10470 buf->reset();
11fdf7f2 10471
f67539c2
TL
10472 auto prefix = per_pool_omap == OMAP_BULK ?
10473 PREFIX_OMAP :
10474 per_pool_omap == OMAP_PER_POOL ?
10475 PREFIX_PERPOOL_OMAP :
10476 PREFIX_PERPG_OMAP;
9f95a23c 10477 buf->omap_allocated =
f67539c2 10478 db->estimate_prefix_size(prefix, string());
11fdf7f2 10479
20effc67 10480 uint64_t bfree = alloc->get_free();
7c673cae
FG
10481
10482 if (bluefs) {
f67539c2 10483 buf->internally_reserved = 0;
11fdf7f2 10484 // include dedicated db, too, if that isn't the shared device.
9f95a23c 10485 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 10486 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
7c673cae 10487 }
11fdf7f2
TL
10488 // call any non-omap bluefs space "internal metadata"
10489 buf->internal_metadata =
f67539c2 10490 bluefs->get_used()
11fdf7f2 10491 - buf->omap_allocated;
7c673cae
FG
10492 }
10493
1e59de90
TL
10494 ExtBlkDevState ebd_state;
10495 int rc = bdev->get_ebd_state(ebd_state);
10496 if (rc == 0) {
10497 buf->total += ebd_state.get_physical_total();
11fdf7f2
TL
10498
10499 // we are limited by both the size of the virtual device and the
10500 // underlying physical device.
1e59de90 10501 bfree = std::min(bfree, ebd_state.get_physical_avail());
11fdf7f2 10502
1e59de90 10503 buf->allocated = ebd_state.get_physical_total() - ebd_state.get_physical_avail();;
11fdf7f2
TL
10504 } else {
10505 buf->total += bdev->get_size();
10506 }
10507 buf->available = bfree;
10508}
10509
10510int BlueStore::statfs(struct store_statfs_t *buf,
10511 osd_alert_list_t* alerts)
10512{
10513 if (alerts) {
10514 alerts->clear();
10515 _log_alerts(*alerts);
10516 }
10517 _get_statfs_overall(buf);
31f18b77 10518 {
11fdf7f2 10519 std::lock_guard l(vstatfs_lock);
31f18b77 10520 buf->allocated = vstatfs.allocated();
11fdf7f2
TL
10521 buf->data_stored = vstatfs.stored();
10522 buf->data_compressed = vstatfs.compressed();
10523 buf->data_compressed_original = vstatfs.compressed_original();
10524 buf->data_compressed_allocated = vstatfs.compressed_allocated();
10525 }
10526
10527 dout(20) << __func__ << " " << *buf << dendl;
10528 return 0;
10529}
10530
9f95a23c
TL
10531int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
10532 bool *out_per_pool_omap)
11fdf7f2
TL
10533{
10534 dout(20) << __func__ << " pool " << pool_id<< dendl;
81eedcae 10535
11fdf7f2
TL
10536 if (!per_pool_stat_collection) {
10537 dout(20) << __func__ << " not supported in legacy mode " << dendl;
10538 return -ENOTSUP;
7c673cae 10539 }
11fdf7f2 10540 buf->reset();
7c673cae 10541
11fdf7f2
TL
10542 {
10543 std::lock_guard l(vstatfs_lock);
10544 osd_pools[pool_id].publish(buf);
10545 }
9f95a23c
TL
10546
10547 string key_prefix;
10548 _key_encode_u64(pool_id, &key_prefix);
f67539c2 10549 *out_per_pool_omap = per_pool_omap != OMAP_BULK;
1d09f67e
TL
10550 // stop calls after db was closed
10551 if (*out_per_pool_omap && db) {
f67539c2
TL
10552 auto prefix = per_pool_omap == OMAP_PER_POOL ?
10553 PREFIX_PERPOOL_OMAP :
10554 PREFIX_PERPG_OMAP;
10555 buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix);
10556 }
9f95a23c 10557
11fdf7f2 10558 dout(10) << __func__ << *buf << dendl;
7c673cae
FG
10559 return 0;
10560}
10561
81eedcae
TL
10562void BlueStore::_check_legacy_statfs_alert()
10563{
10564 string s;
10565 if (!per_pool_stat_collection &&
eafe8130 10566 cct->_conf->bluestore_warn_on_legacy_statfs) {
81eedcae
TL
10567 s = "legacy statfs reporting detected, "
10568 "suggest to run store repair to get consistent statistic reports";
10569 }
10570 std::lock_guard l(qlock);
10571 legacy_statfs_alert = s;
10572}
10573
f67539c2 10574void BlueStore::_check_no_per_pg_or_pool_omap_alert()
9f95a23c 10575{
f67539c2
TL
10576 string per_pg, per_pool;
10577 if (per_pool_omap != OMAP_PER_PG) {
10578 if (cct->_conf->bluestore_warn_on_no_per_pg_omap) {
10579 per_pg = "legacy (not per-pg) omap detected, "
10580 "suggest to run store repair to benefit from faster PG removal";
10581 }
10582 if (per_pool_omap != OMAP_PER_POOL) {
10583 if (cct->_conf->bluestore_warn_on_no_per_pool_omap) {
10584 per_pool = "legacy (not per-pool) omap detected, "
10585 "suggest to run store repair to benefit from per-pool omap usage statistics";
10586 }
10587 }
9f95a23c
TL
10588 }
10589 std::lock_guard l(qlock);
f67539c2
TL
10590 no_per_pg_omap_alert = per_pg;
10591 no_per_pool_omap_alert = per_pool;
9f95a23c
TL
10592}
10593
7c673cae
FG
10594// ---------------
10595// cache
10596
10597BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
10598{
9f95a23c 10599 std::shared_lock l(coll_lock);
7c673cae
FG
10600 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
10601 if (cp == coll_map.end())
10602 return CollectionRef();
10603 return cp->second;
10604}
10605
20effc67
TL
10606BlueStore::CollectionRef BlueStore::_get_collection_by_oid(const ghobject_t& oid)
10607{
10608 std::shared_lock l(coll_lock);
10609
10610 // FIXME: we must replace this with something more efficient
10611
10612 for (auto& i : coll_map) {
10613 spg_t spgid;
10614 if (i.first.is_pg(&spgid) &&
10615 i.second->contains(oid)) {
10616 return i.second;
10617 }
10618 }
10619 return CollectionRef();
10620}
10621
7c673cae
FG
10622void BlueStore::_queue_reap_collection(CollectionRef& c)
10623{
10624 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
10625 // _reap_collections and this in the same thread,
10626 // so no need a lock.
7c673cae
FG
10627 removed_collections.push_back(c);
10628}
10629
10630void BlueStore::_reap_collections()
10631{
94b18763 10632
7c673cae
FG
10633 list<CollectionRef> removed_colls;
10634 {
94b18763
FG
10635 // _queue_reap_collection and this in the same thread.
10636 // So no need a lock.
10637 if (!removed_collections.empty())
10638 removed_colls.swap(removed_collections);
10639 else
10640 return;
7c673cae
FG
10641 }
10642
94b18763
FG
10643 list<CollectionRef>::iterator p = removed_colls.begin();
10644 while (p != removed_colls.end()) {
7c673cae
FG
10645 CollectionRef c = *p;
10646 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
39ae355f 10647 if (c->onode_space.map_any([&](Onode* o) {
11fdf7f2 10648 ceph_assert(!o->exists);
7c673cae
FG
10649 if (o->flushing_count.load()) {
10650 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
10651 << " flush_txns " << o->flushing_count << dendl;
94b18763 10652 return true;
7c673cae 10653 }
94b18763 10654 return false;
7c673cae 10655 })) {
94b18763 10656 ++p;
7c673cae
FG
10657 continue;
10658 }
39ae355f 10659 c->onode_space.clear();
94b18763 10660 p = removed_colls.erase(p);
7c673cae
FG
10661 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
10662 }
94b18763 10663 if (removed_colls.empty()) {
7c673cae 10664 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
10665 } else {
10666 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
10667 }
10668}
10669
1e59de90 10670void BlueStore::_update_logger()
7c673cae
FG
10671{
10672 uint64_t num_onodes = 0;
9f95a23c 10673 uint64_t num_pinned_onodes = 0;
7c673cae
FG
10674 uint64_t num_extents = 0;
10675 uint64_t num_blobs = 0;
10676 uint64_t num_buffers = 0;
10677 uint64_t num_buffer_bytes = 0;
9f95a23c
TL
10678 for (auto c : onode_cache_shards) {
10679 c->add_stats(&num_onodes, &num_pinned_onodes);
10680 }
10681 for (auto c : buffer_cache_shards) {
10682 c->add_stats(&num_extents, &num_blobs,
10683 &num_buffers, &num_buffer_bytes);
7c673cae
FG
10684 }
10685 logger->set(l_bluestore_onodes, num_onodes);
9f95a23c 10686 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
7c673cae
FG
10687 logger->set(l_bluestore_extents, num_extents);
10688 logger->set(l_bluestore_blobs, num_blobs);
10689 logger->set(l_bluestore_buffers, num_buffers);
10690 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
10691}
10692
10693// ---------------
10694// read operations
10695
10696ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
10697{
10698 return _get_collection(cid);
10699}
10700
11fdf7f2
TL
10701ObjectStore::CollectionHandle BlueStore::create_new_collection(
10702 const coll_t& cid)
7c673cae 10703{
9f95a23c
TL
10704 std::unique_lock l{coll_lock};
10705 auto c = ceph::make_ref<Collection>(
11fdf7f2 10706 this,
9f95a23c
TL
10707 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
10708 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
11fdf7f2
TL
10709 cid);
10710 new_coll_map[cid] = c;
9f95a23c 10711 _osr_attach(c.get());
11fdf7f2
TL
10712 return c;
10713}
10714
10715void BlueStore::set_collection_commit_queue(
10716 const coll_t& cid,
10717 ContextQueue *commit_queue)
10718{
10719 if (commit_queue) {
9f95a23c 10720 std::shared_lock l(coll_lock);
11fdf7f2
TL
10721 if (coll_map.count(cid)) {
10722 coll_map[cid]->commit_queue = commit_queue;
10723 } else if (new_coll_map.count(cid)) {
10724 new_coll_map[cid]->commit_queue = commit_queue;
10725 }
10726 }
7c673cae
FG
10727}
10728
11fdf7f2 10729
7c673cae
FG
10730bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
10731{
10732 Collection *c = static_cast<Collection *>(c_.get());
10733 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
10734 if (!c->exists)
10735 return false;
10736
10737 bool r = true;
10738
10739 {
9f95a23c 10740 std::shared_lock l(c->lock);
7c673cae
FG
10741 OnodeRef o = c->get_onode(oid, false);
10742 if (!o || !o->exists)
10743 r = false;
10744 }
10745
7c673cae
FG
10746 return r;
10747}
10748
7c673cae
FG
10749int BlueStore::stat(
10750 CollectionHandle &c_,
10751 const ghobject_t& oid,
10752 struct stat *st,
10753 bool allow_eio)
10754{
10755 Collection *c = static_cast<Collection *>(c_.get());
10756 if (!c->exists)
10757 return -ENOENT;
10758 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
10759
10760 {
9f95a23c 10761 std::shared_lock l(c->lock);
7c673cae
FG
10762 OnodeRef o = c->get_onode(oid, false);
10763 if (!o || !o->exists)
10764 return -ENOENT;
10765 st->st_size = o->onode.size;
10766 st->st_blksize = 4096;
10767 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
10768 st->st_nlink = 1;
10769 }
10770
7c673cae
FG
10771 int r = 0;
10772 if (_debug_mdata_eio(oid)) {
10773 r = -EIO;
10774 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10775 }
10776 return r;
10777}
10778int BlueStore::set_collection_opts(
11fdf7f2 10779 CollectionHandle& ch,
7c673cae
FG
10780 const pool_opts_t& opts)
10781{
7c673cae 10782 Collection *c = static_cast<Collection *>(ch.get());
11fdf7f2 10783 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
7c673cae
FG
10784 if (!c->exists)
10785 return -ENOENT;
9f95a23c 10786 std::unique_lock l{c->lock};
7c673cae
FG
10787 c->pool_opts = opts;
10788 return 0;
10789}
10790
7c673cae
FG
10791int BlueStore::read(
10792 CollectionHandle &c_,
10793 const ghobject_t& oid,
10794 uint64_t offset,
10795 size_t length,
10796 bufferlist& bl,
224ce89b 10797 uint32_t op_flags)
7c673cae 10798{
11fdf7f2 10799 auto start = mono_clock::now();
7c673cae
FG
10800 Collection *c = static_cast<Collection *>(c_.get());
10801 const coll_t &cid = c->get_cid();
10802 dout(15) << __func__ << " " << cid << " " << oid
10803 << " 0x" << std::hex << offset << "~" << length << std::dec
10804 << dendl;
10805 if (!c->exists)
10806 return -ENOENT;
10807
10808 bl.clear();
10809 int r;
10810 {
9f95a23c 10811 std::shared_lock l(c->lock);
11fdf7f2 10812 auto start1 = mono_clock::now();
7c673cae 10813 OnodeRef o = c->get_onode(oid, false);
494da23a
TL
10814 log_latency("get_onode@read",
10815 l_bluestore_read_onode_meta_lat,
10816 mono_clock::now() - start1,
10817 cct->_conf->bluestore_log_op_age);
7c673cae
FG
10818 if (!o || !o->exists) {
10819 r = -ENOENT;
10820 goto out;
10821 }
10822
10823 if (offset == length && offset == 0)
10824 length = o->onode.size;
10825
10826 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
10827 if (r == -EIO) {
10828 logger->inc(l_bluestore_read_eio);
10829 }
7c673cae
FG
10830 }
10831
10832 out:
28e407b8 10833 if (r >= 0 && _debug_data_eio(oid)) {
7c673cae
FG
10834 r = -EIO;
10835 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11fdf7f2
TL
10836 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10837 cct->_conf->bluestore_debug_random_read_err &&
10838 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10839 100.0)) == 0) {
224ce89b
WB
10840 dout(0) << __func__ << ": inject random EIO" << dendl;
10841 r = -EIO;
7c673cae
FG
10842 }
10843 dout(10) << __func__ << " " << cid << " " << oid
10844 << " 0x" << std::hex << offset << "~" << length << std::dec
10845 << " = " << r << dendl;
494da23a
TL
10846 log_latency(__func__,
10847 l_bluestore_read_lat,
10848 mono_clock::now() - start,
10849 cct->_conf->bluestore_log_op_age);
7c673cae
FG
10850 return r;
10851}
10852
9f95a23c 10853void BlueStore::_read_cache(
39ae355f 10854 OnodeRef& o,
7c673cae
FG
10855 uint64_t offset,
10856 size_t length,
9f95a23c
TL
10857 int read_cache_policy,
10858 ready_regions_t& ready_regions,
10859 blobs2read_t& blobs2read)
7c673cae 10860{
7c673cae 10861 // build blob-wise list to of stuff read (that isn't cached)
7c673cae
FG
10862 unsigned left = length;
10863 uint64_t pos = offset;
7c673cae
FG
10864 auto lp = o->extent_map.seek_lextent(offset);
10865 while (left > 0 && lp != o->extent_map.extent_map.end()) {
10866 if (pos < lp->logical_offset) {
10867 unsigned hole = lp->logical_offset - pos;
10868 if (hole >= left) {
9f95a23c 10869 break;
7c673cae
FG
10870 }
10871 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
9f95a23c 10872 << std::dec << dendl;
7c673cae
FG
10873 pos += hole;
10874 left -= hole;
10875 }
94b18763 10876 BlobRef& bptr = lp->blob;
7c673cae
FG
10877 unsigned l_off = pos - lp->logical_offset;
10878 unsigned b_off = l_off + lp->blob_offset;
10879 unsigned b_len = std::min(left, lp->length - l_off);
10880
10881 ready_regions_t cache_res;
10882 interval_set<uint32_t> cache_interval;
10883 bptr->shared_blob->bc.read(
91327a77
AA
10884 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
10885 read_cache_policy);
7c673cae 10886 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c
TL
10887 << " need 0x" << b_off << "~" << b_len
10888 << " cache has 0x" << cache_interval
10889 << std::dec << dendl;
7c673cae
FG
10890
10891 auto pc = cache_res.begin();
11fdf7f2 10892 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
7c673cae
FG
10893 while (b_len > 0) {
10894 unsigned l;
10895 if (pc != cache_res.end() &&
9f95a23c
TL
10896 pc->first == b_off) {
10897 l = pc->second.length();
f67539c2 10898 ready_regions[pos] = std::move(pc->second);
9f95a23c
TL
10899 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
10900 << b_off << "~" << l << std::dec << dendl;
10901 ++pc;
7c673cae 10902 } else {
9f95a23c
TL
10903 l = b_len;
10904 if (pc != cache_res.end()) {
10905 ceph_assert(pc->first > b_off);
10906 l = pc->first - b_off;
10907 }
10908 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
10909 << b_off << "~" << l << std::dec << dendl;
10910 // merge regions
10911 {
10912 uint64_t r_off = b_off;
10913 uint64_t r_len = l;
10914 uint64_t front = r_off % chunk_size;
10915 if (front) {
10916 r_off -= front;
10917 r_len += front;
10918 }
10919 unsigned tail = r_len % chunk_size;
10920 if (tail) {
10921 r_len += chunk_size - tail;
10922 }
10923 bool merged = false;
10924 regions2read_t& r2r = blobs2read[bptr];
10925 if (r2r.size()) {
10926 read_req_t& pre = r2r.back();
10927 if (r_off <= (pre.r_off + pre.r_len)) {
10928 front += (r_off - pre.r_off);
10929 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
10930 pre.regs.emplace_back(region_t(pos, b_off, l, front));
10931 merged = true;
10932 }
10933 }
10934 if (!merged) {
10935 read_req_t req(r_off, r_len);
10936 req.regs.emplace_back(region_t(pos, b_off, l, front));
10937 r2r.emplace_back(std::move(req));
10938 }
10939 }
7c673cae
FG
10940 }
10941 pos += l;
10942 b_off += l;
10943 left -= l;
10944 b_len -= l;
10945 }
10946 ++lp;
10947 }
9f95a23c 10948}
7c673cae 10949
9f95a23c
TL
10950int BlueStore::_prepare_read_ioc(
10951 blobs2read_t& blobs2read,
10952 vector<bufferlist>* compressed_blob_bls,
10953 IOContext* ioc)
10954{
7c673cae 10955 for (auto& p : blobs2read) {
94b18763 10956 const BlobRef& bptr = p.first;
11fdf7f2 10957 regions2read_t& r2r = p.second;
20effc67
TL
10958 dout(20) << __func__ << " blob " << *bptr << " need "
10959 << r2r << dendl;
7c673cae
FG
10960 if (bptr->get_blob().is_compressed()) {
10961 // read the whole thing
9f95a23c
TL
10962 if (compressed_blob_bls->empty()) {
10963 // ensure we avoid any reallocation on subsequent blobs
10964 compressed_blob_bls->reserve(blobs2read.size());
10965 }
10966 compressed_blob_bls->push_back(bufferlist());
10967 bufferlist& bl = compressed_blob_bls->back();
10968 auto r = bptr->get_blob().map(
10969 0, bptr->get_blob().get_ondisk_length(),
10970 [&](uint64_t offset, uint64_t length) {
10971 int r = bdev->aio_read(offset, length, &bl, ioc);
10972 if (r < 0)
7c673cae
FG
10973 return r;
10974 return 0;
9f95a23c 10975 });
b32b8144
FG
10976 if (r < 0) {
10977 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
10978 if (r == -EIO) {
10979 // propagate EIO to caller
10980 return r;
10981 }
11fdf7f2 10982 ceph_assert(r == 0);
b32b8144 10983 }
7c673cae
FG
10984 } else {
10985 // read the pieces
11fdf7f2 10986 for (auto& req : r2r) {
9f95a23c
TL
10987 dout(20) << __func__ << " region 0x" << std::hex
10988 << req.regs.front().logical_offset
10989 << ": 0x" << req.regs.front().blob_xoffset
10990 << " reading 0x" << req.r_off
10991 << "~" << req.r_len << std::dec
10992 << dendl;
7c673cae 10993
9f95a23c
TL
10994 // read it
10995 auto r = bptr->get_blob().map(
10996 req.r_off, req.r_len,
10997 [&](uint64_t offset, uint64_t length) {
10998 int r = bdev->aio_read(offset, length, &req.bl, ioc);
10999 if (r < 0)
7c673cae
FG
11000 return r;
11001 return 0;
9f95a23c 11002 });
b32b8144
FG
11003 if (r < 0) {
11004 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
11005 << dendl;
11006 if (r == -EIO) {
11007 // propagate EIO to caller
11008 return r;
11009 }
11fdf7f2 11010 ceph_assert(r == 0);
b32b8144 11011 }
9f95a23c 11012 ceph_assert(req.bl.length() == req.r_len);
7c673cae
FG
11013 }
11014 }
11015 }
9f95a23c
TL
11016 return 0;
11017}
11fdf7f2 11018
9f95a23c 11019int BlueStore::_generate_read_result_bl(
39ae355f 11020 OnodeRef& o,
9f95a23c
TL
11021 uint64_t offset,
11022 size_t length,
11023 ready_regions_t& ready_regions,
11024 vector<bufferlist>& compressed_blob_bls,
11025 blobs2read_t& blobs2read,
11026 bool buffered,
11027 bool* csum_error,
11028 bufferlist& bl)
11029{
11030 // enumerate and decompress desired blobs
7c673cae
FG
11031 auto p = compressed_blob_bls.begin();
11032 blobs2read_t::iterator b2r_it = blobs2read.begin();
11033 while (b2r_it != blobs2read.end()) {
94b18763 11034 const BlobRef& bptr = b2r_it->first;
11fdf7f2 11035 regions2read_t& r2r = b2r_it->second;
20effc67
TL
11036 dout(20) << __func__ << " blob " << *bptr << " need "
11037 << r2r << dendl;
7c673cae 11038 if (bptr->get_blob().is_compressed()) {
11fdf7f2 11039 ceph_assert(p != compressed_blob_bls.end());
7c673cae
FG
11040 bufferlist& compressed_bl = *p++;
11041 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
9f95a23c
TL
11042 r2r.front().regs.front().logical_offset) < 0) {
11043 *csum_error = true;
11044 return -EIO;
7c673cae
FG
11045 }
11046 bufferlist raw_bl;
9f95a23c 11047 auto r = _decompress(compressed_bl, &raw_bl);
7c673cae 11048 if (r < 0)
9f95a23c 11049 return r;
7c673cae 11050 if (buffered) {
9f95a23c
TL
11051 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
11052 raw_bl);
7c673cae 11053 }
11fdf7f2
TL
11054 for (auto& req : r2r) {
11055 for (auto& r : req.regs) {
11056 ready_regions[r.logical_offset].substr_of(
11057 raw_bl, r.blob_xoffset, r.length);
11058 }
7c673cae
FG
11059 }
11060 } else {
11fdf7f2 11061 for (auto& req : r2r) {
9f95a23c
TL
11062 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
11063 req.regs.front().logical_offset) < 0) {
11064 *csum_error = true;
11065 return -EIO;
11066 }
11067 if (buffered) {
11068 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
11069 req.r_off, req.bl);
11070 }
7c673cae 11071
9f95a23c
TL
11072 // prune and keep result
11073 for (const auto& r : req.regs) {
11074 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
11fdf7f2 11075 }
7c673cae
FG
11076 }
11077 }
11078 ++b2r_it;
11079 }
11080
11081 // generate a resulting buffer
11082 auto pr = ready_regions.begin();
11083 auto pr_end = ready_regions.end();
9f95a23c 11084 uint64_t pos = 0;
7c673cae
FG
11085 while (pos < length) {
11086 if (pr != pr_end && pr->first == pos + offset) {
11087 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
11088 << ": data from 0x" << pr->first << "~" << pr->second.length()
11089 << std::dec << dendl;
7c673cae
FG
11090 pos += pr->second.length();
11091 bl.claim_append(pr->second);
11092 ++pr;
11093 } else {
11094 uint64_t l = length - pos;
11095 if (pr != pr_end) {
11fdf7f2 11096 ceph_assert(pr->first > pos + offset);
9f95a23c 11097 l = pr->first - (pos + offset);
7c673cae
FG
11098 }
11099 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
11100 << ": zeros for 0x" << (pos + offset) << "~" << l
11101 << std::dec << dendl;
7c673cae
FG
11102 bl.append_zero(l);
11103 pos += l;
11104 }
11105 }
11fdf7f2
TL
11106 ceph_assert(bl.length() == length);
11107 ceph_assert(pos == length);
11108 ceph_assert(pr == pr_end);
9f95a23c
TL
11109 return 0;
11110}
11111
11112int BlueStore::_do_read(
11113 Collection *c,
39ae355f 11114 OnodeRef& o,
9f95a23c
TL
11115 uint64_t offset,
11116 size_t length,
11117 bufferlist& bl,
11118 uint32_t op_flags,
11119 uint64_t retry_count)
11120{
11121 FUNCTRACE(cct);
11122 int r = 0;
11123 int read_cache_policy = 0; // do not bypass clean or dirty cache
11124
11125 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11126 << " size 0x" << o->onode.size << " (" << std::dec
11127 << o->onode.size << ")" << dendl;
11128 bl.clear();
11129
11130 if (offset >= o->onode.size) {
11131 return r;
11132 }
11133
11134 // generally, don't buffer anything, unless the client explicitly requests
11135 // it.
11136 bool buffered = false;
11137 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
11138 dout(20) << __func__ << " will do buffered read" << dendl;
11139 buffered = true;
11140 } else if (cct->_conf->bluestore_default_buffered_read &&
11141 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
11142 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
11143 dout(20) << __func__ << " defaulting to buffered read" << dendl;
11144 buffered = true;
11145 }
11146
11147 if (offset + length > o->onode.size) {
11148 length = o->onode.size - offset;
11149 }
11150
11151 auto start = mono_clock::now();
11152 o->extent_map.fault_range(db, offset, length);
11153 log_latency(__func__,
11154 l_bluestore_read_onode_meta_lat,
11155 mono_clock::now() - start,
11156 cct->_conf->bluestore_log_op_age);
11157 _dump_onode<30>(cct, *o);
11158
11159 // for deep-scrub, we only read dirty cache and bypass clean cache in
11160 // order to read underlying block device in case there are silent disk errors.
11161 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
11162 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
11163 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
11164 }
11165
11166 // build blob-wise list to of stuff read (that isn't cached)
11167 ready_regions_t ready_regions;
11168 blobs2read_t blobs2read;
11169 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
11170
11171
11172 // read raw blob data.
11173 start = mono_clock::now(); // for the sake of simplicity
11174 // measure the whole block below.
11175 // The error isn't that much...
11176 vector<bufferlist> compressed_blob_bls;
20effc67 11177 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
9f95a23c
TL
11178 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
11179 // we always issue aio for reading, so errors other than EIO are not allowed
11180 if (r < 0)
11181 return r;
11182
f67539c2 11183 int64_t num_ios = blobs2read.size();
9f95a23c 11184 if (ioc.has_pending_aios()) {
f67539c2 11185 num_ios = ioc.get_num_ios();
9f95a23c
TL
11186 bdev->aio_submit(&ioc);
11187 dout(20) << __func__ << " waiting for aio" << dendl;
11188 ioc.aio_wait();
11189 r = ioc.get_return_value();
11190 if (r < 0) {
11191 ceph_assert(r == -EIO); // no other errors allowed
11192 return -EIO;
11193 }
11194 }
11195 log_latency_fn(__func__,
11196 l_bluestore_read_wait_aio_lat,
11197 mono_clock::now() - start,
11198 cct->_conf->bluestore_log_op_age,
11199 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
11200 );
11201
11202 bool csum_error = false;
11203 r = _generate_read_result_bl(o, offset, length, ready_regions,
11204 compressed_blob_bls, blobs2read,
20effc67
TL
11205 buffered && !ioc.skip_cache(),
11206 &csum_error, bl);
9f95a23c
TL
11207 if (csum_error) {
11208 // Handles spurious read errors caused by a kernel bug.
11209 // We sometimes get all-zero pages as a result of the read under
11210 // high memory pressure. Retrying the failing read succeeds in most
11211 // cases.
11212 // See also: http://tracker.ceph.com/issues/22464
11213 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
11214 return -EIO;
11215 }
11216 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
11217 }
7c673cae 11218 r = bl.length();
f64942e4
AA
11219 if (retry_count) {
11220 logger->inc(l_bluestore_reads_with_retries);
11221 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
11222 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
f67539c2
TL
11223 stringstream s;
11224 s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
11225 _set_spurious_read_errors_alert(s.str());
f64942e4 11226 }
7c673cae
FG
11227 return r;
11228}
11229
11230int BlueStore::_verify_csum(OnodeRef& o,
11231 const bluestore_blob_t* blob, uint64_t blob_xoffset,
11232 const bufferlist& bl,
11233 uint64_t logical_offset) const
11234{
11235 int bad;
11236 uint64_t bad_csum;
11fdf7f2 11237 auto start = mono_clock::now();
7c673cae 11238 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
f64942e4
AA
11239 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
11240 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
11241 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
11242 bad = blob_xoffset;
11243 r = -1;
11244 bad_csum = 0xDEADBEEF;
11245 }
7c673cae
FG
11246 if (r < 0) {
11247 if (r == -1) {
11248 PExtentVector pex;
11249 blob->map(
11250 bad,
11251 blob->get_csum_chunk_size(),
11252 [&](uint64_t offset, uint64_t length) {
11253 pex.emplace_back(bluestore_pextent_t(offset, length));
11254 return 0;
11255 });
11256 derr << __func__ << " bad "
11257 << Checksummer::get_csum_type_string(blob->csum_type)
11258 << "/0x" << std::hex << blob->get_csum_chunk_size()
11259 << " checksum at blob offset 0x" << bad
11260 << ", got 0x" << bad_csum << ", expected 0x"
11261 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
11262 << ", device location " << pex
11263 << ", logical extent 0x" << std::hex
11264 << (logical_offset + bad - blob_xoffset) << "~"
11265 << blob->get_csum_chunk_size() << std::dec
11266 << ", object " << o->oid
11267 << dendl;
11268 } else {
11269 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
11270 }
11271 }
494da23a
TL
11272 log_latency(__func__,
11273 l_bluestore_csum_lat,
11274 mono_clock::now() - start,
11275 cct->_conf->bluestore_log_op_age);
11fdf7f2
TL
11276 if (cct->_conf->bluestore_ignore_data_csum) {
11277 return 0;
11278 }
7c673cae
FG
11279 return r;
11280}
11281
11282int BlueStore::_decompress(bufferlist& source, bufferlist* result)
11283{
11284 int r = 0;
11fdf7f2
TL
11285 auto start = mono_clock::now();
11286 auto i = source.cbegin();
7c673cae 11287 bluestore_compression_header_t chdr;
11fdf7f2 11288 decode(chdr, i);
7c673cae
FG
11289 int alg = int(chdr.type);
11290 CompressorRef cp = compressor;
11291 if (!cp || (int)cp->get_type() != alg) {
11292 cp = Compressor::create(cct, alg);
11293 }
11294
11295 if (!cp.get()) {
11296 // if compressor isn't available - error, because cannot return
11297 // decompressed data?
11fdf7f2
TL
11298
11299 const char* alg_name = Compressor::get_comp_alg_name(alg);
11300 derr << __func__ << " can't load decompressor " << alg_name << dendl;
11301 _set_compression_alert(false, alg_name);
7c673cae
FG
11302 r = -EIO;
11303 } else {
f67539c2 11304 r = cp->decompress(i, chdr.length, *result, chdr.compressor_message);
7c673cae
FG
11305 if (r < 0) {
11306 derr << __func__ << " decompression failed with exit code " << r << dendl;
11307 r = -EIO;
11308 }
11309 }
494da23a
TL
11310 log_latency(__func__,
11311 l_bluestore_decompress_lat,
11312 mono_clock::now() - start,
11313 cct->_conf->bluestore_log_op_age);
7c673cae
FG
11314 return r;
11315}
11316
11317// this stores fiemap into interval_set, other variations
11318// use it internally
11319int BlueStore::_fiemap(
11320 CollectionHandle &c_,
11321 const ghobject_t& oid,
11322 uint64_t offset,
11323 size_t length,
11324 interval_set<uint64_t>& destset)
11325{
11326 Collection *c = static_cast<Collection *>(c_.get());
11327 if (!c->exists)
11328 return -ENOENT;
11329 {
9f95a23c 11330 std::shared_lock l(c->lock);
7c673cae
FG
11331
11332 OnodeRef o = c->get_onode(oid, false);
11333 if (!o || !o->exists) {
11334 return -ENOENT;
11335 }
81eedcae 11336 _dump_onode<30>(cct, *o);
7c673cae
FG
11337
11338 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11339 << " size 0x" << o->onode.size << std::dec << dendl;
11340
11341 boost::intrusive::set<Extent>::iterator ep, eend;
11342 if (offset >= o->onode.size)
11343 goto out;
11344
11345 if (offset + length > o->onode.size) {
11346 length = o->onode.size - offset;
11347 }
11348
11349 o->extent_map.fault_range(db, offset, length);
11350 eend = o->extent_map.extent_map.end();
11351 ep = o->extent_map.seek_lextent(offset);
11352 while (length > 0) {
11353 dout(20) << __func__ << " offset " << offset << dendl;
11354 if (ep != eend && ep->logical_offset + ep->length <= offset) {
11355 ++ep;
11356 continue;
11357 }
11358
11359 uint64_t x_len = length;
11360 if (ep != eend && ep->logical_offset <= offset) {
11361 uint64_t x_off = offset - ep->logical_offset;
11fdf7f2 11362 x_len = std::min(x_len, ep->length - x_off);
7c673cae
FG
11363 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
11364 << x_len << std::dec << " blob " << ep->blob << dendl;
11365 destset.insert(offset, x_len);
11366 length -= x_len;
11367 offset += x_len;
11368 if (x_off + x_len == ep->length)
11369 ++ep;
11370 continue;
11371 }
11372 if (ep != eend &&
11373 ep->logical_offset > offset &&
11374 ep->logical_offset - offset < x_len) {
11375 x_len = ep->logical_offset - offset;
11376 }
11377 offset += x_len;
11378 length -= x_len;
11379 }
11380 }
9f95a23c
TL
11381
11382 out:
11383 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11384 << " size = 0x(" << destset << ")" << std::dec << dendl;
11385 return 0;
11386}
11387
11388int BlueStore::fiemap(
11389 CollectionHandle &c_,
11390 const ghobject_t& oid,
11391 uint64_t offset,
11392 size_t length,
11393 bufferlist& bl)
11394{
11395 interval_set<uint64_t> m;
11396 int r = _fiemap(c_, oid, offset, length, m);
11397 if (r >= 0) {
11398 encode(m, bl);
11399 }
11400 return r;
11401}
11402
11403int BlueStore::fiemap(
11404 CollectionHandle &c_,
11405 const ghobject_t& oid,
11406 uint64_t offset,
11407 size_t length,
11408 map<uint64_t, uint64_t>& destmap)
11409{
11410 interval_set<uint64_t> m;
11411 int r = _fiemap(c_, oid, offset, length, m);
11412 if (r >= 0) {
11413 destmap = std::move(m).detach();
11414 }
11415 return r;
11416}
11417
11418int BlueStore::readv(
11419 CollectionHandle &c_,
11420 const ghobject_t& oid,
11421 interval_set<uint64_t>& m,
11422 bufferlist& bl,
11423 uint32_t op_flags)
11424{
11425 auto start = mono_clock::now();
11426 Collection *c = static_cast<Collection *>(c_.get());
11427 const coll_t &cid = c->get_cid();
11428 dout(15) << __func__ << " " << cid << " " << oid
11429 << " fiemap " << m
11430 << dendl;
11431 if (!c->exists)
11432 return -ENOENT;
11433
11434 bl.clear();
11435 int r;
11436 {
11437 std::shared_lock l(c->lock);
11438 auto start1 = mono_clock::now();
11439 OnodeRef o = c->get_onode(oid, false);
11440 log_latency("get_onode@read",
11441 l_bluestore_read_onode_meta_lat,
11442 mono_clock::now() - start1,
11443 cct->_conf->bluestore_log_op_age);
11444 if (!o || !o->exists) {
11445 r = -ENOENT;
11446 goto out;
11447 }
11448
11449 if (m.empty()) {
11450 r = 0;
11451 goto out;
11452 }
11453
11454 r = _do_readv(c, o, m, bl, op_flags);
11455 if (r == -EIO) {
11456 logger->inc(l_bluestore_read_eio);
11457 }
11458 }
11459
11460 out:
11461 if (r >= 0 && _debug_data_eio(oid)) {
11462 r = -EIO;
11463 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11464 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
11465 cct->_conf->bluestore_debug_random_read_err &&
11466 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
11467 100.0)) == 0) {
11468 dout(0) << __func__ << ": inject random EIO" << dendl;
11469 r = -EIO;
11470 }
11471 dout(10) << __func__ << " " << cid << " " << oid
11472 << " fiemap " << m << std::dec
11473 << " = " << r << dendl;
11474 log_latency(__func__,
11475 l_bluestore_read_lat,
11476 mono_clock::now() - start,
11477 cct->_conf->bluestore_log_op_age);
11478 return r;
11479}
11480
11481int BlueStore::_do_readv(
11482 Collection *c,
39ae355f 11483 OnodeRef& o,
9f95a23c
TL
11484 const interval_set<uint64_t>& m,
11485 bufferlist& bl,
11486 uint32_t op_flags,
11487 uint64_t retry_count)
11488{
11489 FUNCTRACE(cct);
11490 int r = 0;
11491 int read_cache_policy = 0; // do not bypass clean or dirty cache
11492
11493 dout(20) << __func__ << " fiemap " << m << std::hex
11494 << " size 0x" << o->onode.size << " (" << std::dec
11495 << o->onode.size << ")" << dendl;
11496
11497 // generally, don't buffer anything, unless the client explicitly requests
11498 // it.
11499 bool buffered = false;
11500 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
11501 dout(20) << __func__ << " will do buffered read" << dendl;
11502 buffered = true;
11503 } else if (cct->_conf->bluestore_default_buffered_read &&
11504 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
11505 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
11506 dout(20) << __func__ << " defaulting to buffered read" << dendl;
11507 buffered = true;
11508 }
11509 // this method must be idempotent since we may call it several times
11510 // before we finally read the expected result.
11511 bl.clear();
11512
11513 // call fiemap first!
11514 ceph_assert(m.range_start() <= o->onode.size);
11515 ceph_assert(m.range_end() <= o->onode.size);
11516 auto start = mono_clock::now();
11517 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
11518 log_latency(__func__,
11519 l_bluestore_read_onode_meta_lat,
11520 mono_clock::now() - start,
11521 cct->_conf->bluestore_log_op_age);
11522 _dump_onode<30>(cct, *o);
11523
20effc67 11524 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
9f95a23c
TL
11525 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
11526 raw_results.reserve(m.num_intervals());
11527 int i = 0;
11528 for (auto p = m.begin(); p != m.end(); p++, i++) {
11529 raw_results.push_back({});
11530 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
11531 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
11532 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
11533 // we always issue aio for reading, so errors other than EIO are not allowed
11534 if (r < 0)
11535 return r;
11536 }
11537
11538 auto num_ios = m.size();
11539 if (ioc.has_pending_aios()) {
11540 num_ios = ioc.get_num_ios();
11541 bdev->aio_submit(&ioc);
11542 dout(20) << __func__ << " waiting for aio" << dendl;
11543 ioc.aio_wait();
11544 r = ioc.get_return_value();
11545 if (r < 0) {
11546 ceph_assert(r == -EIO); // no other errors allowed
11547 return -EIO;
11548 }
11549 }
11550 log_latency_fn(__func__,
11551 l_bluestore_read_wait_aio_lat,
11552 mono_clock::now() - start,
11553 cct->_conf->bluestore_log_op_age,
11554 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
11555 );
11556
11557 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
11558 i = 0;
11559 for (auto p = m.begin(); p != m.end(); p++, i++) {
11560 bool csum_error = false;
11561 bufferlist t;
11562 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
11563 std::get<0>(raw_results[i]),
11564 std::get<1>(raw_results[i]),
11565 std::get<2>(raw_results[i]),
11566 buffered, &csum_error, t);
11567 if (csum_error) {
11568 // Handles spurious read errors caused by a kernel bug.
11569 // We sometimes get all-zero pages as a result of the read under
11570 // high memory pressure. Retrying the failing read succeeds in most
11571 // cases.
11572 // See also: http://tracker.ceph.com/issues/22464
11573 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
11574 return -EIO;
11575 }
11576 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
11577 }
11578 bl.claim_append(t);
11579 }
11580 if (retry_count) {
11581 logger->inc(l_bluestore_reads_with_retries);
11582 dout(5) << __func__ << " read fiemap " << m
11583 << " failed " << retry_count << " times before succeeding"
11584 << dendl;
11585 }
11586 return bl.length();
7c673cae
FG
11587}
11588
9f95a23c 11589int BlueStore::dump_onode(CollectionHandle &c_,
7c673cae 11590 const ghobject_t& oid,
9f95a23c
TL
11591 const string& section_name,
11592 Formatter *f)
7c673cae 11593{
9f95a23c
TL
11594 Collection *c = static_cast<Collection *>(c_.get());
11595 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
11596 if (!c->exists)
11597 return -ENOENT;
7c673cae 11598
9f95a23c
TL
11599 int r;
11600 {
11601 std::shared_lock l(c->lock);
11602
11603 OnodeRef o = c->get_onode(oid, false);
11604 if (!o || !o->exists) {
11605 r = -ENOENT;
11606 goto out;
11607 }
11608 // FIXME minor: actually the next line isn't enough to
11609 // load shared blobs. Leaving as is for now..
11610 //
11611 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
11612
11613 _dump_onode<0>(cct, *o);
11614 f->open_object_section(section_name.c_str());
11615 o->dump(f);
11616 f->close_section();
11617 r = 0;
7c673cae 11618 }
9f95a23c
TL
11619 out:
11620 dout(10) << __func__ << " " << c->cid << " " << oid
11621 << " = " << r << dendl;
7c673cae
FG
11622 return r;
11623}
11624
7c673cae
FG
11625int BlueStore::getattr(
11626 CollectionHandle &c_,
11627 const ghobject_t& oid,
11628 const char *name,
11629 bufferptr& value)
11630{
11631 Collection *c = static_cast<Collection *>(c_.get());
11632 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
11633 if (!c->exists)
11634 return -ENOENT;
11635
11636 int r;
11637 {
9f95a23c 11638 std::shared_lock l(c->lock);
f91f0fd5 11639 mempool::bluestore_cache_meta::string k(name);
7c673cae
FG
11640
11641 OnodeRef o = c->get_onode(oid, false);
11642 if (!o || !o->exists) {
11643 r = -ENOENT;
11644 goto out;
11645 }
11646
11647 if (!o->onode.attrs.count(k)) {
11648 r = -ENODATA;
11649 goto out;
11650 }
11651 value = o->onode.attrs[k];
11652 r = 0;
11653 }
11654 out:
7c673cae
FG
11655 if (r == 0 && _debug_mdata_eio(oid)) {
11656 r = -EIO;
11657 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11658 }
11659 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
11660 << " = " << r << dendl;
11661 return r;
11662}
11663
7c673cae
FG
11664int BlueStore::getattrs(
11665 CollectionHandle &c_,
11666 const ghobject_t& oid,
20effc67 11667 map<string,bufferptr,less<>>& aset)
7c673cae
FG
11668{
11669 Collection *c = static_cast<Collection *>(c_.get());
11670 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
11671 if (!c->exists)
11672 return -ENOENT;
11673
11674 int r;
11675 {
9f95a23c 11676 std::shared_lock l(c->lock);
7c673cae
FG
11677
11678 OnodeRef o = c->get_onode(oid, false);
11679 if (!o || !o->exists) {
11680 r = -ENOENT;
11681 goto out;
11682 }
11683 for (auto& i : o->onode.attrs) {
11684 aset.emplace(i.first.c_str(), i.second);
11685 }
11686 r = 0;
11687 }
11688
11689 out:
7c673cae
FG
11690 if (r == 0 && _debug_mdata_eio(oid)) {
11691 r = -EIO;
11692 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11693 }
11694 dout(10) << __func__ << " " << c->cid << " " << oid
11695 << " = " << r << dendl;
11696 return r;
11697}
11698
11699int BlueStore::list_collections(vector<coll_t>& ls)
11700{
9f95a23c 11701 std::shared_lock l(coll_lock);
11fdf7f2 11702 ls.reserve(coll_map.size());
7c673cae
FG
11703 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
11704 p != coll_map.end();
11705 ++p)
11706 ls.push_back(p->first);
11707 return 0;
11708}
11709
11710bool BlueStore::collection_exists(const coll_t& c)
11711{
9f95a23c 11712 std::shared_lock l(coll_lock);
7c673cae
FG
11713 return coll_map.count(c);
11714}
11715
11fdf7f2 11716int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
7c673cae 11717{
11fdf7f2 11718 dout(15) << __func__ << " " << ch->cid << dendl;
7c673cae
FG
11719 vector<ghobject_t> ls;
11720 ghobject_t next;
11fdf7f2 11721 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
7c673cae
FG
11722 &ls, &next);
11723 if (r < 0) {
11724 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
11725 << dendl;
11726 return r;
11727 }
11728 *empty = ls.empty();
11fdf7f2 11729 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
7c673cae
FG
11730 return 0;
11731}
11732
11fdf7f2 11733int BlueStore::collection_bits(CollectionHandle& ch)
7c673cae 11734{
11fdf7f2
TL
11735 dout(15) << __func__ << " " << ch->cid << dendl;
11736 Collection *c = static_cast<Collection*>(ch.get());
9f95a23c 11737 std::shared_lock l(c->lock);
11fdf7f2 11738 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
7c673cae
FG
11739 return c->cnode.bits;
11740}
11741
7c673cae
FG
11742int BlueStore::collection_list(
11743 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
11744 vector<ghobject_t> *ls, ghobject_t *pnext)
11745{
11746 Collection *c = static_cast<Collection *>(c_.get());
11fdf7f2 11747 c->flush();
7c673cae
FG
11748 dout(15) << __func__ << " " << c->cid
11749 << " start " << start << " end " << end << " max " << max << dendl;
11750 int r;
11751 {
9f95a23c 11752 std::shared_lock l(c->lock);
f91f0fd5
TL
11753 r = _collection_list(c, start, end, max, false, ls, pnext);
11754 }
11755
11756 dout(10) << __func__ << " " << c->cid
11757 << " start " << start << " end " << end << " max " << max
11758 << " = " << r << ", ls.size() = " << ls->size()
11759 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
11760 return r;
11761}
11762
11763int BlueStore::collection_list_legacy(
11764 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
11765 vector<ghobject_t> *ls, ghobject_t *pnext)
11766{
11767 Collection *c = static_cast<Collection *>(c_.get());
11768 c->flush();
11769 dout(15) << __func__ << " " << c->cid
11770 << " start " << start << " end " << end << " max " << max << dendl;
11771 int r;
11772 {
11773 std::shared_lock l(c->lock);
11774 r = _collection_list(c, start, end, max, true, ls, pnext);
7c673cae
FG
11775 }
11776
7c673cae
FG
11777 dout(10) << __func__ << " " << c->cid
11778 << " start " << start << " end " << end << " max " << max
11779 << " = " << r << ", ls.size() = " << ls->size()
11780 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
11781 return r;
11782}
11783
11784int BlueStore::_collection_list(
11785 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
f91f0fd5 11786 bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
7c673cae
FG
11787{
11788
11789 if (!c->exists)
11790 return -ENOENT;
11791
7c673cae 11792 ghobject_t static_next;
f91f0fd5
TL
11793 std::unique_ptr<CollectionListIterator> it;
11794 ghobject_t coll_range_temp_start, coll_range_temp_end;
11795 ghobject_t coll_range_start, coll_range_end;
f91f0fd5 11796 ghobject_t pend;
7c673cae
FG
11797 bool temp;
11798
11799 if (!pnext)
11800 pnext = &static_next;
11801
a4b75251
TL
11802 auto log_latency = make_scope_guard(
11803 [&, start_time = mono_clock::now(), func_name = __func__] {
11804 log_latency_fn(
11805 func_name,
aee94f69 11806 l_bluestore_clist_lat,
a4b75251
TL
11807 mono_clock::now() - start_time,
11808 cct->_conf->bluestore_log_collection_list_age,
11809 [&](const ceph::timespan& lat) {
11810 ostringstream ostr;
11811 ostr << ", lat = " << timespan_str(lat)
11812 << " cid =" << c->cid
11813 << " start " << start << " end " << end
11814 << " max " << max;
11815 return ostr.str();
11816 });
11817 });
11818
11fdf7f2 11819 if (start.is_max() || start.hobj.is_max()) {
a4b75251
TL
11820 *pnext = ghobject_t::get_max();
11821 return 0;
7c673cae 11822 }
f91f0fd5 11823 get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
a4b75251 11824 &coll_range_temp_end, &coll_range_start, &coll_range_end, legacy);
7c673cae 11825 dout(20) << __func__
f91f0fd5
TL
11826 << " range " << coll_range_temp_start
11827 << " to " << coll_range_temp_end
11828 << " and " << coll_range_start
11829 << " to " << coll_range_end
7c673cae 11830 << " start " << start << dendl;
f91f0fd5
TL
11831 if (legacy) {
11832 it = std::make_unique<SimpleCollectionListIterator>(
11833 cct, db->get_iterator(PREFIX_OBJ));
11834 } else {
11835 it = std::make_unique<SortedCollectionListIterator>(
11836 db->get_iterator(PREFIX_OBJ));
11837 }
7c673cae
FG
11838 if (start == ghobject_t() ||
11839 start.hobj == hobject_t() ||
11840 start == c->cid.get_min_hobj()) {
f91f0fd5 11841 it->upper_bound(coll_range_temp_start);
7c673cae
FG
11842 temp = true;
11843 } else {
7c673cae
FG
11844 if (start.hobj.is_temp()) {
11845 temp = true;
f91f0fd5 11846 ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
7c673cae
FG
11847 } else {
11848 temp = false;
f91f0fd5 11849 ceph_assert(start >= coll_range_start && start < coll_range_end);
7c673cae 11850 }
f91f0fd5
TL
11851 dout(20) << __func__ << " temp=" << (int)temp << dendl;
11852 it->lower_bound(start);
7c673cae
FG
11853 }
11854 if (end.hobj.is_max()) {
f91f0fd5 11855 pend = temp ? coll_range_temp_end : coll_range_end;
7c673cae 11856 } else {
7c673cae 11857 if (end.hobj.is_temp()) {
a4b75251 11858 if (temp) {
f91f0fd5 11859 pend = end;
a4b75251
TL
11860 } else {
11861 *pnext = ghobject_t::get_max();
11862 return 0;
11863 }
7c673cae 11864 } else {
f91f0fd5 11865 pend = temp ? coll_range_temp_end : end;
7c673cae
FG
11866 }
11867 }
f91f0fd5 11868 dout(20) << __func__ << " pend " << pend << dendl;
7c673cae 11869 while (true) {
adb31ebb 11870 if (!it->valid() || it->is_ge(pend)) {
7c673cae
FG
11871 if (!it->valid())
11872 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
11873 else
f91f0fd5 11874 dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
7c673cae
FG
11875 if (temp) {
11876 if (end.hobj.is_temp()) {
adb31ebb 11877 if (it->valid() && it->is_lt(coll_range_temp_end)) {
f91f0fd5 11878 *pnext = it->oid();
a4b75251 11879 return 0;
f91f0fd5 11880 }
7c673cae
FG
11881 break;
11882 }
11883 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
11884 temp = false;
f91f0fd5
TL
11885 it->upper_bound(coll_range_start);
11886 if (end.hobj.is_max())
11887 pend = coll_range_end;
11888 else
11889 pend = end;
11890 dout(30) << __func__ << " pend " << pend << dendl;
7c673cae
FG
11891 continue;
11892 }
adb31ebb 11893 if (it->valid() && it->is_lt(coll_range_end)) {
f91f0fd5 11894 *pnext = it->oid();
a4b75251 11895 return 0;
f91f0fd5 11896 }
7c673cae
FG
11897 break;
11898 }
f91f0fd5 11899 dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
7c673cae
FG
11900 if (ls->size() >= (unsigned)max) {
11901 dout(20) << __func__ << " reached max " << max << dendl;
f91f0fd5 11902 *pnext = it->oid();
a4b75251 11903 return 0;
7c673cae 11904 }
f91f0fd5 11905 ls->push_back(it->oid());
7c673cae
FG
11906 it->next();
11907 }
a4b75251
TL
11908 *pnext = ghobject_t::get_max();
11909 return 0;
7c673cae
FG
11910}
11911
7c673cae
FG
11912int BlueStore::omap_get(
11913 CollectionHandle &c_, ///< [in] Collection containing oid
11914 const ghobject_t &oid, ///< [in] Object containing omap
11915 bufferlist *header, ///< [out] omap header
11916 map<string, bufferlist> *out /// < [out] Key to value map
11917 )
11918{
11919 Collection *c = static_cast<Collection *>(c_.get());
9f95a23c
TL
11920 return _omap_get(c, oid, header, out);
11921}
11922
11923int BlueStore::_omap_get(
11924 Collection *c, ///< [in] Collection containing oid
11925 const ghobject_t &oid, ///< [in] Object containing omap
11926 bufferlist *header, ///< [out] omap header
11927 map<string, bufferlist> *out /// < [out] Key to value map
11928 )
11929{
7c673cae
FG
11930 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11931 if (!c->exists)
11932 return -ENOENT;
9f95a23c 11933 std::shared_lock l(c->lock);
7c673cae
FG
11934 int r = 0;
11935 OnodeRef o = c->get_onode(oid, false);
11936 if (!o || !o->exists) {
11937 r = -ENOENT;
11938 goto out;
11939 }
9f95a23c
TL
11940 r = _onode_omap_get(o, header, out);
11941 out:
11942 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11943 << dendl;
11944 return r;
11945}
11946
11947int BlueStore::_onode_omap_get(
11948 const OnodeRef &o, ///< [in] Object containing omap
11949 bufferlist *header, ///< [out] omap header
11950 map<string, bufferlist> *out /// < [out] Key to value map
11951)
11952{
11953 int r = 0;
11954 if (!o || !o->exists) {
11955 r = -ENOENT;
11956 goto out;
11957 }
7c673cae
FG
11958 if (!o->onode.has_omap())
11959 goto out;
11960 o->flush();
11961 {
9f95a23c 11962 const string& prefix = o->get_omap_prefix();
7c673cae 11963 string head, tail;
9f95a23c
TL
11964 o->get_omap_header(&head);
11965 o->get_omap_tail(&tail);
33c7a0ef 11966 KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
7c673cae
FG
11967 it->lower_bound(head);
11968 while (it->valid()) {
11969 if (it->key() == head) {
9f95a23c
TL
11970 dout(30) << __func__ << " got header" << dendl;
11971 *header = it->value();
7c673cae 11972 } else if (it->key() >= tail) {
9f95a23c
TL
11973 dout(30) << __func__ << " reached tail" << dendl;
11974 break;
7c673cae 11975 } else {
9f95a23c
TL
11976 string user_key;
11977 o->decode_omap_key(it->key(), &user_key);
11978 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
11979 << " -> " << user_key << dendl;
11980 (*out)[user_key] = it->value();
7c673cae
FG
11981 }
11982 it->next();
11983 }
11984 }
9f95a23c 11985out:
7c673cae
FG
11986 return r;
11987}
11988
7c673cae
FG
11989int BlueStore::omap_get_header(
11990 CollectionHandle &c_, ///< [in] Collection containing oid
11991 const ghobject_t &oid, ///< [in] Object containing omap
11992 bufferlist *header, ///< [out] omap header
11993 bool allow_eio ///< [in] don't assert on eio
11994 )
11995{
11996 Collection *c = static_cast<Collection *>(c_.get());
11997 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11998 if (!c->exists)
11999 return -ENOENT;
9f95a23c 12000 std::shared_lock l(c->lock);
7c673cae
FG
12001 int r = 0;
12002 OnodeRef o = c->get_onode(oid, false);
12003 if (!o || !o->exists) {
12004 r = -ENOENT;
12005 goto out;
12006 }
12007 if (!o->onode.has_omap())
12008 goto out;
12009 o->flush();
12010 {
12011 string head;
9f95a23c
TL
12012 o->get_omap_header(&head);
12013 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
7c673cae
FG
12014 dout(30) << __func__ << " got header" << dendl;
12015 } else {
12016 dout(30) << __func__ << " no header" << dendl;
12017 }
12018 }
12019 out:
12020 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12021 << dendl;
12022 return r;
12023}
12024
7c673cae
FG
12025int BlueStore::omap_get_keys(
12026 CollectionHandle &c_, ///< [in] Collection containing oid
12027 const ghobject_t &oid, ///< [in] Object containing omap
12028 set<string> *keys ///< [out] Keys defined on oid
12029 )
12030{
12031 Collection *c = static_cast<Collection *>(c_.get());
12032 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12033 if (!c->exists)
12034 return -ENOENT;
adb31ebb 12035 auto start1 = mono_clock::now();
9f95a23c 12036 std::shared_lock l(c->lock);
7c673cae
FG
12037 int r = 0;
12038 OnodeRef o = c->get_onode(oid, false);
12039 if (!o || !o->exists) {
12040 r = -ENOENT;
12041 goto out;
12042 }
12043 if (!o->onode.has_omap())
12044 goto out;
12045 o->flush();
12046 {
9f95a23c 12047 const string& prefix = o->get_omap_prefix();
7c673cae 12048 string head, tail;
9f95a23c
TL
12049 o->get_omap_key(string(), &head);
12050 o->get_omap_tail(&tail);
33c7a0ef 12051 KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
7c673cae
FG
12052 it->lower_bound(head);
12053 while (it->valid()) {
12054 if (it->key() >= tail) {
12055 dout(30) << __func__ << " reached tail" << dendl;
12056 break;
12057 }
12058 string user_key;
9f95a23c 12059 o->decode_omap_key(it->key(), &user_key);
11fdf7f2 12060 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
12061 << " -> " << user_key << dendl;
12062 keys->insert(user_key);
12063 it->next();
11fdf7f2
TL
12064 }
12065 }
12066 out:
adb31ebb
TL
12067 c->store->log_latency(
12068 __func__,
12069 l_bluestore_omap_get_keys_lat,
12070 mono_clock::now() - start1,
12071 c->store->cct->_conf->bluestore_log_omap_iterator_age);
12072
11fdf7f2
TL
12073 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12074 << dendl;
12075 return r;
7c673cae
FG
12076}
12077
12078int BlueStore::omap_get_values(
12079 CollectionHandle &c_, ///< [in] Collection containing oid
12080 const ghobject_t &oid, ///< [in] Object containing omap
12081 const set<string> &keys, ///< [in] Keys to get
12082 map<string, bufferlist> *out ///< [out] Returned keys and values
12083 )
12084{
12085 Collection *c = static_cast<Collection *>(c_.get());
12086 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12087 if (!c->exists)
12088 return -ENOENT;
9f95a23c 12089 std::shared_lock l(c->lock);
adb31ebb 12090 auto start1 = mono_clock::now();
7c673cae
FG
12091 int r = 0;
12092 string final_key;
12093 OnodeRef o = c->get_onode(oid, false);
12094 if (!o || !o->exists) {
12095 r = -ENOENT;
12096 goto out;
12097 }
9f95a23c 12098 if (!o->onode.has_omap()) {
7c673cae 12099 goto out;
9f95a23c
TL
12100 }
12101 o->flush();
11fdf7f2 12102 {
9f95a23c
TL
12103 const string& prefix = o->get_omap_prefix();
12104 o->get_omap_key(string(), &final_key);
12105 size_t base_key_len = final_key.size();
11fdf7f2 12106 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 12107 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
12108 final_key += *p;
12109 bufferlist val;
12110 if (db->get(prefix, final_key, &val) >= 0) {
12111 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
12112 << " -> " << *p << dendl;
12113 out->insert(make_pair(*p, val));
12114 }
7c673cae
FG
12115 }
12116 }
12117 out:
adb31ebb
TL
12118 c->store->log_latency(
12119 __func__,
12120 l_bluestore_omap_get_values_lat,
12121 mono_clock::now() - start1,
12122 c->store->cct->_conf->bluestore_log_omap_iterator_age);
12123
7c673cae
FG
12124 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12125 << dendl;
12126 return r;
12127}
12128
9f95a23c
TL
12129#ifdef WITH_SEASTAR
12130int BlueStore::omap_get_values(
12131 CollectionHandle &c_, ///< [in] Collection containing oid
12132 const ghobject_t &oid, ///< [in] Object containing omap
12133 const std::optional<string> &start_after, ///< [in] Keys to get
12134 map<string, bufferlist> *output ///< [out] Returned keys and values
12135 )
12136{
12137 Collection *c = static_cast<Collection *>(c_.get());
12138 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12139 if (!c->exists)
12140 return -ENOENT;
12141 std::shared_lock l(c->lock);
12142 int r = 0;
12143 OnodeRef o = c->get_onode(oid, false);
12144 if (!o || !o->exists) {
12145 r = -ENOENT;
12146 goto out;
12147 }
12148 if (!o->onode.has_omap()) {
12149 goto out;
12150 }
12151 o->flush();
12152 {
12153 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
12154 if (!iter) {
12155 r = -ENOENT;
12156 goto out;
12157 }
12158 iter->upper_bound(*start_after);
12159 for (; iter->valid(); iter->next()) {
12160 output->insert(make_pair(iter->key(), iter->value()));
12161 }
12162 }
12163
12164out:
12165 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12166 << dendl;
12167 return r;
12168}
12169#endif
12170
7c673cae
FG
12171int BlueStore::omap_check_keys(
12172 CollectionHandle &c_, ///< [in] Collection containing oid
12173 const ghobject_t &oid, ///< [in] Object containing omap
12174 const set<string> &keys, ///< [in] Keys to check
12175 set<string> *out ///< [out] Subset of keys defined on oid
12176 )
12177{
12178 Collection *c = static_cast<Collection *>(c_.get());
12179 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12180 if (!c->exists)
12181 return -ENOENT;
9f95a23c 12182 std::shared_lock l(c->lock);
7c673cae
FG
12183 int r = 0;
12184 string final_key;
12185 OnodeRef o = c->get_onode(oid, false);
12186 if (!o || !o->exists) {
12187 r = -ENOENT;
12188 goto out;
12189 }
9f95a23c 12190 if (!o->onode.has_omap()) {
7c673cae 12191 goto out;
9f95a23c
TL
12192 }
12193 o->flush();
11fdf7f2 12194 {
9f95a23c
TL
12195 const string& prefix = o->get_omap_prefix();
12196 o->get_omap_key(string(), &final_key);
12197 size_t base_key_len = final_key.size();
11fdf7f2 12198 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 12199 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
12200 final_key += *p;
12201 bufferlist val;
12202 if (db->get(prefix, final_key, &val) >= 0) {
12203 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
12204 << " -> " << *p << dendl;
12205 out->insert(*p);
12206 } else {
12207 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
12208 << " -> " << *p << dendl;
12209 }
7c673cae
FG
12210 }
12211 }
12212 out:
12213 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12214 << dendl;
12215 return r;
12216}
12217
7c673cae
FG
12218ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
12219 CollectionHandle &c_, ///< [in] collection
12220 const ghobject_t &oid ///< [in] object
12221 )
12222{
12223 Collection *c = static_cast<Collection *>(c_.get());
12224 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
12225 if (!c->exists) {
12226 return ObjectMap::ObjectMapIterator();
12227 }
9f95a23c 12228 std::shared_lock l(c->lock);
7c673cae
FG
12229 OnodeRef o = c->get_onode(oid, false);
12230 if (!o || !o->exists) {
12231 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
12232 return ObjectMap::ObjectMapIterator();
12233 }
12234 o->flush();
12235 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
33c7a0ef
TL
12236 auto bounds = KeyValueDB::IteratorBounds();
12237 if (o->onode.has_omap()) {
12238 std::string lower_bound, upper_bound;
12239 o->get_omap_key(string(), &lower_bound);
12240 o->get_omap_tail(&upper_bound);
12241 bounds.lower_bound = std::move(lower_bound);
12242 bounds.upper_bound = std::move(upper_bound);
12243 }
12244 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix(), 0, std::move(bounds));
1e59de90 12245 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(logger,c, o, it));
7c673cae
FG
12246}
12247
12248// -----------------
12249// write helpers
12250
11fdf7f2 12251uint64_t BlueStore::_get_ondisk_reserved() const {
f67539c2 12252 ceph_assert(min_alloc_size);
11fdf7f2
TL
12253 return round_up_to(
12254 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
12255}
12256
7c673cae
FG
12257void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
12258{
12259 dout(10) << __func__ << " ondisk_format " << ondisk_format
12260 << " min_compat_ondisk_format " << min_compat_ondisk_format
12261 << dendl;
11fdf7f2 12262 ceph_assert(ondisk_format == latest_ondisk_format);
7c673cae
FG
12263 {
12264 bufferlist bl;
11fdf7f2 12265 encode(ondisk_format, bl);
7c673cae
FG
12266 t->set(PREFIX_SUPER, "ondisk_format", bl);
12267 }
12268 {
12269 bufferlist bl;
11fdf7f2 12270 encode(min_compat_ondisk_format, bl);
7c673cae
FG
12271 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
12272 }
12273}
12274
12275int BlueStore::_open_super_meta()
12276{
12277 // nid
12278 {
12279 nid_max = 0;
12280 bufferlist bl;
12281 db->get(PREFIX_SUPER, "nid_max", &bl);
11fdf7f2 12282 auto p = bl.cbegin();
7c673cae
FG
12283 try {
12284 uint64_t v;
11fdf7f2 12285 decode(v, p);
7c673cae 12286 nid_max = v;
f67539c2 12287 } catch (ceph::buffer::error& e) {
7c673cae
FG
12288 derr << __func__ << " unable to read nid_max" << dendl;
12289 return -EIO;
12290 }
f67539c2 12291 dout(1) << __func__ << " old nid_max " << nid_max << dendl;
7c673cae
FG
12292 nid_last = nid_max.load();
12293 }
12294
12295 // blobid
12296 {
12297 blobid_max = 0;
12298 bufferlist bl;
12299 db->get(PREFIX_SUPER, "blobid_max", &bl);
11fdf7f2 12300 auto p = bl.cbegin();
7c673cae
FG
12301 try {
12302 uint64_t v;
11fdf7f2 12303 decode(v, p);
7c673cae 12304 blobid_max = v;
f67539c2 12305 } catch (ceph::buffer::error& e) {
7c673cae
FG
12306 derr << __func__ << " unable to read blobid_max" << dendl;
12307 return -EIO;
12308 }
f67539c2 12309 dout(1) << __func__ << " old blobid_max " << blobid_max << dendl;
7c673cae
FG
12310 blobid_last = blobid_max.load();
12311 }
12312
12313 // freelist
12314 {
12315 bufferlist bl;
12316 db->get(PREFIX_SUPER, "freelist_type", &bl);
12317 if (bl.length()) {
12318 freelist_type = std::string(bl.c_str(), bl.length());
7c673cae 12319 } else {
11fdf7f2 12320 ceph_abort_msg("Not Support extent freelist manager");
7c673cae 12321 }
20effc67 12322 dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
7c673cae 12323 }
7c673cae
FG
12324 // ondisk format
12325 int32_t compat_ondisk_format = 0;
12326 {
12327 bufferlist bl;
12328 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
12329 if (r < 0) {
12330 // base case: kraken bluestore is v1 and readable by v1
12331 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
12332 << dendl;
12333 ondisk_format = 1;
12334 compat_ondisk_format = 1;
12335 } else {
11fdf7f2 12336 auto p = bl.cbegin();
7c673cae 12337 try {
11fdf7f2 12338 decode(ondisk_format, p);
f67539c2 12339 } catch (ceph::buffer::error& e) {
7c673cae
FG
12340 derr << __func__ << " unable to read ondisk_format" << dendl;
12341 return -EIO;
12342 }
12343 bl.clear();
12344 {
12345 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11fdf7f2
TL
12346 ceph_assert(!r);
12347 auto p = bl.cbegin();
7c673cae 12348 try {
11fdf7f2 12349 decode(compat_ondisk_format, p);
f67539c2 12350 } catch (ceph::buffer::error& e) {
7c673cae
FG
12351 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
12352 return -EIO;
12353 }
12354 }
12355 }
f67539c2 12356 dout(1) << __func__ << " ondisk_format " << ondisk_format
7c673cae
FG
12357 << " compat_ondisk_format " << compat_ondisk_format
12358 << dendl;
12359 }
12360
12361 if (latest_ondisk_format < compat_ondisk_format) {
12362 derr << __func__ << " compat_ondisk_format is "
12363 << compat_ondisk_format << " but we only understand version "
12364 << latest_ondisk_format << dendl;
12365 return -EPERM;
12366 }
7c673cae
FG
12367
12368 {
12369 bufferlist bl;
12370 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11fdf7f2 12371 auto p = bl.cbegin();
7c673cae
FG
12372 try {
12373 uint64_t val;
11fdf7f2 12374 decode(val, p);
7c673cae 12375 min_alloc_size = val;
1e59de90 12376 min_alloc_size_order = std::countr_zero(val);
20effc67
TL
12377 min_alloc_size_mask = min_alloc_size - 1;
12378
11fdf7f2 12379 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
f67539c2 12380 } catch (ceph::buffer::error& e) {
7c673cae
FG
12381 derr << __func__ << " unable to read min_alloc_size" << dendl;
12382 return -EIO;
12383 }
f67539c2 12384 dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7c673cae 12385 << std::dec << dendl;
20effc67
TL
12386 logger->set(l_bluestore_alloc_unit, min_alloc_size);
12387 }
12388
12389 // smr fields
12390 {
12391 bufferlist bl;
12392 int r = db->get(PREFIX_SUPER, "zone_size", &bl);
12393 if (r >= 0) {
12394 auto p = bl.cbegin();
12395 decode(zone_size, p);
12396 dout(1) << __func__ << " zone_size 0x" << std::hex << zone_size << std::dec << dendl;
12397 ceph_assert(bdev->is_smr());
12398 } else {
12399 ceph_assert(!bdev->is_smr());
12400 }
12401 }
12402 {
12403 bufferlist bl;
12404 int r = db->get(PREFIX_SUPER, "first_sequential_zone", &bl);
12405 if (r >= 0) {
12406 auto p = bl.cbegin();
12407 decode(first_sequential_zone, p);
12408 dout(1) << __func__ << " first_sequential_zone 0x" << std::hex
12409 << first_sequential_zone << std::dec << dendl;
12410 ceph_assert(bdev->is_smr());
12411 } else {
12412 ceph_assert(!bdev->is_smr());
12413 }
7c673cae 12414 }
9f95a23c
TL
12415
12416 _set_per_pool_omap();
12417
224ce89b 12418 _open_statfs();
7c673cae
FG
12419 _set_alloc_sizes();
12420 _set_throttle_params();
12421
12422 _set_csum();
12423 _set_compression();
12424 _set_blob_size();
12425
11fdf7f2 12426 _validate_bdev();
7c673cae
FG
12427 return 0;
12428}
12429
12430int BlueStore::_upgrade_super()
12431{
12432 dout(1) << __func__ << " from " << ondisk_format << ", latest "
12433 << latest_ondisk_format << dendl;
11fdf7f2
TL
12434 if (ondisk_format < latest_ondisk_format) {
12435 ceph_assert(ondisk_format > 0);
12436 ceph_assert(ondisk_format < latest_ondisk_format);
12437
1911f103 12438 KeyValueDB::Transaction t = db->get_transaction();
11fdf7f2
TL
12439 if (ondisk_format == 1) {
12440 // changes:
12441 // - super: added ondisk_format
12442 // - super: added min_readable_ondisk_format
12443 // - super: added min_compat_ondisk_format
12444 // - super: added min_alloc_size
12445 // - super: removed min_min_alloc_size
11fdf7f2
TL
12446 {
12447 bufferlist bl;
12448 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
12449 auto p = bl.cbegin();
12450 try {
12451 uint64_t val;
12452 decode(val, p);
12453 min_alloc_size = val;
f67539c2 12454 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
12455 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
12456 return -EIO;
12457 }
12458 t->set(PREFIX_SUPER, "min_alloc_size", bl);
12459 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7c673cae 12460 }
11fdf7f2 12461 ondisk_format = 2;
7c673cae 12462 }
9f95a23c
TL
12463 if (ondisk_format == 2) {
12464 // changes:
f67539c2
TL
12465 // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all*
12466 // oondes are using the per-pool prefix until a repair is run; at that
9f95a23c
TL
12467 // point the per_pool_omap=1 key will be set.
12468 // - super: added per_pool_omap key, which indicates that *all* objects
12469 // are using the new prefix and key format
12470 ondisk_format = 3;
1911f103
TL
12471 }
12472 if (ondisk_format == 3) {
12473 // changes:
12474 // - FreelistManager keeps meta within bdev label
12475 int r = _write_out_fm_meta(0);
9f95a23c 12476 ceph_assert(r == 0);
1911f103 12477 ondisk_format = 4;
9f95a23c 12478 }
1911f103
TL
12479 // This to be the last operation
12480 _prepare_ondisk_format_super(t);
12481 int r = db->submit_transaction_sync(t);
12482 ceph_assert(r == 0);
7c673cae 12483 }
7c673cae
FG
12484 // done
12485 dout(1) << __func__ << " done" << dendl;
12486 return 0;
12487}
12488
39ae355f 12489void BlueStore::_assign_nid(TransContext *txc, OnodeRef& o)
7c673cae 12490{
224ce89b 12491 if (o->onode.nid) {
11fdf7f2 12492 ceph_assert(o->exists);
7c673cae 12493 return;
224ce89b 12494 }
7c673cae
FG
12495 uint64_t nid = ++nid_last;
12496 dout(20) << __func__ << " " << nid << dendl;
12497 o->onode.nid = nid;
12498 txc->last_nid = nid;
224ce89b 12499 o->exists = true;
7c673cae
FG
12500}
12501
12502uint64_t BlueStore::_assign_blobid(TransContext *txc)
12503{
12504 uint64_t bid = ++blobid_last;
12505 dout(20) << __func__ << " " << bid << dendl;
12506 txc->last_blobid = bid;
12507 return bid;
12508}
12509
12510void BlueStore::get_db_statistics(Formatter *f)
12511{
12512 db->get_statistics(f);
12513}
12514
11fdf7f2
TL
12515BlueStore::TransContext *BlueStore::_txc_create(
12516 Collection *c, OpSequencer *osr,
f67539c2
TL
12517 list<Context*> *on_commits,
12518 TrackedOpRef osd_op)
7c673cae 12519{
11fdf7f2 12520 TransContext *txc = new TransContext(cct, c, osr, on_commits);
7c673cae 12521 txc->t = db->get_transaction();
f67539c2
TL
12522
12523#ifdef WITH_BLKIN
12524 if (osd_op && osd_op->pg_trace) {
12525 txc->trace.init("TransContext", &trace_endpoint,
12526 &osd_op->pg_trace);
12527 txc->trace.event("txc create");
12528 txc->trace.keyval("txc seq", txc->seq);
12529 }
12530#endif
12531
7c673cae
FG
12532 osr->queue_new(txc);
12533 dout(20) << __func__ << " osr " << osr << " = " << txc
12534 << " seq " << txc->seq << dendl;
12535 return txc;
12536}
12537
12538void BlueStore::_txc_calc_cost(TransContext *txc)
12539{
11fdf7f2
TL
12540 // one "io" for the kv commit
12541 auto ios = 1 + txc->ioc.get_num_ios();
7c673cae
FG
12542 auto cost = throttle_cost_per_io.load();
12543 txc->cost = ios * cost + txc->bytes;
9f95a23c 12544 txc->ios = ios;
7c673cae
FG
12545 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
12546 << ios << " ios * " << cost << " + " << txc->bytes
12547 << " bytes)" << dendl;
12548}
12549
12550void BlueStore::_txc_update_store_statfs(TransContext *txc)
12551{
12552 if (txc->statfs_delta.is_empty())
12553 return;
12554
12555 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
12556 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
12557 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
12558 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
12559 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
12560
11fdf7f2 12561 if (per_pool_stat_collection) {
39ae355f
TL
12562 if (!is_statfs_recoverable()) {
12563 bufferlist bl;
12564 txc->statfs_delta.encode(bl);
12565 string key;
12566 get_pool_stat_key(txc->osd_pool_id, &key);
12567 txc->t->merge(PREFIX_STAT, key, bl);
12568 }
11fdf7f2
TL
12569
12570 std::lock_guard l(vstatfs_lock);
12571 auto& stats = osd_pools[txc->osd_pool_id];
12572 stats += txc->statfs_delta;
12573
12574 vstatfs += txc->statfs_delta; //non-persistent in this mode
12575
12576 } else {
39ae355f
TL
12577 if (!is_statfs_recoverable()) {
12578 bufferlist bl;
12579 txc->statfs_delta.encode(bl);
12580 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
12581 }
7c673cae 12582
11fdf7f2
TL
12583 std::lock_guard l(vstatfs_lock);
12584 vstatfs += txc->statfs_delta;
12585 }
7c673cae
FG
12586 txc->statfs_delta.reset();
12587}
12588
12589void BlueStore::_txc_state_proc(TransContext *txc)
12590{
12591 while (true) {
12592 dout(10) << __func__ << " txc " << txc
12593 << " " << txc->get_state_name() << dendl;
f67539c2 12594 switch (txc->get_state()) {
7c673cae 12595 case TransContext::STATE_PREPARE:
9f95a23c 12596 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
7c673cae 12597 if (txc->ioc.has_pending_aios()) {
f67539c2
TL
12598 txc->set_state(TransContext::STATE_AIO_WAIT);
12599#ifdef WITH_BLKIN
12600 if (txc->trace) {
12601 txc->trace.keyval("pending aios", txc->ioc.num_pending.load());
12602 }
12603#endif
7c673cae
FG
12604 txc->had_ios = true;
12605 _txc_aio_submit(txc);
12606 return;
12607 }
12608 // ** fall-thru **
12609
12610 case TransContext::STATE_AIO_WAIT:
11fdf7f2 12611 {
9f95a23c
TL
12612 mono_clock::duration lat = throttle.log_state_latency(
12613 *txc, logger, l_bluestore_state_aio_wait_lat);
12614 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
11fdf7f2
TL
12615 dout(0) << __func__ << " slow aio_wait, txc = " << txc
12616 << ", latency = " << lat
12617 << dendl;
12618 }
12619 }
12620
7c673cae
FG
12621 _txc_finish_io(txc); // may trigger blocked txc's too
12622 return;
12623
12624 case TransContext::STATE_IO_DONE:
11fdf7f2 12625 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
7c673cae
FG
12626 if (txc->had_ios) {
12627 ++txc->osr->txc_with_unstable_io;
12628 }
9f95a23c 12629 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
f67539c2 12630 txc->set_state(TransContext::STATE_KV_QUEUED);
7c673cae
FG
12631 if (cct->_conf->bluestore_sync_submit_transaction) {
12632 if (txc->last_nid >= nid_max ||
12633 txc->last_blobid >= blobid_max) {
12634 dout(20) << __func__
12635 << " last_{nid,blobid} exceeds max, submit via kv thread"
12636 << dendl;
12637 } else if (txc->osr->kv_committing_serially) {
12638 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
12639 << dendl;
12640 // note: this is starvation-prone. once we have a txc in a busy
12641 // sequencer that is committing serially it is possible to keep
12642 // submitting new transactions fast enough that we get stuck doing
12643 // so. the alternative is to block here... fixme?
12644 } else if (txc->osr->txc_with_unstable_io) {
12645 dout(20) << __func__ << " prior txc(s) with unstable ios "
12646 << txc->osr->txc_with_unstable_io.load() << dendl;
12647 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
12648 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
12649 == 0) {
12650 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
12651 << dendl;
12652 } else {
9f95a23c 12653 _txc_apply_kv(txc, true);
7c673cae
FG
12654 }
12655 }
12656 {
11fdf7f2 12657 std::lock_guard l(kv_lock);
7c673cae 12658 kv_queue.push_back(txc);
9f95a23c
TL
12659 if (!kv_sync_in_progress) {
12660 kv_sync_in_progress = true;
12661 kv_cond.notify_one();
12662 }
f67539c2 12663 if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) {
7c673cae
FG
12664 kv_queue_unsubmitted.push_back(txc);
12665 ++txc->osr->kv_committing_serially;
12666 }
31f18b77
FG
12667 if (txc->had_ios)
12668 kv_ios++;
12669 kv_throttle_costs += txc->cost;
7c673cae
FG
12670 }
12671 return;
12672 case TransContext::STATE_KV_SUBMITTED:
7c673cae
FG
12673 _txc_committed_kv(txc);
12674 // ** fall-thru **
12675
12676 case TransContext::STATE_KV_DONE:
9f95a23c 12677 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
7c673cae 12678 if (txc->deferred_txn) {
f67539c2 12679 txc->set_state(TransContext::STATE_DEFERRED_QUEUED);
7c673cae
FG
12680 _deferred_queue(txc);
12681 return;
12682 }
f67539c2 12683 txc->set_state(TransContext::STATE_FINISHING);
7c673cae
FG
12684 break;
12685
12686 case TransContext::STATE_DEFERRED_CLEANUP:
9f95a23c 12687 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
f67539c2 12688 txc->set_state(TransContext::STATE_FINISHING);
7c673cae
FG
12689 // ** fall-thru **
12690
12691 case TransContext::STATE_FINISHING:
9f95a23c 12692 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
7c673cae
FG
12693 _txc_finish(txc);
12694 return;
12695
12696 default:
12697 derr << __func__ << " unexpected txc " << txc
12698 << " state " << txc->get_state_name() << dendl;
11fdf7f2 12699 ceph_abort_msg("unexpected txc state");
7c673cae
FG
12700 return;
12701 }
12702 }
12703}
12704
12705void BlueStore::_txc_finish_io(TransContext *txc)
12706{
12707 dout(20) << __func__ << " " << txc << dendl;
12708
12709 /*
12710 * we need to preserve the order of kv transactions,
12711 * even though aio will complete in any order.
12712 */
12713
12714 OpSequencer *osr = txc->osr.get();
11fdf7f2 12715 std::lock_guard l(osr->qlock);
f67539c2 12716 txc->set_state(TransContext::STATE_IO_DONE);
11fdf7f2 12717 txc->ioc.release_running_aios();
7c673cae
FG
12718 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
12719 while (p != osr->q.begin()) {
12720 --p;
f67539c2 12721 if (p->get_state() < TransContext::STATE_IO_DONE) {
7c673cae
FG
12722 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
12723 << p->get_state_name() << dendl;
12724 return;
12725 }
f67539c2 12726 if (p->get_state() > TransContext::STATE_IO_DONE) {
7c673cae
FG
12727 ++p;
12728 break;
12729 }
12730 }
12731 do {
12732 _txc_state_proc(&*p++);
12733 } while (p != osr->q.end() &&
f67539c2 12734 p->get_state() == TransContext::STATE_IO_DONE);
7c673cae 12735
11fdf7f2 12736 if (osr->kv_submitted_waiters) {
7c673cae
FG
12737 osr->qcond.notify_all();
12738 }
12739}
12740
12741void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
12742{
12743 dout(20) << __func__ << " txc " << txc
12744 << " onodes " << txc->onodes
12745 << " shared_blobs " << txc->shared_blobs
12746 << dendl;
12747
12748 // finalize onodes
12749 for (auto o : txc->onodes) {
11fdf7f2 12750 _record_onode(o, t);
7c673cae
FG
12751 o->flushing_count++;
12752 }
12753
12754 // objects we modified but didn't affect the onode
12755 auto p = txc->modified_objects.begin();
12756 while (p != txc->modified_objects.end()) {
12757 if (txc->onodes.count(*p) == 0) {
12758 (*p)->flushing_count++;
12759 ++p;
12760 } else {
12761 // remove dups with onodes list to avoid problems in _txc_finish
12762 p = txc->modified_objects.erase(p);
12763 }
12764 }
12765
12766 // finalize shared_blobs
12767 for (auto sb : txc->shared_blobs) {
12768 string key;
12769 auto sbid = sb->get_sbid();
12770 get_shared_blob_key(sbid, &key);
12771 if (sb->persistent->empty()) {
11fdf7f2
TL
12772 dout(20) << __func__ << " shared_blob 0x"
12773 << std::hex << sbid << std::dec
7c673cae
FG
12774 << " is empty" << dendl;
12775 t->rmkey(PREFIX_SHARED_BLOB, key);
12776 } else {
12777 bufferlist bl;
11fdf7f2
TL
12778 encode(*(sb->persistent), bl);
12779 dout(20) << __func__ << " shared_blob 0x"
12780 << std::hex << sbid << std::dec
31f18b77 12781 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
12782 t->set(PREFIX_SHARED_BLOB, key, bl);
12783 }
12784 }
12785}
12786
12787void BlueStore::BSPerfTracker::update_from_perfcounters(
12788 PerfCounters &logger)
12789{
11fdf7f2
TL
12790 os_commit_latency_ns.consume_next(
12791 logger.get_tavg_ns(
7c673cae 12792 l_bluestore_commit_lat));
11fdf7f2
TL
12793 os_apply_latency_ns.consume_next(
12794 logger.get_tavg_ns(
7c673cae
FG
12795 l_bluestore_commit_lat));
12796}
12797
12798void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
12799{
12800 dout(20) << __func__ << " txc " << txc << std::hex
12801 << " allocated 0x" << txc->allocated
12802 << " released 0x" << txc->released
12803 << std::dec << dendl;
12804
20effc67
TL
12805 if (!fm->is_null_manager())
12806 {
12807 // We have to handle the case where we allocate *and* deallocate the
12808 // same region in this transaction. The freelist doesn't like that.
12809 // (Actually, the only thing that cares is the BitmapFreelistManager
12810 // debug check. But that's important.)
12811 interval_set<uint64_t> tmp_allocated, tmp_released;
12812 interval_set<uint64_t> *pallocated = &txc->allocated;
12813 interval_set<uint64_t> *preleased = &txc->released;
12814 if (!txc->allocated.empty() && !txc->released.empty()) {
12815 interval_set<uint64_t> overlap;
12816 overlap.intersection_of(txc->allocated, txc->released);
12817 if (!overlap.empty()) {
12818 tmp_allocated = txc->allocated;
12819 tmp_allocated.subtract(overlap);
12820 tmp_released = txc->released;
12821 tmp_released.subtract(overlap);
12822 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
12823 << ", new allocated 0x" << tmp_allocated
12824 << " released 0x" << tmp_released << std::dec
12825 << dendl;
12826 pallocated = &tmp_allocated;
12827 preleased = &tmp_released;
12828 }
7c673cae 12829 }
7c673cae 12830
20effc67
TL
12831 // update freelist with non-overlap sets
12832 for (interval_set<uint64_t>::iterator p = pallocated->begin();
12833 p != pallocated->end();
12834 ++p) {
12835 fm->allocate(p.get_start(), p.get_len(), t);
12836 }
12837 for (interval_set<uint64_t>::iterator p = preleased->begin();
12838 p != preleased->end();
12839 ++p) {
12840 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
12841 << "~" << p.get_len() << std::dec << dendl;
12842 fm->release(p.get_start(), p.get_len(), t);
12843 }
7c673cae
FG
12844 }
12845
20effc67 12846#ifdef HAVE_LIBZBD
f67539c2 12847 if (bdev->is_smr()) {
20effc67
TL
12848 for (auto& i : txc->old_zone_offset_refs) {
12849 dout(20) << __func__ << " rm ref zone 0x" << std::hex << i.first.second
12850 << " offset 0x" << i.second << std::dec
12851 << " -> " << i.first.first->oid << dendl;
12852 string key;
12853 get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
12854 txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
12855 }
12856 for (auto& i : txc->new_zone_offset_refs) {
12857 // (zone, offset) -> oid
12858 dout(20) << __func__ << " add ref zone 0x" << std::hex << i.first.second
12859 << " offset 0x" << i.second << std::dec
12860 << " -> " << i.first.first->oid << dendl;
12861 string key;
12862 get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
12863 bufferlist v;
12864 txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
12865 }
f67539c2 12866 }
20effc67 12867#endif
f67539c2 12868
7c673cae
FG
12869 _txc_update_store_statfs(txc);
12870}
12871
9f95a23c 12872void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
7c673cae 12873{
f67539c2 12874 ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED);
9f95a23c
TL
12875 {
12876#if defined(WITH_LTTNG)
12877 auto start = mono_clock::now();
12878#endif
12879
f67539c2
TL
12880#ifdef WITH_BLKIN
12881 if (txc->trace) {
12882 txc->trace.event("db async submit");
12883 }
12884#endif
12885
9f95a23c
TL
12886 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
12887 ceph_assert(r == 0);
f67539c2 12888 txc->set_state(TransContext::STATE_KV_SUBMITTED);
9f95a23c
TL
12889 if (txc->osr->kv_submitted_waiters) {
12890 std::lock_guard l(txc->osr->qlock);
12891 txc->osr->qcond.notify_all();
12892 }
12893
12894#if defined(WITH_LTTNG)
12895 if (txc->tracing) {
12896 tracepoint(
12897 bluestore,
12898 transaction_kv_submit_latency,
12899 txc->osr->get_sequencer_id(),
12900 txc->seq,
12901 sync_submit_transaction,
12902 ceph::to_seconds<double>(mono_clock::now() - start));
12903 }
12904#endif
12905 }
12906
7c673cae
FG
12907 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
12908 for (auto& o : *ls) {
12909 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
12910 << dendl;
9f95a23c 12911 if (--o->flushing_count == 0 && o->waiting_count.load()) {
11fdf7f2 12912 std::lock_guard l(o->flush_lock);
7c673cae
FG
12913 o->flush_cond.notify_all();
12914 }
12915 }
12916 }
12917}
12918
12919void BlueStore::_txc_committed_kv(TransContext *txc)
12920{
12921 dout(20) << __func__ << " txc " << txc << dendl;
9f95a23c 12922 throttle.complete_kv(*txc);
1adf2230 12923 {
11fdf7f2 12924 std::lock_guard l(txc->osr->qlock);
f67539c2 12925 txc->set_state(TransContext::STATE_KV_DONE);
11fdf7f2
TL
12926 if (txc->ch->commit_queue) {
12927 txc->ch->commit_queue->queue(txc->oncommits);
12928 } else {
12929 finisher.queue(txc->oncommits);
1adf2230 12930 }
7c673cae 12931 }
9f95a23c 12932 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
494da23a
TL
12933 log_latency_fn(
12934 __func__,
12935 l_bluestore_commit_lat,
9f95a23c 12936 mono_clock::now() - txc->start,
494da23a
TL
12937 cct->_conf->bluestore_log_op_age,
12938 [&](auto lat) {
12939 return ", txc = " + stringify(txc);
12940 }
11fdf7f2 12941 );
7c673cae
FG
12942}
12943
12944void BlueStore::_txc_finish(TransContext *txc)
12945{
12946 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
f67539c2 12947 ceph_assert(txc->get_state() == TransContext::STATE_FINISHING);
7c673cae
FG
12948
12949 for (auto& sb : txc->shared_blobs_written) {
f64942e4 12950 sb->finish_write(txc->seq);
7c673cae
FG
12951 }
12952 txc->shared_blobs_written.clear();
12953
12954 while (!txc->removed_collections.empty()) {
12955 _queue_reap_collection(txc->removed_collections.front());
12956 txc->removed_collections.pop_front();
12957 }
12958
12959 OpSequencerRef osr = txc->osr;
7c673cae 12960 bool empty = false;
31f18b77 12961 bool submit_deferred = false;
7c673cae
FG
12962 OpSequencer::q_list_t releasing_txc;
12963 {
11fdf7f2 12964 std::lock_guard l(osr->qlock);
f67539c2 12965 txc->set_state(TransContext::STATE_DONE);
7c673cae
FG
12966 bool notify = false;
12967 while (!osr->q.empty()) {
12968 TransContext *txc = &osr->q.front();
12969 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
12970 << dendl;
f67539c2
TL
12971 if (txc->get_state() != TransContext::STATE_DONE) {
12972 if (txc->get_state() == TransContext::STATE_PREPARE &&
7c673cae
FG
12973 deferred_aggressive) {
12974 // for _osr_drain_preceding()
12975 notify = true;
12976 }
f67539c2 12977 if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED &&
11fdf7f2 12978 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
31f18b77
FG
12979 submit_deferred = true;
12980 }
7c673cae
FG
12981 break;
12982 }
12983
7c673cae
FG
12984 osr->q.pop_front();
12985 releasing_txc.push_back(*txc);
7c673cae 12986 }
9f95a23c 12987
7c673cae
FG
12988 if (osr->q.empty()) {
12989 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
12990 empty = true;
12991 }
9f95a23c
TL
12992
12993 // only drain()/drain_preceding() need wakeup,
12994 // other cases use kv_submitted_waiters
12995 if (notify || empty) {
12996 osr->qcond.notify_all();
12997 }
7c673cae 12998 }
9f95a23c 12999
7c673cae
FG
13000 while (!releasing_txc.empty()) {
13001 // release to allocator only after all preceding txc's have also
13002 // finished any deferred writes that potentially land in these
13003 // blocks
13004 auto txc = &releasing_txc.front();
13005 _txc_release_alloc(txc);
13006 releasing_txc.pop_front();
9f95a23c
TL
13007 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
13008 throttle.complete(*txc);
7c673cae
FG
13009 delete txc;
13010 }
13011
31f18b77
FG
13012 if (submit_deferred) {
13013 // we're pinning memory; flush! we could be more fine-grained here but
13014 // i'm not sure it's worth the bother.
13015 deferred_try_submit();
7c673cae
FG
13016 }
13017
7c673cae 13018 if (empty && osr->zombie) {
11fdf7f2
TL
13019 std::lock_guard l(zombie_osr_lock);
13020 if (zombie_osr_set.erase(osr->cid)) {
13021 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
13022 } else {
13023 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
13024 << dendl;
13025 }
7c673cae 13026 }
9f95a23c 13027}
7c673cae
FG
13028
13029void BlueStore::_txc_release_alloc(TransContext *txc)
13030{
1e59de90 13031 bool discard_queued = false;
a8e16298 13032 // it's expected we're called with lazy_release_lock already taken!
1e59de90
TL
13033 if (unlikely(cct->_conf->bluestore_debug_no_reuse_blocks)) {
13034 goto out;
13035 }
13036 discard_queued = bdev->try_discard(txc->released);
13037 // if async discard succeeded, will do alloc->release when discard callback
13038 // else we should release here
13039 if (!discard_queued) {
13040 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
13041 << txc->released << std::dec << dendl;
13042 alloc->release(txc->released);
7c673cae
FG
13043 }
13044
11fdf7f2 13045out:
7c673cae
FG
13046 txc->allocated.clear();
13047 txc->released.clear();
13048}
13049
11fdf7f2
TL
13050void BlueStore::_osr_attach(Collection *c)
13051{
20effc67 13052 // note: caller has coll_lock
11fdf7f2
TL
13053 auto q = coll_map.find(c->cid);
13054 if (q != coll_map.end()) {
13055 c->osr = q->second->osr;
13056 ldout(cct, 10) << __func__ << " " << c->cid
13057 << " reusing osr " << c->osr << " from existing coll "
13058 << q->second << dendl;
13059 } else {
13060 std::lock_guard l(zombie_osr_lock);
13061 auto p = zombie_osr_set.find(c->cid);
13062 if (p == zombie_osr_set.end()) {
9f95a23c 13063 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
11fdf7f2
TL
13064 ldout(cct, 10) << __func__ << " " << c->cid
13065 << " fresh osr " << c->osr << dendl;
13066 } else {
13067 c->osr = p->second;
13068 zombie_osr_set.erase(p);
13069 ldout(cct, 10) << __func__ << " " << c->cid
13070 << " resurrecting zombie osr " << c->osr << dendl;
13071 c->osr->zombie = false;
13072 }
13073 }
13074}
13075
13076void BlueStore::_osr_register_zombie(OpSequencer *osr)
13077{
13078 std::lock_guard l(zombie_osr_lock);
13079 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
13080 osr->zombie = true;
13081 auto i = zombie_osr_set.emplace(osr->cid, osr);
13082 // this is either a new insertion or the same osr is already there
13083 ceph_assert(i.second || i.first->second == osr);
13084}
13085
7c673cae
FG
13086void BlueStore::_osr_drain_preceding(TransContext *txc)
13087{
13088 OpSequencer *osr = txc->osr.get();
13089 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
13090 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
13091 {
13092 // submit anything pending
f67539c2 13093 osr->deferred_lock.lock();
11fdf7f2 13094 if (osr->deferred_pending && !osr->deferred_running) {
224ce89b
WB
13095 _deferred_submit_unlock(osr);
13096 } else {
f67539c2 13097 osr->deferred_lock.unlock();
7c673cae
FG
13098 }
13099 }
13100 {
13101 // wake up any previously finished deferred events
11fdf7f2 13102 std::lock_guard l(kv_lock);
9f95a23c
TL
13103 if (!kv_sync_in_progress) {
13104 kv_sync_in_progress = true;
13105 kv_cond.notify_one();
13106 }
7c673cae
FG
13107 }
13108 osr->drain_preceding(txc);
13109 --deferred_aggressive;
13110 dout(10) << __func__ << " " << osr << " done" << dendl;
13111}
13112
11fdf7f2
TL
13113void BlueStore::_osr_drain(OpSequencer *osr)
13114{
13115 dout(10) << __func__ << " " << osr << dendl;
13116 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
13117 {
13118 // submit anything pending
f67539c2 13119 osr->deferred_lock.lock();
11fdf7f2
TL
13120 if (osr->deferred_pending && !osr->deferred_running) {
13121 _deferred_submit_unlock(osr);
13122 } else {
f67539c2 13123 osr->deferred_lock.unlock();
11fdf7f2
TL
13124 }
13125 }
13126 {
13127 // wake up any previously finished deferred events
13128 std::lock_guard l(kv_lock);
9f95a23c
TL
13129 if (!kv_sync_in_progress) {
13130 kv_sync_in_progress = true;
13131 kv_cond.notify_one();
13132 }
11fdf7f2
TL
13133 }
13134 osr->drain();
13135 --deferred_aggressive;
13136 dout(10) << __func__ << " " << osr << " done" << dendl;
13137}
13138
7c673cae
FG
13139void BlueStore::_osr_drain_all()
13140{
13141 dout(10) << __func__ << dendl;
13142
13143 set<OpSequencerRef> s;
11fdf7f2
TL
13144 vector<OpSequencerRef> zombies;
13145 {
9f95a23c 13146 std::shared_lock l(coll_lock);
11fdf7f2
TL
13147 for (auto& i : coll_map) {
13148 s.insert(i.second->osr);
13149 }
13150 }
7c673cae 13151 {
11fdf7f2
TL
13152 std::lock_guard l(zombie_osr_lock);
13153 for (auto& i : zombie_osr_set) {
13154 s.insert(i.second);
13155 zombies.push_back(i.second);
13156 }
7c673cae
FG
13157 }
13158 dout(20) << __func__ << " osr_set " << s << dendl;
13159
13160 ++deferred_aggressive;
13161 {
13162 // submit anything pending
224ce89b 13163 deferred_try_submit();
7c673cae
FG
13164 }
13165 {
13166 // wake up any previously finished deferred events
11fdf7f2 13167 std::lock_guard l(kv_lock);
7c673cae
FG
13168 kv_cond.notify_one();
13169 }
31f18b77 13170 {
11fdf7f2 13171 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
13172 kv_finalize_cond.notify_one();
13173 }
7c673cae
FG
13174 for (auto osr : s) {
13175 dout(20) << __func__ << " drain " << osr << dendl;
13176 osr->drain();
13177 }
13178 --deferred_aggressive;
13179
7c673cae 13180 {
11fdf7f2
TL
13181 std::lock_guard l(zombie_osr_lock);
13182 for (auto& osr : zombies) {
13183 if (zombie_osr_set.erase(osr->cid)) {
13184 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
13185 ceph_assert(osr->q.empty());
13186 } else if (osr->zombie) {
13187 dout(10) << __func__ << " empty zombie osr " << osr
13188 << " already reaped" << dendl;
13189 ceph_assert(osr->q.empty());
13190 } else {
13191 dout(10) << __func__ << " empty zombie osr " << osr
13192 << " resurrected" << dendl;
13193 }
7c673cae
FG
13194 }
13195 }
11fdf7f2
TL
13196
13197 dout(10) << __func__ << " done" << dendl;
7c673cae
FG
13198}
13199
11fdf7f2 13200
31f18b77
FG
13201void BlueStore::_kv_start()
13202{
13203 dout(10) << __func__ << dendl;
13204
11fdf7f2 13205 finisher.start();
31f18b77
FG
13206 kv_sync_thread.create("bstore_kv_sync");
13207 kv_finalize_thread.create("bstore_kv_final");
13208}
13209
13210void BlueStore::_kv_stop()
13211{
13212 dout(10) << __func__ << dendl;
13213 {
9f95a23c 13214 std::unique_lock l{kv_lock};
31f18b77
FG
13215 while (!kv_sync_started) {
13216 kv_cond.wait(l);
13217 }
13218 kv_stop = true;
13219 kv_cond.notify_all();
13220 }
13221 {
9f95a23c 13222 std::unique_lock l{kv_finalize_lock};
31f18b77
FG
13223 while (!kv_finalize_started) {
13224 kv_finalize_cond.wait(l);
13225 }
13226 kv_finalize_stop = true;
13227 kv_finalize_cond.notify_all();
13228 }
13229 kv_sync_thread.join();
13230 kv_finalize_thread.join();
11fdf7f2 13231 ceph_assert(removed_collections.empty());
31f18b77 13232 {
11fdf7f2 13233 std::lock_guard l(kv_lock);
31f18b77
FG
13234 kv_stop = false;
13235 }
13236 {
11fdf7f2 13237 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
13238 kv_finalize_stop = false;
13239 }
13240 dout(10) << __func__ << " stopping finishers" << dendl;
11fdf7f2
TL
13241 finisher.wait_for_empty();
13242 finisher.stop();
31f18b77
FG
13243 dout(10) << __func__ << " stopped" << dendl;
13244}
13245
7c673cae
FG
13246void BlueStore::_kv_sync_thread()
13247{
13248 dout(10) << __func__ << " start" << dendl;
11fdf7f2 13249 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
9f95a23c 13250 std::unique_lock l{kv_lock};
11fdf7f2 13251 ceph_assert(!kv_sync_started);
31f18b77
FG
13252 kv_sync_started = true;
13253 kv_cond.notify_all();
adb31ebb
TL
13254
13255 auto t0 = mono_clock::now();
13256 timespan twait = ceph::make_timespan(0);
13257 size_t kv_submitted = 0;
13258
7c673cae 13259 while (true) {
adb31ebb
TL
13260 auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
13261 auto observation_period =
13262 ceph::make_timespan(period);
13263 auto elapsed = mono_clock::now() - t0;
13264 if (period && elapsed >= observation_period) {
13265 dout(5) << __func__ << " utilization: idle "
13266 << twait << " of " << elapsed
13267 << ", submitted: " << kv_submitted
13268 <<dendl;
13269 t0 = mono_clock::now();
13270 twait = ceph::make_timespan(0);
13271 kv_submitted = 0;
13272 }
11fdf7f2 13273 ceph_assert(kv_committing.empty());
7c673cae
FG
13274 if (kv_queue.empty() &&
13275 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
11fdf7f2 13276 !deferred_aggressive)) {
7c673cae
FG
13277 if (kv_stop)
13278 break;
13279 dout(20) << __func__ << " sleep" << dendl;
adb31ebb 13280 auto t = mono_clock::now();
9f95a23c 13281 kv_sync_in_progress = false;
11fdf7f2 13282 kv_cond.wait(l);
adb31ebb
TL
13283 twait += mono_clock::now() - t;
13284
7c673cae
FG
13285 dout(20) << __func__ << " wake" << dendl;
13286 } else {
13287 deque<TransContext*> kv_submitting;
13288 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
13289 uint64_t aios = 0, costs = 0;
13290
7c673cae
FG
13291 dout(20) << __func__ << " committing " << kv_queue.size()
13292 << " submitting " << kv_queue_unsubmitted.size()
13293 << " deferred done " << deferred_done_queue.size()
13294 << " stable " << deferred_stable_queue.size()
13295 << dendl;
13296 kv_committing.swap(kv_queue);
13297 kv_submitting.swap(kv_queue_unsubmitted);
13298 deferred_done.swap(deferred_done_queue);
13299 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
13300 aios = kv_ios;
13301 costs = kv_throttle_costs;
13302 kv_ios = 0;
13303 kv_throttle_costs = 0;
7c673cae
FG
13304 l.unlock();
13305
13306 dout(30) << __func__ << " committing " << kv_committing << dendl;
13307 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
13308 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
13309 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
13310
11fdf7f2
TL
13311 auto start = mono_clock::now();
13312
7c673cae
FG
13313 bool force_flush = false;
13314 // if bluefs is sharing the same device as data (only), then we
13315 // can rely on the bluefs commit to flush the device and make
13316 // deferred aios stable. that means that if we do have done deferred
13317 // txcs AND we are not on a single device, we need to force a flush.
9f95a23c 13318 if (bluefs && bluefs_layout.single_shared_device()) {
31f18b77 13319 if (aios) {
7c673cae 13320 force_flush = true;
11fdf7f2 13321 } else if (kv_committing.empty() && deferred_stable.empty()) {
7c673cae
FG
13322 force_flush = true; // there's nothing else to commit!
13323 } else if (deferred_aggressive) {
13324 force_flush = true;
13325 }
11fdf7f2
TL
13326 } else {
13327 if (aios || !deferred_done.empty()) {
13328 force_flush = true;
13329 } else {
13330 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
13331 }
13332 }
7c673cae
FG
13333
13334 if (force_flush) {
31f18b77 13335 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
13336 << " force_flush=" << (int)force_flush
13337 << ", flushing, deferred done->stable" << dendl;
13338 // flush/barrier on block device
13339 bdev->flush();
13340
1e59de90
TL
13341 // if we flush then deferred done are now deferred stable
13342 if (deferred_stable.empty()) {
13343 deferred_stable.swap(deferred_done);
13344 } else {
13345 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
13346 deferred_done.end());
13347 deferred_done.clear();
13348 }
7c673cae 13349 }
11fdf7f2 13350 auto after_flush = mono_clock::now();
7c673cae
FG
13351
13352 // we will use one final transaction to force a sync
13353 KeyValueDB::Transaction synct = db->get_transaction();
13354
13355 // increase {nid,blobid}_max? note that this covers both the
13356 // case where we are approaching the max and the case we passed
13357 // it. in either case, we increase the max in the earlier txn
13358 // we submit.
13359 uint64_t new_nid_max = 0, new_blobid_max = 0;
13360 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
13361 KeyValueDB::Transaction t =
13362 kv_submitting.empty() ? synct : kv_submitting.front()->t;
13363 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
13364 bufferlist bl;
11fdf7f2 13365 encode(new_nid_max, bl);
7c673cae
FG
13366 t->set(PREFIX_SUPER, "nid_max", bl);
13367 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
13368 }
13369 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
13370 KeyValueDB::Transaction t =
13371 kv_submitting.empty() ? synct : kv_submitting.front()->t;
13372 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
13373 bufferlist bl;
11fdf7f2 13374 encode(new_blobid_max, bl);
7c673cae
FG
13375 t->set(PREFIX_SUPER, "blobid_max", bl);
13376 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
13377 }
c07f9fc5
FG
13378
13379 for (auto txc : kv_committing) {
9f95a23c 13380 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
f67539c2 13381 if (txc->get_state() == TransContext::STATE_KV_QUEUED) {
adb31ebb 13382 ++kv_submitted;
9f95a23c 13383 _txc_apply_kv(txc, false);
c07f9fc5 13384 --txc->osr->kv_committing_serially;
c07f9fc5 13385 } else {
f67539c2 13386 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
7c673cae 13387 }
7c673cae
FG
13388 if (txc->had_ios) {
13389 --txc->osr->txc_with_unstable_io;
13390 }
7c673cae
FG
13391 }
13392
31f18b77
FG
13393 // release throttle *before* we commit. this allows new ops
13394 // to be prepared and enter pipeline while we are waiting on
13395 // the kv commit sync/flush. then hopefully on the next
13396 // iteration there will already be ops awake. otherwise, we
13397 // end up going to sleep, and then wake up when the very first
13398 // transaction is ready for commit.
9f95a23c 13399 throttle.release_kv_throttle(costs);
31f18b77 13400
7c673cae
FG
13401 // cleanup sync deferred keys
13402 for (auto b : deferred_stable) {
13403 for (auto& txc : b->txcs) {
13404 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
11fdf7f2 13405 ceph_assert(wt.released.empty()); // only kraken did this
7c673cae
FG
13406 string key;
13407 get_deferred_key(wt.seq, &key);
13408 synct->rm_single_key(PREFIX_DEFERRED, key);
13409 }
13410 }
13411
9f95a23c
TL
13412#if defined(WITH_LTTNG)
13413 auto sync_start = mono_clock::now();
13414#endif
7c673cae 13415 // submit synct synchronously (block and wait for it to commit)
31f18b77 13416 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
11fdf7f2
TL
13417 ceph_assert(r == 0);
13418
f67539c2
TL
13419#ifdef WITH_BLKIN
13420 for (auto txc : kv_committing) {
13421 if (txc->trace) {
13422 txc->trace.event("db sync submit");
13423 txc->trace.keyval("kv_committing size", kv_committing.size());
13424 }
13425 }
13426#endif
13427
9f95a23c
TL
13428 int committing_size = kv_committing.size();
13429 int deferred_size = deferred_stable.size();
13430
13431#if defined(WITH_LTTNG)
13432 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
13433 for (auto txc: kv_committing) {
13434 if (txc->tracing) {
13435 tracepoint(
13436 bluestore,
13437 transaction_kv_sync_latency,
13438 txc->osr->get_sequencer_id(),
13439 txc->seq,
13440 kv_committing.size(),
13441 deferred_done.size(),
13442 deferred_stable.size(),
13443 sync_latency);
13444 }
13445 }
13446#endif
13447
11fdf7f2 13448 {
9f95a23c 13449 std::unique_lock m{kv_finalize_lock};
11fdf7f2
TL
13450 if (kv_committing_to_finalize.empty()) {
13451 kv_committing_to_finalize.swap(kv_committing);
13452 } else {
13453 kv_committing_to_finalize.insert(
13454 kv_committing_to_finalize.end(),
13455 kv_committing.begin(),
13456 kv_committing.end());
13457 kv_committing.clear();
13458 }
13459 if (deferred_stable_to_finalize.empty()) {
13460 deferred_stable_to_finalize.swap(deferred_stable);
13461 } else {
13462 deferred_stable_to_finalize.insert(
13463 deferred_stable_to_finalize.end(),
13464 deferred_stable.begin(),
13465 deferred_stable.end());
13466 deferred_stable.clear();
13467 }
9f95a23c
TL
13468 if (!kv_finalize_in_progress) {
13469 kv_finalize_in_progress = true;
13470 kv_finalize_cond.notify_one();
13471 }
11fdf7f2 13472 }
7c673cae
FG
13473
13474 if (new_nid_max) {
13475 nid_max = new_nid_max;
13476 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
13477 }
13478 if (new_blobid_max) {
13479 blobid_max = new_blobid_max;
13480 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
13481 }
13482
224ce89b 13483 {
11fdf7f2
TL
13484 auto finish = mono_clock::now();
13485 ceph::timespan dur_flush = after_flush - start;
13486 ceph::timespan dur_kv = finish - after_flush;
13487 ceph::timespan dur = finish - start;
9f95a23c
TL
13488 dout(20) << __func__ << " committed " << committing_size
13489 << " cleaned " << deferred_size
224ce89b
WB
13490 << " in " << dur
13491 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
13492 << dendl;
494da23a
TL
13493 log_latency("kv_flush",
13494 l_bluestore_kv_flush_lat,
13495 dur_flush,
13496 cct->_conf->bluestore_log_op_age);
13497 log_latency("kv_commit",
13498 l_bluestore_kv_commit_lat,
13499 dur_kv,
13500 cct->_conf->bluestore_log_op_age);
13501 log_latency("kv_sync",
13502 l_bluestore_kv_sync_lat,
13503 dur,
13504 cct->_conf->bluestore_log_op_age);
7c673cae 13505 }
31f18b77 13506
31f18b77
FG
13507 l.lock();
13508 // previously deferred "done" are now "stable" by virtue of this
13509 // commit cycle.
13510 deferred_stable_queue.swap(deferred_done);
13511 }
13512 }
13513 dout(10) << __func__ << " finish" << dendl;
13514 kv_sync_started = false;
13515}
13516
13517void BlueStore::_kv_finalize_thread()
13518{
13519 deque<TransContext*> kv_committed;
13520 deque<DeferredBatch*> deferred_stable;
13521 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
13522 std::unique_lock l(kv_finalize_lock);
13523 ceph_assert(!kv_finalize_started);
31f18b77
FG
13524 kv_finalize_started = true;
13525 kv_finalize_cond.notify_all();
13526 while (true) {
11fdf7f2
TL
13527 ceph_assert(kv_committed.empty());
13528 ceph_assert(deferred_stable.empty());
31f18b77
FG
13529 if (kv_committing_to_finalize.empty() &&
13530 deferred_stable_to_finalize.empty()) {
13531 if (kv_finalize_stop)
13532 break;
13533 dout(20) << __func__ << " sleep" << dendl;
9f95a23c 13534 kv_finalize_in_progress = false;
31f18b77
FG
13535 kv_finalize_cond.wait(l);
13536 dout(20) << __func__ << " wake" << dendl;
13537 } else {
13538 kv_committed.swap(kv_committing_to_finalize);
13539 deferred_stable.swap(deferred_stable_to_finalize);
13540 l.unlock();
13541 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
13542 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
13543
11fdf7f2
TL
13544 auto start = mono_clock::now();
13545
31f18b77
FG
13546 while (!kv_committed.empty()) {
13547 TransContext *txc = kv_committed.front();
f67539c2 13548 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
7c673cae 13549 _txc_state_proc(txc);
31f18b77 13550 kv_committed.pop_front();
7c673cae 13551 }
31f18b77 13552
7c673cae
FG
13553 for (auto b : deferred_stable) {
13554 auto p = b->txcs.begin();
13555 while (p != b->txcs.end()) {
13556 TransContext *txc = &*p;
13557 p = b->txcs.erase(p); // unlink here because
13558 _txc_state_proc(txc); // this may destroy txc
13559 }
13560 delete b;
13561 }
31f18b77 13562 deferred_stable.clear();
7c673cae
FG
13563
13564 if (!deferred_aggressive) {
31f18b77 13565 if (deferred_queue_size >= deferred_batch_ops.load() ||
9f95a23c 13566 throttle.should_submit_deferred()) {
224ce89b 13567 deferred_try_submit();
7c673cae
FG
13568 }
13569 }
13570
13571 // this is as good a place as any ...
13572 _reap_collections();
13573
11fdf7f2 13574 logger->set(l_bluestore_fragmentation,
20effc67 13575 (uint64_t)(alloc->get_fragmentation() * 1000));
11fdf7f2 13576
494da23a
TL
13577 log_latency("kv_final",
13578 l_bluestore_kv_final_lat,
13579 mono_clock::now() - start,
13580 cct->_conf->bluestore_log_op_age);
11fdf7f2 13581
7c673cae 13582 l.lock();
7c673cae
FG
13583 }
13584 }
13585 dout(10) << __func__ << " finish" << dendl;
31f18b77 13586 kv_finalize_started = false;
7c673cae
FG
13587}
13588
20effc67
TL
13589#ifdef HAVE_LIBZBD
13590void BlueStore::_zoned_cleaner_start()
13591{
f67539c2 13592 dout(10) << __func__ << dendl;
f67539c2
TL
13593 zoned_cleaner_thread.create("bstore_zcleaner");
13594}
13595
20effc67
TL
13596void BlueStore::_zoned_cleaner_stop()
13597{
f67539c2
TL
13598 dout(10) << __func__ << dendl;
13599 {
13600 std::unique_lock l{zoned_cleaner_lock};
13601 while (!zoned_cleaner_started) {
13602 zoned_cleaner_cond.wait(l);
13603 }
13604 zoned_cleaner_stop = true;
13605 zoned_cleaner_cond.notify_all();
13606 }
13607 zoned_cleaner_thread.join();
13608 {
13609 std::lock_guard l{zoned_cleaner_lock};
13610 zoned_cleaner_stop = false;
13611 }
13612 dout(10) << __func__ << " done" << dendl;
13613}
13614
20effc67
TL
13615void BlueStore::_zoned_cleaner_thread()
13616{
f67539c2
TL
13617 dout(10) << __func__ << " start" << dendl;
13618 std::unique_lock l{zoned_cleaner_lock};
13619 ceph_assert(!zoned_cleaner_started);
13620 zoned_cleaner_started = true;
13621 zoned_cleaner_cond.notify_all();
20effc67
TL
13622 auto a = dynamic_cast<ZonedAllocator*>(alloc);
13623 ceph_assert(a);
13624 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
13625 ceph_assert(f);
f67539c2 13626 while (true) {
20effc67
TL
13627 // thresholds to trigger cleaning
13628 // FIXME
13629 float min_score = .05; // score: bytes saved / bytes moved
13630 uint64_t min_saved = zone_size / 32; // min bytes saved to consider cleaning
13631 auto zone_to_clean = a->pick_zone_to_clean(min_score, min_saved);
13632 if (zone_to_clean < 0) {
f67539c2
TL
13633 if (zoned_cleaner_stop) {
13634 break;
13635 }
20effc67
TL
13636 auto period = ceph::make_timespan(cct->_conf->bluestore_cleaner_sleep_interval);
13637 dout(20) << __func__ << " sleep for " << period << dendl;
13638 zoned_cleaner_cond.wait_for(l, period);
f67539c2
TL
13639 dout(20) << __func__ << " wake" << dendl;
13640 } else {
f67539c2 13641 l.unlock();
20effc67
TL
13642 a->set_cleaning_zone(zone_to_clean);
13643 _zoned_clean_zone(zone_to_clean, a, f);
13644 a->clear_cleaning_zone(zone_to_clean);
f67539c2
TL
13645 l.lock();
13646 }
13647 }
13648 dout(10) << __func__ << " finish" << dendl;
13649 zoned_cleaner_started = false;
13650}
13651
20effc67
TL
13652void BlueStore::_zoned_clean_zone(
13653 uint64_t zone,
13654 ZonedAllocator *a,
13655 ZonedFreelistManager *f
13656 )
13657{
13658 dout(10) << __func__ << " cleaning zone 0x" << std::hex << zone << std::dec << dendl;
13659
13660 KeyValueDB::Iterator it = db->get_iterator(PREFIX_ZONED_CL_INFO);
13661 std::string zone_start;
13662 get_zone_offset_object_key(zone, 0, ghobject_t(), &zone_start);
13663 for (it->lower_bound(zone_start); it->valid(); it->next()) {
13664 uint32_t z;
13665 uint64_t offset;
13666 ghobject_t oid;
13667 string k = it->key();
13668 int r = get_key_zone_offset_object(k, &z, &offset, &oid);
13669 if (r < 0) {
13670 derr << __func__ << " failed to decode zone ref " << pretty_binary_string(k)
13671 << dendl;
13672 continue;
13673 }
13674 if (zone != z) {
13675 dout(10) << __func__ << " reached end of zone refs" << dendl;
13676 break;
13677 }
13678 dout(10) << __func__ << " zone 0x" << std::hex << zone << " offset 0x" << offset
13679 << std::dec << " " << oid << dendl;
13680 _clean_some(oid, zone);
13681 }
13682
13683 if (a->get_live_bytes(zone) > 0) {
13684 derr << "zone 0x" << std::hex << zone << " still has 0x" << a->get_live_bytes(zone)
13685 << " live bytes" << std::dec << dendl;
13686 // should we do something else here to avoid a live-lock in the event of a problem?
13687 return;
13688 }
13689
13690 // make sure transactions flush/drain/commit (and data is all rewritten
13691 // safely elsewhere) before we blow away the cleaned zone
13692 _osr_drain_all();
13693
13694 // reset the device zone
13695 dout(10) << __func__ << " resetting zone 0x" << std::hex << zone << std::dec << dendl;
13696 bdev->reset_zone(zone);
13697
13698 // record that we can now write there
13699 f->mark_zone_to_clean_free(zone, db);
13700 bdev->flush();
13701
13702 // then allow ourselves to start allocating there
13703 dout(10) << __func__ << " done cleaning zone 0x" << std::hex << zone << std::dec
13704 << dendl;
13705 a->reset_zone(zone);
13706}
13707
13708void BlueStore::_clean_some(ghobject_t oid, uint32_t zone)
13709{
13710 dout(10) << __func__ << " " << oid << " from zone 0x" << std::hex << zone << std::dec
13711 << dendl;
13712
13713 CollectionRef cref = _get_collection_by_oid(oid);
13714 if (!cref) {
13715 dout(10) << __func__ << " can't find collection for " << oid << dendl;
13716 return;
13717 }
13718 Collection *c = cref.get();
13719
13720 // serialize io dispatch vs other transactions
13721 std::lock_guard l(atomic_alloc_and_submit_lock);
13722 std::unique_lock l2(c->lock);
13723
13724 auto o = c->get_onode(oid, false);
13725 if (!o) {
13726 dout(10) << __func__ << " can't find " << oid << dendl;
13727 return;
13728 }
13729
13730 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
13731 _dump_onode<30>(cct, *o);
13732
13733 // NOTE: This is a naive rewrite strategy. If any blobs are
13734 // shared, they will be duplicated for each object that references
13735 // them. That means any cloned/snapshotted objects will explode
13736 // their utilization. This won't matter for RGW workloads, but
13737 // for RBD and CephFS it is completely unacceptable, and it's
13738 // entirely reasonable to have "archival" data workloads on SMR
13739 // for CephFS and (possibly/probably) RBD.
13740 //
13741 // At some point we need to replace this with something more
13742 // sophisticated that ensures that a shared blob gets moved once
13743 // and all referencing objects get updated to point to the new
13744 // location.
13745
13746 map<uint32_t, uint32_t> to_move;
13747 for (auto& e : o->extent_map.extent_map) {
13748 bool touches_zone = false;
13749 for (auto& be : e.blob->get_blob().get_extents()) {
13750 if (be.is_valid()) {
13751 uint32_t z = be.offset / zone_size;
13752 if (z == zone) {
13753 touches_zone = true;
13754 break;
13755 }
13756 }
13757 }
13758 if (touches_zone) {
13759 to_move[e.logical_offset] = e.length;
13760 }
13761 }
13762 if (to_move.empty()) {
13763 dout(10) << __func__ << " no references to zone 0x" << std::hex << zone
13764 << std::dec << " from " << oid << dendl;
13765 return;
13766 }
13767
13768 dout(10) << __func__ << " rewriting object extents 0x" << std::hex << to_move
13769 << std::dec << dendl;
13770 OpSequencer *osr = c->osr.get();
13771 TransContext *txc = _txc_create(c, osr, nullptr);
13772
13773 spg_t pgid;
13774 if (c->cid.is_pg(&pgid)) {
13775 txc->osd_pool_id = pgid.pool();
13776 }
13777
13778 for (auto& [offset, length] : to_move) {
13779 bufferlist bl;
13780 int r = _do_read(c, o, offset, length, bl, 0);
13781 ceph_assert(r == (int)length);
13782
13783 r = _do_write(txc, cref, o, offset, length, bl, 0);
13784 ceph_assert(r >= 0);
13785 }
13786 txc->write_onode(o);
13787
13788 _txc_write_nodes(txc, txc->t);
13789 _txc_finalize_kv(txc, txc->t);
13790 _txc_state_proc(txc);
f67539c2 13791}
20effc67 13792#endif
f67539c2 13793
7c673cae 13794bluestore_deferred_op_t *BlueStore::_get_deferred_op(
522d829b 13795 TransContext *txc, uint64_t len)
7c673cae
FG
13796{
13797 if (!txc->deferred_txn) {
13798 txc->deferred_txn = new bluestore_deferred_transaction_t;
13799 }
13800 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
20effc67
TL
13801 logger->inc(l_bluestore_issued_deferred_writes);
13802 logger->inc(l_bluestore_issued_deferred_write_bytes, len);
7c673cae
FG
13803 return &txc->deferred_txn->ops.back();
13804}
13805
13806void BlueStore::_deferred_queue(TransContext *txc)
13807{
13808 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
f67539c2
TL
13809
13810 DeferredBatch *tmp;
13811 txc->osr->deferred_lock.lock();
13812 {
13813 if (!txc->osr->deferred_pending) {
13814 tmp = new DeferredBatch(cct, txc->osr.get());
13815 } else {
13816 tmp = txc->osr->deferred_pending;
13817 }
7c673cae 13818 }
f67539c2
TL
13819
13820 tmp->txcs.push_back(*txc);
7c673cae
FG
13821 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
13822 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
13823 const auto& op = *opi;
11fdf7f2 13824 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
7c673cae
FG
13825 bufferlist::const_iterator p = op.data.begin();
13826 for (auto e : op.extents) {
f67539c2 13827 tmp->prepare_write(cct, wt.seq, e.offset, e.length, p);
7c673cae
FG
13828 }
13829 }
f67539c2
TL
13830
13831 {
13832 ++deferred_queue_size;
13833 txc->osr->deferred_pending = tmp;
13834 // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
13835 // So we should add osr into deferred_queue.
13836 if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) {
13837 deferred_lock.lock();
13838 deferred_queue.push_back(*txc->osr);
13839 deferred_lock.unlock();
13840 }
13841
13842 if (deferred_aggressive &&
13843 !txc->osr->deferred_running) {
13844 _deferred_submit_unlock(txc->osr.get());
13845 } else {
13846 txc->osr->deferred_lock.unlock();
13847 }
7c673cae 13848 }
f67539c2 13849 }
7c673cae 13850
224ce89b 13851void BlueStore::deferred_try_submit()
7c673cae
FG
13852{
13853 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
13854 << deferred_queue_size << " txcs" << dendl;
224ce89b 13855 vector<OpSequencerRef> osrs;
f67539c2
TL
13856
13857 {
13858 std::lock_guard l(deferred_lock);
13859 osrs.reserve(deferred_queue.size());
13860 for (auto& osr : deferred_queue) {
13861 osrs.push_back(&osr);
13862 }
224ce89b 13863 }
f67539c2 13864
224ce89b 13865 for (auto& osr : osrs) {
f67539c2 13866 osr->deferred_lock.lock();
181888fb
FG
13867 if (osr->deferred_pending) {
13868 if (!osr->deferred_running) {
13869 _deferred_submit_unlock(osr.get());
181888fb 13870 } else {
f67539c2 13871 osr->deferred_lock.unlock();
181888fb
FG
13872 dout(20) << __func__ << " osr " << osr << " already has running"
13873 << dendl;
13874 }
13875 } else {
f67539c2 13876 osr->deferred_lock.unlock();
181888fb 13877 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
13878 }
13879 }
9f95a23c 13880
f67539c2
TL
13881 {
13882 std::lock_guard l(deferred_lock);
13883 deferred_last_submitted = ceph_clock_now();
13884 }
7c673cae
FG
13885}
13886
224ce89b 13887void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
13888{
13889 dout(10) << __func__ << " osr " << osr
13890 << " " << osr->deferred_pending->iomap.size() << " ios pending "
13891 << dendl;
11fdf7f2
TL
13892 ceph_assert(osr->deferred_pending);
13893 ceph_assert(!osr->deferred_running);
7c673cae
FG
13894
13895 auto b = osr->deferred_pending;
13896 deferred_queue_size -= b->seq_bytes.size();
11fdf7f2 13897 ceph_assert(deferred_queue_size >= 0);
7c673cae
FG
13898
13899 osr->deferred_running = osr->deferred_pending;
13900 osr->deferred_pending = nullptr;
13901
f67539c2 13902 osr->deferred_lock.unlock();
11fdf7f2
TL
13903
13904 for (auto& txc : b->txcs) {
9f95a23c 13905 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
11fdf7f2 13906 }
7c673cae
FG
13907 uint64_t start = 0, pos = 0;
13908 bufferlist bl;
13909 auto i = b->iomap.begin();
13910 while (true) {
13911 if (i == b->iomap.end() || i->first != pos) {
13912 if (bl.length()) {
13913 dout(20) << __func__ << " write 0x" << std::hex
13914 << start << "~" << bl.length()
13915 << " crc " << bl.crc32c(-1) << std::dec << dendl;
11fdf7f2 13916 if (!g_conf()->bluestore_debug_omit_block_device_write) {
20effc67
TL
13917 logger->inc(l_bluestore_submitted_deferred_writes);
13918 logger->inc(l_bluestore_submitted_deferred_write_bytes, bl.length());
7c673cae 13919 int r = bdev->aio_write(start, bl, &b->ioc, false);
11fdf7f2 13920 ceph_assert(r == 0);
7c673cae
FG
13921 }
13922 }
13923 if (i == b->iomap.end()) {
13924 break;
13925 }
13926 start = 0;
13927 pos = i->first;
13928 bl.clear();
13929 }
13930 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
13931 << std::hex << pos << "~" << i->second.bl.length() << std::dec
13932 << dendl;
13933 if (!bl.length()) {
13934 start = pos;
13935 }
13936 pos += i->second.bl.length();
13937 bl.claim_append(i->second.bl);
13938 ++i;
13939 }
224ce89b 13940
7c673cae
FG
13941 bdev->aio_submit(&b->ioc);
13942}
13943
3efd9988
FG
13944struct C_DeferredTrySubmit : public Context {
13945 BlueStore *store;
13946 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
13947 void finish(int r) {
13948 store->deferred_try_submit();
13949 }
13950};
13951
7c673cae
FG
13952void BlueStore::_deferred_aio_finish(OpSequencer *osr)
13953{
13954 dout(10) << __func__ << " osr " << osr << dendl;
11fdf7f2 13955 ceph_assert(osr->deferred_running);
7c673cae
FG
13956 DeferredBatch *b = osr->deferred_running;
13957
13958 {
f67539c2 13959 osr->deferred_lock.lock();
11fdf7f2 13960 ceph_assert(osr->deferred_running == b);
7c673cae
FG
13961 osr->deferred_running = nullptr;
13962 if (!osr->deferred_pending) {
181888fb 13963 dout(20) << __func__ << " dequeueing" << dendl;
f67539c2
TL
13964 {
13965 deferred_lock.lock();
13966 auto q = deferred_queue.iterator_to(*osr);
13967 deferred_queue.erase(q);
13968 deferred_lock.unlock();
13969 }
13970 osr->deferred_lock.unlock();
181888fb 13971 } else {
f67539c2 13972 osr->deferred_lock.unlock();
9f95a23c
TL
13973 if (deferred_aggressive) {
13974 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
13975 finisher.queue(new C_DeferredTrySubmit(this));
13976 } else {
13977 dout(20) << __func__ << " leaving queued, more pending" << dendl;
13978 }
7c673cae
FG
13979 }
13980 }
13981
13982 {
31f18b77 13983 uint64_t costs = 0;
11fdf7f2 13984 {
11fdf7f2
TL
13985 for (auto& i : b->txcs) {
13986 TransContext *txc = &i;
9f95a23c 13987 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
f67539c2 13988 txc->set_state(TransContext::STATE_DEFERRED_CLEANUP);
11fdf7f2
TL
13989 costs += txc->cost;
13990 }
7c673cae 13991 }
9f95a23c 13992 throttle.release_deferred_throttle(costs);
7c673cae
FG
13993 }
13994
9f95a23c 13995 {
11fdf7f2 13996 std::lock_guard l(kv_lock);
9f95a23c
TL
13997 deferred_done_queue.emplace_back(b);
13998
13999 // in the normal case, do not bother waking up the kv thread; it will
14000 // catch us on the next commit anyway.
14001 if (deferred_aggressive && !kv_sync_in_progress) {
14002 kv_sync_in_progress = true;
14003 kv_cond.notify_one();
14004 }
7c673cae
FG
14005 }
14006}
14007
14008int BlueStore::_deferred_replay()
14009{
14010 dout(10) << __func__ << " start" << dendl;
7c673cae
FG
14011 int count = 0;
14012 int r = 0;
2a845540
TL
14013 interval_set<uint64_t> bluefs_extents;
14014 if (bluefs) {
1e59de90
TL
14015 bluefs->foreach_block_extents(
14016 bluefs_layout.shared_bdev,
14017 [&] (uint64_t start, uint32_t len) {
14018 bluefs_extents.insert(start, len);
14019 }
14020 );
2a845540 14021 }
11fdf7f2
TL
14022 CollectionRef ch = _get_collection(coll_t::meta());
14023 bool fake_ch = false;
14024 if (!ch) {
14025 // hmm, replaying initial mkfs?
14026 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
14027 fake_ch = true;
14028 }
14029 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
7c673cae
FG
14030 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
14031 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
14032 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
14033 << dendl;
14034 bluestore_deferred_transaction_t *deferred_txn =
14035 new bluestore_deferred_transaction_t;
14036 bufferlist bl = it->value();
11fdf7f2 14037 auto p = bl.cbegin();
7c673cae 14038 try {
11fdf7f2 14039 decode(*deferred_txn, p);
f67539c2 14040 } catch (ceph::buffer::error& e) {
7c673cae
FG
14041 derr << __func__ << " failed to decode deferred txn "
14042 << pretty_binary_string(it->key()) << dendl;
14043 delete deferred_txn;
14044 r = -EIO;
14045 goto out;
14046 }
2a845540
TL
14047 bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents);
14048 if (has_some) {
14049 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
14050 txc->deferred_txn = deferred_txn;
14051 txc->set_state(TransContext::STATE_KV_DONE);
14052 _txc_state_proc(txc);
14053 } else {
14054 delete deferred_txn;
14055 }
7c673cae
FG
14056 }
14057 out:
14058 dout(20) << __func__ << " draining osr" << dendl;
11fdf7f2 14059 _osr_register_zombie(osr);
7c673cae 14060 _osr_drain_all();
11fdf7f2
TL
14061 if (fake_ch) {
14062 new_coll_map.clear();
14063 }
7c673cae
FG
14064 dout(10) << __func__ << " completed " << count << " events" << dendl;
14065 return r;
14066}
14067
2a845540
TL
14068bool BlueStore::_eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn,
14069 interval_set<uint64_t>& bluefs_extents)
14070{
14071 bool has_some = false;
14072 dout(30) << __func__ << " bluefs_extents: " << std::hex << bluefs_extents << std::dec << dendl;
14073 auto it = deferred_txn->ops.begin();
14074 while (it != deferred_txn->ops.end()) {
14075 // We process a pair of _data_/_extents_ (here: it->data/it->extents)
14076 // by eliminating _extents_ that belong to bluefs, removing relevant parts of _data_
14077 // example:
14078 // +------------+---------------+---------------+---------------+
14079 // | data | aaaaaaaabbbbb | bbbbcccccdddd | ddddeeeeeefff |
14080 // | extent | 40000 - 44000 | 50000 - 58000 | 58000 - 60000 |
14081 // | in bluefs? | no | yes | no |
14082 // +------------+---------------+---------------+---------------+
14083 // result:
14084 // +------------+---------------+---------------+
14085 // | data | aaaaaaaabbbbb | ddddeeeeeefff |
14086 // | extent | 40000 - 44000 | 58000 - 60000 |
14087 // +------------+---------------+---------------+
14088 PExtentVector new_extents;
14089 ceph::buffer::list new_data;
14090 uint32_t data_offset = 0; // this tracks location of extent 'e' inside it->data
14091 dout(30) << __func__ << " input extents: " << it->extents << dendl;
14092 for (auto& e: it->extents) {
14093 interval_set<uint64_t> region;
14094 region.insert(e.offset, e.length);
14095
14096 auto mi = bluefs_extents.lower_bound(e.offset);
14097 if (mi != bluefs_extents.begin()) {
14098 --mi;
14099 if (mi.get_end() <= e.offset) {
14100 ++mi;
14101 }
14102 }
14103 while (mi != bluefs_extents.end() && mi.get_start() < e.offset + e.length) {
14104 // The interval_set does not like (asserts) when we erase interval that does not exist.
14105 // Hence we do we implement (region-mi) by ((region+mi)-mi).
14106 region.union_insert(mi.get_start(), mi.get_len());
14107 region.erase(mi.get_start(), mi.get_len());
14108 ++mi;
14109 }
14110 // 'region' is now a subset of e, without parts used by bluefs
14111 // we trim coresponding parts from it->data (actally constructing new_data / new_extents)
14112 for (auto ki = region.begin(); ki != region.end(); ki++) {
14113 ceph::buffer::list chunk;
14114 // A chunk from it->data; data_offset is a an offset where 'e' was located;
14115 // 'ki.get_start() - e.offset' is an offset of ki inside 'e'.
14116 chunk.substr_of(it->data, data_offset + (ki.get_start() - e.offset), ki.get_len());
14117 new_data.claim_append(chunk);
14118 new_extents.emplace_back(bluestore_pextent_t(ki.get_start(), ki.get_len()));
14119 }
14120 data_offset += e.length;
14121 }
14122 dout(30) << __func__ << " output extents: " << new_extents << dendl;
14123 if (it->data.length() != new_data.length()) {
14124 dout(10) << __func__ << " trimmed deferred extents: " << it->extents << "->" << new_extents << dendl;
14125 }
14126 if (new_extents.size() == 0) {
14127 it = deferred_txn->ops.erase(it);
14128 } else {
14129 has_some = true;
14130 std::swap(it->extents, new_extents);
14131 std::swap(it->data, new_data);
14132 ++it;
14133 }
14134 }
14135 return has_some;
14136}
14137
7c673cae
FG
14138// ---------------------------
14139// transactions
14140
14141int BlueStore::queue_transactions(
11fdf7f2
TL
14142 CollectionHandle& ch,
14143 vector<Transaction>& tls,
14144 TrackedOpRef op,
14145 ThreadPool::TPHandle *handle)
14146{
14147 FUNCTRACE(cct);
14148 list<Context *> on_applied, on_commit, on_applied_sync;
7c673cae 14149 ObjectStore::Transaction::collect_contexts(
11fdf7f2 14150 tls, &on_applied, &on_commit, &on_applied_sync);
7c673cae 14151
11fdf7f2
TL
14152 auto start = mono_clock::now();
14153
14154 Collection *c = static_cast<Collection*>(ch.get());
14155 OpSequencer *osr = c->osr.get();
14156 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
7c673cae 14157
f67539c2
TL
14158 // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
14159 // submission to happen atomically because if I/O submission happens in a
14160 // different order than I/O allocation, we end up issuing non-sequential
14161 // writes to the drive. This is a temporary solution until ZONE APPEND
14162 // support matures in the kernel. For more information please see:
14163 // https://www.usenix.org/conference/vault20/presentation/bjorling
14164 if (bdev->is_smr()) {
14165 atomic_alloc_and_submit_lock.lock();
14166 }
20effc67
TL
14167
14168 // prepare
14169 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
14170 &on_commit, op);
14171
7c673cae 14172 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
7c673cae
FG
14173 txc->bytes += (*p).get_num_bytes();
14174 _txc_add_transaction(txc, &(*p));
14175 }
14176 _txc_calc_cost(txc);
14177
14178 _txc_write_nodes(txc, txc->t);
14179
14180 // journal deferred items
14181 if (txc->deferred_txn) {
14182 txc->deferred_txn->seq = ++deferred_seq;
14183 bufferlist bl;
11fdf7f2 14184 encode(*txc->deferred_txn, bl);
7c673cae
FG
14185 string key;
14186 get_deferred_key(txc->deferred_txn->seq, &key);
14187 txc->t->set(PREFIX_DEFERRED, key, bl);
14188 }
14189
14190 _txc_finalize_kv(txc, txc->t);
f67539c2
TL
14191
14192#ifdef WITH_BLKIN
14193 if (txc->trace) {
14194 txc->trace.event("txc encode finished");
14195 }
14196#endif
14197
7c673cae
FG
14198 if (handle)
14199 handle->suspend_tp_timeout();
14200
11fdf7f2 14201 auto tstart = mono_clock::now();
9f95a23c
TL
14202
14203 if (!throttle.try_start_transaction(
14204 *db,
14205 *txc,
14206 tstart)) {
7c673cae 14207 // ensure we do not block here because of deferred writes
9f95a23c
TL
14208 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
14209 << dendl;
14210 ++deferred_aggressive;
14211 deferred_try_submit();
14212 {
14213 // wake up any previously finished deferred events
14214 std::lock_guard l(kv_lock);
14215 if (!kv_sync_in_progress) {
14216 kv_sync_in_progress = true;
3efd9988
FG
14217 kv_cond.notify_one();
14218 }
9f95a23c
TL
14219 }
14220 throttle.finish_start_transaction(*db, *txc, tstart);
14221 --deferred_aggressive;
7c673cae 14222 }
11fdf7f2 14223 auto tend = mono_clock::now();
7c673cae
FG
14224
14225 if (handle)
14226 handle->reset_tp_timeout();
14227
14228 logger->inc(l_bluestore_txc);
14229
14230 // execute (start)
14231 _txc_state_proc(txc);
14232
f67539c2
TL
14233 if (bdev->is_smr()) {
14234 atomic_alloc_and_submit_lock.unlock();
14235 }
14236
11fdf7f2
TL
14237 // we're immediately readable (unlike FileStore)
14238 for (auto c : on_applied_sync) {
14239 c->complete(0);
14240 }
14241 if (!on_applied.empty()) {
14242 if (c->commit_queue) {
14243 c->commit_queue->queue(on_applied);
14244 } else {
14245 finisher.queue(on_applied);
14246 }
14247 }
14248
f67539c2
TL
14249#ifdef WITH_BLKIN
14250 if (txc->trace) {
14251 txc->trace.event("txc applied");
14252 }
14253#endif
14254
494da23a
TL
14255 log_latency("submit_transact",
14256 l_bluestore_submit_lat,
14257 mono_clock::now() - start,
14258 cct->_conf->bluestore_log_op_age);
14259 log_latency("throttle_transact",
14260 l_bluestore_throttle_lat,
14261 tend - tstart,
14262 cct->_conf->bluestore_log_op_age);
7c673cae
FG
14263 return 0;
14264}
14265
14266void BlueStore::_txc_aio_submit(TransContext *txc)
14267{
14268 dout(10) << __func__ << " txc " << txc << dendl;
14269 bdev->aio_submit(&txc->ioc);
14270}
14271
14272void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
14273{
14274 Transaction::iterator i = t->begin();
14275
81eedcae 14276 _dump_transaction<30>(cct, t);
7c673cae
FG
14277
14278 vector<CollectionRef> cvec(i.colls.size());
14279 unsigned j = 0;
14280 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
14281 ++p, ++j) {
14282 cvec[j] = _get_collection(*p);
7c673cae 14283 }
11fdf7f2 14284
7c673cae
FG
14285 vector<OnodeRef> ovec(i.objects.size());
14286
14287 for (int pos = 0; i.have_op(); ++pos) {
14288 Transaction::Op *op = i.decode_op();
14289 int r = 0;
14290
14291 // no coll or obj
14292 if (op->op == Transaction::OP_NOP)
14293 continue;
14294
11fdf7f2 14295
7c673cae
FG
14296 // collection operations
14297 CollectionRef &c = cvec[op->cid];
11fdf7f2
TL
14298
14299 // initialize osd_pool_id and do a smoke test that all collections belong
14300 // to the same pool
14301 spg_t pgid;
14302 if (!!c ? c->cid.is_pg(&pgid) : false) {
14303 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
14304 txc->osd_pool_id == pgid.pool());
14305 txc->osd_pool_id = pgid.pool();
14306 }
14307
7c673cae
FG
14308 switch (op->op) {
14309 case Transaction::OP_RMCOLL:
14310 {
14311 const coll_t &cid = i.get_cid(op->cid);
14312 r = _remove_collection(txc, cid, &c);
14313 if (!r)
14314 continue;
14315 }
14316 break;
14317
14318 case Transaction::OP_MKCOLL:
14319 {
11fdf7f2 14320 ceph_assert(!c);
7c673cae
FG
14321 const coll_t &cid = i.get_cid(op->cid);
14322 r = _create_collection(txc, cid, op->split_bits, &c);
14323 if (!r)
14324 continue;
14325 }
14326 break;
14327
14328 case Transaction::OP_SPLIT_COLLECTION:
11fdf7f2 14329 ceph_abort_msg("deprecated");
7c673cae
FG
14330 break;
14331
14332 case Transaction::OP_SPLIT_COLLECTION2:
14333 {
14334 uint32_t bits = op->split_bits;
14335 uint32_t rem = op->split_rem;
14336 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
14337 if (!r)
14338 continue;
14339 }
14340 break;
14341
11fdf7f2
TL
14342 case Transaction::OP_MERGE_COLLECTION:
14343 {
14344 uint32_t bits = op->split_bits;
14345 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
14346 if (!r)
14347 continue;
14348 }
14349 break;
14350
7c673cae
FG
14351 case Transaction::OP_COLL_HINT:
14352 {
f67539c2 14353 uint32_t type = op->hint;
7c673cae
FG
14354 bufferlist hint;
14355 i.decode_bl(hint);
11fdf7f2 14356 auto hiter = hint.cbegin();
7c673cae
FG
14357 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
14358 uint32_t pg_num;
14359 uint64_t num_objs;
11fdf7f2
TL
14360 decode(pg_num, hiter);
14361 decode(num_objs, hiter);
7c673cae
FG
14362 dout(10) << __func__ << " collection hint objects is a no-op, "
14363 << " pg_num " << pg_num << " num_objects " << num_objs
14364 << dendl;
14365 } else {
14366 // Ignore the hint
14367 dout(10) << __func__ << " unknown collection hint " << type << dendl;
14368 }
14369 continue;
14370 }
14371 break;
14372
14373 case Transaction::OP_COLL_SETATTR:
14374 r = -EOPNOTSUPP;
14375 break;
14376
14377 case Transaction::OP_COLL_RMATTR:
14378 r = -EOPNOTSUPP;
14379 break;
14380
14381 case Transaction::OP_COLL_RENAME:
11fdf7f2 14382 ceph_abort_msg("not implemented");
7c673cae
FG
14383 break;
14384 }
14385 if (r < 0) {
14386 derr << __func__ << " error " << cpp_strerror(r)
14387 << " not handled on operation " << op->op
14388 << " (op " << pos << ", counting from 0)" << dendl;
81eedcae 14389 _dump_transaction<0>(cct, t);
11fdf7f2 14390 ceph_abort_msg("unexpected error");
7c673cae
FG
14391 }
14392
14393 // these operations implicity create the object
14394 bool create = false;
14395 if (op->op == Transaction::OP_TOUCH ||
9f95a23c 14396 op->op == Transaction::OP_CREATE ||
7c673cae
FG
14397 op->op == Transaction::OP_WRITE ||
14398 op->op == Transaction::OP_ZERO) {
14399 create = true;
14400 }
14401
14402 // object operations
9f95a23c 14403 std::unique_lock l(c->lock);
7c673cae
FG
14404 OnodeRef &o = ovec[op->oid];
14405 if (!o) {
14406 ghobject_t oid = i.get_oid(op->oid);
9f95a23c 14407 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
7c673cae
FG
14408 }
14409 if (!create && (!o || !o->exists)) {
14410 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
14411 << i.get_oid(op->oid) << dendl;
14412 r = -ENOENT;
14413 goto endop;
14414 }
14415
14416 switch (op->op) {
9f95a23c 14417 case Transaction::OP_CREATE:
7c673cae
FG
14418 case Transaction::OP_TOUCH:
14419 r = _touch(txc, c, o);
14420 break;
14421
14422 case Transaction::OP_WRITE:
14423 {
14424 uint64_t off = op->off;
14425 uint64_t len = op->len;
14426 uint32_t fadvise_flags = i.get_fadvise_flags();
14427 bufferlist bl;
14428 i.decode_bl(bl);
14429 r = _write(txc, c, o, off, len, bl, fadvise_flags);
14430 }
14431 break;
14432
14433 case Transaction::OP_ZERO:
14434 {
14435 uint64_t off = op->off;
14436 uint64_t len = op->len;
14437 r = _zero(txc, c, o, off, len);
14438 }
14439 break;
14440
14441 case Transaction::OP_TRIMCACHE:
14442 {
14443 // deprecated, no-op
14444 }
14445 break;
14446
14447 case Transaction::OP_TRUNCATE:
14448 {
14449 uint64_t off = op->off;
35e4c445 14450 r = _truncate(txc, c, o, off);
7c673cae
FG
14451 }
14452 break;
14453
14454 case Transaction::OP_REMOVE:
14455 {
14456 r = _remove(txc, c, o);
14457 }
14458 break;
14459
14460 case Transaction::OP_SETATTR:
14461 {
14462 string name = i.decode_string();
14463 bufferptr bp;
14464 i.decode_bp(bp);
14465 r = _setattr(txc, c, o, name, bp);
14466 }
14467 break;
14468
14469 case Transaction::OP_SETATTRS:
14470 {
14471 map<string, bufferptr> aset;
14472 i.decode_attrset(aset);
14473 r = _setattrs(txc, c, o, aset);
14474 }
14475 break;
14476
14477 case Transaction::OP_RMATTR:
14478 {
14479 string name = i.decode_string();
14480 r = _rmattr(txc, c, o, name);
14481 }
14482 break;
14483
14484 case Transaction::OP_RMATTRS:
14485 {
14486 r = _rmattrs(txc, c, o);
14487 }
14488 break;
14489
14490 case Transaction::OP_CLONE:
14491 {
14492 OnodeRef& no = ovec[op->dest_oid];
14493 if (!no) {
14494 const ghobject_t& noid = i.get_oid(op->dest_oid);
14495 no = c->get_onode(noid, true);
14496 }
14497 r = _clone(txc, c, o, no);
14498 }
14499 break;
14500
14501 case Transaction::OP_CLONERANGE:
11fdf7f2 14502 ceph_abort_msg("deprecated");
7c673cae
FG
14503 break;
14504
14505 case Transaction::OP_CLONERANGE2:
14506 {
14507 OnodeRef& no = ovec[op->dest_oid];
14508 if (!no) {
14509 const ghobject_t& noid = i.get_oid(op->dest_oid);
14510 no = c->get_onode(noid, true);
14511 }
14512 uint64_t srcoff = op->off;
14513 uint64_t len = op->len;
14514 uint64_t dstoff = op->dest_off;
14515 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
14516 }
14517 break;
14518
14519 case Transaction::OP_COLL_ADD:
11fdf7f2 14520 ceph_abort_msg("not implemented");
7c673cae
FG
14521 break;
14522
14523 case Transaction::OP_COLL_REMOVE:
11fdf7f2 14524 ceph_abort_msg("not implemented");
7c673cae
FG
14525 break;
14526
14527 case Transaction::OP_COLL_MOVE:
11fdf7f2 14528 ceph_abort_msg("deprecated");
7c673cae
FG
14529 break;
14530
14531 case Transaction::OP_COLL_MOVE_RENAME:
14532 case Transaction::OP_TRY_RENAME:
14533 {
11fdf7f2 14534 ceph_assert(op->cid == op->dest_cid);
7c673cae
FG
14535 const ghobject_t& noid = i.get_oid(op->dest_oid);
14536 OnodeRef& no = ovec[op->dest_oid];
14537 if (!no) {
14538 no = c->get_onode(noid, false);
14539 }
14540 r = _rename(txc, c, o, no, noid);
14541 }
14542 break;
14543
14544 case Transaction::OP_OMAP_CLEAR:
14545 {
14546 r = _omap_clear(txc, c, o);
14547 }
14548 break;
14549 case Transaction::OP_OMAP_SETKEYS:
14550 {
14551 bufferlist aset_bl;
14552 i.decode_attrset_bl(&aset_bl);
14553 r = _omap_setkeys(txc, c, o, aset_bl);
14554 }
14555 break;
14556 case Transaction::OP_OMAP_RMKEYS:
14557 {
14558 bufferlist keys_bl;
14559 i.decode_keyset_bl(&keys_bl);
14560 r = _omap_rmkeys(txc, c, o, keys_bl);
14561 }
14562 break;
14563 case Transaction::OP_OMAP_RMKEYRANGE:
14564 {
14565 string first, last;
14566 first = i.decode_string();
14567 last = i.decode_string();
14568 r = _omap_rmkey_range(txc, c, o, first, last);
14569 }
14570 break;
14571 case Transaction::OP_OMAP_SETHEADER:
14572 {
14573 bufferlist bl;
14574 i.decode_bl(bl);
14575 r = _omap_setheader(txc, c, o, bl);
14576 }
14577 break;
14578
14579 case Transaction::OP_SETALLOCHINT:
14580 {
14581 r = _set_alloc_hint(txc, c, o,
14582 op->expected_object_size,
14583 op->expected_write_size,
f67539c2 14584 op->hint);
7c673cae
FG
14585 }
14586 break;
14587
14588 default:
11fdf7f2 14589 derr << __func__ << " bad op " << op->op << dendl;
7c673cae
FG
14590 ceph_abort();
14591 }
14592
14593 endop:
14594 if (r < 0) {
14595 bool ok = false;
14596
14597 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
14598 op->op == Transaction::OP_CLONE ||
14599 op->op == Transaction::OP_CLONERANGE2 ||
14600 op->op == Transaction::OP_COLL_ADD ||
14601 op->op == Transaction::OP_SETATTR ||
14602 op->op == Transaction::OP_SETATTRS ||
14603 op->op == Transaction::OP_RMATTR ||
14604 op->op == Transaction::OP_OMAP_SETKEYS ||
14605 op->op == Transaction::OP_OMAP_RMKEYS ||
14606 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
14607 op->op == Transaction::OP_OMAP_SETHEADER))
14608 // -ENOENT is usually okay
14609 ok = true;
14610 if (r == -ENODATA)
14611 ok = true;
14612
14613 if (!ok) {
14614 const char *msg = "unexpected error code";
14615
14616 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
14617 op->op == Transaction::OP_CLONE ||
14618 op->op == Transaction::OP_CLONERANGE2))
14619 msg = "ENOENT on clone suggests osd bug";
14620
14621 if (r == -ENOSPC)
14622 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
14623 // by partially applying transactions.
14624 msg = "ENOSPC from bluestore, misconfigured cluster";
14625
14626 if (r == -ENOTEMPTY) {
14627 msg = "ENOTEMPTY suggests garbage data in osd data dir";
14628 }
14629
14630 derr << __func__ << " error " << cpp_strerror(r)
14631 << " not handled on operation " << op->op
14632 << " (op " << pos << ", counting from 0)"
14633 << dendl;
14634 derr << msg << dendl;
81eedcae 14635 _dump_transaction<0>(cct, t);
11fdf7f2 14636 ceph_abort_msg("unexpected error");
7c673cae
FG
14637 }
14638 }
14639 }
14640}
14641
14642
14643
14644// -----------------
14645// write operations
14646
14647int BlueStore::_touch(TransContext *txc,
14648 CollectionRef& c,
39ae355f 14649 OnodeRef& o)
7c673cae
FG
14650{
14651 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14652 int r = 0;
7c673cae
FG
14653 _assign_nid(txc, o);
14654 txc->write_onode(o);
14655 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14656 return r;
14657}
14658
7c673cae
FG
14659void BlueStore::_pad_zeros(
14660 bufferlist *bl, uint64_t *offset,
14661 uint64_t chunk_size)
14662{
14663 auto length = bl->length();
14664 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
14665 << " chunk_size 0x" << chunk_size << std::dec << dendl;
14666 dout(40) << "before:\n";
14667 bl->hexdump(*_dout);
14668 *_dout << dendl;
14669 // front
14670 size_t front_pad = *offset % chunk_size;
14671 size_t back_pad = 0;
14672 size_t pad_count = 0;
14673 if (front_pad) {
11fdf7f2 14674 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
f67539c2 14675 bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size);
224ce89b 14676 z.zero(0, front_pad, false);
7c673cae 14677 pad_count += front_pad;
9f95a23c 14678 bl->begin().copy(front_copy, z.c_str() + front_pad);
7c673cae
FG
14679 if (front_copy + front_pad < chunk_size) {
14680 back_pad = chunk_size - (length + front_pad);
224ce89b 14681 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
14682 pad_count += back_pad;
14683 }
14684 bufferlist old, t;
14685 old.swap(*bl);
14686 t.substr_of(old, front_copy, length - front_copy);
14687 bl->append(z);
14688 bl->claim_append(t);
14689 *offset -= front_pad;
224ce89b 14690 length += pad_count;
7c673cae
FG
14691 }
14692
14693 // back
14694 uint64_t end = *offset + length;
14695 unsigned back_copy = end % chunk_size;
14696 if (back_copy) {
11fdf7f2 14697 ceph_assert(back_pad == 0);
7c673cae 14698 back_pad = chunk_size - back_copy;
11fdf7f2 14699 ceph_assert(back_copy <= length);
7c673cae 14700 bufferptr tail(chunk_size);
9f95a23c 14701 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
224ce89b 14702 tail.zero(back_copy, back_pad, false);
7c673cae
FG
14703 bufferlist old;
14704 old.swap(*bl);
14705 bl->substr_of(old, 0, length - back_copy);
14706 bl->append(tail);
14707 length += back_pad;
14708 pad_count += back_pad;
14709 }
14710 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
14711 << back_pad << " on front/back, now 0x" << *offset << "~"
14712 << length << std::dec << dendl;
14713 dout(40) << "after:\n";
14714 bl->hexdump(*_dout);
14715 *_dout << dendl;
14716 if (pad_count)
14717 logger->inc(l_bluestore_write_pad_bytes, pad_count);
11fdf7f2 14718 ceph_assert(bl->length() == length);
7c673cae
FG
14719}
14720
14721void BlueStore::_do_write_small(
14722 TransContext *txc,
14723 CollectionRef &c,
39ae355f 14724 OnodeRef& o,
7c673cae
FG
14725 uint64_t offset, uint64_t length,
14726 bufferlist::iterator& blp,
14727 WriteContext *wctx)
14728{
14729 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
14730 << std::dec << dendl;
11fdf7f2 14731 ceph_assert(length < min_alloc_size);
f67539c2 14732
7c673cae
FG
14733 uint64_t end_offs = offset + length;
14734
14735 logger->inc(l_bluestore_write_small);
14736 logger->inc(l_bluestore_write_small_bytes, length);
14737
14738 bufferlist bl;
14739 blp.copy(length, bl);
14740
81eedcae
TL
14741 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
14742 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
14743 uint32_t alloc_len = min_alloc_size;
14744 auto offset0 = p2align<uint64_t>(offset, alloc_len);
14745
14746 bool any_change;
14747
14748 // search suitable extent in both forward and reverse direction in
14749 // [offset - target_max_blob_size, offset + target_max_blob_size] range
14750 // then check if blob can be reused via can_reuse_blob func or apply
14751 // direct/deferred write (the latter for extents including or higher
14752 // than 'offset' only).
14753 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
14754
20effc67 14755#ifdef HAVE_LIBZBD
f67539c2
TL
14756 // On zoned devices, the first goal is to support non-overwrite workloads,
14757 // such as RGW, with large, aligned objects. Therefore, for user writes
14758 // _do_write_small should not trigger. OSDs, however, write and update a tiny
14759 // amount of metadata, such as OSD maps, to disk. For those cases, we
14760 // temporarily just pad them to min_alloc_size and write them to a new place
14761 // on every update.
14762 if (bdev->is_smr()) {
f67539c2
TL
14763 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
14764 uint64_t b_off0 = b_off;
f67539c2 14765 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
20effc67
TL
14766
14767 // Zero detection -- small block
33c7a0ef 14768 if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
20effc67
TL
14769 BlobRef b = c->new_blob();
14770 _pad_zeros(&bl, &b_off0, min_alloc_size);
14771 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true);
14772 } else { // if (bl.is_zero())
14773 dout(20) << __func__ << " skip small zero block " << std::hex
14774 << " (0x" << b_off0 << "~" << bl.length() << ")"
14775 << " (0x" << b_off << "~" << length << ")"
14776 << std::dec << dendl;
14777 logger->inc(l_bluestore_write_small_skipped);
14778 logger->inc(l_bluestore_write_small_skipped_bytes, length);
14779 }
14780
f67539c2
TL
14781 return;
14782 }
20effc67 14783#endif
f67539c2 14784
7c673cae
FG
14785 // Look for an existing mutable blob we can use.
14786 auto begin = o->extent_map.extent_map.begin();
14787 auto end = o->extent_map.extent_map.end();
14788 auto ep = o->extent_map.seek_lextent(offset);
14789 if (ep != begin) {
14790 --ep;
14791 if (ep->blob_end() <= offset) {
14792 ++ep;
14793 }
14794 }
f67539c2
TL
14795 auto prev_ep = end;
14796 if (ep != begin) {
14797 prev_ep = ep;
7c673cae 14798 --prev_ep;
7c673cae
FG
14799 }
14800
eafe8130
TL
14801 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
14802 // We don't want to have more blobs than min alloc units fit
14803 // into 2 max blobs
14804 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
14805 bool above_blob_threshold = false;
14806
14807 inspected_blobs.reserve(blob_threshold);
14808
14809 uint64_t max_off = 0;
14810 auto start_ep = ep;
14811 auto end_ep = ep; // exclusively
7c673cae
FG
14812 do {
14813 any_change = false;
14814
14815 if (ep != end && ep->logical_offset < offset + max_bsize) {
14816 BlobRef b = ep->blob;
eafe8130
TL
14817 if (!above_blob_threshold) {
14818 inspected_blobs.insert(&b->get_blob());
14819 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
14820 }
14821 max_off = ep->logical_end();
7c673cae 14822 auto bstart = ep->blob_start();
eafe8130 14823
7c673cae
FG
14824 dout(20) << __func__ << " considering " << *b
14825 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
14826 if (bstart >= end_offs) {
14827 dout(20) << __func__ << " ignoring distant " << *b << dendl;
14828 } else if (!b->get_blob().is_mutable()) {
14829 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
14830 } else if (ep->logical_offset % min_alloc_size !=
14831 ep->blob_offset % min_alloc_size) {
14832 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
14833 } else {
14834 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
14835 // can we pad our head/tail out with zeros?
14836 uint64_t head_pad, tail_pad;
11fdf7f2
TL
14837 head_pad = p2phase(offset, chunk_size);
14838 tail_pad = p2nphase(end_offs, chunk_size);
7c673cae
FG
14839 if (head_pad || tail_pad) {
14840 o->extent_map.fault_range(db, offset - head_pad,
14841 end_offs - offset + head_pad + tail_pad);
14842 }
14843 if (head_pad &&
a4b75251 14844 o->extent_map.has_any_lextents(offset - head_pad, head_pad)) {
7c673cae
FG
14845 head_pad = 0;
14846 }
14847 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
14848 tail_pad = 0;
14849 }
14850
14851 uint64_t b_off = offset - head_pad - bstart;
14852 uint64_t b_len = length + head_pad + tail_pad;
14853
14854 // direct write into unused blocks of an existing mutable blob?
14855 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
14856 b->get_blob().get_ondisk_length() >= b_off + b_len &&
14857 b->get_blob().is_unused(b_off, b_len) &&
14858 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 14859 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
14860
14861 dout(20) << __func__ << " write to unused 0x" << std::hex
14862 << b_off << "~" << b_len
14863 << " pad 0x" << head_pad << " + 0x" << tail_pad
14864 << std::dec << " of mutable " << *b << dendl;
224ce89b 14865 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
14866 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14867
11fdf7f2 14868 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 14869 if (b_len < prefer_deferred_size) {
7c673cae
FG
14870 dout(20) << __func__ << " deferring small 0x" << std::hex
14871 << b_len << std::dec << " unused write via deferred" << dendl;
522d829b 14872 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
7c673cae
FG
14873 op->op = bluestore_deferred_op_t::OP_WRITE;
14874 b->get_blob().map(
14875 b_off, b_len,
14876 [&](uint64_t offset, uint64_t length) {
14877 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14878 return 0;
14879 });
224ce89b 14880 op->data = bl;
7c673cae
FG
14881 } else {
14882 b->get_blob().map_bl(
224ce89b 14883 b_off, bl,
7c673cae
FG
14884 [&](uint64_t offset, bufferlist& t) {
14885 bdev->aio_write(offset, t,
14886 &txc->ioc, wctx->buffered);
14887 });
14888 }
14889 }
224ce89b 14890 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
14891 dout(20) << __func__ << " lex old " << *ep << dendl;
14892 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
14893 b,
14894 &wctx->old_extents);
14895 b->dirty_blob().mark_used(le->blob_offset, le->length);
f67539c2 14896
7c673cae
FG
14897 txc->statfs_delta.stored() += le->length;
14898 dout(20) << __func__ << " lex " << *le << dendl;
14899 logger->inc(l_bluestore_write_small_unused);
14900 return;
14901 }
14902 // read some data to fill out the chunk?
11fdf7f2
TL
14903 uint64_t head_read = p2phase(b_off, chunk_size);
14904 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
7c673cae
FG
14905 if ((head_read || tail_read) &&
14906 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
14907 head_read + tail_read < min_alloc_size) {
14908 b_off -= head_read;
14909 b_len += head_read + tail_read;
14910
14911 } else {
14912 head_read = tail_read = 0;
14913 }
14914
14915 // chunk-aligned deferred overwrite?
14916 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
14917 b_off % chunk_size == 0 &&
14918 b_len % chunk_size == 0 &&
14919 b->get_blob().is_allocated(b_off, b_len)) {
14920
224ce89b 14921 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
14922
14923 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
14924 << " and tail 0x" << tail_read << std::dec << dendl;
14925 if (head_read) {
14926 bufferlist head_bl;
14927 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
14928 head_bl, 0);
11fdf7f2 14929 ceph_assert(r >= 0 && r <= (int)head_read);
7c673cae
FG
14930 size_t zlen = head_read - r;
14931 if (zlen) {
14932 head_bl.append_zero(zlen);
14933 logger->inc(l_bluestore_write_pad_bytes, zlen);
14934 }
11fdf7f2
TL
14935 head_bl.claim_append(bl);
14936 bl.swap(head_bl);
7c673cae
FG
14937 logger->inc(l_bluestore_write_penalty_read_ops);
14938 }
14939 if (tail_read) {
14940 bufferlist tail_bl;
14941 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
14942 tail_bl, 0);
11fdf7f2 14943 ceph_assert(r >= 0 && r <= (int)tail_read);
7c673cae
FG
14944 size_t zlen = tail_read - r;
14945 if (zlen) {
14946 tail_bl.append_zero(zlen);
14947 logger->inc(l_bluestore_write_pad_bytes, zlen);
14948 }
224ce89b 14949 bl.claim_append(tail_bl);
7c673cae
FG
14950 logger->inc(l_bluestore_write_penalty_read_ops);
14951 }
f67539c2 14952 logger->inc(l_bluestore_write_small_pre_read);
7c673cae 14953
224ce89b 14954 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
14955 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14956
f67539c2 14957 b->dirty_blob().calc_csum(b_off, bl);
11fdf7f2
TL
14958
14959 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 14960 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
11fdf7f2
TL
14961 op->op = bluestore_deferred_op_t::OP_WRITE;
14962 int r = b->get_blob().map(
14963 b_off, b_len,
14964 [&](uint64_t offset, uint64_t length) {
14965 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14966 return 0;
14967 });
14968 ceph_assert(r == 0);
f67539c2 14969 op->data = std::move(bl);
11fdf7f2
TL
14970 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
14971 << b_len << std::dec << " of mutable " << *b
14972 << " at " << op->extents << dendl;
14973 }
14974
7c673cae
FG
14975 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
14976 b, &wctx->old_extents);
14977 b->dirty_blob().mark_used(le->blob_offset, le->length);
14978 txc->statfs_delta.stored() += le->length;
14979 dout(20) << __func__ << " lex " << *le << dendl;
7c673cae
FG
14980 return;
14981 }
224ce89b
WB
14982 // try to reuse blob if we can
14983 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
14984 max_bsize,
14985 offset0 - bstart,
14986 &alloc_len)) {
11fdf7f2 14987 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
14988 // fit into reused blob
14989 // Need to check for pending writes desiring to
14990 // reuse the same pextent. The rationale is that during GC two chunks
14991 // from garbage blobs(compressed?) can share logical space within the same
14992 // AU. That's in turn might be caused by unaligned len in clone_range2.
14993 // Hence the second write will fail in an attempt to reuse blob at
14994 // do_alloc_write().
14995 if (!wctx->has_conflict(b,
14996 offset0,
14997 offset0 + alloc_len,
14998 min_alloc_size)) {
14999
15000 // we can't reuse pad_head/pad_tail since they might be truncated
15001 // due to existent extents
15002 uint64_t b_off = offset - bstart;
15003 uint64_t b_off0 = b_off;
20effc67 15004 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
7c673cae 15005
20effc67 15006 // Zero detection -- small block
33c7a0ef 15007 if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
20effc67
TL
15008 _pad_zeros(&bl, &b_off0, chunk_size);
15009
15010 dout(20) << __func__ << " reuse blob " << *b << std::hex
15011 << " (0x" << b_off0 << "~" << bl.length() << ")"
15012 << " (0x" << b_off << "~" << length << ")"
15013 << std::dec << dendl;
15014
15015 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
15016 false, false);
15017 logger->inc(l_bluestore_write_small_unused);
15018 } else { // if (bl.is_zero())
15019 dout(20) << __func__ << " skip small zero block " << std::hex
15020 << " (0x" << b_off0 << "~" << bl.length() << ")"
15021 << " (0x" << b_off << "~" << length << ")"
15022 << std::dec << dendl;
15023 logger->inc(l_bluestore_write_small_skipped);
15024 logger->inc(l_bluestore_write_small_skipped_bytes, length);
15025 }
7c673cae 15026
7c673cae
FG
15027 return;
15028 }
15029 }
15030 }
15031 ++ep;
eafe8130 15032 end_ep = ep;
7c673cae
FG
15033 any_change = true;
15034 } // if (ep != end && ep->logical_offset < offset + max_bsize)
15035
15036 // check extent for reuse in reverse order
15037 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
15038 BlobRef b = prev_ep->blob;
eafe8130
TL
15039 if (!above_blob_threshold) {
15040 inspected_blobs.insert(&b->get_blob());
15041 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
15042 }
15043 start_ep = prev_ep;
7c673cae
FG
15044 auto bstart = prev_ep->blob_start();
15045 dout(20) << __func__ << " considering " << *b
15046 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 15047 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
15048 max_bsize,
15049 offset0 - bstart,
15050 &alloc_len)) {
11fdf7f2 15051 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
15052 // fit into reused blob
15053 // Need to check for pending writes desiring to
15054 // reuse the same pextent. The rationale is that during GC two chunks
15055 // from garbage blobs(compressed?) can share logical space within the same
15056 // AU. That's in turn might be caused by unaligned len in clone_range2.
15057 // Hence the second write will fail in an attempt to reuse blob at
15058 // do_alloc_write().
15059 if (!wctx->has_conflict(b,
15060 offset0,
15061 offset0 + alloc_len,
15062 min_alloc_size)) {
15063
7c673cae
FG
15064 uint64_t b_off = offset - bstart;
15065 uint64_t b_off0 = b_off;
20effc67 15066 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
7c673cae 15067
20effc67 15068 // Zero detection -- small block
33c7a0ef 15069 if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
20effc67
TL
15070 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
15071 _pad_zeros(&bl, &b_off0, chunk_size);
15072
15073 dout(20) << __func__ << " reuse blob " << *b << std::hex
15074 << " (0x" << b_off0 << "~" << bl.length() << ")"
15075 << " (0x" << b_off << "~" << length << ")"
15076 << std::dec << dendl;
15077
15078 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
15079 false, false);
15080 logger->inc(l_bluestore_write_small_unused);
15081 } else { // if (bl.is_zero())
15082 dout(20) << __func__ << " skip small zero block " << std::hex
15083 << " (0x" << b_off0 << "~" << bl.length() << ")"
15084 << " (0x" << b_off << "~" << length << ")"
15085 << std::dec << dendl;
15086 logger->inc(l_bluestore_write_small_skipped);
15087 logger->inc(l_bluestore_write_small_skipped_bytes, length);
15088 }
7c673cae 15089
7c673cae
FG
15090 return;
15091 }
15092 }
15093 if (prev_ep != begin) {
15094 --prev_ep;
15095 any_change = true;
15096 } else {
15097 prev_ep = end; // to avoid useless first extent re-check
15098 }
15099 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
15100 } while (any_change);
15101
eafe8130
TL
15102 if (above_blob_threshold) {
15103 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
15104 << " " << std::hex << min_off << "~" << max_off << std::dec
15105 << dendl;
15106 ceph_assert(start_ep != end_ep);
15107 for (auto ep = start_ep; ep != end_ep; ++ep) {
15108 dout(20) << __func__ << " inserting for GC "
15109 << std::hex << ep->logical_offset << "~" << ep->length
15110 << std::dec << dendl;
15111
15112 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
15113 }
15114 // insert newly written extent to GC
15115 wctx->extents_to_gc.union_insert(offset, length);
15116 dout(20) << __func__ << " inserting (last) for GC "
15117 << std::hex << offset << "~" << length
15118 << std::dec << dendl;
15119 }
11fdf7f2 15120 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
7c673cae 15121 uint64_t b_off0 = b_off;
7c673cae 15122 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
20effc67
TL
15123
15124 // Zero detection -- small block
33c7a0ef 15125 if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
20effc67
TL
15126 // new blob.
15127 BlobRef b = c->new_blob();
15128 _pad_zeros(&bl, &b_off0, block_size);
15129 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
15130 min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
15131 // doesn't match disk one only
15132 true);
15133 } else { // if (bl.is_zero())
15134 dout(20) << __func__ << " skip small zero block " << std::hex
15135 << " (0x" << b_off0 << "~" << bl.length() << ")"
15136 << " (0x" << b_off << "~" << length << ")"
15137 << std::dec << dendl;
15138 logger->inc(l_bluestore_write_small_skipped);
15139 logger->inc(l_bluestore_write_small_skipped_bytes, length);
15140 }
7c673cae
FG
15141
15142 return;
15143}
15144
f67539c2
TL
15145bool BlueStore::BigDeferredWriteContext::can_defer(
15146 BlueStore::extent_map_t::iterator ep,
15147 uint64_t prefer_deferred_size,
15148 uint64_t block_size,
15149 uint64_t offset,
15150 uint64_t l)
15151{
15152 bool res = false;
15153 auto& blob = ep->blob->get_blob();
15154 if (offset >= ep->blob_start() &&
15155 blob.is_mutable()) {
15156 off = offset;
15157 b_off = offset - ep->blob_start();
15158 uint64_t chunk_size = blob.get_chunk_size(block_size);
15159 uint64_t ondisk = blob.get_ondisk_length();
15160 used = std::min(l, ondisk - b_off);
15161
15162 // will read some data to fill out the chunk?
15163 head_read = p2phase<uint64_t>(b_off, chunk_size);
15164 tail_read = p2nphase<uint64_t>(b_off + used, chunk_size);
15165 b_off -= head_read;
15166
15167 ceph_assert(b_off % chunk_size == 0);
15168 ceph_assert(blob_aligned_len() % chunk_size == 0);
15169
522d829b 15170 res = blob_aligned_len() < prefer_deferred_size &&
f67539c2
TL
15171 blob_aligned_len() <= ondisk &&
15172 blob.is_allocated(b_off, blob_aligned_len());
15173 if (res) {
15174 blob_ref = ep->blob;
15175 blob_start = ep->blob_start();
15176 }
15177 }
15178 return res;
15179}
15180
15181bool BlueStore::BigDeferredWriteContext::apply_defer()
15182{
15183 int r = blob_ref->get_blob().map(
15184 b_off, blob_aligned_len(),
15185 [&](const bluestore_pextent_t& pext,
15186 uint64_t offset,
15187 uint64_t length) {
15188 // apply deferred if overwrite breaks blob continuity only.
15189 // if it totally overlaps some pextent - fallback to regular write
15190 if (pext.offset < offset ||
15191 pext.end() > offset + length) {
15192 res_extents.emplace_back(bluestore_pextent_t(offset, length));
15193 return 0;
15194 }
15195 return -1;
15196 });
15197 return r >= 0;
15198}
15199
15200void BlueStore::_do_write_big_apply_deferred(
15201 TransContext* txc,
15202 CollectionRef& c,
39ae355f 15203 OnodeRef& o,
f67539c2
TL
15204 BlueStore::BigDeferredWriteContext& dctx,
15205 bufferlist::iterator& blp,
15206 WriteContext* wctx)
15207{
15208 bufferlist bl;
15209 dout(20) << __func__ << " reading head 0x" << std::hex << dctx.head_read
15210 << " and tail 0x" << dctx.tail_read << std::dec << dendl;
15211 if (dctx.head_read) {
15212 int r = _do_read(c.get(), o,
15213 dctx.off - dctx.head_read,
15214 dctx.head_read,
15215 bl,
15216 0);
15217 ceph_assert(r >= 0 && r <= (int)dctx.head_read);
15218 size_t zlen = dctx.head_read - r;
15219 if (zlen) {
15220 bl.append_zero(zlen);
15221 logger->inc(l_bluestore_write_pad_bytes, zlen);
15222 }
15223 logger->inc(l_bluestore_write_penalty_read_ops);
15224 }
15225 blp.copy(dctx.used, bl);
15226
15227 if (dctx.tail_read) {
15228 bufferlist tail_bl;
15229 int r = _do_read(c.get(), o,
15230 dctx.off + dctx.used, dctx.tail_read,
15231 tail_bl, 0);
15232 ceph_assert(r >= 0 && r <= (int)dctx.tail_read);
15233 size_t zlen = dctx.tail_read - r;
15234 if (zlen) {
15235 tail_bl.append_zero(zlen);
15236 logger->inc(l_bluestore_write_pad_bytes, zlen);
15237 }
15238 bl.claim_append(tail_bl);
15239 logger->inc(l_bluestore_write_penalty_read_ops);
15240 }
15241 auto& b0 = dctx.blob_ref;
15242 _buffer_cache_write(txc, b0, dctx.b_off, bl,
15243 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
15244
15245 b0->dirty_blob().calc_csum(dctx.b_off, bl);
15246
15247 Extent* le = o->extent_map.set_lextent(c, dctx.off,
15248 dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents);
15249
15250 // in fact this is a no-op for big writes but left here to maintain
15251 // uniformity and avoid missing after some refactor.
15252 b0->dirty_blob().mark_used(le->blob_offset, le->length);
15253 txc->statfs_delta.stored() += le->length;
15254
15255 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 15256 bluestore_deferred_op_t* op = _get_deferred_op(txc, bl.length());
f67539c2
TL
15257 op->op = bluestore_deferred_op_t::OP_WRITE;
15258 op->extents.swap(dctx.res_extents);
15259 op->data = std::move(bl);
15260 }
15261}
15262
7c673cae
FG
15263void BlueStore::_do_write_big(
15264 TransContext *txc,
15265 CollectionRef &c,
39ae355f 15266 OnodeRef& o,
7c673cae
FG
15267 uint64_t offset, uint64_t length,
15268 bufferlist::iterator& blp,
15269 WriteContext *wctx)
15270{
15271 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
15272 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
15273 << " compress " << (int)wctx->compress
15274 << dendl;
15275 logger->inc(l_bluestore_write_big);
15276 logger->inc(l_bluestore_write_big_bytes, length);
11fdf7f2 15277 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
f67539c2 15278 uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load();
7c673cae
FG
15279 while (length > 0) {
15280 bool new_blob = false;
7c673cae
FG
15281 BlobRef b;
15282 uint32_t b_off = 0;
522d829b 15283 uint32_t l = 0;
7c673cae
FG
15284
15285 //attempting to reuse existing blob
15286 if (!wctx->compress) {
522d829b
TL
15287 // enforce target blob alignment with max_bsize
15288 l = max_bsize - p2phase(offset, max_bsize);
15289 l = std::min(uint64_t(l), length);
15290
7c673cae 15291 auto end = o->extent_map.extent_map.end();
f67539c2 15292
522d829b
TL
15293 dout(20) << __func__ << " may be defer: 0x" << std::hex
15294 << offset << "~" << l
15295 << std::dec << dendl;
15296
f67539c2
TL
15297 if (prefer_deferred_size_snapshot &&
15298 l <= prefer_deferred_size_snapshot * 2) {
15299 // Single write that spans two adjusted existing blobs can result
15300 // in up to two deferred blocks of 'prefer_deferred_size'
15301 // So we're trying to minimize the amount of resulting blobs
15302 // and preserve 2 blobs rather than inserting one more in between
15303 // E.g. write 0x10000~20000 over existing blobs
15304 // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
15305 // performance point of view) to result in two deferred writes to
15306 // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
15307
15308 // look for an existing mutable blob we can write into
15309 auto ep = o->extent_map.seek_lextent(offset);
15310 auto ep_next = end;
15311 BigDeferredWriteContext head_info, tail_info;
15312
15313 bool will_defer = ep != end ?
15314 head_info.can_defer(ep,
15315 prefer_deferred_size_snapshot,
15316 block_size,
15317 offset,
15318 l) :
15319 false;
15320 auto offset_next = offset + head_info.used;
15321 auto remaining = l - head_info.used;
15322 if (will_defer && remaining) {
15323 will_defer = false;
15324 if (remaining <= prefer_deferred_size_snapshot) {
15325 ep_next = o->extent_map.seek_lextent(offset_next);
15326 // check if we can defer remaining totally
15327 will_defer = ep_next == end ?
15328 false :
15329 tail_info.can_defer(ep_next,
15330 prefer_deferred_size_snapshot,
15331 block_size,
15332 offset_next,
15333 remaining);
15334 will_defer = will_defer && remaining == tail_info.used;
15335 }
15336 }
15337 if (will_defer) {
15338 dout(20) << __func__ << " " << *(head_info.blob_ref)
15339 << " deferring big " << std::hex
15340 << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")"
15341 << std::dec << " write via deferred"
15342 << dendl;
15343 if (remaining) {
15344 dout(20) << __func__ << " " << *(tail_info.blob_ref)
15345 << " deferring big " << std::hex
15346 << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")"
15347 << std::dec << " write via deferred"
15348 << dendl;
15349 }
15350
15351 will_defer = head_info.apply_defer();
15352 if (!will_defer) {
15353 dout(20) << __func__
15354 << " deferring big fell back, head isn't continuous"
15355 << dendl;
15356 } else if (remaining) {
15357 will_defer = tail_info.apply_defer();
15358 if (!will_defer) {
15359 dout(20) << __func__
15360 << " deferring big fell back, tail isn't continuous"
15361 << dendl;
15362 }
15363 }
15364 }
15365 if (will_defer) {
15366 _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx);
15367 if (remaining) {
15368 _do_write_big_apply_deferred(txc, c, o, tail_info,
15369 blp, wctx);
15370 }
522d829b
TL
15371 dout(20) << __func__ << " defer big: 0x" << std::hex
15372 << offset << "~" << l
15373 << std::dec << dendl;
f67539c2
TL
15374 offset += l;
15375 length -= l;
15376 logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1);
15377 logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1);
15378 continue;
15379 }
15380 }
522d829b 15381 dout(20) << __func__ << " lookup for blocks to reuse..." << dendl;
f67539c2
TL
15382
15383 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
15384
15385 // seek again as punch_hole could invalidate ep
7c673cae 15386 auto ep = o->extent_map.seek_lextent(offset);
f67539c2
TL
15387 auto begin = o->extent_map.extent_map.begin();
15388 auto prev_ep = end;
15389 if (ep != begin) {
15390 prev_ep = ep;
7c673cae 15391 --prev_ep;
7c673cae 15392 }
f67539c2 15393
7c673cae
FG
15394 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
15395 // search suitable extent in both forward and reverse direction in
15396 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 15397 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
15398 bool any_change;
15399 do {
15400 any_change = false;
15401 if (ep != end && ep->logical_offset < offset + max_bsize) {
522d829b
TL
15402 dout(20) << __func__ << " considering " << *ep
15403 << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl;
f67539c2
TL
15404
15405 if (offset >= ep->blob_start() &&
224ce89b 15406 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
15407 offset - ep->blob_start(),
15408 &l)) {
15409 b = ep->blob;
f67539c2 15410 b_off = offset - ep->blob_start();
7c673cae
FG
15411 prev_ep = end; // to avoid check below
15412 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 15413 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
15414 } else {
15415 ++ep;
15416 any_change = true;
15417 }
15418 }
15419
15420 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
522d829b
TL
15421 dout(20) << __func__ << " considering rev " << *prev_ep
15422 << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl;
f67539c2 15423 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
15424 offset - prev_ep->blob_start(),
15425 &l)) {
15426 b = prev_ep->blob;
15427 b_off = offset - prev_ep->blob_start();
15428 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 15429 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
15430 } else if (prev_ep != begin) {
15431 --prev_ep;
15432 any_change = true;
15433 } else {
15434 prev_ep = end; // to avoid useless first extent re-check
15435 }
15436 }
15437 } while (b == nullptr && any_change);
f67539c2 15438 } else {
522d829b
TL
15439 // trying to utilize as longer chunk as permitted in case of compression.
15440 l = std::min(max_bsize, length);
f67539c2
TL
15441 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
15442 } // if (!wctx->compress)
15443
7c673cae
FG
15444 if (b == nullptr) {
15445 b = c->new_blob();
15446 b_off = 0;
15447 new_blob = true;
15448 }
7c673cae
FG
15449 bufferlist t;
15450 blp.copy(l, t);
20effc67
TL
15451
15452 // Zero detection -- big block
33c7a0ef 15453 if (!cct->_conf->bluestore_zero_block_detection || !t.is_zero()) {
20effc67
TL
15454 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
15455
15456 dout(20) << __func__ << " schedule write big: 0x"
522d829b
TL
15457 << std::hex << offset << "~" << l << std::dec
15458 << (new_blob ? " new " : " reuse ")
15459 << *b << dendl;
20effc67
TL
15460
15461 logger->inc(l_bluestore_write_big_blobs);
15462 } else { // if (!t.is_zero())
15463 dout(20) << __func__ << " skip big zero block " << std::hex
15464 << " (0x" << b_off << "~" << t.length() << ")"
15465 << " (0x" << b_off << "~" << l << ")"
15466 << std::dec << dendl;
15467 logger->inc(l_bluestore_write_big_skipped_blobs);
15468 logger->inc(l_bluestore_write_big_skipped_bytes, l);
15469 }
15470
7c673cae
FG
15471 offset += l;
15472 length -= l;
7c673cae
FG
15473 }
15474}
15475
15476int BlueStore::_do_alloc_write(
15477 TransContext *txc,
15478 CollectionRef coll,
39ae355f 15479 OnodeRef& o,
7c673cae
FG
15480 WriteContext *wctx)
15481{
15482 dout(20) << __func__ << " txc " << txc
15483 << " " << wctx->writes.size() << " blobs"
15484 << dendl;
3efd9988
FG
15485 if (wctx->writes.empty()) {
15486 return 0;
7c673cae
FG
15487 }
15488
7c673cae
FG
15489 CompressorRef c;
15490 double crr = 0;
15491 if (wctx->compress) {
15492 c = select_option(
15493 "compression_algorithm",
15494 compressor,
15495 [&]() {
15496 string val;
15497 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
15498 CompressorRef cp = compressor;
15499 if (!cp || cp->get_type_name() != val) {
15500 cp = Compressor::create(cct, val);
11fdf7f2
TL
15501 if (!cp) {
15502 if (_set_compression_alert(false, val.c_str())) {
15503 derr << __func__ << " unable to initialize " << val.c_str()
15504 << " compressor" << dendl;
15505 }
15506 }
7c673cae 15507 }
1e59de90 15508 return std::optional<CompressorRef>(cp);
7c673cae 15509 }
1e59de90 15510 return std::optional<CompressorRef>();
7c673cae
FG
15511 }
15512 );
15513
15514 crr = select_option(
15515 "compression_required_ratio",
15516 cct->_conf->bluestore_compression_required_ratio,
15517 [&]() {
15518 double val;
3efd9988 15519 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
1e59de90 15520 return std::optional<double>(val);
7c673cae 15521 }
1e59de90 15522 return std::optional<double>();
7c673cae
FG
15523 }
15524 );
15525 }
15526
15527 // checksum
11fdf7f2 15528 int64_t csum = csum_type.load();
7c673cae
FG
15529 csum = select_option(
15530 "csum_type",
15531 csum,
15532 [&]() {
11fdf7f2 15533 int64_t val;
3efd9988 15534 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
1e59de90 15535 return std::optional<int64_t>(val);
7c673cae 15536 }
1e59de90 15537 return std::optional<int64_t>();
7c673cae
FG
15538 }
15539 );
15540
3efd9988
FG
15541 // compress (as needed) and calc needed space
15542 uint64_t need = 0;
39ae355f
TL
15543 uint64_t data_size = 0;
15544 // 'need' is amount of space that must be provided by allocator.
15545 // 'data_size' is a size of data that will be transferred to disk.
15546 // Note that data_size is always <= need. This comes from:
15547 // - write to blob was unaligned, and there is free space
15548 // - data has been compressed
15549 //
15550 // We make one decision and apply it to all blobs.
15551 // All blobs will be deferred or none will.
15552 // We assume that allocator does its best to provide contiguous space,
15553 // and the condition is : (data_size < deferred).
15554
11fdf7f2 15555 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae 15556 for (auto& wi : wctx->writes) {
3efd9988 15557 if (c && wi.blob_length > min_alloc_size) {
11fdf7f2 15558 auto start = mono_clock::now();
7c673cae
FG
15559
15560 // compress
11fdf7f2
TL
15561 ceph_assert(wi.b_off == 0);
15562 ceph_assert(wi.blob_length == wi.bl.length());
3efd9988 15563
7c673cae
FG
15564 // FIXME: memory alignment here is bad
15565 bufferlist t;
1e59de90 15566 std::optional<int32_t> compressor_message;
f67539c2 15567 int r = c->compress(wi.bl, t, compressor_message);
3efd9988 15568 uint64_t want_len_raw = wi.blob_length * crr;
11fdf7f2 15569 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
a8e16298
TL
15570 bool rejected = false;
15571 uint64_t compressed_len = t.length();
15572 // do an approximate (fast) estimation for resulting blob size
15573 // that doesn't take header overhead into account
11fdf7f2 15574 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
15575 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
15576 bluestore_compression_header_t chdr;
15577 chdr.type = c->get_type();
15578 chdr.length = t.length();
f67539c2 15579 chdr.compressor_message = compressor_message;
a8e16298
TL
15580 encode(chdr, wi.compressed_bl);
15581 wi.compressed_bl.claim_append(t);
15582
15583 compressed_len = wi.compressed_bl.length();
11fdf7f2 15584 result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
15585 if (result_len <= want_len && result_len < wi.blob_length) {
15586 // Cool. We compressed at least as much as we were hoping to.
15587 // pad out to min_alloc_size
15588 wi.compressed_bl.append_zero(result_len - compressed_len);
15589 wi.compressed_len = compressed_len;
15590 wi.compressed = true;
15591 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
15592 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
15593 << " -> 0x" << compressed_len << " => 0x" << result_len
15594 << " with " << c->get_type()
15595 << std::dec << dendl;
15596 txc->statfs_delta.compressed() += compressed_len;
15597 txc->statfs_delta.compressed_original() += wi.blob_length;
15598 txc->statfs_delta.compressed_allocated() += result_len;
15599 logger->inc(l_bluestore_compress_success_count);
15600 need += result_len;
39ae355f 15601 data_size += result_len;
a8e16298
TL
15602 } else {
15603 rejected = true;
15604 }
15605 } else if (r != 0) {
15606 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
15607 << " bytes compressed using " << c->get_type_name()
15608 << std::dec
15609 << " failed with errcode = " << r
15610 << ", leaving uncompressed"
15611 << dendl;
15612 logger->inc(l_bluestore_compress_rejected_count);
15613 need += wi.blob_length;
39ae355f 15614 data_size += wi.bl.length();
7c673cae 15615 } else {
a8e16298
TL
15616 rejected = true;
15617 }
15618
15619 if (rejected) {
3efd9988 15620 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
a8e16298 15621 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
3efd9988
FG
15622 << " with " << c->get_type()
15623 << ", which is more than required 0x" << want_len_raw
7c673cae 15624 << " -> 0x" << want_len
3efd9988
FG
15625 << ", leaving uncompressed"
15626 << std::dec << dendl;
15627 logger->inc(l_bluestore_compress_rejected_count);
15628 need += wi.blob_length;
39ae355f 15629 data_size += wi.bl.length();
7c673cae 15630 }
494da23a
TL
15631 log_latency("compress@_do_alloc_write",
15632 l_bluestore_compress_lat,
15633 mono_clock::now() - start,
15634 cct->_conf->bluestore_log_op_age );
3efd9988
FG
15635 } else {
15636 need += wi.blob_length;
39ae355f 15637 data_size += wi.bl.length();
7c673cae 15638 }
3efd9988 15639 }
a8e16298 15640 PExtentVector prealloc;
39ae355f 15641 prealloc.reserve(2 * wctx->writes.size());
11fdf7f2 15642 int64_t prealloc_left = 0;
20effc67 15643 prealloc_left = alloc->allocate(
3efd9988
FG
15644 need, min_alloc_size, need,
15645 0, &prealloc);
eafe8130 15646 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
11fdf7f2 15647 derr << __func__ << " failed to allocate 0x" << std::hex << need
eafe8130 15648 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
11fdf7f2 15649 << " min_alloc_size 0x" << min_alloc_size
20effc67 15650 << " available 0x " << alloc->get_free()
11fdf7f2
TL
15651 << std::dec << dendl;
15652 if (prealloc.size()) {
20effc67 15653 alloc->release(prealloc);
11fdf7f2 15654 }
a8e16298
TL
15655 return -ENOSPC;
15656 }
20effc67 15657 _collect_allocation_stats(need, min_alloc_size, prealloc);
f67539c2 15658
39ae355f
TL
15659 dout(20) << __func__ << std::hex << " need=0x" << need << " data=0x" << data_size
15660 << " prealloc " << prealloc << dendl;
3efd9988 15661 auto prealloc_pos = prealloc.begin();
522d829b 15662 ceph_assert(prealloc_pos != prealloc.end());
3efd9988
FG
15663
15664 for (auto& wi : wctx->writes) {
522d829b 15665 bluestore_blob_t& dblob = wi.b->dirty_blob();
3efd9988
FG
15666 uint64_t b_off = wi.b_off;
15667 bufferlist *l = &wi.bl;
15668 uint64_t final_length = wi.blob_length;
15669 uint64_t csum_length = wi.blob_length;
3efd9988
FG
15670 if (wi.compressed) {
15671 final_length = wi.compressed_bl.length();
15672 csum_length = final_length;
1e59de90 15673 unsigned csum_order = std::countr_zero(csum_length);
3efd9988
FG
15674 l = &wi.compressed_bl;
15675 dblob.set_compressed(wi.blob_length, wi.compressed_len);
adb31ebb 15676 if (csum != Checksummer::CSUM_NONE) {
522d829b
TL
15677 dout(20) << __func__
15678 << " initialize csum setting for compressed blob " << *wi.b
adb31ebb
TL
15679 << " csum_type " << Checksummer::get_csum_type_string(csum)
15680 << " csum_order " << csum_order
15681 << " csum_length 0x" << std::hex << csum_length
15682 << " blob_length 0x" << wi.blob_length
15683 << " compressed_length 0x" << wi.compressed_len << std::dec
15684 << dendl;
15685 dblob.init_csum(csum, csum_order, csum_length);
15686 }
3efd9988 15687 } else if (wi.new_blob) {
adb31ebb 15688 unsigned csum_order;
7c673cae 15689 // initialize newly created blob only
11fdf7f2 15690 ceph_assert(dblob.is_mutable());
7c673cae
FG
15691 if (l->length() != wi.blob_length) {
15692 // hrm, maybe we could do better here, but let's not bother.
15693 dout(20) << __func__ << " forcing csum_order to block_size_order "
15694 << block_size_order << dendl;
31f18b77 15695 csum_order = block_size_order;
7c673cae 15696 } else {
1e59de90 15697 csum_order = std::min<unsigned>(wctx->csum_order, std::countr_zero(l->length()));
7c673cae
FG
15698 }
15699 // try to align blob with max_blob_size to improve
15700 // its reuse ratio, e.g. in case of reverse write
15701 uint32_t suggested_boff =
15702 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
15703 if ((suggested_boff % (1 << csum_order)) == 0 &&
15704 suggested_boff + final_length <= max_bsize &&
15705 suggested_boff > b_off) {
181888fb 15706 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae 15707 << std::hex << suggested_boff << std::dec << dendl;
11fdf7f2 15708 ceph_assert(suggested_boff >= b_off);
7c673cae
FG
15709 csum_length += suggested_boff - b_off;
15710 b_off = suggested_boff;
15711 }
181888fb 15712 if (csum != Checksummer::CSUM_NONE) {
522d829b
TL
15713 dout(20) << __func__
15714 << " initialize csum setting for new blob " << *wi.b
181888fb
FG
15715 << " csum_type " << Checksummer::get_csum_type_string(csum)
15716 << " csum_order " << csum_order
15717 << " csum_length 0x" << std::hex << csum_length << std::dec
15718 << dendl;
15719 dblob.init_csum(csum, csum_order, csum_length);
15720 }
7c673cae
FG
15721 }
15722
a8e16298 15723 PExtentVector extents;
3efd9988 15724 int64_t left = final_length;
522d829b 15725 auto prefer_deferred_size_snapshot = prefer_deferred_size.load();
3efd9988 15726 while (left > 0) {
11fdf7f2 15727 ceph_assert(prealloc_left > 0);
3efd9988
FG
15728 if (prealloc_pos->length <= left) {
15729 prealloc_left -= prealloc_pos->length;
15730 left -= prealloc_pos->length;
15731 txc->statfs_delta.allocated() += prealloc_pos->length;
15732 extents.push_back(*prealloc_pos);
15733 ++prealloc_pos;
15734 } else {
15735 extents.emplace_back(prealloc_pos->offset, left);
15736 prealloc_pos->offset += left;
15737 prealloc_pos->length -= left;
15738 prealloc_left -= left;
15739 txc->statfs_delta.allocated() += left;
15740 left = 0;
15741 break;
15742 }
15743 }
7c673cae 15744 for (auto& p : extents) {
3efd9988 15745 txc->allocated.insert(p.offset, p.length);
7c673cae 15746 }
11fdf7f2 15747 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
7c673cae 15748
522d829b 15749 dout(20) << __func__ << " blob " << *wi.b << dendl;
181888fb 15750 if (dblob.has_csum()) {
7c673cae
FG
15751 dblob.calc_csum(b_off, *l);
15752 }
181888fb 15753
7c673cae 15754 if (wi.mark_unused) {
1911f103 15755 ceph_assert(!dblob.is_compressed());
7c673cae
FG
15756 auto b_end = b_off + wi.bl.length();
15757 if (b_off) {
15758 dblob.add_unused(0, b_off);
15759 }
1911f103
TL
15760 uint64_t llen = dblob.get_logical_length();
15761 if (b_end < llen) {
15762 dblob.add_unused(b_end, llen - b_end);
7c673cae
FG
15763 }
15764 }
15765
15766 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
15767 b_off + (wi.b_off0 - wi.b_off),
15768 wi.length0,
15769 wi.b,
15770 nullptr);
15771 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
15772 txc->statfs_delta.stored() += le->length;
15773 dout(20) << __func__ << " lex " << *le << dendl;
15774 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
15775 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
15776
15777 // queue io
11fdf7f2 15778 if (!g_conf()->bluestore_debug_omit_block_device_write) {
39ae355f 15779 if (data_size < prefer_deferred_size_snapshot) {
f67539c2 15780 dout(20) << __func__ << " deferring 0x" << std::hex
7c673cae 15781 << l->length() << std::dec << " write via deferred" << dendl;
522d829b 15782 bluestore_deferred_op_t *op = _get_deferred_op(txc, l->length());
7c673cae 15783 op->op = bluestore_deferred_op_t::OP_WRITE;
522d829b 15784 int r = wi.b->get_blob().map(
7c673cae
FG
15785 b_off, l->length(),
15786 [&](uint64_t offset, uint64_t length) {
15787 op->extents.emplace_back(bluestore_pextent_t(offset, length));
15788 return 0;
15789 });
11fdf7f2 15790 ceph_assert(r == 0);
7c673cae
FG
15791 op->data = *l;
15792 } else {
522d829b 15793 wi.b->get_blob().map_bl(
7c673cae
FG
15794 b_off, *l,
15795 [&](uint64_t offset, bufferlist& t) {
15796 bdev->aio_write(offset, t, &txc->ioc, false);
15797 });
f67539c2 15798 logger->inc(l_bluestore_write_new);
7c673cae
FG
15799 }
15800 }
15801 }
11fdf7f2
TL
15802 ceph_assert(prealloc_pos == prealloc.end());
15803 ceph_assert(prealloc_left == 0);
7c673cae
FG
15804 return 0;
15805}
15806
15807void BlueStore::_wctx_finish(
15808 TransContext *txc,
15809 CollectionRef& c,
39ae355f 15810 OnodeRef& o,
31f18b77
FG
15811 WriteContext *wctx,
15812 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae 15813{
20effc67
TL
15814#ifdef HAVE_LIBZBD
15815 if (bdev->is_smr()) {
15816 for (auto& w : wctx->writes) {
15817 for (auto& e : w.b->get_blob().get_extents()) {
15818 if (!e.is_valid()) {
15819 continue;
15820 }
15821 uint32_t zone = e.offset / zone_size;
15822 if (!o->onode.zone_offset_refs.count(zone)) {
15823 uint64_t zoff = e.offset % zone_size;
15824 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
15825 << " offset 0x" << zoff << std::dec << dendl;
15826 txc->note_write_zone_offset(o, zone, zoff);
15827 }
15828 }
15829 }
15830 }
15831 set<uint32_t> zones_with_releases;
15832#endif
15833
7c673cae
FG
15834 auto oep = wctx->old_extents.begin();
15835 while (oep != wctx->old_extents.end()) {
15836 auto &lo = *oep;
15837 oep = wctx->old_extents.erase(oep);
15838 dout(20) << __func__ << " lex_old " << lo.e << dendl;
15839 BlobRef b = lo.e.blob;
15840 const bluestore_blob_t& blob = b->get_blob();
15841 if (blob.is_compressed()) {
15842 if (lo.blob_empty) {
15843 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
15844 }
15845 txc->statfs_delta.compressed_original() -= lo.e.length;
15846 }
15847 auto& r = lo.r;
15848 txc->statfs_delta.stored() -= lo.e.length;
15849 if (!r.empty()) {
f67539c2 15850 dout(20) << __func__ << " blob " << *b << " release " << r << dendl;
7c673cae
FG
15851 if (blob.is_shared()) {
15852 PExtentVector final;
15853 c->load_shared_blob(b->shared_blob);
11fdf7f2
TL
15854 bool unshare = false;
15855 bool* unshare_ptr =
15856 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
7c673cae 15857 for (auto e : r) {
31f18b77
FG
15858 b->shared_blob->put_ref(
15859 e.offset, e.length, &final,
11fdf7f2 15860 unshare_ptr);
20effc67
TL
15861#ifdef HAVE_LIBZBD
15862 // we also drop zone ref for shared blob extents
15863 if (bdev->is_smr() && e.is_valid()) {
15864 zones_with_releases.insert(e.offset / zone_size);
15865 }
15866#endif
11fdf7f2
TL
15867 }
15868 if (unshare) {
15869 ceph_assert(maybe_unshared_blobs);
15870 maybe_unshared_blobs->insert(b->shared_blob.get());
7c673cae
FG
15871 }
15872 dout(20) << __func__ << " shared_blob release " << final
15873 << " from " << *b->shared_blob << dendl;
15874 txc->write_shared_blob(b->shared_blob);
15875 r.clear();
15876 r.swap(final);
15877 }
15878 }
15879 // we can't invalidate our logical extents as we drop them because
15880 // other lextents (either in our onode or others) may still
15881 // reference them. but we can throw out anything that is no
15882 // longer allocated. Note that this will leave behind edge bits
15883 // that are no longer referenced but not deallocated (until they
15884 // age out of the cache naturally).
15885 b->discard_unallocated(c.get());
15886 for (auto e : r) {
15887 dout(20) << __func__ << " release " << e << dendl;
15888 txc->released.insert(e.offset, e.length);
15889 txc->statfs_delta.allocated() -= e.length;
15890 if (blob.is_compressed()) {
15891 txc->statfs_delta.compressed_allocated() -= e.length;
15892 }
20effc67
TL
15893#ifdef HAVE_LIBZBD
15894 if (bdev->is_smr() && e.is_valid()) {
15895 zones_with_releases.insert(e.offset / zone_size);
15896 }
15897#endif
7c673cae 15898 }
9f95a23c
TL
15899
15900 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
7c673cae
FG
15901 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
15902 << dendl;
15903 o->extent_map.spanning_blob_map.erase(b->id);
15904 }
9f95a23c 15905 delete &lo;
7c673cae 15906 }
20effc67
TL
15907
15908#ifdef HAVE_LIBZBD
15909 if (!zones_with_releases.empty()) {
15910 // we need to fault the entire extent range in here to determinte if we've dropped
15911 // all refs to a zone.
15912 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
15913 for (auto& b : o->extent_map.extent_map) {
15914 for (auto& e : b.blob->get_blob().get_extents()) {
15915 if (e.is_valid()) {
15916 zones_with_releases.erase(e.offset / zone_size);
15917 }
15918 }
15919 }
15920 for (auto zone : zones_with_releases) {
15921 auto p = o->onode.zone_offset_refs.find(zone);
15922 if (p != o->onode.zone_offset_refs.end()) {
15923 dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
15924 << " offset 0x" << p->second << std::dec << dendl;
15925 txc->note_release_zone_offset(o, zone, p->second);
15926 }
15927 }
15928 }
15929#endif
7c673cae
FG
15930}
15931
15932void BlueStore::_do_write_data(
15933 TransContext *txc,
15934 CollectionRef& c,
39ae355f 15935 OnodeRef& o,
7c673cae
FG
15936 uint64_t offset,
15937 uint64_t length,
15938 bufferlist& bl,
15939 WriteContext *wctx)
15940{
15941 uint64_t end = offset + length;
15942 bufferlist::iterator p = bl.begin();
15943
15944 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
15945 (length != min_alloc_size)) {
15946 // we fall within the same block
15947 _do_write_small(txc, c, o, offset, length, p, wctx);
15948 } else {
15949 uint64_t head_offset, head_length;
15950 uint64_t middle_offset, middle_length;
15951 uint64_t tail_offset, tail_length;
15952
15953 head_offset = offset;
11fdf7f2 15954 head_length = p2nphase(offset, min_alloc_size);
7c673cae 15955
11fdf7f2
TL
15956 tail_offset = p2align(end, min_alloc_size);
15957 tail_length = p2phase(end, min_alloc_size);
7c673cae
FG
15958
15959 middle_offset = head_offset + head_length;
15960 middle_length = length - head_length - tail_length;
15961
15962 if (head_length) {
15963 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
15964 }
15965
f67539c2 15966 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
7c673cae
FG
15967
15968 if (tail_length) {
15969 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
15970 }
15971 }
15972}
15973
31f18b77
FG
15974void BlueStore::_choose_write_options(
15975 CollectionRef& c,
39ae355f 15976 OnodeRef& o,
31f18b77
FG
15977 uint32_t fadvise_flags,
15978 WriteContext *wctx)
7c673cae 15979{
7c673cae
FG
15980 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
15981 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 15982 wctx->buffered = true;
7c673cae
FG
15983 } else if (cct->_conf->bluestore_default_buffered_write &&
15984 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
15985 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
15986 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 15987 wctx->buffered = true;
7c673cae
FG
15988 }
15989
31f18b77
FG
15990 // apply basic csum block size
15991 wctx->csum_order = block_size_order;
7c673cae
FG
15992
15993 // compression parameters
15994 unsigned alloc_hints = o->onode.alloc_hint_flags;
15995 auto cm = select_option(
15996 "compression_mode",
31f18b77 15997 comp_mode.load(),
7c673cae
FG
15998 [&]() {
15999 string val;
11fdf7f2 16000 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
1e59de90 16001 return std::optional<Compressor::CompressionMode>(
31f18b77 16002 Compressor::get_comp_mode_type(val));
7c673cae 16003 }
1e59de90 16004 return std::optional<Compressor::CompressionMode>();
7c673cae
FG
16005 }
16006 );
31f18b77
FG
16007
16008 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
16009 ((cm == Compressor::COMP_FORCE) ||
16010 (cm == Compressor::COMP_AGGRESSIVE &&
16011 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
16012 (cm == Compressor::COMP_PASSIVE &&
16013 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
16014
16015 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
16016 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
16017 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
16018 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 16019 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 16020
7c673cae 16021 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 16022
7c673cae 16023 if (o->onode.expected_write_size) {
224ce89b 16024 wctx->csum_order = std::max(min_alloc_size_order,
1e59de90 16025 (uint8_t)std::countr_zero(o->onode.expected_write_size));
7c673cae 16026 } else {
224ce89b 16027 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
16028 }
16029
31f18b77
FG
16030 if (wctx->compress) {
16031 wctx->target_blob_size = select_option(
7c673cae 16032 "compression_max_blob_size",
31f18b77 16033 comp_max_blob_size.load(),
7c673cae 16034 [&]() {
11fdf7f2
TL
16035 int64_t val;
16036 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
1e59de90 16037 return std::optional<uint64_t>((uint64_t)val);
7c673cae 16038 }
1e59de90 16039 return std::optional<uint64_t>();
7c673cae
FG
16040 }
16041 );
16042 }
16043 } else {
31f18b77
FG
16044 if (wctx->compress) {
16045 wctx->target_blob_size = select_option(
7c673cae 16046 "compression_min_blob_size",
31f18b77 16047 comp_min_blob_size.load(),
7c673cae 16048 [&]() {
11fdf7f2
TL
16049 int64_t val;
16050 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
1e59de90 16051 return std::optional<uint64_t>((uint64_t)val);
7c673cae 16052 }
1e59de90 16053 return std::optional<uint64_t>();
7c673cae
FG
16054 }
16055 );
16056 }
16057 }
31f18b77 16058
7c673cae 16059 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
16060 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
16061 wctx->target_blob_size = max_bsize;
7c673cae 16062 }
31f18b77 16063
7c673cae
FG
16064 // set the min blob size floor at 2x the min_alloc_size, or else we
16065 // won't be able to allocate a smaller extent for the compressed
16066 // data.
31f18b77
FG
16067 if (wctx->compress &&
16068 wctx->target_blob_size < min_alloc_size * 2) {
16069 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 16070 }
31f18b77
FG
16071
16072 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
16073 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
f64942e4
AA
16074 << " compress=" << (int)wctx->compress
16075 << " buffered=" << (int)wctx->buffered
31f18b77
FG
16076 << std::dec << dendl;
16077}
16078
16079int BlueStore::_do_gc(
16080 TransContext *txc,
16081 CollectionRef& c,
39ae355f 16082 OnodeRef& o,
31f18b77
FG
16083 const WriteContext& wctx,
16084 uint64_t *dirty_start,
16085 uint64_t *dirty_end)
16086{
31f18b77 16087
1adf2230 16088 bool dirty_range_updated = false;
31f18b77 16089 WriteContext wctx_gc;
7c673cae 16090 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 16091
eafe8130 16092 auto & extents_to_collect = wctx.extents_to_gc;
31f18b77
FG
16093 for (auto it = extents_to_collect.begin();
16094 it != extents_to_collect.end();
16095 ++it) {
16096 bufferlist bl;
eafe8130
TL
16097 auto offset = (*it).first;
16098 auto length = (*it).second;
16099 dout(20) << __func__ << " processing " << std::hex
16100 << offset << "~" << length << std::dec
16101 << dendl;
16102 int r = _do_read(c.get(), o, offset, length, bl, 0);
16103 ceph_assert(r == (int)length);
31f18b77 16104
eafe8130
TL
16105 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
16106 logger->inc(l_bluestore_gc_merged, length);
31f18b77 16107
eafe8130
TL
16108 if (*dirty_start > offset) {
16109 *dirty_start = offset;
1adf2230 16110 dirty_range_updated = true;
31f18b77
FG
16111 }
16112
eafe8130
TL
16113 if (*dirty_end < offset + length) {
16114 *dirty_end = offset + length;
1adf2230 16115 dirty_range_updated = true;
31f18b77
FG
16116 }
16117 }
1adf2230
AA
16118 if (dirty_range_updated) {
16119 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
16120 }
31f18b77
FG
16121
16122 dout(30) << __func__ << " alloc write" << dendl;
16123 int r = _do_alloc_write(txc, c, o, &wctx_gc);
16124 if (r < 0) {
16125 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
16126 << dendl;
16127 return r;
16128 }
16129
16130 _wctx_finish(txc, c, o, &wctx_gc);
16131 return 0;
16132}
16133
16134int BlueStore::_do_write(
16135 TransContext *txc,
16136 CollectionRef& c,
39ae355f 16137 OnodeRef& o,
31f18b77
FG
16138 uint64_t offset,
16139 uint64_t length,
16140 bufferlist& bl,
16141 uint32_t fadvise_flags)
16142{
16143 int r = 0;
16144
16145 dout(20) << __func__
16146 << " " << o->oid
16147 << " 0x" << std::hex << offset << "~" << length
16148 << " - have 0x" << o->onode.size
16149 << " (" << std::dec << o->onode.size << ")"
f67539c2
TL
16150 << " bytes" << std::hex
16151 << " fadvise_flags 0x" << fadvise_flags
16152 << " alloc_hint 0x" << o->onode.alloc_hint_flags
16153 << " expected_object_size " << o->onode.expected_object_size
16154 << " expected_write_size " << o->onode.expected_write_size
16155 << std::dec
31f18b77 16156 << dendl;
81eedcae 16157 _dump_onode<30>(cct, *o);
31f18b77
FG
16158
16159 if (length == 0) {
16160 return 0;
16161 }
16162
16163 uint64_t end = offset + length;
16164
16165 GarbageCollector gc(c->store->cct);
eafe8130 16166 int64_t benefit = 0;
31f18b77
FG
16167 auto dirty_start = offset;
16168 auto dirty_end = end;
16169
16170 WriteContext wctx;
16171 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
16172 o->extent_map.fault_range(db, offset, length);
16173 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
16174 r = _do_alloc_write(txc, c, o, &wctx);
16175 if (r < 0) {
16176 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
16177 << dendl;
16178 goto out;
16179 }
16180
eafe8130
TL
16181 if (wctx.extents_to_gc.empty() ||
16182 wctx.extents_to_gc.range_start() > offset ||
16183 wctx.extents_to_gc.range_end() < offset + length) {
16184 benefit = gc.estimate(offset,
16185 length,
16186 o->extent_map,
16187 wctx.old_extents,
16188 min_alloc_size);
16189 }
16190
31f18b77
FG
16191 // NB: _wctx_finish() will empty old_extents
16192 // so we must do gc estimation before that
7c673cae
FG
16193 _wctx_finish(txc, c, o, &wctx);
16194 if (end > o->onode.size) {
16195 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 16196 << std::dec << dendl;
7c673cae
FG
16197 o->onode.size = end;
16198 }
16199
11fdf7f2 16200 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
eafe8130
TL
16201 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
16202 dout(20) << __func__
16203 << " perform garbage collection for compressed extents, "
16204 << "expected benefit = " << benefit << " AUs" << dendl;
16205 }
16206 if (!wctx.extents_to_gc.empty()) {
16207 dout(20) << __func__ << " perform garbage collection" << dendl;
16208
16209 r = _do_gc(txc, c, o,
16210 wctx,
16211 &dirty_start, &dirty_end);
16212 if (r < 0) {
16213 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
16214 << dendl;
16215 goto out;
7c673cae 16216 }
eafe8130
TL
16217 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
16218 << "~" << dirty_end - dirty_start << std::dec << dendl;
7c673cae 16219 }
7c673cae 16220 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
16221 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
16222
7c673cae
FG
16223 r = 0;
16224
16225 out:
16226 return r;
16227}
16228
16229int BlueStore::_write(TransContext *txc,
16230 CollectionRef& c,
16231 OnodeRef& o,
31f18b77
FG
16232 uint64_t offset, size_t length,
16233 bufferlist& bl,
16234 uint32_t fadvise_flags)
7c673cae
FG
16235{
16236 dout(15) << __func__ << " " << c->cid << " " << o->oid
16237 << " 0x" << std::hex << offset << "~" << length << std::dec
16238 << dendl;
35e4c445
FG
16239 int r = 0;
16240 if (offset + length >= OBJECT_MAX_SIZE) {
16241 r = -E2BIG;
16242 } else {
16243 _assign_nid(txc, o);
16244 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
16245 txc->write_onode(o);
16246 }
7c673cae
FG
16247 dout(10) << __func__ << " " << c->cid << " " << o->oid
16248 << " 0x" << std::hex << offset << "~" << length << std::dec
16249 << " = " << r << dendl;
16250 return r;
16251}
16252
16253int BlueStore::_zero(TransContext *txc,
16254 CollectionRef& c,
16255 OnodeRef& o,
16256 uint64_t offset, size_t length)
16257{
16258 dout(15) << __func__ << " " << c->cid << " " << o->oid
16259 << " 0x" << std::hex << offset << "~" << length << std::dec
16260 << dendl;
35e4c445
FG
16261 int r = 0;
16262 if (offset + length >= OBJECT_MAX_SIZE) {
16263 r = -E2BIG;
16264 } else {
16265 _assign_nid(txc, o);
16266 r = _do_zero(txc, c, o, offset, length);
16267 }
7c673cae
FG
16268 dout(10) << __func__ << " " << c->cid << " " << o->oid
16269 << " 0x" << std::hex << offset << "~" << length << std::dec
16270 << " = " << r << dendl;
16271 return r;
16272}
16273
16274int BlueStore::_do_zero(TransContext *txc,
16275 CollectionRef& c,
16276 OnodeRef& o,
16277 uint64_t offset, size_t length)
16278{
16279 dout(15) << __func__ << " " << c->cid << " " << o->oid
16280 << " 0x" << std::hex << offset << "~" << length << std::dec
16281 << dendl;
16282 int r = 0;
16283
81eedcae 16284 _dump_onode<30>(cct, *o);
7c673cae
FG
16285
16286 WriteContext wctx;
16287 o->extent_map.fault_range(db, offset, length);
16288 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 16289 o->extent_map.dirty_range(offset, length);
7c673cae
FG
16290 _wctx_finish(txc, c, o, &wctx);
16291
b32b8144 16292 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
16293 o->onode.size = offset + length;
16294 dout(20) << __func__ << " extending size to " << offset + length
16295 << dendl;
16296 }
16297 txc->write_onode(o);
16298
16299 dout(10) << __func__ << " " << c->cid << " " << o->oid
16300 << " 0x" << std::hex << offset << "~" << length << std::dec
16301 << " = " << r << dendl;
16302 return r;
16303}
16304
16305void BlueStore::_do_truncate(
39ae355f 16306 TransContext *txc, CollectionRef& c, OnodeRef& o, uint64_t offset,
31f18b77 16307 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
16308{
16309 dout(15) << __func__ << " " << c->cid << " " << o->oid
16310 << " 0x" << std::hex << offset << std::dec << dendl;
16311
81eedcae 16312 _dump_onode<30>(cct, *o);
7c673cae
FG
16313
16314 if (offset == o->onode.size)
31f18b77 16315 return;
7c673cae 16316
f67539c2 16317 WriteContext wctx;
7c673cae 16318 if (offset < o->onode.size) {
7c673cae
FG
16319 uint64_t length = o->onode.size - offset;
16320 o->extent_map.fault_range(db, offset, length);
16321 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 16322 o->extent_map.dirty_range(offset, length);
20effc67 16323
31f18b77 16324 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
16325
16326 // if we have shards past EOF, ask for a reshard
16327 if (!o->onode.extent_map_shards.empty() &&
16328 o->onode.extent_map_shards.back().offset >= offset) {
16329 dout(10) << __func__ << " request reshard past EOF" << dendl;
16330 if (offset) {
16331 o->extent_map.request_reshard(offset - 1, offset + length);
16332 } else {
16333 o->extent_map.request_reshard(0, length);
16334 }
16335 }
16336 }
16337
16338 o->onode.size = offset;
16339
16340 txc->write_onode(o);
16341}
16342
35e4c445 16343int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
16344 CollectionRef& c,
16345 OnodeRef& o,
16346 uint64_t offset)
16347{
16348 dout(15) << __func__ << " " << c->cid << " " << o->oid
16349 << " 0x" << std::hex << offset << std::dec
16350 << dendl;
20effc67
TL
16351
16352 auto start_time = mono_clock::now();
35e4c445
FG
16353 int r = 0;
16354 if (offset >= OBJECT_MAX_SIZE) {
16355 r = -E2BIG;
16356 } else {
16357 _do_truncate(txc, c, o, offset);
16358 }
20effc67
TL
16359 log_latency_fn(
16360 __func__,
16361 l_bluestore_truncate_lat,
16362 mono_clock::now() - start_time,
16363 cct->_conf->bluestore_log_op_age,
16364 [&](const ceph::timespan& lat) {
16365 ostringstream ostr;
16366 ostr << ", lat = " << timespan_str(lat)
16367 << " cid =" << c->cid
16368 << " oid =" << o->oid;
16369 return ostr.str();
16370 }
16371 );
35e4c445
FG
16372 dout(10) << __func__ << " " << c->cid << " " << o->oid
16373 << " 0x" << std::hex << offset << std::dec
16374 << " = " << r << dendl;
16375 return r;
7c673cae
FG
16376}
16377
16378int BlueStore::_do_remove(
16379 TransContext *txc,
16380 CollectionRef& c,
39ae355f 16381 OnodeRef& o)
7c673cae 16382{
31f18b77 16383 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
16384 bool is_gen = !o->oid.is_no_gen();
16385 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
16386 if (o->onode.has_omap()) {
16387 o->flush();
9f95a23c 16388 _do_omap_clear(txc, o);
7c673cae
FG
16389 }
16390 o->exists = false;
16391 string key;
16392 for (auto &s : o->extent_map.shards) {
16393 dout(20) << __func__ << " removing shard 0x" << std::hex
16394 << s.shard_info->offset << std::dec << dendl;
16395 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
16396 [&](const string& final_key) {
16397 txc->t->rmkey(PREFIX_OBJ, final_key);
16398 }
16399 );
16400 }
16401 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
a8e16298 16402 txc->note_removed_object(o);
7c673cae
FG
16403 o->extent_map.clear();
16404 o->onode = bluestore_onode_t();
16405 _debug_obj_on_delete(o->oid);
31f18b77 16406
224ce89b
WB
16407 if (!is_gen || maybe_unshared_blobs.empty()) {
16408 return 0;
16409 }
31f18b77 16410
224ce89b
WB
16411 // see if we can unshare blobs still referenced by the head
16412 dout(10) << __func__ << " gen and maybe_unshared_blobs "
16413 << maybe_unshared_blobs << dendl;
16414 ghobject_t nogen = o->oid;
16415 nogen.generation = ghobject_t::NO_GEN;
f67539c2 16416 OnodeRef h = c->get_onode(nogen, false);
224ce89b
WB
16417
16418 if (!h || !h->exists) {
16419 return 0;
16420 }
16421
16422 dout(20) << __func__ << " checking for unshareable blobs on " << h
16423 << " " << h->oid << dendl;
16424 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
16425 for (auto& e : h->extent_map.extent_map) {
16426 const bluestore_blob_t& b = e.blob->get_blob();
16427 SharedBlob *sb = e.blob->shared_blob.get();
16428 if (b.is_shared() &&
16429 sb->loaded &&
16430 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
16431 if (b.is_compressed()) {
16432 expect[sb].get(0, b.get_ondisk_length());
16433 } else {
16434 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
16435 expect[sb].get(off, len);
16436 return 0;
16437 });
16438 }
224ce89b
WB
16439 }
16440 }
31f18b77 16441
224ce89b
WB
16442 vector<SharedBlob*> unshared_blobs;
16443 unshared_blobs.reserve(maybe_unshared_blobs.size());
16444 for (auto& p : expect) {
16445 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
16446 if (p.first->persistent->ref_map == p.second) {
16447 SharedBlob *sb = p.first;
16448 dout(20) << __func__ << " unsharing " << *sb << dendl;
16449 unshared_blobs.push_back(sb);
16450 txc->unshare_blob(sb);
16451 uint64_t sbid = c->make_blob_unshared(sb);
16452 string key;
16453 get_shared_blob_key(sbid, &key);
16454 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
16455 }
16456 }
16457
16458 if (unshared_blobs.empty()) {
16459 return 0;
16460 }
16461
224ce89b
WB
16462 for (auto& e : h->extent_map.extent_map) {
16463 const bluestore_blob_t& b = e.blob->get_blob();
16464 SharedBlob *sb = e.blob->shared_blob.get();
16465 if (b.is_shared() &&
16466 std::find(unshared_blobs.begin(), unshared_blobs.end(),
16467 sb) != unshared_blobs.end()) {
16468 dout(20) << __func__ << " unsharing " << e << dendl;
16469 bluestore_blob_t& blob = e.blob->dirty_blob();
16470 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 16471 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
16472 }
16473 }
224ce89b
WB
16474 txc->write_onode(h);
16475
7c673cae
FG
16476 return 0;
16477}
16478
16479int BlueStore::_remove(TransContext *txc,
16480 CollectionRef& c,
39ae355f 16481 OnodeRef& o)
7c673cae 16482{
11fdf7f2
TL
16483 dout(15) << __func__ << " " << c->cid << " " << o->oid
16484 << " onode " << o.get()
16485 << " txc "<< txc << dendl;
20effc67 16486 auto start_time = mono_clock::now();
7c673cae 16487 int r = _do_remove(txc, c, o);
20effc67 16488
adb31ebb
TL
16489 log_latency_fn(
16490 __func__,
16491 l_bluestore_remove_lat,
16492 mono_clock::now() - start_time,
16493 cct->_conf->bluestore_log_op_age,
16494 [&](const ceph::timespan& lat) {
16495 ostringstream ostr;
16496 ostr << ", lat = " << timespan_str(lat)
16497 << " cid =" << c->cid
16498 << " oid =" << o->oid;
16499 return ostr.str();
16500 }
16501 );
16502
7c673cae
FG
16503 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16504 return r;
16505}
16506
16507int BlueStore::_setattr(TransContext *txc,
16508 CollectionRef& c,
16509 OnodeRef& o,
16510 const string& name,
16511 bufferptr& val)
16512{
16513 dout(15) << __func__ << " " << c->cid << " " << o->oid
16514 << " " << name << " (" << val.length() << " bytes)"
16515 << dendl;
16516 int r = 0;
3efd9988
FG
16517 if (val.is_partial()) {
16518 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
16519 val.length());
f91f0fd5 16520 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
16521 } else {
16522 auto& b = o->onode.attrs[name.c_str()] = val;
f91f0fd5 16523 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 16524 }
7c673cae
FG
16525 txc->write_onode(o);
16526 dout(10) << __func__ << " " << c->cid << " " << o->oid
16527 << " " << name << " (" << val.length() << " bytes)"
16528 << " = " << r << dendl;
16529 return r;
16530}
16531
16532int BlueStore::_setattrs(TransContext *txc,
16533 CollectionRef& c,
16534 OnodeRef& o,
16535 const map<string,bufferptr>& aset)
16536{
16537 dout(15) << __func__ << " " << c->cid << " " << o->oid
16538 << " " << aset.size() << " keys"
16539 << dendl;
16540 int r = 0;
16541 for (map<string,bufferptr>::const_iterator p = aset.begin();
16542 p != aset.end(); ++p) {
3efd9988
FG
16543 if (p->second.is_partial()) {
16544 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 16545 bufferptr(p->second.c_str(), p->second.length());
f91f0fd5 16546 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
16547 } else {
16548 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
f91f0fd5 16549 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 16550 }
7c673cae
FG
16551 }
16552 txc->write_onode(o);
16553 dout(10) << __func__ << " " << c->cid << " " << o->oid
16554 << " " << aset.size() << " keys"
16555 << " = " << r << dendl;
16556 return r;
16557}
16558
16559
16560int BlueStore::_rmattr(TransContext *txc,
16561 CollectionRef& c,
16562 OnodeRef& o,
16563 const string& name)
16564{
16565 dout(15) << __func__ << " " << c->cid << " " << o->oid
16566 << " " << name << dendl;
16567 int r = 0;
16568 auto it = o->onode.attrs.find(name.c_str());
16569 if (it == o->onode.attrs.end())
16570 goto out;
16571
16572 o->onode.attrs.erase(it);
16573 txc->write_onode(o);
16574
16575 out:
16576 dout(10) << __func__ << " " << c->cid << " " << o->oid
16577 << " " << name << " = " << r << dendl;
16578 return r;
16579}
16580
16581int BlueStore::_rmattrs(TransContext *txc,
16582 CollectionRef& c,
16583 OnodeRef& o)
16584{
16585 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16586 int r = 0;
16587
16588 if (o->onode.attrs.empty())
16589 goto out;
16590
16591 o->onode.attrs.clear();
16592 txc->write_onode(o);
16593
16594 out:
16595 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16596 return r;
16597}
16598
9f95a23c 16599void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
7c673cae 16600{
9f95a23c 16601 const string& omap_prefix = o->get_omap_prefix();
7c673cae 16602 string prefix, tail;
9f95a23c
TL
16603 o->get_omap_header(&prefix);
16604 o->get_omap_tail(&tail);
11fdf7f2 16605 txc->t->rm_range_keys(omap_prefix, prefix, tail);
494da23a 16606 txc->t->rmkey(omap_prefix, tail);
20effc67 16607 o->onode.clear_omap_flag();
11fdf7f2
TL
16608 dout(20) << __func__ << " remove range start: "
16609 << pretty_binary_string(prefix) << " end: "
16610 << pretty_binary_string(tail) << dendl;
7c673cae
FG
16611}
16612
16613int BlueStore::_omap_clear(TransContext *txc,
16614 CollectionRef& c,
16615 OnodeRef& o)
16616{
16617 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
20effc67
TL
16618 auto t0 = mono_clock::now();
16619
7c673cae
FG
16620 int r = 0;
16621 if (o->onode.has_omap()) {
16622 o->flush();
9f95a23c 16623 _do_omap_clear(txc, o);
7c673cae
FG
16624 txc->write_onode(o);
16625 }
20effc67
TL
16626 logger->tinc(l_bluestore_omap_clear_lat, mono_clock::now() - t0);
16627
7c673cae
FG
16628 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16629 return r;
16630}
16631
16632int BlueStore::_omap_setkeys(TransContext *txc,
16633 CollectionRef& c,
16634 OnodeRef& o,
16635 bufferlist &bl)
16636{
16637 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16638 int r;
11fdf7f2 16639 auto p = bl.cbegin();
7c673cae
FG
16640 __u32 num;
16641 if (!o->onode.has_omap()) {
11fdf7f2 16642 if (o->oid.is_pgmeta()) {
9f95a23c
TL
16643 o->onode.set_omap_flags_pgmeta();
16644 } else {
522d829b 16645 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
11fdf7f2 16646 }
7c673cae 16647 txc->write_onode(o);
494da23a 16648
9f95a23c 16649 const string& prefix = o->get_omap_prefix();
494da23a
TL
16650 string key_tail;
16651 bufferlist tail;
9f95a23c 16652 o->get_omap_tail(&key_tail);
494da23a 16653 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
16654 } else {
16655 txc->note_modified_object(o);
16656 }
9f95a23c 16657 const string& prefix = o->get_omap_prefix();
7c673cae 16658 string final_key;
9f95a23c
TL
16659 o->get_omap_key(string(), &final_key);
16660 size_t base_key_len = final_key.size();
11fdf7f2 16661 decode(num, p);
7c673cae
FG
16662 while (num--) {
16663 string key;
16664 bufferlist value;
11fdf7f2
TL
16665 decode(key, p);
16666 decode(value, p);
9f95a23c 16667 final_key.resize(base_key_len); // keep prefix
7c673cae 16668 final_key += key;
11fdf7f2 16669 dout(20) << __func__ << " " << pretty_binary_string(final_key)
7c673cae 16670 << " <- " << key << dendl;
11fdf7f2 16671 txc->t->set(prefix, final_key, value);
7c673cae
FG
16672 }
16673 r = 0;
16674 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16675 return r;
16676}
16677
16678int BlueStore::_omap_setheader(TransContext *txc,
16679 CollectionRef& c,
39ae355f 16680 OnodeRef& o,
7c673cae
FG
16681 bufferlist& bl)
16682{
16683 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16684 int r;
16685 string key;
16686 if (!o->onode.has_omap()) {
11fdf7f2 16687 if (o->oid.is_pgmeta()) {
9f95a23c
TL
16688 o->onode.set_omap_flags_pgmeta();
16689 } else {
522d829b 16690 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
11fdf7f2 16691 }
7c673cae 16692 txc->write_onode(o);
494da23a 16693
9f95a23c 16694 const string& prefix = o->get_omap_prefix();
494da23a
TL
16695 string key_tail;
16696 bufferlist tail;
9f95a23c 16697 o->get_omap_tail(&key_tail);
494da23a 16698 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
16699 } else {
16700 txc->note_modified_object(o);
16701 }
9f95a23c
TL
16702 const string& prefix = o->get_omap_prefix();
16703 o->get_omap_header(&key);
11fdf7f2 16704 txc->t->set(prefix, key, bl);
7c673cae
FG
16705 r = 0;
16706 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16707 return r;
16708}
16709
16710int BlueStore::_omap_rmkeys(TransContext *txc,
16711 CollectionRef& c,
16712 OnodeRef& o,
16713 bufferlist& bl)
16714{
16715 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16716 int r = 0;
11fdf7f2 16717 auto p = bl.cbegin();
7c673cae
FG
16718 __u32 num;
16719 string final_key;
7c673cae
FG
16720 if (!o->onode.has_omap()) {
16721 goto out;
16722 }
11fdf7f2 16723 {
9f95a23c
TL
16724 const string& prefix = o->get_omap_prefix();
16725 o->get_omap_key(string(), &final_key);
16726 size_t base_key_len = final_key.size();
11fdf7f2 16727 decode(num, p);
1e59de90 16728 logger->inc(l_bluestore_omap_rmkeys_count, num);
11fdf7f2
TL
16729 while (num--) {
16730 string key;
16731 decode(key, p);
9f95a23c 16732 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
16733 final_key += key;
16734 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
16735 << " <- " << key << dendl;
16736 txc->t->rmkey(prefix, final_key);
16737 }
7c673cae
FG
16738 }
16739 txc->note_modified_object(o);
16740
16741 out:
16742 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16743 return r;
16744}
16745
16746int BlueStore::_omap_rmkey_range(TransContext *txc,
16747 CollectionRef& c,
16748 OnodeRef& o,
16749 const string& first, const string& last)
16750{
16751 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
7c673cae
FG
16752 string key_first, key_last;
16753 int r = 0;
16754 if (!o->onode.has_omap()) {
16755 goto out;
16756 }
11fdf7f2 16757 {
9f95a23c 16758 const string& prefix = o->get_omap_prefix();
11fdf7f2 16759 o->flush();
9f95a23c
TL
16760 o->get_omap_key(first, &key_first);
16761 o->get_omap_key(last, &key_last);
1e59de90 16762 logger->inc(l_bluestore_omap_rmkey_ranges_count);
11fdf7f2
TL
16763 txc->t->rm_range_keys(prefix, key_first, key_last);
16764 dout(20) << __func__ << " remove range start: "
16765 << pretty_binary_string(key_first) << " end: "
16766 << pretty_binary_string(key_last) << dendl;
7c673cae
FG
16767 }
16768 txc->note_modified_object(o);
16769
16770 out:
7c673cae
FG
16771 return r;
16772}
16773
16774int BlueStore::_set_alloc_hint(
16775 TransContext *txc,
16776 CollectionRef& c,
16777 OnodeRef& o,
16778 uint64_t expected_object_size,
16779 uint64_t expected_write_size,
16780 uint32_t flags)
16781{
16782 dout(15) << __func__ << " " << c->cid << " " << o->oid
16783 << " object_size " << expected_object_size
16784 << " write_size " << expected_write_size
16785 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
16786 << dendl;
16787 int r = 0;
16788 o->onode.expected_object_size = expected_object_size;
16789 o->onode.expected_write_size = expected_write_size;
16790 o->onode.alloc_hint_flags = flags;
16791 txc->write_onode(o);
16792 dout(10) << __func__ << " " << c->cid << " " << o->oid
16793 << " object_size " << expected_object_size
16794 << " write_size " << expected_write_size
16795 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
16796 << " = " << r << dendl;
16797 return r;
16798}
16799
16800int BlueStore::_clone(TransContext *txc,
16801 CollectionRef& c,
16802 OnodeRef& oldo,
16803 OnodeRef& newo)
16804{
16805 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16806 << newo->oid << dendl;
16807 int r = 0;
16808 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
16809 derr << __func__ << " mismatched hash on " << oldo->oid
16810 << " and " << newo->oid << dendl;
16811 return -EINVAL;
16812 }
16813
7c673cae
FG
16814 _assign_nid(txc, newo);
16815
16816 // clone data
16817 oldo->flush();
16818 _do_truncate(txc, c, newo, 0);
16819 if (cct->_conf->bluestore_clone_cow) {
16820 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
16821 } else {
16822 bufferlist bl;
16823 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
16824 if (r < 0)
16825 goto out;
16826 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
16827 if (r < 0)
16828 goto out;
16829 }
16830
16831 // clone attrs
16832 newo->onode.attrs = oldo->onode.attrs;
16833
16834 // clone omap
16835 if (newo->onode.has_omap()) {
16836 dout(20) << __func__ << " clearing old omap data" << dendl;
16837 newo->flush();
9f95a23c 16838 _do_omap_clear(txc, newo);
7c673cae
FG
16839 }
16840 if (oldo->onode.has_omap()) {
16841 dout(20) << __func__ << " copying omap data" << dendl;
494da23a 16842 if (newo->oid.is_pgmeta()) {
9f95a23c
TL
16843 newo->onode.set_omap_flags_pgmeta();
16844 } else {
522d829b 16845 newo->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
7c673cae 16846 }
20effc67
TL
16847 // check if prefix for omap key is exactly the same size for both objects
16848 // otherwise rewrite_omap_key will corrupt data
16849 ceph_assert(oldo->onode.flags == newo->onode.flags);
9f95a23c 16850 const string& prefix = newo->get_omap_prefix();
7c673cae 16851 string head, tail;
9f95a23c
TL
16852 oldo->get_omap_header(&head);
16853 oldo->get_omap_tail(&tail);
33c7a0ef 16854 KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
7c673cae
FG
16855 it->lower_bound(head);
16856 while (it->valid()) {
16857 if (it->key() >= tail) {
16858 dout(30) << __func__ << " reached tail" << dendl;
16859 break;
16860 } else {
16861 dout(30) << __func__ << " got header/data "
16862 << pretty_binary_string(it->key()) << dendl;
16863 string key;
9f95a23c 16864 newo->rewrite_omap_key(it->key(), &key);
11fdf7f2 16865 txc->t->set(prefix, key, it->value());
7c673cae
FG
16866 }
16867 it->next();
16868 }
494da23a
TL
16869 string new_tail;
16870 bufferlist new_tail_value;
9f95a23c 16871 newo->get_omap_tail(&new_tail);
494da23a 16872 txc->t->set(prefix, new_tail, new_tail_value);
7c673cae
FG
16873 }
16874
16875 txc->write_onode(newo);
16876 r = 0;
16877
16878 out:
16879 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16880 << newo->oid << " = " << r << dendl;
16881 return r;
16882}
16883
16884int BlueStore::_do_clone_range(
16885 TransContext *txc,
16886 CollectionRef& c,
16887 OnodeRef& oldo,
16888 OnodeRef& newo,
224ce89b
WB
16889 uint64_t srcoff,
16890 uint64_t length,
16891 uint64_t dstoff)
7c673cae
FG
16892{
16893 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16894 << newo->oid
16895 << " 0x" << std::hex << srcoff << "~" << length << " -> "
16896 << " 0x" << dstoff << "~" << length << std::dec << dendl;
16897 oldo->extent_map.fault_range(db, srcoff, length);
16898 newo->extent_map.fault_range(db, dstoff, length);
81eedcae
TL
16899 _dump_onode<30>(cct, *oldo);
16900 _dump_onode<30>(cct, *newo);
7c673cae 16901
11fdf7f2 16902 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
7c673cae 16903
20effc67
TL
16904#ifdef HAVE_LIBZBD
16905 if (bdev->is_smr()) {
16906 // duplicate the refs for the shared region.
16907 Extent dummy(dstoff);
16908 for (auto e = newo->extent_map.extent_map.lower_bound(dummy);
16909 e != newo->extent_map.extent_map.end();
16910 ++e) {
16911 if (e->logical_offset >= dstoff + length) {
16912 break;
16913 }
16914 for (auto& ex : e->blob->get_blob().get_extents()) {
16915 // note that we may introduce a new extent reference that is
16916 // earlier than the first zone ref. we allow this since it is
16917 // a lot of work to avoid and has marginal impact on cleaning
16918 // performance.
16919 if (!ex.is_valid()) {
16920 continue;
16921 }
16922 uint32_t zone = ex.offset / zone_size;
16923 if (!newo->onode.zone_offset_refs.count(zone)) {
16924 uint64_t zoff = ex.offset % zone_size;
16925 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
16926 << " offset 0x" << zoff << std::dec
16927 << " -> " << newo->oid << dendl;
16928 txc->note_write_zone_offset(newo, zone, zoff);
16929 }
16930 }
16931 }
16932 }
16933#endif
16934
16935 _dump_onode<30>(cct, *oldo);
16936 _dump_onode<30>(cct, *newo);
16937 return 0;
16938}
16939
16940int BlueStore::_clone_range(TransContext *txc,
16941 CollectionRef& c,
16942 OnodeRef& oldo,
16943 OnodeRef& newo,
16944 uint64_t srcoff, uint64_t length, uint64_t dstoff)
16945{
16946 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
7c673cae
FG
16947 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
16948 << " to offset 0x" << dstoff << std::dec << dendl;
16949 int r = 0;
16950
35e4c445
FG
16951 if (srcoff + length >= OBJECT_MAX_SIZE ||
16952 dstoff + length >= OBJECT_MAX_SIZE) {
16953 r = -E2BIG;
16954 goto out;
16955 }
7c673cae
FG
16956 if (srcoff + length > oldo->onode.size) {
16957 r = -EINVAL;
16958 goto out;
16959 }
16960
7c673cae
FG
16961 _assign_nid(txc, newo);
16962
16963 if (length > 0) {
16964 if (cct->_conf->bluestore_clone_cow) {
16965 _do_zero(txc, c, newo, dstoff, length);
16966 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
16967 } else {
16968 bufferlist bl;
16969 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
16970 if (r < 0)
16971 goto out;
16972 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
16973 if (r < 0)
16974 goto out;
16975 }
16976 }
16977
16978 txc->write_onode(newo);
16979 r = 0;
16980
16981 out:
16982 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16983 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
16984 << " to offset 0x" << dstoff << std::dec
16985 << " = " << r << dendl;
16986 return r;
16987}
16988
16989int BlueStore::_rename(TransContext *txc,
16990 CollectionRef& c,
16991 OnodeRef& oldo,
16992 OnodeRef& newo,
16993 const ghobject_t& new_oid)
16994{
16995 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16996 << new_oid << dendl;
16997 int r;
16998 ghobject_t old_oid = oldo->oid;
f91f0fd5 16999 mempool::bluestore_cache_meta::string new_okey;
7c673cae
FG
17000
17001 if (newo) {
17002 if (newo->exists) {
17003 r = -EEXIST;
17004 goto out;
17005 }
11fdf7f2 17006 ceph_assert(txc->onodes.count(newo) == 0);
7c673cae
FG
17007 }
17008
17009 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
17010
17011 // rewrite shards
17012 {
17013 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
17014 get_object_key(cct, new_oid, &new_okey);
17015 string key;
17016 for (auto &s : oldo->extent_map.shards) {
17017 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
17018 [&](const string& final_key) {
17019 txc->t->rmkey(PREFIX_OBJ, final_key);
17020 }
17021 );
17022 s.dirty = true;
17023 }
17024 }
17025
17026 newo = oldo;
17027 txc->write_onode(newo);
17028
17029 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
17030 // Onode in the old slot
39ae355f 17031 c->onode_space.rename(oldo, old_oid, new_oid, new_okey);
7c673cae
FG
17032 r = 0;
17033
f64942e4
AA
17034 // hold a ref to new Onode in old name position, to ensure we don't drop
17035 // it from the cache before this txc commits (or else someone may come along
17036 // and read newo's metadata via the old name).
17037 txc->note_modified_object(oldo);
17038
20effc67
TL
17039#ifdef HAVE_LIBZBD
17040 if (bdev->is_smr()) {
17041 // adjust zone refs
17042 for (auto& [zone, offset] : newo->onode.zone_offset_refs) {
17043 dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
17044 << " offset 0x" << offset << std::dec
17045 << " -> " << oldo->oid << dendl;
17046 string key;
17047 get_zone_offset_object_key(zone, offset, oldo->oid, &key);
17048 txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
17049
17050 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
17051 << " offset 0x" << offset << std::dec
17052 << " -> " << newo->oid << dendl;
17053 get_zone_offset_object_key(zone, offset, newo->oid, &key);
17054 bufferlist v;
17055 txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
17056 }
17057 }
17058#endif
17059
7c673cae
FG
17060 out:
17061 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
17062 << new_oid << " = " << r << dendl;
17063 return r;
17064}
17065
17066// collections
17067
17068int BlueStore::_create_collection(
17069 TransContext *txc,
17070 const coll_t &cid,
17071 unsigned bits,
17072 CollectionRef *c)
17073{
17074 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
17075 int r;
17076 bufferlist bl;
17077
17078 {
9f95a23c 17079 std::unique_lock l(coll_lock);
7c673cae
FG
17080 if (*c) {
17081 r = -EEXIST;
17082 goto out;
17083 }
11fdf7f2
TL
17084 auto p = new_coll_map.find(cid);
17085 ceph_assert(p != new_coll_map.end());
17086 *c = p->second;
7c673cae
FG
17087 (*c)->cnode.bits = bits;
17088 coll_map[cid] = *c;
11fdf7f2 17089 new_coll_map.erase(p);
7c673cae 17090 }
11fdf7f2 17091 encode((*c)->cnode, bl);
7c673cae
FG
17092 txc->t->set(PREFIX_COLL, stringify(cid), bl);
17093 r = 0;
17094
17095 out:
17096 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
17097 return r;
17098}
17099
17100int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
17101 CollectionRef *c)
17102{
17103 dout(15) << __func__ << " " << cid << dendl;
17104 int r;
17105
11fdf7f2 17106 (*c)->flush_all_but_last();
7c673cae 17107 {
9f95a23c 17108 std::unique_lock l(coll_lock);
7c673cae
FG
17109 if (!*c) {
17110 r = -ENOENT;
17111 goto out;
17112 }
17113 size_t nonexistent_count = 0;
11fdf7f2 17114 ceph_assert((*c)->exists);
39ae355f 17115 if ((*c)->onode_space.map_any([&](Onode* o) {
f67539c2
TL
17116 if (o->exists) {
17117 dout(1) << __func__ << " " << o->oid << " " << o
17118 << " exists in onode_map" << dendl;
7c673cae 17119 return true;
f67539c2
TL
17120 }
17121 ++nonexistent_count;
17122 return false;
17123 })) {
7c673cae
FG
17124 r = -ENOTEMPTY;
17125 goto out;
17126 }
7c673cae
FG
17127 vector<ghobject_t> ls;
17128 ghobject_t next;
17129 // Enumerate onodes in db, up to nonexistent_count + 1
17130 // then check if all of them are marked as non-existent.
11fdf7f2 17131 // Bypass the check if (next != ghobject_t::get_max())
7c673cae 17132 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
f91f0fd5 17133 nonexistent_count + 1, false, &ls, &next);
7c673cae 17134 if (r >= 0) {
11fdf7f2
TL
17135 // If true mean collecton has more objects than nonexistent_count,
17136 // so bypass check.
17137 bool exists = (!next.is_max());
7c673cae
FG
17138 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
17139 dout(10) << __func__ << " oid " << *it << dendl;
39ae355f 17140 auto onode = (*c)->onode_space.lookup(*it);
7c673cae
FG
17141 exists = !onode || onode->exists;
17142 if (exists) {
494da23a 17143 dout(1) << __func__ << " " << *it
f67539c2
TL
17144 << " exists in db, "
17145 << (!onode ? "not present in ram" : "present in ram")
17146 << dendl;
7c673cae
FG
17147 }
17148 }
17149 if (!exists) {
f67539c2 17150 _do_remove_collection(txc, c);
7c673cae
FG
17151 r = 0;
17152 } else {
17153 dout(10) << __func__ << " " << cid
17154 << " is non-empty" << dendl;
f67539c2 17155 r = -ENOTEMPTY;
7c673cae
FG
17156 }
17157 }
17158 }
f67539c2 17159out:
7c673cae
FG
17160 dout(10) << __func__ << " " << cid << " = " << r << dendl;
17161 return r;
17162}
17163
11fdf7f2
TL
17164void BlueStore::_do_remove_collection(TransContext *txc,
17165 CollectionRef *c)
17166{
17167 coll_map.erase((*c)->cid);
17168 txc->removed_collections.push_back(*c);
17169 (*c)->exists = false;
17170 _osr_register_zombie((*c)->osr.get());
17171 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
17172 c->reset();
17173}
17174
7c673cae
FG
17175int BlueStore::_split_collection(TransContext *txc,
17176 CollectionRef& c,
17177 CollectionRef& d,
17178 unsigned bits, int rem)
17179{
17180 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
17181 << " bits " << bits << dendl;
9f95a23c
TL
17182 std::unique_lock l(c->lock);
17183 std::unique_lock l2(d->lock);
7c673cae
FG
17184 int r;
17185
17186 // flush all previous deferred writes on this sequencer. this is a bit
17187 // heavyweight, but we need to make sure all deferred writes complete
17188 // before we split as the new collection's sequencer may need to order
17189 // this after those writes, and we don't bother with the complexity of
17190 // moving those TransContexts over to the new osr.
17191 _osr_drain_preceding(txc);
17192
17193 // move any cached items (onodes and referenced shared blobs) that will
17194 // belong to the child collection post-split. leave everything else behind.
17195 // this may include things that don't strictly belong to the now-smaller
17196 // parent split, but the OSD will always send us a split for every new
17197 // child.
17198
17199 spg_t pgid, dest_pgid;
17200 bool is_pg = c->cid.is_pg(&pgid);
11fdf7f2 17201 ceph_assert(is_pg);
7c673cae 17202 is_pg = d->cid.is_pg(&dest_pgid);
11fdf7f2 17203 ceph_assert(is_pg);
7c673cae
FG
17204
17205 // the destination should initially be empty.
39ae355f 17206 ceph_assert(d->onode_space.empty());
11fdf7f2
TL
17207 ceph_assert(d->shared_blob_set.empty());
17208 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
17209
17210 c->split_cache(d.get());
17211
17212 // adjust bits. note that this will be redundant for all but the first
17213 // split call for this parent (first child).
17214 c->cnode.bits = bits;
11fdf7f2 17215 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
17216 r = 0;
17217
17218 bufferlist bl;
11fdf7f2 17219 encode(c->cnode, bl);
7c673cae
FG
17220 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
17221
17222 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
17223 << " bits " << bits << " = " << r << dendl;
17224 return r;
17225}
17226
11fdf7f2
TL
17227int BlueStore::_merge_collection(
17228 TransContext *txc,
17229 CollectionRef *c,
17230 CollectionRef& d,
17231 unsigned bits)
17232{
17233 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
17234 << " bits " << bits << dendl;
9f95a23c
TL
17235 std::unique_lock l((*c)->lock);
17236 std::unique_lock l2(d->lock);
11fdf7f2
TL
17237 int r;
17238
17239 coll_t cid = (*c)->cid;
17240
17241 // flush all previous deferred writes on the source collection to ensure
17242 // that all deferred writes complete before we merge as the target collection's
17243 // sequencer may need to order new ops after those writes.
17244
17245 _osr_drain((*c)->osr.get());
17246
17247 // move any cached items (onodes and referenced shared blobs) that will
17248 // belong to the child collection post-split. leave everything else behind.
17249 // this may include things that don't strictly belong to the now-smaller
17250 // parent split, but the OSD will always send us a split for every new
17251 // child.
17252
17253 spg_t pgid, dest_pgid;
17254 bool is_pg = cid.is_pg(&pgid);
17255 ceph_assert(is_pg);
17256 is_pg = d->cid.is_pg(&dest_pgid);
17257 ceph_assert(is_pg);
17258
17259 // adjust bits. note that this will be redundant for all but the first
17260 // merge call for the parent/target.
17261 d->cnode.bits = bits;
17262
17263 // behavior depends on target (d) bits, so this after that is updated.
17264 (*c)->split_cache(d.get());
17265
17266 // remove source collection
17267 {
9f95a23c 17268 std::unique_lock l3(coll_lock);
11fdf7f2
TL
17269 _do_remove_collection(txc, c);
17270 }
17271
17272 r = 0;
17273
17274 bufferlist bl;
17275 encode(d->cnode, bl);
17276 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
17277
17278 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
17279 << " bits " << bits << " = " << r << dendl;
17280 return r;
17281}
17282
494da23a
TL
17283void BlueStore::log_latency(
17284 const char* name,
17285 int idx,
17286 const ceph::timespan& l,
17287 double lat_threshold,
17288 const char* info) const
17289{
17290 logger->tinc(idx, l);
17291 if (lat_threshold > 0.0 &&
17292 l >= make_timespan(lat_threshold)) {
17293 dout(0) << __func__ << " slow operation observed for " << name
17294 << ", latency = " << l
17295 << info
17296 << dendl;
17297 }
17298}
17299
11fdf7f2 17300void BlueStore::log_latency_fn(
494da23a 17301 const char* name,
11fdf7f2
TL
17302 int idx,
17303 const ceph::timespan& l,
494da23a
TL
17304 double lat_threshold,
17305 std::function<string (const ceph::timespan& lat)> fn) const
11fdf7f2 17306{
494da23a
TL
17307 logger->tinc(idx, l);
17308 if (lat_threshold > 0.0 &&
17309 l >= make_timespan(lat_threshold)) {
17310 dout(0) << __func__ << " slow operation observed for " << name
17311 << ", latency = " << l
17312 << fn(l)
17313 << dendl;
17314 }
11fdf7f2
TL
17315}
17316
9f95a23c
TL
17317#if defined(WITH_LTTNG)
17318void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
17319 KeyValueDB &db,
17320 TransContext &txc,
17321 mono_clock::time_point start_throttle_acquire)
17322{
17323 pending_kv_ios += txc.ios;
17324 if (txc.deferred_txn) {
17325 pending_deferred_ios += txc.ios;
17326 }
17327
17328 uint64_t started = 0;
17329 uint64_t completed = 0;
17330 if (should_trace(&started, &completed)) {
17331 txc.tracing = true;
17332 uint64_t rocksdb_base_level,
17333 rocksdb_estimate_pending_compaction_bytes,
17334 rocksdb_cur_size_all_mem_tables,
17335 rocksdb_compaction_pending,
17336 rocksdb_mem_table_flush_pending,
17337 rocksdb_num_running_compactions,
17338 rocksdb_num_running_flushes,
17339 rocksdb_actual_delayed_write_rate;
17340 db.get_property(
17341 "rocksdb.base-level",
17342 &rocksdb_base_level);
17343 db.get_property(
17344 "rocksdb.estimate-pending-compaction-bytes",
17345 &rocksdb_estimate_pending_compaction_bytes);
17346 db.get_property(
17347 "rocksdb.cur-size-all-mem-tables",
17348 &rocksdb_cur_size_all_mem_tables);
17349 db.get_property(
17350 "rocksdb.compaction-pending",
17351 &rocksdb_compaction_pending);
17352 db.get_property(
17353 "rocksdb.mem-table-flush-pending",
17354 &rocksdb_mem_table_flush_pending);
17355 db.get_property(
17356 "rocksdb.num-running-compactions",
17357 &rocksdb_num_running_compactions);
17358 db.get_property(
17359 "rocksdb.num-running-flushes",
17360 &rocksdb_num_running_flushes);
17361 db.get_property(
17362 "rocksdb.actual-delayed-write-rate",
17363 &rocksdb_actual_delayed_write_rate);
17364
17365
17366 tracepoint(
17367 bluestore,
17368 transaction_initial_state,
17369 txc.osr->get_sequencer_id(),
17370 txc.seq,
17371 throttle_bytes.get_current(),
17372 throttle_deferred_bytes.get_current(),
17373 pending_kv_ios,
17374 pending_deferred_ios,
17375 started,
17376 completed,
17377 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
17378
17379 tracepoint(
17380 bluestore,
17381 transaction_initial_state_rocksdb,
17382 txc.osr->get_sequencer_id(),
17383 txc.seq,
17384 rocksdb_base_level,
17385 rocksdb_estimate_pending_compaction_bytes,
17386 rocksdb_cur_size_all_mem_tables,
17387 rocksdb_compaction_pending,
17388 rocksdb_mem_table_flush_pending,
17389 rocksdb_num_running_compactions,
17390 rocksdb_num_running_flushes,
17391 rocksdb_actual_delayed_write_rate);
17392 }
17393}
17394#endif
17395
17396mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
17397 TransContext &txc, PerfCounters *logger, int state)
17398{
17399 mono_clock::time_point now = mono_clock::now();
17400 mono_clock::duration lat = now - txc.last_stamp;
17401 logger->tinc(state, lat);
17402#if defined(WITH_LTTNG)
17403 if (txc.tracing &&
17404 state >= l_bluestore_state_prepare_lat &&
17405 state <= l_bluestore_state_done_lat) {
17406 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
17407 tracepoint(
17408 bluestore,
17409 transaction_state_duration,
17410 txc.osr->get_sequencer_id(),
17411 txc.seq,
17412 state,
17413 ceph::to_seconds<double>(lat));
17414 }
17415#endif
17416 txc.last_stamp = now;
17417 return lat;
17418}
17419
17420bool BlueStore::BlueStoreThrottle::try_start_transaction(
17421 KeyValueDB &db,
17422 TransContext &txc,
17423 mono_clock::time_point start_throttle_acquire)
17424{
17425 throttle_bytes.get(txc.cost);
17426
17427 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
17428 emit_initial_tracepoint(db, txc, start_throttle_acquire);
17429 return true;
17430 } else {
17431 return false;
17432 }
17433}
17434
17435void BlueStore::BlueStoreThrottle::finish_start_transaction(
17436 KeyValueDB &db,
17437 TransContext &txc,
17438 mono_clock::time_point start_throttle_acquire)
17439{
17440 ceph_assert(txc.deferred_txn);
17441 throttle_deferred_bytes.get(txc.cost);
17442 emit_initial_tracepoint(db, txc, start_throttle_acquire);
17443}
17444
17445#if defined(WITH_LTTNG)
17446void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
17447{
17448 pending_kv_ios -= 1;
17449 ios_completed_since_last_traced++;
17450 if (txc.tracing) {
17451 tracepoint(
17452 bluestore,
17453 transaction_commit_latency,
17454 txc.osr->get_sequencer_id(),
17455 txc.seq,
17456 ceph::to_seconds<double>(mono_clock::now() - txc.start));
17457 }
17458}
17459#endif
17460
17461#if defined(WITH_LTTNG)
17462void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
17463{
17464 if (txc.deferred_txn) {
17465 pending_deferred_ios -= 1;
17466 }
17467 if (txc.tracing) {
17468 mono_clock::time_point now = mono_clock::now();
17469 mono_clock::duration lat = now - txc.start;
17470 tracepoint(
17471 bluestore,
17472 transaction_total_duration,
17473 txc.osr->get_sequencer_id(),
17474 txc.seq,
17475 ceph::to_seconds<double>(lat));
17476 }
17477}
17478#endif
11fdf7f2 17479
7c673cae
FG
17480const string prefix_onode = "o";
17481const string prefix_onode_shard = "x";
17482const string prefix_other = "Z";
7c673cae
FG
17483//Itrerates through the db and collects the stats
17484void BlueStore::generate_db_histogram(Formatter *f)
17485{
17486 //globals
17487 uint64_t num_onodes = 0;
17488 uint64_t num_shards = 0;
17489 uint64_t num_super = 0;
17490 uint64_t num_coll = 0;
17491 uint64_t num_omap = 0;
11fdf7f2 17492 uint64_t num_pgmeta_omap = 0;
7c673cae
FG
17493 uint64_t num_deferred = 0;
17494 uint64_t num_alloc = 0;
17495 uint64_t num_stat = 0;
17496 uint64_t num_others = 0;
17497 uint64_t num_shared_shards = 0;
17498 size_t max_key_size =0, max_value_size = 0;
17499 uint64_t total_key_size = 0, total_value_size = 0;
17500 size_t key_size = 0, value_size = 0;
20effc67 17501 KeyValueHistogram hist;
7c673cae 17502
11fdf7f2 17503 auto start = coarse_mono_clock::now();
7c673cae 17504
11fdf7f2 17505 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
7c673cae
FG
17506 iter->seek_to_first();
17507 while (iter->valid()) {
17508 dout(30) << __func__ << " Key: " << iter->key() << dendl;
17509 key_size = iter->key_size();
17510 value_size = iter->value_size();
17511 hist.value_hist[hist.get_value_slab(value_size)]++;
11fdf7f2
TL
17512 max_key_size = std::max(max_key_size, key_size);
17513 max_value_size = std::max(max_value_size, value_size);
7c673cae
FG
17514 total_key_size += key_size;
17515 total_value_size += value_size;
17516
17517 pair<string,string> key(iter->raw_key());
17518
17519 if (key.first == PREFIX_SUPER) {
17520 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
17521 num_super++;
17522 } else if (key.first == PREFIX_STAT) {
17523 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
17524 num_stat++;
17525 } else if (key.first == PREFIX_COLL) {
17526 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
17527 num_coll++;
17528 } else if (key.first == PREFIX_OBJ) {
17529 if (key.second.back() == ONODE_KEY_SUFFIX) {
17530 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
17531 num_onodes++;
17532 } else {
17533 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
17534 num_shards++;
17535 }
17536 } else if (key.first == PREFIX_OMAP) {
17537 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
17538 num_omap++;
f67539c2
TL
17539 } else if (key.first == PREFIX_PERPOOL_OMAP) {
17540 hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size);
17541 num_omap++;
17542 } else if (key.first == PREFIX_PERPG_OMAP) {
17543 hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size);
17544 num_omap++;
11fdf7f2
TL
17545 } else if (key.first == PREFIX_PGMETA_OMAP) {
17546 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
17547 num_pgmeta_omap++;
7c673cae
FG
17548 } else if (key.first == PREFIX_DEFERRED) {
17549 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
17550 num_deferred++;
11fdf7f2 17551 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
7c673cae
FG
17552 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
17553 num_alloc++;
17554 } else if (key.first == PREFIX_SHARED_BLOB) {
17555 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
17556 num_shared_shards++;
17557 } else {
17558 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
17559 num_others++;
17560 }
17561 iter->next();
17562 }
17563
11fdf7f2 17564 ceph::timespan duration = coarse_mono_clock::now() - start;
7c673cae
FG
17565 f->open_object_section("rocksdb_key_value_stats");
17566 f->dump_unsigned("num_onodes", num_onodes);
17567 f->dump_unsigned("num_shards", num_shards);
17568 f->dump_unsigned("num_super", num_super);
17569 f->dump_unsigned("num_coll", num_coll);
17570 f->dump_unsigned("num_omap", num_omap);
11fdf7f2 17571 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
7c673cae
FG
17572 f->dump_unsigned("num_deferred", num_deferred);
17573 f->dump_unsigned("num_alloc", num_alloc);
17574 f->dump_unsigned("num_stat", num_stat);
17575 f->dump_unsigned("num_shared_shards", num_shared_shards);
17576 f->dump_unsigned("num_others", num_others);
17577 f->dump_unsigned("max_key_size", max_key_size);
17578 f->dump_unsigned("max_value_size", max_value_size);
17579 f->dump_unsigned("total_key_size", total_key_size);
17580 f->dump_unsigned("total_value_size", total_value_size);
17581 f->close_section();
17582
17583 hist.dump(f);
17584
17585 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
17586
17587}
17588
f6b5b4d7 17589void BlueStore::_shutdown_cache()
7c673cae
FG
17590{
17591 dout(10) << __func__ << dendl;
9f95a23c
TL
17592 for (auto i : buffer_cache_shards) {
17593 i->flush();
11fdf7f2 17594 ceph_assert(i->empty());
7c673cae
FG
17595 }
17596 for (auto& p : coll_map) {
39ae355f 17597 p.second->onode_space.clear();
3efd9988
FG
17598 if (!p.second->shared_blob_set.empty()) {
17599 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11fdf7f2 17600 p.second->shared_blob_set.dump<0>(cct);
3efd9988 17601 }
39ae355f 17602 ceph_assert(p.second->onode_space.empty());
11fdf7f2 17603 ceph_assert(p.second->shared_blob_set.empty());
7c673cae
FG
17604 }
17605 coll_map.clear();
f6b5b4d7
TL
17606 for (auto i : onode_cache_shards) {
17607 ceph_assert(i->empty());
17608 }
7c673cae
FG
17609}
17610
31f18b77
FG
17611// For external caller.
17612// We use a best-effort policy instead, e.g.,
17613// we don't care if there are still some pinned onodes/data in the cache
17614// after this command is completed.
11fdf7f2 17615int BlueStore::flush_cache(ostream *os)
31f18b77
FG
17616{
17617 dout(10) << __func__ << dendl;
9f95a23c
TL
17618 for (auto i : onode_cache_shards) {
17619 i->flush();
17620 }
17621 for (auto i : buffer_cache_shards) {
17622 i->flush();
31f18b77 17623 }
11fdf7f2
TL
17624
17625 return 0;
31f18b77
FG
17626}
17627
7c673cae
FG
17628void BlueStore::_apply_padding(uint64_t head_pad,
17629 uint64_t tail_pad,
7c673cae
FG
17630 bufferlist& padded)
17631{
7c673cae 17632 if (head_pad) {
224ce89b 17633 padded.prepend_zero(head_pad);
7c673cae
FG
17634 }
17635 if (tail_pad) {
17636 padded.append_zero(tail_pad);
17637 }
17638 if (head_pad || tail_pad) {
17639 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
17640 << " tail 0x" << tail_pad << std::dec << dendl;
17641 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
17642 }
17643}
17644
39ae355f 17645void BlueStore::_record_onode(OnodeRef& o, KeyValueDB::Transaction &txn)
11fdf7f2
TL
17646{
17647 // finalize extent_map shards
17648 o->extent_map.update(txn, false);
17649 if (o->extent_map.needs_reshard()) {
17650 o->extent_map.reshard(db, txn);
17651 o->extent_map.update(txn, true);
17652 if (o->extent_map.needs_reshard()) {
17653 dout(20) << __func__ << " warning: still wants reshard, check options?"
17654 << dendl;
17655 o->extent_map.clear_needs_reshard();
17656 }
17657 logger->inc(l_bluestore_onode_reshard);
17658 }
17659
17660 // bound encode
17661 size_t bound = 0;
17662 denc(o->onode, bound);
17663 o->extent_map.bound_encode_spanning_blobs(bound);
17664 if (o->onode.extent_map_shards.empty()) {
17665 denc(o->extent_map.inline_bl, bound);
17666 }
17667
17668 // encode
17669 bufferlist bl;
17670 unsigned onode_part, blob_part, extent_part;
17671 {
17672 auto p = bl.get_contiguous_appender(bound, true);
17673 denc(o->onode, p);
17674 onode_part = p.get_logical_offset();
17675 o->extent_map.encode_spanning_blobs(p);
17676 blob_part = p.get_logical_offset() - onode_part;
17677 if (o->onode.extent_map_shards.empty()) {
17678 denc(o->extent_map.inline_bl, p);
17679 }
17680 extent_part = p.get_logical_offset() - onode_part - blob_part;
17681 }
17682
17683 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
17684 << " (" << onode_part << " bytes onode + "
17685 << blob_part << " bytes spanning blobs + "
17686 << extent_part << " bytes inline extents)"
17687 << dendl;
17688
17689
17690 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
17691}
17692
17693void BlueStore::_log_alerts(osd_alert_list_t& alerts)
17694{
17695 std::lock_guard l(qlock);
1e59de90
TL
17696 size_t used = bluefs && bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW ?
17697 bluefs->get_used(BlueFS::BDEV_SLOW) : 0;
17698 if (used > 0) {
17699 auto db_used = bluefs->get_used(BlueFS::BDEV_DB);
17700 auto db_total = bluefs->get_total(BlueFS::BDEV_DB);
17701 ostringstream ss;
17702 ss << "spilled over " << byte_u_t(used)
17703 << " metadata from 'db' device (" << byte_u_t(db_used)
17704 << " used of " << byte_u_t(db_total) << ") to slow device";
17705 spillover_alert = ss.str();
17706 } else if (!spillover_alert.empty()){
17707 spillover_alert.clear();
17708 }
11fdf7f2 17709
522d829b
TL
17710 if (!spurious_read_errors_alert.empty() &&
17711 cct->_conf->bluestore_warn_on_spurious_read_errors) {
f67539c2
TL
17712 alerts.emplace(
17713 "BLUESTORE_SPURIOUS_READ_ERRORS",
17714 spurious_read_errors_alert);
17715 }
81eedcae
TL
17716 if (!disk_size_mismatch_alert.empty()) {
17717 alerts.emplace(
17718 "BLUESTORE_DISK_SIZE_MISMATCH",
17719 disk_size_mismatch_alert);
17720 }
17721 if (!legacy_statfs_alert.empty()) {
17722 alerts.emplace(
17723 "BLUESTORE_LEGACY_STATFS",
17724 legacy_statfs_alert);
17725 }
11fdf7f2
TL
17726 if (!spillover_alert.empty() &&
17727 cct->_conf->bluestore_warn_on_bluefs_spillover) {
17728 alerts.emplace(
17729 "BLUEFS_SPILLOVER",
17730 spillover_alert);
17731 }
f67539c2
TL
17732 if (!no_per_pg_omap_alert.empty()) {
17733 alerts.emplace(
17734 "BLUESTORE_NO_PER_PG_OMAP",
17735 no_per_pg_omap_alert);
17736 }
9f95a23c
TL
17737 if (!no_per_pool_omap_alert.empty()) {
17738 alerts.emplace(
17739 "BLUESTORE_NO_PER_POOL_OMAP",
17740 no_per_pool_omap_alert);
17741 }
11fdf7f2
TL
17742 string s0(failed_cmode);
17743
17744 if (!failed_compressors.empty()) {
17745 if (!s0.empty()) {
17746 s0 += ", ";
17747 }
17748 s0 += "unable to load:";
17749 bool first = true;
17750 for (auto& s : failed_compressors) {
17751 if (first) {
17752 first = false;
17753 } else {
17754 s0 += ", ";
17755 }
17756 s0 += s;
17757 }
17758 alerts.emplace(
17759 "BLUESTORE_NO_COMPRESSION",
17760 s0);
17761 }
17762}
17763
9f95a23c 17764void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
20effc67 17765 const PExtentVector& extents)
9f95a23c
TL
17766{
17767 alloc_stats_count++;
20effc67 17768 alloc_stats_fragments += extents.size();
9f95a23c 17769 alloc_stats_size += need;
20effc67
TL
17770
17771 for (auto& e : extents) {
17772 logger->hinc(l_bluestore_allocate_hist, e.length, need);
17773 }
9f95a23c
TL
17774}
17775
17776void BlueStore::_record_allocation_stats()
17777{
17778 // don't care about data consistency,
17779 // fields can be partially modified while making the tuple
17780 auto t0 = std::make_tuple(
17781 alloc_stats_count.exchange(0),
17782 alloc_stats_fragments.exchange(0),
17783 alloc_stats_size.exchange(0));
17784
17785 dout(0) << " allocation stats probe "
17786 << probe_count << ":"
17787 << " cnt: " << std::get<0>(t0)
17788 << " frags: " << std::get<1>(t0)
17789 << " size: " << std::get<2>(t0)
17790 << dendl;
17791
17792
17793 //
17794 // Keep the history for probes from the power-of-two sequence:
17795 // -1, -2, -4, -8, -16
17796 //
17797 size_t base = 1;
17798 for (auto& t : alloc_stats_history) {
17799 dout(0) << " probe -"
17800 << base + (probe_count % base) << ": "
17801 << std::get<0>(t)
17802 << ", " << std::get<1>(t)
17803 << ", " << std::get<2>(t)
17804 << dendl;
17805 base <<= 1;
17806 }
17807 dout(0) << "------------" << dendl;
17808
f67539c2 17809 ++ probe_count;
9f95a23c 17810
f67539c2
TL
17811 for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) {
17812 if ((probe_count % (1 << i)) == 0) {
17813 alloc_stats_history[i] = alloc_stats_history[i - 1];
17814 }
9f95a23c
TL
17815 }
17816 alloc_stats_history[0].swap(t0);
17817}
17818
7c673cae 17819// ===========================================
11fdf7f2
TL
17820// BlueStoreRepairer
17821
17822size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
17823 const interval_set<uint64_t>& extents)
17824{
17825 ceph_assert(granularity); // initialized
17826 // can't call for the second time
17827 ceph_assert(!was_filtered_out);
17828 ceph_assert(collections_bfs.size() == objects_bfs.size());
17829
17830 uint64_t prev_pos = 0;
17831 uint64_t npos = collections_bfs.size();
17832
17833 bloom_vector collections_reduced;
17834 bloom_vector objects_reduced;
17835
17836 for (auto e : extents) {
17837 if (e.second == 0) {
17838 continue;
17839 }
17840 uint64_t pos = max(e.first / granularity, prev_pos);
17841 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
17842 while (pos != npos && pos < end_pos) {
17843 ceph_assert( collections_bfs[pos].element_count() ==
17844 objects_bfs[pos].element_count());
17845 if (collections_bfs[pos].element_count()) {
17846 collections_reduced.push_back(std::move(collections_bfs[pos]));
17847 objects_reduced.push_back(std::move(objects_bfs[pos]));
17848 }
17849 ++pos;
17850 }
17851 prev_pos = end_pos;
17852 }
17853 collections_reduced.swap(collections_bfs);
17854 objects_reduced.swap(objects_bfs);
17855 was_filtered_out = true;
17856 return collections_bfs.size();
17857}
17858
17859bool BlueStoreRepairer::remove_key(KeyValueDB *db,
17860 const string& prefix,
17861 const string& key)
17862{
b3b6e05e 17863 std::lock_guard l(lock);
11fdf7f2
TL
17864 if (!remove_key_txn) {
17865 remove_key_txn = db->get_transaction();
17866 }
17867 ++to_repair_cnt;
17868 remove_key_txn->rmkey(prefix, key);
17869
17870 return true;
17871}
17872
f67539c2 17873void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val)
9f95a23c 17874{
b3b6e05e
TL
17875 std::lock_guard l(lock); // possibly redundant
17876 ceph_assert(fix_per_pool_omap_txn == nullptr);
9f95a23c
TL
17877 fix_per_pool_omap_txn = db->get_transaction();
17878 ++to_repair_cnt;
17879 bufferlist bl;
f67539c2 17880 bl.append(stringify(val));
9f95a23c
TL
17881 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
17882}
17883
11fdf7f2 17884bool BlueStoreRepairer::fix_shared_blob(
20effc67 17885 KeyValueDB::Transaction txn,
11fdf7f2 17886 uint64_t sbid,
20effc67
TL
17887 bluestore_extent_ref_map_t* ref_map,
17888 size_t repaired)
11fdf7f2 17889{
11fdf7f2
TL
17890 string key;
17891 get_shared_blob_key(sbid, &key);
20effc67
TL
17892 if (ref_map) {
17893 bluestore_shared_blob_t persistent(sbid, std::move(*ref_map));
17894 bufferlist bl;
17895 encode(persistent, bl);
17896 txn->set(PREFIX_SHARED_BLOB, key, bl);
11fdf7f2
TL
17897 } else {
17898 txn->rmkey(PREFIX_SHARED_BLOB, key);
17899 }
20effc67 17900 to_repair_cnt += repaired;
11fdf7f2
TL
17901 return true;
17902}
17903
17904bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
17905 const string& key,
17906 const store_statfs_t& new_statfs)
17907{
b3b6e05e 17908 std::lock_guard l(lock);
11fdf7f2
TL
17909 if (!fix_statfs_txn) {
17910 fix_statfs_txn = db->get_transaction();
17911 }
17912 BlueStore::volatile_statfs vstatfs;
17913 vstatfs = new_statfs;
17914 bufferlist bl;
17915 vstatfs.encode(bl);
17916 ++to_repair_cnt;
17917 fix_statfs_txn->set(PREFIX_STAT, key, bl);
17918 return true;
17919}
17920
17921bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
17922 FreelistManager* fm,
17923 uint64_t offset, uint64_t len)
17924{
b3b6e05e 17925 std::lock_guard l(lock);
20effc67
TL
17926 ceph_assert(!fm->is_null_manager());
17927
11fdf7f2
TL
17928 if (!fix_fm_leaked_txn) {
17929 fix_fm_leaked_txn = db->get_transaction();
17930 }
17931 ++to_repair_cnt;
17932 fm->release(offset, len, fix_fm_leaked_txn);
17933 return true;
17934}
17935bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
17936 FreelistManager* fm,
17937 uint64_t offset, uint64_t len)
17938{
b3b6e05e 17939 std::lock_guard l(lock);
20effc67
TL
17940 ceph_assert(!fm->is_null_manager());
17941
11fdf7f2
TL
17942 if (!fix_fm_false_free_txn) {
17943 fix_fm_false_free_txn = db->get_transaction();
17944 }
17945 ++to_repair_cnt;
17946 fm->allocate(offset, len, fix_fm_false_free_txn);
17947 return true;
17948}
17949
b3b6e05e
TL
17950bool BlueStoreRepairer::fix_spanning_blobs(
17951 KeyValueDB* db,
17952 std::function<void(KeyValueDB::Transaction)> f)
adb31ebb 17953{
b3b6e05e 17954 std::lock_guard l(lock);
adb31ebb
TL
17955 if (!fix_onode_txn) {
17956 fix_onode_txn = db->get_transaction();
17957 }
b3b6e05e 17958 f(fix_onode_txn);
adb31ebb 17959 ++to_repair_cnt;
b3b6e05e 17960 return true;
adb31ebb
TL
17961}
17962
11fdf7f2
TL
17963bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
17964{
b3b6e05e 17965 //NB: not for use in multithreading mode!!!
11fdf7f2
TL
17966 if (misreferenced_extents.size()) {
17967 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
17968 ceph_assert(n > 0);
17969 if (!fix_misreferences_txn) {
17970 fix_misreferences_txn = db->get_transaction();
17971 }
17972 return true;
17973 }
17974 return false;
17975}
17976
17977unsigned BlueStoreRepairer::apply(KeyValueDB* db)
17978{
b3b6e05e 17979 //NB: not for use in multithreading mode!!!
9f95a23c 17980 if (fix_per_pool_omap_txn) {
20effc67
TL
17981 auto ok = db->submit_transaction_sync(fix_per_pool_omap_txn) == 0;
17982 ceph_assert(ok);
9f95a23c
TL
17983 fix_per_pool_omap_txn = nullptr;
17984 }
11fdf7f2 17985 if (fix_fm_leaked_txn) {
20effc67
TL
17986 auto ok = db->submit_transaction_sync(fix_fm_leaked_txn) == 0;
17987 ceph_assert(ok);
11fdf7f2
TL
17988 fix_fm_leaked_txn = nullptr;
17989 }
17990 if (fix_fm_false_free_txn) {
20effc67
TL
17991 auto ok = db->submit_transaction_sync(fix_fm_false_free_txn) == 0;
17992 ceph_assert(ok);
11fdf7f2
TL
17993 fix_fm_false_free_txn = nullptr;
17994 }
17995 if (remove_key_txn) {
20effc67
TL
17996 auto ok = db->submit_transaction_sync(remove_key_txn) == 0;
17997 ceph_assert(ok);
11fdf7f2
TL
17998 remove_key_txn = nullptr;
17999 }
18000 if (fix_misreferences_txn) {
20effc67
TL
18001 auto ok = db->submit_transaction_sync(fix_misreferences_txn) == 0;
18002 ceph_assert(ok);
11fdf7f2
TL
18003 fix_misreferences_txn = nullptr;
18004 }
adb31ebb 18005 if (fix_onode_txn) {
20effc67
TL
18006 auto ok = db->submit_transaction_sync(fix_onode_txn) == 0;
18007 ceph_assert(ok);
adb31ebb
TL
18008 fix_onode_txn = nullptr;
18009 }
11fdf7f2 18010 if (fix_shared_blob_txn) {
20effc67
TL
18011 auto ok = db->submit_transaction_sync(fix_shared_blob_txn) == 0;
18012 ceph_assert(ok);
11fdf7f2
TL
18013 fix_shared_blob_txn = nullptr;
18014 }
11fdf7f2 18015 if (fix_statfs_txn) {
20effc67
TL
18016 auto ok = db->submit_transaction_sync(fix_statfs_txn) == 0;
18017 ceph_assert(ok);
11fdf7f2
TL
18018 fix_statfs_txn = nullptr;
18019 }
522d829b
TL
18020 if (need_compact) {
18021 db->compact();
18022 need_compact = false;
18023 }
11fdf7f2
TL
18024 unsigned repaired = to_repair_cnt;
18025 to_repair_cnt = 0;
18026 return repaired;
18027}
18028
18029// =======================================================
9f95a23c
TL
18030// RocksDBBlueFSVolumeSelector
18031
18032uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
18033 ceph_assert(h != nullptr);
18034 uint64_t hint = reinterpret_cast<uint64_t>(h);
18035 uint8_t res;
18036 switch (hint) {
18037 case LEVEL_SLOW:
18038 res = BlueFS::BDEV_SLOW;
18039 if (db_avail4slow > 0) {
18040 // considering statically available db space vs.
18041 // - observed maximums on DB dev for DB/WAL/UNSORTED data
18042 // - observed maximum spillovers
18043 uint64_t max_db_use = 0; // max db usage we potentially observed
f6b5b4d7 18044 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
9f95a23c
TL
18045 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
18046 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
18047 // this could go to db hence using it in the estimation
18048 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
18049
18050 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
18051 uint64_t avail = min(
18052 db_avail4slow,
18053 max_db_use < db_total ? db_total - max_db_use : 0);
18054
18055 // considering current DB dev usage for SLOW data
18056 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
18057 res = BlueFS::BDEV_DB;
18058 }
18059 }
18060 break;
f6b5b4d7 18061 case LEVEL_LOG:
9f95a23c
TL
18062 case LEVEL_WAL:
18063 res = BlueFS::BDEV_WAL;
18064 break;
18065 case LEVEL_DB:
18066 default:
18067 res = BlueFS::BDEV_DB;
18068 break;
18069 }
18070 return res;
18071}
18072
18073void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
18074{
a4b75251
TL
18075 auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST];
18076 res.emplace_back(base, db_size);
18077 auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST];
18078 if (slow_size == 0) {
18079 slow_size = db_size;
18080 }
18081 res.emplace_back(base + ".slow", slow_size);
9f95a23c
TL
18082}
18083
b3b6e05e 18084void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
9f95a23c
TL
18085 uint8_t res = LEVEL_DB;
18086 if (dirname.length() > 5) {
18087 // the "db.slow" and "db.wal" directory names are hard-coded at
18088 // match up with bluestore. the slow device is always the second
18089 // one (when a dedicated block.db device is present and used at
18090 // bdev 0). the wal device is always last.
18091 if (boost::algorithm::ends_with(dirname, ".slow")) {
18092 res = LEVEL_SLOW;
18093 }
18094 else if (boost::algorithm::ends_with(dirname, ".wal")) {
18095 res = LEVEL_WAL;
18096 }
18097 }
18098 return reinterpret_cast<void*>(res);
18099}
18100
18101void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
18102 auto max_x = per_level_per_dev_usage.get_max_x();
18103 auto max_y = per_level_per_dev_usage.get_max_y();
1e59de90
TL
18104
18105 sout << "RocksDBBlueFSVolumeSelector Usage Matrix:" << std::endl;
f6b5b4d7 18106 constexpr std::array<const char*, 8> names{ {
9f95a23c
TL
18107 "DEV/LEV",
18108 "WAL",
18109 "DB",
18110 "SLOW",
18111 "*",
18112 "*",
f6b5b4d7
TL
18113 "REAL",
18114 "FILES",
9f95a23c
TL
18115 } };
18116 const size_t width = 12;
18117 for (size_t i = 0; i < names.size(); ++i) {
18118 sout.setf(std::ios::left, std::ios::adjustfield);
18119 sout.width(width);
18120 sout << names[i];
18121 }
18122 sout << std::endl;
18123 for (size_t l = 0; l < max_y; l++) {
18124 sout.setf(std::ios::left, std::ios::adjustfield);
18125 sout.width(width);
18126 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
18127 case LEVEL_LOG:
18128 sout << "LOG"; break;
9f95a23c
TL
18129 case LEVEL_WAL:
18130 sout << "WAL"; break;
18131 case LEVEL_DB:
18132 sout << "DB"; break;
18133 case LEVEL_SLOW:
18134 sout << "SLOW"; break;
18135 case LEVEL_MAX:
1e59de90 18136 sout << "TOTAL"; break;
9f95a23c 18137 }
f6b5b4d7 18138 for (size_t d = 0; d < max_x; d++) {
9f95a23c
TL
18139 sout.setf(std::ios::left, std::ios::adjustfield);
18140 sout.width(width);
18141 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
18142 }
18143 sout.setf(std::ios::left, std::ios::adjustfield);
18144 sout.width(width);
f6b5b4d7 18145 sout << stringify(per_level_files[l]) << std::endl;
9f95a23c
TL
18146 }
18147 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
18148 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
18149 sout << "MAXIMUMS:" << std::endl;
18150 for (size_t l = 0; l < max_y; l++) {
18151 sout.setf(std::ios::left, std::ios::adjustfield);
18152 sout.width(width);
18153 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
18154 case LEVEL_LOG:
18155 sout << "LOG"; break;
9f95a23c
TL
18156 case LEVEL_WAL:
18157 sout << "WAL"; break;
18158 case LEVEL_DB:
18159 sout << "DB"; break;
18160 case LEVEL_SLOW:
18161 sout << "SLOW"; break;
18162 case LEVEL_MAX:
1e59de90 18163 sout << "TOTAL"; break;
9f95a23c
TL
18164 }
18165 for (size_t d = 0; d < max_x - 1; d++) {
18166 sout.setf(std::ios::left, std::ios::adjustfield);
18167 sout.width(width);
18168 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
18169 }
18170 sout.setf(std::ios::left, std::ios::adjustfield);
18171 sout.width(width);
18172 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
1e59de90 18173 sout << std::endl;
9f95a23c 18174 }
1e59de90
TL
18175 string sizes[] = {
18176 ">> SIZE <<",
18177 stringify(byte_u_t(l_totals[LEVEL_WAL - LEVEL_FIRST])),
18178 stringify(byte_u_t(l_totals[LEVEL_DB - LEVEL_FIRST])),
18179 stringify(byte_u_t(l_totals[LEVEL_SLOW - LEVEL_FIRST])),
18180 };
18181 for (size_t i = 0; i < (sizeof(sizes) / sizeof(sizes[0])); i++) {
18182 sout.setf(std::ios::left, std::ios::adjustfield);
18183 sout.width(width);
18184 sout << sizes[i];
18185 }
18186 sout << std::endl;
9f95a23c 18187}
11fdf7f2 18188
20effc67
TL
18189BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const {
18190 RocksDBBlueFSVolumeSelector* ns =
18191 new RocksDBBlueFSVolumeSelector(0, 0, 0,
18192 0, 0, 0,
18193 0, 0, false);
18194 return ns;
18195}
18196
18197bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) {
18198 RocksDBBlueFSVolumeSelector* o = dynamic_cast<RocksDBBlueFSVolumeSelector*>(other);
18199 ceph_assert(o);
18200 bool equal = true;
18201 for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) {
18202 for (size_t y = 0; y <LEVEL_MAX - LEVEL_FIRST + 1; y++) {
18203 equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y));
18204 }
18205 }
18206 for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) {
18207 equal &= (per_level_files[t] == o->per_level_files[t]);
18208 }
18209 return equal;
18210}
18211
9f95a23c 18212// =======================================================
20effc67
TL
18213
18214//================================================================================================================
18215// BlueStore is committing all allocation information (alloc/release) into RocksDB before the client Write is performed.
18216// This cause a delay in write path and add significant load to the CPU/Memory/Disk.
18217// The reason for the RocksDB updates is that it allows Ceph to survive any failure without losing the allocation state.
18218//
18219// We changed the code skiping RocksDB updates on allocation time and instead performing a full desatge of the allocator object
18220// with all the OSD allocation state in a single step during umount().
18221// This change leads to a 25% increase in IOPS and reduced latency in small random-write workload, but exposes the system
18222// to losing allocation info in failure cases where we don't call umount.
18223// We add code to perform a full allocation-map rebuild from information stored inside the ONode which is used in failure cases.
18224// When we perform a graceful shutdown there is no need for recovery and we simply read the allocation-map from a flat file
18225// where we store the allocation-map during umount().
18226//================================================================================================================
18227
18228#undef dout_prefix
18229#define dout_prefix *_dout << "bluestore::NCB::" << __func__ << "::"
18230
18231static const std::string allocator_dir = "ALLOCATOR_NCB_DIR";
18232static const std::string allocator_file = "ALLOCATOR_NCB_FILE";
18233static uint32_t s_format_version = 0x01; // support future changes to allocator-map file
18234static uint32_t s_serial = 0x01;
18235
18236#if 1
18237#define CEPHTOH_32 le32toh
18238#define CEPHTOH_64 le64toh
18239#define HTOCEPH_32 htole32
18240#define HTOCEPH_64 htole64
18241#else
18242// help debug the encode/decode by forcing alien format
18243#define CEPHTOH_32 be32toh
18244#define CEPHTOH_64 be64toh
18245#define HTOCEPH_32 htobe32
18246#define HTOCEPH_64 htobe64
18247#endif
18248
18249// 48 Bytes header for on-disk alloator image
18250const uint64_t ALLOCATOR_IMAGE_VALID_SIGNATURE = 0x1FACE0FF;
18251struct allocator_image_header {
18252 uint32_t format_version; // 0x00
18253 uint32_t valid_signature; // 0x04
18254 utime_t timestamp; // 0x08
18255 uint32_t serial; // 0x10
18256 uint32_t pad[0x7]; // 0x14
18257
18258 allocator_image_header() {
18259 memset((char*)this, 0, sizeof(allocator_image_header));
18260 }
18261
18262 // create header in CEPH format
18263 allocator_image_header(utime_t timestamp, uint32_t format_version, uint32_t serial) {
18264 this->format_version = format_version;
18265 this->timestamp = timestamp;
18266 this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
18267 this->serial = serial;
18268 memset(this->pad, 0, sizeof(this->pad));
18269 }
18270
18271 friend std::ostream& operator<<(std::ostream& out, const allocator_image_header& header) {
18272 out << "format_version = " << header.format_version << std::endl;
18273 out << "valid_signature = " << header.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
18274 out << "timestamp = " << header.timestamp << std::endl;
18275 out << "serial = " << header.serial << std::endl;
18276 for (unsigned i = 0; i < sizeof(header.pad)/sizeof(uint32_t); i++) {
18277 if (header.pad[i]) {
18278 out << "header.pad[" << i << "] = " << header.pad[i] << std::endl;
18279 }
18280 }
18281 return out;
18282 }
18283
18284 DENC(allocator_image_header, v, p) {
18285 denc(v.format_version, p);
18286 denc(v.valid_signature, p);
18287 denc(v.timestamp.tv.tv_sec, p);
18288 denc(v.timestamp.tv.tv_nsec, p);
18289 denc(v.serial, p);
18290 for (auto& pad: v.pad) {
18291 denc(pad, p);
18292 }
18293 }
18294
18295
18296 int verify(CephContext* cct, const std::string &path) {
18297 if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
18298 for (unsigned i = 0; i < (sizeof(pad) / sizeof(uint32_t)); i++) {
18299 if (this->pad[i]) {
18300 derr << "Illegal Header - pad[" << i << "]="<< pad[i] << dendl;
18301 return -1;
18302 }
18303 }
18304 return 0;
18305 }
18306 else {
18307 derr << "Illegal Header - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
18308 return -1;
18309 }
18310 }
18311};
18312WRITE_CLASS_DENC(allocator_image_header)
18313
18314// 56 Bytes trailer for on-disk alloator image
18315struct allocator_image_trailer {
18316 extent_t null_extent; // 0x00
18317
18318 uint32_t format_version; // 0x10
18319 uint32_t valid_signature; // 0x14
18320
18321 utime_t timestamp; // 0x18
18322
18323 uint32_t serial; // 0x20
18324 uint32_t pad; // 0x24
18325 uint64_t entries_count; // 0x28
18326 uint64_t allocation_size; // 0x30
18327
18328 // trailer is created in CEPH format
18329 allocator_image_trailer(utime_t timestamp, uint32_t format_version, uint32_t serial, uint64_t entries_count, uint64_t allocation_size) {
18330 memset((char*)&(this->null_extent), 0, sizeof(this->null_extent));
18331 this->format_version = format_version;
18332 this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
18333 this->timestamp = timestamp;
18334 this->serial = serial;
18335 this->pad = 0;
18336 this->entries_count = entries_count;
18337 this->allocation_size = allocation_size;
18338 }
18339
18340 allocator_image_trailer() {
18341 memset((char*)this, 0, sizeof(allocator_image_trailer));
18342 }
18343
18344 friend std::ostream& operator<<(std::ostream& out, const allocator_image_trailer& trailer) {
18345 if (trailer.null_extent.offset || trailer.null_extent.length) {
18346 out << "trailer.null_extent.offset = " << trailer.null_extent.offset << std::endl;
18347 out << "trailer.null_extent.length = " << trailer.null_extent.length << std::endl;
18348 }
18349 out << "format_version = " << trailer.format_version << std::endl;
18350 out << "valid_signature = " << trailer.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
18351 out << "timestamp = " << trailer.timestamp << std::endl;
18352 out << "serial = " << trailer.serial << std::endl;
18353 if (trailer.pad) {
18354 out << "trailer.pad= " << trailer.pad << std::endl;
18355 }
18356 out << "entries_count = " << trailer.entries_count << std::endl;
18357 out << "allocation_size = " << trailer.allocation_size << std::endl;
18358 return out;
18359 }
18360
18361 int verify(CephContext* cct, const std::string &path, const allocator_image_header *p_header, uint64_t entries_count, uint64_t allocation_size) {
18362 if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
18363
18364 // trailer must starts with null extents (both fields set to zero) [no need to convert formats for zero)
18365 if (null_extent.offset || null_extent.length) {
18366 derr << "illegal trailer - null_extent = [" << null_extent.offset << "," << null_extent.length << "]"<< dendl;
18367 return -1;
18368 }
18369
18370 if (serial != p_header->serial) {
18371 derr << "Illegal trailer: header->serial(" << p_header->serial << ") != trailer->serial(" << serial << ")" << dendl;
18372 return -1;
18373 }
18374
18375 if (format_version != p_header->format_version) {
18376 derr << "Illegal trailer: header->format_version(" << p_header->format_version
18377 << ") != trailer->format_version(" << format_version << ")" << dendl;
18378 return -1;
18379 }
18380
18381 if (timestamp != p_header->timestamp) {
18382 derr << "Illegal trailer: header->timestamp(" << p_header->timestamp
18383 << ") != trailer->timestamp(" << timestamp << ")" << dendl;
18384 return -1;
18385 }
18386
18387 if (this->entries_count != entries_count) {
18388 derr << "Illegal trailer: entries_count(" << entries_count << ") != trailer->entries_count("
18389 << this->entries_count << ")" << dendl;
18390 return -1;
18391 }
18392
18393 if (this->allocation_size != allocation_size) {
18394 derr << "Illegal trailer: allocation_size(" << allocation_size << ") != trailer->allocation_size("
18395 << this->allocation_size << ")" << dendl;
18396 return -1;
18397 }
18398
18399 if (pad) {
18400 derr << "Illegal Trailer - pad="<< pad << dendl;
18401 return -1;
18402 }
18403
18404 // if arrived here -> trailer is valid !!
18405 return 0;
18406 } else {
18407 derr << "Illegal Trailer - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
18408 return -1;
18409 }
18410 }
18411
18412 DENC(allocator_image_trailer, v, p) {
18413 denc(v.null_extent.offset, p);
18414 denc(v.null_extent.length, p);
18415 denc(v.format_version, p);
18416 denc(v.valid_signature, p);
18417 denc(v.timestamp.tv.tv_sec, p);
18418 denc(v.timestamp.tv.tv_nsec, p);
18419 denc(v.serial, p);
18420 denc(v.pad, p);
18421 denc(v.entries_count, p);
18422 denc(v.allocation_size, p);
18423 }
18424};
18425WRITE_CLASS_DENC(allocator_image_trailer)
18426
18427
18428//-------------------------------------------------------------------------------------
18429// invalidate old allocation file if exists so will go directly to recovery after failure
18430// we can safely ignore non-existing file
18431int BlueStore::invalidate_allocation_file_on_bluefs()
18432{
18433 // mark that allocation-file was invalidated and we should destage a new copy whne closing db
18434 need_to_destage_allocation_file = true;
39ae355f 18435 dout(10) << __func__ << " need_to_destage_allocation_file was set" << dendl;
20effc67
TL
18436
18437 BlueFS::FileWriter *p_handle = nullptr;
18438 if (!bluefs->dir_exists(allocator_dir)) {
18439 dout(5) << "allocator_dir(" << allocator_dir << ") doesn't exist" << dendl;
18440 // nothing to do -> return
18441 return 0;
18442 }
18443
18444 int ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
18445 if (ret != 0) {
39ae355f 18446 dout(5) << __func__ << " allocator_file(" << allocator_file << ") doesn't exist" << dendl;
20effc67
TL
18447 // nothing to do -> return
18448 return 0;
18449 }
18450
18451
18452 ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, true);
18453 if (ret != 0) {
39ae355f
TL
18454 derr << __func__ << "::NCB:: Failed open_for_write with error-code "
18455 << ret << dendl;
20effc67
TL
18456 return -1;
18457 }
18458
18459 dout(5) << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
18460 ret = bluefs->truncate(p_handle, 0);
18461 if (ret != 0) {
39ae355f
TL
18462 derr << __func__ << "::NCB:: Failed truncaste with error-code "
18463 << ret << dendl;
20effc67
TL
18464 bluefs->close_writer(p_handle);
18465 return -1;
18466 }
18467
18468 bluefs->fsync(p_handle);
18469 bluefs->close_writer(p_handle);
18470
18471 return 0;
18472}
18473
20effc67
TL
18474//-----------------------------------------------------------------------------------
18475int BlueStore::copy_allocator(Allocator* src_alloc, Allocator* dest_alloc, uint64_t* p_num_entries)
18476{
18477 *p_num_entries = 0;
18478 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
18479 (*p_num_entries)++;
18480 };
1e59de90 18481 src_alloc->foreach(count_entries);
20effc67
TL
18482
18483 dout(5) << "count num_entries=" << *p_num_entries << dendl;
18484
18485 // add 16K extra entries in case new allocation happened
18486 (*p_num_entries) += 16*1024;
18487 unique_ptr<extent_t[]> arr;
18488 try {
18489 arr = make_unique<extent_t[]>(*p_num_entries);
18490 } catch (std::bad_alloc&) {
18491 derr << "****Failed dynamic allocation, num_entries=" << *p_num_entries << dendl;
18492 return -1;
18493 }
18494
18495 uint64_t idx = 0;
18496 auto copy_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
18497 if (extent_length > 0) {
18498 if (idx < *p_num_entries) {
18499 arr[idx] = {extent_offset, extent_length};
18500 }
18501 idx++;
18502 }
18503 else {
18504 derr << "zero length extent!!! offset=" << extent_offset << ", index=" << idx << dendl;
18505 }
18506 };
1e59de90 18507 src_alloc->foreach(copy_entries);
20effc67
TL
18508
18509 dout(5) << "copy num_entries=" << idx << dendl;
18510 if (idx > *p_num_entries) {
18511 derr << "****spillover, num_entries=" << *p_num_entries << ", spillover=" << (idx - *p_num_entries) << dendl;
18512 ceph_assert(idx <= *p_num_entries);
18513 }
18514
18515 *p_num_entries = idx;
18516
18517 for (idx = 0; idx < *p_num_entries; idx++) {
18518 const extent_t *p_extent = &arr[idx];
18519 dest_alloc->init_add_free(p_extent->offset, p_extent->length);
18520 }
18521
18522 return 0;
18523}
18524
18525//-----------------------------------------------------------------------------------
18526static uint32_t flush_extent_buffer_with_crc(BlueFS::FileWriter *p_handle, const char* buffer, const char *p_curr, uint32_t crc)
18527{
18528 std::ptrdiff_t length = p_curr - buffer;
18529 p_handle->append(buffer, length);
18530
18531 crc = ceph_crc32c(crc, (const uint8_t*)buffer, length);
18532 uint32_t encoded_crc = HTOCEPH_32(crc);
18533 p_handle->append((byte*)&encoded_crc, sizeof(encoded_crc));
18534
18535 return crc;
18536}
18537
18538const unsigned MAX_EXTENTS_IN_BUFFER = 4 * 1024; // 4K extents = 64KB of data
18539// write the allocator to a flat bluefs file - 4K extents at a time
18540//-----------------------------------------------------------------------------------
18541int BlueStore::store_allocator(Allocator* src_allocator)
18542{
18543 // when storing allocations to file we must be sure there is no background compactions
18544 // the easiest way to achieve it is to make sure db is closed
18545 ceph_assert(db == nullptr);
18546 utime_t start_time = ceph_clock_now();
18547 int ret = 0;
18548
18549 // create dir if doesn't exist already
18550 if (!bluefs->dir_exists(allocator_dir) ) {
18551 ret = bluefs->mkdir(allocator_dir);
18552 if (ret != 0) {
18553 derr << "Failed mkdir with error-code " << ret << dendl;
18554 return -1;
18555 }
18556 }
1d09f67e 18557 bluefs->compact_log();
20effc67
TL
18558 // reuse previous file-allocation if exists
18559 ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
18560 bool overwrite_file = (ret == 0);
20effc67
TL
18561 BlueFS::FileWriter *p_handle = nullptr;
18562 ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, overwrite_file);
18563 if (ret != 0) {
18564 derr << __func__ << "Failed open_for_write with error-code " << ret << dendl;
18565 return -1;
18566 }
18567
18568 uint64_t file_size = p_handle->file->fnode.size;
18569 uint64_t allocated = p_handle->file->fnode.get_allocated();
1d09f67e 18570 dout(10) << "file_size=" << file_size << ", allocated=" << allocated << dendl;
20effc67 18571
1d09f67e 18572 bluefs->sync_metadata(false);
20effc67
TL
18573 unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(src_allocator));
18574 if (!allocator) {
18575 bluefs->close_writer(p_handle);
18576 return -1;
18577 }
18578
18579 // store all extents (except for the bluefs extents we removed) in a single flat file
18580 utime_t timestamp = ceph_clock_now();
18581 uint32_t crc = -1;
18582 {
18583 allocator_image_header header(timestamp, s_format_version, s_serial);
18584 bufferlist header_bl;
18585 encode(header, header_bl);
18586 crc = header_bl.crc32c(crc);
18587 encode(crc, header_bl);
18588 p_handle->append(header_bl);
18589 }
18590
18591 crc = -1; // reset crc
18592 extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
18593 extent_t *p_curr = buffer;
18594 const extent_t *p_end = buffer + MAX_EXTENTS_IN_BUFFER;
18595 uint64_t extent_count = 0;
18596 uint64_t allocation_size = 0;
18597 auto iterated_allocation = [&](uint64_t extent_offset, uint64_t extent_length) {
18598 if (extent_length == 0) {
18599 derr << __func__ << "" << extent_count << "::[" << extent_offset << "," << extent_length << "]" << dendl;
18600 ret = -1;
18601 return;
18602 }
18603 p_curr->offset = HTOCEPH_64(extent_offset);
18604 p_curr->length = HTOCEPH_64(extent_length);
18605 extent_count++;
18606 allocation_size += extent_length;
18607 p_curr++;
18608
18609 if (p_curr == p_end) {
18610 crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
18611 p_curr = buffer; // recycle the buffer
18612 }
18613 };
1e59de90 18614 allocator->foreach(iterated_allocation);
20effc67
TL
18615 // if got null extent -> fail the operation
18616 if (ret != 0) {
18617 derr << "Illegal extent, fail store operation" << dendl;
18618 derr << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
18619 bluefs->truncate(p_handle, 0);
18620 bluefs->close_writer(p_handle);
18621 return -1;
18622 }
18623
18624 // if we got any leftovers -> add crc and append to file
18625 if (p_curr > buffer) {
18626 crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
18627 }
18628
18629 {
18630 allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
18631 bufferlist trailer_bl;
18632 encode(trailer, trailer_bl);
18633 uint32_t crc = -1;
18634 crc = trailer_bl.crc32c(crc);
18635 encode(crc, trailer_bl);
18636 p_handle->append(trailer_bl);
18637 }
18638
18639 bluefs->fsync(p_handle);
18640 bluefs->truncate(p_handle, p_handle->pos);
18641 bluefs->fsync(p_handle);
18642
18643 utime_t duration = ceph_clock_now() - start_time;
1d09f67e 18644 dout(5) <<"WRITE-extent_count=" << extent_count << ", allocation_size=" << allocation_size << ", serial=" << s_serial << dendl;
20effc67
TL
18645 dout(5) <<"p_handle->pos=" << p_handle->pos << " WRITE-duration=" << duration << " seconds" << dendl;
18646
18647 bluefs->close_writer(p_handle);
18648 need_to_destage_allocation_file = false;
20effc67
TL
18649 return 0;
18650}
18651
18652//-----------------------------------------------------------------------------------
18653Allocator* BlueStore::create_bitmap_allocator(uint64_t bdev_size) {
18654 // create allocator
18655 uint64_t alloc_size = min_alloc_size;
18656 Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size,
18657 zone_size, first_sequential_zone,
18658 "recovery");
18659 if (alloc) {
18660 return alloc;
18661 } else {
18662 derr << "Failed Allocator Creation" << dendl;
18663 return nullptr;
18664 }
18665}
18666
18667//-----------------------------------------------------------------------------------
18668size_t calc_allocator_image_header_size()
18669{
18670 utime_t timestamp = ceph_clock_now();
18671 allocator_image_header header(timestamp, s_format_version, s_serial);
18672 bufferlist header_bl;
18673 encode(header, header_bl);
18674 uint32_t crc = -1;
18675 crc = header_bl.crc32c(crc);
18676 encode(crc, header_bl);
18677
18678 return header_bl.length();
18679}
18680
18681//-----------------------------------------------------------------------------------
18682int calc_allocator_image_trailer_size()
18683{
18684 utime_t timestamp = ceph_clock_now();
18685 uint64_t extent_count = -1;
18686 uint64_t allocation_size = -1;
18687 uint32_t crc = -1;
18688 bufferlist trailer_bl;
18689 allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
18690
18691 encode(trailer, trailer_bl);
18692 crc = trailer_bl.crc32c(crc);
18693 encode(crc, trailer_bl);
18694 return trailer_bl.length();
18695}
18696
18697//-----------------------------------------------------------------------------------
18698int BlueStore::__restore_allocator(Allocator* allocator, uint64_t *num, uint64_t *bytes)
18699{
39ae355f
TL
18700 if (cct->_conf->bluestore_debug_inject_allocation_from_file_failure > 0) {
18701 boost::mt11213b rng(time(NULL));
18702 boost::uniform_real<> ur(0, 1);
18703 if (ur(rng) < cct->_conf->bluestore_debug_inject_allocation_from_file_failure) {
18704 derr << __func__ << " failure injected." << dendl;
18705 return -1;
18706 }
18707 }
20effc67
TL
18708 utime_t start_time = ceph_clock_now();
18709 BlueFS::FileReader *p_temp_handle = nullptr;
18710 int ret = bluefs->open_for_read(allocator_dir, allocator_file, &p_temp_handle, false);
18711 if (ret != 0) {
1e59de90 18712 dout(1) << "Failed open_for_read with error-code " << ret << dendl;
20effc67
TL
18713 return -1;
18714 }
18715 unique_ptr<BlueFS::FileReader> p_handle(p_temp_handle);
18716 uint64_t read_alloc_size = 0;
18717 uint64_t file_size = p_handle->file->fnode.size;
18718 dout(5) << "file_size=" << file_size << ",sizeof(extent_t)=" << sizeof(extent_t) << dendl;
18719
18720 // make sure we were able to store a valid copy
18721 if (file_size == 0) {
1e59de90 18722 dout(1) << "No Valid allocation info on disk (empty file)" << dendl;
20effc67
TL
18723 return -1;
18724 }
18725
18726 // first read the header
18727 size_t offset = 0;
18728 allocator_image_header header;
18729 int header_size = calc_allocator_image_header_size();
18730 {
18731 bufferlist header_bl,temp_bl;
18732 int read_bytes = bluefs->read(p_handle.get(), offset, header_size, &temp_bl, nullptr);
18733 if (read_bytes != header_size) {
18734 derr << "Failed bluefs->read() for header::read_bytes=" << read_bytes << ", req_bytes=" << header_size << dendl;
18735 return -1;
18736 }
18737
18738 offset += read_bytes;
18739
18740 header_bl.claim_append(temp_bl);
18741 auto p = header_bl.cbegin();
18742 decode(header, p);
18743 if (header.verify(cct, path) != 0 ) {
18744 derr << "header = \n" << header << dendl;
18745 return -1;
18746 }
18747
18748 uint32_t crc_calc = -1, crc;
18749 crc_calc = header_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
18750 decode(crc, p);
18751 if (crc != crc_calc) {
18752 derr << "crc mismatch!!! crc=" << crc << ", crc_calc=" << crc_calc << dendl;
18753 derr << "header = \n" << header << dendl;
18754 return -1;
18755 }
18756
18757 // increment version for next store
18758 s_serial = header.serial + 1;
18759 }
18760
18761 // then read the payload (extents list) using a recycled buffer
18762 extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
18763 uint32_t crc = -1;
18764 int trailer_size = calc_allocator_image_trailer_size();
18765 uint64_t extent_count = 0;
18766 uint64_t extents_bytes_left = file_size - (header_size + trailer_size + sizeof(crc));
18767 while (extents_bytes_left) {
1e59de90 18768 int req_bytes = std::min(extents_bytes_left, static_cast<uint64_t>(sizeof(buffer)));
20effc67
TL
18769 int read_bytes = bluefs->read(p_handle.get(), offset, req_bytes, nullptr, (char*)buffer);
18770 if (read_bytes != req_bytes) {
18771 derr << "Failed bluefs->read()::read_bytes=" << read_bytes << ", req_bytes=" << req_bytes << dendl;
18772 return -1;
18773 }
18774
18775 offset += read_bytes;
18776 extents_bytes_left -= read_bytes;
18777
18778 const unsigned num_extent_in_buffer = read_bytes/sizeof(extent_t);
18779 const extent_t *p_end = buffer + num_extent_in_buffer;
18780 for (const extent_t *p_ext = buffer; p_ext < p_end; p_ext++) {
18781 uint64_t offset = CEPHTOH_64(p_ext->offset);
18782 uint64_t length = CEPHTOH_64(p_ext->length);
18783 read_alloc_size += length;
18784
18785 if (length > 0) {
18786 allocator->init_add_free(offset, length);
18787 extent_count ++;
18788 } else {
18789 derr << "extent with zero length at idx=" << extent_count << dendl;
18790 return -1;
18791 }
18792 }
18793
18794 uint32_t calc_crc = ceph_crc32c(crc, (const uint8_t*)buffer, read_bytes);
18795 read_bytes = bluefs->read(p_handle.get(), offset, sizeof(crc), nullptr, (char*)&crc);
18796 if (read_bytes == sizeof(crc) ) {
18797 crc = CEPHTOH_32(crc);
18798 if (crc != calc_crc) {
18799 derr << "data crc mismatch!!! crc=" << crc << ", calc_crc=" << calc_crc << dendl;
18800 derr << "extents_bytes_left=" << extents_bytes_left << ", offset=" << offset << ", extent_count=" << extent_count << dendl;
18801 return -1;
18802 }
18803
18804 offset += read_bytes;
18805 if (extents_bytes_left) {
18806 extents_bytes_left -= read_bytes;
18807 }
18808 } else {
18809 derr << "Failed bluefs->read() for crc::read_bytes=" << read_bytes << ", req_bytes=" << sizeof(crc) << dendl;
18810 return -1;
18811 }
18812
18813 }
18814
1e59de90 18815 // finally, read the trailer and verify it is in good shape and that we got all the extents
20effc67
TL
18816 {
18817 bufferlist trailer_bl,temp_bl;
18818 int read_bytes = bluefs->read(p_handle.get(), offset, trailer_size, &temp_bl, nullptr);
18819 if (read_bytes != trailer_size) {
18820 derr << "Failed bluefs->read() for trailer::read_bytes=" << read_bytes << ", req_bytes=" << trailer_size << dendl;
18821 return -1;
18822 }
18823 offset += read_bytes;
18824
18825 trailer_bl.claim_append(temp_bl);
18826 uint32_t crc_calc = -1;
18827 uint32_t crc;
18828 allocator_image_trailer trailer;
18829 auto p = trailer_bl.cbegin();
18830 decode(trailer, p);
18831 if (trailer.verify(cct, path, &header, extent_count, read_alloc_size) != 0 ) {
18832 derr << "trailer=\n" << trailer << dendl;
18833 return -1;
18834 }
18835
18836 crc_calc = trailer_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
18837 decode(crc, p);
18838 if (crc != crc_calc) {
18839 derr << "trailer crc mismatch!::crc=" << crc << ", crc_calc=" << crc_calc << dendl;
18840 derr << "trailer=\n" << trailer << dendl;
18841 return -1;
18842 }
18843 }
18844
18845 utime_t duration = ceph_clock_now() - start_time;
18846 dout(5) << "READ--extent_count=" << extent_count << ", read_alloc_size= "
18847 << read_alloc_size << ", file_size=" << file_size << dendl;
1d09f67e 18848 dout(5) << "READ duration=" << duration << " seconds, s_serial=" << header.serial << dendl;
20effc67
TL
18849 *num = extent_count;
18850 *bytes = read_alloc_size;
18851 return 0;
18852}
18853
18854//-----------------------------------------------------------------------------------
18855int BlueStore::restore_allocator(Allocator* dest_allocator, uint64_t *num, uint64_t *bytes)
18856{
18857 utime_t start = ceph_clock_now();
18858 auto temp_allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
18859 int ret = __restore_allocator(temp_allocator.get(), num, bytes);
18860 if (ret != 0) {
18861 return ret;
18862 }
18863
18864 uint64_t num_entries = 0;
18865 dout(5) << " calling copy_allocator(bitmap_allocator -> shared_alloc.a)" << dendl;
18866 copy_allocator(temp_allocator.get(), dest_allocator, &num_entries);
18867 utime_t duration = ceph_clock_now() - start;
18868 dout(5) << "restored in " << duration << " seconds, num_entries=" << num_entries << dendl;
18869 return ret;
18870}
18871
20effc67
TL
18872//-----------------------------------------------------------------------------------
18873void BlueStore::set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length)
18874{
39ae355f
TL
18875 dout(30) << __func__ << " 0x" << std::hex
18876 << offset << "~" << length
18877 << " " << min_alloc_size_mask
18878 << dendl;
20effc67
TL
18879 ceph_assert((offset & min_alloc_size_mask) == 0);
18880 ceph_assert((length & min_alloc_size_mask) == 0);
18881 sbmap->set(offset >> min_alloc_size_order, length >> min_alloc_size_order);
18882}
18883
39ae355f
TL
18884void BlueStore::ExtentDecoderPartial::_consume_new_blob(bool spanning,
18885 uint64_t extent_no,
18886 uint64_t sbid,
18887 BlobRef b)
18888{
18889 [[maybe_unused]] auto cct = store.cct;
18890 ceph_assert(per_pool_statfs);
18891 ceph_assert(oid != ghobject_t());
20effc67 18892
39ae355f
TL
18893 auto &blob = b->get_blob();
18894 if(spanning) {
18895 dout(20) << __func__ << " " << spanning << " " << b->id << dendl;
18896 ceph_assert(b->id >= 0);
18897 spanning_blobs[b->id] = b;
18898 ++stats.spanning_blob_count;
18899 } else {
18900 dout(20) << __func__ << " " << spanning << " " << extent_no << dendl;
18901 blobs[extent_no] = b;
18902 }
18903 bool compressed = blob.is_compressed();
18904 if (!blob.is_shared()) {
18905 for (auto& pe : blob.get_extents()) {
18906 if (pe.offset == bluestore_pextent_t::INVALID_OFFSET) {
18907 ++stats.skipped_illegal_extent;
18908 continue;
18909 }
18910 store.set_allocation_in_simple_bmap(&sbmap, pe.offset, pe.length);
20effc67 18911
39ae355f
TL
18912 per_pool_statfs->allocated() += pe.length;
18913 if (compressed) {
18914 per_pool_statfs->compressed_allocated() += pe.length;
18915 }
20effc67 18916 }
39ae355f
TL
18917 if (compressed) {
18918 per_pool_statfs->compressed() +=
18919 blob.get_compressed_payload_length();
18920 ++stats.compressed_blob_count;
20effc67 18921 }
39ae355f
TL
18922 } else {
18923 auto it = sb_info.find(sbid);
18924 if (it == sb_info.end()) {
18925 derr << __func__ << " shared blob not found:" << sbid
18926 << dendl;
18927 }
18928 auto &sbi = *it;
18929 auto pool_id = oid.hobj.get_logical_pool();
18930 if (sbi.pool_id == sb_info_t::INVALID_POOL_ID) {
18931 sbi.pool_id = pool_id;
18932 size_t alloc_delta = sbi.allocated_chunks << min_alloc_size_order;
18933 per_pool_statfs->allocated() += alloc_delta;
18934 if (compressed) {
18935 per_pool_statfs->compressed_allocated() += alloc_delta;
18936 ++stats.compressed_blob_count;
20effc67 18937 }
39ae355f
TL
18938 }
18939 if (compressed) {
18940 per_pool_statfs->compressed() +=
18941 blob.get_compressed_payload_length();
18942 }
18943 }
18944}
20effc67 18945
39ae355f
TL
18946void BlueStore::ExtentDecoderPartial::consume_blobid(Extent* le,
18947 bool spanning,
18948 uint64_t blobid)
18949{
18950 [[maybe_unused]] auto cct = store.cct;
18951 dout(20) << __func__ << " " << spanning << " " << blobid << dendl;
18952 auto &map = spanning ? spanning_blobs : blobs;
18953 auto it = map.find(blobid);
18954 ceph_assert(it != map.end());
18955 per_pool_statfs->stored() += le->length;
18956 if (it->second->get_blob().is_compressed()) {
18957 per_pool_statfs->compressed_original() += le->length;
18958 }
18959}
20effc67 18960
39ae355f
TL
18961void BlueStore::ExtentDecoderPartial::consume_blob(Extent* le,
18962 uint64_t extent_no,
18963 uint64_t sbid,
18964 BlobRef b)
18965{
18966 _consume_new_blob(false, extent_no, sbid, b);
18967 per_pool_statfs->stored() += le->length;
18968 if (b->get_blob().is_compressed()) {
18969 per_pool_statfs->compressed_original() += le->length;
18970 }
18971}
20effc67 18972
39ae355f
TL
18973void BlueStore::ExtentDecoderPartial::consume_spanning_blob(uint64_t sbid,
18974 BlobRef b)
18975{
18976 _consume_new_blob(true, 0/*doesn't matter*/, sbid, b);
18977}
20effc67 18978
39ae355f
TL
18979void BlueStore::ExtentDecoderPartial::reset(const ghobject_t _oid,
18980 volatile_statfs* _per_pool_statfs)
18981{
18982 oid = _oid;
18983 per_pool_statfs = _per_pool_statfs;
18984 blob_map_t empty;
18985 blob_map_t empty2;
18986 std::swap(blobs, empty);
18987 std::swap(spanning_blobs, empty2);
20effc67
TL
18988}
18989
20effc67
TL
18990int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats_t& stats)
18991{
39ae355f
TL
18992 sb_info_space_efficient_map_t sb_info;
18993 // iterate over all shared blobs
18994 auto it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
20effc67 18995 if (!it) {
39ae355f
TL
18996 derr << "failed getting shared blob's iterator" << dendl;
18997 return -ENOENT;
18998 }
18999 if (it) {
19000 for (it->lower_bound(string()); it->valid(); it->next()) {
19001 const auto& key = it->key();
19002 dout(20) << __func__ << " decode sb " << pretty_binary_string(key) << dendl;
19003 uint64_t sbid = 0;
19004 if (get_key_shared_blob(key, &sbid) != 0) {
19005 derr << __func__ << " bad shared blob key '" << pretty_binary_string(key)
19006 << "'" << dendl;
19007 }
19008 bluestore_shared_blob_t shared_blob(sbid);
19009 bufferlist bl = it->value();
19010 auto blp = bl.cbegin();
19011 try {
19012 decode(shared_blob, blp);
19013 }
19014 catch (ceph::buffer::error& e) {
19015 derr << __func__ << " failed to decode Shared Blob"
19016 << pretty_binary_string(key) << dendl;
19017 continue;
19018 }
19019 dout(20) << __func__ << " " << shared_blob << dendl;
19020 uint64_t allocated = 0;
19021 for (auto& r : shared_blob.ref_map.ref_map) {
19022 ceph_assert(r.first != bluestore_pextent_t::INVALID_OFFSET);
19023 set_allocation_in_simple_bmap(sbmap, r.first, r.second.length);
19024 allocated += r.second.length;
19025 }
19026 auto &sbi = sb_info.add_or_adopt(sbid);
19027 ceph_assert(p2phase(allocated, min_alloc_size) == 0);
19028 sbi.allocated_chunks += (allocated >> min_alloc_size_order);
19029 ++stats.shared_blob_count;
19030 }
19031 }
19032
19033 it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
19034 if (!it) {
19035 derr << "failed getting onode's iterator" << dendl;
19036 return -ENOENT;
20effc67
TL
19037 }
19038
20effc67
TL
19039 uint64_t kv_count = 0;
19040 uint64_t count_interval = 1'000'000;
39ae355f
TL
19041 ExtentDecoderPartial edecoder(*this,
19042 stats,
19043 *sbmap,
19044 sb_info,
19045 min_alloc_size_order);
19046
20effc67
TL
19047 // iterate over all ONodes stored in RocksDB
19048 for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) {
19049 // trace an even after every million processed objects (typically every 5-10 seconds)
19050 if (kv_count && (kv_count % count_interval == 0) ) {
39ae355f
TL
19051 dout(5) << __func__ << " processed objects count = " << kv_count << dendl;
19052 }
20effc67 19053
39ae355f
TL
19054 auto key = it->key();
19055 auto okey = key;
19056 dout(20) << __func__ << " decode onode " << pretty_binary_string(key) << dendl;
19057 ghobject_t oid;
19058 if (!is_extent_shard_key(it->key())) {
19059 int r = get_key_object(okey, &oid);
19060 if (r != 0) {
19061 derr << __func__ << " failed to decode onode key = "
19062 << pretty_binary_string(okey) << dendl;
19063 return -EIO;
20effc67 19064 }
39ae355f
TL
19065 edecoder.reset(oid,
19066 &stats.actual_pool_vstatfs[oid.hobj.get_logical_pool()]);
19067 Onode dummy_on(cct);
19068 Onode::decode_raw(&dummy_on,
19069 it->value(),
19070 edecoder);
19071 ++stats.onode_count;
19072 } else {
19073 uint32_t offset;
19074 int r = get_key_extent_shard(key, &okey, &offset);
19075 if (r != 0) {
19076 derr << __func__ << " failed to decode onode extent key = "
19077 << pretty_binary_string(key) << dendl;
19078 return -EIO;
20effc67 19079 }
39ae355f
TL
19080 r = get_key_object(okey, &oid);
19081 if (r != 0) {
19082 derr << __func__
19083 << " failed to decode onode key= " << pretty_binary_string(okey)
19084 << " from extent key= " << pretty_binary_string(key)
19085 << dendl;
19086 return -EIO;
20effc67 19087 }
39ae355f
TL
19088 ceph_assert(oid == edecoder.get_oid());
19089 edecoder.decode_some(it->value(), nullptr);
19090 ++stats.shard_count;
20effc67
TL
19091 }
19092 }
19093
39ae355f
TL
19094 std::lock_guard l(vstatfs_lock);
19095 store_statfs_t s;
19096 osd_pools.clear();
19097 for (auto& p : stats.actual_pool_vstatfs) {
19098 if (per_pool_stat_collection) {
19099 osd_pools[p.first] = p.second;
19100 }
19101 stats.actual_store_vstatfs += p.second;
19102 p.second.publish(&s);
19103 dout(5) << __func__ << " recovered pool "
19104 << std::hex
19105 << p.first << "->" << s
19106 << std::dec
19107 << " per-pool:" << per_pool_stat_collection
19108 << dendl;
20effc67 19109 }
39ae355f
TL
19110 vstatfs = stats.actual_store_vstatfs;
19111 vstatfs.publish(&s);
19112 dout(5) << __func__ << " recovered " << s
19113 << dendl;
20effc67
TL
19114 return 0;
19115}
19116
19117//---------------------------------------------------------
19118int BlueStore::reconstruct_allocations(SimpleBitmap *sbmap, read_alloc_stats_t &stats)
19119{
19120 // first set space used by superblock
19121 auto super_length = std::max<uint64_t>(min_alloc_size, SUPER_RESERVED);
19122 set_allocation_in_simple_bmap(sbmap, 0, super_length);
19123 stats.extent_count++;
19124
19125 // then set all space taken by Objects
19126 int ret = read_allocation_from_onodes(sbmap, stats);
19127 if (ret < 0) {
19128 derr << "failed read_allocation_from_onodes()" << dendl;
19129 return ret;
19130 }
19131
19132 return 0;
19133}
19134
19135//-----------------------------------------------------------------------------------
19136static void copy_simple_bitmap_to_allocator(SimpleBitmap* sbmap, Allocator* dest_alloc, uint64_t alloc_size)
19137{
1e59de90 19138 int alloc_size_shift = std::countr_zero(alloc_size);
20effc67
TL
19139 uint64_t offset = 0;
19140 extent_t ext = sbmap->get_next_clr_extent(offset);
19141 while (ext.length != 0) {
19142 dest_alloc->init_add_free(ext.offset << alloc_size_shift, ext.length << alloc_size_shift);
19143 offset = ext.offset + ext.length;
19144 ext = sbmap->get_next_clr_extent(offset);
19145 }
19146}
19147
19148//---------------------------------------------------------
19149int BlueStore::read_allocation_from_drive_on_startup()
19150{
19151 int ret = 0;
19152
19153 ret = _open_collections();
19154 if (ret < 0) {
19155 return ret;
19156 }
19157 auto shutdown_cache = make_scope_guard([&] {
19158 _shutdown_cache();
19159 });
19160
19161 utime_t start = ceph_clock_now();
19162 read_alloc_stats_t stats = {};
1d09f67e 19163 SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
20effc67
TL
19164 ret = reconstruct_allocations(&sbmap, stats);
19165 if (ret != 0) {
19166 return ret;
19167 }
19168
19169 copy_simple_bitmap_to_allocator(&sbmap, alloc, min_alloc_size);
19170
19171 utime_t duration = ceph_clock_now() - start;
19172 dout(1) << "::Allocation Recovery was completed in " << duration << " seconds, extent_count=" << stats.extent_count << dendl;
19173 return ret;
19174}
19175
19176
19177
19178
19179// Only used for debugging purposes - we build a secondary allocator from the Onodes and compare it to the existing one
19180// Not meant to be run by customers
19181#ifdef CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
19182
19183#include <stdlib.h>
19184#include <algorithm>
19185//---------------------------------------------------------
19186int cmpfunc (const void * a, const void * b)
19187{
19188 if ( ((extent_t*)a)->offset > ((extent_t*)b)->offset ) {
19189 return 1;
19190 }
19191 else if( ((extent_t*)a)->offset < ((extent_t*)b)->offset ) {
19192 return -1;
19193 }
19194 else {
19195 return 0;
19196 }
19197}
19198
19199// compare the allocator built from Onodes with the system allocator (CF-B)
19200//---------------------------------------------------------
19201int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t req_extent_count, uint64_t memory_target)
19202{
19203 uint64_t allocation_size = std::min((req_extent_count) * sizeof(extent_t), memory_target / 3);
19204 uint64_t extent_count = allocation_size/sizeof(extent_t);
19205 dout(5) << "req_extent_count=" << req_extent_count << ", granted extent_count="<< extent_count << dendl;
19206
19207 unique_ptr<extent_t[]> arr1;
19208 unique_ptr<extent_t[]> arr2;
19209 try {
19210 arr1 = make_unique<extent_t[]>(extent_count);
19211 arr2 = make_unique<extent_t[]>(extent_count);
19212 } catch (std::bad_alloc&) {
19213 derr << "****Failed dynamic allocation, extent_count=" << extent_count << dendl;
19214 return -1;
19215 }
19216
19217 // copy the extents from the allocators into simple array and then compare them
19218 uint64_t size1 = 0, size2 = 0;
19219 uint64_t idx1 = 0, idx2 = 0;
19220 auto iterated_mapper1 = [&](uint64_t offset, uint64_t length) {
19221 size1 += length;
19222 if (idx1 < extent_count) {
19223 arr1[idx1++] = {offset, length};
19224 }
19225 else if (idx1 == extent_count) {
19226 derr << "(2)compare_allocators:: spillover" << dendl;
19227 idx1 ++;
19228 }
19229
19230 };
19231
19232 auto iterated_mapper2 = [&](uint64_t offset, uint64_t length) {
19233 size2 += length;
19234 if (idx2 < extent_count) {
19235 arr2[idx2++] = {offset, length};
19236 }
19237 else if (idx2 == extent_count) {
19238 derr << "(2)compare_allocators:: spillover" << dendl;
19239 idx2 ++;
19240 }
19241 };
19242
1e59de90
TL
19243 alloc1->foreach(iterated_mapper1);
19244 alloc2->foreach(iterated_mapper2);
20effc67
TL
19245
19246 qsort(arr1.get(), std::min(idx1, extent_count), sizeof(extent_t), cmpfunc);
19247 qsort(arr2.get(), std::min(idx2, extent_count), sizeof(extent_t), cmpfunc);
19248
19249 if (idx1 == idx2) {
19250 idx1 = idx2 = std::min(idx1, extent_count);
19251 if (memcmp(arr1.get(), arr2.get(), sizeof(extent_t) * idx2) == 0) {
19252 return 0;
19253 }
19254 derr << "Failed memcmp(arr1, arr2, sizeof(extent_t)*idx2)" << dendl;
19255 for (uint64_t i = 0; i < idx1; i++) {
19256 if (memcmp(arr1.get()+i, arr2.get()+i, sizeof(extent_t)) != 0) {
19257 derr << "!!!![" << i << "] arr1::<" << arr1[i].offset << "," << arr1[i].length << ">" << dendl;
19258 derr << "!!!![" << i << "] arr2::<" << arr2[i].offset << "," << arr2[i].length << ">" << dendl;
19259 return -1;
19260 }
19261 }
19262 return 0;
19263 } else {
19264 derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl;
20effc67
TL
19265 return -1;
19266 }
19267}
19268
19269//---------------------------------------------------------
19270int BlueStore::add_existing_bluefs_allocation(Allocator* allocator, read_alloc_stats_t &stats)
19271{
19272 // then add space used by bluefs to store rocksdb
19273 unsigned extent_count = 0;
19274 if (bluefs) {
1e59de90
TL
19275 bluefs->foreach_block_extents(
19276 bluefs_layout.shared_bdev,
19277 [&](uint64_t start, uint32_t len) {
19278 allocator->init_rm_free(start, len);
19279 stats.extent_count++;
19280 }
19281 );
20effc67
TL
19282 }
19283
19284 dout(5) << "bluefs extent_count=" << extent_count << dendl;
19285 return 0;
19286}
19287
19288//---------------------------------------------------------
19289int BlueStore::read_allocation_from_drive_for_bluestore_tool()
19290{
19291 dout(5) << __func__ << dendl;
19292 int ret = 0;
19293 uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
19294 ret = _open_db_and_around(true, false);
19295 if (ret < 0) {
19296 return ret;
19297 }
19298
19299 ret = _open_collections();
19300 if (ret < 0) {
19301 _close_db_and_around();
19302 return ret;
19303 }
19304
19305 utime_t duration;
19306 read_alloc_stats_t stats = {};
19307 utime_t start = ceph_clock_now();
19308
19309 auto shutdown_cache = make_scope_guard([&] {
1d09f67e
TL
19310 dout(1) << "Allocation Recovery was completed in " << duration
19311 << " seconds; insert_count=" << stats.insert_count
19312 << "; extent_count=" << stats.extent_count << dendl;
20effc67
TL
19313 _shutdown_cache();
19314 _close_db_and_around();
19315 });
19316
19317 {
19318 auto allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
19319 //reconstruct allocations into a temp simple-bitmap and copy into allocator
19320 {
1d09f67e 19321 SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
20effc67
TL
19322 ret = reconstruct_allocations(&sbmap, stats);
19323 if (ret != 0) {
19324 return ret;
19325 }
19326 copy_simple_bitmap_to_allocator(&sbmap, allocator.get(), min_alloc_size);
19327 }
19328
19329 // add allocation space used by the bluefs itself
19330 ret = add_existing_bluefs_allocation(allocator.get(), stats);
19331 if (ret < 0) {
19332 return ret;
19333 }
19334
19335 duration = ceph_clock_now() - start;
19336 stats.insert_count = 0;
19337 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
19338 stats.insert_count++;
19339 };
1e59de90 19340 allocator->foreach(count_entries);
20effc67 19341 ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target);
1d09f67e 19342 if (ret == 0) {
20effc67
TL
19343 dout(5) << "Allocator drive - file integrity check OK" << dendl;
19344 } else {
19345 derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl;
19346 }
19347 }
19348
1d09f67e 19349 dout(1) << stats << dendl;
20effc67
TL
19350 return ret;
19351}
19352
19353//---------------------------------------------------------
19354Allocator* BlueStore::clone_allocator_without_bluefs(Allocator *src_allocator)
19355{
19356 uint64_t bdev_size = bdev->get_size();
19357 Allocator* allocator = create_bitmap_allocator(bdev_size);
19358 if (allocator) {
19359 dout(5) << "bitmap-allocator=" << allocator << dendl;
19360 } else {
19361 derr << "****failed create_bitmap_allocator()" << dendl;
19362 return nullptr;
19363 }
19364
19365 uint64_t num_entries = 0;
19366 copy_allocator(src_allocator, allocator, &num_entries);
19367
19368 // BlueFS stores its internal allocation outside RocksDB (FM) so we should not destage them to the allcoator-file
19369 // we are going to hide bluefs allocation during allocator-destage as they are stored elsewhere
19370 {
1e59de90
TL
19371 bluefs->foreach_block_extents(
19372 bluefs_layout.shared_bdev,
19373 [&] (uint64_t start, uint32_t len) {
19374 allocator->init_add_free(start, len);
19375 }
19376 );
20effc67
TL
19377 }
19378
19379 return allocator;
19380}
19381
19382//---------------------------------------------------------
19383static void clear_allocation_objects_from_rocksdb(KeyValueDB *db, CephContext *cct, const std::string &path)
19384{
19385 dout(5) << "t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP)" << dendl;
19386 KeyValueDB::Transaction t = db->get_transaction();
19387 t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP);
19388 db->submit_transaction_sync(t);
19389}
19390
19391//---------------------------------------------------------
19392void BlueStore::copy_allocator_content_to_fm(Allocator *allocator, FreelistManager *real_fm)
19393{
19394 unsigned max_txn = 1024;
19395 dout(5) << "max_transaction_submit=" << max_txn << dendl;
19396 uint64_t size = 0, idx = 0;
19397 KeyValueDB::Transaction txn = db->get_transaction();
19398 auto iterated_insert = [&](uint64_t offset, uint64_t length) {
19399 size += length;
19400 real_fm->release(offset, length, txn);
19401 if ((++idx % max_txn) == 0) {
19402 db->submit_transaction_sync(txn);
19403 txn = db->get_transaction();
19404 }
19405 };
1e59de90 19406 allocator->foreach(iterated_insert);
20effc67
TL
19407 if (idx % max_txn != 0) {
19408 db->submit_transaction_sync(txn);
19409 }
19410 dout(5) << "size=" << size << ", num extents=" << idx << dendl;
19411}
19412
19413//---------------------------------------------------------
19414Allocator* BlueStore::initialize_allocator_from_freelist(FreelistManager *real_fm)
19415{
19416 dout(5) << "real_fm->enumerate_next" << dendl;
19417 Allocator* allocator2 = create_bitmap_allocator(bdev->get_size());
19418 if (allocator2) {
19419 dout(5) << "bitmap-allocator=" << allocator2 << dendl;
19420 } else {
19421 return nullptr;
19422 }
19423
19424 uint64_t size2 = 0, idx2 = 0;
19425 real_fm->enumerate_reset();
19426 uint64_t offset, length;
19427 while (real_fm->enumerate_next(db, &offset, &length)) {
19428 allocator2->init_add_free(offset, length);
19429 ++idx2;
19430 size2 += length;
19431 }
19432 real_fm->enumerate_reset();
19433
19434 dout(5) << "size2=" << size2 << ", num2=" << idx2 << dendl;
19435 return allocator2;
19436}
19437
19438//---------------------------------------------------------
19439// close the active fm and open it in a new mode like makefs()
19440// but make sure to mark the full device space as allocated
19441// later we will mark all exetents from the allocator as free
19442int BlueStore::reset_fm_for_restore()
19443{
19444 dout(5) << "<<==>> fm->clear_null_manager()" << dendl;
19445 fm->shutdown();
19446 delete fm;
19447 fm = nullptr;
19448 freelist_type = "bitmap";
19449 KeyValueDB::Transaction t = db->get_transaction();
19450 // call _open_fm() with fm_restore set to TRUE
19451 // this will mark the full device space as allocated (and not just the reserved space)
39ae355f 19452 _open_fm(t, true, true, true);
20effc67
TL
19453 if (fm == nullptr) {
19454 derr << "Failed _open_fm()" << dendl;
19455 return -1;
19456 }
19457 db->submit_transaction_sync(t);
19458 ceph_assert(!fm->is_null_manager());
19459 dout(5) << "fm was reactivated in full mode" << dendl;
19460 return 0;
19461}
19462
19463
19464//---------------------------------------------------------
19465// create a temp allocator filled with allocation state from the fm
19466// and compare it to the base allocator passed in
19467int BlueStore::verify_rocksdb_allocations(Allocator *allocator)
19468{
19469 dout(5) << "verify that alloc content is identical to FM" << dendl;
19470 // initialize from freelist
19471 Allocator* temp_allocator = initialize_allocator_from_freelist(fm);
19472 if (temp_allocator == nullptr) {
19473 return -1;
19474 }
19475
19476 uint64_t insert_count = 0;
19477 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
19478 insert_count++;
19479 };
1e59de90 19480 temp_allocator->foreach(count_entries);
20effc67
TL
19481 uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
19482 int ret = compare_allocators(allocator, temp_allocator, insert_count, memory_target);
19483
19484 delete temp_allocator;
19485
19486 if (ret == 0) {
19487 dout(5) << "SUCCESS!!! compare(allocator, temp_allocator)" << dendl;
19488 return 0;
19489 } else {
19490 derr << "**** FAILURE compare(allocator, temp_allocator)::ret=" << ret << dendl;
19491 return -1;
19492 }
19493}
19494
19495//---------------------------------------------------------
19496int BlueStore::db_cleanup(int ret)
19497{
19498 _shutdown_cache();
19499 _close_db_and_around();
19500 return ret;
19501}
19502
19503//---------------------------------------------------------
19504// convert back the system from null-allocator to using rocksdb to store allocation
19505int BlueStore::push_allocation_to_rocksdb()
19506{
19507 if (cct->_conf->bluestore_allocation_from_file) {
19508 derr << "cct->_conf->bluestore_allocation_from_file must be cleared first" << dendl;
19509 derr << "please change default to false in ceph.conf file>" << dendl;
19510 return -1;
19511 }
19512
19513 dout(5) << "calling open_db_and_around() in read/write mode" << dendl;
19514 int ret = _open_db_and_around(false);
19515 if (ret < 0) {
19516 return ret;
19517 }
19518
19519 if (!fm->is_null_manager()) {
19520 derr << "This is not a NULL-MANAGER -> nothing to do..." << dendl;
19521 return db_cleanup(0);
19522 }
19523
19524 // start by creating a clone copy of the shared-allocator
19525 unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(alloc));
19526 if (!allocator) {
19527 return db_cleanup(-1);
19528 }
19529
19530 // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
19531 clear_allocation_objects_from_rocksdb(db, cct, path);
19532
19533 // then open fm in new mode with the full devie marked as alloctaed
19534 if (reset_fm_for_restore() != 0) {
19535 return db_cleanup(-1);
19536 }
19537
19538 // push the free-space from the allocator (shared-alloc without bfs) to rocksdb
19539 copy_allocator_content_to_fm(allocator.get(), fm);
19540
19541 // compare the allocator info with the info stored in the fm/rocksdb
19542 if (verify_rocksdb_allocations(allocator.get()) == 0) {
19543 // all is good -> we can commit to rocksdb allocator
19544 commit_to_real_manager();
19545 } else {
19546 return db_cleanup(-1);
19547 }
19548
19549 // can't be too paranoid :-)
19550 dout(5) << "Running full scale verification..." << dendl;
19551 // close db/fm/allocator and start fresh
19552 db_cleanup(0);
19553 dout(5) << "calling open_db_and_around() in read-only mode" << dendl;
19554 ret = _open_db_and_around(true);
19555 if (ret < 0) {
19556 return db_cleanup(ret);
19557 }
19558 ceph_assert(!fm->is_null_manager());
19559 ceph_assert(verify_rocksdb_allocations(allocator.get()) == 0);
19560
19561 return db_cleanup(ret);
19562}
19563
19564#endif // CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
19565
19566//-------------------------------------------------------------------------------------
39ae355f 19567int BlueStore::commit_freelist_type()
20effc67
TL
19568{
19569 // When freelist_type to "bitmap" we will store allocation in RocksDB
19570 // When allocation-info is stored in a single file we set freelist_type to "null"
19571 // This will direct the startup code to read allocation from file and not RocksDB
19572 KeyValueDB::Transaction t = db->get_transaction();
19573 if (t == nullptr) {
19574 derr << "db->get_transaction() failed!!!" << dendl;
19575 return -1;
19576 }
19577
19578 bufferlist bl;
19579 bl.append(freelist_type);
19580 t->set(PREFIX_SUPER, "freelist_type", bl);
19581
19582 int ret = db->submit_transaction_sync(t);
19583 if (ret != 0) {
19584 derr << "Failed db->submit_transaction_sync(t)" << dendl;
19585 }
19586 return ret;
19587}
19588
19589//-------------------------------------------------------------------------------------
19590int BlueStore::commit_to_null_manager()
19591{
39ae355f 19592 dout(5) << __func__ << " Set FreelistManager to NULL FM..." << dendl;
20effc67
TL
19593 fm->set_null_manager();
19594 freelist_type = "null";
19595#if 1
39ae355f 19596 return commit_freelist_type();
20effc67
TL
19597#else
19598 // should check how long this step take on a big configuration as deletes are expensive
39ae355f 19599 if (commit_freelist_type() == 0) {
20effc67
TL
19600 // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
19601 clear_allocation_objects_from_rocksdb(db, cct, path);
19602 }
19603#endif
19604}
19605
19606
19607//-------------------------------------------------------------------------------------
19608int BlueStore::commit_to_real_manager()
19609{
19610 dout(5) << "Set FreelistManager to Real FM..." << dendl;
19611 ceph_assert(!fm->is_null_manager());
19612 freelist_type = "bitmap";
39ae355f 19613 int ret = commit_freelist_type();
20effc67
TL
19614 if (ret == 0) {
19615 //remove the allocation_file
19616 invalidate_allocation_file_on_bluefs();
19617 ret = bluefs->unlink(allocator_dir, allocator_file);
19618 bluefs->sync_metadata(false);
19619 if (ret == 0) {
19620 dout(5) << "Remove Allocation File successfully" << dendl;
19621 }
19622 else {
19623 derr << "Remove Allocation File ret_code=" << ret << dendl;
19624 }
19625 }
19626
19627 return ret;
19628}
19629
19630//================================================================================================================
19631//================================================================================================================