]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
bump version to 17.2.5-pve1
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
20effc67 20#include <algorithm>
7c673cae 21
eafe8130 22#include <boost/container/flat_set.hpp>
20effc67 23#include <boost/algorithm/string.hpp>
eafe8130 24
31f18b77
FG
25#include "include/cpp-btree/btree_set.h"
26
7c673cae 27#include "BlueStore.h"
f67539c2 28#include "bluestore_common.h"
20effc67 29#include "simple_bitmap.h"
7c673cae
FG
30#include "os/kv.h"
31#include "include/compat.h"
32#include "include/intarith.h"
33#include "include/stringify.h"
11fdf7f2
TL
34#include "include/str_map.h"
35#include "include/util.h"
7c673cae
FG
36#include "common/errno.h"
37#include "common/safe_io.h"
91327a77 38#include "common/PriorityCache.h"
20effc67 39#include "common/url_escape.h"
7c673cae
FG
40#include "Allocator.h"
41#include "FreelistManager.h"
42#include "BlueFS.h"
43#include "BlueRocksEnv.h"
44#include "auth/Crypto.h"
45#include "common/EventTrace.h"
91327a77 46#include "perfglue/heap_profiler.h"
11fdf7f2
TL
47#include "common/blkdev.h"
48#include "common/numa.h"
f67539c2 49#include "common/pretty_binary.h"
20effc67
TL
50#include "kv/KeyValueHistogram.h"
51
52#ifdef HAVE_LIBZBD
53#include "ZonedAllocator.h"
54#include "ZonedFreelistManager.h"
55#endif
7c673cae 56
9f95a23c
TL
57#if defined(WITH_LTTNG)
58#define TRACEPOINT_DEFINE
59#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
60#include "tracing/bluestore.h"
61#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
62#undef TRACEPOINT_DEFINE
63#else
64#define tracepoint(...)
65#endif
66
7c673cae
FG
67#define dout_context cct
68#define dout_subsys ceph_subsys_bluestore
69
31f18b77
FG
70using bid_t = decltype(BlueStore::Blob::id);
71
72// bluestore_cache_onode
7c673cae 73MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 74 bluestore_cache_onode);
7c673cae 75
31f18b77 76// bluestore_cache_other
7c673cae 77MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
f91f0fd5 78 bluestore_Buffer);
7c673cae 79MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
f91f0fd5 80 bluestore_Extent);
7c673cae 81MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
f91f0fd5 82 bluestore_Blob);
7c673cae 83MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
f91f0fd5 84 bluestore_SharedBlob);
31f18b77
FG
85
86// bluestore_txc
87MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
88 bluestore_txc);
20effc67 89using std::byte;
f67539c2
TL
90using std::deque;
91using std::min;
92using std::make_pair;
93using std::numeric_limits;
94using std::pair;
20effc67 95using std::less;
f67539c2 96using std::list;
20effc67 97using std::make_unique;
f67539c2
TL
98using std::map;
99using std::max;
100using std::ostream;
101using std::ostringstream;
102using std::set;
103using std::string;
104using std::stringstream;
20effc67 105using std::unique_ptr;
f67539c2
TL
106using std::vector;
107
108using ceph::bufferlist;
109using ceph::bufferptr;
110using ceph::coarse_mono_clock;
111using ceph::decode;
112using ceph::encode;
113using ceph::Formatter;
114using ceph::JSONFormatter;
115using ceph::make_timespan;
116using ceph::mono_clock;
117using ceph::mono_time;
118using ceph::timespan_str;
7c673cae
FG
119
120// kv store prefixes
11fdf7f2
TL
121const string PREFIX_SUPER = "S"; // field -> value
122const string PREFIX_STAT = "T"; // field -> value(int64 array)
123const string PREFIX_COLL = "C"; // collection name -> cnode_t
124const string PREFIX_OBJ = "O"; // object name -> onode_t
125const string PREFIX_OMAP = "M"; // u64 + keyname -> value
126const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
9f95a23c 127const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
f67539c2 128const string PREFIX_PERPG_OMAP = "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value
11fdf7f2
TL
129const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
130const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
131const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
20effc67
TL
132const string PREFIX_SHARED_BLOB = "X"; // u64 SB id -> shared_blob_t
133
134#ifdef HAVE_LIBZBD
f67539c2
TL
135const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager)
136const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager)
137const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata)
20effc67 138#endif
7c673cae 139
11fdf7f2
TL
140const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
141
7c673cae
FG
142// write a label in the first block. always use this size. note that
143// bluefs makes a matching assumption about the location of its
144// superblock (always the second block of the device).
145#define BDEV_LABEL_BLOCK_SIZE 4096
146
147// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
148#define SUPER_RESERVED 8192
149
150#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
151
152
153/*
154 * extent map blob encoding
155 *
156 * we use the low bits of the blobid field to indicate some common scenarios
157 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
158 */
159#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
160#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
161#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
162#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
163#define BLOBID_SHIFT_BITS 4
164
165/*
166 * object name key structure
167 *
168 * encoded u8: shard + 2^7 (so that it sorts properly)
169 * encoded u64: poolid + 2^63 (so that it sorts properly)
170 * encoded u32: hash (bit reversed)
171 *
172 * escaped string: namespace
173 *
174 * escaped string: key or object name
175 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
176 * we are done. otherwise, we are followed by the object name.
177 * escaped string: object name (unless '=' above)
178 *
179 * encoded u64: snap
180 * encoded u64: generation
181 * 'o'
182 */
183#define ONODE_KEY_SUFFIX 'o'
184
185/*
186 * extent shard key
187 *
188 * object prefix key
189 * u32
190 * 'x'
191 */
192#define EXTENT_SHARD_KEY_SUFFIX 'x'
193
194/*
195 * string encoding in the key
196 *
197 * The key string needs to lexicographically sort the same way that
198 * ghobject_t does. We do this by escaping anything <= to '#' with #
199 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
200 * hex digits.
201 *
202 * We use ! as a terminator for strings; this works because it is < #
203 * and will get escaped if it is present in the string.
204 *
f91f0fd5
TL
205 * NOTE: There is a bug in this implementation: due to implicit
206 * character type conversion in comparison it may produce unexpected
207 * ordering. Unfortunately fixing the bug would mean invalidating the
208 * keys in existing deployments. Instead we do additional sorting
209 * where it is needed.
7c673cae
FG
210 */
211template<typename S>
212static void append_escaped(const string &in, S *out)
213{
224ce89b
WB
214 char hexbyte[in.length() * 3 + 1];
215 char* ptr = &hexbyte[0];
7c673cae 216 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
f91f0fd5 217 if (*i <= '#') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
218 *ptr++ = '#';
219 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
220 *ptr++ = "0123456789abcdef"[*i & 0x0f];
f91f0fd5 221 } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
222 *ptr++ = '~';
223 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
224 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 225 } else {
224ce89b 226 *ptr++ = *i;
7c673cae
FG
227 }
228 }
224ce89b
WB
229 *ptr++ = '!';
230 out->append(hexbyte, ptr - &hexbyte[0]);
231}
232
233inline unsigned h2i(char c)
234{
235 if ((c >= '0') && (c <= '9')) {
236 return c - 0x30;
237 } else if ((c >= 'a') && (c <= 'f')) {
238 return c - 'a' + 10;
239 } else if ((c >= 'A') && (c <= 'F')) {
240 return c - 'A' + 10;
241 } else {
242 return 256; // make it always larger than 255
243 }
7c673cae
FG
244}
245
246static int decode_escaped(const char *p, string *out)
247{
224ce89b
WB
248 char buff[256];
249 char* ptr = &buff[0];
250 char* max = &buff[252];
7c673cae
FG
251 const char *orig_p = p;
252 while (*p && *p != '!') {
253 if (*p == '#' || *p == '~') {
224ce89b
WB
254 unsigned hex = 0;
255 p++;
256 hex = h2i(*p++) << 4;
257 if (hex > 255) {
258 return -EINVAL;
259 }
260 hex |= h2i(*p++);
261 if (hex > 255) {
262 return -EINVAL;
263 }
264 *ptr++ = hex;
7c673cae 265 } else {
224ce89b
WB
266 *ptr++ = *p++;
267 }
268 if (ptr > max) {
269 out->append(buff, ptr-buff);
270 ptr = &buff[0];
7c673cae
FG
271 }
272 }
224ce89b
WB
273 if (ptr != buff) {
274 out->append(buff, ptr-buff);
275 }
7c673cae
FG
276 return p - orig_p;
277}
278
7c673cae
FG
279template<typename T>
280static void _key_encode_shard(shard_id_t shard, T *key)
281{
282 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
283}
284
285static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
286{
287 pshard->id = (uint8_t)*key - (uint8_t)0x80;
288 return key + 1;
289}
290
f91f0fd5 291static void get_coll_range(const coll_t& cid, int bits,
f67539c2 292 ghobject_t *temp_start, ghobject_t *temp_end,
a4b75251 293 ghobject_t *start, ghobject_t *end, bool legacy)
7c673cae 294{
7c673cae 295 spg_t pgid;
a4b75251
TL
296 constexpr uint32_t MAX_HASH = std::numeric_limits<uint32_t>::max();
297 // use different nspaces due to we use different schemes when encoding
298 // keys for listing objects
299 const std::string_view MAX_NSPACE = legacy ? "\x7f" : "\xff";
7c673cae 300 if (cid.is_pg(&pgid)) {
f91f0fd5 301 start->shard_id = pgid.shard;
7c673cae
FG
302 *temp_start = *start;
303
f91f0fd5
TL
304 start->hobj.pool = pgid.pool();
305 temp_start->hobj.pool = -2ll - pgid.pool();
7c673cae
FG
306
307 *end = *start;
308 *temp_end = *temp_start;
309
310 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
f91f0fd5
TL
311 start->hobj.set_bitwise_key_u32(reverse_hash);
312 temp_start->hobj.set_bitwise_key_u32(reverse_hash);
7c673cae
FG
313
314 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
a4b75251
TL
315 if (end_hash > MAX_HASH) {
316 // make sure end hobj is even greater than the maximum possible hobj
317 end->hobj.set_bitwise_key_u32(MAX_HASH);
318 temp_end->hobj.set_bitwise_key_u32(MAX_HASH);
319 end->hobj.nspace = MAX_NSPACE;
320 } else {
321 end->hobj.set_bitwise_key_u32(end_hash);
322 temp_end->hobj.set_bitwise_key_u32(end_hash);
323 }
7c673cae 324 } else {
f91f0fd5
TL
325 start->shard_id = shard_id_t::NO_SHARD;
326 start->hobj.pool = -1ull;
327
7c673cae 328 *end = *start;
f91f0fd5 329 start->hobj.set_bitwise_key_u32(0);
a4b75251
TL
330 end->hobj.set_bitwise_key_u32(MAX_HASH);
331 end->hobj.nspace = MAX_NSPACE;
7c673cae
FG
332 // no separate temp section
333 *temp_start = *end;
334 *temp_end = *end;
335 }
f91f0fd5
TL
336
337 start->generation = 0;
338 end->generation = 0;
339 temp_start->generation = 0;
340 temp_end->generation = 0;
7c673cae
FG
341}
342
343static void get_shared_blob_key(uint64_t sbid, string *key)
344{
345 key->clear();
346 _key_encode_u64(sbid, key);
347}
348
349static int get_key_shared_blob(const string& key, uint64_t *sbid)
350{
351 const char *p = key.c_str();
352 if (key.length() < sizeof(uint64_t))
353 return -1;
224ce89b 354 _key_decode_u64(p, sbid);
7c673cae
FG
355 return 0;
356}
357
358template<typename S>
f91f0fd5 359static void _key_encode_prefix(const ghobject_t& oid, S *key)
7c673cae 360{
f91f0fd5
TL
361 _key_encode_shard(oid.shard_id, key);
362 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
363 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
364}
7c673cae 365
f91f0fd5
TL
366static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
367{
7c673cae
FG
368 p = _key_decode_shard(p, &oid->shard_id);
369
370 uint64_t pool;
371 p = _key_decode_u64(p, &pool);
372 oid->hobj.pool = pool - 0x8000000000000000ull;
373
374 unsigned hash;
375 p = _key_decode_u32(p, &hash);
376
377 oid->hobj.set_bitwise_key_u32(hash);
378
f91f0fd5
TL
379 return p;
380}
381
382#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
383
20effc67 384static int _get_key_object(const char *p, ghobject_t *oid)
f91f0fd5
TL
385{
386 int r;
f91f0fd5
TL
387
388 p = _key_decode_prefix(p, oid);
389
7c673cae
FG
390 r = decode_escaped(p, &oid->hobj.nspace);
391 if (r < 0)
392 return -2;
393 p += r + 1;
394
395 string k;
396 r = decode_escaped(p, &k);
397 if (r < 0)
398 return -3;
399 p += r + 1;
400 if (*p == '=') {
401 // no key
402 ++p;
403 oid->hobj.oid.name = k;
404 } else if (*p == '<' || *p == '>') {
405 // key + name
406 ++p;
407 r = decode_escaped(p, &oid->hobj.oid.name);
408 if (r < 0)
409 return -5;
410 p += r + 1;
411 oid->hobj.set_key(k);
412 } else {
413 // malformed
414 return -6;
415 }
416
417 p = _key_decode_u64(p, &oid->hobj.snap.val);
418 p = _key_decode_u64(p, &oid->generation);
419
420 if (*p != ONODE_KEY_SUFFIX) {
421 return -7;
422 }
423 p++;
424 if (*p) {
425 // if we get something other than a null terminator here,
426 // something goes wrong.
427 return -8;
428 }
429
430 return 0;
431}
432
433template<typename S>
20effc67 434static int get_key_object(const S& key, ghobject_t *oid)
7c673cae 435{
20effc67
TL
436 if (key.length() < ENCODED_KEY_PREFIX_LEN)
437 return -1;
438 if (key.length() == ENCODED_KEY_PREFIX_LEN)
439 return -2;
440 const char *p = key.c_str();
441 return _get_key_object(p, oid);
442}
7c673cae 443
20effc67
TL
444template<typename S>
445static void _get_object_key(const ghobject_t& oid, S *key)
446{
f91f0fd5 447 size_t max_len = ENCODED_KEY_PREFIX_LEN +
7c673cae
FG
448 (oid.hobj.nspace.length() * 3 + 1) +
449 (oid.hobj.get_key().length() * 3 + 1) +
450 1 + // for '<', '=', or '>'
451 (oid.hobj.oid.name.length() * 3 + 1) +
452 8 + 8 + 1;
453 key->reserve(max_len);
454
f91f0fd5 455 _key_encode_prefix(oid, key);
7c673cae
FG
456
457 append_escaped(oid.hobj.nspace, key);
458
459 if (oid.hobj.get_key().length()) {
460 // is a key... could be < = or >.
461 append_escaped(oid.hobj.get_key(), key);
462 // (ASCII chars < = and > sort in that order, yay)
463 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
464 if (r) {
465 key->append(r > 0 ? ">" : "<");
466 append_escaped(oid.hobj.oid.name, key);
467 } else {
468 // same as no key
469 key->append("=");
470 }
471 } else {
472 // no key
473 append_escaped(oid.hobj.oid.name, key);
474 key->append("=");
475 }
476
477 _key_encode_u64(oid.hobj.snap, key);
478 _key_encode_u64(oid.generation, key);
479
480 key->push_back(ONODE_KEY_SUFFIX);
20effc67
TL
481}
482
483template<typename S>
484static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
485{
486 key->clear();
487 _get_object_key(oid, key);
7c673cae
FG
488
489 // sanity check
490 if (true) {
491 ghobject_t t;
492 int r = get_key_object(*key, &t);
493 if (r || t != oid) {
494 derr << " r " << r << dendl;
495 derr << "key " << pretty_binary_string(*key) << dendl;
496 derr << "oid " << oid << dendl;
497 derr << " t " << t << dendl;
11fdf7f2 498 ceph_assert(r == 0 && t == oid);
7c673cae
FG
499 }
500 }
501}
502
7c673cae
FG
503// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
504// char lets us quickly test whether it is a shard key without decoding any
505// of the prefix bytes.
506template<typename S>
507static void get_extent_shard_key(const S& onode_key, uint32_t offset,
508 string *key)
509{
510 key->clear();
511 key->reserve(onode_key.length() + 4 + 1);
512 key->append(onode_key.c_str(), onode_key.size());
513 _key_encode_u32(offset, key);
514 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
515}
516
517static void rewrite_extent_shard_key(uint32_t offset, string *key)
518{
11fdf7f2
TL
519 ceph_assert(key->size() > sizeof(uint32_t) + 1);
520 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
521 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
522}
523
524template<typename S>
525static void generate_extent_shard_key_and_apply(
526 const S& onode_key,
527 uint32_t offset,
528 string *key,
529 std::function<void(const string& final_key)> apply)
530{
531 if (key->empty()) { // make full key
11fdf7f2 532 ceph_assert(!onode_key.empty());
7c673cae
FG
533 get_extent_shard_key(onode_key, offset, key);
534 } else {
535 rewrite_extent_shard_key(offset, key);
536 }
537 apply(*key);
538}
539
540int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
541{
11fdf7f2
TL
542 ceph_assert(key.size() > sizeof(uint32_t) + 1);
543 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
544 int okey_len = key.size() - sizeof(uint32_t) - 1;
545 *onode_key = key.substr(0, okey_len);
546 const char *p = key.data() + okey_len;
224ce89b 547 _key_decode_u32(p, offset);
7c673cae
FG
548 return 0;
549}
550
551static bool is_extent_shard_key(const string& key)
552{
553 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
554}
555
7c673cae
FG
556static void get_deferred_key(uint64_t seq, string *out)
557{
558 _key_encode_u64(seq, out);
559}
560
11fdf7f2
TL
561static void get_pool_stat_key(int64_t pool_id, string *key)
562{
563 key->clear();
564 _key_encode_u64(pool_id, key);
565}
566
567static int get_key_pool_stat(const string& key, uint64_t* pool_id)
568{
569 const char *p = key.c_str();
570 if (key.length() < sizeof(uint64_t))
571 return -1;
572 _key_decode_u64(p, pool_id);
573 return 0;
574}
7c673cae 575
20effc67
TL
576#ifdef HAVE_LIBZBD
577static void get_zone_offset_object_key(
578 uint32_t zone,
579 uint64_t offset,
580 ghobject_t oid,
581 std::string *key)
582{
583 key->clear();
584 _key_encode_u32(zone, key);
585 _key_encode_u64(offset, key);
586 _get_object_key(oid, key);
587}
588
589static int get_key_zone_offset_object(
590 const string& key,
591 uint32_t *zone,
592 uint64_t *offset,
593 ghobject_t *oid)
594{
595 const char *p = key.c_str();
596 if (key.length() < sizeof(uint64_t) + sizeof(uint32_t) + ENCODED_KEY_PREFIX_LEN + 1)
597 return -1;
598 p = _key_decode_u32(p, zone);
599 p = _key_decode_u64(p, offset);
600 int r = _get_key_object(p, oid);
601 if (r < 0) {
602 return r;
603 }
604 return 0;
605}
606#endif
522d829b 607
81eedcae
TL
608template <int LogLevelV>
609void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
610{
611 uint64_t pos = 0;
612 for (auto& s : em.shards) {
613 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
614 << (s.loaded ? " (loaded)" : "")
615 << (s.dirty ? " (dirty)" : "")
616 << dendl;
617 }
618 for (auto& e : em.extent_map) {
619 dout(LogLevelV) << __func__ << " " << e << dendl;
620 ceph_assert(e.logical_offset >= pos);
621 pos = e.logical_offset + e.length;
622 const bluestore_blob_t& blob = e.blob->get_blob();
623 if (blob.has_csum()) {
624 vector<uint64_t> v;
625 unsigned n = blob.get_csum_count();
626 for (unsigned i = 0; i < n; ++i)
627 v.push_back(blob.get_csum_item(i));
628 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
629 << dendl;
630 }
631 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
632 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
633 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
634 << "~" << i.second->length << std::dec
635 << " " << *i.second << dendl;
636 }
637 }
638}
639
640template <int LogLevelV>
641void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
642{
643 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
644 return;
645 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
646 << " nid " << o.onode.nid
647 << " size 0x" << std::hex << o.onode.size
648 << " (" << std::dec << o.onode.size << ")"
649 << " expected_object_size " << o.onode.expected_object_size
650 << " expected_write_size " << o.onode.expected_write_size
651 << " in " << o.onode.extent_map_shards.size() << " shards"
652 << ", " << o.extent_map.spanning_blob_map.size()
653 << " spanning blobs"
654 << dendl;
20effc67
TL
655 for (auto& [zone, offset] : o.onode.zone_offset_refs) {
656 dout(LogLevelV) << __func__ << " zone ref 0x" << std::hex << zone
657 << " offset 0x" << offset << std::dec << dendl;
658 }
81eedcae
TL
659 for (auto p = o.onode.attrs.begin();
660 p != o.onode.attrs.end();
661 ++p) {
662 dout(LogLevelV) << __func__ << " attr " << p->first
663 << " len " << p->second.length() << dendl;
664 }
665 _dump_extent_map<LogLevelV>(cct, o.extent_map);
666}
667
668template <int LogLevelV>
669void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
670{
671 dout(LogLevelV) << __func__ << " transaction dump:\n";
672 JSONFormatter f(true);
673 f.open_object_section("transaction");
674 t->dump(&f);
675 f.close_section();
676 f.flush(*_dout);
677 *_dout << dendl;
678}
679
7c673cae
FG
680// Buffer
681
682ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
683{
684 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
685 << b.offset << "~" << b.length << std::dec
686 << " " << BlueStore::Buffer::get_state_name(b.state);
687 if (b.flags)
688 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
689 return out << ")";
690}
691
f91f0fd5
TL
692namespace {
693
694/*
695 * Due to a bug in key string encoding (see a comment for append_escaped)
696 * the KeyValueDB iterator does not lexicographically sort the same
697 * way that ghobject_t does: objects with the same hash may have wrong order.
698 *
699 * This is the iterator wrapper that fixes the keys order.
700 */
701
702class CollectionListIterator {
703public:
704 CollectionListIterator(const KeyValueDB::Iterator &it)
705 : m_it(it) {
706 }
707 virtual ~CollectionListIterator() {
708 }
709
710 virtual bool valid() const = 0;
711 virtual const ghobject_t &oid() const = 0;
712 virtual void lower_bound(const ghobject_t &oid) = 0;
713 virtual void upper_bound(const ghobject_t &oid) = 0;
714 virtual void next() = 0;
715
adb31ebb
TL
716 virtual int cmp(const ghobject_t &oid) const = 0;
717
718 bool is_ge(const ghobject_t &oid) const {
719 return cmp(oid) >= 0;
720 }
721
722 bool is_lt(const ghobject_t &oid) const {
723 return cmp(oid) < 0;
724 }
725
f91f0fd5
TL
726protected:
727 KeyValueDB::Iterator m_it;
728};
729
730class SimpleCollectionListIterator : public CollectionListIterator {
731public:
732 SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
733 : CollectionListIterator(it), m_cct(cct) {
734 }
735
736 bool valid() const override {
737 return m_it->valid();
738 }
739
740 const ghobject_t &oid() const override {
741 ceph_assert(valid());
742
743 return m_oid;
744 }
745
746 void lower_bound(const ghobject_t &oid) override {
747 string key;
748 get_object_key(m_cct, oid, &key);
749
750 m_it->lower_bound(key);
751 get_oid();
752 }
753
754 void upper_bound(const ghobject_t &oid) override {
755 string key;
756 get_object_key(m_cct, oid, &key);
757
758 m_it->upper_bound(key);
759 get_oid();
760 }
761
762 void next() override {
763 ceph_assert(valid());
764
765 m_it->next();
766 get_oid();
767 }
768
adb31ebb
TL
769 int cmp(const ghobject_t &oid) const override {
770 ceph_assert(valid());
771
772 string key;
773 get_object_key(m_cct, oid, &key);
774
775 return m_it->key().compare(key);
776 }
777
f91f0fd5
TL
778private:
779 CephContext *m_cct;
780 ghobject_t m_oid;
781
782 void get_oid() {
f67539c2
TL
783 m_oid = ghobject_t();
784 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
785 m_it->next();
f91f0fd5 786 }
f67539c2 787 if (!valid()) {
f91f0fd5
TL
788 return;
789 }
790
f91f0fd5
TL
791 int r = get_key_object(m_it->key(), &m_oid);
792 ceph_assert(r == 0);
793 }
794};
795
796class SortedCollectionListIterator : public CollectionListIterator {
797public:
798 SortedCollectionListIterator(const KeyValueDB::Iterator &it)
799 : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
800 }
801
802 bool valid() const override {
803 return m_chunk_iter != m_chunk.end();
804 }
805
806 const ghobject_t &oid() const override {
807 ceph_assert(valid());
808
809 return m_chunk_iter->first;
810 }
811
812 void lower_bound(const ghobject_t &oid) override {
813 std::string key;
814 _key_encode_prefix(oid, &key);
815
816 m_it->lower_bound(key);
817 m_chunk_iter = m_chunk.end();
818 if (!get_next_chunk()) {
819 return;
820 }
821
822 if (this->oid().shard_id != oid.shard_id ||
823 this->oid().hobj.pool != oid.hobj.pool ||
824 this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
825 return;
826 }
827
828 m_chunk_iter = m_chunk.lower_bound(oid);
829 if (m_chunk_iter == m_chunk.end()) {
830 get_next_chunk();
831 }
832 }
833
834 void upper_bound(const ghobject_t &oid) override {
835 lower_bound(oid);
836
837 if (valid() && this->oid() == oid) {
838 next();
839 }
840 }
841
842 void next() override {
843 ceph_assert(valid());
844
845 m_chunk_iter++;
846 if (m_chunk_iter == m_chunk.end()) {
847 get_next_chunk();
848 }
849 }
850
adb31ebb
TL
851 int cmp(const ghobject_t &oid) const override {
852 ceph_assert(valid());
853
854 if (this->oid() < oid) {
855 return -1;
856 }
857 if (this->oid() > oid) {
858 return 1;
859 }
860 return 0;
861 }
862
f91f0fd5
TL
863private:
864 std::map<ghobject_t, std::string> m_chunk;
865 std::map<ghobject_t, std::string>::iterator m_chunk_iter;
866
867 bool get_next_chunk() {
868 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
869 m_it->next();
870 }
871
872 if (!m_it->valid()) {
873 return false;
874 }
875
876 ghobject_t oid;
877 int r = get_key_object(m_it->key(), &oid);
878 ceph_assert(r == 0);
879
880 m_chunk.clear();
881 while (true) {
882 m_chunk.insert({oid, m_it->key()});
883
884 do {
885 m_it->next();
886 } while (m_it->valid() && is_extent_shard_key(m_it->key()));
887
888 if (!m_it->valid()) {
889 break;
890 }
891
892 ghobject_t next;
893 r = get_key_object(m_it->key(), &next);
894 ceph_assert(r == 0);
895 if (next.shard_id != oid.shard_id ||
896 next.hobj.pool != oid.hobj.pool ||
897 next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
898 break;
899 }
900 oid = next;
901 }
902
903 m_chunk_iter = m_chunk.begin();
904 return true;
905 }
906};
907
908} // anonymous namespace
909
7c673cae
FG
910// Garbage Collector
911
912void BlueStore::GarbageCollector::process_protrusive_extents(
913 const BlueStore::ExtentMap& extent_map,
914 uint64_t start_offset,
915 uint64_t end_offset,
916 uint64_t start_touch_offset,
917 uint64_t end_touch_offset,
918 uint64_t min_alloc_size)
919{
11fdf7f2 920 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
7c673cae 921
11fdf7f2
TL
922 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
923 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
7c673cae
FG
924
925 dout(30) << __func__ << " (hex): [" << std::hex
926 << lookup_start_offset << ", " << lookup_end_offset
927 << ")" << std::dec << dendl;
928
929 for (auto it = extent_map.seek_lextent(lookup_start_offset);
930 it != extent_map.extent_map.end() &&
931 it->logical_offset < lookup_end_offset;
932 ++it) {
933 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
934 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
935
936 dout(30) << __func__ << " " << *it
937 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
938 << dendl;
939
940 Blob* b = it->blob.get();
941
942 if (it->logical_offset >=start_touch_offset &&
943 it->logical_end() <= end_touch_offset) {
944 // Process extents within the range affected by
945 // the current write request.
946 // Need to take into account if existing extents
947 // can be merged with them (uncompressed case)
948 if (!b->get_blob().is_compressed()) {
949 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
950 --blob_info_counted->expected_allocations; // don't need to allocate
951 // new AU for compressed
952 // data since another
953 // collocated uncompressed
954 // blob already exists
955 dout(30) << __func__ << " --expected:"
956 << alloc_unit_start << dendl;
957 }
958 used_alloc_unit = alloc_unit_end;
959 blob_info_counted = nullptr;
960 }
961 } else if (b->get_blob().is_compressed()) {
962
963 // additionally we take compressed blobs that were not impacted
964 // by the write into account too
965 BlobInfo& bi =
966 affected_blobs.emplace(
967 b, BlobInfo(b->get_referenced_bytes())).first->second;
968
969 int adjust =
970 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
971 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
972 dout(30) << __func__ << " expected_allocations="
973 << bi.expected_allocations << " end_au:"
974 << alloc_unit_end << dendl;
975
976 blob_info_counted = &bi;
977 used_alloc_unit = alloc_unit_end;
978
11fdf7f2 979 ceph_assert(it->length <= bi.referenced_bytes);
7c673cae
FG
980 bi.referenced_bytes -= it->length;
981 dout(30) << __func__ << " affected_blob:" << *b
982 << " unref 0x" << std::hex << it->length
983 << " referenced = 0x" << bi.referenced_bytes
984 << std::dec << dendl;
985 // NOTE: we can't move specific blob to resulting GC list here
986 // when reference counter == 0 since subsequent extents might
987 // decrement its expected_allocation.
988 // Hence need to enumerate all the extents first.
989 if (!bi.collect_candidate) {
990 bi.first_lextent = it;
991 bi.collect_candidate = true;
992 }
993 bi.last_lextent = it;
994 } else {
995 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
996 // don't need to allocate new AU for compressed data since another
997 // collocated uncompressed blob already exists
998 --blob_info_counted->expected_allocations;
999 dout(30) << __func__ << " --expected_allocations:"
1000 << alloc_unit_start << dendl;
1001 }
1002 used_alloc_unit = alloc_unit_end;
1003 blob_info_counted = nullptr;
1004 }
1005 }
1006
1007 for (auto b_it = affected_blobs.begin();
1008 b_it != affected_blobs.end();
1009 ++b_it) {
1010 Blob* b = b_it->first;
1011 BlobInfo& bi = b_it->second;
1012 if (bi.referenced_bytes == 0) {
1013 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
1014 int64_t blob_expected_for_release =
11fdf7f2 1015 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
7c673cae
FG
1016
1017 dout(30) << __func__ << " " << *(b_it->first)
1018 << " expected4release=" << blob_expected_for_release
1019 << " expected_allocations=" << bi.expected_allocations
1020 << dendl;
1021 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
11fdf7f2 1022 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
7c673cae
FG
1023 if (bi.collect_candidate) {
1024 auto it = bi.first_lextent;
1025 bool bExit = false;
1026 do {
1027 if (it->blob.get() == b) {
eafe8130 1028 extents_to_collect.insert(it->logical_offset, it->length);
7c673cae
FG
1029 }
1030 bExit = it == bi.last_lextent;
1031 ++it;
31f18b77 1032 } while (!bExit);
7c673cae
FG
1033 }
1034 expected_for_release += blob_expected_for_release;
1035 expected_allocations += bi.expected_allocations;
1036 }
1037 }
1038 }
1039}
1040
1041int64_t BlueStore::GarbageCollector::estimate(
1042 uint64_t start_offset,
1043 uint64_t length,
1044 const BlueStore::ExtentMap& extent_map,
1045 const BlueStore::old_extent_map_t& old_extents,
1046 uint64_t min_alloc_size)
1047{
1048
1049 affected_blobs.clear();
1050 extents_to_collect.clear();
1051 used_alloc_unit = boost::optional<uint64_t >();
1052 blob_info_counted = nullptr;
1053
eafe8130
TL
1054 uint64_t gc_start_offset = start_offset;
1055 uint64_t gc_end_offset = start_offset + length;
7c673cae
FG
1056
1057 uint64_t end_offset = start_offset + length;
1058
1059 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
1060 Blob* b = it->e.blob.get();
1061 if (b->get_blob().is_compressed()) {
1062
1063 // update gc_start_offset/gc_end_offset if needed
1064 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
11fdf7f2 1065 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
7c673cae
FG
1066
1067 auto o = it->e.logical_offset;
1068 auto l = it->e.length;
1069
1070 uint64_t ref_bytes = b->get_referenced_bytes();
1071 // micro optimization to bypass blobs that have no more references
1072 if (ref_bytes != 0) {
1073 dout(30) << __func__ << " affected_blob:" << *b
1074 << " unref 0x" << std::hex << o << "~" << l
1075 << std::dec << dendl;
1076 affected_blobs.emplace(b, BlobInfo(ref_bytes));
1077 }
1078 }
1079 }
1080 dout(30) << __func__ << " gc range(hex): [" << std::hex
1081 << gc_start_offset << ", " << gc_end_offset
1082 << ")" << std::dec << dendl;
1083
1084 // enumerate preceeding extents to check if they reference affected blobs
1085 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
1086 process_protrusive_extents(extent_map,
1087 gc_start_offset,
1088 gc_end_offset,
1089 start_offset,
1090 end_offset,
1091 min_alloc_size);
1092 }
1093 return expected_for_release - expected_allocations;
1094}
1095
9f95a23c
TL
1096// LruOnodeCacheShard
1097struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
1098 typedef boost::intrusive::list<
1099 BlueStore::Onode,
1100 boost::intrusive::member_hook<
1101 BlueStore::Onode,
1102 boost::intrusive::list_member_hook<>,
1103 &BlueStore::Onode::lru_item> > list_t;
7c673cae 1104
9f95a23c 1105 list_t lru;
7c673cae 1106
9f95a23c 1107 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
7c673cae 1108
f6b5b4d7 1109 void _add(BlueStore::Onode* o, int level) override
9f95a23c 1110 {
f6b5b4d7 1111 if (o->put_cache()) {
9f95a23c 1112 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
20effc67
TL
1113 o->cache_age_bin = age_bins.front();
1114 *(o->cache_age_bin) += 1;
f6b5b4d7
TL
1115 } else {
1116 ++num_pinned;
9f95a23c 1117 }
f6b5b4d7 1118 ++num; // we count both pinned and unpinned entries
20effc67
TL
1119 dout(20) << __func__ << " " << this << " " << o->oid << " added, num="
1120 << num << dendl;
eafe8130 1121 }
f6b5b4d7 1122 void _rm(BlueStore::Onode* o) override
9f95a23c 1123 {
f6b5b4d7 1124 if (o->pop_cache()) {
20effc67 1125 *(o->cache_age_bin) -= 1;
9f95a23c 1126 lru.erase(lru.iterator_to(*o));
f6b5b4d7
TL
1127 } else {
1128 ceph_assert(num_pinned);
1129 --num_pinned;
9f95a23c 1130 }
f6b5b4d7
TL
1131 ceph_assert(num);
1132 --num;
1133 dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
9f95a23c 1134 }
f6b5b4d7 1135 void _pin(BlueStore::Onode* o) override
9f95a23c 1136 {
20effc67 1137 *(o->cache_age_bin) -= 1;
9f95a23c 1138 lru.erase(lru.iterator_to(*o));
f6b5b4d7 1139 ++num_pinned;
20effc67 1140 dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " pinned" << dendl;
9f95a23c 1141 }
f6b5b4d7 1142 void _unpin(BlueStore::Onode* o) override
9f95a23c 1143 {
f6b5b4d7 1144 lru.push_front(*o);
20effc67
TL
1145 o->cache_age_bin = age_bins.front();
1146 *(o->cache_age_bin) += 1;
f6b5b4d7
TL
1147 ceph_assert(num_pinned);
1148 --num_pinned;
20effc67 1149 dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " unpinned" << dendl;
9f95a23c 1150 }
adb31ebb
TL
1151 void _unpin_and_rm(BlueStore::Onode* o) override
1152 {
1153 o->pop_cache();
1154 ceph_assert(num_pinned);
1155 --num_pinned;
1156 ceph_assert(num);
1157 --num;
1158 }
9f95a23c
TL
1159 void _trim_to(uint64_t new_size) override
1160 {
1161 if (new_size >= lru.size()) {
1162 return; // don't even try
1163 }
1164 uint64_t n = lru.size() - new_size;
1165 auto p = lru.end();
1166 ceph_assert(p != lru.begin());
1167 --p;
f6b5b4d7
TL
1168 ceph_assert(num >= n);
1169 num -= n;
1170 while (n-- > 0) {
9f95a23c 1171 BlueStore::Onode *o = &*p;
f6b5b4d7
TL
1172 dout(20) << __func__ << " rm " << o->oid << " "
1173 << o->nref << " " << o->cached << " " << o->pinned << dendl;
9f95a23c
TL
1174 if (p != lru.begin()) {
1175 lru.erase(p--);
1176 } else {
f6b5b4d7 1177 ceph_assert(n == 0);
9f95a23c 1178 lru.erase(p);
9f95a23c 1179 }
20effc67 1180 *(o->cache_age_bin) -= 1;
f6b5b4d7
TL
1181 auto pinned = !o->pop_cache();
1182 ceph_assert(!pinned);
1183 o->c->onode_map._remove(o->oid);
9f95a23c 1184 }
f6b5b4d7
TL
1185 }
1186 void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
1187 {
1188 if (to == this) {
1189 return;
1190 }
1191 ceph_assert(o->cached);
1192 ceph_assert(o->pinned);
1193 ceph_assert(num);
1194 ceph_assert(num_pinned);
1195 --num_pinned;
1196 --num;
1197 ++to->num_pinned;
1198 ++to->num;
9f95a23c
TL
1199 }
1200 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
1201 {
f6b5b4d7 1202 *onodes += num;
9f95a23c
TL
1203 *pinned_onodes += num_pinned;
1204 }
1205};
7c673cae 1206
9f95a23c
TL
1207// OnodeCacheShard
1208BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
1209 CephContext* cct,
1210 string type,
1211 PerfCounters *logger)
7c673cae 1212{
9f95a23c
TL
1213 BlueStore::OnodeCacheShard *c = nullptr;
1214 // Currently we only implement an LRU cache for onodes
1215 c = new LruOnodeCacheShard(cct);
1216 c->logger = logger;
1217 return c;
7c673cae
FG
1218}
1219
9f95a23c
TL
1220// LruBufferCacheShard
1221struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
1222 typedef boost::intrusive::list<
1223 BlueStore::Buffer,
1224 boost::intrusive::member_hook<
1225 BlueStore::Buffer,
1226 boost::intrusive::list_member_hook<>,
1227 &BlueStore::Buffer::lru_item> > list_t;
1228 list_t lru;
1229
1230 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
1231
1232 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
1233 if (near) {
1234 auto q = lru.iterator_to(*near);
1235 lru.insert(q, *b);
1236 } else if (level > 0) {
1237 lru.push_front(*b);
1238 } else {
1239 lru.push_back(*b);
7c673cae 1240 }
9f95a23c 1241 buffer_bytes += b->length;
20effc67
TL
1242 b->cache_age_bin = age_bins.front();
1243 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1244 num = lru.size();
1245 }
1246 void _rm(BlueStore::Buffer *b) override {
1247 ceph_assert(buffer_bytes >= b->length);
1248 buffer_bytes -= b->length;
20effc67
TL
1249 assert(*(b->cache_age_bin) >= b->length);
1250 *(b->cache_age_bin) -= b->length;
9f95a23c
TL
1251 auto q = lru.iterator_to(*b);
1252 lru.erase(q);
1253 num = lru.size();
1254 }
1255 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
1256 src->_rm(b);
1257 _add(b, 0, nullptr);
1258 }
1259 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
1260 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1261 buffer_bytes += delta;
20effc67
TL
1262 assert(*(b->cache_age_bin) + delta >= 0);
1263 *(b->cache_age_bin) += delta;
9f95a23c
TL
1264 }
1265 void _touch(BlueStore::Buffer *b) override {
1266 auto p = lru.iterator_to(*b);
1267 lru.erase(p);
1268 lru.push_front(*b);
20effc67
TL
1269 *(b->cache_age_bin) -= b->length;
1270 b->cache_age_bin = age_bins.front();
1271 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1272 num = lru.size();
1273 _audit("_touch_buffer end");
1274 }
7c673cae 1275
9f95a23c
TL
1276 void _trim_to(uint64_t max) override
1277 {
1278 while (buffer_bytes > max) {
1279 auto i = lru.rbegin();
1280 if (i == lru.rend()) {
1281 // stop if lru is now empty
7c673cae
FG
1282 break;
1283 }
1284
9f95a23c
TL
1285 BlueStore::Buffer *b = &*i;
1286 ceph_assert(b->is_clean());
1287 dout(20) << __func__ << " rm " << *b << dendl;
20effc67
TL
1288 assert(*(b->cache_age_bin) >= b->length);
1289 *(b->cache_age_bin) -= b->length;
9f95a23c 1290 b->space->_rm_buffer(this, b);
7c673cae 1291 }
9f95a23c 1292 num = lru.size();
7c673cae 1293 }
7c673cae 1294
9f95a23c
TL
1295 void add_stats(uint64_t *extents,
1296 uint64_t *blobs,
1297 uint64_t *buffers,
1298 uint64_t *bytes) override {
1299 *extents += num_extents;
1300 *blobs += num_blobs;
1301 *buffers += num;
1302 *bytes += buffer_bytes;
7c673cae 1303 }
9f95a23c
TL
1304#ifdef DEBUG_CACHE
1305 void _audit(const char *s) override
1306 {
1307 dout(10) << __func__ << " " << when << " start" << dendl;
1308 uint64_t s = 0;
1309 for (auto i = lru.begin(); i != lru.end(); ++i) {
1310 s += i->length;
1311 }
1312 if (s != buffer_bytes) {
1313 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1314 << dendl;
1315 for (auto i = lru.begin(); i != lru.end(); ++i) {
1316 derr << __func__ << " " << *i << dendl;
1317 }
1318 ceph_assert(s == buffer_bytes);
7c673cae 1319 }
9f95a23c
TL
1320 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1321 << " ok" << dendl;
7c673cae 1322 }
7c673cae 1323#endif
9f95a23c 1324};
7c673cae 1325
9f95a23c
TL
1326// TwoQBufferCacheShard
1327
1328struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1329 typedef boost::intrusive::list<
1330 BlueStore::Buffer,
1331 boost::intrusive::member_hook<
1332 BlueStore::Buffer,
1333 boost::intrusive::list_member_hook<>,
1334 &BlueStore::Buffer::lru_item> > list_t;
1335 list_t hot; ///< "Am" hot buffers
1336 list_t warm_in; ///< "A1in" newly warm buffers
1337 list_t warm_out; ///< "A1out" empty buffers we've evicted
9f95a23c
TL
1338
1339 enum {
1340 BUFFER_NEW = 0,
1341 BUFFER_WARM_IN, ///< in warm_in
1342 BUFFER_WARM_OUT, ///< in warm_out
1343 BUFFER_HOT, ///< in hot
1344 BUFFER_TYPE_MAX
1345 };
7c673cae 1346
9f95a23c 1347 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
7c673cae 1348
9f95a23c
TL
1349public:
1350 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
7c673cae 1351
9f95a23c
TL
1352 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1353 {
1354 dout(20) << __func__ << " level " << level << " near " << near
1355 << " on " << *b
1356 << " which has cache_private " << b->cache_private << dendl;
1357 if (near) {
1358 b->cache_private = near->cache_private;
1359 switch (b->cache_private) {
1360 case BUFFER_WARM_IN:
1361 warm_in.insert(warm_in.iterator_to(*near), *b);
1362 break;
1363 case BUFFER_WARM_OUT:
1364 ceph_assert(b->is_empty());
1365 warm_out.insert(warm_out.iterator_to(*near), *b);
1366 break;
1367 case BUFFER_HOT:
1368 hot.insert(hot.iterator_to(*near), *b);
1369 break;
1370 default:
1371 ceph_abort_msg("bad cache_private");
1372 }
1373 } else if (b->cache_private == BUFFER_NEW) {
1374 b->cache_private = BUFFER_WARM_IN;
1375 if (level > 0) {
1376 warm_in.push_front(*b);
1377 } else {
1378 // take caller hint to start at the back of the warm queue
1379 warm_in.push_back(*b);
1380 }
1381 } else {
1382 // we got a hint from discard
1383 switch (b->cache_private) {
1384 case BUFFER_WARM_IN:
1385 // stay in warm_in. move to front, even though 2Q doesn't actually
1386 // do this.
1387 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1388 warm_in.push_front(*b);
1389 break;
1390 case BUFFER_WARM_OUT:
1391 b->cache_private = BUFFER_HOT;
1392 // move to hot. fall-thru
1393 case BUFFER_HOT:
1394 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1395 hot.push_front(*b);
1396 break;
1397 default:
1398 ceph_abort_msg("bad cache_private");
1399 }
1400 }
20effc67 1401 b->cache_age_bin = age_bins.front();
9f95a23c
TL
1402 if (!b->is_empty()) {
1403 buffer_bytes += b->length;
1404 list_bytes[b->cache_private] += b->length;
20effc67 1405 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1406 }
1407 num = hot.size() + warm_in.size();
1408 }
1409
1410 void _rm(BlueStore::Buffer *b) override
1411 {
1412 dout(20) << __func__ << " " << *b << dendl;
1413 if (!b->is_empty()) {
1414 ceph_assert(buffer_bytes >= b->length);
1415 buffer_bytes -= b->length;
1416 ceph_assert(list_bytes[b->cache_private] >= b->length);
1417 list_bytes[b->cache_private] -= b->length;
20effc67
TL
1418 assert(*(b->cache_age_bin) >= b->length);
1419 *(b->cache_age_bin) -= b->length;
9f95a23c 1420 }
7c673cae
FG
1421 switch (b->cache_private) {
1422 case BUFFER_WARM_IN:
9f95a23c 1423 warm_in.erase(warm_in.iterator_to(*b));
7c673cae
FG
1424 break;
1425 case BUFFER_WARM_OUT:
9f95a23c 1426 warm_out.erase(warm_out.iterator_to(*b));
7c673cae
FG
1427 break;
1428 case BUFFER_HOT:
9f95a23c 1429 hot.erase(hot.iterator_to(*b));
7c673cae
FG
1430 break;
1431 default:
11fdf7f2 1432 ceph_abort_msg("bad cache_private");
7c673cae 1433 }
9f95a23c
TL
1434 num = hot.size() + warm_in.size();
1435 }
1436
1437 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1438 {
1439 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1440 src->_rm(b);
1441
1442 // preserve which list we're on (even if we can't preserve the order!)
7c673cae
FG
1443 switch (b->cache_private) {
1444 case BUFFER_WARM_IN:
9f95a23c
TL
1445 ceph_assert(!b->is_empty());
1446 warm_in.push_back(*b);
7c673cae
FG
1447 break;
1448 case BUFFER_WARM_OUT:
9f95a23c
TL
1449 ceph_assert(b->is_empty());
1450 warm_out.push_back(*b);
1451 break;
7c673cae 1452 case BUFFER_HOT:
9f95a23c
TL
1453 ceph_assert(!b->is_empty());
1454 hot.push_back(*b);
7c673cae
FG
1455 break;
1456 default:
11fdf7f2 1457 ceph_abort_msg("bad cache_private");
7c673cae 1458 }
9f95a23c
TL
1459 if (!b->is_empty()) {
1460 buffer_bytes += b->length;
1461 list_bytes[b->cache_private] += b->length;
20effc67 1462 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1463 }
1464 num = hot.size() + warm_in.size();
7c673cae 1465 }
7c673cae 1466
9f95a23c
TL
1467 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1468 {
1469 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1470 if (!b->is_empty()) {
1471 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1472 buffer_bytes += delta;
1473 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1474 list_bytes[b->cache_private] += delta;
20effc67
TL
1475 assert(*(b->cache_age_bin) + delta >= 0);
1476 *(b->cache_age_bin) += delta;
9f95a23c 1477 }
7c673cae 1478 }
7c673cae 1479
9f95a23c
TL
1480 void _touch(BlueStore::Buffer *b) override {
1481 switch (b->cache_private) {
1482 case BUFFER_WARM_IN:
1483 // do nothing (somewhat counter-intuitively!)
1484 break;
1485 case BUFFER_WARM_OUT:
1486 // move from warm_out to hot LRU
1487 ceph_abort_msg("this happens via discard hint");
1488 break;
1489 case BUFFER_HOT:
1490 // move to front of hot LRU
1491 hot.erase(hot.iterator_to(*b));
1492 hot.push_front(*b);
1493 break;
1494 }
20effc67
TL
1495 *(b->cache_age_bin) -= b->length;
1496 b->cache_age_bin = age_bins.front();
1497 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1498 num = hot.size() + warm_in.size();
1499 _audit("_touch_buffer end");
7c673cae 1500 }
7c673cae 1501
9f95a23c
TL
1502 void _trim_to(uint64_t max) override
1503 {
1504 if (buffer_bytes > max) {
1505 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1506 uint64_t khot = max - kin;
1507
1508 // pre-calculate kout based on average buffer size too,
1509 // which is typical(the warm_in and hot lists may change later)
1510 uint64_t kout = 0;
1511 uint64_t buffer_num = hot.size() + warm_in.size();
1512 if (buffer_num) {
1513 uint64_t avg_size = buffer_bytes / buffer_num;
1514 ceph_assert(avg_size);
1515 uint64_t calculated_num = max / avg_size;
1516 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1517 }
1518
1519 if (list_bytes[BUFFER_HOT] < khot) {
1520 // hot is small, give slack to warm_in
1521 kin += khot - list_bytes[BUFFER_HOT];
1522 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1523 // warm_in is small, give slack to hot
1524 khot += kin - list_bytes[BUFFER_WARM_IN];
1525 }
1526
1527 // adjust warm_in list
1528 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1529 uint64_t evicted = 0;
1530
1531 while (to_evict_bytes > 0) {
1532 auto p = warm_in.rbegin();
1533 if (p == warm_in.rend()) {
1534 // stop if warm_in list is now empty
1535 break;
1536 }
7c673cae 1537
9f95a23c
TL
1538 BlueStore::Buffer *b = &*p;
1539 ceph_assert(b->is_clean());
1540 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1541 ceph_assert(buffer_bytes >= b->length);
1542 buffer_bytes -= b->length;
1543 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1544 list_bytes[BUFFER_WARM_IN] -= b->length;
20effc67
TL
1545 assert(*(b->cache_age_bin) >= b->length);
1546 *(b->cache_age_bin) -= b->length;
1547 to_evict_bytes -= b->length;
9f95a23c
TL
1548 evicted += b->length;
1549 b->state = BlueStore::Buffer::STATE_EMPTY;
1550 b->data.clear();
1551 warm_in.erase(warm_in.iterator_to(*b));
1552 warm_out.push_front(*b);
1553 b->cache_private = BUFFER_WARM_OUT;
1554 }
1555
1556 if (evicted > 0) {
1557 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1558 << " from warm_in list, done evicting warm_in buffers"
1559 << dendl;
1560 }
7c673cae 1561
9f95a23c
TL
1562 // adjust hot list
1563 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1564 evicted = 0;
7c673cae 1565
9f95a23c
TL
1566 while (to_evict_bytes > 0) {
1567 auto p = hot.rbegin();
1568 if (p == hot.rend()) {
1569 // stop if hot list is now empty
1570 break;
1571 }
7c673cae 1572
9f95a23c
TL
1573 BlueStore::Buffer *b = &*p;
1574 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1575 ceph_assert(b->is_clean());
1576 // adjust evict size before buffer goes invalid
1577 to_evict_bytes -= b->length;
1578 evicted += b->length;
1579 b->space->_rm_buffer(this, b);
1580 }
7c673cae 1581
9f95a23c
TL
1582 if (evicted > 0) {
1583 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1584 << " from hot list, done evicting hot buffers"
1585 << dendl;
7c673cae
FG
1586 }
1587
9f95a23c
TL
1588 // adjust warm out list too, if necessary
1589 int64_t n = warm_out.size() - kout;
1590 while (n-- > 0) {
1591 BlueStore::Buffer *b = &*warm_out.rbegin();
1592 ceph_assert(b->is_empty());
1593 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1594 b->space->_rm_buffer(this, b);
1595 }
7c673cae 1596 }
9f95a23c
TL
1597 num = hot.size() + warm_in.size();
1598 }
7c673cae 1599
9f95a23c
TL
1600 void add_stats(uint64_t *extents,
1601 uint64_t *blobs,
1602 uint64_t *buffers,
1603 uint64_t *bytes) override {
1604 *extents += num_extents;
1605 *blobs += num_blobs;
1606 *buffers += num;
1607 *bytes += buffer_bytes;
1608 }
7c673cae 1609
9f95a23c
TL
1610#ifdef DEBUG_CACHE
1611 void _audit(const char *s) override
1612 {
1613 dout(10) << __func__ << " " << when << " start" << dendl;
1614 uint64_t s = 0;
1615 for (auto i = hot.begin(); i != hot.end(); ++i) {
1616 s += i->length;
7c673cae
FG
1617 }
1618
9f95a23c
TL
1619 uint64_t hot_bytes = s;
1620 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1621 derr << __func__ << " hot_list_bytes "
1622 << list_bytes[BUFFER_HOT]
1623 << " != actual " << hot_bytes
1624 << dendl;
1625 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
7c673cae
FG
1626 }
1627
9f95a23c
TL
1628 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1629 s += i->length;
7c673cae 1630 }
7c673cae 1631
9f95a23c
TL
1632 uint64_t warm_in_bytes = s - hot_bytes;
1633 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1634 derr << __func__ << " warm_in_list_bytes "
1635 << list_bytes[BUFFER_WARM_IN]
1636 << " != actual " << warm_in_bytes
1637 << dendl;
1638 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
7c673cae 1639 }
7c673cae 1640
9f95a23c
TL
1641 if (s != buffer_bytes) {
1642 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1643 << dendl;
1644 ceph_assert(s == buffer_bytes);
1645 }
7c673cae 1646
9f95a23c
TL
1647 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1648 << " ok" << dendl;
7c673cae 1649 }
9f95a23c
TL
1650#endif
1651};
7c673cae 1652
9f95a23c 1653// BuferCacheShard
7c673cae 1654
9f95a23c
TL
1655BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1656 CephContext* cct,
1657 string type,
1658 PerfCounters *logger)
1659{
1660 BufferCacheShard *c = nullptr;
1661 if (type == "lru")
1662 c = new LruBufferCacheShard(cct);
1663 else if (type == "2q")
1664 c = new TwoQBufferCacheShard(cct);
1665 else
1666 ceph_abort_msg("unrecognized cache type");
1667 c->logger = logger;
1668 return c;
7c673cae 1669}
7c673cae
FG
1670
1671// BufferSpace
1672
1673#undef dout_prefix
1674#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1675
9f95a23c 1676void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
7c673cae
FG
1677{
1678 // note: we already hold cache->lock
1679 ldout(cache->cct, 20) << __func__ << dendl;
1680 while (!buffer_map.empty()) {
1681 _rm_buffer(cache, buffer_map.begin());
1682 }
1683}
1684
9f95a23c 1685int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
7c673cae
FG
1686{
1687 // note: we already hold cache->lock
1688 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1689 << std::dec << dendl;
1690 int cache_private = 0;
1691 cache->_audit("discard start");
1692 auto i = _data_lower_bound(offset);
1693 uint32_t end = offset + length;
1694 while (i != buffer_map.end()) {
1695 Buffer *b = i->second.get();
1696 if (b->offset >= end) {
1697 break;
1698 }
1699 if (b->cache_private > cache_private) {
1700 cache_private = b->cache_private;
1701 }
1702 if (b->offset < offset) {
1703 int64_t front = offset - b->offset;
1704 if (b->end() > end) {
1705 // drop middle (split)
1706 uint32_t tail = b->end() - end;
1707 if (b->data.length()) {
1708 bufferlist bl;
1709 bl.substr_of(b->data, b->length - tail, tail);
f67539c2 1710 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
31f18b77
FG
1711 nb->maybe_rebuild();
1712 _add_buffer(cache, nb, 0, b);
7c673cae 1713 } else {
f67539c2
TL
1714 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail,
1715 b->flags),
1716 0, b);
7c673cae
FG
1717 }
1718 if (!b->is_writing()) {
9f95a23c 1719 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1720 }
1721 b->truncate(front);
31f18b77 1722 b->maybe_rebuild();
7c673cae
FG
1723 cache->_audit("discard end 1");
1724 break;
1725 } else {
1726 // drop tail
1727 if (!b->is_writing()) {
9f95a23c 1728 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1729 }
1730 b->truncate(front);
31f18b77 1731 b->maybe_rebuild();
7c673cae
FG
1732 ++i;
1733 continue;
1734 }
1735 }
1736 if (b->end() <= end) {
1737 // drop entire buffer
1738 _rm_buffer(cache, i++);
1739 continue;
1740 }
1741 // drop front
1742 uint32_t keep = b->end() - end;
1743 if (b->data.length()) {
1744 bufferlist bl;
1745 bl.substr_of(b->data, b->length - keep, keep);
f67539c2 1746 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
31f18b77
FG
1747 nb->maybe_rebuild();
1748 _add_buffer(cache, nb, 0, b);
7c673cae 1749 } else {
f67539c2
TL
1750 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep,
1751 b->flags),
1752 0, b);
7c673cae
FG
1753 }
1754 _rm_buffer(cache, i);
1755 cache->_audit("discard end 2");
1756 break;
1757 }
1758 return cache_private;
1759}
1760
1761void BlueStore::BufferSpace::read(
9f95a23c 1762 BufferCacheShard* cache,
224ce89b
WB
1763 uint32_t offset,
1764 uint32_t length,
7c673cae 1765 BlueStore::ready_regions_t& res,
91327a77
AA
1766 interval_set<uint32_t>& res_intervals,
1767 int flags)
7c673cae 1768{
7c673cae
FG
1769 res.clear();
1770 res_intervals.clear();
1771 uint32_t want_bytes = length;
1772 uint32_t end = offset + length;
224ce89b
WB
1773
1774 {
11fdf7f2 1775 std::lock_guard l(cache->lock);
224ce89b
WB
1776 for (auto i = _data_lower_bound(offset);
1777 i != buffer_map.end() && offset < end && i->first < end;
1778 ++i) {
1779 Buffer *b = i->second.get();
11fdf7f2 1780 ceph_assert(b->end() > offset);
91327a77
AA
1781
1782 bool val = false;
1783 if (flags & BYPASS_CLEAN_CACHE)
1784 val = b->is_writing();
1785 else
1786 val = b->is_writing() || b->is_clean();
1787 if (val) {
224ce89b
WB
1788 if (b->offset < offset) {
1789 uint32_t skip = offset - b->offset;
11fdf7f2 1790 uint32_t l = min(length, b->length - skip);
224ce89b
WB
1791 res[offset].substr_of(b->data, skip, l);
1792 res_intervals.insert(offset, l);
1793 offset += l;
1794 length -= l;
1795 if (!b->is_writing()) {
9f95a23c 1796 cache->_touch(b);
f67539c2 1797 }
224ce89b
WB
1798 continue;
1799 }
1800 if (b->offset > offset) {
1801 uint32_t gap = b->offset - offset;
1802 if (length <= gap) {
1803 break;
1804 }
1805 offset += gap;
1806 length -= gap;
1807 }
1808 if (!b->is_writing()) {
9f95a23c 1809 cache->_touch(b);
224ce89b
WB
1810 }
1811 if (b->length > length) {
1812 res[offset].substr_of(b->data, 0, length);
1813 res_intervals.insert(offset, length);
7c673cae 1814 break;
224ce89b
WB
1815 } else {
1816 res[offset].append(b->data);
1817 res_intervals.insert(offset, b->length);
1818 if (b->length == length)
1819 break;
1820 offset += b->length;
1821 length -= b->length;
1822 }
7c673cae
FG
1823 }
1824 }
1825 }
1826
1827 uint64_t hit_bytes = res_intervals.size();
11fdf7f2 1828 ceph_assert(hit_bytes <= want_bytes);
7c673cae
FG
1829 uint64_t miss_bytes = want_bytes - hit_bytes;
1830 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1831 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1832}
1833
9f95a23c 1834void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
7c673cae 1835{
7c673cae
FG
1836 auto i = writing.begin();
1837 while (i != writing.end()) {
1838 if (i->seq > seq) {
1839 break;
1840 }
1841 if (i->seq < seq) {
1842 ++i;
1843 continue;
1844 }
1845
1846 Buffer *b = &*i;
11fdf7f2 1847 ceph_assert(b->is_writing());
7c673cae
FG
1848
1849 if (b->flags & Buffer::FLAG_NOCACHE) {
1850 writing.erase(i++);
1851 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1852 buffer_map.erase(b->offset);
1853 } else {
1854 b->state = Buffer::STATE_CLEAN;
1855 writing.erase(i++);
31f18b77
FG
1856 b->maybe_rebuild();
1857 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
9f95a23c 1858 cache->_add(b, 1, nullptr);
7c673cae
FG
1859 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1860 }
1861 }
9f95a23c 1862 cache->_trim();
7c673cae
FG
1863 cache->_audit("finish_write end");
1864}
1865
9f95a23c 1866void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
7c673cae 1867{
11fdf7f2 1868 std::lock_guard lk(cache->lock);
7c673cae
FG
1869 if (buffer_map.empty())
1870 return;
1871
1872 auto p = --buffer_map.end();
1873 while (true) {
1874 if (p->second->end() <= pos)
1875 break;
1876
1877 if (p->second->offset < pos) {
1878 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1879 size_t left = pos - p->second->offset;
1880 size_t right = p->second->length - left;
1881 if (p->second->data.length()) {
1882 bufferlist bl;
1883 bl.substr_of(p->second->data, left, right);
f67539c2
TL
1884 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1885 0, bl, p->second->flags),
7c673cae
FG
1886 0, p->second.get());
1887 } else {
f67539c2
TL
1888 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1889 0, right, p->second->flags),
7c673cae
FG
1890 0, p->second.get());
1891 }
9f95a23c 1892 cache->_adjust_size(p->second.get(), -right);
7c673cae
FG
1893 p->second->truncate(left);
1894 break;
1895 }
1896
11fdf7f2 1897 ceph_assert(p->second->end() > pos);
7c673cae
FG
1898 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1899 if (p->second->data.length()) {
1900 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
f67539c2 1901 p->second->offset - pos, p->second->data, p->second->flags),
7c673cae
FG
1902 0, p->second.get());
1903 } else {
1904 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
f67539c2 1905 p->second->offset - pos, p->second->length, p->second->flags),
7c673cae
FG
1906 0, p->second.get());
1907 }
1908 if (p == buffer_map.begin()) {
1909 _rm_buffer(cache, p);
1910 break;
1911 } else {
1912 _rm_buffer(cache, p--);
1913 }
1914 }
11fdf7f2 1915 ceph_assert(writing.empty());
9f95a23c 1916 cache->_trim();
7c673cae
FG
1917}
1918
1919// OnodeSpace
1920
1921#undef dout_prefix
1922#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1923
f6b5b4d7
TL
1924BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
1925 OnodeRef& o)
7c673cae 1926{
11fdf7f2 1927 std::lock_guard l(cache->lock);
7c673cae
FG
1928 auto p = onode_map.find(oid);
1929 if (p != onode_map.end()) {
1930 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1931 << " raced, returning existing " << p->second
1932 << dendl;
1933 return p->second;
1934 }
f6b5b4d7 1935 ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
7c673cae 1936 onode_map[oid] = o;
f6b5b4d7 1937 cache->_add(o.get(), 1);
9f95a23c 1938 cache->_trim();
7c673cae
FG
1939 return o;
1940}
1941
f6b5b4d7
TL
1942void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
1943{
1944 ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
1945 onode_map.erase(oid);
1946}
1947
7c673cae
FG
1948BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1949{
7c673cae 1950 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b 1951 OnodeRef o;
224ce89b
WB
1952
1953 {
11fdf7f2 1954 std::lock_guard l(cache->lock);
224ce89b
WB
1955 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1956 if (p == onode_map.end()) {
20effc67 1957 cache->logger->inc(l_bluestore_onode_misses);
224ce89b
WB
1958 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1959 } else {
1960 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
f6b5b4d7
TL
1961 << " " << p->second->nref
1962 << " " << p->second->cached
1963 << " " << p->second->pinned
224ce89b 1964 << dendl;
f6b5b4d7
TL
1965 // This will pin onode and implicitly touch the cache when Onode
1966 // eventually will become unpinned
224ce89b 1967 o = p->second;
f6b5b4d7
TL
1968 ceph_assert(!o->cached || o->pinned);
1969
20effc67 1970 cache->logger->inc(l_bluestore_onode_hits);
224ce89b
WB
1971 }
1972 }
1973
224ce89b 1974 return o;
7c673cae
FG
1975}
1976
1977void BlueStore::OnodeSpace::clear()
1978{
11fdf7f2 1979 std::lock_guard l(cache->lock);
f6b5b4d7 1980 ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
7c673cae 1981 for (auto &p : onode_map) {
f6b5b4d7 1982 cache->_rm(p.second.get());
7c673cae
FG
1983 }
1984 onode_map.clear();
1985}
1986
1987bool BlueStore::OnodeSpace::empty()
1988{
11fdf7f2 1989 std::lock_guard l(cache->lock);
7c673cae
FG
1990 return onode_map.empty();
1991}
1992
1993void BlueStore::OnodeSpace::rename(
1994 OnodeRef& oldo,
1995 const ghobject_t& old_oid,
1996 const ghobject_t& new_oid,
f91f0fd5 1997 const mempool::bluestore_cache_meta::string& new_okey)
7c673cae 1998{
11fdf7f2 1999 std::lock_guard l(cache->lock);
7c673cae
FG
2000 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
2001 << dendl;
2002 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
2003 po = onode_map.find(old_oid);
2004 pn = onode_map.find(new_oid);
11fdf7f2 2005 ceph_assert(po != pn);
7c673cae 2006
11fdf7f2 2007 ceph_assert(po != onode_map.end());
7c673cae
FG
2008 if (pn != onode_map.end()) {
2009 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
2010 << dendl;
f6b5b4d7 2011 cache->_rm(pn->second.get());
7c673cae
FG
2012 onode_map.erase(pn);
2013 }
2014 OnodeRef o = po->second;
2015
2016 // install a non-existent onode at old location
2017 oldo.reset(new Onode(o->c, old_oid, o->key));
2018 po->second = oldo;
f6b5b4d7
TL
2019 cache->_add(oldo.get(), 1);
2020 // add at new position and fix oid, key.
2021 // This will pin 'o' and implicitly touch cache
2022 // when it will eventually become unpinned
7c673cae 2023 onode_map.insert(make_pair(new_oid, o));
f6b5b4d7
TL
2024 ceph_assert(o->pinned);
2025
7c673cae
FG
2026 o->oid = new_oid;
2027 o->key = new_okey;
9f95a23c 2028 cache->_trim();
7c673cae
FG
2029}
2030
adb31ebb 2031bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
7c673cae 2032{
11fdf7f2 2033 std::lock_guard l(cache->lock);
7c673cae
FG
2034 ldout(cache->cct, 20) << __func__ << dendl;
2035 for (auto& i : onode_map) {
adb31ebb 2036 if (f(i.second.get())) {
7c673cae
FG
2037 return true;
2038 }
2039 }
2040 return false;
2041}
2042
11fdf7f2
TL
2043template <int LogLevelV = 30>
2044void BlueStore::OnodeSpace::dump(CephContext *cct)
3efd9988
FG
2045{
2046 for (auto& i : onode_map) {
f6b5b4d7
TL
2047 ldout(cct, LogLevelV) << i.first << " : " << i.second
2048 << " " << i.second->nref
2049 << " " << i.second->cached
2050 << " " << i.second->pinned
2051 << dendl;
3efd9988
FG
2052 }
2053}
7c673cae
FG
2054
2055// SharedBlob
2056
2057#undef dout_prefix
2058#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
9f95a23c
TL
2059#undef dout_context
2060#define dout_context coll->store->cct
7c673cae 2061
9f95a23c 2062void BlueStore::SharedBlob::dump(Formatter* f) const
7c673cae 2063{
9f95a23c
TL
2064 f->dump_bool("loaded", loaded);
2065 if (loaded) {
2066 persistent->dump(f);
2067 } else {
2068 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
2069 }
2070}
2071
2072ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
2073{
2074 out << "SharedBlob(" << &sb;
2075
7c673cae
FG
2076 if (sb.loaded) {
2077 out << " loaded " << *sb.persistent;
2078 } else {
2079 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
2080 }
2081 return out << ")";
2082}
2083
2084BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
2085 : coll(_coll), sbid_unloaded(i)
2086{
11fdf7f2 2087 ceph_assert(sbid_unloaded > 0);
7c673cae
FG
2088 if (get_cache()) {
2089 get_cache()->add_blob();
2090 }
2091}
2092
2093BlueStore::SharedBlob::~SharedBlob()
2094{
7c673cae
FG
2095 if (loaded && persistent) {
2096 delete persistent;
2097 }
2098}
2099
2100void BlueStore::SharedBlob::put()
2101{
2102 if (--nref == 0) {
9f95a23c
TL
2103 dout(20) << __func__ << " " << this
2104 << " removing self from set " << get_parent()
2105 << dendl;
1adf2230
AA
2106 again:
2107 auto coll_snap = coll;
2108 if (coll_snap) {
11fdf7f2 2109 std::lock_guard l(coll_snap->cache->lock);
1adf2230
AA
2110 if (coll_snap != coll) {
2111 goto again;
2112 }
91327a77
AA
2113 if (!coll_snap->shared_blob_set.remove(this, true)) {
2114 // race with lookup
2115 return;
2116 }
1adf2230
AA
2117 bc._clear(coll_snap->cache);
2118 coll_snap->cache->rm_blob();
7c673cae 2119 }
28e407b8 2120 delete this;
7c673cae
FG
2121 }
2122}
2123
2124void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
2125{
11fdf7f2 2126 ceph_assert(persistent);
7c673cae
FG
2127 persistent->ref_map.get(offset, length);
2128}
2129
2130void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77 2131 PExtentVector *r,
11fdf7f2 2132 bool *unshare)
7c673cae 2133{
11fdf7f2
TL
2134 ceph_assert(persistent);
2135 persistent->ref_map.put(offset, length, r,
2136 unshare && !*unshare ? unshare : nullptr);
7c673cae
FG
2137}
2138
f64942e4
AA
2139void BlueStore::SharedBlob::finish_write(uint64_t seq)
2140{
2141 while (true) {
9f95a23c 2142 BufferCacheShard *cache = coll->cache;
11fdf7f2 2143 std::lock_guard l(cache->lock);
f64942e4 2144 if (coll->cache != cache) {
9f95a23c
TL
2145 dout(20) << __func__
2146 << " raced with sb cache update, was " << cache
2147 << ", now " << coll->cache << ", retrying"
2148 << dendl;
f64942e4
AA
2149 continue;
2150 }
2151 bc._finish_write(cache, seq);
2152 break;
2153 }
2154}
2155
3efd9988
FG
2156// SharedBlobSet
2157
2158#undef dout_prefix
2159#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2160
11fdf7f2
TL
2161template <int LogLevelV = 30>
2162void BlueStore::SharedBlobSet::dump(CephContext *cct)
3efd9988 2163{
11fdf7f2 2164 std::lock_guard l(lock);
3efd9988 2165 for (auto& i : sb_map) {
11fdf7f2 2166 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
3efd9988
FG
2167 }
2168}
2169
7c673cae
FG
2170// Blob
2171
2172#undef dout_prefix
2173#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2174
9f95a23c
TL
2175void BlueStore::Blob::dump(Formatter* f) const
2176{
2177 if (is_spanning()) {
2178 f->dump_unsigned("spanning_id ", id);
2179 }
2180 blob.dump(f);
2181 if (shared_blob) {
2182 f->dump_object("shared", *shared_blob);
2183 }
2184}
2185
7c673cae
FG
2186ostream& operator<<(ostream& out, const BlueStore::Blob& b)
2187{
2188 out << "Blob(" << &b;
2189 if (b.is_spanning()) {
2190 out << " spanning " << b.id;
2191 }
35e4c445
FG
2192 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
2193 if (b.shared_blob) {
2194 out << " " << *b.shared_blob;
2195 } else {
2196 out << " (shared_blob=NULL)";
2197 }
2198 out << ")";
7c673cae
FG
2199 return out;
2200}
2201
2202void BlueStore::Blob::discard_unallocated(Collection *coll)
2203{
224ce89b 2204 if (get_blob().is_shared()) {
7c673cae
FG
2205 return;
2206 }
224ce89b 2207 if (get_blob().is_compressed()) {
7c673cae
FG
2208 bool discard = false;
2209 bool all_invalid = true;
224ce89b 2210 for (auto e : get_blob().get_extents()) {
7c673cae
FG
2211 if (!e.is_valid()) {
2212 discard = true;
2213 } else {
2214 all_invalid = false;
2215 }
2216 }
11fdf7f2 2217 ceph_assert(discard == all_invalid); // in case of compressed blob all
7c673cae
FG
2218 // or none pextents are invalid.
2219 if (discard) {
224ce89b
WB
2220 shared_blob->bc.discard(shared_blob->get_cache(), 0,
2221 get_blob().get_logical_length());
7c673cae
FG
2222 }
2223 } else {
2224 size_t pos = 0;
224ce89b 2225 for (auto e : get_blob().get_extents()) {
7c673cae 2226 if (!e.is_valid()) {
9f95a23c
TL
2227 dout(20) << __func__ << " 0x" << std::hex << pos
2228 << "~" << e.length
2229 << std::dec << dendl;
7c673cae
FG
2230 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
2231 }
2232 pos += e.length;
2233 }
224ce89b
WB
2234 if (get_blob().can_prune_tail()) {
2235 dirty_blob().prune_tail();
2236 used_in_blob.prune_tail(get_blob().get_ondisk_length());
224ce89b 2237 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
2238 }
2239 }
2240}
2241
2242void BlueStore::Blob::get_ref(
2243 Collection *coll,
2244 uint32_t offset,
2245 uint32_t length)
2246{
2247 // Caller has to initialize Blob's logical length prior to increment
2248 // references. Otherwise one is neither unable to determine required
2249 // amount of counters in case of per-au tracking nor obtain min_release_size
2250 // for single counter mode.
11fdf7f2 2251 ceph_assert(get_blob().get_logical_length() != 0);
7c673cae
FG
2252 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2253 << std::dec << " " << *this << dendl;
2254
2255 if (used_in_blob.is_empty()) {
2256 uint32_t min_release_size =
224ce89b
WB
2257 get_blob().get_release_size(coll->store->min_alloc_size);
2258 uint64_t l = get_blob().get_logical_length();
2259 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
2260 << min_release_size << std::dec << dendl;
7c673cae
FG
2261 used_in_blob.init(l, min_release_size);
2262 }
2263 used_in_blob.get(
2264 offset,
2265 length);
2266}
2267
2268bool BlueStore::Blob::put_ref(
2269 Collection *coll,
2270 uint32_t offset,
2271 uint32_t length,
2272 PExtentVector *r)
2273{
2274 PExtentVector logical;
2275
7c673cae
FG
2276 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2277 << std::dec << " " << *this << dendl;
2278
2279 bool empty = used_in_blob.put(
2280 offset,
2281 length,
2282 &logical);
2283 r->clear();
2284 // nothing to release
2285 if (!empty && logical.empty()) {
2286 return false;
2287 }
2288
2289 bluestore_blob_t& b = dirty_blob();
2290 return b.release_extents(empty, logical, r);
2291}
2292
224ce89b 2293bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
2294 uint32_t target_blob_size,
2295 uint32_t b_offset,
2296 uint32_t *length0) {
11fdf7f2
TL
2297 ceph_assert(min_alloc_size);
2298 ceph_assert(target_blob_size);
7c673cae
FG
2299 if (!get_blob().is_mutable()) {
2300 return false;
2301 }
2302
2303 uint32_t length = *length0;
2304 uint32_t end = b_offset + length;
2305
2306 // Currently for the sake of simplicity we omit blob reuse if data is
2307 // unaligned with csum chunk. Later we can perform padding if needed.
2308 if (get_blob().has_csum() &&
2309 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2310 (end % get_blob().get_csum_chunk_size()) != 0)) {
2311 return false;
2312 }
2313
2314 auto blen = get_blob().get_logical_length();
2315 uint32_t new_blen = blen;
2316
2317 // make sure target_blob_size isn't less than current blob len
11fdf7f2 2318 target_blob_size = std::max(blen, target_blob_size);
7c673cae
FG
2319
2320 if (b_offset >= blen) {
224ce89b
WB
2321 // new data totally stands out of the existing blob
2322 new_blen = end;
7c673cae 2323 } else {
224ce89b 2324 // new data overlaps with the existing blob
11fdf7f2 2325 new_blen = std::max(blen, end);
224ce89b
WB
2326
2327 uint32_t overlap = 0;
2328 if (new_blen > blen) {
2329 overlap = blen - b_offset;
2330 } else {
2331 overlap = length;
2332 }
2333
2334 if (!get_blob().is_unallocated(b_offset, overlap)) {
2335 // abort if any piece of the overlap has already been allocated
2336 return false;
7c673cae
FG
2337 }
2338 }
224ce89b 2339
7c673cae
FG
2340 if (new_blen > blen) {
2341 int64_t overflow = int64_t(new_blen) - target_blob_size;
2342 // Unable to decrease the provided length to fit into max_blob_size
2343 if (overflow >= length) {
2344 return false;
2345 }
2346
2347 // FIXME: in some cases we could reduce unused resolution
2348 if (get_blob().has_unused()) {
2349 return false;
2350 }
2351
2352 if (overflow > 0) {
2353 new_blen -= overflow;
2354 length -= overflow;
2355 *length0 = length;
2356 }
224ce89b 2357
7c673cae
FG
2358 if (new_blen > blen) {
2359 dirty_blob().add_tail(new_blen);
2360 used_in_blob.add_tail(new_blen,
224ce89b 2361 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
2362 }
2363 }
2364 return true;
2365}
2366
2367void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2368{
7c673cae
FG
2369 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2370 << " start " << *this << dendl;
11fdf7f2
TL
2371 ceph_assert(blob.can_split());
2372 ceph_assert(used_in_blob.can_split());
7c673cae
FG
2373 bluestore_blob_t &lb = dirty_blob();
2374 bluestore_blob_t &rb = r->dirty_blob();
2375
2376 used_in_blob.split(
2377 blob_offset,
2378 &(r->used_in_blob));
2379
2380 lb.split(blob_offset, rb);
2381 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2382
2383 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2384 << " finish " << *this << dendl;
2385 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2386 << " and " << *r << dendl;
2387}
2388
2389#ifndef CACHE_BLOB_BL
2390void BlueStore::Blob::decode(
2391 Collection *coll,
11fdf7f2 2392 bufferptr::const_iterator& p,
7c673cae
FG
2393 uint64_t struct_v,
2394 uint64_t* sbid,
2395 bool include_ref_map)
2396{
2397 denc(blob, p, struct_v);
2398 if (blob.is_shared()) {
2399 denc(*sbid, p);
2400 }
2401 if (include_ref_map) {
2402 if (struct_v > 1) {
2403 used_in_blob.decode(p);
2404 } else {
2405 used_in_blob.clear();
2406 bluestore_extent_ref_map_t legacy_ref_map;
2407 legacy_ref_map.decode(p);
2408 for (auto r : legacy_ref_map.ref_map) {
2409 get_ref(
2410 coll,
2411 r.first,
2412 r.second.refs * r.second.length);
2413 }
2414 }
2415 }
2416}
2417#endif
2418
2419// Extent
2420
9f95a23c
TL
2421void BlueStore::Extent::dump(Formatter* f) const
2422{
2423 f->dump_unsigned("logical_offset", logical_offset);
2424 f->dump_unsigned("length", length);
2425 f->dump_unsigned("blob_offset", blob_offset);
2426 f->dump_object("blob", *blob);
2427}
2428
7c673cae
FG
2429ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2430{
2431 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2432 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2433 << " " << *e.blob;
2434}
2435
2436// OldExtent
2437BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2438 uint32_t lo,
2439 uint32_t o,
2440 uint32_t l,
2441 BlobRef& b) {
2442 OldExtent* oe = new OldExtent(lo, o, l, b);
2443 b->put_ref(c.get(), o, l, &(oe->r));
adb31ebb 2444 oe->blob_empty = !b->is_referenced();
7c673cae
FG
2445 return oe;
2446}
2447
2448// ExtentMap
2449
2450#undef dout_prefix
2451#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
9f95a23c
TL
2452#undef dout_context
2453#define dout_context onode->c->store->cct
7c673cae
FG
2454
2455BlueStore::ExtentMap::ExtentMap(Onode *o)
2456 : onode(o),
2457 inline_bl(
2458 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2459}
2460
9f95a23c
TL
2461void BlueStore::ExtentMap::dump(Formatter* f) const
2462{
2463 f->open_array_section("extents");
2464
2465 for (auto& e : extent_map) {
2466 f->dump_object("extent", e);
2467 }
2468 f->close_section();
2469}
2470
11fdf7f2
TL
2471void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2472 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2473 uint64_t& length, uint64_t& dstoff) {
2474
2475 auto cct = onode->c->store->cct;
2476 bool inject_21040 =
2477 cct->_conf->bluestore_debug_inject_bug21040;
2478 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2479 for (auto& e : oldo->extent_map.extent_map) {
2480 e.blob->last_encoded_id = -1;
2481 }
2482
2483 int n = 0;
2484 uint64_t end = srcoff + length;
2485 uint32_t dirty_range_begin = 0;
2486 uint32_t dirty_range_end = 0;
2487 bool src_dirty = false;
2488 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2489 ep != oldo->extent_map.extent_map.end();
2490 ++ep) {
2491 auto& e = *ep;
2492 if (e.logical_offset >= end) {
2493 break;
2494 }
2495 dout(20) << __func__ << " src " << e << dendl;
2496 BlobRef cb;
2497 bool blob_duped = true;
2498 if (e.blob->last_encoded_id >= 0) {
2499 cb = id_to_blob[e.blob->last_encoded_id];
2500 blob_duped = false;
2501 } else {
2502 // dup the blob
2503 const bluestore_blob_t& blob = e.blob->get_blob();
2504 // make sure it is shared
2505 if (!blob.is_shared()) {
2506 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2507 if (!inject_21040 && !src_dirty) {
2508 src_dirty = true;
2509 dirty_range_begin = e.logical_offset;
2510 } else if (inject_21040 &&
2511 dirty_range_begin == 0 && dirty_range_end == 0) {
2512 dirty_range_begin = e.logical_offset;
2513 }
2514 ceph_assert(e.logical_end() > 0);
2515 // -1 to exclude next potential shard
2516 dirty_range_end = e.logical_end() - 1;
2517 } else {
2518 c->load_shared_blob(e.blob->shared_blob);
2519 }
2520 cb = new Blob();
2521 e.blob->last_encoded_id = n;
2522 id_to_blob[n] = cb;
2523 e.blob->dup(*cb);
2524 // bump the extent refs on the copied blob's extents
2525 for (auto p : blob.get_extents()) {
2526 if (p.is_valid()) {
2527 e.blob->shared_blob->get_ref(p.offset, p.length);
2528 }
2529 }
2530 txc->write_shared_blob(e.blob->shared_blob);
2531 dout(20) << __func__ << " new " << *cb << dendl;
2532 }
2533
2534 int skip_front, skip_back;
2535 if (e.logical_offset < srcoff) {
2536 skip_front = srcoff - e.logical_offset;
2537 } else {
2538 skip_front = 0;
2539 }
2540 if (e.logical_end() > end) {
2541 skip_back = e.logical_end() - end;
2542 } else {
2543 skip_back = 0;
2544 }
2545
2546 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2547 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2548 newo->extent_map.extent_map.insert(*ne);
2549 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2550 // fixme: we may leave parts of new blob unreferenced that could
2551 // be freed (relative to the shared_blob).
2552 txc->statfs_delta.stored() += ne->length;
2553 if (e.blob->get_blob().is_compressed()) {
2554 txc->statfs_delta.compressed_original() += ne->length;
2555 if (blob_duped) {
2556 txc->statfs_delta.compressed() +=
2557 cb->get_blob().get_compressed_payload_length();
2558 }
2559 }
2560 dout(20) << __func__ << " dst " << *ne << dendl;
2561 ++n;
2562 }
2563 if ((!inject_21040 && src_dirty) ||
2564 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2565 oldo->extent_map.dirty_range(dirty_range_begin,
2566 dirty_range_end - dirty_range_begin);
2567 txc->write_onode(oldo);
2568 }
2569 txc->write_onode(newo);
2570
2571 if (dstoff + length > newo->onode.size) {
2572 newo->onode.size = dstoff + length;
2573 }
2574 newo->extent_map.dirty_range(dstoff, length);
2575}
7c673cae
FG
2576void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2577 bool force)
2578{
2579 auto cct = onode->c->store->cct; //used by dout
2580 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2581 if (onode->onode.extent_map_shards.empty()) {
2582 if (inline_bl.length() == 0) {
2583 unsigned n;
2584 // we need to encode inline_bl to measure encoded length
2585 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
f91f0fd5 2586 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
11fdf7f2 2587 ceph_assert(!never_happen);
7c673cae
FG
2588 size_t len = inline_bl.length();
2589 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2590 << " extents" << dendl;
2591 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2592 request_reshard(0, OBJECT_MAX_SIZE);
2593 return;
2594 }
2595 }
2596 // will persist in the onode key.
2597 } else {
2598 // pending shard update
2599 struct dirty_shard_t {
2600 Shard *shard;
2601 bufferlist bl;
2602 dirty_shard_t(Shard *s) : shard(s) {}
2603 };
2604 vector<dirty_shard_t> encoded_shards;
2605 // allocate slots for all shards in a single call instead of
2606 // doing multiple allocations - one per each dirty shard
2607 encoded_shards.reserve(shards.size());
2608
2609 auto p = shards.begin();
2610 auto prev_p = p;
2611 while (p != shards.end()) {
11fdf7f2 2612 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2613 auto n = p;
2614 ++n;
2615 if (p->dirty) {
2616 uint32_t endoff;
2617 if (n == shards.end()) {
2618 endoff = OBJECT_MAX_SIZE;
2619 } else {
2620 endoff = n->shard_info->offset;
2621 }
2622 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2623 bufferlist& bl = encoded_shards.back().bl;
2624 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2625 bl, &p->extents)) {
2626 if (force) {
2627 derr << __func__ << " encode_some needs reshard" << dendl;
11fdf7f2 2628 ceph_assert(!force);
7c673cae
FG
2629 }
2630 }
2631 size_t len = bl.length();
2632
2633 dout(20) << __func__ << " shard 0x" << std::hex
2634 << p->shard_info->offset << std::dec << " is " << len
2635 << " bytes (was " << p->shard_info->bytes << ") from "
2636 << p->extents << " extents" << dendl;
2637
2638 if (!force) {
2639 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2640 // we are big; reshard ourselves
2641 request_reshard(p->shard_info->offset, endoff);
2642 }
2643 // avoid resharding the trailing shard, even if it is small
2644 else if (n != shards.end() &&
11fdf7f2
TL
2645 len < g_conf()->bluestore_extent_map_shard_min_size) {
2646 ceph_assert(endoff != OBJECT_MAX_SIZE);
31f18b77
FG
2647 if (p == shards.begin()) {
2648 // we are the first shard, combine with next shard
7c673cae 2649 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2650 } else {
31f18b77
FG
2651 // combine either with the previous shard or the next,
2652 // whichever is smaller
7c673cae
FG
2653 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2654 request_reshard(p->shard_info->offset, endoff + 1);
2655 } else {
2656 request_reshard(prev_p->shard_info->offset, endoff);
2657 }
2658 }
2659 }
2660 }
2661 }
2662 prev_p = p;
2663 p = n;
2664 }
2665 if (needs_reshard()) {
2666 return;
2667 }
2668
2669 // schedule DB update for dirty shards
2670 string key;
2671 for (auto& it : encoded_shards) {
20effc67
TL
2672 dout(20) << __func__ << " encoding key for shard 0x" << std::hex
2673 << it.shard->shard_info->offset << std::dec << dendl;
7c673cae
FG
2674 it.shard->dirty = false;
2675 it.shard->shard_info->bytes = it.bl.length();
2676 generate_extent_shard_key_and_apply(
2677 onode->key,
2678 it.shard->shard_info->offset,
2679 &key,
2680 [&](const string& final_key) {
2681 t->set(PREFIX_OBJ, final_key, it.bl);
2682 }
2683 );
2684 }
2685 }
2686}
2687
31f18b77
FG
2688bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2689{
2690 if (spanning_blob_map.empty())
2691 return 0;
2692 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2693 // bid is valid and available.
2694 if (bid >= 0)
2695 return bid;
2696 // Find next unused bid;
2697 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2698 const auto begin_bid = bid;
2699 do {
2700 if (!spanning_blob_map.count(bid))
2701 return bid;
2702 else {
2703 bid++;
2704 if (bid < 0) bid = 0;
2705 }
2706 } while (bid != begin_bid);
81eedcae
TL
2707 auto cct = onode->c->store->cct; // used by dout
2708 _dump_onode<0>(cct, *onode);
11fdf7f2 2709 ceph_abort_msg("no available blob id");
31f18b77
FG
2710}
2711
7c673cae
FG
2712void BlueStore::ExtentMap::reshard(
2713 KeyValueDB *db,
2714 KeyValueDB::Transaction t)
2715{
2716 auto cct = onode->c->store->cct; // used by dout
2717
2718 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2719 << needs_reshard_end << ")" << std::dec
2720 << " of " << onode->onode.extent_map_shards.size()
2721 << " shards on " << onode->oid << dendl;
2722 for (auto& p : spanning_blob_map) {
2723 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2724 << dendl;
2725 }
2726 // determine shard index range
2727 unsigned si_begin = 0, si_end = 0;
2728 if (!shards.empty()) {
2729 while (si_begin + 1 < shards.size() &&
2730 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2731 ++si_begin;
2732 }
2733 needs_reshard_begin = shards[si_begin].shard_info->offset;
2734 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2735 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2736 needs_reshard_end = shards[si_end].shard_info->offset;
2737 break;
2738 }
2739 }
2740 if (si_end == shards.size()) {
2741 needs_reshard_end = OBJECT_MAX_SIZE;
2742 }
2743 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2744 << " over 0x[" << std::hex << needs_reshard_begin << ","
2745 << needs_reshard_end << ")" << std::dec << dendl;
2746 }
2747
181888fb 2748 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2749
2750 // we may need to fault in a larger interval later must have all
2751 // referring extents for spanning blobs loaded in order to have
2752 // accurate use_tracker values.
2753 uint32_t spanning_scan_begin = needs_reshard_begin;
2754 uint32_t spanning_scan_end = needs_reshard_end;
2755
2756 // remove old keys
2757 string key;
2758 for (unsigned i = si_begin; i < si_end; ++i) {
2759 generate_extent_shard_key_and_apply(
2760 onode->key, shards[i].shard_info->offset, &key,
2761 [&](const string& final_key) {
2762 t->rmkey(PREFIX_OBJ, final_key);
2763 }
2764 );
2765 }
2766
2767 // calculate average extent size
2768 unsigned bytes = 0;
2769 unsigned extents = 0;
2770 if (onode->onode.extent_map_shards.empty()) {
2771 bytes = inline_bl.length();
2772 extents = extent_map.size();
2773 } else {
2774 for (unsigned i = si_begin; i < si_end; ++i) {
2775 bytes += shards[i].shard_info->bytes;
2776 extents += shards[i].extents;
2777 }
2778 }
2779 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2780 unsigned slop = target *
2781 cct->_conf->bluestore_extent_map_shard_target_size_slop;
11fdf7f2 2782 unsigned extent_avg = bytes / std::max(1u, extents);
7c673cae
FG
2783 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2784 << ", slop " << slop << dendl;
2785
2786 // reshard
2787 unsigned estimate = 0;
31f18b77 2788 unsigned offset = needs_reshard_begin;
7c673cae
FG
2789 vector<bluestore_onode_t::shard_info> new_shard_info;
2790 unsigned max_blob_end = 0;
2791 Extent dummy(needs_reshard_begin);
2792 for (auto e = extent_map.lower_bound(dummy);
2793 e != extent_map.end();
2794 ++e) {
2795 if (e->logical_offset >= needs_reshard_end) {
2796 break;
2797 }
2798 dout(30) << " extent " << *e << dendl;
2799
2800 // disfavor shard boundaries that span a blob
2801 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2802 if (estimate &&
2803 estimate + extent_avg > target + (would_span ? slop : 0)) {
2804 // new shard
31f18b77 2805 if (offset == needs_reshard_begin) {
7c673cae
FG
2806 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2807 new_shard_info.back().offset = offset;
2808 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2809 << std::dec << dendl;
7c673cae
FG
2810 }
2811 offset = e->logical_offset;
2812 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2813 new_shard_info.back().offset = offset;
2814 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2815 << std::dec << dendl;
2816 estimate = 0;
2817 }
2818 estimate += extent_avg;
31f18b77
FG
2819 unsigned bs = e->blob_start();
2820 if (bs < spanning_scan_begin) {
2821 spanning_scan_begin = bs;
7c673cae
FG
2822 }
2823 uint32_t be = e->blob_end();
2824 if (be > max_blob_end) {
2825 max_blob_end = be;
2826 }
2827 if (be > spanning_scan_end) {
2828 spanning_scan_end = be;
2829 }
2830 }
2831 if (new_shard_info.empty() && (si_begin > 0 ||
2832 si_end < shards.size())) {
2833 // we resharded a partial range; we must produce at least one output
2834 // shard
2835 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2836 new_shard_info.back().offset = needs_reshard_begin;
2837 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2838 << std::dec << " (singleton degenerate case)" << dendl;
2839 }
2840
2841 auto& sv = onode->onode.extent_map_shards;
2842 dout(20) << __func__ << " new " << new_shard_info << dendl;
2843 dout(20) << __func__ << " old " << sv << dendl;
2844 if (sv.empty()) {
2845 // no old shards to keep
2846 sv.swap(new_shard_info);
2847 init_shards(true, true);
2848 } else {
2849 // splice in new shards
2850 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2851 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2852 sv.insert(
2853 sv.begin() + si_begin,
2854 new_shard_info.begin(),
2855 new_shard_info.end());
2856 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2857 si_end = si_begin + new_shard_info.size();
31f18b77 2858
11fdf7f2 2859 ceph_assert(sv.size() == shards.size());
31f18b77
FG
2860
2861 // note that we need to update every shard_info of shards here,
2862 // as sv might have been totally re-allocated above
2863 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2864 shards[i].shard_info = &sv[i];
31f18b77
FG
2865 }
2866
2867 // mark newly added shards as dirty
2868 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2869 shards[i].loaded = true;
2870 shards[i].dirty = true;
2871 }
7c673cae
FG
2872 }
2873 dout(20) << __func__ << " fin " << sv << dendl;
2874 inline_bl.clear();
2875
2876 if (sv.empty()) {
2877 // no more shards; unspan all previously spanning blobs
2878 auto p = spanning_blob_map.begin();
2879 while (p != spanning_blob_map.end()) {
2880 p->second->id = -1;
2881 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2882 p = spanning_blob_map.erase(p);
2883 }
2884 } else {
2885 // identify new spanning blobs
2886 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2887 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2888 if (spanning_scan_begin < needs_reshard_begin) {
2889 fault_range(db, spanning_scan_begin,
2890 needs_reshard_begin - spanning_scan_begin);
2891 }
2892 if (spanning_scan_end > needs_reshard_end) {
2893 fault_range(db, needs_reshard_end,
31f18b77 2894 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2895 }
2896 auto sp = sv.begin() + si_begin;
2897 auto esp = sv.end();
2898 unsigned shard_start = sp->offset;
2899 unsigned shard_end;
2900 ++sp;
2901 if (sp == esp) {
2902 shard_end = OBJECT_MAX_SIZE;
2903 } else {
2904 shard_end = sp->offset;
2905 }
7c673cae 2906 Extent dummy(needs_reshard_begin);
9f95a23c
TL
2907
2908 bool was_too_many_blobs_check = false;
2909 auto too_many_blobs_threshold =
2910 g_conf()->bluestore_debug_too_many_blobs_threshold;
2911 auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
2912 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2913 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
2914
7c673cae
FG
2915 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2916 if (e->logical_offset >= needs_reshard_end) {
2917 break;
2918 }
2919 dout(30) << " extent " << *e << dendl;
2920 while (e->logical_offset >= shard_end) {
2921 shard_start = shard_end;
11fdf7f2 2922 ceph_assert(sp != esp);
7c673cae
FG
2923 ++sp;
2924 if (sp == esp) {
2925 shard_end = OBJECT_MAX_SIZE;
2926 } else {
2927 shard_end = sp->offset;
2928 }
2929 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2930 << " to 0x" << shard_end << std::dec << dendl;
2931 }
9f95a23c 2932
7c673cae
FG
2933 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2934 if (!e->blob->is_spanning()) {
2935 // We have two options: (1) split the blob into pieces at the
2936 // shard boundaries (and adjust extents accordingly), or (2)
2937 // mark it spanning. We prefer to cut the blob if we can. Note that
2938 // we may have to split it multiple times--potentially at every
2939 // shard boundary.
2940 bool must_span = false;
2941 BlobRef b = e->blob;
2942 if (b->can_split()) {
2943 uint32_t bstart = e->blob_start();
2944 uint32_t bend = e->blob_end();
2945 for (const auto& sh : shards) {
2946 if (bstart < sh.shard_info->offset &&
2947 bend > sh.shard_info->offset) {
2948 uint32_t blob_offset = sh.shard_info->offset - bstart;
2949 if (b->can_split_at(blob_offset)) {
2950 dout(20) << __func__ << " splitting blob, bstart 0x"
2951 << std::hex << bstart << " blob_offset 0x"
2952 << blob_offset << std::dec << " " << *b << dendl;
2953 b = split_blob(b, blob_offset, sh.shard_info->offset);
2954 // switch b to the new right-hand side, in case it
2955 // *also* has to get split.
2956 bstart += blob_offset;
2957 onode->c->store->logger->inc(l_bluestore_blob_split);
2958 } else {
2959 must_span = true;
2960 break;
2961 }
2962 }
2963 }
2964 } else {
2965 must_span = true;
2966 }
2967 if (must_span) {
31f18b77
FG
2968 auto bid = allocate_spanning_blob_id();
2969 b->id = bid;
7c673cae
FG
2970 spanning_blob_map[b->id] = b;
2971 dout(20) << __func__ << " adding spanning " << *b << dendl;
9f95a23c
TL
2972 if (!was_too_many_blobs_check &&
2973 too_many_blobs_threshold &&
2974 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2975
2976 was_too_many_blobs_check = true;
2977 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2978 if (dumped_onodes[i].first == onode->oid) {
2979 oid_slot = &dumped_onodes[i];
2980 break;
2981 }
2982 if (!oldest_slot || (oldest_slot &&
2983 dumped_onodes[i].second < oldest_slot->second)) {
2984 oldest_slot = &dumped_onodes[i];
2985 }
2986 }
2987 }
7c673cae
FG
2988 }
2989 }
2990 } else {
2991 if (e->blob->is_spanning()) {
2992 spanning_blob_map.erase(e->blob->id);
2993 e->blob->id = -1;
2994 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2995 }
2996 }
2997 }
9f95a23c
TL
2998 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
2999 (oid_slot &&
3000 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
3001 if (do_dump) {
3002 dout(0) << __func__
3003 << " spanning blob count exceeds threshold, "
3004 << spanning_blob_map.size() << " spanning blobs"
3005 << dendl;
3006 _dump_onode<0>(cct, *onode);
3007 if (oid_slot) {
3008 oid_slot->second = mono_clock::now();
3009 } else {
3010 ceph_assert(oldest_slot);
3011 oldest_slot->first = onode->oid;
3012 oldest_slot->second = mono_clock::now();
3013 }
3014 }
7c673cae
FG
3015 }
3016
3017 clear_needs_reshard();
3018}
3019
3020bool BlueStore::ExtentMap::encode_some(
3021 uint32_t offset,
3022 uint32_t length,
3023 bufferlist& bl,
3024 unsigned *pn)
3025{
7c673cae
FG
3026 Extent dummy(offset);
3027 auto start = extent_map.lower_bound(dummy);
3028 uint32_t end = offset + length;
3029
3030 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
3031 // serialization only. Hence there is no specific
3032 // handling at ExtentMap level.
3033
3034 unsigned n = 0;
3035 size_t bound = 0;
7c673cae
FG
3036 bool must_reshard = false;
3037 for (auto p = start;
3038 p != extent_map.end() && p->logical_offset < end;
3039 ++p, ++n) {
11fdf7f2 3040 ceph_assert(p->logical_offset >= offset);
7c673cae
FG
3041 p->blob->last_encoded_id = -1;
3042 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
3043 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3044 << std::dec << " hit new spanning blob " << *p << dendl;
3045 request_reshard(p->blob_start(), p->blob_end());
3046 must_reshard = true;
3047 }
31f18b77
FG
3048 if (!must_reshard) {
3049 denc_varint(0, bound); // blobid
3050 denc_varint(0, bound); // logical_offset
3051 denc_varint(0, bound); // len
3052 denc_varint(0, bound); // blob_offset
7c673cae 3053
31f18b77
FG
3054 p->blob->bound_encode(
3055 bound,
3056 struct_v,
3057 p->blob->shared_blob->get_sbid(),
3058 false);
3059 }
7c673cae
FG
3060 }
3061 if (must_reshard) {
3062 return true;
3063 }
3064
31f18b77
FG
3065 denc(struct_v, bound);
3066 denc_varint(0, bound); // number of extents
3067
7c673cae
FG
3068 {
3069 auto app = bl.get_contiguous_appender(bound);
3070 denc(struct_v, app);
3071 denc_varint(n, app);
3072 if (pn) {
3073 *pn = n;
3074 }
3075
3076 n = 0;
3077 uint64_t pos = 0;
3078 uint64_t prev_len = 0;
3079 for (auto p = start;
3080 p != extent_map.end() && p->logical_offset < end;
3081 ++p, ++n) {
3082 unsigned blobid;
3083 bool include_blob = false;
3084 if (p->blob->is_spanning()) {
3085 blobid = p->blob->id << BLOBID_SHIFT_BITS;
3086 blobid |= BLOBID_FLAG_SPANNING;
3087 } else if (p->blob->last_encoded_id < 0) {
3088 p->blob->last_encoded_id = n + 1; // so it is always non-zero
3089 include_blob = true;
3090 blobid = 0; // the decoder will infer the id from n
3091 } else {
3092 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
3093 }
3094 if (p->logical_offset == pos) {
3095 blobid |= BLOBID_FLAG_CONTIGUOUS;
3096 }
3097 if (p->blob_offset == 0) {
3098 blobid |= BLOBID_FLAG_ZEROOFFSET;
3099 }
3100 if (p->length == prev_len) {
3101 blobid |= BLOBID_FLAG_SAMELENGTH;
3102 } else {
3103 prev_len = p->length;
3104 }
3105 denc_varint(blobid, app);
3106 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3107 denc_varint_lowz(p->logical_offset - pos, app);
3108 }
3109 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3110 denc_varint_lowz(p->blob_offset, app);
3111 }
3112 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3113 denc_varint_lowz(p->length, app);
3114 }
3115 pos = p->logical_end();
3116 if (include_blob) {
3117 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
3118 }
3119 }
3120 }
3121 /*derr << __func__ << bl << dendl;
3122 derr << __func__ << ":";
3123 bl.hexdump(*_dout);
3124 *_dout << dendl;
3125 */
3126 return false;
3127}
3128
3129unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
3130{
7c673cae
FG
3131 /*
3132 derr << __func__ << ":";
3133 bl.hexdump(*_dout);
3134 *_dout << dendl;
3135 */
3136
11fdf7f2 3137 ceph_assert(bl.get_num_buffers() <= 1);
7c673cae
FG
3138 auto p = bl.front().begin_deep();
3139 __u8 struct_v;
3140 denc(struct_v, p);
3141 // Version 2 differs from v1 in blob's ref_map
3142 // serialization only. Hence there is no specific
3143 // handling at ExtentMap level below.
11fdf7f2 3144 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
3145
3146 uint32_t num;
3147 denc_varint(num, p);
3148 vector<BlobRef> blobs(num);
3149 uint64_t pos = 0;
3150 uint64_t prev_len = 0;
3151 unsigned n = 0;
3152
3153 while (!p.end()) {
3154 Extent *le = new Extent();
3155 uint64_t blobid;
3156 denc_varint(blobid, p);
3157 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3158 uint64_t gap;
3159 denc_varint_lowz(gap, p);
3160 pos += gap;
3161 }
3162 le->logical_offset = pos;
3163 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3164 denc_varint_lowz(le->blob_offset, p);
3165 } else {
3166 le->blob_offset = 0;
3167 }
3168 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3169 denc_varint_lowz(prev_len, p);
3170 }
3171 le->length = prev_len;
3172
3173 if (blobid & BLOBID_FLAG_SPANNING) {
3174 dout(30) << __func__ << " getting spanning blob "
3175 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
3176 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
3177 } else {
3178 blobid >>= BLOBID_SHIFT_BITS;
3179 if (blobid) {
3180 le->assign_blob(blobs[blobid - 1]);
11fdf7f2 3181 ceph_assert(le->blob);
7c673cae
FG
3182 } else {
3183 Blob *b = new Blob();
3184 uint64_t sbid = 0;
3185 b->decode(onode->c, p, struct_v, &sbid, false);
3186 blobs[n] = b;
3187 onode->c->open_shared_blob(sbid, b);
3188 le->assign_blob(b);
3189 }
3190 // we build ref_map dynamically for non-spanning blobs
3191 le->blob->get_ref(
3192 onode->c,
3193 le->blob_offset,
3194 le->length);
3195 }
3196 pos += prev_len;
3197 ++n;
3198 extent_map.insert(*le);
3199 }
3200
11fdf7f2 3201 ceph_assert(n == num);
7c673cae
FG
3202 return num;
3203}
3204
3205void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
3206{
3207 // Version 2 differs from v1 in blob's ref_map
3208 // serialization only. Hence there is no specific
3209 // handling at ExtentMap level.
3210 __u8 struct_v = 2;
3211
3212 denc(struct_v, p);
3213 denc_varint((uint32_t)0, p);
3214 size_t key_size = 0;
3215 denc_varint((uint32_t)0, key_size);
3216 p += spanning_blob_map.size() * key_size;
3217 for (const auto& i : spanning_blob_map) {
3218 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3219 }
3220}
3221
3222void BlueStore::ExtentMap::encode_spanning_blobs(
3223 bufferlist::contiguous_appender& p)
3224{
3225 // Version 2 differs from v1 in blob's ref_map
3226 // serialization only. Hence there is no specific
3227 // handling at ExtentMap level.
3228 __u8 struct_v = 2;
3229
3230 denc(struct_v, p);
3231 denc_varint(spanning_blob_map.size(), p);
3232 for (auto& i : spanning_blob_map) {
3233 denc_varint(i.second->id, p);
3234 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3235 }
3236}
3237
3238void BlueStore::ExtentMap::decode_spanning_blobs(
11fdf7f2 3239 bufferptr::const_iterator& p)
7c673cae
FG
3240{
3241 __u8 struct_v;
3242 denc(struct_v, p);
3243 // Version 2 differs from v1 in blob's ref_map
3244 // serialization only. Hence there is no specific
3245 // handling at ExtentMap level.
11fdf7f2 3246 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
3247
3248 unsigned n;
3249 denc_varint(n, p);
3250 while (n--) {
3251 BlobRef b(new Blob());
3252 denc_varint(b->id, p);
3253 spanning_blob_map[b->id] = b;
3254 uint64_t sbid = 0;
3255 b->decode(onode->c, p, struct_v, &sbid, true);
3256 onode->c->open_shared_blob(sbid, b);
3257 }
3258}
3259
3260void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
3261{
3262 shards.resize(onode->onode.extent_map_shards.size());
3263 unsigned i = 0;
3264 for (auto &s : onode->onode.extent_map_shards) {
3265 shards[i].shard_info = &s;
3266 shards[i].loaded = loaded;
3267 shards[i].dirty = dirty;
3268 ++i;
3269 }
3270}
3271
3272void BlueStore::ExtentMap::fault_range(
3273 KeyValueDB *db,
3274 uint32_t offset,
3275 uint32_t length)
3276{
7c673cae
FG
3277 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3278 << std::dec << dendl;
3279 auto start = seek_shard(offset);
3280 auto last = seek_shard(offset + length);
3281
3282 if (start < 0)
3283 return;
3284
11fdf7f2 3285 ceph_assert(last >= start);
7c673cae
FG
3286 string key;
3287 while (start <= last) {
11fdf7f2 3288 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3289 auto p = &shards[start];
3290 if (!p->loaded) {
3291 dout(30) << __func__ << " opening shard 0x" << std::hex
3292 << p->shard_info->offset << std::dec << dendl;
3293 bufferlist v;
3294 generate_extent_shard_key_and_apply(
3295 onode->key, p->shard_info->offset, &key,
3296 [&](const string& final_key) {
3297 int r = db->get(PREFIX_OBJ, final_key, &v);
3298 if (r < 0) {
3299 derr << __func__ << " missing shard 0x" << std::hex
3300 << p->shard_info->offset << std::dec << " for " << onode->oid
3301 << dendl;
11fdf7f2 3302 ceph_assert(r >= 0);
7c673cae
FG
3303 }
3304 }
3305 );
3306 p->extents = decode_some(v);
3307 p->loaded = true;
3308 dout(20) << __func__ << " open shard 0x" << std::hex
81eedcae
TL
3309 << p->shard_info->offset
3310 << " for range 0x" << offset << "~" << length << std::dec
7c673cae 3311 << " (" << v.length() << " bytes)" << dendl;
11fdf7f2
TL
3312 ceph_assert(p->dirty == false);
3313 ceph_assert(v.length() == p->shard_info->bytes);
7c673cae
FG
3314 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3315 } else {
3316 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3317 }
3318 ++start;
3319 }
3320}
3321
3322void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
3323 uint32_t offset,
3324 uint32_t length)
3325{
7c673cae
FG
3326 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3327 << std::dec << dendl;
3328 if (shards.empty()) {
3329 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3330 inline_bl.clear();
3331 return;
3332 }
3333 auto start = seek_shard(offset);
11fdf7f2
TL
3334 if (length == 0) {
3335 length = 1;
3336 }
3337 auto last = seek_shard(offset + length - 1);
7c673cae
FG
3338 if (start < 0)
3339 return;
3340
11fdf7f2 3341 ceph_assert(last >= start);
7c673cae 3342 while (start <= last) {
11fdf7f2 3343 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3344 auto p = &shards[start];
3345 if (!p->loaded) {
11fdf7f2
TL
3346 derr << __func__ << "on write 0x" << std::hex << offset
3347 << "~" << length << " shard 0x" << p->shard_info->offset
3348 << std::dec << " is not loaded, can't mark dirty" << dendl;
3349 ceph_abort_msg("can't mark unloaded shard dirty");
7c673cae
FG
3350 }
3351 if (!p->dirty) {
3352 dout(20) << __func__ << " mark shard 0x" << std::hex
3353 << p->shard_info->offset << std::dec << " dirty" << dendl;
3354 p->dirty = true;
3355 }
3356 ++start;
3357 }
3358}
3359
3360BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3361 uint64_t offset)
3362{
3363 Extent dummy(offset);
3364 return extent_map.find(dummy);
3365}
3366
7c673cae
FG
3367BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3368 uint64_t offset)
3369{
3370 Extent dummy(offset);
3371 auto fp = extent_map.lower_bound(dummy);
3372 if (fp != extent_map.begin()) {
3373 --fp;
3374 if (fp->logical_end() <= offset) {
3375 ++fp;
3376 }
3377 }
3378 return fp;
3379}
3380
3381BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3382 uint64_t offset) const
3383{
3384 Extent dummy(offset);
3385 auto fp = extent_map.lower_bound(dummy);
3386 if (fp != extent_map.begin()) {
3387 --fp;
3388 if (fp->logical_end() <= offset) {
3389 ++fp;
3390 }
3391 }
3392 return fp;
3393}
3394
3395bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3396{
3397 auto fp = seek_lextent(offset);
3398 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3399 return false;
3400 }
3401 return true;
3402}
3403
3404int BlueStore::ExtentMap::compress_extent_map(
3405 uint64_t offset,
3406 uint64_t length)
3407{
7c673cae
FG
3408 if (extent_map.empty())
3409 return 0;
3410 int removed = 0;
3411 auto p = seek_lextent(offset);
3412 if (p != extent_map.begin()) {
3413 --p; // start to the left of offset
3414 }
3415 // the caller should have just written to this region
11fdf7f2 3416 ceph_assert(p != extent_map.end());
7c673cae
FG
3417
3418 // identify the *next* shard
3419 auto pshard = shards.begin();
3420 while (pshard != shards.end() &&
3421 p->logical_offset >= pshard->shard_info->offset) {
3422 ++pshard;
3423 }
3424 uint64_t shard_end;
3425 if (pshard != shards.end()) {
3426 shard_end = pshard->shard_info->offset;
3427 } else {
3428 shard_end = OBJECT_MAX_SIZE;
3429 }
3430
3431 auto n = p;
3432 for (++n; n != extent_map.end(); p = n++) {
3433 if (n->logical_offset > offset + length) {
3434 break; // stop after end
3435 }
3436 while (n != extent_map.end() &&
3437 p->logical_end() == n->logical_offset &&
3438 p->blob == n->blob &&
3439 p->blob_offset + p->length == n->blob_offset &&
3440 n->logical_offset < shard_end) {
3441 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3442 << " next shard 0x" << shard_end << std::dec
3443 << " merging " << *p << " and " << *n << dendl;
3444 p->length += n->length;
3445 rm(n++);
3446 ++removed;
3447 }
3448 if (n == extent_map.end()) {
3449 break;
3450 }
3451 if (n->logical_offset >= shard_end) {
11fdf7f2 3452 ceph_assert(pshard != shards.end());
7c673cae
FG
3453 ++pshard;
3454 if (pshard != shards.end()) {
3455 shard_end = pshard->shard_info->offset;
3456 } else {
3457 shard_end = OBJECT_MAX_SIZE;
3458 }
3459 }
3460 }
11fdf7f2 3461 if (removed) {
7c673cae
FG
3462 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3463 }
3464 return removed;
3465}
3466
3467void BlueStore::ExtentMap::punch_hole(
3468 CollectionRef &c,
3469 uint64_t offset,
3470 uint64_t length,
3471 old_extent_map_t *old_extents)
3472{
3473 auto p = seek_lextent(offset);
3474 uint64_t end = offset + length;
3475 while (p != extent_map.end()) {
3476 if (p->logical_offset >= end) {
3477 break;
3478 }
3479 if (p->logical_offset < offset) {
3480 if (p->logical_end() > end) {
3481 // split and deref middle
3482 uint64_t front = offset - p->logical_offset;
3483 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3484 length, p->blob);
3485 old_extents->push_back(*oe);
3486 add(end,
3487 p->blob_offset + front + length,
3488 p->length - front - length,
3489 p->blob);
3490 p->length = front;
3491 break;
3492 } else {
3493 // deref tail
11fdf7f2 3494 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
7c673cae
FG
3495 uint64_t keep = offset - p->logical_offset;
3496 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3497 p->length - keep, p->blob);
3498 old_extents->push_back(*oe);
3499 p->length = keep;
3500 ++p;
3501 continue;
3502 }
3503 }
3504 if (p->logical_offset + p->length <= end) {
3505 // deref whole lextent
3506 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3507 p->length, p->blob);
3508 old_extents->push_back(*oe);
3509 rm(p++);
3510 continue;
3511 }
3512 // deref head
3513 uint64_t keep = p->logical_end() - end;
3514 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3515 p->length - keep, p->blob);
3516 old_extents->push_back(*oe);
3517
3518 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3519 rm(p);
3520 break;
3521 }
3522}
3523
3524BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3525 CollectionRef &c,
3526 uint64_t logical_offset,
3527 uint64_t blob_offset, uint64_t length, BlobRef b,
3528 old_extent_map_t *old_extents)
3529{
3530 // We need to have completely initialized Blob to increment its ref counters.
11fdf7f2 3531 ceph_assert(b->get_blob().get_logical_length() != 0);
7c673cae
FG
3532
3533 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3534 // old_extents list if we overwre the blob totally
3535 // This might happen during WAL overwrite.
3536 b->get_ref(onode->c, blob_offset, length);
3537
3538 if (old_extents) {
3539 punch_hole(c, logical_offset, length, old_extents);
3540 }
3541
3542 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3543 extent_map.insert(*le);
3544 if (spans_shard(logical_offset, length)) {
3545 request_reshard(logical_offset, logical_offset + length);
3546 }
3547 return le;
3548}
3549
3550BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3551 BlobRef lb,
3552 uint32_t blob_offset,
3553 uint32_t pos)
3554{
7c673cae
FG
3555 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3556 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3557 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3558 << dendl;
3559 BlobRef rb = onode->c->new_blob();
3560 lb->split(onode->c, blob_offset, rb.get());
3561
3562 for (auto ep = seek_lextent(pos);
3563 ep != extent_map.end() && ep->logical_offset < end_pos;
3564 ++ep) {
3565 if (ep->blob != lb) {
3566 continue;
3567 }
3568 if (ep->logical_offset < pos) {
3569 // split extent
3570 size_t left = pos - ep->logical_offset;
3571 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3572 extent_map.insert(*ne);
3573 ep->length = left;
3574 dout(30) << __func__ << " split " << *ep << dendl;
3575 dout(30) << __func__ << " to " << *ne << dendl;
3576 } else {
3577 // switch blob
11fdf7f2 3578 ceph_assert(ep->blob_offset >= blob_offset);
7c673cae
FG
3579
3580 ep->blob = rb;
3581 ep->blob_offset -= blob_offset;
3582 dout(30) << __func__ << " adjusted " << *ep << dendl;
3583 }
3584 }
3585 return rb;
3586}
3587
3588// Onode
3589
3590#undef dout_prefix
3591#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3592
20effc67
TL
3593const std::string& BlueStore::Onode::calc_omap_prefix(uint8_t flags)
3594{
3595 if (bluestore_onode_t::is_pgmeta_omap(flags)) {
3596 return PREFIX_PGMETA_OMAP;
3597 }
3598 if (bluestore_onode_t::is_perpg_omap(flags)) {
3599 return PREFIX_PERPG_OMAP;
3600 }
3601 if (bluestore_onode_t::is_perpool_omap(flags)) {
3602 return PREFIX_PERPOOL_OMAP;
3603 }
3604 return PREFIX_OMAP;
3605}
3606
3607// '-' < '.' < '~'
3608void BlueStore::Onode::calc_omap_header(
3609 uint8_t flags,
3610 const Onode* o,
3611 std::string* out)
3612{
3613 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3614 if (bluestore_onode_t::is_perpg_omap(flags)) {
3615 _key_encode_u64(o->c->pool(), out);
3616 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3617 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3618 _key_encode_u64(o->c->pool(), out);
3619 }
3620 }
3621 _key_encode_u64(o->onode.nid, out);
3622 out->push_back('-');
3623}
3624
3625void BlueStore::Onode::calc_omap_key(uint8_t flags,
3626 const Onode* o,
3627 const std::string& key,
3628 std::string* out)
3629{
3630 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3631 if (bluestore_onode_t::is_perpg_omap(flags)) {
3632 _key_encode_u64(o->c->pool(), out);
3633 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3634 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3635 _key_encode_u64(o->c->pool(), out);
3636 }
3637 }
3638 _key_encode_u64(o->onode.nid, out);
3639 out->push_back('.');
3640 out->append(key);
3641}
3642
3643void BlueStore::Onode::calc_omap_tail(
3644 uint8_t flags,
3645 const Onode* o,
3646 std::string* out)
3647{
3648 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3649 if (bluestore_onode_t::is_perpg_omap(flags)) {
3650 _key_encode_u64(o->c->pool(), out);
3651 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3652 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3653 _key_encode_u64(o->c->pool(), out);
3654 }
3655 }
3656 _key_encode_u64(o->onode.nid, out);
3657 out->push_back('~');
3658}
3659
f6b5b4d7 3660void BlueStore::Onode::get() {
adb31ebb
TL
3661 if (++nref >= 2 && !pinned) {
3662 OnodeCacheShard* ocs = c->get_onode_cache();
f67539c2
TL
3663 ocs->lock.lock();
3664 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3665 while (ocs != c->get_onode_cache()) {
3666 ocs->lock.unlock();
3667 ocs = c->get_onode_cache();
3668 ocs->lock.lock();
3669 }
adb31ebb
TL
3670 bool was_pinned = pinned;
3671 pinned = nref >= 2;
adb31ebb 3672 bool r = !was_pinned && pinned;
adb31ebb
TL
3673 if (cached && r) {
3674 ocs->_pin(this);
3675 }
f67539c2 3676 ocs->lock.unlock();
f6b5b4d7
TL
3677 }
3678}
3679void BlueStore::Onode::put() {
20effc67 3680 ++put_nref;
adb31ebb 3681 int n = --nref;
20effc67 3682 if (n == 1) {
adb31ebb 3683 OnodeCacheShard* ocs = c->get_onode_cache();
f67539c2
TL
3684 ocs->lock.lock();
3685 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3686 while (ocs != c->get_onode_cache()) {
3687 ocs->lock.unlock();
3688 ocs = c->get_onode_cache();
3689 ocs->lock.lock();
3690 }
adb31ebb 3691 bool need_unpin = pinned;
20effc67 3692 pinned = pinned && nref >= 2;
adb31ebb
TL
3693 need_unpin = need_unpin && !pinned;
3694 if (cached && need_unpin) {
3695 if (exists) {
3696 ocs->_unpin(this);
3697 } else {
3698 ocs->_unpin_and_rm(this);
20effc67 3699 // remove will also decrement nref
adb31ebb
TL
3700 c->onode_map._remove(oid);
3701 }
3702 }
f67539c2 3703 ocs->lock.unlock();
f6b5b4d7 3704 }
20effc67
TL
3705 auto pn = --put_nref;
3706 if (nref == 0 && pn == 0) {
f6b5b4d7
TL
3707 delete this;
3708 }
3709}
3710
eafe8130
TL
3711BlueStore::Onode* BlueStore::Onode::decode(
3712 CollectionRef c,
3713 const ghobject_t& oid,
3714 const string& key,
3715 const bufferlist& v)
3716{
3717 Onode* on = new Onode(c.get(), oid, key);
3718 on->exists = true;
3719 auto p = v.front().begin_deep();
3720 on->onode.decode(p);
3721 for (auto& i : on->onode.attrs) {
f91f0fd5 3722 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
eafe8130
TL
3723 }
3724
3725 // initialize extent_map
3726 on->extent_map.decode_spanning_blobs(p);
3727 if (on->onode.extent_map_shards.empty()) {
3728 denc(on->extent_map.inline_bl, p);
3729 on->extent_map.decode_some(on->extent_map.inline_bl);
3730 on->extent_map.inline_bl.reassign_to_mempool(
f91f0fd5 3731 mempool::mempool_bluestore_cache_data);
eafe8130
TL
3732 }
3733 else {
3734 on->extent_map.init_shards(false, false);
3735 }
3736 return on;
3737}
3738
7c673cae
FG
3739void BlueStore::Onode::flush()
3740{
3741 if (flushing_count.load()) {
3742 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
9f95a23c 3743 waiting_count++;
11fdf7f2 3744 std::unique_lock l(flush_lock);
7c673cae
FG
3745 while (flushing_count.load()) {
3746 flush_cond.wait(l);
3747 }
9f95a23c 3748 waiting_count--;
7c673cae
FG
3749 }
3750 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3751}
3752
9f95a23c
TL
3753void BlueStore::Onode::dump(Formatter* f) const
3754{
3755 onode.dump(f);
3756 extent_map.dump(f);
3757}
3758
9f95a23c
TL
3759void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3760{
f67539c2
TL
3761 if (!onode.is_pgmeta_omap()) {
3762 if (onode.is_perpg_omap()) {
3763 _key_encode_u64(c->pool(), out);
3764 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3765 } else if (onode.is_perpool_omap()) {
3766 _key_encode_u64(c->pool(), out);
3767 }
9f95a23c
TL
3768 }
3769 _key_encode_u64(onode.nid, out);
3770 out->append(old.c_str() + out->length(), old.size() - out->length());
3771}
3772
9f95a23c
TL
3773void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3774{
f67539c2
TL
3775 size_t pos = sizeof(uint64_t) + 1;
3776 if (!onode.is_pgmeta_omap()) {
3777 if (onode.is_perpg_omap()) {
3778 pos += sizeof(uint64_t) + sizeof(uint32_t);
3779 } else if (onode.is_perpool_omap()) {
3780 pos += sizeof(uint64_t);
3781 }
9f95a23c 3782 }
f67539c2 3783 *user_key = key.substr(pos);
9f95a23c
TL
3784}
3785
7c673cae
FG
3786// =======================================================
3787// WriteContext
3788
3789/// Checks for writes to the same pextent within a blob
3790bool BlueStore::WriteContext::has_conflict(
3791 BlobRef b,
3792 uint64_t loffs,
3793 uint64_t loffs_end,
3794 uint64_t min_alloc_size)
3795{
11fdf7f2
TL
3796 ceph_assert((loffs % min_alloc_size) == 0);
3797 ceph_assert((loffs_end % min_alloc_size) == 0);
7c673cae
FG
3798 for (auto w : writes) {
3799 if (b == w.b) {
11fdf7f2
TL
3800 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3801 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
7c673cae 3802 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 3803 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
3804 return true;
3805 }
3806 }
3807 }
3808 return false;
3809}
3810
3811// =======================================================
3812
3813// DeferredBatch
3814#undef dout_prefix
3815#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
9f95a23c
TL
3816#undef dout_context
3817#define dout_context cct
7c673cae
FG
3818
3819void BlueStore::DeferredBatch::prepare_write(
3820 CephContext *cct,
3821 uint64_t seq, uint64_t offset, uint64_t length,
3822 bufferlist::const_iterator& blp)
3823{
3824 _discard(cct, offset, length);
3825 auto i = iomap.insert(make_pair(offset, deferred_io()));
11fdf7f2 3826 ceph_assert(i.second); // this should be a new insertion
7c673cae
FG
3827 i.first->second.seq = seq;
3828 blp.copy(length, i.first->second.bl);
31f18b77
FG
3829 i.first->second.bl.reassign_to_mempool(
3830 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3831 dout(20) << __func__ << " seq " << seq
3832 << " 0x" << std::hex << offset << "~" << length
3833 << " crc " << i.first->second.bl.crc32c(-1)
3834 << std::dec << dendl;
3835 seq_bytes[seq] += length;
3836#ifdef DEBUG_DEFERRED
3837 _audit(cct);
3838#endif
3839}
3840
3841void BlueStore::DeferredBatch::_discard(
3842 CephContext *cct, uint64_t offset, uint64_t length)
3843{
3844 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3845 << std::dec << dendl;
3846 auto p = iomap.lower_bound(offset);
3847 if (p != iomap.begin()) {
3848 --p;
3849 auto end = p->first + p->second.bl.length();
3850 if (end > offset) {
3851 bufferlist head;
3852 head.substr_of(p->second.bl, 0, offset - p->first);
3853 dout(20) << __func__ << " keep head " << p->second.seq
3854 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3855 << " -> 0x" << head.length() << std::dec << dendl;
3856 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3857 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3858 if (end > offset + length) {
3859 bufferlist tail;
3860 tail.substr_of(p->second.bl, offset + length - p->first,
3861 end - (offset + length));
3862 dout(20) << __func__ << " keep tail " << p->second.seq
3863 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3864 << " -> 0x" << tail.length() << std::dec << dendl;
3865 auto &n = iomap[offset + length];
3866 n.bl.swap(tail);
3867 n.seq = p->second.seq;
3868 i->second -= length;
3869 } else {
3870 i->second -= end - offset;
3871 }
11fdf7f2 3872 ceph_assert(i->second >= 0);
7c673cae
FG
3873 p->second.bl.swap(head);
3874 }
3875 ++p;
3876 }
3877 while (p != iomap.end()) {
3878 if (p->first >= offset + length) {
3879 break;
3880 }
3881 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3882 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3883 auto end = p->first + p->second.bl.length();
3884 if (end > offset + length) {
3885 unsigned drop_front = offset + length - p->first;
3886 unsigned keep_tail = end - (offset + length);
3887 dout(20) << __func__ << " truncate front " << p->second.seq
3888 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3889 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3890 << " to 0x" << (offset + length) << "~" << keep_tail
3891 << std::dec << dendl;
3892 auto &s = iomap[offset + length];
3893 s.seq = p->second.seq;
3894 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3895 i->second -= drop_front;
3896 } else {
3897 dout(20) << __func__ << " drop " << p->second.seq
3898 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3899 << std::dec << dendl;
3900 i->second -= p->second.bl.length();
3901 }
11fdf7f2 3902 ceph_assert(i->second >= 0);
7c673cae
FG
3903 p = iomap.erase(p);
3904 }
3905}
3906
3907void BlueStore::DeferredBatch::_audit(CephContext *cct)
3908{
3909 map<uint64_t,int> sb;
3910 for (auto p : seq_bytes) {
3911 sb[p.first] = 0; // make sure we have the same set of keys
3912 }
3913 uint64_t pos = 0;
3914 for (auto& p : iomap) {
11fdf7f2 3915 ceph_assert(p.first >= pos);
7c673cae
FG
3916 sb[p.second.seq] += p.second.bl.length();
3917 pos = p.first + p.second.bl.length();
3918 }
11fdf7f2 3919 ceph_assert(sb == seq_bytes);
7c673cae
FG
3920}
3921
3922
3923// Collection
3924
3925#undef dout_prefix
3926#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3927
9f95a23c
TL
3928BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3929 : CollectionImpl(store_->cct, cid),
11fdf7f2 3930 store(store_),
9f95a23c 3931 cache(bc),
7c673cae 3932 exists(true),
9f95a23c 3933 onode_map(oc),
11fdf7f2
TL
3934 commit_queue(nullptr)
3935{
3936}
3937
3938bool BlueStore::Collection::flush_commit(Context *c)
3939{
3940 return osr->flush_commit(c);
3941}
3942
3943void BlueStore::Collection::flush()
3944{
3945 osr->flush();
3946}
3947
3948void BlueStore::Collection::flush_all_but_last()
7c673cae 3949{
11fdf7f2 3950 osr->flush_all_but_last();
7c673cae
FG
3951}
3952
3953void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3954{
11fdf7f2 3955 ceph_assert(!b->shared_blob);
7c673cae
FG
3956 const bluestore_blob_t& blob = b->get_blob();
3957 if (!blob.is_shared()) {
3958 b->shared_blob = new SharedBlob(this);
3959 return;
3960 }
3961
3962 b->shared_blob = shared_blob_set.lookup(sbid);
3963 if (b->shared_blob) {
3964 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3965 << std::dec << " had " << *b->shared_blob << dendl;
3966 } else {
3967 b->shared_blob = new SharedBlob(sbid, this);
3968 shared_blob_set.add(this, b->shared_blob.get());
3969 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3970 << std::dec << " opened " << *b->shared_blob
3971 << dendl;
3972 }
3973}
3974
3975void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3976{
3977 if (!sb->is_loaded()) {
3978
3979 bufferlist v;
3980 string key;
3981 auto sbid = sb->get_sbid();
3982 get_shared_blob_key(sbid, &key);
3983 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3984 if (r < 0) {
3985 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3986 << std::dec << " not found at key "
3987 << pretty_binary_string(key) << dendl;
11fdf7f2 3988 ceph_abort_msg("uh oh, missing shared_blob");
7c673cae
FG
3989 }
3990
3991 sb->loaded = true;
3992 sb->persistent = new bluestore_shared_blob_t(sbid);
11fdf7f2
TL
3993 auto p = v.cbegin();
3994 decode(*(sb->persistent), p);
7c673cae
FG
3995 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3996 << std::dec << " loaded shared_blob " << *sb << dendl;
3997 }
3998}
3999
4000void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
4001{
7c673cae 4002 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
11fdf7f2 4003 ceph_assert(!b->shared_blob->is_loaded());
7c673cae
FG
4004
4005 // update blob
31f18b77 4006 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 4007 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
4008
4009 // update shared blob
4010 b->shared_blob->loaded = true;
4011 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
4012 shared_blob_set.add(this, b->shared_blob.get());
4013 for (auto p : blob.get_extents()) {
4014 if (p.is_valid()) {
4015 b->shared_blob->get_ref(
4016 p.offset,
4017 p.length);
4018 }
4019 }
4020 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
4021}
4022
31f18b77
FG
4023uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
4024{
4025 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
11fdf7f2 4026 ceph_assert(sb->is_loaded());
31f18b77
FG
4027
4028 uint64_t sbid = sb->get_sbid();
4029 shared_blob_set.remove(sb);
4030 sb->loaded = false;
4031 delete sb->persistent;
4032 sb->sbid_unloaded = 0;
4033 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
4034 return sbid;
4035}
4036
7c673cae
FG
4037BlueStore::OnodeRef BlueStore::Collection::get_onode(
4038 const ghobject_t& oid,
9f95a23c
TL
4039 bool create,
4040 bool is_createop)
7c673cae 4041{
9f95a23c 4042 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
7c673cae
FG
4043
4044 spg_t pgid;
4045 if (cid.is_pg(&pgid)) {
4046 if (!oid.match(cnode.bits, pgid.ps())) {
4047 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
4048 << pgid << " bits " << cnode.bits << dendl;
4049 ceph_abort();
4050 }
4051 }
4052
4053 OnodeRef o = onode_map.lookup(oid);
4054 if (o)
4055 return o;
4056
eafe8130 4057 string key;
7c673cae
FG
4058 get_object_key(store->cct, oid, &key);
4059
4060 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
4061 << pretty_binary_string(key) << dendl;
4062
4063 bufferlist v;
9f95a23c 4064 int r = -ENOENT;
7c673cae 4065 Onode *on;
9f95a23c
TL
4066 if (!is_createop) {
4067 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
4068 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
4069 }
7c673cae 4070 if (v.length() == 0) {
11fdf7f2 4071 ceph_assert(r == -ENOENT);
f67539c2 4072 if (!create)
7c673cae
FG
4073 return OnodeRef();
4074
4075 // new object, new onode
4076 on = new Onode(this, oid, key);
4077 } else {
4078 // loaded
11fdf7f2 4079 ceph_assert(r >= 0);
eafe8130 4080 on = Onode::decode(this, oid, key, v);
7c673cae
FG
4081 }
4082 o.reset(on);
4083 return onode_map.add(oid, o);
4084}
4085
4086void BlueStore::Collection::split_cache(
4087 Collection *dest)
4088{
4089 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
4090
f67539c2
TL
4091 auto *ocache = get_onode_cache();
4092 auto *ocache_dest = dest->get_onode_cache();
4093
4094 // lock cache shards
4095 std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock);
4096 std::lock_guard l(ocache->lock, std::adopt_lock);
4097 std::lock_guard l2(ocache_dest->lock, std::adopt_lock);
4098 std::lock_guard l3(cache->lock, std::adopt_lock);
4099 std::lock_guard l4(dest->cache->lock, std::adopt_lock);
7c673cae
FG
4100
4101 int destbits = dest->cnode.bits;
4102 spg_t destpg;
4103 bool is_pg = dest->cid.is_pg(&destpg);
11fdf7f2 4104 ceph_assert(is_pg);
7c673cae
FG
4105
4106 auto p = onode_map.onode_map.begin();
4107 while (p != onode_map.onode_map.end()) {
11fdf7f2 4108 OnodeRef o = p->second;
7c673cae
FG
4109 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
4110 // onode does not belong to this child
11fdf7f2
TL
4111 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
4112 << dendl;
7c673cae
FG
4113 ++p;
4114 } else {
7c673cae
FG
4115 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
4116 << dendl;
4117
f6b5b4d7
TL
4118 // ensuring that nref is always >= 2 and hence onode is pinned and
4119 // physically out of cache during the transition
4120 OnodeRef o_pin = o;
4121 ceph_assert(o->pinned);
4122
7c673cae 4123 p = onode_map.onode_map.erase(p);
7c673cae 4124 dest->onode_map.onode_map[o->oid] = o;
adb31ebb 4125 if (o->cached) {
f6b5b4d7 4126 get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
9f95a23c 4127 }
f6b5b4d7 4128 o->c = dest;
7c673cae
FG
4129
4130 // move over shared blobs and buffers. cover shared blobs from
4131 // both extent map and spanning blob map (the full extent map
4132 // may not be faulted in)
4133 vector<SharedBlob*> sbvec;
4134 for (auto& e : o->extent_map.extent_map) {
4135 sbvec.push_back(e.blob->shared_blob.get());
4136 }
4137 for (auto& b : o->extent_map.spanning_blob_map) {
4138 sbvec.push_back(b.second->shared_blob.get());
4139 }
4140 for (auto sb : sbvec) {
4141 if (sb->coll == dest) {
4142 ldout(store->cct, 20) << __func__ << " already moved " << *sb
4143 << dendl;
4144 continue;
4145 }
4146 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
4147 if (sb->get_sbid()) {
4148 ldout(store->cct, 20) << __func__
4149 << " moving registration " << *sb << dendl;
4150 shared_blob_set.remove(sb);
4151 dest->shared_blob_set.add(dest, sb);
4152 }
3efd9988 4153 sb->coll = dest;
7c673cae 4154 if (dest->cache != cache) {
7c673cae
FG
4155 for (auto& i : sb->bc.buffer_map) {
4156 if (!i.second->is_writing()) {
4157 ldout(store->cct, 20) << __func__ << " moving " << *i.second
4158 << dendl;
9f95a23c 4159 dest->cache->_move(cache, i.second.get());
7c673cae
FG
4160 }
4161 }
4162 }
4163 }
7c673cae
FG
4164 }
4165 }
9f95a23c 4166 dest->cache->_trim();
7c673cae
FG
4167}
4168
7c673cae
FG
4169// =======================================================
4170
91327a77
AA
4171// MempoolThread
4172
4173#undef dout_prefix
4174#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
9f95a23c
TL
4175#undef dout_context
4176#define dout_context store->cct
91327a77 4177
7c673cae
FG
4178void *BlueStore::MempoolThread::entry()
4179{
9f95a23c 4180 std::unique_lock l{lock};
11fdf7f2 4181
92f5a8d4 4182 uint32_t prev_config_change = store->config_changed.load();
eafe8130
TL
4183 uint64_t base = store->osd_memory_base;
4184 double fragmentation = store->osd_memory_expected_fragmentation;
4185 uint64_t target = store->osd_memory_target;
4186 uint64_t min = store->osd_memory_cache_min;
4187 uint64_t max = min;
4188
4189 // When setting the maximum amount of memory to use for cache, first
4190 // assume some base amount of memory for the OSD and then fudge in
4191 // some overhead for fragmentation that scales with cache usage.
4192 uint64_t ltarget = (1.0 - fragmentation) * target;
4193 if (ltarget > base + min) {
4194 max = ltarget - base;
11fdf7f2 4195 }
31f18b77 4196
eafe8130 4197 binned_kv_cache = store->db->get_priority_cache();
f67539c2 4198 binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
eafe8130
TL
4199 if (store->cache_autotune && binned_kv_cache != nullptr) {
4200 pcm = std::make_shared<PriorityCache::Manager>(
f67539c2 4201 store->cct, min, max, target, true, "bluestore-pricache");
eafe8130
TL
4202 pcm->insert("kv", binned_kv_cache, true);
4203 pcm->insert("meta", meta_cache, true);
4204 pcm->insert("data", data_cache, true);
f67539c2
TL
4205 if (binned_kv_onode_cache != nullptr) {
4206 pcm->insert("kv_onode", binned_kv_onode_cache, true);
4207 }
eafe8130 4208 }
91327a77
AA
4209
4210 utime_t next_balance = ceph_clock_now();
4211 utime_t next_resize = ceph_clock_now();
20effc67 4212 utime_t next_bin_rotation = ceph_clock_now();
9f95a23c
TL
4213 utime_t next_deferred_force_submit = ceph_clock_now();
4214 utime_t alloc_stats_dump_clock = ceph_clock_now();
31f18b77 4215
91327a77 4216 bool interval_stats_trim = false;
91327a77 4217 while (!stop) {
92f5a8d4
TL
4218 // Update pcm cache settings if related configuration was changed
4219 uint32_t cur_config_change = store->config_changed.load();
4220 if (cur_config_change != prev_config_change) {
4221 _update_cache_settings();
4222 prev_config_change = cur_config_change;
4223 }
4224
20effc67
TL
4225 // define various intervals for background work
4226 double age_bin_interval = store->cache_age_bin_interval;
91327a77
AA
4227 double autotune_interval = store->cache_autotune_interval;
4228 double resize_interval = store->osd_memory_cache_resize_interval;
9f95a23c 4229 double max_defer_interval = store->max_defer_interval;
9f95a23c
TL
4230 double alloc_stats_dump_interval =
4231 store->cct->_conf->bluestore_alloc_stats_dump_interval;
91327a77 4232
20effc67 4233 // alloc stats dump
9f95a23c
TL
4234 if (alloc_stats_dump_interval > 0 &&
4235 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
4236 store->_record_allocation_stats();
4237 alloc_stats_dump_clock = ceph_clock_now();
4238 }
20effc67
TL
4239 // cache age binning
4240 if (age_bin_interval > 0 && next_bin_rotation < ceph_clock_now()) {
4241 if (binned_kv_cache != nullptr) {
4242 binned_kv_cache->import_bins(store->kv_bins);
4243 }
4244 if (binned_kv_onode_cache != nullptr) {
4245 binned_kv_onode_cache->import_bins(store->kv_onode_bins);
4246 }
4247 meta_cache->import_bins(store->meta_bins);
4248 data_cache->import_bins(store->data_bins);
4249
4250 if (pcm != nullptr) {
4251 pcm->shift_bins();
4252 }
4253 next_bin_rotation = ceph_clock_now();
4254 next_bin_rotation += age_bin_interval;
4255 }
4256 // cache balancing
91327a77 4257 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
20effc67
TL
4258 if (binned_kv_cache != nullptr) {
4259 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
4260 }
4261 if (binned_kv_onode_cache != nullptr) {
4262 binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
4263 }
4264 meta_cache->set_cache_ratio(store->cache_meta_ratio);
4265 data_cache->set_cache_ratio(store->cache_data_ratio);
11fdf7f2 4266
91327a77 4267 // Log events at 5 instead of 20 when balance happens.
91327a77 4268 interval_stats_trim = true;
eafe8130
TL
4269
4270 if (pcm != nullptr) {
4271 pcm->balance();
91327a77 4272 }
31f18b77 4273
91327a77
AA
4274 next_balance = ceph_clock_now();
4275 next_balance += autotune_interval;
4276 }
20effc67 4277 // memory resizing (ie autotuning)
91327a77 4278 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
eafe8130
TL
4279 if (ceph_using_tcmalloc() && pcm != nullptr) {
4280 pcm->tune_memory();
91327a77
AA
4281 }
4282 next_resize = ceph_clock_now();
4283 next_resize += resize_interval;
31f18b77 4284 }
20effc67 4285 // deferred force submit
9f95a23c
TL
4286 if (max_defer_interval > 0 &&
4287 next_deferred_force_submit < ceph_clock_now()) {
4288 if (store->get_deferred_last_submitted() + max_defer_interval <
4289 ceph_clock_now()) {
4290 store->deferred_try_submit();
4291 }
4292 next_deferred_force_submit = ceph_clock_now();
4293 next_deferred_force_submit += max_defer_interval/3;
4294 }
4295
4296 // Now Resize the shards
4297 _resize_shards(interval_stats_trim);
91327a77 4298 interval_stats_trim = false;
31f18b77 4299
91327a77 4300 store->_update_cache_logger();
11fdf7f2
TL
4301 auto wait = ceph::make_timespan(
4302 store->cct->_conf->bluestore_cache_trim_interval);
4303 cond.wait_for(l, wait);
7c673cae 4304 }
9f95a23c
TL
4305 // do final dump
4306 store->_record_allocation_stats();
7c673cae 4307 stop = false;
f67539c2 4308 pcm = nullptr;
7c673cae
FG
4309 return NULL;
4310}
4311
9f95a23c 4312void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
91327a77 4313{
9f95a23c
TL
4314 size_t onode_shards = store->onode_cache_shards.size();
4315 size_t buffer_shards = store->buffer_cache_shards.size();
91327a77 4316 int64_t kv_used = store->db->get_cache_usage();
f67539c2 4317 int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
11fdf7f2
TL
4318 int64_t meta_used = meta_cache->_get_used_bytes();
4319 int64_t data_used = data_cache->_get_used_bytes();
91327a77
AA
4320
4321 uint64_t cache_size = store->cache_size;
4322 int64_t kv_alloc =
20effc67 4323 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
f67539c2
TL
4324 int64_t kv_onode_alloc =
4325 static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
91327a77 4326 int64_t meta_alloc =
11fdf7f2 4327 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
91327a77 4328 int64_t data_alloc =
11fdf7f2 4329 static_cast<int64_t>(store->cache_data_ratio * cache_size);
91327a77 4330
eafe8130
TL
4331 if (pcm != nullptr && binned_kv_cache != nullptr) {
4332 cache_size = pcm->get_tuned_mem();
11fdf7f2
TL
4333 kv_alloc = binned_kv_cache->get_committed_size();
4334 meta_alloc = meta_cache->get_committed_size();
4335 data_alloc = data_cache->get_committed_size();
f67539c2
TL
4336 if (binned_kv_onode_cache != nullptr) {
4337 kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
4338 }
91327a77
AA
4339 }
4340
4341 if (interval_stats) {
9f95a23c 4342 dout(5) << __func__ << " cache_size: " << cache_size
91327a77
AA
4343 << " kv_alloc: " << kv_alloc
4344 << " kv_used: " << kv_used
f67539c2
TL
4345 << " kv_onode_alloc: " << kv_onode_alloc
4346 << " kv_onode_used: " << kv_onode_used
91327a77
AA
4347 << " meta_alloc: " << meta_alloc
4348 << " meta_used: " << meta_used
4349 << " data_alloc: " << data_alloc
4350 << " data_used: " << data_used << dendl;
4351 } else {
9f95a23c 4352 dout(20) << __func__ << " cache_size: " << cache_size
91327a77
AA
4353 << " kv_alloc: " << kv_alloc
4354 << " kv_used: " << kv_used
f67539c2
TL
4355 << " kv_onode_alloc: " << kv_onode_alloc
4356 << " kv_onode_used: " << kv_onode_used
91327a77
AA
4357 << " meta_alloc: " << meta_alloc
4358 << " meta_used: " << meta_used
4359 << " data_alloc: " << data_alloc
4360 << " data_used: " << data_used << dendl;
4361 }
4362
4363 uint64_t max_shard_onodes = static_cast<uint64_t>(
9f95a23c
TL
4364 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
4365 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
91327a77 4366
9f95a23c 4367 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
91327a77
AA
4368 << " max_shard_buffer: " << max_shard_buffer << dendl;
4369
9f95a23c
TL
4370 for (auto i : store->onode_cache_shards) {
4371 i->set_max(max_shard_onodes);
4372 }
4373 for (auto i : store->buffer_cache_shards) {
4374 i->set_max(max_shard_buffer);
91327a77
AA
4375 }
4376}
4377
92f5a8d4
TL
4378void BlueStore::MempoolThread::_update_cache_settings()
4379{
4380 // Nothing to do if pcm is not used.
4381 if (pcm == nullptr) {
4382 return;
4383 }
4384
92f5a8d4
TL
4385 uint64_t target = store->osd_memory_target;
4386 uint64_t base = store->osd_memory_base;
4387 uint64_t min = store->osd_memory_cache_min;
4388 uint64_t max = min;
4389 double fragmentation = store->osd_memory_expected_fragmentation;
4390
4391 uint64_t ltarget = (1.0 - fragmentation) * target;
4392 if (ltarget > base + min) {
4393 max = ltarget - base;
4394 }
4395
4396 // set pcm cache levels
4397 pcm->set_target_memory(target);
4398 pcm->set_min_memory(min);
4399 pcm->set_max_memory(max);
4400
9f95a23c 4401 dout(5) << __func__ << " updated pcm target: " << target
92f5a8d4
TL
4402 << " pcm min: " << min
4403 << " pcm max: " << max
4404 << dendl;
4405}
4406
7c673cae
FG
4407// =======================================================
4408
31f18b77
FG
4409// OmapIteratorImpl
4410
4411#undef dout_prefix
4412#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4413
4414BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4415 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
4416 : c(c), o(o), it(it)
4417{
9f95a23c 4418 std::shared_lock l(c->lock);
31f18b77 4419 if (o->onode.has_omap()) {
9f95a23c
TL
4420 o->get_omap_key(string(), &head);
4421 o->get_omap_tail(&tail);
31f18b77
FG
4422 it->lower_bound(head);
4423 }
4424}
4425
11fdf7f2
TL
4426string BlueStore::OmapIteratorImpl::_stringify() const
4427{
4428 stringstream s;
4429 s << " omap_iterator(cid = " << c->cid
4430 <<", oid = " << o->oid << ")";
4431 return s.str();
4432}
4433
31f18b77
FG
4434int BlueStore::OmapIteratorImpl::seek_to_first()
4435{
9f95a23c 4436 std::shared_lock l(c->lock);
11fdf7f2 4437 auto start1 = mono_clock::now();
31f18b77
FG
4438 if (o->onode.has_omap()) {
4439 it->lower_bound(head);
4440 } else {
4441 it = KeyValueDB::Iterator();
4442 }
494da23a
TL
4443 c->store->log_latency(
4444 __func__,
11fdf7f2
TL
4445 l_bluestore_omap_seek_to_first_lat,
4446 mono_clock::now() - start1,
494da23a 4447 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2 4448
31f18b77
FG
4449 return 0;
4450}
4451
4452int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4453{
9f95a23c 4454 std::shared_lock l(c->lock);
11fdf7f2 4455 auto start1 = mono_clock::now();
31f18b77
FG
4456 if (o->onode.has_omap()) {
4457 string key;
9f95a23c 4458 o->get_omap_key(after, &key);
31f18b77
FG
4459 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4460 << pretty_binary_string(key) << dendl;
4461 it->upper_bound(key);
4462 } else {
4463 it = KeyValueDB::Iterator();
4464 }
11fdf7f2 4465 c->store->log_latency_fn(
494da23a 4466 __func__,
11fdf7f2
TL
4467 l_bluestore_omap_upper_bound_lat,
4468 mono_clock::now() - start1,
494da23a 4469 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4470 [&] (const ceph::timespan& lat) {
494da23a 4471 return ", after = " + after +
11fdf7f2
TL
4472 _stringify();
4473 }
4474 );
31f18b77
FG
4475 return 0;
4476}
4477
4478int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4479{
9f95a23c 4480 std::shared_lock l(c->lock);
11fdf7f2 4481 auto start1 = mono_clock::now();
31f18b77
FG
4482 if (o->onode.has_omap()) {
4483 string key;
9f95a23c 4484 o->get_omap_key(to, &key);
31f18b77
FG
4485 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4486 << pretty_binary_string(key) << dendl;
4487 it->lower_bound(key);
4488 } else {
4489 it = KeyValueDB::Iterator();
4490 }
11fdf7f2 4491 c->store->log_latency_fn(
494da23a 4492 __func__,
11fdf7f2
TL
4493 l_bluestore_omap_lower_bound_lat,
4494 mono_clock::now() - start1,
494da23a 4495 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4496 [&] (const ceph::timespan& lat) {
494da23a 4497 return ", to = " + to +
11fdf7f2
TL
4498 _stringify();
4499 }
4500 );
31f18b77
FG
4501 return 0;
4502}
4503
4504bool BlueStore::OmapIteratorImpl::valid()
4505{
9f95a23c 4506 std::shared_lock l(c->lock);
31f18b77 4507 bool r = o->onode.has_omap() && it && it->valid() &&
494da23a 4508 it->raw_key().second < tail;
31f18b77
FG
4509 if (it && it->valid()) {
4510 ldout(c->store->cct,20) << __func__ << " is at "
4511 << pretty_binary_string(it->raw_key().second)
4512 << dendl;
4513 }
4514 return r;
4515}
4516
11fdf7f2 4517int BlueStore::OmapIteratorImpl::next()
31f18b77 4518{
11fdf7f2 4519 int r = -1;
9f95a23c 4520 std::shared_lock l(c->lock);
11fdf7f2 4521 auto start1 = mono_clock::now();
31f18b77
FG
4522 if (o->onode.has_omap()) {
4523 it->next();
11fdf7f2 4524 r = 0;
31f18b77 4525 }
494da23a
TL
4526 c->store->log_latency(
4527 __func__,
11fdf7f2
TL
4528 l_bluestore_omap_next_lat,
4529 mono_clock::now() - start1,
494da23a 4530 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2
TL
4531
4532 return r;
31f18b77
FG
4533}
4534
4535string BlueStore::OmapIteratorImpl::key()
4536{
9f95a23c 4537 std::shared_lock l(c->lock);
11fdf7f2 4538 ceph_assert(it->valid());
31f18b77
FG
4539 string db_key = it->raw_key().second;
4540 string user_key;
9f95a23c 4541 o->decode_omap_key(db_key, &user_key);
494da23a 4542
31f18b77
FG
4543 return user_key;
4544}
4545
4546bufferlist BlueStore::OmapIteratorImpl::value()
4547{
9f95a23c 4548 std::shared_lock l(c->lock);
11fdf7f2 4549 ceph_assert(it->valid());
31f18b77
FG
4550 return it->value();
4551}
4552
4553
4554// =====================================
4555
7c673cae
FG
4556#undef dout_prefix
4557#define dout_prefix *_dout << "bluestore(" << path << ") "
9f95a23c
TL
4558#undef dout_context
4559#define dout_context cct
7c673cae
FG
4560
4561
4562static void aio_cb(void *priv, void *priv2)
4563{
4564 BlueStore *store = static_cast<BlueStore*>(priv);
4565 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4566 c->aio_finish(store);
4567}
4568
11fdf7f2
TL
4569static void discard_cb(void *priv, void *priv2)
4570{
4571 BlueStore *store = static_cast<BlueStore*>(priv);
4572 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4573 store->handle_discard(*tmp);
4574}
4575
4576void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4577{
4578 dout(10) << __func__ << dendl;
20effc67
TL
4579 ceph_assert(alloc);
4580 alloc->release(to_release);
11fdf7f2
TL
4581}
4582
7c673cae 4583BlueStore::BlueStore(CephContext *cct, const string& path)
9f95a23c 4584 : BlueStore(cct, path, 0) {}
7c673cae
FG
4585
4586BlueStore::BlueStore(CephContext *cct,
4587 const string& path,
4588 uint64_t _min_alloc_size)
4589 : ObjectStore(cct, path),
9f95a23c 4590 throttle(cct),
11fdf7f2 4591 finisher(cct, "commit_finisher", "cfin"),
7c673cae 4592 kv_sync_thread(this),
31f18b77 4593 kv_finalize_thread(this),
20effc67 4594#ifdef HAVE_LIBZBD
f67539c2 4595 zoned_cleaner_thread(this),
20effc67 4596#endif
7c673cae
FG
4597 min_alloc_size(_min_alloc_size),
4598 min_alloc_size_order(ctz(_min_alloc_size)),
4599 mempool_thread(this)
4600{
4601 _init_logger();
11fdf7f2 4602 cct->_conf.add_observer(this);
7c673cae 4603 set_cache_shards(1);
7c673cae
FG
4604}
4605
4606BlueStore::~BlueStore()
4607{
11fdf7f2 4608 cct->_conf.remove_observer(this);
7c673cae 4609 _shutdown_logger();
11fdf7f2
TL
4610 ceph_assert(!mounted);
4611 ceph_assert(db == NULL);
4612 ceph_assert(bluefs == NULL);
4613 ceph_assert(fsid_fd < 0);
4614 ceph_assert(path_fd < 0);
9f95a23c
TL
4615 for (auto i : onode_cache_shards) {
4616 delete i;
4617 }
4618 for (auto i : buffer_cache_shards) {
7c673cae
FG
4619 delete i;
4620 }
9f95a23c
TL
4621 onode_cache_shards.clear();
4622 buffer_cache_shards.clear();
7c673cae
FG
4623}
4624
4625const char **BlueStore::get_tracked_conf_keys() const
4626{
4627 static const char* KEYS[] = {
4628 "bluestore_csum_type",
4629 "bluestore_compression_mode",
4630 "bluestore_compression_algorithm",
4631 "bluestore_compression_min_blob_size",
4632 "bluestore_compression_min_blob_size_ssd",
4633 "bluestore_compression_min_blob_size_hdd",
4634 "bluestore_compression_max_blob_size",
4635 "bluestore_compression_max_blob_size_ssd",
4636 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 4637 "bluestore_compression_required_ratio",
7c673cae
FG
4638 "bluestore_max_alloc_size",
4639 "bluestore_prefer_deferred_size",
181888fb
FG
4640 "bluestore_prefer_deferred_size_hdd",
4641 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
4642 "bluestore_deferred_batch_ops",
4643 "bluestore_deferred_batch_ops_hdd",
4644 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
4645 "bluestore_throttle_bytes",
4646 "bluestore_throttle_deferred_bytes",
4647 "bluestore_throttle_cost_per_io_hdd",
4648 "bluestore_throttle_cost_per_io_ssd",
4649 "bluestore_throttle_cost_per_io",
4650 "bluestore_max_blob_size",
4651 "bluestore_max_blob_size_ssd",
4652 "bluestore_max_blob_size_hdd",
11fdf7f2
TL
4653 "osd_memory_target",
4654 "osd_memory_target_cgroup_limit_ratio",
4655 "osd_memory_base",
4656 "osd_memory_cache_min",
92f5a8d4 4657 "osd_memory_expected_fragmentation",
11fdf7f2
TL
4658 "bluestore_cache_autotune",
4659 "bluestore_cache_autotune_interval",
20effc67
TL
4660 "bluestore_cache_age_bin_interval",
4661 "bluestore_cache_kv_age_bins",
4662 "bluestore_cache_kv_onode_age_bins",
4663 "bluestore_cache_meta_age_bins",
4664 "bluestore_cache_data_age_bins",
81eedcae 4665 "bluestore_warn_on_legacy_statfs",
9f95a23c 4666 "bluestore_warn_on_no_per_pool_omap",
20effc67 4667 "bluestore_warn_on_no_per_pg_omap",
9f95a23c 4668 "bluestore_max_defer_interval",
7c673cae
FG
4669 NULL
4670 };
4671 return KEYS;
4672}
4673
11fdf7f2 4674void BlueStore::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
4675 const std::set<std::string> &changed)
4676{
eafe8130 4677 if (changed.count("bluestore_warn_on_legacy_statfs")) {
81eedcae
TL
4678 _check_legacy_statfs_alert();
4679 }
f67539c2
TL
4680 if (changed.count("bluestore_warn_on_no_per_pool_omap") ||
4681 changed.count("bluestore_warn_on_no_per_pg_omap")) {
4682 _check_no_per_pg_or_pool_omap_alert();
9f95a23c 4683 }
81eedcae 4684
7c673cae
FG
4685 if (changed.count("bluestore_csum_type")) {
4686 _set_csum();
4687 }
4688 if (changed.count("bluestore_compression_mode") ||
4689 changed.count("bluestore_compression_algorithm") ||
4690 changed.count("bluestore_compression_min_blob_size") ||
4691 changed.count("bluestore_compression_max_blob_size")) {
4692 if (bdev) {
4693 _set_compression();
4694 }
4695 }
4696 if (changed.count("bluestore_max_blob_size") ||
4697 changed.count("bluestore_max_blob_size_ssd") ||
4698 changed.count("bluestore_max_blob_size_hdd")) {
4699 if (bdev) {
4700 // only after startup
4701 _set_blob_size();
4702 }
4703 }
4704 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
4705 changed.count("bluestore_prefer_deferred_size_hdd") ||
4706 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
4707 changed.count("bluestore_max_alloc_size") ||
4708 changed.count("bluestore_deferred_batch_ops") ||
4709 changed.count("bluestore_deferred_batch_ops_hdd") ||
4710 changed.count("bluestore_deferred_batch_ops_ssd")) {
4711 if (bdev) {
4712 // only after startup
4713 _set_alloc_sizes();
4714 }
4715 }
4716 if (changed.count("bluestore_throttle_cost_per_io") ||
4717 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4718 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4719 if (bdev) {
4720 _set_throttle_params();
4721 }
4722 }
9f95a23c
TL
4723 if (changed.count("bluestore_throttle_bytes") ||
4724 changed.count("bluestore_throttle_deferred_bytes") ||
4725 changed.count("bluestore_throttle_trace_rate")) {
4726 throttle.reset_throttle(conf);
7c673cae 4727 }
9f95a23c
TL
4728 if (changed.count("bluestore_max_defer_interval")) {
4729 if (bdev) {
4730 _set_max_defer_interval();
4731 }
7c673cae 4732 }
92f5a8d4
TL
4733 if (changed.count("osd_memory_target") ||
4734 changed.count("osd_memory_base") ||
4735 changed.count("osd_memory_cache_min") ||
4736 changed.count("osd_memory_expected_fragmentation")) {
4737 _update_osd_memory_options();
4738 }
7c673cae
FG
4739}
4740
4741void BlueStore::_set_compression()
4742{
224ce89b
WB
4743 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4744 if (m) {
11fdf7f2 4745 _clear_compression_alert();
224ce89b
WB
4746 comp_mode = *m;
4747 } else {
4748 derr << __func__ << " unrecognized value '"
4749 << cct->_conf->bluestore_compression_mode
4750 << "' for bluestore_compression_mode, reverting to 'none'"
4751 << dendl;
4752 comp_mode = Compressor::COMP_NONE;
11fdf7f2
TL
4753 string s("unknown mode: ");
4754 s += cct->_conf->bluestore_compression_mode;
4755 _set_compression_alert(true, s.c_str());
224ce89b
WB
4756 }
4757
4758 compressor = nullptr;
4759
3efd9988
FG
4760 if (cct->_conf->bluestore_compression_min_blob_size) {
4761 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae 4762 } else {
11fdf7f2 4763 ceph_assert(bdev);
9f95a23c 4764 if (_use_rotational_settings()) {
7c673cae
FG
4765 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4766 } else {
4767 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4768 }
4769 }
4770
4771 if (cct->_conf->bluestore_compression_max_blob_size) {
4772 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4773 } else {
11fdf7f2 4774 ceph_assert(bdev);
9f95a23c 4775 if (_use_rotational_settings()) {
7c673cae
FG
4776 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4777 } else {
4778 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4779 }
4780 }
4781
7c673cae
FG
4782 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4783 if (!alg_name.empty()) {
4784 compressor = Compressor::create(cct, alg_name);
4785 if (!compressor) {
4786 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4787 << dendl;
11fdf7f2 4788 _set_compression_alert(false, alg_name.c_str());
7c673cae
FG
4789 }
4790 }
4791
4792 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4793 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
11fdf7f2
TL
4794 << " min_blob " << comp_min_blob_size
4795 << " max_blob " << comp_max_blob_size
7c673cae
FG
4796 << dendl;
4797}
4798
4799void BlueStore::_set_csum()
4800{
4801 csum_type = Checksummer::CSUM_NONE;
4802 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4803 if (t > Checksummer::CSUM_NONE)
4804 csum_type = t;
4805
4806 dout(10) << __func__ << " csum_type "
4807 << Checksummer::get_csum_type_string(csum_type)
4808 << dendl;
4809}
4810
4811void BlueStore::_set_throttle_params()
4812{
4813 if (cct->_conf->bluestore_throttle_cost_per_io) {
4814 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4815 } else {
11fdf7f2 4816 ceph_assert(bdev);
9f95a23c 4817 if (_use_rotational_settings()) {
7c673cae
FG
4818 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4819 } else {
4820 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4821 }
4822 }
4823
4824 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4825 << dendl;
4826}
4827void BlueStore::_set_blob_size()
4828{
4829 if (cct->_conf->bluestore_max_blob_size) {
4830 max_blob_size = cct->_conf->bluestore_max_blob_size;
4831 } else {
11fdf7f2 4832 ceph_assert(bdev);
9f95a23c 4833 if (_use_rotational_settings()) {
7c673cae
FG
4834 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4835 } else {
4836 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4837 }
4838 }
4839 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4840 << std::dec << dendl;
4841}
4842
92f5a8d4
TL
4843void BlueStore::_update_osd_memory_options()
4844{
4845 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4846 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4847 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4848 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4849 config_changed++;
4850 dout(10) << __func__
4851 << " osd_memory_target " << osd_memory_target
4852 << " osd_memory_base " << osd_memory_base
4853 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4854 << " osd_memory_cache_min " << osd_memory_cache_min
4855 << dendl;
4856}
4857
11fdf7f2 4858int BlueStore::_set_cache_sizes()
1adf2230 4859{
11fdf7f2
TL
4860 ceph_assert(bdev);
4861 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
91327a77 4862 cache_autotune_interval =
11fdf7f2 4863 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
20effc67
TL
4864 cache_age_bin_interval =
4865 cct->_conf.get_val<double>("bluestore_cache_age_bin_interval");
4866 auto _set_bin = [&](std::string conf_name, std::vector<uint64_t>* intervals)
4867 {
4868 std::string intervals_str = cct->_conf.get_val<std::string>(conf_name);
4869 std::istringstream interval_stream(intervals_str);
4870 std::copy(
4871 std::istream_iterator<uint64_t>(interval_stream),
4872 std::istream_iterator<uint64_t>(),
4873 std::back_inserter(*intervals));
4874 };
4875 _set_bin("bluestore_cache_age_bins_kv", &kv_bins);
4876 _set_bin("bluestore_cache_age_bins_kv_onode", &kv_onode_bins);
4877 _set_bin("bluestore_cache_age_bins_meta", &meta_bins);
4878 _set_bin("bluestore_cache_age_bins_data", &data_bins);
4879
11fdf7f2
TL
4880 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4881 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
91327a77 4882 osd_memory_expected_fragmentation =
11fdf7f2
TL
4883 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4884 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
91327a77 4885 osd_memory_cache_resize_interval =
11fdf7f2 4886 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
91327a77 4887
224ce89b
WB
4888 if (cct->_conf->bluestore_cache_size) {
4889 cache_size = cct->_conf->bluestore_cache_size;
4890 } else {
4891 // choose global cache size based on backend type
9f95a23c 4892 if (_use_rotational_settings()) {
224ce89b
WB
4893 cache_size = cct->_conf->bluestore_cache_size_hdd;
4894 } else {
4895 cache_size = cct->_conf->bluestore_cache_size_ssd;
4896 }
4897 }
31f18b77 4898
f67539c2 4899 cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
224ce89b 4900 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 4901 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77 4902 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4903 return -EINVAL;
4904 }
91327a77 4905
f67539c2 4906 cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
224ce89b 4907 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 4908 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
91327a77 4909 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4910 return -EINVAL;
4911 }
91327a77 4912
f67539c2
TL
4913 cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
4914 if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
4915 derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
4916 << ") must be in range [0,1.0]" << dendl;
4917 return -EINVAL;
4918 }
4919
31f18b77 4920 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 4921 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77
AA
4922 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4923 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4924 << dendl;
31f18b77
FG
4925 return -EINVAL;
4926 }
91327a77 4927
f67539c2
TL
4928 cache_data_ratio = (double)1.0 -
4929 (double)cache_meta_ratio -
4930 (double)cache_kv_ratio -
4931 (double)cache_kv_onode_ratio;
31f18b77
FG
4932 if (cache_data_ratio < 0) {
4933 // deal with floating point imprecision
4934 cache_data_ratio = 0;
4935 }
91327a77 4936
224ce89b
WB
4937 dout(1) << __func__ << " cache_size " << cache_size
4938 << " meta " << cache_meta_ratio
31f18b77
FG
4939 << " kv " << cache_kv_ratio
4940 << " data " << cache_data_ratio
4941 << dendl;
4942 return 0;
4943}
4944
3efd9988
FG
4945int BlueStore::write_meta(const std::string& key, const std::string& value)
4946{
4947 bluestore_bdev_label_t label;
4948 string p = path + "/block";
4949 int r = _read_bdev_label(cct, p, &label);
4950 if (r < 0) {
4951 return ObjectStore::write_meta(key, value);
4952 }
4953 label.meta[key] = value;
4954 r = _write_bdev_label(cct, p, label);
11fdf7f2 4955 ceph_assert(r == 0);
3efd9988
FG
4956 return ObjectStore::write_meta(key, value);
4957}
4958
4959int BlueStore::read_meta(const std::string& key, std::string *value)
4960{
4961 bluestore_bdev_label_t label;
4962 string p = path + "/block";
4963 int r = _read_bdev_label(cct, p, &label);
4964 if (r < 0) {
4965 return ObjectStore::read_meta(key, value);
4966 }
4967 auto i = label.meta.find(key);
4968 if (i == label.meta.end()) {
4969 return ObjectStore::read_meta(key, value);
4970 }
4971 *value = i->second;
4972 return 0;
4973}
4974
7c673cae
FG
4975void BlueStore::_init_logger()
4976{
4977 PerfCountersBuilder b(cct, "bluestore",
4978 l_bluestore_first, l_bluestore_last);
20effc67
TL
4979
4980 // space utilization stats
4981 //****************************************
4982 b.add_u64(l_bluestore_allocated, "allocated",
4983 "Sum for allocated bytes",
4984 "al_b",
4985 PerfCountersBuilder::PRIO_CRITICAL,
4986 unit_t(UNIT_BYTES));
4987 b.add_u64(l_bluestore_stored, "stored",
4988 "Sum for stored bytes",
4989 "st_b",
4990 PerfCountersBuilder::PRIO_CRITICAL,
4991 unit_t(UNIT_BYTES));
4992 b.add_u64(l_bluestore_fragmentation, "fragmentation_micros",
4993 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
4994 b.add_u64(l_bluestore_alloc_unit, "alloc_unit",
4995 "allocation unit size in bytes",
4996 "au_b",
4997 PerfCountersBuilder::PRIO_CRITICAL,
4998 unit_t(UNIT_BYTES));
4999 //****************************************
5000
5001 // Update op processing state latencies
5002 //****************************************
7c673cae 5003 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
20effc67
TL
5004 "Average prepare state latency",
5005 "sprl", PerfCountersBuilder::PRIO_USEFUL);
7c673cae
FG
5006 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
5007 "Average aio_wait state latency",
20effc67 5008 "sawl", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae 5009 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
20effc67
TL
5010 "Average io_done state latency",
5011 "sidl", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5012 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
20effc67
TL
5013 "Average kv_queued state latency",
5014 "skql", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5015 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
20effc67
TL
5016 "Average kv_commiting state latency",
5017 "skcl", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5018 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
20effc67
TL
5019 "Average kv_done state latency",
5020 "skdl", PerfCountersBuilder::PRIO_USEFUL);
5021 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
5022 "Average finishing state latency",
5023 "sfnl", PerfCountersBuilder::PRIO_USEFUL);
5024 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
5025 "Average done state latency",
5026 "sdnl", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5027 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
20effc67
TL
5028 "Average deferred_queued state latency",
5029 "sdql", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5030 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
20effc67
TL
5031 "Average aio_wait state latency",
5032 "sdal", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5033 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
20effc67
TL
5034 "Average cleanup state latency",
5035 "sdcl", PerfCountersBuilder::PRIO_USEFUL);
5036 //****************************************
5037
5038 // Update Transaction stats
5039 //****************************************
5040 b.add_time_avg(l_bluestore_throttle_lat, "txc_throttle_lat",
7c673cae
FG
5041 "Average submit throttle latency",
5042 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
20effc67 5043 b.add_time_avg(l_bluestore_submit_lat, "txc_submit_lat",
7c673cae
FG
5044 "Average submit latency",
5045 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
20effc67 5046 b.add_time_avg(l_bluestore_commit_lat, "txc_commit_lat",
7c673cae
FG
5047 "Average commit latency",
5048 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
20effc67
TL
5049 b.add_u64_counter(l_bluestore_txc, "txc_count", "Transactions committed");
5050 //****************************************
5051
5052 // Read op stats
5053 //****************************************
7c673cae 5054 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
20effc67
TL
5055 "Average read onode metadata latency",
5056 "roml", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5057 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
20effc67
TL
5058 "Average read I/O waiting latency",
5059 "rwal", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5060 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
20effc67
TL
5061 "Average checksum latency",
5062 "csml", PerfCountersBuilder::PRIO_USEFUL);
5063 b.add_u64_counter(l_bluestore_read_eio, "read_eio",
5064 "Read EIO errors propagated to high level callers");
5065 b.add_u64_counter(l_bluestore_reads_with_retries, "reads_with_retries",
5066 "Read operations that required at least one retry due to failed checksum validation",
5067 "rd_r", PerfCountersBuilder::PRIO_USEFUL);
5068 b.add_time_avg(l_bluestore_read_lat, "read_lat",
5069 "Average read latency",
5070 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
5071 //****************************************
5072
5073 // kv_thread latencies
5074 //****************************************
5075 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
5076 "Average kv_thread flush latency",
5077 "kfsl", PerfCountersBuilder::PRIO_INTERESTING);
5078 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
5079 "Average kv_thread commit latency",
5080 "kcol", PerfCountersBuilder::PRIO_USEFUL);
5081 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
5082 "Average kv_sync thread latency",
5083 "kscl", PerfCountersBuilder::PRIO_INTERESTING);
5084 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
5085 "Average kv_finalize thread latency",
5086 "kfll", PerfCountersBuilder::PRIO_INTERESTING);
5087 //****************************************
5088
5089 // write op stats
5090 //****************************************
5091 b.add_u64_counter(l_bluestore_write_big, "write_big",
7c673cae 5092 "Large aligned writes into fresh blobs");
20effc67
TL
5093 b.add_u64_counter(l_bluestore_write_big_bytes, "write_big_bytes",
5094 "Large aligned writes into fresh blobs (bytes)",
5095 NULL,
5096 PerfCountersBuilder::PRIO_DEBUGONLY,
5097 unit_t(UNIT_BYTES));
5098 b.add_u64_counter(l_bluestore_write_big_blobs, "write_big_blobs",
7c673cae 5099 "Large aligned writes into fresh blobs (blobs)");
f67539c2 5100 b.add_u64_counter(l_bluestore_write_big_deferred,
20effc67 5101 "write_big_deferred",
f67539c2 5102 "Big overwrites using deferred");
20effc67
TL
5103
5104 b.add_u64_counter(l_bluestore_write_small, "write_small",
7c673cae 5105 "Small writes into existing or sparse small blobs");
20effc67
TL
5106 b.add_u64_counter(l_bluestore_write_small_bytes, "write_small_bytes",
5107 "Small writes into existing or sparse small blobs (bytes)",
5108 NULL,
5109 PerfCountersBuilder::PRIO_DEBUGONLY,
5110 unit_t(UNIT_BYTES));
7c673cae 5111 b.add_u64_counter(l_bluestore_write_small_unused,
20effc67 5112 "write_small_unused",
7c673cae 5113 "Small writes into unused portion of existing blob");
7c673cae 5114 b.add_u64_counter(l_bluestore_write_small_pre_read,
20effc67 5115 "write_small_pre_read",
7c673cae
FG
5116 "Small writes that required we read some data (possibly "
5117 "cached) to fill out the block");
7c673cae 5118
20effc67
TL
5119 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
5120 "Sum for write-op padded bytes",
5121 NULL,
5122 PerfCountersBuilder::PRIO_DEBUGONLY,
5123 unit_t(UNIT_BYTES));
5124 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
5125 "Sum for write penalty read ops");
5126 b.add_u64_counter(l_bluestore_write_new, "write_new",
5127 "Write into new blob");
5128
5129 b.add_u64_counter(l_bluestore_issued_deferred_writes,
5130 "issued_deferred_writes",
5131 "Total deferred writes issued");
5132 b.add_u64_counter(l_bluestore_issued_deferred_write_bytes,
5133 "issued_deferred_write_bytes",
5134 "Total bytes in issued deferred writes",
5135 NULL,
5136 PerfCountersBuilder::PRIO_DEBUGONLY,
5137 unit_t(UNIT_BYTES));
5138 b.add_u64_counter(l_bluestore_submitted_deferred_writes,
5139 "submitted_deferred_writes",
5140 "Total deferred writes submitted to disk");
5141 b.add_u64_counter(l_bluestore_submitted_deferred_write_bytes,
5142 "submitted_deferred_write_bytes",
5143 "Total bytes submitted to disk by deferred writes",
5144 NULL,
5145 PerfCountersBuilder::PRIO_DEBUGONLY,
5146 unit_t(UNIT_BYTES));
5147
5148 b.add_u64_counter(l_bluestore_write_big_skipped_blobs,
5149 "write_big_skipped_blobs",
5150 "Large aligned writes into fresh blobs skipped due to zero detection (blobs)");
5151 b.add_u64_counter(l_bluestore_write_big_skipped_bytes,
5152 "write_big_skipped_bytes",
5153 "Large aligned writes into fresh blobs skipped due to zero detection (bytes)");
5154 b.add_u64_counter(l_bluestore_write_small_skipped,
5155 "write_small_skipped",
5156 "Small writes into existing or sparse small blobs skipped due to zero detection");
5157 b.add_u64_counter(l_bluestore_write_small_skipped_bytes,
5158 "write_small_skipped_bytes",
5159 "Small writes into existing or sparse small blobs skipped due to zero detection (bytes)");
5160 //****************************************
5161
5162 // compressions stats
5163 //****************************************
5164 b.add_u64(l_bluestore_compressed, "compressed",
5165 "Sum for stored compressed bytes",
5166 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5167 b.add_u64(l_bluestore_compressed_allocated, "compressed_allocated",
5168 "Sum for bytes allocated for compressed data",
5169 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5170 b.add_u64(l_bluestore_compressed_original, "compressed_original",
5171 "Sum for original bytes that were compressed",
5172 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5173 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
5174 "Average compress latency",
5175 "_cpl", PerfCountersBuilder::PRIO_USEFUL);
5176 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
5177 "Average decompress latency",
5178 "dcpl", PerfCountersBuilder::PRIO_USEFUL);
5179 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
5180 "Sum for beneficial compress ops");
5181 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
5182 "Sum for compress ops rejected due to low net gain of space");
5183 //****************************************
5184
5185 // onode cache stats
5186 //****************************************
5187 b.add_u64(l_bluestore_onodes, "onodes",
5188 "Number of onodes in cache");
5189 b.add_u64(l_bluestore_pinned_onodes, "onodes_pinned",
5190 "Number of pinned onodes in cache");
5191 b.add_u64_counter(l_bluestore_onode_hits, "onode_hits",
5192 "Count of onode cache lookup hits",
5193 "o_ht", PerfCountersBuilder::PRIO_USEFUL);
5194 b.add_u64_counter(l_bluestore_onode_misses, "onode_misses",
5195 "Count of onode cache lookup misses",
5196 "o_ms", PerfCountersBuilder::PRIO_USEFUL);
5197 b.add_u64_counter(l_bluestore_onode_shard_hits, "onode_shard_hits",
5198 "Count of onode shard cache lookups hits");
5199 b.add_u64_counter(l_bluestore_onode_shard_misses,
5200 "onode_shard_misses",
5201 "Count of onode shard cache lookups misses");
5202 b.add_u64(l_bluestore_extents, "onode_extents",
5203 "Number of extents in cache");
5204 b.add_u64(l_bluestore_blobs, "onode_blobs",
5205 "Number of blobs in cache");
5206 //****************************************
5207
5208 // buffer cache stats
5209 //****************************************
5210 b.add_u64(l_bluestore_buffers, "buffers",
5211 "Number of buffers in cache");
5212 b.add_u64(l_bluestore_buffer_bytes, "buffer_bytes",
5213 "Number of buffer bytes in cache",
5214 NULL,
5215 PerfCountersBuilder::PRIO_DEBUGONLY,
5216 unit_t(UNIT_BYTES));
5217 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "buffer_hit_bytes",
5218 "Sum for bytes of read hit in the cache",
5219 NULL,
5220 PerfCountersBuilder::PRIO_DEBUGONLY,
5221 unit_t(UNIT_BYTES));
5222 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "buffer_miss_bytes",
5223 "Sum for bytes of read missed in the cache",
5224 NULL,
5225 PerfCountersBuilder::PRIO_DEBUGONLY,
5226 unit_t(UNIT_BYTES));
5227 //****************************************
5228
5229 // internal stats
5230 //****************************************
5231 b.add_u64_counter(l_bluestore_onode_reshard, "onode_reshard",
5232 "Onode extent map reshard events");
5233 b.add_u64_counter(l_bluestore_blob_split, "blob_split",
7c673cae 5234 "Sum for blob splitting due to resharding");
20effc67 5235 b.add_u64_counter(l_bluestore_extent_compress, "extent_compress",
7c673cae 5236 "Sum for extents that have been removed due to compression");
20effc67 5237 b.add_u64_counter(l_bluestore_gc_merged, "gc_merged",
7c673cae
FG
5238 "Sum for extents that have been merged due to garbage "
5239 "collection");
20effc67
TL
5240 //****************************************
5241
5242 // other client ops latencies
5243 //****************************************
11fdf7f2 5244 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
20effc67
TL
5245 "Average omap iterator seek_to_first call latency",
5246 "osfl", PerfCountersBuilder::PRIO_USEFUL);
11fdf7f2 5247 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
20effc67
TL
5248 "Average omap iterator upper_bound call latency",
5249 "oubl", PerfCountersBuilder::PRIO_USEFUL);
11fdf7f2 5250 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
20effc67
TL
5251 "Average omap iterator lower_bound call latency",
5252 "olbl", PerfCountersBuilder::PRIO_USEFUL);
11fdf7f2 5253 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
20effc67
TL
5254 "Average omap iterator next call latency",
5255 "onxl", PerfCountersBuilder::PRIO_USEFUL);
adb31ebb 5256 b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
20effc67
TL
5257 "Average omap get_keys call latency",
5258 "ogkl", PerfCountersBuilder::PRIO_USEFUL);
adb31ebb 5259 b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
20effc67
TL
5260 "Average omap get_values call latency",
5261 "ogvl", PerfCountersBuilder::PRIO_USEFUL);
5262 b.add_time_avg(l_bluestore_omap_clear_lat, "omap_clear_lat",
5263 "Average omap clear call latency");
494da23a 5264 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
20effc67
TL
5265 "Average collection listing latency",
5266 "cl_l", PerfCountersBuilder::PRIO_USEFUL);
adb31ebb 5267 b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
20effc67
TL
5268 "Average removal latency",
5269 "rm_l", PerfCountersBuilder::PRIO_USEFUL);
5270 b.add_time_avg(l_bluestore_truncate_lat, "truncate_lat",
5271 "Average truncate latency",
5272 "tr_l", PerfCountersBuilder::PRIO_USEFUL);
5273 //****************************************
5274
5275 // Resulting size axis configuration for op histograms, values are in bytes
5276 PerfHistogramCommon::axis_config_d alloc_hist_x_axis_config{
5277 "Given size (bytes)",
5278 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
5279 0, ///< Start at 0
5280 4096, ///< Quantization unit
5281 13, ///< Enough to cover 4+M requests
5282 };
5283 // Req size axis configuration for op histograms, values are in bytes
5284 PerfHistogramCommon::axis_config_d alloc_hist_y_axis_config{
5285 "Request size (bytes)",
5286 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
5287 0, ///< Start at 0
5288 4096, ///< Quantization unit
5289 13, ///< Enough to cover 4+M requests
5290 };
5291 b.add_u64_counter_histogram(
5292 l_bluestore_allocate_hist, "allocate_histogram",
5293 alloc_hist_x_axis_config, alloc_hist_y_axis_config,
5294 "Histogram of requested block allocations vs. given ones");
adb31ebb 5295
7c673cae
FG
5296 logger = b.create_perf_counters();
5297 cct->get_perfcounters_collection()->add(logger);
5298}
5299
5300int BlueStore::_reload_logger()
5301{
5302 struct store_statfs_t store_statfs;
7c673cae 5303 int r = statfs(&store_statfs);
11fdf7f2 5304 if (r >= 0) {
7c673cae 5305 logger->set(l_bluestore_allocated, store_statfs.allocated);
11fdf7f2
TL
5306 logger->set(l_bluestore_stored, store_statfs.data_stored);
5307 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
5308 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
5309 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
7c673cae
FG
5310 }
5311 return r;
5312}
5313
5314void BlueStore::_shutdown_logger()
5315{
5316 cct->get_perfcounters_collection()->remove(logger);
5317 delete logger;
5318}
5319
5320int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
5321 uuid_d *fsid)
5322{
5323 bluestore_bdev_label_t label;
5324 int r = _read_bdev_label(cct, path, &label);
5325 if (r < 0)
5326 return r;
5327 *fsid = label.osd_uuid;
5328 return 0;
5329}
5330
5331int BlueStore::_open_path()
5332{
b32b8144 5333 // sanity check(s)
11fdf7f2 5334 ceph_assert(path_fd < 0);
91327a77 5335 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
7c673cae
FG
5336 if (path_fd < 0) {
5337 int r = -errno;
5338 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
5339 << dendl;
5340 return r;
5341 }
5342 return 0;
5343}
5344
5345void BlueStore::_close_path()
5346{
5347 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
5348 path_fd = -1;
5349}
5350
3efd9988 5351int BlueStore::_write_bdev_label(CephContext *cct,
20effc67 5352 const string &path, bluestore_bdev_label_t label)
7c673cae
FG
5353{
5354 dout(10) << __func__ << " path " << path << " label " << label << dendl;
5355 bufferlist bl;
11fdf7f2 5356 encode(label, bl);
7c673cae 5357 uint32_t crc = bl.crc32c(-1);
11fdf7f2
TL
5358 encode(crc, bl);
5359 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
7c673cae
FG
5360 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
5361 z.zero();
5362 bl.append(std::move(z));
5363
91327a77 5364 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
7c673cae
FG
5365 if (fd < 0) {
5366 fd = -errno;
5367 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5368 << dendl;
5369 return fd;
5370 }
5371 int r = bl.write_fd(fd);
5372 if (r < 0) {
5373 derr << __func__ << " failed to write to " << path
5374 << ": " << cpp_strerror(r) << dendl;
11fdf7f2 5375 goto out;
7c673cae 5376 }
3efd9988
FG
5377 r = ::fsync(fd);
5378 if (r < 0) {
5379 derr << __func__ << " failed to fsync " << path
5380 << ": " << cpp_strerror(r) << dendl;
5381 }
11fdf7f2 5382out:
7c673cae
FG
5383 VOID_TEMP_FAILURE_RETRY(::close(fd));
5384 return r;
5385}
5386
20effc67 5387int BlueStore::_read_bdev_label(CephContext* cct, const string &path,
7c673cae
FG
5388 bluestore_bdev_label_t *label)
5389{
5390 dout(10) << __func__ << dendl;
91327a77 5391 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
7c673cae
FG
5392 if (fd < 0) {
5393 fd = -errno;
5394 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5395 << dendl;
5396 return fd;
5397 }
5398 bufferlist bl;
5399 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
5400 VOID_TEMP_FAILURE_RETRY(::close(fd));
5401 if (r < 0) {
5402 derr << __func__ << " failed to read from " << path
5403 << ": " << cpp_strerror(r) << dendl;
5404 return r;
5405 }
5406
5407 uint32_t crc, expected_crc;
11fdf7f2 5408 auto p = bl.cbegin();
7c673cae 5409 try {
11fdf7f2 5410 decode(*label, p);
7c673cae
FG
5411 bufferlist t;
5412 t.substr_of(bl, 0, p.get_off());
5413 crc = t.crc32c(-1);
11fdf7f2 5414 decode(expected_crc, p);
7c673cae 5415 }
f67539c2 5416 catch (ceph::buffer::error& e) {
b32b8144 5417 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
5418 << ": " << e.what()
5419 << dendl;
b32b8144 5420 return -ENOENT;
7c673cae
FG
5421 }
5422 if (crc != expected_crc) {
5423 derr << __func__ << " bad crc on label, expected " << expected_crc
5424 << " != actual " << crc << dendl;
5425 return -EIO;
5426 }
5427 dout(10) << __func__ << " got " << *label << dendl;
5428 return 0;
5429}
5430
5431int BlueStore::_check_or_set_bdev_label(
5432 string path, uint64_t size, string desc, bool create)
5433{
5434 bluestore_bdev_label_t label;
5435 if (create) {
5436 label.osd_uuid = fsid;
5437 label.size = size;
5438 label.btime = ceph_clock_now();
5439 label.description = desc;
3efd9988 5440 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
5441 if (r < 0)
5442 return r;
5443 } else {
5444 int r = _read_bdev_label(cct, path, &label);
5445 if (r < 0)
5446 return r;
31f18b77
FG
5447 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
5448 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5449 << " and fsid " << fsid << " check bypassed" << dendl;
1911f103 5450 } else if (label.osd_uuid != fsid) {
7c673cae
FG
5451 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5452 << " does not match our fsid " << fsid << dendl;
5453 return -EIO;
5454 }
5455 }
5456 return 0;
5457}
5458
5459void BlueStore::_set_alloc_sizes(void)
5460{
7c673cae
FG
5461 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
5462
20effc67
TL
5463#ifdef HAVE_LIBZBD
5464 ceph_assert(bdev);
5465 if (bdev->is_smr()) {
5466 prefer_deferred_size = 0;
5467 } else
5468#endif
7c673cae
FG
5469 if (cct->_conf->bluestore_prefer_deferred_size) {
5470 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
5471 } else {
9f95a23c 5472 if (_use_rotational_settings()) {
7c673cae
FG
5473 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
5474 } else {
5475 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
5476 }
5477 }
5478
5479 if (cct->_conf->bluestore_deferred_batch_ops) {
5480 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
5481 } else {
9f95a23c 5482 if (_use_rotational_settings()) {
7c673cae
FG
5483 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
5484 } else {
5485 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
5486 }
5487 }
5488
5489 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11fdf7f2 5490 << std::dec << " order " << (int)min_alloc_size_order
7c673cae
FG
5491 << " max_alloc_size 0x" << std::hex << max_alloc_size
5492 << " prefer_deferred_size 0x" << prefer_deferred_size
5493 << std::dec
5494 << " deferred_batch_ops " << deferred_batch_ops
5495 << dendl;
5496}
5497
5498int BlueStore::_open_bdev(bool create)
5499{
11fdf7f2 5500 ceph_assert(bdev == NULL);
7c673cae 5501 string p = path + "/block";
11fdf7f2 5502 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
7c673cae
FG
5503 int r = bdev->open(p);
5504 if (r < 0)
5505 goto fail;
5506
11fdf7f2
TL
5507 if (create && cct->_conf->bdev_enable_discard) {
5508 bdev->discard(0, bdev->get_size());
5509 }
5510
7c673cae
FG
5511 if (bdev->supported_bdev_label()) {
5512 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
5513 if (r < 0)
5514 goto fail_close;
5515 }
5516
5517 // initialize global block parameters
5518 block_size = bdev->get_block_size();
5519 block_mask = ~(block_size - 1);
5520 block_size_order = ctz(block_size);
11fdf7f2 5521 ceph_assert(block_size == 1u << block_size_order);
9f95a23c 5522 _set_max_defer_interval();
224ce89b
WB
5523 // and set cache_size based on device type
5524 r = _set_cache_sizes();
5525 if (r < 0) {
5526 goto fail_close;
5527 }
20effc67
TL
5528 // get block dev optimal io size
5529 optimal_io_size = bdev->get_optimal_io_size();
f67539c2 5530
7c673cae
FG
5531 return 0;
5532
5533 fail_close:
5534 bdev->close();
5535 fail:
5536 delete bdev;
5537 bdev = NULL;
5538 return r;
5539}
5540
11fdf7f2
TL
5541void BlueStore::_validate_bdev()
5542{
5543 ceph_assert(bdev);
11fdf7f2 5544 uint64_t dev_size = bdev->get_size();
f67539c2 5545 ceph_assert(dev_size > _get_ondisk_reserved());
11fdf7f2
TL
5546}
5547
7c673cae
FG
5548void BlueStore::_close_bdev()
5549{
11fdf7f2 5550 ceph_assert(bdev);
7c673cae
FG
5551 bdev->close();
5552 delete bdev;
5553 bdev = NULL;
5554}
5555
20effc67 5556int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only, bool fm_restore)
7c673cae 5557{
1911f103 5558 int r;
1911f103 5559
20effc67 5560 dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
11fdf7f2 5561 ceph_assert(fm == NULL);
20effc67
TL
5562 // fm_restore means we are transitioning from null-fm to bitmap-fm
5563 ceph_assert(!fm_restore || (freelist_type != "null"));
5564 // fm restore must pass in a valid transaction
5565 ceph_assert(!fm_restore || (t != nullptr));
5566
5567 // When allocation-info is stored in a single file we set freelist_type to "null"
5568 bool set_null_freemap = false;
5569 if (freelist_type == "null") {
5570 // use BitmapFreelistManager with the null option to stop allocations from going to RocksDB
5571 // we will store the allocation info in a single file during umount()
5572 freelist_type = "bitmap";
5573 set_null_freemap = true;
5574 }
11fdf7f2
TL
5575 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
5576 ceph_assert(fm);
20effc67
TL
5577 if (set_null_freemap) {
5578 fm->set_null_manager();
5579 }
11fdf7f2
TL
5580 if (t) {
5581 // create mode. initialize freespace
7c673cae 5582 dout(20) << __func__ << " initializing freespace" << dendl;
7c673cae
FG
5583 {
5584 bufferlist bl;
5585 bl.append(freelist_type);
5586 t->set(PREFIX_SUPER, "freelist_type", bl);
5587 }
b32b8144
FG
5588 // being able to allocate in units less than bdev block size
5589 // seems to be a bad idea.
20effc67 5590 ceph_assert(cct->_conf->bdev_block_size <= min_alloc_size);
f67539c2
TL
5591
5592 uint64_t alloc_size = min_alloc_size;
20effc67 5593#ifdef HAVE_LIBZBD
f67539c2 5594 if (bdev->is_smr()) {
20effc67
TL
5595 if (freelist_type != "zoned") {
5596 derr << "SMR device but freelist_type = " << freelist_type << " (not zoned)"
5597 << dendl;
5598 return -EINVAL;
5599 }
5600 } else
5601#endif
5602 if (freelist_type == "zoned") {
5603 derr << "non-SMR device (or SMR support not built-in) but freelist_type = zoned"
5604 << dendl;
5605 return -EINVAL;
f67539c2
TL
5606 }
5607
20effc67
TL
5608 fm->create(bdev->get_size(), alloc_size,
5609 zone_size, first_sequential_zone,
5610 t);
7c673cae
FG
5611
5612 // allocate superblock reserved space. note that we do not mark
5613 // bluefs space as allocated in the freelist; we instead rely on
f67539c2 5614 // bluefs doing that itself.
11fdf7f2 5615 auto reserved = _get_ondisk_reserved();
20effc67
TL
5616 if (fm_restore) {
5617 // we need to allocate the full space in restore case
5618 // as later we will add free-space marked in the allocator file
5619 fm->allocate(0, bdev->get_size(), t);
5620 } else {
5621 // allocate superblock reserved space. note that we do not mark
5622 // bluefs space as allocated in the freelist; we instead rely on
5623 // bluefs doing that itself.
5624 fm->allocate(0, reserved, t);
5625 }
5626 // debug code - not needed for NULL FM
7c673cae
FG
5627 if (cct->_conf->bluestore_debug_prefill > 0) {
5628 uint64_t end = bdev->get_size() - reserved;
5629 dout(1) << __func__ << " pre-fragmenting freespace, using "
5630 << cct->_conf->bluestore_debug_prefill << " with max free extent "
5631 << cct->_conf->bluestore_debug_prefragment_max << dendl;
11fdf7f2 5632 uint64_t start = p2roundup(reserved, min_alloc_size);
7c673cae
FG
5633 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
5634 float r = cct->_conf->bluestore_debug_prefill;
5635 r /= 1.0 - r;
5636 bool stop = false;
5637
5638 while (!stop && start < end) {
5639 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
5640 if (start + l > end) {
5641 l = end - start;
11fdf7f2 5642 l = p2align(l, min_alloc_size);
7c673cae 5643 }
11fdf7f2 5644 ceph_assert(start + l <= end);
7c673cae
FG
5645
5646 uint64_t u = 1 + (uint64_t)(r * (double)l);
11fdf7f2 5647 u = p2roundup(u, min_alloc_size);
7c673cae
FG
5648 if (start + l + u > end) {
5649 u = end - (start + l);
5650 // trim to align so we don't overflow again
11fdf7f2 5651 u = p2align(u, min_alloc_size);
7c673cae
FG
5652 stop = true;
5653 }
11fdf7f2 5654 ceph_assert(start + l + u <= end);
7c673cae 5655
11fdf7f2 5656 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
7c673cae
FG
5657 << " use 0x" << u << std::dec << dendl;
5658
5659 if (u == 0) {
5660 // break if u has been trimmed to nothing
5661 break;
5662 }
5663
5664 fm->allocate(start + l, u, t);
5665 start += l + u;
5666 }
5667 }
f67539c2 5668 r = _write_out_fm_meta(0);
1911f103
TL
5669 ceph_assert(r == 0);
5670 } else {
f67539c2
TL
5671 r = fm->init(db, read_only,
5672 [&](const std::string& key, std::string* result) {
5673 return read_meta(key, result);
5674 });
1911f103 5675 if (r < 0) {
f67539c2 5676 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
1911f103
TL
5677 delete fm;
5678 fm = NULL;
5679 return r;
5680 }
7c673cae 5681 }
81eedcae
TL
5682 // if space size tracked by free list manager is that higher than actual
5683 // dev size one can hit out-of-space allocation which will result
5684 // in data loss and/or assertions
5685 // Probably user altered the device size somehow.
5686 // The only fix for now is to redeploy OSD.
5687 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5688 ostringstream ss;
5689 ss << "slow device size mismatch detected, "
5690 << " fm size(" << fm->get_size()
5691 << ") > slow device size(" << bdev->get_size()
5692 << "), Please stop using this OSD as it might cause data loss.";
5693 _set_disk_size_mismatch_alert(ss.str());
5694 }
7c673cae
FG
5695 return 0;
5696}
5697
5698void BlueStore::_close_fm()
5699{
5700 dout(10) << __func__ << dendl;
11fdf7f2 5701 ceph_assert(fm);
7c673cae
FG
5702 fm->shutdown();
5703 delete fm;
5704 fm = NULL;
5705}
5706
f67539c2 5707int BlueStore::_write_out_fm_meta(uint64_t target_size)
1911f103 5708{
f67539c2 5709 int r = 0;
1911f103
TL
5710 string p = path + "/block";
5711
5712 std::vector<std::pair<string, string>> fm_meta;
5713 fm->get_meta(target_size, &fm_meta);
5714
1911f103 5715 for (auto& m : fm_meta) {
f67539c2
TL
5716 r = write_meta(m.first, m.second);
5717 ceph_assert(r == 0);
1911f103 5718 }
1911f103
TL
5719 return r;
5720}
5721
f67539c2 5722int BlueStore::_create_alloc()
7c673cae 5723{
20effc67 5724 ceph_assert(alloc == NULL);
f67539c2 5725 ceph_assert(shared_alloc.a == NULL);
11fdf7f2
TL
5726 ceph_assert(bdev->get_size());
5727
f67539c2 5728 uint64_t alloc_size = min_alloc_size;
20effc67
TL
5729
5730 std::string allocator_type = cct->_conf->bluestore_allocator;
5731
5732#ifdef HAVE_LIBZBD
5733 if (freelist_type == "zoned") {
5734 allocator_type = "zoned";
11fdf7f2 5735 }
20effc67 5736#endif
11fdf7f2 5737
20effc67
TL
5738 alloc = Allocator::create(
5739 cct, allocator_type,
f67539c2 5740 bdev->get_size(),
20effc67
TL
5741 alloc_size,
5742 zone_size,
5743 first_sequential_zone,
5744 "block");
5745 if (!alloc) {
5746 lderr(cct) << __func__ << " failed to create " << allocator_type << " allocator"
5747 << dendl;
7c673cae
FG
5748 return -EINVAL;
5749 }
20effc67
TL
5750
5751#ifdef HAVE_LIBZBD
5752 if (freelist_type == "zoned") {
5753 Allocator *a = Allocator::create(
5754 cct, cct->_conf->bluestore_allocator,
5755 bdev->get_conventional_region_size(),
5756 alloc_size,
5757 0, 0,
5758 "zoned_block");
5759 if (!a) {
5760 lderr(cct) << __func__ << " failed to create " << cct->_conf->bluestore_allocator
5761 << " allocator" << dendl;
5762 delete alloc;
5763 return -EINVAL;
5764 }
5765 shared_alloc.set(a);
5766 } else
5767#endif
5768 {
5769 // BlueFS will share the same allocator
5770 shared_alloc.set(alloc);
5771 }
5772
f67539c2
TL
5773 return 0;
5774}
5775
20effc67 5776int BlueStore::_init_alloc(std::map<uint64_t, uint64_t> *zone_adjustments)
f67539c2
TL
5777{
5778 int r = _create_alloc();
5779 if (r < 0) {
5780 return r;
5781 }
20effc67 5782 ceph_assert(alloc != NULL);
f67539c2 5783
20effc67 5784#ifdef HAVE_LIBZBD
f67539c2 5785 if (bdev->is_smr()) {
20effc67
TL
5786 auto a = dynamic_cast<ZonedAllocator*>(alloc);
5787 ceph_assert(a);
5788 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
5789 ceph_assert(f);
5790 vector<uint64_t> wp = bdev->get_zones();
5791 vector<zone_state_t> zones = f->get_zone_states(db);
5792 ceph_assert(wp.size() == zones.size());
5793
5794 // reconcile zone state
5795 auto num_zones = bdev->get_size() / zone_size;
5796 for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
5797 ceph_assert(wp[i] >= i * zone_size);
5798 ceph_assert(wp[i] <= (i + 1) * zone_size); // pos might be at start of next zone
5799 uint64_t p = wp[i] - i * zone_size;
5800 if (zones[i].write_pointer > p) {
5801 derr << __func__ << " zone 0x" << std::hex << i
5802 << " bluestore write pointer 0x" << zones[i].write_pointer
5803 << " > device write pointer 0x" << p
5804 << std::dec << " -- VERY SUSPICIOUS!" << dendl;
5805 } else if (zones[i].write_pointer < p) {
5806 // this is "normal" in that it can happen after any crash (if we have a
5807 // write in flight but did not manage to commit the transaction)
5808 auto delta = p - zones[i].write_pointer;
5809 dout(1) << __func__ << " zone 0x" << std::hex << i
5810 << " device write pointer 0x" << p
5811 << " > bluestore pointer 0x" << zones[i].write_pointer
5812 << ", advancing 0x" << delta << std::dec << dendl;
5813 (*zone_adjustments)[zones[i].write_pointer] = delta;
5814 zones[i].num_dead_bytes += delta;
5815 zones[i].write_pointer = p;
5816 }
5817 }
5818
5819 // start with conventional zone "free" (bluefs may adjust this when it starts up)
5820 auto reserved = _get_ondisk_reserved();
5821 // for now we require a conventional zone
5822 ceph_assert(bdev->get_conventional_region_size());
5823 ceph_assert(shared_alloc.a != alloc); // zoned allocator doesn't use conventional region
5824 shared_alloc.a->init_add_free(
5825 reserved,
5826 p2align(bdev->get_conventional_region_size(), min_alloc_size) - reserved);
5827
5828 // init sequential zone based on the device's write pointers
5829 a->init_from_zone_pointers(std::move(zones));
5830 dout(1) << __func__
5831 << " loaded zone pointers: "
5832 << std::hex
5833 << ", allocator type " << alloc->get_type()
5834 << ", capacity 0x" << alloc->get_capacity()
5835 << ", block size 0x" << alloc->get_block_size()
5836 << ", free 0x" << alloc->get_free()
5837 << ", fragmentation " << alloc->get_fragmentation()
5838 << std::dec << dendl;
5839
5840 return 0;
f67539c2 5841 }
20effc67 5842#endif
7c673cae
FG
5843
5844 uint64_t num = 0, bytes = 0;
20effc67
TL
5845 utime_t start_time = ceph_clock_now();
5846 if (!fm->is_null_manager()) {
5847 // This is the original path - loading allocation map from RocksDB and feeding into the allocator
5848 dout(5) << __func__ << "::NCB::loading allocation from FM -> alloc" << dendl;
5849 // initialize from freelist
5850 fm->enumerate_reset();
5851 uint64_t offset, length;
5852 while (fm->enumerate_next(db, &offset, &length)) {
5853 alloc->init_add_free(offset, length);
5854 ++num;
5855 bytes += length;
5856 }
5857 fm->enumerate_reset();
5858
5859 utime_t duration = ceph_clock_now() - start_time;
5860 dout(5) << __func__ << "::num_entries=" << num << " free_size=" << bytes << " alloc_size=" <<
5861 alloc->get_capacity() - bytes << " time=" << duration << " seconds" << dendl;
5862 } else {
5863 // This is the new path reading the allocation map from a flat bluefs file and feeding them into the allocator
7c673cae 5864
20effc67
TL
5865 if (!cct->_conf->bluestore_allocation_from_file) {
5866 derr << __func__ << "::NCB::cct->_conf->bluestore_allocation_from_file is set to FALSE with an active NULL-FM" << dendl;
5867 derr << __func__ << "::NCB::Please change the value of bluestore_allocation_from_file to TRUE in your ceph.conf file" << dendl;
5868 return -ENOTSUP; // Operation not supported
5869 }
7c673cae 5870
20effc67
TL
5871 if (restore_allocator(alloc, &num, &bytes) == 0) {
5872 dout(5) << __func__ << "::NCB::restore_allocator() completed successfully alloc=" << alloc << dendl;
5873 } else {
5874 // This must mean that we had an unplanned shutdown and didn't manage to destage the allocator
5875 dout(0) << __func__ << "::NCB::restore_allocator() failed! Run Full Recovery from ONodes (might take a while) ..." << dendl;
5876 // if failed must recover from on-disk ONode internal state
5877 if (read_allocation_from_drive_on_startup() != 0) {
5878 derr << __func__ << "::NCB::Failed Recovery" << dendl;
5879 derr << __func__ << "::NCB::Ceph-OSD won't start, make sure your drives are connected and readable" << dendl;
5880 derr << __func__ << "::NCB::If no HW fault is found, please report failure and consider redeploying OSD" << dendl;
5881 return -ENOTRECOVERABLE;
5882 }
5883 }
5884 }
f67539c2
TL
5885 dout(1) << __func__
5886 << " loaded " << byte_u_t(bytes) << " in " << num << " extents"
5887 << std::hex
20effc67
TL
5888 << ", allocator type " << alloc->get_type()
5889 << ", capacity 0x" << alloc->get_capacity()
5890 << ", block size 0x" << alloc->get_block_size()
5891 << ", free 0x" << alloc->get_free()
5892 << ", fragmentation " << alloc->get_fragmentation()
f67539c2 5893 << std::dec << dendl;
1911f103 5894
7c673cae
FG
5895 return 0;
5896}
5897
20effc67
TL
5898void BlueStore::_post_init_alloc(const std::map<uint64_t, uint64_t>& zone_adjustments)
5899{
5900#ifdef HAVE_LIBZBD
5901 assert(bdev->is_smr());
5902 dout(1) << __func__ << " adjusting freelist based on device write pointers" << dendl;
5903 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
5904 ceph_assert(f);
5905 KeyValueDB::Transaction t = db->get_transaction();
5906 for (auto& i : zone_adjustments) {
5907 // allocate AND release since this gap is now dead space
5908 // note that the offset is imprecise, but only need to select the zone
5909 f->allocate(i.first, i.second, t);
5910 f->release(i.first, i.second, t);
5911 }
5912 int r = db->submit_transaction_sync(t);
5913 ceph_assert(r == 0);
5914#endif
5915}
5916
7c673cae
FG
5917void BlueStore::_close_alloc()
5918{
11fdf7f2
TL
5919 ceph_assert(bdev);
5920 bdev->discard_drain();
5921
20effc67
TL
5922 ceph_assert(alloc);
5923 alloc->shutdown();
5924 delete alloc;
5925
f67539c2 5926 ceph_assert(shared_alloc.a);
20effc67
TL
5927 if (alloc != shared_alloc.a) {
5928 shared_alloc.a->shutdown();
5929 delete shared_alloc.a;
5930 }
5931
f67539c2 5932 shared_alloc.reset();
20effc67 5933 alloc = nullptr;
7c673cae
FG
5934}
5935
5936int BlueStore::_open_fsid(bool create)
5937{
11fdf7f2 5938 ceph_assert(fsid_fd < 0);
91327a77 5939 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
5940 if (create)
5941 flags |= O_CREAT;
5942 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
5943 if (fsid_fd < 0) {
5944 int err = -errno;
5945 derr << __func__ << " " << cpp_strerror(err) << dendl;
5946 return err;
5947 }
5948 return 0;
5949}
5950
5951int BlueStore::_read_fsid(uuid_d *uuid)
5952{
5953 char fsid_str[40];
5954 memset(fsid_str, 0, sizeof(fsid_str));
5955 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
5956 if (ret < 0) {
5957 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
5958 return ret;
5959 }
5960 if (ret > 36)
5961 fsid_str[36] = 0;
5962 else
5963 fsid_str[ret] = 0;
5964 if (!uuid->parse(fsid_str)) {
5965 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
5966 return -EINVAL;
5967 }
5968 return 0;
5969}
5970
5971int BlueStore::_write_fsid()
5972{
5973 int r = ::ftruncate(fsid_fd, 0);
5974 if (r < 0) {
5975 r = -errno;
5976 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
5977 return r;
5978 }
5979 string str = stringify(fsid) + "\n";
5980 r = safe_write(fsid_fd, str.c_str(), str.length());
5981 if (r < 0) {
5982 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
5983 return r;
5984 }
5985 r = ::fsync(fsid_fd);
5986 if (r < 0) {
5987 r = -errno;
5988 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
5989 return r;
5990 }
5991 return 0;
5992}
5993
5994void BlueStore::_close_fsid()
5995{
5996 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
5997 fsid_fd = -1;
5998}
5999
6000int BlueStore::_lock_fsid()
6001{
6002 struct flock l;
6003 memset(&l, 0, sizeof(l));
6004 l.l_type = F_WRLCK;
6005 l.l_whence = SEEK_SET;
6006 int r = ::fcntl(fsid_fd, F_SETLK, &l);
6007 if (r < 0) {
6008 int err = errno;
6009 derr << __func__ << " failed to lock " << path << "/fsid"
6010 << " (is another ceph-osd still running?)"
6011 << cpp_strerror(err) << dendl;
6012 return -err;
6013 }
6014 return 0;
6015}
6016
31f18b77
FG
6017bool BlueStore::is_rotational()
6018{
6019 if (bdev) {
6020 return bdev->is_rotational();
6021 }
6022
6023 bool rotational = true;
6024 int r = _open_path();
6025 if (r < 0)
6026 goto out;
6027 r = _open_fsid(false);
6028 if (r < 0)
6029 goto out_path;
6030 r = _read_fsid(&fsid);
6031 if (r < 0)
6032 goto out_fsid;
6033 r = _lock_fsid();
6034 if (r < 0)
6035 goto out_fsid;
6036 r = _open_bdev(false);
6037 if (r < 0)
6038 goto out_fsid;
6039 rotational = bdev->is_rotational();
6040 _close_bdev();
6041 out_fsid:
6042 _close_fsid();
6043 out_path:
6044 _close_path();
6045 out:
6046 return rotational;
6047}
6048
d2e6a577
FG
6049bool BlueStore::is_journal_rotational()
6050{
6051 if (!bluefs) {
6052 dout(5) << __func__ << " bluefs disabled, default to store media type"
6053 << dendl;
6054 return is_rotational();
6055 }
6056 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
6057 return bluefs->wal_is_rotational();
6058}
6059
1d09f67e
TL
6060bool BlueStore::is_db_rotational()
6061{
6062 if (!bluefs) {
6063 dout(5) << __func__ << " bluefs disabled, default to store media type"
6064 << dendl;
6065 return is_rotational();
6066 }
6067 dout(10) << __func__ << " " << (int)bluefs->db_is_rotational() << dendl;
6068 return bluefs->db_is_rotational();
6069}
6070
9f95a23c
TL
6071bool BlueStore::_use_rotational_settings()
6072{
6073 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
6074 return true;
6075 }
6076 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
6077 return false;
6078 }
6079 return bdev->is_rotational();
6080}
6081
7c673cae
FG
6082bool BlueStore::test_mount_in_use()
6083{
6084 // most error conditions mean the mount is not in use (e.g., because
6085 // it doesn't exist). only if we fail to lock do we conclude it is
6086 // in use.
6087 bool ret = false;
6088 int r = _open_path();
6089 if (r < 0)
6090 return false;
6091 r = _open_fsid(false);
6092 if (r < 0)
6093 goto out_path;
6094 r = _lock_fsid();
6095 if (r < 0)
6096 ret = true; // if we can't lock, it is in use
6097 _close_fsid();
6098 out_path:
6099 _close_path();
6100 return ret;
6101}
6102
11fdf7f2 6103int BlueStore::_minimal_open_bluefs(bool create)
7c673cae
FG
6104{
6105 int r;
11fdf7f2 6106 bluefs = new BlueFS(cct);
7c673cae 6107
11fdf7f2
TL
6108 string bfn;
6109 struct stat st;
6110
6111 bfn = path + "/block.db";
6112 if (::stat(bfn.c_str(), &st) == 0) {
eafe8130
TL
6113 r = bluefs->add_block_device(
6114 BlueFS::BDEV_DB, bfn,
f67539c2
TL
6115 create && cct->_conf->bdev_enable_discard,
6116 SUPER_RESERVED);
7c673cae 6117 if (r < 0) {
11fdf7f2
TL
6118 derr << __func__ << " add block device(" << bfn << ") returned: "
6119 << cpp_strerror(r) << dendl;
6120 goto free_bluefs;
7c673cae 6121 }
7c673cae 6122
11fdf7f2
TL
6123 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
6124 r = _check_or_set_bdev_label(
6125 bfn,
6126 bluefs->get_block_device_size(BlueFS::BDEV_DB),
6127 "bluefs db", create);
6128 if (r < 0) {
6129 derr << __func__
6130 << " check block device(" << bfn << ") label returned: "
6131 << cpp_strerror(r) << dendl;
6132 goto free_bluefs;
6133 }
7c673cae 6134 }
9f95a23c
TL
6135 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6136 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
6137 } else {
6138 r = -errno;
6139 if (::lstat(bfn.c_str(), &st) == -1) {
6140 r = 0;
9f95a23c 6141 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7c673cae 6142 } else {
11fdf7f2
TL
6143 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
6144 << cpp_strerror(r) << dendl;
6145 goto free_bluefs;
7c673cae
FG
6146 }
6147 }
7c673cae 6148
11fdf7f2
TL
6149 // shared device
6150 bfn = path + "/block";
6151 // never trim here
9f95a23c 6152 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
f67539c2
TL
6153 0, // no need to provide valid 'reserved' for shared dev
6154 &shared_alloc);
11fdf7f2
TL
6155 if (r < 0) {
6156 derr << __func__ << " add block device(" << bfn << ") returned: "
6157 << cpp_strerror(r) << dendl;
6158 goto free_bluefs;
6159 }
11fdf7f2
TL
6160
6161 bfn = path + "/block.wal";
6162 if (::stat(bfn.c_str(), &st) == 0) {
6163 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
f67539c2
TL
6164 create && cct->_conf->bdev_enable_discard,
6165 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
6166 if (r < 0) {
6167 derr << __func__ << " add block device(" << bfn << ") returned: "
6168 << cpp_strerror(r) << dendl;
6169 goto free_bluefs;
6170 }
7c673cae 6171
11fdf7f2
TL
6172 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
6173 r = _check_or_set_bdev_label(
6174 bfn,
6175 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
6176 "bluefs wal", create);
7c673cae 6177 if (r < 0) {
11fdf7f2
TL
6178 derr << __func__ << " check block device(" << bfn
6179 << ") label returned: " << cpp_strerror(r) << dendl;
7c673cae
FG
6180 goto free_bluefs;
6181 }
7c673cae
FG
6182 }
6183
9f95a23c 6184 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6185 } else {
6186 r = 0;
6187 if (::lstat(bfn.c_str(), &st) != -1) {
6188 r = -errno;
6189 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
6190 << cpp_strerror(r) << dendl;
7c673cae
FG
6191 goto free_bluefs;
6192 }
11fdf7f2
TL
6193 }
6194 return 0;
7c673cae 6195
11fdf7f2
TL
6196free_bluefs:
6197 ceph_assert(bluefs);
6198 delete bluefs;
6199 bluefs = NULL;
6200 return r;
6201}
7c673cae 6202
f67539c2 6203int BlueStore::_open_bluefs(bool create, bool read_only)
11fdf7f2
TL
6204{
6205 int r = _minimal_open_bluefs(create);
6206 if (r < 0) {
6207 return r;
6208 }
f67539c2 6209 BlueFSVolumeSelector* vselector = nullptr;
9f95a23c
TL
6210 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
6211
6212 string options = cct->_conf->bluestore_rocksdb_options;
cd265ab1
TL
6213 string options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6214 if (!options_annex.empty()) {
6215 if (!options.empty() &&
6216 *options.rbegin() != ',') {
6217 options += ',';
6218 }
6219 options += options_annex;
6220 }
9f95a23c
TL
6221
6222 rocksdb::Options rocks_opts;
f67539c2 6223 r = RocksDBStore::ParseOptionsFromStringStatic(
9f95a23c
TL
6224 cct,
6225 options,
6226 rocks_opts,
6227 nullptr);
6228 if (r < 0) {
6229 return r;
6230 }
f67539c2
TL
6231 if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
6232 vselector = new FitToFastVolumeSelector(
9f95a23c
TL
6233 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
6234 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
f67539c2
TL
6235 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100);
6236 } else {
6237 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
6238 vselector =
6239 new RocksDBBlueFSVolumeSelector(
6240 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
6241 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
6242 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
6243 1024 * 1024 * 1024, //FIXME: set expected l0 size here
6244 rocks_opts.max_bytes_for_level_base,
6245 rocks_opts.max_bytes_for_level_multiplier,
6246 reserved_factor,
6247 cct->_conf->bluestore_volume_selection_reserved,
6248 cct->_conf->bluestore_volume_selection_policy == "use_some_extra");
6249 }
9f95a23c 6250 }
11fdf7f2 6251 if (create) {
9f95a23c 6252 bluefs->mkfs(fsid, bluefs_layout);
11fdf7f2 6253 }
9f95a23c 6254 bluefs->set_volume_selector(vselector);
11fdf7f2
TL
6255 r = bluefs->mount();
6256 if (r < 0) {
6257 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
6258 }
9f95a23c 6259 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
11fdf7f2
TL
6260 return r;
6261}
6262
20effc67 6263void BlueStore::_close_bluefs()
11fdf7f2 6264{
20effc67 6265 bluefs->umount(db_was_opened_read_only);
11fdf7f2
TL
6266 _minimal_close_bluefs();
6267}
6268
6269void BlueStore::_minimal_close_bluefs()
6270{
6271 delete bluefs;
6272 bluefs = NULL;
6273}
6274
6275int BlueStore::_is_bluefs(bool create, bool* ret)
6276{
6277 if (create) {
6278 *ret = cct->_conf->bluestore_bluefs;
6279 } else {
6280 string s;
6281 int r = read_meta("bluefs", &s);
6282 if (r < 0) {
6283 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
6284 return -EIO;
6285 }
6286 if (s == "1") {
6287 *ret = true;
6288 } else if (s == "0") {
6289 *ret = false;
31f18b77 6290 } else {
11fdf7f2
TL
6291 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
6292 << dendl;
6293 return -EIO;
6294 }
6295 }
6296 return 0;
6297}
6298
6299/*
6300* opens both DB and dependant super_meta, FreelistManager and allocator
6301* in the proper order
6302*/
f67539c2 6303int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
11fdf7f2 6304{
20effc67 6305 dout(5) << __func__ << "::NCB::read_only=" << read_only << ", to_repair=" << to_repair << dendl;
f67539c2
TL
6306 {
6307 string type;
6308 int r = read_meta("type", &type);
6309 if (r < 0) {
6310 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
6311 << dendl;
11fdf7f2 6312 return r;
f67539c2 6313 }
11fdf7f2 6314
f67539c2
TL
6315 if (type != "bluestore") {
6316 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6317 return -EIO;
11fdf7f2 6318 }
f67539c2 6319 }
11fdf7f2 6320
20effc67
TL
6321 // SMR devices may require a freelist adjustment, but that can only happen after
6322 // the db is read-write. we'll stash pending changes here.
6323 std::map<uint64_t, uint64_t> zone_adjustments;
6324
f67539c2
TL
6325 int r = _open_path();
6326 if (r < 0)
6327 return r;
6328 r = _open_fsid(false);
6329 if (r < 0)
6330 goto out_path;
11fdf7f2 6331
f67539c2
TL
6332 r = _read_fsid(&fsid);
6333 if (r < 0)
6334 goto out_fsid;
11fdf7f2 6335
f67539c2
TL
6336 r = _lock_fsid();
6337 if (r < 0)
6338 goto out_fsid;
11fdf7f2 6339
f67539c2
TL
6340 r = _open_bdev(false);
6341 if (r < 0)
6342 goto out_fsid;
7c673cae 6343
20effc67
TL
6344 // GBH: can probably skip open_db step in REad-Only mode when operating in NULL-FM mode
6345 // (might need to open if failed to restore from file)
6346
f67539c2
TL
6347 // open in read-only first to read FM list and init allocator
6348 // as they might be needed for some BlueFS procedures
6349 r = _open_db(false, false, true);
6350 if (r < 0)
6351 goto out_bdev;
11fdf7f2 6352
f67539c2
TL
6353 r = _open_super_meta();
6354 if (r < 0) {
6355 goto out_db;
6356 }
6357
6358 r = _open_fm(nullptr, true);
6359 if (r < 0)
6360 goto out_db;
6361
20effc67 6362 r = _init_alloc(&zone_adjustments);
f67539c2
TL
6363 if (r < 0)
6364 goto out_fm;
6365
6366 // Re-open in the proper mode(s).
6367
6368 // Can't simply bypass second open for read-only mode as we need to
6369 // load allocated extents from bluefs into allocator.
6370 // And now it's time to do that
6371 //
20effc67 6372 _close_db();
f67539c2
TL
6373 r = _open_db(false, to_repair, read_only);
6374 if (r < 0) {
6375 goto out_alloc;
11fdf7f2 6376 }
20effc67
TL
6377
6378 if (!read_only && !zone_adjustments.empty()) {
6379 // for SMR devices that have freelist mismatch with device write pointers
6380 _post_init_alloc(zone_adjustments);
6381 }
6382
6383 // when function is called in repair mode (to_repair=true) we skip db->open()/create()
6384 // we can't change bluestore allocation so no need to invlidate allocation-file
6385 if (fm->is_null_manager() && !read_only && !to_repair) {
6386 // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected
6387 // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount()
6388 // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort
6389 // to recovery from RocksDB::ONodes
6390 r = invalidate_allocation_file_on_bluefs();
6391 if (r != 0) {
6392 derr << __func__ << "::NCB::invalidate_allocation_file_on_bluefs() failed!" << dendl;
6393 goto out_alloc;
6394 }
6395 }
6396
6397 // when function is called in repair mode (to_repair=true) we skip db->open()/create()
1d09f67e 6398 if (!is_db_rotational() && !read_only && !to_repair && cct->_conf->bluestore_allocation_from_file
20effc67
TL
6399#ifdef HAVE_LIBZBD
6400 && !bdev->is_smr()
6401#endif
6402 ) {
6403 dout(5) << __func__ << "::NCB::Commit to Null-Manager" << dendl;
6404 commit_to_null_manager();
6405 need_to_destage_allocation_file = true;
6406 dout(10) << __func__ << "::NCB::need_to_destage_allocation_file was set" << dendl;
6407 }
6408
11fdf7f2
TL
6409 return 0;
6410
f67539c2
TL
6411out_alloc:
6412 _close_alloc();
6413out_fm:
11fdf7f2
TL
6414 _close_fm();
6415 out_db:
20effc67 6416 _close_db();
f67539c2
TL
6417 out_bdev:
6418 _close_bdev();
6419 out_fsid:
6420 _close_fsid();
6421 out_path:
6422 _close_path();
11fdf7f2
TL
6423 return r;
6424}
6425
20effc67 6426void BlueStore::_close_db_and_around()
11fdf7f2 6427{
20effc67
TL
6428 if (db) {
6429 _close_db();
6430 }
6431 if (bluefs) {
6432 _close_bluefs();
6433 }
f67539c2
TL
6434 _close_fm();
6435 _close_alloc();
6436 _close_bdev();
6437 _close_fsid();
6438 _close_path();
6439}
6440
6441int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair)
6442{
6443 _kv_only = true;
6444 int r = _open_db_and_around(false, to_repair);
6445 if (r == 0) {
6446 *pdb = db;
11fdf7f2 6447 } else {
f67539c2 6448 *pdb = nullptr;
11fdf7f2 6449 }
f67539c2 6450 return r;
11fdf7f2
TL
6451}
6452
f67539c2 6453int BlueStore::close_db_environment()
11fdf7f2 6454{
20effc67 6455 _close_db_and_around();
f67539c2 6456 return 0;
11fdf7f2
TL
6457}
6458
20effc67
TL
6459/* gets access to bluefs supporting RocksDB */
6460BlueFS* BlueStore::get_bluefs() {
6461 return bluefs;
6462}
6463
f67539c2
TL
6464int BlueStore::_prepare_db_environment(bool create, bool read_only,
6465 std::string* _fn, std::string* _kv_backend)
11fdf7f2
TL
6466{
6467 int r;
6468 ceph_assert(!db);
f67539c2
TL
6469 std::string& fn=*_fn;
6470 std::string& kv_backend=*_kv_backend;
6471 fn = path + "/db";
11fdf7f2
TL
6472 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
6473
11fdf7f2
TL
6474 if (create) {
6475 kv_backend = cct->_conf->bluestore_kvbackend;
6476 } else {
6477 r = read_meta("kv_backend", &kv_backend);
7c673cae 6478 if (r < 0) {
11fdf7f2
TL
6479 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
6480 return -EIO;
6481 }
6482 }
6483 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
6484
6485 bool do_bluefs;
6486 r = _is_bluefs(create, &do_bluefs);
6487 if (r < 0) {
6488 return r;
6489 }
6490 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
6491
6492 map<string,string> kv_options;
6493 // force separate wal dir for all new deployments.
6494 kv_options["separate_wal_dir"] = 1;
6495 rocksdb::Env *env = NULL;
6496 if (do_bluefs) {
6497 dout(10) << __func__ << " initializing bluefs" << dendl;
6498 if (kv_backend != "rocksdb") {
6499 derr << " backend must be rocksdb to use bluefs" << dendl;
6500 return -EINVAL;
7c673cae 6501 }
11fdf7f2 6502
f67539c2 6503 r = _open_bluefs(create, read_only);
11fdf7f2
TL
6504 if (r < 0) {
6505 return r;
6506 }
11fdf7f2 6507
7c673cae 6508 if (cct->_conf->bluestore_bluefs_env_mirror) {
9f95a23c
TL
6509 rocksdb::Env* a = new BlueRocksEnv(bluefs);
6510 rocksdb::Env* b = rocksdb::Env::Default();
7c673cae 6511 if (create) {
9f95a23c
TL
6512 string cmd = "rm -rf " + path + "/db " +
6513 path + "/db.slow " +
6514 path + "/db.wal";
6515 int r = system(cmd.c_str());
6516 (void)r;
7c673cae
FG
6517 }
6518 env = new rocksdb::EnvMirror(b, a, false, true);
1911f103 6519 } else {
7c673cae
FG
6520 env = new BlueRocksEnv(bluefs);
6521
6522 // simplify the dir names, too, as "seen" by rocksdb
6523 fn = "db";
6524 }
9f95a23c
TL
6525 BlueFSVolumeSelector::paths paths;
6526 bluefs->get_vselector_paths(fn, paths);
7c673cae 6527
522d829b 6528 {
7c673cae 6529 ostringstream db_paths;
9f95a23c
TL
6530 bool first = true;
6531 for (auto& p : paths) {
6532 if (!first) {
6533 db_paths << " ";
6534 }
6535 first = false;
6536 db_paths << p.first << "," << p.second;
6537
6538 }
11fdf7f2 6539 kv_options["db_paths"] = db_paths.str();
9f95a23c 6540 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
7c673cae
FG
6541 }
6542
6543 if (create) {
9f95a23c
TL
6544 for (auto& p : paths) {
6545 env->CreateDir(p.first);
6546 }
6547 // Selectors don't provide wal path so far hence create explicitly
11fdf7f2 6548 env->CreateDir(fn + ".wal");
11fdf7f2
TL
6549 } else {
6550 std::vector<std::string> res;
6551 // check for dir presence
6552 auto r = env->GetChildren(fn+".wal", &res);
6553 if (r.IsNotFound()) {
6554 kv_options.erase("separate_wal_dir");
6555 }
7c673cae 6556 }
11fdf7f2
TL
6557 } else {
6558 string walfn = path + "/db.wal";
7c673cae 6559
11fdf7f2
TL
6560 if (create) {
6561 int r = ::mkdir(fn.c_str(), 0755);
6562 if (r < 0)
6563 r = -errno;
6564 if (r < 0 && r != -EEXIST) {
6565 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
6566 << dendl;
6567 return r;
6568 }
6569
6570 // wal_dir, too!
7c673cae
FG
6571 r = ::mkdir(walfn.c_str(), 0755);
6572 if (r < 0)
6573 r = -errno;
6574 if (r < 0 && r != -EEXIST) {
6575 derr << __func__ << " failed to create " << walfn
6576 << ": " << cpp_strerror(r)
6577 << dendl;
6578 return r;
6579 }
11fdf7f2
TL
6580 } else {
6581 struct stat st;
6582 r = ::stat(walfn.c_str(), &st);
6583 if (r < 0 && errno == ENOENT) {
6584 kv_options.erase("separate_wal_dir");
6585 }
7c673cae
FG
6586 }
6587 }
6588
91327a77 6589
7c673cae
FG
6590 db = KeyValueDB::create(cct,
6591 kv_backend,
6592 fn,
11fdf7f2 6593 kv_options,
7c673cae
FG
6594 static_cast<void*>(env));
6595 if (!db) {
6596 derr << __func__ << " error creating db" << dendl;
6597 if (bluefs) {
20effc67 6598 _close_bluefs();
7c673cae
FG
6599 }
6600 // delete env manually here since we can't depend on db to do this
6601 // under this case
6602 delete env;
6603 env = NULL;
6604 return -EIO;
6605 }
6606
f67539c2 6607 FreelistManager::setup_merge_operators(db, freelist_type);
7c673cae 6608 db->set_merge_operator(PREFIX_STAT, merge_op);
91327a77 6609 db->set_cache_size(cache_kv_ratio * cache_size);
f67539c2
TL
6610 return 0;
6611}
31f18b77 6612
f67539c2
TL
6613int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
6614{
6615 int r;
6616 ceph_assert(!(create && read_only));
6617 string options;
6618 string options_annex;
6619 stringstream err;
6620 string kv_dir_fn;
6621 string kv_backend;
6622 std::string sharding_def;
20effc67
TL
6623 // prevent write attempts to BlueFS in case we failed before BlueFS was opened
6624 db_was_opened_read_only = true;
f67539c2
TL
6625 r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend);
6626 if (r < 0) {
6627 derr << __func__ << " failed to prepare db environment: " << err.str() << dendl;
6628 return -EIO;
6629 }
20effc67
TL
6630 // if reached here then BlueFS is already opened
6631 db_was_opened_read_only = read_only;
6632 dout(10) << __func__ << "::db_was_opened_read_only was set to " << read_only << dendl;
11fdf7f2 6633 if (kv_backend == "rocksdb") {
7c673cae 6634 options = cct->_conf->bluestore_rocksdb_options;
cd265ab1
TL
6635 options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6636 if (!options_annex.empty()) {
6637 if (!options.empty() &&
6638 *options.rbegin() != ',') {
6639 options += ',';
6640 }
6641 options += options_annex;
6642 }
11fdf7f2 6643
f67539c2
TL
6644 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
6645 sharding_def = cct->_conf.get_val<std::string>("bluestore_rocksdb_cfs");
11fdf7f2
TL
6646 }
6647 }
6648
7c673cae 6649 db->init(options);
11fdf7f2
TL
6650 if (to_repair_db)
6651 return 0;
6652 if (create) {
f67539c2 6653 r = db->create_and_open(err, sharding_def);
11fdf7f2
TL
6654 } else {
6655 // we pass in cf list here, but it is only used if the db already has
6656 // column families created.
6657 r = read_only ?
f67539c2
TL
6658 db->open_read_only(err, sharding_def) :
6659 db->open(err, sharding_def);
11fdf7f2 6660 }
7c673cae
FG
6661 if (r) {
6662 derr << __func__ << " erroring opening db: " << err.str() << dendl;
20effc67 6663 _close_db();
7c673cae
FG
6664 return -EIO;
6665 }
6666 dout(1) << __func__ << " opened " << kv_backend
f67539c2 6667 << " path " << kv_dir_fn << " options " << options << dendl;
7c673cae 6668 return 0;
7c673cae
FG
6669}
6670
20effc67 6671void BlueStore::_close_db_leave_bluefs()
7c673cae 6672{
11fdf7f2 6673 ceph_assert(db);
7c673cae 6674 delete db;
20effc67
TL
6675 db = nullptr;
6676}
6677
6678void BlueStore::_close_db()
6679{
6680 dout(10) << __func__ << ":read_only=" << db_was_opened_read_only << " fm=" << fm << " destage_alloc_file=" << need_to_destage_allocation_file << dendl;
6681 _close_db_leave_bluefs();
6682
6683 if (need_to_destage_allocation_file) {
6684 ceph_assert(fm && fm->is_null_manager());
6685 int ret = store_allocator(alloc);
6686 if (ret != 0) {
6687 derr << __func__ << "::NCB::store_allocator() failed (continue with bitmapFreelistManager)" << dendl;
6688 }
6689 }
6690
7c673cae 6691 if (bluefs) {
20effc67 6692 _close_bluefs();
7c673cae
FG
6693 }
6694}
6695
11fdf7f2 6696void BlueStore::_dump_alloc_on_failure()
7c673cae 6697{
11fdf7f2
TL
6698 auto dump_interval =
6699 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
6700 if (dump_interval > 0 &&
6701 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
f67539c2 6702 shared_alloc.a->dump();
11fdf7f2
TL
6703 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
6704 next_dump_on_bluefs_alloc_failure += dump_interval;
7c673cae 6705 }
11fdf7f2 6706}
7c673cae 6707
eafe8130 6708int BlueStore::_open_collections()
7c673cae 6709{
20effc67
TL
6710 if (!coll_map.empty()) {
6711 // could be opened from another path
6712 dout(20) << __func__ << "::NCB::collections are already opened, nothing to do" << dendl;
6713 return 0;
6714 }
6715
28e407b8 6716 dout(10) << __func__ << dendl;
eafe8130 6717 collections_had_errors = false;
7c673cae 6718 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
20effc67 6719 size_t load_cnt = 0;
7c673cae
FG
6720 for (it->upper_bound(string());
6721 it->valid();
6722 it->next()) {
6723 coll_t cid;
6724 if (cid.parse(it->key())) {
9f95a23c 6725 auto c = ceph::make_ref<Collection>(
7c673cae 6726 this,
9f95a23c
TL
6727 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6728 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6729 cid);
7c673cae 6730 bufferlist bl = it->value();
11fdf7f2 6731 auto p = bl.cbegin();
7c673cae 6732 try {
11fdf7f2 6733 decode(c->cnode, p);
f67539c2 6734 } catch (ceph::buffer::error& e) {
7c673cae
FG
6735 derr << __func__ << " failed to decode cnode, key:"
6736 << pretty_binary_string(it->key()) << dendl;
6737 return -EIO;
6738 }
28e407b8
AA
6739 dout(20) << __func__ << " opened " << cid << " " << c
6740 << " " << c->cnode << dendl;
11fdf7f2 6741 _osr_attach(c.get());
7c673cae 6742 coll_map[cid] = c;
20effc67 6743 load_cnt++;
7c673cae
FG
6744 } else {
6745 derr << __func__ << " unrecognized collection " << it->key() << dendl;
eafe8130 6746 collections_had_errors = true;
7c673cae
FG
6747 }
6748 }
20effc67
TL
6749 dout(10) << __func__ << " collections loaded: " << load_cnt
6750 << dendl;
7c673cae
FG
6751 return 0;
6752}
6753
eafe8130
TL
6754void BlueStore::_fsck_collections(int64_t* errors)
6755{
6756 if (collections_had_errors) {
6757 dout(10) << __func__ << dendl;
f67539c2 6758 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
6759 for (it->upper_bound(string());
6760 it->valid();
6761 it->next()) {
6762 coll_t cid;
6763 if (!cid.parse(it->key())) {
6764 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6765 if (errors) {
6766 (*errors)++;
6767 }
6768 }
6769 }
6770 }
6771}
6772
9f95a23c
TL
6773void BlueStore::_set_per_pool_omap()
6774{
f67539c2 6775 per_pool_omap = OMAP_BULK;
9f95a23c
TL
6776 bufferlist bl;
6777 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6778 if (bl.length()) {
f67539c2
TL
6779 auto s = bl.to_str();
6780 if (s == stringify(OMAP_PER_POOL)) {
6781 per_pool_omap = OMAP_PER_POOL;
a4b75251 6782 } else if (s == stringify(OMAP_PER_PG)) {
f67539c2 6783 per_pool_omap = OMAP_PER_PG;
a4b75251
TL
6784 } else {
6785 ceph_assert(s == stringify(OMAP_BULK));
f67539c2
TL
6786 }
6787 dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl;
9f95a23c
TL
6788 } else {
6789 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6790 }
f67539c2 6791 _check_no_per_pg_or_pool_omap_alert();
9f95a23c
TL
6792}
6793
224ce89b 6794void BlueStore::_open_statfs()
31f18b77 6795{
11fdf7f2
TL
6796 osd_pools.clear();
6797 vstatfs.reset();
6798
31f18b77 6799 bufferlist bl;
11fdf7f2 6800 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
31f18b77 6801 if (r >= 0) {
11fdf7f2 6802 per_pool_stat_collection = false;
31f18b77 6803 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
11fdf7f2 6804 auto it = bl.cbegin();
31f18b77 6805 vstatfs.decode(it);
11fdf7f2 6806 dout(10) << __func__ << " store_statfs is found" << dendl;
224ce89b 6807 } else {
31f18b77
FG
6808 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6809 }
81eedcae 6810 _check_legacy_statfs_alert();
11fdf7f2
TL
6811 } else {
6812 per_pool_stat_collection = true;
6813 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
f67539c2 6814 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2
TL
6815 for (it->upper_bound(string());
6816 it->valid();
6817 it->next()) {
6818
6819 uint64_t pool_id;
6820 int r = get_key_pool_stat(it->key(), &pool_id);
6821 ceph_assert(r == 0);
6822
6823 bufferlist bl;
6824 bl = it->value();
6825 auto p = bl.cbegin();
6826 auto& st = osd_pools[pool_id];
6827 try {
6828 st.decode(p);
6829 vstatfs += st;
6830
6831 dout(30) << __func__ << " pool " << pool_id
6832 << " statfs " << st << dendl;
f67539c2 6833 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
6834 derr << __func__ << " failed to decode pool stats, key:"
6835 << pretty_binary_string(it->key()) << dendl;
6836 }
6837 }
31f18b77 6838 }
11fdf7f2
TL
6839 dout(30) << __func__ << " statfs " << vstatfs << dendl;
6840
31f18b77
FG
6841}
6842
7c673cae
FG
6843int BlueStore::_setup_block_symlink_or_file(
6844 string name,
6845 string epath,
6846 uint64_t size,
6847 bool create)
6848{
6849 dout(20) << __func__ << " name " << name << " path " << epath
6850 << " size " << size << " create=" << (int)create << dendl;
6851 int r = 0;
91327a77 6852 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
6853 if (create)
6854 flags |= O_CREAT;
6855 if (epath.length()) {
6856 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6857 if (r < 0) {
6858 r = -errno;
6859 derr << __func__ << " failed to create " << name << " symlink to "
6860 << epath << ": " << cpp_strerror(r) << dendl;
6861 return r;
6862 }
6863
6864 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6865 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6866 if (fd < 0) {
6867 r = -errno;
6868 derr << __func__ << " failed to open " << epath << " file: "
6869 << cpp_strerror(r) << dendl;
6870 return r;
6871 }
11fdf7f2
TL
6872 // write the Transport ID of the NVMe device
6873 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6874 // where "0000:02:00.0" is the selector of a PCI device, see
6875 // the first column of "lspci -mm -n -D"
6876 string trid{"trtype:PCIe "};
6877 trid += "traddr:";
6878 trid += epath.substr(strlen(SPDK_PREFIX));
6879 r = ::write(fd, trid.c_str(), trid.size());
6880 ceph_assert(r == static_cast<int>(trid.size()));
7c673cae
FG
6881 dout(1) << __func__ << " created " << name << " symlink to "
6882 << epath << dendl;
6883 VOID_TEMP_FAILURE_RETRY(::close(fd));
6884 }
6885 }
6886 if (size) {
6887 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
6888 if (fd >= 0) {
6889 // block file is present
6890 struct stat st;
6891 int r = ::fstat(fd, &st);
6892 if (r == 0 &&
6893 S_ISREG(st.st_mode) && // if it is a regular file
6894 st.st_size == 0) { // and is 0 bytes
6895 r = ::ftruncate(fd, size);
6896 if (r < 0) {
6897 r = -errno;
6898 derr << __func__ << " failed to resize " << name << " file to "
6899 << size << ": " << cpp_strerror(r) << dendl;
6900 VOID_TEMP_FAILURE_RETRY(::close(fd));
6901 return r;
6902 }
6903
6904 if (cct->_conf->bluestore_block_preallocate_file) {
28e407b8
AA
6905 r = ::ceph_posix_fallocate(fd, 0, size);
6906 if (r > 0) {
7c673cae
FG
6907 derr << __func__ << " failed to prefallocate " << name << " file to "
6908 << size << ": " << cpp_strerror(r) << dendl;
6909 VOID_TEMP_FAILURE_RETRY(::close(fd));
6910 return -r;
6911 }
7c673cae
FG
6912 }
6913 dout(1) << __func__ << " resized " << name << " file to "
1adf2230 6914 << byte_u_t(size) << dendl;
7c673cae
FG
6915 }
6916 VOID_TEMP_FAILURE_RETRY(::close(fd));
6917 } else {
6918 int r = -errno;
6919 if (r != -ENOENT) {
6920 derr << __func__ << " failed to open " << name << " file: "
6921 << cpp_strerror(r) << dendl;
6922 return r;
6923 }
6924 }
6925 }
6926 return 0;
6927}
6928
6929int BlueStore::mkfs()
6930{
6931 dout(1) << __func__ << " path " << path << dendl;
6932 int r;
6933 uuid_d old_fsid;
f67539c2 6934 uint64_t reserved;
eafe8130
TL
6935 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6936 derr << __func__ << " osd_max_object_size "
6937 << cct->_conf->osd_max_object_size << " > bluestore max "
6938 << OBJECT_MAX_SIZE << dendl;
6939 return -EINVAL;
6940 }
6941
7c673cae
FG
6942 {
6943 string done;
6944 r = read_meta("mkfs_done", &done);
6945 if (r == 0) {
6946 dout(1) << __func__ << " already created" << dendl;
6947 if (cct->_conf->bluestore_fsck_on_mkfs) {
6948 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6949 if (r < 0) {
6950 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
6951 << dendl;
6952 return r;
6953 }
6954 if (r > 0) {
6955 derr << __func__ << " fsck found " << r << " errors" << dendl;
6956 r = -EIO;
6957 }
6958 }
6959 return r; // idempotent
6960 }
6961 }
6962
6963 {
6964 string type;
6965 r = read_meta("type", &type);
6966 if (r == 0) {
6967 if (type != "bluestore") {
6968 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6969 return -EIO;
6970 }
6971 } else {
6972 r = write_meta("type", "bluestore");
6973 if (r < 0)
6974 return r;
6975 }
6976 }
6977
7c673cae
FG
6978 r = _open_path();
6979 if (r < 0)
6980 return r;
6981
6982 r = _open_fsid(true);
6983 if (r < 0)
6984 goto out_path_fd;
6985
6986 r = _lock_fsid();
6987 if (r < 0)
6988 goto out_close_fsid;
6989
6990 r = _read_fsid(&old_fsid);
6991 if (r < 0 || old_fsid.is_zero()) {
6992 if (fsid.is_zero()) {
6993 fsid.generate_random();
6994 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6995 } else {
6996 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6997 }
6998 // we'll write it later.
6999 } else {
7000 if (!fsid.is_zero() && fsid != old_fsid) {
7001 derr << __func__ << " on-disk fsid " << old_fsid
7002 << " != provided " << fsid << dendl;
7003 r = -EINVAL;
7004 goto out_close_fsid;
7005 }
7006 fsid = old_fsid;
7007 }
7008
7009 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
7010 cct->_conf->bluestore_block_size,
7011 cct->_conf->bluestore_block_create);
7012 if (r < 0)
7013 goto out_close_fsid;
7014 if (cct->_conf->bluestore_bluefs) {
7015 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
7016 cct->_conf->bluestore_block_wal_size,
7017 cct->_conf->bluestore_block_wal_create);
7018 if (r < 0)
7019 goto out_close_fsid;
7020 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
7021 cct->_conf->bluestore_block_db_size,
7022 cct->_conf->bluestore_block_db_create);
7023 if (r < 0)
7024 goto out_close_fsid;
7025 }
7026
7027 r = _open_bdev(true);
7028 if (r < 0)
7029 goto out_close_fsid;
7030
20effc67
TL
7031 // choose freelist manager
7032#ifdef HAVE_LIBZBD
7033 if (bdev->is_smr()) {
7034 freelist_type = "zoned";
7035 zone_size = bdev->get_zone_size();
7036 first_sequential_zone = bdev->get_conventional_region_size() / zone_size;
7037 bdev->reset_all_zones();
7038 } else
7039#endif
7040 {
7041 freelist_type = "bitmap";
7042 }
7043 dout(10) << " freelist_type " << freelist_type << dendl;
7044
3efd9988 7045 // choose min_alloc_size
20effc67
TL
7046 dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
7047 << " block_size: 0x" << block_size << std::dec << dendl;
7048 if ((cct->_conf->bluestore_use_optimal_io_size_for_min_alloc_size) && (optimal_io_size != 0)) {
7049 dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
7050 << " for min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
7051 min_alloc_size = optimal_io_size;
7052 }
7053 else if (cct->_conf->bluestore_min_alloc_size) {
3efd9988
FG
7054 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
7055 } else {
11fdf7f2 7056 ceph_assert(bdev);
f67539c2 7057 if (_use_rotational_settings()) {
3efd9988
FG
7058 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
7059 } else {
7060 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
7061 }
7062 }
11fdf7f2 7063 _validate_bdev();
3efd9988
FG
7064
7065 // make sure min_alloc_size is power of 2 aligned.
11fdf7f2 7066 if (!isp2(min_alloc_size)) {
3efd9988
FG
7067 derr << __func__ << " min_alloc_size 0x"
7068 << std::hex << min_alloc_size << std::dec
7069 << " is not power of 2 aligned!"
7070 << dendl;
7071 r = -EINVAL;
7072 goto out_close_bdev;
7073 }
7074
20effc67
TL
7075 // make sure min_alloc_size is >= and aligned with block size
7076 if (min_alloc_size % block_size != 0) {
7077 derr << __func__ << " min_alloc_size 0x"
7078 << std::hex << min_alloc_size
7079 << " is less or not aligned with block_size: 0x"
7080 << block_size << std::dec << dendl;
7081 r = -EINVAL;
7082 goto out_close_bdev;
7083 }
7084
f67539c2
TL
7085 r = _create_alloc();
7086 if (r < 0) {
7087 goto out_close_bdev;
7088 }
7089
7090 reserved = _get_ondisk_reserved();
20effc67 7091 alloc->init_add_free(reserved,
f67539c2 7092 p2align(bdev->get_size(), min_alloc_size) - reserved);
20effc67
TL
7093#ifdef HAVE_LIBZBD
7094 if (bdev->is_smr() && alloc != shared_alloc.a) {
7095 shared_alloc.a->init_add_free(reserved,
7096 p2align(bdev->get_conventional_region_size(),
7097 min_alloc_size) - reserved);
7098 }
7099#endif
f67539c2 7100
7c673cae
FG
7101 r = _open_db(true);
7102 if (r < 0)
f67539c2 7103 goto out_close_alloc;
7c673cae 7104
7c673cae
FG
7105 {
7106 KeyValueDB::Transaction t = db->get_transaction();
1911f103 7107 r = _open_fm(t, true);
11fdf7f2
TL
7108 if (r < 0)
7109 goto out_close_db;
7c673cae
FG
7110 {
7111 bufferlist bl;
11fdf7f2 7112 encode((uint64_t)0, bl);
7c673cae
FG
7113 t->set(PREFIX_SUPER, "nid_max", bl);
7114 t->set(PREFIX_SUPER, "blobid_max", bl);
7115 }
7116
7c673cae
FG
7117 {
7118 bufferlist bl;
11fdf7f2 7119 encode((uint64_t)min_alloc_size, bl);
7c673cae
FG
7120 t->set(PREFIX_SUPER, "min_alloc_size", bl);
7121 }
9f95a23c
TL
7122 {
7123 bufferlist bl;
a4b75251
TL
7124 if (cct->_conf.get_val<bool>("bluestore_debug_legacy_omap")) {
7125 bl.append(stringify(OMAP_BULK));
7126 } else {
7127 bl.append(stringify(OMAP_PER_PG));
7128 }
9f95a23c
TL
7129 t->set(PREFIX_SUPER, "per_pool_omap", bl);
7130 }
20effc67
TL
7131
7132#ifdef HAVE_LIBZBD
7133 if (bdev->is_smr()) {
7134 {
7135 bufferlist bl;
7136 encode((uint64_t)zone_size, bl);
7137 t->set(PREFIX_SUPER, "zone_size", bl);
7138 }
7139 {
7140 bufferlist bl;
7141 encode((uint64_t)first_sequential_zone, bl);
7142 t->set(PREFIX_SUPER, "first_sequential_zone", bl);
7143 }
7144 }
7145#endif
7146
7c673cae
FG
7147 ondisk_format = latest_ondisk_format;
7148 _prepare_ondisk_format_super(t);
7149 db->submit_transaction_sync(t);
7150 }
7151
7c673cae
FG
7152 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
7153 if (r < 0)
224ce89b
WB
7154 goto out_close_fm;
7155
3efd9988 7156 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 7157 if (r < 0)
224ce89b 7158 goto out_close_fm;
7c673cae
FG
7159
7160 if (fsid != old_fsid) {
7161 r = _write_fsid();
7162 if (r < 0) {
7163 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 7164 goto out_close_fm;
7c673cae
FG
7165 }
7166 }
7167
7c673cae
FG
7168 out_close_fm:
7169 _close_fm();
7170 out_close_db:
20effc67 7171 _close_db();
f67539c2
TL
7172 out_close_alloc:
7173 _close_alloc();
7c673cae
FG
7174 out_close_bdev:
7175 _close_bdev();
7176 out_close_fsid:
7177 _close_fsid();
7178 out_path_fd:
7179 _close_path();
7180
7181 if (r == 0 &&
7182 cct->_conf->bluestore_fsck_on_mkfs) {
7183 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
7184 if (rc < 0)
7185 return rc;
7186 if (rc > 0) {
7187 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7188 r = -EIO;
7189 }
11fdf7f2
TL
7190 }
7191
7192 if (r == 0) {
7193 // indicate success by writing the 'mkfs_done' file
7194 r = write_meta("mkfs_done", "yes");
7195 }
7196
7197 if (r < 0) {
7198 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
7199 } else {
7200 dout(0) << __func__ << " success" << dendl;
7201 }
7202 return r;
7203}
7204
11fdf7f2
TL
7205int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
7206{
7207 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
7208 int r;
7209 ceph_assert(path_fd < 0);
7210
7211 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
7212
7213 if (!cct->_conf->bluestore_bluefs) {
7214 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7215 return -EIO;
7216 }
20effc67 7217 dout(5) << __func__ << "::NCB::calling open_db_and_around(read-only)" << dendl;
f67539c2 7218 r = _open_db_and_around(true);
20effc67
TL
7219 if (r < 0) {
7220 return r;
7221 }
11fdf7f2 7222
11fdf7f2
TL
7223 if (id == BlueFS::BDEV_NEWWAL) {
7224 string p = path + "/block.wal";
7225 r = _setup_block_symlink_or_file("block.wal", dev_path,
7226 cct->_conf->bluestore_block_wal_size,
7227 true);
7228 ceph_assert(r == 0);
7229
7230 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
f67539c2
TL
7231 cct->_conf->bdev_enable_discard,
7232 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
7233 ceph_assert(r == 0);
7234
7235 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
7236 r = _check_or_set_bdev_label(
7237 p,
7238 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
7239 "bluefs wal",
7240 true);
7241 ceph_assert(r == 0);
7242 }
7243
9f95a23c 7244 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
7245 } else if (id == BlueFS::BDEV_NEWDB) {
7246 string p = path + "/block.db";
7247 r = _setup_block_symlink_or_file("block.db", dev_path,
7248 cct->_conf->bluestore_block_db_size,
7249 true);
7250 ceph_assert(r == 0);
7251
7252 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
f67539c2
TL
7253 cct->_conf->bdev_enable_discard,
7254 SUPER_RESERVED);
11fdf7f2
TL
7255 ceph_assert(r == 0);
7256
7257 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
7258 r = _check_or_set_bdev_label(
7259 p,
7260 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
7261 "bluefs db",
7262 true);
7263 ceph_assert(r == 0);
7264 }
9f95a23c
TL
7265 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
7266 bluefs_layout.dedicated_db = true;
11fdf7f2 7267 }
11fdf7f2
TL
7268 bluefs->umount();
7269 bluefs->mount();
7270
9f95a23c 7271 r = bluefs->prepare_new_device(id, bluefs_layout);
11fdf7f2
TL
7272 ceph_assert(r == 0);
7273
7274 if (r < 0) {
7275 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
7276 } else {
7277 dout(0) << __func__ << " success" << dendl;
7278 }
7279
20effc67 7280 _close_db_and_around();
11fdf7f2
TL
7281 return r;
7282}
7283
7284int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
7285 int id)
7286{
7287 dout(10) << __func__ << " id:" << id << dendl;
7288 ceph_assert(path_fd < 0);
7289
7290 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
7291
7292 if (!cct->_conf->bluestore_bluefs) {
7293 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7294 return -EIO;
7295 }
7296
f67539c2 7297 int r = _open_db_and_around(true);
20effc67
TL
7298 if (r < 0) {
7299 return r;
7300 }
7301 auto close_db = make_scope_guard([&] {
7302 _close_db_and_around();
7303 });
f67539c2 7304 uint64_t used_space = 0;
11fdf7f2 7305 for(auto src_id : devs_source) {
f67539c2 7306 used_space += bluefs->get_used(src_id);
11fdf7f2
TL
7307 }
7308 uint64_t target_free = bluefs->get_free(id);
f67539c2 7309 if (target_free < used_space) {
11fdf7f2
TL
7310 derr << __func__
7311 << " can't migrate, free space at target: " << target_free
7312 << " is less than required space: " << used_space
7313 << dendl;
20effc67 7314 return -ENOSPC;
11fdf7f2 7315 }
9f95a23c
TL
7316 if (devs_source.count(BlueFS::BDEV_DB)) {
7317 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7318 bluefs_layout.dedicated_db = false;
7319 }
7320 if (devs_source.count(BlueFS::BDEV_WAL)) {
7321 bluefs_layout.dedicated_wal = false;
7322 }
7323 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
11fdf7f2
TL
7324 if (r < 0) {
7325 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
20effc67 7326 return r;
11fdf7f2
TL
7327 }
7328
7329 if (devs_source.count(BlueFS::BDEV_DB)) {
7330 r = unlink(string(path + "/block.db").c_str());
7331 ceph_assert(r == 0);
7332 }
7333 if (devs_source.count(BlueFS::BDEV_WAL)) {
7334 r = unlink(string(path + "/block.wal").c_str());
7335 ceph_assert(r == 0);
7336 }
11fdf7f2
TL
7337 return r;
7338}
7339
7340int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
7341 int id,
7342 const string& dev_path)
7343{
7344 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
11fdf7f2
TL
7345 ceph_assert(path_fd < 0);
7346
7347 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
7348
7349 if (!cct->_conf->bluestore_bluefs) {
7350 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7351 return -EIO;
7352 }
7353
20effc67
TL
7354 int r = _open_db_and_around(true);
7355 if (r < 0) {
7356 return r;
7357 }
7358 auto close_db = make_scope_guard([&] {
7359 _close_db_and_around();
7360 });
11fdf7f2 7361
11fdf7f2
TL
7362 string link_db;
7363 string link_wal;
7364 if (devs_source.count(BlueFS::BDEV_DB) &&
9f95a23c 7365 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 7366 link_db = path + "/block.db";
9f95a23c
TL
7367 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7368 bluefs_layout.dedicated_db = false;
11fdf7f2
TL
7369 }
7370 if (devs_source.count(BlueFS::BDEV_WAL)) {
7371 link_wal = path + "/block.wal";
9f95a23c 7372 bluefs_layout.dedicated_wal = false;
11fdf7f2
TL
7373 }
7374
20effc67 7375 size_t target_size = 0;
11fdf7f2
TL
7376 string target_name;
7377 if (id == BlueFS::BDEV_NEWWAL) {
7378 target_name = "block.wal";
7379 target_size = cct->_conf->bluestore_block_wal_size;
9f95a23c 7380 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
7381
7382 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
f67539c2
TL
7383 cct->_conf->bdev_enable_discard,
7384 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
7385 ceph_assert(r == 0);
7386
7387 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
7388 r = _check_or_set_bdev_label(
7389 dev_path,
7390 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
7391 "bluefs wal",
7392 true);
7393 ceph_assert(r == 0);
7394 }
11fdf7f2
TL
7395 } else if (id == BlueFS::BDEV_NEWDB) {
7396 target_name = "block.db";
7397 target_size = cct->_conf->bluestore_block_db_size;
9f95a23c
TL
7398 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
7399 bluefs_layout.dedicated_db = true;
31f18b77 7400
11fdf7f2 7401 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
f67539c2
TL
7402 cct->_conf->bdev_enable_discard,
7403 SUPER_RESERVED);
11fdf7f2
TL
7404 ceph_assert(r == 0);
7405
7406 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
7407 r = _check_or_set_bdev_label(
7408 dev_path,
7409 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
7410 "bluefs db",
7411 true);
7412 ceph_assert(r == 0);
7413 }
31f18b77
FG
7414 }
7415
11fdf7f2
TL
7416 bluefs->umount();
7417 bluefs->mount();
7418
9f95a23c 7419 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
11fdf7f2 7420
7c673cae 7421 if (r < 0) {
11fdf7f2 7422 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
20effc67 7423 return r;
11fdf7f2
TL
7424 }
7425
7426 if (!link_db.empty()) {
7427 r = unlink(link_db.c_str());
7428 ceph_assert(r == 0);
7429 }
7430 if (!link_wal.empty()) {
7431 r = unlink(link_wal.c_str());
7432 ceph_assert(r == 0);
7433 }
7434 r = _setup_block_symlink_or_file(
7435 target_name,
7436 dev_path,
7437 target_size,
7438 true);
7439 ceph_assert(r == 0);
7440 dout(0) << __func__ << " success" << dendl;
7441
11fdf7f2
TL
7442 return r;
7443}
7444
7445string BlueStore::get_device_path(unsigned id)
7446{
7447 string res;
7448 if (id < BlueFS::MAX_BDEV) {
7449 switch (id) {
7450 case BlueFS::BDEV_WAL:
7451 res = path + "/block.wal";
7452 break;
7453 case BlueFS::BDEV_DB:
9f95a23c 7454 if (id == bluefs_layout.shared_bdev) {
11fdf7f2
TL
7455 res = path + "/block";
7456 } else {
7457 res = path + "/block.db";
7458 }
7459 break;
7460 case BlueFS::BDEV_SLOW:
7461 res = path + "/block";
7462 break;
7463 }
7464 }
7465 return res;
7466}
7467
f67539c2
TL
7468int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
7469{
7470 bluestore_bdev_label_t label;
7471 int r = _read_bdev_label(cct, path, &label);
7472 if (r < 0) {
7473 derr << "unable to read label for " << path << ": "
7474 << cpp_strerror(r) << dendl;
7475 } else {
7476 label.size = size;
7477 r = _write_bdev_label(cct, path, label);
7478 if (r < 0) {
7479 derr << "unable to write label for " << path << ": "
7480 << cpp_strerror(r) << dendl;
7481 }
7482 }
7483 return r;
7484}
7485
11fdf7f2
TL
7486int BlueStore::expand_devices(ostream& out)
7487{
f67539c2 7488 int r = _open_db_and_around(true);
11fdf7f2
TL
7489 ceph_assert(r == 0);
7490 bluefs->dump_block_extents(out);
1911f103 7491 out << "Expanding DB/WAL..." << std::endl;
11fdf7f2 7492 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
9f95a23c 7493 if (devid == bluefs_layout.shared_bdev ) {
11fdf7f2
TL
7494 continue;
7495 }
7496 uint64_t size = bluefs->get_block_device_size(devid);
7497 if (size == 0) {
7498 // no bdev
7499 continue;
7500 }
7501
f67539c2
TL
7502 out << devid
7503 <<" : expanding " << " to 0x" << size << std::dec << std::endl;
7504 string p = get_device_path(devid);
7505 const char* path = p.c_str();
7506 if (path == nullptr) {
7507 derr << devid
7508 <<": can't find device path " << dendl;
7509 continue;
7510 }
7511 if (bluefs->bdev_support_label(devid)) {
7512 if (_set_bdev_label_size(p, size) >= 0) {
7513 out << devid
7514 << " : size label updated to " << size
7515 << std::endl;
11fdf7f2 7516 }
11fdf7f2
TL
7517 }
7518 }
7519 uint64_t size0 = fm->get_size();
7520 uint64_t size = bdev->get_size();
7521 if (size0 < size) {
9f95a23c 7522 out << bluefs_layout.shared_bdev
1911f103
TL
7523 << " : expanding " << " from 0x" << std::hex
7524 << size0 << " to 0x" << size << std::dec << std::endl;
f67539c2
TL
7525 _write_out_fm_meta(size);
7526 if (bdev->supported_bdev_label()) {
7527 if (_set_bdev_label_size(path, size) >= 0) {
7528 out << bluefs_layout.shared_bdev
7529 << " : size label updated to " << size
7530 << std::endl;
7531 }
7532 }
20effc67 7533
1d09f67e
TL
7534 if (fm && fm->is_null_manager()) {
7535 // we grow the allocation range, must reflect it in the allocation file
7536 alloc->init_add_free(size0, size - size0);
7537 need_to_destage_allocation_file = true;
7538 }
20effc67 7539 _close_db_and_around();
1911f103
TL
7540
7541 // mount in read/write to sync expansion changes
f67539c2 7542 r = _mount();
11fdf7f2 7543 ceph_assert(r == 0);
1911f103
TL
7544 umount();
7545 } else {
20effc67 7546 _close_db_and_around();
7c673cae 7547 }
1911f103
TL
7548 return r;
7549}
7550
7551int BlueStore::dump_bluefs_sizes(ostream& out)
7552{
f67539c2 7553 int r = _open_db_and_around(true);
1911f103
TL
7554 ceph_assert(r == 0);
7555 bluefs->dump_block_extents(out);
20effc67 7556 _close_db_and_around();
7c673cae
FG
7557 return r;
7558}
7559
7560void BlueStore::set_cache_shards(unsigned num)
7561{
7562 dout(10) << __func__ << " " << num << dendl;
9f95a23c
TL
7563 size_t oold = onode_cache_shards.size();
7564 size_t bold = buffer_cache_shards.size();
7565 ceph_assert(num >= oold && num >= bold);
7566 onode_cache_shards.resize(num);
7567 buffer_cache_shards.resize(num);
7568 for (unsigned i = oold; i < num; ++i) {
7569 onode_cache_shards[i] =
7570 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7571 logger);
7572 }
7573 for (unsigned i = bold; i < num; ++i) {
7574 buffer_cache_shards[i] =
7575 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7576 logger);
7c673cae
FG
7577 }
7578}
7579
1d09f67e
TL
7580//---------------------------------------------
7581bool BlueStore::has_null_manager()
7582{
7583 return (fm && fm->is_null_manager());
7584}
7585
f67539c2 7586int BlueStore::_mount()
7c673cae 7587{
20effc67 7588 dout(5) << __func__ << "NCB:: path " << path << dendl;
1d09f67e 7589
f67539c2 7590 _kv_only = false;
7c673cae 7591 if (cct->_conf->bluestore_fsck_on_mount) {
20effc67 7592 dout(5) << __func__ << "::NCB::calling fsck()" << dendl;
7c673cae
FG
7593 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
7594 if (rc < 0)
7595 return rc;
7596 if (rc > 0) {
7597 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7598 return -EIO;
7599 }
7600 }
7601
eafe8130
TL
7602 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
7603 derr << __func__ << " osd_max_object_size "
7604 << cct->_conf->osd_max_object_size << " > bluestore max "
7605 << OBJECT_MAX_SIZE << dendl;
7606 return -EINVAL;
7607 }
7608
20effc67 7609 dout(5) << __func__ << "::NCB::calling open_db_and_around(read/write)" << dendl;
f67539c2 7610 int r = _open_db_and_around(false);
9f95a23c 7611 if (r < 0) {
f67539c2 7612 return r;
11fdf7f2 7613 }
20effc67
TL
7614 auto close_db = make_scope_guard([&] {
7615 if (!mounted) {
7616 _close_db_and_around();
7617 }
7618 });
7c673cae 7619
11fdf7f2
TL
7620 r = _upgrade_super();
7621 if (r < 0) {
20effc67 7622 return r;
11fdf7f2 7623 }
7c673cae 7624
20effc67 7625 // The recovery process for allocation-map needs to open collection early
7c673cae 7626 r = _open_collections();
20effc67
TL
7627 if (r < 0) {
7628 return r;
7629 }
7630 auto shutdown_cache = make_scope_guard([&] {
7631 if (!mounted) {
7632 _shutdown_cache();
7633 }
7634 });
7c673cae
FG
7635
7636 r = _reload_logger();
20effc67
TL
7637 if (r < 0) {
7638 return r;
7639 }
7c673cae 7640
31f18b77 7641 _kv_start();
20effc67
TL
7642 auto stop_kv = make_scope_guard([&] {
7643 if (!mounted) {
7644 _kv_stop();
7645 }
7646 });
7647
7648 r = _deferred_replay();
7649 if (r < 0) {
7650 return r;
7651 }
7c673cae 7652
20effc67 7653#ifdef HAVE_LIBZBD
f67539c2
TL
7654 if (bdev->is_smr()) {
7655 _zoned_cleaner_start();
7656 }
20effc67 7657#endif
7c673cae
FG
7658
7659 mempool_thread.init();
7660
f67539c2 7661 if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
eafe8130 7662 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
9f95a23c 7663
f67539c2 7664 auto was_per_pool_omap = per_pool_omap;
9f95a23c 7665
eafe8130
TL
7666 dout(1) << __func__ << " quick-fix on mount" << dendl;
7667 _fsck_on_open(FSCK_SHALLOW, true);
7668
7669 //reread statfs
7670 //FIXME minor: replace with actual open/close?
7671 _open_statfs();
eafe8130 7672 _check_legacy_statfs_alert();
9f95a23c
TL
7673
7674 //set again as hopefully it has been fixed
f67539c2 7675 if (was_per_pool_omap != OMAP_PER_PG) {
9f95a23c
TL
7676 _set_per_pool_omap();
7677 }
eafe8130
TL
7678 }
7679
7c673cae
FG
7680 mounted = true;
7681 return 0;
7c673cae
FG
7682}
7683
7684int BlueStore::umount()
7685{
11fdf7f2 7686 ceph_assert(_kv_only || mounted);
7c673cae 7687 _osr_drain_all();
7c673cae 7688
7c673cae 7689 mounted = false;
20effc67
TL
7690
7691 ceph_assert(alloc);
7692
3efd9988
FG
7693 if (!_kv_only) {
7694 mempool_thread.shutdown();
20effc67 7695#ifdef HAVE_LIBZBD
f67539c2
TL
7696 if (bdev->is_smr()) {
7697 dout(20) << __func__ << " stopping zone cleaner thread" << dendl;
7698 _zoned_cleaner_stop();
7699 }
20effc67 7700#endif
3efd9988
FG
7701 dout(20) << __func__ << " stopping kv thread" << dendl;
7702 _kv_stop();
1d09f67e
TL
7703 // skip cache cleanup step on fast shutdown
7704 if (likely(!m_fast_shutdown)) {
7705 _shutdown_cache();
7706 }
3efd9988 7707 dout(20) << __func__ << " closing" << dendl;
3efd9988 7708 }
20effc67 7709 _close_db_and_around();
1d09f67e
TL
7710 // disable fsck on fast-shutdown
7711 if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) {
7c673cae
FG
7712 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7713 if (rc < 0)
7714 return rc;
7715 if (rc > 0) {
7716 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7717 return -EIO;
7718 }
7719 }
7720 return 0;
7721}
7722
eafe8130
TL
7723int BlueStore::cold_open()
7724{
f67539c2 7725 return _open_db_and_around(true);
eafe8130 7726}
f67539c2 7727
eafe8130
TL
7728int BlueStore::cold_close()
7729{
20effc67 7730 _close_db_and_around();
eafe8130
TL
7731 return 0;
7732}
7733
9f95a23c
TL
7734// derr wrapper to limit enormous output and avoid log flooding.
7735// Of limited use where such output is expected for now
7736#define fsck_derr(err_cnt, threshold) \
7737 if (err_cnt <= threshold) { \
7738 bool need_skip_print = err_cnt == threshold; \
7739 derr
7740
7741#define fsck_dendl \
7742 dendl; \
7743 if (need_skip_print) \
7744 derr << "more error lines skipped..." << dendl; \
7c673cae 7745 }
7c673cae 7746
eafe8130
TL
7747int _fsck_sum_extents(
7748 const PExtentVector& extents,
7749 bool compressed,
7750 store_statfs_t& expected_statfs)
7751{
7752 for (auto e : extents) {
7753 if (!e.is_valid())
7754 continue;
7755 expected_statfs.allocated += e.length;
7756 if (compressed) {
7757 expected_statfs.data_compressed_allocated += e.length;
7758 }
7759 }
7760 return 0;
7761}
7762
7c673cae 7763int BlueStore::_fsck_check_extents(
20effc67 7764 std::string_view ctx_descr,
7c673cae
FG
7765 const PExtentVector& extents,
7766 bool compressed,
7767 mempool_dynamic_bitset &used_blocks,
b32b8144 7768 uint64_t granularity,
11fdf7f2 7769 BlueStoreRepairer* repairer,
eafe8130
TL
7770 store_statfs_t& expected_statfs,
7771 FSCKDepth depth)
7c673cae 7772{
20effc67 7773 dout(30) << __func__ << " " << ctx_descr << ", extents " << extents << dendl;
7c673cae
FG
7774 int errors = 0;
7775 for (auto e : extents) {
7776 if (!e.is_valid())
7777 continue;
7778 expected_statfs.allocated += e.length;
7779 if (compressed) {
11fdf7f2 7780 expected_statfs.data_compressed_allocated += e.length;
7c673cae 7781 }
eafe8130
TL
7782 if (depth != FSCK_SHALLOW) {
7783 bool already = false;
9f95a23c 7784 apply_for_bitset_range(
eafe8130
TL
7785 e.offset, e.length, granularity, used_blocks,
7786 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
7787 if (bs.test(pos)) {
7788 if (repairer) {
7789 repairer->note_misreference(
7790 pos * min_alloc_size, min_alloc_size, !already);
7791 }
7792 if (!already) {
20effc67 7793 derr << __func__ << "::fsck error: " << ctx_descr << ", extent " << e
eafe8130
TL
7794 << " or a subset is already allocated (misreferenced)" << dendl;
7795 ++errors;
7796 already = true;
7797 }
11fdf7f2 7798 }
eafe8130
TL
7799 else
7800 bs.set(pos);
7801 });
11fdf7f2 7802
eafe8130 7803 if (e.end() > bdev->get_size()) {
20effc67 7804 derr << "fsck error: " << ctx_descr << ", extent " << e
eafe8130
TL
7805 << " past end of block device" << dendl;
7806 ++errors;
7807 }
7c673cae
FG
7808 }
7809 }
7810 return errors;
7811}
7812
11fdf7f2
TL
7813void BlueStore::_fsck_check_pool_statfs(
7814 BlueStore::per_pool_statfs& expected_pool_statfs,
eafe8130
TL
7815 int64_t& errors,
7816 int64_t& warnings,
11fdf7f2
TL
7817 BlueStoreRepairer* repairer)
7818{
f67539c2 7819 auto it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2
TL
7820 if (it) {
7821 for (it->lower_bound(string()); it->valid(); it->next()) {
7822 string key = it->key();
7823 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
7824 if (repairer) {
eafe8130
TL
7825 ++errors;
7826 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
7827 derr << "fsck error: " << "legacy statfs record found, removing"
11fdf7f2
TL
7828 << dendl;
7829 }
7830 continue;
7831 }
11fdf7f2
TL
7832 uint64_t pool_id;
7833 if (get_key_pool_stat(key, &pool_id) < 0) {
7834 derr << "fsck error: bad key " << key
7835 << "in statfs namespece" << dendl;
7836 if (repairer) {
7837 repairer->remove_key(db, PREFIX_STAT, key);
7838 }
7839 ++errors;
7840 continue;
7841 }
7842
7843 volatile_statfs vstatfs;
7844 bufferlist bl = it->value();
7845 auto blp = bl.cbegin();
7846 try {
7847 vstatfs.decode(blp);
f67539c2 7848 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
7849 derr << "fsck error: failed to decode Pool StatFS record"
7850 << pretty_binary_string(key) << dendl;
7851 if (repairer) {
7852 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
7853 << pretty_binary_string(key)
7854 << "', removing" << dendl;
7855 repairer->remove_key(db, PREFIX_STAT, key);
7856 }
7857 ++errors;
7858 vstatfs.reset();
7859 }
7860 auto stat_it = expected_pool_statfs.find(pool_id);
7861 if (stat_it == expected_pool_statfs.end()) {
7862 if (vstatfs.is_empty()) {
7863 // we don't consider that as an error since empty pool statfs
7864 // are left in DB for now
7865 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7866 << std::hex << pool_id << std::dec << dendl;
7867 if (repairer) {
7868 // but we need to increment error count in case of repair
7869 // to have proper counters at the end
7870 // (as repairer increments recovery counter anyway).
7871 ++errors;
7872 }
7873 } else {
7874 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
7875 << std::hex << pool_id << std::dec << dendl;
7876 ++errors;
7877 }
7878 if (repairer) {
522d829b 7879 repairer->remove_key(db, PREFIX_STAT, key);
11fdf7f2
TL
7880 }
7881 continue;
7882 }
7883 store_statfs_t statfs;
7884 vstatfs.publish(&statfs);
7885 if (!(stat_it->second == statfs)) {
7886 derr << "fsck error: actual " << statfs
7887 << " != expected " << stat_it->second
7888 << " for pool "
7889 << std::hex << pool_id << std::dec << dendl;
7890 if (repairer) {
7891 repairer->fix_statfs(db, key, stat_it->second);
7892 }
7893 ++errors;
7894 }
7895 expected_pool_statfs.erase(stat_it);
7896 }
7897 } // if (it)
eafe8130
TL
7898 for (auto& s : expected_pool_statfs) {
7899 if (s.second.is_zero()) {
11fdf7f2
TL
7900 // we might lack empty statfs recs in DB
7901 continue;
7902 }
7903 derr << "fsck error: missing Pool StatFS record for pool "
eafe8130 7904 << std::hex << s.first << std::dec << dendl;
11fdf7f2
TL
7905 if (repairer) {
7906 string key;
eafe8130
TL
7907 get_pool_stat_key(s.first, &key);
7908 repairer->fix_statfs(db, key, s.second);
11fdf7f2
TL
7909 }
7910 ++errors;
7911 }
eafe8130 7912 if (!per_pool_stat_collection &&
eafe8130
TL
7913 repairer) {
7914 // by virtue of running this method, we correct the top-level
7915 // error of having global stats
7916 repairer->inc_repaired();
7917 }
11fdf7f2
TL
7918}
7919
20effc67
TL
7920void BlueStore::_fsck_repair_shared_blobs(
7921 BlueStoreRepairer& repairer,
7922 shared_blob_2hash_tracker_t& sb_ref_counts,
7923 sb_info_space_efficient_map_t& sb_info)
7924{
7925 auto sb_ref_mismatches = sb_ref_counts.count_non_zero();
7926 dout(1) << __func__ << " repairing shared_blobs, ref mismatch estimate: "
7927 << sb_ref_mismatches << dendl;
7928 if (!sb_ref_mismatches) // not expected to succeed, just in case
7929 return;
7930
7931
7932 auto foreach_shared_blob = [&](std::function<
7933 void (coll_t,
7934 ghobject_t,
7935 uint64_t,
7936 const bluestore_blob_t&)> cb) {
7937 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
7938 if (it) {
7939 CollectionRef c;
7940 spg_t pgid;
7941 for (it->lower_bound(string()); it->valid(); it->next()) {
7942 dout(30) << __func__ << " key "
7943 << pretty_binary_string(it->key())
7944 << dendl;
7945 if (is_extent_shard_key(it->key())) {
7946 continue;
7947 }
7948
7949 ghobject_t oid;
7950 int r = get_key_object(it->key(), &oid);
7951 if (r < 0) {
7952 continue;
7953 }
7954
7955 if (!c ||
7956 oid.shard_id != pgid.shard ||
7957 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
7958 !c->contains(oid)) {
7959 c = nullptr;
7960 for (auto& p : coll_map) {
7961 if (p.second->contains(oid)) {
7962 c = p.second;
7963 break;
7964 }
7965 }
7966 if (!c) {
7967 continue;
7968 }
7969 }
7970 dout(20) << __func__
7971 << " inspecting shared blob refs for col:" << c->cid
7972 << " obj:" << oid
7973 << dendl;
7974
7975 OnodeRef o;
7976 o.reset(Onode::decode(c, oid, it->key(), it->value()));
7977 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7978
7979 _dump_onode<30>(cct, *o);
7980
7981 mempool::bluestore_fsck::set<BlobRef> passed_sbs;
7982 for (auto& e : o->extent_map.extent_map) {
7983 auto& b = e.blob->get_blob();
7984 if (b.is_shared() && passed_sbs.count(e.blob) == 0) {
7985 auto sbid = e.blob->shared_blob->get_sbid();
7986 cb(c->cid, oid, sbid, b);
7987 passed_sbs.emplace(e.blob);
7988 }
7989 } // for ... extent_map
7990 } // for ... it->valid
7991 } //if (it(PREFIX_OBJ))
7992 }; //foreach_shared_blob fn declaration
7993
7994 mempool::bluestore_fsck::map<uint64_t, bluestore_extent_ref_map_t> refs_map;
7995
7996 // first iteration over objects to identify all the broken sbids
7997 foreach_shared_blob( [&](coll_t cid,
7998 ghobject_t oid,
7999 uint64_t sbid,
8000 const bluestore_blob_t& b) {
8001 auto it = refs_map.lower_bound(sbid);
8002 if(it != refs_map.end() && it->first == sbid) {
8003 return;
8004 }
8005 for (auto& p : b.get_extents()) {
8006 if (p.is_valid() &&
8007 !sb_ref_counts.test_all_zero_range(sbid,
8008 p.offset,
8009 p.length)) {
8010 refs_map.emplace_hint(it, sbid, bluestore_extent_ref_map_t());
8011 dout(20) << __func__
8012 << " broken shared blob found for col:" << cid
8013 << " obj:" << oid
8014 << " sbid 0x " << std::hex << sbid << std::dec
8015 << dendl;
8016 break;
8017 }
8018 }
8019 });
8020
8021 // second iteration over objects to build new ref map for the broken sbids
8022 foreach_shared_blob( [&](coll_t cid,
8023 ghobject_t oid,
8024 uint64_t sbid,
8025 const bluestore_blob_t& b) {
8026 auto it = refs_map.find(sbid);
8027 if(it == refs_map.end()) {
8028 return;
8029 }
8030 for (auto& p : b.get_extents()) {
8031 if (p.is_valid()) {
8032 it->second.get(p.offset, p.length);
8033 break;
8034 }
8035 }
8036 });
8037
8038 // update shared blob records
8039 auto ref_it = refs_map.begin();
8040 while (ref_it != refs_map.end()) {
8041 size_t cnt = 0;
8042 const size_t max_transactions = 4096;
8043 KeyValueDB::Transaction txn = db->get_transaction();
8044 for (cnt = 0;
8045 cnt < max_transactions && ref_it != refs_map.end();
8046 ref_it++) {
8047 auto sbid = ref_it->first;
8048 dout(20) << __func__ << " repaired shared_blob 0x"
8049 << std::hex << sbid << std::dec
8050 << ref_it->second << dendl;
8051 repairer.fix_shared_blob(txn, sbid, &ref_it->second, 0);
8052 cnt++;
8053 }
8054 if (cnt) {
8055 db->submit_transaction_sync(txn);
8056 cnt = 0;
8057 }
8058 }
8059 // remove stray shared blob records
8060 size_t cnt = 0;
8061 const size_t max_transactions = 4096;
8062 KeyValueDB::Transaction txn = db->get_transaction();
8063 sb_info.foreach_stray([&](const sb_info_t& sbi) {
8064 auto sbid = sbi.get_sbid();
8065 dout(20) << __func__ << " removing stray shared_blob 0x"
8066 << std::hex << sbid << std::dec
8067 << dendl;
8068 repairer.fix_shared_blob(txn, sbid, nullptr, 0);
8069 cnt++;
8070 if (cnt >= max_transactions) {}
8071 db->submit_transaction_sync(txn);
8072 txn = db->get_transaction();
8073 cnt = 0;
8074 });
8075 if (cnt > 0) {
8076 db->submit_transaction_sync(txn);
8077 }
8078
8079 // amount of repairs to report to be equal to previously
8080 // determined error estimation, not the actual number of updated shared blobs
8081 repairer.inc_repaired(sb_ref_mismatches);
8082}
8083
eafe8130
TL
8084BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
8085 BlueStore::FSCKDepth depth,
8086 int64_t pool_id,
8087 BlueStore::CollectionRef c,
8088 const ghobject_t& oid,
8089 const string& key,
8090 const bufferlist& value,
9f95a23c 8091 mempool::bluestore_fsck::list<string>* expecting_shards,
eafe8130
TL
8092 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
8093 const BlueStore::FSCK_ObjectCtx& ctx)
8094{
8095 auto& errors = ctx.errors;
8096 auto& num_objects = ctx.num_objects;
8097 auto& num_extents = ctx.num_extents;
8098 auto& num_blobs = ctx.num_blobs;
8099 auto& num_sharded_objects = ctx.num_sharded_objects;
8100 auto& num_spanning_blobs = ctx.num_spanning_blobs;
8101 auto used_blocks = ctx.used_blocks;
8102 auto sb_info_lock = ctx.sb_info_lock;
8103 auto& sb_info = ctx.sb_info;
20effc67 8104 auto& sb_ref_counts = ctx.sb_ref_counts;
eafe8130
TL
8105 auto repairer = ctx.repairer;
8106
8107 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
8108 &ctx.expected_pool_statfs[pool_id] :
8109 &ctx.expected_store_statfs;
8110
20effc67
TL
8111 map<uint32_t, uint64_t> zone_first_offsets; // for zoned/smr devices
8112
eafe8130
TL
8113 dout(10) << __func__ << " " << oid << dendl;
8114 OnodeRef o;
8115 o.reset(Onode::decode(c, oid, key, value));
8116 ++num_objects;
7c673cae 8117
eafe8130 8118 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7c673cae 8119
eafe8130
TL
8120 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8121 _dump_onode<30>(cct, *o);
8122 // shards
8123 if (!o->extent_map.shards.empty()) {
8124 ++num_sharded_objects;
8125 if (depth != FSCK_SHALLOW) {
9f95a23c 8126 ceph_assert(expecting_shards);
eafe8130
TL
8127 for (auto& s : o->extent_map.shards) {
8128 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
9f95a23c 8129 expecting_shards->push_back(string());
eafe8130 8130 get_extent_shard_key(o->key, s.shard_info->offset,
9f95a23c 8131 &expecting_shards->back());
eafe8130
TL
8132 if (s.shard_info->offset >= o->onode.size) {
8133 derr << "fsck error: " << oid << " shard 0x" << std::hex
8134 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
8135 << std::dec << dendl;
8136 ++errors;
8137 }
8138 }
8139 }
8140 }
7c673cae 8141
eafe8130
TL
8142 // lextents
8143 uint64_t pos = 0;
8144 mempool::bluestore_fsck::map<BlobRef,
8145 bluestore_blob_use_tracker_t> ref_map;
8146 for (auto& l : o->extent_map.extent_map) {
8147 dout(20) << __func__ << " " << l << dendl;
8148 if (l.logical_offset < pos) {
8149 derr << "fsck error: " << oid << " lextent at 0x"
8150 << std::hex << l.logical_offset
8151 << " overlaps with the previous, which ends at 0x" << pos
8152 << std::dec << dendl;
8153 ++errors;
8154 }
8155 if (depth != FSCK_SHALLOW &&
8156 o->extent_map.spans_shard(l.logical_offset, l.length)) {
8157 derr << "fsck error: " << oid << " lextent at 0x"
8158 << std::hex << l.logical_offset << "~" << l.length
8159 << " spans a shard boundary"
8160 << std::dec << dendl;
8161 ++errors;
8162 }
8163 pos = l.logical_offset + l.length;
8164 res_statfs->data_stored += l.length;
8165 ceph_assert(l.blob);
8166 const bluestore_blob_t& blob = l.blob->get_blob();
8167
20effc67
TL
8168#ifdef HAVE_LIBZBD
8169 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
8170 for (auto& e : blob.get_extents()) {
8171 if (e.is_valid()) {
8172 uint32_t zone = e.offset / zone_size;
8173 uint64_t offset = e.offset % zone_size;
8174 auto p = zone_first_offsets.find(zone);
8175 if (p == zone_first_offsets.end() || p->second > offset) {
8176 // FIXME: use interator for guided insert?
8177 zone_first_offsets[zone] = offset;
8178 }
8179 }
8180 }
8181 }
8182#endif
8183
8184 auto& ref = ref_map[l.blob];
8185 if (ref.is_empty()) {
8186 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
8187 uint32_t l = blob.get_logical_length();
8188 ref.init(l, min_release_size);
eafe8130
TL
8189 }
8190 ref.get(
8191 l.blob_offset,
8192 l.length);
8193 ++num_extents;
8194 if (depth != FSCK_SHALLOW &&
8195 blob.has_unused()) {
8196 ceph_assert(referenced);
8197 auto p = referenced->find(l.blob);
8198 bluestore_blob_t::unused_t* pu;
8199 if (p == referenced->end()) {
8200 pu = &(*referenced)[l.blob];
8201 }
8202 else {
8203 pu = &p->second;
8204 }
8205 uint64_t blob_len = blob.get_logical_length();
8206 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
8207 ceph_assert(l.blob_offset + l.length <= blob_len);
8208 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
8209 uint64_t start = l.blob_offset / chunk_size;
8210 uint64_t end =
8211 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
8212 for (auto i = start; i < end; ++i) {
8213 (*pu) |= (1u << i);
8214 }
8215 }
8216 } //for (auto& l : o->extent_map.extent_map)
8217
8218 for (auto& i : ref_map) {
8219 ++num_blobs;
8220 const bluestore_blob_t& blob = i.first->get_blob();
8221 bool equal =
8222 depth == FSCK_SHALLOW ? true :
8223 i.first->get_blob_use_tracker().equal(i.second);
8224 if (!equal) {
8225 derr << "fsck error: " << oid << " blob " << *i.first
8226 << " doesn't match expected ref_map " << i.second << dendl;
8227 ++errors;
8228 }
8229 if (blob.is_compressed()) {
8230 res_statfs->data_compressed += blob.get_compressed_payload_length();
8231 res_statfs->data_compressed_original +=
8232 i.first->get_referenced_bytes();
8233 }
20effc67
TL
8234 if (depth != FSCK_SHALLOW && repairer) {
8235 for (auto e : blob.get_extents()) {
8236 if (!e.is_valid())
8237 continue;
8238 repairer->set_space_used(e.offset, e.length, c->cid, oid);
8239 }
8240 }
eafe8130
TL
8241 if (blob.is_shared()) {
8242 if (i.first->shared_blob->get_sbid() > blobid_max) {
8243 derr << "fsck error: " << oid << " blob " << blob
8244 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
8245 << blobid_max << dendl;
8246 ++errors;
20effc67 8247 } else if (i.first->shared_blob->get_sbid() == 0) {
eafe8130
TL
8248 derr << "fsck error: " << oid << " blob " << blob
8249 << " marked as shared but has uninitialized sbid"
8250 << dendl;
8251 ++errors;
8252 }
8253 // the below lock is optional and provided in multithreading mode only
8254 if (sb_info_lock) {
8255 sb_info_lock->lock();
8256 }
20effc67
TL
8257 auto sbid = i.first->shared_blob->get_sbid();
8258 sb_info_t& sbi = sb_info.add_or_adopt(i.first->shared_blob->get_sbid());
8259 ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID ||
eafe8130 8260 sbi.pool_id == oid.hobj.get_logical_pool());
eafe8130 8261 sbi.pool_id = oid.hobj.get_logical_pool();
20effc67 8262 bool compressed = blob.is_compressed();
eafe8130
TL
8263 for (auto e : blob.get_extents()) {
8264 if (e.is_valid()) {
20effc67
TL
8265 if (compressed) {
8266 ceph_assert(sbi.allocated_chunks <= 0);
8267 sbi.allocated_chunks -= (e.length >> min_alloc_size_order);
8268 } else {
8269 ceph_assert(sbi.allocated_chunks >= 0);
8270 sbi.allocated_chunks += (e.length >> min_alloc_size_order);
8271 }
8272 sb_ref_counts.inc_range(sbid, e.offset, e.length, 1);
eafe8130
TL
8273 }
8274 }
8275 if (sb_info_lock) {
8276 sb_info_lock->unlock();
8277 }
8278 } else if (depth != FSCK_SHALLOW) {
8279 ceph_assert(used_blocks);
20effc67
TL
8280 string ctx_descr = " oid " + stringify(oid);
8281 errors += _fsck_check_extents(ctx_descr,
8282 blob.get_extents(),
eafe8130
TL
8283 blob.is_compressed(),
8284 *used_blocks,
8285 fm->get_alloc_size(),
20effc67 8286 repairer,
eafe8130
TL
8287 *res_statfs,
8288 depth);
8289 } else {
8290 errors += _fsck_sum_extents(
8291 blob.get_extents(),
8292 blob.is_compressed(),
8293 *res_statfs);
8294 }
8295 } // for (auto& i : ref_map)
9f95a23c 8296
adb31ebb
TL
8297 {
8298 auto &sbm = o->extent_map.spanning_blob_map;
8299 size_t broken = 0;
8300 BlobRef first_broken;
8301 for (auto it = sbm.begin(); it != sbm.end();) {
8302 auto it1 = it++;
8303 if (ref_map.count(it1->second) == 0) {
8304 if (!broken) {
8305 first_broken = it1->second;
8306 ++errors;
8307 }
8308 broken++;
8309 if (repairer) {
8310 sbm.erase(it1);
8311 }
8312 }
8313 }
20effc67
TL
8314
8315#ifdef HAVE_LIBZBD
8316 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
8317 for (auto& [zone, first_offset] : zone_first_offsets) {
8318 auto p = (*ctx.zone_refs)[zone].find(oid);
8319 if (p != (*ctx.zone_refs)[zone].end()) {
8320 if (first_offset < p->second) {
8321 dout(20) << " slightly wonky zone ref 0x" << std::hex << zone
8322 << " offset 0x" << p->second
8323 << " but first offset is 0x" << first_offset
8324 << "; this can happen due to clone_range"
8325 << dendl;
8326 } else {
8327 dout(20) << " good zone ref 0x" << std::hex << zone << " offset 0x" << p->second
8328 << " <= first offset 0x" << first_offset
8329 << std::dec << dendl;
8330 }
8331 (*ctx.zone_refs)[zone].erase(p);
8332 } else {
8333 derr << "fsck error: " << oid << " references zone 0x" << std::hex << zone
8334 << " but there is no zone ref" << std::dec << dendl;
8335 // FIXME: add repair
8336 ++errors;
8337 }
8338 }
8339 }
8340#endif
8341
adb31ebb
TL
8342 if (broken) {
8343 derr << "fsck error: " << oid << " - " << broken
8344 << " zombie spanning blob(s) found, the first one: "
8345 << *first_broken << dendl;
8346 if(repairer) {
b3b6e05e
TL
8347 repairer->fix_spanning_blobs(
8348 db,
8349 [&](KeyValueDB::Transaction txn) {
8350 _record_onode(o, txn);
8351 });
adb31ebb
TL
8352 }
8353 }
8354 }
8355
9f95a23c
TL
8356 if (o->onode.has_omap()) {
8357 _fsck_check_object_omap(depth, o, ctx);
8358 }
8359
eafe8130
TL
8360 return o;
8361}
8362
8363#include "common/WorkQueue.h"
8364
8365class ShallowFSCKThreadPool : public ThreadPool
8366{
8367public:
8368 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
8369 ThreadPool(cct_, nm, tn, n) {
8370 }
8371 void worker(ThreadPool::WorkThread* wt) override {
8372 int next_wq = 0;
8373 while (!_stop) {
8374 next_wq %= work_queues.size();
8375 WorkQueue_ *wq = work_queues[next_wq++];
8376
8377 void* item = wq->_void_dequeue();
8378 if (item) {
8379 processing++;
8380 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
8381 wq->_void_process(item, tp_handle);
8382 processing--;
8383 }
8384 }
8385 }
8386 template <size_t BatchLen>
8387 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
8388 {
8389 struct Entry {
8390 int64_t pool_id;
8391 BlueStore::CollectionRef c;
8392 ghobject_t oid;
8393 string key;
8394 bufferlist value;
8395 };
8396 struct Batch {
8397 std::atomic<size_t> running = { 0 };
8398 size_t entry_count = 0;
8399 std::array<Entry, BatchLen> entries;
8400
8401 int64_t errors = 0;
8402 int64_t warnings = 0;
8403 uint64_t num_objects = 0;
8404 uint64_t num_extents = 0;
8405 uint64_t num_blobs = 0;
8406 uint64_t num_sharded_objects = 0;
8407 uint64_t num_spanning_blobs = 0;
8408 store_statfs_t expected_store_statfs;
8409 BlueStore::per_pool_statfs expected_pool_statfs;
8410 };
8411
8412 size_t batchCount;
8413 BlueStore* store = nullptr;
8414
eafe8130 8415 ceph::mutex* sb_info_lock = nullptr;
20effc67
TL
8416 sb_info_space_efficient_map_t* sb_info = nullptr;
8417 shared_blob_2hash_tracker_t* sb_ref_counts = nullptr;
eafe8130
TL
8418 BlueStoreRepairer* repairer = nullptr;
8419
8420 Batch* batches = nullptr;
8421 size_t last_batch_pos = 0;
8422 bool batch_acquired = false;
8423
8424 FSCKWorkQueue(std::string n,
8425 size_t _batchCount,
8426 BlueStore* _store,
eafe8130 8427 ceph::mutex* _sb_info_lock,
20effc67
TL
8428 sb_info_space_efficient_map_t& _sb_info,
8429 shared_blob_2hash_tracker_t& _sb_ref_counts,
eafe8130 8430 BlueStoreRepairer* _repairer) :
f67539c2 8431 WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()),
eafe8130
TL
8432 batchCount(_batchCount),
8433 store(_store),
eafe8130
TL
8434 sb_info_lock(_sb_info_lock),
8435 sb_info(&_sb_info),
20effc67 8436 sb_ref_counts(&_sb_ref_counts),
eafe8130
TL
8437 repairer(_repairer)
8438 {
8439 batches = new Batch[batchCount];
8440 }
8441 ~FSCKWorkQueue() {
8442 delete[] batches;
8443 }
8444
8445 /// Remove all work items from the queue.
8446 void _clear() override {
8447 //do nothing
8448 }
8449 /// Check whether there is anything to do.
8450 bool _empty() override {
8451 ceph_assert(false);
8452 }
8453
8454 /// Get the next work item to process.
8455 void* _void_dequeue() override {
8456 size_t pos = rand() % batchCount;
8457 size_t pos0 = pos;
8458 do {
8459 auto& batch = batches[pos];
8460 if (batch.running.fetch_add(1) == 0) {
8461 if (batch.entry_count) {
8462 return &batch;
8463 }
8464 }
8465 batch.running--;
8466 pos++;
8467 pos %= batchCount;
8468 } while (pos != pos0);
8469 return nullptr;
8470 }
8471 /** @brief Process the work item.
8472 * This function will be called several times in parallel
8473 * and must therefore be thread-safe. */
8474 void _void_process(void* item, TPHandle& handle) override {
8475 Batch* batch = (Batch*)item;
8476
8477 BlueStore::FSCK_ObjectCtx ctx(
8478 batch->errors,
8479 batch->warnings,
8480 batch->num_objects,
8481 batch->num_extents,
8482 batch->num_blobs,
8483 batch->num_sharded_objects,
8484 batch->num_spanning_blobs,
8485 nullptr, // used_blocks
9f95a23c 8486 nullptr, //used_omap_head
20effc67 8487 nullptr,
eafe8130
TL
8488 sb_info_lock,
8489 *sb_info,
20effc67 8490 *sb_ref_counts,
eafe8130
TL
8491 batch->expected_store_statfs,
8492 batch->expected_pool_statfs,
8493 repairer);
8494
8495 for (size_t i = 0; i < batch->entry_count; i++) {
8496 auto& entry = batch->entries[i];
8497
8498 store->fsck_check_objects_shallow(
8499 BlueStore::FSCK_SHALLOW,
8500 entry.pool_id,
8501 entry.c,
8502 entry.oid,
8503 entry.key,
8504 entry.value,
9f95a23c 8505 nullptr, // expecting_shards - this will need a protection if passed
eafe8130
TL
8506 nullptr, // referenced
8507 ctx);
8508 }
eafe8130
TL
8509 batch->entry_count = 0;
8510 batch->running--;
8511 }
8512 /** @brief Synchronously finish processing a work item.
8513 * This function is called after _void_process with the global thread pool lock held,
8514 * so at most one copy will execute simultaneously for a given thread pool.
8515 * It can be used for non-thread-safe finalization. */
8516 void _void_process_finish(void*) override {
8517 ceph_assert(false);
8518 }
8519
8520 bool queue(
8521 int64_t pool_id,
8522 BlueStore::CollectionRef c,
8523 const ghobject_t& oid,
8524 const string& key,
8525 const bufferlist& value) {
8526 bool res = false;
8527 size_t pos0 = last_batch_pos;
8528 if (!batch_acquired) {
8529 do {
8530 auto& batch = batches[last_batch_pos];
8531 if (batch.running.fetch_add(1) == 0) {
8532 if (batch.entry_count < BatchLen) {
8533 batch_acquired = true;
8534 break;
8535 }
8536 }
8537 batch.running.fetch_sub(1);
8538 last_batch_pos++;
8539 last_batch_pos %= batchCount;
8540 } while (last_batch_pos != pos0);
8541 }
8542 if (batch_acquired) {
8543 auto& batch = batches[last_batch_pos];
8544 ceph_assert(batch.running);
8545 ceph_assert(batch.entry_count < BatchLen);
8546
8547 auto& entry = batch.entries[batch.entry_count];
8548 entry.pool_id = pool_id;
8549 entry.c = c;
8550 entry.oid = oid;
8551 entry.key = key;
8552 entry.value = value;
8553
8554 ++batch.entry_count;
8555 if (batch.entry_count == BatchLen) {
8556 batch_acquired = false;
8557 batch.running.fetch_sub(1);
8558 last_batch_pos++;
8559 last_batch_pos %= batchCount;
8560 }
8561 res = true;
8562 }
8563 return res;
8564 }
8565
8566 void finalize(ThreadPool& tp,
8567 BlueStore::FSCK_ObjectCtx& ctx) {
8568 if (batch_acquired) {
8569 auto& batch = batches[last_batch_pos];
8570 ceph_assert(batch.running);
8571 batch.running.fetch_sub(1);
8572 }
8573 tp.stop();
8574
8575 for (size_t i = 0; i < batchCount; i++) {
8576 auto& batch = batches[i];
8577
8578 //process leftovers if any
8579 if (batch.entry_count) {
8580 TPHandle tp_handle(store->cct,
8581 nullptr,
8582 timeout_interval,
8583 suicide_interval);
8584 ceph_assert(batch.running == 0);
8585
8586 batch.running++; // just to be on-par with the regular call
8587 _void_process(&batch, tp_handle);
8588 }
8589 ceph_assert(batch.entry_count == 0);
8590
8591 ctx.errors += batch.errors;
8592 ctx.warnings += batch.warnings;
8593 ctx.num_objects += batch.num_objects;
8594 ctx.num_extents += batch.num_extents;
8595 ctx.num_blobs += batch.num_blobs;
8596 ctx.num_sharded_objects += batch.num_sharded_objects;
8597 ctx.num_spanning_blobs += batch.num_spanning_blobs;
9f95a23c 8598
eafe8130
TL
8599 ctx.expected_store_statfs.add(batch.expected_store_statfs);
8600
8601 for (auto it = batch.expected_pool_statfs.begin();
8602 it != batch.expected_pool_statfs.end();
8603 it++) {
8604 ctx.expected_pool_statfs[it->first].add(it->second);
8605 }
8606 }
8607 }
8608 };
8609};
8610
9f95a23c
TL
8611void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
8612 OnodeRef& o,
8613 const BlueStore::FSCK_ObjectCtx& ctx)
eafe8130 8614{
9f95a23c
TL
8615 auto& errors = ctx.errors;
8616 auto& warnings = ctx.warnings;
8617 auto repairer = ctx.repairer;
8618
8619 ceph_assert(o->onode.has_omap());
8620 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
f67539c2 8621 if (per_pool_omap == OMAP_PER_POOL) {
9f95a23c
TL
8622 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8623 << "fsck error: " << o->oid
8624 << " has omap that is not per-pool or pgmeta"
8625 << fsck_dendl;
8626 ++errors;
8627 } else {
8628 const char* w;
8629 int64_t num;
8630 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8631 ++errors;
8632 num = errors;
8633 w = "error";
8634 } else {
8635 ++warnings;
8636 num = warnings;
8637 w = "warning";
8638 }
8639 fsck_derr(num, MAX_FSCK_ERROR_LINES)
8640 << "fsck " << w << ": " << o->oid
8641 << " has omap that is not per-pool or pgmeta"
8642 << fsck_dendl;
8643 }
f67539c2
TL
8644 } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) {
8645 if (per_pool_omap == OMAP_PER_PG) {
8646 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8647 << "fsck error: " << o->oid
8648 << " has omap that is not per-pg or pgmeta"
8649 << fsck_dendl;
8650 ++errors;
8651 } else {
8652 const char* w;
8653 int64_t num;
8654 if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) {
8655 ++errors;
8656 num = errors;
8657 w = "error";
8658 } else {
8659 ++warnings;
8660 num = warnings;
8661 w = "warning";
8662 }
8663 fsck_derr(num, MAX_FSCK_ERROR_LINES)
8664 << "fsck " << w << ": " << o->oid
8665 << " has omap that is not per-pg or pgmeta"
8666 << fsck_dendl;
8667 }
9f95a23c
TL
8668 }
8669 if (repairer &&
f67539c2 8670 !o->onode.is_perpg_omap() &&
9f95a23c 8671 !o->onode.is_pgmeta_omap()) {
f67539c2 8672 dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl;
522d829b 8673 bufferlist header;
9f95a23c 8674 map<string, bufferlist> kv;
522d829b
TL
8675 {
8676 KeyValueDB::Transaction txn = db->get_transaction();
8677 uint64_t txn_cost = 0;
8678 const string& prefix = Onode::calc_omap_prefix(o->onode.flags);
8679 uint8_t new_flags = o->onode.flags |
8680 bluestore_onode_t::FLAG_PERPOOL_OMAP |
8681 bluestore_onode_t::FLAG_PERPG_OMAP;
8682 const string& new_omap_prefix = Onode::calc_omap_prefix(new_flags);
8683
8684 KeyValueDB::Iterator it = db->get_iterator(prefix);
8685 string head, tail;
8686 o->get_omap_header(&head);
8687 o->get_omap_tail(&tail);
8688 it->lower_bound(head);
8689 // head
8690 if (it->valid() && it->key() == head) {
8691 dout(30) << __func__ << " got header" << dendl;
8692 header = it->value();
8693 if (header.length()) {
8694 string new_head;
8695 Onode::calc_omap_header(new_flags, o.get(), &new_head);
8696 txn->set(new_omap_prefix, new_head, header);
8697 txn_cost += new_head.length() + header.length();
8698 }
a4b75251 8699 it->next();
522d829b
TL
8700 }
8701 // tail
8702 {
8703 string new_tail;
8704 Onode::calc_omap_tail(new_flags, o.get(), &new_tail);
8705 bufferlist empty;
8706 txn->set(new_omap_prefix, new_tail, empty);
8707 txn_cost += new_tail.length() + new_tail.length();
8708 }
8709 // values
8710 string final_key;
8711 Onode::calc_omap_key(new_flags, o.get(), string(), &final_key);
8712 size_t base_key_len = final_key.size();
8713 while (it->valid() && it->key() < tail) {
8714 string user_key;
8715 o->decode_omap_key(it->key(), &user_key);
8716 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
8717 << " -> " << user_key << dendl;
8718
8719 final_key.resize(base_key_len);
a4b75251 8720 final_key += user_key;
522d829b
TL
8721 auto v = it->value();
8722 txn->set(new_omap_prefix, final_key, v);
8723 txn_cost += final_key.length() + v.length();
8724
8725 // submit a portion if cost exceeds 16MB
8726 if (txn_cost >= 16 * (1 << 20) ) {
8727 db->submit_transaction_sync(txn);
8728 txn = db->get_transaction();
8729 txn_cost = 0;
8730 }
8731 it->next();
8732 }
8733 if (txn_cost > 0) {
8734 db->submit_transaction_sync(txn);
8735 }
8736 }
8737 // finalize: remove legacy data
8738 {
9f95a23c
TL
8739 KeyValueDB::Transaction txn = db->get_transaction();
8740 // remove old keys
8741 const string& old_omap_prefix = o->get_omap_prefix();
8742 string old_head, old_tail;
8743 o->get_omap_header(&old_head);
8744 o->get_omap_tail(&old_tail);
8745 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
8746 txn->rmkey(old_omap_prefix, old_tail);
8747 // set flag
f67539c2 8748 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP);
9f95a23c 8749 _record_onode(o, txn);
9f95a23c
TL
8750 db->submit_transaction_sync(txn);
8751 repairer->inc_repaired();
522d829b 8752 repairer->request_compaction();
9f95a23c 8753 }
eafe8130 8754 }
9f95a23c 8755}
eafe8130 8756
20effc67
TL
8757void BlueStore::_fsck_check_objects(
8758 FSCKDepth depth,
9f95a23c
TL
8759 BlueStore::FSCK_ObjectCtx& ctx)
8760{
eafe8130 8761 auto& errors = ctx.errors;
eafe8130
TL
8762 auto sb_info_lock = ctx.sb_info_lock;
8763 auto& sb_info = ctx.sb_info;
20effc67 8764 auto& sb_ref_counts = ctx.sb_ref_counts;
eafe8130
TL
8765 auto repairer = ctx.repairer;
8766
8767 uint64_t_btree_t used_nids;
8768
8769 size_t processed_myself = 0;
8770
f67539c2 8771 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
8772 mempool::bluestore_fsck::list<string> expecting_shards;
8773 if (it) {
8774 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
8775 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
8776 std::unique_ptr<WQ> wq(
8777 new WQ(
8778 "FSCKWorkQueue",
8779 (thread_count ? : 1) * 32,
8780 this,
eafe8130
TL
8781 sb_info_lock,
8782 sb_info,
20effc67 8783 sb_ref_counts,
eafe8130
TL
8784 repairer));
8785
8786 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
8787
8788 thread_pool.add_work_queue(wq.get());
8789 if (depth == FSCK_SHALLOW && thread_count > 0) {
8790 //not the best place but let's check anyway
8791 ceph_assert(sb_info_lock);
8792 thread_pool.start();
8793 }
8794
20effc67 8795 // fill global if not overriden below
eafe8130
TL
8796 CollectionRef c;
8797 int64_t pool_id = -1;
8798 spg_t pgid;
8799 for (it->lower_bound(string()); it->valid(); it->next()) {
8800 dout(30) << __func__ << " key "
8801 << pretty_binary_string(it->key()) << dendl;
8802 if (is_extent_shard_key(it->key())) {
8803 if (depth == FSCK_SHALLOW) {
8804 continue;
8805 }
8806 while (!expecting_shards.empty() &&
8807 expecting_shards.front() < it->key()) {
8808 derr << "fsck error: missing shard key "
8809 << pretty_binary_string(expecting_shards.front())
8810 << dendl;
8811 ++errors;
8812 expecting_shards.pop_front();
8813 }
8814 if (!expecting_shards.empty() &&
8815 expecting_shards.front() == it->key()) {
8816 // all good
8817 expecting_shards.pop_front();
8818 continue;
8819 }
8820
8821 uint32_t offset;
8822 string okey;
8823 get_key_extent_shard(it->key(), &okey, &offset);
8824 derr << "fsck error: stray shard 0x" << std::hex << offset
8825 << std::dec << dendl;
8826 if (expecting_shards.empty()) {
8827 derr << "fsck error: " << pretty_binary_string(it->key())
8828 << " is unexpected" << dendl;
8829 ++errors;
8830 continue;
8831 }
8832 while (expecting_shards.front() > it->key()) {
8833 derr << "fsck error: saw " << pretty_binary_string(it->key())
8834 << dendl;
8835 derr << "fsck error: exp "
8836 << pretty_binary_string(expecting_shards.front()) << dendl;
8837 ++errors;
8838 expecting_shards.pop_front();
8839 if (expecting_shards.empty()) {
8840 break;
8841 }
8842 }
8843 continue;
8844 }
8845
8846 ghobject_t oid;
8847 int r = get_key_object(it->key(), &oid);
8848 if (r < 0) {
8849 derr << "fsck error: bad object key "
8850 << pretty_binary_string(it->key()) << dendl;
8851 ++errors;
8852 continue;
8853 }
8854 if (!c ||
8855 oid.shard_id != pgid.shard ||
8856 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8857 !c->contains(oid)) {
8858 c = nullptr;
8859 for (auto& p : coll_map) {
8860 if (p.second->contains(oid)) {
8861 c = p.second;
8862 break;
8863 }
8864 }
8865 if (!c) {
8866 derr << "fsck error: stray object " << oid
8867 << " not owned by any collection" << dendl;
8868 ++errors;
8869 continue;
8870 }
8871 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
8872 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
8873 << dendl;
8874 }
8875
8876 if (depth != FSCK_SHALLOW &&
8877 !expecting_shards.empty()) {
8878 for (auto& k : expecting_shards) {
8879 derr << "fsck error: missing shard key "
8880 << pretty_binary_string(k) << dendl;
8881 }
8882 ++errors;
8883 expecting_shards.clear();
8884 }
8885
8886 bool queued = false;
8887 if (depth == FSCK_SHALLOW && thread_count > 0) {
8888 queued = wq->queue(
8889 pool_id,
8890 c,
8891 oid,
8892 it->key(),
8893 it->value());
8894 }
8895 OnodeRef o;
8896 map<BlobRef, bluestore_blob_t::unused_t> referenced;
8897
8898 if (!queued) {
8899 ++processed_myself;
eafe8130
TL
8900 o = fsck_check_objects_shallow(
8901 depth,
8902 pool_id,
8903 c,
8904 oid,
8905 it->key(),
8906 it->value(),
9f95a23c 8907 &expecting_shards,
eafe8130
TL
8908 &referenced,
8909 ctx);
8910 }
8911
8912 if (depth != FSCK_SHALLOW) {
8913 ceph_assert(o != nullptr);
8914 if (o->onode.nid) {
8915 if (o->onode.nid > nid_max) {
8916 derr << "fsck error: " << oid << " nid " << o->onode.nid
8917 << " > nid_max " << nid_max << dendl;
8918 ++errors;
8919 }
8920 if (used_nids.count(o->onode.nid)) {
8921 derr << "fsck error: " << oid << " nid " << o->onode.nid
8922 << " already in use" << dendl;
8923 ++errors;
8924 continue; // go for next object
8925 }
8926 used_nids.insert(o->onode.nid);
8927 }
8928 for (auto& i : referenced) {
8929 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
8930 << std::dec << " for " << *i.first << dendl;
8931 const bluestore_blob_t& blob = i.first->get_blob();
8932 if (i.second & blob.unused) {
8933 derr << "fsck error: " << oid << " blob claims unused 0x"
8934 << std::hex << blob.unused
8935 << " but extents reference 0x" << i.second << std::dec
8936 << " on blob " << *i.first << dendl;
8937 ++errors;
8938 }
8939 if (blob.has_csum()) {
8940 uint64_t blob_len = blob.get_logical_length();
8941 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
8942 unsigned csum_count = blob.get_csum_count();
8943 unsigned csum_chunk_size = blob.get_csum_chunk_size();
8944 for (unsigned p = 0; p < csum_count; ++p) {
8945 unsigned pos = p * csum_chunk_size;
8946 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
8947 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
8948 unsigned mask = 1u << firstbit;
8949 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
8950 mask |= 1u << b;
8951 }
8952 if ((blob.unused & mask) == mask) {
8953 // this csum chunk region is marked unused
8954 if (blob.get_csum_item(p) != 0) {
8955 derr << "fsck error: " << oid
8956 << " blob claims csum chunk 0x" << std::hex << pos
8957 << "~" << csum_chunk_size
8958 << " is unused (mask 0x" << mask << " of unused 0x"
8959 << blob.unused << ") but csum is non-zero 0x"
8960 << blob.get_csum_item(p) << std::dec << " on blob "
8961 << *i.first << dendl;
8962 ++errors;
8963 }
8964 }
8965 }
8966 }
8967 }
8968 // omap
8969 if (o->onode.has_omap()) {
9f95a23c
TL
8970 ceph_assert(ctx.used_omap_head);
8971 if (ctx.used_omap_head->count(o->onode.nid)) {
8972 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
8973 << " already in use" << dendl;
eafe8130
TL
8974 ++errors;
8975 } else {
9f95a23c 8976 ctx.used_omap_head->insert(o->onode.nid);
eafe8130 8977 }
9f95a23c 8978 } // if (o->onode.has_omap())
eafe8130
TL
8979 if (depth == FSCK_DEEP) {
8980 bufferlist bl;
8981 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
8982 uint64_t offset = 0;
8983 do {
8984 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
8985 int r = _do_read(c.get(), o, offset, l, bl,
8986 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
8987 if (r < 0) {
8988 ++errors;
8989 derr << "fsck error: " << oid << std::hex
8990 << " error during read: "
8991 << " " << offset << "~" << l
8992 << " " << cpp_strerror(r) << std::dec
8993 << dendl;
8994 break;
8995 }
8996 offset += l;
8997 } while (offset < o->onode.size);
8998 } // deep
8999 } //if (depth != FSCK_SHALLOW)
9000 } // for (it->lower_bound(string()); it->valid(); it->next())
9001 if (depth == FSCK_SHALLOW && thread_count > 0) {
9002 wq->finalize(thread_pool, ctx);
9003 if (processed_myself) {
9004 // may be needs more threads?
9005 dout(0) << __func__ << " partial offload"
9006 << ", done myself " << processed_myself
9007 << " of " << ctx.num_objects
9008 << "objects, threads " << thread_count
9009 << dendl;
9010 }
9011 }
9012 } // if (it)
9013}
9014/**
9015An overview for currently implemented repair logics
9016performed in fsck in two stages: detection(+preparation) and commit.
9017Detection stage (in processing order):
9018 (Issue -> Repair action to schedule)
9019 - Detect undecodable keys for Shared Blobs -> Remove
9020 - Detect undecodable records for Shared Blobs -> Remove
9021 (might trigger missed Shared Blob detection below)
9022 - Detect stray records for Shared Blobs -> Remove
9023 - Detect misreferenced pextents -> Fix
9024 Prepare Bloom-like filter to track cid/oid -> pextent
9025 Prepare list of extents that are improperly referenced
9026 Enumerate Onode records that might use 'misreferenced' pextents
9027 (Bloom-like filter applied to reduce computation)
9028 Per each questinable Onode enumerate all blobs and identify broken ones
9029 (i.e. blobs having 'misreferences')
9030 Rewrite each broken blob data by allocating another extents and
9031 copying data there
9032 If blob is shared - unshare it and mark corresponding Shared Blob
9033 for removal
9034 Release previously allocated space
9035 Update Extent Map
9036 - Detect missed Shared Blobs -> Recreate
9037 - Detect undecodable deferred transaction -> Remove
9038 - Detect Freelist Manager's 'false free' entries -> Mark as used
9039 - Detect Freelist Manager's leaked entries -> Mark as free
9040 - Detect statfs inconsistency - Update
9041 Commit stage (separate DB commit per each step):
9042 - Apply leaked FM entries fix
9043 - Apply 'false free' FM entries fix
9044 - Apply 'Remove' actions
9045 - Apply fix for misreference pextents
9046 - Apply Shared Blob recreate
9047 (can be merged with the step above if misreferences were dectected)
9048 - Apply StatFS update
9049*/
9050int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
9051{
20effc67 9052 dout(5) << __func__
eafe8130
TL
9053 << (repair ? " repair" : " check")
9054 << (depth == FSCK_DEEP ? " (deep)" :
9055 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
9056 << dendl;
9057
9058 // in deep mode we need R/W write access to be able to replay deferred ops
20effc67 9059 const bool read_only = !(repair || depth == FSCK_DEEP);
f67539c2 9060 int r = _open_db_and_around(read_only);
20effc67 9061 if (r < 0) {
eafe8130 9062 return r;
20effc67
TL
9063 }
9064 auto close_db = make_scope_guard([&] {
9065 _close_db_and_around();
9066 });
7c673cae 9067
11fdf7f2
TL
9068 if (!read_only) {
9069 r = _upgrade_super();
9070 if (r < 0) {
20effc67 9071 return r;
11fdf7f2
TL
9072 }
9073 }
7c673cae 9074
20effc67 9075 // NullFreelistManager needs to open collection early
eafe8130 9076 r = _open_collections();
20effc67
TL
9077 if (r < 0) {
9078 return r;
9079 }
7c673cae
FG
9080
9081 mempool_thread.init();
20effc67
TL
9082 auto stop_mempool = make_scope_guard([&] {
9083 mempool_thread.shutdown();
9084 _shutdown_cache();
9085 });
11fdf7f2
TL
9086 // we need finisher and kv_{sync,finalize}_thread *just* for replay
9087 // enable in repair or deep mode modes only
9088 if (!read_only) {
9089 _kv_start();
9090 r = _deferred_replay();
9091 _kv_stop();
9092 }
eafe8130 9093
20effc67
TL
9094 if (r < 0) {
9095 return r;
9096 }
9097 return _fsck_on_open(depth, repair);
eafe8130
TL
9098}
9099
9100int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
9101{
20effc67
TL
9102 uint64_t sb_hash_size = uint64_t(
9103 cct->_conf.get_val<Option::size_t>("osd_memory_target") *
9104 cct->_conf.get_val<double>(
9105 "bluestore_fsck_shared_blob_tracker_size"));
9106
eafe8130
TL
9107 dout(1) << __func__
9108 << " <<<START>>>"
9109 << (repair ? " repair" : " check")
9110 << (depth == FSCK_DEEP ? " (deep)" :
9111 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
20effc67
TL
9112 << " start sb_tracker_hash_size:" << sb_hash_size
9113 << dendl;
eafe8130
TL
9114 int64_t errors = 0;
9115 int64_t warnings = 0;
9116 unsigned repaired = 0;
9117
9118 uint64_t_btree_t used_omap_head;
eafe8130
TL
9119 uint64_t_btree_t used_sbids;
9120
f67539c2 9121 mempool_dynamic_bitset used_blocks, bluefs_used_blocks;
eafe8130
TL
9122 KeyValueDB::Iterator it;
9123 store_statfs_t expected_store_statfs, actual_statfs;
9124 per_pool_statfs expected_pool_statfs;
9125
20effc67
TL
9126 sb_info_space_efficient_map_t sb_info;
9127 shared_blob_2hash_tracker_t sb_ref_counts(
9128 sb_hash_size,
9129 min_alloc_size);
9130 size_t sb_ref_mismatches = 0;
9131
9132 /// map of oid -> (first_)offset for each zone
9133 std::vector<std::unordered_map<ghobject_t, uint64_t>> zone_refs; // FIXME: this may be a lot of RAM!
eafe8130
TL
9134
9135 uint64_t num_objects = 0;
9136 uint64_t num_extents = 0;
9137 uint64_t num_blobs = 0;
9138 uint64_t num_spanning_blobs = 0;
9139 uint64_t num_shared_blobs = 0;
9140 uint64_t num_sharded_objects = 0;
9141 BlueStoreRepairer repairer;
9142
f67539c2
TL
9143 auto alloc_size = fm->get_alloc_size();
9144
eafe8130
TL
9145 utime_t start = ceph_clock_now();
9146
9147 _fsck_collections(&errors);
b32b8144 9148 used_blocks.resize(fm->get_alloc_units());
7c673cae
FG
9149
9150 if (bluefs) {
f67539c2 9151 interval_set<uint64_t> bluefs_extents;
11fdf7f2 9152
f67539c2
TL
9153 int r = bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents);
9154 ceph_assert(r == 0);
9155 for (auto [start, len] : bluefs_extents) {
9156 apply_for_bitset_range(start, len, alloc_size, used_blocks,
9157 [&](uint64_t pos, mempool_dynamic_bitset& bs) {
9158 ceph_assert(pos < bs.size());
7c673cae 9159 bs.set(pos);
f67539c2
TL
9160 }
9161 );
9162 }
9163 }
9164
9165 bluefs_used_blocks = used_blocks;
9166
9167 apply_for_bitset_range(
9168 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks,
9169 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9170 bs.set(pos);
7c673cae 9171 }
f67539c2
TL
9172 );
9173
9174
9175 if (repair) {
b3b6e05e 9176 repairer.init_space_usage_tracker(
f67539c2
TL
9177 bdev->get_size(),
9178 min_alloc_size);
9179 }
9180
9181 if (bluefs) {
eafe8130 9182 int r = bluefs->fsck();
7c673cae 9183 if (r < 0) {
eafe8130 9184 return r;
7c673cae
FG
9185 }
9186 if (r > 0)
9187 errors += r;
9188 }
9189
eafe8130
TL
9190 if (!per_pool_stat_collection) {
9191 const char *w;
9192 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
9193 w = "error";
9194 ++errors;
9195 } else {
9196 w = "warning";
9197 ++warnings;
9198 }
9199 derr << "fsck " << w << ": store not yet converted to per-pool stats"
9200 << dendl;
9201 }
f67539c2 9202 if (per_pool_omap != OMAP_PER_PG) {
9f95a23c
TL
9203 const char *w;
9204 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
9205 w = "error";
9206 ++errors;
9207 } else {
9208 w = "warning";
9209 ++warnings;
9210 }
f67539c2 9211 derr << "fsck " << w << ": store not yet converted to per-pg omap"
9f95a23c
TL
9212 << dendl;
9213 }
9214
11fdf7f2 9215 // get expected statfs; reset unaffected fields to be able to compare
7c673cae
FG
9216 // structs
9217 statfs(&actual_statfs);
11fdf7f2
TL
9218 actual_statfs.total = 0;
9219 actual_statfs.internally_reserved = 0;
9220 actual_statfs.available = 0;
9221 actual_statfs.internal_metadata = 0;
9222 actual_statfs.omap_allocated = 0;
9223
eafe8130
TL
9224 if (g_conf()->bluestore_debug_fsck_abort) {
9225 dout(1) << __func__ << " debug abort" << dendl;
9226 goto out_scan;
9227 }
20effc67
TL
9228
9229#ifdef HAVE_LIBZBD
9230 if (bdev->is_smr()) {
9231 auto a = dynamic_cast<ZonedAllocator*>(alloc);
9232 ceph_assert(a);
9233 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
9234 ceph_assert(f);
9235 vector<uint64_t> wp = bdev->get_zones();
9236 vector<zone_state_t> zones = f->get_zone_states(db);
9237 ceph_assert(wp.size() == zones.size());
9238 auto num_zones = bdev->get_size() / zone_size;
9239 for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
9240 uint64_t p = wp[i] == (i + 1) * zone_size ? zone_size : wp[i] % zone_size;
9241 if (zones[i].write_pointer > p &&
9242 zones[i].num_dead_bytes < zones[i].write_pointer) {
9243 derr << "fsck error: zone 0x" << std::hex << i
9244 << " bluestore write pointer 0x" << zones[i].write_pointer
9245 << " > device write pointer 0x" << p
9246 << " (with only 0x" << zones[i].num_dead_bytes << " dead bytes)"
9247 << std::dec << dendl;
9248 ++errors;
9249 }
9250 }
9251
9252 if (depth != FSCK_SHALLOW) {
9253 // load zone refs
9254 zone_refs.resize(bdev->get_size() / zone_size);
9255 it = db->get_iterator(PREFIX_ZONED_CL_INFO, KeyValueDB::ITERATOR_NOCACHE);
9256 if (it) {
9257 for (it->lower_bound(string());
9258 it->valid();
9259 it->next()) {
9260 uint32_t zone = 0;
9261 uint64_t offset = 0;
9262 ghobject_t oid;
9263 string key = it->key();
9264 int r = get_key_zone_offset_object(key, &zone, &offset, &oid);
9265 if (r < 0) {
9266 derr << "fsck error: invalid zone ref key " << pretty_binary_string(key)
9267 << dendl;
9268 if (repair) {
9269 repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
9270 }
9271 ++errors;
9272 continue;
9273 }
9274 dout(30) << " zone ref 0x" << std::hex << zone << " offset 0x" << offset
9275 << " -> " << std::dec << oid << dendl;
9276 if (zone_refs[zone].count(oid)) {
9277 derr << "fsck error: second zone ref in zone 0x" << std::hex << zone
9278 << " offset 0x" << offset << std::dec << " for " << oid << dendl;
9279 if (repair) {
9280 repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
9281 }
9282 ++errors;
9283 continue;
9284 }
9285 zone_refs[zone][oid] = offset;
9286 }
9287 }
9288 }
9289 }
9290#endif
9291
9292 dout(1) << __func__ << " checking shared_blobs (phase 1)" << dendl;
9293 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
9294 if (it) {
9295 for (it->lower_bound(string()); it->valid(); it->next()) {
9296 string key = it->key();
9297 uint64_t sbid;
9298 if (get_key_shared_blob(key, &sbid) < 0) {
9299 // Failed to parse the key.
9300 // This gonna to be handled at the second stage
9301 continue;
9302 }
9303 bluestore_shared_blob_t shared_blob(sbid);
9304 bufferlist bl = it->value();
9305 auto blp = bl.cbegin();
9306 try {
9307 decode(shared_blob, blp);
9308 }
9309 catch (ceph::buffer::error& e) {
9310 // this gonna to be handled at the second stage
9311 continue;
9312 }
9313 dout(20) << __func__ << " " << shared_blob << dendl;
9314 auto& sbi = sb_info.add_maybe_stray(sbid);
9315
9316 // primarily to silent the 'unused' warning
9317 ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID);
9318
9319 for (auto& r : shared_blob.ref_map.ref_map) {
9320 sb_ref_counts.inc_range(
9321 sbid,
9322 r.first,
9323 r.second.length,
9324 -r.second.refs);
9325 }
9326 }
9327 } // if (it) //checking shared_blobs (phase1)
9328
7c673cae 9329 // walk PREFIX_OBJ
eafe8130
TL
9330 {
9331 dout(1) << __func__ << " walking object keyspace" << dendl;
9332 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
9333 BlueStore::FSCK_ObjectCtx ctx(
9334 errors,
9335 warnings,
9336 num_objects,
9337 num_extents,
9338 num_blobs,
9339 num_sharded_objects,
9340 num_spanning_blobs,
9341 &used_blocks,
9342 &used_omap_head,
20effc67 9343 &zone_refs,
9f95a23c
TL
9344 //no need for the below lock when in non-shallow mode as
9345 // there is no multithreading in this case
9346 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
eafe8130 9347 sb_info,
20effc67 9348 sb_ref_counts,
eafe8130
TL
9349 expected_store_statfs,
9350 expected_pool_statfs,
9351 repair ? &repairer : nullptr);
9f95a23c
TL
9352
9353 _fsck_check_objects(depth, ctx);
eafe8130 9354 }
11fdf7f2 9355
20effc67
TL
9356#ifdef HAVE_LIBZBD
9357 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
9358 dout(1) << __func__ << " checking for leaked zone refs" << dendl;
9359 for (uint32_t zone = 0; zone < zone_refs.size(); ++zone) {
9360 for (auto& [oid, offset] : zone_refs[zone]) {
9361 derr << "fsck error: stray zone ref 0x" << std::hex << zone
9362 << " offset 0x" << offset << " -> " << std::dec << oid << dendl;
9363 // FIXME: add repair
9364 ++errors;
9365 }
9366 }
9367 }
9368#endif
9369
9370 sb_ref_mismatches = sb_ref_counts.count_non_zero();
9371 if (sb_ref_mismatches != 0) {
9372 derr << "fsck error: shared blob references aren't matching, at least "
9373 << sb_ref_mismatches << " found" << dendl;
9374 errors += sb_ref_mismatches;
9375 }
9376
9377 if (depth != FSCK_SHALLOW && repair) {
9378 _fsck_repair_shared_blobs(repairer, sb_ref_counts, sb_info);
9379 }
9380 dout(1) << __func__ << " checking shared_blobs (phase 2)" << dendl;
f67539c2 9381 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
7c673cae 9382 if (it) {
eafe8130
TL
9383 // FIXME minor: perhaps simplify for shallow mode?
9384 // fill global if not overriden below
9385 auto expected_statfs = &expected_store_statfs;
7c673cae
FG
9386 for (it->lower_bound(string()); it->valid(); it->next()) {
9387 string key = it->key();
9388 uint64_t sbid;
9389 if (get_key_shared_blob(key, &sbid)) {
3efd9988 9390 derr << "fsck error: bad key '" << key
20effc67 9391 << "' in shared blob namespace" << dendl;
11fdf7f2
TL
9392 if (repair) {
9393 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9394 }
7c673cae
FG
9395 ++errors;
9396 continue;
9397 }
9398 auto p = sb_info.find(sbid);
9399 if (p == sb_info.end()) {
20effc67
TL
9400 if (sb_ref_mismatches > 0) {
9401 // highly likely this has been already reported before, ignoring...
9402 dout(5) << __func__ << " found duplicate(?) stray shared blob data for sbid 0x"
9403 << std::hex << sbid << std::dec << dendl;
9404 } else {
9405 derr<< "fsck error: found stray shared blob data for sbid 0x"
9406 << std::hex << sbid << std::dec << dendl;
9407 ++errors;
9408 if (repair) {
9409 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9410 }
11fdf7f2 9411 }
7c673cae
FG
9412 } else {
9413 ++num_shared_blobs;
20effc67 9414 sb_info_t& sbi = *p;
7c673cae
FG
9415 bluestore_shared_blob_t shared_blob(sbid);
9416 bufferlist bl = it->value();
11fdf7f2
TL
9417 auto blp = bl.cbegin();
9418 try {
20effc67
TL
9419 decode(shared_blob, blp);
9420 }
9421 catch (ceph::buffer::error& e) {
7c673cae 9422 ++errors;
20effc67
TL
9423
9424 derr << "fsck error: failed to decode Shared Blob"
9425 << pretty_binary_string(key) << dendl;
9426 if (repair) {
9427 dout(20) << __func__ << " undecodable Shared Blob, key:'"
9428 << pretty_binary_string(key)
9429 << "', removing" << dendl;
9430 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9431 }
9432 continue;
7c673cae 9433 }
20effc67 9434 dout(20) << __func__ << " " << shared_blob << dendl;
7c673cae 9435 PExtentVector extents;
20effc67 9436 for (auto& r : shared_blob.ref_map.ref_map) {
7c673cae
FG
9437 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
9438 }
20effc67
TL
9439 if (sbi.pool_id != sb_info_t::INVALID_POOL_ID &&
9440 (per_pool_stat_collection || repair)) {
11fdf7f2
TL
9441 expected_statfs = &expected_pool_statfs[sbi.pool_id];
9442 }
20effc67
TL
9443 std::stringstream ss;
9444 ss << "sbid 0x" << std::hex << sbid << std::dec;
9445 errors += _fsck_check_extents(ss.str(),
9446 extents,
9447 sbi.allocated_chunks < 0,
9448 used_blocks,
9449 fm->get_alloc_size(),
9450 repair ? &repairer : nullptr,
9451 *expected_statfs,
9452 depth);
11fdf7f2
TL
9453 }
9454 }
20effc67 9455 } // if (it) /* checking shared_blobs (phase 2)*/
11fdf7f2
TL
9456
9457 if (repair && repairer.preprocess_misreference(db)) {
9458
9459 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
11fdf7f2
TL
9460 auto& misref_extents = repairer.get_misreferences();
9461 interval_set<uint64_t> to_release;
f67539c2 9462 it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2 9463 if (it) {
eafe8130
TL
9464 // fill global if not overriden below
9465 auto expected_statfs = &expected_store_statfs;
11fdf7f2
TL
9466
9467 CollectionRef c;
9468 spg_t pgid;
9469 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
9470 bool bypass_rest = false;
9471 for (it->lower_bound(string()); it->valid() && !bypass_rest;
9472 it->next()) {
9473 dout(30) << __func__ << " key "
9474 << pretty_binary_string(it->key()) << dendl;
9475 if (is_extent_shard_key(it->key())) {
9476 continue;
9477 }
9478
9479 ghobject_t oid;
9480 int r = get_key_object(it->key(), &oid);
b3b6e05e 9481 if (r < 0 || !repairer.is_used(oid)) {
11fdf7f2
TL
9482 continue;
9483 }
9484
9485 if (!c ||
9486 oid.shard_id != pgid.shard ||
9487 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
9488 !c->contains(oid)) {
9489 c = nullptr;
9490 for (auto& p : coll_map) {
9491 if (p.second->contains(oid)) {
9492 c = p.second;
9493 break;
9494 }
9495 }
9496 if (!c) {
9497 continue;
9498 }
eafe8130
TL
9499 if (per_pool_stat_collection || repair) {
9500 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
11fdf7f2
TL
9501 expected_statfs = &expected_pool_statfs[pool_id];
9502 }
9503 }
b3b6e05e 9504 if (!repairer.is_used(c->cid)) {
11fdf7f2
TL
9505 continue;
9506 }
9507
9508 dout(20) << __func__ << " check misreference for col:" << c->cid
9509 << " obj:" << oid << dendl;
9510
eafe8130
TL
9511 OnodeRef o;
9512 o.reset(Onode::decode(c, oid, it->key(), it->value()));
11fdf7f2
TL
9513 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9514 mempool::bluestore_fsck::set<BlobRef> blobs;
9515
9516 for (auto& e : o->extent_map.extent_map) {
9517 blobs.insert(e.blob);
9518 }
9519 bool need_onode_update = false;
9520 bool first_dump = true;
9521 for(auto b : blobs) {
9522 bool broken_blob = false;
9523 auto& pextents = b->dirty_blob().dirty_extents();
9524 for (auto& e : pextents) {
9525 if (!e.is_valid()) {
9526 continue;
9527 }
9528 // for the sake of simplicity and proper shared blob handling
9529 // always rewrite the whole blob even when it's partially
9530 // misreferenced.
9531 if (misref_extents.intersects(e.offset, e.length)) {
9532 if (first_dump) {
9533 first_dump = false;
81eedcae 9534 _dump_onode<10>(cct, *o);
11fdf7f2
TL
9535 }
9536 broken_blob = true;
9537 break;
9538 }
9539 }
9540 if (!broken_blob)
9541 continue;
9542 bool compressed = b->get_blob().is_compressed();
9543 need_onode_update = true;
9544 dout(10) << __func__
9545 << " fix misreferences in oid:" << oid
9546 << " " << *b << dendl;
9547 uint64_t b_off = 0;
9548 PExtentVector pext_to_release;
9549 pext_to_release.reserve(pextents.size());
9550 // rewriting all valid pextents
9551 for (auto e = pextents.begin(); e != pextents.end();
a4b75251
TL
9552 e++) {
9553 auto b_off_cur = b_off;
9554 b_off += e->length;
11fdf7f2
TL
9555 if (!e->is_valid()) {
9556 continue;
9557 }
9558 PExtentVector exts;
20effc67 9559 dout(5) << __func__ << "::NCB::(F)alloc=" << alloc << ", length=" << e->length << dendl;
f67539c2 9560 int64_t alloc_len =
20effc67 9561 alloc->allocate(e->length, min_alloc_size,
f67539c2 9562 0, 0, &exts);
eafe8130 9563 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
11fdf7f2
TL
9564 derr << __func__
9565 << " failed to allocate 0x" << std::hex << e->length
eafe8130 9566 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2 9567 << " min_alloc_size 0x" << min_alloc_size
20effc67 9568 << " available 0x " << alloc->get_free()
11fdf7f2
TL
9569 << std::dec << dendl;
9570 if (alloc_len > 0) {
20effc67 9571 alloc->release(exts);
11fdf7f2
TL
9572 }
9573 bypass_rest = true;
9574 break;
9575 }
9576 expected_statfs->allocated += e->length;
9577 if (compressed) {
9578 expected_statfs->data_compressed_allocated += e->length;
9579 }
9580
9581 bufferlist bl;
20effc67 9582 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
11fdf7f2
TL
9583 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
9584 if (r < 0) {
9585 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
9586 <<"~" << e->length << std::dec << dendl;
9587 ceph_abort_msg("read failed, wtf");
9588 }
9589 pext_to_release.push_back(*e);
9590 e = pextents.erase(e);
9591 e = pextents.insert(e, exts.begin(), exts.end());
9592 b->get_blob().map_bl(
20effc67 9593 b_off_cur, bl,
11fdf7f2
TL
9594 [&](uint64_t offset, bufferlist& t) {
9595 int r = bdev->write(offset, t, false);
9596 ceph_assert(r == 0);
9597 });
9598 e += exts.size() - 1;
9599 for (auto& p : exts) {
9600 fm->allocate(p.offset, p.length, txn);
9601 }
9602 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
9603
9604 if (b->get_blob().is_shared()) {
9605 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
9606
20effc67
TL
9607 auto sbid = b->shared_blob->get_sbid();
9608 auto sb_it = sb_info.find(sbid);
11fdf7f2 9609 ceph_assert(sb_it != sb_info.end());
20effc67
TL
9610 sb_info_t& sbi = *sb_it;
9611
9612 if (sbi.allocated_chunks < 0) {
9613 // NB: it's crucial to use compressed_allocated_chunks from sb_info_t
9614 // as we originally used that value while accumulating
9615 // expected_statfs
9616 expected_statfs->allocated -= uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
9617 expected_statfs->data_compressed_allocated -=
9618 uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
9619 } else {
9620 expected_statfs->allocated -= uint64_t(sbi.allocated_chunks) << min_alloc_size_order;
11fdf7f2 9621 }
20effc67
TL
9622 sbi.allocated_chunks = 0;
9623 repairer.fix_shared_blob(txn, sbid, nullptr, 0);
9624
11fdf7f2
TL
9625 // relying on blob's pextents to decide what to release.
9626 for (auto& p : pext_to_release) {
9627 to_release.union_insert(p.offset, p.length);
9628 }
9629 } else {
9630 for (auto& p : pext_to_release) {
9631 expected_statfs->allocated -= p.length;
9632 if (compressed) {
9633 expected_statfs->data_compressed_allocated -= p.length;
9634 }
9635 to_release.union_insert(p.offset, p.length);
9636 }
9637 }
9638 if (bypass_rest) {
9639 break;
9640 }
9641 } // for(auto b : blobs)
9642 if (need_onode_update) {
9643 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
9644 _record_onode(o, txn);
9645 }
9646 } // for (it->lower_bound(string()); it->valid(); it->next())
9647
9648 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
9649 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
9650 << "~" << it.get_len() << std::dec << dendl;
9651 fm->release(it.get_start(), it.get_len(), txn);
9652 }
20effc67 9653 alloc->release(to_release);
11fdf7f2
TL
9654 to_release.clear();
9655 } // if (it) {
9656 } //if (repair && repairer.preprocess_misreference()) {
11fdf7f2 9657 sb_info.clear();
20effc67 9658 sb_ref_counts.reset();
11fdf7f2 9659
eafe8130
TL
9660 // check global stats only if fscking (not repairing) w/o per-pool stats
9661 if (!per_pool_stat_collection &&
9662 !repair &&
9663 !(actual_statfs == expected_store_statfs)) {
9664 derr << "fsck error: actual " << actual_statfs
9665 << " != expected " << expected_store_statfs << dendl;
9666 if (repair) {
9667 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
9668 expected_store_statfs);
11fdf7f2 9669 }
eafe8130 9670 ++errors;
7c673cae
FG
9671 }
9672
eafe8130
TL
9673 dout(1) << __func__ << " checking pool_statfs" << dendl;
9674 _fsck_check_pool_statfs(expected_pool_statfs,
9675 errors, warnings, repair ? &repairer : nullptr);
9676
9677 if (depth != FSCK_SHALLOW) {
9f95a23c 9678 dout(1) << __func__ << " checking for stray omap data " << dendl;
f67539c2 9679 it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE);
eafe8130 9680 if (it) {
9f95a23c 9681 uint64_t last_omap_head = 0;
eafe8130
TL
9682 for (it->lower_bound(string()); it->valid(); it->next()) {
9683 uint64_t omap_head;
f67539c2 9684
eafe8130 9685 _key_decode_u64(it->key().c_str(), &omap_head);
f67539c2 9686
9f95a23c 9687 if (used_omap_head.count(omap_head) == 0 &&
f67539c2 9688 omap_head != last_omap_head) {
20effc67 9689 pair<string,string> rk = it->raw_key();
9f95a23c
TL
9690 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9691 << "fsck error: found stray omap data on omap_head "
20effc67
TL
9692 << omap_head << " " << last_omap_head
9693 << " prefix/key: " << url_escape(rk.first)
9694 << " " << url_escape(rk.second)
9695 << fsck_dendl;
f67539c2
TL
9696 ++errors;
9697 last_omap_head = omap_head;
eafe8130 9698 }
7c673cae
FG
9699 }
9700 }
f67539c2 9701 it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE);
eafe8130 9702 if (it) {
9f95a23c 9703 uint64_t last_omap_head = 0;
eafe8130
TL
9704 for (it->lower_bound(string()); it->valid(); it->next()) {
9705 uint64_t omap_head;
9706 _key_decode_u64(it->key().c_str(), &omap_head);
9f95a23c
TL
9707 if (used_omap_head.count(omap_head) == 0 &&
9708 omap_head != last_omap_head) {
20effc67 9709 pair<string,string> rk = it->raw_key();
9f95a23c
TL
9710 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9711 << "fsck error: found stray (pgmeta) omap data on omap_head "
20effc67
TL
9712 << omap_head << " " << last_omap_head
9713 << " prefix/key: " << url_escape(rk.first)
9714 << " " << url_escape(rk.second)
9715 << fsck_dendl;
9f95a23c 9716 last_omap_head = omap_head;
eafe8130
TL
9717 ++errors;
9718 }
11fdf7f2
TL
9719 }
9720 }
f67539c2 9721 it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9f95a23c
TL
9722 if (it) {
9723 uint64_t last_omap_head = 0;
9724 for (it->lower_bound(string()); it->valid(); it->next()) {
9725 uint64_t pool;
9726 uint64_t omap_head;
9727 string k = it->key();
9728 const char *c = k.c_str();
9729 c = _key_decode_u64(c, &pool);
9730 c = _key_decode_u64(c, &omap_head);
9731 if (used_omap_head.count(omap_head) == 0 &&
f67539c2 9732 omap_head != last_omap_head) {
20effc67 9733 pair<string,string> rk = it->raw_key();
9f95a23c
TL
9734 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9735 << "fsck error: found stray (per-pool) omap data on omap_head "
20effc67
TL
9736 << omap_head << " " << last_omap_head
9737 << " prefix/key: " << url_escape(rk.first)
9738 << " " << url_escape(rk.second)
9739 << fsck_dendl;
9f95a23c 9740 ++errors;
f67539c2
TL
9741 last_omap_head = omap_head;
9742 }
9743 }
9744 }
9745 it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9746 if (it) {
9747 uint64_t last_omap_head = 0;
9748 for (it->lower_bound(string()); it->valid(); it->next()) {
9749 uint64_t pool;
9750 uint32_t hash;
9751 uint64_t omap_head;
9752 string k = it->key();
9753 const char* c = k.c_str();
9754 c = _key_decode_u64(c, &pool);
9755 c = _key_decode_u32(c, &hash);
9756 c = _key_decode_u64(c, &omap_head);
9757 if (used_omap_head.count(omap_head) == 0 &&
9758 omap_head != last_omap_head) {
9759 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9760 << "fsck error: found stray (per-pg) omap data on omap_head "
20effc67 9761 << " key " << pretty_binary_string(it->key())
f67539c2
TL
9762 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
9763 ++errors;
9764 last_omap_head = omap_head;
9f95a23c
TL
9765 }
9766 }
9767 }
eafe8130 9768 dout(1) << __func__ << " checking deferred events" << dendl;
f67539c2 9769 it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
9770 if (it) {
9771 for (it->lower_bound(string()); it->valid(); it->next()) {
9772 bufferlist bl = it->value();
9773 auto p = bl.cbegin();
9774 bluestore_deferred_transaction_t wt;
9775 try {
9776 decode(wt, p);
f67539c2 9777 } catch (ceph::buffer::error& e) {
eafe8130
TL
9778 derr << "fsck error: failed to decode deferred txn "
9779 << pretty_binary_string(it->key()) << dendl;
9780 if (repair) {
9781 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
9782 << pretty_binary_string(it->key())
9783 << "', removing" << dendl;
9784 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
9785 }
9786 continue;
9787 }
9788 dout(20) << __func__ << " deferred " << wt.seq
9789 << " ops " << wt.ops.size()
9790 << " released 0x" << std::hex << wt.released << std::dec << dendl;
9791 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
9f95a23c 9792 apply_for_bitset_range(
f67539c2 9793 e.get_start(), e.get_len(), alloc_size, used_blocks,
eafe8130 9794 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
9795 bs.set(pos);
9796 }
9797 );
9798 }
7c673cae 9799 }
eafe8130
TL
9800 }
9801
20effc67
TL
9802 // skip freelist vs allocated compare when we have Null fm
9803 if (!fm->is_null_manager()) {
9804 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
9805#ifdef HAVE_LIBZBD
9806 if (freelist_type == "zoned") {
9807 // verify per-zone state
9808 // - verify no allocations beyond write pointer
9809 // - verify num_dead_bytes count (neither allocated nor
9810 // free space past the write pointer)
9811 auto a = dynamic_cast<ZonedAllocator*>(alloc);
9812 auto num_zones = bdev->get_size() / zone_size;
9813
9814 // mark the free space past the write pointer
9815 for (uint32_t zone = first_sequential_zone; zone < num_zones; ++zone) {
9816 auto wp = a->get_write_pointer(zone);
9817 uint64_t offset = zone_size * zone + wp;
9818 uint64_t length = zone_size - wp;
9819 if (!length) {
9820 continue;
9821 }
9822 bool intersects = false;
9823 dout(10) << " marking zone 0x" << std::hex << zone
9824 << " region after wp 0x" << offset << "~" << length
9825 << std::dec << dendl;
9826 apply_for_bitset_range(
9827 offset, length, alloc_size, used_blocks,
9828 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9829 if (bs.test(pos)) {
9830 derr << "fsck error: zone 0x" << std::hex << zone
9831 << " has used space at 0x" << pos * alloc_size
9832 << " beyond write pointer 0x" << wp
9833 << std::dec << dendl;
9834 intersects = true;
eafe8130 9835 } else {
20effc67 9836 bs.set(pos);
11fdf7f2 9837 }
20effc67
TL
9838 }
9839 );
9840 if (intersects) {
9841 ++errors;
9842 }
9843 }
9844
9845 used_blocks.flip();
9846
9847 // skip conventional zones
9848 uint64_t pos = (first_sequential_zone * zone_size) / min_alloc_size - 1;
9849 pos = used_blocks.find_next(pos);
9850
9851 uint64_t zone_dead = 0;
9852 for (uint32_t zone = first_sequential_zone;
9853 zone < num_zones;
9854 ++zone, zone_dead = 0) {
9855 while (pos != decltype(used_blocks)::npos &&
9856 (pos * min_alloc_size) / zone_size == zone) {
9857 dout(40) << " zone 0x" << std::hex << zone
9858 << " dead 0x" << (pos * min_alloc_size) << "~" << min_alloc_size
9859 << std::dec << dendl;
9860 zone_dead += min_alloc_size;
9861 pos = used_blocks.find_next(pos);
9862 }
9863 dout(20) << " zone 0x" << std::hex << zone << " dead is 0x" << zone_dead
9864 << std::dec << dendl;
9865 // cross-check dead bytes against zone state
9866 if (a->get_dead_bytes(zone) != zone_dead) {
9867 derr << "fsck error: zone 0x" << std::hex << zone << " has 0x" << zone_dead
9868 << " dead bytes but freelist says 0x" << a->get_dead_bytes(zone)
9869 << dendl;
9870 ++errors;
9871 // TODO: repair
9872 }
9873 }
9874 used_blocks.flip();
9875 } else
9876#endif
9877 {
9878 fm->enumerate_reset();
9879 uint64_t offset, length;
9880 while (fm->enumerate_next(db, &offset, &length)) {
9881 bool intersects = false;
9882 apply_for_bitset_range(
9883 offset, length, alloc_size, used_blocks,
9884 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9885 ceph_assert(pos < bs.size());
9886 if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
9887 if (offset == SUPER_RESERVED &&
9888 length == min_alloc_size - SUPER_RESERVED) {
9889 // this is due to the change just after luminous to min_alloc_size
9890 // granularity allocations, and our baked in assumption at the top
9891 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
9892 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
9893 // since we will never allocate this region below min_alloc_size.
9894 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
9895 << " and min_alloc_size, 0x" << std::hex << offset << "~"
9896 << length << std::dec << dendl;
9897 } else {
9898 intersects = true;
9899 if (repair) {
9900 repairer.fix_false_free(db, fm,
9901 pos * min_alloc_size,
9902 min_alloc_size);
9903 }
9904 }
9905 } else {
9906 bs.set(pos);
eafe8130 9907 }
11fdf7f2 9908 }
20effc67
TL
9909 );
9910 if (intersects) {
9911 derr << "fsck error: free extent 0x" << std::hex << offset
9912 << "~" << length << std::dec
9913 << " intersects allocated blocks" << dendl;
9914 ++errors;
b5b8bbf5 9915 }
20effc67
TL
9916 }
9917 fm->enumerate_reset();
9918
9919 // check for leaked extents
9920 size_t count = used_blocks.count();
9921 if (used_blocks.size() != count) {
9922 ceph_assert(used_blocks.size() > count);
9923 used_blocks.flip();
9924 size_t start = used_blocks.find_first();
9925 while (start != decltype(used_blocks)::npos) {
9926 size_t cur = start;
9927 while (true) {
9928 size_t next = used_blocks.find_next(cur);
9929 if (next != cur + 1) {
9930 ++errors;
9931 derr << "fsck error: leaked extent 0x" << std::hex
9932 << ((uint64_t)start * fm->get_alloc_size()) << "~"
9933 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
9934 << dendl;
9935 if (repair) {
9936 repairer.fix_leaked(db,
9937 fm,
9938 start * min_alloc_size,
9939 (cur + 1 - start) * min_alloc_size);
9940 }
9941 start = next;
9942 break;
9943 }
9944 cur = next;
9945 }
9946 }
9947 used_blocks.flip();
9948 }
b5b8bbf5 9949 }
7c673cae
FG
9950 }
9951 }
11fdf7f2 9952 if (repair) {
f67539c2
TL
9953 if (per_pool_omap != OMAP_PER_PG) {
9954 dout(5) << __func__ << " fixing per_pg_omap" << dendl;
9955 repairer.fix_per_pool_omap(db, OMAP_PER_PG);
9f95a23c
TL
9956 }
9957
11fdf7f2
TL
9958 dout(5) << __func__ << " applying repair results" << dendl;
9959 repaired = repairer.apply(db);
9960 dout(5) << __func__ << " repair applied" << dendl;
9961 }
7c673cae 9962
eafe8130 9963out_scan:
7c673cae
FG
9964 dout(2) << __func__ << " " << num_objects << " objects, "
9965 << num_sharded_objects << " of them sharded. "
9966 << dendl;
9967 dout(2) << __func__ << " " << num_extents << " extents to "
9968 << num_blobs << " blobs, "
9969 << num_spanning_blobs << " spanning, "
9970 << num_shared_blobs << " shared."
9971 << dendl;
9972
9973 utime_t duration = ceph_clock_now() - start;
9f95a23c
TL
9974 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
9975 << warnings << " warnings, "
9976 << repaired << " repaired, "
9977 << (errors + warnings - (int)repaired) << " remaining in "
7c673cae 9978 << duration << " seconds" << dendl;
9f95a23c
TL
9979
9980 // In non-repair mode we should return error count only as
9981 // it indicates if store status is OK.
9982 // In repair mode both errors and warnings are taken into account
9983 // since repaired counter relates to them both.
9984 return repair ? errors + warnings - (int)repaired : errors;
11fdf7f2
TL
9985}
9986
9987/// methods to inject various errors fsck can repair
9988void BlueStore::inject_broken_shared_blob_key(const string& key,
9989 const bufferlist& bl)
9990{
9991 KeyValueDB::Transaction txn;
9992 txn = db->get_transaction();
9993 txn->set(PREFIX_SHARED_BLOB, key, bl);
9994 db->submit_transaction_sync(txn);
9995};
9996
a4b75251
TL
9997void BlueStore::inject_no_shared_blob_key()
9998{
9999 KeyValueDB::Transaction txn;
10000 txn = db->get_transaction();
10001 ceph_assert(blobid_last > 0);
10002 // kill the last used sbid, this can be broken due to blobid preallocation
10003 // in rare cases, leaving as-is for the sake of simplicity
10004 uint64_t sbid = blobid_last;
10005
10006 string key;
10007 dout(5) << __func__<< " " << sbid << dendl;
10008 get_shared_blob_key(sbid, &key);
10009 txn->rmkey(PREFIX_SHARED_BLOB, key);
10010 db->submit_transaction_sync(txn);
10011};
10012
20effc67 10013void BlueStore::inject_stray_shared_blob_key(uint64_t sbid)
11fdf7f2
TL
10014{
10015 KeyValueDB::Transaction txn;
10016 txn = db->get_transaction();
10017
20effc67
TL
10018 dout(5) << __func__ << " " << sbid << dendl;
10019
10020 string key;
10021 get_shared_blob_key(sbid, &key);
10022 bluestore_shared_blob_t persistent(sbid);
10023 persistent.ref_map.get(0xdead0000, 0x1000);
10024 bufferlist bl;
10025 encode(persistent, bl);
10026 dout(20) << __func__ << " sbid " << sbid
10027 << " takes " << bl.length() << " bytes, updating"
10028 << dendl;
10029
10030 txn->set(PREFIX_SHARED_BLOB, key, bl);
10031 db->submit_transaction_sync(txn);
10032};
10033
10034
10035void BlueStore::inject_leaked(uint64_t len)
10036{
11fdf7f2 10037 PExtentVector exts;
20effc67 10038 int64_t alloc_len = alloc->allocate(len, min_alloc_size,
11fdf7f2 10039 min_alloc_size * 256, 0, &exts);
20effc67
TL
10040
10041 if (fm->is_null_manager()) {
10042 return;
10043 }
10044
10045 KeyValueDB::Transaction txn;
10046 txn = db->get_transaction();
10047
11fdf7f2
TL
10048 ceph_assert(alloc_len >= (int64_t)len);
10049 for (auto& p : exts) {
10050 fm->allocate(p.offset, p.length, txn);
10051 }
10052 db->submit_transaction_sync(txn);
10053}
10054
10055void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
10056{
20effc67
TL
10057 ceph_assert(!fm->is_null_manager());
10058
11fdf7f2
TL
10059 KeyValueDB::Transaction txn;
10060 OnodeRef o;
10061 CollectionRef c = _get_collection(cid);
10062 ceph_assert(c);
10063 {
9f95a23c 10064 std::unique_lock l{c->lock}; // just to avoid internal asserts
11fdf7f2
TL
10065 o = c->get_onode(oid, false);
10066 ceph_assert(o);
10067 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10068 }
10069
10070 bool injected = false;
10071 txn = db->get_transaction();
10072 auto& em = o->extent_map.extent_map;
10073 std::vector<const PExtentVector*> v;
10074 if (em.size()) {
10075 v.push_back(&em.begin()->blob->get_blob().get_extents());
10076 }
10077 if (em.size() > 1) {
10078 auto it = em.end();
10079 --it;
10080 v.push_back(&(it->blob->get_blob().get_extents()));
10081 }
10082 for (auto pext : v) {
10083 if (pext->size()) {
10084 auto p = pext->begin();
10085 while (p != pext->end()) {
10086 if (p->is_valid()) {
10087 dout(20) << __func__ << " release 0x" << std::hex << p->offset
10088 << "~" << p->length << std::dec << dendl;
10089 fm->release(p->offset, p->length, txn);
10090 injected = true;
10091 break;
10092 }
10093 ++p;
10094 }
10095 }
10096 }
10097 ceph_assert(injected);
10098 db->submit_transaction_sync(txn);
10099}
10100
9f95a23c
TL
10101void BlueStore::inject_legacy_omap()
10102{
10103 dout(1) << __func__ << dendl;
f67539c2 10104 per_pool_omap = OMAP_BULK;
9f95a23c
TL
10105 KeyValueDB::Transaction txn;
10106 txn = db->get_transaction();
10107 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
10108 db->submit_transaction_sync(txn);
10109}
10110
10111void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
10112{
10113 dout(1) << __func__ << " "
10114 << cid << " " << oid
10115 <<dendl;
10116 KeyValueDB::Transaction txn;
10117 OnodeRef o;
10118 CollectionRef c = _get_collection(cid);
10119 ceph_assert(c);
10120 {
10121 std::unique_lock l{ c->lock }; // just to avoid internal asserts
10122 o = c->get_onode(oid, false);
10123 ceph_assert(o);
10124 }
f67539c2
TL
10125 o->onode.clear_flag(
10126 bluestore_onode_t::FLAG_PERPG_OMAP |
10127 bluestore_onode_t::FLAG_PERPOOL_OMAP |
10128 bluestore_onode_t::FLAG_PGMETA_OMAP);
9f95a23c
TL
10129 txn = db->get_transaction();
10130 _record_onode(o, txn);
10131 db->submit_transaction_sync(txn);
10132}
10133
20effc67
TL
10134void BlueStore::inject_stray_omap(uint64_t head, const string& name)
10135{
10136 dout(1) << __func__ << dendl;
10137 KeyValueDB::Transaction txn = db->get_transaction();
10138
10139 string key;
10140 bufferlist bl;
10141 _key_encode_u64(head, &key);
10142 key.append(name);
10143 txn->set(PREFIX_OMAP, key, bl);
10144
10145 db->submit_transaction_sync(txn);
10146}
9f95a23c 10147
11fdf7f2
TL
10148void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
10149{
10150 BlueStoreRepairer repairer;
10151 repairer.fix_statfs(db, key, new_statfs);
10152 repairer.apply(db);
10153}
10154
eafe8130
TL
10155void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
10156{
10157 KeyValueDB::Transaction t = db->get_transaction();
10158 volatile_statfs v;
10159 v = new_statfs;
10160 bufferlist bl;
10161 v.encode(bl);
10162 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
10163 db->submit_transaction_sync(t);
10164}
10165
11fdf7f2
TL
10166void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
10167 coll_t cid2, ghobject_t oid2,
10168 uint64_t offset)
10169{
10170 OnodeRef o1;
10171 CollectionRef c1 = _get_collection(cid1);
10172 ceph_assert(c1);
10173 {
9f95a23c 10174 std::unique_lock l{c1->lock}; // just to avoid internal asserts
11fdf7f2
TL
10175 o1 = c1->get_onode(oid1, false);
10176 ceph_assert(o1);
10177 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
10178 }
10179 OnodeRef o2;
10180 CollectionRef c2 = _get_collection(cid2);
10181 ceph_assert(c2);
10182 {
9f95a23c 10183 std::unique_lock l{c2->lock}; // just to avoid internal asserts
11fdf7f2
TL
10184 o2 = c2->get_onode(oid2, false);
10185 ceph_assert(o2);
10186 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
10187 }
10188 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
10189 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
10190
10191 // require onode/extent layout to be the same (and simple)
10192 // to make things easier
10193 ceph_assert(o1->onode.extent_map_shards.empty());
10194 ceph_assert(o2->onode.extent_map_shards.empty());
10195 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
10196 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
10197 ceph_assert(e1.logical_offset == e2.logical_offset);
10198 ceph_assert(e1.length == e2.length);
10199 ceph_assert(e1.blob_offset == e2.blob_offset);
10200
10201 KeyValueDB::Transaction txn;
10202 txn = db->get_transaction();
10203
10204 // along with misreference error this will create space leaks errors
10205 e2.blob->dirty_blob() = e1.blob->get_blob();
10206 o2->extent_map.dirty_range(offset, e2.length);
10207 o2->extent_map.update(txn, false);
10208
10209 _record_onode(o2, txn);
10210 db->submit_transaction_sync(txn);
7c673cae
FG
10211}
10212
adb31ebb
TL
10213void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
10214 int16_t blob_id)
10215{
10216 OnodeRef o;
10217 CollectionRef c = _get_collection(cid);
10218 ceph_assert(c);
10219 {
10220 std::unique_lock l{ c->lock }; // just to avoid internal asserts
10221 o = c->get_onode(oid, false);
10222 ceph_assert(o);
10223 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10224 }
10225
10226 BlobRef b = c->new_blob();
10227 b->id = blob_id;
10228 o->extent_map.spanning_blob_map[blob_id] = b;
10229
10230 KeyValueDB::Transaction txn;
10231 txn = db->get_transaction();
10232
10233 _record_onode(o, txn);
10234 db->submit_transaction_sync(txn);
10235}
10236
a4b75251
TL
10237void BlueStore::inject_bluefs_file(std::string_view dir, std::string_view name, size_t new_size)
10238{
10239 ceph_assert(bluefs);
10240
10241 BlueFS::FileWriter* p_handle = nullptr;
10242 auto ret = bluefs->open_for_write(dir, name, &p_handle, false);
10243 ceph_assert(ret == 0);
10244
10245 std::string s('0', new_size);
10246 bufferlist bl;
10247 bl.append(s);
10248 p_handle->append(bl);
10249
10250 bluefs->fsync(p_handle);
10251 bluefs->close_writer(p_handle);
10252}
10253
7c673cae
FG
10254void BlueStore::collect_metadata(map<string,string> *pm)
10255{
10256 dout(10) << __func__ << dendl;
10257 bdev->collect_metadata("bluestore_bdev_", pm);
10258 if (bluefs) {
10259 (*pm)["bluefs"] = "1";
9f95a23c
TL
10260 // this value is for backward compatibility only
10261 (*pm)["bluefs_single_shared_device"] = \
10262 stringify((int)bluefs_layout.single_shared_device());
10263 (*pm)["bluefs_dedicated_db"] = \
10264 stringify((int)bluefs_layout.dedicated_db);
10265 (*pm)["bluefs_dedicated_wal"] = \
10266 stringify((int)bluefs_layout.dedicated_wal);
10267 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
7c673cae
FG
10268 } else {
10269 (*pm)["bluefs"] = "0";
10270 }
11fdf7f2
TL
10271
10272 // report numa mapping for underlying devices
10273 int node = -1;
10274 set<int> nodes;
10275 set<string> failed;
10276 int r = get_numa_node(&node, &nodes, &failed);
10277 if (r >= 0) {
10278 if (!failed.empty()) {
10279 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
10280 }
10281 if (!nodes.empty()) {
10282 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
10283 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
10284 }
10285 if (node >= 0) {
10286 (*pm)["objectstore_numa_node"] = stringify(node);
10287 }
10288 }
10289}
10290
10291int BlueStore::get_numa_node(
10292 int *final_node,
10293 set<int> *out_nodes,
10294 set<string> *out_failed)
10295{
10296 int node = -1;
10297 set<string> devices;
10298 get_devices(&devices);
10299 set<int> nodes;
10300 set<string> failed;
10301 for (auto& devname : devices) {
10302 int n;
10303 BlkDev bdev(devname);
10304 int r = bdev.get_numa_node(&n);
10305 if (r < 0) {
10306 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
10307 << dendl;
10308 failed.insert(devname);
10309 continue;
10310 }
10311 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
10312 << dendl;
10313 nodes.insert(n);
10314 if (node < 0) {
10315 node = n;
10316 }
10317 }
10318 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
10319 *final_node = node;
10320 }
10321 if (out_nodes) {
10322 *out_nodes = nodes;
10323 }
10324 if (out_failed) {
10325 *out_failed = failed;
10326 }
10327 return 0;
10328}
10329
1d09f67e
TL
10330void BlueStore::prepare_for_fast_shutdown()
10331{
10332 m_fast_shutdown = true;
10333}
10334
11fdf7f2
TL
10335int BlueStore::get_devices(set<string> *ls)
10336{
10337 if (bdev) {
10338 bdev->get_devices(ls);
10339 if (bluefs) {
10340 bluefs->get_devices(ls);
10341 }
10342 return 0;
10343 }
20effc67 10344
11fdf7f2 10345 // grumble, we haven't started up yet.
20effc67
TL
10346 if (int r = _open_path(); r < 0) {
10347 return r;
10348 }
10349 auto close_path = make_scope_guard([&] {
10350 _close_path();
10351 });
10352 if (int r = _open_fsid(false); r < 0) {
10353 return r;
10354 }
10355 auto close_fsid = make_scope_guard([&] {
10356 _close_fsid();
10357 });
10358 if (int r = _read_fsid(&fsid); r < 0) {
10359 return r;
10360 }
10361 if (int r = _lock_fsid(); r < 0) {
10362 return r;
10363 }
10364 if (int r = _open_bdev(false); r < 0) {
10365 return r;
10366 }
10367 auto close_bdev = make_scope_guard([&] {
10368 _close_bdev();
10369 });
10370 if (int r = _minimal_open_bluefs(false); r < 0) {
10371 return r;
10372 }
11fdf7f2
TL
10373 bdev->get_devices(ls);
10374 if (bluefs) {
10375 bluefs->get_devices(ls);
10376 }
11fdf7f2 10377 _minimal_close_bluefs();
20effc67 10378 return 0;
7c673cae
FG
10379}
10380
11fdf7f2 10381void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
7c673cae
FG
10382{
10383 buf->reset();
11fdf7f2 10384
f67539c2
TL
10385 auto prefix = per_pool_omap == OMAP_BULK ?
10386 PREFIX_OMAP :
10387 per_pool_omap == OMAP_PER_POOL ?
10388 PREFIX_PERPOOL_OMAP :
10389 PREFIX_PERPG_OMAP;
9f95a23c 10390 buf->omap_allocated =
f67539c2 10391 db->estimate_prefix_size(prefix, string());
11fdf7f2 10392
20effc67 10393 uint64_t bfree = alloc->get_free();
7c673cae
FG
10394
10395 if (bluefs) {
f67539c2 10396 buf->internally_reserved = 0;
11fdf7f2 10397 // include dedicated db, too, if that isn't the shared device.
9f95a23c 10398 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 10399 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
7c673cae 10400 }
11fdf7f2
TL
10401 // call any non-omap bluefs space "internal metadata"
10402 buf->internal_metadata =
f67539c2 10403 bluefs->get_used()
11fdf7f2 10404 - buf->omap_allocated;
7c673cae
FG
10405 }
10406
11fdf7f2
TL
10407 uint64_t thin_total, thin_avail;
10408 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
10409 buf->total += thin_total;
10410
10411 // we are limited by both the size of the virtual device and the
10412 // underlying physical device.
10413 bfree = std::min(bfree, thin_avail);
10414
10415 buf->allocated = thin_total - thin_avail;
10416 } else {
10417 buf->total += bdev->get_size();
10418 }
10419 buf->available = bfree;
10420}
10421
10422int BlueStore::statfs(struct store_statfs_t *buf,
10423 osd_alert_list_t* alerts)
10424{
10425 if (alerts) {
10426 alerts->clear();
10427 _log_alerts(*alerts);
10428 }
10429 _get_statfs_overall(buf);
31f18b77 10430 {
11fdf7f2 10431 std::lock_guard l(vstatfs_lock);
31f18b77 10432 buf->allocated = vstatfs.allocated();
11fdf7f2
TL
10433 buf->data_stored = vstatfs.stored();
10434 buf->data_compressed = vstatfs.compressed();
10435 buf->data_compressed_original = vstatfs.compressed_original();
10436 buf->data_compressed_allocated = vstatfs.compressed_allocated();
10437 }
10438
10439 dout(20) << __func__ << " " << *buf << dendl;
10440 return 0;
10441}
10442
9f95a23c
TL
10443int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
10444 bool *out_per_pool_omap)
11fdf7f2
TL
10445{
10446 dout(20) << __func__ << " pool " << pool_id<< dendl;
81eedcae 10447
11fdf7f2
TL
10448 if (!per_pool_stat_collection) {
10449 dout(20) << __func__ << " not supported in legacy mode " << dendl;
10450 return -ENOTSUP;
7c673cae 10451 }
11fdf7f2 10452 buf->reset();
7c673cae 10453
11fdf7f2
TL
10454 {
10455 std::lock_guard l(vstatfs_lock);
10456 osd_pools[pool_id].publish(buf);
10457 }
9f95a23c
TL
10458
10459 string key_prefix;
10460 _key_encode_u64(pool_id, &key_prefix);
f67539c2 10461 *out_per_pool_omap = per_pool_omap != OMAP_BULK;
1d09f67e
TL
10462 // stop calls after db was closed
10463 if (*out_per_pool_omap && db) {
f67539c2
TL
10464 auto prefix = per_pool_omap == OMAP_PER_POOL ?
10465 PREFIX_PERPOOL_OMAP :
10466 PREFIX_PERPG_OMAP;
10467 buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix);
10468 }
9f95a23c 10469
11fdf7f2 10470 dout(10) << __func__ << *buf << dendl;
7c673cae
FG
10471 return 0;
10472}
10473
81eedcae
TL
10474void BlueStore::_check_legacy_statfs_alert()
10475{
10476 string s;
10477 if (!per_pool_stat_collection &&
eafe8130 10478 cct->_conf->bluestore_warn_on_legacy_statfs) {
81eedcae
TL
10479 s = "legacy statfs reporting detected, "
10480 "suggest to run store repair to get consistent statistic reports";
10481 }
10482 std::lock_guard l(qlock);
10483 legacy_statfs_alert = s;
10484}
10485
f67539c2 10486void BlueStore::_check_no_per_pg_or_pool_omap_alert()
9f95a23c 10487{
f67539c2
TL
10488 string per_pg, per_pool;
10489 if (per_pool_omap != OMAP_PER_PG) {
10490 if (cct->_conf->bluestore_warn_on_no_per_pg_omap) {
10491 per_pg = "legacy (not per-pg) omap detected, "
10492 "suggest to run store repair to benefit from faster PG removal";
10493 }
10494 if (per_pool_omap != OMAP_PER_POOL) {
10495 if (cct->_conf->bluestore_warn_on_no_per_pool_omap) {
10496 per_pool = "legacy (not per-pool) omap detected, "
10497 "suggest to run store repair to benefit from per-pool omap usage statistics";
10498 }
10499 }
9f95a23c
TL
10500 }
10501 std::lock_guard l(qlock);
f67539c2
TL
10502 no_per_pg_omap_alert = per_pg;
10503 no_per_pool_omap_alert = per_pool;
9f95a23c
TL
10504}
10505
7c673cae
FG
10506// ---------------
10507// cache
10508
10509BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
10510{
9f95a23c 10511 std::shared_lock l(coll_lock);
7c673cae
FG
10512 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
10513 if (cp == coll_map.end())
10514 return CollectionRef();
10515 return cp->second;
10516}
10517
20effc67
TL
10518BlueStore::CollectionRef BlueStore::_get_collection_by_oid(const ghobject_t& oid)
10519{
10520 std::shared_lock l(coll_lock);
10521
10522 // FIXME: we must replace this with something more efficient
10523
10524 for (auto& i : coll_map) {
10525 spg_t spgid;
10526 if (i.first.is_pg(&spgid) &&
10527 i.second->contains(oid)) {
10528 return i.second;
10529 }
10530 }
10531 return CollectionRef();
10532}
10533
7c673cae
FG
10534void BlueStore::_queue_reap_collection(CollectionRef& c)
10535{
10536 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
10537 // _reap_collections and this in the same thread,
10538 // so no need a lock.
7c673cae
FG
10539 removed_collections.push_back(c);
10540}
10541
10542void BlueStore::_reap_collections()
10543{
94b18763 10544
7c673cae
FG
10545 list<CollectionRef> removed_colls;
10546 {
94b18763
FG
10547 // _queue_reap_collection and this in the same thread.
10548 // So no need a lock.
10549 if (!removed_collections.empty())
10550 removed_colls.swap(removed_collections);
10551 else
10552 return;
7c673cae
FG
10553 }
10554
94b18763
FG
10555 list<CollectionRef>::iterator p = removed_colls.begin();
10556 while (p != removed_colls.end()) {
7c673cae
FG
10557 CollectionRef c = *p;
10558 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
adb31ebb 10559 if (c->onode_map.map_any([&](Onode* o) {
11fdf7f2 10560 ceph_assert(!o->exists);
7c673cae
FG
10561 if (o->flushing_count.load()) {
10562 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
10563 << " flush_txns " << o->flushing_count << dendl;
94b18763 10564 return true;
7c673cae 10565 }
94b18763 10566 return false;
7c673cae 10567 })) {
94b18763 10568 ++p;
7c673cae
FG
10569 continue;
10570 }
10571 c->onode_map.clear();
94b18763 10572 p = removed_colls.erase(p);
7c673cae
FG
10573 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
10574 }
94b18763 10575 if (removed_colls.empty()) {
7c673cae 10576 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
10577 } else {
10578 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
10579 }
10580}
10581
10582void BlueStore::_update_cache_logger()
10583{
10584 uint64_t num_onodes = 0;
9f95a23c 10585 uint64_t num_pinned_onodes = 0;
7c673cae
FG
10586 uint64_t num_extents = 0;
10587 uint64_t num_blobs = 0;
10588 uint64_t num_buffers = 0;
10589 uint64_t num_buffer_bytes = 0;
9f95a23c
TL
10590 for (auto c : onode_cache_shards) {
10591 c->add_stats(&num_onodes, &num_pinned_onodes);
10592 }
10593 for (auto c : buffer_cache_shards) {
10594 c->add_stats(&num_extents, &num_blobs,
10595 &num_buffers, &num_buffer_bytes);
7c673cae
FG
10596 }
10597 logger->set(l_bluestore_onodes, num_onodes);
9f95a23c 10598 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
7c673cae
FG
10599 logger->set(l_bluestore_extents, num_extents);
10600 logger->set(l_bluestore_blobs, num_blobs);
10601 logger->set(l_bluestore_buffers, num_buffers);
10602 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
10603}
10604
10605// ---------------
10606// read operations
10607
10608ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
10609{
10610 return _get_collection(cid);
10611}
10612
11fdf7f2
TL
10613ObjectStore::CollectionHandle BlueStore::create_new_collection(
10614 const coll_t& cid)
7c673cae 10615{
9f95a23c
TL
10616 std::unique_lock l{coll_lock};
10617 auto c = ceph::make_ref<Collection>(
11fdf7f2 10618 this,
9f95a23c
TL
10619 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
10620 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
11fdf7f2
TL
10621 cid);
10622 new_coll_map[cid] = c;
9f95a23c 10623 _osr_attach(c.get());
11fdf7f2
TL
10624 return c;
10625}
10626
10627void BlueStore::set_collection_commit_queue(
10628 const coll_t& cid,
10629 ContextQueue *commit_queue)
10630{
10631 if (commit_queue) {
9f95a23c 10632 std::shared_lock l(coll_lock);
11fdf7f2
TL
10633 if (coll_map.count(cid)) {
10634 coll_map[cid]->commit_queue = commit_queue;
10635 } else if (new_coll_map.count(cid)) {
10636 new_coll_map[cid]->commit_queue = commit_queue;
10637 }
10638 }
7c673cae
FG
10639}
10640
11fdf7f2 10641
7c673cae
FG
10642bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
10643{
10644 Collection *c = static_cast<Collection *>(c_.get());
10645 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
10646 if (!c->exists)
10647 return false;
10648
10649 bool r = true;
10650
10651 {
9f95a23c 10652 std::shared_lock l(c->lock);
7c673cae
FG
10653 OnodeRef o = c->get_onode(oid, false);
10654 if (!o || !o->exists)
10655 r = false;
10656 }
10657
7c673cae
FG
10658 return r;
10659}
10660
7c673cae
FG
10661int BlueStore::stat(
10662 CollectionHandle &c_,
10663 const ghobject_t& oid,
10664 struct stat *st,
10665 bool allow_eio)
10666{
10667 Collection *c = static_cast<Collection *>(c_.get());
10668 if (!c->exists)
10669 return -ENOENT;
10670 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
10671
10672 {
9f95a23c 10673 std::shared_lock l(c->lock);
7c673cae
FG
10674 OnodeRef o = c->get_onode(oid, false);
10675 if (!o || !o->exists)
10676 return -ENOENT;
10677 st->st_size = o->onode.size;
10678 st->st_blksize = 4096;
10679 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
10680 st->st_nlink = 1;
10681 }
10682
7c673cae
FG
10683 int r = 0;
10684 if (_debug_mdata_eio(oid)) {
10685 r = -EIO;
10686 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10687 }
10688 return r;
10689}
10690int BlueStore::set_collection_opts(
11fdf7f2 10691 CollectionHandle& ch,
7c673cae
FG
10692 const pool_opts_t& opts)
10693{
7c673cae 10694 Collection *c = static_cast<Collection *>(ch.get());
11fdf7f2 10695 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
7c673cae
FG
10696 if (!c->exists)
10697 return -ENOENT;
9f95a23c 10698 std::unique_lock l{c->lock};
7c673cae
FG
10699 c->pool_opts = opts;
10700 return 0;
10701}
10702
7c673cae
FG
10703int BlueStore::read(
10704 CollectionHandle &c_,
10705 const ghobject_t& oid,
10706 uint64_t offset,
10707 size_t length,
10708 bufferlist& bl,
224ce89b 10709 uint32_t op_flags)
7c673cae 10710{
11fdf7f2 10711 auto start = mono_clock::now();
7c673cae
FG
10712 Collection *c = static_cast<Collection *>(c_.get());
10713 const coll_t &cid = c->get_cid();
10714 dout(15) << __func__ << " " << cid << " " << oid
10715 << " 0x" << std::hex << offset << "~" << length << std::dec
10716 << dendl;
10717 if (!c->exists)
10718 return -ENOENT;
10719
10720 bl.clear();
10721 int r;
10722 {
9f95a23c 10723 std::shared_lock l(c->lock);
11fdf7f2 10724 auto start1 = mono_clock::now();
7c673cae 10725 OnodeRef o = c->get_onode(oid, false);
494da23a
TL
10726 log_latency("get_onode@read",
10727 l_bluestore_read_onode_meta_lat,
10728 mono_clock::now() - start1,
10729 cct->_conf->bluestore_log_op_age);
7c673cae
FG
10730 if (!o || !o->exists) {
10731 r = -ENOENT;
10732 goto out;
10733 }
10734
10735 if (offset == length && offset == 0)
10736 length = o->onode.size;
10737
10738 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
10739 if (r == -EIO) {
10740 logger->inc(l_bluestore_read_eio);
10741 }
7c673cae
FG
10742 }
10743
10744 out:
28e407b8 10745 if (r >= 0 && _debug_data_eio(oid)) {
7c673cae
FG
10746 r = -EIO;
10747 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11fdf7f2
TL
10748 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10749 cct->_conf->bluestore_debug_random_read_err &&
10750 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10751 100.0)) == 0) {
224ce89b
WB
10752 dout(0) << __func__ << ": inject random EIO" << dendl;
10753 r = -EIO;
7c673cae
FG
10754 }
10755 dout(10) << __func__ << " " << cid << " " << oid
10756 << " 0x" << std::hex << offset << "~" << length << std::dec
10757 << " = " << r << dendl;
494da23a
TL
10758 log_latency(__func__,
10759 l_bluestore_read_lat,
10760 mono_clock::now() - start,
10761 cct->_conf->bluestore_log_op_age);
7c673cae
FG
10762 return r;
10763}
10764
9f95a23c 10765void BlueStore::_read_cache(
7c673cae
FG
10766 OnodeRef o,
10767 uint64_t offset,
10768 size_t length,
9f95a23c
TL
10769 int read_cache_policy,
10770 ready_regions_t& ready_regions,
10771 blobs2read_t& blobs2read)
7c673cae 10772{
7c673cae 10773 // build blob-wise list to of stuff read (that isn't cached)
7c673cae
FG
10774 unsigned left = length;
10775 uint64_t pos = offset;
7c673cae
FG
10776 auto lp = o->extent_map.seek_lextent(offset);
10777 while (left > 0 && lp != o->extent_map.extent_map.end()) {
10778 if (pos < lp->logical_offset) {
10779 unsigned hole = lp->logical_offset - pos;
10780 if (hole >= left) {
9f95a23c 10781 break;
7c673cae
FG
10782 }
10783 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
9f95a23c 10784 << std::dec << dendl;
7c673cae
FG
10785 pos += hole;
10786 left -= hole;
10787 }
94b18763 10788 BlobRef& bptr = lp->blob;
7c673cae
FG
10789 unsigned l_off = pos - lp->logical_offset;
10790 unsigned b_off = l_off + lp->blob_offset;
10791 unsigned b_len = std::min(left, lp->length - l_off);
10792
10793 ready_regions_t cache_res;
10794 interval_set<uint32_t> cache_interval;
10795 bptr->shared_blob->bc.read(
91327a77
AA
10796 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
10797 read_cache_policy);
7c673cae 10798 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c
TL
10799 << " need 0x" << b_off << "~" << b_len
10800 << " cache has 0x" << cache_interval
10801 << std::dec << dendl;
7c673cae
FG
10802
10803 auto pc = cache_res.begin();
11fdf7f2 10804 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
7c673cae
FG
10805 while (b_len > 0) {
10806 unsigned l;
10807 if (pc != cache_res.end() &&
9f95a23c
TL
10808 pc->first == b_off) {
10809 l = pc->second.length();
f67539c2 10810 ready_regions[pos] = std::move(pc->second);
9f95a23c
TL
10811 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
10812 << b_off << "~" << l << std::dec << dendl;
10813 ++pc;
7c673cae 10814 } else {
9f95a23c
TL
10815 l = b_len;
10816 if (pc != cache_res.end()) {
10817 ceph_assert(pc->first > b_off);
10818 l = pc->first - b_off;
10819 }
10820 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
10821 << b_off << "~" << l << std::dec << dendl;
10822 // merge regions
10823 {
10824 uint64_t r_off = b_off;
10825 uint64_t r_len = l;
10826 uint64_t front = r_off % chunk_size;
10827 if (front) {
10828 r_off -= front;
10829 r_len += front;
10830 }
10831 unsigned tail = r_len % chunk_size;
10832 if (tail) {
10833 r_len += chunk_size - tail;
10834 }
10835 bool merged = false;
10836 regions2read_t& r2r = blobs2read[bptr];
10837 if (r2r.size()) {
10838 read_req_t& pre = r2r.back();
10839 if (r_off <= (pre.r_off + pre.r_len)) {
10840 front += (r_off - pre.r_off);
10841 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
10842 pre.regs.emplace_back(region_t(pos, b_off, l, front));
10843 merged = true;
10844 }
10845 }
10846 if (!merged) {
10847 read_req_t req(r_off, r_len);
10848 req.regs.emplace_back(region_t(pos, b_off, l, front));
10849 r2r.emplace_back(std::move(req));
10850 }
10851 }
7c673cae
FG
10852 }
10853 pos += l;
10854 b_off += l;
10855 left -= l;
10856 b_len -= l;
10857 }
10858 ++lp;
10859 }
9f95a23c 10860}
7c673cae 10861
9f95a23c
TL
10862int BlueStore::_prepare_read_ioc(
10863 blobs2read_t& blobs2read,
10864 vector<bufferlist>* compressed_blob_bls,
10865 IOContext* ioc)
10866{
7c673cae 10867 for (auto& p : blobs2read) {
94b18763 10868 const BlobRef& bptr = p.first;
11fdf7f2 10869 regions2read_t& r2r = p.second;
20effc67
TL
10870 dout(20) << __func__ << " blob " << *bptr << " need "
10871 << r2r << dendl;
7c673cae
FG
10872 if (bptr->get_blob().is_compressed()) {
10873 // read the whole thing
9f95a23c
TL
10874 if (compressed_blob_bls->empty()) {
10875 // ensure we avoid any reallocation on subsequent blobs
10876 compressed_blob_bls->reserve(blobs2read.size());
10877 }
10878 compressed_blob_bls->push_back(bufferlist());
10879 bufferlist& bl = compressed_blob_bls->back();
10880 auto r = bptr->get_blob().map(
10881 0, bptr->get_blob().get_ondisk_length(),
10882 [&](uint64_t offset, uint64_t length) {
10883 int r = bdev->aio_read(offset, length, &bl, ioc);
10884 if (r < 0)
7c673cae
FG
10885 return r;
10886 return 0;
9f95a23c 10887 });
b32b8144
FG
10888 if (r < 0) {
10889 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
10890 if (r == -EIO) {
10891 // propagate EIO to caller
10892 return r;
10893 }
11fdf7f2 10894 ceph_assert(r == 0);
b32b8144 10895 }
7c673cae
FG
10896 } else {
10897 // read the pieces
11fdf7f2 10898 for (auto& req : r2r) {
9f95a23c
TL
10899 dout(20) << __func__ << " region 0x" << std::hex
10900 << req.regs.front().logical_offset
10901 << ": 0x" << req.regs.front().blob_xoffset
10902 << " reading 0x" << req.r_off
10903 << "~" << req.r_len << std::dec
10904 << dendl;
7c673cae 10905
9f95a23c
TL
10906 // read it
10907 auto r = bptr->get_blob().map(
10908 req.r_off, req.r_len,
10909 [&](uint64_t offset, uint64_t length) {
10910 int r = bdev->aio_read(offset, length, &req.bl, ioc);
10911 if (r < 0)
7c673cae
FG
10912 return r;
10913 return 0;
9f95a23c 10914 });
b32b8144
FG
10915 if (r < 0) {
10916 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
10917 << dendl;
10918 if (r == -EIO) {
10919 // propagate EIO to caller
10920 return r;
10921 }
11fdf7f2 10922 ceph_assert(r == 0);
b32b8144 10923 }
9f95a23c 10924 ceph_assert(req.bl.length() == req.r_len);
7c673cae
FG
10925 }
10926 }
10927 }
9f95a23c
TL
10928 return 0;
10929}
11fdf7f2 10930
9f95a23c
TL
10931int BlueStore::_generate_read_result_bl(
10932 OnodeRef o,
10933 uint64_t offset,
10934 size_t length,
10935 ready_regions_t& ready_regions,
10936 vector<bufferlist>& compressed_blob_bls,
10937 blobs2read_t& blobs2read,
10938 bool buffered,
10939 bool* csum_error,
10940 bufferlist& bl)
10941{
10942 // enumerate and decompress desired blobs
7c673cae
FG
10943 auto p = compressed_blob_bls.begin();
10944 blobs2read_t::iterator b2r_it = blobs2read.begin();
10945 while (b2r_it != blobs2read.end()) {
94b18763 10946 const BlobRef& bptr = b2r_it->first;
11fdf7f2 10947 regions2read_t& r2r = b2r_it->second;
20effc67
TL
10948 dout(20) << __func__ << " blob " << *bptr << " need "
10949 << r2r << dendl;
7c673cae 10950 if (bptr->get_blob().is_compressed()) {
11fdf7f2 10951 ceph_assert(p != compressed_blob_bls.end());
7c673cae
FG
10952 bufferlist& compressed_bl = *p++;
10953 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
9f95a23c
TL
10954 r2r.front().regs.front().logical_offset) < 0) {
10955 *csum_error = true;
10956 return -EIO;
7c673cae
FG
10957 }
10958 bufferlist raw_bl;
9f95a23c 10959 auto r = _decompress(compressed_bl, &raw_bl);
7c673cae 10960 if (r < 0)
9f95a23c 10961 return r;
7c673cae 10962 if (buffered) {
9f95a23c
TL
10963 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
10964 raw_bl);
7c673cae 10965 }
11fdf7f2
TL
10966 for (auto& req : r2r) {
10967 for (auto& r : req.regs) {
10968 ready_regions[r.logical_offset].substr_of(
10969 raw_bl, r.blob_xoffset, r.length);
10970 }
7c673cae
FG
10971 }
10972 } else {
11fdf7f2 10973 for (auto& req : r2r) {
9f95a23c
TL
10974 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
10975 req.regs.front().logical_offset) < 0) {
10976 *csum_error = true;
10977 return -EIO;
10978 }
10979 if (buffered) {
10980 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
10981 req.r_off, req.bl);
10982 }
7c673cae 10983
9f95a23c
TL
10984 // prune and keep result
10985 for (const auto& r : req.regs) {
10986 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
11fdf7f2 10987 }
7c673cae
FG
10988 }
10989 }
10990 ++b2r_it;
10991 }
10992
10993 // generate a resulting buffer
10994 auto pr = ready_regions.begin();
10995 auto pr_end = ready_regions.end();
9f95a23c 10996 uint64_t pos = 0;
7c673cae
FG
10997 while (pos < length) {
10998 if (pr != pr_end && pr->first == pos + offset) {
10999 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
11000 << ": data from 0x" << pr->first << "~" << pr->second.length()
11001 << std::dec << dendl;
7c673cae
FG
11002 pos += pr->second.length();
11003 bl.claim_append(pr->second);
11004 ++pr;
11005 } else {
11006 uint64_t l = length - pos;
11007 if (pr != pr_end) {
11fdf7f2 11008 ceph_assert(pr->first > pos + offset);
9f95a23c 11009 l = pr->first - (pos + offset);
7c673cae
FG
11010 }
11011 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
11012 << ": zeros for 0x" << (pos + offset) << "~" << l
11013 << std::dec << dendl;
7c673cae
FG
11014 bl.append_zero(l);
11015 pos += l;
11016 }
11017 }
11fdf7f2
TL
11018 ceph_assert(bl.length() == length);
11019 ceph_assert(pos == length);
11020 ceph_assert(pr == pr_end);
9f95a23c
TL
11021 return 0;
11022}
11023
11024int BlueStore::_do_read(
11025 Collection *c,
11026 OnodeRef o,
11027 uint64_t offset,
11028 size_t length,
11029 bufferlist& bl,
11030 uint32_t op_flags,
11031 uint64_t retry_count)
11032{
11033 FUNCTRACE(cct);
11034 int r = 0;
11035 int read_cache_policy = 0; // do not bypass clean or dirty cache
11036
11037 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11038 << " size 0x" << o->onode.size << " (" << std::dec
11039 << o->onode.size << ")" << dendl;
11040 bl.clear();
11041
11042 if (offset >= o->onode.size) {
11043 return r;
11044 }
11045
11046 // generally, don't buffer anything, unless the client explicitly requests
11047 // it.
11048 bool buffered = false;
11049 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
11050 dout(20) << __func__ << " will do buffered read" << dendl;
11051 buffered = true;
11052 } else if (cct->_conf->bluestore_default_buffered_read &&
11053 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
11054 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
11055 dout(20) << __func__ << " defaulting to buffered read" << dendl;
11056 buffered = true;
11057 }
11058
11059 if (offset + length > o->onode.size) {
11060 length = o->onode.size - offset;
11061 }
11062
11063 auto start = mono_clock::now();
11064 o->extent_map.fault_range(db, offset, length);
11065 log_latency(__func__,
11066 l_bluestore_read_onode_meta_lat,
11067 mono_clock::now() - start,
11068 cct->_conf->bluestore_log_op_age);
11069 _dump_onode<30>(cct, *o);
11070
11071 // for deep-scrub, we only read dirty cache and bypass clean cache in
11072 // order to read underlying block device in case there are silent disk errors.
11073 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
11074 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
11075 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
11076 }
11077
11078 // build blob-wise list to of stuff read (that isn't cached)
11079 ready_regions_t ready_regions;
11080 blobs2read_t blobs2read;
11081 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
11082
11083
11084 // read raw blob data.
11085 start = mono_clock::now(); // for the sake of simplicity
11086 // measure the whole block below.
11087 // The error isn't that much...
11088 vector<bufferlist> compressed_blob_bls;
20effc67 11089 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
9f95a23c
TL
11090 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
11091 // we always issue aio for reading, so errors other than EIO are not allowed
11092 if (r < 0)
11093 return r;
11094
f67539c2 11095 int64_t num_ios = blobs2read.size();
9f95a23c 11096 if (ioc.has_pending_aios()) {
f67539c2 11097 num_ios = ioc.get_num_ios();
9f95a23c
TL
11098 bdev->aio_submit(&ioc);
11099 dout(20) << __func__ << " waiting for aio" << dendl;
11100 ioc.aio_wait();
11101 r = ioc.get_return_value();
11102 if (r < 0) {
11103 ceph_assert(r == -EIO); // no other errors allowed
11104 return -EIO;
11105 }
11106 }
11107 log_latency_fn(__func__,
11108 l_bluestore_read_wait_aio_lat,
11109 mono_clock::now() - start,
11110 cct->_conf->bluestore_log_op_age,
11111 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
11112 );
11113
11114 bool csum_error = false;
11115 r = _generate_read_result_bl(o, offset, length, ready_regions,
11116 compressed_blob_bls, blobs2read,
20effc67
TL
11117 buffered && !ioc.skip_cache(),
11118 &csum_error, bl);
9f95a23c
TL
11119 if (csum_error) {
11120 // Handles spurious read errors caused by a kernel bug.
11121 // We sometimes get all-zero pages as a result of the read under
11122 // high memory pressure. Retrying the failing read succeeds in most
11123 // cases.
11124 // See also: http://tracker.ceph.com/issues/22464
11125 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
11126 return -EIO;
11127 }
11128 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
11129 }
7c673cae 11130 r = bl.length();
f64942e4
AA
11131 if (retry_count) {
11132 logger->inc(l_bluestore_reads_with_retries);
11133 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
11134 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
f67539c2
TL
11135 stringstream s;
11136 s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
11137 _set_spurious_read_errors_alert(s.str());
f64942e4 11138 }
7c673cae
FG
11139 return r;
11140}
11141
11142int BlueStore::_verify_csum(OnodeRef& o,
11143 const bluestore_blob_t* blob, uint64_t blob_xoffset,
11144 const bufferlist& bl,
11145 uint64_t logical_offset) const
11146{
11147 int bad;
11148 uint64_t bad_csum;
11fdf7f2 11149 auto start = mono_clock::now();
7c673cae 11150 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
f64942e4
AA
11151 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
11152 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
11153 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
11154 bad = blob_xoffset;
11155 r = -1;
11156 bad_csum = 0xDEADBEEF;
11157 }
7c673cae
FG
11158 if (r < 0) {
11159 if (r == -1) {
11160 PExtentVector pex;
11161 blob->map(
11162 bad,
11163 blob->get_csum_chunk_size(),
11164 [&](uint64_t offset, uint64_t length) {
11165 pex.emplace_back(bluestore_pextent_t(offset, length));
11166 return 0;
11167 });
11168 derr << __func__ << " bad "
11169 << Checksummer::get_csum_type_string(blob->csum_type)
11170 << "/0x" << std::hex << blob->get_csum_chunk_size()
11171 << " checksum at blob offset 0x" << bad
11172 << ", got 0x" << bad_csum << ", expected 0x"
11173 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
11174 << ", device location " << pex
11175 << ", logical extent 0x" << std::hex
11176 << (logical_offset + bad - blob_xoffset) << "~"
11177 << blob->get_csum_chunk_size() << std::dec
11178 << ", object " << o->oid
11179 << dendl;
11180 } else {
11181 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
11182 }
11183 }
494da23a
TL
11184 log_latency(__func__,
11185 l_bluestore_csum_lat,
11186 mono_clock::now() - start,
11187 cct->_conf->bluestore_log_op_age);
11fdf7f2
TL
11188 if (cct->_conf->bluestore_ignore_data_csum) {
11189 return 0;
11190 }
7c673cae
FG
11191 return r;
11192}
11193
11194int BlueStore::_decompress(bufferlist& source, bufferlist* result)
11195{
11196 int r = 0;
11fdf7f2
TL
11197 auto start = mono_clock::now();
11198 auto i = source.cbegin();
7c673cae 11199 bluestore_compression_header_t chdr;
11fdf7f2 11200 decode(chdr, i);
7c673cae
FG
11201 int alg = int(chdr.type);
11202 CompressorRef cp = compressor;
11203 if (!cp || (int)cp->get_type() != alg) {
11204 cp = Compressor::create(cct, alg);
11205 }
11206
11207 if (!cp.get()) {
11208 // if compressor isn't available - error, because cannot return
11209 // decompressed data?
11fdf7f2
TL
11210
11211 const char* alg_name = Compressor::get_comp_alg_name(alg);
11212 derr << __func__ << " can't load decompressor " << alg_name << dendl;
11213 _set_compression_alert(false, alg_name);
7c673cae
FG
11214 r = -EIO;
11215 } else {
f67539c2 11216 r = cp->decompress(i, chdr.length, *result, chdr.compressor_message);
7c673cae
FG
11217 if (r < 0) {
11218 derr << __func__ << " decompression failed with exit code " << r << dendl;
11219 r = -EIO;
11220 }
11221 }
494da23a
TL
11222 log_latency(__func__,
11223 l_bluestore_decompress_lat,
11224 mono_clock::now() - start,
11225 cct->_conf->bluestore_log_op_age);
7c673cae
FG
11226 return r;
11227}
11228
11229// this stores fiemap into interval_set, other variations
11230// use it internally
11231int BlueStore::_fiemap(
11232 CollectionHandle &c_,
11233 const ghobject_t& oid,
11234 uint64_t offset,
11235 size_t length,
11236 interval_set<uint64_t>& destset)
11237{
11238 Collection *c = static_cast<Collection *>(c_.get());
11239 if (!c->exists)
11240 return -ENOENT;
11241 {
9f95a23c 11242 std::shared_lock l(c->lock);
7c673cae
FG
11243
11244 OnodeRef o = c->get_onode(oid, false);
11245 if (!o || !o->exists) {
11246 return -ENOENT;
11247 }
81eedcae 11248 _dump_onode<30>(cct, *o);
7c673cae
FG
11249
11250 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11251 << " size 0x" << o->onode.size << std::dec << dendl;
11252
11253 boost::intrusive::set<Extent>::iterator ep, eend;
11254 if (offset >= o->onode.size)
11255 goto out;
11256
11257 if (offset + length > o->onode.size) {
11258 length = o->onode.size - offset;
11259 }
11260
11261 o->extent_map.fault_range(db, offset, length);
11262 eend = o->extent_map.extent_map.end();
11263 ep = o->extent_map.seek_lextent(offset);
11264 while (length > 0) {
11265 dout(20) << __func__ << " offset " << offset << dendl;
11266 if (ep != eend && ep->logical_offset + ep->length <= offset) {
11267 ++ep;
11268 continue;
11269 }
11270
11271 uint64_t x_len = length;
11272 if (ep != eend && ep->logical_offset <= offset) {
11273 uint64_t x_off = offset - ep->logical_offset;
11fdf7f2 11274 x_len = std::min(x_len, ep->length - x_off);
7c673cae
FG
11275 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
11276 << x_len << std::dec << " blob " << ep->blob << dendl;
11277 destset.insert(offset, x_len);
11278 length -= x_len;
11279 offset += x_len;
11280 if (x_off + x_len == ep->length)
11281 ++ep;
11282 continue;
11283 }
11284 if (ep != eend &&
11285 ep->logical_offset > offset &&
11286 ep->logical_offset - offset < x_len) {
11287 x_len = ep->logical_offset - offset;
11288 }
11289 offset += x_len;
11290 length -= x_len;
11291 }
11292 }
9f95a23c
TL
11293
11294 out:
11295 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11296 << " size = 0x(" << destset << ")" << std::dec << dendl;
11297 return 0;
11298}
11299
11300int BlueStore::fiemap(
11301 CollectionHandle &c_,
11302 const ghobject_t& oid,
11303 uint64_t offset,
11304 size_t length,
11305 bufferlist& bl)
11306{
11307 interval_set<uint64_t> m;
11308 int r = _fiemap(c_, oid, offset, length, m);
11309 if (r >= 0) {
11310 encode(m, bl);
11311 }
11312 return r;
11313}
11314
11315int BlueStore::fiemap(
11316 CollectionHandle &c_,
11317 const ghobject_t& oid,
11318 uint64_t offset,
11319 size_t length,
11320 map<uint64_t, uint64_t>& destmap)
11321{
11322 interval_set<uint64_t> m;
11323 int r = _fiemap(c_, oid, offset, length, m);
11324 if (r >= 0) {
11325 destmap = std::move(m).detach();
11326 }
11327 return r;
11328}
11329
11330int BlueStore::readv(
11331 CollectionHandle &c_,
11332 const ghobject_t& oid,
11333 interval_set<uint64_t>& m,
11334 bufferlist& bl,
11335 uint32_t op_flags)
11336{
11337 auto start = mono_clock::now();
11338 Collection *c = static_cast<Collection *>(c_.get());
11339 const coll_t &cid = c->get_cid();
11340 dout(15) << __func__ << " " << cid << " " << oid
11341 << " fiemap " << m
11342 << dendl;
11343 if (!c->exists)
11344 return -ENOENT;
11345
11346 bl.clear();
11347 int r;
11348 {
11349 std::shared_lock l(c->lock);
11350 auto start1 = mono_clock::now();
11351 OnodeRef o = c->get_onode(oid, false);
11352 log_latency("get_onode@read",
11353 l_bluestore_read_onode_meta_lat,
11354 mono_clock::now() - start1,
11355 cct->_conf->bluestore_log_op_age);
11356 if (!o || !o->exists) {
11357 r = -ENOENT;
11358 goto out;
11359 }
11360
11361 if (m.empty()) {
11362 r = 0;
11363 goto out;
11364 }
11365
11366 r = _do_readv(c, o, m, bl, op_flags);
11367 if (r == -EIO) {
11368 logger->inc(l_bluestore_read_eio);
11369 }
11370 }
11371
11372 out:
11373 if (r >= 0 && _debug_data_eio(oid)) {
11374 r = -EIO;
11375 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11376 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
11377 cct->_conf->bluestore_debug_random_read_err &&
11378 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
11379 100.0)) == 0) {
11380 dout(0) << __func__ << ": inject random EIO" << dendl;
11381 r = -EIO;
11382 }
11383 dout(10) << __func__ << " " << cid << " " << oid
11384 << " fiemap " << m << std::dec
11385 << " = " << r << dendl;
11386 log_latency(__func__,
11387 l_bluestore_read_lat,
11388 mono_clock::now() - start,
11389 cct->_conf->bluestore_log_op_age);
11390 return r;
11391}
11392
11393int BlueStore::_do_readv(
11394 Collection *c,
11395 OnodeRef o,
11396 const interval_set<uint64_t>& m,
11397 bufferlist& bl,
11398 uint32_t op_flags,
11399 uint64_t retry_count)
11400{
11401 FUNCTRACE(cct);
11402 int r = 0;
11403 int read_cache_policy = 0; // do not bypass clean or dirty cache
11404
11405 dout(20) << __func__ << " fiemap " << m << std::hex
11406 << " size 0x" << o->onode.size << " (" << std::dec
11407 << o->onode.size << ")" << dendl;
11408
11409 // generally, don't buffer anything, unless the client explicitly requests
11410 // it.
11411 bool buffered = false;
11412 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
11413 dout(20) << __func__ << " will do buffered read" << dendl;
11414 buffered = true;
11415 } else if (cct->_conf->bluestore_default_buffered_read &&
11416 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
11417 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
11418 dout(20) << __func__ << " defaulting to buffered read" << dendl;
11419 buffered = true;
11420 }
11421 // this method must be idempotent since we may call it several times
11422 // before we finally read the expected result.
11423 bl.clear();
11424
11425 // call fiemap first!
11426 ceph_assert(m.range_start() <= o->onode.size);
11427 ceph_assert(m.range_end() <= o->onode.size);
11428 auto start = mono_clock::now();
11429 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
11430 log_latency(__func__,
11431 l_bluestore_read_onode_meta_lat,
11432 mono_clock::now() - start,
11433 cct->_conf->bluestore_log_op_age);
11434 _dump_onode<30>(cct, *o);
11435
20effc67 11436 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
9f95a23c
TL
11437 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
11438 raw_results.reserve(m.num_intervals());
11439 int i = 0;
11440 for (auto p = m.begin(); p != m.end(); p++, i++) {
11441 raw_results.push_back({});
11442 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
11443 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
11444 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
11445 // we always issue aio for reading, so errors other than EIO are not allowed
11446 if (r < 0)
11447 return r;
11448 }
11449
11450 auto num_ios = m.size();
11451 if (ioc.has_pending_aios()) {
11452 num_ios = ioc.get_num_ios();
11453 bdev->aio_submit(&ioc);
11454 dout(20) << __func__ << " waiting for aio" << dendl;
11455 ioc.aio_wait();
11456 r = ioc.get_return_value();
11457 if (r < 0) {
11458 ceph_assert(r == -EIO); // no other errors allowed
11459 return -EIO;
11460 }
11461 }
11462 log_latency_fn(__func__,
11463 l_bluestore_read_wait_aio_lat,
11464 mono_clock::now() - start,
11465 cct->_conf->bluestore_log_op_age,
11466 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
11467 );
11468
11469 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
11470 i = 0;
11471 for (auto p = m.begin(); p != m.end(); p++, i++) {
11472 bool csum_error = false;
11473 bufferlist t;
11474 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
11475 std::get<0>(raw_results[i]),
11476 std::get<1>(raw_results[i]),
11477 std::get<2>(raw_results[i]),
11478 buffered, &csum_error, t);
11479 if (csum_error) {
11480 // Handles spurious read errors caused by a kernel bug.
11481 // We sometimes get all-zero pages as a result of the read under
11482 // high memory pressure. Retrying the failing read succeeds in most
11483 // cases.
11484 // See also: http://tracker.ceph.com/issues/22464
11485 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
11486 return -EIO;
11487 }
11488 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
11489 }
11490 bl.claim_append(t);
11491 }
11492 if (retry_count) {
11493 logger->inc(l_bluestore_reads_with_retries);
11494 dout(5) << __func__ << " read fiemap " << m
11495 << " failed " << retry_count << " times before succeeding"
11496 << dendl;
11497 }
11498 return bl.length();
7c673cae
FG
11499}
11500
9f95a23c 11501int BlueStore::dump_onode(CollectionHandle &c_,
7c673cae 11502 const ghobject_t& oid,
9f95a23c
TL
11503 const string& section_name,
11504 Formatter *f)
7c673cae 11505{
9f95a23c
TL
11506 Collection *c = static_cast<Collection *>(c_.get());
11507 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
11508 if (!c->exists)
11509 return -ENOENT;
7c673cae 11510
9f95a23c
TL
11511 int r;
11512 {
11513 std::shared_lock l(c->lock);
11514
11515 OnodeRef o = c->get_onode(oid, false);
11516 if (!o || !o->exists) {
11517 r = -ENOENT;
11518 goto out;
11519 }
11520 // FIXME minor: actually the next line isn't enough to
11521 // load shared blobs. Leaving as is for now..
11522 //
11523 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
11524
11525 _dump_onode<0>(cct, *o);
11526 f->open_object_section(section_name.c_str());
11527 o->dump(f);
11528 f->close_section();
11529 r = 0;
7c673cae 11530 }
9f95a23c
TL
11531 out:
11532 dout(10) << __func__ << " " << c->cid << " " << oid
11533 << " = " << r << dendl;
7c673cae
FG
11534 return r;
11535}
11536
7c673cae
FG
11537int BlueStore::getattr(
11538 CollectionHandle &c_,
11539 const ghobject_t& oid,
11540 const char *name,
11541 bufferptr& value)
11542{
11543 Collection *c = static_cast<Collection *>(c_.get());
11544 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
11545 if (!c->exists)
11546 return -ENOENT;
11547
11548 int r;
11549 {
9f95a23c 11550 std::shared_lock l(c->lock);
f91f0fd5 11551 mempool::bluestore_cache_meta::string k(name);
7c673cae
FG
11552
11553 OnodeRef o = c->get_onode(oid, false);
11554 if (!o || !o->exists) {
11555 r = -ENOENT;
11556 goto out;
11557 }
11558
11559 if (!o->onode.attrs.count(k)) {
11560 r = -ENODATA;
11561 goto out;
11562 }
11563 value = o->onode.attrs[k];
11564 r = 0;
11565 }
11566 out:
7c673cae
FG
11567 if (r == 0 && _debug_mdata_eio(oid)) {
11568 r = -EIO;
11569 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11570 }
11571 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
11572 << " = " << r << dendl;
11573 return r;
11574}
11575
7c673cae
FG
11576int BlueStore::getattrs(
11577 CollectionHandle &c_,
11578 const ghobject_t& oid,
20effc67 11579 map<string,bufferptr,less<>>& aset)
7c673cae
FG
11580{
11581 Collection *c = static_cast<Collection *>(c_.get());
11582 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
11583 if (!c->exists)
11584 return -ENOENT;
11585
11586 int r;
11587 {
9f95a23c 11588 std::shared_lock l(c->lock);
7c673cae
FG
11589
11590 OnodeRef o = c->get_onode(oid, false);
11591 if (!o || !o->exists) {
11592 r = -ENOENT;
11593 goto out;
11594 }
11595 for (auto& i : o->onode.attrs) {
11596 aset.emplace(i.first.c_str(), i.second);
11597 }
11598 r = 0;
11599 }
11600
11601 out:
7c673cae
FG
11602 if (r == 0 && _debug_mdata_eio(oid)) {
11603 r = -EIO;
11604 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11605 }
11606 dout(10) << __func__ << " " << c->cid << " " << oid
11607 << " = " << r << dendl;
11608 return r;
11609}
11610
11611int BlueStore::list_collections(vector<coll_t>& ls)
11612{
9f95a23c 11613 std::shared_lock l(coll_lock);
11fdf7f2 11614 ls.reserve(coll_map.size());
7c673cae
FG
11615 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
11616 p != coll_map.end();
11617 ++p)
11618 ls.push_back(p->first);
11619 return 0;
11620}
11621
11622bool BlueStore::collection_exists(const coll_t& c)
11623{
9f95a23c 11624 std::shared_lock l(coll_lock);
7c673cae
FG
11625 return coll_map.count(c);
11626}
11627
11fdf7f2 11628int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
7c673cae 11629{
11fdf7f2 11630 dout(15) << __func__ << " " << ch->cid << dendl;
7c673cae
FG
11631 vector<ghobject_t> ls;
11632 ghobject_t next;
11fdf7f2 11633 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
7c673cae
FG
11634 &ls, &next);
11635 if (r < 0) {
11636 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
11637 << dendl;
11638 return r;
11639 }
11640 *empty = ls.empty();
11fdf7f2 11641 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
7c673cae
FG
11642 return 0;
11643}
11644
11fdf7f2 11645int BlueStore::collection_bits(CollectionHandle& ch)
7c673cae 11646{
11fdf7f2
TL
11647 dout(15) << __func__ << " " << ch->cid << dendl;
11648 Collection *c = static_cast<Collection*>(ch.get());
9f95a23c 11649 std::shared_lock l(c->lock);
11fdf7f2 11650 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
7c673cae
FG
11651 return c->cnode.bits;
11652}
11653
7c673cae
FG
11654int BlueStore::collection_list(
11655 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
11656 vector<ghobject_t> *ls, ghobject_t *pnext)
11657{
11658 Collection *c = static_cast<Collection *>(c_.get());
11fdf7f2 11659 c->flush();
7c673cae
FG
11660 dout(15) << __func__ << " " << c->cid
11661 << " start " << start << " end " << end << " max " << max << dendl;
11662 int r;
11663 {
9f95a23c 11664 std::shared_lock l(c->lock);
f91f0fd5
TL
11665 r = _collection_list(c, start, end, max, false, ls, pnext);
11666 }
11667
11668 dout(10) << __func__ << " " << c->cid
11669 << " start " << start << " end " << end << " max " << max
11670 << " = " << r << ", ls.size() = " << ls->size()
11671 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
11672 return r;
11673}
11674
11675int BlueStore::collection_list_legacy(
11676 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
11677 vector<ghobject_t> *ls, ghobject_t *pnext)
11678{
11679 Collection *c = static_cast<Collection *>(c_.get());
11680 c->flush();
11681 dout(15) << __func__ << " " << c->cid
11682 << " start " << start << " end " << end << " max " << max << dendl;
11683 int r;
11684 {
11685 std::shared_lock l(c->lock);
11686 r = _collection_list(c, start, end, max, true, ls, pnext);
7c673cae
FG
11687 }
11688
7c673cae
FG
11689 dout(10) << __func__ << " " << c->cid
11690 << " start " << start << " end " << end << " max " << max
11691 << " = " << r << ", ls.size() = " << ls->size()
11692 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
11693 return r;
11694}
11695
11696int BlueStore::_collection_list(
11697 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
f91f0fd5 11698 bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
7c673cae
FG
11699{
11700
11701 if (!c->exists)
11702 return -ENOENT;
11703
7c673cae 11704 ghobject_t static_next;
f91f0fd5
TL
11705 std::unique_ptr<CollectionListIterator> it;
11706 ghobject_t coll_range_temp_start, coll_range_temp_end;
11707 ghobject_t coll_range_start, coll_range_end;
f91f0fd5 11708 ghobject_t pend;
7c673cae
FG
11709 bool temp;
11710
11711 if (!pnext)
11712 pnext = &static_next;
11713
a4b75251
TL
11714 auto log_latency = make_scope_guard(
11715 [&, start_time = mono_clock::now(), func_name = __func__] {
11716 log_latency_fn(
11717 func_name,
11718 l_bluestore_remove_lat,
11719 mono_clock::now() - start_time,
11720 cct->_conf->bluestore_log_collection_list_age,
11721 [&](const ceph::timespan& lat) {
11722 ostringstream ostr;
11723 ostr << ", lat = " << timespan_str(lat)
11724 << " cid =" << c->cid
11725 << " start " << start << " end " << end
11726 << " max " << max;
11727 return ostr.str();
11728 });
11729 });
11730
11fdf7f2 11731 if (start.is_max() || start.hobj.is_max()) {
a4b75251
TL
11732 *pnext = ghobject_t::get_max();
11733 return 0;
7c673cae 11734 }
f91f0fd5 11735 get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
a4b75251 11736 &coll_range_temp_end, &coll_range_start, &coll_range_end, legacy);
7c673cae 11737 dout(20) << __func__
f91f0fd5
TL
11738 << " range " << coll_range_temp_start
11739 << " to " << coll_range_temp_end
11740 << " and " << coll_range_start
11741 << " to " << coll_range_end
7c673cae 11742 << " start " << start << dendl;
f91f0fd5
TL
11743 if (legacy) {
11744 it = std::make_unique<SimpleCollectionListIterator>(
11745 cct, db->get_iterator(PREFIX_OBJ));
11746 } else {
11747 it = std::make_unique<SortedCollectionListIterator>(
11748 db->get_iterator(PREFIX_OBJ));
11749 }
7c673cae
FG
11750 if (start == ghobject_t() ||
11751 start.hobj == hobject_t() ||
11752 start == c->cid.get_min_hobj()) {
f91f0fd5 11753 it->upper_bound(coll_range_temp_start);
7c673cae
FG
11754 temp = true;
11755 } else {
7c673cae
FG
11756 if (start.hobj.is_temp()) {
11757 temp = true;
f91f0fd5 11758 ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
7c673cae
FG
11759 } else {
11760 temp = false;
f91f0fd5 11761 ceph_assert(start >= coll_range_start && start < coll_range_end);
7c673cae 11762 }
f91f0fd5
TL
11763 dout(20) << __func__ << " temp=" << (int)temp << dendl;
11764 it->lower_bound(start);
7c673cae
FG
11765 }
11766 if (end.hobj.is_max()) {
f91f0fd5 11767 pend = temp ? coll_range_temp_end : coll_range_end;
7c673cae 11768 } else {
7c673cae 11769 if (end.hobj.is_temp()) {
a4b75251 11770 if (temp) {
f91f0fd5 11771 pend = end;
a4b75251
TL
11772 } else {
11773 *pnext = ghobject_t::get_max();
11774 return 0;
11775 }
7c673cae 11776 } else {
f91f0fd5 11777 pend = temp ? coll_range_temp_end : end;
7c673cae
FG
11778 }
11779 }
f91f0fd5 11780 dout(20) << __func__ << " pend " << pend << dendl;
7c673cae 11781 while (true) {
adb31ebb 11782 if (!it->valid() || it->is_ge(pend)) {
7c673cae
FG
11783 if (!it->valid())
11784 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
11785 else
f91f0fd5 11786 dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
7c673cae
FG
11787 if (temp) {
11788 if (end.hobj.is_temp()) {
adb31ebb 11789 if (it->valid() && it->is_lt(coll_range_temp_end)) {
f91f0fd5 11790 *pnext = it->oid();
a4b75251 11791 return 0;
f91f0fd5 11792 }
7c673cae
FG
11793 break;
11794 }
11795 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
11796 temp = false;
f91f0fd5
TL
11797 it->upper_bound(coll_range_start);
11798 if (end.hobj.is_max())
11799 pend = coll_range_end;
11800 else
11801 pend = end;
11802 dout(30) << __func__ << " pend " << pend << dendl;
7c673cae
FG
11803 continue;
11804 }
adb31ebb 11805 if (it->valid() && it->is_lt(coll_range_end)) {
f91f0fd5 11806 *pnext = it->oid();
a4b75251 11807 return 0;
f91f0fd5 11808 }
7c673cae
FG
11809 break;
11810 }
f91f0fd5 11811 dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
7c673cae
FG
11812 if (ls->size() >= (unsigned)max) {
11813 dout(20) << __func__ << " reached max " << max << dendl;
f91f0fd5 11814 *pnext = it->oid();
a4b75251 11815 return 0;
7c673cae 11816 }
f91f0fd5 11817 ls->push_back(it->oid());
7c673cae
FG
11818 it->next();
11819 }
a4b75251
TL
11820 *pnext = ghobject_t::get_max();
11821 return 0;
7c673cae
FG
11822}
11823
7c673cae
FG
11824int BlueStore::omap_get(
11825 CollectionHandle &c_, ///< [in] Collection containing oid
11826 const ghobject_t &oid, ///< [in] Object containing omap
11827 bufferlist *header, ///< [out] omap header
11828 map<string, bufferlist> *out /// < [out] Key to value map
11829 )
11830{
11831 Collection *c = static_cast<Collection *>(c_.get());
9f95a23c
TL
11832 return _omap_get(c, oid, header, out);
11833}
11834
11835int BlueStore::_omap_get(
11836 Collection *c, ///< [in] Collection containing oid
11837 const ghobject_t &oid, ///< [in] Object containing omap
11838 bufferlist *header, ///< [out] omap header
11839 map<string, bufferlist> *out /// < [out] Key to value map
11840 )
11841{
7c673cae
FG
11842 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11843 if (!c->exists)
11844 return -ENOENT;
9f95a23c 11845 std::shared_lock l(c->lock);
7c673cae
FG
11846 int r = 0;
11847 OnodeRef o = c->get_onode(oid, false);
11848 if (!o || !o->exists) {
11849 r = -ENOENT;
11850 goto out;
11851 }
9f95a23c
TL
11852 r = _onode_omap_get(o, header, out);
11853 out:
11854 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11855 << dendl;
11856 return r;
11857}
11858
11859int BlueStore::_onode_omap_get(
11860 const OnodeRef &o, ///< [in] Object containing omap
11861 bufferlist *header, ///< [out] omap header
11862 map<string, bufferlist> *out /// < [out] Key to value map
11863)
11864{
11865 int r = 0;
11866 if (!o || !o->exists) {
11867 r = -ENOENT;
11868 goto out;
11869 }
7c673cae
FG
11870 if (!o->onode.has_omap())
11871 goto out;
11872 o->flush();
11873 {
9f95a23c 11874 const string& prefix = o->get_omap_prefix();
7c673cae 11875 string head, tail;
9f95a23c
TL
11876 o->get_omap_header(&head);
11877 o->get_omap_tail(&tail);
33c7a0ef 11878 KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
7c673cae
FG
11879 it->lower_bound(head);
11880 while (it->valid()) {
11881 if (it->key() == head) {
9f95a23c
TL
11882 dout(30) << __func__ << " got header" << dendl;
11883 *header = it->value();
7c673cae 11884 } else if (it->key() >= tail) {
9f95a23c
TL
11885 dout(30) << __func__ << " reached tail" << dendl;
11886 break;
7c673cae 11887 } else {
9f95a23c
TL
11888 string user_key;
11889 o->decode_omap_key(it->key(), &user_key);
11890 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
11891 << " -> " << user_key << dendl;
11892 (*out)[user_key] = it->value();
7c673cae
FG
11893 }
11894 it->next();
11895 }
11896 }
9f95a23c 11897out:
7c673cae
FG
11898 return r;
11899}
11900
7c673cae
FG
11901int BlueStore::omap_get_header(
11902 CollectionHandle &c_, ///< [in] Collection containing oid
11903 const ghobject_t &oid, ///< [in] Object containing omap
11904 bufferlist *header, ///< [out] omap header
11905 bool allow_eio ///< [in] don't assert on eio
11906 )
11907{
11908 Collection *c = static_cast<Collection *>(c_.get());
11909 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11910 if (!c->exists)
11911 return -ENOENT;
9f95a23c 11912 std::shared_lock l(c->lock);
7c673cae
FG
11913 int r = 0;
11914 OnodeRef o = c->get_onode(oid, false);
11915 if (!o || !o->exists) {
11916 r = -ENOENT;
11917 goto out;
11918 }
11919 if (!o->onode.has_omap())
11920 goto out;
11921 o->flush();
11922 {
11923 string head;
9f95a23c
TL
11924 o->get_omap_header(&head);
11925 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
7c673cae
FG
11926 dout(30) << __func__ << " got header" << dendl;
11927 } else {
11928 dout(30) << __func__ << " no header" << dendl;
11929 }
11930 }
11931 out:
11932 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11933 << dendl;
11934 return r;
11935}
11936
7c673cae
FG
11937int BlueStore::omap_get_keys(
11938 CollectionHandle &c_, ///< [in] Collection containing oid
11939 const ghobject_t &oid, ///< [in] Object containing omap
11940 set<string> *keys ///< [out] Keys defined on oid
11941 )
11942{
11943 Collection *c = static_cast<Collection *>(c_.get());
11944 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11945 if (!c->exists)
11946 return -ENOENT;
adb31ebb 11947 auto start1 = mono_clock::now();
9f95a23c 11948 std::shared_lock l(c->lock);
7c673cae
FG
11949 int r = 0;
11950 OnodeRef o = c->get_onode(oid, false);
11951 if (!o || !o->exists) {
11952 r = -ENOENT;
11953 goto out;
11954 }
11955 if (!o->onode.has_omap())
11956 goto out;
11957 o->flush();
11958 {
9f95a23c 11959 const string& prefix = o->get_omap_prefix();
7c673cae 11960 string head, tail;
9f95a23c
TL
11961 o->get_omap_key(string(), &head);
11962 o->get_omap_tail(&tail);
33c7a0ef 11963 KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
7c673cae
FG
11964 it->lower_bound(head);
11965 while (it->valid()) {
11966 if (it->key() >= tail) {
11967 dout(30) << __func__ << " reached tail" << dendl;
11968 break;
11969 }
11970 string user_key;
9f95a23c 11971 o->decode_omap_key(it->key(), &user_key);
11fdf7f2 11972 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
11973 << " -> " << user_key << dendl;
11974 keys->insert(user_key);
11975 it->next();
11fdf7f2
TL
11976 }
11977 }
11978 out:
adb31ebb
TL
11979 c->store->log_latency(
11980 __func__,
11981 l_bluestore_omap_get_keys_lat,
11982 mono_clock::now() - start1,
11983 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11984
11fdf7f2
TL
11985 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11986 << dendl;
11987 return r;
7c673cae
FG
11988}
11989
11990int BlueStore::omap_get_values(
11991 CollectionHandle &c_, ///< [in] Collection containing oid
11992 const ghobject_t &oid, ///< [in] Object containing omap
11993 const set<string> &keys, ///< [in] Keys to get
11994 map<string, bufferlist> *out ///< [out] Returned keys and values
11995 )
11996{
11997 Collection *c = static_cast<Collection *>(c_.get());
11998 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11999 if (!c->exists)
12000 return -ENOENT;
9f95a23c 12001 std::shared_lock l(c->lock);
adb31ebb 12002 auto start1 = mono_clock::now();
7c673cae
FG
12003 int r = 0;
12004 string final_key;
12005 OnodeRef o = c->get_onode(oid, false);
12006 if (!o || !o->exists) {
12007 r = -ENOENT;
12008 goto out;
12009 }
9f95a23c 12010 if (!o->onode.has_omap()) {
7c673cae 12011 goto out;
9f95a23c
TL
12012 }
12013 o->flush();
11fdf7f2 12014 {
9f95a23c
TL
12015 const string& prefix = o->get_omap_prefix();
12016 o->get_omap_key(string(), &final_key);
12017 size_t base_key_len = final_key.size();
11fdf7f2 12018 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 12019 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
12020 final_key += *p;
12021 bufferlist val;
12022 if (db->get(prefix, final_key, &val) >= 0) {
12023 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
12024 << " -> " << *p << dendl;
12025 out->insert(make_pair(*p, val));
12026 }
7c673cae
FG
12027 }
12028 }
12029 out:
adb31ebb
TL
12030 c->store->log_latency(
12031 __func__,
12032 l_bluestore_omap_get_values_lat,
12033 mono_clock::now() - start1,
12034 c->store->cct->_conf->bluestore_log_omap_iterator_age);
12035
7c673cae
FG
12036 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12037 << dendl;
12038 return r;
12039}
12040
9f95a23c
TL
12041#ifdef WITH_SEASTAR
12042int BlueStore::omap_get_values(
12043 CollectionHandle &c_, ///< [in] Collection containing oid
12044 const ghobject_t &oid, ///< [in] Object containing omap
12045 const std::optional<string> &start_after, ///< [in] Keys to get
12046 map<string, bufferlist> *output ///< [out] Returned keys and values
12047 )
12048{
12049 Collection *c = static_cast<Collection *>(c_.get());
12050 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12051 if (!c->exists)
12052 return -ENOENT;
12053 std::shared_lock l(c->lock);
12054 int r = 0;
12055 OnodeRef o = c->get_onode(oid, false);
12056 if (!o || !o->exists) {
12057 r = -ENOENT;
12058 goto out;
12059 }
12060 if (!o->onode.has_omap()) {
12061 goto out;
12062 }
12063 o->flush();
12064 {
12065 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
12066 if (!iter) {
12067 r = -ENOENT;
12068 goto out;
12069 }
12070 iter->upper_bound(*start_after);
12071 for (; iter->valid(); iter->next()) {
12072 output->insert(make_pair(iter->key(), iter->value()));
12073 }
12074 }
12075
12076out:
12077 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12078 << dendl;
12079 return r;
12080}
12081#endif
12082
7c673cae
FG
12083int BlueStore::omap_check_keys(
12084 CollectionHandle &c_, ///< [in] Collection containing oid
12085 const ghobject_t &oid, ///< [in] Object containing omap
12086 const set<string> &keys, ///< [in] Keys to check
12087 set<string> *out ///< [out] Subset of keys defined on oid
12088 )
12089{
12090 Collection *c = static_cast<Collection *>(c_.get());
12091 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12092 if (!c->exists)
12093 return -ENOENT;
9f95a23c 12094 std::shared_lock l(c->lock);
7c673cae
FG
12095 int r = 0;
12096 string final_key;
12097 OnodeRef o = c->get_onode(oid, false);
12098 if (!o || !o->exists) {
12099 r = -ENOENT;
12100 goto out;
12101 }
9f95a23c 12102 if (!o->onode.has_omap()) {
7c673cae 12103 goto out;
9f95a23c
TL
12104 }
12105 o->flush();
11fdf7f2 12106 {
9f95a23c
TL
12107 const string& prefix = o->get_omap_prefix();
12108 o->get_omap_key(string(), &final_key);
12109 size_t base_key_len = final_key.size();
11fdf7f2 12110 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 12111 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
12112 final_key += *p;
12113 bufferlist val;
12114 if (db->get(prefix, final_key, &val) >= 0) {
12115 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
12116 << " -> " << *p << dendl;
12117 out->insert(*p);
12118 } else {
12119 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
12120 << " -> " << *p << dendl;
12121 }
7c673cae
FG
12122 }
12123 }
12124 out:
12125 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12126 << dendl;
12127 return r;
12128}
12129
7c673cae
FG
12130ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
12131 CollectionHandle &c_, ///< [in] collection
12132 const ghobject_t &oid ///< [in] object
12133 )
12134{
12135 Collection *c = static_cast<Collection *>(c_.get());
12136 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
12137 if (!c->exists) {
12138 return ObjectMap::ObjectMapIterator();
12139 }
9f95a23c 12140 std::shared_lock l(c->lock);
7c673cae
FG
12141 OnodeRef o = c->get_onode(oid, false);
12142 if (!o || !o->exists) {
12143 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
12144 return ObjectMap::ObjectMapIterator();
12145 }
12146 o->flush();
12147 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
33c7a0ef
TL
12148 auto bounds = KeyValueDB::IteratorBounds();
12149 if (o->onode.has_omap()) {
12150 std::string lower_bound, upper_bound;
12151 o->get_omap_key(string(), &lower_bound);
12152 o->get_omap_tail(&upper_bound);
12153 bounds.lower_bound = std::move(lower_bound);
12154 bounds.upper_bound = std::move(upper_bound);
12155 }
12156 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix(), 0, std::move(bounds));
7c673cae
FG
12157 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
12158}
12159
12160// -----------------
12161// write helpers
12162
11fdf7f2 12163uint64_t BlueStore::_get_ondisk_reserved() const {
f67539c2 12164 ceph_assert(min_alloc_size);
11fdf7f2
TL
12165 return round_up_to(
12166 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
12167}
12168
7c673cae
FG
12169void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
12170{
12171 dout(10) << __func__ << " ondisk_format " << ondisk_format
12172 << " min_compat_ondisk_format " << min_compat_ondisk_format
12173 << dendl;
11fdf7f2 12174 ceph_assert(ondisk_format == latest_ondisk_format);
7c673cae
FG
12175 {
12176 bufferlist bl;
11fdf7f2 12177 encode(ondisk_format, bl);
7c673cae
FG
12178 t->set(PREFIX_SUPER, "ondisk_format", bl);
12179 }
12180 {
12181 bufferlist bl;
11fdf7f2 12182 encode(min_compat_ondisk_format, bl);
7c673cae
FG
12183 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
12184 }
12185}
12186
12187int BlueStore::_open_super_meta()
12188{
12189 // nid
12190 {
12191 nid_max = 0;
12192 bufferlist bl;
12193 db->get(PREFIX_SUPER, "nid_max", &bl);
11fdf7f2 12194 auto p = bl.cbegin();
7c673cae
FG
12195 try {
12196 uint64_t v;
11fdf7f2 12197 decode(v, p);
7c673cae 12198 nid_max = v;
f67539c2 12199 } catch (ceph::buffer::error& e) {
7c673cae
FG
12200 derr << __func__ << " unable to read nid_max" << dendl;
12201 return -EIO;
12202 }
f67539c2 12203 dout(1) << __func__ << " old nid_max " << nid_max << dendl;
7c673cae
FG
12204 nid_last = nid_max.load();
12205 }
12206
12207 // blobid
12208 {
12209 blobid_max = 0;
12210 bufferlist bl;
12211 db->get(PREFIX_SUPER, "blobid_max", &bl);
11fdf7f2 12212 auto p = bl.cbegin();
7c673cae
FG
12213 try {
12214 uint64_t v;
11fdf7f2 12215 decode(v, p);
7c673cae 12216 blobid_max = v;
f67539c2 12217 } catch (ceph::buffer::error& e) {
7c673cae
FG
12218 derr << __func__ << " unable to read blobid_max" << dendl;
12219 return -EIO;
12220 }
f67539c2 12221 dout(1) << __func__ << " old blobid_max " << blobid_max << dendl;
7c673cae
FG
12222 blobid_last = blobid_max.load();
12223 }
12224
12225 // freelist
12226 {
12227 bufferlist bl;
12228 db->get(PREFIX_SUPER, "freelist_type", &bl);
12229 if (bl.length()) {
12230 freelist_type = std::string(bl.c_str(), bl.length());
7c673cae 12231 } else {
11fdf7f2 12232 ceph_abort_msg("Not Support extent freelist manager");
7c673cae 12233 }
20effc67 12234 dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
7c673cae 12235 }
7c673cae
FG
12236 // ondisk format
12237 int32_t compat_ondisk_format = 0;
12238 {
12239 bufferlist bl;
12240 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
12241 if (r < 0) {
12242 // base case: kraken bluestore is v1 and readable by v1
12243 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
12244 << dendl;
12245 ondisk_format = 1;
12246 compat_ondisk_format = 1;
12247 } else {
11fdf7f2 12248 auto p = bl.cbegin();
7c673cae 12249 try {
11fdf7f2 12250 decode(ondisk_format, p);
f67539c2 12251 } catch (ceph::buffer::error& e) {
7c673cae
FG
12252 derr << __func__ << " unable to read ondisk_format" << dendl;
12253 return -EIO;
12254 }
12255 bl.clear();
12256 {
12257 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11fdf7f2
TL
12258 ceph_assert(!r);
12259 auto p = bl.cbegin();
7c673cae 12260 try {
11fdf7f2 12261 decode(compat_ondisk_format, p);
f67539c2 12262 } catch (ceph::buffer::error& e) {
7c673cae
FG
12263 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
12264 return -EIO;
12265 }
12266 }
12267 }
f67539c2 12268 dout(1) << __func__ << " ondisk_format " << ondisk_format
7c673cae
FG
12269 << " compat_ondisk_format " << compat_ondisk_format
12270 << dendl;
12271 }
12272
12273 if (latest_ondisk_format < compat_ondisk_format) {
12274 derr << __func__ << " compat_ondisk_format is "
12275 << compat_ondisk_format << " but we only understand version "
12276 << latest_ondisk_format << dendl;
12277 return -EPERM;
12278 }
7c673cae
FG
12279
12280 {
12281 bufferlist bl;
12282 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11fdf7f2 12283 auto p = bl.cbegin();
7c673cae
FG
12284 try {
12285 uint64_t val;
11fdf7f2 12286 decode(val, p);
7c673cae 12287 min_alloc_size = val;
224ce89b 12288 min_alloc_size_order = ctz(val);
20effc67
TL
12289 min_alloc_size_mask = min_alloc_size - 1;
12290
11fdf7f2 12291 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
f67539c2 12292 } catch (ceph::buffer::error& e) {
7c673cae
FG
12293 derr << __func__ << " unable to read min_alloc_size" << dendl;
12294 return -EIO;
12295 }
f67539c2 12296 dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7c673cae 12297 << std::dec << dendl;
20effc67
TL
12298 logger->set(l_bluestore_alloc_unit, min_alloc_size);
12299 }
12300
12301 // smr fields
12302 {
12303 bufferlist bl;
12304 int r = db->get(PREFIX_SUPER, "zone_size", &bl);
12305 if (r >= 0) {
12306 auto p = bl.cbegin();
12307 decode(zone_size, p);
12308 dout(1) << __func__ << " zone_size 0x" << std::hex << zone_size << std::dec << dendl;
12309 ceph_assert(bdev->is_smr());
12310 } else {
12311 ceph_assert(!bdev->is_smr());
12312 }
12313 }
12314 {
12315 bufferlist bl;
12316 int r = db->get(PREFIX_SUPER, "first_sequential_zone", &bl);
12317 if (r >= 0) {
12318 auto p = bl.cbegin();
12319 decode(first_sequential_zone, p);
12320 dout(1) << __func__ << " first_sequential_zone 0x" << std::hex
12321 << first_sequential_zone << std::dec << dendl;
12322 ceph_assert(bdev->is_smr());
12323 } else {
12324 ceph_assert(!bdev->is_smr());
12325 }
7c673cae 12326 }
9f95a23c
TL
12327
12328 _set_per_pool_omap();
12329
224ce89b 12330 _open_statfs();
7c673cae
FG
12331 _set_alloc_sizes();
12332 _set_throttle_params();
12333
12334 _set_csum();
12335 _set_compression();
12336 _set_blob_size();
12337
11fdf7f2 12338 _validate_bdev();
7c673cae
FG
12339 return 0;
12340}
12341
12342int BlueStore::_upgrade_super()
12343{
12344 dout(1) << __func__ << " from " << ondisk_format << ", latest "
12345 << latest_ondisk_format << dendl;
11fdf7f2
TL
12346 if (ondisk_format < latest_ondisk_format) {
12347 ceph_assert(ondisk_format > 0);
12348 ceph_assert(ondisk_format < latest_ondisk_format);
12349
1911f103 12350 KeyValueDB::Transaction t = db->get_transaction();
11fdf7f2
TL
12351 if (ondisk_format == 1) {
12352 // changes:
12353 // - super: added ondisk_format
12354 // - super: added min_readable_ondisk_format
12355 // - super: added min_compat_ondisk_format
12356 // - super: added min_alloc_size
12357 // - super: removed min_min_alloc_size
11fdf7f2
TL
12358 {
12359 bufferlist bl;
12360 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
12361 auto p = bl.cbegin();
12362 try {
12363 uint64_t val;
12364 decode(val, p);
12365 min_alloc_size = val;
f67539c2 12366 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
12367 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
12368 return -EIO;
12369 }
12370 t->set(PREFIX_SUPER, "min_alloc_size", bl);
12371 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7c673cae 12372 }
11fdf7f2 12373 ondisk_format = 2;
7c673cae 12374 }
9f95a23c
TL
12375 if (ondisk_format == 2) {
12376 // changes:
f67539c2
TL
12377 // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all*
12378 // oondes are using the per-pool prefix until a repair is run; at that
9f95a23c
TL
12379 // point the per_pool_omap=1 key will be set.
12380 // - super: added per_pool_omap key, which indicates that *all* objects
12381 // are using the new prefix and key format
12382 ondisk_format = 3;
1911f103
TL
12383 }
12384 if (ondisk_format == 3) {
12385 // changes:
12386 // - FreelistManager keeps meta within bdev label
12387 int r = _write_out_fm_meta(0);
9f95a23c 12388 ceph_assert(r == 0);
1911f103 12389 ondisk_format = 4;
9f95a23c 12390 }
1911f103
TL
12391 // This to be the last operation
12392 _prepare_ondisk_format_super(t);
12393 int r = db->submit_transaction_sync(t);
12394 ceph_assert(r == 0);
7c673cae 12395 }
7c673cae
FG
12396 // done
12397 dout(1) << __func__ << " done" << dendl;
12398 return 0;
12399}
12400
12401void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
12402{
224ce89b 12403 if (o->onode.nid) {
11fdf7f2 12404 ceph_assert(o->exists);
7c673cae 12405 return;
224ce89b 12406 }
7c673cae
FG
12407 uint64_t nid = ++nid_last;
12408 dout(20) << __func__ << " " << nid << dendl;
12409 o->onode.nid = nid;
12410 txc->last_nid = nid;
224ce89b 12411 o->exists = true;
7c673cae
FG
12412}
12413
12414uint64_t BlueStore::_assign_blobid(TransContext *txc)
12415{
12416 uint64_t bid = ++blobid_last;
12417 dout(20) << __func__ << " " << bid << dendl;
12418 txc->last_blobid = bid;
12419 return bid;
12420}
12421
12422void BlueStore::get_db_statistics(Formatter *f)
12423{
12424 db->get_statistics(f);
12425}
12426
11fdf7f2
TL
12427BlueStore::TransContext *BlueStore::_txc_create(
12428 Collection *c, OpSequencer *osr,
f67539c2
TL
12429 list<Context*> *on_commits,
12430 TrackedOpRef osd_op)
7c673cae 12431{
11fdf7f2 12432 TransContext *txc = new TransContext(cct, c, osr, on_commits);
7c673cae 12433 txc->t = db->get_transaction();
f67539c2
TL
12434
12435#ifdef WITH_BLKIN
12436 if (osd_op && osd_op->pg_trace) {
12437 txc->trace.init("TransContext", &trace_endpoint,
12438 &osd_op->pg_trace);
12439 txc->trace.event("txc create");
12440 txc->trace.keyval("txc seq", txc->seq);
12441 }
12442#endif
12443
7c673cae
FG
12444 osr->queue_new(txc);
12445 dout(20) << __func__ << " osr " << osr << " = " << txc
12446 << " seq " << txc->seq << dendl;
12447 return txc;
12448}
12449
12450void BlueStore::_txc_calc_cost(TransContext *txc)
12451{
11fdf7f2
TL
12452 // one "io" for the kv commit
12453 auto ios = 1 + txc->ioc.get_num_ios();
7c673cae
FG
12454 auto cost = throttle_cost_per_io.load();
12455 txc->cost = ios * cost + txc->bytes;
9f95a23c 12456 txc->ios = ios;
7c673cae
FG
12457 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
12458 << ios << " ios * " << cost << " + " << txc->bytes
12459 << " bytes)" << dendl;
12460}
12461
12462void BlueStore::_txc_update_store_statfs(TransContext *txc)
12463{
12464 if (txc->statfs_delta.is_empty())
12465 return;
12466
12467 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
12468 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
12469 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
12470 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
12471 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
12472
12473 bufferlist bl;
12474 txc->statfs_delta.encode(bl);
11fdf7f2
TL
12475 if (per_pool_stat_collection) {
12476 string key;
12477 get_pool_stat_key(txc->osd_pool_id, &key);
12478 txc->t->merge(PREFIX_STAT, key, bl);
12479
12480 std::lock_guard l(vstatfs_lock);
12481 auto& stats = osd_pools[txc->osd_pool_id];
12482 stats += txc->statfs_delta;
12483
12484 vstatfs += txc->statfs_delta; //non-persistent in this mode
12485
12486 } else {
12487 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
7c673cae 12488
11fdf7f2
TL
12489 std::lock_guard l(vstatfs_lock);
12490 vstatfs += txc->statfs_delta;
12491 }
7c673cae
FG
12492 txc->statfs_delta.reset();
12493}
12494
12495void BlueStore::_txc_state_proc(TransContext *txc)
12496{
12497 while (true) {
12498 dout(10) << __func__ << " txc " << txc
12499 << " " << txc->get_state_name() << dendl;
f67539c2 12500 switch (txc->get_state()) {
7c673cae 12501 case TransContext::STATE_PREPARE:
9f95a23c 12502 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
7c673cae 12503 if (txc->ioc.has_pending_aios()) {
f67539c2
TL
12504 txc->set_state(TransContext::STATE_AIO_WAIT);
12505#ifdef WITH_BLKIN
12506 if (txc->trace) {
12507 txc->trace.keyval("pending aios", txc->ioc.num_pending.load());
12508 }
12509#endif
7c673cae
FG
12510 txc->had_ios = true;
12511 _txc_aio_submit(txc);
12512 return;
12513 }
12514 // ** fall-thru **
12515
12516 case TransContext::STATE_AIO_WAIT:
11fdf7f2 12517 {
9f95a23c
TL
12518 mono_clock::duration lat = throttle.log_state_latency(
12519 *txc, logger, l_bluestore_state_aio_wait_lat);
12520 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
11fdf7f2
TL
12521 dout(0) << __func__ << " slow aio_wait, txc = " << txc
12522 << ", latency = " << lat
12523 << dendl;
12524 }
12525 }
12526
7c673cae
FG
12527 _txc_finish_io(txc); // may trigger blocked txc's too
12528 return;
12529
12530 case TransContext::STATE_IO_DONE:
11fdf7f2 12531 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
7c673cae
FG
12532 if (txc->had_ios) {
12533 ++txc->osr->txc_with_unstable_io;
12534 }
9f95a23c 12535 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
f67539c2 12536 txc->set_state(TransContext::STATE_KV_QUEUED);
7c673cae
FG
12537 if (cct->_conf->bluestore_sync_submit_transaction) {
12538 if (txc->last_nid >= nid_max ||
12539 txc->last_blobid >= blobid_max) {
12540 dout(20) << __func__
12541 << " last_{nid,blobid} exceeds max, submit via kv thread"
12542 << dendl;
12543 } else if (txc->osr->kv_committing_serially) {
12544 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
12545 << dendl;
12546 // note: this is starvation-prone. once we have a txc in a busy
12547 // sequencer that is committing serially it is possible to keep
12548 // submitting new transactions fast enough that we get stuck doing
12549 // so. the alternative is to block here... fixme?
12550 } else if (txc->osr->txc_with_unstable_io) {
12551 dout(20) << __func__ << " prior txc(s) with unstable ios "
12552 << txc->osr->txc_with_unstable_io.load() << dendl;
12553 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
12554 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
12555 == 0) {
12556 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
12557 << dendl;
12558 } else {
9f95a23c 12559 _txc_apply_kv(txc, true);
7c673cae
FG
12560 }
12561 }
12562 {
11fdf7f2 12563 std::lock_guard l(kv_lock);
7c673cae 12564 kv_queue.push_back(txc);
9f95a23c
TL
12565 if (!kv_sync_in_progress) {
12566 kv_sync_in_progress = true;
12567 kv_cond.notify_one();
12568 }
f67539c2 12569 if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) {
7c673cae
FG
12570 kv_queue_unsubmitted.push_back(txc);
12571 ++txc->osr->kv_committing_serially;
12572 }
31f18b77
FG
12573 if (txc->had_ios)
12574 kv_ios++;
12575 kv_throttle_costs += txc->cost;
7c673cae
FG
12576 }
12577 return;
12578 case TransContext::STATE_KV_SUBMITTED:
7c673cae
FG
12579 _txc_committed_kv(txc);
12580 // ** fall-thru **
12581
12582 case TransContext::STATE_KV_DONE:
9f95a23c 12583 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
7c673cae 12584 if (txc->deferred_txn) {
f67539c2 12585 txc->set_state(TransContext::STATE_DEFERRED_QUEUED);
7c673cae
FG
12586 _deferred_queue(txc);
12587 return;
12588 }
f67539c2 12589 txc->set_state(TransContext::STATE_FINISHING);
7c673cae
FG
12590 break;
12591
12592 case TransContext::STATE_DEFERRED_CLEANUP:
9f95a23c 12593 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
f67539c2 12594 txc->set_state(TransContext::STATE_FINISHING);
7c673cae
FG
12595 // ** fall-thru **
12596
12597 case TransContext::STATE_FINISHING:
9f95a23c 12598 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
7c673cae
FG
12599 _txc_finish(txc);
12600 return;
12601
12602 default:
12603 derr << __func__ << " unexpected txc " << txc
12604 << " state " << txc->get_state_name() << dendl;
11fdf7f2 12605 ceph_abort_msg("unexpected txc state");
7c673cae
FG
12606 return;
12607 }
12608 }
12609}
12610
12611void BlueStore::_txc_finish_io(TransContext *txc)
12612{
12613 dout(20) << __func__ << " " << txc << dendl;
12614
12615 /*
12616 * we need to preserve the order of kv transactions,
12617 * even though aio will complete in any order.
12618 */
12619
12620 OpSequencer *osr = txc->osr.get();
11fdf7f2 12621 std::lock_guard l(osr->qlock);
f67539c2 12622 txc->set_state(TransContext::STATE_IO_DONE);
11fdf7f2 12623 txc->ioc.release_running_aios();
7c673cae
FG
12624 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
12625 while (p != osr->q.begin()) {
12626 --p;
f67539c2 12627 if (p->get_state() < TransContext::STATE_IO_DONE) {
7c673cae
FG
12628 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
12629 << p->get_state_name() << dendl;
12630 return;
12631 }
f67539c2 12632 if (p->get_state() > TransContext::STATE_IO_DONE) {
7c673cae
FG
12633 ++p;
12634 break;
12635 }
12636 }
12637 do {
12638 _txc_state_proc(&*p++);
12639 } while (p != osr->q.end() &&
f67539c2 12640 p->get_state() == TransContext::STATE_IO_DONE);
7c673cae 12641
11fdf7f2 12642 if (osr->kv_submitted_waiters) {
7c673cae
FG
12643 osr->qcond.notify_all();
12644 }
12645}
12646
12647void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
12648{
12649 dout(20) << __func__ << " txc " << txc
12650 << " onodes " << txc->onodes
12651 << " shared_blobs " << txc->shared_blobs
12652 << dendl;
12653
12654 // finalize onodes
12655 for (auto o : txc->onodes) {
11fdf7f2 12656 _record_onode(o, t);
7c673cae
FG
12657 o->flushing_count++;
12658 }
12659
12660 // objects we modified but didn't affect the onode
12661 auto p = txc->modified_objects.begin();
12662 while (p != txc->modified_objects.end()) {
12663 if (txc->onodes.count(*p) == 0) {
12664 (*p)->flushing_count++;
12665 ++p;
12666 } else {
12667 // remove dups with onodes list to avoid problems in _txc_finish
12668 p = txc->modified_objects.erase(p);
12669 }
12670 }
12671
12672 // finalize shared_blobs
12673 for (auto sb : txc->shared_blobs) {
12674 string key;
12675 auto sbid = sb->get_sbid();
12676 get_shared_blob_key(sbid, &key);
12677 if (sb->persistent->empty()) {
11fdf7f2
TL
12678 dout(20) << __func__ << " shared_blob 0x"
12679 << std::hex << sbid << std::dec
7c673cae
FG
12680 << " is empty" << dendl;
12681 t->rmkey(PREFIX_SHARED_BLOB, key);
12682 } else {
12683 bufferlist bl;
11fdf7f2
TL
12684 encode(*(sb->persistent), bl);
12685 dout(20) << __func__ << " shared_blob 0x"
12686 << std::hex << sbid << std::dec
31f18b77 12687 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
12688 t->set(PREFIX_SHARED_BLOB, key, bl);
12689 }
12690 }
12691}
12692
12693void BlueStore::BSPerfTracker::update_from_perfcounters(
12694 PerfCounters &logger)
12695{
11fdf7f2
TL
12696 os_commit_latency_ns.consume_next(
12697 logger.get_tavg_ns(
7c673cae 12698 l_bluestore_commit_lat));
11fdf7f2
TL
12699 os_apply_latency_ns.consume_next(
12700 logger.get_tavg_ns(
7c673cae
FG
12701 l_bluestore_commit_lat));
12702}
12703
12704void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
12705{
12706 dout(20) << __func__ << " txc " << txc << std::hex
12707 << " allocated 0x" << txc->allocated
12708 << " released 0x" << txc->released
12709 << std::dec << dendl;
12710
20effc67
TL
12711 if (!fm->is_null_manager())
12712 {
12713 // We have to handle the case where we allocate *and* deallocate the
12714 // same region in this transaction. The freelist doesn't like that.
12715 // (Actually, the only thing that cares is the BitmapFreelistManager
12716 // debug check. But that's important.)
12717 interval_set<uint64_t> tmp_allocated, tmp_released;
12718 interval_set<uint64_t> *pallocated = &txc->allocated;
12719 interval_set<uint64_t> *preleased = &txc->released;
12720 if (!txc->allocated.empty() && !txc->released.empty()) {
12721 interval_set<uint64_t> overlap;
12722 overlap.intersection_of(txc->allocated, txc->released);
12723 if (!overlap.empty()) {
12724 tmp_allocated = txc->allocated;
12725 tmp_allocated.subtract(overlap);
12726 tmp_released = txc->released;
12727 tmp_released.subtract(overlap);
12728 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
12729 << ", new allocated 0x" << tmp_allocated
12730 << " released 0x" << tmp_released << std::dec
12731 << dendl;
12732 pallocated = &tmp_allocated;
12733 preleased = &tmp_released;
12734 }
7c673cae 12735 }
7c673cae 12736
20effc67
TL
12737 // update freelist with non-overlap sets
12738 for (interval_set<uint64_t>::iterator p = pallocated->begin();
12739 p != pallocated->end();
12740 ++p) {
12741 fm->allocate(p.get_start(), p.get_len(), t);
12742 }
12743 for (interval_set<uint64_t>::iterator p = preleased->begin();
12744 p != preleased->end();
12745 ++p) {
12746 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
12747 << "~" << p.get_len() << std::dec << dendl;
12748 fm->release(p.get_start(), p.get_len(), t);
12749 }
7c673cae
FG
12750 }
12751
20effc67 12752#ifdef HAVE_LIBZBD
f67539c2 12753 if (bdev->is_smr()) {
20effc67
TL
12754 for (auto& i : txc->old_zone_offset_refs) {
12755 dout(20) << __func__ << " rm ref zone 0x" << std::hex << i.first.second
12756 << " offset 0x" << i.second << std::dec
12757 << " -> " << i.first.first->oid << dendl;
12758 string key;
12759 get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
12760 txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
12761 }
12762 for (auto& i : txc->new_zone_offset_refs) {
12763 // (zone, offset) -> oid
12764 dout(20) << __func__ << " add ref zone 0x" << std::hex << i.first.second
12765 << " offset 0x" << i.second << std::dec
12766 << " -> " << i.first.first->oid << dendl;
12767 string key;
12768 get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
12769 bufferlist v;
12770 txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
12771 }
f67539c2 12772 }
20effc67 12773#endif
f67539c2 12774
7c673cae
FG
12775 _txc_update_store_statfs(txc);
12776}
12777
9f95a23c 12778void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
7c673cae 12779{
f67539c2 12780 ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED);
9f95a23c
TL
12781 {
12782#if defined(WITH_LTTNG)
12783 auto start = mono_clock::now();
12784#endif
12785
f67539c2
TL
12786#ifdef WITH_BLKIN
12787 if (txc->trace) {
12788 txc->trace.event("db async submit");
12789 }
12790#endif
12791
9f95a23c
TL
12792 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
12793 ceph_assert(r == 0);
f67539c2 12794 txc->set_state(TransContext::STATE_KV_SUBMITTED);
9f95a23c
TL
12795 if (txc->osr->kv_submitted_waiters) {
12796 std::lock_guard l(txc->osr->qlock);
12797 txc->osr->qcond.notify_all();
12798 }
12799
12800#if defined(WITH_LTTNG)
12801 if (txc->tracing) {
12802 tracepoint(
12803 bluestore,
12804 transaction_kv_submit_latency,
12805 txc->osr->get_sequencer_id(),
12806 txc->seq,
12807 sync_submit_transaction,
12808 ceph::to_seconds<double>(mono_clock::now() - start));
12809 }
12810#endif
12811 }
12812
7c673cae
FG
12813 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
12814 for (auto& o : *ls) {
12815 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
12816 << dendl;
9f95a23c 12817 if (--o->flushing_count == 0 && o->waiting_count.load()) {
11fdf7f2 12818 std::lock_guard l(o->flush_lock);
7c673cae
FG
12819 o->flush_cond.notify_all();
12820 }
12821 }
12822 }
12823}
12824
12825void BlueStore::_txc_committed_kv(TransContext *txc)
12826{
12827 dout(20) << __func__ << " txc " << txc << dendl;
9f95a23c 12828 throttle.complete_kv(*txc);
1adf2230 12829 {
11fdf7f2 12830 std::lock_guard l(txc->osr->qlock);
f67539c2 12831 txc->set_state(TransContext::STATE_KV_DONE);
11fdf7f2
TL
12832 if (txc->ch->commit_queue) {
12833 txc->ch->commit_queue->queue(txc->oncommits);
12834 } else {
12835 finisher.queue(txc->oncommits);
1adf2230 12836 }
7c673cae 12837 }
9f95a23c 12838 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
494da23a
TL
12839 log_latency_fn(
12840 __func__,
12841 l_bluestore_commit_lat,
9f95a23c 12842 mono_clock::now() - txc->start,
494da23a
TL
12843 cct->_conf->bluestore_log_op_age,
12844 [&](auto lat) {
12845 return ", txc = " + stringify(txc);
12846 }
11fdf7f2 12847 );
7c673cae
FG
12848}
12849
12850void BlueStore::_txc_finish(TransContext *txc)
12851{
12852 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
f67539c2 12853 ceph_assert(txc->get_state() == TransContext::STATE_FINISHING);
7c673cae
FG
12854
12855 for (auto& sb : txc->shared_blobs_written) {
f64942e4 12856 sb->finish_write(txc->seq);
7c673cae
FG
12857 }
12858 txc->shared_blobs_written.clear();
12859
12860 while (!txc->removed_collections.empty()) {
12861 _queue_reap_collection(txc->removed_collections.front());
12862 txc->removed_collections.pop_front();
12863 }
12864
12865 OpSequencerRef osr = txc->osr;
7c673cae 12866 bool empty = false;
31f18b77 12867 bool submit_deferred = false;
7c673cae
FG
12868 OpSequencer::q_list_t releasing_txc;
12869 {
11fdf7f2 12870 std::lock_guard l(osr->qlock);
f67539c2 12871 txc->set_state(TransContext::STATE_DONE);
7c673cae
FG
12872 bool notify = false;
12873 while (!osr->q.empty()) {
12874 TransContext *txc = &osr->q.front();
12875 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
12876 << dendl;
f67539c2
TL
12877 if (txc->get_state() != TransContext::STATE_DONE) {
12878 if (txc->get_state() == TransContext::STATE_PREPARE &&
7c673cae
FG
12879 deferred_aggressive) {
12880 // for _osr_drain_preceding()
12881 notify = true;
12882 }
f67539c2 12883 if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED &&
11fdf7f2 12884 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
31f18b77
FG
12885 submit_deferred = true;
12886 }
7c673cae
FG
12887 break;
12888 }
12889
7c673cae
FG
12890 osr->q.pop_front();
12891 releasing_txc.push_back(*txc);
7c673cae 12892 }
9f95a23c 12893
7c673cae
FG
12894 if (osr->q.empty()) {
12895 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
12896 empty = true;
12897 }
9f95a23c
TL
12898
12899 // only drain()/drain_preceding() need wakeup,
12900 // other cases use kv_submitted_waiters
12901 if (notify || empty) {
12902 osr->qcond.notify_all();
12903 }
7c673cae 12904 }
9f95a23c 12905
7c673cae
FG
12906 while (!releasing_txc.empty()) {
12907 // release to allocator only after all preceding txc's have also
12908 // finished any deferred writes that potentially land in these
12909 // blocks
12910 auto txc = &releasing_txc.front();
12911 _txc_release_alloc(txc);
12912 releasing_txc.pop_front();
9f95a23c
TL
12913 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
12914 throttle.complete(*txc);
7c673cae
FG
12915 delete txc;
12916 }
12917
31f18b77
FG
12918 if (submit_deferred) {
12919 // we're pinning memory; flush! we could be more fine-grained here but
12920 // i'm not sure it's worth the bother.
12921 deferred_try_submit();
7c673cae
FG
12922 }
12923
7c673cae 12924 if (empty && osr->zombie) {
11fdf7f2
TL
12925 std::lock_guard l(zombie_osr_lock);
12926 if (zombie_osr_set.erase(osr->cid)) {
12927 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
12928 } else {
12929 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
12930 << dendl;
12931 }
7c673cae 12932 }
9f95a23c 12933}
7c673cae
FG
12934
12935void BlueStore::_txc_release_alloc(TransContext *txc)
12936{
a8e16298 12937 // it's expected we're called with lazy_release_lock already taken!
11fdf7f2
TL
12938 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
12939 int r = 0;
12940 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
12941 r = bdev->queue_discard(txc->released);
12942 if (r == 0) {
12943 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
12944 << txc->released << std::dec << dendl;
12945 goto out;
12946 }
12947 } else if (cct->_conf->bdev_enable_discard) {
12948 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
12949 bdev->discard(p.get_start(), p.get_len());
12950 }
12951 }
12952 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
94b18763 12953 << txc->released << std::dec << dendl;
20effc67 12954 alloc->release(txc->released);
7c673cae
FG
12955 }
12956
11fdf7f2 12957out:
7c673cae
FG
12958 txc->allocated.clear();
12959 txc->released.clear();
12960}
12961
11fdf7f2
TL
12962void BlueStore::_osr_attach(Collection *c)
12963{
20effc67 12964 // note: caller has coll_lock
11fdf7f2
TL
12965 auto q = coll_map.find(c->cid);
12966 if (q != coll_map.end()) {
12967 c->osr = q->second->osr;
12968 ldout(cct, 10) << __func__ << " " << c->cid
12969 << " reusing osr " << c->osr << " from existing coll "
12970 << q->second << dendl;
12971 } else {
12972 std::lock_guard l(zombie_osr_lock);
12973 auto p = zombie_osr_set.find(c->cid);
12974 if (p == zombie_osr_set.end()) {
9f95a23c 12975 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
11fdf7f2
TL
12976 ldout(cct, 10) << __func__ << " " << c->cid
12977 << " fresh osr " << c->osr << dendl;
12978 } else {
12979 c->osr = p->second;
12980 zombie_osr_set.erase(p);
12981 ldout(cct, 10) << __func__ << " " << c->cid
12982 << " resurrecting zombie osr " << c->osr << dendl;
12983 c->osr->zombie = false;
12984 }
12985 }
12986}
12987
12988void BlueStore::_osr_register_zombie(OpSequencer *osr)
12989{
12990 std::lock_guard l(zombie_osr_lock);
12991 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
12992 osr->zombie = true;
12993 auto i = zombie_osr_set.emplace(osr->cid, osr);
12994 // this is either a new insertion or the same osr is already there
12995 ceph_assert(i.second || i.first->second == osr);
12996}
12997
7c673cae
FG
12998void BlueStore::_osr_drain_preceding(TransContext *txc)
12999{
13000 OpSequencer *osr = txc->osr.get();
13001 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
13002 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
13003 {
13004 // submit anything pending
f67539c2 13005 osr->deferred_lock.lock();
11fdf7f2 13006 if (osr->deferred_pending && !osr->deferred_running) {
224ce89b
WB
13007 _deferred_submit_unlock(osr);
13008 } else {
f67539c2 13009 osr->deferred_lock.unlock();
7c673cae
FG
13010 }
13011 }
13012 {
13013 // wake up any previously finished deferred events
11fdf7f2 13014 std::lock_guard l(kv_lock);
9f95a23c
TL
13015 if (!kv_sync_in_progress) {
13016 kv_sync_in_progress = true;
13017 kv_cond.notify_one();
13018 }
7c673cae
FG
13019 }
13020 osr->drain_preceding(txc);
13021 --deferred_aggressive;
13022 dout(10) << __func__ << " " << osr << " done" << dendl;
13023}
13024
11fdf7f2
TL
13025void BlueStore::_osr_drain(OpSequencer *osr)
13026{
13027 dout(10) << __func__ << " " << osr << dendl;
13028 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
13029 {
13030 // submit anything pending
f67539c2 13031 osr->deferred_lock.lock();
11fdf7f2
TL
13032 if (osr->deferred_pending && !osr->deferred_running) {
13033 _deferred_submit_unlock(osr);
13034 } else {
f67539c2 13035 osr->deferred_lock.unlock();
11fdf7f2
TL
13036 }
13037 }
13038 {
13039 // wake up any previously finished deferred events
13040 std::lock_guard l(kv_lock);
9f95a23c
TL
13041 if (!kv_sync_in_progress) {
13042 kv_sync_in_progress = true;
13043 kv_cond.notify_one();
13044 }
11fdf7f2
TL
13045 }
13046 osr->drain();
13047 --deferred_aggressive;
13048 dout(10) << __func__ << " " << osr << " done" << dendl;
13049}
13050
7c673cae
FG
13051void BlueStore::_osr_drain_all()
13052{
13053 dout(10) << __func__ << dendl;
13054
13055 set<OpSequencerRef> s;
11fdf7f2
TL
13056 vector<OpSequencerRef> zombies;
13057 {
9f95a23c 13058 std::shared_lock l(coll_lock);
11fdf7f2
TL
13059 for (auto& i : coll_map) {
13060 s.insert(i.second->osr);
13061 }
13062 }
7c673cae 13063 {
11fdf7f2
TL
13064 std::lock_guard l(zombie_osr_lock);
13065 for (auto& i : zombie_osr_set) {
13066 s.insert(i.second);
13067 zombies.push_back(i.second);
13068 }
7c673cae
FG
13069 }
13070 dout(20) << __func__ << " osr_set " << s << dendl;
13071
13072 ++deferred_aggressive;
13073 {
13074 // submit anything pending
224ce89b 13075 deferred_try_submit();
7c673cae
FG
13076 }
13077 {
13078 // wake up any previously finished deferred events
11fdf7f2 13079 std::lock_guard l(kv_lock);
7c673cae
FG
13080 kv_cond.notify_one();
13081 }
31f18b77 13082 {
11fdf7f2 13083 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
13084 kv_finalize_cond.notify_one();
13085 }
7c673cae
FG
13086 for (auto osr : s) {
13087 dout(20) << __func__ << " drain " << osr << dendl;
13088 osr->drain();
13089 }
13090 --deferred_aggressive;
13091
7c673cae 13092 {
11fdf7f2
TL
13093 std::lock_guard l(zombie_osr_lock);
13094 for (auto& osr : zombies) {
13095 if (zombie_osr_set.erase(osr->cid)) {
13096 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
13097 ceph_assert(osr->q.empty());
13098 } else if (osr->zombie) {
13099 dout(10) << __func__ << " empty zombie osr " << osr
13100 << " already reaped" << dendl;
13101 ceph_assert(osr->q.empty());
13102 } else {
13103 dout(10) << __func__ << " empty zombie osr " << osr
13104 << " resurrected" << dendl;
13105 }
7c673cae
FG
13106 }
13107 }
11fdf7f2
TL
13108
13109 dout(10) << __func__ << " done" << dendl;
7c673cae
FG
13110}
13111
11fdf7f2 13112
31f18b77
FG
13113void BlueStore::_kv_start()
13114{
13115 dout(10) << __func__ << dendl;
13116
11fdf7f2 13117 finisher.start();
31f18b77
FG
13118 kv_sync_thread.create("bstore_kv_sync");
13119 kv_finalize_thread.create("bstore_kv_final");
13120}
13121
13122void BlueStore::_kv_stop()
13123{
13124 dout(10) << __func__ << dendl;
13125 {
9f95a23c 13126 std::unique_lock l{kv_lock};
31f18b77
FG
13127 while (!kv_sync_started) {
13128 kv_cond.wait(l);
13129 }
13130 kv_stop = true;
13131 kv_cond.notify_all();
13132 }
13133 {
9f95a23c 13134 std::unique_lock l{kv_finalize_lock};
31f18b77
FG
13135 while (!kv_finalize_started) {
13136 kv_finalize_cond.wait(l);
13137 }
13138 kv_finalize_stop = true;
13139 kv_finalize_cond.notify_all();
13140 }
13141 kv_sync_thread.join();
13142 kv_finalize_thread.join();
11fdf7f2 13143 ceph_assert(removed_collections.empty());
31f18b77 13144 {
11fdf7f2 13145 std::lock_guard l(kv_lock);
31f18b77
FG
13146 kv_stop = false;
13147 }
13148 {
11fdf7f2 13149 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
13150 kv_finalize_stop = false;
13151 }
13152 dout(10) << __func__ << " stopping finishers" << dendl;
11fdf7f2
TL
13153 finisher.wait_for_empty();
13154 finisher.stop();
31f18b77
FG
13155 dout(10) << __func__ << " stopped" << dendl;
13156}
13157
7c673cae
FG
13158void BlueStore::_kv_sync_thread()
13159{
13160 dout(10) << __func__ << " start" << dendl;
11fdf7f2 13161 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
9f95a23c 13162 std::unique_lock l{kv_lock};
11fdf7f2 13163 ceph_assert(!kv_sync_started);
31f18b77
FG
13164 kv_sync_started = true;
13165 kv_cond.notify_all();
adb31ebb
TL
13166
13167 auto t0 = mono_clock::now();
13168 timespan twait = ceph::make_timespan(0);
13169 size_t kv_submitted = 0;
13170
7c673cae 13171 while (true) {
adb31ebb
TL
13172 auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
13173 auto observation_period =
13174 ceph::make_timespan(period);
13175 auto elapsed = mono_clock::now() - t0;
13176 if (period && elapsed >= observation_period) {
13177 dout(5) << __func__ << " utilization: idle "
13178 << twait << " of " << elapsed
13179 << ", submitted: " << kv_submitted
13180 <<dendl;
13181 t0 = mono_clock::now();
13182 twait = ceph::make_timespan(0);
13183 kv_submitted = 0;
13184 }
11fdf7f2 13185 ceph_assert(kv_committing.empty());
7c673cae
FG
13186 if (kv_queue.empty() &&
13187 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
11fdf7f2 13188 !deferred_aggressive)) {
7c673cae
FG
13189 if (kv_stop)
13190 break;
13191 dout(20) << __func__ << " sleep" << dendl;
adb31ebb 13192 auto t = mono_clock::now();
9f95a23c 13193 kv_sync_in_progress = false;
11fdf7f2 13194 kv_cond.wait(l);
adb31ebb
TL
13195 twait += mono_clock::now() - t;
13196
7c673cae
FG
13197 dout(20) << __func__ << " wake" << dendl;
13198 } else {
13199 deque<TransContext*> kv_submitting;
13200 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
13201 uint64_t aios = 0, costs = 0;
13202
7c673cae
FG
13203 dout(20) << __func__ << " committing " << kv_queue.size()
13204 << " submitting " << kv_queue_unsubmitted.size()
13205 << " deferred done " << deferred_done_queue.size()
13206 << " stable " << deferred_stable_queue.size()
13207 << dendl;
13208 kv_committing.swap(kv_queue);
13209 kv_submitting.swap(kv_queue_unsubmitted);
13210 deferred_done.swap(deferred_done_queue);
13211 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
13212 aios = kv_ios;
13213 costs = kv_throttle_costs;
13214 kv_ios = 0;
13215 kv_throttle_costs = 0;
7c673cae
FG
13216 l.unlock();
13217
13218 dout(30) << __func__ << " committing " << kv_committing << dendl;
13219 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
13220 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
13221 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
13222
11fdf7f2
TL
13223 auto start = mono_clock::now();
13224
7c673cae
FG
13225 bool force_flush = false;
13226 // if bluefs is sharing the same device as data (only), then we
13227 // can rely on the bluefs commit to flush the device and make
13228 // deferred aios stable. that means that if we do have done deferred
13229 // txcs AND we are not on a single device, we need to force a flush.
9f95a23c 13230 if (bluefs && bluefs_layout.single_shared_device()) {
31f18b77 13231 if (aios) {
7c673cae 13232 force_flush = true;
11fdf7f2 13233 } else if (kv_committing.empty() && deferred_stable.empty()) {
7c673cae
FG
13234 force_flush = true; // there's nothing else to commit!
13235 } else if (deferred_aggressive) {
13236 force_flush = true;
13237 }
11fdf7f2
TL
13238 } else {
13239 if (aios || !deferred_done.empty()) {
13240 force_flush = true;
13241 } else {
13242 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
13243 }
13244 }
7c673cae
FG
13245
13246 if (force_flush) {
31f18b77 13247 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
13248 << " force_flush=" << (int)force_flush
13249 << ", flushing, deferred done->stable" << dendl;
13250 // flush/barrier on block device
13251 bdev->flush();
13252
13253 // if we flush then deferred done are now deferred stable
13254 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
13255 deferred_done.end());
13256 deferred_done.clear();
13257 }
11fdf7f2 13258 auto after_flush = mono_clock::now();
7c673cae
FG
13259
13260 // we will use one final transaction to force a sync
13261 KeyValueDB::Transaction synct = db->get_transaction();
13262
13263 // increase {nid,blobid}_max? note that this covers both the
13264 // case where we are approaching the max and the case we passed
13265 // it. in either case, we increase the max in the earlier txn
13266 // we submit.
13267 uint64_t new_nid_max = 0, new_blobid_max = 0;
13268 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
13269 KeyValueDB::Transaction t =
13270 kv_submitting.empty() ? synct : kv_submitting.front()->t;
13271 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
13272 bufferlist bl;
11fdf7f2 13273 encode(new_nid_max, bl);
7c673cae
FG
13274 t->set(PREFIX_SUPER, "nid_max", bl);
13275 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
13276 }
13277 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
13278 KeyValueDB::Transaction t =
13279 kv_submitting.empty() ? synct : kv_submitting.front()->t;
13280 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
13281 bufferlist bl;
11fdf7f2 13282 encode(new_blobid_max, bl);
7c673cae
FG
13283 t->set(PREFIX_SUPER, "blobid_max", bl);
13284 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
13285 }
c07f9fc5
FG
13286
13287 for (auto txc : kv_committing) {
9f95a23c 13288 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
f67539c2 13289 if (txc->get_state() == TransContext::STATE_KV_QUEUED) {
adb31ebb 13290 ++kv_submitted;
9f95a23c 13291 _txc_apply_kv(txc, false);
c07f9fc5 13292 --txc->osr->kv_committing_serially;
c07f9fc5 13293 } else {
f67539c2 13294 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
7c673cae 13295 }
7c673cae
FG
13296 if (txc->had_ios) {
13297 --txc->osr->txc_with_unstable_io;
13298 }
7c673cae
FG
13299 }
13300
31f18b77
FG
13301 // release throttle *before* we commit. this allows new ops
13302 // to be prepared and enter pipeline while we are waiting on
13303 // the kv commit sync/flush. then hopefully on the next
13304 // iteration there will already be ops awake. otherwise, we
13305 // end up going to sleep, and then wake up when the very first
13306 // transaction is ready for commit.
9f95a23c 13307 throttle.release_kv_throttle(costs);
31f18b77 13308
7c673cae
FG
13309 // cleanup sync deferred keys
13310 for (auto b : deferred_stable) {
13311 for (auto& txc : b->txcs) {
13312 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
11fdf7f2 13313 ceph_assert(wt.released.empty()); // only kraken did this
7c673cae
FG
13314 string key;
13315 get_deferred_key(wt.seq, &key);
13316 synct->rm_single_key(PREFIX_DEFERRED, key);
13317 }
13318 }
13319
9f95a23c
TL
13320#if defined(WITH_LTTNG)
13321 auto sync_start = mono_clock::now();
13322#endif
7c673cae 13323 // submit synct synchronously (block and wait for it to commit)
31f18b77 13324 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
11fdf7f2
TL
13325 ceph_assert(r == 0);
13326
f67539c2
TL
13327#ifdef WITH_BLKIN
13328 for (auto txc : kv_committing) {
13329 if (txc->trace) {
13330 txc->trace.event("db sync submit");
13331 txc->trace.keyval("kv_committing size", kv_committing.size());
13332 }
13333 }
13334#endif
13335
9f95a23c
TL
13336 int committing_size = kv_committing.size();
13337 int deferred_size = deferred_stable.size();
13338
13339#if defined(WITH_LTTNG)
13340 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
13341 for (auto txc: kv_committing) {
13342 if (txc->tracing) {
13343 tracepoint(
13344 bluestore,
13345 transaction_kv_sync_latency,
13346 txc->osr->get_sequencer_id(),
13347 txc->seq,
13348 kv_committing.size(),
13349 deferred_done.size(),
13350 deferred_stable.size(),
13351 sync_latency);
13352 }
13353 }
13354#endif
13355
11fdf7f2 13356 {
9f95a23c 13357 std::unique_lock m{kv_finalize_lock};
11fdf7f2
TL
13358 if (kv_committing_to_finalize.empty()) {
13359 kv_committing_to_finalize.swap(kv_committing);
13360 } else {
13361 kv_committing_to_finalize.insert(
13362 kv_committing_to_finalize.end(),
13363 kv_committing.begin(),
13364 kv_committing.end());
13365 kv_committing.clear();
13366 }
13367 if (deferred_stable_to_finalize.empty()) {
13368 deferred_stable_to_finalize.swap(deferred_stable);
13369 } else {
13370 deferred_stable_to_finalize.insert(
13371 deferred_stable_to_finalize.end(),
13372 deferred_stable.begin(),
13373 deferred_stable.end());
13374 deferred_stable.clear();
13375 }
9f95a23c
TL
13376 if (!kv_finalize_in_progress) {
13377 kv_finalize_in_progress = true;
13378 kv_finalize_cond.notify_one();
13379 }
11fdf7f2 13380 }
7c673cae
FG
13381
13382 if (new_nid_max) {
13383 nid_max = new_nid_max;
13384 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
13385 }
13386 if (new_blobid_max) {
13387 blobid_max = new_blobid_max;
13388 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
13389 }
13390
224ce89b 13391 {
11fdf7f2
TL
13392 auto finish = mono_clock::now();
13393 ceph::timespan dur_flush = after_flush - start;
13394 ceph::timespan dur_kv = finish - after_flush;
13395 ceph::timespan dur = finish - start;
9f95a23c
TL
13396 dout(20) << __func__ << " committed " << committing_size
13397 << " cleaned " << deferred_size
224ce89b
WB
13398 << " in " << dur
13399 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
13400 << dendl;
494da23a
TL
13401 log_latency("kv_flush",
13402 l_bluestore_kv_flush_lat,
13403 dur_flush,
13404 cct->_conf->bluestore_log_op_age);
13405 log_latency("kv_commit",
13406 l_bluestore_kv_commit_lat,
13407 dur_kv,
13408 cct->_conf->bluestore_log_op_age);
13409 log_latency("kv_sync",
13410 l_bluestore_kv_sync_lat,
13411 dur,
13412 cct->_conf->bluestore_log_op_age);
7c673cae 13413 }
31f18b77 13414
31f18b77
FG
13415 l.lock();
13416 // previously deferred "done" are now "stable" by virtue of this
13417 // commit cycle.
13418 deferred_stable_queue.swap(deferred_done);
13419 }
13420 }
13421 dout(10) << __func__ << " finish" << dendl;
13422 kv_sync_started = false;
13423}
13424
13425void BlueStore::_kv_finalize_thread()
13426{
13427 deque<TransContext*> kv_committed;
13428 deque<DeferredBatch*> deferred_stable;
13429 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
13430 std::unique_lock l(kv_finalize_lock);
13431 ceph_assert(!kv_finalize_started);
31f18b77
FG
13432 kv_finalize_started = true;
13433 kv_finalize_cond.notify_all();
13434 while (true) {
11fdf7f2
TL
13435 ceph_assert(kv_committed.empty());
13436 ceph_assert(deferred_stable.empty());
31f18b77
FG
13437 if (kv_committing_to_finalize.empty() &&
13438 deferred_stable_to_finalize.empty()) {
13439 if (kv_finalize_stop)
13440 break;
13441 dout(20) << __func__ << " sleep" << dendl;
9f95a23c 13442 kv_finalize_in_progress = false;
31f18b77
FG
13443 kv_finalize_cond.wait(l);
13444 dout(20) << __func__ << " wake" << dendl;
13445 } else {
13446 kv_committed.swap(kv_committing_to_finalize);
13447 deferred_stable.swap(deferred_stable_to_finalize);
13448 l.unlock();
13449 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
13450 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
13451
11fdf7f2
TL
13452 auto start = mono_clock::now();
13453
31f18b77
FG
13454 while (!kv_committed.empty()) {
13455 TransContext *txc = kv_committed.front();
f67539c2 13456 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
7c673cae 13457 _txc_state_proc(txc);
31f18b77 13458 kv_committed.pop_front();
7c673cae 13459 }
31f18b77 13460
7c673cae
FG
13461 for (auto b : deferred_stable) {
13462 auto p = b->txcs.begin();
13463 while (p != b->txcs.end()) {
13464 TransContext *txc = &*p;
13465 p = b->txcs.erase(p); // unlink here because
13466 _txc_state_proc(txc); // this may destroy txc
13467 }
13468 delete b;
13469 }
31f18b77 13470 deferred_stable.clear();
7c673cae
FG
13471
13472 if (!deferred_aggressive) {
31f18b77 13473 if (deferred_queue_size >= deferred_batch_ops.load() ||
9f95a23c 13474 throttle.should_submit_deferred()) {
224ce89b 13475 deferred_try_submit();
7c673cae
FG
13476 }
13477 }
13478
13479 // this is as good a place as any ...
13480 _reap_collections();
13481
11fdf7f2 13482 logger->set(l_bluestore_fragmentation,
20effc67 13483 (uint64_t)(alloc->get_fragmentation() * 1000));
11fdf7f2 13484
494da23a
TL
13485 log_latency("kv_final",
13486 l_bluestore_kv_final_lat,
13487 mono_clock::now() - start,
13488 cct->_conf->bluestore_log_op_age);
11fdf7f2 13489
7c673cae 13490 l.lock();
7c673cae
FG
13491 }
13492 }
13493 dout(10) << __func__ << " finish" << dendl;
31f18b77 13494 kv_finalize_started = false;
7c673cae
FG
13495}
13496
20effc67
TL
13497#ifdef HAVE_LIBZBD
13498void BlueStore::_zoned_cleaner_start()
13499{
f67539c2 13500 dout(10) << __func__ << dendl;
f67539c2
TL
13501 zoned_cleaner_thread.create("bstore_zcleaner");
13502}
13503
20effc67
TL
13504void BlueStore::_zoned_cleaner_stop()
13505{
f67539c2
TL
13506 dout(10) << __func__ << dendl;
13507 {
13508 std::unique_lock l{zoned_cleaner_lock};
13509 while (!zoned_cleaner_started) {
13510 zoned_cleaner_cond.wait(l);
13511 }
13512 zoned_cleaner_stop = true;
13513 zoned_cleaner_cond.notify_all();
13514 }
13515 zoned_cleaner_thread.join();
13516 {
13517 std::lock_guard l{zoned_cleaner_lock};
13518 zoned_cleaner_stop = false;
13519 }
13520 dout(10) << __func__ << " done" << dendl;
13521}
13522
20effc67
TL
13523void BlueStore::_zoned_cleaner_thread()
13524{
f67539c2
TL
13525 dout(10) << __func__ << " start" << dendl;
13526 std::unique_lock l{zoned_cleaner_lock};
13527 ceph_assert(!zoned_cleaner_started);
13528 zoned_cleaner_started = true;
13529 zoned_cleaner_cond.notify_all();
20effc67
TL
13530 auto a = dynamic_cast<ZonedAllocator*>(alloc);
13531 ceph_assert(a);
13532 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
13533 ceph_assert(f);
f67539c2 13534 while (true) {
20effc67
TL
13535 // thresholds to trigger cleaning
13536 // FIXME
13537 float min_score = .05; // score: bytes saved / bytes moved
13538 uint64_t min_saved = zone_size / 32; // min bytes saved to consider cleaning
13539 auto zone_to_clean = a->pick_zone_to_clean(min_score, min_saved);
13540 if (zone_to_clean < 0) {
f67539c2
TL
13541 if (zoned_cleaner_stop) {
13542 break;
13543 }
20effc67
TL
13544 auto period = ceph::make_timespan(cct->_conf->bluestore_cleaner_sleep_interval);
13545 dout(20) << __func__ << " sleep for " << period << dendl;
13546 zoned_cleaner_cond.wait_for(l, period);
f67539c2
TL
13547 dout(20) << __func__ << " wake" << dendl;
13548 } else {
f67539c2 13549 l.unlock();
20effc67
TL
13550 a->set_cleaning_zone(zone_to_clean);
13551 _zoned_clean_zone(zone_to_clean, a, f);
13552 a->clear_cleaning_zone(zone_to_clean);
f67539c2
TL
13553 l.lock();
13554 }
13555 }
13556 dout(10) << __func__ << " finish" << dendl;
13557 zoned_cleaner_started = false;
13558}
13559
20effc67
TL
13560void BlueStore::_zoned_clean_zone(
13561 uint64_t zone,
13562 ZonedAllocator *a,
13563 ZonedFreelistManager *f
13564 )
13565{
13566 dout(10) << __func__ << " cleaning zone 0x" << std::hex << zone << std::dec << dendl;
13567
13568 KeyValueDB::Iterator it = db->get_iterator(PREFIX_ZONED_CL_INFO);
13569 std::string zone_start;
13570 get_zone_offset_object_key(zone, 0, ghobject_t(), &zone_start);
13571 for (it->lower_bound(zone_start); it->valid(); it->next()) {
13572 uint32_t z;
13573 uint64_t offset;
13574 ghobject_t oid;
13575 string k = it->key();
13576 int r = get_key_zone_offset_object(k, &z, &offset, &oid);
13577 if (r < 0) {
13578 derr << __func__ << " failed to decode zone ref " << pretty_binary_string(k)
13579 << dendl;
13580 continue;
13581 }
13582 if (zone != z) {
13583 dout(10) << __func__ << " reached end of zone refs" << dendl;
13584 break;
13585 }
13586 dout(10) << __func__ << " zone 0x" << std::hex << zone << " offset 0x" << offset
13587 << std::dec << " " << oid << dendl;
13588 _clean_some(oid, zone);
13589 }
13590
13591 if (a->get_live_bytes(zone) > 0) {
13592 derr << "zone 0x" << std::hex << zone << " still has 0x" << a->get_live_bytes(zone)
13593 << " live bytes" << std::dec << dendl;
13594 // should we do something else here to avoid a live-lock in the event of a problem?
13595 return;
13596 }
13597
13598 // make sure transactions flush/drain/commit (and data is all rewritten
13599 // safely elsewhere) before we blow away the cleaned zone
13600 _osr_drain_all();
13601
13602 // reset the device zone
13603 dout(10) << __func__ << " resetting zone 0x" << std::hex << zone << std::dec << dendl;
13604 bdev->reset_zone(zone);
13605
13606 // record that we can now write there
13607 f->mark_zone_to_clean_free(zone, db);
13608 bdev->flush();
13609
13610 // then allow ourselves to start allocating there
13611 dout(10) << __func__ << " done cleaning zone 0x" << std::hex << zone << std::dec
13612 << dendl;
13613 a->reset_zone(zone);
13614}
13615
13616void BlueStore::_clean_some(ghobject_t oid, uint32_t zone)
13617{
13618 dout(10) << __func__ << " " << oid << " from zone 0x" << std::hex << zone << std::dec
13619 << dendl;
13620
13621 CollectionRef cref = _get_collection_by_oid(oid);
13622 if (!cref) {
13623 dout(10) << __func__ << " can't find collection for " << oid << dendl;
13624 return;
13625 }
13626 Collection *c = cref.get();
13627
13628 // serialize io dispatch vs other transactions
13629 std::lock_guard l(atomic_alloc_and_submit_lock);
13630 std::unique_lock l2(c->lock);
13631
13632 auto o = c->get_onode(oid, false);
13633 if (!o) {
13634 dout(10) << __func__ << " can't find " << oid << dendl;
13635 return;
13636 }
13637
13638 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
13639 _dump_onode<30>(cct, *o);
13640
13641 // NOTE: This is a naive rewrite strategy. If any blobs are
13642 // shared, they will be duplicated for each object that references
13643 // them. That means any cloned/snapshotted objects will explode
13644 // their utilization. This won't matter for RGW workloads, but
13645 // for RBD and CephFS it is completely unacceptable, and it's
13646 // entirely reasonable to have "archival" data workloads on SMR
13647 // for CephFS and (possibly/probably) RBD.
13648 //
13649 // At some point we need to replace this with something more
13650 // sophisticated that ensures that a shared blob gets moved once
13651 // and all referencing objects get updated to point to the new
13652 // location.
13653
13654 map<uint32_t, uint32_t> to_move;
13655 for (auto& e : o->extent_map.extent_map) {
13656 bool touches_zone = false;
13657 for (auto& be : e.blob->get_blob().get_extents()) {
13658 if (be.is_valid()) {
13659 uint32_t z = be.offset / zone_size;
13660 if (z == zone) {
13661 touches_zone = true;
13662 break;
13663 }
13664 }
13665 }
13666 if (touches_zone) {
13667 to_move[e.logical_offset] = e.length;
13668 }
13669 }
13670 if (to_move.empty()) {
13671 dout(10) << __func__ << " no references to zone 0x" << std::hex << zone
13672 << std::dec << " from " << oid << dendl;
13673 return;
13674 }
13675
13676 dout(10) << __func__ << " rewriting object extents 0x" << std::hex << to_move
13677 << std::dec << dendl;
13678 OpSequencer *osr = c->osr.get();
13679 TransContext *txc = _txc_create(c, osr, nullptr);
13680
13681 spg_t pgid;
13682 if (c->cid.is_pg(&pgid)) {
13683 txc->osd_pool_id = pgid.pool();
13684 }
13685
13686 for (auto& [offset, length] : to_move) {
13687 bufferlist bl;
13688 int r = _do_read(c, o, offset, length, bl, 0);
13689 ceph_assert(r == (int)length);
13690
13691 r = _do_write(txc, cref, o, offset, length, bl, 0);
13692 ceph_assert(r >= 0);
13693 }
13694 txc->write_onode(o);
13695
13696 _txc_write_nodes(txc, txc->t);
13697 _txc_finalize_kv(txc, txc->t);
13698 _txc_state_proc(txc);
f67539c2 13699}
20effc67 13700#endif
f67539c2 13701
7c673cae 13702bluestore_deferred_op_t *BlueStore::_get_deferred_op(
522d829b 13703 TransContext *txc, uint64_t len)
7c673cae
FG
13704{
13705 if (!txc->deferred_txn) {
13706 txc->deferred_txn = new bluestore_deferred_transaction_t;
13707 }
13708 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
20effc67
TL
13709 logger->inc(l_bluestore_issued_deferred_writes);
13710 logger->inc(l_bluestore_issued_deferred_write_bytes, len);
7c673cae
FG
13711 return &txc->deferred_txn->ops.back();
13712}
13713
13714void BlueStore::_deferred_queue(TransContext *txc)
13715{
13716 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
f67539c2
TL
13717
13718 DeferredBatch *tmp;
13719 txc->osr->deferred_lock.lock();
13720 {
13721 if (!txc->osr->deferred_pending) {
13722 tmp = new DeferredBatch(cct, txc->osr.get());
13723 } else {
13724 tmp = txc->osr->deferred_pending;
13725 }
7c673cae 13726 }
f67539c2
TL
13727
13728 tmp->txcs.push_back(*txc);
7c673cae
FG
13729 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
13730 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
13731 const auto& op = *opi;
11fdf7f2 13732 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
7c673cae
FG
13733 bufferlist::const_iterator p = op.data.begin();
13734 for (auto e : op.extents) {
f67539c2 13735 tmp->prepare_write(cct, wt.seq, e.offset, e.length, p);
7c673cae
FG
13736 }
13737 }
f67539c2
TL
13738
13739 {
13740 ++deferred_queue_size;
13741 txc->osr->deferred_pending = tmp;
13742 // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
13743 // So we should add osr into deferred_queue.
13744 if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) {
13745 deferred_lock.lock();
13746 deferred_queue.push_back(*txc->osr);
13747 deferred_lock.unlock();
13748 }
13749
13750 if (deferred_aggressive &&
13751 !txc->osr->deferred_running) {
13752 _deferred_submit_unlock(txc->osr.get());
13753 } else {
13754 txc->osr->deferred_lock.unlock();
13755 }
7c673cae 13756 }
f67539c2 13757 }
7c673cae 13758
224ce89b 13759void BlueStore::deferred_try_submit()
7c673cae
FG
13760{
13761 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
13762 << deferred_queue_size << " txcs" << dendl;
224ce89b 13763 vector<OpSequencerRef> osrs;
f67539c2
TL
13764
13765 {
13766 std::lock_guard l(deferred_lock);
13767 osrs.reserve(deferred_queue.size());
13768 for (auto& osr : deferred_queue) {
13769 osrs.push_back(&osr);
13770 }
224ce89b 13771 }
f67539c2 13772
224ce89b 13773 for (auto& osr : osrs) {
f67539c2 13774 osr->deferred_lock.lock();
181888fb
FG
13775 if (osr->deferred_pending) {
13776 if (!osr->deferred_running) {
13777 _deferred_submit_unlock(osr.get());
181888fb 13778 } else {
f67539c2 13779 osr->deferred_lock.unlock();
181888fb
FG
13780 dout(20) << __func__ << " osr " << osr << " already has running"
13781 << dendl;
13782 }
13783 } else {
f67539c2 13784 osr->deferred_lock.unlock();
181888fb 13785 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
13786 }
13787 }
9f95a23c 13788
f67539c2
TL
13789 {
13790 std::lock_guard l(deferred_lock);
13791 deferred_last_submitted = ceph_clock_now();
13792 }
7c673cae
FG
13793}
13794
224ce89b 13795void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
13796{
13797 dout(10) << __func__ << " osr " << osr
13798 << " " << osr->deferred_pending->iomap.size() << " ios pending "
13799 << dendl;
11fdf7f2
TL
13800 ceph_assert(osr->deferred_pending);
13801 ceph_assert(!osr->deferred_running);
7c673cae
FG
13802
13803 auto b = osr->deferred_pending;
13804 deferred_queue_size -= b->seq_bytes.size();
11fdf7f2 13805 ceph_assert(deferred_queue_size >= 0);
7c673cae
FG
13806
13807 osr->deferred_running = osr->deferred_pending;
13808 osr->deferred_pending = nullptr;
13809
f67539c2 13810 osr->deferred_lock.unlock();
11fdf7f2
TL
13811
13812 for (auto& txc : b->txcs) {
9f95a23c 13813 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
11fdf7f2 13814 }
7c673cae
FG
13815 uint64_t start = 0, pos = 0;
13816 bufferlist bl;
13817 auto i = b->iomap.begin();
13818 while (true) {
13819 if (i == b->iomap.end() || i->first != pos) {
13820 if (bl.length()) {
13821 dout(20) << __func__ << " write 0x" << std::hex
13822 << start << "~" << bl.length()
13823 << " crc " << bl.crc32c(-1) << std::dec << dendl;
11fdf7f2 13824 if (!g_conf()->bluestore_debug_omit_block_device_write) {
20effc67
TL
13825 logger->inc(l_bluestore_submitted_deferred_writes);
13826 logger->inc(l_bluestore_submitted_deferred_write_bytes, bl.length());
7c673cae 13827 int r = bdev->aio_write(start, bl, &b->ioc, false);
11fdf7f2 13828 ceph_assert(r == 0);
7c673cae
FG
13829 }
13830 }
13831 if (i == b->iomap.end()) {
13832 break;
13833 }
13834 start = 0;
13835 pos = i->first;
13836 bl.clear();
13837 }
13838 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
13839 << std::hex << pos << "~" << i->second.bl.length() << std::dec
13840 << dendl;
13841 if (!bl.length()) {
13842 start = pos;
13843 }
13844 pos += i->second.bl.length();
13845 bl.claim_append(i->second.bl);
13846 ++i;
13847 }
224ce89b 13848
7c673cae
FG
13849 bdev->aio_submit(&b->ioc);
13850}
13851
3efd9988
FG
13852struct C_DeferredTrySubmit : public Context {
13853 BlueStore *store;
13854 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
13855 void finish(int r) {
13856 store->deferred_try_submit();
13857 }
13858};
13859
7c673cae
FG
13860void BlueStore::_deferred_aio_finish(OpSequencer *osr)
13861{
13862 dout(10) << __func__ << " osr " << osr << dendl;
11fdf7f2 13863 ceph_assert(osr->deferred_running);
7c673cae
FG
13864 DeferredBatch *b = osr->deferred_running;
13865
13866 {
f67539c2 13867 osr->deferred_lock.lock();
11fdf7f2 13868 ceph_assert(osr->deferred_running == b);
7c673cae
FG
13869 osr->deferred_running = nullptr;
13870 if (!osr->deferred_pending) {
181888fb 13871 dout(20) << __func__ << " dequeueing" << dendl;
f67539c2
TL
13872 {
13873 deferred_lock.lock();
13874 auto q = deferred_queue.iterator_to(*osr);
13875 deferred_queue.erase(q);
13876 deferred_lock.unlock();
13877 }
13878 osr->deferred_lock.unlock();
181888fb 13879 } else {
f67539c2 13880 osr->deferred_lock.unlock();
9f95a23c
TL
13881 if (deferred_aggressive) {
13882 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
13883 finisher.queue(new C_DeferredTrySubmit(this));
13884 } else {
13885 dout(20) << __func__ << " leaving queued, more pending" << dendl;
13886 }
7c673cae
FG
13887 }
13888 }
13889
13890 {
31f18b77 13891 uint64_t costs = 0;
11fdf7f2 13892 {
11fdf7f2
TL
13893 for (auto& i : b->txcs) {
13894 TransContext *txc = &i;
9f95a23c 13895 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
f67539c2 13896 txc->set_state(TransContext::STATE_DEFERRED_CLEANUP);
11fdf7f2
TL
13897 costs += txc->cost;
13898 }
7c673cae 13899 }
9f95a23c 13900 throttle.release_deferred_throttle(costs);
7c673cae
FG
13901 }
13902
9f95a23c 13903 {
11fdf7f2 13904 std::lock_guard l(kv_lock);
9f95a23c
TL
13905 deferred_done_queue.emplace_back(b);
13906
13907 // in the normal case, do not bother waking up the kv thread; it will
13908 // catch us on the next commit anyway.
13909 if (deferred_aggressive && !kv_sync_in_progress) {
13910 kv_sync_in_progress = true;
13911 kv_cond.notify_one();
13912 }
7c673cae
FG
13913 }
13914}
13915
13916int BlueStore::_deferred_replay()
13917{
13918 dout(10) << __func__ << " start" << dendl;
7c673cae
FG
13919 int count = 0;
13920 int r = 0;
2a845540
TL
13921 interval_set<uint64_t> bluefs_extents;
13922 if (bluefs) {
13923 bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents);
13924 }
11fdf7f2
TL
13925 CollectionRef ch = _get_collection(coll_t::meta());
13926 bool fake_ch = false;
13927 if (!ch) {
13928 // hmm, replaying initial mkfs?
13929 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
13930 fake_ch = true;
13931 }
13932 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
7c673cae
FG
13933 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
13934 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
13935 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
13936 << dendl;
13937 bluestore_deferred_transaction_t *deferred_txn =
13938 new bluestore_deferred_transaction_t;
13939 bufferlist bl = it->value();
11fdf7f2 13940 auto p = bl.cbegin();
7c673cae 13941 try {
11fdf7f2 13942 decode(*deferred_txn, p);
f67539c2 13943 } catch (ceph::buffer::error& e) {
7c673cae
FG
13944 derr << __func__ << " failed to decode deferred txn "
13945 << pretty_binary_string(it->key()) << dendl;
13946 delete deferred_txn;
13947 r = -EIO;
13948 goto out;
13949 }
2a845540
TL
13950 bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents);
13951 if (has_some) {
13952 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
13953 txc->deferred_txn = deferred_txn;
13954 txc->set_state(TransContext::STATE_KV_DONE);
13955 _txc_state_proc(txc);
13956 } else {
13957 delete deferred_txn;
13958 }
7c673cae
FG
13959 }
13960 out:
13961 dout(20) << __func__ << " draining osr" << dendl;
11fdf7f2 13962 _osr_register_zombie(osr);
7c673cae 13963 _osr_drain_all();
11fdf7f2
TL
13964 if (fake_ch) {
13965 new_coll_map.clear();
13966 }
7c673cae
FG
13967 dout(10) << __func__ << " completed " << count << " events" << dendl;
13968 return r;
13969}
13970
2a845540
TL
13971bool BlueStore::_eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn,
13972 interval_set<uint64_t>& bluefs_extents)
13973{
13974 bool has_some = false;
13975 dout(30) << __func__ << " bluefs_extents: " << std::hex << bluefs_extents << std::dec << dendl;
13976 auto it = deferred_txn->ops.begin();
13977 while (it != deferred_txn->ops.end()) {
13978 // We process a pair of _data_/_extents_ (here: it->data/it->extents)
13979 // by eliminating _extents_ that belong to bluefs, removing relevant parts of _data_
13980 // example:
13981 // +------------+---------------+---------------+---------------+
13982 // | data | aaaaaaaabbbbb | bbbbcccccdddd | ddddeeeeeefff |
13983 // | extent | 40000 - 44000 | 50000 - 58000 | 58000 - 60000 |
13984 // | in bluefs? | no | yes | no |
13985 // +------------+---------------+---------------+---------------+
13986 // result:
13987 // +------------+---------------+---------------+
13988 // | data | aaaaaaaabbbbb | ddddeeeeeefff |
13989 // | extent | 40000 - 44000 | 58000 - 60000 |
13990 // +------------+---------------+---------------+
13991 PExtentVector new_extents;
13992 ceph::buffer::list new_data;
13993 uint32_t data_offset = 0; // this tracks location of extent 'e' inside it->data
13994 dout(30) << __func__ << " input extents: " << it->extents << dendl;
13995 for (auto& e: it->extents) {
13996 interval_set<uint64_t> region;
13997 region.insert(e.offset, e.length);
13998
13999 auto mi = bluefs_extents.lower_bound(e.offset);
14000 if (mi != bluefs_extents.begin()) {
14001 --mi;
14002 if (mi.get_end() <= e.offset) {
14003 ++mi;
14004 }
14005 }
14006 while (mi != bluefs_extents.end() && mi.get_start() < e.offset + e.length) {
14007 // The interval_set does not like (asserts) when we erase interval that does not exist.
14008 // Hence we do we implement (region-mi) by ((region+mi)-mi).
14009 region.union_insert(mi.get_start(), mi.get_len());
14010 region.erase(mi.get_start(), mi.get_len());
14011 ++mi;
14012 }
14013 // 'region' is now a subset of e, without parts used by bluefs
14014 // we trim coresponding parts from it->data (actally constructing new_data / new_extents)
14015 for (auto ki = region.begin(); ki != region.end(); ki++) {
14016 ceph::buffer::list chunk;
14017 // A chunk from it->data; data_offset is a an offset where 'e' was located;
14018 // 'ki.get_start() - e.offset' is an offset of ki inside 'e'.
14019 chunk.substr_of(it->data, data_offset + (ki.get_start() - e.offset), ki.get_len());
14020 new_data.claim_append(chunk);
14021 new_extents.emplace_back(bluestore_pextent_t(ki.get_start(), ki.get_len()));
14022 }
14023 data_offset += e.length;
14024 }
14025 dout(30) << __func__ << " output extents: " << new_extents << dendl;
14026 if (it->data.length() != new_data.length()) {
14027 dout(10) << __func__ << " trimmed deferred extents: " << it->extents << "->" << new_extents << dendl;
14028 }
14029 if (new_extents.size() == 0) {
14030 it = deferred_txn->ops.erase(it);
14031 } else {
14032 has_some = true;
14033 std::swap(it->extents, new_extents);
14034 std::swap(it->data, new_data);
14035 ++it;
14036 }
14037 }
14038 return has_some;
14039}
14040
7c673cae
FG
14041// ---------------------------
14042// transactions
14043
14044int BlueStore::queue_transactions(
11fdf7f2
TL
14045 CollectionHandle& ch,
14046 vector<Transaction>& tls,
14047 TrackedOpRef op,
14048 ThreadPool::TPHandle *handle)
14049{
14050 FUNCTRACE(cct);
14051 list<Context *> on_applied, on_commit, on_applied_sync;
7c673cae 14052 ObjectStore::Transaction::collect_contexts(
11fdf7f2 14053 tls, &on_applied, &on_commit, &on_applied_sync);
7c673cae 14054
11fdf7f2
TL
14055 auto start = mono_clock::now();
14056
14057 Collection *c = static_cast<Collection*>(ch.get());
14058 OpSequencer *osr = c->osr.get();
14059 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
7c673cae 14060
f67539c2
TL
14061 // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
14062 // submission to happen atomically because if I/O submission happens in a
14063 // different order than I/O allocation, we end up issuing non-sequential
14064 // writes to the drive. This is a temporary solution until ZONE APPEND
14065 // support matures in the kernel. For more information please see:
14066 // https://www.usenix.org/conference/vault20/presentation/bjorling
14067 if (bdev->is_smr()) {
14068 atomic_alloc_and_submit_lock.lock();
14069 }
20effc67
TL
14070
14071 // prepare
14072 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
14073 &on_commit, op);
14074
7c673cae 14075 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
7c673cae
FG
14076 txc->bytes += (*p).get_num_bytes();
14077 _txc_add_transaction(txc, &(*p));
14078 }
14079 _txc_calc_cost(txc);
14080
14081 _txc_write_nodes(txc, txc->t);
14082
14083 // journal deferred items
14084 if (txc->deferred_txn) {
14085 txc->deferred_txn->seq = ++deferred_seq;
14086 bufferlist bl;
11fdf7f2 14087 encode(*txc->deferred_txn, bl);
7c673cae
FG
14088 string key;
14089 get_deferred_key(txc->deferred_txn->seq, &key);
14090 txc->t->set(PREFIX_DEFERRED, key, bl);
14091 }
14092
14093 _txc_finalize_kv(txc, txc->t);
f67539c2
TL
14094
14095#ifdef WITH_BLKIN
14096 if (txc->trace) {
14097 txc->trace.event("txc encode finished");
14098 }
14099#endif
14100
7c673cae
FG
14101 if (handle)
14102 handle->suspend_tp_timeout();
14103
11fdf7f2 14104 auto tstart = mono_clock::now();
9f95a23c
TL
14105
14106 if (!throttle.try_start_transaction(
14107 *db,
14108 *txc,
14109 tstart)) {
7c673cae 14110 // ensure we do not block here because of deferred writes
9f95a23c
TL
14111 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
14112 << dendl;
14113 ++deferred_aggressive;
14114 deferred_try_submit();
14115 {
14116 // wake up any previously finished deferred events
14117 std::lock_guard l(kv_lock);
14118 if (!kv_sync_in_progress) {
14119 kv_sync_in_progress = true;
3efd9988
FG
14120 kv_cond.notify_one();
14121 }
9f95a23c
TL
14122 }
14123 throttle.finish_start_transaction(*db, *txc, tstart);
14124 --deferred_aggressive;
7c673cae 14125 }
11fdf7f2 14126 auto tend = mono_clock::now();
7c673cae
FG
14127
14128 if (handle)
14129 handle->reset_tp_timeout();
14130
14131 logger->inc(l_bluestore_txc);
14132
14133 // execute (start)
14134 _txc_state_proc(txc);
14135
f67539c2
TL
14136 if (bdev->is_smr()) {
14137 atomic_alloc_and_submit_lock.unlock();
14138 }
14139
11fdf7f2
TL
14140 // we're immediately readable (unlike FileStore)
14141 for (auto c : on_applied_sync) {
14142 c->complete(0);
14143 }
14144 if (!on_applied.empty()) {
14145 if (c->commit_queue) {
14146 c->commit_queue->queue(on_applied);
14147 } else {
14148 finisher.queue(on_applied);
14149 }
14150 }
14151
f67539c2
TL
14152#ifdef WITH_BLKIN
14153 if (txc->trace) {
14154 txc->trace.event("txc applied");
14155 }
14156#endif
14157
494da23a
TL
14158 log_latency("submit_transact",
14159 l_bluestore_submit_lat,
14160 mono_clock::now() - start,
14161 cct->_conf->bluestore_log_op_age);
14162 log_latency("throttle_transact",
14163 l_bluestore_throttle_lat,
14164 tend - tstart,
14165 cct->_conf->bluestore_log_op_age);
7c673cae
FG
14166 return 0;
14167}
14168
14169void BlueStore::_txc_aio_submit(TransContext *txc)
14170{
14171 dout(10) << __func__ << " txc " << txc << dendl;
14172 bdev->aio_submit(&txc->ioc);
14173}
14174
14175void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
14176{
14177 Transaction::iterator i = t->begin();
14178
81eedcae 14179 _dump_transaction<30>(cct, t);
7c673cae
FG
14180
14181 vector<CollectionRef> cvec(i.colls.size());
14182 unsigned j = 0;
14183 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
14184 ++p, ++j) {
14185 cvec[j] = _get_collection(*p);
7c673cae 14186 }
11fdf7f2 14187
7c673cae
FG
14188 vector<OnodeRef> ovec(i.objects.size());
14189
14190 for (int pos = 0; i.have_op(); ++pos) {
14191 Transaction::Op *op = i.decode_op();
14192 int r = 0;
14193
14194 // no coll or obj
14195 if (op->op == Transaction::OP_NOP)
14196 continue;
14197
11fdf7f2 14198
7c673cae
FG
14199 // collection operations
14200 CollectionRef &c = cvec[op->cid];
11fdf7f2
TL
14201
14202 // initialize osd_pool_id and do a smoke test that all collections belong
14203 // to the same pool
14204 spg_t pgid;
14205 if (!!c ? c->cid.is_pg(&pgid) : false) {
14206 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
14207 txc->osd_pool_id == pgid.pool());
14208 txc->osd_pool_id = pgid.pool();
14209 }
14210
7c673cae
FG
14211 switch (op->op) {
14212 case Transaction::OP_RMCOLL:
14213 {
14214 const coll_t &cid = i.get_cid(op->cid);
14215 r = _remove_collection(txc, cid, &c);
14216 if (!r)
14217 continue;
14218 }
14219 break;
14220
14221 case Transaction::OP_MKCOLL:
14222 {
11fdf7f2 14223 ceph_assert(!c);
7c673cae
FG
14224 const coll_t &cid = i.get_cid(op->cid);
14225 r = _create_collection(txc, cid, op->split_bits, &c);
14226 if (!r)
14227 continue;
14228 }
14229 break;
14230
14231 case Transaction::OP_SPLIT_COLLECTION:
11fdf7f2 14232 ceph_abort_msg("deprecated");
7c673cae
FG
14233 break;
14234
14235 case Transaction::OP_SPLIT_COLLECTION2:
14236 {
14237 uint32_t bits = op->split_bits;
14238 uint32_t rem = op->split_rem;
14239 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
14240 if (!r)
14241 continue;
14242 }
14243 break;
14244
11fdf7f2
TL
14245 case Transaction::OP_MERGE_COLLECTION:
14246 {
14247 uint32_t bits = op->split_bits;
14248 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
14249 if (!r)
14250 continue;
14251 }
14252 break;
14253
7c673cae
FG
14254 case Transaction::OP_COLL_HINT:
14255 {
f67539c2 14256 uint32_t type = op->hint;
7c673cae
FG
14257 bufferlist hint;
14258 i.decode_bl(hint);
11fdf7f2 14259 auto hiter = hint.cbegin();
7c673cae
FG
14260 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
14261 uint32_t pg_num;
14262 uint64_t num_objs;
11fdf7f2
TL
14263 decode(pg_num, hiter);
14264 decode(num_objs, hiter);
7c673cae
FG
14265 dout(10) << __func__ << " collection hint objects is a no-op, "
14266 << " pg_num " << pg_num << " num_objects " << num_objs
14267 << dendl;
14268 } else {
14269 // Ignore the hint
14270 dout(10) << __func__ << " unknown collection hint " << type << dendl;
14271 }
14272 continue;
14273 }
14274 break;
14275
14276 case Transaction::OP_COLL_SETATTR:
14277 r = -EOPNOTSUPP;
14278 break;
14279
14280 case Transaction::OP_COLL_RMATTR:
14281 r = -EOPNOTSUPP;
14282 break;
14283
14284 case Transaction::OP_COLL_RENAME:
11fdf7f2 14285 ceph_abort_msg("not implemented");
7c673cae
FG
14286 break;
14287 }
14288 if (r < 0) {
14289 derr << __func__ << " error " << cpp_strerror(r)
14290 << " not handled on operation " << op->op
14291 << " (op " << pos << ", counting from 0)" << dendl;
81eedcae 14292 _dump_transaction<0>(cct, t);
11fdf7f2 14293 ceph_abort_msg("unexpected error");
7c673cae
FG
14294 }
14295
14296 // these operations implicity create the object
14297 bool create = false;
14298 if (op->op == Transaction::OP_TOUCH ||
9f95a23c 14299 op->op == Transaction::OP_CREATE ||
7c673cae
FG
14300 op->op == Transaction::OP_WRITE ||
14301 op->op == Transaction::OP_ZERO) {
14302 create = true;
14303 }
14304
14305 // object operations
9f95a23c 14306 std::unique_lock l(c->lock);
7c673cae
FG
14307 OnodeRef &o = ovec[op->oid];
14308 if (!o) {
14309 ghobject_t oid = i.get_oid(op->oid);
9f95a23c 14310 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
7c673cae
FG
14311 }
14312 if (!create && (!o || !o->exists)) {
14313 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
14314 << i.get_oid(op->oid) << dendl;
14315 r = -ENOENT;
14316 goto endop;
14317 }
14318
14319 switch (op->op) {
9f95a23c 14320 case Transaction::OP_CREATE:
7c673cae
FG
14321 case Transaction::OP_TOUCH:
14322 r = _touch(txc, c, o);
14323 break;
14324
14325 case Transaction::OP_WRITE:
14326 {
14327 uint64_t off = op->off;
14328 uint64_t len = op->len;
14329 uint32_t fadvise_flags = i.get_fadvise_flags();
14330 bufferlist bl;
14331 i.decode_bl(bl);
14332 r = _write(txc, c, o, off, len, bl, fadvise_flags);
14333 }
14334 break;
14335
14336 case Transaction::OP_ZERO:
14337 {
14338 uint64_t off = op->off;
14339 uint64_t len = op->len;
14340 r = _zero(txc, c, o, off, len);
14341 }
14342 break;
14343
14344 case Transaction::OP_TRIMCACHE:
14345 {
14346 // deprecated, no-op
14347 }
14348 break;
14349
14350 case Transaction::OP_TRUNCATE:
14351 {
14352 uint64_t off = op->off;
35e4c445 14353 r = _truncate(txc, c, o, off);
7c673cae
FG
14354 }
14355 break;
14356
14357 case Transaction::OP_REMOVE:
14358 {
14359 r = _remove(txc, c, o);
14360 }
14361 break;
14362
14363 case Transaction::OP_SETATTR:
14364 {
14365 string name = i.decode_string();
14366 bufferptr bp;
14367 i.decode_bp(bp);
14368 r = _setattr(txc, c, o, name, bp);
14369 }
14370 break;
14371
14372 case Transaction::OP_SETATTRS:
14373 {
14374 map<string, bufferptr> aset;
14375 i.decode_attrset(aset);
14376 r = _setattrs(txc, c, o, aset);
14377 }
14378 break;
14379
14380 case Transaction::OP_RMATTR:
14381 {
14382 string name = i.decode_string();
14383 r = _rmattr(txc, c, o, name);
14384 }
14385 break;
14386
14387 case Transaction::OP_RMATTRS:
14388 {
14389 r = _rmattrs(txc, c, o);
14390 }
14391 break;
14392
14393 case Transaction::OP_CLONE:
14394 {
14395 OnodeRef& no = ovec[op->dest_oid];
14396 if (!no) {
14397 const ghobject_t& noid = i.get_oid(op->dest_oid);
14398 no = c->get_onode(noid, true);
14399 }
14400 r = _clone(txc, c, o, no);
14401 }
14402 break;
14403
14404 case Transaction::OP_CLONERANGE:
11fdf7f2 14405 ceph_abort_msg("deprecated");
7c673cae
FG
14406 break;
14407
14408 case Transaction::OP_CLONERANGE2:
14409 {
14410 OnodeRef& no = ovec[op->dest_oid];
14411 if (!no) {
14412 const ghobject_t& noid = i.get_oid(op->dest_oid);
14413 no = c->get_onode(noid, true);
14414 }
14415 uint64_t srcoff = op->off;
14416 uint64_t len = op->len;
14417 uint64_t dstoff = op->dest_off;
14418 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
14419 }
14420 break;
14421
14422 case Transaction::OP_COLL_ADD:
11fdf7f2 14423 ceph_abort_msg("not implemented");
7c673cae
FG
14424 break;
14425
14426 case Transaction::OP_COLL_REMOVE:
11fdf7f2 14427 ceph_abort_msg("not implemented");
7c673cae
FG
14428 break;
14429
14430 case Transaction::OP_COLL_MOVE:
11fdf7f2 14431 ceph_abort_msg("deprecated");
7c673cae
FG
14432 break;
14433
14434 case Transaction::OP_COLL_MOVE_RENAME:
14435 case Transaction::OP_TRY_RENAME:
14436 {
11fdf7f2 14437 ceph_assert(op->cid == op->dest_cid);
7c673cae
FG
14438 const ghobject_t& noid = i.get_oid(op->dest_oid);
14439 OnodeRef& no = ovec[op->dest_oid];
14440 if (!no) {
14441 no = c->get_onode(noid, false);
14442 }
14443 r = _rename(txc, c, o, no, noid);
14444 }
14445 break;
14446
14447 case Transaction::OP_OMAP_CLEAR:
14448 {
14449 r = _omap_clear(txc, c, o);
14450 }
14451 break;
14452 case Transaction::OP_OMAP_SETKEYS:
14453 {
14454 bufferlist aset_bl;
14455 i.decode_attrset_bl(&aset_bl);
14456 r = _omap_setkeys(txc, c, o, aset_bl);
14457 }
14458 break;
14459 case Transaction::OP_OMAP_RMKEYS:
14460 {
14461 bufferlist keys_bl;
14462 i.decode_keyset_bl(&keys_bl);
14463 r = _omap_rmkeys(txc, c, o, keys_bl);
14464 }
14465 break;
14466 case Transaction::OP_OMAP_RMKEYRANGE:
14467 {
14468 string first, last;
14469 first = i.decode_string();
14470 last = i.decode_string();
14471 r = _omap_rmkey_range(txc, c, o, first, last);
14472 }
14473 break;
14474 case Transaction::OP_OMAP_SETHEADER:
14475 {
14476 bufferlist bl;
14477 i.decode_bl(bl);
14478 r = _omap_setheader(txc, c, o, bl);
14479 }
14480 break;
14481
14482 case Transaction::OP_SETALLOCHINT:
14483 {
14484 r = _set_alloc_hint(txc, c, o,
14485 op->expected_object_size,
14486 op->expected_write_size,
f67539c2 14487 op->hint);
7c673cae
FG
14488 }
14489 break;
14490
14491 default:
11fdf7f2 14492 derr << __func__ << " bad op " << op->op << dendl;
7c673cae
FG
14493 ceph_abort();
14494 }
14495
14496 endop:
14497 if (r < 0) {
14498 bool ok = false;
14499
14500 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
14501 op->op == Transaction::OP_CLONE ||
14502 op->op == Transaction::OP_CLONERANGE2 ||
14503 op->op == Transaction::OP_COLL_ADD ||
14504 op->op == Transaction::OP_SETATTR ||
14505 op->op == Transaction::OP_SETATTRS ||
14506 op->op == Transaction::OP_RMATTR ||
14507 op->op == Transaction::OP_OMAP_SETKEYS ||
14508 op->op == Transaction::OP_OMAP_RMKEYS ||
14509 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
14510 op->op == Transaction::OP_OMAP_SETHEADER))
14511 // -ENOENT is usually okay
14512 ok = true;
14513 if (r == -ENODATA)
14514 ok = true;
14515
14516 if (!ok) {
14517 const char *msg = "unexpected error code";
14518
14519 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
14520 op->op == Transaction::OP_CLONE ||
14521 op->op == Transaction::OP_CLONERANGE2))
14522 msg = "ENOENT on clone suggests osd bug";
14523
14524 if (r == -ENOSPC)
14525 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
14526 // by partially applying transactions.
14527 msg = "ENOSPC from bluestore, misconfigured cluster";
14528
14529 if (r == -ENOTEMPTY) {
14530 msg = "ENOTEMPTY suggests garbage data in osd data dir";
14531 }
14532
14533 derr << __func__ << " error " << cpp_strerror(r)
14534 << " not handled on operation " << op->op
14535 << " (op " << pos << ", counting from 0)"
14536 << dendl;
14537 derr << msg << dendl;
81eedcae 14538 _dump_transaction<0>(cct, t);
11fdf7f2 14539 ceph_abort_msg("unexpected error");
7c673cae
FG
14540 }
14541 }
14542 }
14543}
14544
14545
14546
14547// -----------------
14548// write operations
14549
14550int BlueStore::_touch(TransContext *txc,
14551 CollectionRef& c,
14552 OnodeRef &o)
14553{
14554 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14555 int r = 0;
7c673cae
FG
14556 _assign_nid(txc, o);
14557 txc->write_onode(o);
14558 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14559 return r;
14560}
14561
7c673cae
FG
14562void BlueStore::_pad_zeros(
14563 bufferlist *bl, uint64_t *offset,
14564 uint64_t chunk_size)
14565{
14566 auto length = bl->length();
14567 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
14568 << " chunk_size 0x" << chunk_size << std::dec << dendl;
14569 dout(40) << "before:\n";
14570 bl->hexdump(*_dout);
14571 *_dout << dendl;
14572 // front
14573 size_t front_pad = *offset % chunk_size;
14574 size_t back_pad = 0;
14575 size_t pad_count = 0;
14576 if (front_pad) {
11fdf7f2 14577 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
f67539c2 14578 bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size);
224ce89b 14579 z.zero(0, front_pad, false);
7c673cae 14580 pad_count += front_pad;
9f95a23c 14581 bl->begin().copy(front_copy, z.c_str() + front_pad);
7c673cae
FG
14582 if (front_copy + front_pad < chunk_size) {
14583 back_pad = chunk_size - (length + front_pad);
224ce89b 14584 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
14585 pad_count += back_pad;
14586 }
14587 bufferlist old, t;
14588 old.swap(*bl);
14589 t.substr_of(old, front_copy, length - front_copy);
14590 bl->append(z);
14591 bl->claim_append(t);
14592 *offset -= front_pad;
224ce89b 14593 length += pad_count;
7c673cae
FG
14594 }
14595
14596 // back
14597 uint64_t end = *offset + length;
14598 unsigned back_copy = end % chunk_size;
14599 if (back_copy) {
11fdf7f2 14600 ceph_assert(back_pad == 0);
7c673cae 14601 back_pad = chunk_size - back_copy;
11fdf7f2 14602 ceph_assert(back_copy <= length);
7c673cae 14603 bufferptr tail(chunk_size);
9f95a23c 14604 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
224ce89b 14605 tail.zero(back_copy, back_pad, false);
7c673cae
FG
14606 bufferlist old;
14607 old.swap(*bl);
14608 bl->substr_of(old, 0, length - back_copy);
14609 bl->append(tail);
14610 length += back_pad;
14611 pad_count += back_pad;
14612 }
14613 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
14614 << back_pad << " on front/back, now 0x" << *offset << "~"
14615 << length << std::dec << dendl;
14616 dout(40) << "after:\n";
14617 bl->hexdump(*_dout);
14618 *_dout << dendl;
14619 if (pad_count)
14620 logger->inc(l_bluestore_write_pad_bytes, pad_count);
11fdf7f2 14621 ceph_assert(bl->length() == length);
7c673cae
FG
14622}
14623
14624void BlueStore::_do_write_small(
14625 TransContext *txc,
14626 CollectionRef &c,
14627 OnodeRef o,
14628 uint64_t offset, uint64_t length,
14629 bufferlist::iterator& blp,
14630 WriteContext *wctx)
14631{
14632 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
14633 << std::dec << dendl;
11fdf7f2 14634 ceph_assert(length < min_alloc_size);
f67539c2 14635
7c673cae
FG
14636 uint64_t end_offs = offset + length;
14637
14638 logger->inc(l_bluestore_write_small);
14639 logger->inc(l_bluestore_write_small_bytes, length);
14640
14641 bufferlist bl;
14642 blp.copy(length, bl);
14643
81eedcae
TL
14644 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
14645 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
14646 uint32_t alloc_len = min_alloc_size;
14647 auto offset0 = p2align<uint64_t>(offset, alloc_len);
14648
14649 bool any_change;
14650
14651 // search suitable extent in both forward and reverse direction in
14652 // [offset - target_max_blob_size, offset + target_max_blob_size] range
14653 // then check if blob can be reused via can_reuse_blob func or apply
14654 // direct/deferred write (the latter for extents including or higher
14655 // than 'offset' only).
14656 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
14657
20effc67 14658#ifdef HAVE_LIBZBD
f67539c2
TL
14659 // On zoned devices, the first goal is to support non-overwrite workloads,
14660 // such as RGW, with large, aligned objects. Therefore, for user writes
14661 // _do_write_small should not trigger. OSDs, however, write and update a tiny
14662 // amount of metadata, such as OSD maps, to disk. For those cases, we
14663 // temporarily just pad them to min_alloc_size and write them to a new place
14664 // on every update.
14665 if (bdev->is_smr()) {
f67539c2
TL
14666 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
14667 uint64_t b_off0 = b_off;
f67539c2 14668 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
20effc67
TL
14669
14670 // Zero detection -- small block
33c7a0ef 14671 if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
20effc67
TL
14672 BlobRef b = c->new_blob();
14673 _pad_zeros(&bl, &b_off0, min_alloc_size);
14674 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true);
14675 } else { // if (bl.is_zero())
14676 dout(20) << __func__ << " skip small zero block " << std::hex
14677 << " (0x" << b_off0 << "~" << bl.length() << ")"
14678 << " (0x" << b_off << "~" << length << ")"
14679 << std::dec << dendl;
14680 logger->inc(l_bluestore_write_small_skipped);
14681 logger->inc(l_bluestore_write_small_skipped_bytes, length);
14682 }
14683
f67539c2
TL
14684 return;
14685 }
20effc67 14686#endif
f67539c2 14687
7c673cae
FG
14688 // Look for an existing mutable blob we can use.
14689 auto begin = o->extent_map.extent_map.begin();
14690 auto end = o->extent_map.extent_map.end();
14691 auto ep = o->extent_map.seek_lextent(offset);
14692 if (ep != begin) {
14693 --ep;
14694 if (ep->blob_end() <= offset) {
14695 ++ep;
14696 }
14697 }
f67539c2
TL
14698 auto prev_ep = end;
14699 if (ep != begin) {
14700 prev_ep = ep;
7c673cae 14701 --prev_ep;
7c673cae
FG
14702 }
14703
eafe8130
TL
14704 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
14705 // We don't want to have more blobs than min alloc units fit
14706 // into 2 max blobs
14707 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
14708 bool above_blob_threshold = false;
14709
14710 inspected_blobs.reserve(blob_threshold);
14711
14712 uint64_t max_off = 0;
14713 auto start_ep = ep;
14714 auto end_ep = ep; // exclusively
7c673cae
FG
14715 do {
14716 any_change = false;
14717
14718 if (ep != end && ep->logical_offset < offset + max_bsize) {
14719 BlobRef b = ep->blob;
eafe8130
TL
14720 if (!above_blob_threshold) {
14721 inspected_blobs.insert(&b->get_blob());
14722 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
14723 }
14724 max_off = ep->logical_end();
7c673cae 14725 auto bstart = ep->blob_start();
eafe8130 14726
7c673cae
FG
14727 dout(20) << __func__ << " considering " << *b
14728 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
14729 if (bstart >= end_offs) {
14730 dout(20) << __func__ << " ignoring distant " << *b << dendl;
14731 } else if (!b->get_blob().is_mutable()) {
14732 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
14733 } else if (ep->logical_offset % min_alloc_size !=
14734 ep->blob_offset % min_alloc_size) {
14735 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
14736 } else {
14737 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
14738 // can we pad our head/tail out with zeros?
14739 uint64_t head_pad, tail_pad;
11fdf7f2
TL
14740 head_pad = p2phase(offset, chunk_size);
14741 tail_pad = p2nphase(end_offs, chunk_size);
7c673cae
FG
14742 if (head_pad || tail_pad) {
14743 o->extent_map.fault_range(db, offset - head_pad,
14744 end_offs - offset + head_pad + tail_pad);
14745 }
14746 if (head_pad &&
a4b75251 14747 o->extent_map.has_any_lextents(offset - head_pad, head_pad)) {
7c673cae
FG
14748 head_pad = 0;
14749 }
14750 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
14751 tail_pad = 0;
14752 }
14753
14754 uint64_t b_off = offset - head_pad - bstart;
14755 uint64_t b_len = length + head_pad + tail_pad;
14756
14757 // direct write into unused blocks of an existing mutable blob?
14758 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
14759 b->get_blob().get_ondisk_length() >= b_off + b_len &&
14760 b->get_blob().is_unused(b_off, b_len) &&
14761 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 14762 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
14763
14764 dout(20) << __func__ << " write to unused 0x" << std::hex
14765 << b_off << "~" << b_len
14766 << " pad 0x" << head_pad << " + 0x" << tail_pad
14767 << std::dec << " of mutable " << *b << dendl;
224ce89b 14768 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
14769 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14770
11fdf7f2 14771 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 14772 if (b_len < prefer_deferred_size) {
7c673cae
FG
14773 dout(20) << __func__ << " deferring small 0x" << std::hex
14774 << b_len << std::dec << " unused write via deferred" << dendl;
522d829b 14775 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
7c673cae
FG
14776 op->op = bluestore_deferred_op_t::OP_WRITE;
14777 b->get_blob().map(
14778 b_off, b_len,
14779 [&](uint64_t offset, uint64_t length) {
14780 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14781 return 0;
14782 });
224ce89b 14783 op->data = bl;
7c673cae
FG
14784 } else {
14785 b->get_blob().map_bl(
224ce89b 14786 b_off, bl,
7c673cae
FG
14787 [&](uint64_t offset, bufferlist& t) {
14788 bdev->aio_write(offset, t,
14789 &txc->ioc, wctx->buffered);
14790 });
14791 }
14792 }
224ce89b 14793 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
14794 dout(20) << __func__ << " lex old " << *ep << dendl;
14795 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
14796 b,
14797 &wctx->old_extents);
14798 b->dirty_blob().mark_used(le->blob_offset, le->length);
f67539c2 14799
7c673cae
FG
14800 txc->statfs_delta.stored() += le->length;
14801 dout(20) << __func__ << " lex " << *le << dendl;
14802 logger->inc(l_bluestore_write_small_unused);
14803 return;
14804 }
14805 // read some data to fill out the chunk?
11fdf7f2
TL
14806 uint64_t head_read = p2phase(b_off, chunk_size);
14807 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
7c673cae
FG
14808 if ((head_read || tail_read) &&
14809 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
14810 head_read + tail_read < min_alloc_size) {
14811 b_off -= head_read;
14812 b_len += head_read + tail_read;
14813
14814 } else {
14815 head_read = tail_read = 0;
14816 }
14817
14818 // chunk-aligned deferred overwrite?
14819 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
14820 b_off % chunk_size == 0 &&
14821 b_len % chunk_size == 0 &&
14822 b->get_blob().is_allocated(b_off, b_len)) {
14823
224ce89b 14824 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
14825
14826 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
14827 << " and tail 0x" << tail_read << std::dec << dendl;
14828 if (head_read) {
14829 bufferlist head_bl;
14830 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
14831 head_bl, 0);
11fdf7f2 14832 ceph_assert(r >= 0 && r <= (int)head_read);
7c673cae
FG
14833 size_t zlen = head_read - r;
14834 if (zlen) {
14835 head_bl.append_zero(zlen);
14836 logger->inc(l_bluestore_write_pad_bytes, zlen);
14837 }
11fdf7f2
TL
14838 head_bl.claim_append(bl);
14839 bl.swap(head_bl);
7c673cae
FG
14840 logger->inc(l_bluestore_write_penalty_read_ops);
14841 }
14842 if (tail_read) {
14843 bufferlist tail_bl;
14844 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
14845 tail_bl, 0);
11fdf7f2 14846 ceph_assert(r >= 0 && r <= (int)tail_read);
7c673cae
FG
14847 size_t zlen = tail_read - r;
14848 if (zlen) {
14849 tail_bl.append_zero(zlen);
14850 logger->inc(l_bluestore_write_pad_bytes, zlen);
14851 }
224ce89b 14852 bl.claim_append(tail_bl);
7c673cae
FG
14853 logger->inc(l_bluestore_write_penalty_read_ops);
14854 }
f67539c2 14855 logger->inc(l_bluestore_write_small_pre_read);
7c673cae 14856
224ce89b 14857 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
14858 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14859
f67539c2 14860 b->dirty_blob().calc_csum(b_off, bl);
11fdf7f2
TL
14861
14862 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 14863 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
11fdf7f2
TL
14864 op->op = bluestore_deferred_op_t::OP_WRITE;
14865 int r = b->get_blob().map(
14866 b_off, b_len,
14867 [&](uint64_t offset, uint64_t length) {
14868 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14869 return 0;
14870 });
14871 ceph_assert(r == 0);
f67539c2 14872 op->data = std::move(bl);
11fdf7f2
TL
14873 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
14874 << b_len << std::dec << " of mutable " << *b
14875 << " at " << op->extents << dendl;
14876 }
14877
7c673cae
FG
14878 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
14879 b, &wctx->old_extents);
14880 b->dirty_blob().mark_used(le->blob_offset, le->length);
14881 txc->statfs_delta.stored() += le->length;
14882 dout(20) << __func__ << " lex " << *le << dendl;
7c673cae
FG
14883 return;
14884 }
224ce89b
WB
14885 // try to reuse blob if we can
14886 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
14887 max_bsize,
14888 offset0 - bstart,
14889 &alloc_len)) {
11fdf7f2 14890 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
14891 // fit into reused blob
14892 // Need to check for pending writes desiring to
14893 // reuse the same pextent. The rationale is that during GC two chunks
14894 // from garbage blobs(compressed?) can share logical space within the same
14895 // AU. That's in turn might be caused by unaligned len in clone_range2.
14896 // Hence the second write will fail in an attempt to reuse blob at
14897 // do_alloc_write().
14898 if (!wctx->has_conflict(b,
14899 offset0,
14900 offset0 + alloc_len,
14901 min_alloc_size)) {
14902
14903 // we can't reuse pad_head/pad_tail since they might be truncated
14904 // due to existent extents
14905 uint64_t b_off = offset - bstart;
14906 uint64_t b_off0 = b_off;
20effc67 14907 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
7c673cae 14908
20effc67 14909 // Zero detection -- small block
33c7a0ef 14910 if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
20effc67
TL
14911 _pad_zeros(&bl, &b_off0, chunk_size);
14912
14913 dout(20) << __func__ << " reuse blob " << *b << std::hex
14914 << " (0x" << b_off0 << "~" << bl.length() << ")"
14915 << " (0x" << b_off << "~" << length << ")"
14916 << std::dec << dendl;
14917
14918 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
14919 false, false);
14920 logger->inc(l_bluestore_write_small_unused);
14921 } else { // if (bl.is_zero())
14922 dout(20) << __func__ << " skip small zero block " << std::hex
14923 << " (0x" << b_off0 << "~" << bl.length() << ")"
14924 << " (0x" << b_off << "~" << length << ")"
14925 << std::dec << dendl;
14926 logger->inc(l_bluestore_write_small_skipped);
14927 logger->inc(l_bluestore_write_small_skipped_bytes, length);
14928 }
7c673cae 14929
7c673cae
FG
14930 return;
14931 }
14932 }
14933 }
14934 ++ep;
eafe8130 14935 end_ep = ep;
7c673cae
FG
14936 any_change = true;
14937 } // if (ep != end && ep->logical_offset < offset + max_bsize)
14938
14939 // check extent for reuse in reverse order
14940 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
14941 BlobRef b = prev_ep->blob;
eafe8130
TL
14942 if (!above_blob_threshold) {
14943 inspected_blobs.insert(&b->get_blob());
14944 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
14945 }
14946 start_ep = prev_ep;
7c673cae
FG
14947 auto bstart = prev_ep->blob_start();
14948 dout(20) << __func__ << " considering " << *b
14949 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 14950 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
14951 max_bsize,
14952 offset0 - bstart,
14953 &alloc_len)) {
11fdf7f2 14954 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
14955 // fit into reused blob
14956 // Need to check for pending writes desiring to
14957 // reuse the same pextent. The rationale is that during GC two chunks
14958 // from garbage blobs(compressed?) can share logical space within the same
14959 // AU. That's in turn might be caused by unaligned len in clone_range2.
14960 // Hence the second write will fail in an attempt to reuse blob at
14961 // do_alloc_write().
14962 if (!wctx->has_conflict(b,
14963 offset0,
14964 offset0 + alloc_len,
14965 min_alloc_size)) {
14966
7c673cae
FG
14967 uint64_t b_off = offset - bstart;
14968 uint64_t b_off0 = b_off;
20effc67 14969 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
7c673cae 14970
20effc67 14971 // Zero detection -- small block
33c7a0ef 14972 if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
20effc67
TL
14973 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
14974 _pad_zeros(&bl, &b_off0, chunk_size);
14975
14976 dout(20) << __func__ << " reuse blob " << *b << std::hex
14977 << " (0x" << b_off0 << "~" << bl.length() << ")"
14978 << " (0x" << b_off << "~" << length << ")"
14979 << std::dec << dendl;
14980
14981 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
14982 false, false);
14983 logger->inc(l_bluestore_write_small_unused);
14984 } else { // if (bl.is_zero())
14985 dout(20) << __func__ << " skip small zero block " << std::hex
14986 << " (0x" << b_off0 << "~" << bl.length() << ")"
14987 << " (0x" << b_off << "~" << length << ")"
14988 << std::dec << dendl;
14989 logger->inc(l_bluestore_write_small_skipped);
14990 logger->inc(l_bluestore_write_small_skipped_bytes, length);
14991 }
7c673cae 14992
7c673cae
FG
14993 return;
14994 }
14995 }
14996 if (prev_ep != begin) {
14997 --prev_ep;
14998 any_change = true;
14999 } else {
15000 prev_ep = end; // to avoid useless first extent re-check
15001 }
15002 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
15003 } while (any_change);
15004
eafe8130
TL
15005 if (above_blob_threshold) {
15006 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
15007 << " " << std::hex << min_off << "~" << max_off << std::dec
15008 << dendl;
15009 ceph_assert(start_ep != end_ep);
15010 for (auto ep = start_ep; ep != end_ep; ++ep) {
15011 dout(20) << __func__ << " inserting for GC "
15012 << std::hex << ep->logical_offset << "~" << ep->length
15013 << std::dec << dendl;
15014
15015 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
15016 }
15017 // insert newly written extent to GC
15018 wctx->extents_to_gc.union_insert(offset, length);
15019 dout(20) << __func__ << " inserting (last) for GC "
15020 << std::hex << offset << "~" << length
15021 << std::dec << dendl;
15022 }
11fdf7f2 15023 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
7c673cae 15024 uint64_t b_off0 = b_off;
7c673cae 15025 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
20effc67
TL
15026
15027 // Zero detection -- small block
33c7a0ef 15028 if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
20effc67
TL
15029 // new blob.
15030 BlobRef b = c->new_blob();
15031 _pad_zeros(&bl, &b_off0, block_size);
15032 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
15033 min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
15034 // doesn't match disk one only
15035 true);
15036 } else { // if (bl.is_zero())
15037 dout(20) << __func__ << " skip small zero block " << std::hex
15038 << " (0x" << b_off0 << "~" << bl.length() << ")"
15039 << " (0x" << b_off << "~" << length << ")"
15040 << std::dec << dendl;
15041 logger->inc(l_bluestore_write_small_skipped);
15042 logger->inc(l_bluestore_write_small_skipped_bytes, length);
15043 }
7c673cae
FG
15044
15045 return;
15046}
15047
20effc67
TL
15048bool BlueStore::has_null_fm()
15049{
15050 return fm->is_null_manager();
15051}
15052
f67539c2
TL
15053bool BlueStore::BigDeferredWriteContext::can_defer(
15054 BlueStore::extent_map_t::iterator ep,
15055 uint64_t prefer_deferred_size,
15056 uint64_t block_size,
15057 uint64_t offset,
15058 uint64_t l)
15059{
15060 bool res = false;
15061 auto& blob = ep->blob->get_blob();
15062 if (offset >= ep->blob_start() &&
15063 blob.is_mutable()) {
15064 off = offset;
15065 b_off = offset - ep->blob_start();
15066 uint64_t chunk_size = blob.get_chunk_size(block_size);
15067 uint64_t ondisk = blob.get_ondisk_length();
15068 used = std::min(l, ondisk - b_off);
15069
15070 // will read some data to fill out the chunk?
15071 head_read = p2phase<uint64_t>(b_off, chunk_size);
15072 tail_read = p2nphase<uint64_t>(b_off + used, chunk_size);
15073 b_off -= head_read;
15074
15075 ceph_assert(b_off % chunk_size == 0);
15076 ceph_assert(blob_aligned_len() % chunk_size == 0);
15077
522d829b 15078 res = blob_aligned_len() < prefer_deferred_size &&
f67539c2
TL
15079 blob_aligned_len() <= ondisk &&
15080 blob.is_allocated(b_off, blob_aligned_len());
15081 if (res) {
15082 blob_ref = ep->blob;
15083 blob_start = ep->blob_start();
15084 }
15085 }
15086 return res;
15087}
15088
15089bool BlueStore::BigDeferredWriteContext::apply_defer()
15090{
15091 int r = blob_ref->get_blob().map(
15092 b_off, blob_aligned_len(),
15093 [&](const bluestore_pextent_t& pext,
15094 uint64_t offset,
15095 uint64_t length) {
15096 // apply deferred if overwrite breaks blob continuity only.
15097 // if it totally overlaps some pextent - fallback to regular write
15098 if (pext.offset < offset ||
15099 pext.end() > offset + length) {
15100 res_extents.emplace_back(bluestore_pextent_t(offset, length));
15101 return 0;
15102 }
15103 return -1;
15104 });
15105 return r >= 0;
15106}
15107
15108void BlueStore::_do_write_big_apply_deferred(
15109 TransContext* txc,
15110 CollectionRef& c,
15111 OnodeRef o,
15112 BlueStore::BigDeferredWriteContext& dctx,
15113 bufferlist::iterator& blp,
15114 WriteContext* wctx)
15115{
15116 bufferlist bl;
15117 dout(20) << __func__ << " reading head 0x" << std::hex << dctx.head_read
15118 << " and tail 0x" << dctx.tail_read << std::dec << dendl;
15119 if (dctx.head_read) {
15120 int r = _do_read(c.get(), o,
15121 dctx.off - dctx.head_read,
15122 dctx.head_read,
15123 bl,
15124 0);
15125 ceph_assert(r >= 0 && r <= (int)dctx.head_read);
15126 size_t zlen = dctx.head_read - r;
15127 if (zlen) {
15128 bl.append_zero(zlen);
15129 logger->inc(l_bluestore_write_pad_bytes, zlen);
15130 }
15131 logger->inc(l_bluestore_write_penalty_read_ops);
15132 }
15133 blp.copy(dctx.used, bl);
15134
15135 if (dctx.tail_read) {
15136 bufferlist tail_bl;
15137 int r = _do_read(c.get(), o,
15138 dctx.off + dctx.used, dctx.tail_read,
15139 tail_bl, 0);
15140 ceph_assert(r >= 0 && r <= (int)dctx.tail_read);
15141 size_t zlen = dctx.tail_read - r;
15142 if (zlen) {
15143 tail_bl.append_zero(zlen);
15144 logger->inc(l_bluestore_write_pad_bytes, zlen);
15145 }
15146 bl.claim_append(tail_bl);
15147 logger->inc(l_bluestore_write_penalty_read_ops);
15148 }
15149 auto& b0 = dctx.blob_ref;
15150 _buffer_cache_write(txc, b0, dctx.b_off, bl,
15151 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
15152
15153 b0->dirty_blob().calc_csum(dctx.b_off, bl);
15154
15155 Extent* le = o->extent_map.set_lextent(c, dctx.off,
15156 dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents);
15157
15158 // in fact this is a no-op for big writes but left here to maintain
15159 // uniformity and avoid missing after some refactor.
15160 b0->dirty_blob().mark_used(le->blob_offset, le->length);
15161 txc->statfs_delta.stored() += le->length;
15162
15163 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 15164 bluestore_deferred_op_t* op = _get_deferred_op(txc, bl.length());
f67539c2
TL
15165 op->op = bluestore_deferred_op_t::OP_WRITE;
15166 op->extents.swap(dctx.res_extents);
15167 op->data = std::move(bl);
15168 }
15169}
15170
7c673cae
FG
15171void BlueStore::_do_write_big(
15172 TransContext *txc,
15173 CollectionRef &c,
15174 OnodeRef o,
15175 uint64_t offset, uint64_t length,
15176 bufferlist::iterator& blp,
15177 WriteContext *wctx)
15178{
15179 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
15180 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
15181 << " compress " << (int)wctx->compress
15182 << dendl;
15183 logger->inc(l_bluestore_write_big);
15184 logger->inc(l_bluestore_write_big_bytes, length);
11fdf7f2 15185 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
f67539c2 15186 uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load();
7c673cae
FG
15187 while (length > 0) {
15188 bool new_blob = false;
7c673cae
FG
15189 BlobRef b;
15190 uint32_t b_off = 0;
522d829b 15191 uint32_t l = 0;
7c673cae
FG
15192
15193 //attempting to reuse existing blob
15194 if (!wctx->compress) {
522d829b
TL
15195 // enforce target blob alignment with max_bsize
15196 l = max_bsize - p2phase(offset, max_bsize);
15197 l = std::min(uint64_t(l), length);
15198
7c673cae 15199 auto end = o->extent_map.extent_map.end();
f67539c2 15200
522d829b
TL
15201 dout(20) << __func__ << " may be defer: 0x" << std::hex
15202 << offset << "~" << l
15203 << std::dec << dendl;
15204
f67539c2
TL
15205 if (prefer_deferred_size_snapshot &&
15206 l <= prefer_deferred_size_snapshot * 2) {
15207 // Single write that spans two adjusted existing blobs can result
15208 // in up to two deferred blocks of 'prefer_deferred_size'
15209 // So we're trying to minimize the amount of resulting blobs
15210 // and preserve 2 blobs rather than inserting one more in between
15211 // E.g. write 0x10000~20000 over existing blobs
15212 // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
15213 // performance point of view) to result in two deferred writes to
15214 // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
15215
15216 // look for an existing mutable blob we can write into
15217 auto ep = o->extent_map.seek_lextent(offset);
15218 auto ep_next = end;
15219 BigDeferredWriteContext head_info, tail_info;
15220
15221 bool will_defer = ep != end ?
15222 head_info.can_defer(ep,
15223 prefer_deferred_size_snapshot,
15224 block_size,
15225 offset,
15226 l) :
15227 false;
15228 auto offset_next = offset + head_info.used;
15229 auto remaining = l - head_info.used;
15230 if (will_defer && remaining) {
15231 will_defer = false;
15232 if (remaining <= prefer_deferred_size_snapshot) {
15233 ep_next = o->extent_map.seek_lextent(offset_next);
15234 // check if we can defer remaining totally
15235 will_defer = ep_next == end ?
15236 false :
15237 tail_info.can_defer(ep_next,
15238 prefer_deferred_size_snapshot,
15239 block_size,
15240 offset_next,
15241 remaining);
15242 will_defer = will_defer && remaining == tail_info.used;
15243 }
15244 }
15245 if (will_defer) {
15246 dout(20) << __func__ << " " << *(head_info.blob_ref)
15247 << " deferring big " << std::hex
15248 << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")"
15249 << std::dec << " write via deferred"
15250 << dendl;
15251 if (remaining) {
15252 dout(20) << __func__ << " " << *(tail_info.blob_ref)
15253 << " deferring big " << std::hex
15254 << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")"
15255 << std::dec << " write via deferred"
15256 << dendl;
15257 }
15258
15259 will_defer = head_info.apply_defer();
15260 if (!will_defer) {
15261 dout(20) << __func__
15262 << " deferring big fell back, head isn't continuous"
15263 << dendl;
15264 } else if (remaining) {
15265 will_defer = tail_info.apply_defer();
15266 if (!will_defer) {
15267 dout(20) << __func__
15268 << " deferring big fell back, tail isn't continuous"
15269 << dendl;
15270 }
15271 }
15272 }
15273 if (will_defer) {
15274 _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx);
15275 if (remaining) {
15276 _do_write_big_apply_deferred(txc, c, o, tail_info,
15277 blp, wctx);
15278 }
522d829b
TL
15279 dout(20) << __func__ << " defer big: 0x" << std::hex
15280 << offset << "~" << l
15281 << std::dec << dendl;
f67539c2
TL
15282 offset += l;
15283 length -= l;
15284 logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1);
15285 logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1);
15286 continue;
15287 }
15288 }
522d829b 15289 dout(20) << __func__ << " lookup for blocks to reuse..." << dendl;
f67539c2
TL
15290
15291 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
15292
15293 // seek again as punch_hole could invalidate ep
7c673cae 15294 auto ep = o->extent_map.seek_lextent(offset);
f67539c2
TL
15295 auto begin = o->extent_map.extent_map.begin();
15296 auto prev_ep = end;
15297 if (ep != begin) {
15298 prev_ep = ep;
7c673cae 15299 --prev_ep;
7c673cae 15300 }
f67539c2 15301
7c673cae
FG
15302 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
15303 // search suitable extent in both forward and reverse direction in
15304 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 15305 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
15306 bool any_change;
15307 do {
15308 any_change = false;
15309 if (ep != end && ep->logical_offset < offset + max_bsize) {
522d829b
TL
15310 dout(20) << __func__ << " considering " << *ep
15311 << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl;
f67539c2
TL
15312
15313 if (offset >= ep->blob_start() &&
224ce89b 15314 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
15315 offset - ep->blob_start(),
15316 &l)) {
15317 b = ep->blob;
f67539c2 15318 b_off = offset - ep->blob_start();
7c673cae
FG
15319 prev_ep = end; // to avoid check below
15320 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 15321 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
15322 } else {
15323 ++ep;
15324 any_change = true;
15325 }
15326 }
15327
15328 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
522d829b
TL
15329 dout(20) << __func__ << " considering rev " << *prev_ep
15330 << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl;
f67539c2 15331 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
15332 offset - prev_ep->blob_start(),
15333 &l)) {
15334 b = prev_ep->blob;
15335 b_off = offset - prev_ep->blob_start();
15336 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 15337 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
15338 } else if (prev_ep != begin) {
15339 --prev_ep;
15340 any_change = true;
15341 } else {
15342 prev_ep = end; // to avoid useless first extent re-check
15343 }
15344 }
15345 } while (b == nullptr && any_change);
f67539c2 15346 } else {
522d829b
TL
15347 // trying to utilize as longer chunk as permitted in case of compression.
15348 l = std::min(max_bsize, length);
f67539c2
TL
15349 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
15350 } // if (!wctx->compress)
15351
7c673cae
FG
15352 if (b == nullptr) {
15353 b = c->new_blob();
15354 b_off = 0;
15355 new_blob = true;
15356 }
7c673cae
FG
15357 bufferlist t;
15358 blp.copy(l, t);
20effc67
TL
15359
15360 // Zero detection -- big block
33c7a0ef 15361 if (!cct->_conf->bluestore_zero_block_detection || !t.is_zero()) {
20effc67
TL
15362 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
15363
15364 dout(20) << __func__ << " schedule write big: 0x"
522d829b
TL
15365 << std::hex << offset << "~" << l << std::dec
15366 << (new_blob ? " new " : " reuse ")
15367 << *b << dendl;
20effc67
TL
15368
15369 logger->inc(l_bluestore_write_big_blobs);
15370 } else { // if (!t.is_zero())
15371 dout(20) << __func__ << " skip big zero block " << std::hex
15372 << " (0x" << b_off << "~" << t.length() << ")"
15373 << " (0x" << b_off << "~" << l << ")"
15374 << std::dec << dendl;
15375 logger->inc(l_bluestore_write_big_skipped_blobs);
15376 logger->inc(l_bluestore_write_big_skipped_bytes, l);
15377 }
15378
7c673cae
FG
15379 offset += l;
15380 length -= l;
7c673cae
FG
15381 }
15382}
15383
15384int BlueStore::_do_alloc_write(
15385 TransContext *txc,
15386 CollectionRef coll,
15387 OnodeRef o,
15388 WriteContext *wctx)
15389{
15390 dout(20) << __func__ << " txc " << txc
15391 << " " << wctx->writes.size() << " blobs"
15392 << dendl;
3efd9988
FG
15393 if (wctx->writes.empty()) {
15394 return 0;
7c673cae
FG
15395 }
15396
7c673cae
FG
15397 CompressorRef c;
15398 double crr = 0;
15399 if (wctx->compress) {
15400 c = select_option(
15401 "compression_algorithm",
15402 compressor,
15403 [&]() {
15404 string val;
15405 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
15406 CompressorRef cp = compressor;
15407 if (!cp || cp->get_type_name() != val) {
15408 cp = Compressor::create(cct, val);
11fdf7f2
TL
15409 if (!cp) {
15410 if (_set_compression_alert(false, val.c_str())) {
15411 derr << __func__ << " unable to initialize " << val.c_str()
15412 << " compressor" << dendl;
15413 }
15414 }
7c673cae
FG
15415 }
15416 return boost::optional<CompressorRef>(cp);
15417 }
15418 return boost::optional<CompressorRef>();
15419 }
15420 );
15421
15422 crr = select_option(
15423 "compression_required_ratio",
15424 cct->_conf->bluestore_compression_required_ratio,
15425 [&]() {
15426 double val;
3efd9988 15427 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
7c673cae
FG
15428 return boost::optional<double>(val);
15429 }
15430 return boost::optional<double>();
15431 }
15432 );
15433 }
15434
15435 // checksum
11fdf7f2 15436 int64_t csum = csum_type.load();
7c673cae
FG
15437 csum = select_option(
15438 "csum_type",
15439 csum,
15440 [&]() {
11fdf7f2 15441 int64_t val;
3efd9988 15442 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
11fdf7f2 15443 return boost::optional<int64_t>(val);
7c673cae 15444 }
11fdf7f2 15445 return boost::optional<int64_t>();
7c673cae
FG
15446 }
15447 );
15448
3efd9988
FG
15449 // compress (as needed) and calc needed space
15450 uint64_t need = 0;
11fdf7f2 15451 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae 15452 for (auto& wi : wctx->writes) {
3efd9988 15453 if (c && wi.blob_length > min_alloc_size) {
11fdf7f2 15454 auto start = mono_clock::now();
7c673cae
FG
15455
15456 // compress
11fdf7f2
TL
15457 ceph_assert(wi.b_off == 0);
15458 ceph_assert(wi.blob_length == wi.bl.length());
3efd9988 15459
7c673cae
FG
15460 // FIXME: memory alignment here is bad
15461 bufferlist t;
f67539c2
TL
15462 boost::optional<int32_t> compressor_message;
15463 int r = c->compress(wi.bl, t, compressor_message);
3efd9988 15464 uint64_t want_len_raw = wi.blob_length * crr;
11fdf7f2 15465 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
a8e16298
TL
15466 bool rejected = false;
15467 uint64_t compressed_len = t.length();
15468 // do an approximate (fast) estimation for resulting blob size
15469 // that doesn't take header overhead into account
11fdf7f2 15470 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
15471 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
15472 bluestore_compression_header_t chdr;
15473 chdr.type = c->get_type();
15474 chdr.length = t.length();
f67539c2 15475 chdr.compressor_message = compressor_message;
a8e16298
TL
15476 encode(chdr, wi.compressed_bl);
15477 wi.compressed_bl.claim_append(t);
15478
15479 compressed_len = wi.compressed_bl.length();
11fdf7f2 15480 result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
15481 if (result_len <= want_len && result_len < wi.blob_length) {
15482 // Cool. We compressed at least as much as we were hoping to.
15483 // pad out to min_alloc_size
15484 wi.compressed_bl.append_zero(result_len - compressed_len);
15485 wi.compressed_len = compressed_len;
15486 wi.compressed = true;
15487 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
15488 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
15489 << " -> 0x" << compressed_len << " => 0x" << result_len
15490 << " with " << c->get_type()
15491 << std::dec << dendl;
15492 txc->statfs_delta.compressed() += compressed_len;
15493 txc->statfs_delta.compressed_original() += wi.blob_length;
15494 txc->statfs_delta.compressed_allocated() += result_len;
15495 logger->inc(l_bluestore_compress_success_count);
15496 need += result_len;
15497 } else {
15498 rejected = true;
15499 }
15500 } else if (r != 0) {
15501 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
15502 << " bytes compressed using " << c->get_type_name()
15503 << std::dec
15504 << " failed with errcode = " << r
15505 << ", leaving uncompressed"
15506 << dendl;
15507 logger->inc(l_bluestore_compress_rejected_count);
15508 need += wi.blob_length;
7c673cae 15509 } else {
a8e16298
TL
15510 rejected = true;
15511 }
15512
15513 if (rejected) {
3efd9988 15514 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
a8e16298 15515 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
3efd9988
FG
15516 << " with " << c->get_type()
15517 << ", which is more than required 0x" << want_len_raw
7c673cae 15518 << " -> 0x" << want_len
3efd9988
FG
15519 << ", leaving uncompressed"
15520 << std::dec << dendl;
15521 logger->inc(l_bluestore_compress_rejected_count);
15522 need += wi.blob_length;
7c673cae 15523 }
494da23a
TL
15524 log_latency("compress@_do_alloc_write",
15525 l_bluestore_compress_lat,
15526 mono_clock::now() - start,
15527 cct->_conf->bluestore_log_op_age );
3efd9988
FG
15528 } else {
15529 need += wi.blob_length;
7c673cae 15530 }
3efd9988 15531 }
a8e16298 15532 PExtentVector prealloc;
3efd9988 15533 prealloc.reserve(2 * wctx->writes.size());;
11fdf7f2 15534 int64_t prealloc_left = 0;
20effc67 15535 prealloc_left = alloc->allocate(
3efd9988
FG
15536 need, min_alloc_size, need,
15537 0, &prealloc);
eafe8130 15538 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
11fdf7f2 15539 derr << __func__ << " failed to allocate 0x" << std::hex << need
eafe8130 15540 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
11fdf7f2 15541 << " min_alloc_size 0x" << min_alloc_size
20effc67 15542 << " available 0x " << alloc->get_free()
11fdf7f2
TL
15543 << std::dec << dendl;
15544 if (prealloc.size()) {
20effc67 15545 alloc->release(prealloc);
11fdf7f2 15546 }
a8e16298
TL
15547 return -ENOSPC;
15548 }
20effc67 15549 _collect_allocation_stats(need, min_alloc_size, prealloc);
f67539c2 15550
3efd9988
FG
15551 dout(20) << __func__ << " prealloc " << prealloc << dendl;
15552 auto prealloc_pos = prealloc.begin();
522d829b
TL
15553 ceph_assert(prealloc_pos != prealloc.end());
15554 uint64_t prealloc_pos_length = prealloc_pos->length;
3efd9988
FG
15555
15556 for (auto& wi : wctx->writes) {
522d829b 15557 bluestore_blob_t& dblob = wi.b->dirty_blob();
3efd9988
FG
15558 uint64_t b_off = wi.b_off;
15559 bufferlist *l = &wi.bl;
15560 uint64_t final_length = wi.blob_length;
15561 uint64_t csum_length = wi.blob_length;
3efd9988
FG
15562 if (wi.compressed) {
15563 final_length = wi.compressed_bl.length();
15564 csum_length = final_length;
adb31ebb 15565 unsigned csum_order = ctz(csum_length);
3efd9988
FG
15566 l = &wi.compressed_bl;
15567 dblob.set_compressed(wi.blob_length, wi.compressed_len);
adb31ebb 15568 if (csum != Checksummer::CSUM_NONE) {
522d829b
TL
15569 dout(20) << __func__
15570 << " initialize csum setting for compressed blob " << *wi.b
adb31ebb
TL
15571 << " csum_type " << Checksummer::get_csum_type_string(csum)
15572 << " csum_order " << csum_order
15573 << " csum_length 0x" << std::hex << csum_length
15574 << " blob_length 0x" << wi.blob_length
15575 << " compressed_length 0x" << wi.compressed_len << std::dec
15576 << dendl;
15577 dblob.init_csum(csum, csum_order, csum_length);
15578 }
3efd9988 15579 } else if (wi.new_blob) {
adb31ebb 15580 unsigned csum_order;
7c673cae 15581 // initialize newly created blob only
11fdf7f2 15582 ceph_assert(dblob.is_mutable());
7c673cae
FG
15583 if (l->length() != wi.blob_length) {
15584 // hrm, maybe we could do better here, but let's not bother.
15585 dout(20) << __func__ << " forcing csum_order to block_size_order "
15586 << block_size_order << dendl;
31f18b77 15587 csum_order = block_size_order;
7c673cae
FG
15588 } else {
15589 csum_order = std::min(wctx->csum_order, ctz(l->length()));
15590 }
15591 // try to align blob with max_blob_size to improve
15592 // its reuse ratio, e.g. in case of reverse write
15593 uint32_t suggested_boff =
15594 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
15595 if ((suggested_boff % (1 << csum_order)) == 0 &&
15596 suggested_boff + final_length <= max_bsize &&
15597 suggested_boff > b_off) {
181888fb 15598 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae 15599 << std::hex << suggested_boff << std::dec << dendl;
11fdf7f2 15600 ceph_assert(suggested_boff >= b_off);
7c673cae
FG
15601 csum_length += suggested_boff - b_off;
15602 b_off = suggested_boff;
15603 }
181888fb 15604 if (csum != Checksummer::CSUM_NONE) {
522d829b
TL
15605 dout(20) << __func__
15606 << " initialize csum setting for new blob " << *wi.b
181888fb
FG
15607 << " csum_type " << Checksummer::get_csum_type_string(csum)
15608 << " csum_order " << csum_order
15609 << " csum_length 0x" << std::hex << csum_length << std::dec
15610 << dendl;
15611 dblob.init_csum(csum, csum_order, csum_length);
15612 }
7c673cae
FG
15613 }
15614
a8e16298 15615 PExtentVector extents;
3efd9988 15616 int64_t left = final_length;
522d829b
TL
15617 bool has_chunk2defer = false;
15618 auto prefer_deferred_size_snapshot = prefer_deferred_size.load();
3efd9988 15619 while (left > 0) {
11fdf7f2 15620 ceph_assert(prealloc_left > 0);
522d829b 15621 has_chunk2defer |= (prealloc_pos_length < prefer_deferred_size_snapshot);
3efd9988
FG
15622 if (prealloc_pos->length <= left) {
15623 prealloc_left -= prealloc_pos->length;
15624 left -= prealloc_pos->length;
15625 txc->statfs_delta.allocated() += prealloc_pos->length;
15626 extents.push_back(*prealloc_pos);
15627 ++prealloc_pos;
522d829b
TL
15628 if (prealloc_pos != prealloc.end()) {
15629 prealloc_pos_length = prealloc_pos->length;
15630 }
3efd9988
FG
15631 } else {
15632 extents.emplace_back(prealloc_pos->offset, left);
15633 prealloc_pos->offset += left;
15634 prealloc_pos->length -= left;
15635 prealloc_left -= left;
15636 txc->statfs_delta.allocated() += left;
15637 left = 0;
15638 break;
15639 }
15640 }
7c673cae 15641 for (auto& p : extents) {
3efd9988 15642 txc->allocated.insert(p.offset, p.length);
7c673cae 15643 }
11fdf7f2 15644 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
7c673cae 15645
522d829b 15646 dout(20) << __func__ << " blob " << *wi.b << dendl;
181888fb 15647 if (dblob.has_csum()) {
7c673cae
FG
15648 dblob.calc_csum(b_off, *l);
15649 }
181888fb 15650
7c673cae 15651 if (wi.mark_unused) {
1911f103 15652 ceph_assert(!dblob.is_compressed());
7c673cae
FG
15653 auto b_end = b_off + wi.bl.length();
15654 if (b_off) {
15655 dblob.add_unused(0, b_off);
15656 }
1911f103
TL
15657 uint64_t llen = dblob.get_logical_length();
15658 if (b_end < llen) {
15659 dblob.add_unused(b_end, llen - b_end);
7c673cae
FG
15660 }
15661 }
15662
15663 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
15664 b_off + (wi.b_off0 - wi.b_off),
15665 wi.length0,
15666 wi.b,
15667 nullptr);
15668 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
15669 txc->statfs_delta.stored() += le->length;
15670 dout(20) << __func__ << " lex " << *le << dendl;
15671 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
15672 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
15673
15674 // queue io
11fdf7f2 15675 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 15676 if (has_chunk2defer && l->length() < prefer_deferred_size_snapshot) {
f67539c2 15677 dout(20) << __func__ << " deferring 0x" << std::hex
7c673cae 15678 << l->length() << std::dec << " write via deferred" << dendl;
522d829b 15679 bluestore_deferred_op_t *op = _get_deferred_op(txc, l->length());
7c673cae 15680 op->op = bluestore_deferred_op_t::OP_WRITE;
522d829b 15681 int r = wi.b->get_blob().map(
7c673cae
FG
15682 b_off, l->length(),
15683 [&](uint64_t offset, uint64_t length) {
15684 op->extents.emplace_back(bluestore_pextent_t(offset, length));
15685 return 0;
15686 });
11fdf7f2 15687 ceph_assert(r == 0);
7c673cae
FG
15688 op->data = *l;
15689 } else {
522d829b 15690 wi.b->get_blob().map_bl(
7c673cae
FG
15691 b_off, *l,
15692 [&](uint64_t offset, bufferlist& t) {
15693 bdev->aio_write(offset, t, &txc->ioc, false);
15694 });
f67539c2 15695 logger->inc(l_bluestore_write_new);
7c673cae
FG
15696 }
15697 }
15698 }
11fdf7f2
TL
15699 ceph_assert(prealloc_pos == prealloc.end());
15700 ceph_assert(prealloc_left == 0);
7c673cae
FG
15701 return 0;
15702}
15703
15704void BlueStore::_wctx_finish(
15705 TransContext *txc,
15706 CollectionRef& c,
15707 OnodeRef o,
31f18b77
FG
15708 WriteContext *wctx,
15709 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae 15710{
20effc67
TL
15711#ifdef HAVE_LIBZBD
15712 if (bdev->is_smr()) {
15713 for (auto& w : wctx->writes) {
15714 for (auto& e : w.b->get_blob().get_extents()) {
15715 if (!e.is_valid()) {
15716 continue;
15717 }
15718 uint32_t zone = e.offset / zone_size;
15719 if (!o->onode.zone_offset_refs.count(zone)) {
15720 uint64_t zoff = e.offset % zone_size;
15721 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
15722 << " offset 0x" << zoff << std::dec << dendl;
15723 txc->note_write_zone_offset(o, zone, zoff);
15724 }
15725 }
15726 }
15727 }
15728 set<uint32_t> zones_with_releases;
15729#endif
15730
7c673cae
FG
15731 auto oep = wctx->old_extents.begin();
15732 while (oep != wctx->old_extents.end()) {
15733 auto &lo = *oep;
15734 oep = wctx->old_extents.erase(oep);
15735 dout(20) << __func__ << " lex_old " << lo.e << dendl;
15736 BlobRef b = lo.e.blob;
15737 const bluestore_blob_t& blob = b->get_blob();
15738 if (blob.is_compressed()) {
15739 if (lo.blob_empty) {
15740 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
15741 }
15742 txc->statfs_delta.compressed_original() -= lo.e.length;
15743 }
15744 auto& r = lo.r;
15745 txc->statfs_delta.stored() -= lo.e.length;
15746 if (!r.empty()) {
f67539c2 15747 dout(20) << __func__ << " blob " << *b << " release " << r << dendl;
7c673cae
FG
15748 if (blob.is_shared()) {
15749 PExtentVector final;
15750 c->load_shared_blob(b->shared_blob);
11fdf7f2
TL
15751 bool unshare = false;
15752 bool* unshare_ptr =
15753 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
7c673cae 15754 for (auto e : r) {
31f18b77
FG
15755 b->shared_blob->put_ref(
15756 e.offset, e.length, &final,
11fdf7f2 15757 unshare_ptr);
20effc67
TL
15758#ifdef HAVE_LIBZBD
15759 // we also drop zone ref for shared blob extents
15760 if (bdev->is_smr() && e.is_valid()) {
15761 zones_with_releases.insert(e.offset / zone_size);
15762 }
15763#endif
11fdf7f2
TL
15764 }
15765 if (unshare) {
15766 ceph_assert(maybe_unshared_blobs);
15767 maybe_unshared_blobs->insert(b->shared_blob.get());
7c673cae
FG
15768 }
15769 dout(20) << __func__ << " shared_blob release " << final
15770 << " from " << *b->shared_blob << dendl;
15771 txc->write_shared_blob(b->shared_blob);
15772 r.clear();
15773 r.swap(final);
15774 }
15775 }
15776 // we can't invalidate our logical extents as we drop them because
15777 // other lextents (either in our onode or others) may still
15778 // reference them. but we can throw out anything that is no
15779 // longer allocated. Note that this will leave behind edge bits
15780 // that are no longer referenced but not deallocated (until they
15781 // age out of the cache naturally).
15782 b->discard_unallocated(c.get());
15783 for (auto e : r) {
15784 dout(20) << __func__ << " release " << e << dendl;
15785 txc->released.insert(e.offset, e.length);
15786 txc->statfs_delta.allocated() -= e.length;
15787 if (blob.is_compressed()) {
15788 txc->statfs_delta.compressed_allocated() -= e.length;
15789 }
20effc67
TL
15790#ifdef HAVE_LIBZBD
15791 if (bdev->is_smr() && e.is_valid()) {
15792 zones_with_releases.insert(e.offset / zone_size);
15793 }
15794#endif
7c673cae 15795 }
9f95a23c
TL
15796
15797 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
7c673cae
FG
15798 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
15799 << dendl;
15800 o->extent_map.spanning_blob_map.erase(b->id);
15801 }
9f95a23c 15802 delete &lo;
7c673cae 15803 }
20effc67
TL
15804
15805#ifdef HAVE_LIBZBD
15806 if (!zones_with_releases.empty()) {
15807 // we need to fault the entire extent range in here to determinte if we've dropped
15808 // all refs to a zone.
15809 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
15810 for (auto& b : o->extent_map.extent_map) {
15811 for (auto& e : b.blob->get_blob().get_extents()) {
15812 if (e.is_valid()) {
15813 zones_with_releases.erase(e.offset / zone_size);
15814 }
15815 }
15816 }
15817 for (auto zone : zones_with_releases) {
15818 auto p = o->onode.zone_offset_refs.find(zone);
15819 if (p != o->onode.zone_offset_refs.end()) {
15820 dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
15821 << " offset 0x" << p->second << std::dec << dendl;
15822 txc->note_release_zone_offset(o, zone, p->second);
15823 }
15824 }
15825 }
15826#endif
7c673cae
FG
15827}
15828
15829void BlueStore::_do_write_data(
15830 TransContext *txc,
15831 CollectionRef& c,
15832 OnodeRef o,
15833 uint64_t offset,
15834 uint64_t length,
15835 bufferlist& bl,
15836 WriteContext *wctx)
15837{
15838 uint64_t end = offset + length;
15839 bufferlist::iterator p = bl.begin();
15840
15841 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
15842 (length != min_alloc_size)) {
15843 // we fall within the same block
15844 _do_write_small(txc, c, o, offset, length, p, wctx);
15845 } else {
15846 uint64_t head_offset, head_length;
15847 uint64_t middle_offset, middle_length;
15848 uint64_t tail_offset, tail_length;
15849
15850 head_offset = offset;
11fdf7f2 15851 head_length = p2nphase(offset, min_alloc_size);
7c673cae 15852
11fdf7f2
TL
15853 tail_offset = p2align(end, min_alloc_size);
15854 tail_length = p2phase(end, min_alloc_size);
7c673cae
FG
15855
15856 middle_offset = head_offset + head_length;
15857 middle_length = length - head_length - tail_length;
15858
15859 if (head_length) {
15860 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
15861 }
15862
f67539c2 15863 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
7c673cae
FG
15864
15865 if (tail_length) {
15866 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
15867 }
15868 }
15869}
15870
31f18b77
FG
15871void BlueStore::_choose_write_options(
15872 CollectionRef& c,
15873 OnodeRef o,
15874 uint32_t fadvise_flags,
15875 WriteContext *wctx)
7c673cae 15876{
7c673cae
FG
15877 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
15878 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 15879 wctx->buffered = true;
7c673cae
FG
15880 } else if (cct->_conf->bluestore_default_buffered_write &&
15881 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
15882 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
15883 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 15884 wctx->buffered = true;
7c673cae
FG
15885 }
15886
31f18b77
FG
15887 // apply basic csum block size
15888 wctx->csum_order = block_size_order;
7c673cae
FG
15889
15890 // compression parameters
15891 unsigned alloc_hints = o->onode.alloc_hint_flags;
15892 auto cm = select_option(
15893 "compression_mode",
31f18b77 15894 comp_mode.load(),
7c673cae
FG
15895 [&]() {
15896 string val;
11fdf7f2 15897 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
15898 return boost::optional<Compressor::CompressionMode>(
15899 Compressor::get_comp_mode_type(val));
7c673cae
FG
15900 }
15901 return boost::optional<Compressor::CompressionMode>();
15902 }
15903 );
31f18b77
FG
15904
15905 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
15906 ((cm == Compressor::COMP_FORCE) ||
15907 (cm == Compressor::COMP_AGGRESSIVE &&
15908 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
15909 (cm == Compressor::COMP_PASSIVE &&
15910 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
15911
15912 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
15913 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
15914 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
15915 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 15916 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 15917
7c673cae 15918 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 15919
7c673cae 15920 if (o->onode.expected_write_size) {
224ce89b 15921 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 15922 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 15923 } else {
224ce89b 15924 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
15925 }
15926
31f18b77
FG
15927 if (wctx->compress) {
15928 wctx->target_blob_size = select_option(
7c673cae 15929 "compression_max_blob_size",
31f18b77 15930 comp_max_blob_size.load(),
7c673cae 15931 [&]() {
11fdf7f2
TL
15932 int64_t val;
15933 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
7c673cae
FG
15934 return boost::optional<uint64_t>((uint64_t)val);
15935 }
15936 return boost::optional<uint64_t>();
15937 }
15938 );
15939 }
15940 } else {
31f18b77
FG
15941 if (wctx->compress) {
15942 wctx->target_blob_size = select_option(
7c673cae 15943 "compression_min_blob_size",
31f18b77 15944 comp_min_blob_size.load(),
7c673cae 15945 [&]() {
11fdf7f2
TL
15946 int64_t val;
15947 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
7c673cae
FG
15948 return boost::optional<uint64_t>((uint64_t)val);
15949 }
15950 return boost::optional<uint64_t>();
15951 }
15952 );
15953 }
15954 }
31f18b77 15955
7c673cae 15956 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
15957 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
15958 wctx->target_blob_size = max_bsize;
7c673cae 15959 }
31f18b77 15960
7c673cae
FG
15961 // set the min blob size floor at 2x the min_alloc_size, or else we
15962 // won't be able to allocate a smaller extent for the compressed
15963 // data.
31f18b77
FG
15964 if (wctx->compress &&
15965 wctx->target_blob_size < min_alloc_size * 2) {
15966 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 15967 }
31f18b77
FG
15968
15969 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
15970 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
f64942e4
AA
15971 << " compress=" << (int)wctx->compress
15972 << " buffered=" << (int)wctx->buffered
31f18b77
FG
15973 << std::dec << dendl;
15974}
15975
15976int BlueStore::_do_gc(
15977 TransContext *txc,
15978 CollectionRef& c,
15979 OnodeRef o,
31f18b77
FG
15980 const WriteContext& wctx,
15981 uint64_t *dirty_start,
15982 uint64_t *dirty_end)
15983{
31f18b77 15984
1adf2230 15985 bool dirty_range_updated = false;
31f18b77 15986 WriteContext wctx_gc;
7c673cae 15987 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 15988
eafe8130 15989 auto & extents_to_collect = wctx.extents_to_gc;
31f18b77
FG
15990 for (auto it = extents_to_collect.begin();
15991 it != extents_to_collect.end();
15992 ++it) {
15993 bufferlist bl;
eafe8130
TL
15994 auto offset = (*it).first;
15995 auto length = (*it).second;
15996 dout(20) << __func__ << " processing " << std::hex
15997 << offset << "~" << length << std::dec
15998 << dendl;
15999 int r = _do_read(c.get(), o, offset, length, bl, 0);
16000 ceph_assert(r == (int)length);
31f18b77 16001
eafe8130
TL
16002 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
16003 logger->inc(l_bluestore_gc_merged, length);
31f18b77 16004
eafe8130
TL
16005 if (*dirty_start > offset) {
16006 *dirty_start = offset;
1adf2230 16007 dirty_range_updated = true;
31f18b77
FG
16008 }
16009
eafe8130
TL
16010 if (*dirty_end < offset + length) {
16011 *dirty_end = offset + length;
1adf2230 16012 dirty_range_updated = true;
31f18b77
FG
16013 }
16014 }
1adf2230
AA
16015 if (dirty_range_updated) {
16016 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
16017 }
31f18b77
FG
16018
16019 dout(30) << __func__ << " alloc write" << dendl;
16020 int r = _do_alloc_write(txc, c, o, &wctx_gc);
16021 if (r < 0) {
16022 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
16023 << dendl;
16024 return r;
16025 }
16026
16027 _wctx_finish(txc, c, o, &wctx_gc);
16028 return 0;
16029}
16030
16031int BlueStore::_do_write(
16032 TransContext *txc,
16033 CollectionRef& c,
16034 OnodeRef o,
16035 uint64_t offset,
16036 uint64_t length,
16037 bufferlist& bl,
16038 uint32_t fadvise_flags)
16039{
16040 int r = 0;
16041
16042 dout(20) << __func__
16043 << " " << o->oid
16044 << " 0x" << std::hex << offset << "~" << length
16045 << " - have 0x" << o->onode.size
16046 << " (" << std::dec << o->onode.size << ")"
f67539c2
TL
16047 << " bytes" << std::hex
16048 << " fadvise_flags 0x" << fadvise_flags
16049 << " alloc_hint 0x" << o->onode.alloc_hint_flags
16050 << " expected_object_size " << o->onode.expected_object_size
16051 << " expected_write_size " << o->onode.expected_write_size
16052 << std::dec
31f18b77 16053 << dendl;
81eedcae 16054 _dump_onode<30>(cct, *o);
31f18b77
FG
16055
16056 if (length == 0) {
16057 return 0;
16058 }
16059
16060 uint64_t end = offset + length;
16061
16062 GarbageCollector gc(c->store->cct);
eafe8130 16063 int64_t benefit = 0;
31f18b77
FG
16064 auto dirty_start = offset;
16065 auto dirty_end = end;
16066
16067 WriteContext wctx;
16068 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
16069 o->extent_map.fault_range(db, offset, length);
16070 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
16071 r = _do_alloc_write(txc, c, o, &wctx);
16072 if (r < 0) {
16073 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
16074 << dendl;
16075 goto out;
16076 }
16077
eafe8130
TL
16078 if (wctx.extents_to_gc.empty() ||
16079 wctx.extents_to_gc.range_start() > offset ||
16080 wctx.extents_to_gc.range_end() < offset + length) {
16081 benefit = gc.estimate(offset,
16082 length,
16083 o->extent_map,
16084 wctx.old_extents,
16085 min_alloc_size);
16086 }
16087
31f18b77
FG
16088 // NB: _wctx_finish() will empty old_extents
16089 // so we must do gc estimation before that
7c673cae
FG
16090 _wctx_finish(txc, c, o, &wctx);
16091 if (end > o->onode.size) {
16092 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 16093 << std::dec << dendl;
7c673cae
FG
16094 o->onode.size = end;
16095 }
16096
11fdf7f2 16097 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
eafe8130
TL
16098 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
16099 dout(20) << __func__
16100 << " perform garbage collection for compressed extents, "
16101 << "expected benefit = " << benefit << " AUs" << dendl;
16102 }
16103 if (!wctx.extents_to_gc.empty()) {
16104 dout(20) << __func__ << " perform garbage collection" << dendl;
16105
16106 r = _do_gc(txc, c, o,
16107 wctx,
16108 &dirty_start, &dirty_end);
16109 if (r < 0) {
16110 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
16111 << dendl;
16112 goto out;
7c673cae 16113 }
eafe8130
TL
16114 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
16115 << "~" << dirty_end - dirty_start << std::dec << dendl;
7c673cae 16116 }
7c673cae 16117 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
16118 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
16119
7c673cae
FG
16120 r = 0;
16121
16122 out:
16123 return r;
16124}
16125
16126int BlueStore::_write(TransContext *txc,
16127 CollectionRef& c,
16128 OnodeRef& o,
31f18b77
FG
16129 uint64_t offset, size_t length,
16130 bufferlist& bl,
16131 uint32_t fadvise_flags)
7c673cae
FG
16132{
16133 dout(15) << __func__ << " " << c->cid << " " << o->oid
16134 << " 0x" << std::hex << offset << "~" << length << std::dec
16135 << dendl;
35e4c445
FG
16136 int r = 0;
16137 if (offset + length >= OBJECT_MAX_SIZE) {
16138 r = -E2BIG;
16139 } else {
16140 _assign_nid(txc, o);
16141 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
16142 txc->write_onode(o);
16143 }
7c673cae
FG
16144 dout(10) << __func__ << " " << c->cid << " " << o->oid
16145 << " 0x" << std::hex << offset << "~" << length << std::dec
16146 << " = " << r << dendl;
16147 return r;
16148}
16149
16150int BlueStore::_zero(TransContext *txc,
16151 CollectionRef& c,
16152 OnodeRef& o,
16153 uint64_t offset, size_t length)
16154{
16155 dout(15) << __func__ << " " << c->cid << " " << o->oid
16156 << " 0x" << std::hex << offset << "~" << length << std::dec
16157 << dendl;
35e4c445
FG
16158 int r = 0;
16159 if (offset + length >= OBJECT_MAX_SIZE) {
16160 r = -E2BIG;
16161 } else {
16162 _assign_nid(txc, o);
16163 r = _do_zero(txc, c, o, offset, length);
16164 }
7c673cae
FG
16165 dout(10) << __func__ << " " << c->cid << " " << o->oid
16166 << " 0x" << std::hex << offset << "~" << length << std::dec
16167 << " = " << r << dendl;
16168 return r;
16169}
16170
16171int BlueStore::_do_zero(TransContext *txc,
16172 CollectionRef& c,
16173 OnodeRef& o,
16174 uint64_t offset, size_t length)
16175{
16176 dout(15) << __func__ << " " << c->cid << " " << o->oid
16177 << " 0x" << std::hex << offset << "~" << length << std::dec
16178 << dendl;
16179 int r = 0;
16180
81eedcae 16181 _dump_onode<30>(cct, *o);
7c673cae
FG
16182
16183 WriteContext wctx;
16184 o->extent_map.fault_range(db, offset, length);
16185 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 16186 o->extent_map.dirty_range(offset, length);
7c673cae
FG
16187 _wctx_finish(txc, c, o, &wctx);
16188
b32b8144 16189 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
16190 o->onode.size = offset + length;
16191 dout(20) << __func__ << " extending size to " << offset + length
16192 << dendl;
16193 }
16194 txc->write_onode(o);
16195
16196 dout(10) << __func__ << " " << c->cid << " " << o->oid
16197 << " 0x" << std::hex << offset << "~" << length << std::dec
16198 << " = " << r << dendl;
16199 return r;
16200}
16201
16202void BlueStore::_do_truncate(
31f18b77
FG
16203 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
16204 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
16205{
16206 dout(15) << __func__ << " " << c->cid << " " << o->oid
16207 << " 0x" << std::hex << offset << std::dec << dendl;
16208
81eedcae 16209 _dump_onode<30>(cct, *o);
7c673cae
FG
16210
16211 if (offset == o->onode.size)
31f18b77 16212 return;
7c673cae 16213
f67539c2 16214 WriteContext wctx;
7c673cae 16215 if (offset < o->onode.size) {
7c673cae
FG
16216 uint64_t length = o->onode.size - offset;
16217 o->extent_map.fault_range(db, offset, length);
16218 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 16219 o->extent_map.dirty_range(offset, length);
20effc67 16220
31f18b77 16221 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
16222
16223 // if we have shards past EOF, ask for a reshard
16224 if (!o->onode.extent_map_shards.empty() &&
16225 o->onode.extent_map_shards.back().offset >= offset) {
16226 dout(10) << __func__ << " request reshard past EOF" << dendl;
16227 if (offset) {
16228 o->extent_map.request_reshard(offset - 1, offset + length);
16229 } else {
16230 o->extent_map.request_reshard(0, length);
16231 }
16232 }
16233 }
16234
16235 o->onode.size = offset;
16236
16237 txc->write_onode(o);
16238}
16239
35e4c445 16240int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
16241 CollectionRef& c,
16242 OnodeRef& o,
16243 uint64_t offset)
16244{
16245 dout(15) << __func__ << " " << c->cid << " " << o->oid
16246 << " 0x" << std::hex << offset << std::dec
16247 << dendl;
20effc67
TL
16248
16249 auto start_time = mono_clock::now();
35e4c445
FG
16250 int r = 0;
16251 if (offset >= OBJECT_MAX_SIZE) {
16252 r = -E2BIG;
16253 } else {
16254 _do_truncate(txc, c, o, offset);
16255 }
20effc67
TL
16256 log_latency_fn(
16257 __func__,
16258 l_bluestore_truncate_lat,
16259 mono_clock::now() - start_time,
16260 cct->_conf->bluestore_log_op_age,
16261 [&](const ceph::timespan& lat) {
16262 ostringstream ostr;
16263 ostr << ", lat = " << timespan_str(lat)
16264 << " cid =" << c->cid
16265 << " oid =" << o->oid;
16266 return ostr.str();
16267 }
16268 );
35e4c445
FG
16269 dout(10) << __func__ << " " << c->cid << " " << o->oid
16270 << " 0x" << std::hex << offset << std::dec
16271 << " = " << r << dendl;
16272 return r;
7c673cae
FG
16273}
16274
16275int BlueStore::_do_remove(
16276 TransContext *txc,
16277 CollectionRef& c,
16278 OnodeRef o)
16279{
31f18b77 16280 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
16281 bool is_gen = !o->oid.is_no_gen();
16282 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
16283 if (o->onode.has_omap()) {
16284 o->flush();
9f95a23c 16285 _do_omap_clear(txc, o);
7c673cae
FG
16286 }
16287 o->exists = false;
16288 string key;
16289 for (auto &s : o->extent_map.shards) {
16290 dout(20) << __func__ << " removing shard 0x" << std::hex
16291 << s.shard_info->offset << std::dec << dendl;
16292 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
16293 [&](const string& final_key) {
16294 txc->t->rmkey(PREFIX_OBJ, final_key);
16295 }
16296 );
16297 }
16298 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
a8e16298 16299 txc->note_removed_object(o);
7c673cae
FG
16300 o->extent_map.clear();
16301 o->onode = bluestore_onode_t();
16302 _debug_obj_on_delete(o->oid);
31f18b77 16303
224ce89b
WB
16304 if (!is_gen || maybe_unshared_blobs.empty()) {
16305 return 0;
16306 }
31f18b77 16307
224ce89b
WB
16308 // see if we can unshare blobs still referenced by the head
16309 dout(10) << __func__ << " gen and maybe_unshared_blobs "
16310 << maybe_unshared_blobs << dendl;
16311 ghobject_t nogen = o->oid;
16312 nogen.generation = ghobject_t::NO_GEN;
f67539c2 16313 OnodeRef h = c->get_onode(nogen, false);
224ce89b
WB
16314
16315 if (!h || !h->exists) {
16316 return 0;
16317 }
16318
16319 dout(20) << __func__ << " checking for unshareable blobs on " << h
16320 << " " << h->oid << dendl;
16321 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
16322 for (auto& e : h->extent_map.extent_map) {
16323 const bluestore_blob_t& b = e.blob->get_blob();
16324 SharedBlob *sb = e.blob->shared_blob.get();
16325 if (b.is_shared() &&
16326 sb->loaded &&
16327 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
16328 if (b.is_compressed()) {
16329 expect[sb].get(0, b.get_ondisk_length());
16330 } else {
16331 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
16332 expect[sb].get(off, len);
16333 return 0;
16334 });
16335 }
224ce89b
WB
16336 }
16337 }
31f18b77 16338
224ce89b
WB
16339 vector<SharedBlob*> unshared_blobs;
16340 unshared_blobs.reserve(maybe_unshared_blobs.size());
16341 for (auto& p : expect) {
16342 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
16343 if (p.first->persistent->ref_map == p.second) {
16344 SharedBlob *sb = p.first;
16345 dout(20) << __func__ << " unsharing " << *sb << dendl;
16346 unshared_blobs.push_back(sb);
16347 txc->unshare_blob(sb);
16348 uint64_t sbid = c->make_blob_unshared(sb);
16349 string key;
16350 get_shared_blob_key(sbid, &key);
16351 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
16352 }
16353 }
16354
16355 if (unshared_blobs.empty()) {
16356 return 0;
16357 }
16358
224ce89b
WB
16359 for (auto& e : h->extent_map.extent_map) {
16360 const bluestore_blob_t& b = e.blob->get_blob();
16361 SharedBlob *sb = e.blob->shared_blob.get();
16362 if (b.is_shared() &&
16363 std::find(unshared_blobs.begin(), unshared_blobs.end(),
16364 sb) != unshared_blobs.end()) {
16365 dout(20) << __func__ << " unsharing " << e << dendl;
16366 bluestore_blob_t& blob = e.blob->dirty_blob();
16367 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 16368 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
16369 }
16370 }
224ce89b
WB
16371 txc->write_onode(h);
16372
7c673cae
FG
16373 return 0;
16374}
16375
16376int BlueStore::_remove(TransContext *txc,
16377 CollectionRef& c,
16378 OnodeRef &o)
16379{
11fdf7f2
TL
16380 dout(15) << __func__ << " " << c->cid << " " << o->oid
16381 << " onode " << o.get()
16382 << " txc "<< txc << dendl;
20effc67 16383 auto start_time = mono_clock::now();
7c673cae 16384 int r = _do_remove(txc, c, o);
20effc67 16385
adb31ebb
TL
16386 log_latency_fn(
16387 __func__,
16388 l_bluestore_remove_lat,
16389 mono_clock::now() - start_time,
16390 cct->_conf->bluestore_log_op_age,
16391 [&](const ceph::timespan& lat) {
16392 ostringstream ostr;
16393 ostr << ", lat = " << timespan_str(lat)
16394 << " cid =" << c->cid
16395 << " oid =" << o->oid;
16396 return ostr.str();
16397 }
16398 );
16399
7c673cae
FG
16400 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16401 return r;
16402}
16403
16404int BlueStore::_setattr(TransContext *txc,
16405 CollectionRef& c,
16406 OnodeRef& o,
16407 const string& name,
16408 bufferptr& val)
16409{
16410 dout(15) << __func__ << " " << c->cid << " " << o->oid
16411 << " " << name << " (" << val.length() << " bytes)"
16412 << dendl;
16413 int r = 0;
3efd9988
FG
16414 if (val.is_partial()) {
16415 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
16416 val.length());
f91f0fd5 16417 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
16418 } else {
16419 auto& b = o->onode.attrs[name.c_str()] = val;
f91f0fd5 16420 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 16421 }
7c673cae
FG
16422 txc->write_onode(o);
16423 dout(10) << __func__ << " " << c->cid << " " << o->oid
16424 << " " << name << " (" << val.length() << " bytes)"
16425 << " = " << r << dendl;
16426 return r;
16427}
16428
16429int BlueStore::_setattrs(TransContext *txc,
16430 CollectionRef& c,
16431 OnodeRef& o,
16432 const map<string,bufferptr>& aset)
16433{
16434 dout(15) << __func__ << " " << c->cid << " " << o->oid
16435 << " " << aset.size() << " keys"
16436 << dendl;
16437 int r = 0;
16438 for (map<string,bufferptr>::const_iterator p = aset.begin();
16439 p != aset.end(); ++p) {
3efd9988
FG
16440 if (p->second.is_partial()) {
16441 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 16442 bufferptr(p->second.c_str(), p->second.length());
f91f0fd5 16443 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
16444 } else {
16445 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
f91f0fd5 16446 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 16447 }
7c673cae
FG
16448 }
16449 txc->write_onode(o);
16450 dout(10) << __func__ << " " << c->cid << " " << o->oid
16451 << " " << aset.size() << " keys"
16452 << " = " << r << dendl;
16453 return r;
16454}
16455
16456
16457int BlueStore::_rmattr(TransContext *txc,
16458 CollectionRef& c,
16459 OnodeRef& o,
16460 const string& name)
16461{
16462 dout(15) << __func__ << " " << c->cid << " " << o->oid
16463 << " " << name << dendl;
16464 int r = 0;
16465 auto it = o->onode.attrs.find(name.c_str());
16466 if (it == o->onode.attrs.end())
16467 goto out;
16468
16469 o->onode.attrs.erase(it);
16470 txc->write_onode(o);
16471
16472 out:
16473 dout(10) << __func__ << " " << c->cid << " " << o->oid
16474 << " " << name << " = " << r << dendl;
16475 return r;
16476}
16477
16478int BlueStore::_rmattrs(TransContext *txc,
16479 CollectionRef& c,
16480 OnodeRef& o)
16481{
16482 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16483 int r = 0;
16484
16485 if (o->onode.attrs.empty())
16486 goto out;
16487
16488 o->onode.attrs.clear();
16489 txc->write_onode(o);
16490
16491 out:
16492 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16493 return r;
16494}
16495
9f95a23c 16496void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
7c673cae 16497{
9f95a23c 16498 const string& omap_prefix = o->get_omap_prefix();
7c673cae 16499 string prefix, tail;
9f95a23c
TL
16500 o->get_omap_header(&prefix);
16501 o->get_omap_tail(&tail);
11fdf7f2 16502 txc->t->rm_range_keys(omap_prefix, prefix, tail);
494da23a 16503 txc->t->rmkey(omap_prefix, tail);
20effc67 16504 o->onode.clear_omap_flag();
11fdf7f2
TL
16505 dout(20) << __func__ << " remove range start: "
16506 << pretty_binary_string(prefix) << " end: "
16507 << pretty_binary_string(tail) << dendl;
7c673cae
FG
16508}
16509
16510int BlueStore::_omap_clear(TransContext *txc,
16511 CollectionRef& c,
16512 OnodeRef& o)
16513{
16514 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
20effc67
TL
16515 auto t0 = mono_clock::now();
16516
7c673cae
FG
16517 int r = 0;
16518 if (o->onode.has_omap()) {
16519 o->flush();
9f95a23c 16520 _do_omap_clear(txc, o);
7c673cae
FG
16521 txc->write_onode(o);
16522 }
20effc67
TL
16523 logger->tinc(l_bluestore_omap_clear_lat, mono_clock::now() - t0);
16524
7c673cae
FG
16525 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16526 return r;
16527}
16528
16529int BlueStore::_omap_setkeys(TransContext *txc,
16530 CollectionRef& c,
16531 OnodeRef& o,
16532 bufferlist &bl)
16533{
16534 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16535 int r;
11fdf7f2 16536 auto p = bl.cbegin();
7c673cae
FG
16537 __u32 num;
16538 if (!o->onode.has_omap()) {
11fdf7f2 16539 if (o->oid.is_pgmeta()) {
9f95a23c
TL
16540 o->onode.set_omap_flags_pgmeta();
16541 } else {
522d829b 16542 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
11fdf7f2 16543 }
7c673cae 16544 txc->write_onode(o);
494da23a 16545
9f95a23c 16546 const string& prefix = o->get_omap_prefix();
494da23a
TL
16547 string key_tail;
16548 bufferlist tail;
9f95a23c 16549 o->get_omap_tail(&key_tail);
494da23a 16550 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
16551 } else {
16552 txc->note_modified_object(o);
16553 }
9f95a23c 16554 const string& prefix = o->get_omap_prefix();
7c673cae 16555 string final_key;
9f95a23c
TL
16556 o->get_omap_key(string(), &final_key);
16557 size_t base_key_len = final_key.size();
11fdf7f2 16558 decode(num, p);
7c673cae
FG
16559 while (num--) {
16560 string key;
16561 bufferlist value;
11fdf7f2
TL
16562 decode(key, p);
16563 decode(value, p);
9f95a23c 16564 final_key.resize(base_key_len); // keep prefix
7c673cae 16565 final_key += key;
11fdf7f2 16566 dout(20) << __func__ << " " << pretty_binary_string(final_key)
7c673cae 16567 << " <- " << key << dendl;
11fdf7f2 16568 txc->t->set(prefix, final_key, value);
7c673cae
FG
16569 }
16570 r = 0;
16571 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16572 return r;
16573}
16574
16575int BlueStore::_omap_setheader(TransContext *txc,
16576 CollectionRef& c,
16577 OnodeRef &o,
16578 bufferlist& bl)
16579{
16580 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16581 int r;
16582 string key;
16583 if (!o->onode.has_omap()) {
11fdf7f2 16584 if (o->oid.is_pgmeta()) {
9f95a23c
TL
16585 o->onode.set_omap_flags_pgmeta();
16586 } else {
522d829b 16587 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
11fdf7f2 16588 }
7c673cae 16589 txc->write_onode(o);
494da23a 16590
9f95a23c 16591 const string& prefix = o->get_omap_prefix();
494da23a
TL
16592 string key_tail;
16593 bufferlist tail;
9f95a23c 16594 o->get_omap_tail(&key_tail);
494da23a 16595 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
16596 } else {
16597 txc->note_modified_object(o);
16598 }
9f95a23c
TL
16599 const string& prefix = o->get_omap_prefix();
16600 o->get_omap_header(&key);
11fdf7f2 16601 txc->t->set(prefix, key, bl);
7c673cae
FG
16602 r = 0;
16603 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16604 return r;
16605}
16606
16607int BlueStore::_omap_rmkeys(TransContext *txc,
16608 CollectionRef& c,
16609 OnodeRef& o,
16610 bufferlist& bl)
16611{
16612 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16613 int r = 0;
11fdf7f2 16614 auto p = bl.cbegin();
7c673cae
FG
16615 __u32 num;
16616 string final_key;
16617
16618 if (!o->onode.has_omap()) {
16619 goto out;
16620 }
11fdf7f2 16621 {
9f95a23c
TL
16622 const string& prefix = o->get_omap_prefix();
16623 o->get_omap_key(string(), &final_key);
16624 size_t base_key_len = final_key.size();
11fdf7f2
TL
16625 decode(num, p);
16626 while (num--) {
16627 string key;
16628 decode(key, p);
9f95a23c 16629 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
16630 final_key += key;
16631 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
16632 << " <- " << key << dendl;
16633 txc->t->rmkey(prefix, final_key);
16634 }
7c673cae
FG
16635 }
16636 txc->note_modified_object(o);
16637
16638 out:
16639 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16640 return r;
16641}
16642
16643int BlueStore::_omap_rmkey_range(TransContext *txc,
16644 CollectionRef& c,
16645 OnodeRef& o,
16646 const string& first, const string& last)
16647{
16648 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
7c673cae
FG
16649 string key_first, key_last;
16650 int r = 0;
16651 if (!o->onode.has_omap()) {
16652 goto out;
16653 }
11fdf7f2 16654 {
9f95a23c 16655 const string& prefix = o->get_omap_prefix();
11fdf7f2 16656 o->flush();
9f95a23c
TL
16657 o->get_omap_key(first, &key_first);
16658 o->get_omap_key(last, &key_last);
11fdf7f2
TL
16659 txc->t->rm_range_keys(prefix, key_first, key_last);
16660 dout(20) << __func__ << " remove range start: "
16661 << pretty_binary_string(key_first) << " end: "
16662 << pretty_binary_string(key_last) << dendl;
7c673cae
FG
16663 }
16664 txc->note_modified_object(o);
16665
16666 out:
16667 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16668 return r;
16669}
16670
16671int BlueStore::_set_alloc_hint(
16672 TransContext *txc,
16673 CollectionRef& c,
16674 OnodeRef& o,
16675 uint64_t expected_object_size,
16676 uint64_t expected_write_size,
16677 uint32_t flags)
16678{
16679 dout(15) << __func__ << " " << c->cid << " " << o->oid
16680 << " object_size " << expected_object_size
16681 << " write_size " << expected_write_size
16682 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
16683 << dendl;
16684 int r = 0;
16685 o->onode.expected_object_size = expected_object_size;
16686 o->onode.expected_write_size = expected_write_size;
16687 o->onode.alloc_hint_flags = flags;
16688 txc->write_onode(o);
16689 dout(10) << __func__ << " " << c->cid << " " << o->oid
16690 << " object_size " << expected_object_size
16691 << " write_size " << expected_write_size
16692 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
16693 << " = " << r << dendl;
16694 return r;
16695}
16696
16697int BlueStore::_clone(TransContext *txc,
16698 CollectionRef& c,
16699 OnodeRef& oldo,
16700 OnodeRef& newo)
16701{
16702 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16703 << newo->oid << dendl;
16704 int r = 0;
16705 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
16706 derr << __func__ << " mismatched hash on " << oldo->oid
16707 << " and " << newo->oid << dendl;
16708 return -EINVAL;
16709 }
16710
7c673cae
FG
16711 _assign_nid(txc, newo);
16712
16713 // clone data
16714 oldo->flush();
16715 _do_truncate(txc, c, newo, 0);
16716 if (cct->_conf->bluestore_clone_cow) {
16717 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
16718 } else {
16719 bufferlist bl;
16720 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
16721 if (r < 0)
16722 goto out;
16723 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
16724 if (r < 0)
16725 goto out;
16726 }
16727
16728 // clone attrs
16729 newo->onode.attrs = oldo->onode.attrs;
16730
16731 // clone omap
16732 if (newo->onode.has_omap()) {
16733 dout(20) << __func__ << " clearing old omap data" << dendl;
16734 newo->flush();
9f95a23c 16735 _do_omap_clear(txc, newo);
7c673cae
FG
16736 }
16737 if (oldo->onode.has_omap()) {
16738 dout(20) << __func__ << " copying omap data" << dendl;
494da23a 16739 if (newo->oid.is_pgmeta()) {
9f95a23c
TL
16740 newo->onode.set_omap_flags_pgmeta();
16741 } else {
522d829b 16742 newo->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
7c673cae 16743 }
20effc67
TL
16744 // check if prefix for omap key is exactly the same size for both objects
16745 // otherwise rewrite_omap_key will corrupt data
16746 ceph_assert(oldo->onode.flags == newo->onode.flags);
9f95a23c 16747 const string& prefix = newo->get_omap_prefix();
7c673cae 16748 string head, tail;
9f95a23c
TL
16749 oldo->get_omap_header(&head);
16750 oldo->get_omap_tail(&tail);
33c7a0ef 16751 KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
7c673cae
FG
16752 it->lower_bound(head);
16753 while (it->valid()) {
16754 if (it->key() >= tail) {
16755 dout(30) << __func__ << " reached tail" << dendl;
16756 break;
16757 } else {
16758 dout(30) << __func__ << " got header/data "
16759 << pretty_binary_string(it->key()) << dendl;
16760 string key;
9f95a23c 16761 newo->rewrite_omap_key(it->key(), &key);
11fdf7f2 16762 txc->t->set(prefix, key, it->value());
7c673cae
FG
16763 }
16764 it->next();
16765 }
494da23a
TL
16766 string new_tail;
16767 bufferlist new_tail_value;
9f95a23c 16768 newo->get_omap_tail(&new_tail);
494da23a 16769 txc->t->set(prefix, new_tail, new_tail_value);
7c673cae
FG
16770 }
16771
16772 txc->write_onode(newo);
16773 r = 0;
16774
16775 out:
16776 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16777 << newo->oid << " = " << r << dendl;
16778 return r;
16779}
16780
16781int BlueStore::_do_clone_range(
16782 TransContext *txc,
16783 CollectionRef& c,
16784 OnodeRef& oldo,
16785 OnodeRef& newo,
224ce89b
WB
16786 uint64_t srcoff,
16787 uint64_t length,
16788 uint64_t dstoff)
7c673cae
FG
16789{
16790 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16791 << newo->oid
16792 << " 0x" << std::hex << srcoff << "~" << length << " -> "
16793 << " 0x" << dstoff << "~" << length << std::dec << dendl;
16794 oldo->extent_map.fault_range(db, srcoff, length);
16795 newo->extent_map.fault_range(db, dstoff, length);
81eedcae
TL
16796 _dump_onode<30>(cct, *oldo);
16797 _dump_onode<30>(cct, *newo);
7c673cae 16798
11fdf7f2 16799 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
7c673cae 16800
20effc67
TL
16801#ifdef HAVE_LIBZBD
16802 if (bdev->is_smr()) {
16803 // duplicate the refs for the shared region.
16804 Extent dummy(dstoff);
16805 for (auto e = newo->extent_map.extent_map.lower_bound(dummy);
16806 e != newo->extent_map.extent_map.end();
16807 ++e) {
16808 if (e->logical_offset >= dstoff + length) {
16809 break;
16810 }
16811 for (auto& ex : e->blob->get_blob().get_extents()) {
16812 // note that we may introduce a new extent reference that is
16813 // earlier than the first zone ref. we allow this since it is
16814 // a lot of work to avoid and has marginal impact on cleaning
16815 // performance.
16816 if (!ex.is_valid()) {
16817 continue;
16818 }
16819 uint32_t zone = ex.offset / zone_size;
16820 if (!newo->onode.zone_offset_refs.count(zone)) {
16821 uint64_t zoff = ex.offset % zone_size;
16822 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
16823 << " offset 0x" << zoff << std::dec
16824 << " -> " << newo->oid << dendl;
16825 txc->note_write_zone_offset(newo, zone, zoff);
16826 }
16827 }
16828 }
16829 }
16830#endif
16831
16832 _dump_onode<30>(cct, *oldo);
16833 _dump_onode<30>(cct, *newo);
16834 return 0;
16835}
16836
16837int BlueStore::_clone_range(TransContext *txc,
16838 CollectionRef& c,
16839 OnodeRef& oldo,
16840 OnodeRef& newo,
16841 uint64_t srcoff, uint64_t length, uint64_t dstoff)
16842{
16843 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
7c673cae
FG
16844 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
16845 << " to offset 0x" << dstoff << std::dec << dendl;
16846 int r = 0;
16847
35e4c445
FG
16848 if (srcoff + length >= OBJECT_MAX_SIZE ||
16849 dstoff + length >= OBJECT_MAX_SIZE) {
16850 r = -E2BIG;
16851 goto out;
16852 }
7c673cae
FG
16853 if (srcoff + length > oldo->onode.size) {
16854 r = -EINVAL;
16855 goto out;
16856 }
16857
7c673cae
FG
16858 _assign_nid(txc, newo);
16859
16860 if (length > 0) {
16861 if (cct->_conf->bluestore_clone_cow) {
16862 _do_zero(txc, c, newo, dstoff, length);
16863 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
16864 } else {
16865 bufferlist bl;
16866 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
16867 if (r < 0)
16868 goto out;
16869 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
16870 if (r < 0)
16871 goto out;
16872 }
16873 }
16874
16875 txc->write_onode(newo);
16876 r = 0;
16877
16878 out:
16879 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16880 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
16881 << " to offset 0x" << dstoff << std::dec
16882 << " = " << r << dendl;
16883 return r;
16884}
16885
16886int BlueStore::_rename(TransContext *txc,
16887 CollectionRef& c,
16888 OnodeRef& oldo,
16889 OnodeRef& newo,
16890 const ghobject_t& new_oid)
16891{
16892 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16893 << new_oid << dendl;
16894 int r;
16895 ghobject_t old_oid = oldo->oid;
f91f0fd5 16896 mempool::bluestore_cache_meta::string new_okey;
7c673cae
FG
16897
16898 if (newo) {
16899 if (newo->exists) {
16900 r = -EEXIST;
16901 goto out;
16902 }
11fdf7f2 16903 ceph_assert(txc->onodes.count(newo) == 0);
7c673cae
FG
16904 }
16905
16906 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
16907
16908 // rewrite shards
16909 {
16910 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
16911 get_object_key(cct, new_oid, &new_okey);
16912 string key;
16913 for (auto &s : oldo->extent_map.shards) {
16914 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
16915 [&](const string& final_key) {
16916 txc->t->rmkey(PREFIX_OBJ, final_key);
16917 }
16918 );
16919 s.dirty = true;
16920 }
16921 }
16922
16923 newo = oldo;
16924 txc->write_onode(newo);
16925
16926 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
16927 // Onode in the old slot
16928 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
16929 r = 0;
16930
f64942e4
AA
16931 // hold a ref to new Onode in old name position, to ensure we don't drop
16932 // it from the cache before this txc commits (or else someone may come along
16933 // and read newo's metadata via the old name).
16934 txc->note_modified_object(oldo);
16935
20effc67
TL
16936#ifdef HAVE_LIBZBD
16937 if (bdev->is_smr()) {
16938 // adjust zone refs
16939 for (auto& [zone, offset] : newo->onode.zone_offset_refs) {
16940 dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
16941 << " offset 0x" << offset << std::dec
16942 << " -> " << oldo->oid << dendl;
16943 string key;
16944 get_zone_offset_object_key(zone, offset, oldo->oid, &key);
16945 txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
16946
16947 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
16948 << " offset 0x" << offset << std::dec
16949 << " -> " << newo->oid << dendl;
16950 get_zone_offset_object_key(zone, offset, newo->oid, &key);
16951 bufferlist v;
16952 txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
16953 }
16954 }
16955#endif
16956
7c673cae
FG
16957 out:
16958 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
16959 << new_oid << " = " << r << dendl;
16960 return r;
16961}
16962
16963// collections
16964
16965int BlueStore::_create_collection(
16966 TransContext *txc,
16967 const coll_t &cid,
16968 unsigned bits,
16969 CollectionRef *c)
16970{
16971 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
16972 int r;
16973 bufferlist bl;
16974
16975 {
9f95a23c 16976 std::unique_lock l(coll_lock);
7c673cae
FG
16977 if (*c) {
16978 r = -EEXIST;
16979 goto out;
16980 }
11fdf7f2
TL
16981 auto p = new_coll_map.find(cid);
16982 ceph_assert(p != new_coll_map.end());
16983 *c = p->second;
7c673cae
FG
16984 (*c)->cnode.bits = bits;
16985 coll_map[cid] = *c;
11fdf7f2 16986 new_coll_map.erase(p);
7c673cae 16987 }
11fdf7f2 16988 encode((*c)->cnode, bl);
7c673cae
FG
16989 txc->t->set(PREFIX_COLL, stringify(cid), bl);
16990 r = 0;
16991
16992 out:
16993 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
16994 return r;
16995}
16996
16997int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
16998 CollectionRef *c)
16999{
17000 dout(15) << __func__ << " " << cid << dendl;
17001 int r;
17002
11fdf7f2 17003 (*c)->flush_all_but_last();
7c673cae 17004 {
9f95a23c 17005 std::unique_lock l(coll_lock);
7c673cae
FG
17006 if (!*c) {
17007 r = -ENOENT;
17008 goto out;
17009 }
17010 size_t nonexistent_count = 0;
11fdf7f2 17011 ceph_assert((*c)->exists);
adb31ebb 17012 if ((*c)->onode_map.map_any([&](Onode* o) {
f67539c2
TL
17013 if (o->exists) {
17014 dout(1) << __func__ << " " << o->oid << " " << o
17015 << " exists in onode_map" << dendl;
7c673cae 17016 return true;
f67539c2
TL
17017 }
17018 ++nonexistent_count;
17019 return false;
17020 })) {
7c673cae
FG
17021 r = -ENOTEMPTY;
17022 goto out;
17023 }
7c673cae
FG
17024 vector<ghobject_t> ls;
17025 ghobject_t next;
17026 // Enumerate onodes in db, up to nonexistent_count + 1
17027 // then check if all of them are marked as non-existent.
11fdf7f2 17028 // Bypass the check if (next != ghobject_t::get_max())
7c673cae 17029 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
f91f0fd5 17030 nonexistent_count + 1, false, &ls, &next);
7c673cae 17031 if (r >= 0) {
11fdf7f2
TL
17032 // If true mean collecton has more objects than nonexistent_count,
17033 // so bypass check.
17034 bool exists = (!next.is_max());
7c673cae
FG
17035 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
17036 dout(10) << __func__ << " oid " << *it << dendl;
17037 auto onode = (*c)->onode_map.lookup(*it);
17038 exists = !onode || onode->exists;
17039 if (exists) {
494da23a 17040 dout(1) << __func__ << " " << *it
f67539c2
TL
17041 << " exists in db, "
17042 << (!onode ? "not present in ram" : "present in ram")
17043 << dendl;
7c673cae
FG
17044 }
17045 }
17046 if (!exists) {
f67539c2 17047 _do_remove_collection(txc, c);
7c673cae
FG
17048 r = 0;
17049 } else {
17050 dout(10) << __func__ << " " << cid
17051 << " is non-empty" << dendl;
f67539c2 17052 r = -ENOTEMPTY;
7c673cae
FG
17053 }
17054 }
17055 }
f67539c2 17056out:
7c673cae
FG
17057 dout(10) << __func__ << " " << cid << " = " << r << dendl;
17058 return r;
17059}
17060
11fdf7f2
TL
17061void BlueStore::_do_remove_collection(TransContext *txc,
17062 CollectionRef *c)
17063{
17064 coll_map.erase((*c)->cid);
17065 txc->removed_collections.push_back(*c);
17066 (*c)->exists = false;
17067 _osr_register_zombie((*c)->osr.get());
17068 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
17069 c->reset();
17070}
17071
7c673cae
FG
17072int BlueStore::_split_collection(TransContext *txc,
17073 CollectionRef& c,
17074 CollectionRef& d,
17075 unsigned bits, int rem)
17076{
17077 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
17078 << " bits " << bits << dendl;
9f95a23c
TL
17079 std::unique_lock l(c->lock);
17080 std::unique_lock l2(d->lock);
7c673cae
FG
17081 int r;
17082
17083 // flush all previous deferred writes on this sequencer. this is a bit
17084 // heavyweight, but we need to make sure all deferred writes complete
17085 // before we split as the new collection's sequencer may need to order
17086 // this after those writes, and we don't bother with the complexity of
17087 // moving those TransContexts over to the new osr.
17088 _osr_drain_preceding(txc);
17089
17090 // move any cached items (onodes and referenced shared blobs) that will
17091 // belong to the child collection post-split. leave everything else behind.
17092 // this may include things that don't strictly belong to the now-smaller
17093 // parent split, but the OSD will always send us a split for every new
17094 // child.
17095
17096 spg_t pgid, dest_pgid;
17097 bool is_pg = c->cid.is_pg(&pgid);
11fdf7f2 17098 ceph_assert(is_pg);
7c673cae 17099 is_pg = d->cid.is_pg(&dest_pgid);
11fdf7f2 17100 ceph_assert(is_pg);
7c673cae
FG
17101
17102 // the destination should initially be empty.
11fdf7f2
TL
17103 ceph_assert(d->onode_map.empty());
17104 ceph_assert(d->shared_blob_set.empty());
17105 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
17106
17107 c->split_cache(d.get());
17108
17109 // adjust bits. note that this will be redundant for all but the first
17110 // split call for this parent (first child).
17111 c->cnode.bits = bits;
11fdf7f2 17112 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
17113 r = 0;
17114
17115 bufferlist bl;
11fdf7f2 17116 encode(c->cnode, bl);
7c673cae
FG
17117 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
17118
17119 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
17120 << " bits " << bits << " = " << r << dendl;
17121 return r;
17122}
17123
11fdf7f2
TL
17124int BlueStore::_merge_collection(
17125 TransContext *txc,
17126 CollectionRef *c,
17127 CollectionRef& d,
17128 unsigned bits)
17129{
17130 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
17131 << " bits " << bits << dendl;
9f95a23c
TL
17132 std::unique_lock l((*c)->lock);
17133 std::unique_lock l2(d->lock);
11fdf7f2
TL
17134 int r;
17135
17136 coll_t cid = (*c)->cid;
17137
17138 // flush all previous deferred writes on the source collection to ensure
17139 // that all deferred writes complete before we merge as the target collection's
17140 // sequencer may need to order new ops after those writes.
17141
17142 _osr_drain((*c)->osr.get());
17143
17144 // move any cached items (onodes and referenced shared blobs) that will
17145 // belong to the child collection post-split. leave everything else behind.
17146 // this may include things that don't strictly belong to the now-smaller
17147 // parent split, but the OSD will always send us a split for every new
17148 // child.
17149
17150 spg_t pgid, dest_pgid;
17151 bool is_pg = cid.is_pg(&pgid);
17152 ceph_assert(is_pg);
17153 is_pg = d->cid.is_pg(&dest_pgid);
17154 ceph_assert(is_pg);
17155
17156 // adjust bits. note that this will be redundant for all but the first
17157 // merge call for the parent/target.
17158 d->cnode.bits = bits;
17159
17160 // behavior depends on target (d) bits, so this after that is updated.
17161 (*c)->split_cache(d.get());
17162
17163 // remove source collection
17164 {
9f95a23c 17165 std::unique_lock l3(coll_lock);
11fdf7f2
TL
17166 _do_remove_collection(txc, c);
17167 }
17168
17169 r = 0;
17170
17171 bufferlist bl;
17172 encode(d->cnode, bl);
17173 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
17174
17175 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
17176 << " bits " << bits << " = " << r << dendl;
17177 return r;
17178}
17179
494da23a
TL
17180void BlueStore::log_latency(
17181 const char* name,
17182 int idx,
17183 const ceph::timespan& l,
17184 double lat_threshold,
17185 const char* info) const
17186{
17187 logger->tinc(idx, l);
17188 if (lat_threshold > 0.0 &&
17189 l >= make_timespan(lat_threshold)) {
17190 dout(0) << __func__ << " slow operation observed for " << name
17191 << ", latency = " << l
17192 << info
17193 << dendl;
17194 }
17195}
17196
11fdf7f2 17197void BlueStore::log_latency_fn(
494da23a 17198 const char* name,
11fdf7f2
TL
17199 int idx,
17200 const ceph::timespan& l,
494da23a
TL
17201 double lat_threshold,
17202 std::function<string (const ceph::timespan& lat)> fn) const
11fdf7f2 17203{
494da23a
TL
17204 logger->tinc(idx, l);
17205 if (lat_threshold > 0.0 &&
17206 l >= make_timespan(lat_threshold)) {
17207 dout(0) << __func__ << " slow operation observed for " << name
17208 << ", latency = " << l
17209 << fn(l)
17210 << dendl;
17211 }
11fdf7f2
TL
17212}
17213
9f95a23c
TL
17214#if defined(WITH_LTTNG)
17215void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
17216 KeyValueDB &db,
17217 TransContext &txc,
17218 mono_clock::time_point start_throttle_acquire)
17219{
17220 pending_kv_ios += txc.ios;
17221 if (txc.deferred_txn) {
17222 pending_deferred_ios += txc.ios;
17223 }
17224
17225 uint64_t started = 0;
17226 uint64_t completed = 0;
17227 if (should_trace(&started, &completed)) {
17228 txc.tracing = true;
17229 uint64_t rocksdb_base_level,
17230 rocksdb_estimate_pending_compaction_bytes,
17231 rocksdb_cur_size_all_mem_tables,
17232 rocksdb_compaction_pending,
17233 rocksdb_mem_table_flush_pending,
17234 rocksdb_num_running_compactions,
17235 rocksdb_num_running_flushes,
17236 rocksdb_actual_delayed_write_rate;
17237 db.get_property(
17238 "rocksdb.base-level",
17239 &rocksdb_base_level);
17240 db.get_property(
17241 "rocksdb.estimate-pending-compaction-bytes",
17242 &rocksdb_estimate_pending_compaction_bytes);
17243 db.get_property(
17244 "rocksdb.cur-size-all-mem-tables",
17245 &rocksdb_cur_size_all_mem_tables);
17246 db.get_property(
17247 "rocksdb.compaction-pending",
17248 &rocksdb_compaction_pending);
17249 db.get_property(
17250 "rocksdb.mem-table-flush-pending",
17251 &rocksdb_mem_table_flush_pending);
17252 db.get_property(
17253 "rocksdb.num-running-compactions",
17254 &rocksdb_num_running_compactions);
17255 db.get_property(
17256 "rocksdb.num-running-flushes",
17257 &rocksdb_num_running_flushes);
17258 db.get_property(
17259 "rocksdb.actual-delayed-write-rate",
17260 &rocksdb_actual_delayed_write_rate);
17261
17262
17263 tracepoint(
17264 bluestore,
17265 transaction_initial_state,
17266 txc.osr->get_sequencer_id(),
17267 txc.seq,
17268 throttle_bytes.get_current(),
17269 throttle_deferred_bytes.get_current(),
17270 pending_kv_ios,
17271 pending_deferred_ios,
17272 started,
17273 completed,
17274 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
17275
17276 tracepoint(
17277 bluestore,
17278 transaction_initial_state_rocksdb,
17279 txc.osr->get_sequencer_id(),
17280 txc.seq,
17281 rocksdb_base_level,
17282 rocksdb_estimate_pending_compaction_bytes,
17283 rocksdb_cur_size_all_mem_tables,
17284 rocksdb_compaction_pending,
17285 rocksdb_mem_table_flush_pending,
17286 rocksdb_num_running_compactions,
17287 rocksdb_num_running_flushes,
17288 rocksdb_actual_delayed_write_rate);
17289 }
17290}
17291#endif
17292
17293mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
17294 TransContext &txc, PerfCounters *logger, int state)
17295{
17296 mono_clock::time_point now = mono_clock::now();
17297 mono_clock::duration lat = now - txc.last_stamp;
17298 logger->tinc(state, lat);
17299#if defined(WITH_LTTNG)
17300 if (txc.tracing &&
17301 state >= l_bluestore_state_prepare_lat &&
17302 state <= l_bluestore_state_done_lat) {
17303 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
17304 tracepoint(
17305 bluestore,
17306 transaction_state_duration,
17307 txc.osr->get_sequencer_id(),
17308 txc.seq,
17309 state,
17310 ceph::to_seconds<double>(lat));
17311 }
17312#endif
17313 txc.last_stamp = now;
17314 return lat;
17315}
17316
17317bool BlueStore::BlueStoreThrottle::try_start_transaction(
17318 KeyValueDB &db,
17319 TransContext &txc,
17320 mono_clock::time_point start_throttle_acquire)
17321{
17322 throttle_bytes.get(txc.cost);
17323
17324 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
17325 emit_initial_tracepoint(db, txc, start_throttle_acquire);
17326 return true;
17327 } else {
17328 return false;
17329 }
17330}
17331
17332void BlueStore::BlueStoreThrottle::finish_start_transaction(
17333 KeyValueDB &db,
17334 TransContext &txc,
17335 mono_clock::time_point start_throttle_acquire)
17336{
17337 ceph_assert(txc.deferred_txn);
17338 throttle_deferred_bytes.get(txc.cost);
17339 emit_initial_tracepoint(db, txc, start_throttle_acquire);
17340}
17341
17342#if defined(WITH_LTTNG)
17343void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
17344{
17345 pending_kv_ios -= 1;
17346 ios_completed_since_last_traced++;
17347 if (txc.tracing) {
17348 tracepoint(
17349 bluestore,
17350 transaction_commit_latency,
17351 txc.osr->get_sequencer_id(),
17352 txc.seq,
17353 ceph::to_seconds<double>(mono_clock::now() - txc.start));
17354 }
17355}
17356#endif
17357
17358#if defined(WITH_LTTNG)
17359void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
17360{
17361 if (txc.deferred_txn) {
17362 pending_deferred_ios -= 1;
17363 }
17364 if (txc.tracing) {
17365 mono_clock::time_point now = mono_clock::now();
17366 mono_clock::duration lat = now - txc.start;
17367 tracepoint(
17368 bluestore,
17369 transaction_total_duration,
17370 txc.osr->get_sequencer_id(),
17371 txc.seq,
17372 ceph::to_seconds<double>(lat));
17373 }
17374}
17375#endif
11fdf7f2 17376
7c673cae
FG
17377const string prefix_onode = "o";
17378const string prefix_onode_shard = "x";
17379const string prefix_other = "Z";
7c673cae
FG
17380//Itrerates through the db and collects the stats
17381void BlueStore::generate_db_histogram(Formatter *f)
17382{
17383 //globals
17384 uint64_t num_onodes = 0;
17385 uint64_t num_shards = 0;
17386 uint64_t num_super = 0;
17387 uint64_t num_coll = 0;
17388 uint64_t num_omap = 0;
11fdf7f2 17389 uint64_t num_pgmeta_omap = 0;
7c673cae
FG
17390 uint64_t num_deferred = 0;
17391 uint64_t num_alloc = 0;
17392 uint64_t num_stat = 0;
17393 uint64_t num_others = 0;
17394 uint64_t num_shared_shards = 0;
17395 size_t max_key_size =0, max_value_size = 0;
17396 uint64_t total_key_size = 0, total_value_size = 0;
17397 size_t key_size = 0, value_size = 0;
20effc67 17398 KeyValueHistogram hist;
7c673cae 17399
11fdf7f2 17400 auto start = coarse_mono_clock::now();
7c673cae 17401
11fdf7f2 17402 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
7c673cae
FG
17403 iter->seek_to_first();
17404 while (iter->valid()) {
17405 dout(30) << __func__ << " Key: " << iter->key() << dendl;
17406 key_size = iter->key_size();
17407 value_size = iter->value_size();
17408 hist.value_hist[hist.get_value_slab(value_size)]++;
11fdf7f2
TL
17409 max_key_size = std::max(max_key_size, key_size);
17410 max_value_size = std::max(max_value_size, value_size);
7c673cae
FG
17411 total_key_size += key_size;
17412 total_value_size += value_size;
17413
17414 pair<string,string> key(iter->raw_key());
17415
17416 if (key.first == PREFIX_SUPER) {
17417 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
17418 num_super++;
17419 } else if (key.first == PREFIX_STAT) {
17420 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
17421 num_stat++;
17422 } else if (key.first == PREFIX_COLL) {
17423 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
17424 num_coll++;
17425 } else if (key.first == PREFIX_OBJ) {
17426 if (key.second.back() == ONODE_KEY_SUFFIX) {
17427 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
17428 num_onodes++;
17429 } else {
17430 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
17431 num_shards++;
17432 }
17433 } else if (key.first == PREFIX_OMAP) {
17434 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
17435 num_omap++;
f67539c2
TL
17436 } else if (key.first == PREFIX_PERPOOL_OMAP) {
17437 hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size);
17438 num_omap++;
17439 } else if (key.first == PREFIX_PERPG_OMAP) {
17440 hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size);
17441 num_omap++;
11fdf7f2
TL
17442 } else if (key.first == PREFIX_PGMETA_OMAP) {
17443 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
17444 num_pgmeta_omap++;
7c673cae
FG
17445 } else if (key.first == PREFIX_DEFERRED) {
17446 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
17447 num_deferred++;
11fdf7f2 17448 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
7c673cae
FG
17449 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
17450 num_alloc++;
17451 } else if (key.first == PREFIX_SHARED_BLOB) {
17452 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
17453 num_shared_shards++;
17454 } else {
17455 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
17456 num_others++;
17457 }
17458 iter->next();
17459 }
17460
11fdf7f2 17461 ceph::timespan duration = coarse_mono_clock::now() - start;
7c673cae
FG
17462 f->open_object_section("rocksdb_key_value_stats");
17463 f->dump_unsigned("num_onodes", num_onodes);
17464 f->dump_unsigned("num_shards", num_shards);
17465 f->dump_unsigned("num_super", num_super);
17466 f->dump_unsigned("num_coll", num_coll);
17467 f->dump_unsigned("num_omap", num_omap);
11fdf7f2 17468 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
7c673cae
FG
17469 f->dump_unsigned("num_deferred", num_deferred);
17470 f->dump_unsigned("num_alloc", num_alloc);
17471 f->dump_unsigned("num_stat", num_stat);
17472 f->dump_unsigned("num_shared_shards", num_shared_shards);
17473 f->dump_unsigned("num_others", num_others);
17474 f->dump_unsigned("max_key_size", max_key_size);
17475 f->dump_unsigned("max_value_size", max_value_size);
17476 f->dump_unsigned("total_key_size", total_key_size);
17477 f->dump_unsigned("total_value_size", total_value_size);
17478 f->close_section();
17479
17480 hist.dump(f);
17481
17482 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
17483
17484}
17485
f6b5b4d7 17486void BlueStore::_shutdown_cache()
7c673cae
FG
17487{
17488 dout(10) << __func__ << dendl;
9f95a23c
TL
17489 for (auto i : buffer_cache_shards) {
17490 i->flush();
11fdf7f2 17491 ceph_assert(i->empty());
7c673cae
FG
17492 }
17493 for (auto& p : coll_map) {
f6b5b4d7 17494 p.second->onode_map.clear();
3efd9988
FG
17495 if (!p.second->shared_blob_set.empty()) {
17496 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11fdf7f2 17497 p.second->shared_blob_set.dump<0>(cct);
3efd9988 17498 }
11fdf7f2
TL
17499 ceph_assert(p.second->onode_map.empty());
17500 ceph_assert(p.second->shared_blob_set.empty());
7c673cae
FG
17501 }
17502 coll_map.clear();
f6b5b4d7
TL
17503 for (auto i : onode_cache_shards) {
17504 ceph_assert(i->empty());
17505 }
7c673cae
FG
17506}
17507
31f18b77
FG
17508// For external caller.
17509// We use a best-effort policy instead, e.g.,
17510// we don't care if there are still some pinned onodes/data in the cache
17511// after this command is completed.
11fdf7f2 17512int BlueStore::flush_cache(ostream *os)
31f18b77
FG
17513{
17514 dout(10) << __func__ << dendl;
9f95a23c
TL
17515 for (auto i : onode_cache_shards) {
17516 i->flush();
17517 }
17518 for (auto i : buffer_cache_shards) {
17519 i->flush();
31f18b77 17520 }
11fdf7f2
TL
17521
17522 return 0;
31f18b77
FG
17523}
17524
7c673cae
FG
17525void BlueStore::_apply_padding(uint64_t head_pad,
17526 uint64_t tail_pad,
7c673cae
FG
17527 bufferlist& padded)
17528{
7c673cae 17529 if (head_pad) {
224ce89b 17530 padded.prepend_zero(head_pad);
7c673cae
FG
17531 }
17532 if (tail_pad) {
17533 padded.append_zero(tail_pad);
17534 }
17535 if (head_pad || tail_pad) {
17536 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
17537 << " tail 0x" << tail_pad << std::dec << dendl;
17538 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
17539 }
17540}
17541
11fdf7f2
TL
17542void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
17543{
17544 // finalize extent_map shards
17545 o->extent_map.update(txn, false);
17546 if (o->extent_map.needs_reshard()) {
17547 o->extent_map.reshard(db, txn);
17548 o->extent_map.update(txn, true);
17549 if (o->extent_map.needs_reshard()) {
17550 dout(20) << __func__ << " warning: still wants reshard, check options?"
17551 << dendl;
17552 o->extent_map.clear_needs_reshard();
17553 }
17554 logger->inc(l_bluestore_onode_reshard);
17555 }
17556
17557 // bound encode
17558 size_t bound = 0;
17559 denc(o->onode, bound);
17560 o->extent_map.bound_encode_spanning_blobs(bound);
17561 if (o->onode.extent_map_shards.empty()) {
17562 denc(o->extent_map.inline_bl, bound);
17563 }
17564
17565 // encode
17566 bufferlist bl;
17567 unsigned onode_part, blob_part, extent_part;
17568 {
17569 auto p = bl.get_contiguous_appender(bound, true);
17570 denc(o->onode, p);
17571 onode_part = p.get_logical_offset();
17572 o->extent_map.encode_spanning_blobs(p);
17573 blob_part = p.get_logical_offset() - onode_part;
17574 if (o->onode.extent_map_shards.empty()) {
17575 denc(o->extent_map.inline_bl, p);
17576 }
17577 extent_part = p.get_logical_offset() - onode_part - blob_part;
17578 }
17579
17580 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
17581 << " (" << onode_part << " bytes onode + "
17582 << blob_part << " bytes spanning blobs + "
17583 << extent_part << " bytes inline extents)"
17584 << dendl;
17585
17586
17587 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
17588}
17589
17590void BlueStore::_log_alerts(osd_alert_list_t& alerts)
17591{
17592 std::lock_guard l(qlock);
17593
522d829b
TL
17594 if (!spurious_read_errors_alert.empty() &&
17595 cct->_conf->bluestore_warn_on_spurious_read_errors) {
f67539c2
TL
17596 alerts.emplace(
17597 "BLUESTORE_SPURIOUS_READ_ERRORS",
17598 spurious_read_errors_alert);
17599 }
81eedcae
TL
17600 if (!disk_size_mismatch_alert.empty()) {
17601 alerts.emplace(
17602 "BLUESTORE_DISK_SIZE_MISMATCH",
17603 disk_size_mismatch_alert);
17604 }
17605 if (!legacy_statfs_alert.empty()) {
17606 alerts.emplace(
17607 "BLUESTORE_LEGACY_STATFS",
17608 legacy_statfs_alert);
17609 }
11fdf7f2
TL
17610 if (!spillover_alert.empty() &&
17611 cct->_conf->bluestore_warn_on_bluefs_spillover) {
17612 alerts.emplace(
17613 "BLUEFS_SPILLOVER",
17614 spillover_alert);
17615 }
f67539c2
TL
17616 if (!no_per_pg_omap_alert.empty()) {
17617 alerts.emplace(
17618 "BLUESTORE_NO_PER_PG_OMAP",
17619 no_per_pg_omap_alert);
17620 }
9f95a23c
TL
17621 if (!no_per_pool_omap_alert.empty()) {
17622 alerts.emplace(
17623 "BLUESTORE_NO_PER_POOL_OMAP",
17624 no_per_pool_omap_alert);
17625 }
11fdf7f2
TL
17626 string s0(failed_cmode);
17627
17628 if (!failed_compressors.empty()) {
17629 if (!s0.empty()) {
17630 s0 += ", ";
17631 }
17632 s0 += "unable to load:";
17633 bool first = true;
17634 for (auto& s : failed_compressors) {
17635 if (first) {
17636 first = false;
17637 } else {
17638 s0 += ", ";
17639 }
17640 s0 += s;
17641 }
17642 alerts.emplace(
17643 "BLUESTORE_NO_COMPRESSION",
17644 s0);
17645 }
17646}
17647
9f95a23c 17648void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
20effc67 17649 const PExtentVector& extents)
9f95a23c
TL
17650{
17651 alloc_stats_count++;
20effc67 17652 alloc_stats_fragments += extents.size();
9f95a23c 17653 alloc_stats_size += need;
20effc67
TL
17654
17655 for (auto& e : extents) {
17656 logger->hinc(l_bluestore_allocate_hist, e.length, need);
17657 }
9f95a23c
TL
17658}
17659
17660void BlueStore::_record_allocation_stats()
17661{
17662 // don't care about data consistency,
17663 // fields can be partially modified while making the tuple
17664 auto t0 = std::make_tuple(
17665 alloc_stats_count.exchange(0),
17666 alloc_stats_fragments.exchange(0),
17667 alloc_stats_size.exchange(0));
17668
17669 dout(0) << " allocation stats probe "
17670 << probe_count << ":"
17671 << " cnt: " << std::get<0>(t0)
17672 << " frags: " << std::get<1>(t0)
17673 << " size: " << std::get<2>(t0)
17674 << dendl;
17675
17676
17677 //
17678 // Keep the history for probes from the power-of-two sequence:
17679 // -1, -2, -4, -8, -16
17680 //
17681 size_t base = 1;
17682 for (auto& t : alloc_stats_history) {
17683 dout(0) << " probe -"
17684 << base + (probe_count % base) << ": "
17685 << std::get<0>(t)
17686 << ", " << std::get<1>(t)
17687 << ", " << std::get<2>(t)
17688 << dendl;
17689 base <<= 1;
17690 }
17691 dout(0) << "------------" << dendl;
17692
f67539c2 17693 ++ probe_count;
9f95a23c 17694
f67539c2
TL
17695 for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) {
17696 if ((probe_count % (1 << i)) == 0) {
17697 alloc_stats_history[i] = alloc_stats_history[i - 1];
17698 }
9f95a23c
TL
17699 }
17700 alloc_stats_history[0].swap(t0);
17701}
17702
7c673cae 17703// ===========================================
11fdf7f2
TL
17704// BlueStoreRepairer
17705
17706size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
17707 const interval_set<uint64_t>& extents)
17708{
17709 ceph_assert(granularity); // initialized
17710 // can't call for the second time
17711 ceph_assert(!was_filtered_out);
17712 ceph_assert(collections_bfs.size() == objects_bfs.size());
17713
17714 uint64_t prev_pos = 0;
17715 uint64_t npos = collections_bfs.size();
17716
17717 bloom_vector collections_reduced;
17718 bloom_vector objects_reduced;
17719
17720 for (auto e : extents) {
17721 if (e.second == 0) {
17722 continue;
17723 }
17724 uint64_t pos = max(e.first / granularity, prev_pos);
17725 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
17726 while (pos != npos && pos < end_pos) {
17727 ceph_assert( collections_bfs[pos].element_count() ==
17728 objects_bfs[pos].element_count());
17729 if (collections_bfs[pos].element_count()) {
17730 collections_reduced.push_back(std::move(collections_bfs[pos]));
17731 objects_reduced.push_back(std::move(objects_bfs[pos]));
17732 }
17733 ++pos;
17734 }
17735 prev_pos = end_pos;
17736 }
17737 collections_reduced.swap(collections_bfs);
17738 objects_reduced.swap(objects_bfs);
17739 was_filtered_out = true;
17740 return collections_bfs.size();
17741}
17742
17743bool BlueStoreRepairer::remove_key(KeyValueDB *db,
17744 const string& prefix,
17745 const string& key)
17746{
b3b6e05e 17747 std::lock_guard l(lock);
11fdf7f2
TL
17748 if (!remove_key_txn) {
17749 remove_key_txn = db->get_transaction();
17750 }
17751 ++to_repair_cnt;
17752 remove_key_txn->rmkey(prefix, key);
17753
17754 return true;
17755}
17756
f67539c2 17757void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val)
9f95a23c 17758{
b3b6e05e
TL
17759 std::lock_guard l(lock); // possibly redundant
17760 ceph_assert(fix_per_pool_omap_txn == nullptr);
9f95a23c
TL
17761 fix_per_pool_omap_txn = db->get_transaction();
17762 ++to_repair_cnt;
17763 bufferlist bl;
f67539c2 17764 bl.append(stringify(val));
9f95a23c
TL
17765 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
17766}
17767
11fdf7f2 17768bool BlueStoreRepairer::fix_shared_blob(
20effc67 17769 KeyValueDB::Transaction txn,
11fdf7f2 17770 uint64_t sbid,
20effc67
TL
17771 bluestore_extent_ref_map_t* ref_map,
17772 size_t repaired)
11fdf7f2 17773{
11fdf7f2
TL
17774 string key;
17775 get_shared_blob_key(sbid, &key);
20effc67
TL
17776 if (ref_map) {
17777 bluestore_shared_blob_t persistent(sbid, std::move(*ref_map));
17778 bufferlist bl;
17779 encode(persistent, bl);
17780 txn->set(PREFIX_SHARED_BLOB, key, bl);
11fdf7f2
TL
17781 } else {
17782 txn->rmkey(PREFIX_SHARED_BLOB, key);
17783 }
20effc67 17784 to_repair_cnt += repaired;
11fdf7f2
TL
17785 return true;
17786}
17787
17788bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
17789 const string& key,
17790 const store_statfs_t& new_statfs)
17791{
b3b6e05e 17792 std::lock_guard l(lock);
11fdf7f2
TL
17793 if (!fix_statfs_txn) {
17794 fix_statfs_txn = db->get_transaction();
17795 }
17796 BlueStore::volatile_statfs vstatfs;
17797 vstatfs = new_statfs;
17798 bufferlist bl;
17799 vstatfs.encode(bl);
17800 ++to_repair_cnt;
17801 fix_statfs_txn->set(PREFIX_STAT, key, bl);
17802 return true;
17803}
17804
17805bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
17806 FreelistManager* fm,
17807 uint64_t offset, uint64_t len)
17808{
b3b6e05e 17809 std::lock_guard l(lock);
20effc67
TL
17810 ceph_assert(!fm->is_null_manager());
17811
11fdf7f2
TL
17812 if (!fix_fm_leaked_txn) {
17813 fix_fm_leaked_txn = db->get_transaction();
17814 }
17815 ++to_repair_cnt;
17816 fm->release(offset, len, fix_fm_leaked_txn);
17817 return true;
17818}
17819bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
17820 FreelistManager* fm,
17821 uint64_t offset, uint64_t len)
17822{
b3b6e05e 17823 std::lock_guard l(lock);
20effc67
TL
17824 ceph_assert(!fm->is_null_manager());
17825
11fdf7f2
TL
17826 if (!fix_fm_false_free_txn) {
17827 fix_fm_false_free_txn = db->get_transaction();
17828 }
17829 ++to_repair_cnt;
17830 fm->allocate(offset, len, fix_fm_false_free_txn);
17831 return true;
17832}
17833
b3b6e05e
TL
17834bool BlueStoreRepairer::fix_spanning_blobs(
17835 KeyValueDB* db,
17836 std::function<void(KeyValueDB::Transaction)> f)
adb31ebb 17837{
b3b6e05e 17838 std::lock_guard l(lock);
adb31ebb
TL
17839 if (!fix_onode_txn) {
17840 fix_onode_txn = db->get_transaction();
17841 }
b3b6e05e 17842 f(fix_onode_txn);
adb31ebb 17843 ++to_repair_cnt;
b3b6e05e 17844 return true;
adb31ebb
TL
17845}
17846
11fdf7f2
TL
17847bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
17848{
b3b6e05e 17849 //NB: not for use in multithreading mode!!!
11fdf7f2
TL
17850 if (misreferenced_extents.size()) {
17851 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
17852 ceph_assert(n > 0);
17853 if (!fix_misreferences_txn) {
17854 fix_misreferences_txn = db->get_transaction();
17855 }
17856 return true;
17857 }
17858 return false;
17859}
17860
17861unsigned BlueStoreRepairer::apply(KeyValueDB* db)
17862{
b3b6e05e 17863 //NB: not for use in multithreading mode!!!
9f95a23c 17864 if (fix_per_pool_omap_txn) {
20effc67
TL
17865 auto ok = db->submit_transaction_sync(fix_per_pool_omap_txn) == 0;
17866 ceph_assert(ok);
9f95a23c
TL
17867 fix_per_pool_omap_txn = nullptr;
17868 }
11fdf7f2 17869 if (fix_fm_leaked_txn) {
20effc67
TL
17870 auto ok = db->submit_transaction_sync(fix_fm_leaked_txn) == 0;
17871 ceph_assert(ok);
11fdf7f2
TL
17872 fix_fm_leaked_txn = nullptr;
17873 }
17874 if (fix_fm_false_free_txn) {
20effc67
TL
17875 auto ok = db->submit_transaction_sync(fix_fm_false_free_txn) == 0;
17876 ceph_assert(ok);
11fdf7f2
TL
17877 fix_fm_false_free_txn = nullptr;
17878 }
17879 if (remove_key_txn) {
20effc67
TL
17880 auto ok = db->submit_transaction_sync(remove_key_txn) == 0;
17881 ceph_assert(ok);
11fdf7f2
TL
17882 remove_key_txn = nullptr;
17883 }
17884 if (fix_misreferences_txn) {
20effc67
TL
17885 auto ok = db->submit_transaction_sync(fix_misreferences_txn) == 0;
17886 ceph_assert(ok);
11fdf7f2
TL
17887 fix_misreferences_txn = nullptr;
17888 }
adb31ebb 17889 if (fix_onode_txn) {
20effc67
TL
17890 auto ok = db->submit_transaction_sync(fix_onode_txn) == 0;
17891 ceph_assert(ok);
adb31ebb
TL
17892 fix_onode_txn = nullptr;
17893 }
11fdf7f2 17894 if (fix_shared_blob_txn) {
20effc67
TL
17895 auto ok = db->submit_transaction_sync(fix_shared_blob_txn) == 0;
17896 ceph_assert(ok);
11fdf7f2
TL
17897 fix_shared_blob_txn = nullptr;
17898 }
11fdf7f2 17899 if (fix_statfs_txn) {
20effc67
TL
17900 auto ok = db->submit_transaction_sync(fix_statfs_txn) == 0;
17901 ceph_assert(ok);
11fdf7f2
TL
17902 fix_statfs_txn = nullptr;
17903 }
522d829b
TL
17904 if (need_compact) {
17905 db->compact();
17906 need_compact = false;
17907 }
11fdf7f2
TL
17908 unsigned repaired = to_repair_cnt;
17909 to_repair_cnt = 0;
17910 return repaired;
17911}
17912
17913// =======================================================
9f95a23c
TL
17914// RocksDBBlueFSVolumeSelector
17915
17916uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
17917 ceph_assert(h != nullptr);
17918 uint64_t hint = reinterpret_cast<uint64_t>(h);
17919 uint8_t res;
17920 switch (hint) {
17921 case LEVEL_SLOW:
17922 res = BlueFS::BDEV_SLOW;
17923 if (db_avail4slow > 0) {
17924 // considering statically available db space vs.
17925 // - observed maximums on DB dev for DB/WAL/UNSORTED data
17926 // - observed maximum spillovers
17927 uint64_t max_db_use = 0; // max db usage we potentially observed
f6b5b4d7 17928 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
9f95a23c
TL
17929 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
17930 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
17931 // this could go to db hence using it in the estimation
17932 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
17933
17934 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
17935 uint64_t avail = min(
17936 db_avail4slow,
17937 max_db_use < db_total ? db_total - max_db_use : 0);
17938
17939 // considering current DB dev usage for SLOW data
17940 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
17941 res = BlueFS::BDEV_DB;
17942 }
17943 }
17944 break;
f6b5b4d7 17945 case LEVEL_LOG:
9f95a23c
TL
17946 case LEVEL_WAL:
17947 res = BlueFS::BDEV_WAL;
17948 break;
17949 case LEVEL_DB:
17950 default:
17951 res = BlueFS::BDEV_DB;
17952 break;
17953 }
17954 return res;
17955}
17956
17957void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
17958{
a4b75251
TL
17959 auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST];
17960 res.emplace_back(base, db_size);
17961 auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST];
17962 if (slow_size == 0) {
17963 slow_size = db_size;
17964 }
17965 res.emplace_back(base + ".slow", slow_size);
9f95a23c
TL
17966}
17967
b3b6e05e 17968void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
9f95a23c
TL
17969 uint8_t res = LEVEL_DB;
17970 if (dirname.length() > 5) {
17971 // the "db.slow" and "db.wal" directory names are hard-coded at
17972 // match up with bluestore. the slow device is always the second
17973 // one (when a dedicated block.db device is present and used at
17974 // bdev 0). the wal device is always last.
17975 if (boost::algorithm::ends_with(dirname, ".slow")) {
17976 res = LEVEL_SLOW;
17977 }
17978 else if (boost::algorithm::ends_with(dirname, ".wal")) {
17979 res = LEVEL_WAL;
17980 }
17981 }
17982 return reinterpret_cast<void*>(res);
17983}
17984
17985void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
17986 auto max_x = per_level_per_dev_usage.get_max_x();
17987 auto max_y = per_level_per_dev_usage.get_max_y();
17988 sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
17989 << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
17990 << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
17991 << ", db_avail:" << db_avail4slow << std::endl
17992 << "Usage matrix:" << std::endl;
f6b5b4d7 17993 constexpr std::array<const char*, 8> names{ {
9f95a23c
TL
17994 "DEV/LEV",
17995 "WAL",
17996 "DB",
17997 "SLOW",
17998 "*",
17999 "*",
f6b5b4d7
TL
18000 "REAL",
18001 "FILES",
9f95a23c
TL
18002 } };
18003 const size_t width = 12;
18004 for (size_t i = 0; i < names.size(); ++i) {
18005 sout.setf(std::ios::left, std::ios::adjustfield);
18006 sout.width(width);
18007 sout << names[i];
18008 }
18009 sout << std::endl;
18010 for (size_t l = 0; l < max_y; l++) {
18011 sout.setf(std::ios::left, std::ios::adjustfield);
18012 sout.width(width);
18013 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
18014 case LEVEL_LOG:
18015 sout << "LOG"; break;
9f95a23c
TL
18016 case LEVEL_WAL:
18017 sout << "WAL"; break;
18018 case LEVEL_DB:
18019 sout << "DB"; break;
18020 case LEVEL_SLOW:
18021 sout << "SLOW"; break;
18022 case LEVEL_MAX:
18023 sout << "TOTALS"; break;
18024 }
f6b5b4d7 18025 for (size_t d = 0; d < max_x; d++) {
9f95a23c
TL
18026 sout.setf(std::ios::left, std::ios::adjustfield);
18027 sout.width(width);
18028 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
18029 }
18030 sout.setf(std::ios::left, std::ios::adjustfield);
18031 sout.width(width);
f6b5b4d7 18032 sout << stringify(per_level_files[l]) << std::endl;
9f95a23c
TL
18033 }
18034 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
18035 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
18036 sout << "MAXIMUMS:" << std::endl;
18037 for (size_t l = 0; l < max_y; l++) {
18038 sout.setf(std::ios::left, std::ios::adjustfield);
18039 sout.width(width);
18040 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
18041 case LEVEL_LOG:
18042 sout << "LOG"; break;
9f95a23c
TL
18043 case LEVEL_WAL:
18044 sout << "WAL"; break;
18045 case LEVEL_DB:
18046 sout << "DB"; break;
18047 case LEVEL_SLOW:
18048 sout << "SLOW"; break;
18049 case LEVEL_MAX:
18050 sout << "TOTALS"; break;
18051 }
18052 for (size_t d = 0; d < max_x - 1; d++) {
18053 sout.setf(std::ios::left, std::ios::adjustfield);
18054 sout.width(width);
18055 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
18056 }
18057 sout.setf(std::ios::left, std::ios::adjustfield);
18058 sout.width(width);
18059 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
18060 if (l < max_y - 1) {
18061 sout << std::endl;
18062 }
18063 }
18064}
11fdf7f2 18065
20effc67
TL
18066BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const {
18067 RocksDBBlueFSVolumeSelector* ns =
18068 new RocksDBBlueFSVolumeSelector(0, 0, 0,
18069 0, 0, 0,
18070 0, 0, false);
18071 return ns;
18072}
18073
18074bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) {
18075 RocksDBBlueFSVolumeSelector* o = dynamic_cast<RocksDBBlueFSVolumeSelector*>(other);
18076 ceph_assert(o);
18077 bool equal = true;
18078 for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) {
18079 for (size_t y = 0; y <LEVEL_MAX - LEVEL_FIRST + 1; y++) {
18080 equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y));
18081 }
18082 }
18083 for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) {
18084 equal &= (per_level_files[t] == o->per_level_files[t]);
18085 }
18086 return equal;
18087}
18088
9f95a23c 18089// =======================================================
20effc67
TL
18090
18091//================================================================================================================
18092// BlueStore is committing all allocation information (alloc/release) into RocksDB before the client Write is performed.
18093// This cause a delay in write path and add significant load to the CPU/Memory/Disk.
18094// The reason for the RocksDB updates is that it allows Ceph to survive any failure without losing the allocation state.
18095//
18096// We changed the code skiping RocksDB updates on allocation time and instead performing a full desatge of the allocator object
18097// with all the OSD allocation state in a single step during umount().
18098// This change leads to a 25% increase in IOPS and reduced latency in small random-write workload, but exposes the system
18099// to losing allocation info in failure cases where we don't call umount.
18100// We add code to perform a full allocation-map rebuild from information stored inside the ONode which is used in failure cases.
18101// When we perform a graceful shutdown there is no need for recovery and we simply read the allocation-map from a flat file
18102// where we store the allocation-map during umount().
18103//================================================================================================================
18104
18105#undef dout_prefix
18106#define dout_prefix *_dout << "bluestore::NCB::" << __func__ << "::"
18107
18108static const std::string allocator_dir = "ALLOCATOR_NCB_DIR";
18109static const std::string allocator_file = "ALLOCATOR_NCB_FILE";
18110static uint32_t s_format_version = 0x01; // support future changes to allocator-map file
18111static uint32_t s_serial = 0x01;
18112
18113#if 1
18114#define CEPHTOH_32 le32toh
18115#define CEPHTOH_64 le64toh
18116#define HTOCEPH_32 htole32
18117#define HTOCEPH_64 htole64
18118#else
18119// help debug the encode/decode by forcing alien format
18120#define CEPHTOH_32 be32toh
18121#define CEPHTOH_64 be64toh
18122#define HTOCEPH_32 htobe32
18123#define HTOCEPH_64 htobe64
18124#endif
18125
18126// 48 Bytes header for on-disk alloator image
18127const uint64_t ALLOCATOR_IMAGE_VALID_SIGNATURE = 0x1FACE0FF;
18128struct allocator_image_header {
18129 uint32_t format_version; // 0x00
18130 uint32_t valid_signature; // 0x04
18131 utime_t timestamp; // 0x08
18132 uint32_t serial; // 0x10
18133 uint32_t pad[0x7]; // 0x14
18134
18135 allocator_image_header() {
18136 memset((char*)this, 0, sizeof(allocator_image_header));
18137 }
18138
18139 // create header in CEPH format
18140 allocator_image_header(utime_t timestamp, uint32_t format_version, uint32_t serial) {
18141 this->format_version = format_version;
18142 this->timestamp = timestamp;
18143 this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
18144 this->serial = serial;
18145 memset(this->pad, 0, sizeof(this->pad));
18146 }
18147
18148 friend std::ostream& operator<<(std::ostream& out, const allocator_image_header& header) {
18149 out << "format_version = " << header.format_version << std::endl;
18150 out << "valid_signature = " << header.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
18151 out << "timestamp = " << header.timestamp << std::endl;
18152 out << "serial = " << header.serial << std::endl;
18153 for (unsigned i = 0; i < sizeof(header.pad)/sizeof(uint32_t); i++) {
18154 if (header.pad[i]) {
18155 out << "header.pad[" << i << "] = " << header.pad[i] << std::endl;
18156 }
18157 }
18158 return out;
18159 }
18160
18161 DENC(allocator_image_header, v, p) {
18162 denc(v.format_version, p);
18163 denc(v.valid_signature, p);
18164 denc(v.timestamp.tv.tv_sec, p);
18165 denc(v.timestamp.tv.tv_nsec, p);
18166 denc(v.serial, p);
18167 for (auto& pad: v.pad) {
18168 denc(pad, p);
18169 }
18170 }
18171
18172
18173 int verify(CephContext* cct, const std::string &path) {
18174 if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
18175 for (unsigned i = 0; i < (sizeof(pad) / sizeof(uint32_t)); i++) {
18176 if (this->pad[i]) {
18177 derr << "Illegal Header - pad[" << i << "]="<< pad[i] << dendl;
18178 return -1;
18179 }
18180 }
18181 return 0;
18182 }
18183 else {
18184 derr << "Illegal Header - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
18185 return -1;
18186 }
18187 }
18188};
18189WRITE_CLASS_DENC(allocator_image_header)
18190
18191// 56 Bytes trailer for on-disk alloator image
18192struct allocator_image_trailer {
18193 extent_t null_extent; // 0x00
18194
18195 uint32_t format_version; // 0x10
18196 uint32_t valid_signature; // 0x14
18197
18198 utime_t timestamp; // 0x18
18199
18200 uint32_t serial; // 0x20
18201 uint32_t pad; // 0x24
18202 uint64_t entries_count; // 0x28
18203 uint64_t allocation_size; // 0x30
18204
18205 // trailer is created in CEPH format
18206 allocator_image_trailer(utime_t timestamp, uint32_t format_version, uint32_t serial, uint64_t entries_count, uint64_t allocation_size) {
18207 memset((char*)&(this->null_extent), 0, sizeof(this->null_extent));
18208 this->format_version = format_version;
18209 this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
18210 this->timestamp = timestamp;
18211 this->serial = serial;
18212 this->pad = 0;
18213 this->entries_count = entries_count;
18214 this->allocation_size = allocation_size;
18215 }
18216
18217 allocator_image_trailer() {
18218 memset((char*)this, 0, sizeof(allocator_image_trailer));
18219 }
18220
18221 friend std::ostream& operator<<(std::ostream& out, const allocator_image_trailer& trailer) {
18222 if (trailer.null_extent.offset || trailer.null_extent.length) {
18223 out << "trailer.null_extent.offset = " << trailer.null_extent.offset << std::endl;
18224 out << "trailer.null_extent.length = " << trailer.null_extent.length << std::endl;
18225 }
18226 out << "format_version = " << trailer.format_version << std::endl;
18227 out << "valid_signature = " << trailer.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
18228 out << "timestamp = " << trailer.timestamp << std::endl;
18229 out << "serial = " << trailer.serial << std::endl;
18230 if (trailer.pad) {
18231 out << "trailer.pad= " << trailer.pad << std::endl;
18232 }
18233 out << "entries_count = " << trailer.entries_count << std::endl;
18234 out << "allocation_size = " << trailer.allocation_size << std::endl;
18235 return out;
18236 }
18237
18238 int verify(CephContext* cct, const std::string &path, const allocator_image_header *p_header, uint64_t entries_count, uint64_t allocation_size) {
18239 if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
18240
18241 // trailer must starts with null extents (both fields set to zero) [no need to convert formats for zero)
18242 if (null_extent.offset || null_extent.length) {
18243 derr << "illegal trailer - null_extent = [" << null_extent.offset << "," << null_extent.length << "]"<< dendl;
18244 return -1;
18245 }
18246
18247 if (serial != p_header->serial) {
18248 derr << "Illegal trailer: header->serial(" << p_header->serial << ") != trailer->serial(" << serial << ")" << dendl;
18249 return -1;
18250 }
18251
18252 if (format_version != p_header->format_version) {
18253 derr << "Illegal trailer: header->format_version(" << p_header->format_version
18254 << ") != trailer->format_version(" << format_version << ")" << dendl;
18255 return -1;
18256 }
18257
18258 if (timestamp != p_header->timestamp) {
18259 derr << "Illegal trailer: header->timestamp(" << p_header->timestamp
18260 << ") != trailer->timestamp(" << timestamp << ")" << dendl;
18261 return -1;
18262 }
18263
18264 if (this->entries_count != entries_count) {
18265 derr << "Illegal trailer: entries_count(" << entries_count << ") != trailer->entries_count("
18266 << this->entries_count << ")" << dendl;
18267 return -1;
18268 }
18269
18270 if (this->allocation_size != allocation_size) {
18271 derr << "Illegal trailer: allocation_size(" << allocation_size << ") != trailer->allocation_size("
18272 << this->allocation_size << ")" << dendl;
18273 return -1;
18274 }
18275
18276 if (pad) {
18277 derr << "Illegal Trailer - pad="<< pad << dendl;
18278 return -1;
18279 }
18280
18281 // if arrived here -> trailer is valid !!
18282 return 0;
18283 } else {
18284 derr << "Illegal Trailer - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
18285 return -1;
18286 }
18287 }
18288
18289 DENC(allocator_image_trailer, v, p) {
18290 denc(v.null_extent.offset, p);
18291 denc(v.null_extent.length, p);
18292 denc(v.format_version, p);
18293 denc(v.valid_signature, p);
18294 denc(v.timestamp.tv.tv_sec, p);
18295 denc(v.timestamp.tv.tv_nsec, p);
18296 denc(v.serial, p);
18297 denc(v.pad, p);
18298 denc(v.entries_count, p);
18299 denc(v.allocation_size, p);
18300 }
18301};
18302WRITE_CLASS_DENC(allocator_image_trailer)
18303
18304
18305//-------------------------------------------------------------------------------------
18306// invalidate old allocation file if exists so will go directly to recovery after failure
18307// we can safely ignore non-existing file
18308int BlueStore::invalidate_allocation_file_on_bluefs()
18309{
18310 // mark that allocation-file was invalidated and we should destage a new copy whne closing db
18311 need_to_destage_allocation_file = true;
18312 dout(10) << "need_to_destage_allocation_file was set" << dendl;
18313
18314 BlueFS::FileWriter *p_handle = nullptr;
18315 if (!bluefs->dir_exists(allocator_dir)) {
18316 dout(5) << "allocator_dir(" << allocator_dir << ") doesn't exist" << dendl;
18317 // nothing to do -> return
18318 return 0;
18319 }
18320
18321 int ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
18322 if (ret != 0) {
18323 dout(5) << "allocator_file(" << allocator_file << ") doesn't exist" << dendl;
18324 // nothing to do -> return
18325 return 0;
18326 }
18327
18328
18329 ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, true);
18330 if (ret != 0) {
18331 derr << "Failed open_for_write with error-code " << ret << dendl;
18332 return -1;
18333 }
18334
18335 dout(5) << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
18336 ret = bluefs->truncate(p_handle, 0);
18337 if (ret != 0) {
18338 derr << "Failed truncate with error-code " << ret << dendl;
18339 bluefs->close_writer(p_handle);
18340 return -1;
18341 }
18342
18343 bluefs->fsync(p_handle);
18344 bluefs->close_writer(p_handle);
18345
18346 return 0;
18347}
18348
18349//-----------------------------------------------------------------------------------
18350// load bluefs extents into bluefs_extents_vec
18351int load_bluefs_extents(BlueFS *bluefs,
18352 bluefs_layout_t *bluefs_layout,
18353 CephContext* cct,
18354 const std::string &path,
18355 std::vector<extent_t> &bluefs_extents_vec,
18356 uint64_t min_alloc_size)
18357{
18358 if (! bluefs) {
18359 dout(5) << "No BlueFS device found!!" << dendl;
18360 return 0;
18361 }
18362
18363 interval_set<uint64_t> bluefs_extents;
18364 int ret = bluefs->get_block_extents(bluefs_layout->shared_bdev, &bluefs_extents);
18365 if (ret < 0) {
18366 derr << "failed bluefs->get_block_extents()!!" << dendl;
18367 return ret;
18368 }
18369
18370 for (auto itr = bluefs_extents.begin(); itr != bluefs_extents.end(); itr++) {
18371 extent_t e = { .offset = itr.get_start(), .length = itr.get_len() };
18372 bluefs_extents_vec.push_back(e);
18373 }
18374
18375 dout(5) << "BlueFS extent_count=" << bluefs_extents_vec.size() << dendl;
18376 return 0;
18377}
18378
18379//-----------------------------------------------------------------------------------
18380int BlueStore::copy_allocator(Allocator* src_alloc, Allocator* dest_alloc, uint64_t* p_num_entries)
18381{
18382 *p_num_entries = 0;
18383 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
18384 (*p_num_entries)++;
18385 };
18386 src_alloc->dump(count_entries);
18387
18388 dout(5) << "count num_entries=" << *p_num_entries << dendl;
18389
18390 // add 16K extra entries in case new allocation happened
18391 (*p_num_entries) += 16*1024;
18392 unique_ptr<extent_t[]> arr;
18393 try {
18394 arr = make_unique<extent_t[]>(*p_num_entries);
18395 } catch (std::bad_alloc&) {
18396 derr << "****Failed dynamic allocation, num_entries=" << *p_num_entries << dendl;
18397 return -1;
18398 }
18399
18400 uint64_t idx = 0;
18401 auto copy_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
18402 if (extent_length > 0) {
18403 if (idx < *p_num_entries) {
18404 arr[idx] = {extent_offset, extent_length};
18405 }
18406 idx++;
18407 }
18408 else {
18409 derr << "zero length extent!!! offset=" << extent_offset << ", index=" << idx << dendl;
18410 }
18411 };
18412 src_alloc->dump(copy_entries);
18413
18414 dout(5) << "copy num_entries=" << idx << dendl;
18415 if (idx > *p_num_entries) {
18416 derr << "****spillover, num_entries=" << *p_num_entries << ", spillover=" << (idx - *p_num_entries) << dendl;
18417 ceph_assert(idx <= *p_num_entries);
18418 }
18419
18420 *p_num_entries = idx;
18421
18422 for (idx = 0; idx < *p_num_entries; idx++) {
18423 const extent_t *p_extent = &arr[idx];
18424 dest_alloc->init_add_free(p_extent->offset, p_extent->length);
18425 }
18426
18427 return 0;
18428}
18429
18430//-----------------------------------------------------------------------------------
18431static uint32_t flush_extent_buffer_with_crc(BlueFS::FileWriter *p_handle, const char* buffer, const char *p_curr, uint32_t crc)
18432{
18433 std::ptrdiff_t length = p_curr - buffer;
18434 p_handle->append(buffer, length);
18435
18436 crc = ceph_crc32c(crc, (const uint8_t*)buffer, length);
18437 uint32_t encoded_crc = HTOCEPH_32(crc);
18438 p_handle->append((byte*)&encoded_crc, sizeof(encoded_crc));
18439
18440 return crc;
18441}
18442
18443const unsigned MAX_EXTENTS_IN_BUFFER = 4 * 1024; // 4K extents = 64KB of data
18444// write the allocator to a flat bluefs file - 4K extents at a time
18445//-----------------------------------------------------------------------------------
18446int BlueStore::store_allocator(Allocator* src_allocator)
18447{
18448 // when storing allocations to file we must be sure there is no background compactions
18449 // the easiest way to achieve it is to make sure db is closed
18450 ceph_assert(db == nullptr);
18451 utime_t start_time = ceph_clock_now();
18452 int ret = 0;
18453
18454 // create dir if doesn't exist already
18455 if (!bluefs->dir_exists(allocator_dir) ) {
18456 ret = bluefs->mkdir(allocator_dir);
18457 if (ret != 0) {
18458 derr << "Failed mkdir with error-code " << ret << dendl;
18459 return -1;
18460 }
18461 }
1d09f67e 18462 bluefs->compact_log();
20effc67
TL
18463 // reuse previous file-allocation if exists
18464 ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
18465 bool overwrite_file = (ret == 0);
20effc67
TL
18466 BlueFS::FileWriter *p_handle = nullptr;
18467 ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, overwrite_file);
18468 if (ret != 0) {
18469 derr << __func__ << "Failed open_for_write with error-code " << ret << dendl;
18470 return -1;
18471 }
18472
18473 uint64_t file_size = p_handle->file->fnode.size;
18474 uint64_t allocated = p_handle->file->fnode.get_allocated();
1d09f67e 18475 dout(10) << "file_size=" << file_size << ", allocated=" << allocated << dendl;
20effc67 18476
1d09f67e 18477 bluefs->sync_metadata(false);
20effc67
TL
18478 unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(src_allocator));
18479 if (!allocator) {
18480 bluefs->close_writer(p_handle);
18481 return -1;
18482 }
18483
18484 // store all extents (except for the bluefs extents we removed) in a single flat file
18485 utime_t timestamp = ceph_clock_now();
18486 uint32_t crc = -1;
18487 {
18488 allocator_image_header header(timestamp, s_format_version, s_serial);
18489 bufferlist header_bl;
18490 encode(header, header_bl);
18491 crc = header_bl.crc32c(crc);
18492 encode(crc, header_bl);
18493 p_handle->append(header_bl);
18494 }
18495
18496 crc = -1; // reset crc
18497 extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
18498 extent_t *p_curr = buffer;
18499 const extent_t *p_end = buffer + MAX_EXTENTS_IN_BUFFER;
18500 uint64_t extent_count = 0;
18501 uint64_t allocation_size = 0;
18502 auto iterated_allocation = [&](uint64_t extent_offset, uint64_t extent_length) {
18503 if (extent_length == 0) {
18504 derr << __func__ << "" << extent_count << "::[" << extent_offset << "," << extent_length << "]" << dendl;
18505 ret = -1;
18506 return;
18507 }
18508 p_curr->offset = HTOCEPH_64(extent_offset);
18509 p_curr->length = HTOCEPH_64(extent_length);
18510 extent_count++;
18511 allocation_size += extent_length;
18512 p_curr++;
18513
18514 if (p_curr == p_end) {
18515 crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
18516 p_curr = buffer; // recycle the buffer
18517 }
18518 };
18519 allocator->dump(iterated_allocation);
18520 // if got null extent -> fail the operation
18521 if (ret != 0) {
18522 derr << "Illegal extent, fail store operation" << dendl;
18523 derr << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
18524 bluefs->truncate(p_handle, 0);
18525 bluefs->close_writer(p_handle);
18526 return -1;
18527 }
18528
18529 // if we got any leftovers -> add crc and append to file
18530 if (p_curr > buffer) {
18531 crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
18532 }
18533
18534 {
18535 allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
18536 bufferlist trailer_bl;
18537 encode(trailer, trailer_bl);
18538 uint32_t crc = -1;
18539 crc = trailer_bl.crc32c(crc);
18540 encode(crc, trailer_bl);
18541 p_handle->append(trailer_bl);
18542 }
18543
18544 bluefs->fsync(p_handle);
18545 bluefs->truncate(p_handle, p_handle->pos);
18546 bluefs->fsync(p_handle);
18547
18548 utime_t duration = ceph_clock_now() - start_time;
1d09f67e 18549 dout(5) <<"WRITE-extent_count=" << extent_count << ", allocation_size=" << allocation_size << ", serial=" << s_serial << dendl;
20effc67
TL
18550 dout(5) <<"p_handle->pos=" << p_handle->pos << " WRITE-duration=" << duration << " seconds" << dendl;
18551
18552 bluefs->close_writer(p_handle);
18553 need_to_destage_allocation_file = false;
20effc67
TL
18554 return 0;
18555}
18556
18557//-----------------------------------------------------------------------------------
18558Allocator* BlueStore::create_bitmap_allocator(uint64_t bdev_size) {
18559 // create allocator
18560 uint64_t alloc_size = min_alloc_size;
18561 Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size,
18562 zone_size, first_sequential_zone,
18563 "recovery");
18564 if (alloc) {
18565 return alloc;
18566 } else {
18567 derr << "Failed Allocator Creation" << dendl;
18568 return nullptr;
18569 }
18570}
18571
18572//-----------------------------------------------------------------------------------
18573size_t calc_allocator_image_header_size()
18574{
18575 utime_t timestamp = ceph_clock_now();
18576 allocator_image_header header(timestamp, s_format_version, s_serial);
18577 bufferlist header_bl;
18578 encode(header, header_bl);
18579 uint32_t crc = -1;
18580 crc = header_bl.crc32c(crc);
18581 encode(crc, header_bl);
18582
18583 return header_bl.length();
18584}
18585
18586//-----------------------------------------------------------------------------------
18587int calc_allocator_image_trailer_size()
18588{
18589 utime_t timestamp = ceph_clock_now();
18590 uint64_t extent_count = -1;
18591 uint64_t allocation_size = -1;
18592 uint32_t crc = -1;
18593 bufferlist trailer_bl;
18594 allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
18595
18596 encode(trailer, trailer_bl);
18597 crc = trailer_bl.crc32c(crc);
18598 encode(crc, trailer_bl);
18599 return trailer_bl.length();
18600}
18601
18602//-----------------------------------------------------------------------------------
18603int BlueStore::__restore_allocator(Allocator* allocator, uint64_t *num, uint64_t *bytes)
18604{
18605 utime_t start_time = ceph_clock_now();
18606 BlueFS::FileReader *p_temp_handle = nullptr;
18607 int ret = bluefs->open_for_read(allocator_dir, allocator_file, &p_temp_handle, false);
18608 if (ret != 0) {
18609 derr << "Failed open_for_read with error-code " << ret << dendl;
18610 return -1;
18611 }
18612 unique_ptr<BlueFS::FileReader> p_handle(p_temp_handle);
18613 uint64_t read_alloc_size = 0;
18614 uint64_t file_size = p_handle->file->fnode.size;
18615 dout(5) << "file_size=" << file_size << ",sizeof(extent_t)=" << sizeof(extent_t) << dendl;
18616
18617 // make sure we were able to store a valid copy
18618 if (file_size == 0) {
18619 derr << "No Valid allocation info on disk (empty file)" << dendl;
18620 return -1;
18621 }
18622
18623 // first read the header
18624 size_t offset = 0;
18625 allocator_image_header header;
18626 int header_size = calc_allocator_image_header_size();
18627 {
18628 bufferlist header_bl,temp_bl;
18629 int read_bytes = bluefs->read(p_handle.get(), offset, header_size, &temp_bl, nullptr);
18630 if (read_bytes != header_size) {
18631 derr << "Failed bluefs->read() for header::read_bytes=" << read_bytes << ", req_bytes=" << header_size << dendl;
18632 return -1;
18633 }
18634
18635 offset += read_bytes;
18636
18637 header_bl.claim_append(temp_bl);
18638 auto p = header_bl.cbegin();
18639 decode(header, p);
18640 if (header.verify(cct, path) != 0 ) {
18641 derr << "header = \n" << header << dendl;
18642 return -1;
18643 }
18644
18645 uint32_t crc_calc = -1, crc;
18646 crc_calc = header_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
18647 decode(crc, p);
18648 if (crc != crc_calc) {
18649 derr << "crc mismatch!!! crc=" << crc << ", crc_calc=" << crc_calc << dendl;
18650 derr << "header = \n" << header << dendl;
18651 return -1;
18652 }
18653
18654 // increment version for next store
18655 s_serial = header.serial + 1;
18656 }
18657
18658 // then read the payload (extents list) using a recycled buffer
18659 extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
18660 uint32_t crc = -1;
18661 int trailer_size = calc_allocator_image_trailer_size();
18662 uint64_t extent_count = 0;
18663 uint64_t extents_bytes_left = file_size - (header_size + trailer_size + sizeof(crc));
18664 while (extents_bytes_left) {
18665 int req_bytes = std::min(extents_bytes_left, sizeof(buffer));
18666 int read_bytes = bluefs->read(p_handle.get(), offset, req_bytes, nullptr, (char*)buffer);
18667 if (read_bytes != req_bytes) {
18668 derr << "Failed bluefs->read()::read_bytes=" << read_bytes << ", req_bytes=" << req_bytes << dendl;
18669 return -1;
18670 }
18671
18672 offset += read_bytes;
18673 extents_bytes_left -= read_bytes;
18674
18675 const unsigned num_extent_in_buffer = read_bytes/sizeof(extent_t);
18676 const extent_t *p_end = buffer + num_extent_in_buffer;
18677 for (const extent_t *p_ext = buffer; p_ext < p_end; p_ext++) {
18678 uint64_t offset = CEPHTOH_64(p_ext->offset);
18679 uint64_t length = CEPHTOH_64(p_ext->length);
18680 read_alloc_size += length;
18681
18682 if (length > 0) {
18683 allocator->init_add_free(offset, length);
18684 extent_count ++;
18685 } else {
18686 derr << "extent with zero length at idx=" << extent_count << dendl;
18687 return -1;
18688 }
18689 }
18690
18691 uint32_t calc_crc = ceph_crc32c(crc, (const uint8_t*)buffer, read_bytes);
18692 read_bytes = bluefs->read(p_handle.get(), offset, sizeof(crc), nullptr, (char*)&crc);
18693 if (read_bytes == sizeof(crc) ) {
18694 crc = CEPHTOH_32(crc);
18695 if (crc != calc_crc) {
18696 derr << "data crc mismatch!!! crc=" << crc << ", calc_crc=" << calc_crc << dendl;
18697 derr << "extents_bytes_left=" << extents_bytes_left << ", offset=" << offset << ", extent_count=" << extent_count << dendl;
18698 return -1;
18699 }
18700
18701 offset += read_bytes;
18702 if (extents_bytes_left) {
18703 extents_bytes_left -= read_bytes;
18704 }
18705 } else {
18706 derr << "Failed bluefs->read() for crc::read_bytes=" << read_bytes << ", req_bytes=" << sizeof(crc) << dendl;
18707 return -1;
18708 }
18709
18710 }
18711
18712 // finally, read teh trailer and verify it is in good shape and that we got all the extents
18713 {
18714 bufferlist trailer_bl,temp_bl;
18715 int read_bytes = bluefs->read(p_handle.get(), offset, trailer_size, &temp_bl, nullptr);
18716 if (read_bytes != trailer_size) {
18717 derr << "Failed bluefs->read() for trailer::read_bytes=" << read_bytes << ", req_bytes=" << trailer_size << dendl;
18718 return -1;
18719 }
18720 offset += read_bytes;
18721
18722 trailer_bl.claim_append(temp_bl);
18723 uint32_t crc_calc = -1;
18724 uint32_t crc;
18725 allocator_image_trailer trailer;
18726 auto p = trailer_bl.cbegin();
18727 decode(trailer, p);
18728 if (trailer.verify(cct, path, &header, extent_count, read_alloc_size) != 0 ) {
18729 derr << "trailer=\n" << trailer << dendl;
18730 return -1;
18731 }
18732
18733 crc_calc = trailer_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
18734 decode(crc, p);
18735 if (crc != crc_calc) {
18736 derr << "trailer crc mismatch!::crc=" << crc << ", crc_calc=" << crc_calc << dendl;
18737 derr << "trailer=\n" << trailer << dendl;
18738 return -1;
18739 }
18740 }
18741
18742 utime_t duration = ceph_clock_now() - start_time;
18743 dout(5) << "READ--extent_count=" << extent_count << ", read_alloc_size= "
18744 << read_alloc_size << ", file_size=" << file_size << dendl;
1d09f67e 18745 dout(5) << "READ duration=" << duration << " seconds, s_serial=" << header.serial << dendl;
20effc67
TL
18746 *num = extent_count;
18747 *bytes = read_alloc_size;
18748 return 0;
18749}
18750
18751//-----------------------------------------------------------------------------------
18752int BlueStore::restore_allocator(Allocator* dest_allocator, uint64_t *num, uint64_t *bytes)
18753{
18754 utime_t start = ceph_clock_now();
18755 auto temp_allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
18756 int ret = __restore_allocator(temp_allocator.get(), num, bytes);
18757 if (ret != 0) {
18758 return ret;
18759 }
18760
18761 uint64_t num_entries = 0;
18762 dout(5) << " calling copy_allocator(bitmap_allocator -> shared_alloc.a)" << dendl;
18763 copy_allocator(temp_allocator.get(), dest_allocator, &num_entries);
18764 utime_t duration = ceph_clock_now() - start;
18765 dout(5) << "restored in " << duration << " seconds, num_entries=" << num_entries << dendl;
18766 return ret;
18767}
18768
18769//-------------------------------------------------------------------------
18770void BlueStore::ExtentMap::provide_shard_info_to_onode(bufferlist v, uint32_t shard_id)
18771{
18772 [[maybe_unused]] auto cct = onode->c->store->cct;
18773 auto path = onode->c->store->path;
18774 if (shard_id < shards.size()) {
18775 auto p = &shards[shard_id];
18776 if (!p->loaded) {
18777 dout(30) << "opening shard 0x" << std::hex << p->shard_info->offset << std::dec << dendl;
18778 p->extents = decode_some(v);
18779 p->loaded = true;
18780 dout(20) << "open shard 0x" << std::hex << p->shard_info->offset << std::dec << dendl;
18781 ceph_assert(p->dirty == false);
18782 ceph_assert(v.length() == p->shard_info->bytes);
18783 }
18784 } else {
18785 derr << "illegal shard-id=" << shard_id << " shards.size()=" << shards.size() << dendl;
18786 ceph_assert(shard_id < shards.size());
18787 }
18788}
18789
18790//-----------------------------------------------------------------------------------
18791void BlueStore::set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length)
18792{
18793 ceph_assert((offset & min_alloc_size_mask) == 0);
18794 ceph_assert((length & min_alloc_size_mask) == 0);
18795 sbmap->set(offset >> min_alloc_size_order, length >> min_alloc_size_order);
18796}
18797
18798//---------------------------------------------------------
18799// Process all physical extents from a given Onode (including all its shards)
18800void BlueStore::read_allocation_from_single_onode(
18801 SimpleBitmap* sbmap,
18802 BlueStore::OnodeRef& onode_ref,
18803 read_alloc_stats_t& stats)
18804{
18805 // create a map holding all physical-extents of this Onode to prevent duplication from being added twice and more
18806 std::unordered_map<uint64_t, uint32_t> lcl_extnt_map;
18807 unsigned blobs_count = 0;
18808 uint64_t pos = 0;
18809
18810 stats.spanning_blob_count += onode_ref->extent_map.spanning_blob_map.size();
18811 // first iterate over all logical-extents
18812 for (struct Extent& l_extent : onode_ref->extent_map.extent_map) {
18813 ceph_assert(l_extent.logical_offset >= pos);
18814
18815 pos = l_extent.logical_offset + l_extent.length;
18816 ceph_assert(l_extent.blob);
18817 const bluestore_blob_t& blob = l_extent.blob->get_blob();
18818 const PExtentVector& p_extent_vec = blob.get_extents();
18819 blobs_count++;
18820 if (blob.is_compressed()) {
18821 stats.compressed_blob_count++;
18822 }
18823
18824 if (blob.is_shared()) {
18825 stats.shared_blobs_count++;
18826 }
18827
18828 // process all physical extent in this blob
18829 for (auto p_extent = p_extent_vec.begin(); p_extent != p_extent_vec.end(); p_extent++) {
18830 auto offset = p_extent->offset;
18831 auto length = p_extent->length;
18832
18833 // Offset of -1 means that the extent was removed (and it is only a place holder) and can be safely skipped
18834 if (offset == (uint64_t)-1) {
18835 stats.skipped_illegal_extent++;
18836 continue;
18837 }
18838
18839 if (!blob.is_shared()) {
18840 // skip repeating extents
18841 auto lcl_itr = lcl_extnt_map.find(offset);
18842 // extents using shared blobs might have differnt length
18843 if (lcl_itr != lcl_extnt_map.end() ) {
18844 // repeated extents must have the same length!
18845 ceph_assert(lcl_extnt_map[offset] == length);
18846 stats.skipped_repeated_extent++;
18847 } else {
18848 lcl_extnt_map[offset] = length;
18849 set_allocation_in_simple_bmap(sbmap, offset, length);
18850 stats.extent_count++;
18851 }
18852 } else {
18853 // extents using shared blobs might have differnt length
18854 set_allocation_in_simple_bmap(sbmap, offset, length);
18855 stats.extent_count++;
18856 }
18857
18858 } // physical-extents loop
18859
18860 } // logical-extents loop
18861
18862 if (blobs_count < MAX_BLOBS_IN_ONODE) {
18863 stats.blobs_in_onode[blobs_count]++;
18864 } else {
18865 // store all counts higher than MAX_BLOBS_IN_ONODE in a single bucket at offset zero
18866 stats.blobs_in_onode[MAX_BLOBS_IN_ONODE]++;
18867 }
18868}
18869
18870//-------------------------------------------------------------------------
18871int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats_t& stats)
18872{
18873 // finally add all space take by user data
18874 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
18875 if (!it) {
18876 // TBD - find a better error code
18877 derr << "failed db->get_iterator(PREFIX_OBJ)" << dendl;
18878 return -1;
18879 }
18880
18881 CollectionRef collection_ref;
18882 spg_t pgid;
18883 BlueStore::OnodeRef onode_ref;
18884 bool has_open_onode = false;
18885 uint32_t shard_id = 0;
18886 uint64_t kv_count = 0;
18887 uint64_t count_interval = 1'000'000;
18888 // iterate over all ONodes stored in RocksDB
18889 for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) {
18890 // trace an even after every million processed objects (typically every 5-10 seconds)
18891 if (kv_count && (kv_count % count_interval == 0) ) {
18892 dout(5) << "processed objects count = " << kv_count << dendl;
18893 }
18894
18895 // Shards - Code
18896 // add the extents from the shards to the main Obj
18897 if (is_extent_shard_key(it->key())) {
18898 // shards must follow a valid main object
18899 if (has_open_onode) {
18900 // shards keys must start with the main object key
18901 if (it->key().find(onode_ref->key) == 0) {
18902 // shards count can't exceed declared shard-count in the main-object
18903 if (shard_id < onode_ref->extent_map.shards.size()) {
18904 onode_ref->extent_map.provide_shard_info_to_onode(it->value(), shard_id);
18905 stats.shard_count++;
18906 shard_id++;
18907 } else {
18908 derr << "illegal shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
18909 derr << "shard->key=" << pretty_binary_string(it->key()) << dendl;
18910 ceph_assert(shard_id < onode_ref->extent_map.shards.size());
18911 }
18912 } else {
18913 derr << "illegal shard-key::onode->key=" << pretty_binary_string(onode_ref->key) << " shard->key=" << pretty_binary_string(it->key()) << dendl;
18914 ceph_assert(it->key().find(onode_ref->key) == 0);
18915 }
18916 } else {
18917 derr << "error::shard without main objects for key=" << pretty_binary_string(it->key()) << dendl;
18918 ceph_assert(has_open_onode);
18919 }
18920
18921 } else {
18922 // Main Object Code
18923
18924 if (has_open_onode) {
18925 // make sure we got all shards of this object
18926 if (shard_id == onode_ref->extent_map.shards.size()) {
18927 // We completed an Onode Object -> pass it to be processed
18928 read_allocation_from_single_onode(sbmap, onode_ref, stats);
18929 } else {
18930 derr << "Missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
18931 ceph_assert(shard_id == onode_ref->extent_map.shards.size());
18932 }
18933 } else {
18934 // We opened a new Object
18935 has_open_onode = true;
18936 }
18937
18938 // The main Obj is always first in RocksDB so we can start with shard_id set to zero
18939 shard_id = 0;
18940 stats.onode_count++;
18941 ghobject_t oid;
18942 int ret = get_key_object(it->key(), &oid);
18943 if (ret < 0) {
18944 derr << "bad object key " << pretty_binary_string(it->key()) << dendl;
18945 ceph_assert(ret == 0);
18946 continue;
18947 }
18948
18949 // fill collection_ref if doesn't exist yet
18950 // We process all the obejcts in a given collection and then move to the next collection
18951 // This means we only search once for every given collection
18952 if (!collection_ref ||
18953 oid.shard_id != pgid.shard ||
18954 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
18955 !collection_ref->contains(oid)) {
18956 stats.collection_search++;
18957 collection_ref = nullptr;
18958
18959 for (auto& p : coll_map) {
18960 if (p.second->contains(oid)) {
18961 collection_ref = p.second;
18962 break;
18963 }
18964 }
18965
18966 if (!collection_ref) {
18967 derr << "stray object " << oid << " not owned by any collection" << dendl;
18968 ceph_assert(collection_ref);
18969 continue;
18970 }
18971
18972 collection_ref->cid.is_pg(&pgid);
18973 }
18974 onode_ref.reset(BlueStore::Onode::decode(collection_ref, oid, it->key(), it->value()));
18975 }
18976 }
18977
18978 // process the last object
18979 if (has_open_onode) {
18980 // make sure we got all shards of this object
18981 if (shard_id == onode_ref->extent_map.shards.size()) {
18982 // We completed an Onode Object -> pass it to be processed
18983 read_allocation_from_single_onode(sbmap, onode_ref, stats);
18984 } else {
18985 derr << "Last Object is missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
18986 ceph_assert(shard_id == onode_ref->extent_map.shards.size());
18987 }
18988 }
18989 dout(5) << "onode_count=" << stats.onode_count << " ,shard_count=" << stats.shard_count << dendl;
18990
18991 return 0;
18992}
18993
18994//---------------------------------------------------------
18995int BlueStore::reconstruct_allocations(SimpleBitmap *sbmap, read_alloc_stats_t &stats)
18996{
18997 // first set space used by superblock
18998 auto super_length = std::max<uint64_t>(min_alloc_size, SUPER_RESERVED);
18999 set_allocation_in_simple_bmap(sbmap, 0, super_length);
19000 stats.extent_count++;
19001
19002 // then set all space taken by Objects
19003 int ret = read_allocation_from_onodes(sbmap, stats);
19004 if (ret < 0) {
19005 derr << "failed read_allocation_from_onodes()" << dendl;
19006 return ret;
19007 }
19008
19009 return 0;
19010}
19011
19012//-----------------------------------------------------------------------------------
19013static void copy_simple_bitmap_to_allocator(SimpleBitmap* sbmap, Allocator* dest_alloc, uint64_t alloc_size)
19014{
19015 int alloc_size_shift = ctz(alloc_size);
19016 uint64_t offset = 0;
19017 extent_t ext = sbmap->get_next_clr_extent(offset);
19018 while (ext.length != 0) {
19019 dest_alloc->init_add_free(ext.offset << alloc_size_shift, ext.length << alloc_size_shift);
19020 offset = ext.offset + ext.length;
19021 ext = sbmap->get_next_clr_extent(offset);
19022 }
19023}
19024
19025//---------------------------------------------------------
19026int BlueStore::read_allocation_from_drive_on_startup()
19027{
19028 int ret = 0;
19029
19030 ret = _open_collections();
19031 if (ret < 0) {
19032 return ret;
19033 }
19034 auto shutdown_cache = make_scope_guard([&] {
19035 _shutdown_cache();
19036 });
19037
19038 utime_t start = ceph_clock_now();
19039 read_alloc_stats_t stats = {};
1d09f67e 19040 SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
20effc67
TL
19041 ret = reconstruct_allocations(&sbmap, stats);
19042 if (ret != 0) {
19043 return ret;
19044 }
19045
19046 copy_simple_bitmap_to_allocator(&sbmap, alloc, min_alloc_size);
19047
19048 utime_t duration = ceph_clock_now() - start;
19049 dout(1) << "::Allocation Recovery was completed in " << duration << " seconds, extent_count=" << stats.extent_count << dendl;
19050 return ret;
19051}
19052
19053
19054
19055
19056// Only used for debugging purposes - we build a secondary allocator from the Onodes and compare it to the existing one
19057// Not meant to be run by customers
19058#ifdef CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
19059
19060#include <stdlib.h>
19061#include <algorithm>
19062//---------------------------------------------------------
19063int cmpfunc (const void * a, const void * b)
19064{
19065 if ( ((extent_t*)a)->offset > ((extent_t*)b)->offset ) {
19066 return 1;
19067 }
19068 else if( ((extent_t*)a)->offset < ((extent_t*)b)->offset ) {
19069 return -1;
19070 }
19071 else {
19072 return 0;
19073 }
19074}
19075
19076// compare the allocator built from Onodes with the system allocator (CF-B)
19077//---------------------------------------------------------
19078int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t req_extent_count, uint64_t memory_target)
19079{
19080 uint64_t allocation_size = std::min((req_extent_count) * sizeof(extent_t), memory_target / 3);
19081 uint64_t extent_count = allocation_size/sizeof(extent_t);
19082 dout(5) << "req_extent_count=" << req_extent_count << ", granted extent_count="<< extent_count << dendl;
19083
19084 unique_ptr<extent_t[]> arr1;
19085 unique_ptr<extent_t[]> arr2;
19086 try {
19087 arr1 = make_unique<extent_t[]>(extent_count);
19088 arr2 = make_unique<extent_t[]>(extent_count);
19089 } catch (std::bad_alloc&) {
19090 derr << "****Failed dynamic allocation, extent_count=" << extent_count << dendl;
19091 return -1;
19092 }
19093
19094 // copy the extents from the allocators into simple array and then compare them
19095 uint64_t size1 = 0, size2 = 0;
19096 uint64_t idx1 = 0, idx2 = 0;
19097 auto iterated_mapper1 = [&](uint64_t offset, uint64_t length) {
19098 size1 += length;
19099 if (idx1 < extent_count) {
19100 arr1[idx1++] = {offset, length};
19101 }
19102 else if (idx1 == extent_count) {
19103 derr << "(2)compare_allocators:: spillover" << dendl;
19104 idx1 ++;
19105 }
19106
19107 };
19108
19109 auto iterated_mapper2 = [&](uint64_t offset, uint64_t length) {
19110 size2 += length;
19111 if (idx2 < extent_count) {
19112 arr2[idx2++] = {offset, length};
19113 }
19114 else if (idx2 == extent_count) {
19115 derr << "(2)compare_allocators:: spillover" << dendl;
19116 idx2 ++;
19117 }
19118 };
19119
19120 alloc1->dump(iterated_mapper1);
19121 alloc2->dump(iterated_mapper2);
19122
19123 qsort(arr1.get(), std::min(idx1, extent_count), sizeof(extent_t), cmpfunc);
19124 qsort(arr2.get(), std::min(idx2, extent_count), sizeof(extent_t), cmpfunc);
19125
19126 if (idx1 == idx2) {
19127 idx1 = idx2 = std::min(idx1, extent_count);
19128 if (memcmp(arr1.get(), arr2.get(), sizeof(extent_t) * idx2) == 0) {
19129 return 0;
19130 }
19131 derr << "Failed memcmp(arr1, arr2, sizeof(extent_t)*idx2)" << dendl;
19132 for (uint64_t i = 0; i < idx1; i++) {
19133 if (memcmp(arr1.get()+i, arr2.get()+i, sizeof(extent_t)) != 0) {
19134 derr << "!!!![" << i << "] arr1::<" << arr1[i].offset << "," << arr1[i].length << ">" << dendl;
19135 derr << "!!!![" << i << "] arr2::<" << arr2[i].offset << "," << arr2[i].length << ">" << dendl;
19136 return -1;
19137 }
19138 }
19139 return 0;
19140 } else {
19141 derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl;
20effc67
TL
19142 return -1;
19143 }
19144}
19145
19146//---------------------------------------------------------
19147int BlueStore::add_existing_bluefs_allocation(Allocator* allocator, read_alloc_stats_t &stats)
19148{
19149 // then add space used by bluefs to store rocksdb
19150 unsigned extent_count = 0;
19151 if (bluefs) {
19152 interval_set<uint64_t> bluefs_extents;
19153 int ret = bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents);
19154 if (ret < 0) {
19155 return ret;
19156 }
19157 for (auto itr = bluefs_extents.begin(); itr != bluefs_extents.end(); extent_count++, itr++) {
19158 allocator->init_rm_free(itr.get_start(), itr.get_len());
19159 stats.extent_count++;
19160 }
19161 }
19162
19163 dout(5) << "bluefs extent_count=" << extent_count << dendl;
19164 return 0;
19165}
19166
19167//---------------------------------------------------------
19168int BlueStore::read_allocation_from_drive_for_bluestore_tool()
19169{
19170 dout(5) << __func__ << dendl;
19171 int ret = 0;
19172 uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
19173 ret = _open_db_and_around(true, false);
19174 if (ret < 0) {
19175 return ret;
19176 }
19177
19178 ret = _open_collections();
19179 if (ret < 0) {
19180 _close_db_and_around();
19181 return ret;
19182 }
19183
19184 utime_t duration;
19185 read_alloc_stats_t stats = {};
19186 utime_t start = ceph_clock_now();
19187
19188 auto shutdown_cache = make_scope_guard([&] {
1d09f67e
TL
19189 dout(1) << "Allocation Recovery was completed in " << duration
19190 << " seconds; insert_count=" << stats.insert_count
19191 << "; extent_count=" << stats.extent_count << dendl;
20effc67
TL
19192 _shutdown_cache();
19193 _close_db_and_around();
19194 });
19195
19196 {
19197 auto allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
19198 //reconstruct allocations into a temp simple-bitmap and copy into allocator
19199 {
1d09f67e 19200 SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
20effc67
TL
19201 ret = reconstruct_allocations(&sbmap, stats);
19202 if (ret != 0) {
19203 return ret;
19204 }
19205 copy_simple_bitmap_to_allocator(&sbmap, allocator.get(), min_alloc_size);
19206 }
19207
19208 // add allocation space used by the bluefs itself
19209 ret = add_existing_bluefs_allocation(allocator.get(), stats);
19210 if (ret < 0) {
19211 return ret;
19212 }
19213
19214 duration = ceph_clock_now() - start;
19215 stats.insert_count = 0;
19216 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
19217 stats.insert_count++;
19218 };
19219 allocator->dump(count_entries);
19220 ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target);
1d09f67e 19221 if (ret == 0) {
20effc67
TL
19222 dout(5) << "Allocator drive - file integrity check OK" << dendl;
19223 } else {
19224 derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl;
19225 }
19226 }
19227
1d09f67e 19228 dout(1) << stats << dendl;
20effc67
TL
19229 return ret;
19230}
19231
19232//---------------------------------------------------------
19233Allocator* BlueStore::clone_allocator_without_bluefs(Allocator *src_allocator)
19234{
19235 uint64_t bdev_size = bdev->get_size();
19236 Allocator* allocator = create_bitmap_allocator(bdev_size);
19237 if (allocator) {
19238 dout(5) << "bitmap-allocator=" << allocator << dendl;
19239 } else {
19240 derr << "****failed create_bitmap_allocator()" << dendl;
19241 return nullptr;
19242 }
19243
19244 uint64_t num_entries = 0;
19245 copy_allocator(src_allocator, allocator, &num_entries);
19246
19247 // BlueFS stores its internal allocation outside RocksDB (FM) so we should not destage them to the allcoator-file
19248 // we are going to hide bluefs allocation during allocator-destage as they are stored elsewhere
19249 {
19250 std::vector<extent_t> bluefs_extents_vec;
19251 // load current bluefs internal allocation into a vector
19252 load_bluefs_extents(bluefs, &bluefs_layout, cct, path, bluefs_extents_vec, min_alloc_size);
19253 // then remove them from the shared allocator before dumping it to disk (bluefs stored them internally)
19254 for (auto itr = bluefs_extents_vec.begin(); itr != bluefs_extents_vec.end(); ++itr) {
19255 allocator->init_add_free(itr->offset, itr->length);
19256 }
19257 }
19258
19259 return allocator;
19260}
19261
19262//---------------------------------------------------------
19263static void clear_allocation_objects_from_rocksdb(KeyValueDB *db, CephContext *cct, const std::string &path)
19264{
19265 dout(5) << "t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP)" << dendl;
19266 KeyValueDB::Transaction t = db->get_transaction();
19267 t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP);
19268 db->submit_transaction_sync(t);
19269}
19270
19271//---------------------------------------------------------
19272void BlueStore::copy_allocator_content_to_fm(Allocator *allocator, FreelistManager *real_fm)
19273{
19274 unsigned max_txn = 1024;
19275 dout(5) << "max_transaction_submit=" << max_txn << dendl;
19276 uint64_t size = 0, idx = 0;
19277 KeyValueDB::Transaction txn = db->get_transaction();
19278 auto iterated_insert = [&](uint64_t offset, uint64_t length) {
19279 size += length;
19280 real_fm->release(offset, length, txn);
19281 if ((++idx % max_txn) == 0) {
19282 db->submit_transaction_sync(txn);
19283 txn = db->get_transaction();
19284 }
19285 };
19286 allocator->dump(iterated_insert);
19287 if (idx % max_txn != 0) {
19288 db->submit_transaction_sync(txn);
19289 }
19290 dout(5) << "size=" << size << ", num extents=" << idx << dendl;
19291}
19292
19293//---------------------------------------------------------
19294Allocator* BlueStore::initialize_allocator_from_freelist(FreelistManager *real_fm)
19295{
19296 dout(5) << "real_fm->enumerate_next" << dendl;
19297 Allocator* allocator2 = create_bitmap_allocator(bdev->get_size());
19298 if (allocator2) {
19299 dout(5) << "bitmap-allocator=" << allocator2 << dendl;
19300 } else {
19301 return nullptr;
19302 }
19303
19304 uint64_t size2 = 0, idx2 = 0;
19305 real_fm->enumerate_reset();
19306 uint64_t offset, length;
19307 while (real_fm->enumerate_next(db, &offset, &length)) {
19308 allocator2->init_add_free(offset, length);
19309 ++idx2;
19310 size2 += length;
19311 }
19312 real_fm->enumerate_reset();
19313
19314 dout(5) << "size2=" << size2 << ", num2=" << idx2 << dendl;
19315 return allocator2;
19316}
19317
19318//---------------------------------------------------------
19319// close the active fm and open it in a new mode like makefs()
19320// but make sure to mark the full device space as allocated
19321// later we will mark all exetents from the allocator as free
19322int BlueStore::reset_fm_for_restore()
19323{
19324 dout(5) << "<<==>> fm->clear_null_manager()" << dendl;
19325 fm->shutdown();
19326 delete fm;
19327 fm = nullptr;
19328 freelist_type = "bitmap";
19329 KeyValueDB::Transaction t = db->get_transaction();
19330 // call _open_fm() with fm_restore set to TRUE
19331 // this will mark the full device space as allocated (and not just the reserved space)
19332 _open_fm(t, true, true);
19333 if (fm == nullptr) {
19334 derr << "Failed _open_fm()" << dendl;
19335 return -1;
19336 }
19337 db->submit_transaction_sync(t);
19338 ceph_assert(!fm->is_null_manager());
19339 dout(5) << "fm was reactivated in full mode" << dendl;
19340 return 0;
19341}
19342
19343
19344//---------------------------------------------------------
19345// create a temp allocator filled with allocation state from the fm
19346// and compare it to the base allocator passed in
19347int BlueStore::verify_rocksdb_allocations(Allocator *allocator)
19348{
19349 dout(5) << "verify that alloc content is identical to FM" << dendl;
19350 // initialize from freelist
19351 Allocator* temp_allocator = initialize_allocator_from_freelist(fm);
19352 if (temp_allocator == nullptr) {
19353 return -1;
19354 }
19355
19356 uint64_t insert_count = 0;
19357 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
19358 insert_count++;
19359 };
19360 temp_allocator->dump(count_entries);
19361 uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
19362 int ret = compare_allocators(allocator, temp_allocator, insert_count, memory_target);
19363
19364 delete temp_allocator;
19365
19366 if (ret == 0) {
19367 dout(5) << "SUCCESS!!! compare(allocator, temp_allocator)" << dendl;
19368 return 0;
19369 } else {
19370 derr << "**** FAILURE compare(allocator, temp_allocator)::ret=" << ret << dendl;
19371 return -1;
19372 }
19373}
19374
19375//---------------------------------------------------------
19376int BlueStore::db_cleanup(int ret)
19377{
19378 _shutdown_cache();
19379 _close_db_and_around();
19380 return ret;
19381}
19382
19383//---------------------------------------------------------
19384// convert back the system from null-allocator to using rocksdb to store allocation
19385int BlueStore::push_allocation_to_rocksdb()
19386{
19387 if (cct->_conf->bluestore_allocation_from_file) {
19388 derr << "cct->_conf->bluestore_allocation_from_file must be cleared first" << dendl;
19389 derr << "please change default to false in ceph.conf file>" << dendl;
19390 return -1;
19391 }
19392
19393 dout(5) << "calling open_db_and_around() in read/write mode" << dendl;
19394 int ret = _open_db_and_around(false);
19395 if (ret < 0) {
19396 return ret;
19397 }
19398
19399 if (!fm->is_null_manager()) {
19400 derr << "This is not a NULL-MANAGER -> nothing to do..." << dendl;
19401 return db_cleanup(0);
19402 }
19403
19404 // start by creating a clone copy of the shared-allocator
19405 unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(alloc));
19406 if (!allocator) {
19407 return db_cleanup(-1);
19408 }
19409
19410 // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
19411 clear_allocation_objects_from_rocksdb(db, cct, path);
19412
19413 // then open fm in new mode with the full devie marked as alloctaed
19414 if (reset_fm_for_restore() != 0) {
19415 return db_cleanup(-1);
19416 }
19417
19418 // push the free-space from the allocator (shared-alloc without bfs) to rocksdb
19419 copy_allocator_content_to_fm(allocator.get(), fm);
19420
19421 // compare the allocator info with the info stored in the fm/rocksdb
19422 if (verify_rocksdb_allocations(allocator.get()) == 0) {
19423 // all is good -> we can commit to rocksdb allocator
19424 commit_to_real_manager();
19425 } else {
19426 return db_cleanup(-1);
19427 }
19428
19429 // can't be too paranoid :-)
19430 dout(5) << "Running full scale verification..." << dendl;
19431 // close db/fm/allocator and start fresh
19432 db_cleanup(0);
19433 dout(5) << "calling open_db_and_around() in read-only mode" << dendl;
19434 ret = _open_db_and_around(true);
19435 if (ret < 0) {
19436 return db_cleanup(ret);
19437 }
19438 ceph_assert(!fm->is_null_manager());
19439 ceph_assert(verify_rocksdb_allocations(allocator.get()) == 0);
19440
19441 return db_cleanup(ret);
19442}
19443
19444#endif // CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
19445
19446//-------------------------------------------------------------------------------------
19447static int commit_freelist_type(KeyValueDB *db, const std::string& freelist_type, CephContext *cct, const std::string &path)
19448{
19449 // When freelist_type to "bitmap" we will store allocation in RocksDB
19450 // When allocation-info is stored in a single file we set freelist_type to "null"
19451 // This will direct the startup code to read allocation from file and not RocksDB
19452 KeyValueDB::Transaction t = db->get_transaction();
19453 if (t == nullptr) {
19454 derr << "db->get_transaction() failed!!!" << dendl;
19455 return -1;
19456 }
19457
19458 bufferlist bl;
19459 bl.append(freelist_type);
19460 t->set(PREFIX_SUPER, "freelist_type", bl);
19461
19462 int ret = db->submit_transaction_sync(t);
19463 if (ret != 0) {
19464 derr << "Failed db->submit_transaction_sync(t)" << dendl;
19465 }
19466 return ret;
19467}
19468
19469//-------------------------------------------------------------------------------------
19470int BlueStore::commit_to_null_manager()
19471{
19472 dout(5) << "Set FreelistManager to NULL FM..." << dendl;
19473 fm->set_null_manager();
19474 freelist_type = "null";
19475#if 1
19476 return commit_freelist_type(db, freelist_type, cct, path);
19477#else
19478 // should check how long this step take on a big configuration as deletes are expensive
19479 if (commit_freelist_type(db, freelist_type, cct, path) == 0) {
19480 // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
19481 clear_allocation_objects_from_rocksdb(db, cct, path);
19482 }
19483#endif
19484}
19485
19486
19487//-------------------------------------------------------------------------------------
19488int BlueStore::commit_to_real_manager()
19489{
19490 dout(5) << "Set FreelistManager to Real FM..." << dendl;
19491 ceph_assert(!fm->is_null_manager());
19492 freelist_type = "bitmap";
19493 int ret = commit_freelist_type(db, freelist_type, cct, path);
19494 if (ret == 0) {
19495 //remove the allocation_file
19496 invalidate_allocation_file_on_bluefs();
19497 ret = bluefs->unlink(allocator_dir, allocator_file);
19498 bluefs->sync_metadata(false);
19499 if (ret == 0) {
19500 dout(5) << "Remove Allocation File successfully" << dendl;
19501 }
19502 else {
19503 derr << "Remove Allocation File ret_code=" << ret << dendl;
19504 }
19505 }
19506
19507 return ret;
19508}
19509
19510//================================================================================================================
19511//================================================================================================================