]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
update patches for quincy beta
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
20effc67 20#include <algorithm>
7c673cae 21
eafe8130 22#include <boost/container/flat_set.hpp>
20effc67 23#include <boost/algorithm/string.hpp>
eafe8130 24
31f18b77
FG
25#include "include/cpp-btree/btree_set.h"
26
7c673cae 27#include "BlueStore.h"
f67539c2 28#include "bluestore_common.h"
20effc67 29#include "simple_bitmap.h"
7c673cae
FG
30#include "os/kv.h"
31#include "include/compat.h"
32#include "include/intarith.h"
33#include "include/stringify.h"
11fdf7f2
TL
34#include "include/str_map.h"
35#include "include/util.h"
7c673cae
FG
36#include "common/errno.h"
37#include "common/safe_io.h"
91327a77 38#include "common/PriorityCache.h"
20effc67 39#include "common/url_escape.h"
7c673cae
FG
40#include "Allocator.h"
41#include "FreelistManager.h"
42#include "BlueFS.h"
43#include "BlueRocksEnv.h"
44#include "auth/Crypto.h"
45#include "common/EventTrace.h"
91327a77 46#include "perfglue/heap_profiler.h"
11fdf7f2
TL
47#include "common/blkdev.h"
48#include "common/numa.h"
f67539c2 49#include "common/pretty_binary.h"
20effc67
TL
50#include "kv/KeyValueHistogram.h"
51
52#ifdef HAVE_LIBZBD
53#include "ZonedAllocator.h"
54#include "ZonedFreelistManager.h"
55#endif
7c673cae 56
9f95a23c
TL
57#if defined(WITH_LTTNG)
58#define TRACEPOINT_DEFINE
59#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
60#include "tracing/bluestore.h"
61#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
62#undef TRACEPOINT_DEFINE
63#else
64#define tracepoint(...)
65#endif
66
7c673cae
FG
67#define dout_context cct
68#define dout_subsys ceph_subsys_bluestore
69
31f18b77
FG
70using bid_t = decltype(BlueStore::Blob::id);
71
72// bluestore_cache_onode
7c673cae 73MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 74 bluestore_cache_onode);
7c673cae 75
31f18b77 76// bluestore_cache_other
7c673cae 77MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
f91f0fd5 78 bluestore_Buffer);
7c673cae 79MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
f91f0fd5 80 bluestore_Extent);
7c673cae 81MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
f91f0fd5 82 bluestore_Blob);
7c673cae 83MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
f91f0fd5 84 bluestore_SharedBlob);
31f18b77
FG
85
86// bluestore_txc
87MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
88 bluestore_txc);
20effc67 89using std::byte;
f67539c2
TL
90using std::deque;
91using std::min;
92using std::make_pair;
93using std::numeric_limits;
94using std::pair;
20effc67 95using std::less;
f67539c2 96using std::list;
20effc67 97using std::make_unique;
f67539c2
TL
98using std::map;
99using std::max;
100using std::ostream;
101using std::ostringstream;
102using std::set;
103using std::string;
104using std::stringstream;
20effc67 105using std::unique_ptr;
f67539c2
TL
106using std::vector;
107
108using ceph::bufferlist;
109using ceph::bufferptr;
110using ceph::coarse_mono_clock;
111using ceph::decode;
112using ceph::encode;
113using ceph::Formatter;
114using ceph::JSONFormatter;
115using ceph::make_timespan;
116using ceph::mono_clock;
117using ceph::mono_time;
118using ceph::timespan_str;
7c673cae
FG
119
120// kv store prefixes
11fdf7f2
TL
121const string PREFIX_SUPER = "S"; // field -> value
122const string PREFIX_STAT = "T"; // field -> value(int64 array)
123const string PREFIX_COLL = "C"; // collection name -> cnode_t
124const string PREFIX_OBJ = "O"; // object name -> onode_t
125const string PREFIX_OMAP = "M"; // u64 + keyname -> value
126const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
9f95a23c 127const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
f67539c2 128const string PREFIX_PERPG_OMAP = "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value
11fdf7f2
TL
129const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
130const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
131const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
20effc67
TL
132const string PREFIX_SHARED_BLOB = "X"; // u64 SB id -> shared_blob_t
133
134#ifdef HAVE_LIBZBD
f67539c2
TL
135const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager)
136const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager)
137const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata)
20effc67 138#endif
7c673cae 139
11fdf7f2
TL
140const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
141
7c673cae
FG
142// write a label in the first block. always use this size. note that
143// bluefs makes a matching assumption about the location of its
144// superblock (always the second block of the device).
145#define BDEV_LABEL_BLOCK_SIZE 4096
146
147// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
148#define SUPER_RESERVED 8192
149
150#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
151
152
153/*
154 * extent map blob encoding
155 *
156 * we use the low bits of the blobid field to indicate some common scenarios
157 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
158 */
159#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
160#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
161#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
162#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
163#define BLOBID_SHIFT_BITS 4
164
165/*
166 * object name key structure
167 *
168 * encoded u8: shard + 2^7 (so that it sorts properly)
169 * encoded u64: poolid + 2^63 (so that it sorts properly)
170 * encoded u32: hash (bit reversed)
171 *
172 * escaped string: namespace
173 *
174 * escaped string: key or object name
175 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
176 * we are done. otherwise, we are followed by the object name.
177 * escaped string: object name (unless '=' above)
178 *
179 * encoded u64: snap
180 * encoded u64: generation
181 * 'o'
182 */
183#define ONODE_KEY_SUFFIX 'o'
184
185/*
186 * extent shard key
187 *
188 * object prefix key
189 * u32
190 * 'x'
191 */
192#define EXTENT_SHARD_KEY_SUFFIX 'x'
193
194/*
195 * string encoding in the key
196 *
197 * The key string needs to lexicographically sort the same way that
198 * ghobject_t does. We do this by escaping anything <= to '#' with #
199 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
200 * hex digits.
201 *
202 * We use ! as a terminator for strings; this works because it is < #
203 * and will get escaped if it is present in the string.
204 *
f91f0fd5
TL
205 * NOTE: There is a bug in this implementation: due to implicit
206 * character type conversion in comparison it may produce unexpected
207 * ordering. Unfortunately fixing the bug would mean invalidating the
208 * keys in existing deployments. Instead we do additional sorting
209 * where it is needed.
7c673cae
FG
210 */
211template<typename S>
212static void append_escaped(const string &in, S *out)
213{
224ce89b
WB
214 char hexbyte[in.length() * 3 + 1];
215 char* ptr = &hexbyte[0];
7c673cae 216 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
f91f0fd5 217 if (*i <= '#') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
218 *ptr++ = '#';
219 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
220 *ptr++ = "0123456789abcdef"[*i & 0x0f];
f91f0fd5 221 } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
222 *ptr++ = '~';
223 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
224 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 225 } else {
224ce89b 226 *ptr++ = *i;
7c673cae
FG
227 }
228 }
224ce89b
WB
229 *ptr++ = '!';
230 out->append(hexbyte, ptr - &hexbyte[0]);
231}
232
233inline unsigned h2i(char c)
234{
235 if ((c >= '0') && (c <= '9')) {
236 return c - 0x30;
237 } else if ((c >= 'a') && (c <= 'f')) {
238 return c - 'a' + 10;
239 } else if ((c >= 'A') && (c <= 'F')) {
240 return c - 'A' + 10;
241 } else {
242 return 256; // make it always larger than 255
243 }
7c673cae
FG
244}
245
246static int decode_escaped(const char *p, string *out)
247{
224ce89b
WB
248 char buff[256];
249 char* ptr = &buff[0];
250 char* max = &buff[252];
7c673cae
FG
251 const char *orig_p = p;
252 while (*p && *p != '!') {
253 if (*p == '#' || *p == '~') {
224ce89b
WB
254 unsigned hex = 0;
255 p++;
256 hex = h2i(*p++) << 4;
257 if (hex > 255) {
258 return -EINVAL;
259 }
260 hex |= h2i(*p++);
261 if (hex > 255) {
262 return -EINVAL;
263 }
264 *ptr++ = hex;
7c673cae 265 } else {
224ce89b
WB
266 *ptr++ = *p++;
267 }
268 if (ptr > max) {
269 out->append(buff, ptr-buff);
270 ptr = &buff[0];
7c673cae
FG
271 }
272 }
224ce89b
WB
273 if (ptr != buff) {
274 out->append(buff, ptr-buff);
275 }
7c673cae
FG
276 return p - orig_p;
277}
278
7c673cae
FG
279template<typename T>
280static void _key_encode_shard(shard_id_t shard, T *key)
281{
282 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
283}
284
285static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
286{
287 pshard->id = (uint8_t)*key - (uint8_t)0x80;
288 return key + 1;
289}
290
f91f0fd5 291static void get_coll_range(const coll_t& cid, int bits,
f67539c2 292 ghobject_t *temp_start, ghobject_t *temp_end,
a4b75251 293 ghobject_t *start, ghobject_t *end, bool legacy)
7c673cae 294{
7c673cae 295 spg_t pgid;
a4b75251
TL
296 constexpr uint32_t MAX_HASH = std::numeric_limits<uint32_t>::max();
297 // use different nspaces due to we use different schemes when encoding
298 // keys for listing objects
299 const std::string_view MAX_NSPACE = legacy ? "\x7f" : "\xff";
7c673cae 300 if (cid.is_pg(&pgid)) {
f91f0fd5 301 start->shard_id = pgid.shard;
7c673cae
FG
302 *temp_start = *start;
303
f91f0fd5
TL
304 start->hobj.pool = pgid.pool();
305 temp_start->hobj.pool = -2ll - pgid.pool();
7c673cae
FG
306
307 *end = *start;
308 *temp_end = *temp_start;
309
310 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
f91f0fd5
TL
311 start->hobj.set_bitwise_key_u32(reverse_hash);
312 temp_start->hobj.set_bitwise_key_u32(reverse_hash);
7c673cae
FG
313
314 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
a4b75251
TL
315 if (end_hash > MAX_HASH) {
316 // make sure end hobj is even greater than the maximum possible hobj
317 end->hobj.set_bitwise_key_u32(MAX_HASH);
318 temp_end->hobj.set_bitwise_key_u32(MAX_HASH);
319 end->hobj.nspace = MAX_NSPACE;
320 } else {
321 end->hobj.set_bitwise_key_u32(end_hash);
322 temp_end->hobj.set_bitwise_key_u32(end_hash);
323 }
7c673cae 324 } else {
f91f0fd5
TL
325 start->shard_id = shard_id_t::NO_SHARD;
326 start->hobj.pool = -1ull;
327
7c673cae 328 *end = *start;
f91f0fd5 329 start->hobj.set_bitwise_key_u32(0);
a4b75251
TL
330 end->hobj.set_bitwise_key_u32(MAX_HASH);
331 end->hobj.nspace = MAX_NSPACE;
7c673cae
FG
332 // no separate temp section
333 *temp_start = *end;
334 *temp_end = *end;
335 }
f91f0fd5
TL
336
337 start->generation = 0;
338 end->generation = 0;
339 temp_start->generation = 0;
340 temp_end->generation = 0;
7c673cae
FG
341}
342
343static void get_shared_blob_key(uint64_t sbid, string *key)
344{
345 key->clear();
346 _key_encode_u64(sbid, key);
347}
348
349static int get_key_shared_blob(const string& key, uint64_t *sbid)
350{
351 const char *p = key.c_str();
352 if (key.length() < sizeof(uint64_t))
353 return -1;
224ce89b 354 _key_decode_u64(p, sbid);
7c673cae
FG
355 return 0;
356}
357
358template<typename S>
f91f0fd5 359static void _key_encode_prefix(const ghobject_t& oid, S *key)
7c673cae 360{
f91f0fd5
TL
361 _key_encode_shard(oid.shard_id, key);
362 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
363 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
364}
7c673cae 365
f91f0fd5
TL
366static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
367{
7c673cae
FG
368 p = _key_decode_shard(p, &oid->shard_id);
369
370 uint64_t pool;
371 p = _key_decode_u64(p, &pool);
372 oid->hobj.pool = pool - 0x8000000000000000ull;
373
374 unsigned hash;
375 p = _key_decode_u32(p, &hash);
376
377 oid->hobj.set_bitwise_key_u32(hash);
378
f91f0fd5
TL
379 return p;
380}
381
382#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
383
20effc67 384static int _get_key_object(const char *p, ghobject_t *oid)
f91f0fd5
TL
385{
386 int r;
f91f0fd5
TL
387
388 p = _key_decode_prefix(p, oid);
389
7c673cae
FG
390 r = decode_escaped(p, &oid->hobj.nspace);
391 if (r < 0)
392 return -2;
393 p += r + 1;
394
395 string k;
396 r = decode_escaped(p, &k);
397 if (r < 0)
398 return -3;
399 p += r + 1;
400 if (*p == '=') {
401 // no key
402 ++p;
403 oid->hobj.oid.name = k;
404 } else if (*p == '<' || *p == '>') {
405 // key + name
406 ++p;
407 r = decode_escaped(p, &oid->hobj.oid.name);
408 if (r < 0)
409 return -5;
410 p += r + 1;
411 oid->hobj.set_key(k);
412 } else {
413 // malformed
414 return -6;
415 }
416
417 p = _key_decode_u64(p, &oid->hobj.snap.val);
418 p = _key_decode_u64(p, &oid->generation);
419
420 if (*p != ONODE_KEY_SUFFIX) {
421 return -7;
422 }
423 p++;
424 if (*p) {
425 // if we get something other than a null terminator here,
426 // something goes wrong.
427 return -8;
428 }
429
430 return 0;
431}
432
433template<typename S>
20effc67 434static int get_key_object(const S& key, ghobject_t *oid)
7c673cae 435{
20effc67
TL
436 if (key.length() < ENCODED_KEY_PREFIX_LEN)
437 return -1;
438 if (key.length() == ENCODED_KEY_PREFIX_LEN)
439 return -2;
440 const char *p = key.c_str();
441 return _get_key_object(p, oid);
442}
7c673cae 443
20effc67
TL
444template<typename S>
445static void _get_object_key(const ghobject_t& oid, S *key)
446{
f91f0fd5 447 size_t max_len = ENCODED_KEY_PREFIX_LEN +
7c673cae
FG
448 (oid.hobj.nspace.length() * 3 + 1) +
449 (oid.hobj.get_key().length() * 3 + 1) +
450 1 + // for '<', '=', or '>'
451 (oid.hobj.oid.name.length() * 3 + 1) +
452 8 + 8 + 1;
453 key->reserve(max_len);
454
f91f0fd5 455 _key_encode_prefix(oid, key);
7c673cae
FG
456
457 append_escaped(oid.hobj.nspace, key);
458
459 if (oid.hobj.get_key().length()) {
460 // is a key... could be < = or >.
461 append_escaped(oid.hobj.get_key(), key);
462 // (ASCII chars < = and > sort in that order, yay)
463 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
464 if (r) {
465 key->append(r > 0 ? ">" : "<");
466 append_escaped(oid.hobj.oid.name, key);
467 } else {
468 // same as no key
469 key->append("=");
470 }
471 } else {
472 // no key
473 append_escaped(oid.hobj.oid.name, key);
474 key->append("=");
475 }
476
477 _key_encode_u64(oid.hobj.snap, key);
478 _key_encode_u64(oid.generation, key);
479
480 key->push_back(ONODE_KEY_SUFFIX);
20effc67
TL
481}
482
483template<typename S>
484static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
485{
486 key->clear();
487 _get_object_key(oid, key);
7c673cae
FG
488
489 // sanity check
490 if (true) {
491 ghobject_t t;
492 int r = get_key_object(*key, &t);
493 if (r || t != oid) {
494 derr << " r " << r << dendl;
495 derr << "key " << pretty_binary_string(*key) << dendl;
496 derr << "oid " << oid << dendl;
497 derr << " t " << t << dendl;
11fdf7f2 498 ceph_assert(r == 0 && t == oid);
7c673cae
FG
499 }
500 }
501}
502
7c673cae
FG
503// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
504// char lets us quickly test whether it is a shard key without decoding any
505// of the prefix bytes.
506template<typename S>
507static void get_extent_shard_key(const S& onode_key, uint32_t offset,
508 string *key)
509{
510 key->clear();
511 key->reserve(onode_key.length() + 4 + 1);
512 key->append(onode_key.c_str(), onode_key.size());
513 _key_encode_u32(offset, key);
514 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
515}
516
517static void rewrite_extent_shard_key(uint32_t offset, string *key)
518{
11fdf7f2
TL
519 ceph_assert(key->size() > sizeof(uint32_t) + 1);
520 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
521 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
522}
523
524template<typename S>
525static void generate_extent_shard_key_and_apply(
526 const S& onode_key,
527 uint32_t offset,
528 string *key,
529 std::function<void(const string& final_key)> apply)
530{
531 if (key->empty()) { // make full key
11fdf7f2 532 ceph_assert(!onode_key.empty());
7c673cae
FG
533 get_extent_shard_key(onode_key, offset, key);
534 } else {
535 rewrite_extent_shard_key(offset, key);
536 }
537 apply(*key);
538}
539
540int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
541{
11fdf7f2
TL
542 ceph_assert(key.size() > sizeof(uint32_t) + 1);
543 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
544 int okey_len = key.size() - sizeof(uint32_t) - 1;
545 *onode_key = key.substr(0, okey_len);
546 const char *p = key.data() + okey_len;
224ce89b 547 _key_decode_u32(p, offset);
7c673cae
FG
548 return 0;
549}
550
551static bool is_extent_shard_key(const string& key)
552{
553 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
554}
555
7c673cae
FG
556static void get_deferred_key(uint64_t seq, string *out)
557{
558 _key_encode_u64(seq, out);
559}
560
11fdf7f2
TL
561static void get_pool_stat_key(int64_t pool_id, string *key)
562{
563 key->clear();
564 _key_encode_u64(pool_id, key);
565}
566
567static int get_key_pool_stat(const string& key, uint64_t* pool_id)
568{
569 const char *p = key.c_str();
570 if (key.length() < sizeof(uint64_t))
571 return -1;
572 _key_decode_u64(p, pool_id);
573 return 0;
574}
7c673cae 575
20effc67
TL
576#ifdef HAVE_LIBZBD
577static void get_zone_offset_object_key(
578 uint32_t zone,
579 uint64_t offset,
580 ghobject_t oid,
581 std::string *key)
582{
583 key->clear();
584 _key_encode_u32(zone, key);
585 _key_encode_u64(offset, key);
586 _get_object_key(oid, key);
587}
588
589static int get_key_zone_offset_object(
590 const string& key,
591 uint32_t *zone,
592 uint64_t *offset,
593 ghobject_t *oid)
594{
595 const char *p = key.c_str();
596 if (key.length() < sizeof(uint64_t) + sizeof(uint32_t) + ENCODED_KEY_PREFIX_LEN + 1)
597 return -1;
598 p = _key_decode_u32(p, zone);
599 p = _key_decode_u64(p, offset);
600 int r = _get_key_object(p, oid);
601 if (r < 0) {
602 return r;
603 }
604 return 0;
605}
606#endif
522d829b 607
81eedcae
TL
608template <int LogLevelV>
609void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
610{
611 uint64_t pos = 0;
612 for (auto& s : em.shards) {
613 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
614 << (s.loaded ? " (loaded)" : "")
615 << (s.dirty ? " (dirty)" : "")
616 << dendl;
617 }
618 for (auto& e : em.extent_map) {
619 dout(LogLevelV) << __func__ << " " << e << dendl;
620 ceph_assert(e.logical_offset >= pos);
621 pos = e.logical_offset + e.length;
622 const bluestore_blob_t& blob = e.blob->get_blob();
623 if (blob.has_csum()) {
624 vector<uint64_t> v;
625 unsigned n = blob.get_csum_count();
626 for (unsigned i = 0; i < n; ++i)
627 v.push_back(blob.get_csum_item(i));
628 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
629 << dendl;
630 }
631 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
632 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
633 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
634 << "~" << i.second->length << std::dec
635 << " " << *i.second << dendl;
636 }
637 }
638}
639
640template <int LogLevelV>
641void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
642{
643 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
644 return;
645 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
646 << " nid " << o.onode.nid
647 << " size 0x" << std::hex << o.onode.size
648 << " (" << std::dec << o.onode.size << ")"
649 << " expected_object_size " << o.onode.expected_object_size
650 << " expected_write_size " << o.onode.expected_write_size
651 << " in " << o.onode.extent_map_shards.size() << " shards"
652 << ", " << o.extent_map.spanning_blob_map.size()
653 << " spanning blobs"
654 << dendl;
20effc67
TL
655 for (auto& [zone, offset] : o.onode.zone_offset_refs) {
656 dout(LogLevelV) << __func__ << " zone ref 0x" << std::hex << zone
657 << " offset 0x" << offset << std::dec << dendl;
658 }
81eedcae
TL
659 for (auto p = o.onode.attrs.begin();
660 p != o.onode.attrs.end();
661 ++p) {
662 dout(LogLevelV) << __func__ << " attr " << p->first
663 << " len " << p->second.length() << dendl;
664 }
665 _dump_extent_map<LogLevelV>(cct, o.extent_map);
666}
667
668template <int LogLevelV>
669void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
670{
671 dout(LogLevelV) << __func__ << " transaction dump:\n";
672 JSONFormatter f(true);
673 f.open_object_section("transaction");
674 t->dump(&f);
675 f.close_section();
676 f.flush(*_dout);
677 *_dout << dendl;
678}
679
7c673cae
FG
680// Buffer
681
682ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
683{
684 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
685 << b.offset << "~" << b.length << std::dec
686 << " " << BlueStore::Buffer::get_state_name(b.state);
687 if (b.flags)
688 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
689 return out << ")";
690}
691
f91f0fd5
TL
692namespace {
693
694/*
695 * Due to a bug in key string encoding (see a comment for append_escaped)
696 * the KeyValueDB iterator does not lexicographically sort the same
697 * way that ghobject_t does: objects with the same hash may have wrong order.
698 *
699 * This is the iterator wrapper that fixes the keys order.
700 */
701
702class CollectionListIterator {
703public:
704 CollectionListIterator(const KeyValueDB::Iterator &it)
705 : m_it(it) {
706 }
707 virtual ~CollectionListIterator() {
708 }
709
710 virtual bool valid() const = 0;
711 virtual const ghobject_t &oid() const = 0;
712 virtual void lower_bound(const ghobject_t &oid) = 0;
713 virtual void upper_bound(const ghobject_t &oid) = 0;
714 virtual void next() = 0;
715
adb31ebb
TL
716 virtual int cmp(const ghobject_t &oid) const = 0;
717
718 bool is_ge(const ghobject_t &oid) const {
719 return cmp(oid) >= 0;
720 }
721
722 bool is_lt(const ghobject_t &oid) const {
723 return cmp(oid) < 0;
724 }
725
f91f0fd5
TL
726protected:
727 KeyValueDB::Iterator m_it;
728};
729
730class SimpleCollectionListIterator : public CollectionListIterator {
731public:
732 SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
733 : CollectionListIterator(it), m_cct(cct) {
734 }
735
736 bool valid() const override {
737 return m_it->valid();
738 }
739
740 const ghobject_t &oid() const override {
741 ceph_assert(valid());
742
743 return m_oid;
744 }
745
746 void lower_bound(const ghobject_t &oid) override {
747 string key;
748 get_object_key(m_cct, oid, &key);
749
750 m_it->lower_bound(key);
751 get_oid();
752 }
753
754 void upper_bound(const ghobject_t &oid) override {
755 string key;
756 get_object_key(m_cct, oid, &key);
757
758 m_it->upper_bound(key);
759 get_oid();
760 }
761
762 void next() override {
763 ceph_assert(valid());
764
765 m_it->next();
766 get_oid();
767 }
768
adb31ebb
TL
769 int cmp(const ghobject_t &oid) const override {
770 ceph_assert(valid());
771
772 string key;
773 get_object_key(m_cct, oid, &key);
774
775 return m_it->key().compare(key);
776 }
777
f91f0fd5
TL
778private:
779 CephContext *m_cct;
780 ghobject_t m_oid;
781
782 void get_oid() {
f67539c2
TL
783 m_oid = ghobject_t();
784 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
785 m_it->next();
f91f0fd5 786 }
f67539c2 787 if (!valid()) {
f91f0fd5
TL
788 return;
789 }
790
f91f0fd5
TL
791 int r = get_key_object(m_it->key(), &m_oid);
792 ceph_assert(r == 0);
793 }
794};
795
796class SortedCollectionListIterator : public CollectionListIterator {
797public:
798 SortedCollectionListIterator(const KeyValueDB::Iterator &it)
799 : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
800 }
801
802 bool valid() const override {
803 return m_chunk_iter != m_chunk.end();
804 }
805
806 const ghobject_t &oid() const override {
807 ceph_assert(valid());
808
809 return m_chunk_iter->first;
810 }
811
812 void lower_bound(const ghobject_t &oid) override {
813 std::string key;
814 _key_encode_prefix(oid, &key);
815
816 m_it->lower_bound(key);
817 m_chunk_iter = m_chunk.end();
818 if (!get_next_chunk()) {
819 return;
820 }
821
822 if (this->oid().shard_id != oid.shard_id ||
823 this->oid().hobj.pool != oid.hobj.pool ||
824 this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
825 return;
826 }
827
828 m_chunk_iter = m_chunk.lower_bound(oid);
829 if (m_chunk_iter == m_chunk.end()) {
830 get_next_chunk();
831 }
832 }
833
834 void upper_bound(const ghobject_t &oid) override {
835 lower_bound(oid);
836
837 if (valid() && this->oid() == oid) {
838 next();
839 }
840 }
841
842 void next() override {
843 ceph_assert(valid());
844
845 m_chunk_iter++;
846 if (m_chunk_iter == m_chunk.end()) {
847 get_next_chunk();
848 }
849 }
850
adb31ebb
TL
851 int cmp(const ghobject_t &oid) const override {
852 ceph_assert(valid());
853
854 if (this->oid() < oid) {
855 return -1;
856 }
857 if (this->oid() > oid) {
858 return 1;
859 }
860 return 0;
861 }
862
f91f0fd5
TL
863private:
864 std::map<ghobject_t, std::string> m_chunk;
865 std::map<ghobject_t, std::string>::iterator m_chunk_iter;
866
867 bool get_next_chunk() {
868 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
869 m_it->next();
870 }
871
872 if (!m_it->valid()) {
873 return false;
874 }
875
876 ghobject_t oid;
877 int r = get_key_object(m_it->key(), &oid);
878 ceph_assert(r == 0);
879
880 m_chunk.clear();
881 while (true) {
882 m_chunk.insert({oid, m_it->key()});
883
884 do {
885 m_it->next();
886 } while (m_it->valid() && is_extent_shard_key(m_it->key()));
887
888 if (!m_it->valid()) {
889 break;
890 }
891
892 ghobject_t next;
893 r = get_key_object(m_it->key(), &next);
894 ceph_assert(r == 0);
895 if (next.shard_id != oid.shard_id ||
896 next.hobj.pool != oid.hobj.pool ||
897 next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
898 break;
899 }
900 oid = next;
901 }
902
903 m_chunk_iter = m_chunk.begin();
904 return true;
905 }
906};
907
908} // anonymous namespace
909
7c673cae
FG
910// Garbage Collector
911
912void BlueStore::GarbageCollector::process_protrusive_extents(
913 const BlueStore::ExtentMap& extent_map,
914 uint64_t start_offset,
915 uint64_t end_offset,
916 uint64_t start_touch_offset,
917 uint64_t end_touch_offset,
918 uint64_t min_alloc_size)
919{
11fdf7f2 920 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
7c673cae 921
11fdf7f2
TL
922 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
923 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
7c673cae
FG
924
925 dout(30) << __func__ << " (hex): [" << std::hex
926 << lookup_start_offset << ", " << lookup_end_offset
927 << ")" << std::dec << dendl;
928
929 for (auto it = extent_map.seek_lextent(lookup_start_offset);
930 it != extent_map.extent_map.end() &&
931 it->logical_offset < lookup_end_offset;
932 ++it) {
933 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
934 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
935
936 dout(30) << __func__ << " " << *it
937 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
938 << dendl;
939
940 Blob* b = it->blob.get();
941
942 if (it->logical_offset >=start_touch_offset &&
943 it->logical_end() <= end_touch_offset) {
944 // Process extents within the range affected by
945 // the current write request.
946 // Need to take into account if existing extents
947 // can be merged with them (uncompressed case)
948 if (!b->get_blob().is_compressed()) {
949 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
950 --blob_info_counted->expected_allocations; // don't need to allocate
951 // new AU for compressed
952 // data since another
953 // collocated uncompressed
954 // blob already exists
955 dout(30) << __func__ << " --expected:"
956 << alloc_unit_start << dendl;
957 }
958 used_alloc_unit = alloc_unit_end;
959 blob_info_counted = nullptr;
960 }
961 } else if (b->get_blob().is_compressed()) {
962
963 // additionally we take compressed blobs that were not impacted
964 // by the write into account too
965 BlobInfo& bi =
966 affected_blobs.emplace(
967 b, BlobInfo(b->get_referenced_bytes())).first->second;
968
969 int adjust =
970 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
971 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
972 dout(30) << __func__ << " expected_allocations="
973 << bi.expected_allocations << " end_au:"
974 << alloc_unit_end << dendl;
975
976 blob_info_counted = &bi;
977 used_alloc_unit = alloc_unit_end;
978
11fdf7f2 979 ceph_assert(it->length <= bi.referenced_bytes);
7c673cae
FG
980 bi.referenced_bytes -= it->length;
981 dout(30) << __func__ << " affected_blob:" << *b
982 << " unref 0x" << std::hex << it->length
983 << " referenced = 0x" << bi.referenced_bytes
984 << std::dec << dendl;
985 // NOTE: we can't move specific blob to resulting GC list here
986 // when reference counter == 0 since subsequent extents might
987 // decrement its expected_allocation.
988 // Hence need to enumerate all the extents first.
989 if (!bi.collect_candidate) {
990 bi.first_lextent = it;
991 bi.collect_candidate = true;
992 }
993 bi.last_lextent = it;
994 } else {
995 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
996 // don't need to allocate new AU for compressed data since another
997 // collocated uncompressed blob already exists
998 --blob_info_counted->expected_allocations;
999 dout(30) << __func__ << " --expected_allocations:"
1000 << alloc_unit_start << dendl;
1001 }
1002 used_alloc_unit = alloc_unit_end;
1003 blob_info_counted = nullptr;
1004 }
1005 }
1006
1007 for (auto b_it = affected_blobs.begin();
1008 b_it != affected_blobs.end();
1009 ++b_it) {
1010 Blob* b = b_it->first;
1011 BlobInfo& bi = b_it->second;
1012 if (bi.referenced_bytes == 0) {
1013 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
1014 int64_t blob_expected_for_release =
11fdf7f2 1015 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
7c673cae
FG
1016
1017 dout(30) << __func__ << " " << *(b_it->first)
1018 << " expected4release=" << blob_expected_for_release
1019 << " expected_allocations=" << bi.expected_allocations
1020 << dendl;
1021 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
11fdf7f2 1022 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
7c673cae
FG
1023 if (bi.collect_candidate) {
1024 auto it = bi.first_lextent;
1025 bool bExit = false;
1026 do {
1027 if (it->blob.get() == b) {
eafe8130 1028 extents_to_collect.insert(it->logical_offset, it->length);
7c673cae
FG
1029 }
1030 bExit = it == bi.last_lextent;
1031 ++it;
31f18b77 1032 } while (!bExit);
7c673cae
FG
1033 }
1034 expected_for_release += blob_expected_for_release;
1035 expected_allocations += bi.expected_allocations;
1036 }
1037 }
1038 }
1039}
1040
1041int64_t BlueStore::GarbageCollector::estimate(
1042 uint64_t start_offset,
1043 uint64_t length,
1044 const BlueStore::ExtentMap& extent_map,
1045 const BlueStore::old_extent_map_t& old_extents,
1046 uint64_t min_alloc_size)
1047{
1048
1049 affected_blobs.clear();
1050 extents_to_collect.clear();
1051 used_alloc_unit = boost::optional<uint64_t >();
1052 blob_info_counted = nullptr;
1053
eafe8130
TL
1054 uint64_t gc_start_offset = start_offset;
1055 uint64_t gc_end_offset = start_offset + length;
7c673cae
FG
1056
1057 uint64_t end_offset = start_offset + length;
1058
1059 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
1060 Blob* b = it->e.blob.get();
1061 if (b->get_blob().is_compressed()) {
1062
1063 // update gc_start_offset/gc_end_offset if needed
1064 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
11fdf7f2 1065 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
7c673cae
FG
1066
1067 auto o = it->e.logical_offset;
1068 auto l = it->e.length;
1069
1070 uint64_t ref_bytes = b->get_referenced_bytes();
1071 // micro optimization to bypass blobs that have no more references
1072 if (ref_bytes != 0) {
1073 dout(30) << __func__ << " affected_blob:" << *b
1074 << " unref 0x" << std::hex << o << "~" << l
1075 << std::dec << dendl;
1076 affected_blobs.emplace(b, BlobInfo(ref_bytes));
1077 }
1078 }
1079 }
1080 dout(30) << __func__ << " gc range(hex): [" << std::hex
1081 << gc_start_offset << ", " << gc_end_offset
1082 << ")" << std::dec << dendl;
1083
1084 // enumerate preceeding extents to check if they reference affected blobs
1085 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
1086 process_protrusive_extents(extent_map,
1087 gc_start_offset,
1088 gc_end_offset,
1089 start_offset,
1090 end_offset,
1091 min_alloc_size);
1092 }
1093 return expected_for_release - expected_allocations;
1094}
1095
9f95a23c
TL
1096// LruOnodeCacheShard
1097struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
1098 typedef boost::intrusive::list<
1099 BlueStore::Onode,
1100 boost::intrusive::member_hook<
1101 BlueStore::Onode,
1102 boost::intrusive::list_member_hook<>,
1103 &BlueStore::Onode::lru_item> > list_t;
7c673cae 1104
9f95a23c 1105 list_t lru;
7c673cae 1106
9f95a23c 1107 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
7c673cae 1108
f6b5b4d7 1109 void _add(BlueStore::Onode* o, int level) override
9f95a23c 1110 {
f6b5b4d7 1111 if (o->put_cache()) {
9f95a23c 1112 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
20effc67
TL
1113 o->cache_age_bin = age_bins.front();
1114 *(o->cache_age_bin) += 1;
f6b5b4d7
TL
1115 } else {
1116 ++num_pinned;
9f95a23c 1117 }
f6b5b4d7 1118 ++num; // we count both pinned and unpinned entries
20effc67
TL
1119 dout(20) << __func__ << " " << this << " " << o->oid << " added, num="
1120 << num << dendl;
eafe8130 1121 }
f6b5b4d7 1122 void _rm(BlueStore::Onode* o) override
9f95a23c 1123 {
f6b5b4d7 1124 if (o->pop_cache()) {
20effc67 1125 *(o->cache_age_bin) -= 1;
9f95a23c 1126 lru.erase(lru.iterator_to(*o));
f6b5b4d7
TL
1127 } else {
1128 ceph_assert(num_pinned);
1129 --num_pinned;
9f95a23c 1130 }
f6b5b4d7
TL
1131 ceph_assert(num);
1132 --num;
1133 dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
9f95a23c 1134 }
f6b5b4d7 1135 void _pin(BlueStore::Onode* o) override
9f95a23c 1136 {
20effc67 1137 *(o->cache_age_bin) -= 1;
9f95a23c 1138 lru.erase(lru.iterator_to(*o));
f6b5b4d7 1139 ++num_pinned;
20effc67 1140 dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " pinned" << dendl;
9f95a23c 1141 }
f6b5b4d7 1142 void _unpin(BlueStore::Onode* o) override
9f95a23c 1143 {
f6b5b4d7 1144 lru.push_front(*o);
20effc67
TL
1145 o->cache_age_bin = age_bins.front();
1146 *(o->cache_age_bin) += 1;
f6b5b4d7
TL
1147 ceph_assert(num_pinned);
1148 --num_pinned;
20effc67 1149 dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " unpinned" << dendl;
9f95a23c 1150 }
adb31ebb
TL
1151 void _unpin_and_rm(BlueStore::Onode* o) override
1152 {
1153 o->pop_cache();
1154 ceph_assert(num_pinned);
1155 --num_pinned;
1156 ceph_assert(num);
1157 --num;
1158 }
9f95a23c
TL
1159 void _trim_to(uint64_t new_size) override
1160 {
1161 if (new_size >= lru.size()) {
1162 return; // don't even try
1163 }
1164 uint64_t n = lru.size() - new_size;
1165 auto p = lru.end();
1166 ceph_assert(p != lru.begin());
1167 --p;
f6b5b4d7
TL
1168 ceph_assert(num >= n);
1169 num -= n;
1170 while (n-- > 0) {
9f95a23c 1171 BlueStore::Onode *o = &*p;
f6b5b4d7
TL
1172 dout(20) << __func__ << " rm " << o->oid << " "
1173 << o->nref << " " << o->cached << " " << o->pinned << dendl;
9f95a23c
TL
1174 if (p != lru.begin()) {
1175 lru.erase(p--);
1176 } else {
f6b5b4d7 1177 ceph_assert(n == 0);
9f95a23c 1178 lru.erase(p);
9f95a23c 1179 }
20effc67 1180 *(o->cache_age_bin) -= 1;
f6b5b4d7
TL
1181 auto pinned = !o->pop_cache();
1182 ceph_assert(!pinned);
1183 o->c->onode_map._remove(o->oid);
9f95a23c 1184 }
f6b5b4d7
TL
1185 }
1186 void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
1187 {
1188 if (to == this) {
1189 return;
1190 }
1191 ceph_assert(o->cached);
1192 ceph_assert(o->pinned);
1193 ceph_assert(num);
1194 ceph_assert(num_pinned);
1195 --num_pinned;
1196 --num;
1197 ++to->num_pinned;
1198 ++to->num;
9f95a23c
TL
1199 }
1200 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
1201 {
f6b5b4d7 1202 *onodes += num;
9f95a23c
TL
1203 *pinned_onodes += num_pinned;
1204 }
1205};
7c673cae 1206
9f95a23c
TL
1207// OnodeCacheShard
1208BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
1209 CephContext* cct,
1210 string type,
1211 PerfCounters *logger)
7c673cae 1212{
9f95a23c
TL
1213 BlueStore::OnodeCacheShard *c = nullptr;
1214 // Currently we only implement an LRU cache for onodes
1215 c = new LruOnodeCacheShard(cct);
1216 c->logger = logger;
1217 return c;
7c673cae
FG
1218}
1219
9f95a23c
TL
1220// LruBufferCacheShard
1221struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
1222 typedef boost::intrusive::list<
1223 BlueStore::Buffer,
1224 boost::intrusive::member_hook<
1225 BlueStore::Buffer,
1226 boost::intrusive::list_member_hook<>,
1227 &BlueStore::Buffer::lru_item> > list_t;
1228 list_t lru;
1229
1230 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
1231
1232 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
1233 if (near) {
1234 auto q = lru.iterator_to(*near);
1235 lru.insert(q, *b);
1236 } else if (level > 0) {
1237 lru.push_front(*b);
1238 } else {
1239 lru.push_back(*b);
7c673cae 1240 }
9f95a23c 1241 buffer_bytes += b->length;
20effc67
TL
1242 b->cache_age_bin = age_bins.front();
1243 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1244 num = lru.size();
1245 }
1246 void _rm(BlueStore::Buffer *b) override {
1247 ceph_assert(buffer_bytes >= b->length);
1248 buffer_bytes -= b->length;
20effc67
TL
1249 assert(*(b->cache_age_bin) >= b->length);
1250 *(b->cache_age_bin) -= b->length;
9f95a23c
TL
1251 auto q = lru.iterator_to(*b);
1252 lru.erase(q);
1253 num = lru.size();
1254 }
1255 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
1256 src->_rm(b);
1257 _add(b, 0, nullptr);
1258 }
1259 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
1260 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1261 buffer_bytes += delta;
20effc67
TL
1262 assert(*(b->cache_age_bin) + delta >= 0);
1263 *(b->cache_age_bin) += delta;
9f95a23c
TL
1264 }
1265 void _touch(BlueStore::Buffer *b) override {
1266 auto p = lru.iterator_to(*b);
1267 lru.erase(p);
1268 lru.push_front(*b);
20effc67
TL
1269 *(b->cache_age_bin) -= b->length;
1270 b->cache_age_bin = age_bins.front();
1271 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1272 num = lru.size();
1273 _audit("_touch_buffer end");
1274 }
7c673cae 1275
9f95a23c
TL
1276 void _trim_to(uint64_t max) override
1277 {
1278 while (buffer_bytes > max) {
1279 auto i = lru.rbegin();
1280 if (i == lru.rend()) {
1281 // stop if lru is now empty
7c673cae
FG
1282 break;
1283 }
1284
9f95a23c
TL
1285 BlueStore::Buffer *b = &*i;
1286 ceph_assert(b->is_clean());
1287 dout(20) << __func__ << " rm " << *b << dendl;
20effc67
TL
1288 assert(*(b->cache_age_bin) >= b->length);
1289 *(b->cache_age_bin) -= b->length;
9f95a23c 1290 b->space->_rm_buffer(this, b);
7c673cae 1291 }
9f95a23c 1292 num = lru.size();
7c673cae 1293 }
7c673cae 1294
9f95a23c
TL
1295 void add_stats(uint64_t *extents,
1296 uint64_t *blobs,
1297 uint64_t *buffers,
1298 uint64_t *bytes) override {
1299 *extents += num_extents;
1300 *blobs += num_blobs;
1301 *buffers += num;
1302 *bytes += buffer_bytes;
7c673cae 1303 }
9f95a23c
TL
1304#ifdef DEBUG_CACHE
1305 void _audit(const char *s) override
1306 {
1307 dout(10) << __func__ << " " << when << " start" << dendl;
1308 uint64_t s = 0;
1309 for (auto i = lru.begin(); i != lru.end(); ++i) {
1310 s += i->length;
1311 }
1312 if (s != buffer_bytes) {
1313 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1314 << dendl;
1315 for (auto i = lru.begin(); i != lru.end(); ++i) {
1316 derr << __func__ << " " << *i << dendl;
1317 }
1318 ceph_assert(s == buffer_bytes);
7c673cae 1319 }
9f95a23c
TL
1320 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1321 << " ok" << dendl;
7c673cae 1322 }
7c673cae 1323#endif
9f95a23c 1324};
7c673cae 1325
9f95a23c
TL
1326// TwoQBufferCacheShard
1327
1328struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1329 typedef boost::intrusive::list<
1330 BlueStore::Buffer,
1331 boost::intrusive::member_hook<
1332 BlueStore::Buffer,
1333 boost::intrusive::list_member_hook<>,
1334 &BlueStore::Buffer::lru_item> > list_t;
1335 list_t hot; ///< "Am" hot buffers
1336 list_t warm_in; ///< "A1in" newly warm buffers
1337 list_t warm_out; ///< "A1out" empty buffers we've evicted
9f95a23c
TL
1338
1339 enum {
1340 BUFFER_NEW = 0,
1341 BUFFER_WARM_IN, ///< in warm_in
1342 BUFFER_WARM_OUT, ///< in warm_out
1343 BUFFER_HOT, ///< in hot
1344 BUFFER_TYPE_MAX
1345 };
7c673cae 1346
9f95a23c 1347 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
7c673cae 1348
9f95a23c
TL
1349public:
1350 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
7c673cae 1351
9f95a23c
TL
1352 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1353 {
1354 dout(20) << __func__ << " level " << level << " near " << near
1355 << " on " << *b
1356 << " which has cache_private " << b->cache_private << dendl;
1357 if (near) {
1358 b->cache_private = near->cache_private;
1359 switch (b->cache_private) {
1360 case BUFFER_WARM_IN:
1361 warm_in.insert(warm_in.iterator_to(*near), *b);
1362 break;
1363 case BUFFER_WARM_OUT:
1364 ceph_assert(b->is_empty());
1365 warm_out.insert(warm_out.iterator_to(*near), *b);
1366 break;
1367 case BUFFER_HOT:
1368 hot.insert(hot.iterator_to(*near), *b);
1369 break;
1370 default:
1371 ceph_abort_msg("bad cache_private");
1372 }
1373 } else if (b->cache_private == BUFFER_NEW) {
1374 b->cache_private = BUFFER_WARM_IN;
1375 if (level > 0) {
1376 warm_in.push_front(*b);
1377 } else {
1378 // take caller hint to start at the back of the warm queue
1379 warm_in.push_back(*b);
1380 }
1381 } else {
1382 // we got a hint from discard
1383 switch (b->cache_private) {
1384 case BUFFER_WARM_IN:
1385 // stay in warm_in. move to front, even though 2Q doesn't actually
1386 // do this.
1387 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1388 warm_in.push_front(*b);
1389 break;
1390 case BUFFER_WARM_OUT:
1391 b->cache_private = BUFFER_HOT;
1392 // move to hot. fall-thru
1393 case BUFFER_HOT:
1394 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1395 hot.push_front(*b);
1396 break;
1397 default:
1398 ceph_abort_msg("bad cache_private");
1399 }
1400 }
20effc67 1401 b->cache_age_bin = age_bins.front();
9f95a23c
TL
1402 if (!b->is_empty()) {
1403 buffer_bytes += b->length;
1404 list_bytes[b->cache_private] += b->length;
20effc67 1405 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1406 }
1407 num = hot.size() + warm_in.size();
1408 }
1409
1410 void _rm(BlueStore::Buffer *b) override
1411 {
1412 dout(20) << __func__ << " " << *b << dendl;
1413 if (!b->is_empty()) {
1414 ceph_assert(buffer_bytes >= b->length);
1415 buffer_bytes -= b->length;
1416 ceph_assert(list_bytes[b->cache_private] >= b->length);
1417 list_bytes[b->cache_private] -= b->length;
20effc67
TL
1418 assert(*(b->cache_age_bin) >= b->length);
1419 *(b->cache_age_bin) -= b->length;
9f95a23c 1420 }
7c673cae
FG
1421 switch (b->cache_private) {
1422 case BUFFER_WARM_IN:
9f95a23c 1423 warm_in.erase(warm_in.iterator_to(*b));
7c673cae
FG
1424 break;
1425 case BUFFER_WARM_OUT:
9f95a23c 1426 warm_out.erase(warm_out.iterator_to(*b));
7c673cae
FG
1427 break;
1428 case BUFFER_HOT:
9f95a23c 1429 hot.erase(hot.iterator_to(*b));
7c673cae
FG
1430 break;
1431 default:
11fdf7f2 1432 ceph_abort_msg("bad cache_private");
7c673cae 1433 }
9f95a23c
TL
1434 num = hot.size() + warm_in.size();
1435 }
1436
1437 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1438 {
1439 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1440 src->_rm(b);
1441
1442 // preserve which list we're on (even if we can't preserve the order!)
7c673cae
FG
1443 switch (b->cache_private) {
1444 case BUFFER_WARM_IN:
9f95a23c
TL
1445 ceph_assert(!b->is_empty());
1446 warm_in.push_back(*b);
7c673cae
FG
1447 break;
1448 case BUFFER_WARM_OUT:
9f95a23c
TL
1449 ceph_assert(b->is_empty());
1450 warm_out.push_back(*b);
1451 break;
7c673cae 1452 case BUFFER_HOT:
9f95a23c
TL
1453 ceph_assert(!b->is_empty());
1454 hot.push_back(*b);
7c673cae
FG
1455 break;
1456 default:
11fdf7f2 1457 ceph_abort_msg("bad cache_private");
7c673cae 1458 }
9f95a23c
TL
1459 if (!b->is_empty()) {
1460 buffer_bytes += b->length;
1461 list_bytes[b->cache_private] += b->length;
20effc67 1462 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1463 }
1464 num = hot.size() + warm_in.size();
7c673cae 1465 }
7c673cae 1466
9f95a23c
TL
1467 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1468 {
1469 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1470 if (!b->is_empty()) {
1471 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1472 buffer_bytes += delta;
1473 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1474 list_bytes[b->cache_private] += delta;
20effc67
TL
1475 assert(*(b->cache_age_bin) + delta >= 0);
1476 *(b->cache_age_bin) += delta;
9f95a23c 1477 }
7c673cae 1478 }
7c673cae 1479
9f95a23c
TL
1480 void _touch(BlueStore::Buffer *b) override {
1481 switch (b->cache_private) {
1482 case BUFFER_WARM_IN:
1483 // do nothing (somewhat counter-intuitively!)
1484 break;
1485 case BUFFER_WARM_OUT:
1486 // move from warm_out to hot LRU
1487 ceph_abort_msg("this happens via discard hint");
1488 break;
1489 case BUFFER_HOT:
1490 // move to front of hot LRU
1491 hot.erase(hot.iterator_to(*b));
1492 hot.push_front(*b);
1493 break;
1494 }
20effc67
TL
1495 *(b->cache_age_bin) -= b->length;
1496 b->cache_age_bin = age_bins.front();
1497 *(b->cache_age_bin) += b->length;
9f95a23c
TL
1498 num = hot.size() + warm_in.size();
1499 _audit("_touch_buffer end");
7c673cae 1500 }
7c673cae 1501
9f95a23c
TL
1502 void _trim_to(uint64_t max) override
1503 {
1504 if (buffer_bytes > max) {
1505 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1506 uint64_t khot = max - kin;
1507
1508 // pre-calculate kout based on average buffer size too,
1509 // which is typical(the warm_in and hot lists may change later)
1510 uint64_t kout = 0;
1511 uint64_t buffer_num = hot.size() + warm_in.size();
1512 if (buffer_num) {
1513 uint64_t avg_size = buffer_bytes / buffer_num;
1514 ceph_assert(avg_size);
1515 uint64_t calculated_num = max / avg_size;
1516 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1517 }
1518
1519 if (list_bytes[BUFFER_HOT] < khot) {
1520 // hot is small, give slack to warm_in
1521 kin += khot - list_bytes[BUFFER_HOT];
1522 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1523 // warm_in is small, give slack to hot
1524 khot += kin - list_bytes[BUFFER_WARM_IN];
1525 }
1526
1527 // adjust warm_in list
1528 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1529 uint64_t evicted = 0;
1530
1531 while (to_evict_bytes > 0) {
1532 auto p = warm_in.rbegin();
1533 if (p == warm_in.rend()) {
1534 // stop if warm_in list is now empty
1535 break;
1536 }
7c673cae 1537
9f95a23c
TL
1538 BlueStore::Buffer *b = &*p;
1539 ceph_assert(b->is_clean());
1540 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1541 ceph_assert(buffer_bytes >= b->length);
1542 buffer_bytes -= b->length;
1543 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1544 list_bytes[BUFFER_WARM_IN] -= b->length;
20effc67
TL
1545 assert(*(b->cache_age_bin) >= b->length);
1546 *(b->cache_age_bin) -= b->length;
1547 to_evict_bytes -= b->length;
9f95a23c
TL
1548 evicted += b->length;
1549 b->state = BlueStore::Buffer::STATE_EMPTY;
1550 b->data.clear();
1551 warm_in.erase(warm_in.iterator_to(*b));
1552 warm_out.push_front(*b);
1553 b->cache_private = BUFFER_WARM_OUT;
1554 }
1555
1556 if (evicted > 0) {
1557 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1558 << " from warm_in list, done evicting warm_in buffers"
1559 << dendl;
1560 }
7c673cae 1561
9f95a23c
TL
1562 // adjust hot list
1563 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1564 evicted = 0;
7c673cae 1565
9f95a23c
TL
1566 while (to_evict_bytes > 0) {
1567 auto p = hot.rbegin();
1568 if (p == hot.rend()) {
1569 // stop if hot list is now empty
1570 break;
1571 }
7c673cae 1572
9f95a23c
TL
1573 BlueStore::Buffer *b = &*p;
1574 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1575 ceph_assert(b->is_clean());
1576 // adjust evict size before buffer goes invalid
1577 to_evict_bytes -= b->length;
1578 evicted += b->length;
1579 b->space->_rm_buffer(this, b);
1580 }
7c673cae 1581
9f95a23c
TL
1582 if (evicted > 0) {
1583 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1584 << " from hot list, done evicting hot buffers"
1585 << dendl;
7c673cae
FG
1586 }
1587
9f95a23c
TL
1588 // adjust warm out list too, if necessary
1589 int64_t n = warm_out.size() - kout;
1590 while (n-- > 0) {
1591 BlueStore::Buffer *b = &*warm_out.rbegin();
1592 ceph_assert(b->is_empty());
1593 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1594 b->space->_rm_buffer(this, b);
1595 }
7c673cae 1596 }
9f95a23c
TL
1597 num = hot.size() + warm_in.size();
1598 }
7c673cae 1599
9f95a23c
TL
1600 void add_stats(uint64_t *extents,
1601 uint64_t *blobs,
1602 uint64_t *buffers,
1603 uint64_t *bytes) override {
1604 *extents += num_extents;
1605 *blobs += num_blobs;
1606 *buffers += num;
1607 *bytes += buffer_bytes;
1608 }
7c673cae 1609
9f95a23c
TL
1610#ifdef DEBUG_CACHE
1611 void _audit(const char *s) override
1612 {
1613 dout(10) << __func__ << " " << when << " start" << dendl;
1614 uint64_t s = 0;
1615 for (auto i = hot.begin(); i != hot.end(); ++i) {
1616 s += i->length;
7c673cae
FG
1617 }
1618
9f95a23c
TL
1619 uint64_t hot_bytes = s;
1620 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1621 derr << __func__ << " hot_list_bytes "
1622 << list_bytes[BUFFER_HOT]
1623 << " != actual " << hot_bytes
1624 << dendl;
1625 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
7c673cae
FG
1626 }
1627
9f95a23c
TL
1628 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1629 s += i->length;
7c673cae 1630 }
7c673cae 1631
9f95a23c
TL
1632 uint64_t warm_in_bytes = s - hot_bytes;
1633 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1634 derr << __func__ << " warm_in_list_bytes "
1635 << list_bytes[BUFFER_WARM_IN]
1636 << " != actual " << warm_in_bytes
1637 << dendl;
1638 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
7c673cae 1639 }
7c673cae 1640
9f95a23c
TL
1641 if (s != buffer_bytes) {
1642 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1643 << dendl;
1644 ceph_assert(s == buffer_bytes);
1645 }
7c673cae 1646
9f95a23c
TL
1647 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1648 << " ok" << dendl;
7c673cae 1649 }
9f95a23c
TL
1650#endif
1651};
7c673cae 1652
9f95a23c 1653// BuferCacheShard
7c673cae 1654
9f95a23c
TL
1655BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1656 CephContext* cct,
1657 string type,
1658 PerfCounters *logger)
1659{
1660 BufferCacheShard *c = nullptr;
1661 if (type == "lru")
1662 c = new LruBufferCacheShard(cct);
1663 else if (type == "2q")
1664 c = new TwoQBufferCacheShard(cct);
1665 else
1666 ceph_abort_msg("unrecognized cache type");
1667 c->logger = logger;
1668 return c;
7c673cae 1669}
7c673cae
FG
1670
1671// BufferSpace
1672
1673#undef dout_prefix
1674#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1675
9f95a23c 1676void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
7c673cae
FG
1677{
1678 // note: we already hold cache->lock
1679 ldout(cache->cct, 20) << __func__ << dendl;
1680 while (!buffer_map.empty()) {
1681 _rm_buffer(cache, buffer_map.begin());
1682 }
1683}
1684
9f95a23c 1685int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
7c673cae
FG
1686{
1687 // note: we already hold cache->lock
1688 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1689 << std::dec << dendl;
1690 int cache_private = 0;
1691 cache->_audit("discard start");
1692 auto i = _data_lower_bound(offset);
1693 uint32_t end = offset + length;
1694 while (i != buffer_map.end()) {
1695 Buffer *b = i->second.get();
1696 if (b->offset >= end) {
1697 break;
1698 }
1699 if (b->cache_private > cache_private) {
1700 cache_private = b->cache_private;
1701 }
1702 if (b->offset < offset) {
1703 int64_t front = offset - b->offset;
1704 if (b->end() > end) {
1705 // drop middle (split)
1706 uint32_t tail = b->end() - end;
1707 if (b->data.length()) {
1708 bufferlist bl;
1709 bl.substr_of(b->data, b->length - tail, tail);
f67539c2 1710 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
31f18b77
FG
1711 nb->maybe_rebuild();
1712 _add_buffer(cache, nb, 0, b);
7c673cae 1713 } else {
f67539c2
TL
1714 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail,
1715 b->flags),
1716 0, b);
7c673cae
FG
1717 }
1718 if (!b->is_writing()) {
9f95a23c 1719 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1720 }
1721 b->truncate(front);
31f18b77 1722 b->maybe_rebuild();
7c673cae
FG
1723 cache->_audit("discard end 1");
1724 break;
1725 } else {
1726 // drop tail
1727 if (!b->is_writing()) {
9f95a23c 1728 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1729 }
1730 b->truncate(front);
31f18b77 1731 b->maybe_rebuild();
7c673cae
FG
1732 ++i;
1733 continue;
1734 }
1735 }
1736 if (b->end() <= end) {
1737 // drop entire buffer
1738 _rm_buffer(cache, i++);
1739 continue;
1740 }
1741 // drop front
1742 uint32_t keep = b->end() - end;
1743 if (b->data.length()) {
1744 bufferlist bl;
1745 bl.substr_of(b->data, b->length - keep, keep);
f67539c2 1746 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
31f18b77
FG
1747 nb->maybe_rebuild();
1748 _add_buffer(cache, nb, 0, b);
7c673cae 1749 } else {
f67539c2
TL
1750 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep,
1751 b->flags),
1752 0, b);
7c673cae
FG
1753 }
1754 _rm_buffer(cache, i);
1755 cache->_audit("discard end 2");
1756 break;
1757 }
1758 return cache_private;
1759}
1760
1761void BlueStore::BufferSpace::read(
9f95a23c 1762 BufferCacheShard* cache,
224ce89b
WB
1763 uint32_t offset,
1764 uint32_t length,
7c673cae 1765 BlueStore::ready_regions_t& res,
91327a77
AA
1766 interval_set<uint32_t>& res_intervals,
1767 int flags)
7c673cae 1768{
7c673cae
FG
1769 res.clear();
1770 res_intervals.clear();
1771 uint32_t want_bytes = length;
1772 uint32_t end = offset + length;
224ce89b
WB
1773
1774 {
11fdf7f2 1775 std::lock_guard l(cache->lock);
224ce89b
WB
1776 for (auto i = _data_lower_bound(offset);
1777 i != buffer_map.end() && offset < end && i->first < end;
1778 ++i) {
1779 Buffer *b = i->second.get();
11fdf7f2 1780 ceph_assert(b->end() > offset);
91327a77
AA
1781
1782 bool val = false;
1783 if (flags & BYPASS_CLEAN_CACHE)
1784 val = b->is_writing();
1785 else
1786 val = b->is_writing() || b->is_clean();
1787 if (val) {
224ce89b
WB
1788 if (b->offset < offset) {
1789 uint32_t skip = offset - b->offset;
11fdf7f2 1790 uint32_t l = min(length, b->length - skip);
224ce89b
WB
1791 res[offset].substr_of(b->data, skip, l);
1792 res_intervals.insert(offset, l);
1793 offset += l;
1794 length -= l;
1795 if (!b->is_writing()) {
9f95a23c 1796 cache->_touch(b);
f67539c2 1797 }
224ce89b
WB
1798 continue;
1799 }
1800 if (b->offset > offset) {
1801 uint32_t gap = b->offset - offset;
1802 if (length <= gap) {
1803 break;
1804 }
1805 offset += gap;
1806 length -= gap;
1807 }
1808 if (!b->is_writing()) {
9f95a23c 1809 cache->_touch(b);
224ce89b
WB
1810 }
1811 if (b->length > length) {
1812 res[offset].substr_of(b->data, 0, length);
1813 res_intervals.insert(offset, length);
7c673cae 1814 break;
224ce89b
WB
1815 } else {
1816 res[offset].append(b->data);
1817 res_intervals.insert(offset, b->length);
1818 if (b->length == length)
1819 break;
1820 offset += b->length;
1821 length -= b->length;
1822 }
7c673cae
FG
1823 }
1824 }
1825 }
1826
1827 uint64_t hit_bytes = res_intervals.size();
11fdf7f2 1828 ceph_assert(hit_bytes <= want_bytes);
7c673cae
FG
1829 uint64_t miss_bytes = want_bytes - hit_bytes;
1830 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1831 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1832}
1833
9f95a23c 1834void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
7c673cae 1835{
7c673cae
FG
1836 auto i = writing.begin();
1837 while (i != writing.end()) {
1838 if (i->seq > seq) {
1839 break;
1840 }
1841 if (i->seq < seq) {
1842 ++i;
1843 continue;
1844 }
1845
1846 Buffer *b = &*i;
11fdf7f2 1847 ceph_assert(b->is_writing());
7c673cae
FG
1848
1849 if (b->flags & Buffer::FLAG_NOCACHE) {
1850 writing.erase(i++);
1851 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1852 buffer_map.erase(b->offset);
1853 } else {
1854 b->state = Buffer::STATE_CLEAN;
1855 writing.erase(i++);
31f18b77
FG
1856 b->maybe_rebuild();
1857 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
9f95a23c 1858 cache->_add(b, 1, nullptr);
7c673cae
FG
1859 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1860 }
1861 }
9f95a23c 1862 cache->_trim();
7c673cae
FG
1863 cache->_audit("finish_write end");
1864}
1865
9f95a23c 1866void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
7c673cae 1867{
11fdf7f2 1868 std::lock_guard lk(cache->lock);
7c673cae
FG
1869 if (buffer_map.empty())
1870 return;
1871
1872 auto p = --buffer_map.end();
1873 while (true) {
1874 if (p->second->end() <= pos)
1875 break;
1876
1877 if (p->second->offset < pos) {
1878 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1879 size_t left = pos - p->second->offset;
1880 size_t right = p->second->length - left;
1881 if (p->second->data.length()) {
1882 bufferlist bl;
1883 bl.substr_of(p->second->data, left, right);
f67539c2
TL
1884 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1885 0, bl, p->second->flags),
7c673cae
FG
1886 0, p->second.get());
1887 } else {
f67539c2
TL
1888 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1889 0, right, p->second->flags),
7c673cae
FG
1890 0, p->second.get());
1891 }
9f95a23c 1892 cache->_adjust_size(p->second.get(), -right);
7c673cae
FG
1893 p->second->truncate(left);
1894 break;
1895 }
1896
11fdf7f2 1897 ceph_assert(p->second->end() > pos);
7c673cae
FG
1898 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1899 if (p->second->data.length()) {
1900 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
f67539c2 1901 p->second->offset - pos, p->second->data, p->second->flags),
7c673cae
FG
1902 0, p->second.get());
1903 } else {
1904 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
f67539c2 1905 p->second->offset - pos, p->second->length, p->second->flags),
7c673cae
FG
1906 0, p->second.get());
1907 }
1908 if (p == buffer_map.begin()) {
1909 _rm_buffer(cache, p);
1910 break;
1911 } else {
1912 _rm_buffer(cache, p--);
1913 }
1914 }
11fdf7f2 1915 ceph_assert(writing.empty());
9f95a23c 1916 cache->_trim();
7c673cae
FG
1917}
1918
1919// OnodeSpace
1920
1921#undef dout_prefix
1922#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1923
f6b5b4d7
TL
1924BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
1925 OnodeRef& o)
7c673cae 1926{
11fdf7f2 1927 std::lock_guard l(cache->lock);
7c673cae
FG
1928 auto p = onode_map.find(oid);
1929 if (p != onode_map.end()) {
1930 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1931 << " raced, returning existing " << p->second
1932 << dendl;
1933 return p->second;
1934 }
f6b5b4d7 1935 ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
7c673cae 1936 onode_map[oid] = o;
f6b5b4d7 1937 cache->_add(o.get(), 1);
9f95a23c 1938 cache->_trim();
7c673cae
FG
1939 return o;
1940}
1941
f6b5b4d7
TL
1942void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
1943{
1944 ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
1945 onode_map.erase(oid);
1946}
1947
7c673cae
FG
1948BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1949{
7c673cae 1950 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b 1951 OnodeRef o;
224ce89b
WB
1952
1953 {
11fdf7f2 1954 std::lock_guard l(cache->lock);
224ce89b
WB
1955 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1956 if (p == onode_map.end()) {
20effc67 1957 cache->logger->inc(l_bluestore_onode_misses);
224ce89b
WB
1958 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1959 } else {
1960 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
f6b5b4d7
TL
1961 << " " << p->second->nref
1962 << " " << p->second->cached
1963 << " " << p->second->pinned
224ce89b 1964 << dendl;
f6b5b4d7
TL
1965 // This will pin onode and implicitly touch the cache when Onode
1966 // eventually will become unpinned
224ce89b 1967 o = p->second;
f6b5b4d7
TL
1968 ceph_assert(!o->cached || o->pinned);
1969
20effc67 1970 cache->logger->inc(l_bluestore_onode_hits);
224ce89b
WB
1971 }
1972 }
1973
224ce89b 1974 return o;
7c673cae
FG
1975}
1976
1977void BlueStore::OnodeSpace::clear()
1978{
11fdf7f2 1979 std::lock_guard l(cache->lock);
f6b5b4d7 1980 ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
7c673cae 1981 for (auto &p : onode_map) {
f6b5b4d7 1982 cache->_rm(p.second.get());
7c673cae
FG
1983 }
1984 onode_map.clear();
1985}
1986
1987bool BlueStore::OnodeSpace::empty()
1988{
11fdf7f2 1989 std::lock_guard l(cache->lock);
7c673cae
FG
1990 return onode_map.empty();
1991}
1992
1993void BlueStore::OnodeSpace::rename(
1994 OnodeRef& oldo,
1995 const ghobject_t& old_oid,
1996 const ghobject_t& new_oid,
f91f0fd5 1997 const mempool::bluestore_cache_meta::string& new_okey)
7c673cae 1998{
11fdf7f2 1999 std::lock_guard l(cache->lock);
7c673cae
FG
2000 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
2001 << dendl;
2002 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
2003 po = onode_map.find(old_oid);
2004 pn = onode_map.find(new_oid);
11fdf7f2 2005 ceph_assert(po != pn);
7c673cae 2006
11fdf7f2 2007 ceph_assert(po != onode_map.end());
7c673cae
FG
2008 if (pn != onode_map.end()) {
2009 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
2010 << dendl;
f6b5b4d7 2011 cache->_rm(pn->second.get());
7c673cae
FG
2012 onode_map.erase(pn);
2013 }
2014 OnodeRef o = po->second;
2015
2016 // install a non-existent onode at old location
2017 oldo.reset(new Onode(o->c, old_oid, o->key));
2018 po->second = oldo;
f6b5b4d7
TL
2019 cache->_add(oldo.get(), 1);
2020 // add at new position and fix oid, key.
2021 // This will pin 'o' and implicitly touch cache
2022 // when it will eventually become unpinned
7c673cae 2023 onode_map.insert(make_pair(new_oid, o));
f6b5b4d7
TL
2024 ceph_assert(o->pinned);
2025
7c673cae
FG
2026 o->oid = new_oid;
2027 o->key = new_okey;
9f95a23c 2028 cache->_trim();
7c673cae
FG
2029}
2030
adb31ebb 2031bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
7c673cae 2032{
11fdf7f2 2033 std::lock_guard l(cache->lock);
7c673cae
FG
2034 ldout(cache->cct, 20) << __func__ << dendl;
2035 for (auto& i : onode_map) {
adb31ebb 2036 if (f(i.second.get())) {
7c673cae
FG
2037 return true;
2038 }
2039 }
2040 return false;
2041}
2042
11fdf7f2
TL
2043template <int LogLevelV = 30>
2044void BlueStore::OnodeSpace::dump(CephContext *cct)
3efd9988
FG
2045{
2046 for (auto& i : onode_map) {
f6b5b4d7
TL
2047 ldout(cct, LogLevelV) << i.first << " : " << i.second
2048 << " " << i.second->nref
2049 << " " << i.second->cached
2050 << " " << i.second->pinned
2051 << dendl;
3efd9988
FG
2052 }
2053}
7c673cae
FG
2054
2055// SharedBlob
2056
2057#undef dout_prefix
2058#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
9f95a23c
TL
2059#undef dout_context
2060#define dout_context coll->store->cct
7c673cae 2061
9f95a23c 2062void BlueStore::SharedBlob::dump(Formatter* f) const
7c673cae 2063{
9f95a23c
TL
2064 f->dump_bool("loaded", loaded);
2065 if (loaded) {
2066 persistent->dump(f);
2067 } else {
2068 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
2069 }
2070}
2071
2072ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
2073{
2074 out << "SharedBlob(" << &sb;
2075
7c673cae
FG
2076 if (sb.loaded) {
2077 out << " loaded " << *sb.persistent;
2078 } else {
2079 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
2080 }
2081 return out << ")";
2082}
2083
2084BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
2085 : coll(_coll), sbid_unloaded(i)
2086{
11fdf7f2 2087 ceph_assert(sbid_unloaded > 0);
7c673cae
FG
2088 if (get_cache()) {
2089 get_cache()->add_blob();
2090 }
2091}
2092
2093BlueStore::SharedBlob::~SharedBlob()
2094{
7c673cae
FG
2095 if (loaded && persistent) {
2096 delete persistent;
2097 }
2098}
2099
2100void BlueStore::SharedBlob::put()
2101{
2102 if (--nref == 0) {
9f95a23c
TL
2103 dout(20) << __func__ << " " << this
2104 << " removing self from set " << get_parent()
2105 << dendl;
1adf2230
AA
2106 again:
2107 auto coll_snap = coll;
2108 if (coll_snap) {
11fdf7f2 2109 std::lock_guard l(coll_snap->cache->lock);
1adf2230
AA
2110 if (coll_snap != coll) {
2111 goto again;
2112 }
91327a77
AA
2113 if (!coll_snap->shared_blob_set.remove(this, true)) {
2114 // race with lookup
2115 return;
2116 }
1adf2230
AA
2117 bc._clear(coll_snap->cache);
2118 coll_snap->cache->rm_blob();
7c673cae 2119 }
28e407b8 2120 delete this;
7c673cae
FG
2121 }
2122}
2123
2124void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
2125{
11fdf7f2 2126 ceph_assert(persistent);
7c673cae
FG
2127 persistent->ref_map.get(offset, length);
2128}
2129
2130void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77 2131 PExtentVector *r,
11fdf7f2 2132 bool *unshare)
7c673cae 2133{
11fdf7f2
TL
2134 ceph_assert(persistent);
2135 persistent->ref_map.put(offset, length, r,
2136 unshare && !*unshare ? unshare : nullptr);
7c673cae
FG
2137}
2138
f64942e4
AA
2139void BlueStore::SharedBlob::finish_write(uint64_t seq)
2140{
2141 while (true) {
9f95a23c 2142 BufferCacheShard *cache = coll->cache;
11fdf7f2 2143 std::lock_guard l(cache->lock);
f64942e4 2144 if (coll->cache != cache) {
9f95a23c
TL
2145 dout(20) << __func__
2146 << " raced with sb cache update, was " << cache
2147 << ", now " << coll->cache << ", retrying"
2148 << dendl;
f64942e4
AA
2149 continue;
2150 }
2151 bc._finish_write(cache, seq);
2152 break;
2153 }
2154}
2155
3efd9988
FG
2156// SharedBlobSet
2157
2158#undef dout_prefix
2159#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2160
11fdf7f2
TL
2161template <int LogLevelV = 30>
2162void BlueStore::SharedBlobSet::dump(CephContext *cct)
3efd9988 2163{
11fdf7f2 2164 std::lock_guard l(lock);
3efd9988 2165 for (auto& i : sb_map) {
11fdf7f2 2166 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
3efd9988
FG
2167 }
2168}
2169
7c673cae
FG
2170// Blob
2171
2172#undef dout_prefix
2173#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2174
9f95a23c
TL
2175void BlueStore::Blob::dump(Formatter* f) const
2176{
2177 if (is_spanning()) {
2178 f->dump_unsigned("spanning_id ", id);
2179 }
2180 blob.dump(f);
2181 if (shared_blob) {
2182 f->dump_object("shared", *shared_blob);
2183 }
2184}
2185
7c673cae
FG
2186ostream& operator<<(ostream& out, const BlueStore::Blob& b)
2187{
2188 out << "Blob(" << &b;
2189 if (b.is_spanning()) {
2190 out << " spanning " << b.id;
2191 }
35e4c445
FG
2192 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
2193 if (b.shared_blob) {
2194 out << " " << *b.shared_blob;
2195 } else {
2196 out << " (shared_blob=NULL)";
2197 }
2198 out << ")";
7c673cae
FG
2199 return out;
2200}
2201
2202void BlueStore::Blob::discard_unallocated(Collection *coll)
2203{
224ce89b 2204 if (get_blob().is_shared()) {
7c673cae
FG
2205 return;
2206 }
224ce89b 2207 if (get_blob().is_compressed()) {
7c673cae
FG
2208 bool discard = false;
2209 bool all_invalid = true;
224ce89b 2210 for (auto e : get_blob().get_extents()) {
7c673cae
FG
2211 if (!e.is_valid()) {
2212 discard = true;
2213 } else {
2214 all_invalid = false;
2215 }
2216 }
11fdf7f2 2217 ceph_assert(discard == all_invalid); // in case of compressed blob all
7c673cae
FG
2218 // or none pextents are invalid.
2219 if (discard) {
224ce89b
WB
2220 shared_blob->bc.discard(shared_blob->get_cache(), 0,
2221 get_blob().get_logical_length());
7c673cae
FG
2222 }
2223 } else {
2224 size_t pos = 0;
224ce89b 2225 for (auto e : get_blob().get_extents()) {
7c673cae 2226 if (!e.is_valid()) {
9f95a23c
TL
2227 dout(20) << __func__ << " 0x" << std::hex << pos
2228 << "~" << e.length
2229 << std::dec << dendl;
7c673cae
FG
2230 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
2231 }
2232 pos += e.length;
2233 }
224ce89b
WB
2234 if (get_blob().can_prune_tail()) {
2235 dirty_blob().prune_tail();
2236 used_in_blob.prune_tail(get_blob().get_ondisk_length());
224ce89b 2237 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
2238 }
2239 }
2240}
2241
2242void BlueStore::Blob::get_ref(
2243 Collection *coll,
2244 uint32_t offset,
2245 uint32_t length)
2246{
2247 // Caller has to initialize Blob's logical length prior to increment
2248 // references. Otherwise one is neither unable to determine required
2249 // amount of counters in case of per-au tracking nor obtain min_release_size
2250 // for single counter mode.
11fdf7f2 2251 ceph_assert(get_blob().get_logical_length() != 0);
7c673cae
FG
2252 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2253 << std::dec << " " << *this << dendl;
2254
2255 if (used_in_blob.is_empty()) {
2256 uint32_t min_release_size =
224ce89b
WB
2257 get_blob().get_release_size(coll->store->min_alloc_size);
2258 uint64_t l = get_blob().get_logical_length();
2259 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
2260 << min_release_size << std::dec << dendl;
7c673cae
FG
2261 used_in_blob.init(l, min_release_size);
2262 }
2263 used_in_blob.get(
2264 offset,
2265 length);
2266}
2267
2268bool BlueStore::Blob::put_ref(
2269 Collection *coll,
2270 uint32_t offset,
2271 uint32_t length,
2272 PExtentVector *r)
2273{
2274 PExtentVector logical;
2275
7c673cae
FG
2276 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2277 << std::dec << " " << *this << dendl;
2278
2279 bool empty = used_in_blob.put(
2280 offset,
2281 length,
2282 &logical);
2283 r->clear();
2284 // nothing to release
2285 if (!empty && logical.empty()) {
2286 return false;
2287 }
2288
2289 bluestore_blob_t& b = dirty_blob();
2290 return b.release_extents(empty, logical, r);
2291}
2292
224ce89b 2293bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
2294 uint32_t target_blob_size,
2295 uint32_t b_offset,
2296 uint32_t *length0) {
11fdf7f2
TL
2297 ceph_assert(min_alloc_size);
2298 ceph_assert(target_blob_size);
7c673cae
FG
2299 if (!get_blob().is_mutable()) {
2300 return false;
2301 }
2302
2303 uint32_t length = *length0;
2304 uint32_t end = b_offset + length;
2305
2306 // Currently for the sake of simplicity we omit blob reuse if data is
2307 // unaligned with csum chunk. Later we can perform padding if needed.
2308 if (get_blob().has_csum() &&
2309 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2310 (end % get_blob().get_csum_chunk_size()) != 0)) {
2311 return false;
2312 }
2313
2314 auto blen = get_blob().get_logical_length();
2315 uint32_t new_blen = blen;
2316
2317 // make sure target_blob_size isn't less than current blob len
11fdf7f2 2318 target_blob_size = std::max(blen, target_blob_size);
7c673cae
FG
2319
2320 if (b_offset >= blen) {
224ce89b
WB
2321 // new data totally stands out of the existing blob
2322 new_blen = end;
7c673cae 2323 } else {
224ce89b 2324 // new data overlaps with the existing blob
11fdf7f2 2325 new_blen = std::max(blen, end);
224ce89b
WB
2326
2327 uint32_t overlap = 0;
2328 if (new_blen > blen) {
2329 overlap = blen - b_offset;
2330 } else {
2331 overlap = length;
2332 }
2333
2334 if (!get_blob().is_unallocated(b_offset, overlap)) {
2335 // abort if any piece of the overlap has already been allocated
2336 return false;
7c673cae
FG
2337 }
2338 }
224ce89b 2339
7c673cae
FG
2340 if (new_blen > blen) {
2341 int64_t overflow = int64_t(new_blen) - target_blob_size;
2342 // Unable to decrease the provided length to fit into max_blob_size
2343 if (overflow >= length) {
2344 return false;
2345 }
2346
2347 // FIXME: in some cases we could reduce unused resolution
2348 if (get_blob().has_unused()) {
2349 return false;
2350 }
2351
2352 if (overflow > 0) {
2353 new_blen -= overflow;
2354 length -= overflow;
2355 *length0 = length;
2356 }
224ce89b 2357
7c673cae
FG
2358 if (new_blen > blen) {
2359 dirty_blob().add_tail(new_blen);
2360 used_in_blob.add_tail(new_blen,
224ce89b 2361 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
2362 }
2363 }
2364 return true;
2365}
2366
2367void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2368{
7c673cae
FG
2369 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2370 << " start " << *this << dendl;
11fdf7f2
TL
2371 ceph_assert(blob.can_split());
2372 ceph_assert(used_in_blob.can_split());
7c673cae
FG
2373 bluestore_blob_t &lb = dirty_blob();
2374 bluestore_blob_t &rb = r->dirty_blob();
2375
2376 used_in_blob.split(
2377 blob_offset,
2378 &(r->used_in_blob));
2379
2380 lb.split(blob_offset, rb);
2381 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2382
2383 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2384 << " finish " << *this << dendl;
2385 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2386 << " and " << *r << dendl;
2387}
2388
2389#ifndef CACHE_BLOB_BL
2390void BlueStore::Blob::decode(
2391 Collection *coll,
11fdf7f2 2392 bufferptr::const_iterator& p,
7c673cae
FG
2393 uint64_t struct_v,
2394 uint64_t* sbid,
2395 bool include_ref_map)
2396{
2397 denc(blob, p, struct_v);
2398 if (blob.is_shared()) {
2399 denc(*sbid, p);
2400 }
2401 if (include_ref_map) {
2402 if (struct_v > 1) {
2403 used_in_blob.decode(p);
2404 } else {
2405 used_in_blob.clear();
2406 bluestore_extent_ref_map_t legacy_ref_map;
2407 legacy_ref_map.decode(p);
2408 for (auto r : legacy_ref_map.ref_map) {
2409 get_ref(
2410 coll,
2411 r.first,
2412 r.second.refs * r.second.length);
2413 }
2414 }
2415 }
2416}
2417#endif
2418
2419// Extent
2420
9f95a23c
TL
2421void BlueStore::Extent::dump(Formatter* f) const
2422{
2423 f->dump_unsigned("logical_offset", logical_offset);
2424 f->dump_unsigned("length", length);
2425 f->dump_unsigned("blob_offset", blob_offset);
2426 f->dump_object("blob", *blob);
2427}
2428
7c673cae
FG
2429ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2430{
2431 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2432 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2433 << " " << *e.blob;
2434}
2435
2436// OldExtent
2437BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2438 uint32_t lo,
2439 uint32_t o,
2440 uint32_t l,
2441 BlobRef& b) {
2442 OldExtent* oe = new OldExtent(lo, o, l, b);
2443 b->put_ref(c.get(), o, l, &(oe->r));
adb31ebb 2444 oe->blob_empty = !b->is_referenced();
7c673cae
FG
2445 return oe;
2446}
2447
2448// ExtentMap
2449
2450#undef dout_prefix
2451#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
9f95a23c
TL
2452#undef dout_context
2453#define dout_context onode->c->store->cct
7c673cae
FG
2454
2455BlueStore::ExtentMap::ExtentMap(Onode *o)
2456 : onode(o),
2457 inline_bl(
2458 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2459}
2460
9f95a23c
TL
2461void BlueStore::ExtentMap::dump(Formatter* f) const
2462{
2463 f->open_array_section("extents");
2464
2465 for (auto& e : extent_map) {
2466 f->dump_object("extent", e);
2467 }
2468 f->close_section();
2469}
2470
11fdf7f2
TL
2471void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2472 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2473 uint64_t& length, uint64_t& dstoff) {
2474
2475 auto cct = onode->c->store->cct;
2476 bool inject_21040 =
2477 cct->_conf->bluestore_debug_inject_bug21040;
2478 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2479 for (auto& e : oldo->extent_map.extent_map) {
2480 e.blob->last_encoded_id = -1;
2481 }
2482
2483 int n = 0;
2484 uint64_t end = srcoff + length;
2485 uint32_t dirty_range_begin = 0;
2486 uint32_t dirty_range_end = 0;
2487 bool src_dirty = false;
2488 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2489 ep != oldo->extent_map.extent_map.end();
2490 ++ep) {
2491 auto& e = *ep;
2492 if (e.logical_offset >= end) {
2493 break;
2494 }
2495 dout(20) << __func__ << " src " << e << dendl;
2496 BlobRef cb;
2497 bool blob_duped = true;
2498 if (e.blob->last_encoded_id >= 0) {
2499 cb = id_to_blob[e.blob->last_encoded_id];
2500 blob_duped = false;
2501 } else {
2502 // dup the blob
2503 const bluestore_blob_t& blob = e.blob->get_blob();
2504 // make sure it is shared
2505 if (!blob.is_shared()) {
2506 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2507 if (!inject_21040 && !src_dirty) {
2508 src_dirty = true;
2509 dirty_range_begin = e.logical_offset;
2510 } else if (inject_21040 &&
2511 dirty_range_begin == 0 && dirty_range_end == 0) {
2512 dirty_range_begin = e.logical_offset;
2513 }
2514 ceph_assert(e.logical_end() > 0);
2515 // -1 to exclude next potential shard
2516 dirty_range_end = e.logical_end() - 1;
2517 } else {
2518 c->load_shared_blob(e.blob->shared_blob);
2519 }
2520 cb = new Blob();
2521 e.blob->last_encoded_id = n;
2522 id_to_blob[n] = cb;
2523 e.blob->dup(*cb);
2524 // bump the extent refs on the copied blob's extents
2525 for (auto p : blob.get_extents()) {
2526 if (p.is_valid()) {
2527 e.blob->shared_blob->get_ref(p.offset, p.length);
2528 }
2529 }
2530 txc->write_shared_blob(e.blob->shared_blob);
2531 dout(20) << __func__ << " new " << *cb << dendl;
2532 }
2533
2534 int skip_front, skip_back;
2535 if (e.logical_offset < srcoff) {
2536 skip_front = srcoff - e.logical_offset;
2537 } else {
2538 skip_front = 0;
2539 }
2540 if (e.logical_end() > end) {
2541 skip_back = e.logical_end() - end;
2542 } else {
2543 skip_back = 0;
2544 }
2545
2546 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2547 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2548 newo->extent_map.extent_map.insert(*ne);
2549 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2550 // fixme: we may leave parts of new blob unreferenced that could
2551 // be freed (relative to the shared_blob).
2552 txc->statfs_delta.stored() += ne->length;
2553 if (e.blob->get_blob().is_compressed()) {
2554 txc->statfs_delta.compressed_original() += ne->length;
2555 if (blob_duped) {
2556 txc->statfs_delta.compressed() +=
2557 cb->get_blob().get_compressed_payload_length();
2558 }
2559 }
2560 dout(20) << __func__ << " dst " << *ne << dendl;
2561 ++n;
2562 }
2563 if ((!inject_21040 && src_dirty) ||
2564 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2565 oldo->extent_map.dirty_range(dirty_range_begin,
2566 dirty_range_end - dirty_range_begin);
2567 txc->write_onode(oldo);
2568 }
2569 txc->write_onode(newo);
2570
2571 if (dstoff + length > newo->onode.size) {
2572 newo->onode.size = dstoff + length;
2573 }
2574 newo->extent_map.dirty_range(dstoff, length);
2575}
7c673cae
FG
2576void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2577 bool force)
2578{
2579 auto cct = onode->c->store->cct; //used by dout
2580 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2581 if (onode->onode.extent_map_shards.empty()) {
2582 if (inline_bl.length() == 0) {
2583 unsigned n;
2584 // we need to encode inline_bl to measure encoded length
2585 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
f91f0fd5 2586 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
11fdf7f2 2587 ceph_assert(!never_happen);
7c673cae
FG
2588 size_t len = inline_bl.length();
2589 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2590 << " extents" << dendl;
2591 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2592 request_reshard(0, OBJECT_MAX_SIZE);
2593 return;
2594 }
2595 }
2596 // will persist in the onode key.
2597 } else {
2598 // pending shard update
2599 struct dirty_shard_t {
2600 Shard *shard;
2601 bufferlist bl;
2602 dirty_shard_t(Shard *s) : shard(s) {}
2603 };
2604 vector<dirty_shard_t> encoded_shards;
2605 // allocate slots for all shards in a single call instead of
2606 // doing multiple allocations - one per each dirty shard
2607 encoded_shards.reserve(shards.size());
2608
2609 auto p = shards.begin();
2610 auto prev_p = p;
2611 while (p != shards.end()) {
11fdf7f2 2612 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2613 auto n = p;
2614 ++n;
2615 if (p->dirty) {
2616 uint32_t endoff;
2617 if (n == shards.end()) {
2618 endoff = OBJECT_MAX_SIZE;
2619 } else {
2620 endoff = n->shard_info->offset;
2621 }
2622 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2623 bufferlist& bl = encoded_shards.back().bl;
2624 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2625 bl, &p->extents)) {
2626 if (force) {
2627 derr << __func__ << " encode_some needs reshard" << dendl;
11fdf7f2 2628 ceph_assert(!force);
7c673cae
FG
2629 }
2630 }
2631 size_t len = bl.length();
2632
2633 dout(20) << __func__ << " shard 0x" << std::hex
2634 << p->shard_info->offset << std::dec << " is " << len
2635 << " bytes (was " << p->shard_info->bytes << ") from "
2636 << p->extents << " extents" << dendl;
2637
2638 if (!force) {
2639 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2640 // we are big; reshard ourselves
2641 request_reshard(p->shard_info->offset, endoff);
2642 }
2643 // avoid resharding the trailing shard, even if it is small
2644 else if (n != shards.end() &&
11fdf7f2
TL
2645 len < g_conf()->bluestore_extent_map_shard_min_size) {
2646 ceph_assert(endoff != OBJECT_MAX_SIZE);
31f18b77
FG
2647 if (p == shards.begin()) {
2648 // we are the first shard, combine with next shard
7c673cae 2649 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2650 } else {
31f18b77
FG
2651 // combine either with the previous shard or the next,
2652 // whichever is smaller
7c673cae
FG
2653 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2654 request_reshard(p->shard_info->offset, endoff + 1);
2655 } else {
2656 request_reshard(prev_p->shard_info->offset, endoff);
2657 }
2658 }
2659 }
2660 }
2661 }
2662 prev_p = p;
2663 p = n;
2664 }
2665 if (needs_reshard()) {
2666 return;
2667 }
2668
2669 // schedule DB update for dirty shards
2670 string key;
2671 for (auto& it : encoded_shards) {
20effc67
TL
2672 dout(20) << __func__ << " encoding key for shard 0x" << std::hex
2673 << it.shard->shard_info->offset << std::dec << dendl;
7c673cae
FG
2674 it.shard->dirty = false;
2675 it.shard->shard_info->bytes = it.bl.length();
2676 generate_extent_shard_key_and_apply(
2677 onode->key,
2678 it.shard->shard_info->offset,
2679 &key,
2680 [&](const string& final_key) {
2681 t->set(PREFIX_OBJ, final_key, it.bl);
2682 }
2683 );
2684 }
2685 }
2686}
2687
31f18b77
FG
2688bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2689{
2690 if (spanning_blob_map.empty())
2691 return 0;
2692 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2693 // bid is valid and available.
2694 if (bid >= 0)
2695 return bid;
2696 // Find next unused bid;
2697 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2698 const auto begin_bid = bid;
2699 do {
2700 if (!spanning_blob_map.count(bid))
2701 return bid;
2702 else {
2703 bid++;
2704 if (bid < 0) bid = 0;
2705 }
2706 } while (bid != begin_bid);
81eedcae
TL
2707 auto cct = onode->c->store->cct; // used by dout
2708 _dump_onode<0>(cct, *onode);
11fdf7f2 2709 ceph_abort_msg("no available blob id");
31f18b77
FG
2710}
2711
7c673cae
FG
2712void BlueStore::ExtentMap::reshard(
2713 KeyValueDB *db,
2714 KeyValueDB::Transaction t)
2715{
2716 auto cct = onode->c->store->cct; // used by dout
2717
2718 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2719 << needs_reshard_end << ")" << std::dec
2720 << " of " << onode->onode.extent_map_shards.size()
2721 << " shards on " << onode->oid << dendl;
2722 for (auto& p : spanning_blob_map) {
2723 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2724 << dendl;
2725 }
2726 // determine shard index range
2727 unsigned si_begin = 0, si_end = 0;
2728 if (!shards.empty()) {
2729 while (si_begin + 1 < shards.size() &&
2730 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2731 ++si_begin;
2732 }
2733 needs_reshard_begin = shards[si_begin].shard_info->offset;
2734 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2735 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2736 needs_reshard_end = shards[si_end].shard_info->offset;
2737 break;
2738 }
2739 }
2740 if (si_end == shards.size()) {
2741 needs_reshard_end = OBJECT_MAX_SIZE;
2742 }
2743 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2744 << " over 0x[" << std::hex << needs_reshard_begin << ","
2745 << needs_reshard_end << ")" << std::dec << dendl;
2746 }
2747
181888fb 2748 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2749
2750 // we may need to fault in a larger interval later must have all
2751 // referring extents for spanning blobs loaded in order to have
2752 // accurate use_tracker values.
2753 uint32_t spanning_scan_begin = needs_reshard_begin;
2754 uint32_t spanning_scan_end = needs_reshard_end;
2755
2756 // remove old keys
2757 string key;
2758 for (unsigned i = si_begin; i < si_end; ++i) {
2759 generate_extent_shard_key_and_apply(
2760 onode->key, shards[i].shard_info->offset, &key,
2761 [&](const string& final_key) {
2762 t->rmkey(PREFIX_OBJ, final_key);
2763 }
2764 );
2765 }
2766
2767 // calculate average extent size
2768 unsigned bytes = 0;
2769 unsigned extents = 0;
2770 if (onode->onode.extent_map_shards.empty()) {
2771 bytes = inline_bl.length();
2772 extents = extent_map.size();
2773 } else {
2774 for (unsigned i = si_begin; i < si_end; ++i) {
2775 bytes += shards[i].shard_info->bytes;
2776 extents += shards[i].extents;
2777 }
2778 }
2779 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2780 unsigned slop = target *
2781 cct->_conf->bluestore_extent_map_shard_target_size_slop;
11fdf7f2 2782 unsigned extent_avg = bytes / std::max(1u, extents);
7c673cae
FG
2783 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2784 << ", slop " << slop << dendl;
2785
2786 // reshard
2787 unsigned estimate = 0;
31f18b77 2788 unsigned offset = needs_reshard_begin;
7c673cae
FG
2789 vector<bluestore_onode_t::shard_info> new_shard_info;
2790 unsigned max_blob_end = 0;
2791 Extent dummy(needs_reshard_begin);
2792 for (auto e = extent_map.lower_bound(dummy);
2793 e != extent_map.end();
2794 ++e) {
2795 if (e->logical_offset >= needs_reshard_end) {
2796 break;
2797 }
2798 dout(30) << " extent " << *e << dendl;
2799
2800 // disfavor shard boundaries that span a blob
2801 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2802 if (estimate &&
2803 estimate + extent_avg > target + (would_span ? slop : 0)) {
2804 // new shard
31f18b77 2805 if (offset == needs_reshard_begin) {
7c673cae
FG
2806 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2807 new_shard_info.back().offset = offset;
2808 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2809 << std::dec << dendl;
7c673cae
FG
2810 }
2811 offset = e->logical_offset;
2812 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2813 new_shard_info.back().offset = offset;
2814 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2815 << std::dec << dendl;
2816 estimate = 0;
2817 }
2818 estimate += extent_avg;
31f18b77
FG
2819 unsigned bs = e->blob_start();
2820 if (bs < spanning_scan_begin) {
2821 spanning_scan_begin = bs;
7c673cae
FG
2822 }
2823 uint32_t be = e->blob_end();
2824 if (be > max_blob_end) {
2825 max_blob_end = be;
2826 }
2827 if (be > spanning_scan_end) {
2828 spanning_scan_end = be;
2829 }
2830 }
2831 if (new_shard_info.empty() && (si_begin > 0 ||
2832 si_end < shards.size())) {
2833 // we resharded a partial range; we must produce at least one output
2834 // shard
2835 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2836 new_shard_info.back().offset = needs_reshard_begin;
2837 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2838 << std::dec << " (singleton degenerate case)" << dendl;
2839 }
2840
2841 auto& sv = onode->onode.extent_map_shards;
2842 dout(20) << __func__ << " new " << new_shard_info << dendl;
2843 dout(20) << __func__ << " old " << sv << dendl;
2844 if (sv.empty()) {
2845 // no old shards to keep
2846 sv.swap(new_shard_info);
2847 init_shards(true, true);
2848 } else {
2849 // splice in new shards
2850 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2851 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2852 sv.insert(
2853 sv.begin() + si_begin,
2854 new_shard_info.begin(),
2855 new_shard_info.end());
2856 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2857 si_end = si_begin + new_shard_info.size();
31f18b77 2858
11fdf7f2 2859 ceph_assert(sv.size() == shards.size());
31f18b77
FG
2860
2861 // note that we need to update every shard_info of shards here,
2862 // as sv might have been totally re-allocated above
2863 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2864 shards[i].shard_info = &sv[i];
31f18b77
FG
2865 }
2866
2867 // mark newly added shards as dirty
2868 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2869 shards[i].loaded = true;
2870 shards[i].dirty = true;
2871 }
7c673cae
FG
2872 }
2873 dout(20) << __func__ << " fin " << sv << dendl;
2874 inline_bl.clear();
2875
2876 if (sv.empty()) {
2877 // no more shards; unspan all previously spanning blobs
2878 auto p = spanning_blob_map.begin();
2879 while (p != spanning_blob_map.end()) {
2880 p->second->id = -1;
2881 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2882 p = spanning_blob_map.erase(p);
2883 }
2884 } else {
2885 // identify new spanning blobs
2886 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2887 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2888 if (spanning_scan_begin < needs_reshard_begin) {
2889 fault_range(db, spanning_scan_begin,
2890 needs_reshard_begin - spanning_scan_begin);
2891 }
2892 if (spanning_scan_end > needs_reshard_end) {
2893 fault_range(db, needs_reshard_end,
31f18b77 2894 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2895 }
2896 auto sp = sv.begin() + si_begin;
2897 auto esp = sv.end();
2898 unsigned shard_start = sp->offset;
2899 unsigned shard_end;
2900 ++sp;
2901 if (sp == esp) {
2902 shard_end = OBJECT_MAX_SIZE;
2903 } else {
2904 shard_end = sp->offset;
2905 }
7c673cae 2906 Extent dummy(needs_reshard_begin);
9f95a23c
TL
2907
2908 bool was_too_many_blobs_check = false;
2909 auto too_many_blobs_threshold =
2910 g_conf()->bluestore_debug_too_many_blobs_threshold;
2911 auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
2912 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2913 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
2914
7c673cae
FG
2915 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2916 if (e->logical_offset >= needs_reshard_end) {
2917 break;
2918 }
2919 dout(30) << " extent " << *e << dendl;
2920 while (e->logical_offset >= shard_end) {
2921 shard_start = shard_end;
11fdf7f2 2922 ceph_assert(sp != esp);
7c673cae
FG
2923 ++sp;
2924 if (sp == esp) {
2925 shard_end = OBJECT_MAX_SIZE;
2926 } else {
2927 shard_end = sp->offset;
2928 }
2929 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2930 << " to 0x" << shard_end << std::dec << dendl;
2931 }
9f95a23c 2932
7c673cae
FG
2933 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2934 if (!e->blob->is_spanning()) {
2935 // We have two options: (1) split the blob into pieces at the
2936 // shard boundaries (and adjust extents accordingly), or (2)
2937 // mark it spanning. We prefer to cut the blob if we can. Note that
2938 // we may have to split it multiple times--potentially at every
2939 // shard boundary.
2940 bool must_span = false;
2941 BlobRef b = e->blob;
2942 if (b->can_split()) {
2943 uint32_t bstart = e->blob_start();
2944 uint32_t bend = e->blob_end();
2945 for (const auto& sh : shards) {
2946 if (bstart < sh.shard_info->offset &&
2947 bend > sh.shard_info->offset) {
2948 uint32_t blob_offset = sh.shard_info->offset - bstart;
2949 if (b->can_split_at(blob_offset)) {
2950 dout(20) << __func__ << " splitting blob, bstart 0x"
2951 << std::hex << bstart << " blob_offset 0x"
2952 << blob_offset << std::dec << " " << *b << dendl;
2953 b = split_blob(b, blob_offset, sh.shard_info->offset);
2954 // switch b to the new right-hand side, in case it
2955 // *also* has to get split.
2956 bstart += blob_offset;
2957 onode->c->store->logger->inc(l_bluestore_blob_split);
2958 } else {
2959 must_span = true;
2960 break;
2961 }
2962 }
2963 }
2964 } else {
2965 must_span = true;
2966 }
2967 if (must_span) {
31f18b77
FG
2968 auto bid = allocate_spanning_blob_id();
2969 b->id = bid;
7c673cae
FG
2970 spanning_blob_map[b->id] = b;
2971 dout(20) << __func__ << " adding spanning " << *b << dendl;
9f95a23c
TL
2972 if (!was_too_many_blobs_check &&
2973 too_many_blobs_threshold &&
2974 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2975
2976 was_too_many_blobs_check = true;
2977 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2978 if (dumped_onodes[i].first == onode->oid) {
2979 oid_slot = &dumped_onodes[i];
2980 break;
2981 }
2982 if (!oldest_slot || (oldest_slot &&
2983 dumped_onodes[i].second < oldest_slot->second)) {
2984 oldest_slot = &dumped_onodes[i];
2985 }
2986 }
2987 }
7c673cae
FG
2988 }
2989 }
2990 } else {
2991 if (e->blob->is_spanning()) {
2992 spanning_blob_map.erase(e->blob->id);
2993 e->blob->id = -1;
2994 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2995 }
2996 }
2997 }
9f95a23c
TL
2998 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
2999 (oid_slot &&
3000 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
3001 if (do_dump) {
3002 dout(0) << __func__
3003 << " spanning blob count exceeds threshold, "
3004 << spanning_blob_map.size() << " spanning blobs"
3005 << dendl;
3006 _dump_onode<0>(cct, *onode);
3007 if (oid_slot) {
3008 oid_slot->second = mono_clock::now();
3009 } else {
3010 ceph_assert(oldest_slot);
3011 oldest_slot->first = onode->oid;
3012 oldest_slot->second = mono_clock::now();
3013 }
3014 }
7c673cae
FG
3015 }
3016
3017 clear_needs_reshard();
3018}
3019
3020bool BlueStore::ExtentMap::encode_some(
3021 uint32_t offset,
3022 uint32_t length,
3023 bufferlist& bl,
3024 unsigned *pn)
3025{
7c673cae
FG
3026 Extent dummy(offset);
3027 auto start = extent_map.lower_bound(dummy);
3028 uint32_t end = offset + length;
3029
3030 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
3031 // serialization only. Hence there is no specific
3032 // handling at ExtentMap level.
3033
3034 unsigned n = 0;
3035 size_t bound = 0;
7c673cae
FG
3036 bool must_reshard = false;
3037 for (auto p = start;
3038 p != extent_map.end() && p->logical_offset < end;
3039 ++p, ++n) {
11fdf7f2 3040 ceph_assert(p->logical_offset >= offset);
7c673cae
FG
3041 p->blob->last_encoded_id = -1;
3042 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
3043 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3044 << std::dec << " hit new spanning blob " << *p << dendl;
3045 request_reshard(p->blob_start(), p->blob_end());
3046 must_reshard = true;
3047 }
31f18b77
FG
3048 if (!must_reshard) {
3049 denc_varint(0, bound); // blobid
3050 denc_varint(0, bound); // logical_offset
3051 denc_varint(0, bound); // len
3052 denc_varint(0, bound); // blob_offset
7c673cae 3053
31f18b77
FG
3054 p->blob->bound_encode(
3055 bound,
3056 struct_v,
3057 p->blob->shared_blob->get_sbid(),
3058 false);
3059 }
7c673cae
FG
3060 }
3061 if (must_reshard) {
3062 return true;
3063 }
3064
31f18b77
FG
3065 denc(struct_v, bound);
3066 denc_varint(0, bound); // number of extents
3067
7c673cae
FG
3068 {
3069 auto app = bl.get_contiguous_appender(bound);
3070 denc(struct_v, app);
3071 denc_varint(n, app);
3072 if (pn) {
3073 *pn = n;
3074 }
3075
3076 n = 0;
3077 uint64_t pos = 0;
3078 uint64_t prev_len = 0;
3079 for (auto p = start;
3080 p != extent_map.end() && p->logical_offset < end;
3081 ++p, ++n) {
3082 unsigned blobid;
3083 bool include_blob = false;
3084 if (p->blob->is_spanning()) {
3085 blobid = p->blob->id << BLOBID_SHIFT_BITS;
3086 blobid |= BLOBID_FLAG_SPANNING;
3087 } else if (p->blob->last_encoded_id < 0) {
3088 p->blob->last_encoded_id = n + 1; // so it is always non-zero
3089 include_blob = true;
3090 blobid = 0; // the decoder will infer the id from n
3091 } else {
3092 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
3093 }
3094 if (p->logical_offset == pos) {
3095 blobid |= BLOBID_FLAG_CONTIGUOUS;
3096 }
3097 if (p->blob_offset == 0) {
3098 blobid |= BLOBID_FLAG_ZEROOFFSET;
3099 }
3100 if (p->length == prev_len) {
3101 blobid |= BLOBID_FLAG_SAMELENGTH;
3102 } else {
3103 prev_len = p->length;
3104 }
3105 denc_varint(blobid, app);
3106 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3107 denc_varint_lowz(p->logical_offset - pos, app);
3108 }
3109 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3110 denc_varint_lowz(p->blob_offset, app);
3111 }
3112 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3113 denc_varint_lowz(p->length, app);
3114 }
3115 pos = p->logical_end();
3116 if (include_blob) {
3117 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
3118 }
3119 }
3120 }
3121 /*derr << __func__ << bl << dendl;
3122 derr << __func__ << ":";
3123 bl.hexdump(*_dout);
3124 *_dout << dendl;
3125 */
3126 return false;
3127}
3128
3129unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
3130{
7c673cae
FG
3131 /*
3132 derr << __func__ << ":";
3133 bl.hexdump(*_dout);
3134 *_dout << dendl;
3135 */
3136
11fdf7f2 3137 ceph_assert(bl.get_num_buffers() <= 1);
7c673cae
FG
3138 auto p = bl.front().begin_deep();
3139 __u8 struct_v;
3140 denc(struct_v, p);
3141 // Version 2 differs from v1 in blob's ref_map
3142 // serialization only. Hence there is no specific
3143 // handling at ExtentMap level below.
11fdf7f2 3144 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
3145
3146 uint32_t num;
3147 denc_varint(num, p);
3148 vector<BlobRef> blobs(num);
3149 uint64_t pos = 0;
3150 uint64_t prev_len = 0;
3151 unsigned n = 0;
3152
3153 while (!p.end()) {
3154 Extent *le = new Extent();
3155 uint64_t blobid;
3156 denc_varint(blobid, p);
3157 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3158 uint64_t gap;
3159 denc_varint_lowz(gap, p);
3160 pos += gap;
3161 }
3162 le->logical_offset = pos;
3163 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3164 denc_varint_lowz(le->blob_offset, p);
3165 } else {
3166 le->blob_offset = 0;
3167 }
3168 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3169 denc_varint_lowz(prev_len, p);
3170 }
3171 le->length = prev_len;
3172
3173 if (blobid & BLOBID_FLAG_SPANNING) {
3174 dout(30) << __func__ << " getting spanning blob "
3175 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
3176 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
3177 } else {
3178 blobid >>= BLOBID_SHIFT_BITS;
3179 if (blobid) {
3180 le->assign_blob(blobs[blobid - 1]);
11fdf7f2 3181 ceph_assert(le->blob);
7c673cae
FG
3182 } else {
3183 Blob *b = new Blob();
3184 uint64_t sbid = 0;
3185 b->decode(onode->c, p, struct_v, &sbid, false);
3186 blobs[n] = b;
3187 onode->c->open_shared_blob(sbid, b);
3188 le->assign_blob(b);
3189 }
3190 // we build ref_map dynamically for non-spanning blobs
3191 le->blob->get_ref(
3192 onode->c,
3193 le->blob_offset,
3194 le->length);
3195 }
3196 pos += prev_len;
3197 ++n;
3198 extent_map.insert(*le);
3199 }
3200
11fdf7f2 3201 ceph_assert(n == num);
7c673cae
FG
3202 return num;
3203}
3204
3205void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
3206{
3207 // Version 2 differs from v1 in blob's ref_map
3208 // serialization only. Hence there is no specific
3209 // handling at ExtentMap level.
3210 __u8 struct_v = 2;
3211
3212 denc(struct_v, p);
3213 denc_varint((uint32_t)0, p);
3214 size_t key_size = 0;
3215 denc_varint((uint32_t)0, key_size);
3216 p += spanning_blob_map.size() * key_size;
3217 for (const auto& i : spanning_blob_map) {
3218 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3219 }
3220}
3221
3222void BlueStore::ExtentMap::encode_spanning_blobs(
3223 bufferlist::contiguous_appender& p)
3224{
3225 // Version 2 differs from v1 in blob's ref_map
3226 // serialization only. Hence there is no specific
3227 // handling at ExtentMap level.
3228 __u8 struct_v = 2;
3229
3230 denc(struct_v, p);
3231 denc_varint(spanning_blob_map.size(), p);
3232 for (auto& i : spanning_blob_map) {
3233 denc_varint(i.second->id, p);
3234 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3235 }
3236}
3237
3238void BlueStore::ExtentMap::decode_spanning_blobs(
11fdf7f2 3239 bufferptr::const_iterator& p)
7c673cae
FG
3240{
3241 __u8 struct_v;
3242 denc(struct_v, p);
3243 // Version 2 differs from v1 in blob's ref_map
3244 // serialization only. Hence there is no specific
3245 // handling at ExtentMap level.
11fdf7f2 3246 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
3247
3248 unsigned n;
3249 denc_varint(n, p);
3250 while (n--) {
3251 BlobRef b(new Blob());
3252 denc_varint(b->id, p);
3253 spanning_blob_map[b->id] = b;
3254 uint64_t sbid = 0;
3255 b->decode(onode->c, p, struct_v, &sbid, true);
3256 onode->c->open_shared_blob(sbid, b);
3257 }
3258}
3259
3260void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
3261{
3262 shards.resize(onode->onode.extent_map_shards.size());
3263 unsigned i = 0;
3264 for (auto &s : onode->onode.extent_map_shards) {
3265 shards[i].shard_info = &s;
3266 shards[i].loaded = loaded;
3267 shards[i].dirty = dirty;
3268 ++i;
3269 }
3270}
3271
3272void BlueStore::ExtentMap::fault_range(
3273 KeyValueDB *db,
3274 uint32_t offset,
3275 uint32_t length)
3276{
7c673cae
FG
3277 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3278 << std::dec << dendl;
3279 auto start = seek_shard(offset);
3280 auto last = seek_shard(offset + length);
3281
3282 if (start < 0)
3283 return;
3284
11fdf7f2 3285 ceph_assert(last >= start);
7c673cae
FG
3286 string key;
3287 while (start <= last) {
11fdf7f2 3288 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3289 auto p = &shards[start];
3290 if (!p->loaded) {
3291 dout(30) << __func__ << " opening shard 0x" << std::hex
3292 << p->shard_info->offset << std::dec << dendl;
3293 bufferlist v;
3294 generate_extent_shard_key_and_apply(
3295 onode->key, p->shard_info->offset, &key,
3296 [&](const string& final_key) {
3297 int r = db->get(PREFIX_OBJ, final_key, &v);
3298 if (r < 0) {
3299 derr << __func__ << " missing shard 0x" << std::hex
3300 << p->shard_info->offset << std::dec << " for " << onode->oid
3301 << dendl;
11fdf7f2 3302 ceph_assert(r >= 0);
7c673cae
FG
3303 }
3304 }
3305 );
3306 p->extents = decode_some(v);
3307 p->loaded = true;
3308 dout(20) << __func__ << " open shard 0x" << std::hex
81eedcae
TL
3309 << p->shard_info->offset
3310 << " for range 0x" << offset << "~" << length << std::dec
7c673cae 3311 << " (" << v.length() << " bytes)" << dendl;
11fdf7f2
TL
3312 ceph_assert(p->dirty == false);
3313 ceph_assert(v.length() == p->shard_info->bytes);
7c673cae
FG
3314 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3315 } else {
3316 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3317 }
3318 ++start;
3319 }
3320}
3321
3322void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
3323 uint32_t offset,
3324 uint32_t length)
3325{
7c673cae
FG
3326 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3327 << std::dec << dendl;
3328 if (shards.empty()) {
3329 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3330 inline_bl.clear();
3331 return;
3332 }
3333 auto start = seek_shard(offset);
11fdf7f2
TL
3334 if (length == 0) {
3335 length = 1;
3336 }
3337 auto last = seek_shard(offset + length - 1);
7c673cae
FG
3338 if (start < 0)
3339 return;
3340
11fdf7f2 3341 ceph_assert(last >= start);
7c673cae 3342 while (start <= last) {
11fdf7f2 3343 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3344 auto p = &shards[start];
3345 if (!p->loaded) {
11fdf7f2
TL
3346 derr << __func__ << "on write 0x" << std::hex << offset
3347 << "~" << length << " shard 0x" << p->shard_info->offset
3348 << std::dec << " is not loaded, can't mark dirty" << dendl;
3349 ceph_abort_msg("can't mark unloaded shard dirty");
7c673cae
FG
3350 }
3351 if (!p->dirty) {
3352 dout(20) << __func__ << " mark shard 0x" << std::hex
3353 << p->shard_info->offset << std::dec << " dirty" << dendl;
3354 p->dirty = true;
3355 }
3356 ++start;
3357 }
3358}
3359
3360BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3361 uint64_t offset)
3362{
3363 Extent dummy(offset);
3364 return extent_map.find(dummy);
3365}
3366
7c673cae
FG
3367BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3368 uint64_t offset)
3369{
3370 Extent dummy(offset);
3371 auto fp = extent_map.lower_bound(dummy);
3372 if (fp != extent_map.begin()) {
3373 --fp;
3374 if (fp->logical_end() <= offset) {
3375 ++fp;
3376 }
3377 }
3378 return fp;
3379}
3380
3381BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3382 uint64_t offset) const
3383{
3384 Extent dummy(offset);
3385 auto fp = extent_map.lower_bound(dummy);
3386 if (fp != extent_map.begin()) {
3387 --fp;
3388 if (fp->logical_end() <= offset) {
3389 ++fp;
3390 }
3391 }
3392 return fp;
3393}
3394
3395bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3396{
3397 auto fp = seek_lextent(offset);
3398 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3399 return false;
3400 }
3401 return true;
3402}
3403
3404int BlueStore::ExtentMap::compress_extent_map(
3405 uint64_t offset,
3406 uint64_t length)
3407{
7c673cae
FG
3408 if (extent_map.empty())
3409 return 0;
3410 int removed = 0;
3411 auto p = seek_lextent(offset);
3412 if (p != extent_map.begin()) {
3413 --p; // start to the left of offset
3414 }
3415 // the caller should have just written to this region
11fdf7f2 3416 ceph_assert(p != extent_map.end());
7c673cae
FG
3417
3418 // identify the *next* shard
3419 auto pshard = shards.begin();
3420 while (pshard != shards.end() &&
3421 p->logical_offset >= pshard->shard_info->offset) {
3422 ++pshard;
3423 }
3424 uint64_t shard_end;
3425 if (pshard != shards.end()) {
3426 shard_end = pshard->shard_info->offset;
3427 } else {
3428 shard_end = OBJECT_MAX_SIZE;
3429 }
3430
3431 auto n = p;
3432 for (++n; n != extent_map.end(); p = n++) {
3433 if (n->logical_offset > offset + length) {
3434 break; // stop after end
3435 }
3436 while (n != extent_map.end() &&
3437 p->logical_end() == n->logical_offset &&
3438 p->blob == n->blob &&
3439 p->blob_offset + p->length == n->blob_offset &&
3440 n->logical_offset < shard_end) {
3441 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3442 << " next shard 0x" << shard_end << std::dec
3443 << " merging " << *p << " and " << *n << dendl;
3444 p->length += n->length;
3445 rm(n++);
3446 ++removed;
3447 }
3448 if (n == extent_map.end()) {
3449 break;
3450 }
3451 if (n->logical_offset >= shard_end) {
11fdf7f2 3452 ceph_assert(pshard != shards.end());
7c673cae
FG
3453 ++pshard;
3454 if (pshard != shards.end()) {
3455 shard_end = pshard->shard_info->offset;
3456 } else {
3457 shard_end = OBJECT_MAX_SIZE;
3458 }
3459 }
3460 }
11fdf7f2 3461 if (removed) {
7c673cae
FG
3462 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3463 }
3464 return removed;
3465}
3466
3467void BlueStore::ExtentMap::punch_hole(
3468 CollectionRef &c,
3469 uint64_t offset,
3470 uint64_t length,
3471 old_extent_map_t *old_extents)
3472{
3473 auto p = seek_lextent(offset);
3474 uint64_t end = offset + length;
3475 while (p != extent_map.end()) {
3476 if (p->logical_offset >= end) {
3477 break;
3478 }
3479 if (p->logical_offset < offset) {
3480 if (p->logical_end() > end) {
3481 // split and deref middle
3482 uint64_t front = offset - p->logical_offset;
3483 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3484 length, p->blob);
3485 old_extents->push_back(*oe);
3486 add(end,
3487 p->blob_offset + front + length,
3488 p->length - front - length,
3489 p->blob);
3490 p->length = front;
3491 break;
3492 } else {
3493 // deref tail
11fdf7f2 3494 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
7c673cae
FG
3495 uint64_t keep = offset - p->logical_offset;
3496 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3497 p->length - keep, p->blob);
3498 old_extents->push_back(*oe);
3499 p->length = keep;
3500 ++p;
3501 continue;
3502 }
3503 }
3504 if (p->logical_offset + p->length <= end) {
3505 // deref whole lextent
3506 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3507 p->length, p->blob);
3508 old_extents->push_back(*oe);
3509 rm(p++);
3510 continue;
3511 }
3512 // deref head
3513 uint64_t keep = p->logical_end() - end;
3514 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3515 p->length - keep, p->blob);
3516 old_extents->push_back(*oe);
3517
3518 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3519 rm(p);
3520 break;
3521 }
3522}
3523
3524BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3525 CollectionRef &c,
3526 uint64_t logical_offset,
3527 uint64_t blob_offset, uint64_t length, BlobRef b,
3528 old_extent_map_t *old_extents)
3529{
3530 // We need to have completely initialized Blob to increment its ref counters.
11fdf7f2 3531 ceph_assert(b->get_blob().get_logical_length() != 0);
7c673cae
FG
3532
3533 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3534 // old_extents list if we overwre the blob totally
3535 // This might happen during WAL overwrite.
3536 b->get_ref(onode->c, blob_offset, length);
3537
3538 if (old_extents) {
3539 punch_hole(c, logical_offset, length, old_extents);
3540 }
3541
3542 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3543 extent_map.insert(*le);
3544 if (spans_shard(logical_offset, length)) {
3545 request_reshard(logical_offset, logical_offset + length);
3546 }
3547 return le;
3548}
3549
3550BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3551 BlobRef lb,
3552 uint32_t blob_offset,
3553 uint32_t pos)
3554{
7c673cae
FG
3555 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3556 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3557 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3558 << dendl;
3559 BlobRef rb = onode->c->new_blob();
3560 lb->split(onode->c, blob_offset, rb.get());
3561
3562 for (auto ep = seek_lextent(pos);
3563 ep != extent_map.end() && ep->logical_offset < end_pos;
3564 ++ep) {
3565 if (ep->blob != lb) {
3566 continue;
3567 }
3568 if (ep->logical_offset < pos) {
3569 // split extent
3570 size_t left = pos - ep->logical_offset;
3571 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3572 extent_map.insert(*ne);
3573 ep->length = left;
3574 dout(30) << __func__ << " split " << *ep << dendl;
3575 dout(30) << __func__ << " to " << *ne << dendl;
3576 } else {
3577 // switch blob
11fdf7f2 3578 ceph_assert(ep->blob_offset >= blob_offset);
7c673cae
FG
3579
3580 ep->blob = rb;
3581 ep->blob_offset -= blob_offset;
3582 dout(30) << __func__ << " adjusted " << *ep << dendl;
3583 }
3584 }
3585 return rb;
3586}
3587
3588// Onode
3589
3590#undef dout_prefix
3591#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3592
20effc67
TL
3593const std::string& BlueStore::Onode::calc_omap_prefix(uint8_t flags)
3594{
3595 if (bluestore_onode_t::is_pgmeta_omap(flags)) {
3596 return PREFIX_PGMETA_OMAP;
3597 }
3598 if (bluestore_onode_t::is_perpg_omap(flags)) {
3599 return PREFIX_PERPG_OMAP;
3600 }
3601 if (bluestore_onode_t::is_perpool_omap(flags)) {
3602 return PREFIX_PERPOOL_OMAP;
3603 }
3604 return PREFIX_OMAP;
3605}
3606
3607// '-' < '.' < '~'
3608void BlueStore::Onode::calc_omap_header(
3609 uint8_t flags,
3610 const Onode* o,
3611 std::string* out)
3612{
3613 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3614 if (bluestore_onode_t::is_perpg_omap(flags)) {
3615 _key_encode_u64(o->c->pool(), out);
3616 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3617 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3618 _key_encode_u64(o->c->pool(), out);
3619 }
3620 }
3621 _key_encode_u64(o->onode.nid, out);
3622 out->push_back('-');
3623}
3624
3625void BlueStore::Onode::calc_omap_key(uint8_t flags,
3626 const Onode* o,
3627 const std::string& key,
3628 std::string* out)
3629{
3630 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3631 if (bluestore_onode_t::is_perpg_omap(flags)) {
3632 _key_encode_u64(o->c->pool(), out);
3633 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3634 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3635 _key_encode_u64(o->c->pool(), out);
3636 }
3637 }
3638 _key_encode_u64(o->onode.nid, out);
3639 out->push_back('.');
3640 out->append(key);
3641}
3642
3643void BlueStore::Onode::calc_omap_tail(
3644 uint8_t flags,
3645 const Onode* o,
3646 std::string* out)
3647{
3648 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3649 if (bluestore_onode_t::is_perpg_omap(flags)) {
3650 _key_encode_u64(o->c->pool(), out);
3651 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3652 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3653 _key_encode_u64(o->c->pool(), out);
3654 }
3655 }
3656 _key_encode_u64(o->onode.nid, out);
3657 out->push_back('~');
3658}
3659
f6b5b4d7 3660void BlueStore::Onode::get() {
adb31ebb
TL
3661 if (++nref >= 2 && !pinned) {
3662 OnodeCacheShard* ocs = c->get_onode_cache();
f67539c2
TL
3663 ocs->lock.lock();
3664 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3665 while (ocs != c->get_onode_cache()) {
3666 ocs->lock.unlock();
3667 ocs = c->get_onode_cache();
3668 ocs->lock.lock();
3669 }
adb31ebb
TL
3670 bool was_pinned = pinned;
3671 pinned = nref >= 2;
adb31ebb 3672 bool r = !was_pinned && pinned;
adb31ebb
TL
3673 if (cached && r) {
3674 ocs->_pin(this);
3675 }
f67539c2 3676 ocs->lock.unlock();
f6b5b4d7
TL
3677 }
3678}
3679void BlueStore::Onode::put() {
20effc67 3680 ++put_nref;
adb31ebb 3681 int n = --nref;
20effc67 3682 if (n == 1) {
adb31ebb 3683 OnodeCacheShard* ocs = c->get_onode_cache();
f67539c2
TL
3684 ocs->lock.lock();
3685 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3686 while (ocs != c->get_onode_cache()) {
3687 ocs->lock.unlock();
3688 ocs = c->get_onode_cache();
3689 ocs->lock.lock();
3690 }
adb31ebb 3691 bool need_unpin = pinned;
20effc67 3692 pinned = pinned && nref >= 2;
adb31ebb
TL
3693 need_unpin = need_unpin && !pinned;
3694 if (cached && need_unpin) {
3695 if (exists) {
3696 ocs->_unpin(this);
3697 } else {
3698 ocs->_unpin_and_rm(this);
20effc67 3699 // remove will also decrement nref
adb31ebb
TL
3700 c->onode_map._remove(oid);
3701 }
3702 }
f67539c2 3703 ocs->lock.unlock();
f6b5b4d7 3704 }
20effc67
TL
3705 auto pn = --put_nref;
3706 if (nref == 0 && pn == 0) {
f6b5b4d7
TL
3707 delete this;
3708 }
3709}
3710
eafe8130
TL
3711BlueStore::Onode* BlueStore::Onode::decode(
3712 CollectionRef c,
3713 const ghobject_t& oid,
3714 const string& key,
3715 const bufferlist& v)
3716{
3717 Onode* on = new Onode(c.get(), oid, key);
3718 on->exists = true;
3719 auto p = v.front().begin_deep();
3720 on->onode.decode(p);
3721 for (auto& i : on->onode.attrs) {
f91f0fd5 3722 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
eafe8130
TL
3723 }
3724
3725 // initialize extent_map
3726 on->extent_map.decode_spanning_blobs(p);
3727 if (on->onode.extent_map_shards.empty()) {
3728 denc(on->extent_map.inline_bl, p);
3729 on->extent_map.decode_some(on->extent_map.inline_bl);
3730 on->extent_map.inline_bl.reassign_to_mempool(
f91f0fd5 3731 mempool::mempool_bluestore_cache_data);
eafe8130
TL
3732 }
3733 else {
3734 on->extent_map.init_shards(false, false);
3735 }
3736 return on;
3737}
3738
7c673cae
FG
3739void BlueStore::Onode::flush()
3740{
3741 if (flushing_count.load()) {
3742 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
9f95a23c 3743 waiting_count++;
11fdf7f2 3744 std::unique_lock l(flush_lock);
7c673cae
FG
3745 while (flushing_count.load()) {
3746 flush_cond.wait(l);
3747 }
9f95a23c 3748 waiting_count--;
7c673cae
FG
3749 }
3750 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3751}
3752
9f95a23c
TL
3753void BlueStore::Onode::dump(Formatter* f) const
3754{
3755 onode.dump(f);
3756 extent_map.dump(f);
3757}
3758
9f95a23c
TL
3759void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3760{
f67539c2
TL
3761 if (!onode.is_pgmeta_omap()) {
3762 if (onode.is_perpg_omap()) {
3763 _key_encode_u64(c->pool(), out);
3764 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3765 } else if (onode.is_perpool_omap()) {
3766 _key_encode_u64(c->pool(), out);
3767 }
9f95a23c
TL
3768 }
3769 _key_encode_u64(onode.nid, out);
3770 out->append(old.c_str() + out->length(), old.size() - out->length());
3771}
3772
9f95a23c
TL
3773void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3774{
f67539c2
TL
3775 size_t pos = sizeof(uint64_t) + 1;
3776 if (!onode.is_pgmeta_omap()) {
3777 if (onode.is_perpg_omap()) {
3778 pos += sizeof(uint64_t) + sizeof(uint32_t);
3779 } else if (onode.is_perpool_omap()) {
3780 pos += sizeof(uint64_t);
3781 }
9f95a23c 3782 }
f67539c2 3783 *user_key = key.substr(pos);
9f95a23c
TL
3784}
3785
7c673cae
FG
3786// =======================================================
3787// WriteContext
3788
3789/// Checks for writes to the same pextent within a blob
3790bool BlueStore::WriteContext::has_conflict(
3791 BlobRef b,
3792 uint64_t loffs,
3793 uint64_t loffs_end,
3794 uint64_t min_alloc_size)
3795{
11fdf7f2
TL
3796 ceph_assert((loffs % min_alloc_size) == 0);
3797 ceph_assert((loffs_end % min_alloc_size) == 0);
7c673cae
FG
3798 for (auto w : writes) {
3799 if (b == w.b) {
11fdf7f2
TL
3800 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3801 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
7c673cae 3802 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 3803 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
3804 return true;
3805 }
3806 }
3807 }
3808 return false;
3809}
3810
3811// =======================================================
3812
3813// DeferredBatch
3814#undef dout_prefix
3815#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
9f95a23c
TL
3816#undef dout_context
3817#define dout_context cct
7c673cae
FG
3818
3819void BlueStore::DeferredBatch::prepare_write(
3820 CephContext *cct,
3821 uint64_t seq, uint64_t offset, uint64_t length,
3822 bufferlist::const_iterator& blp)
3823{
3824 _discard(cct, offset, length);
3825 auto i = iomap.insert(make_pair(offset, deferred_io()));
11fdf7f2 3826 ceph_assert(i.second); // this should be a new insertion
7c673cae
FG
3827 i.first->second.seq = seq;
3828 blp.copy(length, i.first->second.bl);
31f18b77
FG
3829 i.first->second.bl.reassign_to_mempool(
3830 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3831 dout(20) << __func__ << " seq " << seq
3832 << " 0x" << std::hex << offset << "~" << length
3833 << " crc " << i.first->second.bl.crc32c(-1)
3834 << std::dec << dendl;
3835 seq_bytes[seq] += length;
3836#ifdef DEBUG_DEFERRED
3837 _audit(cct);
3838#endif
3839}
3840
3841void BlueStore::DeferredBatch::_discard(
3842 CephContext *cct, uint64_t offset, uint64_t length)
3843{
3844 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3845 << std::dec << dendl;
3846 auto p = iomap.lower_bound(offset);
3847 if (p != iomap.begin()) {
3848 --p;
3849 auto end = p->first + p->second.bl.length();
3850 if (end > offset) {
3851 bufferlist head;
3852 head.substr_of(p->second.bl, 0, offset - p->first);
3853 dout(20) << __func__ << " keep head " << p->second.seq
3854 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3855 << " -> 0x" << head.length() << std::dec << dendl;
3856 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3857 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3858 if (end > offset + length) {
3859 bufferlist tail;
3860 tail.substr_of(p->second.bl, offset + length - p->first,
3861 end - (offset + length));
3862 dout(20) << __func__ << " keep tail " << p->second.seq
3863 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3864 << " -> 0x" << tail.length() << std::dec << dendl;
3865 auto &n = iomap[offset + length];
3866 n.bl.swap(tail);
3867 n.seq = p->second.seq;
3868 i->second -= length;
3869 } else {
3870 i->second -= end - offset;
3871 }
11fdf7f2 3872 ceph_assert(i->second >= 0);
7c673cae
FG
3873 p->second.bl.swap(head);
3874 }
3875 ++p;
3876 }
3877 while (p != iomap.end()) {
3878 if (p->first >= offset + length) {
3879 break;
3880 }
3881 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3882 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3883 auto end = p->first + p->second.bl.length();
3884 if (end > offset + length) {
3885 unsigned drop_front = offset + length - p->first;
3886 unsigned keep_tail = end - (offset + length);
3887 dout(20) << __func__ << " truncate front " << p->second.seq
3888 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3889 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3890 << " to 0x" << (offset + length) << "~" << keep_tail
3891 << std::dec << dendl;
3892 auto &s = iomap[offset + length];
3893 s.seq = p->second.seq;
3894 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3895 i->second -= drop_front;
3896 } else {
3897 dout(20) << __func__ << " drop " << p->second.seq
3898 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3899 << std::dec << dendl;
3900 i->second -= p->second.bl.length();
3901 }
11fdf7f2 3902 ceph_assert(i->second >= 0);
7c673cae
FG
3903 p = iomap.erase(p);
3904 }
3905}
3906
3907void BlueStore::DeferredBatch::_audit(CephContext *cct)
3908{
3909 map<uint64_t,int> sb;
3910 for (auto p : seq_bytes) {
3911 sb[p.first] = 0; // make sure we have the same set of keys
3912 }
3913 uint64_t pos = 0;
3914 for (auto& p : iomap) {
11fdf7f2 3915 ceph_assert(p.first >= pos);
7c673cae
FG
3916 sb[p.second.seq] += p.second.bl.length();
3917 pos = p.first + p.second.bl.length();
3918 }
11fdf7f2 3919 ceph_assert(sb == seq_bytes);
7c673cae
FG
3920}
3921
3922
3923// Collection
3924
3925#undef dout_prefix
3926#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3927
9f95a23c
TL
3928BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3929 : CollectionImpl(store_->cct, cid),
11fdf7f2 3930 store(store_),
9f95a23c 3931 cache(bc),
7c673cae 3932 exists(true),
9f95a23c 3933 onode_map(oc),
11fdf7f2
TL
3934 commit_queue(nullptr)
3935{
3936}
3937
3938bool BlueStore::Collection::flush_commit(Context *c)
3939{
3940 return osr->flush_commit(c);
3941}
3942
3943void BlueStore::Collection::flush()
3944{
3945 osr->flush();
3946}
3947
3948void BlueStore::Collection::flush_all_but_last()
7c673cae 3949{
11fdf7f2 3950 osr->flush_all_but_last();
7c673cae
FG
3951}
3952
3953void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3954{
11fdf7f2 3955 ceph_assert(!b->shared_blob);
7c673cae
FG
3956 const bluestore_blob_t& blob = b->get_blob();
3957 if (!blob.is_shared()) {
3958 b->shared_blob = new SharedBlob(this);
3959 return;
3960 }
3961
3962 b->shared_blob = shared_blob_set.lookup(sbid);
3963 if (b->shared_blob) {
3964 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3965 << std::dec << " had " << *b->shared_blob << dendl;
3966 } else {
3967 b->shared_blob = new SharedBlob(sbid, this);
3968 shared_blob_set.add(this, b->shared_blob.get());
3969 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3970 << std::dec << " opened " << *b->shared_blob
3971 << dendl;
3972 }
3973}
3974
3975void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3976{
3977 if (!sb->is_loaded()) {
3978
3979 bufferlist v;
3980 string key;
3981 auto sbid = sb->get_sbid();
3982 get_shared_blob_key(sbid, &key);
3983 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3984 if (r < 0) {
3985 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3986 << std::dec << " not found at key "
3987 << pretty_binary_string(key) << dendl;
11fdf7f2 3988 ceph_abort_msg("uh oh, missing shared_blob");
7c673cae
FG
3989 }
3990
3991 sb->loaded = true;
3992 sb->persistent = new bluestore_shared_blob_t(sbid);
11fdf7f2
TL
3993 auto p = v.cbegin();
3994 decode(*(sb->persistent), p);
7c673cae
FG
3995 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3996 << std::dec << " loaded shared_blob " << *sb << dendl;
3997 }
3998}
3999
4000void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
4001{
7c673cae 4002 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
11fdf7f2 4003 ceph_assert(!b->shared_blob->is_loaded());
7c673cae
FG
4004
4005 // update blob
31f18b77 4006 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 4007 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
4008
4009 // update shared blob
4010 b->shared_blob->loaded = true;
4011 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
4012 shared_blob_set.add(this, b->shared_blob.get());
4013 for (auto p : blob.get_extents()) {
4014 if (p.is_valid()) {
4015 b->shared_blob->get_ref(
4016 p.offset,
4017 p.length);
4018 }
4019 }
4020 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
4021}
4022
31f18b77
FG
4023uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
4024{
4025 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
11fdf7f2 4026 ceph_assert(sb->is_loaded());
31f18b77
FG
4027
4028 uint64_t sbid = sb->get_sbid();
4029 shared_blob_set.remove(sb);
4030 sb->loaded = false;
4031 delete sb->persistent;
4032 sb->sbid_unloaded = 0;
4033 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
4034 return sbid;
4035}
4036
7c673cae
FG
4037BlueStore::OnodeRef BlueStore::Collection::get_onode(
4038 const ghobject_t& oid,
9f95a23c
TL
4039 bool create,
4040 bool is_createop)
7c673cae 4041{
9f95a23c 4042 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
7c673cae
FG
4043
4044 spg_t pgid;
4045 if (cid.is_pg(&pgid)) {
4046 if (!oid.match(cnode.bits, pgid.ps())) {
4047 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
4048 << pgid << " bits " << cnode.bits << dendl;
4049 ceph_abort();
4050 }
4051 }
4052
4053 OnodeRef o = onode_map.lookup(oid);
4054 if (o)
4055 return o;
4056
eafe8130 4057 string key;
7c673cae
FG
4058 get_object_key(store->cct, oid, &key);
4059
4060 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
4061 << pretty_binary_string(key) << dendl;
4062
4063 bufferlist v;
9f95a23c 4064 int r = -ENOENT;
7c673cae 4065 Onode *on;
9f95a23c
TL
4066 if (!is_createop) {
4067 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
4068 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
4069 }
7c673cae 4070 if (v.length() == 0) {
11fdf7f2 4071 ceph_assert(r == -ENOENT);
f67539c2 4072 if (!create)
7c673cae
FG
4073 return OnodeRef();
4074
4075 // new object, new onode
4076 on = new Onode(this, oid, key);
4077 } else {
4078 // loaded
11fdf7f2 4079 ceph_assert(r >= 0);
eafe8130 4080 on = Onode::decode(this, oid, key, v);
7c673cae
FG
4081 }
4082 o.reset(on);
4083 return onode_map.add(oid, o);
4084}
4085
4086void BlueStore::Collection::split_cache(
4087 Collection *dest)
4088{
4089 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
4090
f67539c2
TL
4091 auto *ocache = get_onode_cache();
4092 auto *ocache_dest = dest->get_onode_cache();
4093
4094 // lock cache shards
4095 std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock);
4096 std::lock_guard l(ocache->lock, std::adopt_lock);
4097 std::lock_guard l2(ocache_dest->lock, std::adopt_lock);
4098 std::lock_guard l3(cache->lock, std::adopt_lock);
4099 std::lock_guard l4(dest->cache->lock, std::adopt_lock);
7c673cae
FG
4100
4101 int destbits = dest->cnode.bits;
4102 spg_t destpg;
4103 bool is_pg = dest->cid.is_pg(&destpg);
11fdf7f2 4104 ceph_assert(is_pg);
7c673cae
FG
4105
4106 auto p = onode_map.onode_map.begin();
4107 while (p != onode_map.onode_map.end()) {
11fdf7f2 4108 OnodeRef o = p->second;
7c673cae
FG
4109 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
4110 // onode does not belong to this child
11fdf7f2
TL
4111 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
4112 << dendl;
7c673cae
FG
4113 ++p;
4114 } else {
7c673cae
FG
4115 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
4116 << dendl;
4117
f6b5b4d7
TL
4118 // ensuring that nref is always >= 2 and hence onode is pinned and
4119 // physically out of cache during the transition
4120 OnodeRef o_pin = o;
4121 ceph_assert(o->pinned);
4122
7c673cae 4123 p = onode_map.onode_map.erase(p);
7c673cae 4124 dest->onode_map.onode_map[o->oid] = o;
adb31ebb 4125 if (o->cached) {
f6b5b4d7 4126 get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
9f95a23c 4127 }
f6b5b4d7 4128 o->c = dest;
7c673cae
FG
4129
4130 // move over shared blobs and buffers. cover shared blobs from
4131 // both extent map and spanning blob map (the full extent map
4132 // may not be faulted in)
4133 vector<SharedBlob*> sbvec;
4134 for (auto& e : o->extent_map.extent_map) {
4135 sbvec.push_back(e.blob->shared_blob.get());
4136 }
4137 for (auto& b : o->extent_map.spanning_blob_map) {
4138 sbvec.push_back(b.second->shared_blob.get());
4139 }
4140 for (auto sb : sbvec) {
4141 if (sb->coll == dest) {
4142 ldout(store->cct, 20) << __func__ << " already moved " << *sb
4143 << dendl;
4144 continue;
4145 }
4146 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
4147 if (sb->get_sbid()) {
4148 ldout(store->cct, 20) << __func__
4149 << " moving registration " << *sb << dendl;
4150 shared_blob_set.remove(sb);
4151 dest->shared_blob_set.add(dest, sb);
4152 }
3efd9988 4153 sb->coll = dest;
7c673cae 4154 if (dest->cache != cache) {
7c673cae
FG
4155 for (auto& i : sb->bc.buffer_map) {
4156 if (!i.second->is_writing()) {
4157 ldout(store->cct, 20) << __func__ << " moving " << *i.second
4158 << dendl;
9f95a23c 4159 dest->cache->_move(cache, i.second.get());
7c673cae
FG
4160 }
4161 }
4162 }
4163 }
7c673cae
FG
4164 }
4165 }
9f95a23c 4166 dest->cache->_trim();
7c673cae
FG
4167}
4168
7c673cae
FG
4169// =======================================================
4170
91327a77
AA
4171// MempoolThread
4172
4173#undef dout_prefix
4174#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
9f95a23c
TL
4175#undef dout_context
4176#define dout_context store->cct
91327a77 4177
7c673cae
FG
4178void *BlueStore::MempoolThread::entry()
4179{
9f95a23c 4180 std::unique_lock l{lock};
11fdf7f2 4181
92f5a8d4 4182 uint32_t prev_config_change = store->config_changed.load();
eafe8130
TL
4183 uint64_t base = store->osd_memory_base;
4184 double fragmentation = store->osd_memory_expected_fragmentation;
4185 uint64_t target = store->osd_memory_target;
4186 uint64_t min = store->osd_memory_cache_min;
4187 uint64_t max = min;
4188
4189 // When setting the maximum amount of memory to use for cache, first
4190 // assume some base amount of memory for the OSD and then fudge in
4191 // some overhead for fragmentation that scales with cache usage.
4192 uint64_t ltarget = (1.0 - fragmentation) * target;
4193 if (ltarget > base + min) {
4194 max = ltarget - base;
11fdf7f2 4195 }
31f18b77 4196
eafe8130 4197 binned_kv_cache = store->db->get_priority_cache();
f67539c2 4198 binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
eafe8130
TL
4199 if (store->cache_autotune && binned_kv_cache != nullptr) {
4200 pcm = std::make_shared<PriorityCache::Manager>(
f67539c2 4201 store->cct, min, max, target, true, "bluestore-pricache");
eafe8130
TL
4202 pcm->insert("kv", binned_kv_cache, true);
4203 pcm->insert("meta", meta_cache, true);
4204 pcm->insert("data", data_cache, true);
f67539c2
TL
4205 if (binned_kv_onode_cache != nullptr) {
4206 pcm->insert("kv_onode", binned_kv_onode_cache, true);
4207 }
eafe8130 4208 }
91327a77
AA
4209
4210 utime_t next_balance = ceph_clock_now();
4211 utime_t next_resize = ceph_clock_now();
20effc67 4212 utime_t next_bin_rotation = ceph_clock_now();
9f95a23c
TL
4213 utime_t next_deferred_force_submit = ceph_clock_now();
4214 utime_t alloc_stats_dump_clock = ceph_clock_now();
31f18b77 4215
91327a77 4216 bool interval_stats_trim = false;
91327a77 4217 while (!stop) {
92f5a8d4
TL
4218 // Update pcm cache settings if related configuration was changed
4219 uint32_t cur_config_change = store->config_changed.load();
4220 if (cur_config_change != prev_config_change) {
4221 _update_cache_settings();
4222 prev_config_change = cur_config_change;
4223 }
4224
20effc67
TL
4225 // define various intervals for background work
4226 double age_bin_interval = store->cache_age_bin_interval;
91327a77
AA
4227 double autotune_interval = store->cache_autotune_interval;
4228 double resize_interval = store->osd_memory_cache_resize_interval;
9f95a23c 4229 double max_defer_interval = store->max_defer_interval;
9f95a23c
TL
4230 double alloc_stats_dump_interval =
4231 store->cct->_conf->bluestore_alloc_stats_dump_interval;
91327a77 4232
20effc67 4233 // alloc stats dump
9f95a23c
TL
4234 if (alloc_stats_dump_interval > 0 &&
4235 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
4236 store->_record_allocation_stats();
4237 alloc_stats_dump_clock = ceph_clock_now();
4238 }
20effc67
TL
4239 // cache age binning
4240 if (age_bin_interval > 0 && next_bin_rotation < ceph_clock_now()) {
4241 if (binned_kv_cache != nullptr) {
4242 binned_kv_cache->import_bins(store->kv_bins);
4243 }
4244 if (binned_kv_onode_cache != nullptr) {
4245 binned_kv_onode_cache->import_bins(store->kv_onode_bins);
4246 }
4247 meta_cache->import_bins(store->meta_bins);
4248 data_cache->import_bins(store->data_bins);
4249
4250 if (pcm != nullptr) {
4251 pcm->shift_bins();
4252 }
4253 next_bin_rotation = ceph_clock_now();
4254 next_bin_rotation += age_bin_interval;
4255 }
4256 // cache balancing
91327a77 4257 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
20effc67
TL
4258 if (binned_kv_cache != nullptr) {
4259 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
4260 }
4261 if (binned_kv_onode_cache != nullptr) {
4262 binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
4263 }
4264 meta_cache->set_cache_ratio(store->cache_meta_ratio);
4265 data_cache->set_cache_ratio(store->cache_data_ratio);
11fdf7f2 4266
91327a77 4267 // Log events at 5 instead of 20 when balance happens.
91327a77 4268 interval_stats_trim = true;
eafe8130
TL
4269
4270 if (pcm != nullptr) {
4271 pcm->balance();
91327a77 4272 }
31f18b77 4273
91327a77
AA
4274 next_balance = ceph_clock_now();
4275 next_balance += autotune_interval;
4276 }
20effc67 4277 // memory resizing (ie autotuning)
91327a77 4278 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
eafe8130
TL
4279 if (ceph_using_tcmalloc() && pcm != nullptr) {
4280 pcm->tune_memory();
91327a77
AA
4281 }
4282 next_resize = ceph_clock_now();
4283 next_resize += resize_interval;
31f18b77 4284 }
20effc67 4285 // deferred force submit
9f95a23c
TL
4286 if (max_defer_interval > 0 &&
4287 next_deferred_force_submit < ceph_clock_now()) {
4288 if (store->get_deferred_last_submitted() + max_defer_interval <
4289 ceph_clock_now()) {
4290 store->deferred_try_submit();
4291 }
4292 next_deferred_force_submit = ceph_clock_now();
4293 next_deferred_force_submit += max_defer_interval/3;
4294 }
4295
4296 // Now Resize the shards
4297 _resize_shards(interval_stats_trim);
91327a77 4298 interval_stats_trim = false;
31f18b77 4299
91327a77 4300 store->_update_cache_logger();
11fdf7f2
TL
4301 auto wait = ceph::make_timespan(
4302 store->cct->_conf->bluestore_cache_trim_interval);
4303 cond.wait_for(l, wait);
7c673cae 4304 }
9f95a23c
TL
4305 // do final dump
4306 store->_record_allocation_stats();
7c673cae 4307 stop = false;
f67539c2 4308 pcm = nullptr;
7c673cae
FG
4309 return NULL;
4310}
4311
9f95a23c 4312void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
91327a77 4313{
9f95a23c
TL
4314 size_t onode_shards = store->onode_cache_shards.size();
4315 size_t buffer_shards = store->buffer_cache_shards.size();
91327a77 4316 int64_t kv_used = store->db->get_cache_usage();
f67539c2 4317 int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
11fdf7f2
TL
4318 int64_t meta_used = meta_cache->_get_used_bytes();
4319 int64_t data_used = data_cache->_get_used_bytes();
91327a77
AA
4320
4321 uint64_t cache_size = store->cache_size;
4322 int64_t kv_alloc =
20effc67 4323 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
f67539c2
TL
4324 int64_t kv_onode_alloc =
4325 static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
91327a77 4326 int64_t meta_alloc =
11fdf7f2 4327 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
91327a77 4328 int64_t data_alloc =
11fdf7f2 4329 static_cast<int64_t>(store->cache_data_ratio * cache_size);
91327a77 4330
eafe8130
TL
4331 if (pcm != nullptr && binned_kv_cache != nullptr) {
4332 cache_size = pcm->get_tuned_mem();
11fdf7f2
TL
4333 kv_alloc = binned_kv_cache->get_committed_size();
4334 meta_alloc = meta_cache->get_committed_size();
4335 data_alloc = data_cache->get_committed_size();
f67539c2
TL
4336 if (binned_kv_onode_cache != nullptr) {
4337 kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
4338 }
91327a77
AA
4339 }
4340
4341 if (interval_stats) {
9f95a23c 4342 dout(5) << __func__ << " cache_size: " << cache_size
91327a77
AA
4343 << " kv_alloc: " << kv_alloc
4344 << " kv_used: " << kv_used
f67539c2
TL
4345 << " kv_onode_alloc: " << kv_onode_alloc
4346 << " kv_onode_used: " << kv_onode_used
91327a77
AA
4347 << " meta_alloc: " << meta_alloc
4348 << " meta_used: " << meta_used
4349 << " data_alloc: " << data_alloc
4350 << " data_used: " << data_used << dendl;
4351 } else {
9f95a23c 4352 dout(20) << __func__ << " cache_size: " << cache_size
91327a77
AA
4353 << " kv_alloc: " << kv_alloc
4354 << " kv_used: " << kv_used
f67539c2
TL
4355 << " kv_onode_alloc: " << kv_onode_alloc
4356 << " kv_onode_used: " << kv_onode_used
91327a77
AA
4357 << " meta_alloc: " << meta_alloc
4358 << " meta_used: " << meta_used
4359 << " data_alloc: " << data_alloc
4360 << " data_used: " << data_used << dendl;
4361 }
4362
4363 uint64_t max_shard_onodes = static_cast<uint64_t>(
9f95a23c
TL
4364 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
4365 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
91327a77 4366
9f95a23c 4367 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
91327a77
AA
4368 << " max_shard_buffer: " << max_shard_buffer << dendl;
4369
9f95a23c
TL
4370 for (auto i : store->onode_cache_shards) {
4371 i->set_max(max_shard_onodes);
4372 }
4373 for (auto i : store->buffer_cache_shards) {
4374 i->set_max(max_shard_buffer);
91327a77
AA
4375 }
4376}
4377
92f5a8d4
TL
4378void BlueStore::MempoolThread::_update_cache_settings()
4379{
4380 // Nothing to do if pcm is not used.
4381 if (pcm == nullptr) {
4382 return;
4383 }
4384
92f5a8d4
TL
4385 uint64_t target = store->osd_memory_target;
4386 uint64_t base = store->osd_memory_base;
4387 uint64_t min = store->osd_memory_cache_min;
4388 uint64_t max = min;
4389 double fragmentation = store->osd_memory_expected_fragmentation;
4390
4391 uint64_t ltarget = (1.0 - fragmentation) * target;
4392 if (ltarget > base + min) {
4393 max = ltarget - base;
4394 }
4395
4396 // set pcm cache levels
4397 pcm->set_target_memory(target);
4398 pcm->set_min_memory(min);
4399 pcm->set_max_memory(max);
4400
9f95a23c 4401 dout(5) << __func__ << " updated pcm target: " << target
92f5a8d4
TL
4402 << " pcm min: " << min
4403 << " pcm max: " << max
4404 << dendl;
4405}
4406
7c673cae
FG
4407// =======================================================
4408
31f18b77
FG
4409// OmapIteratorImpl
4410
4411#undef dout_prefix
4412#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4413
4414BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4415 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
4416 : c(c), o(o), it(it)
4417{
9f95a23c 4418 std::shared_lock l(c->lock);
31f18b77 4419 if (o->onode.has_omap()) {
9f95a23c
TL
4420 o->get_omap_key(string(), &head);
4421 o->get_omap_tail(&tail);
31f18b77
FG
4422 it->lower_bound(head);
4423 }
4424}
4425
11fdf7f2
TL
4426string BlueStore::OmapIteratorImpl::_stringify() const
4427{
4428 stringstream s;
4429 s << " omap_iterator(cid = " << c->cid
4430 <<", oid = " << o->oid << ")";
4431 return s.str();
4432}
4433
31f18b77
FG
4434int BlueStore::OmapIteratorImpl::seek_to_first()
4435{
9f95a23c 4436 std::shared_lock l(c->lock);
11fdf7f2 4437 auto start1 = mono_clock::now();
31f18b77
FG
4438 if (o->onode.has_omap()) {
4439 it->lower_bound(head);
4440 } else {
4441 it = KeyValueDB::Iterator();
4442 }
494da23a
TL
4443 c->store->log_latency(
4444 __func__,
11fdf7f2
TL
4445 l_bluestore_omap_seek_to_first_lat,
4446 mono_clock::now() - start1,
494da23a 4447 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2 4448
31f18b77
FG
4449 return 0;
4450}
4451
4452int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4453{
9f95a23c 4454 std::shared_lock l(c->lock);
11fdf7f2 4455 auto start1 = mono_clock::now();
31f18b77
FG
4456 if (o->onode.has_omap()) {
4457 string key;
9f95a23c 4458 o->get_omap_key(after, &key);
31f18b77
FG
4459 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4460 << pretty_binary_string(key) << dendl;
4461 it->upper_bound(key);
4462 } else {
4463 it = KeyValueDB::Iterator();
4464 }
11fdf7f2 4465 c->store->log_latency_fn(
494da23a 4466 __func__,
11fdf7f2
TL
4467 l_bluestore_omap_upper_bound_lat,
4468 mono_clock::now() - start1,
494da23a 4469 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4470 [&] (const ceph::timespan& lat) {
494da23a 4471 return ", after = " + after +
11fdf7f2
TL
4472 _stringify();
4473 }
4474 );
31f18b77
FG
4475 return 0;
4476}
4477
4478int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4479{
9f95a23c 4480 std::shared_lock l(c->lock);
11fdf7f2 4481 auto start1 = mono_clock::now();
31f18b77
FG
4482 if (o->onode.has_omap()) {
4483 string key;
9f95a23c 4484 o->get_omap_key(to, &key);
31f18b77
FG
4485 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4486 << pretty_binary_string(key) << dendl;
4487 it->lower_bound(key);
4488 } else {
4489 it = KeyValueDB::Iterator();
4490 }
11fdf7f2 4491 c->store->log_latency_fn(
494da23a 4492 __func__,
11fdf7f2
TL
4493 l_bluestore_omap_lower_bound_lat,
4494 mono_clock::now() - start1,
494da23a 4495 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4496 [&] (const ceph::timespan& lat) {
494da23a 4497 return ", to = " + to +
11fdf7f2
TL
4498 _stringify();
4499 }
4500 );
31f18b77
FG
4501 return 0;
4502}
4503
4504bool BlueStore::OmapIteratorImpl::valid()
4505{
9f95a23c 4506 std::shared_lock l(c->lock);
31f18b77 4507 bool r = o->onode.has_omap() && it && it->valid() &&
494da23a 4508 it->raw_key().second < tail;
31f18b77
FG
4509 if (it && it->valid()) {
4510 ldout(c->store->cct,20) << __func__ << " is at "
4511 << pretty_binary_string(it->raw_key().second)
4512 << dendl;
4513 }
4514 return r;
4515}
4516
11fdf7f2 4517int BlueStore::OmapIteratorImpl::next()
31f18b77 4518{
11fdf7f2 4519 int r = -1;
9f95a23c 4520 std::shared_lock l(c->lock);
11fdf7f2 4521 auto start1 = mono_clock::now();
31f18b77
FG
4522 if (o->onode.has_omap()) {
4523 it->next();
11fdf7f2 4524 r = 0;
31f18b77 4525 }
494da23a
TL
4526 c->store->log_latency(
4527 __func__,
11fdf7f2
TL
4528 l_bluestore_omap_next_lat,
4529 mono_clock::now() - start1,
494da23a 4530 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2
TL
4531
4532 return r;
31f18b77
FG
4533}
4534
4535string BlueStore::OmapIteratorImpl::key()
4536{
9f95a23c 4537 std::shared_lock l(c->lock);
11fdf7f2 4538 ceph_assert(it->valid());
31f18b77
FG
4539 string db_key = it->raw_key().second;
4540 string user_key;
9f95a23c 4541 o->decode_omap_key(db_key, &user_key);
494da23a 4542
31f18b77
FG
4543 return user_key;
4544}
4545
4546bufferlist BlueStore::OmapIteratorImpl::value()
4547{
9f95a23c 4548 std::shared_lock l(c->lock);
11fdf7f2 4549 ceph_assert(it->valid());
31f18b77
FG
4550 return it->value();
4551}
4552
4553
4554// =====================================
4555
7c673cae
FG
4556#undef dout_prefix
4557#define dout_prefix *_dout << "bluestore(" << path << ") "
9f95a23c
TL
4558#undef dout_context
4559#define dout_context cct
7c673cae
FG
4560
4561
4562static void aio_cb(void *priv, void *priv2)
4563{
4564 BlueStore *store = static_cast<BlueStore*>(priv);
4565 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4566 c->aio_finish(store);
4567}
4568
11fdf7f2
TL
4569static void discard_cb(void *priv, void *priv2)
4570{
4571 BlueStore *store = static_cast<BlueStore*>(priv);
4572 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4573 store->handle_discard(*tmp);
4574}
4575
4576void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4577{
4578 dout(10) << __func__ << dendl;
20effc67
TL
4579 ceph_assert(alloc);
4580 alloc->release(to_release);
11fdf7f2
TL
4581}
4582
7c673cae 4583BlueStore::BlueStore(CephContext *cct, const string& path)
9f95a23c 4584 : BlueStore(cct, path, 0) {}
7c673cae
FG
4585
4586BlueStore::BlueStore(CephContext *cct,
4587 const string& path,
4588 uint64_t _min_alloc_size)
4589 : ObjectStore(cct, path),
9f95a23c 4590 throttle(cct),
11fdf7f2 4591 finisher(cct, "commit_finisher", "cfin"),
7c673cae 4592 kv_sync_thread(this),
31f18b77 4593 kv_finalize_thread(this),
20effc67 4594#ifdef HAVE_LIBZBD
f67539c2 4595 zoned_cleaner_thread(this),
20effc67 4596#endif
7c673cae
FG
4597 min_alloc_size(_min_alloc_size),
4598 min_alloc_size_order(ctz(_min_alloc_size)),
4599 mempool_thread(this)
4600{
4601 _init_logger();
11fdf7f2 4602 cct->_conf.add_observer(this);
7c673cae 4603 set_cache_shards(1);
7c673cae
FG
4604}
4605
4606BlueStore::~BlueStore()
4607{
11fdf7f2 4608 cct->_conf.remove_observer(this);
7c673cae 4609 _shutdown_logger();
11fdf7f2
TL
4610 ceph_assert(!mounted);
4611 ceph_assert(db == NULL);
4612 ceph_assert(bluefs == NULL);
4613 ceph_assert(fsid_fd < 0);
4614 ceph_assert(path_fd < 0);
9f95a23c
TL
4615 for (auto i : onode_cache_shards) {
4616 delete i;
4617 }
4618 for (auto i : buffer_cache_shards) {
7c673cae
FG
4619 delete i;
4620 }
9f95a23c
TL
4621 onode_cache_shards.clear();
4622 buffer_cache_shards.clear();
7c673cae
FG
4623}
4624
4625const char **BlueStore::get_tracked_conf_keys() const
4626{
4627 static const char* KEYS[] = {
4628 "bluestore_csum_type",
4629 "bluestore_compression_mode",
4630 "bluestore_compression_algorithm",
4631 "bluestore_compression_min_blob_size",
4632 "bluestore_compression_min_blob_size_ssd",
4633 "bluestore_compression_min_blob_size_hdd",
4634 "bluestore_compression_max_blob_size",
4635 "bluestore_compression_max_blob_size_ssd",
4636 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 4637 "bluestore_compression_required_ratio",
7c673cae
FG
4638 "bluestore_max_alloc_size",
4639 "bluestore_prefer_deferred_size",
181888fb
FG
4640 "bluestore_prefer_deferred_size_hdd",
4641 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
4642 "bluestore_deferred_batch_ops",
4643 "bluestore_deferred_batch_ops_hdd",
4644 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
4645 "bluestore_throttle_bytes",
4646 "bluestore_throttle_deferred_bytes",
4647 "bluestore_throttle_cost_per_io_hdd",
4648 "bluestore_throttle_cost_per_io_ssd",
4649 "bluestore_throttle_cost_per_io",
4650 "bluestore_max_blob_size",
4651 "bluestore_max_blob_size_ssd",
4652 "bluestore_max_blob_size_hdd",
11fdf7f2
TL
4653 "osd_memory_target",
4654 "osd_memory_target_cgroup_limit_ratio",
4655 "osd_memory_base",
4656 "osd_memory_cache_min",
92f5a8d4 4657 "osd_memory_expected_fragmentation",
11fdf7f2
TL
4658 "bluestore_cache_autotune",
4659 "bluestore_cache_autotune_interval",
20effc67
TL
4660 "bluestore_cache_age_bin_interval",
4661 "bluestore_cache_kv_age_bins",
4662 "bluestore_cache_kv_onode_age_bins",
4663 "bluestore_cache_meta_age_bins",
4664 "bluestore_cache_data_age_bins",
81eedcae 4665 "bluestore_warn_on_legacy_statfs",
9f95a23c 4666 "bluestore_warn_on_no_per_pool_omap",
20effc67 4667 "bluestore_warn_on_no_per_pg_omap",
9f95a23c 4668 "bluestore_max_defer_interval",
7c673cae
FG
4669 NULL
4670 };
4671 return KEYS;
4672}
4673
11fdf7f2 4674void BlueStore::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
4675 const std::set<std::string> &changed)
4676{
eafe8130 4677 if (changed.count("bluestore_warn_on_legacy_statfs")) {
81eedcae
TL
4678 _check_legacy_statfs_alert();
4679 }
f67539c2
TL
4680 if (changed.count("bluestore_warn_on_no_per_pool_omap") ||
4681 changed.count("bluestore_warn_on_no_per_pg_omap")) {
4682 _check_no_per_pg_or_pool_omap_alert();
9f95a23c 4683 }
81eedcae 4684
7c673cae
FG
4685 if (changed.count("bluestore_csum_type")) {
4686 _set_csum();
4687 }
4688 if (changed.count("bluestore_compression_mode") ||
4689 changed.count("bluestore_compression_algorithm") ||
4690 changed.count("bluestore_compression_min_blob_size") ||
4691 changed.count("bluestore_compression_max_blob_size")) {
4692 if (bdev) {
4693 _set_compression();
4694 }
4695 }
4696 if (changed.count("bluestore_max_blob_size") ||
4697 changed.count("bluestore_max_blob_size_ssd") ||
4698 changed.count("bluestore_max_blob_size_hdd")) {
4699 if (bdev) {
4700 // only after startup
4701 _set_blob_size();
4702 }
4703 }
4704 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
4705 changed.count("bluestore_prefer_deferred_size_hdd") ||
4706 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
4707 changed.count("bluestore_max_alloc_size") ||
4708 changed.count("bluestore_deferred_batch_ops") ||
4709 changed.count("bluestore_deferred_batch_ops_hdd") ||
4710 changed.count("bluestore_deferred_batch_ops_ssd")) {
4711 if (bdev) {
4712 // only after startup
4713 _set_alloc_sizes();
4714 }
4715 }
4716 if (changed.count("bluestore_throttle_cost_per_io") ||
4717 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4718 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4719 if (bdev) {
4720 _set_throttle_params();
4721 }
4722 }
9f95a23c
TL
4723 if (changed.count("bluestore_throttle_bytes") ||
4724 changed.count("bluestore_throttle_deferred_bytes") ||
4725 changed.count("bluestore_throttle_trace_rate")) {
4726 throttle.reset_throttle(conf);
7c673cae 4727 }
9f95a23c
TL
4728 if (changed.count("bluestore_max_defer_interval")) {
4729 if (bdev) {
4730 _set_max_defer_interval();
4731 }
7c673cae 4732 }
92f5a8d4
TL
4733 if (changed.count("osd_memory_target") ||
4734 changed.count("osd_memory_base") ||
4735 changed.count("osd_memory_cache_min") ||
4736 changed.count("osd_memory_expected_fragmentation")) {
4737 _update_osd_memory_options();
4738 }
7c673cae
FG
4739}
4740
4741void BlueStore::_set_compression()
4742{
224ce89b
WB
4743 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4744 if (m) {
11fdf7f2 4745 _clear_compression_alert();
224ce89b
WB
4746 comp_mode = *m;
4747 } else {
4748 derr << __func__ << " unrecognized value '"
4749 << cct->_conf->bluestore_compression_mode
4750 << "' for bluestore_compression_mode, reverting to 'none'"
4751 << dendl;
4752 comp_mode = Compressor::COMP_NONE;
11fdf7f2
TL
4753 string s("unknown mode: ");
4754 s += cct->_conf->bluestore_compression_mode;
4755 _set_compression_alert(true, s.c_str());
224ce89b
WB
4756 }
4757
4758 compressor = nullptr;
4759
3efd9988
FG
4760 if (cct->_conf->bluestore_compression_min_blob_size) {
4761 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae 4762 } else {
11fdf7f2 4763 ceph_assert(bdev);
9f95a23c 4764 if (_use_rotational_settings()) {
7c673cae
FG
4765 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4766 } else {
4767 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4768 }
4769 }
4770
4771 if (cct->_conf->bluestore_compression_max_blob_size) {
4772 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4773 } else {
11fdf7f2 4774 ceph_assert(bdev);
9f95a23c 4775 if (_use_rotational_settings()) {
7c673cae
FG
4776 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4777 } else {
4778 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4779 }
4780 }
4781
7c673cae
FG
4782 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4783 if (!alg_name.empty()) {
4784 compressor = Compressor::create(cct, alg_name);
4785 if (!compressor) {
4786 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4787 << dendl;
11fdf7f2 4788 _set_compression_alert(false, alg_name.c_str());
7c673cae
FG
4789 }
4790 }
4791
4792 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4793 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
11fdf7f2
TL
4794 << " min_blob " << comp_min_blob_size
4795 << " max_blob " << comp_max_blob_size
7c673cae
FG
4796 << dendl;
4797}
4798
4799void BlueStore::_set_csum()
4800{
4801 csum_type = Checksummer::CSUM_NONE;
4802 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4803 if (t > Checksummer::CSUM_NONE)
4804 csum_type = t;
4805
4806 dout(10) << __func__ << " csum_type "
4807 << Checksummer::get_csum_type_string(csum_type)
4808 << dendl;
4809}
4810
4811void BlueStore::_set_throttle_params()
4812{
4813 if (cct->_conf->bluestore_throttle_cost_per_io) {
4814 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4815 } else {
11fdf7f2 4816 ceph_assert(bdev);
9f95a23c 4817 if (_use_rotational_settings()) {
7c673cae
FG
4818 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4819 } else {
4820 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4821 }
4822 }
4823
4824 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4825 << dendl;
4826}
4827void BlueStore::_set_blob_size()
4828{
4829 if (cct->_conf->bluestore_max_blob_size) {
4830 max_blob_size = cct->_conf->bluestore_max_blob_size;
4831 } else {
11fdf7f2 4832 ceph_assert(bdev);
9f95a23c 4833 if (_use_rotational_settings()) {
7c673cae
FG
4834 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4835 } else {
4836 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4837 }
4838 }
4839 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4840 << std::dec << dendl;
4841}
4842
92f5a8d4
TL
4843void BlueStore::_update_osd_memory_options()
4844{
4845 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4846 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4847 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4848 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4849 config_changed++;
4850 dout(10) << __func__
4851 << " osd_memory_target " << osd_memory_target
4852 << " osd_memory_base " << osd_memory_base
4853 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4854 << " osd_memory_cache_min " << osd_memory_cache_min
4855 << dendl;
4856}
4857
11fdf7f2 4858int BlueStore::_set_cache_sizes()
1adf2230 4859{
11fdf7f2
TL
4860 ceph_assert(bdev);
4861 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
91327a77 4862 cache_autotune_interval =
11fdf7f2 4863 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
20effc67
TL
4864 cache_age_bin_interval =
4865 cct->_conf.get_val<double>("bluestore_cache_age_bin_interval");
4866 auto _set_bin = [&](std::string conf_name, std::vector<uint64_t>* intervals)
4867 {
4868 std::string intervals_str = cct->_conf.get_val<std::string>(conf_name);
4869 std::istringstream interval_stream(intervals_str);
4870 std::copy(
4871 std::istream_iterator<uint64_t>(interval_stream),
4872 std::istream_iterator<uint64_t>(),
4873 std::back_inserter(*intervals));
4874 };
4875 _set_bin("bluestore_cache_age_bins_kv", &kv_bins);
4876 _set_bin("bluestore_cache_age_bins_kv_onode", &kv_onode_bins);
4877 _set_bin("bluestore_cache_age_bins_meta", &meta_bins);
4878 _set_bin("bluestore_cache_age_bins_data", &data_bins);
4879
11fdf7f2
TL
4880 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4881 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
91327a77 4882 osd_memory_expected_fragmentation =
11fdf7f2
TL
4883 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4884 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
91327a77 4885 osd_memory_cache_resize_interval =
11fdf7f2 4886 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
91327a77 4887
224ce89b
WB
4888 if (cct->_conf->bluestore_cache_size) {
4889 cache_size = cct->_conf->bluestore_cache_size;
4890 } else {
4891 // choose global cache size based on backend type
9f95a23c 4892 if (_use_rotational_settings()) {
224ce89b
WB
4893 cache_size = cct->_conf->bluestore_cache_size_hdd;
4894 } else {
4895 cache_size = cct->_conf->bluestore_cache_size_ssd;
4896 }
4897 }
31f18b77 4898
f67539c2 4899 cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
224ce89b 4900 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 4901 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77 4902 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4903 return -EINVAL;
4904 }
91327a77 4905
f67539c2 4906 cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
224ce89b 4907 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 4908 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
91327a77 4909 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4910 return -EINVAL;
4911 }
91327a77 4912
f67539c2
TL
4913 cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
4914 if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
4915 derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
4916 << ") must be in range [0,1.0]" << dendl;
4917 return -EINVAL;
4918 }
4919
31f18b77 4920 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 4921 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77
AA
4922 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4923 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4924 << dendl;
31f18b77
FG
4925 return -EINVAL;
4926 }
91327a77 4927
f67539c2
TL
4928 cache_data_ratio = (double)1.0 -
4929 (double)cache_meta_ratio -
4930 (double)cache_kv_ratio -
4931 (double)cache_kv_onode_ratio;
31f18b77
FG
4932 if (cache_data_ratio < 0) {
4933 // deal with floating point imprecision
4934 cache_data_ratio = 0;
4935 }
91327a77 4936
224ce89b
WB
4937 dout(1) << __func__ << " cache_size " << cache_size
4938 << " meta " << cache_meta_ratio
31f18b77
FG
4939 << " kv " << cache_kv_ratio
4940 << " data " << cache_data_ratio
4941 << dendl;
4942 return 0;
4943}
4944
3efd9988
FG
4945int BlueStore::write_meta(const std::string& key, const std::string& value)
4946{
4947 bluestore_bdev_label_t label;
4948 string p = path + "/block";
4949 int r = _read_bdev_label(cct, p, &label);
4950 if (r < 0) {
4951 return ObjectStore::write_meta(key, value);
4952 }
4953 label.meta[key] = value;
4954 r = _write_bdev_label(cct, p, label);
11fdf7f2 4955 ceph_assert(r == 0);
3efd9988
FG
4956 return ObjectStore::write_meta(key, value);
4957}
4958
4959int BlueStore::read_meta(const std::string& key, std::string *value)
4960{
4961 bluestore_bdev_label_t label;
4962 string p = path + "/block";
4963 int r = _read_bdev_label(cct, p, &label);
4964 if (r < 0) {
4965 return ObjectStore::read_meta(key, value);
4966 }
4967 auto i = label.meta.find(key);
4968 if (i == label.meta.end()) {
4969 return ObjectStore::read_meta(key, value);
4970 }
4971 *value = i->second;
4972 return 0;
4973}
4974
7c673cae
FG
4975void BlueStore::_init_logger()
4976{
4977 PerfCountersBuilder b(cct, "bluestore",
4978 l_bluestore_first, l_bluestore_last);
20effc67
TL
4979
4980 // space utilization stats
4981 //****************************************
4982 b.add_u64(l_bluestore_allocated, "allocated",
4983 "Sum for allocated bytes",
4984 "al_b",
4985 PerfCountersBuilder::PRIO_CRITICAL,
4986 unit_t(UNIT_BYTES));
4987 b.add_u64(l_bluestore_stored, "stored",
4988 "Sum for stored bytes",
4989 "st_b",
4990 PerfCountersBuilder::PRIO_CRITICAL,
4991 unit_t(UNIT_BYTES));
4992 b.add_u64(l_bluestore_fragmentation, "fragmentation_micros",
4993 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
4994 b.add_u64(l_bluestore_alloc_unit, "alloc_unit",
4995 "allocation unit size in bytes",
4996 "au_b",
4997 PerfCountersBuilder::PRIO_CRITICAL,
4998 unit_t(UNIT_BYTES));
4999 //****************************************
5000
5001 // Update op processing state latencies
5002 //****************************************
7c673cae 5003 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
20effc67
TL
5004 "Average prepare state latency",
5005 "sprl", PerfCountersBuilder::PRIO_USEFUL);
7c673cae
FG
5006 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
5007 "Average aio_wait state latency",
20effc67 5008 "sawl", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae 5009 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
20effc67
TL
5010 "Average io_done state latency",
5011 "sidl", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5012 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
20effc67
TL
5013 "Average kv_queued state latency",
5014 "skql", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5015 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
20effc67
TL
5016 "Average kv_commiting state latency",
5017 "skcl", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5018 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
20effc67
TL
5019 "Average kv_done state latency",
5020 "skdl", PerfCountersBuilder::PRIO_USEFUL);
5021 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
5022 "Average finishing state latency",
5023 "sfnl", PerfCountersBuilder::PRIO_USEFUL);
5024 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
5025 "Average done state latency",
5026 "sdnl", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5027 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
20effc67
TL
5028 "Average deferred_queued state latency",
5029 "sdql", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5030 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
20effc67
TL
5031 "Average aio_wait state latency",
5032 "sdal", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5033 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
20effc67
TL
5034 "Average cleanup state latency",
5035 "sdcl", PerfCountersBuilder::PRIO_USEFUL);
5036 //****************************************
5037
5038 // Update Transaction stats
5039 //****************************************
5040 b.add_time_avg(l_bluestore_throttle_lat, "txc_throttle_lat",
7c673cae
FG
5041 "Average submit throttle latency",
5042 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
20effc67 5043 b.add_time_avg(l_bluestore_submit_lat, "txc_submit_lat",
7c673cae
FG
5044 "Average submit latency",
5045 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
20effc67 5046 b.add_time_avg(l_bluestore_commit_lat, "txc_commit_lat",
7c673cae
FG
5047 "Average commit latency",
5048 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
20effc67
TL
5049 b.add_u64_counter(l_bluestore_txc, "txc_count", "Transactions committed");
5050 //****************************************
5051
5052 // Read op stats
5053 //****************************************
7c673cae 5054 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
20effc67
TL
5055 "Average read onode metadata latency",
5056 "roml", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5057 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
20effc67
TL
5058 "Average read I/O waiting latency",
5059 "rwal", PerfCountersBuilder::PRIO_USEFUL);
7c673cae 5060 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
20effc67
TL
5061 "Average checksum latency",
5062 "csml", PerfCountersBuilder::PRIO_USEFUL);
5063 b.add_u64_counter(l_bluestore_read_eio, "read_eio",
5064 "Read EIO errors propagated to high level callers");
5065 b.add_u64_counter(l_bluestore_reads_with_retries, "reads_with_retries",
5066 "Read operations that required at least one retry due to failed checksum validation",
5067 "rd_r", PerfCountersBuilder::PRIO_USEFUL);
5068 b.add_time_avg(l_bluestore_read_lat, "read_lat",
5069 "Average read latency",
5070 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
5071 //****************************************
5072
5073 // kv_thread latencies
5074 //****************************************
5075 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
5076 "Average kv_thread flush latency",
5077 "kfsl", PerfCountersBuilder::PRIO_INTERESTING);
5078 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
5079 "Average kv_thread commit latency",
5080 "kcol", PerfCountersBuilder::PRIO_USEFUL);
5081 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
5082 "Average kv_sync thread latency",
5083 "kscl", PerfCountersBuilder::PRIO_INTERESTING);
5084 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
5085 "Average kv_finalize thread latency",
5086 "kfll", PerfCountersBuilder::PRIO_INTERESTING);
5087 //****************************************
5088
5089 // write op stats
5090 //****************************************
5091 b.add_u64_counter(l_bluestore_write_big, "write_big",
7c673cae 5092 "Large aligned writes into fresh blobs");
20effc67
TL
5093 b.add_u64_counter(l_bluestore_write_big_bytes, "write_big_bytes",
5094 "Large aligned writes into fresh blobs (bytes)",
5095 NULL,
5096 PerfCountersBuilder::PRIO_DEBUGONLY,
5097 unit_t(UNIT_BYTES));
5098 b.add_u64_counter(l_bluestore_write_big_blobs, "write_big_blobs",
7c673cae 5099 "Large aligned writes into fresh blobs (blobs)");
f67539c2 5100 b.add_u64_counter(l_bluestore_write_big_deferred,
20effc67 5101 "write_big_deferred",
f67539c2 5102 "Big overwrites using deferred");
20effc67
TL
5103
5104 b.add_u64_counter(l_bluestore_write_small, "write_small",
7c673cae 5105 "Small writes into existing or sparse small blobs");
20effc67
TL
5106 b.add_u64_counter(l_bluestore_write_small_bytes, "write_small_bytes",
5107 "Small writes into existing or sparse small blobs (bytes)",
5108 NULL,
5109 PerfCountersBuilder::PRIO_DEBUGONLY,
5110 unit_t(UNIT_BYTES));
7c673cae 5111 b.add_u64_counter(l_bluestore_write_small_unused,
20effc67 5112 "write_small_unused",
7c673cae 5113 "Small writes into unused portion of existing blob");
7c673cae 5114 b.add_u64_counter(l_bluestore_write_small_pre_read,
20effc67 5115 "write_small_pre_read",
7c673cae
FG
5116 "Small writes that required we read some data (possibly "
5117 "cached) to fill out the block");
7c673cae 5118
20effc67
TL
5119 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
5120 "Sum for write-op padded bytes",
5121 NULL,
5122 PerfCountersBuilder::PRIO_DEBUGONLY,
5123 unit_t(UNIT_BYTES));
5124 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
5125 "Sum for write penalty read ops");
5126 b.add_u64_counter(l_bluestore_write_new, "write_new",
5127 "Write into new blob");
5128
5129 b.add_u64_counter(l_bluestore_issued_deferred_writes,
5130 "issued_deferred_writes",
5131 "Total deferred writes issued");
5132 b.add_u64_counter(l_bluestore_issued_deferred_write_bytes,
5133 "issued_deferred_write_bytes",
5134 "Total bytes in issued deferred writes",
5135 NULL,
5136 PerfCountersBuilder::PRIO_DEBUGONLY,
5137 unit_t(UNIT_BYTES));
5138 b.add_u64_counter(l_bluestore_submitted_deferred_writes,
5139 "submitted_deferred_writes",
5140 "Total deferred writes submitted to disk");
5141 b.add_u64_counter(l_bluestore_submitted_deferred_write_bytes,
5142 "submitted_deferred_write_bytes",
5143 "Total bytes submitted to disk by deferred writes",
5144 NULL,
5145 PerfCountersBuilder::PRIO_DEBUGONLY,
5146 unit_t(UNIT_BYTES));
5147
5148 b.add_u64_counter(l_bluestore_write_big_skipped_blobs,
5149 "write_big_skipped_blobs",
5150 "Large aligned writes into fresh blobs skipped due to zero detection (blobs)");
5151 b.add_u64_counter(l_bluestore_write_big_skipped_bytes,
5152 "write_big_skipped_bytes",
5153 "Large aligned writes into fresh blobs skipped due to zero detection (bytes)");
5154 b.add_u64_counter(l_bluestore_write_small_skipped,
5155 "write_small_skipped",
5156 "Small writes into existing or sparse small blobs skipped due to zero detection");
5157 b.add_u64_counter(l_bluestore_write_small_skipped_bytes,
5158 "write_small_skipped_bytes",
5159 "Small writes into existing or sparse small blobs skipped due to zero detection (bytes)");
5160 //****************************************
5161
5162 // compressions stats
5163 //****************************************
5164 b.add_u64(l_bluestore_compressed, "compressed",
5165 "Sum for stored compressed bytes",
5166 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5167 b.add_u64(l_bluestore_compressed_allocated, "compressed_allocated",
5168 "Sum for bytes allocated for compressed data",
5169 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5170 b.add_u64(l_bluestore_compressed_original, "compressed_original",
5171 "Sum for original bytes that were compressed",
5172 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
5173 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
5174 "Average compress latency",
5175 "_cpl", PerfCountersBuilder::PRIO_USEFUL);
5176 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
5177 "Average decompress latency",
5178 "dcpl", PerfCountersBuilder::PRIO_USEFUL);
5179 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
5180 "Sum for beneficial compress ops");
5181 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
5182 "Sum for compress ops rejected due to low net gain of space");
5183 //****************************************
5184
5185 // onode cache stats
5186 //****************************************
5187 b.add_u64(l_bluestore_onodes, "onodes",
5188 "Number of onodes in cache");
5189 b.add_u64(l_bluestore_pinned_onodes, "onodes_pinned",
5190 "Number of pinned onodes in cache");
5191 b.add_u64_counter(l_bluestore_onode_hits, "onode_hits",
5192 "Count of onode cache lookup hits",
5193 "o_ht", PerfCountersBuilder::PRIO_USEFUL);
5194 b.add_u64_counter(l_bluestore_onode_misses, "onode_misses",
5195 "Count of onode cache lookup misses",
5196 "o_ms", PerfCountersBuilder::PRIO_USEFUL);
5197 b.add_u64_counter(l_bluestore_onode_shard_hits, "onode_shard_hits",
5198 "Count of onode shard cache lookups hits");
5199 b.add_u64_counter(l_bluestore_onode_shard_misses,
5200 "onode_shard_misses",
5201 "Count of onode shard cache lookups misses");
5202 b.add_u64(l_bluestore_extents, "onode_extents",
5203 "Number of extents in cache");
5204 b.add_u64(l_bluestore_blobs, "onode_blobs",
5205 "Number of blobs in cache");
5206 //****************************************
5207
5208 // buffer cache stats
5209 //****************************************
5210 b.add_u64(l_bluestore_buffers, "buffers",
5211 "Number of buffers in cache");
5212 b.add_u64(l_bluestore_buffer_bytes, "buffer_bytes",
5213 "Number of buffer bytes in cache",
5214 NULL,
5215 PerfCountersBuilder::PRIO_DEBUGONLY,
5216 unit_t(UNIT_BYTES));
5217 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "buffer_hit_bytes",
5218 "Sum for bytes of read hit in the cache",
5219 NULL,
5220 PerfCountersBuilder::PRIO_DEBUGONLY,
5221 unit_t(UNIT_BYTES));
5222 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "buffer_miss_bytes",
5223 "Sum for bytes of read missed in the cache",
5224 NULL,
5225 PerfCountersBuilder::PRIO_DEBUGONLY,
5226 unit_t(UNIT_BYTES));
5227 //****************************************
5228
5229 // internal stats
5230 //****************************************
5231 b.add_u64_counter(l_bluestore_onode_reshard, "onode_reshard",
5232 "Onode extent map reshard events");
5233 b.add_u64_counter(l_bluestore_blob_split, "blob_split",
7c673cae 5234 "Sum for blob splitting due to resharding");
20effc67 5235 b.add_u64_counter(l_bluestore_extent_compress, "extent_compress",
7c673cae 5236 "Sum for extents that have been removed due to compression");
20effc67 5237 b.add_u64_counter(l_bluestore_gc_merged, "gc_merged",
7c673cae
FG
5238 "Sum for extents that have been merged due to garbage "
5239 "collection");
20effc67
TL
5240 //****************************************
5241
5242 // other client ops latencies
5243 //****************************************
11fdf7f2 5244 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
20effc67
TL
5245 "Average omap iterator seek_to_first call latency",
5246 "osfl", PerfCountersBuilder::PRIO_USEFUL);
11fdf7f2 5247 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
20effc67
TL
5248 "Average omap iterator upper_bound call latency",
5249 "oubl", PerfCountersBuilder::PRIO_USEFUL);
11fdf7f2 5250 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
20effc67
TL
5251 "Average omap iterator lower_bound call latency",
5252 "olbl", PerfCountersBuilder::PRIO_USEFUL);
11fdf7f2 5253 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
20effc67
TL
5254 "Average omap iterator next call latency",
5255 "onxl", PerfCountersBuilder::PRIO_USEFUL);
adb31ebb 5256 b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
20effc67
TL
5257 "Average omap get_keys call latency",
5258 "ogkl", PerfCountersBuilder::PRIO_USEFUL);
adb31ebb 5259 b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
20effc67
TL
5260 "Average omap get_values call latency",
5261 "ogvl", PerfCountersBuilder::PRIO_USEFUL);
5262 b.add_time_avg(l_bluestore_omap_clear_lat, "omap_clear_lat",
5263 "Average omap clear call latency");
494da23a 5264 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
20effc67
TL
5265 "Average collection listing latency",
5266 "cl_l", PerfCountersBuilder::PRIO_USEFUL);
adb31ebb 5267 b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
20effc67
TL
5268 "Average removal latency",
5269 "rm_l", PerfCountersBuilder::PRIO_USEFUL);
5270 b.add_time_avg(l_bluestore_truncate_lat, "truncate_lat",
5271 "Average truncate latency",
5272 "tr_l", PerfCountersBuilder::PRIO_USEFUL);
5273 //****************************************
5274
5275 // Resulting size axis configuration for op histograms, values are in bytes
5276 PerfHistogramCommon::axis_config_d alloc_hist_x_axis_config{
5277 "Given size (bytes)",
5278 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
5279 0, ///< Start at 0
5280 4096, ///< Quantization unit
5281 13, ///< Enough to cover 4+M requests
5282 };
5283 // Req size axis configuration for op histograms, values are in bytes
5284 PerfHistogramCommon::axis_config_d alloc_hist_y_axis_config{
5285 "Request size (bytes)",
5286 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
5287 0, ///< Start at 0
5288 4096, ///< Quantization unit
5289 13, ///< Enough to cover 4+M requests
5290 };
5291 b.add_u64_counter_histogram(
5292 l_bluestore_allocate_hist, "allocate_histogram",
5293 alloc_hist_x_axis_config, alloc_hist_y_axis_config,
5294 "Histogram of requested block allocations vs. given ones");
adb31ebb 5295
7c673cae
FG
5296 logger = b.create_perf_counters();
5297 cct->get_perfcounters_collection()->add(logger);
5298}
5299
5300int BlueStore::_reload_logger()
5301{
5302 struct store_statfs_t store_statfs;
7c673cae 5303 int r = statfs(&store_statfs);
11fdf7f2 5304 if (r >= 0) {
7c673cae 5305 logger->set(l_bluestore_allocated, store_statfs.allocated);
11fdf7f2
TL
5306 logger->set(l_bluestore_stored, store_statfs.data_stored);
5307 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
5308 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
5309 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
7c673cae
FG
5310 }
5311 return r;
5312}
5313
5314void BlueStore::_shutdown_logger()
5315{
5316 cct->get_perfcounters_collection()->remove(logger);
5317 delete logger;
5318}
5319
5320int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
5321 uuid_d *fsid)
5322{
5323 bluestore_bdev_label_t label;
5324 int r = _read_bdev_label(cct, path, &label);
5325 if (r < 0)
5326 return r;
5327 *fsid = label.osd_uuid;
5328 return 0;
5329}
5330
5331int BlueStore::_open_path()
5332{
b32b8144 5333 // sanity check(s)
11fdf7f2 5334 ceph_assert(path_fd < 0);
91327a77 5335 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
7c673cae
FG
5336 if (path_fd < 0) {
5337 int r = -errno;
5338 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
5339 << dendl;
5340 return r;
5341 }
5342 return 0;
5343}
5344
5345void BlueStore::_close_path()
5346{
5347 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
5348 path_fd = -1;
5349}
5350
3efd9988 5351int BlueStore::_write_bdev_label(CephContext *cct,
20effc67 5352 const string &path, bluestore_bdev_label_t label)
7c673cae
FG
5353{
5354 dout(10) << __func__ << " path " << path << " label " << label << dendl;
5355 bufferlist bl;
11fdf7f2 5356 encode(label, bl);
7c673cae 5357 uint32_t crc = bl.crc32c(-1);
11fdf7f2
TL
5358 encode(crc, bl);
5359 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
7c673cae
FG
5360 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
5361 z.zero();
5362 bl.append(std::move(z));
5363
91327a77 5364 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
7c673cae
FG
5365 if (fd < 0) {
5366 fd = -errno;
5367 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5368 << dendl;
5369 return fd;
5370 }
5371 int r = bl.write_fd(fd);
5372 if (r < 0) {
5373 derr << __func__ << " failed to write to " << path
5374 << ": " << cpp_strerror(r) << dendl;
11fdf7f2 5375 goto out;
7c673cae 5376 }
3efd9988
FG
5377 r = ::fsync(fd);
5378 if (r < 0) {
5379 derr << __func__ << " failed to fsync " << path
5380 << ": " << cpp_strerror(r) << dendl;
5381 }
11fdf7f2 5382out:
7c673cae
FG
5383 VOID_TEMP_FAILURE_RETRY(::close(fd));
5384 return r;
5385}
5386
20effc67 5387int BlueStore::_read_bdev_label(CephContext* cct, const string &path,
7c673cae
FG
5388 bluestore_bdev_label_t *label)
5389{
5390 dout(10) << __func__ << dendl;
91327a77 5391 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
7c673cae
FG
5392 if (fd < 0) {
5393 fd = -errno;
5394 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5395 << dendl;
5396 return fd;
5397 }
5398 bufferlist bl;
5399 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
5400 VOID_TEMP_FAILURE_RETRY(::close(fd));
5401 if (r < 0) {
5402 derr << __func__ << " failed to read from " << path
5403 << ": " << cpp_strerror(r) << dendl;
5404 return r;
5405 }
5406
5407 uint32_t crc, expected_crc;
11fdf7f2 5408 auto p = bl.cbegin();
7c673cae 5409 try {
11fdf7f2 5410 decode(*label, p);
7c673cae
FG
5411 bufferlist t;
5412 t.substr_of(bl, 0, p.get_off());
5413 crc = t.crc32c(-1);
11fdf7f2 5414 decode(expected_crc, p);
7c673cae 5415 }
f67539c2 5416 catch (ceph::buffer::error& e) {
b32b8144 5417 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
5418 << ": " << e.what()
5419 << dendl;
b32b8144 5420 return -ENOENT;
7c673cae
FG
5421 }
5422 if (crc != expected_crc) {
5423 derr << __func__ << " bad crc on label, expected " << expected_crc
5424 << " != actual " << crc << dendl;
5425 return -EIO;
5426 }
5427 dout(10) << __func__ << " got " << *label << dendl;
5428 return 0;
5429}
5430
5431int BlueStore::_check_or_set_bdev_label(
5432 string path, uint64_t size, string desc, bool create)
5433{
5434 bluestore_bdev_label_t label;
5435 if (create) {
5436 label.osd_uuid = fsid;
5437 label.size = size;
5438 label.btime = ceph_clock_now();
5439 label.description = desc;
3efd9988 5440 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
5441 if (r < 0)
5442 return r;
5443 } else {
5444 int r = _read_bdev_label(cct, path, &label);
5445 if (r < 0)
5446 return r;
31f18b77
FG
5447 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
5448 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5449 << " and fsid " << fsid << " check bypassed" << dendl;
1911f103 5450 } else if (label.osd_uuid != fsid) {
7c673cae
FG
5451 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5452 << " does not match our fsid " << fsid << dendl;
5453 return -EIO;
5454 }
5455 }
5456 return 0;
5457}
5458
5459void BlueStore::_set_alloc_sizes(void)
5460{
7c673cae
FG
5461 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
5462
20effc67
TL
5463#ifdef HAVE_LIBZBD
5464 ceph_assert(bdev);
5465 if (bdev->is_smr()) {
5466 prefer_deferred_size = 0;
5467 } else
5468#endif
7c673cae
FG
5469 if (cct->_conf->bluestore_prefer_deferred_size) {
5470 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
5471 } else {
9f95a23c 5472 if (_use_rotational_settings()) {
7c673cae
FG
5473 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
5474 } else {
5475 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
5476 }
5477 }
5478
5479 if (cct->_conf->bluestore_deferred_batch_ops) {
5480 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
5481 } else {
9f95a23c 5482 if (_use_rotational_settings()) {
7c673cae
FG
5483 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
5484 } else {
5485 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
5486 }
5487 }
5488
5489 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11fdf7f2 5490 << std::dec << " order " << (int)min_alloc_size_order
7c673cae
FG
5491 << " max_alloc_size 0x" << std::hex << max_alloc_size
5492 << " prefer_deferred_size 0x" << prefer_deferred_size
5493 << std::dec
5494 << " deferred_batch_ops " << deferred_batch_ops
5495 << dendl;
5496}
5497
5498int BlueStore::_open_bdev(bool create)
5499{
11fdf7f2 5500 ceph_assert(bdev == NULL);
7c673cae 5501 string p = path + "/block";
11fdf7f2 5502 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
7c673cae
FG
5503 int r = bdev->open(p);
5504 if (r < 0)
5505 goto fail;
5506
11fdf7f2
TL
5507 if (create && cct->_conf->bdev_enable_discard) {
5508 bdev->discard(0, bdev->get_size());
5509 }
5510
7c673cae
FG
5511 if (bdev->supported_bdev_label()) {
5512 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
5513 if (r < 0)
5514 goto fail_close;
5515 }
5516
5517 // initialize global block parameters
5518 block_size = bdev->get_block_size();
5519 block_mask = ~(block_size - 1);
5520 block_size_order = ctz(block_size);
11fdf7f2 5521 ceph_assert(block_size == 1u << block_size_order);
9f95a23c 5522 _set_max_defer_interval();
224ce89b
WB
5523 // and set cache_size based on device type
5524 r = _set_cache_sizes();
5525 if (r < 0) {
5526 goto fail_close;
5527 }
20effc67
TL
5528 // get block dev optimal io size
5529 optimal_io_size = bdev->get_optimal_io_size();
f67539c2 5530
7c673cae
FG
5531 return 0;
5532
5533 fail_close:
5534 bdev->close();
5535 fail:
5536 delete bdev;
5537 bdev = NULL;
5538 return r;
5539}
5540
11fdf7f2
TL
5541void BlueStore::_validate_bdev()
5542{
5543 ceph_assert(bdev);
11fdf7f2 5544 uint64_t dev_size = bdev->get_size();
f67539c2 5545 ceph_assert(dev_size > _get_ondisk_reserved());
11fdf7f2
TL
5546}
5547
7c673cae
FG
5548void BlueStore::_close_bdev()
5549{
11fdf7f2 5550 ceph_assert(bdev);
7c673cae
FG
5551 bdev->close();
5552 delete bdev;
5553 bdev = NULL;
5554}
5555
20effc67 5556int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only, bool fm_restore)
7c673cae 5557{
1911f103 5558 int r;
1911f103 5559
20effc67 5560 dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
11fdf7f2 5561 ceph_assert(fm == NULL);
20effc67
TL
5562 // fm_restore means we are transitioning from null-fm to bitmap-fm
5563 ceph_assert(!fm_restore || (freelist_type != "null"));
5564 // fm restore must pass in a valid transaction
5565 ceph_assert(!fm_restore || (t != nullptr));
5566
5567 // When allocation-info is stored in a single file we set freelist_type to "null"
5568 bool set_null_freemap = false;
5569 if (freelist_type == "null") {
5570 // use BitmapFreelistManager with the null option to stop allocations from going to RocksDB
5571 // we will store the allocation info in a single file during umount()
5572 freelist_type = "bitmap";
5573 set_null_freemap = true;
5574 }
11fdf7f2
TL
5575 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
5576 ceph_assert(fm);
20effc67
TL
5577 if (set_null_freemap) {
5578 fm->set_null_manager();
5579 }
11fdf7f2
TL
5580 if (t) {
5581 // create mode. initialize freespace
7c673cae 5582 dout(20) << __func__ << " initializing freespace" << dendl;
7c673cae
FG
5583 {
5584 bufferlist bl;
5585 bl.append(freelist_type);
5586 t->set(PREFIX_SUPER, "freelist_type", bl);
5587 }
b32b8144
FG
5588 // being able to allocate in units less than bdev block size
5589 // seems to be a bad idea.
20effc67 5590 ceph_assert(cct->_conf->bdev_block_size <= min_alloc_size);
f67539c2
TL
5591
5592 uint64_t alloc_size = min_alloc_size;
20effc67 5593#ifdef HAVE_LIBZBD
f67539c2 5594 if (bdev->is_smr()) {
20effc67
TL
5595 if (freelist_type != "zoned") {
5596 derr << "SMR device but freelist_type = " << freelist_type << " (not zoned)"
5597 << dendl;
5598 return -EINVAL;
5599 }
5600 } else
5601#endif
5602 if (freelist_type == "zoned") {
5603 derr << "non-SMR device (or SMR support not built-in) but freelist_type = zoned"
5604 << dendl;
5605 return -EINVAL;
f67539c2
TL
5606 }
5607
20effc67
TL
5608 fm->create(bdev->get_size(), alloc_size,
5609 zone_size, first_sequential_zone,
5610 t);
7c673cae
FG
5611
5612 // allocate superblock reserved space. note that we do not mark
5613 // bluefs space as allocated in the freelist; we instead rely on
f67539c2 5614 // bluefs doing that itself.
11fdf7f2 5615 auto reserved = _get_ondisk_reserved();
20effc67
TL
5616 if (fm_restore) {
5617 // we need to allocate the full space in restore case
5618 // as later we will add free-space marked in the allocator file
5619 fm->allocate(0, bdev->get_size(), t);
5620 } else {
5621 // allocate superblock reserved space. note that we do not mark
5622 // bluefs space as allocated in the freelist; we instead rely on
5623 // bluefs doing that itself.
5624 fm->allocate(0, reserved, t);
5625 }
5626 // debug code - not needed for NULL FM
7c673cae
FG
5627 if (cct->_conf->bluestore_debug_prefill > 0) {
5628 uint64_t end = bdev->get_size() - reserved;
5629 dout(1) << __func__ << " pre-fragmenting freespace, using "
5630 << cct->_conf->bluestore_debug_prefill << " with max free extent "
5631 << cct->_conf->bluestore_debug_prefragment_max << dendl;
11fdf7f2 5632 uint64_t start = p2roundup(reserved, min_alloc_size);
7c673cae
FG
5633 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
5634 float r = cct->_conf->bluestore_debug_prefill;
5635 r /= 1.0 - r;
5636 bool stop = false;
5637
5638 while (!stop && start < end) {
5639 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
5640 if (start + l > end) {
5641 l = end - start;
11fdf7f2 5642 l = p2align(l, min_alloc_size);
7c673cae 5643 }
11fdf7f2 5644 ceph_assert(start + l <= end);
7c673cae
FG
5645
5646 uint64_t u = 1 + (uint64_t)(r * (double)l);
11fdf7f2 5647 u = p2roundup(u, min_alloc_size);
7c673cae
FG
5648 if (start + l + u > end) {
5649 u = end - (start + l);
5650 // trim to align so we don't overflow again
11fdf7f2 5651 u = p2align(u, min_alloc_size);
7c673cae
FG
5652 stop = true;
5653 }
11fdf7f2 5654 ceph_assert(start + l + u <= end);
7c673cae 5655
11fdf7f2 5656 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
7c673cae
FG
5657 << " use 0x" << u << std::dec << dendl;
5658
5659 if (u == 0) {
5660 // break if u has been trimmed to nothing
5661 break;
5662 }
5663
5664 fm->allocate(start + l, u, t);
5665 start += l + u;
5666 }
5667 }
f67539c2 5668 r = _write_out_fm_meta(0);
1911f103
TL
5669 ceph_assert(r == 0);
5670 } else {
f67539c2
TL
5671 r = fm->init(db, read_only,
5672 [&](const std::string& key, std::string* result) {
5673 return read_meta(key, result);
5674 });
1911f103 5675 if (r < 0) {
f67539c2 5676 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
1911f103
TL
5677 delete fm;
5678 fm = NULL;
5679 return r;
5680 }
7c673cae 5681 }
81eedcae
TL
5682 // if space size tracked by free list manager is that higher than actual
5683 // dev size one can hit out-of-space allocation which will result
5684 // in data loss and/or assertions
5685 // Probably user altered the device size somehow.
5686 // The only fix for now is to redeploy OSD.
5687 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5688 ostringstream ss;
5689 ss << "slow device size mismatch detected, "
5690 << " fm size(" << fm->get_size()
5691 << ") > slow device size(" << bdev->get_size()
5692 << "), Please stop using this OSD as it might cause data loss.";
5693 _set_disk_size_mismatch_alert(ss.str());
5694 }
7c673cae
FG
5695 return 0;
5696}
5697
5698void BlueStore::_close_fm()
5699{
5700 dout(10) << __func__ << dendl;
11fdf7f2 5701 ceph_assert(fm);
7c673cae
FG
5702 fm->shutdown();
5703 delete fm;
5704 fm = NULL;
5705}
5706
f67539c2 5707int BlueStore::_write_out_fm_meta(uint64_t target_size)
1911f103 5708{
f67539c2 5709 int r = 0;
1911f103
TL
5710 string p = path + "/block";
5711
5712 std::vector<std::pair<string, string>> fm_meta;
5713 fm->get_meta(target_size, &fm_meta);
5714
1911f103 5715 for (auto& m : fm_meta) {
f67539c2
TL
5716 r = write_meta(m.first, m.second);
5717 ceph_assert(r == 0);
1911f103 5718 }
1911f103
TL
5719 return r;
5720}
5721
f67539c2 5722int BlueStore::_create_alloc()
7c673cae 5723{
20effc67 5724 ceph_assert(alloc == NULL);
f67539c2 5725 ceph_assert(shared_alloc.a == NULL);
11fdf7f2
TL
5726 ceph_assert(bdev->get_size());
5727
f67539c2 5728 uint64_t alloc_size = min_alloc_size;
20effc67
TL
5729
5730 std::string allocator_type = cct->_conf->bluestore_allocator;
5731
5732#ifdef HAVE_LIBZBD
5733 if (freelist_type == "zoned") {
5734 allocator_type = "zoned";
11fdf7f2 5735 }
20effc67 5736#endif
11fdf7f2 5737
20effc67
TL
5738 alloc = Allocator::create(
5739 cct, allocator_type,
f67539c2 5740 bdev->get_size(),
20effc67
TL
5741 alloc_size,
5742 zone_size,
5743 first_sequential_zone,
5744 "block");
5745 if (!alloc) {
5746 lderr(cct) << __func__ << " failed to create " << allocator_type << " allocator"
5747 << dendl;
7c673cae
FG
5748 return -EINVAL;
5749 }
20effc67
TL
5750
5751#ifdef HAVE_LIBZBD
5752 if (freelist_type == "zoned") {
5753 Allocator *a = Allocator::create(
5754 cct, cct->_conf->bluestore_allocator,
5755 bdev->get_conventional_region_size(),
5756 alloc_size,
5757 0, 0,
5758 "zoned_block");
5759 if (!a) {
5760 lderr(cct) << __func__ << " failed to create " << cct->_conf->bluestore_allocator
5761 << " allocator" << dendl;
5762 delete alloc;
5763 return -EINVAL;
5764 }
5765 shared_alloc.set(a);
5766 } else
5767#endif
5768 {
5769 // BlueFS will share the same allocator
5770 shared_alloc.set(alloc);
5771 }
5772
f67539c2
TL
5773 return 0;
5774}
5775
20effc67 5776int BlueStore::_init_alloc(std::map<uint64_t, uint64_t> *zone_adjustments)
f67539c2
TL
5777{
5778 int r = _create_alloc();
5779 if (r < 0) {
5780 return r;
5781 }
20effc67 5782 ceph_assert(alloc != NULL);
f67539c2 5783
20effc67 5784#ifdef HAVE_LIBZBD
f67539c2 5785 if (bdev->is_smr()) {
20effc67
TL
5786 auto a = dynamic_cast<ZonedAllocator*>(alloc);
5787 ceph_assert(a);
5788 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
5789 ceph_assert(f);
5790 vector<uint64_t> wp = bdev->get_zones();
5791 vector<zone_state_t> zones = f->get_zone_states(db);
5792 ceph_assert(wp.size() == zones.size());
5793
5794 // reconcile zone state
5795 auto num_zones = bdev->get_size() / zone_size;
5796 for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
5797 ceph_assert(wp[i] >= i * zone_size);
5798 ceph_assert(wp[i] <= (i + 1) * zone_size); // pos might be at start of next zone
5799 uint64_t p = wp[i] - i * zone_size;
5800 if (zones[i].write_pointer > p) {
5801 derr << __func__ << " zone 0x" << std::hex << i
5802 << " bluestore write pointer 0x" << zones[i].write_pointer
5803 << " > device write pointer 0x" << p
5804 << std::dec << " -- VERY SUSPICIOUS!" << dendl;
5805 } else if (zones[i].write_pointer < p) {
5806 // this is "normal" in that it can happen after any crash (if we have a
5807 // write in flight but did not manage to commit the transaction)
5808 auto delta = p - zones[i].write_pointer;
5809 dout(1) << __func__ << " zone 0x" << std::hex << i
5810 << " device write pointer 0x" << p
5811 << " > bluestore pointer 0x" << zones[i].write_pointer
5812 << ", advancing 0x" << delta << std::dec << dendl;
5813 (*zone_adjustments)[zones[i].write_pointer] = delta;
5814 zones[i].num_dead_bytes += delta;
5815 zones[i].write_pointer = p;
5816 }
5817 }
5818
5819 // start with conventional zone "free" (bluefs may adjust this when it starts up)
5820 auto reserved = _get_ondisk_reserved();
5821 // for now we require a conventional zone
5822 ceph_assert(bdev->get_conventional_region_size());
5823 ceph_assert(shared_alloc.a != alloc); // zoned allocator doesn't use conventional region
5824 shared_alloc.a->init_add_free(
5825 reserved,
5826 p2align(bdev->get_conventional_region_size(), min_alloc_size) - reserved);
5827
5828 // init sequential zone based on the device's write pointers
5829 a->init_from_zone_pointers(std::move(zones));
5830 dout(1) << __func__
5831 << " loaded zone pointers: "
5832 << std::hex
5833 << ", allocator type " << alloc->get_type()
5834 << ", capacity 0x" << alloc->get_capacity()
5835 << ", block size 0x" << alloc->get_block_size()
5836 << ", free 0x" << alloc->get_free()
5837 << ", fragmentation " << alloc->get_fragmentation()
5838 << std::dec << dendl;
5839
5840 return 0;
f67539c2 5841 }
20effc67 5842#endif
7c673cae
FG
5843
5844 uint64_t num = 0, bytes = 0;
20effc67
TL
5845 utime_t start_time = ceph_clock_now();
5846 if (!fm->is_null_manager()) {
5847 // This is the original path - loading allocation map from RocksDB and feeding into the allocator
5848 dout(5) << __func__ << "::NCB::loading allocation from FM -> alloc" << dendl;
5849 // initialize from freelist
5850 fm->enumerate_reset();
5851 uint64_t offset, length;
5852 while (fm->enumerate_next(db, &offset, &length)) {
5853 alloc->init_add_free(offset, length);
5854 ++num;
5855 bytes += length;
5856 }
5857 fm->enumerate_reset();
5858
5859 utime_t duration = ceph_clock_now() - start_time;
5860 dout(5) << __func__ << "::num_entries=" << num << " free_size=" << bytes << " alloc_size=" <<
5861 alloc->get_capacity() - bytes << " time=" << duration << " seconds" << dendl;
5862 } else {
5863 // This is the new path reading the allocation map from a flat bluefs file and feeding them into the allocator
7c673cae 5864
20effc67
TL
5865 if (!cct->_conf->bluestore_allocation_from_file) {
5866 derr << __func__ << "::NCB::cct->_conf->bluestore_allocation_from_file is set to FALSE with an active NULL-FM" << dendl;
5867 derr << __func__ << "::NCB::Please change the value of bluestore_allocation_from_file to TRUE in your ceph.conf file" << dendl;
5868 return -ENOTSUP; // Operation not supported
5869 }
7c673cae 5870
20effc67
TL
5871 if (restore_allocator(alloc, &num, &bytes) == 0) {
5872 dout(5) << __func__ << "::NCB::restore_allocator() completed successfully alloc=" << alloc << dendl;
5873 } else {
5874 // This must mean that we had an unplanned shutdown and didn't manage to destage the allocator
5875 dout(0) << __func__ << "::NCB::restore_allocator() failed! Run Full Recovery from ONodes (might take a while) ..." << dendl;
5876 // if failed must recover from on-disk ONode internal state
5877 if (read_allocation_from_drive_on_startup() != 0) {
5878 derr << __func__ << "::NCB::Failed Recovery" << dendl;
5879 derr << __func__ << "::NCB::Ceph-OSD won't start, make sure your drives are connected and readable" << dendl;
5880 derr << __func__ << "::NCB::If no HW fault is found, please report failure and consider redeploying OSD" << dendl;
5881 return -ENOTRECOVERABLE;
5882 }
5883 }
5884 }
f67539c2
TL
5885 dout(1) << __func__
5886 << " loaded " << byte_u_t(bytes) << " in " << num << " extents"
5887 << std::hex
20effc67
TL
5888 << ", allocator type " << alloc->get_type()
5889 << ", capacity 0x" << alloc->get_capacity()
5890 << ", block size 0x" << alloc->get_block_size()
5891 << ", free 0x" << alloc->get_free()
5892 << ", fragmentation " << alloc->get_fragmentation()
f67539c2 5893 << std::dec << dendl;
1911f103 5894
7c673cae
FG
5895 return 0;
5896}
5897
20effc67
TL
5898void BlueStore::_post_init_alloc(const std::map<uint64_t, uint64_t>& zone_adjustments)
5899{
5900#ifdef HAVE_LIBZBD
5901 assert(bdev->is_smr());
5902 dout(1) << __func__ << " adjusting freelist based on device write pointers" << dendl;
5903 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
5904 ceph_assert(f);
5905 KeyValueDB::Transaction t = db->get_transaction();
5906 for (auto& i : zone_adjustments) {
5907 // allocate AND release since this gap is now dead space
5908 // note that the offset is imprecise, but only need to select the zone
5909 f->allocate(i.first, i.second, t);
5910 f->release(i.first, i.second, t);
5911 }
5912 int r = db->submit_transaction_sync(t);
5913 ceph_assert(r == 0);
5914#endif
5915}
5916
7c673cae
FG
5917void BlueStore::_close_alloc()
5918{
11fdf7f2
TL
5919 ceph_assert(bdev);
5920 bdev->discard_drain();
5921
20effc67
TL
5922 ceph_assert(alloc);
5923 alloc->shutdown();
5924 delete alloc;
5925
f67539c2 5926 ceph_assert(shared_alloc.a);
20effc67
TL
5927 if (alloc != shared_alloc.a) {
5928 shared_alloc.a->shutdown();
5929 delete shared_alloc.a;
5930 }
5931
f67539c2 5932 shared_alloc.reset();
20effc67 5933 alloc = nullptr;
7c673cae
FG
5934}
5935
5936int BlueStore::_open_fsid(bool create)
5937{
11fdf7f2 5938 ceph_assert(fsid_fd < 0);
91327a77 5939 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
5940 if (create)
5941 flags |= O_CREAT;
5942 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
5943 if (fsid_fd < 0) {
5944 int err = -errno;
5945 derr << __func__ << " " << cpp_strerror(err) << dendl;
5946 return err;
5947 }
5948 return 0;
5949}
5950
5951int BlueStore::_read_fsid(uuid_d *uuid)
5952{
5953 char fsid_str[40];
5954 memset(fsid_str, 0, sizeof(fsid_str));
5955 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
5956 if (ret < 0) {
5957 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
5958 return ret;
5959 }
5960 if (ret > 36)
5961 fsid_str[36] = 0;
5962 else
5963 fsid_str[ret] = 0;
5964 if (!uuid->parse(fsid_str)) {
5965 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
5966 return -EINVAL;
5967 }
5968 return 0;
5969}
5970
5971int BlueStore::_write_fsid()
5972{
5973 int r = ::ftruncate(fsid_fd, 0);
5974 if (r < 0) {
5975 r = -errno;
5976 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
5977 return r;
5978 }
5979 string str = stringify(fsid) + "\n";
5980 r = safe_write(fsid_fd, str.c_str(), str.length());
5981 if (r < 0) {
5982 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
5983 return r;
5984 }
5985 r = ::fsync(fsid_fd);
5986 if (r < 0) {
5987 r = -errno;
5988 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
5989 return r;
5990 }
5991 return 0;
5992}
5993
5994void BlueStore::_close_fsid()
5995{
5996 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
5997 fsid_fd = -1;
5998}
5999
6000int BlueStore::_lock_fsid()
6001{
6002 struct flock l;
6003 memset(&l, 0, sizeof(l));
6004 l.l_type = F_WRLCK;
6005 l.l_whence = SEEK_SET;
6006 int r = ::fcntl(fsid_fd, F_SETLK, &l);
6007 if (r < 0) {
6008 int err = errno;
6009 derr << __func__ << " failed to lock " << path << "/fsid"
6010 << " (is another ceph-osd still running?)"
6011 << cpp_strerror(err) << dendl;
6012 return -err;
6013 }
6014 return 0;
6015}
6016
31f18b77
FG
6017bool BlueStore::is_rotational()
6018{
6019 if (bdev) {
6020 return bdev->is_rotational();
6021 }
6022
6023 bool rotational = true;
6024 int r = _open_path();
6025 if (r < 0)
6026 goto out;
6027 r = _open_fsid(false);
6028 if (r < 0)
6029 goto out_path;
6030 r = _read_fsid(&fsid);
6031 if (r < 0)
6032 goto out_fsid;
6033 r = _lock_fsid();
6034 if (r < 0)
6035 goto out_fsid;
6036 r = _open_bdev(false);
6037 if (r < 0)
6038 goto out_fsid;
6039 rotational = bdev->is_rotational();
6040 _close_bdev();
6041 out_fsid:
6042 _close_fsid();
6043 out_path:
6044 _close_path();
6045 out:
6046 return rotational;
6047}
6048
d2e6a577
FG
6049bool BlueStore::is_journal_rotational()
6050{
6051 if (!bluefs) {
6052 dout(5) << __func__ << " bluefs disabled, default to store media type"
6053 << dendl;
6054 return is_rotational();
6055 }
6056 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
6057 return bluefs->wal_is_rotational();
6058}
6059
9f95a23c
TL
6060bool BlueStore::_use_rotational_settings()
6061{
6062 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
6063 return true;
6064 }
6065 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
6066 return false;
6067 }
6068 return bdev->is_rotational();
6069}
6070
7c673cae
FG
6071bool BlueStore::test_mount_in_use()
6072{
6073 // most error conditions mean the mount is not in use (e.g., because
6074 // it doesn't exist). only if we fail to lock do we conclude it is
6075 // in use.
6076 bool ret = false;
6077 int r = _open_path();
6078 if (r < 0)
6079 return false;
6080 r = _open_fsid(false);
6081 if (r < 0)
6082 goto out_path;
6083 r = _lock_fsid();
6084 if (r < 0)
6085 ret = true; // if we can't lock, it is in use
6086 _close_fsid();
6087 out_path:
6088 _close_path();
6089 return ret;
6090}
6091
11fdf7f2 6092int BlueStore::_minimal_open_bluefs(bool create)
7c673cae
FG
6093{
6094 int r;
11fdf7f2 6095 bluefs = new BlueFS(cct);
7c673cae 6096
11fdf7f2
TL
6097 string bfn;
6098 struct stat st;
6099
6100 bfn = path + "/block.db";
6101 if (::stat(bfn.c_str(), &st) == 0) {
eafe8130
TL
6102 r = bluefs->add_block_device(
6103 BlueFS::BDEV_DB, bfn,
f67539c2
TL
6104 create && cct->_conf->bdev_enable_discard,
6105 SUPER_RESERVED);
7c673cae 6106 if (r < 0) {
11fdf7f2
TL
6107 derr << __func__ << " add block device(" << bfn << ") returned: "
6108 << cpp_strerror(r) << dendl;
6109 goto free_bluefs;
7c673cae 6110 }
7c673cae 6111
11fdf7f2
TL
6112 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
6113 r = _check_or_set_bdev_label(
6114 bfn,
6115 bluefs->get_block_device_size(BlueFS::BDEV_DB),
6116 "bluefs db", create);
6117 if (r < 0) {
6118 derr << __func__
6119 << " check block device(" << bfn << ") label returned: "
6120 << cpp_strerror(r) << dendl;
6121 goto free_bluefs;
6122 }
7c673cae 6123 }
9f95a23c
TL
6124 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6125 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
6126 } else {
6127 r = -errno;
6128 if (::lstat(bfn.c_str(), &st) == -1) {
6129 r = 0;
9f95a23c 6130 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7c673cae 6131 } else {
11fdf7f2
TL
6132 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
6133 << cpp_strerror(r) << dendl;
6134 goto free_bluefs;
7c673cae
FG
6135 }
6136 }
7c673cae 6137
11fdf7f2
TL
6138 // shared device
6139 bfn = path + "/block";
6140 // never trim here
9f95a23c 6141 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
f67539c2
TL
6142 0, // no need to provide valid 'reserved' for shared dev
6143 &shared_alloc);
11fdf7f2
TL
6144 if (r < 0) {
6145 derr << __func__ << " add block device(" << bfn << ") returned: "
6146 << cpp_strerror(r) << dendl;
6147 goto free_bluefs;
6148 }
11fdf7f2
TL
6149
6150 bfn = path + "/block.wal";
6151 if (::stat(bfn.c_str(), &st) == 0) {
6152 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
f67539c2
TL
6153 create && cct->_conf->bdev_enable_discard,
6154 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
6155 if (r < 0) {
6156 derr << __func__ << " add block device(" << bfn << ") returned: "
6157 << cpp_strerror(r) << dendl;
6158 goto free_bluefs;
6159 }
7c673cae 6160
11fdf7f2
TL
6161 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
6162 r = _check_or_set_bdev_label(
6163 bfn,
6164 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
6165 "bluefs wal", create);
7c673cae 6166 if (r < 0) {
11fdf7f2
TL
6167 derr << __func__ << " check block device(" << bfn
6168 << ") label returned: " << cpp_strerror(r) << dendl;
7c673cae
FG
6169 goto free_bluefs;
6170 }
7c673cae
FG
6171 }
6172
9f95a23c 6173 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6174 } else {
6175 r = 0;
6176 if (::lstat(bfn.c_str(), &st) != -1) {
6177 r = -errno;
6178 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
6179 << cpp_strerror(r) << dendl;
7c673cae
FG
6180 goto free_bluefs;
6181 }
11fdf7f2
TL
6182 }
6183 return 0;
7c673cae 6184
11fdf7f2
TL
6185free_bluefs:
6186 ceph_assert(bluefs);
6187 delete bluefs;
6188 bluefs = NULL;
6189 return r;
6190}
7c673cae 6191
f67539c2 6192int BlueStore::_open_bluefs(bool create, bool read_only)
11fdf7f2
TL
6193{
6194 int r = _minimal_open_bluefs(create);
6195 if (r < 0) {
6196 return r;
6197 }
f67539c2 6198 BlueFSVolumeSelector* vselector = nullptr;
9f95a23c
TL
6199 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
6200
6201 string options = cct->_conf->bluestore_rocksdb_options;
cd265ab1
TL
6202 string options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6203 if (!options_annex.empty()) {
6204 if (!options.empty() &&
6205 *options.rbegin() != ',') {
6206 options += ',';
6207 }
6208 options += options_annex;
6209 }
9f95a23c
TL
6210
6211 rocksdb::Options rocks_opts;
f67539c2 6212 r = RocksDBStore::ParseOptionsFromStringStatic(
9f95a23c
TL
6213 cct,
6214 options,
6215 rocks_opts,
6216 nullptr);
6217 if (r < 0) {
6218 return r;
6219 }
f67539c2
TL
6220 if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
6221 vselector = new FitToFastVolumeSelector(
9f95a23c
TL
6222 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
6223 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
f67539c2
TL
6224 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100);
6225 } else {
6226 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
6227 vselector =
6228 new RocksDBBlueFSVolumeSelector(
6229 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
6230 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
6231 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
6232 1024 * 1024 * 1024, //FIXME: set expected l0 size here
6233 rocks_opts.max_bytes_for_level_base,
6234 rocks_opts.max_bytes_for_level_multiplier,
6235 reserved_factor,
6236 cct->_conf->bluestore_volume_selection_reserved,
6237 cct->_conf->bluestore_volume_selection_policy == "use_some_extra");
6238 }
9f95a23c 6239 }
11fdf7f2 6240 if (create) {
9f95a23c 6241 bluefs->mkfs(fsid, bluefs_layout);
11fdf7f2 6242 }
9f95a23c 6243 bluefs->set_volume_selector(vselector);
11fdf7f2
TL
6244 r = bluefs->mount();
6245 if (r < 0) {
6246 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
6247 }
9f95a23c 6248 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
11fdf7f2
TL
6249 return r;
6250}
6251
20effc67 6252void BlueStore::_close_bluefs()
11fdf7f2 6253{
20effc67 6254 bluefs->umount(db_was_opened_read_only);
11fdf7f2
TL
6255 _minimal_close_bluefs();
6256}
6257
6258void BlueStore::_minimal_close_bluefs()
6259{
6260 delete bluefs;
6261 bluefs = NULL;
6262}
6263
6264int BlueStore::_is_bluefs(bool create, bool* ret)
6265{
6266 if (create) {
6267 *ret = cct->_conf->bluestore_bluefs;
6268 } else {
6269 string s;
6270 int r = read_meta("bluefs", &s);
6271 if (r < 0) {
6272 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
6273 return -EIO;
6274 }
6275 if (s == "1") {
6276 *ret = true;
6277 } else if (s == "0") {
6278 *ret = false;
31f18b77 6279 } else {
11fdf7f2
TL
6280 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
6281 << dendl;
6282 return -EIO;
6283 }
6284 }
6285 return 0;
6286}
6287
6288/*
6289* opens both DB and dependant super_meta, FreelistManager and allocator
6290* in the proper order
6291*/
f67539c2 6292int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
11fdf7f2 6293{
20effc67 6294 dout(5) << __func__ << "::NCB::read_only=" << read_only << ", to_repair=" << to_repair << dendl;
f67539c2
TL
6295 {
6296 string type;
6297 int r = read_meta("type", &type);
6298 if (r < 0) {
6299 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
6300 << dendl;
11fdf7f2 6301 return r;
f67539c2 6302 }
11fdf7f2 6303
f67539c2
TL
6304 if (type != "bluestore") {
6305 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6306 return -EIO;
11fdf7f2 6307 }
f67539c2 6308 }
11fdf7f2 6309
20effc67
TL
6310 // SMR devices may require a freelist adjustment, but that can only happen after
6311 // the db is read-write. we'll stash pending changes here.
6312 std::map<uint64_t, uint64_t> zone_adjustments;
6313
f67539c2
TL
6314 int r = _open_path();
6315 if (r < 0)
6316 return r;
6317 r = _open_fsid(false);
6318 if (r < 0)
6319 goto out_path;
11fdf7f2 6320
f67539c2
TL
6321 r = _read_fsid(&fsid);
6322 if (r < 0)
6323 goto out_fsid;
11fdf7f2 6324
f67539c2
TL
6325 r = _lock_fsid();
6326 if (r < 0)
6327 goto out_fsid;
11fdf7f2 6328
f67539c2
TL
6329 r = _open_bdev(false);
6330 if (r < 0)
6331 goto out_fsid;
7c673cae 6332
20effc67
TL
6333 // GBH: can probably skip open_db step in REad-Only mode when operating in NULL-FM mode
6334 // (might need to open if failed to restore from file)
6335
f67539c2
TL
6336 // open in read-only first to read FM list and init allocator
6337 // as they might be needed for some BlueFS procedures
6338 r = _open_db(false, false, true);
6339 if (r < 0)
6340 goto out_bdev;
11fdf7f2 6341
f67539c2
TL
6342 r = _open_super_meta();
6343 if (r < 0) {
6344 goto out_db;
6345 }
6346
6347 r = _open_fm(nullptr, true);
6348 if (r < 0)
6349 goto out_db;
6350
20effc67 6351 r = _init_alloc(&zone_adjustments);
f67539c2
TL
6352 if (r < 0)
6353 goto out_fm;
6354
6355 // Re-open in the proper mode(s).
6356
6357 // Can't simply bypass second open for read-only mode as we need to
6358 // load allocated extents from bluefs into allocator.
6359 // And now it's time to do that
6360 //
20effc67 6361 _close_db();
f67539c2
TL
6362 r = _open_db(false, to_repair, read_only);
6363 if (r < 0) {
6364 goto out_alloc;
11fdf7f2 6365 }
20effc67
TL
6366
6367 if (!read_only && !zone_adjustments.empty()) {
6368 // for SMR devices that have freelist mismatch with device write pointers
6369 _post_init_alloc(zone_adjustments);
6370 }
6371
6372 // when function is called in repair mode (to_repair=true) we skip db->open()/create()
6373 // we can't change bluestore allocation so no need to invlidate allocation-file
6374 if (fm->is_null_manager() && !read_only && !to_repair) {
6375 // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected
6376 // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount()
6377 // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort
6378 // to recovery from RocksDB::ONodes
6379 r = invalidate_allocation_file_on_bluefs();
6380 if (r != 0) {
6381 derr << __func__ << "::NCB::invalidate_allocation_file_on_bluefs() failed!" << dendl;
6382 goto out_alloc;
6383 }
6384 }
6385
6386 // when function is called in repair mode (to_repair=true) we skip db->open()/create()
6387 if (!read_only && !to_repair && cct->_conf->bluestore_allocation_from_file
6388#ifdef HAVE_LIBZBD
6389 && !bdev->is_smr()
6390#endif
6391 ) {
6392 dout(5) << __func__ << "::NCB::Commit to Null-Manager" << dendl;
6393 commit_to_null_manager();
6394 need_to_destage_allocation_file = true;
6395 dout(10) << __func__ << "::NCB::need_to_destage_allocation_file was set" << dendl;
6396 }
6397
11fdf7f2
TL
6398 return 0;
6399
f67539c2
TL
6400out_alloc:
6401 _close_alloc();
6402out_fm:
11fdf7f2
TL
6403 _close_fm();
6404 out_db:
20effc67 6405 _close_db();
f67539c2
TL
6406 out_bdev:
6407 _close_bdev();
6408 out_fsid:
6409 _close_fsid();
6410 out_path:
6411 _close_path();
11fdf7f2
TL
6412 return r;
6413}
6414
20effc67 6415void BlueStore::_close_db_and_around()
11fdf7f2 6416{
20effc67
TL
6417 if (db) {
6418 _close_db();
6419 }
6420 if (bluefs) {
6421 _close_bluefs();
6422 }
f67539c2
TL
6423 _close_fm();
6424 _close_alloc();
6425 _close_bdev();
6426 _close_fsid();
6427 _close_path();
6428}
6429
6430int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair)
6431{
6432 _kv_only = true;
6433 int r = _open_db_and_around(false, to_repair);
6434 if (r == 0) {
6435 *pdb = db;
11fdf7f2 6436 } else {
f67539c2 6437 *pdb = nullptr;
11fdf7f2 6438 }
f67539c2 6439 return r;
11fdf7f2
TL
6440}
6441
f67539c2 6442int BlueStore::close_db_environment()
11fdf7f2 6443{
20effc67 6444 _close_db_and_around();
f67539c2 6445 return 0;
11fdf7f2
TL
6446}
6447
20effc67
TL
6448/* gets access to bluefs supporting RocksDB */
6449BlueFS* BlueStore::get_bluefs() {
6450 return bluefs;
6451}
6452
f67539c2
TL
6453int BlueStore::_prepare_db_environment(bool create, bool read_only,
6454 std::string* _fn, std::string* _kv_backend)
11fdf7f2
TL
6455{
6456 int r;
6457 ceph_assert(!db);
f67539c2
TL
6458 std::string& fn=*_fn;
6459 std::string& kv_backend=*_kv_backend;
6460 fn = path + "/db";
11fdf7f2
TL
6461 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
6462
11fdf7f2
TL
6463 if (create) {
6464 kv_backend = cct->_conf->bluestore_kvbackend;
6465 } else {
6466 r = read_meta("kv_backend", &kv_backend);
7c673cae 6467 if (r < 0) {
11fdf7f2
TL
6468 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
6469 return -EIO;
6470 }
6471 }
6472 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
6473
6474 bool do_bluefs;
6475 r = _is_bluefs(create, &do_bluefs);
6476 if (r < 0) {
6477 return r;
6478 }
6479 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
6480
6481 map<string,string> kv_options;
6482 // force separate wal dir for all new deployments.
6483 kv_options["separate_wal_dir"] = 1;
6484 rocksdb::Env *env = NULL;
6485 if (do_bluefs) {
6486 dout(10) << __func__ << " initializing bluefs" << dendl;
6487 if (kv_backend != "rocksdb") {
6488 derr << " backend must be rocksdb to use bluefs" << dendl;
6489 return -EINVAL;
7c673cae 6490 }
11fdf7f2 6491
f67539c2 6492 r = _open_bluefs(create, read_only);
11fdf7f2
TL
6493 if (r < 0) {
6494 return r;
6495 }
11fdf7f2 6496
7c673cae 6497 if (cct->_conf->bluestore_bluefs_env_mirror) {
9f95a23c
TL
6498 rocksdb::Env* a = new BlueRocksEnv(bluefs);
6499 rocksdb::Env* b = rocksdb::Env::Default();
7c673cae 6500 if (create) {
9f95a23c
TL
6501 string cmd = "rm -rf " + path + "/db " +
6502 path + "/db.slow " +
6503 path + "/db.wal";
6504 int r = system(cmd.c_str());
6505 (void)r;
7c673cae
FG
6506 }
6507 env = new rocksdb::EnvMirror(b, a, false, true);
1911f103 6508 } else {
7c673cae
FG
6509 env = new BlueRocksEnv(bluefs);
6510
6511 // simplify the dir names, too, as "seen" by rocksdb
6512 fn = "db";
6513 }
9f95a23c
TL
6514 BlueFSVolumeSelector::paths paths;
6515 bluefs->get_vselector_paths(fn, paths);
7c673cae 6516
522d829b 6517 {
7c673cae 6518 ostringstream db_paths;
9f95a23c
TL
6519 bool first = true;
6520 for (auto& p : paths) {
6521 if (!first) {
6522 db_paths << " ";
6523 }
6524 first = false;
6525 db_paths << p.first << "," << p.second;
6526
6527 }
11fdf7f2 6528 kv_options["db_paths"] = db_paths.str();
9f95a23c 6529 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
7c673cae
FG
6530 }
6531
6532 if (create) {
9f95a23c
TL
6533 for (auto& p : paths) {
6534 env->CreateDir(p.first);
6535 }
6536 // Selectors don't provide wal path so far hence create explicitly
11fdf7f2 6537 env->CreateDir(fn + ".wal");
11fdf7f2
TL
6538 } else {
6539 std::vector<std::string> res;
6540 // check for dir presence
6541 auto r = env->GetChildren(fn+".wal", &res);
6542 if (r.IsNotFound()) {
6543 kv_options.erase("separate_wal_dir");
6544 }
7c673cae 6545 }
11fdf7f2
TL
6546 } else {
6547 string walfn = path + "/db.wal";
7c673cae 6548
11fdf7f2
TL
6549 if (create) {
6550 int r = ::mkdir(fn.c_str(), 0755);
6551 if (r < 0)
6552 r = -errno;
6553 if (r < 0 && r != -EEXIST) {
6554 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
6555 << dendl;
6556 return r;
6557 }
6558
6559 // wal_dir, too!
7c673cae
FG
6560 r = ::mkdir(walfn.c_str(), 0755);
6561 if (r < 0)
6562 r = -errno;
6563 if (r < 0 && r != -EEXIST) {
6564 derr << __func__ << " failed to create " << walfn
6565 << ": " << cpp_strerror(r)
6566 << dendl;
6567 return r;
6568 }
11fdf7f2
TL
6569 } else {
6570 struct stat st;
6571 r = ::stat(walfn.c_str(), &st);
6572 if (r < 0 && errno == ENOENT) {
6573 kv_options.erase("separate_wal_dir");
6574 }
7c673cae
FG
6575 }
6576 }
6577
91327a77 6578
7c673cae
FG
6579 db = KeyValueDB::create(cct,
6580 kv_backend,
6581 fn,
11fdf7f2 6582 kv_options,
7c673cae
FG
6583 static_cast<void*>(env));
6584 if (!db) {
6585 derr << __func__ << " error creating db" << dendl;
6586 if (bluefs) {
20effc67 6587 _close_bluefs();
7c673cae
FG
6588 }
6589 // delete env manually here since we can't depend on db to do this
6590 // under this case
6591 delete env;
6592 env = NULL;
6593 return -EIO;
6594 }
6595
f67539c2 6596 FreelistManager::setup_merge_operators(db, freelist_type);
7c673cae 6597 db->set_merge_operator(PREFIX_STAT, merge_op);
91327a77 6598 db->set_cache_size(cache_kv_ratio * cache_size);
f67539c2
TL
6599 return 0;
6600}
31f18b77 6601
f67539c2
TL
6602int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
6603{
6604 int r;
6605 ceph_assert(!(create && read_only));
6606 string options;
6607 string options_annex;
6608 stringstream err;
6609 string kv_dir_fn;
6610 string kv_backend;
6611 std::string sharding_def;
20effc67
TL
6612 // prevent write attempts to BlueFS in case we failed before BlueFS was opened
6613 db_was_opened_read_only = true;
f67539c2
TL
6614 r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend);
6615 if (r < 0) {
6616 derr << __func__ << " failed to prepare db environment: " << err.str() << dendl;
6617 return -EIO;
6618 }
20effc67
TL
6619 // if reached here then BlueFS is already opened
6620 db_was_opened_read_only = read_only;
6621 dout(10) << __func__ << "::db_was_opened_read_only was set to " << read_only << dendl;
11fdf7f2 6622 if (kv_backend == "rocksdb") {
7c673cae 6623 options = cct->_conf->bluestore_rocksdb_options;
cd265ab1
TL
6624 options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6625 if (!options_annex.empty()) {
6626 if (!options.empty() &&
6627 *options.rbegin() != ',') {
6628 options += ',';
6629 }
6630 options += options_annex;
6631 }
11fdf7f2 6632
f67539c2
TL
6633 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
6634 sharding_def = cct->_conf.get_val<std::string>("bluestore_rocksdb_cfs");
11fdf7f2
TL
6635 }
6636 }
6637
7c673cae 6638 db->init(options);
11fdf7f2
TL
6639 if (to_repair_db)
6640 return 0;
6641 if (create) {
f67539c2 6642 r = db->create_and_open(err, sharding_def);
11fdf7f2
TL
6643 } else {
6644 // we pass in cf list here, but it is only used if the db already has
6645 // column families created.
6646 r = read_only ?
f67539c2
TL
6647 db->open_read_only(err, sharding_def) :
6648 db->open(err, sharding_def);
11fdf7f2 6649 }
7c673cae
FG
6650 if (r) {
6651 derr << __func__ << " erroring opening db: " << err.str() << dendl;
20effc67 6652 _close_db();
7c673cae
FG
6653 return -EIO;
6654 }
6655 dout(1) << __func__ << " opened " << kv_backend
f67539c2 6656 << " path " << kv_dir_fn << " options " << options << dendl;
7c673cae 6657 return 0;
7c673cae
FG
6658}
6659
20effc67 6660void BlueStore::_close_db_leave_bluefs()
7c673cae 6661{
11fdf7f2 6662 ceph_assert(db);
7c673cae 6663 delete db;
20effc67
TL
6664 db = nullptr;
6665}
6666
6667void BlueStore::_close_db()
6668{
6669 dout(10) << __func__ << ":read_only=" << db_was_opened_read_only << " fm=" << fm << " destage_alloc_file=" << need_to_destage_allocation_file << dendl;
6670 _close_db_leave_bluefs();
6671
6672 if (need_to_destage_allocation_file) {
6673 ceph_assert(fm && fm->is_null_manager());
6674 int ret = store_allocator(alloc);
6675 if (ret != 0) {
6676 derr << __func__ << "::NCB::store_allocator() failed (continue with bitmapFreelistManager)" << dendl;
6677 }
6678 }
6679
7c673cae 6680 if (bluefs) {
20effc67 6681 _close_bluefs();
7c673cae
FG
6682 }
6683}
6684
11fdf7f2 6685void BlueStore::_dump_alloc_on_failure()
7c673cae 6686{
11fdf7f2
TL
6687 auto dump_interval =
6688 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
6689 if (dump_interval > 0 &&
6690 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
f67539c2 6691 shared_alloc.a->dump();
11fdf7f2
TL
6692 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
6693 next_dump_on_bluefs_alloc_failure += dump_interval;
7c673cae 6694 }
11fdf7f2 6695}
7c673cae 6696
eafe8130 6697int BlueStore::_open_collections()
7c673cae 6698{
20effc67
TL
6699 if (!coll_map.empty()) {
6700 // could be opened from another path
6701 dout(20) << __func__ << "::NCB::collections are already opened, nothing to do" << dendl;
6702 return 0;
6703 }
6704
28e407b8 6705 dout(10) << __func__ << dendl;
eafe8130 6706 collections_had_errors = false;
7c673cae 6707 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
20effc67 6708 size_t load_cnt = 0;
7c673cae
FG
6709 for (it->upper_bound(string());
6710 it->valid();
6711 it->next()) {
6712 coll_t cid;
6713 if (cid.parse(it->key())) {
9f95a23c 6714 auto c = ceph::make_ref<Collection>(
7c673cae 6715 this,
9f95a23c
TL
6716 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6717 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6718 cid);
7c673cae 6719 bufferlist bl = it->value();
11fdf7f2 6720 auto p = bl.cbegin();
7c673cae 6721 try {
11fdf7f2 6722 decode(c->cnode, p);
f67539c2 6723 } catch (ceph::buffer::error& e) {
7c673cae
FG
6724 derr << __func__ << " failed to decode cnode, key:"
6725 << pretty_binary_string(it->key()) << dendl;
6726 return -EIO;
6727 }
28e407b8
AA
6728 dout(20) << __func__ << " opened " << cid << " " << c
6729 << " " << c->cnode << dendl;
11fdf7f2 6730 _osr_attach(c.get());
7c673cae 6731 coll_map[cid] = c;
20effc67 6732 load_cnt++;
7c673cae
FG
6733 } else {
6734 derr << __func__ << " unrecognized collection " << it->key() << dendl;
eafe8130 6735 collections_had_errors = true;
7c673cae
FG
6736 }
6737 }
20effc67
TL
6738 dout(10) << __func__ << " collections loaded: " << load_cnt
6739 << dendl;
7c673cae
FG
6740 return 0;
6741}
6742
eafe8130
TL
6743void BlueStore::_fsck_collections(int64_t* errors)
6744{
6745 if (collections_had_errors) {
6746 dout(10) << __func__ << dendl;
f67539c2 6747 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
6748 for (it->upper_bound(string());
6749 it->valid();
6750 it->next()) {
6751 coll_t cid;
6752 if (!cid.parse(it->key())) {
6753 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6754 if (errors) {
6755 (*errors)++;
6756 }
6757 }
6758 }
6759 }
6760}
6761
9f95a23c
TL
6762void BlueStore::_set_per_pool_omap()
6763{
f67539c2 6764 per_pool_omap = OMAP_BULK;
9f95a23c
TL
6765 bufferlist bl;
6766 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6767 if (bl.length()) {
f67539c2
TL
6768 auto s = bl.to_str();
6769 if (s == stringify(OMAP_PER_POOL)) {
6770 per_pool_omap = OMAP_PER_POOL;
a4b75251 6771 } else if (s == stringify(OMAP_PER_PG)) {
f67539c2 6772 per_pool_omap = OMAP_PER_PG;
a4b75251
TL
6773 } else {
6774 ceph_assert(s == stringify(OMAP_BULK));
f67539c2
TL
6775 }
6776 dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl;
9f95a23c
TL
6777 } else {
6778 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6779 }
f67539c2 6780 _check_no_per_pg_or_pool_omap_alert();
9f95a23c
TL
6781}
6782
224ce89b 6783void BlueStore::_open_statfs()
31f18b77 6784{
11fdf7f2
TL
6785 osd_pools.clear();
6786 vstatfs.reset();
6787
31f18b77 6788 bufferlist bl;
11fdf7f2 6789 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
31f18b77 6790 if (r >= 0) {
11fdf7f2 6791 per_pool_stat_collection = false;
31f18b77 6792 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
11fdf7f2 6793 auto it = bl.cbegin();
31f18b77 6794 vstatfs.decode(it);
11fdf7f2 6795 dout(10) << __func__ << " store_statfs is found" << dendl;
224ce89b 6796 } else {
31f18b77
FG
6797 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6798 }
81eedcae 6799 _check_legacy_statfs_alert();
11fdf7f2
TL
6800 } else {
6801 per_pool_stat_collection = true;
6802 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
f67539c2 6803 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2
TL
6804 for (it->upper_bound(string());
6805 it->valid();
6806 it->next()) {
6807
6808 uint64_t pool_id;
6809 int r = get_key_pool_stat(it->key(), &pool_id);
6810 ceph_assert(r == 0);
6811
6812 bufferlist bl;
6813 bl = it->value();
6814 auto p = bl.cbegin();
6815 auto& st = osd_pools[pool_id];
6816 try {
6817 st.decode(p);
6818 vstatfs += st;
6819
6820 dout(30) << __func__ << " pool " << pool_id
6821 << " statfs " << st << dendl;
f67539c2 6822 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
6823 derr << __func__ << " failed to decode pool stats, key:"
6824 << pretty_binary_string(it->key()) << dendl;
6825 }
6826 }
31f18b77 6827 }
11fdf7f2
TL
6828 dout(30) << __func__ << " statfs " << vstatfs << dendl;
6829
31f18b77
FG
6830}
6831
7c673cae
FG
6832int BlueStore::_setup_block_symlink_or_file(
6833 string name,
6834 string epath,
6835 uint64_t size,
6836 bool create)
6837{
6838 dout(20) << __func__ << " name " << name << " path " << epath
6839 << " size " << size << " create=" << (int)create << dendl;
6840 int r = 0;
91327a77 6841 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
6842 if (create)
6843 flags |= O_CREAT;
6844 if (epath.length()) {
6845 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6846 if (r < 0) {
6847 r = -errno;
6848 derr << __func__ << " failed to create " << name << " symlink to "
6849 << epath << ": " << cpp_strerror(r) << dendl;
6850 return r;
6851 }
6852
6853 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6854 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6855 if (fd < 0) {
6856 r = -errno;
6857 derr << __func__ << " failed to open " << epath << " file: "
6858 << cpp_strerror(r) << dendl;
6859 return r;
6860 }
11fdf7f2
TL
6861 // write the Transport ID of the NVMe device
6862 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6863 // where "0000:02:00.0" is the selector of a PCI device, see
6864 // the first column of "lspci -mm -n -D"
6865 string trid{"trtype:PCIe "};
6866 trid += "traddr:";
6867 trid += epath.substr(strlen(SPDK_PREFIX));
6868 r = ::write(fd, trid.c_str(), trid.size());
6869 ceph_assert(r == static_cast<int>(trid.size()));
7c673cae
FG
6870 dout(1) << __func__ << " created " << name << " symlink to "
6871 << epath << dendl;
6872 VOID_TEMP_FAILURE_RETRY(::close(fd));
6873 }
6874 }
6875 if (size) {
6876 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
6877 if (fd >= 0) {
6878 // block file is present
6879 struct stat st;
6880 int r = ::fstat(fd, &st);
6881 if (r == 0 &&
6882 S_ISREG(st.st_mode) && // if it is a regular file
6883 st.st_size == 0) { // and is 0 bytes
6884 r = ::ftruncate(fd, size);
6885 if (r < 0) {
6886 r = -errno;
6887 derr << __func__ << " failed to resize " << name << " file to "
6888 << size << ": " << cpp_strerror(r) << dendl;
6889 VOID_TEMP_FAILURE_RETRY(::close(fd));
6890 return r;
6891 }
6892
6893 if (cct->_conf->bluestore_block_preallocate_file) {
28e407b8
AA
6894 r = ::ceph_posix_fallocate(fd, 0, size);
6895 if (r > 0) {
7c673cae
FG
6896 derr << __func__ << " failed to prefallocate " << name << " file to "
6897 << size << ": " << cpp_strerror(r) << dendl;
6898 VOID_TEMP_FAILURE_RETRY(::close(fd));
6899 return -r;
6900 }
7c673cae
FG
6901 }
6902 dout(1) << __func__ << " resized " << name << " file to "
1adf2230 6903 << byte_u_t(size) << dendl;
7c673cae
FG
6904 }
6905 VOID_TEMP_FAILURE_RETRY(::close(fd));
6906 } else {
6907 int r = -errno;
6908 if (r != -ENOENT) {
6909 derr << __func__ << " failed to open " << name << " file: "
6910 << cpp_strerror(r) << dendl;
6911 return r;
6912 }
6913 }
6914 }
6915 return 0;
6916}
6917
6918int BlueStore::mkfs()
6919{
6920 dout(1) << __func__ << " path " << path << dendl;
6921 int r;
6922 uuid_d old_fsid;
f67539c2 6923 uint64_t reserved;
eafe8130
TL
6924 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6925 derr << __func__ << " osd_max_object_size "
6926 << cct->_conf->osd_max_object_size << " > bluestore max "
6927 << OBJECT_MAX_SIZE << dendl;
6928 return -EINVAL;
6929 }
6930
7c673cae
FG
6931 {
6932 string done;
6933 r = read_meta("mkfs_done", &done);
6934 if (r == 0) {
6935 dout(1) << __func__ << " already created" << dendl;
6936 if (cct->_conf->bluestore_fsck_on_mkfs) {
6937 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6938 if (r < 0) {
6939 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
6940 << dendl;
6941 return r;
6942 }
6943 if (r > 0) {
6944 derr << __func__ << " fsck found " << r << " errors" << dendl;
6945 r = -EIO;
6946 }
6947 }
6948 return r; // idempotent
6949 }
6950 }
6951
6952 {
6953 string type;
6954 r = read_meta("type", &type);
6955 if (r == 0) {
6956 if (type != "bluestore") {
6957 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6958 return -EIO;
6959 }
6960 } else {
6961 r = write_meta("type", "bluestore");
6962 if (r < 0)
6963 return r;
6964 }
6965 }
6966
7c673cae
FG
6967 r = _open_path();
6968 if (r < 0)
6969 return r;
6970
6971 r = _open_fsid(true);
6972 if (r < 0)
6973 goto out_path_fd;
6974
6975 r = _lock_fsid();
6976 if (r < 0)
6977 goto out_close_fsid;
6978
6979 r = _read_fsid(&old_fsid);
6980 if (r < 0 || old_fsid.is_zero()) {
6981 if (fsid.is_zero()) {
6982 fsid.generate_random();
6983 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6984 } else {
6985 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6986 }
6987 // we'll write it later.
6988 } else {
6989 if (!fsid.is_zero() && fsid != old_fsid) {
6990 derr << __func__ << " on-disk fsid " << old_fsid
6991 << " != provided " << fsid << dendl;
6992 r = -EINVAL;
6993 goto out_close_fsid;
6994 }
6995 fsid = old_fsid;
6996 }
6997
6998 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
6999 cct->_conf->bluestore_block_size,
7000 cct->_conf->bluestore_block_create);
7001 if (r < 0)
7002 goto out_close_fsid;
7003 if (cct->_conf->bluestore_bluefs) {
7004 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
7005 cct->_conf->bluestore_block_wal_size,
7006 cct->_conf->bluestore_block_wal_create);
7007 if (r < 0)
7008 goto out_close_fsid;
7009 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
7010 cct->_conf->bluestore_block_db_size,
7011 cct->_conf->bluestore_block_db_create);
7012 if (r < 0)
7013 goto out_close_fsid;
7014 }
7015
7016 r = _open_bdev(true);
7017 if (r < 0)
7018 goto out_close_fsid;
7019
20effc67
TL
7020 // choose freelist manager
7021#ifdef HAVE_LIBZBD
7022 if (bdev->is_smr()) {
7023 freelist_type = "zoned";
7024 zone_size = bdev->get_zone_size();
7025 first_sequential_zone = bdev->get_conventional_region_size() / zone_size;
7026 bdev->reset_all_zones();
7027 } else
7028#endif
7029 {
7030 freelist_type = "bitmap";
7031 }
7032 dout(10) << " freelist_type " << freelist_type << dendl;
7033
3efd9988 7034 // choose min_alloc_size
20effc67
TL
7035 dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
7036 << " block_size: 0x" << block_size << std::dec << dendl;
7037 if ((cct->_conf->bluestore_use_optimal_io_size_for_min_alloc_size) && (optimal_io_size != 0)) {
7038 dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
7039 << " for min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
7040 min_alloc_size = optimal_io_size;
7041 }
7042 else if (cct->_conf->bluestore_min_alloc_size) {
3efd9988
FG
7043 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
7044 } else {
11fdf7f2 7045 ceph_assert(bdev);
f67539c2 7046 if (_use_rotational_settings()) {
3efd9988
FG
7047 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
7048 } else {
7049 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
7050 }
7051 }
11fdf7f2 7052 _validate_bdev();
3efd9988
FG
7053
7054 // make sure min_alloc_size is power of 2 aligned.
11fdf7f2 7055 if (!isp2(min_alloc_size)) {
3efd9988
FG
7056 derr << __func__ << " min_alloc_size 0x"
7057 << std::hex << min_alloc_size << std::dec
7058 << " is not power of 2 aligned!"
7059 << dendl;
7060 r = -EINVAL;
7061 goto out_close_bdev;
7062 }
7063
20effc67
TL
7064 // make sure min_alloc_size is >= and aligned with block size
7065 if (min_alloc_size % block_size != 0) {
7066 derr << __func__ << " min_alloc_size 0x"
7067 << std::hex << min_alloc_size
7068 << " is less or not aligned with block_size: 0x"
7069 << block_size << std::dec << dendl;
7070 r = -EINVAL;
7071 goto out_close_bdev;
7072 }
7073
f67539c2
TL
7074 r = _create_alloc();
7075 if (r < 0) {
7076 goto out_close_bdev;
7077 }
7078
7079 reserved = _get_ondisk_reserved();
20effc67 7080 alloc->init_add_free(reserved,
f67539c2 7081 p2align(bdev->get_size(), min_alloc_size) - reserved);
20effc67
TL
7082#ifdef HAVE_LIBZBD
7083 if (bdev->is_smr() && alloc != shared_alloc.a) {
7084 shared_alloc.a->init_add_free(reserved,
7085 p2align(bdev->get_conventional_region_size(),
7086 min_alloc_size) - reserved);
7087 }
7088#endif
f67539c2 7089
7c673cae
FG
7090 r = _open_db(true);
7091 if (r < 0)
f67539c2 7092 goto out_close_alloc;
7c673cae 7093
7c673cae
FG
7094 {
7095 KeyValueDB::Transaction t = db->get_transaction();
1911f103 7096 r = _open_fm(t, true);
11fdf7f2
TL
7097 if (r < 0)
7098 goto out_close_db;
7c673cae
FG
7099 {
7100 bufferlist bl;
11fdf7f2 7101 encode((uint64_t)0, bl);
7c673cae
FG
7102 t->set(PREFIX_SUPER, "nid_max", bl);
7103 t->set(PREFIX_SUPER, "blobid_max", bl);
7104 }
7105
7c673cae
FG
7106 {
7107 bufferlist bl;
11fdf7f2 7108 encode((uint64_t)min_alloc_size, bl);
7c673cae
FG
7109 t->set(PREFIX_SUPER, "min_alloc_size", bl);
7110 }
9f95a23c
TL
7111 {
7112 bufferlist bl;
a4b75251
TL
7113 if (cct->_conf.get_val<bool>("bluestore_debug_legacy_omap")) {
7114 bl.append(stringify(OMAP_BULK));
7115 } else {
7116 bl.append(stringify(OMAP_PER_PG));
7117 }
9f95a23c
TL
7118 t->set(PREFIX_SUPER, "per_pool_omap", bl);
7119 }
20effc67
TL
7120
7121#ifdef HAVE_LIBZBD
7122 if (bdev->is_smr()) {
7123 {
7124 bufferlist bl;
7125 encode((uint64_t)zone_size, bl);
7126 t->set(PREFIX_SUPER, "zone_size", bl);
7127 }
7128 {
7129 bufferlist bl;
7130 encode((uint64_t)first_sequential_zone, bl);
7131 t->set(PREFIX_SUPER, "first_sequential_zone", bl);
7132 }
7133 }
7134#endif
7135
7c673cae
FG
7136 ondisk_format = latest_ondisk_format;
7137 _prepare_ondisk_format_super(t);
7138 db->submit_transaction_sync(t);
7139 }
7140
7c673cae
FG
7141 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
7142 if (r < 0)
224ce89b
WB
7143 goto out_close_fm;
7144
3efd9988 7145 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 7146 if (r < 0)
224ce89b 7147 goto out_close_fm;
7c673cae
FG
7148
7149 if (fsid != old_fsid) {
7150 r = _write_fsid();
7151 if (r < 0) {
7152 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 7153 goto out_close_fm;
7c673cae
FG
7154 }
7155 }
7156
7c673cae
FG
7157 out_close_fm:
7158 _close_fm();
7159 out_close_db:
20effc67 7160 _close_db();
f67539c2
TL
7161 out_close_alloc:
7162 _close_alloc();
7c673cae
FG
7163 out_close_bdev:
7164 _close_bdev();
7165 out_close_fsid:
7166 _close_fsid();
7167 out_path_fd:
7168 _close_path();
7169
7170 if (r == 0 &&
7171 cct->_conf->bluestore_fsck_on_mkfs) {
7172 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
7173 if (rc < 0)
7174 return rc;
7175 if (rc > 0) {
7176 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7177 r = -EIO;
7178 }
11fdf7f2
TL
7179 }
7180
7181 if (r == 0) {
7182 // indicate success by writing the 'mkfs_done' file
7183 r = write_meta("mkfs_done", "yes");
7184 }
7185
7186 if (r < 0) {
7187 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
7188 } else {
7189 dout(0) << __func__ << " success" << dendl;
7190 }
7191 return r;
7192}
7193
11fdf7f2
TL
7194int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
7195{
7196 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
7197 int r;
7198 ceph_assert(path_fd < 0);
7199
7200 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
7201
7202 if (!cct->_conf->bluestore_bluefs) {
7203 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7204 return -EIO;
7205 }
20effc67 7206 dout(5) << __func__ << "::NCB::calling open_db_and_around(read-only)" << dendl;
f67539c2 7207 r = _open_db_and_around(true);
20effc67
TL
7208 if (r < 0) {
7209 return r;
7210 }
11fdf7f2 7211
11fdf7f2
TL
7212 if (id == BlueFS::BDEV_NEWWAL) {
7213 string p = path + "/block.wal";
7214 r = _setup_block_symlink_or_file("block.wal", dev_path,
7215 cct->_conf->bluestore_block_wal_size,
7216 true);
7217 ceph_assert(r == 0);
7218
7219 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
f67539c2
TL
7220 cct->_conf->bdev_enable_discard,
7221 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
7222 ceph_assert(r == 0);
7223
7224 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
7225 r = _check_or_set_bdev_label(
7226 p,
7227 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
7228 "bluefs wal",
7229 true);
7230 ceph_assert(r == 0);
7231 }
7232
9f95a23c 7233 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
7234 } else if (id == BlueFS::BDEV_NEWDB) {
7235 string p = path + "/block.db";
7236 r = _setup_block_symlink_or_file("block.db", dev_path,
7237 cct->_conf->bluestore_block_db_size,
7238 true);
7239 ceph_assert(r == 0);
7240
7241 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
f67539c2
TL
7242 cct->_conf->bdev_enable_discard,
7243 SUPER_RESERVED);
11fdf7f2
TL
7244 ceph_assert(r == 0);
7245
7246 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
7247 r = _check_or_set_bdev_label(
7248 p,
7249 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
7250 "bluefs db",
7251 true);
7252 ceph_assert(r == 0);
7253 }
9f95a23c
TL
7254 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
7255 bluefs_layout.dedicated_db = true;
11fdf7f2 7256 }
11fdf7f2
TL
7257 bluefs->umount();
7258 bluefs->mount();
7259
9f95a23c 7260 r = bluefs->prepare_new_device(id, bluefs_layout);
11fdf7f2
TL
7261 ceph_assert(r == 0);
7262
7263 if (r < 0) {
7264 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
7265 } else {
7266 dout(0) << __func__ << " success" << dendl;
7267 }
7268
20effc67 7269 _close_db_and_around();
11fdf7f2
TL
7270 return r;
7271}
7272
7273int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
7274 int id)
7275{
7276 dout(10) << __func__ << " id:" << id << dendl;
7277 ceph_assert(path_fd < 0);
7278
7279 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
7280
7281 if (!cct->_conf->bluestore_bluefs) {
7282 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7283 return -EIO;
7284 }
7285
f67539c2 7286 int r = _open_db_and_around(true);
20effc67
TL
7287 if (r < 0) {
7288 return r;
7289 }
7290 auto close_db = make_scope_guard([&] {
7291 _close_db_and_around();
7292 });
f67539c2 7293 uint64_t used_space = 0;
11fdf7f2 7294 for(auto src_id : devs_source) {
f67539c2 7295 used_space += bluefs->get_used(src_id);
11fdf7f2
TL
7296 }
7297 uint64_t target_free = bluefs->get_free(id);
f67539c2 7298 if (target_free < used_space) {
11fdf7f2
TL
7299 derr << __func__
7300 << " can't migrate, free space at target: " << target_free
7301 << " is less than required space: " << used_space
7302 << dendl;
20effc67 7303 return -ENOSPC;
11fdf7f2 7304 }
9f95a23c
TL
7305 if (devs_source.count(BlueFS::BDEV_DB)) {
7306 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7307 bluefs_layout.dedicated_db = false;
7308 }
7309 if (devs_source.count(BlueFS::BDEV_WAL)) {
7310 bluefs_layout.dedicated_wal = false;
7311 }
7312 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
11fdf7f2
TL
7313 if (r < 0) {
7314 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
20effc67 7315 return r;
11fdf7f2
TL
7316 }
7317
7318 if (devs_source.count(BlueFS::BDEV_DB)) {
7319 r = unlink(string(path + "/block.db").c_str());
7320 ceph_assert(r == 0);
7321 }
7322 if (devs_source.count(BlueFS::BDEV_WAL)) {
7323 r = unlink(string(path + "/block.wal").c_str());
7324 ceph_assert(r == 0);
7325 }
11fdf7f2
TL
7326 return r;
7327}
7328
7329int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
7330 int id,
7331 const string& dev_path)
7332{
7333 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
11fdf7f2
TL
7334 ceph_assert(path_fd < 0);
7335
7336 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
7337
7338 if (!cct->_conf->bluestore_bluefs) {
7339 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
7340 return -EIO;
7341 }
7342
20effc67
TL
7343 int r = _open_db_and_around(true);
7344 if (r < 0) {
7345 return r;
7346 }
7347 auto close_db = make_scope_guard([&] {
7348 _close_db_and_around();
7349 });
11fdf7f2 7350
11fdf7f2
TL
7351 string link_db;
7352 string link_wal;
7353 if (devs_source.count(BlueFS::BDEV_DB) &&
9f95a23c 7354 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 7355 link_db = path + "/block.db";
9f95a23c
TL
7356 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7357 bluefs_layout.dedicated_db = false;
11fdf7f2
TL
7358 }
7359 if (devs_source.count(BlueFS::BDEV_WAL)) {
7360 link_wal = path + "/block.wal";
9f95a23c 7361 bluefs_layout.dedicated_wal = false;
11fdf7f2
TL
7362 }
7363
20effc67 7364 size_t target_size = 0;
11fdf7f2
TL
7365 string target_name;
7366 if (id == BlueFS::BDEV_NEWWAL) {
7367 target_name = "block.wal";
7368 target_size = cct->_conf->bluestore_block_wal_size;
9f95a23c 7369 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
7370
7371 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
f67539c2
TL
7372 cct->_conf->bdev_enable_discard,
7373 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
7374 ceph_assert(r == 0);
7375
7376 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
7377 r = _check_or_set_bdev_label(
7378 dev_path,
7379 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
7380 "bluefs wal",
7381 true);
7382 ceph_assert(r == 0);
7383 }
11fdf7f2
TL
7384 } else if (id == BlueFS::BDEV_NEWDB) {
7385 target_name = "block.db";
7386 target_size = cct->_conf->bluestore_block_db_size;
9f95a23c
TL
7387 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
7388 bluefs_layout.dedicated_db = true;
31f18b77 7389
11fdf7f2 7390 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
f67539c2
TL
7391 cct->_conf->bdev_enable_discard,
7392 SUPER_RESERVED);
11fdf7f2
TL
7393 ceph_assert(r == 0);
7394
7395 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
7396 r = _check_or_set_bdev_label(
7397 dev_path,
7398 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
7399 "bluefs db",
7400 true);
7401 ceph_assert(r == 0);
7402 }
31f18b77
FG
7403 }
7404
11fdf7f2
TL
7405 bluefs->umount();
7406 bluefs->mount();
7407
9f95a23c 7408 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
11fdf7f2 7409
7c673cae 7410 if (r < 0) {
11fdf7f2 7411 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
20effc67 7412 return r;
11fdf7f2
TL
7413 }
7414
7415 if (!link_db.empty()) {
7416 r = unlink(link_db.c_str());
7417 ceph_assert(r == 0);
7418 }
7419 if (!link_wal.empty()) {
7420 r = unlink(link_wal.c_str());
7421 ceph_assert(r == 0);
7422 }
7423 r = _setup_block_symlink_or_file(
7424 target_name,
7425 dev_path,
7426 target_size,
7427 true);
7428 ceph_assert(r == 0);
7429 dout(0) << __func__ << " success" << dendl;
7430
11fdf7f2
TL
7431 return r;
7432}
7433
7434string BlueStore::get_device_path(unsigned id)
7435{
7436 string res;
7437 if (id < BlueFS::MAX_BDEV) {
7438 switch (id) {
7439 case BlueFS::BDEV_WAL:
7440 res = path + "/block.wal";
7441 break;
7442 case BlueFS::BDEV_DB:
9f95a23c 7443 if (id == bluefs_layout.shared_bdev) {
11fdf7f2
TL
7444 res = path + "/block";
7445 } else {
7446 res = path + "/block.db";
7447 }
7448 break;
7449 case BlueFS::BDEV_SLOW:
7450 res = path + "/block";
7451 break;
7452 }
7453 }
7454 return res;
7455}
7456
f67539c2
TL
7457int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
7458{
7459 bluestore_bdev_label_t label;
7460 int r = _read_bdev_label(cct, path, &label);
7461 if (r < 0) {
7462 derr << "unable to read label for " << path << ": "
7463 << cpp_strerror(r) << dendl;
7464 } else {
7465 label.size = size;
7466 r = _write_bdev_label(cct, path, label);
7467 if (r < 0) {
7468 derr << "unable to write label for " << path << ": "
7469 << cpp_strerror(r) << dendl;
7470 }
7471 }
7472 return r;
7473}
7474
11fdf7f2
TL
7475int BlueStore::expand_devices(ostream& out)
7476{
f67539c2 7477 int r = _open_db_and_around(true);
11fdf7f2
TL
7478 ceph_assert(r == 0);
7479 bluefs->dump_block_extents(out);
1911f103 7480 out << "Expanding DB/WAL..." << std::endl;
11fdf7f2 7481 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
9f95a23c 7482 if (devid == bluefs_layout.shared_bdev ) {
11fdf7f2
TL
7483 continue;
7484 }
7485 uint64_t size = bluefs->get_block_device_size(devid);
7486 if (size == 0) {
7487 // no bdev
7488 continue;
7489 }
7490
f67539c2
TL
7491 out << devid
7492 <<" : expanding " << " to 0x" << size << std::dec << std::endl;
7493 string p = get_device_path(devid);
7494 const char* path = p.c_str();
7495 if (path == nullptr) {
7496 derr << devid
7497 <<": can't find device path " << dendl;
7498 continue;
7499 }
7500 if (bluefs->bdev_support_label(devid)) {
7501 if (_set_bdev_label_size(p, size) >= 0) {
7502 out << devid
7503 << " : size label updated to " << size
7504 << std::endl;
11fdf7f2 7505 }
11fdf7f2
TL
7506 }
7507 }
7508 uint64_t size0 = fm->get_size();
7509 uint64_t size = bdev->get_size();
7510 if (size0 < size) {
9f95a23c 7511 out << bluefs_layout.shared_bdev
1911f103
TL
7512 << " : expanding " << " from 0x" << std::hex
7513 << size0 << " to 0x" << size << std::dec << std::endl;
f67539c2
TL
7514 _write_out_fm_meta(size);
7515 if (bdev->supported_bdev_label()) {
7516 if (_set_bdev_label_size(path, size) >= 0) {
7517 out << bluefs_layout.shared_bdev
7518 << " : size label updated to " << size
7519 << std::endl;
7520 }
7521 }
20effc67
TL
7522
7523 // we grow the allocation range, must reflect it in the allocation file
7524 alloc->init_add_free(size0, size - size0);
7525 need_to_destage_allocation_file = true;
7526
7527 _close_db_and_around();
1911f103
TL
7528
7529 // mount in read/write to sync expansion changes
f67539c2 7530 r = _mount();
11fdf7f2 7531 ceph_assert(r == 0);
1911f103
TL
7532 umount();
7533 } else {
20effc67 7534 _close_db_and_around();
7c673cae 7535 }
1911f103
TL
7536 return r;
7537}
7538
7539int BlueStore::dump_bluefs_sizes(ostream& out)
7540{
f67539c2 7541 int r = _open_db_and_around(true);
1911f103
TL
7542 ceph_assert(r == 0);
7543 bluefs->dump_block_extents(out);
20effc67 7544 _close_db_and_around();
7c673cae
FG
7545 return r;
7546}
7547
7548void BlueStore::set_cache_shards(unsigned num)
7549{
7550 dout(10) << __func__ << " " << num << dendl;
9f95a23c
TL
7551 size_t oold = onode_cache_shards.size();
7552 size_t bold = buffer_cache_shards.size();
7553 ceph_assert(num >= oold && num >= bold);
7554 onode_cache_shards.resize(num);
7555 buffer_cache_shards.resize(num);
7556 for (unsigned i = oold; i < num; ++i) {
7557 onode_cache_shards[i] =
7558 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7559 logger);
7560 }
7561 for (unsigned i = bold; i < num; ++i) {
7562 buffer_cache_shards[i] =
7563 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7564 logger);
7c673cae
FG
7565 }
7566}
7567
f67539c2 7568int BlueStore::_mount()
7c673cae 7569{
20effc67 7570 dout(5) << __func__ << "NCB:: path " << path << dendl;
f67539c2 7571 _kv_only = false;
7c673cae 7572 if (cct->_conf->bluestore_fsck_on_mount) {
20effc67 7573 dout(5) << __func__ << "::NCB::calling fsck()" << dendl;
7c673cae
FG
7574 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
7575 if (rc < 0)
7576 return rc;
7577 if (rc > 0) {
7578 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7579 return -EIO;
7580 }
7581 }
7582
eafe8130
TL
7583 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
7584 derr << __func__ << " osd_max_object_size "
7585 << cct->_conf->osd_max_object_size << " > bluestore max "
7586 << OBJECT_MAX_SIZE << dendl;
7587 return -EINVAL;
7588 }
7589
20effc67 7590 dout(5) << __func__ << "::NCB::calling open_db_and_around(read/write)" << dendl;
f67539c2 7591 int r = _open_db_and_around(false);
9f95a23c 7592 if (r < 0) {
f67539c2 7593 return r;
11fdf7f2 7594 }
20effc67
TL
7595 auto close_db = make_scope_guard([&] {
7596 if (!mounted) {
7597 _close_db_and_around();
7598 }
7599 });
7c673cae 7600
11fdf7f2
TL
7601 r = _upgrade_super();
7602 if (r < 0) {
20effc67 7603 return r;
11fdf7f2 7604 }
7c673cae 7605
20effc67 7606 // The recovery process for allocation-map needs to open collection early
7c673cae 7607 r = _open_collections();
20effc67
TL
7608 if (r < 0) {
7609 return r;
7610 }
7611 auto shutdown_cache = make_scope_guard([&] {
7612 if (!mounted) {
7613 _shutdown_cache();
7614 }
7615 });
7c673cae
FG
7616
7617 r = _reload_logger();
20effc67
TL
7618 if (r < 0) {
7619 return r;
7620 }
7c673cae 7621
31f18b77 7622 _kv_start();
20effc67
TL
7623 auto stop_kv = make_scope_guard([&] {
7624 if (!mounted) {
7625 _kv_stop();
7626 }
7627 });
7628
7629 r = _deferred_replay();
7630 if (r < 0) {
7631 return r;
7632 }
7c673cae 7633
20effc67 7634#ifdef HAVE_LIBZBD
f67539c2
TL
7635 if (bdev->is_smr()) {
7636 _zoned_cleaner_start();
7637 }
20effc67 7638#endif
7c673cae
FG
7639
7640 mempool_thread.init();
7641
f67539c2 7642 if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
eafe8130 7643 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
9f95a23c 7644
f67539c2 7645 auto was_per_pool_omap = per_pool_omap;
9f95a23c 7646
eafe8130
TL
7647 dout(1) << __func__ << " quick-fix on mount" << dendl;
7648 _fsck_on_open(FSCK_SHALLOW, true);
7649
7650 //reread statfs
7651 //FIXME minor: replace with actual open/close?
7652 _open_statfs();
eafe8130 7653 _check_legacy_statfs_alert();
9f95a23c
TL
7654
7655 //set again as hopefully it has been fixed
f67539c2 7656 if (was_per_pool_omap != OMAP_PER_PG) {
9f95a23c
TL
7657 _set_per_pool_omap();
7658 }
eafe8130
TL
7659 }
7660
7c673cae
FG
7661 mounted = true;
7662 return 0;
7c673cae
FG
7663}
7664
7665int BlueStore::umount()
7666{
11fdf7f2 7667 ceph_assert(_kv_only || mounted);
7c673cae 7668 _osr_drain_all();
7c673cae 7669
7c673cae 7670 mounted = false;
20effc67
TL
7671
7672 ceph_assert(alloc);
7673
3efd9988
FG
7674 if (!_kv_only) {
7675 mempool_thread.shutdown();
20effc67 7676#ifdef HAVE_LIBZBD
f67539c2
TL
7677 if (bdev->is_smr()) {
7678 dout(20) << __func__ << " stopping zone cleaner thread" << dendl;
7679 _zoned_cleaner_stop();
7680 }
20effc67 7681#endif
3efd9988
FG
7682 dout(20) << __func__ << " stopping kv thread" << dendl;
7683 _kv_stop();
f6b5b4d7 7684 _shutdown_cache();
3efd9988 7685 dout(20) << __func__ << " closing" << dendl;
3efd9988 7686 }
7c673cae 7687
20effc67 7688 _close_db_and_around();
7c673cae
FG
7689 if (cct->_conf->bluestore_fsck_on_umount) {
7690 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7691 if (rc < 0)
7692 return rc;
7693 if (rc > 0) {
7694 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7695 return -EIO;
7696 }
7697 }
7698 return 0;
7699}
7700
eafe8130
TL
7701int BlueStore::cold_open()
7702{
f67539c2 7703 return _open_db_and_around(true);
eafe8130 7704}
f67539c2 7705
eafe8130
TL
7706int BlueStore::cold_close()
7707{
20effc67 7708 _close_db_and_around();
eafe8130
TL
7709 return 0;
7710}
7711
9f95a23c
TL
7712// derr wrapper to limit enormous output and avoid log flooding.
7713// Of limited use where such output is expected for now
7714#define fsck_derr(err_cnt, threshold) \
7715 if (err_cnt <= threshold) { \
7716 bool need_skip_print = err_cnt == threshold; \
7717 derr
7718
7719#define fsck_dendl \
7720 dendl; \
7721 if (need_skip_print) \
7722 derr << "more error lines skipped..." << dendl; \
7c673cae 7723 }
7c673cae 7724
eafe8130
TL
7725int _fsck_sum_extents(
7726 const PExtentVector& extents,
7727 bool compressed,
7728 store_statfs_t& expected_statfs)
7729{
7730 for (auto e : extents) {
7731 if (!e.is_valid())
7732 continue;
7733 expected_statfs.allocated += e.length;
7734 if (compressed) {
7735 expected_statfs.data_compressed_allocated += e.length;
7736 }
7737 }
7738 return 0;
7739}
7740
7c673cae 7741int BlueStore::_fsck_check_extents(
20effc67 7742 std::string_view ctx_descr,
7c673cae
FG
7743 const PExtentVector& extents,
7744 bool compressed,
7745 mempool_dynamic_bitset &used_blocks,
b32b8144 7746 uint64_t granularity,
11fdf7f2 7747 BlueStoreRepairer* repairer,
eafe8130
TL
7748 store_statfs_t& expected_statfs,
7749 FSCKDepth depth)
7c673cae 7750{
20effc67 7751 dout(30) << __func__ << " " << ctx_descr << ", extents " << extents << dendl;
7c673cae
FG
7752 int errors = 0;
7753 for (auto e : extents) {
7754 if (!e.is_valid())
7755 continue;
7756 expected_statfs.allocated += e.length;
7757 if (compressed) {
11fdf7f2 7758 expected_statfs.data_compressed_allocated += e.length;
7c673cae 7759 }
eafe8130
TL
7760 if (depth != FSCK_SHALLOW) {
7761 bool already = false;
9f95a23c 7762 apply_for_bitset_range(
eafe8130
TL
7763 e.offset, e.length, granularity, used_blocks,
7764 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
7765 if (bs.test(pos)) {
7766 if (repairer) {
7767 repairer->note_misreference(
7768 pos * min_alloc_size, min_alloc_size, !already);
7769 }
7770 if (!already) {
20effc67 7771 derr << __func__ << "::fsck error: " << ctx_descr << ", extent " << e
eafe8130
TL
7772 << " or a subset is already allocated (misreferenced)" << dendl;
7773 ++errors;
7774 already = true;
7775 }
11fdf7f2 7776 }
eafe8130
TL
7777 else
7778 bs.set(pos);
7779 });
11fdf7f2 7780
eafe8130 7781 if (e.end() > bdev->get_size()) {
20effc67 7782 derr << "fsck error: " << ctx_descr << ", extent " << e
eafe8130
TL
7783 << " past end of block device" << dendl;
7784 ++errors;
7785 }
7c673cae
FG
7786 }
7787 }
7788 return errors;
7789}
7790
11fdf7f2
TL
7791void BlueStore::_fsck_check_pool_statfs(
7792 BlueStore::per_pool_statfs& expected_pool_statfs,
eafe8130
TL
7793 int64_t& errors,
7794 int64_t& warnings,
11fdf7f2
TL
7795 BlueStoreRepairer* repairer)
7796{
f67539c2 7797 auto it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2
TL
7798 if (it) {
7799 for (it->lower_bound(string()); it->valid(); it->next()) {
7800 string key = it->key();
7801 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
7802 if (repairer) {
eafe8130
TL
7803 ++errors;
7804 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
7805 derr << "fsck error: " << "legacy statfs record found, removing"
11fdf7f2
TL
7806 << dendl;
7807 }
7808 continue;
7809 }
11fdf7f2
TL
7810 uint64_t pool_id;
7811 if (get_key_pool_stat(key, &pool_id) < 0) {
7812 derr << "fsck error: bad key " << key
7813 << "in statfs namespece" << dendl;
7814 if (repairer) {
7815 repairer->remove_key(db, PREFIX_STAT, key);
7816 }
7817 ++errors;
7818 continue;
7819 }
7820
7821 volatile_statfs vstatfs;
7822 bufferlist bl = it->value();
7823 auto blp = bl.cbegin();
7824 try {
7825 vstatfs.decode(blp);
f67539c2 7826 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
7827 derr << "fsck error: failed to decode Pool StatFS record"
7828 << pretty_binary_string(key) << dendl;
7829 if (repairer) {
7830 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
7831 << pretty_binary_string(key)
7832 << "', removing" << dendl;
7833 repairer->remove_key(db, PREFIX_STAT, key);
7834 }
7835 ++errors;
7836 vstatfs.reset();
7837 }
7838 auto stat_it = expected_pool_statfs.find(pool_id);
7839 if (stat_it == expected_pool_statfs.end()) {
7840 if (vstatfs.is_empty()) {
7841 // we don't consider that as an error since empty pool statfs
7842 // are left in DB for now
7843 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7844 << std::hex << pool_id << std::dec << dendl;
7845 if (repairer) {
7846 // but we need to increment error count in case of repair
7847 // to have proper counters at the end
7848 // (as repairer increments recovery counter anyway).
7849 ++errors;
7850 }
7851 } else {
7852 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
7853 << std::hex << pool_id << std::dec << dendl;
7854 ++errors;
7855 }
7856 if (repairer) {
522d829b 7857 repairer->remove_key(db, PREFIX_STAT, key);
11fdf7f2
TL
7858 }
7859 continue;
7860 }
7861 store_statfs_t statfs;
7862 vstatfs.publish(&statfs);
7863 if (!(stat_it->second == statfs)) {
7864 derr << "fsck error: actual " << statfs
7865 << " != expected " << stat_it->second
7866 << " for pool "
7867 << std::hex << pool_id << std::dec << dendl;
7868 if (repairer) {
7869 repairer->fix_statfs(db, key, stat_it->second);
7870 }
7871 ++errors;
7872 }
7873 expected_pool_statfs.erase(stat_it);
7874 }
7875 } // if (it)
eafe8130
TL
7876 for (auto& s : expected_pool_statfs) {
7877 if (s.second.is_zero()) {
11fdf7f2
TL
7878 // we might lack empty statfs recs in DB
7879 continue;
7880 }
7881 derr << "fsck error: missing Pool StatFS record for pool "
eafe8130 7882 << std::hex << s.first << std::dec << dendl;
11fdf7f2
TL
7883 if (repairer) {
7884 string key;
eafe8130
TL
7885 get_pool_stat_key(s.first, &key);
7886 repairer->fix_statfs(db, key, s.second);
11fdf7f2
TL
7887 }
7888 ++errors;
7889 }
eafe8130 7890 if (!per_pool_stat_collection &&
eafe8130
TL
7891 repairer) {
7892 // by virtue of running this method, we correct the top-level
7893 // error of having global stats
7894 repairer->inc_repaired();
7895 }
11fdf7f2
TL
7896}
7897
20effc67
TL
7898void BlueStore::_fsck_repair_shared_blobs(
7899 BlueStoreRepairer& repairer,
7900 shared_blob_2hash_tracker_t& sb_ref_counts,
7901 sb_info_space_efficient_map_t& sb_info)
7902{
7903 auto sb_ref_mismatches = sb_ref_counts.count_non_zero();
7904 dout(1) << __func__ << " repairing shared_blobs, ref mismatch estimate: "
7905 << sb_ref_mismatches << dendl;
7906 if (!sb_ref_mismatches) // not expected to succeed, just in case
7907 return;
7908
7909
7910 auto foreach_shared_blob = [&](std::function<
7911 void (coll_t,
7912 ghobject_t,
7913 uint64_t,
7914 const bluestore_blob_t&)> cb) {
7915 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
7916 if (it) {
7917 CollectionRef c;
7918 spg_t pgid;
7919 for (it->lower_bound(string()); it->valid(); it->next()) {
7920 dout(30) << __func__ << " key "
7921 << pretty_binary_string(it->key())
7922 << dendl;
7923 if (is_extent_shard_key(it->key())) {
7924 continue;
7925 }
7926
7927 ghobject_t oid;
7928 int r = get_key_object(it->key(), &oid);
7929 if (r < 0) {
7930 continue;
7931 }
7932
7933 if (!c ||
7934 oid.shard_id != pgid.shard ||
7935 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
7936 !c->contains(oid)) {
7937 c = nullptr;
7938 for (auto& p : coll_map) {
7939 if (p.second->contains(oid)) {
7940 c = p.second;
7941 break;
7942 }
7943 }
7944 if (!c) {
7945 continue;
7946 }
7947 }
7948 dout(20) << __func__
7949 << " inspecting shared blob refs for col:" << c->cid
7950 << " obj:" << oid
7951 << dendl;
7952
7953 OnodeRef o;
7954 o.reset(Onode::decode(c, oid, it->key(), it->value()));
7955 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7956
7957 _dump_onode<30>(cct, *o);
7958
7959 mempool::bluestore_fsck::set<BlobRef> passed_sbs;
7960 for (auto& e : o->extent_map.extent_map) {
7961 auto& b = e.blob->get_blob();
7962 if (b.is_shared() && passed_sbs.count(e.blob) == 0) {
7963 auto sbid = e.blob->shared_blob->get_sbid();
7964 cb(c->cid, oid, sbid, b);
7965 passed_sbs.emplace(e.blob);
7966 }
7967 } // for ... extent_map
7968 } // for ... it->valid
7969 } //if (it(PREFIX_OBJ))
7970 }; //foreach_shared_blob fn declaration
7971
7972 mempool::bluestore_fsck::map<uint64_t, bluestore_extent_ref_map_t> refs_map;
7973
7974 // first iteration over objects to identify all the broken sbids
7975 foreach_shared_blob( [&](coll_t cid,
7976 ghobject_t oid,
7977 uint64_t sbid,
7978 const bluestore_blob_t& b) {
7979 auto it = refs_map.lower_bound(sbid);
7980 if(it != refs_map.end() && it->first == sbid) {
7981 return;
7982 }
7983 for (auto& p : b.get_extents()) {
7984 if (p.is_valid() &&
7985 !sb_ref_counts.test_all_zero_range(sbid,
7986 p.offset,
7987 p.length)) {
7988 refs_map.emplace_hint(it, sbid, bluestore_extent_ref_map_t());
7989 dout(20) << __func__
7990 << " broken shared blob found for col:" << cid
7991 << " obj:" << oid
7992 << " sbid 0x " << std::hex << sbid << std::dec
7993 << dendl;
7994 break;
7995 }
7996 }
7997 });
7998
7999 // second iteration over objects to build new ref map for the broken sbids
8000 foreach_shared_blob( [&](coll_t cid,
8001 ghobject_t oid,
8002 uint64_t sbid,
8003 const bluestore_blob_t& b) {
8004 auto it = refs_map.find(sbid);
8005 if(it == refs_map.end()) {
8006 return;
8007 }
8008 for (auto& p : b.get_extents()) {
8009 if (p.is_valid()) {
8010 it->second.get(p.offset, p.length);
8011 break;
8012 }
8013 }
8014 });
8015
8016 // update shared blob records
8017 auto ref_it = refs_map.begin();
8018 while (ref_it != refs_map.end()) {
8019 size_t cnt = 0;
8020 const size_t max_transactions = 4096;
8021 KeyValueDB::Transaction txn = db->get_transaction();
8022 for (cnt = 0;
8023 cnt < max_transactions && ref_it != refs_map.end();
8024 ref_it++) {
8025 auto sbid = ref_it->first;
8026 dout(20) << __func__ << " repaired shared_blob 0x"
8027 << std::hex << sbid << std::dec
8028 << ref_it->second << dendl;
8029 repairer.fix_shared_blob(txn, sbid, &ref_it->second, 0);
8030 cnt++;
8031 }
8032 if (cnt) {
8033 db->submit_transaction_sync(txn);
8034 cnt = 0;
8035 }
8036 }
8037 // remove stray shared blob records
8038 size_t cnt = 0;
8039 const size_t max_transactions = 4096;
8040 KeyValueDB::Transaction txn = db->get_transaction();
8041 sb_info.foreach_stray([&](const sb_info_t& sbi) {
8042 auto sbid = sbi.get_sbid();
8043 dout(20) << __func__ << " removing stray shared_blob 0x"
8044 << std::hex << sbid << std::dec
8045 << dendl;
8046 repairer.fix_shared_blob(txn, sbid, nullptr, 0);
8047 cnt++;
8048 if (cnt >= max_transactions) {}
8049 db->submit_transaction_sync(txn);
8050 txn = db->get_transaction();
8051 cnt = 0;
8052 });
8053 if (cnt > 0) {
8054 db->submit_transaction_sync(txn);
8055 }
8056
8057 // amount of repairs to report to be equal to previously
8058 // determined error estimation, not the actual number of updated shared blobs
8059 repairer.inc_repaired(sb_ref_mismatches);
8060}
8061
eafe8130
TL
8062BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
8063 BlueStore::FSCKDepth depth,
8064 int64_t pool_id,
8065 BlueStore::CollectionRef c,
8066 const ghobject_t& oid,
8067 const string& key,
8068 const bufferlist& value,
9f95a23c 8069 mempool::bluestore_fsck::list<string>* expecting_shards,
eafe8130
TL
8070 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
8071 const BlueStore::FSCK_ObjectCtx& ctx)
8072{
8073 auto& errors = ctx.errors;
8074 auto& num_objects = ctx.num_objects;
8075 auto& num_extents = ctx.num_extents;
8076 auto& num_blobs = ctx.num_blobs;
8077 auto& num_sharded_objects = ctx.num_sharded_objects;
8078 auto& num_spanning_blobs = ctx.num_spanning_blobs;
8079 auto used_blocks = ctx.used_blocks;
8080 auto sb_info_lock = ctx.sb_info_lock;
8081 auto& sb_info = ctx.sb_info;
20effc67 8082 auto& sb_ref_counts = ctx.sb_ref_counts;
eafe8130
TL
8083 auto repairer = ctx.repairer;
8084
8085 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
8086 &ctx.expected_pool_statfs[pool_id] :
8087 &ctx.expected_store_statfs;
8088
20effc67
TL
8089 map<uint32_t, uint64_t> zone_first_offsets; // for zoned/smr devices
8090
eafe8130
TL
8091 dout(10) << __func__ << " " << oid << dendl;
8092 OnodeRef o;
8093 o.reset(Onode::decode(c, oid, key, value));
8094 ++num_objects;
7c673cae 8095
eafe8130 8096 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7c673cae 8097
eafe8130
TL
8098 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8099 _dump_onode<30>(cct, *o);
8100 // shards
8101 if (!o->extent_map.shards.empty()) {
8102 ++num_sharded_objects;
8103 if (depth != FSCK_SHALLOW) {
9f95a23c 8104 ceph_assert(expecting_shards);
eafe8130
TL
8105 for (auto& s : o->extent_map.shards) {
8106 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
9f95a23c 8107 expecting_shards->push_back(string());
eafe8130 8108 get_extent_shard_key(o->key, s.shard_info->offset,
9f95a23c 8109 &expecting_shards->back());
eafe8130
TL
8110 if (s.shard_info->offset >= o->onode.size) {
8111 derr << "fsck error: " << oid << " shard 0x" << std::hex
8112 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
8113 << std::dec << dendl;
8114 ++errors;
8115 }
8116 }
8117 }
8118 }
7c673cae 8119
eafe8130
TL
8120 // lextents
8121 uint64_t pos = 0;
8122 mempool::bluestore_fsck::map<BlobRef,
8123 bluestore_blob_use_tracker_t> ref_map;
8124 for (auto& l : o->extent_map.extent_map) {
8125 dout(20) << __func__ << " " << l << dendl;
8126 if (l.logical_offset < pos) {
8127 derr << "fsck error: " << oid << " lextent at 0x"
8128 << std::hex << l.logical_offset
8129 << " overlaps with the previous, which ends at 0x" << pos
8130 << std::dec << dendl;
8131 ++errors;
8132 }
8133 if (depth != FSCK_SHALLOW &&
8134 o->extent_map.spans_shard(l.logical_offset, l.length)) {
8135 derr << "fsck error: " << oid << " lextent at 0x"
8136 << std::hex << l.logical_offset << "~" << l.length
8137 << " spans a shard boundary"
8138 << std::dec << dendl;
8139 ++errors;
8140 }
8141 pos = l.logical_offset + l.length;
8142 res_statfs->data_stored += l.length;
8143 ceph_assert(l.blob);
8144 const bluestore_blob_t& blob = l.blob->get_blob();
8145
20effc67
TL
8146#ifdef HAVE_LIBZBD
8147 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
8148 for (auto& e : blob.get_extents()) {
8149 if (e.is_valid()) {
8150 uint32_t zone = e.offset / zone_size;
8151 uint64_t offset = e.offset % zone_size;
8152 auto p = zone_first_offsets.find(zone);
8153 if (p == zone_first_offsets.end() || p->second > offset) {
8154 // FIXME: use interator for guided insert?
8155 zone_first_offsets[zone] = offset;
8156 }
8157 }
8158 }
8159 }
8160#endif
8161
8162 auto& ref = ref_map[l.blob];
8163 if (ref.is_empty()) {
8164 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
8165 uint32_t l = blob.get_logical_length();
8166 ref.init(l, min_release_size);
eafe8130
TL
8167 }
8168 ref.get(
8169 l.blob_offset,
8170 l.length);
8171 ++num_extents;
8172 if (depth != FSCK_SHALLOW &&
8173 blob.has_unused()) {
8174 ceph_assert(referenced);
8175 auto p = referenced->find(l.blob);
8176 bluestore_blob_t::unused_t* pu;
8177 if (p == referenced->end()) {
8178 pu = &(*referenced)[l.blob];
8179 }
8180 else {
8181 pu = &p->second;
8182 }
8183 uint64_t blob_len = blob.get_logical_length();
8184 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
8185 ceph_assert(l.blob_offset + l.length <= blob_len);
8186 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
8187 uint64_t start = l.blob_offset / chunk_size;
8188 uint64_t end =
8189 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
8190 for (auto i = start; i < end; ++i) {
8191 (*pu) |= (1u << i);
8192 }
8193 }
8194 } //for (auto& l : o->extent_map.extent_map)
8195
8196 for (auto& i : ref_map) {
8197 ++num_blobs;
8198 const bluestore_blob_t& blob = i.first->get_blob();
8199 bool equal =
8200 depth == FSCK_SHALLOW ? true :
8201 i.first->get_blob_use_tracker().equal(i.second);
8202 if (!equal) {
8203 derr << "fsck error: " << oid << " blob " << *i.first
8204 << " doesn't match expected ref_map " << i.second << dendl;
8205 ++errors;
8206 }
8207 if (blob.is_compressed()) {
8208 res_statfs->data_compressed += blob.get_compressed_payload_length();
8209 res_statfs->data_compressed_original +=
8210 i.first->get_referenced_bytes();
8211 }
20effc67
TL
8212 if (depth != FSCK_SHALLOW && repairer) {
8213 for (auto e : blob.get_extents()) {
8214 if (!e.is_valid())
8215 continue;
8216 repairer->set_space_used(e.offset, e.length, c->cid, oid);
8217 }
8218 }
eafe8130
TL
8219 if (blob.is_shared()) {
8220 if (i.first->shared_blob->get_sbid() > blobid_max) {
8221 derr << "fsck error: " << oid << " blob " << blob
8222 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
8223 << blobid_max << dendl;
8224 ++errors;
20effc67 8225 } else if (i.first->shared_blob->get_sbid() == 0) {
eafe8130
TL
8226 derr << "fsck error: " << oid << " blob " << blob
8227 << " marked as shared but has uninitialized sbid"
8228 << dendl;
8229 ++errors;
8230 }
8231 // the below lock is optional and provided in multithreading mode only
8232 if (sb_info_lock) {
8233 sb_info_lock->lock();
8234 }
20effc67
TL
8235 auto sbid = i.first->shared_blob->get_sbid();
8236 sb_info_t& sbi = sb_info.add_or_adopt(i.first->shared_blob->get_sbid());
8237 ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID ||
eafe8130 8238 sbi.pool_id == oid.hobj.get_logical_pool());
eafe8130 8239 sbi.pool_id = oid.hobj.get_logical_pool();
20effc67 8240 bool compressed = blob.is_compressed();
eafe8130
TL
8241 for (auto e : blob.get_extents()) {
8242 if (e.is_valid()) {
20effc67
TL
8243 if (compressed) {
8244 ceph_assert(sbi.allocated_chunks <= 0);
8245 sbi.allocated_chunks -= (e.length >> min_alloc_size_order);
8246 } else {
8247 ceph_assert(sbi.allocated_chunks >= 0);
8248 sbi.allocated_chunks += (e.length >> min_alloc_size_order);
8249 }
8250 sb_ref_counts.inc_range(sbid, e.offset, e.length, 1);
eafe8130
TL
8251 }
8252 }
8253 if (sb_info_lock) {
8254 sb_info_lock->unlock();
8255 }
8256 } else if (depth != FSCK_SHALLOW) {
8257 ceph_assert(used_blocks);
20effc67
TL
8258 string ctx_descr = " oid " + stringify(oid);
8259 errors += _fsck_check_extents(ctx_descr,
8260 blob.get_extents(),
eafe8130
TL
8261 blob.is_compressed(),
8262 *used_blocks,
8263 fm->get_alloc_size(),
20effc67 8264 repairer,
eafe8130
TL
8265 *res_statfs,
8266 depth);
8267 } else {
8268 errors += _fsck_sum_extents(
8269 blob.get_extents(),
8270 blob.is_compressed(),
8271 *res_statfs);
8272 }
8273 } // for (auto& i : ref_map)
9f95a23c 8274
adb31ebb
TL
8275 {
8276 auto &sbm = o->extent_map.spanning_blob_map;
8277 size_t broken = 0;
8278 BlobRef first_broken;
8279 for (auto it = sbm.begin(); it != sbm.end();) {
8280 auto it1 = it++;
8281 if (ref_map.count(it1->second) == 0) {
8282 if (!broken) {
8283 first_broken = it1->second;
8284 ++errors;
8285 }
8286 broken++;
8287 if (repairer) {
8288 sbm.erase(it1);
8289 }
8290 }
8291 }
20effc67
TL
8292
8293#ifdef HAVE_LIBZBD
8294 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
8295 for (auto& [zone, first_offset] : zone_first_offsets) {
8296 auto p = (*ctx.zone_refs)[zone].find(oid);
8297 if (p != (*ctx.zone_refs)[zone].end()) {
8298 if (first_offset < p->second) {
8299 dout(20) << " slightly wonky zone ref 0x" << std::hex << zone
8300 << " offset 0x" << p->second
8301 << " but first offset is 0x" << first_offset
8302 << "; this can happen due to clone_range"
8303 << dendl;
8304 } else {
8305 dout(20) << " good zone ref 0x" << std::hex << zone << " offset 0x" << p->second
8306 << " <= first offset 0x" << first_offset
8307 << std::dec << dendl;
8308 }
8309 (*ctx.zone_refs)[zone].erase(p);
8310 } else {
8311 derr << "fsck error: " << oid << " references zone 0x" << std::hex << zone
8312 << " but there is no zone ref" << std::dec << dendl;
8313 // FIXME: add repair
8314 ++errors;
8315 }
8316 }
8317 }
8318#endif
8319
adb31ebb
TL
8320 if (broken) {
8321 derr << "fsck error: " << oid << " - " << broken
8322 << " zombie spanning blob(s) found, the first one: "
8323 << *first_broken << dendl;
8324 if(repairer) {
b3b6e05e
TL
8325 repairer->fix_spanning_blobs(
8326 db,
8327 [&](KeyValueDB::Transaction txn) {
8328 _record_onode(o, txn);
8329 });
adb31ebb
TL
8330 }
8331 }
8332 }
8333
9f95a23c
TL
8334 if (o->onode.has_omap()) {
8335 _fsck_check_object_omap(depth, o, ctx);
8336 }
8337
eafe8130
TL
8338 return o;
8339}
8340
8341#include "common/WorkQueue.h"
8342
8343class ShallowFSCKThreadPool : public ThreadPool
8344{
8345public:
8346 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
8347 ThreadPool(cct_, nm, tn, n) {
8348 }
8349 void worker(ThreadPool::WorkThread* wt) override {
8350 int next_wq = 0;
8351 while (!_stop) {
8352 next_wq %= work_queues.size();
8353 WorkQueue_ *wq = work_queues[next_wq++];
8354
8355 void* item = wq->_void_dequeue();
8356 if (item) {
8357 processing++;
8358 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
8359 wq->_void_process(item, tp_handle);
8360 processing--;
8361 }
8362 }
8363 }
8364 template <size_t BatchLen>
8365 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
8366 {
8367 struct Entry {
8368 int64_t pool_id;
8369 BlueStore::CollectionRef c;
8370 ghobject_t oid;
8371 string key;
8372 bufferlist value;
8373 };
8374 struct Batch {
8375 std::atomic<size_t> running = { 0 };
8376 size_t entry_count = 0;
8377 std::array<Entry, BatchLen> entries;
8378
8379 int64_t errors = 0;
8380 int64_t warnings = 0;
8381 uint64_t num_objects = 0;
8382 uint64_t num_extents = 0;
8383 uint64_t num_blobs = 0;
8384 uint64_t num_sharded_objects = 0;
8385 uint64_t num_spanning_blobs = 0;
8386 store_statfs_t expected_store_statfs;
8387 BlueStore::per_pool_statfs expected_pool_statfs;
8388 };
8389
8390 size_t batchCount;
8391 BlueStore* store = nullptr;
8392
eafe8130 8393 ceph::mutex* sb_info_lock = nullptr;
20effc67
TL
8394 sb_info_space_efficient_map_t* sb_info = nullptr;
8395 shared_blob_2hash_tracker_t* sb_ref_counts = nullptr;
eafe8130
TL
8396 BlueStoreRepairer* repairer = nullptr;
8397
8398 Batch* batches = nullptr;
8399 size_t last_batch_pos = 0;
8400 bool batch_acquired = false;
8401
8402 FSCKWorkQueue(std::string n,
8403 size_t _batchCount,
8404 BlueStore* _store,
eafe8130 8405 ceph::mutex* _sb_info_lock,
20effc67
TL
8406 sb_info_space_efficient_map_t& _sb_info,
8407 shared_blob_2hash_tracker_t& _sb_ref_counts,
eafe8130 8408 BlueStoreRepairer* _repairer) :
f67539c2 8409 WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()),
eafe8130
TL
8410 batchCount(_batchCount),
8411 store(_store),
eafe8130
TL
8412 sb_info_lock(_sb_info_lock),
8413 sb_info(&_sb_info),
20effc67 8414 sb_ref_counts(&_sb_ref_counts),
eafe8130
TL
8415 repairer(_repairer)
8416 {
8417 batches = new Batch[batchCount];
8418 }
8419 ~FSCKWorkQueue() {
8420 delete[] batches;
8421 }
8422
8423 /// Remove all work items from the queue.
8424 void _clear() override {
8425 //do nothing
8426 }
8427 /// Check whether there is anything to do.
8428 bool _empty() override {
8429 ceph_assert(false);
8430 }
8431
8432 /// Get the next work item to process.
8433 void* _void_dequeue() override {
8434 size_t pos = rand() % batchCount;
8435 size_t pos0 = pos;
8436 do {
8437 auto& batch = batches[pos];
8438 if (batch.running.fetch_add(1) == 0) {
8439 if (batch.entry_count) {
8440 return &batch;
8441 }
8442 }
8443 batch.running--;
8444 pos++;
8445 pos %= batchCount;
8446 } while (pos != pos0);
8447 return nullptr;
8448 }
8449 /** @brief Process the work item.
8450 * This function will be called several times in parallel
8451 * and must therefore be thread-safe. */
8452 void _void_process(void* item, TPHandle& handle) override {
8453 Batch* batch = (Batch*)item;
8454
8455 BlueStore::FSCK_ObjectCtx ctx(
8456 batch->errors,
8457 batch->warnings,
8458 batch->num_objects,
8459 batch->num_extents,
8460 batch->num_blobs,
8461 batch->num_sharded_objects,
8462 batch->num_spanning_blobs,
8463 nullptr, // used_blocks
9f95a23c 8464 nullptr, //used_omap_head
20effc67 8465 nullptr,
eafe8130
TL
8466 sb_info_lock,
8467 *sb_info,
20effc67 8468 *sb_ref_counts,
eafe8130
TL
8469 batch->expected_store_statfs,
8470 batch->expected_pool_statfs,
8471 repairer);
8472
8473 for (size_t i = 0; i < batch->entry_count; i++) {
8474 auto& entry = batch->entries[i];
8475
8476 store->fsck_check_objects_shallow(
8477 BlueStore::FSCK_SHALLOW,
8478 entry.pool_id,
8479 entry.c,
8480 entry.oid,
8481 entry.key,
8482 entry.value,
9f95a23c 8483 nullptr, // expecting_shards - this will need a protection if passed
eafe8130
TL
8484 nullptr, // referenced
8485 ctx);
8486 }
eafe8130
TL
8487 batch->entry_count = 0;
8488 batch->running--;
8489 }
8490 /** @brief Synchronously finish processing a work item.
8491 * This function is called after _void_process with the global thread pool lock held,
8492 * so at most one copy will execute simultaneously for a given thread pool.
8493 * It can be used for non-thread-safe finalization. */
8494 void _void_process_finish(void*) override {
8495 ceph_assert(false);
8496 }
8497
8498 bool queue(
8499 int64_t pool_id,
8500 BlueStore::CollectionRef c,
8501 const ghobject_t& oid,
8502 const string& key,
8503 const bufferlist& value) {
8504 bool res = false;
8505 size_t pos0 = last_batch_pos;
8506 if (!batch_acquired) {
8507 do {
8508 auto& batch = batches[last_batch_pos];
8509 if (batch.running.fetch_add(1) == 0) {
8510 if (batch.entry_count < BatchLen) {
8511 batch_acquired = true;
8512 break;
8513 }
8514 }
8515 batch.running.fetch_sub(1);
8516 last_batch_pos++;
8517 last_batch_pos %= batchCount;
8518 } while (last_batch_pos != pos0);
8519 }
8520 if (batch_acquired) {
8521 auto& batch = batches[last_batch_pos];
8522 ceph_assert(batch.running);
8523 ceph_assert(batch.entry_count < BatchLen);
8524
8525 auto& entry = batch.entries[batch.entry_count];
8526 entry.pool_id = pool_id;
8527 entry.c = c;
8528 entry.oid = oid;
8529 entry.key = key;
8530 entry.value = value;
8531
8532 ++batch.entry_count;
8533 if (batch.entry_count == BatchLen) {
8534 batch_acquired = false;
8535 batch.running.fetch_sub(1);
8536 last_batch_pos++;
8537 last_batch_pos %= batchCount;
8538 }
8539 res = true;
8540 }
8541 return res;
8542 }
8543
8544 void finalize(ThreadPool& tp,
8545 BlueStore::FSCK_ObjectCtx& ctx) {
8546 if (batch_acquired) {
8547 auto& batch = batches[last_batch_pos];
8548 ceph_assert(batch.running);
8549 batch.running.fetch_sub(1);
8550 }
8551 tp.stop();
8552
8553 for (size_t i = 0; i < batchCount; i++) {
8554 auto& batch = batches[i];
8555
8556 //process leftovers if any
8557 if (batch.entry_count) {
8558 TPHandle tp_handle(store->cct,
8559 nullptr,
8560 timeout_interval,
8561 suicide_interval);
8562 ceph_assert(batch.running == 0);
8563
8564 batch.running++; // just to be on-par with the regular call
8565 _void_process(&batch, tp_handle);
8566 }
8567 ceph_assert(batch.entry_count == 0);
8568
8569 ctx.errors += batch.errors;
8570 ctx.warnings += batch.warnings;
8571 ctx.num_objects += batch.num_objects;
8572 ctx.num_extents += batch.num_extents;
8573 ctx.num_blobs += batch.num_blobs;
8574 ctx.num_sharded_objects += batch.num_sharded_objects;
8575 ctx.num_spanning_blobs += batch.num_spanning_blobs;
9f95a23c 8576
eafe8130
TL
8577 ctx.expected_store_statfs.add(batch.expected_store_statfs);
8578
8579 for (auto it = batch.expected_pool_statfs.begin();
8580 it != batch.expected_pool_statfs.end();
8581 it++) {
8582 ctx.expected_pool_statfs[it->first].add(it->second);
8583 }
8584 }
8585 }
8586 };
8587};
8588
9f95a23c
TL
8589void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
8590 OnodeRef& o,
8591 const BlueStore::FSCK_ObjectCtx& ctx)
eafe8130 8592{
9f95a23c
TL
8593 auto& errors = ctx.errors;
8594 auto& warnings = ctx.warnings;
8595 auto repairer = ctx.repairer;
8596
8597 ceph_assert(o->onode.has_omap());
8598 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
f67539c2 8599 if (per_pool_omap == OMAP_PER_POOL) {
9f95a23c
TL
8600 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8601 << "fsck error: " << o->oid
8602 << " has omap that is not per-pool or pgmeta"
8603 << fsck_dendl;
8604 ++errors;
8605 } else {
8606 const char* w;
8607 int64_t num;
8608 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8609 ++errors;
8610 num = errors;
8611 w = "error";
8612 } else {
8613 ++warnings;
8614 num = warnings;
8615 w = "warning";
8616 }
8617 fsck_derr(num, MAX_FSCK_ERROR_LINES)
8618 << "fsck " << w << ": " << o->oid
8619 << " has omap that is not per-pool or pgmeta"
8620 << fsck_dendl;
8621 }
f67539c2
TL
8622 } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) {
8623 if (per_pool_omap == OMAP_PER_PG) {
8624 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8625 << "fsck error: " << o->oid
8626 << " has omap that is not per-pg or pgmeta"
8627 << fsck_dendl;
8628 ++errors;
8629 } else {
8630 const char* w;
8631 int64_t num;
8632 if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) {
8633 ++errors;
8634 num = errors;
8635 w = "error";
8636 } else {
8637 ++warnings;
8638 num = warnings;
8639 w = "warning";
8640 }
8641 fsck_derr(num, MAX_FSCK_ERROR_LINES)
8642 << "fsck " << w << ": " << o->oid
8643 << " has omap that is not per-pg or pgmeta"
8644 << fsck_dendl;
8645 }
9f95a23c
TL
8646 }
8647 if (repairer &&
f67539c2 8648 !o->onode.is_perpg_omap() &&
9f95a23c 8649 !o->onode.is_pgmeta_omap()) {
f67539c2 8650 dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl;
522d829b 8651 bufferlist header;
9f95a23c 8652 map<string, bufferlist> kv;
522d829b
TL
8653 {
8654 KeyValueDB::Transaction txn = db->get_transaction();
8655 uint64_t txn_cost = 0;
8656 const string& prefix = Onode::calc_omap_prefix(o->onode.flags);
8657 uint8_t new_flags = o->onode.flags |
8658 bluestore_onode_t::FLAG_PERPOOL_OMAP |
8659 bluestore_onode_t::FLAG_PERPG_OMAP;
8660 const string& new_omap_prefix = Onode::calc_omap_prefix(new_flags);
8661
8662 KeyValueDB::Iterator it = db->get_iterator(prefix);
8663 string head, tail;
8664 o->get_omap_header(&head);
8665 o->get_omap_tail(&tail);
8666 it->lower_bound(head);
8667 // head
8668 if (it->valid() && it->key() == head) {
8669 dout(30) << __func__ << " got header" << dendl;
8670 header = it->value();
8671 if (header.length()) {
8672 string new_head;
8673 Onode::calc_omap_header(new_flags, o.get(), &new_head);
8674 txn->set(new_omap_prefix, new_head, header);
8675 txn_cost += new_head.length() + header.length();
8676 }
a4b75251 8677 it->next();
522d829b
TL
8678 }
8679 // tail
8680 {
8681 string new_tail;
8682 Onode::calc_omap_tail(new_flags, o.get(), &new_tail);
8683 bufferlist empty;
8684 txn->set(new_omap_prefix, new_tail, empty);
8685 txn_cost += new_tail.length() + new_tail.length();
8686 }
8687 // values
8688 string final_key;
8689 Onode::calc_omap_key(new_flags, o.get(), string(), &final_key);
8690 size_t base_key_len = final_key.size();
8691 while (it->valid() && it->key() < tail) {
8692 string user_key;
8693 o->decode_omap_key(it->key(), &user_key);
8694 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
8695 << " -> " << user_key << dendl;
8696
8697 final_key.resize(base_key_len);
a4b75251 8698 final_key += user_key;
522d829b
TL
8699 auto v = it->value();
8700 txn->set(new_omap_prefix, final_key, v);
8701 txn_cost += final_key.length() + v.length();
8702
8703 // submit a portion if cost exceeds 16MB
8704 if (txn_cost >= 16 * (1 << 20) ) {
8705 db->submit_transaction_sync(txn);
8706 txn = db->get_transaction();
8707 txn_cost = 0;
8708 }
8709 it->next();
8710 }
8711 if (txn_cost > 0) {
8712 db->submit_transaction_sync(txn);
8713 }
8714 }
8715 // finalize: remove legacy data
8716 {
9f95a23c
TL
8717 KeyValueDB::Transaction txn = db->get_transaction();
8718 // remove old keys
8719 const string& old_omap_prefix = o->get_omap_prefix();
8720 string old_head, old_tail;
8721 o->get_omap_header(&old_head);
8722 o->get_omap_tail(&old_tail);
8723 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
8724 txn->rmkey(old_omap_prefix, old_tail);
8725 // set flag
f67539c2 8726 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP);
9f95a23c 8727 _record_onode(o, txn);
9f95a23c
TL
8728 db->submit_transaction_sync(txn);
8729 repairer->inc_repaired();
522d829b 8730 repairer->request_compaction();
9f95a23c 8731 }
eafe8130 8732 }
9f95a23c 8733}
eafe8130 8734
20effc67
TL
8735void BlueStore::_fsck_check_objects(
8736 FSCKDepth depth,
9f95a23c
TL
8737 BlueStore::FSCK_ObjectCtx& ctx)
8738{
eafe8130 8739 auto& errors = ctx.errors;
eafe8130
TL
8740 auto sb_info_lock = ctx.sb_info_lock;
8741 auto& sb_info = ctx.sb_info;
20effc67 8742 auto& sb_ref_counts = ctx.sb_ref_counts;
eafe8130
TL
8743 auto repairer = ctx.repairer;
8744
8745 uint64_t_btree_t used_nids;
8746
8747 size_t processed_myself = 0;
8748
f67539c2 8749 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
8750 mempool::bluestore_fsck::list<string> expecting_shards;
8751 if (it) {
8752 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
8753 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
8754 std::unique_ptr<WQ> wq(
8755 new WQ(
8756 "FSCKWorkQueue",
8757 (thread_count ? : 1) * 32,
8758 this,
eafe8130
TL
8759 sb_info_lock,
8760 sb_info,
20effc67 8761 sb_ref_counts,
eafe8130
TL
8762 repairer));
8763
8764 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
8765
8766 thread_pool.add_work_queue(wq.get());
8767 if (depth == FSCK_SHALLOW && thread_count > 0) {
8768 //not the best place but let's check anyway
8769 ceph_assert(sb_info_lock);
8770 thread_pool.start();
8771 }
8772
20effc67 8773 // fill global if not overriden below
eafe8130
TL
8774 CollectionRef c;
8775 int64_t pool_id = -1;
8776 spg_t pgid;
8777 for (it->lower_bound(string()); it->valid(); it->next()) {
8778 dout(30) << __func__ << " key "
8779 << pretty_binary_string(it->key()) << dendl;
8780 if (is_extent_shard_key(it->key())) {
8781 if (depth == FSCK_SHALLOW) {
8782 continue;
8783 }
8784 while (!expecting_shards.empty() &&
8785 expecting_shards.front() < it->key()) {
8786 derr << "fsck error: missing shard key "
8787 << pretty_binary_string(expecting_shards.front())
8788 << dendl;
8789 ++errors;
8790 expecting_shards.pop_front();
8791 }
8792 if (!expecting_shards.empty() &&
8793 expecting_shards.front() == it->key()) {
8794 // all good
8795 expecting_shards.pop_front();
8796 continue;
8797 }
8798
8799 uint32_t offset;
8800 string okey;
8801 get_key_extent_shard(it->key(), &okey, &offset);
8802 derr << "fsck error: stray shard 0x" << std::hex << offset
8803 << std::dec << dendl;
8804 if (expecting_shards.empty()) {
8805 derr << "fsck error: " << pretty_binary_string(it->key())
8806 << " is unexpected" << dendl;
8807 ++errors;
8808 continue;
8809 }
8810 while (expecting_shards.front() > it->key()) {
8811 derr << "fsck error: saw " << pretty_binary_string(it->key())
8812 << dendl;
8813 derr << "fsck error: exp "
8814 << pretty_binary_string(expecting_shards.front()) << dendl;
8815 ++errors;
8816 expecting_shards.pop_front();
8817 if (expecting_shards.empty()) {
8818 break;
8819 }
8820 }
8821 continue;
8822 }
8823
8824 ghobject_t oid;
8825 int r = get_key_object(it->key(), &oid);
8826 if (r < 0) {
8827 derr << "fsck error: bad object key "
8828 << pretty_binary_string(it->key()) << dendl;
8829 ++errors;
8830 continue;
8831 }
8832 if (!c ||
8833 oid.shard_id != pgid.shard ||
8834 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8835 !c->contains(oid)) {
8836 c = nullptr;
8837 for (auto& p : coll_map) {
8838 if (p.second->contains(oid)) {
8839 c = p.second;
8840 break;
8841 }
8842 }
8843 if (!c) {
8844 derr << "fsck error: stray object " << oid
8845 << " not owned by any collection" << dendl;
8846 ++errors;
8847 continue;
8848 }
8849 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
8850 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
8851 << dendl;
8852 }
8853
8854 if (depth != FSCK_SHALLOW &&
8855 !expecting_shards.empty()) {
8856 for (auto& k : expecting_shards) {
8857 derr << "fsck error: missing shard key "
8858 << pretty_binary_string(k) << dendl;
8859 }
8860 ++errors;
8861 expecting_shards.clear();
8862 }
8863
8864 bool queued = false;
8865 if (depth == FSCK_SHALLOW && thread_count > 0) {
8866 queued = wq->queue(
8867 pool_id,
8868 c,
8869 oid,
8870 it->key(),
8871 it->value());
8872 }
8873 OnodeRef o;
8874 map<BlobRef, bluestore_blob_t::unused_t> referenced;
8875
8876 if (!queued) {
8877 ++processed_myself;
eafe8130
TL
8878 o = fsck_check_objects_shallow(
8879 depth,
8880 pool_id,
8881 c,
8882 oid,
8883 it->key(),
8884 it->value(),
9f95a23c 8885 &expecting_shards,
eafe8130
TL
8886 &referenced,
8887 ctx);
8888 }
8889
8890 if (depth != FSCK_SHALLOW) {
8891 ceph_assert(o != nullptr);
8892 if (o->onode.nid) {
8893 if (o->onode.nid > nid_max) {
8894 derr << "fsck error: " << oid << " nid " << o->onode.nid
8895 << " > nid_max " << nid_max << dendl;
8896 ++errors;
8897 }
8898 if (used_nids.count(o->onode.nid)) {
8899 derr << "fsck error: " << oid << " nid " << o->onode.nid
8900 << " already in use" << dendl;
8901 ++errors;
8902 continue; // go for next object
8903 }
8904 used_nids.insert(o->onode.nid);
8905 }
8906 for (auto& i : referenced) {
8907 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
8908 << std::dec << " for " << *i.first << dendl;
8909 const bluestore_blob_t& blob = i.first->get_blob();
8910 if (i.second & blob.unused) {
8911 derr << "fsck error: " << oid << " blob claims unused 0x"
8912 << std::hex << blob.unused
8913 << " but extents reference 0x" << i.second << std::dec
8914 << " on blob " << *i.first << dendl;
8915 ++errors;
8916 }
8917 if (blob.has_csum()) {
8918 uint64_t blob_len = blob.get_logical_length();
8919 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
8920 unsigned csum_count = blob.get_csum_count();
8921 unsigned csum_chunk_size = blob.get_csum_chunk_size();
8922 for (unsigned p = 0; p < csum_count; ++p) {
8923 unsigned pos = p * csum_chunk_size;
8924 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
8925 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
8926 unsigned mask = 1u << firstbit;
8927 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
8928 mask |= 1u << b;
8929 }
8930 if ((blob.unused & mask) == mask) {
8931 // this csum chunk region is marked unused
8932 if (blob.get_csum_item(p) != 0) {
8933 derr << "fsck error: " << oid
8934 << " blob claims csum chunk 0x" << std::hex << pos
8935 << "~" << csum_chunk_size
8936 << " is unused (mask 0x" << mask << " of unused 0x"
8937 << blob.unused << ") but csum is non-zero 0x"
8938 << blob.get_csum_item(p) << std::dec << " on blob "
8939 << *i.first << dendl;
8940 ++errors;
8941 }
8942 }
8943 }
8944 }
8945 }
8946 // omap
8947 if (o->onode.has_omap()) {
9f95a23c
TL
8948 ceph_assert(ctx.used_omap_head);
8949 if (ctx.used_omap_head->count(o->onode.nid)) {
8950 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
8951 << " already in use" << dendl;
eafe8130
TL
8952 ++errors;
8953 } else {
9f95a23c 8954 ctx.used_omap_head->insert(o->onode.nid);
eafe8130 8955 }
9f95a23c 8956 } // if (o->onode.has_omap())
eafe8130
TL
8957 if (depth == FSCK_DEEP) {
8958 bufferlist bl;
8959 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
8960 uint64_t offset = 0;
8961 do {
8962 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
8963 int r = _do_read(c.get(), o, offset, l, bl,
8964 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
8965 if (r < 0) {
8966 ++errors;
8967 derr << "fsck error: " << oid << std::hex
8968 << " error during read: "
8969 << " " << offset << "~" << l
8970 << " " << cpp_strerror(r) << std::dec
8971 << dendl;
8972 break;
8973 }
8974 offset += l;
8975 } while (offset < o->onode.size);
8976 } // deep
8977 } //if (depth != FSCK_SHALLOW)
8978 } // for (it->lower_bound(string()); it->valid(); it->next())
8979 if (depth == FSCK_SHALLOW && thread_count > 0) {
8980 wq->finalize(thread_pool, ctx);
8981 if (processed_myself) {
8982 // may be needs more threads?
8983 dout(0) << __func__ << " partial offload"
8984 << ", done myself " << processed_myself
8985 << " of " << ctx.num_objects
8986 << "objects, threads " << thread_count
8987 << dendl;
8988 }
8989 }
8990 } // if (it)
8991}
8992/**
8993An overview for currently implemented repair logics
8994performed in fsck in two stages: detection(+preparation) and commit.
8995Detection stage (in processing order):
8996 (Issue -> Repair action to schedule)
8997 - Detect undecodable keys for Shared Blobs -> Remove
8998 - Detect undecodable records for Shared Blobs -> Remove
8999 (might trigger missed Shared Blob detection below)
9000 - Detect stray records for Shared Blobs -> Remove
9001 - Detect misreferenced pextents -> Fix
9002 Prepare Bloom-like filter to track cid/oid -> pextent
9003 Prepare list of extents that are improperly referenced
9004 Enumerate Onode records that might use 'misreferenced' pextents
9005 (Bloom-like filter applied to reduce computation)
9006 Per each questinable Onode enumerate all blobs and identify broken ones
9007 (i.e. blobs having 'misreferences')
9008 Rewrite each broken blob data by allocating another extents and
9009 copying data there
9010 If blob is shared - unshare it and mark corresponding Shared Blob
9011 for removal
9012 Release previously allocated space
9013 Update Extent Map
9014 - Detect missed Shared Blobs -> Recreate
9015 - Detect undecodable deferred transaction -> Remove
9016 - Detect Freelist Manager's 'false free' entries -> Mark as used
9017 - Detect Freelist Manager's leaked entries -> Mark as free
9018 - Detect statfs inconsistency - Update
9019 Commit stage (separate DB commit per each step):
9020 - Apply leaked FM entries fix
9021 - Apply 'false free' FM entries fix
9022 - Apply 'Remove' actions
9023 - Apply fix for misreference pextents
9024 - Apply Shared Blob recreate
9025 (can be merged with the step above if misreferences were dectected)
9026 - Apply StatFS update
9027*/
9028int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
9029{
20effc67 9030 dout(5) << __func__
eafe8130
TL
9031 << (repair ? " repair" : " check")
9032 << (depth == FSCK_DEEP ? " (deep)" :
9033 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
9034 << dendl;
9035
9036 // in deep mode we need R/W write access to be able to replay deferred ops
20effc67 9037 const bool read_only = !(repair || depth == FSCK_DEEP);
f67539c2 9038 int r = _open_db_and_around(read_only);
20effc67 9039 if (r < 0) {
eafe8130 9040 return r;
20effc67
TL
9041 }
9042 auto close_db = make_scope_guard([&] {
9043 _close_db_and_around();
9044 });
7c673cae 9045
11fdf7f2
TL
9046 if (!read_only) {
9047 r = _upgrade_super();
9048 if (r < 0) {
20effc67 9049 return r;
11fdf7f2
TL
9050 }
9051 }
7c673cae 9052
20effc67 9053 // NullFreelistManager needs to open collection early
eafe8130 9054 r = _open_collections();
20effc67
TL
9055 if (r < 0) {
9056 return r;
9057 }
7c673cae
FG
9058
9059 mempool_thread.init();
20effc67
TL
9060 auto stop_mempool = make_scope_guard([&] {
9061 mempool_thread.shutdown();
9062 _shutdown_cache();
9063 });
11fdf7f2
TL
9064 // we need finisher and kv_{sync,finalize}_thread *just* for replay
9065 // enable in repair or deep mode modes only
9066 if (!read_only) {
9067 _kv_start();
9068 r = _deferred_replay();
9069 _kv_stop();
9070 }
eafe8130 9071
20effc67
TL
9072 if (r < 0) {
9073 return r;
9074 }
9075 return _fsck_on_open(depth, repair);
eafe8130
TL
9076}
9077
9078int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
9079{
20effc67
TL
9080 uint64_t sb_hash_size = uint64_t(
9081 cct->_conf.get_val<Option::size_t>("osd_memory_target") *
9082 cct->_conf.get_val<double>(
9083 "bluestore_fsck_shared_blob_tracker_size"));
9084
eafe8130
TL
9085 dout(1) << __func__
9086 << " <<<START>>>"
9087 << (repair ? " repair" : " check")
9088 << (depth == FSCK_DEEP ? " (deep)" :
9089 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
20effc67
TL
9090 << " start sb_tracker_hash_size:" << sb_hash_size
9091 << dendl;
eafe8130
TL
9092 int64_t errors = 0;
9093 int64_t warnings = 0;
9094 unsigned repaired = 0;
9095
9096 uint64_t_btree_t used_omap_head;
eafe8130
TL
9097 uint64_t_btree_t used_sbids;
9098
f67539c2 9099 mempool_dynamic_bitset used_blocks, bluefs_used_blocks;
eafe8130
TL
9100 KeyValueDB::Iterator it;
9101 store_statfs_t expected_store_statfs, actual_statfs;
9102 per_pool_statfs expected_pool_statfs;
9103
20effc67
TL
9104 sb_info_space_efficient_map_t sb_info;
9105 shared_blob_2hash_tracker_t sb_ref_counts(
9106 sb_hash_size,
9107 min_alloc_size);
9108 size_t sb_ref_mismatches = 0;
9109
9110 /// map of oid -> (first_)offset for each zone
9111 std::vector<std::unordered_map<ghobject_t, uint64_t>> zone_refs; // FIXME: this may be a lot of RAM!
eafe8130
TL
9112
9113 uint64_t num_objects = 0;
9114 uint64_t num_extents = 0;
9115 uint64_t num_blobs = 0;
9116 uint64_t num_spanning_blobs = 0;
9117 uint64_t num_shared_blobs = 0;
9118 uint64_t num_sharded_objects = 0;
9119 BlueStoreRepairer repairer;
9120
f67539c2
TL
9121 auto alloc_size = fm->get_alloc_size();
9122
eafe8130
TL
9123 utime_t start = ceph_clock_now();
9124
9125 _fsck_collections(&errors);
b32b8144 9126 used_blocks.resize(fm->get_alloc_units());
7c673cae
FG
9127
9128 if (bluefs) {
f67539c2 9129 interval_set<uint64_t> bluefs_extents;
11fdf7f2 9130
f67539c2
TL
9131 int r = bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents);
9132 ceph_assert(r == 0);
9133 for (auto [start, len] : bluefs_extents) {
9134 apply_for_bitset_range(start, len, alloc_size, used_blocks,
9135 [&](uint64_t pos, mempool_dynamic_bitset& bs) {
9136 ceph_assert(pos < bs.size());
7c673cae 9137 bs.set(pos);
f67539c2
TL
9138 }
9139 );
9140 }
9141 }
9142
9143 bluefs_used_blocks = used_blocks;
9144
9145 apply_for_bitset_range(
9146 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks,
9147 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9148 bs.set(pos);
7c673cae 9149 }
f67539c2
TL
9150 );
9151
9152
9153 if (repair) {
b3b6e05e 9154 repairer.init_space_usage_tracker(
f67539c2
TL
9155 bdev->get_size(),
9156 min_alloc_size);
9157 }
9158
9159 if (bluefs) {
eafe8130 9160 int r = bluefs->fsck();
7c673cae 9161 if (r < 0) {
eafe8130 9162 return r;
7c673cae
FG
9163 }
9164 if (r > 0)
9165 errors += r;
9166 }
9167
eafe8130
TL
9168 if (!per_pool_stat_collection) {
9169 const char *w;
9170 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
9171 w = "error";
9172 ++errors;
9173 } else {
9174 w = "warning";
9175 ++warnings;
9176 }
9177 derr << "fsck " << w << ": store not yet converted to per-pool stats"
9178 << dendl;
9179 }
f67539c2 9180 if (per_pool_omap != OMAP_PER_PG) {
9f95a23c
TL
9181 const char *w;
9182 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
9183 w = "error";
9184 ++errors;
9185 } else {
9186 w = "warning";
9187 ++warnings;
9188 }
f67539c2 9189 derr << "fsck " << w << ": store not yet converted to per-pg omap"
9f95a23c
TL
9190 << dendl;
9191 }
9192
11fdf7f2 9193 // get expected statfs; reset unaffected fields to be able to compare
7c673cae
FG
9194 // structs
9195 statfs(&actual_statfs);
11fdf7f2
TL
9196 actual_statfs.total = 0;
9197 actual_statfs.internally_reserved = 0;
9198 actual_statfs.available = 0;
9199 actual_statfs.internal_metadata = 0;
9200 actual_statfs.omap_allocated = 0;
9201
eafe8130
TL
9202 if (g_conf()->bluestore_debug_fsck_abort) {
9203 dout(1) << __func__ << " debug abort" << dendl;
9204 goto out_scan;
9205 }
20effc67
TL
9206
9207#ifdef HAVE_LIBZBD
9208 if (bdev->is_smr()) {
9209 auto a = dynamic_cast<ZonedAllocator*>(alloc);
9210 ceph_assert(a);
9211 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
9212 ceph_assert(f);
9213 vector<uint64_t> wp = bdev->get_zones();
9214 vector<zone_state_t> zones = f->get_zone_states(db);
9215 ceph_assert(wp.size() == zones.size());
9216 auto num_zones = bdev->get_size() / zone_size;
9217 for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
9218 uint64_t p = wp[i] == (i + 1) * zone_size ? zone_size : wp[i] % zone_size;
9219 if (zones[i].write_pointer > p &&
9220 zones[i].num_dead_bytes < zones[i].write_pointer) {
9221 derr << "fsck error: zone 0x" << std::hex << i
9222 << " bluestore write pointer 0x" << zones[i].write_pointer
9223 << " > device write pointer 0x" << p
9224 << " (with only 0x" << zones[i].num_dead_bytes << " dead bytes)"
9225 << std::dec << dendl;
9226 ++errors;
9227 }
9228 }
9229
9230 if (depth != FSCK_SHALLOW) {
9231 // load zone refs
9232 zone_refs.resize(bdev->get_size() / zone_size);
9233 it = db->get_iterator(PREFIX_ZONED_CL_INFO, KeyValueDB::ITERATOR_NOCACHE);
9234 if (it) {
9235 for (it->lower_bound(string());
9236 it->valid();
9237 it->next()) {
9238 uint32_t zone = 0;
9239 uint64_t offset = 0;
9240 ghobject_t oid;
9241 string key = it->key();
9242 int r = get_key_zone_offset_object(key, &zone, &offset, &oid);
9243 if (r < 0) {
9244 derr << "fsck error: invalid zone ref key " << pretty_binary_string(key)
9245 << dendl;
9246 if (repair) {
9247 repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
9248 }
9249 ++errors;
9250 continue;
9251 }
9252 dout(30) << " zone ref 0x" << std::hex << zone << " offset 0x" << offset
9253 << " -> " << std::dec << oid << dendl;
9254 if (zone_refs[zone].count(oid)) {
9255 derr << "fsck error: second zone ref in zone 0x" << std::hex << zone
9256 << " offset 0x" << offset << std::dec << " for " << oid << dendl;
9257 if (repair) {
9258 repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
9259 }
9260 ++errors;
9261 continue;
9262 }
9263 zone_refs[zone][oid] = offset;
9264 }
9265 }
9266 }
9267 }
9268#endif
9269
9270 dout(1) << __func__ << " checking shared_blobs (phase 1)" << dendl;
9271 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
9272 if (it) {
9273 for (it->lower_bound(string()); it->valid(); it->next()) {
9274 string key = it->key();
9275 uint64_t sbid;
9276 if (get_key_shared_blob(key, &sbid) < 0) {
9277 // Failed to parse the key.
9278 // This gonna to be handled at the second stage
9279 continue;
9280 }
9281 bluestore_shared_blob_t shared_blob(sbid);
9282 bufferlist bl = it->value();
9283 auto blp = bl.cbegin();
9284 try {
9285 decode(shared_blob, blp);
9286 }
9287 catch (ceph::buffer::error& e) {
9288 // this gonna to be handled at the second stage
9289 continue;
9290 }
9291 dout(20) << __func__ << " " << shared_blob << dendl;
9292 auto& sbi = sb_info.add_maybe_stray(sbid);
9293
9294 // primarily to silent the 'unused' warning
9295 ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID);
9296
9297 for (auto& r : shared_blob.ref_map.ref_map) {
9298 sb_ref_counts.inc_range(
9299 sbid,
9300 r.first,
9301 r.second.length,
9302 -r.second.refs);
9303 }
9304 }
9305 } // if (it) //checking shared_blobs (phase1)
9306
7c673cae 9307 // walk PREFIX_OBJ
eafe8130
TL
9308 {
9309 dout(1) << __func__ << " walking object keyspace" << dendl;
9310 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
9311 BlueStore::FSCK_ObjectCtx ctx(
9312 errors,
9313 warnings,
9314 num_objects,
9315 num_extents,
9316 num_blobs,
9317 num_sharded_objects,
9318 num_spanning_blobs,
9319 &used_blocks,
9320 &used_omap_head,
20effc67 9321 &zone_refs,
9f95a23c
TL
9322 //no need for the below lock when in non-shallow mode as
9323 // there is no multithreading in this case
9324 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
eafe8130 9325 sb_info,
20effc67 9326 sb_ref_counts,
eafe8130
TL
9327 expected_store_statfs,
9328 expected_pool_statfs,
9329 repair ? &repairer : nullptr);
9f95a23c
TL
9330
9331 _fsck_check_objects(depth, ctx);
eafe8130 9332 }
11fdf7f2 9333
20effc67
TL
9334#ifdef HAVE_LIBZBD
9335 if (bdev->is_smr() && depth != FSCK_SHALLOW) {
9336 dout(1) << __func__ << " checking for leaked zone refs" << dendl;
9337 for (uint32_t zone = 0; zone < zone_refs.size(); ++zone) {
9338 for (auto& [oid, offset] : zone_refs[zone]) {
9339 derr << "fsck error: stray zone ref 0x" << std::hex << zone
9340 << " offset 0x" << offset << " -> " << std::dec << oid << dendl;
9341 // FIXME: add repair
9342 ++errors;
9343 }
9344 }
9345 }
9346#endif
9347
9348 sb_ref_mismatches = sb_ref_counts.count_non_zero();
9349 if (sb_ref_mismatches != 0) {
9350 derr << "fsck error: shared blob references aren't matching, at least "
9351 << sb_ref_mismatches << " found" << dendl;
9352 errors += sb_ref_mismatches;
9353 }
9354
9355 if (depth != FSCK_SHALLOW && repair) {
9356 _fsck_repair_shared_blobs(repairer, sb_ref_counts, sb_info);
9357 }
9358 dout(1) << __func__ << " checking shared_blobs (phase 2)" << dendl;
f67539c2 9359 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
7c673cae 9360 if (it) {
eafe8130
TL
9361 // FIXME minor: perhaps simplify for shallow mode?
9362 // fill global if not overriden below
9363 auto expected_statfs = &expected_store_statfs;
7c673cae
FG
9364 for (it->lower_bound(string()); it->valid(); it->next()) {
9365 string key = it->key();
9366 uint64_t sbid;
9367 if (get_key_shared_blob(key, &sbid)) {
3efd9988 9368 derr << "fsck error: bad key '" << key
20effc67 9369 << "' in shared blob namespace" << dendl;
11fdf7f2
TL
9370 if (repair) {
9371 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9372 }
7c673cae
FG
9373 ++errors;
9374 continue;
9375 }
9376 auto p = sb_info.find(sbid);
9377 if (p == sb_info.end()) {
20effc67
TL
9378 if (sb_ref_mismatches > 0) {
9379 // highly likely this has been already reported before, ignoring...
9380 dout(5) << __func__ << " found duplicate(?) stray shared blob data for sbid 0x"
9381 << std::hex << sbid << std::dec << dendl;
9382 } else {
9383 derr<< "fsck error: found stray shared blob data for sbid 0x"
9384 << std::hex << sbid << std::dec << dendl;
9385 ++errors;
9386 if (repair) {
9387 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9388 }
11fdf7f2 9389 }
7c673cae
FG
9390 } else {
9391 ++num_shared_blobs;
20effc67 9392 sb_info_t& sbi = *p;
7c673cae
FG
9393 bluestore_shared_blob_t shared_blob(sbid);
9394 bufferlist bl = it->value();
11fdf7f2
TL
9395 auto blp = bl.cbegin();
9396 try {
20effc67
TL
9397 decode(shared_blob, blp);
9398 }
9399 catch (ceph::buffer::error& e) {
7c673cae 9400 ++errors;
20effc67
TL
9401
9402 derr << "fsck error: failed to decode Shared Blob"
9403 << pretty_binary_string(key) << dendl;
9404 if (repair) {
9405 dout(20) << __func__ << " undecodable Shared Blob, key:'"
9406 << pretty_binary_string(key)
9407 << "', removing" << dendl;
9408 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
9409 }
9410 continue;
7c673cae 9411 }
20effc67 9412 dout(20) << __func__ << " " << shared_blob << dendl;
7c673cae 9413 PExtentVector extents;
20effc67 9414 for (auto& r : shared_blob.ref_map.ref_map) {
7c673cae
FG
9415 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
9416 }
20effc67
TL
9417 if (sbi.pool_id != sb_info_t::INVALID_POOL_ID &&
9418 (per_pool_stat_collection || repair)) {
11fdf7f2
TL
9419 expected_statfs = &expected_pool_statfs[sbi.pool_id];
9420 }
20effc67
TL
9421 std::stringstream ss;
9422 ss << "sbid 0x" << std::hex << sbid << std::dec;
9423 errors += _fsck_check_extents(ss.str(),
9424 extents,
9425 sbi.allocated_chunks < 0,
9426 used_blocks,
9427 fm->get_alloc_size(),
9428 repair ? &repairer : nullptr,
9429 *expected_statfs,
9430 depth);
11fdf7f2
TL
9431 }
9432 }
20effc67 9433 } // if (it) /* checking shared_blobs (phase 2)*/
11fdf7f2
TL
9434
9435 if (repair && repairer.preprocess_misreference(db)) {
9436
9437 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
11fdf7f2
TL
9438 auto& misref_extents = repairer.get_misreferences();
9439 interval_set<uint64_t> to_release;
f67539c2 9440 it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2 9441 if (it) {
eafe8130
TL
9442 // fill global if not overriden below
9443 auto expected_statfs = &expected_store_statfs;
11fdf7f2
TL
9444
9445 CollectionRef c;
9446 spg_t pgid;
9447 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
9448 bool bypass_rest = false;
9449 for (it->lower_bound(string()); it->valid() && !bypass_rest;
9450 it->next()) {
9451 dout(30) << __func__ << " key "
9452 << pretty_binary_string(it->key()) << dendl;
9453 if (is_extent_shard_key(it->key())) {
9454 continue;
9455 }
9456
9457 ghobject_t oid;
9458 int r = get_key_object(it->key(), &oid);
b3b6e05e 9459 if (r < 0 || !repairer.is_used(oid)) {
11fdf7f2
TL
9460 continue;
9461 }
9462
9463 if (!c ||
9464 oid.shard_id != pgid.shard ||
9465 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
9466 !c->contains(oid)) {
9467 c = nullptr;
9468 for (auto& p : coll_map) {
9469 if (p.second->contains(oid)) {
9470 c = p.second;
9471 break;
9472 }
9473 }
9474 if (!c) {
9475 continue;
9476 }
eafe8130
TL
9477 if (per_pool_stat_collection || repair) {
9478 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
11fdf7f2
TL
9479 expected_statfs = &expected_pool_statfs[pool_id];
9480 }
9481 }
b3b6e05e 9482 if (!repairer.is_used(c->cid)) {
11fdf7f2
TL
9483 continue;
9484 }
9485
9486 dout(20) << __func__ << " check misreference for col:" << c->cid
9487 << " obj:" << oid << dendl;
9488
eafe8130
TL
9489 OnodeRef o;
9490 o.reset(Onode::decode(c, oid, it->key(), it->value()));
11fdf7f2
TL
9491 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9492 mempool::bluestore_fsck::set<BlobRef> blobs;
9493
9494 for (auto& e : o->extent_map.extent_map) {
9495 blobs.insert(e.blob);
9496 }
9497 bool need_onode_update = false;
9498 bool first_dump = true;
9499 for(auto b : blobs) {
9500 bool broken_blob = false;
9501 auto& pextents = b->dirty_blob().dirty_extents();
9502 for (auto& e : pextents) {
9503 if (!e.is_valid()) {
9504 continue;
9505 }
9506 // for the sake of simplicity and proper shared blob handling
9507 // always rewrite the whole blob even when it's partially
9508 // misreferenced.
9509 if (misref_extents.intersects(e.offset, e.length)) {
9510 if (first_dump) {
9511 first_dump = false;
81eedcae 9512 _dump_onode<10>(cct, *o);
11fdf7f2
TL
9513 }
9514 broken_blob = true;
9515 break;
9516 }
9517 }
9518 if (!broken_blob)
9519 continue;
9520 bool compressed = b->get_blob().is_compressed();
9521 need_onode_update = true;
9522 dout(10) << __func__
9523 << " fix misreferences in oid:" << oid
9524 << " " << *b << dendl;
9525 uint64_t b_off = 0;
9526 PExtentVector pext_to_release;
9527 pext_to_release.reserve(pextents.size());
9528 // rewriting all valid pextents
9529 for (auto e = pextents.begin(); e != pextents.end();
a4b75251
TL
9530 e++) {
9531 auto b_off_cur = b_off;
9532 b_off += e->length;
11fdf7f2
TL
9533 if (!e->is_valid()) {
9534 continue;
9535 }
9536 PExtentVector exts;
20effc67 9537 dout(5) << __func__ << "::NCB::(F)alloc=" << alloc << ", length=" << e->length << dendl;
f67539c2 9538 int64_t alloc_len =
20effc67 9539 alloc->allocate(e->length, min_alloc_size,
f67539c2 9540 0, 0, &exts);
eafe8130 9541 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
11fdf7f2
TL
9542 derr << __func__
9543 << " failed to allocate 0x" << std::hex << e->length
eafe8130 9544 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2 9545 << " min_alloc_size 0x" << min_alloc_size
20effc67 9546 << " available 0x " << alloc->get_free()
11fdf7f2
TL
9547 << std::dec << dendl;
9548 if (alloc_len > 0) {
20effc67 9549 alloc->release(exts);
11fdf7f2
TL
9550 }
9551 bypass_rest = true;
9552 break;
9553 }
9554 expected_statfs->allocated += e->length;
9555 if (compressed) {
9556 expected_statfs->data_compressed_allocated += e->length;
9557 }
9558
9559 bufferlist bl;
20effc67 9560 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
11fdf7f2
TL
9561 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
9562 if (r < 0) {
9563 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
9564 <<"~" << e->length << std::dec << dendl;
9565 ceph_abort_msg("read failed, wtf");
9566 }
9567 pext_to_release.push_back(*e);
9568 e = pextents.erase(e);
9569 e = pextents.insert(e, exts.begin(), exts.end());
9570 b->get_blob().map_bl(
20effc67 9571 b_off_cur, bl,
11fdf7f2
TL
9572 [&](uint64_t offset, bufferlist& t) {
9573 int r = bdev->write(offset, t, false);
9574 ceph_assert(r == 0);
9575 });
9576 e += exts.size() - 1;
9577 for (auto& p : exts) {
9578 fm->allocate(p.offset, p.length, txn);
9579 }
9580 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
9581
9582 if (b->get_blob().is_shared()) {
9583 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
9584
20effc67
TL
9585 auto sbid = b->shared_blob->get_sbid();
9586 auto sb_it = sb_info.find(sbid);
11fdf7f2 9587 ceph_assert(sb_it != sb_info.end());
20effc67
TL
9588 sb_info_t& sbi = *sb_it;
9589
9590 if (sbi.allocated_chunks < 0) {
9591 // NB: it's crucial to use compressed_allocated_chunks from sb_info_t
9592 // as we originally used that value while accumulating
9593 // expected_statfs
9594 expected_statfs->allocated -= uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
9595 expected_statfs->data_compressed_allocated -=
9596 uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
9597 } else {
9598 expected_statfs->allocated -= uint64_t(sbi.allocated_chunks) << min_alloc_size_order;
11fdf7f2 9599 }
20effc67
TL
9600 sbi.allocated_chunks = 0;
9601 repairer.fix_shared_blob(txn, sbid, nullptr, 0);
9602
11fdf7f2
TL
9603 // relying on blob's pextents to decide what to release.
9604 for (auto& p : pext_to_release) {
9605 to_release.union_insert(p.offset, p.length);
9606 }
9607 } else {
9608 for (auto& p : pext_to_release) {
9609 expected_statfs->allocated -= p.length;
9610 if (compressed) {
9611 expected_statfs->data_compressed_allocated -= p.length;
9612 }
9613 to_release.union_insert(p.offset, p.length);
9614 }
9615 }
9616 if (bypass_rest) {
9617 break;
9618 }
9619 } // for(auto b : blobs)
9620 if (need_onode_update) {
9621 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
9622 _record_onode(o, txn);
9623 }
9624 } // for (it->lower_bound(string()); it->valid(); it->next())
9625
9626 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
9627 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
9628 << "~" << it.get_len() << std::dec << dendl;
9629 fm->release(it.get_start(), it.get_len(), txn);
9630 }
20effc67 9631 alloc->release(to_release);
11fdf7f2
TL
9632 to_release.clear();
9633 } // if (it) {
9634 } //if (repair && repairer.preprocess_misreference()) {
11fdf7f2 9635 sb_info.clear();
20effc67 9636 sb_ref_counts.reset();
11fdf7f2 9637
eafe8130
TL
9638 // check global stats only if fscking (not repairing) w/o per-pool stats
9639 if (!per_pool_stat_collection &&
9640 !repair &&
9641 !(actual_statfs == expected_store_statfs)) {
9642 derr << "fsck error: actual " << actual_statfs
9643 << " != expected " << expected_store_statfs << dendl;
9644 if (repair) {
9645 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
9646 expected_store_statfs);
11fdf7f2 9647 }
eafe8130 9648 ++errors;
7c673cae
FG
9649 }
9650
eafe8130
TL
9651 dout(1) << __func__ << " checking pool_statfs" << dendl;
9652 _fsck_check_pool_statfs(expected_pool_statfs,
9653 errors, warnings, repair ? &repairer : nullptr);
9654
9655 if (depth != FSCK_SHALLOW) {
9f95a23c 9656 dout(1) << __func__ << " checking for stray omap data " << dendl;
f67539c2 9657 it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE);
eafe8130 9658 if (it) {
9f95a23c 9659 uint64_t last_omap_head = 0;
eafe8130
TL
9660 for (it->lower_bound(string()); it->valid(); it->next()) {
9661 uint64_t omap_head;
f67539c2 9662
eafe8130 9663 _key_decode_u64(it->key().c_str(), &omap_head);
f67539c2 9664
9f95a23c 9665 if (used_omap_head.count(omap_head) == 0 &&
f67539c2 9666 omap_head != last_omap_head) {
20effc67 9667 pair<string,string> rk = it->raw_key();
9f95a23c
TL
9668 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9669 << "fsck error: found stray omap data on omap_head "
20effc67
TL
9670 << omap_head << " " << last_omap_head
9671 << " prefix/key: " << url_escape(rk.first)
9672 << " " << url_escape(rk.second)
9673 << fsck_dendl;
f67539c2
TL
9674 ++errors;
9675 last_omap_head = omap_head;
eafe8130 9676 }
7c673cae
FG
9677 }
9678 }
f67539c2 9679 it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE);
eafe8130 9680 if (it) {
9f95a23c 9681 uint64_t last_omap_head = 0;
eafe8130
TL
9682 for (it->lower_bound(string()); it->valid(); it->next()) {
9683 uint64_t omap_head;
9684 _key_decode_u64(it->key().c_str(), &omap_head);
9f95a23c
TL
9685 if (used_omap_head.count(omap_head) == 0 &&
9686 omap_head != last_omap_head) {
20effc67 9687 pair<string,string> rk = it->raw_key();
9f95a23c
TL
9688 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9689 << "fsck error: found stray (pgmeta) omap data on omap_head "
20effc67
TL
9690 << omap_head << " " << last_omap_head
9691 << " prefix/key: " << url_escape(rk.first)
9692 << " " << url_escape(rk.second)
9693 << fsck_dendl;
9f95a23c 9694 last_omap_head = omap_head;
eafe8130
TL
9695 ++errors;
9696 }
11fdf7f2
TL
9697 }
9698 }
f67539c2 9699 it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9f95a23c
TL
9700 if (it) {
9701 uint64_t last_omap_head = 0;
9702 for (it->lower_bound(string()); it->valid(); it->next()) {
9703 uint64_t pool;
9704 uint64_t omap_head;
9705 string k = it->key();
9706 const char *c = k.c_str();
9707 c = _key_decode_u64(c, &pool);
9708 c = _key_decode_u64(c, &omap_head);
9709 if (used_omap_head.count(omap_head) == 0 &&
f67539c2 9710 omap_head != last_omap_head) {
20effc67 9711 pair<string,string> rk = it->raw_key();
9f95a23c
TL
9712 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9713 << "fsck error: found stray (per-pool) omap data on omap_head "
20effc67
TL
9714 << omap_head << " " << last_omap_head
9715 << " prefix/key: " << url_escape(rk.first)
9716 << " " << url_escape(rk.second)
9717 << fsck_dendl;
9f95a23c 9718 ++errors;
f67539c2
TL
9719 last_omap_head = omap_head;
9720 }
9721 }
9722 }
9723 it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9724 if (it) {
9725 uint64_t last_omap_head = 0;
9726 for (it->lower_bound(string()); it->valid(); it->next()) {
9727 uint64_t pool;
9728 uint32_t hash;
9729 uint64_t omap_head;
9730 string k = it->key();
9731 const char* c = k.c_str();
9732 c = _key_decode_u64(c, &pool);
9733 c = _key_decode_u32(c, &hash);
9734 c = _key_decode_u64(c, &omap_head);
9735 if (used_omap_head.count(omap_head) == 0 &&
9736 omap_head != last_omap_head) {
9737 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
9738 << "fsck error: found stray (per-pg) omap data on omap_head "
20effc67 9739 << " key " << pretty_binary_string(it->key())
f67539c2
TL
9740 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
9741 ++errors;
9742 last_omap_head = omap_head;
9f95a23c
TL
9743 }
9744 }
9745 }
eafe8130 9746 dout(1) << __func__ << " checking deferred events" << dendl;
f67539c2 9747 it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
9748 if (it) {
9749 for (it->lower_bound(string()); it->valid(); it->next()) {
9750 bufferlist bl = it->value();
9751 auto p = bl.cbegin();
9752 bluestore_deferred_transaction_t wt;
9753 try {
9754 decode(wt, p);
f67539c2 9755 } catch (ceph::buffer::error& e) {
eafe8130
TL
9756 derr << "fsck error: failed to decode deferred txn "
9757 << pretty_binary_string(it->key()) << dendl;
9758 if (repair) {
9759 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
9760 << pretty_binary_string(it->key())
9761 << "', removing" << dendl;
9762 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
9763 }
9764 continue;
9765 }
9766 dout(20) << __func__ << " deferred " << wt.seq
9767 << " ops " << wt.ops.size()
9768 << " released 0x" << std::hex << wt.released << std::dec << dendl;
9769 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
9f95a23c 9770 apply_for_bitset_range(
f67539c2 9771 e.get_start(), e.get_len(), alloc_size, used_blocks,
eafe8130 9772 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
9773 bs.set(pos);
9774 }
9775 );
9776 }
7c673cae 9777 }
eafe8130
TL
9778 }
9779
20effc67
TL
9780 // skip freelist vs allocated compare when we have Null fm
9781 if (!fm->is_null_manager()) {
9782 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
9783#ifdef HAVE_LIBZBD
9784 if (freelist_type == "zoned") {
9785 // verify per-zone state
9786 // - verify no allocations beyond write pointer
9787 // - verify num_dead_bytes count (neither allocated nor
9788 // free space past the write pointer)
9789 auto a = dynamic_cast<ZonedAllocator*>(alloc);
9790 auto num_zones = bdev->get_size() / zone_size;
9791
9792 // mark the free space past the write pointer
9793 for (uint32_t zone = first_sequential_zone; zone < num_zones; ++zone) {
9794 auto wp = a->get_write_pointer(zone);
9795 uint64_t offset = zone_size * zone + wp;
9796 uint64_t length = zone_size - wp;
9797 if (!length) {
9798 continue;
9799 }
9800 bool intersects = false;
9801 dout(10) << " marking zone 0x" << std::hex << zone
9802 << " region after wp 0x" << offset << "~" << length
9803 << std::dec << dendl;
9804 apply_for_bitset_range(
9805 offset, length, alloc_size, used_blocks,
9806 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9807 if (bs.test(pos)) {
9808 derr << "fsck error: zone 0x" << std::hex << zone
9809 << " has used space at 0x" << pos * alloc_size
9810 << " beyond write pointer 0x" << wp
9811 << std::dec << dendl;
9812 intersects = true;
eafe8130 9813 } else {
20effc67 9814 bs.set(pos);
11fdf7f2 9815 }
20effc67
TL
9816 }
9817 );
9818 if (intersects) {
9819 ++errors;
9820 }
9821 }
9822
9823 used_blocks.flip();
9824
9825 // skip conventional zones
9826 uint64_t pos = (first_sequential_zone * zone_size) / min_alloc_size - 1;
9827 pos = used_blocks.find_next(pos);
9828
9829 uint64_t zone_dead = 0;
9830 for (uint32_t zone = first_sequential_zone;
9831 zone < num_zones;
9832 ++zone, zone_dead = 0) {
9833 while (pos != decltype(used_blocks)::npos &&
9834 (pos * min_alloc_size) / zone_size == zone) {
9835 dout(40) << " zone 0x" << std::hex << zone
9836 << " dead 0x" << (pos * min_alloc_size) << "~" << min_alloc_size
9837 << std::dec << dendl;
9838 zone_dead += min_alloc_size;
9839 pos = used_blocks.find_next(pos);
9840 }
9841 dout(20) << " zone 0x" << std::hex << zone << " dead is 0x" << zone_dead
9842 << std::dec << dendl;
9843 // cross-check dead bytes against zone state
9844 if (a->get_dead_bytes(zone) != zone_dead) {
9845 derr << "fsck error: zone 0x" << std::hex << zone << " has 0x" << zone_dead
9846 << " dead bytes but freelist says 0x" << a->get_dead_bytes(zone)
9847 << dendl;
9848 ++errors;
9849 // TODO: repair
9850 }
9851 }
9852 used_blocks.flip();
9853 } else
9854#endif
9855 {
9856 fm->enumerate_reset();
9857 uint64_t offset, length;
9858 while (fm->enumerate_next(db, &offset, &length)) {
9859 bool intersects = false;
9860 apply_for_bitset_range(
9861 offset, length, alloc_size, used_blocks,
9862 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
9863 ceph_assert(pos < bs.size());
9864 if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
9865 if (offset == SUPER_RESERVED &&
9866 length == min_alloc_size - SUPER_RESERVED) {
9867 // this is due to the change just after luminous to min_alloc_size
9868 // granularity allocations, and our baked in assumption at the top
9869 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
9870 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
9871 // since we will never allocate this region below min_alloc_size.
9872 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
9873 << " and min_alloc_size, 0x" << std::hex << offset << "~"
9874 << length << std::dec << dendl;
9875 } else {
9876 intersects = true;
9877 if (repair) {
9878 repairer.fix_false_free(db, fm,
9879 pos * min_alloc_size,
9880 min_alloc_size);
9881 }
9882 }
9883 } else {
9884 bs.set(pos);
eafe8130 9885 }
11fdf7f2 9886 }
20effc67
TL
9887 );
9888 if (intersects) {
9889 derr << "fsck error: free extent 0x" << std::hex << offset
9890 << "~" << length << std::dec
9891 << " intersects allocated blocks" << dendl;
9892 ++errors;
b5b8bbf5 9893 }
20effc67
TL
9894 }
9895 fm->enumerate_reset();
9896
9897 // check for leaked extents
9898 size_t count = used_blocks.count();
9899 if (used_blocks.size() != count) {
9900 ceph_assert(used_blocks.size() > count);
9901 used_blocks.flip();
9902 size_t start = used_blocks.find_first();
9903 while (start != decltype(used_blocks)::npos) {
9904 size_t cur = start;
9905 while (true) {
9906 size_t next = used_blocks.find_next(cur);
9907 if (next != cur + 1) {
9908 ++errors;
9909 derr << "fsck error: leaked extent 0x" << std::hex
9910 << ((uint64_t)start * fm->get_alloc_size()) << "~"
9911 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
9912 << dendl;
9913 if (repair) {
9914 repairer.fix_leaked(db,
9915 fm,
9916 start * min_alloc_size,
9917 (cur + 1 - start) * min_alloc_size);
9918 }
9919 start = next;
9920 break;
9921 }
9922 cur = next;
9923 }
9924 }
9925 used_blocks.flip();
9926 }
b5b8bbf5 9927 }
7c673cae
FG
9928 }
9929 }
11fdf7f2 9930 if (repair) {
f67539c2
TL
9931 if (per_pool_omap != OMAP_PER_PG) {
9932 dout(5) << __func__ << " fixing per_pg_omap" << dendl;
9933 repairer.fix_per_pool_omap(db, OMAP_PER_PG);
9f95a23c
TL
9934 }
9935
11fdf7f2
TL
9936 dout(5) << __func__ << " applying repair results" << dendl;
9937 repaired = repairer.apply(db);
9938 dout(5) << __func__ << " repair applied" << dendl;
9939 }
7c673cae 9940
eafe8130 9941out_scan:
7c673cae
FG
9942 dout(2) << __func__ << " " << num_objects << " objects, "
9943 << num_sharded_objects << " of them sharded. "
9944 << dendl;
9945 dout(2) << __func__ << " " << num_extents << " extents to "
9946 << num_blobs << " blobs, "
9947 << num_spanning_blobs << " spanning, "
9948 << num_shared_blobs << " shared."
9949 << dendl;
9950
9951 utime_t duration = ceph_clock_now() - start;
9f95a23c
TL
9952 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
9953 << warnings << " warnings, "
9954 << repaired << " repaired, "
9955 << (errors + warnings - (int)repaired) << " remaining in "
7c673cae 9956 << duration << " seconds" << dendl;
9f95a23c
TL
9957
9958 // In non-repair mode we should return error count only as
9959 // it indicates if store status is OK.
9960 // In repair mode both errors and warnings are taken into account
9961 // since repaired counter relates to them both.
9962 return repair ? errors + warnings - (int)repaired : errors;
11fdf7f2
TL
9963}
9964
9965/// methods to inject various errors fsck can repair
9966void BlueStore::inject_broken_shared_blob_key(const string& key,
9967 const bufferlist& bl)
9968{
9969 KeyValueDB::Transaction txn;
9970 txn = db->get_transaction();
9971 txn->set(PREFIX_SHARED_BLOB, key, bl);
9972 db->submit_transaction_sync(txn);
9973};
9974
a4b75251
TL
9975void BlueStore::inject_no_shared_blob_key()
9976{
9977 KeyValueDB::Transaction txn;
9978 txn = db->get_transaction();
9979 ceph_assert(blobid_last > 0);
9980 // kill the last used sbid, this can be broken due to blobid preallocation
9981 // in rare cases, leaving as-is for the sake of simplicity
9982 uint64_t sbid = blobid_last;
9983
9984 string key;
9985 dout(5) << __func__<< " " << sbid << dendl;
9986 get_shared_blob_key(sbid, &key);
9987 txn->rmkey(PREFIX_SHARED_BLOB, key);
9988 db->submit_transaction_sync(txn);
9989};
9990
20effc67 9991void BlueStore::inject_stray_shared_blob_key(uint64_t sbid)
11fdf7f2
TL
9992{
9993 KeyValueDB::Transaction txn;
9994 txn = db->get_transaction();
9995
20effc67
TL
9996 dout(5) << __func__ << " " << sbid << dendl;
9997
9998 string key;
9999 get_shared_blob_key(sbid, &key);
10000 bluestore_shared_blob_t persistent(sbid);
10001 persistent.ref_map.get(0xdead0000, 0x1000);
10002 bufferlist bl;
10003 encode(persistent, bl);
10004 dout(20) << __func__ << " sbid " << sbid
10005 << " takes " << bl.length() << " bytes, updating"
10006 << dendl;
10007
10008 txn->set(PREFIX_SHARED_BLOB, key, bl);
10009 db->submit_transaction_sync(txn);
10010};
10011
10012
10013void BlueStore::inject_leaked(uint64_t len)
10014{
11fdf7f2 10015 PExtentVector exts;
20effc67 10016 int64_t alloc_len = alloc->allocate(len, min_alloc_size,
11fdf7f2 10017 min_alloc_size * 256, 0, &exts);
20effc67
TL
10018
10019 if (fm->is_null_manager()) {
10020 return;
10021 }
10022
10023 KeyValueDB::Transaction txn;
10024 txn = db->get_transaction();
10025
11fdf7f2
TL
10026 ceph_assert(alloc_len >= (int64_t)len);
10027 for (auto& p : exts) {
10028 fm->allocate(p.offset, p.length, txn);
10029 }
10030 db->submit_transaction_sync(txn);
10031}
10032
10033void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
10034{
20effc67
TL
10035 ceph_assert(!fm->is_null_manager());
10036
11fdf7f2
TL
10037 KeyValueDB::Transaction txn;
10038 OnodeRef o;
10039 CollectionRef c = _get_collection(cid);
10040 ceph_assert(c);
10041 {
9f95a23c 10042 std::unique_lock l{c->lock}; // just to avoid internal asserts
11fdf7f2
TL
10043 o = c->get_onode(oid, false);
10044 ceph_assert(o);
10045 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10046 }
10047
10048 bool injected = false;
10049 txn = db->get_transaction();
10050 auto& em = o->extent_map.extent_map;
10051 std::vector<const PExtentVector*> v;
10052 if (em.size()) {
10053 v.push_back(&em.begin()->blob->get_blob().get_extents());
10054 }
10055 if (em.size() > 1) {
10056 auto it = em.end();
10057 --it;
10058 v.push_back(&(it->blob->get_blob().get_extents()));
10059 }
10060 for (auto pext : v) {
10061 if (pext->size()) {
10062 auto p = pext->begin();
10063 while (p != pext->end()) {
10064 if (p->is_valid()) {
10065 dout(20) << __func__ << " release 0x" << std::hex << p->offset
10066 << "~" << p->length << std::dec << dendl;
10067 fm->release(p->offset, p->length, txn);
10068 injected = true;
10069 break;
10070 }
10071 ++p;
10072 }
10073 }
10074 }
10075 ceph_assert(injected);
10076 db->submit_transaction_sync(txn);
10077}
10078
9f95a23c
TL
10079void BlueStore::inject_legacy_omap()
10080{
10081 dout(1) << __func__ << dendl;
f67539c2 10082 per_pool_omap = OMAP_BULK;
9f95a23c
TL
10083 KeyValueDB::Transaction txn;
10084 txn = db->get_transaction();
10085 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
10086 db->submit_transaction_sync(txn);
10087}
10088
10089void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
10090{
10091 dout(1) << __func__ << " "
10092 << cid << " " << oid
10093 <<dendl;
10094 KeyValueDB::Transaction txn;
10095 OnodeRef o;
10096 CollectionRef c = _get_collection(cid);
10097 ceph_assert(c);
10098 {
10099 std::unique_lock l{ c->lock }; // just to avoid internal asserts
10100 o = c->get_onode(oid, false);
10101 ceph_assert(o);
10102 }
f67539c2
TL
10103 o->onode.clear_flag(
10104 bluestore_onode_t::FLAG_PERPG_OMAP |
10105 bluestore_onode_t::FLAG_PERPOOL_OMAP |
10106 bluestore_onode_t::FLAG_PGMETA_OMAP);
9f95a23c
TL
10107 txn = db->get_transaction();
10108 _record_onode(o, txn);
10109 db->submit_transaction_sync(txn);
10110}
10111
20effc67
TL
10112void BlueStore::inject_stray_omap(uint64_t head, const string& name)
10113{
10114 dout(1) << __func__ << dendl;
10115 KeyValueDB::Transaction txn = db->get_transaction();
10116
10117 string key;
10118 bufferlist bl;
10119 _key_encode_u64(head, &key);
10120 key.append(name);
10121 txn->set(PREFIX_OMAP, key, bl);
10122
10123 db->submit_transaction_sync(txn);
10124}
9f95a23c 10125
11fdf7f2
TL
10126void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
10127{
10128 BlueStoreRepairer repairer;
10129 repairer.fix_statfs(db, key, new_statfs);
10130 repairer.apply(db);
10131}
10132
eafe8130
TL
10133void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
10134{
10135 KeyValueDB::Transaction t = db->get_transaction();
10136 volatile_statfs v;
10137 v = new_statfs;
10138 bufferlist bl;
10139 v.encode(bl);
10140 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
10141 db->submit_transaction_sync(t);
10142}
10143
11fdf7f2
TL
10144void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
10145 coll_t cid2, ghobject_t oid2,
10146 uint64_t offset)
10147{
10148 OnodeRef o1;
10149 CollectionRef c1 = _get_collection(cid1);
10150 ceph_assert(c1);
10151 {
9f95a23c 10152 std::unique_lock l{c1->lock}; // just to avoid internal asserts
11fdf7f2
TL
10153 o1 = c1->get_onode(oid1, false);
10154 ceph_assert(o1);
10155 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
10156 }
10157 OnodeRef o2;
10158 CollectionRef c2 = _get_collection(cid2);
10159 ceph_assert(c2);
10160 {
9f95a23c 10161 std::unique_lock l{c2->lock}; // just to avoid internal asserts
11fdf7f2
TL
10162 o2 = c2->get_onode(oid2, false);
10163 ceph_assert(o2);
10164 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
10165 }
10166 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
10167 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
10168
10169 // require onode/extent layout to be the same (and simple)
10170 // to make things easier
10171 ceph_assert(o1->onode.extent_map_shards.empty());
10172 ceph_assert(o2->onode.extent_map_shards.empty());
10173 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
10174 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
10175 ceph_assert(e1.logical_offset == e2.logical_offset);
10176 ceph_assert(e1.length == e2.length);
10177 ceph_assert(e1.blob_offset == e2.blob_offset);
10178
10179 KeyValueDB::Transaction txn;
10180 txn = db->get_transaction();
10181
10182 // along with misreference error this will create space leaks errors
10183 e2.blob->dirty_blob() = e1.blob->get_blob();
10184 o2->extent_map.dirty_range(offset, e2.length);
10185 o2->extent_map.update(txn, false);
10186
10187 _record_onode(o2, txn);
10188 db->submit_transaction_sync(txn);
7c673cae
FG
10189}
10190
adb31ebb
TL
10191void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
10192 int16_t blob_id)
10193{
10194 OnodeRef o;
10195 CollectionRef c = _get_collection(cid);
10196 ceph_assert(c);
10197 {
10198 std::unique_lock l{ c->lock }; // just to avoid internal asserts
10199 o = c->get_onode(oid, false);
10200 ceph_assert(o);
10201 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10202 }
10203
10204 BlobRef b = c->new_blob();
10205 b->id = blob_id;
10206 o->extent_map.spanning_blob_map[blob_id] = b;
10207
10208 KeyValueDB::Transaction txn;
10209 txn = db->get_transaction();
10210
10211 _record_onode(o, txn);
10212 db->submit_transaction_sync(txn);
10213}
10214
a4b75251
TL
10215void BlueStore::inject_bluefs_file(std::string_view dir, std::string_view name, size_t new_size)
10216{
10217 ceph_assert(bluefs);
10218
10219 BlueFS::FileWriter* p_handle = nullptr;
10220 auto ret = bluefs->open_for_write(dir, name, &p_handle, false);
10221 ceph_assert(ret == 0);
10222
10223 std::string s('0', new_size);
10224 bufferlist bl;
10225 bl.append(s);
10226 p_handle->append(bl);
10227
10228 bluefs->fsync(p_handle);
10229 bluefs->close_writer(p_handle);
10230}
10231
7c673cae
FG
10232void BlueStore::collect_metadata(map<string,string> *pm)
10233{
10234 dout(10) << __func__ << dendl;
10235 bdev->collect_metadata("bluestore_bdev_", pm);
10236 if (bluefs) {
10237 (*pm)["bluefs"] = "1";
9f95a23c
TL
10238 // this value is for backward compatibility only
10239 (*pm)["bluefs_single_shared_device"] = \
10240 stringify((int)bluefs_layout.single_shared_device());
10241 (*pm)["bluefs_dedicated_db"] = \
10242 stringify((int)bluefs_layout.dedicated_db);
10243 (*pm)["bluefs_dedicated_wal"] = \
10244 stringify((int)bluefs_layout.dedicated_wal);
10245 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
7c673cae
FG
10246 } else {
10247 (*pm)["bluefs"] = "0";
10248 }
11fdf7f2
TL
10249
10250 // report numa mapping for underlying devices
10251 int node = -1;
10252 set<int> nodes;
10253 set<string> failed;
10254 int r = get_numa_node(&node, &nodes, &failed);
10255 if (r >= 0) {
10256 if (!failed.empty()) {
10257 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
10258 }
10259 if (!nodes.empty()) {
10260 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
10261 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
10262 }
10263 if (node >= 0) {
10264 (*pm)["objectstore_numa_node"] = stringify(node);
10265 }
10266 }
10267}
10268
10269int BlueStore::get_numa_node(
10270 int *final_node,
10271 set<int> *out_nodes,
10272 set<string> *out_failed)
10273{
10274 int node = -1;
10275 set<string> devices;
10276 get_devices(&devices);
10277 set<int> nodes;
10278 set<string> failed;
10279 for (auto& devname : devices) {
10280 int n;
10281 BlkDev bdev(devname);
10282 int r = bdev.get_numa_node(&n);
10283 if (r < 0) {
10284 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
10285 << dendl;
10286 failed.insert(devname);
10287 continue;
10288 }
10289 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
10290 << dendl;
10291 nodes.insert(n);
10292 if (node < 0) {
10293 node = n;
10294 }
10295 }
10296 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
10297 *final_node = node;
10298 }
10299 if (out_nodes) {
10300 *out_nodes = nodes;
10301 }
10302 if (out_failed) {
10303 *out_failed = failed;
10304 }
10305 return 0;
10306}
10307
10308int BlueStore::get_devices(set<string> *ls)
10309{
10310 if (bdev) {
10311 bdev->get_devices(ls);
10312 if (bluefs) {
10313 bluefs->get_devices(ls);
10314 }
10315 return 0;
10316 }
20effc67 10317
11fdf7f2 10318 // grumble, we haven't started up yet.
20effc67
TL
10319 if (int r = _open_path(); r < 0) {
10320 return r;
10321 }
10322 auto close_path = make_scope_guard([&] {
10323 _close_path();
10324 });
10325 if (int r = _open_fsid(false); r < 0) {
10326 return r;
10327 }
10328 auto close_fsid = make_scope_guard([&] {
10329 _close_fsid();
10330 });
10331 if (int r = _read_fsid(&fsid); r < 0) {
10332 return r;
10333 }
10334 if (int r = _lock_fsid(); r < 0) {
10335 return r;
10336 }
10337 if (int r = _open_bdev(false); r < 0) {
10338 return r;
10339 }
10340 auto close_bdev = make_scope_guard([&] {
10341 _close_bdev();
10342 });
10343 if (int r = _minimal_open_bluefs(false); r < 0) {
10344 return r;
10345 }
11fdf7f2
TL
10346 bdev->get_devices(ls);
10347 if (bluefs) {
10348 bluefs->get_devices(ls);
10349 }
11fdf7f2 10350 _minimal_close_bluefs();
20effc67 10351 return 0;
7c673cae
FG
10352}
10353
11fdf7f2 10354void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
7c673cae
FG
10355{
10356 buf->reset();
11fdf7f2 10357
f67539c2
TL
10358 auto prefix = per_pool_omap == OMAP_BULK ?
10359 PREFIX_OMAP :
10360 per_pool_omap == OMAP_PER_POOL ?
10361 PREFIX_PERPOOL_OMAP :
10362 PREFIX_PERPG_OMAP;
9f95a23c 10363 buf->omap_allocated =
f67539c2 10364 db->estimate_prefix_size(prefix, string());
11fdf7f2 10365
20effc67 10366 uint64_t bfree = alloc->get_free();
7c673cae
FG
10367
10368 if (bluefs) {
f67539c2 10369 buf->internally_reserved = 0;
11fdf7f2 10370 // include dedicated db, too, if that isn't the shared device.
9f95a23c 10371 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 10372 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
7c673cae 10373 }
11fdf7f2
TL
10374 // call any non-omap bluefs space "internal metadata"
10375 buf->internal_metadata =
f67539c2 10376 bluefs->get_used()
11fdf7f2 10377 - buf->omap_allocated;
7c673cae
FG
10378 }
10379
11fdf7f2
TL
10380 uint64_t thin_total, thin_avail;
10381 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
10382 buf->total += thin_total;
10383
10384 // we are limited by both the size of the virtual device and the
10385 // underlying physical device.
10386 bfree = std::min(bfree, thin_avail);
10387
10388 buf->allocated = thin_total - thin_avail;
10389 } else {
10390 buf->total += bdev->get_size();
10391 }
10392 buf->available = bfree;
10393}
10394
10395int BlueStore::statfs(struct store_statfs_t *buf,
10396 osd_alert_list_t* alerts)
10397{
10398 if (alerts) {
10399 alerts->clear();
10400 _log_alerts(*alerts);
10401 }
10402 _get_statfs_overall(buf);
31f18b77 10403 {
11fdf7f2 10404 std::lock_guard l(vstatfs_lock);
31f18b77 10405 buf->allocated = vstatfs.allocated();
11fdf7f2
TL
10406 buf->data_stored = vstatfs.stored();
10407 buf->data_compressed = vstatfs.compressed();
10408 buf->data_compressed_original = vstatfs.compressed_original();
10409 buf->data_compressed_allocated = vstatfs.compressed_allocated();
10410 }
10411
10412 dout(20) << __func__ << " " << *buf << dendl;
10413 return 0;
10414}
10415
9f95a23c
TL
10416int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
10417 bool *out_per_pool_omap)
11fdf7f2
TL
10418{
10419 dout(20) << __func__ << " pool " << pool_id<< dendl;
81eedcae 10420
11fdf7f2
TL
10421 if (!per_pool_stat_collection) {
10422 dout(20) << __func__ << " not supported in legacy mode " << dendl;
10423 return -ENOTSUP;
7c673cae 10424 }
11fdf7f2 10425 buf->reset();
7c673cae 10426
11fdf7f2
TL
10427 {
10428 std::lock_guard l(vstatfs_lock);
10429 osd_pools[pool_id].publish(buf);
10430 }
9f95a23c
TL
10431
10432 string key_prefix;
10433 _key_encode_u64(pool_id, &key_prefix);
f67539c2
TL
10434 *out_per_pool_omap = per_pool_omap != OMAP_BULK;
10435 if (*out_per_pool_omap) {
10436 auto prefix = per_pool_omap == OMAP_PER_POOL ?
10437 PREFIX_PERPOOL_OMAP :
10438 PREFIX_PERPG_OMAP;
10439 buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix);
10440 }
9f95a23c 10441
11fdf7f2 10442 dout(10) << __func__ << *buf << dendl;
7c673cae
FG
10443 return 0;
10444}
10445
81eedcae
TL
10446void BlueStore::_check_legacy_statfs_alert()
10447{
10448 string s;
10449 if (!per_pool_stat_collection &&
eafe8130 10450 cct->_conf->bluestore_warn_on_legacy_statfs) {
81eedcae
TL
10451 s = "legacy statfs reporting detected, "
10452 "suggest to run store repair to get consistent statistic reports";
10453 }
10454 std::lock_guard l(qlock);
10455 legacy_statfs_alert = s;
10456}
10457
f67539c2 10458void BlueStore::_check_no_per_pg_or_pool_omap_alert()
9f95a23c 10459{
f67539c2
TL
10460 string per_pg, per_pool;
10461 if (per_pool_omap != OMAP_PER_PG) {
10462 if (cct->_conf->bluestore_warn_on_no_per_pg_omap) {
10463 per_pg = "legacy (not per-pg) omap detected, "
10464 "suggest to run store repair to benefit from faster PG removal";
10465 }
10466 if (per_pool_omap != OMAP_PER_POOL) {
10467 if (cct->_conf->bluestore_warn_on_no_per_pool_omap) {
10468 per_pool = "legacy (not per-pool) omap detected, "
10469 "suggest to run store repair to benefit from per-pool omap usage statistics";
10470 }
10471 }
9f95a23c
TL
10472 }
10473 std::lock_guard l(qlock);
f67539c2
TL
10474 no_per_pg_omap_alert = per_pg;
10475 no_per_pool_omap_alert = per_pool;
9f95a23c
TL
10476}
10477
7c673cae
FG
10478// ---------------
10479// cache
10480
10481BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
10482{
9f95a23c 10483 std::shared_lock l(coll_lock);
7c673cae
FG
10484 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
10485 if (cp == coll_map.end())
10486 return CollectionRef();
10487 return cp->second;
10488}
10489
20effc67
TL
10490BlueStore::CollectionRef BlueStore::_get_collection_by_oid(const ghobject_t& oid)
10491{
10492 std::shared_lock l(coll_lock);
10493
10494 // FIXME: we must replace this with something more efficient
10495
10496 for (auto& i : coll_map) {
10497 spg_t spgid;
10498 if (i.first.is_pg(&spgid) &&
10499 i.second->contains(oid)) {
10500 return i.second;
10501 }
10502 }
10503 return CollectionRef();
10504}
10505
7c673cae
FG
10506void BlueStore::_queue_reap_collection(CollectionRef& c)
10507{
10508 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
10509 // _reap_collections and this in the same thread,
10510 // so no need a lock.
7c673cae
FG
10511 removed_collections.push_back(c);
10512}
10513
10514void BlueStore::_reap_collections()
10515{
94b18763 10516
7c673cae
FG
10517 list<CollectionRef> removed_colls;
10518 {
94b18763
FG
10519 // _queue_reap_collection and this in the same thread.
10520 // So no need a lock.
10521 if (!removed_collections.empty())
10522 removed_colls.swap(removed_collections);
10523 else
10524 return;
7c673cae
FG
10525 }
10526
94b18763
FG
10527 list<CollectionRef>::iterator p = removed_colls.begin();
10528 while (p != removed_colls.end()) {
7c673cae
FG
10529 CollectionRef c = *p;
10530 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
adb31ebb 10531 if (c->onode_map.map_any([&](Onode* o) {
11fdf7f2 10532 ceph_assert(!o->exists);
7c673cae
FG
10533 if (o->flushing_count.load()) {
10534 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
10535 << " flush_txns " << o->flushing_count << dendl;
94b18763 10536 return true;
7c673cae 10537 }
94b18763 10538 return false;
7c673cae 10539 })) {
94b18763 10540 ++p;
7c673cae
FG
10541 continue;
10542 }
10543 c->onode_map.clear();
94b18763 10544 p = removed_colls.erase(p);
7c673cae
FG
10545 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
10546 }
94b18763 10547 if (removed_colls.empty()) {
7c673cae 10548 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
10549 } else {
10550 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
10551 }
10552}
10553
10554void BlueStore::_update_cache_logger()
10555{
10556 uint64_t num_onodes = 0;
9f95a23c 10557 uint64_t num_pinned_onodes = 0;
7c673cae
FG
10558 uint64_t num_extents = 0;
10559 uint64_t num_blobs = 0;
10560 uint64_t num_buffers = 0;
10561 uint64_t num_buffer_bytes = 0;
9f95a23c
TL
10562 for (auto c : onode_cache_shards) {
10563 c->add_stats(&num_onodes, &num_pinned_onodes);
10564 }
10565 for (auto c : buffer_cache_shards) {
10566 c->add_stats(&num_extents, &num_blobs,
10567 &num_buffers, &num_buffer_bytes);
7c673cae
FG
10568 }
10569 logger->set(l_bluestore_onodes, num_onodes);
9f95a23c 10570 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
7c673cae
FG
10571 logger->set(l_bluestore_extents, num_extents);
10572 logger->set(l_bluestore_blobs, num_blobs);
10573 logger->set(l_bluestore_buffers, num_buffers);
10574 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
10575}
10576
10577// ---------------
10578// read operations
10579
10580ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
10581{
10582 return _get_collection(cid);
10583}
10584
11fdf7f2
TL
10585ObjectStore::CollectionHandle BlueStore::create_new_collection(
10586 const coll_t& cid)
7c673cae 10587{
9f95a23c
TL
10588 std::unique_lock l{coll_lock};
10589 auto c = ceph::make_ref<Collection>(
11fdf7f2 10590 this,
9f95a23c
TL
10591 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
10592 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
11fdf7f2
TL
10593 cid);
10594 new_coll_map[cid] = c;
9f95a23c 10595 _osr_attach(c.get());
11fdf7f2
TL
10596 return c;
10597}
10598
10599void BlueStore::set_collection_commit_queue(
10600 const coll_t& cid,
10601 ContextQueue *commit_queue)
10602{
10603 if (commit_queue) {
9f95a23c 10604 std::shared_lock l(coll_lock);
11fdf7f2
TL
10605 if (coll_map.count(cid)) {
10606 coll_map[cid]->commit_queue = commit_queue;
10607 } else if (new_coll_map.count(cid)) {
10608 new_coll_map[cid]->commit_queue = commit_queue;
10609 }
10610 }
7c673cae
FG
10611}
10612
11fdf7f2 10613
7c673cae
FG
10614bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
10615{
10616 Collection *c = static_cast<Collection *>(c_.get());
10617 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
10618 if (!c->exists)
10619 return false;
10620
10621 bool r = true;
10622
10623 {
9f95a23c 10624 std::shared_lock l(c->lock);
7c673cae
FG
10625 OnodeRef o = c->get_onode(oid, false);
10626 if (!o || !o->exists)
10627 r = false;
10628 }
10629
7c673cae
FG
10630 return r;
10631}
10632
7c673cae
FG
10633int BlueStore::stat(
10634 CollectionHandle &c_,
10635 const ghobject_t& oid,
10636 struct stat *st,
10637 bool allow_eio)
10638{
10639 Collection *c = static_cast<Collection *>(c_.get());
10640 if (!c->exists)
10641 return -ENOENT;
10642 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
10643
10644 {
9f95a23c 10645 std::shared_lock l(c->lock);
7c673cae
FG
10646 OnodeRef o = c->get_onode(oid, false);
10647 if (!o || !o->exists)
10648 return -ENOENT;
10649 st->st_size = o->onode.size;
10650 st->st_blksize = 4096;
10651 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
10652 st->st_nlink = 1;
10653 }
10654
7c673cae
FG
10655 int r = 0;
10656 if (_debug_mdata_eio(oid)) {
10657 r = -EIO;
10658 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10659 }
10660 return r;
10661}
10662int BlueStore::set_collection_opts(
11fdf7f2 10663 CollectionHandle& ch,
7c673cae
FG
10664 const pool_opts_t& opts)
10665{
7c673cae 10666 Collection *c = static_cast<Collection *>(ch.get());
11fdf7f2 10667 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
7c673cae
FG
10668 if (!c->exists)
10669 return -ENOENT;
9f95a23c 10670 std::unique_lock l{c->lock};
7c673cae
FG
10671 c->pool_opts = opts;
10672 return 0;
10673}
10674
7c673cae
FG
10675int BlueStore::read(
10676 CollectionHandle &c_,
10677 const ghobject_t& oid,
10678 uint64_t offset,
10679 size_t length,
10680 bufferlist& bl,
224ce89b 10681 uint32_t op_flags)
7c673cae 10682{
11fdf7f2 10683 auto start = mono_clock::now();
7c673cae
FG
10684 Collection *c = static_cast<Collection *>(c_.get());
10685 const coll_t &cid = c->get_cid();
10686 dout(15) << __func__ << " " << cid << " " << oid
10687 << " 0x" << std::hex << offset << "~" << length << std::dec
10688 << dendl;
10689 if (!c->exists)
10690 return -ENOENT;
10691
10692 bl.clear();
10693 int r;
10694 {
9f95a23c 10695 std::shared_lock l(c->lock);
11fdf7f2 10696 auto start1 = mono_clock::now();
7c673cae 10697 OnodeRef o = c->get_onode(oid, false);
494da23a
TL
10698 log_latency("get_onode@read",
10699 l_bluestore_read_onode_meta_lat,
10700 mono_clock::now() - start1,
10701 cct->_conf->bluestore_log_op_age);
7c673cae
FG
10702 if (!o || !o->exists) {
10703 r = -ENOENT;
10704 goto out;
10705 }
10706
10707 if (offset == length && offset == 0)
10708 length = o->onode.size;
10709
10710 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
10711 if (r == -EIO) {
10712 logger->inc(l_bluestore_read_eio);
10713 }
7c673cae
FG
10714 }
10715
10716 out:
28e407b8 10717 if (r >= 0 && _debug_data_eio(oid)) {
7c673cae
FG
10718 r = -EIO;
10719 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11fdf7f2
TL
10720 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10721 cct->_conf->bluestore_debug_random_read_err &&
10722 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10723 100.0)) == 0) {
224ce89b
WB
10724 dout(0) << __func__ << ": inject random EIO" << dendl;
10725 r = -EIO;
7c673cae
FG
10726 }
10727 dout(10) << __func__ << " " << cid << " " << oid
10728 << " 0x" << std::hex << offset << "~" << length << std::dec
10729 << " = " << r << dendl;
494da23a
TL
10730 log_latency(__func__,
10731 l_bluestore_read_lat,
10732 mono_clock::now() - start,
10733 cct->_conf->bluestore_log_op_age);
7c673cae
FG
10734 return r;
10735}
10736
9f95a23c 10737void BlueStore::_read_cache(
7c673cae
FG
10738 OnodeRef o,
10739 uint64_t offset,
10740 size_t length,
9f95a23c
TL
10741 int read_cache_policy,
10742 ready_regions_t& ready_regions,
10743 blobs2read_t& blobs2read)
7c673cae 10744{
7c673cae 10745 // build blob-wise list to of stuff read (that isn't cached)
7c673cae
FG
10746 unsigned left = length;
10747 uint64_t pos = offset;
7c673cae
FG
10748 auto lp = o->extent_map.seek_lextent(offset);
10749 while (left > 0 && lp != o->extent_map.extent_map.end()) {
10750 if (pos < lp->logical_offset) {
10751 unsigned hole = lp->logical_offset - pos;
10752 if (hole >= left) {
9f95a23c 10753 break;
7c673cae
FG
10754 }
10755 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
9f95a23c 10756 << std::dec << dendl;
7c673cae
FG
10757 pos += hole;
10758 left -= hole;
10759 }
94b18763 10760 BlobRef& bptr = lp->blob;
7c673cae
FG
10761 unsigned l_off = pos - lp->logical_offset;
10762 unsigned b_off = l_off + lp->blob_offset;
10763 unsigned b_len = std::min(left, lp->length - l_off);
10764
10765 ready_regions_t cache_res;
10766 interval_set<uint32_t> cache_interval;
10767 bptr->shared_blob->bc.read(
91327a77
AA
10768 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
10769 read_cache_policy);
7c673cae 10770 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c
TL
10771 << " need 0x" << b_off << "~" << b_len
10772 << " cache has 0x" << cache_interval
10773 << std::dec << dendl;
7c673cae
FG
10774
10775 auto pc = cache_res.begin();
11fdf7f2 10776 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
7c673cae
FG
10777 while (b_len > 0) {
10778 unsigned l;
10779 if (pc != cache_res.end() &&
9f95a23c
TL
10780 pc->first == b_off) {
10781 l = pc->second.length();
f67539c2 10782 ready_regions[pos] = std::move(pc->second);
9f95a23c
TL
10783 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
10784 << b_off << "~" << l << std::dec << dendl;
10785 ++pc;
7c673cae 10786 } else {
9f95a23c
TL
10787 l = b_len;
10788 if (pc != cache_res.end()) {
10789 ceph_assert(pc->first > b_off);
10790 l = pc->first - b_off;
10791 }
10792 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
10793 << b_off << "~" << l << std::dec << dendl;
10794 // merge regions
10795 {
10796 uint64_t r_off = b_off;
10797 uint64_t r_len = l;
10798 uint64_t front = r_off % chunk_size;
10799 if (front) {
10800 r_off -= front;
10801 r_len += front;
10802 }
10803 unsigned tail = r_len % chunk_size;
10804 if (tail) {
10805 r_len += chunk_size - tail;
10806 }
10807 bool merged = false;
10808 regions2read_t& r2r = blobs2read[bptr];
10809 if (r2r.size()) {
10810 read_req_t& pre = r2r.back();
10811 if (r_off <= (pre.r_off + pre.r_len)) {
10812 front += (r_off - pre.r_off);
10813 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
10814 pre.regs.emplace_back(region_t(pos, b_off, l, front));
10815 merged = true;
10816 }
10817 }
10818 if (!merged) {
10819 read_req_t req(r_off, r_len);
10820 req.regs.emplace_back(region_t(pos, b_off, l, front));
10821 r2r.emplace_back(std::move(req));
10822 }
10823 }
7c673cae
FG
10824 }
10825 pos += l;
10826 b_off += l;
10827 left -= l;
10828 b_len -= l;
10829 }
10830 ++lp;
10831 }
9f95a23c 10832}
7c673cae 10833
9f95a23c
TL
10834int BlueStore::_prepare_read_ioc(
10835 blobs2read_t& blobs2read,
10836 vector<bufferlist>* compressed_blob_bls,
10837 IOContext* ioc)
10838{
7c673cae 10839 for (auto& p : blobs2read) {
94b18763 10840 const BlobRef& bptr = p.first;
11fdf7f2 10841 regions2read_t& r2r = p.second;
20effc67
TL
10842 dout(20) << __func__ << " blob " << *bptr << " need "
10843 << r2r << dendl;
7c673cae
FG
10844 if (bptr->get_blob().is_compressed()) {
10845 // read the whole thing
9f95a23c
TL
10846 if (compressed_blob_bls->empty()) {
10847 // ensure we avoid any reallocation on subsequent blobs
10848 compressed_blob_bls->reserve(blobs2read.size());
10849 }
10850 compressed_blob_bls->push_back(bufferlist());
10851 bufferlist& bl = compressed_blob_bls->back();
10852 auto r = bptr->get_blob().map(
10853 0, bptr->get_blob().get_ondisk_length(),
10854 [&](uint64_t offset, uint64_t length) {
10855 int r = bdev->aio_read(offset, length, &bl, ioc);
10856 if (r < 0)
7c673cae
FG
10857 return r;
10858 return 0;
9f95a23c 10859 });
b32b8144
FG
10860 if (r < 0) {
10861 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
10862 if (r == -EIO) {
10863 // propagate EIO to caller
10864 return r;
10865 }
11fdf7f2 10866 ceph_assert(r == 0);
b32b8144 10867 }
7c673cae
FG
10868 } else {
10869 // read the pieces
11fdf7f2 10870 for (auto& req : r2r) {
9f95a23c
TL
10871 dout(20) << __func__ << " region 0x" << std::hex
10872 << req.regs.front().logical_offset
10873 << ": 0x" << req.regs.front().blob_xoffset
10874 << " reading 0x" << req.r_off
10875 << "~" << req.r_len << std::dec
10876 << dendl;
7c673cae 10877
9f95a23c
TL
10878 // read it
10879 auto r = bptr->get_blob().map(
10880 req.r_off, req.r_len,
10881 [&](uint64_t offset, uint64_t length) {
10882 int r = bdev->aio_read(offset, length, &req.bl, ioc);
10883 if (r < 0)
7c673cae
FG
10884 return r;
10885 return 0;
9f95a23c 10886 });
b32b8144
FG
10887 if (r < 0) {
10888 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
10889 << dendl;
10890 if (r == -EIO) {
10891 // propagate EIO to caller
10892 return r;
10893 }
11fdf7f2 10894 ceph_assert(r == 0);
b32b8144 10895 }
9f95a23c 10896 ceph_assert(req.bl.length() == req.r_len);
7c673cae
FG
10897 }
10898 }
10899 }
9f95a23c
TL
10900 return 0;
10901}
11fdf7f2 10902
9f95a23c
TL
10903int BlueStore::_generate_read_result_bl(
10904 OnodeRef o,
10905 uint64_t offset,
10906 size_t length,
10907 ready_regions_t& ready_regions,
10908 vector<bufferlist>& compressed_blob_bls,
10909 blobs2read_t& blobs2read,
10910 bool buffered,
10911 bool* csum_error,
10912 bufferlist& bl)
10913{
10914 // enumerate and decompress desired blobs
7c673cae
FG
10915 auto p = compressed_blob_bls.begin();
10916 blobs2read_t::iterator b2r_it = blobs2read.begin();
10917 while (b2r_it != blobs2read.end()) {
94b18763 10918 const BlobRef& bptr = b2r_it->first;
11fdf7f2 10919 regions2read_t& r2r = b2r_it->second;
20effc67
TL
10920 dout(20) << __func__ << " blob " << *bptr << " need "
10921 << r2r << dendl;
7c673cae 10922 if (bptr->get_blob().is_compressed()) {
11fdf7f2 10923 ceph_assert(p != compressed_blob_bls.end());
7c673cae
FG
10924 bufferlist& compressed_bl = *p++;
10925 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
9f95a23c
TL
10926 r2r.front().regs.front().logical_offset) < 0) {
10927 *csum_error = true;
10928 return -EIO;
7c673cae
FG
10929 }
10930 bufferlist raw_bl;
9f95a23c 10931 auto r = _decompress(compressed_bl, &raw_bl);
7c673cae 10932 if (r < 0)
9f95a23c 10933 return r;
7c673cae 10934 if (buffered) {
9f95a23c
TL
10935 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
10936 raw_bl);
7c673cae 10937 }
11fdf7f2
TL
10938 for (auto& req : r2r) {
10939 for (auto& r : req.regs) {
10940 ready_regions[r.logical_offset].substr_of(
10941 raw_bl, r.blob_xoffset, r.length);
10942 }
7c673cae
FG
10943 }
10944 } else {
11fdf7f2 10945 for (auto& req : r2r) {
9f95a23c
TL
10946 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
10947 req.regs.front().logical_offset) < 0) {
10948 *csum_error = true;
10949 return -EIO;
10950 }
10951 if (buffered) {
10952 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
10953 req.r_off, req.bl);
10954 }
7c673cae 10955
9f95a23c
TL
10956 // prune and keep result
10957 for (const auto& r : req.regs) {
10958 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
11fdf7f2 10959 }
7c673cae
FG
10960 }
10961 }
10962 ++b2r_it;
10963 }
10964
10965 // generate a resulting buffer
10966 auto pr = ready_regions.begin();
10967 auto pr_end = ready_regions.end();
9f95a23c 10968 uint64_t pos = 0;
7c673cae
FG
10969 while (pos < length) {
10970 if (pr != pr_end && pr->first == pos + offset) {
10971 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
10972 << ": data from 0x" << pr->first << "~" << pr->second.length()
10973 << std::dec << dendl;
7c673cae
FG
10974 pos += pr->second.length();
10975 bl.claim_append(pr->second);
10976 ++pr;
10977 } else {
10978 uint64_t l = length - pos;
10979 if (pr != pr_end) {
11fdf7f2 10980 ceph_assert(pr->first > pos + offset);
9f95a23c 10981 l = pr->first - (pos + offset);
7c673cae
FG
10982 }
10983 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
10984 << ": zeros for 0x" << (pos + offset) << "~" << l
10985 << std::dec << dendl;
7c673cae
FG
10986 bl.append_zero(l);
10987 pos += l;
10988 }
10989 }
11fdf7f2
TL
10990 ceph_assert(bl.length() == length);
10991 ceph_assert(pos == length);
10992 ceph_assert(pr == pr_end);
9f95a23c
TL
10993 return 0;
10994}
10995
10996int BlueStore::_do_read(
10997 Collection *c,
10998 OnodeRef o,
10999 uint64_t offset,
11000 size_t length,
11001 bufferlist& bl,
11002 uint32_t op_flags,
11003 uint64_t retry_count)
11004{
11005 FUNCTRACE(cct);
11006 int r = 0;
11007 int read_cache_policy = 0; // do not bypass clean or dirty cache
11008
11009 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11010 << " size 0x" << o->onode.size << " (" << std::dec
11011 << o->onode.size << ")" << dendl;
11012 bl.clear();
11013
11014 if (offset >= o->onode.size) {
11015 return r;
11016 }
11017
11018 // generally, don't buffer anything, unless the client explicitly requests
11019 // it.
11020 bool buffered = false;
11021 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
11022 dout(20) << __func__ << " will do buffered read" << dendl;
11023 buffered = true;
11024 } else if (cct->_conf->bluestore_default_buffered_read &&
11025 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
11026 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
11027 dout(20) << __func__ << " defaulting to buffered read" << dendl;
11028 buffered = true;
11029 }
11030
11031 if (offset + length > o->onode.size) {
11032 length = o->onode.size - offset;
11033 }
11034
11035 auto start = mono_clock::now();
11036 o->extent_map.fault_range(db, offset, length);
11037 log_latency(__func__,
11038 l_bluestore_read_onode_meta_lat,
11039 mono_clock::now() - start,
11040 cct->_conf->bluestore_log_op_age);
11041 _dump_onode<30>(cct, *o);
11042
11043 // for deep-scrub, we only read dirty cache and bypass clean cache in
11044 // order to read underlying block device in case there are silent disk errors.
11045 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
11046 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
11047 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
11048 }
11049
11050 // build blob-wise list to of stuff read (that isn't cached)
11051 ready_regions_t ready_regions;
11052 blobs2read_t blobs2read;
11053 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
11054
11055
11056 // read raw blob data.
11057 start = mono_clock::now(); // for the sake of simplicity
11058 // measure the whole block below.
11059 // The error isn't that much...
11060 vector<bufferlist> compressed_blob_bls;
20effc67 11061 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
9f95a23c
TL
11062 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
11063 // we always issue aio for reading, so errors other than EIO are not allowed
11064 if (r < 0)
11065 return r;
11066
f67539c2 11067 int64_t num_ios = blobs2read.size();
9f95a23c 11068 if (ioc.has_pending_aios()) {
f67539c2 11069 num_ios = ioc.get_num_ios();
9f95a23c
TL
11070 bdev->aio_submit(&ioc);
11071 dout(20) << __func__ << " waiting for aio" << dendl;
11072 ioc.aio_wait();
11073 r = ioc.get_return_value();
11074 if (r < 0) {
11075 ceph_assert(r == -EIO); // no other errors allowed
11076 return -EIO;
11077 }
11078 }
11079 log_latency_fn(__func__,
11080 l_bluestore_read_wait_aio_lat,
11081 mono_clock::now() - start,
11082 cct->_conf->bluestore_log_op_age,
11083 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
11084 );
11085
11086 bool csum_error = false;
11087 r = _generate_read_result_bl(o, offset, length, ready_regions,
11088 compressed_blob_bls, blobs2read,
20effc67
TL
11089 buffered && !ioc.skip_cache(),
11090 &csum_error, bl);
9f95a23c
TL
11091 if (csum_error) {
11092 // Handles spurious read errors caused by a kernel bug.
11093 // We sometimes get all-zero pages as a result of the read under
11094 // high memory pressure. Retrying the failing read succeeds in most
11095 // cases.
11096 // See also: http://tracker.ceph.com/issues/22464
11097 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
11098 return -EIO;
11099 }
11100 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
11101 }
7c673cae 11102 r = bl.length();
f64942e4
AA
11103 if (retry_count) {
11104 logger->inc(l_bluestore_reads_with_retries);
11105 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
11106 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
f67539c2
TL
11107 stringstream s;
11108 s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
11109 _set_spurious_read_errors_alert(s.str());
f64942e4 11110 }
7c673cae
FG
11111 return r;
11112}
11113
11114int BlueStore::_verify_csum(OnodeRef& o,
11115 const bluestore_blob_t* blob, uint64_t blob_xoffset,
11116 const bufferlist& bl,
11117 uint64_t logical_offset) const
11118{
11119 int bad;
11120 uint64_t bad_csum;
11fdf7f2 11121 auto start = mono_clock::now();
7c673cae 11122 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
f64942e4
AA
11123 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
11124 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
11125 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
11126 bad = blob_xoffset;
11127 r = -1;
11128 bad_csum = 0xDEADBEEF;
11129 }
7c673cae
FG
11130 if (r < 0) {
11131 if (r == -1) {
11132 PExtentVector pex;
11133 blob->map(
11134 bad,
11135 blob->get_csum_chunk_size(),
11136 [&](uint64_t offset, uint64_t length) {
11137 pex.emplace_back(bluestore_pextent_t(offset, length));
11138 return 0;
11139 });
11140 derr << __func__ << " bad "
11141 << Checksummer::get_csum_type_string(blob->csum_type)
11142 << "/0x" << std::hex << blob->get_csum_chunk_size()
11143 << " checksum at blob offset 0x" << bad
11144 << ", got 0x" << bad_csum << ", expected 0x"
11145 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
11146 << ", device location " << pex
11147 << ", logical extent 0x" << std::hex
11148 << (logical_offset + bad - blob_xoffset) << "~"
11149 << blob->get_csum_chunk_size() << std::dec
11150 << ", object " << o->oid
11151 << dendl;
11152 } else {
11153 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
11154 }
11155 }
494da23a
TL
11156 log_latency(__func__,
11157 l_bluestore_csum_lat,
11158 mono_clock::now() - start,
11159 cct->_conf->bluestore_log_op_age);
11fdf7f2
TL
11160 if (cct->_conf->bluestore_ignore_data_csum) {
11161 return 0;
11162 }
7c673cae
FG
11163 return r;
11164}
11165
11166int BlueStore::_decompress(bufferlist& source, bufferlist* result)
11167{
11168 int r = 0;
11fdf7f2
TL
11169 auto start = mono_clock::now();
11170 auto i = source.cbegin();
7c673cae 11171 bluestore_compression_header_t chdr;
11fdf7f2 11172 decode(chdr, i);
7c673cae
FG
11173 int alg = int(chdr.type);
11174 CompressorRef cp = compressor;
11175 if (!cp || (int)cp->get_type() != alg) {
11176 cp = Compressor::create(cct, alg);
11177 }
11178
11179 if (!cp.get()) {
11180 // if compressor isn't available - error, because cannot return
11181 // decompressed data?
11fdf7f2
TL
11182
11183 const char* alg_name = Compressor::get_comp_alg_name(alg);
11184 derr << __func__ << " can't load decompressor " << alg_name << dendl;
11185 _set_compression_alert(false, alg_name);
7c673cae
FG
11186 r = -EIO;
11187 } else {
f67539c2 11188 r = cp->decompress(i, chdr.length, *result, chdr.compressor_message);
7c673cae
FG
11189 if (r < 0) {
11190 derr << __func__ << " decompression failed with exit code " << r << dendl;
11191 r = -EIO;
11192 }
11193 }
494da23a
TL
11194 log_latency(__func__,
11195 l_bluestore_decompress_lat,
11196 mono_clock::now() - start,
11197 cct->_conf->bluestore_log_op_age);
7c673cae
FG
11198 return r;
11199}
11200
11201// this stores fiemap into interval_set, other variations
11202// use it internally
11203int BlueStore::_fiemap(
11204 CollectionHandle &c_,
11205 const ghobject_t& oid,
11206 uint64_t offset,
11207 size_t length,
11208 interval_set<uint64_t>& destset)
11209{
11210 Collection *c = static_cast<Collection *>(c_.get());
11211 if (!c->exists)
11212 return -ENOENT;
11213 {
9f95a23c 11214 std::shared_lock l(c->lock);
7c673cae
FG
11215
11216 OnodeRef o = c->get_onode(oid, false);
11217 if (!o || !o->exists) {
11218 return -ENOENT;
11219 }
81eedcae 11220 _dump_onode<30>(cct, *o);
7c673cae
FG
11221
11222 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11223 << " size 0x" << o->onode.size << std::dec << dendl;
11224
11225 boost::intrusive::set<Extent>::iterator ep, eend;
11226 if (offset >= o->onode.size)
11227 goto out;
11228
11229 if (offset + length > o->onode.size) {
11230 length = o->onode.size - offset;
11231 }
11232
11233 o->extent_map.fault_range(db, offset, length);
11234 eend = o->extent_map.extent_map.end();
11235 ep = o->extent_map.seek_lextent(offset);
11236 while (length > 0) {
11237 dout(20) << __func__ << " offset " << offset << dendl;
11238 if (ep != eend && ep->logical_offset + ep->length <= offset) {
11239 ++ep;
11240 continue;
11241 }
11242
11243 uint64_t x_len = length;
11244 if (ep != eend && ep->logical_offset <= offset) {
11245 uint64_t x_off = offset - ep->logical_offset;
11fdf7f2 11246 x_len = std::min(x_len, ep->length - x_off);
7c673cae
FG
11247 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
11248 << x_len << std::dec << " blob " << ep->blob << dendl;
11249 destset.insert(offset, x_len);
11250 length -= x_len;
11251 offset += x_len;
11252 if (x_off + x_len == ep->length)
11253 ++ep;
11254 continue;
11255 }
11256 if (ep != eend &&
11257 ep->logical_offset > offset &&
11258 ep->logical_offset - offset < x_len) {
11259 x_len = ep->logical_offset - offset;
11260 }
11261 offset += x_len;
11262 length -= x_len;
11263 }
11264 }
9f95a23c
TL
11265
11266 out:
11267 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
11268 << " size = 0x(" << destset << ")" << std::dec << dendl;
11269 return 0;
11270}
11271
11272int BlueStore::fiemap(
11273 CollectionHandle &c_,
11274 const ghobject_t& oid,
11275 uint64_t offset,
11276 size_t length,
11277 bufferlist& bl)
11278{
11279 interval_set<uint64_t> m;
11280 int r = _fiemap(c_, oid, offset, length, m);
11281 if (r >= 0) {
11282 encode(m, bl);
11283 }
11284 return r;
11285}
11286
11287int BlueStore::fiemap(
11288 CollectionHandle &c_,
11289 const ghobject_t& oid,
11290 uint64_t offset,
11291 size_t length,
11292 map<uint64_t, uint64_t>& destmap)
11293{
11294 interval_set<uint64_t> m;
11295 int r = _fiemap(c_, oid, offset, length, m);
11296 if (r >= 0) {
11297 destmap = std::move(m).detach();
11298 }
11299 return r;
11300}
11301
11302int BlueStore::readv(
11303 CollectionHandle &c_,
11304 const ghobject_t& oid,
11305 interval_set<uint64_t>& m,
11306 bufferlist& bl,
11307 uint32_t op_flags)
11308{
11309 auto start = mono_clock::now();
11310 Collection *c = static_cast<Collection *>(c_.get());
11311 const coll_t &cid = c->get_cid();
11312 dout(15) << __func__ << " " << cid << " " << oid
11313 << " fiemap " << m
11314 << dendl;
11315 if (!c->exists)
11316 return -ENOENT;
11317
11318 bl.clear();
11319 int r;
11320 {
11321 std::shared_lock l(c->lock);
11322 auto start1 = mono_clock::now();
11323 OnodeRef o = c->get_onode(oid, false);
11324 log_latency("get_onode@read",
11325 l_bluestore_read_onode_meta_lat,
11326 mono_clock::now() - start1,
11327 cct->_conf->bluestore_log_op_age);
11328 if (!o || !o->exists) {
11329 r = -ENOENT;
11330 goto out;
11331 }
11332
11333 if (m.empty()) {
11334 r = 0;
11335 goto out;
11336 }
11337
11338 r = _do_readv(c, o, m, bl, op_flags);
11339 if (r == -EIO) {
11340 logger->inc(l_bluestore_read_eio);
11341 }
11342 }
11343
11344 out:
11345 if (r >= 0 && _debug_data_eio(oid)) {
11346 r = -EIO;
11347 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11348 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
11349 cct->_conf->bluestore_debug_random_read_err &&
11350 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
11351 100.0)) == 0) {
11352 dout(0) << __func__ << ": inject random EIO" << dendl;
11353 r = -EIO;
11354 }
11355 dout(10) << __func__ << " " << cid << " " << oid
11356 << " fiemap " << m << std::dec
11357 << " = " << r << dendl;
11358 log_latency(__func__,
11359 l_bluestore_read_lat,
11360 mono_clock::now() - start,
11361 cct->_conf->bluestore_log_op_age);
11362 return r;
11363}
11364
11365int BlueStore::_do_readv(
11366 Collection *c,
11367 OnodeRef o,
11368 const interval_set<uint64_t>& m,
11369 bufferlist& bl,
11370 uint32_t op_flags,
11371 uint64_t retry_count)
11372{
11373 FUNCTRACE(cct);
11374 int r = 0;
11375 int read_cache_policy = 0; // do not bypass clean or dirty cache
11376
11377 dout(20) << __func__ << " fiemap " << m << std::hex
11378 << " size 0x" << o->onode.size << " (" << std::dec
11379 << o->onode.size << ")" << dendl;
11380
11381 // generally, don't buffer anything, unless the client explicitly requests
11382 // it.
11383 bool buffered = false;
11384 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
11385 dout(20) << __func__ << " will do buffered read" << dendl;
11386 buffered = true;
11387 } else if (cct->_conf->bluestore_default_buffered_read &&
11388 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
11389 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
11390 dout(20) << __func__ << " defaulting to buffered read" << dendl;
11391 buffered = true;
11392 }
11393 // this method must be idempotent since we may call it several times
11394 // before we finally read the expected result.
11395 bl.clear();
11396
11397 // call fiemap first!
11398 ceph_assert(m.range_start() <= o->onode.size);
11399 ceph_assert(m.range_end() <= o->onode.size);
11400 auto start = mono_clock::now();
11401 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
11402 log_latency(__func__,
11403 l_bluestore_read_onode_meta_lat,
11404 mono_clock::now() - start,
11405 cct->_conf->bluestore_log_op_age);
11406 _dump_onode<30>(cct, *o);
11407
20effc67 11408 IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
9f95a23c
TL
11409 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
11410 raw_results.reserve(m.num_intervals());
11411 int i = 0;
11412 for (auto p = m.begin(); p != m.end(); p++, i++) {
11413 raw_results.push_back({});
11414 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
11415 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
11416 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
11417 // we always issue aio for reading, so errors other than EIO are not allowed
11418 if (r < 0)
11419 return r;
11420 }
11421
11422 auto num_ios = m.size();
11423 if (ioc.has_pending_aios()) {
11424 num_ios = ioc.get_num_ios();
11425 bdev->aio_submit(&ioc);
11426 dout(20) << __func__ << " waiting for aio" << dendl;
11427 ioc.aio_wait();
11428 r = ioc.get_return_value();
11429 if (r < 0) {
11430 ceph_assert(r == -EIO); // no other errors allowed
11431 return -EIO;
11432 }
11433 }
11434 log_latency_fn(__func__,
11435 l_bluestore_read_wait_aio_lat,
11436 mono_clock::now() - start,
11437 cct->_conf->bluestore_log_op_age,
11438 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
11439 );
11440
11441 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
11442 i = 0;
11443 for (auto p = m.begin(); p != m.end(); p++, i++) {
11444 bool csum_error = false;
11445 bufferlist t;
11446 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
11447 std::get<0>(raw_results[i]),
11448 std::get<1>(raw_results[i]),
11449 std::get<2>(raw_results[i]),
11450 buffered, &csum_error, t);
11451 if (csum_error) {
11452 // Handles spurious read errors caused by a kernel bug.
11453 // We sometimes get all-zero pages as a result of the read under
11454 // high memory pressure. Retrying the failing read succeeds in most
11455 // cases.
11456 // See also: http://tracker.ceph.com/issues/22464
11457 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
11458 return -EIO;
11459 }
11460 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
11461 }
11462 bl.claim_append(t);
11463 }
11464 if (retry_count) {
11465 logger->inc(l_bluestore_reads_with_retries);
11466 dout(5) << __func__ << " read fiemap " << m
11467 << " failed " << retry_count << " times before succeeding"
11468 << dendl;
11469 }
11470 return bl.length();
7c673cae
FG
11471}
11472
9f95a23c 11473int BlueStore::dump_onode(CollectionHandle &c_,
7c673cae 11474 const ghobject_t& oid,
9f95a23c
TL
11475 const string& section_name,
11476 Formatter *f)
7c673cae 11477{
9f95a23c
TL
11478 Collection *c = static_cast<Collection *>(c_.get());
11479 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
11480 if (!c->exists)
11481 return -ENOENT;
7c673cae 11482
9f95a23c
TL
11483 int r;
11484 {
11485 std::shared_lock l(c->lock);
11486
11487 OnodeRef o = c->get_onode(oid, false);
11488 if (!o || !o->exists) {
11489 r = -ENOENT;
11490 goto out;
11491 }
11492 // FIXME minor: actually the next line isn't enough to
11493 // load shared blobs. Leaving as is for now..
11494 //
11495 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
11496
11497 _dump_onode<0>(cct, *o);
11498 f->open_object_section(section_name.c_str());
11499 o->dump(f);
11500 f->close_section();
11501 r = 0;
7c673cae 11502 }
9f95a23c
TL
11503 out:
11504 dout(10) << __func__ << " " << c->cid << " " << oid
11505 << " = " << r << dendl;
7c673cae
FG
11506 return r;
11507}
11508
7c673cae
FG
11509int BlueStore::getattr(
11510 CollectionHandle &c_,
11511 const ghobject_t& oid,
11512 const char *name,
11513 bufferptr& value)
11514{
11515 Collection *c = static_cast<Collection *>(c_.get());
11516 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
11517 if (!c->exists)
11518 return -ENOENT;
11519
11520 int r;
11521 {
9f95a23c 11522 std::shared_lock l(c->lock);
f91f0fd5 11523 mempool::bluestore_cache_meta::string k(name);
7c673cae
FG
11524
11525 OnodeRef o = c->get_onode(oid, false);
11526 if (!o || !o->exists) {
11527 r = -ENOENT;
11528 goto out;
11529 }
11530
11531 if (!o->onode.attrs.count(k)) {
11532 r = -ENODATA;
11533 goto out;
11534 }
11535 value = o->onode.attrs[k];
11536 r = 0;
11537 }
11538 out:
7c673cae
FG
11539 if (r == 0 && _debug_mdata_eio(oid)) {
11540 r = -EIO;
11541 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11542 }
11543 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
11544 << " = " << r << dendl;
11545 return r;
11546}
11547
7c673cae
FG
11548int BlueStore::getattrs(
11549 CollectionHandle &c_,
11550 const ghobject_t& oid,
20effc67 11551 map<string,bufferptr,less<>>& aset)
7c673cae
FG
11552{
11553 Collection *c = static_cast<Collection *>(c_.get());
11554 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
11555 if (!c->exists)
11556 return -ENOENT;
11557
11558 int r;
11559 {
9f95a23c 11560 std::shared_lock l(c->lock);
7c673cae
FG
11561
11562 OnodeRef o = c->get_onode(oid, false);
11563 if (!o || !o->exists) {
11564 r = -ENOENT;
11565 goto out;
11566 }
11567 for (auto& i : o->onode.attrs) {
11568 aset.emplace(i.first.c_str(), i.second);
11569 }
11570 r = 0;
11571 }
11572
11573 out:
7c673cae
FG
11574 if (r == 0 && _debug_mdata_eio(oid)) {
11575 r = -EIO;
11576 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11577 }
11578 dout(10) << __func__ << " " << c->cid << " " << oid
11579 << " = " << r << dendl;
11580 return r;
11581}
11582
11583int BlueStore::list_collections(vector<coll_t>& ls)
11584{
9f95a23c 11585 std::shared_lock l(coll_lock);
11fdf7f2 11586 ls.reserve(coll_map.size());
7c673cae
FG
11587 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
11588 p != coll_map.end();
11589 ++p)
11590 ls.push_back(p->first);
11591 return 0;
11592}
11593
11594bool BlueStore::collection_exists(const coll_t& c)
11595{
9f95a23c 11596 std::shared_lock l(coll_lock);
7c673cae
FG
11597 return coll_map.count(c);
11598}
11599
11fdf7f2 11600int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
7c673cae 11601{
11fdf7f2 11602 dout(15) << __func__ << " " << ch->cid << dendl;
7c673cae
FG
11603 vector<ghobject_t> ls;
11604 ghobject_t next;
11fdf7f2 11605 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
7c673cae
FG
11606 &ls, &next);
11607 if (r < 0) {
11608 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
11609 << dendl;
11610 return r;
11611 }
11612 *empty = ls.empty();
11fdf7f2 11613 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
7c673cae
FG
11614 return 0;
11615}
11616
11fdf7f2 11617int BlueStore::collection_bits(CollectionHandle& ch)
7c673cae 11618{
11fdf7f2
TL
11619 dout(15) << __func__ << " " << ch->cid << dendl;
11620 Collection *c = static_cast<Collection*>(ch.get());
9f95a23c 11621 std::shared_lock l(c->lock);
11fdf7f2 11622 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
7c673cae
FG
11623 return c->cnode.bits;
11624}
11625
7c673cae
FG
11626int BlueStore::collection_list(
11627 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
11628 vector<ghobject_t> *ls, ghobject_t *pnext)
11629{
11630 Collection *c = static_cast<Collection *>(c_.get());
11fdf7f2 11631 c->flush();
7c673cae
FG
11632 dout(15) << __func__ << " " << c->cid
11633 << " start " << start << " end " << end << " max " << max << dendl;
11634 int r;
11635 {
9f95a23c 11636 std::shared_lock l(c->lock);
f91f0fd5
TL
11637 r = _collection_list(c, start, end, max, false, ls, pnext);
11638 }
11639
11640 dout(10) << __func__ << " " << c->cid
11641 << " start " << start << " end " << end << " max " << max
11642 << " = " << r << ", ls.size() = " << ls->size()
11643 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
11644 return r;
11645}
11646
11647int BlueStore::collection_list_legacy(
11648 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
11649 vector<ghobject_t> *ls, ghobject_t *pnext)
11650{
11651 Collection *c = static_cast<Collection *>(c_.get());
11652 c->flush();
11653 dout(15) << __func__ << " " << c->cid
11654 << " start " << start << " end " << end << " max " << max << dendl;
11655 int r;
11656 {
11657 std::shared_lock l(c->lock);
11658 r = _collection_list(c, start, end, max, true, ls, pnext);
7c673cae
FG
11659 }
11660
7c673cae
FG
11661 dout(10) << __func__ << " " << c->cid
11662 << " start " << start << " end " << end << " max " << max
11663 << " = " << r << ", ls.size() = " << ls->size()
11664 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
11665 return r;
11666}
11667
11668int BlueStore::_collection_list(
11669 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
f91f0fd5 11670 bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
7c673cae
FG
11671{
11672
11673 if (!c->exists)
11674 return -ENOENT;
11675
7c673cae 11676 ghobject_t static_next;
f91f0fd5
TL
11677 std::unique_ptr<CollectionListIterator> it;
11678 ghobject_t coll_range_temp_start, coll_range_temp_end;
11679 ghobject_t coll_range_start, coll_range_end;
f91f0fd5 11680 ghobject_t pend;
7c673cae
FG
11681 bool temp;
11682
11683 if (!pnext)
11684 pnext = &static_next;
11685
a4b75251
TL
11686 auto log_latency = make_scope_guard(
11687 [&, start_time = mono_clock::now(), func_name = __func__] {
11688 log_latency_fn(
11689 func_name,
11690 l_bluestore_remove_lat,
11691 mono_clock::now() - start_time,
11692 cct->_conf->bluestore_log_collection_list_age,
11693 [&](const ceph::timespan& lat) {
11694 ostringstream ostr;
11695 ostr << ", lat = " << timespan_str(lat)
11696 << " cid =" << c->cid
11697 << " start " << start << " end " << end
11698 << " max " << max;
11699 return ostr.str();
11700 });
11701 });
11702
11fdf7f2 11703 if (start.is_max() || start.hobj.is_max()) {
a4b75251
TL
11704 *pnext = ghobject_t::get_max();
11705 return 0;
7c673cae 11706 }
f91f0fd5 11707 get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
a4b75251 11708 &coll_range_temp_end, &coll_range_start, &coll_range_end, legacy);
7c673cae 11709 dout(20) << __func__
f91f0fd5
TL
11710 << " range " << coll_range_temp_start
11711 << " to " << coll_range_temp_end
11712 << " and " << coll_range_start
11713 << " to " << coll_range_end
7c673cae 11714 << " start " << start << dendl;
f91f0fd5
TL
11715 if (legacy) {
11716 it = std::make_unique<SimpleCollectionListIterator>(
11717 cct, db->get_iterator(PREFIX_OBJ));
11718 } else {
11719 it = std::make_unique<SortedCollectionListIterator>(
11720 db->get_iterator(PREFIX_OBJ));
11721 }
7c673cae
FG
11722 if (start == ghobject_t() ||
11723 start.hobj == hobject_t() ||
11724 start == c->cid.get_min_hobj()) {
f91f0fd5 11725 it->upper_bound(coll_range_temp_start);
7c673cae
FG
11726 temp = true;
11727 } else {
7c673cae
FG
11728 if (start.hobj.is_temp()) {
11729 temp = true;
f91f0fd5 11730 ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
7c673cae
FG
11731 } else {
11732 temp = false;
f91f0fd5 11733 ceph_assert(start >= coll_range_start && start < coll_range_end);
7c673cae 11734 }
f91f0fd5
TL
11735 dout(20) << __func__ << " temp=" << (int)temp << dendl;
11736 it->lower_bound(start);
7c673cae
FG
11737 }
11738 if (end.hobj.is_max()) {
f91f0fd5 11739 pend = temp ? coll_range_temp_end : coll_range_end;
7c673cae 11740 } else {
7c673cae 11741 if (end.hobj.is_temp()) {
a4b75251 11742 if (temp) {
f91f0fd5 11743 pend = end;
a4b75251
TL
11744 } else {
11745 *pnext = ghobject_t::get_max();
11746 return 0;
11747 }
7c673cae 11748 } else {
f91f0fd5 11749 pend = temp ? coll_range_temp_end : end;
7c673cae
FG
11750 }
11751 }
f91f0fd5 11752 dout(20) << __func__ << " pend " << pend << dendl;
7c673cae 11753 while (true) {
adb31ebb 11754 if (!it->valid() || it->is_ge(pend)) {
7c673cae
FG
11755 if (!it->valid())
11756 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
11757 else
f91f0fd5 11758 dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
7c673cae
FG
11759 if (temp) {
11760 if (end.hobj.is_temp()) {
adb31ebb 11761 if (it->valid() && it->is_lt(coll_range_temp_end)) {
f91f0fd5 11762 *pnext = it->oid();
a4b75251 11763 return 0;
f91f0fd5 11764 }
7c673cae
FG
11765 break;
11766 }
11767 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
11768 temp = false;
f91f0fd5
TL
11769 it->upper_bound(coll_range_start);
11770 if (end.hobj.is_max())
11771 pend = coll_range_end;
11772 else
11773 pend = end;
11774 dout(30) << __func__ << " pend " << pend << dendl;
7c673cae
FG
11775 continue;
11776 }
adb31ebb 11777 if (it->valid() && it->is_lt(coll_range_end)) {
f91f0fd5 11778 *pnext = it->oid();
a4b75251 11779 return 0;
f91f0fd5 11780 }
7c673cae
FG
11781 break;
11782 }
f91f0fd5 11783 dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
7c673cae
FG
11784 if (ls->size() >= (unsigned)max) {
11785 dout(20) << __func__ << " reached max " << max << dendl;
f91f0fd5 11786 *pnext = it->oid();
a4b75251 11787 return 0;
7c673cae 11788 }
f91f0fd5 11789 ls->push_back(it->oid());
7c673cae
FG
11790 it->next();
11791 }
a4b75251
TL
11792 *pnext = ghobject_t::get_max();
11793 return 0;
7c673cae
FG
11794}
11795
7c673cae
FG
11796int BlueStore::omap_get(
11797 CollectionHandle &c_, ///< [in] Collection containing oid
11798 const ghobject_t &oid, ///< [in] Object containing omap
11799 bufferlist *header, ///< [out] omap header
11800 map<string, bufferlist> *out /// < [out] Key to value map
11801 )
11802{
11803 Collection *c = static_cast<Collection *>(c_.get());
9f95a23c
TL
11804 return _omap_get(c, oid, header, out);
11805}
11806
11807int BlueStore::_omap_get(
11808 Collection *c, ///< [in] Collection containing oid
11809 const ghobject_t &oid, ///< [in] Object containing omap
11810 bufferlist *header, ///< [out] omap header
11811 map<string, bufferlist> *out /// < [out] Key to value map
11812 )
11813{
7c673cae
FG
11814 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11815 if (!c->exists)
11816 return -ENOENT;
9f95a23c 11817 std::shared_lock l(c->lock);
7c673cae
FG
11818 int r = 0;
11819 OnodeRef o = c->get_onode(oid, false);
11820 if (!o || !o->exists) {
11821 r = -ENOENT;
11822 goto out;
11823 }
9f95a23c
TL
11824 r = _onode_omap_get(o, header, out);
11825 out:
11826 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11827 << dendl;
11828 return r;
11829}
11830
11831int BlueStore::_onode_omap_get(
11832 const OnodeRef &o, ///< [in] Object containing omap
11833 bufferlist *header, ///< [out] omap header
11834 map<string, bufferlist> *out /// < [out] Key to value map
11835)
11836{
11837 int r = 0;
11838 if (!o || !o->exists) {
11839 r = -ENOENT;
11840 goto out;
11841 }
7c673cae
FG
11842 if (!o->onode.has_omap())
11843 goto out;
11844 o->flush();
11845 {
9f95a23c 11846 const string& prefix = o->get_omap_prefix();
11fdf7f2 11847 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 11848 string head, tail;
9f95a23c
TL
11849 o->get_omap_header(&head);
11850 o->get_omap_tail(&tail);
7c673cae
FG
11851 it->lower_bound(head);
11852 while (it->valid()) {
11853 if (it->key() == head) {
9f95a23c
TL
11854 dout(30) << __func__ << " got header" << dendl;
11855 *header = it->value();
7c673cae 11856 } else if (it->key() >= tail) {
9f95a23c
TL
11857 dout(30) << __func__ << " reached tail" << dendl;
11858 break;
7c673cae 11859 } else {
9f95a23c
TL
11860 string user_key;
11861 o->decode_omap_key(it->key(), &user_key);
11862 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
11863 << " -> " << user_key << dendl;
11864 (*out)[user_key] = it->value();
7c673cae
FG
11865 }
11866 it->next();
11867 }
11868 }
9f95a23c 11869out:
7c673cae
FG
11870 return r;
11871}
11872
7c673cae
FG
11873int BlueStore::omap_get_header(
11874 CollectionHandle &c_, ///< [in] Collection containing oid
11875 const ghobject_t &oid, ///< [in] Object containing omap
11876 bufferlist *header, ///< [out] omap header
11877 bool allow_eio ///< [in] don't assert on eio
11878 )
11879{
11880 Collection *c = static_cast<Collection *>(c_.get());
11881 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11882 if (!c->exists)
11883 return -ENOENT;
9f95a23c 11884 std::shared_lock l(c->lock);
7c673cae
FG
11885 int r = 0;
11886 OnodeRef o = c->get_onode(oid, false);
11887 if (!o || !o->exists) {
11888 r = -ENOENT;
11889 goto out;
11890 }
11891 if (!o->onode.has_omap())
11892 goto out;
11893 o->flush();
11894 {
11895 string head;
9f95a23c
TL
11896 o->get_omap_header(&head);
11897 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
7c673cae
FG
11898 dout(30) << __func__ << " got header" << dendl;
11899 } else {
11900 dout(30) << __func__ << " no header" << dendl;
11901 }
11902 }
11903 out:
11904 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11905 << dendl;
11906 return r;
11907}
11908
7c673cae
FG
11909int BlueStore::omap_get_keys(
11910 CollectionHandle &c_, ///< [in] Collection containing oid
11911 const ghobject_t &oid, ///< [in] Object containing omap
11912 set<string> *keys ///< [out] Keys defined on oid
11913 )
11914{
11915 Collection *c = static_cast<Collection *>(c_.get());
11916 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11917 if (!c->exists)
11918 return -ENOENT;
adb31ebb 11919 auto start1 = mono_clock::now();
9f95a23c 11920 std::shared_lock l(c->lock);
7c673cae
FG
11921 int r = 0;
11922 OnodeRef o = c->get_onode(oid, false);
11923 if (!o || !o->exists) {
11924 r = -ENOENT;
11925 goto out;
11926 }
11927 if (!o->onode.has_omap())
11928 goto out;
11929 o->flush();
11930 {
9f95a23c 11931 const string& prefix = o->get_omap_prefix();
11fdf7f2 11932 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 11933 string head, tail;
9f95a23c
TL
11934 o->get_omap_key(string(), &head);
11935 o->get_omap_tail(&tail);
7c673cae
FG
11936 it->lower_bound(head);
11937 while (it->valid()) {
11938 if (it->key() >= tail) {
11939 dout(30) << __func__ << " reached tail" << dendl;
11940 break;
11941 }
11942 string user_key;
9f95a23c 11943 o->decode_omap_key(it->key(), &user_key);
11fdf7f2 11944 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
11945 << " -> " << user_key << dendl;
11946 keys->insert(user_key);
11947 it->next();
11fdf7f2
TL
11948 }
11949 }
11950 out:
adb31ebb
TL
11951 c->store->log_latency(
11952 __func__,
11953 l_bluestore_omap_get_keys_lat,
11954 mono_clock::now() - start1,
11955 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11956
11fdf7f2
TL
11957 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11958 << dendl;
11959 return r;
7c673cae
FG
11960}
11961
11962int BlueStore::omap_get_values(
11963 CollectionHandle &c_, ///< [in] Collection containing oid
11964 const ghobject_t &oid, ///< [in] Object containing omap
11965 const set<string> &keys, ///< [in] Keys to get
11966 map<string, bufferlist> *out ///< [out] Returned keys and values
11967 )
11968{
11969 Collection *c = static_cast<Collection *>(c_.get());
11970 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11971 if (!c->exists)
11972 return -ENOENT;
9f95a23c 11973 std::shared_lock l(c->lock);
adb31ebb 11974 auto start1 = mono_clock::now();
7c673cae
FG
11975 int r = 0;
11976 string final_key;
11977 OnodeRef o = c->get_onode(oid, false);
11978 if (!o || !o->exists) {
11979 r = -ENOENT;
11980 goto out;
11981 }
9f95a23c 11982 if (!o->onode.has_omap()) {
7c673cae 11983 goto out;
9f95a23c
TL
11984 }
11985 o->flush();
11fdf7f2 11986 {
9f95a23c
TL
11987 const string& prefix = o->get_omap_prefix();
11988 o->get_omap_key(string(), &final_key);
11989 size_t base_key_len = final_key.size();
11fdf7f2 11990 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 11991 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
11992 final_key += *p;
11993 bufferlist val;
11994 if (db->get(prefix, final_key, &val) >= 0) {
11995 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
11996 << " -> " << *p << dendl;
11997 out->insert(make_pair(*p, val));
11998 }
7c673cae
FG
11999 }
12000 }
12001 out:
adb31ebb
TL
12002 c->store->log_latency(
12003 __func__,
12004 l_bluestore_omap_get_values_lat,
12005 mono_clock::now() - start1,
12006 c->store->cct->_conf->bluestore_log_omap_iterator_age);
12007
7c673cae
FG
12008 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12009 << dendl;
12010 return r;
12011}
12012
9f95a23c
TL
12013#ifdef WITH_SEASTAR
12014int BlueStore::omap_get_values(
12015 CollectionHandle &c_, ///< [in] Collection containing oid
12016 const ghobject_t &oid, ///< [in] Object containing omap
12017 const std::optional<string> &start_after, ///< [in] Keys to get
12018 map<string, bufferlist> *output ///< [out] Returned keys and values
12019 )
12020{
12021 Collection *c = static_cast<Collection *>(c_.get());
12022 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12023 if (!c->exists)
12024 return -ENOENT;
12025 std::shared_lock l(c->lock);
12026 int r = 0;
12027 OnodeRef o = c->get_onode(oid, false);
12028 if (!o || !o->exists) {
12029 r = -ENOENT;
12030 goto out;
12031 }
12032 if (!o->onode.has_omap()) {
12033 goto out;
12034 }
12035 o->flush();
12036 {
12037 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
12038 if (!iter) {
12039 r = -ENOENT;
12040 goto out;
12041 }
12042 iter->upper_bound(*start_after);
12043 for (; iter->valid(); iter->next()) {
12044 output->insert(make_pair(iter->key(), iter->value()));
12045 }
12046 }
12047
12048out:
12049 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12050 << dendl;
12051 return r;
12052}
12053#endif
12054
7c673cae
FG
12055int BlueStore::omap_check_keys(
12056 CollectionHandle &c_, ///< [in] Collection containing oid
12057 const ghobject_t &oid, ///< [in] Object containing omap
12058 const set<string> &keys, ///< [in] Keys to check
12059 set<string> *out ///< [out] Subset of keys defined on oid
12060 )
12061{
12062 Collection *c = static_cast<Collection *>(c_.get());
12063 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
12064 if (!c->exists)
12065 return -ENOENT;
9f95a23c 12066 std::shared_lock l(c->lock);
7c673cae
FG
12067 int r = 0;
12068 string final_key;
12069 OnodeRef o = c->get_onode(oid, false);
12070 if (!o || !o->exists) {
12071 r = -ENOENT;
12072 goto out;
12073 }
9f95a23c 12074 if (!o->onode.has_omap()) {
7c673cae 12075 goto out;
9f95a23c
TL
12076 }
12077 o->flush();
11fdf7f2 12078 {
9f95a23c
TL
12079 const string& prefix = o->get_omap_prefix();
12080 o->get_omap_key(string(), &final_key);
12081 size_t base_key_len = final_key.size();
11fdf7f2 12082 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 12083 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
12084 final_key += *p;
12085 bufferlist val;
12086 if (db->get(prefix, final_key, &val) >= 0) {
12087 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
12088 << " -> " << *p << dendl;
12089 out->insert(*p);
12090 } else {
12091 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
12092 << " -> " << *p << dendl;
12093 }
7c673cae
FG
12094 }
12095 }
12096 out:
12097 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
12098 << dendl;
12099 return r;
12100}
12101
7c673cae
FG
12102ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
12103 CollectionHandle &c_, ///< [in] collection
12104 const ghobject_t &oid ///< [in] object
12105 )
12106{
12107 Collection *c = static_cast<Collection *>(c_.get());
12108 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
12109 if (!c->exists) {
12110 return ObjectMap::ObjectMapIterator();
12111 }
9f95a23c 12112 std::shared_lock l(c->lock);
7c673cae
FG
12113 OnodeRef o = c->get_onode(oid, false);
12114 if (!o || !o->exists) {
12115 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
12116 return ObjectMap::ObjectMapIterator();
12117 }
12118 o->flush();
12119 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
9f95a23c 12120 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix());
7c673cae
FG
12121 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
12122}
12123
12124// -----------------
12125// write helpers
12126
11fdf7f2 12127uint64_t BlueStore::_get_ondisk_reserved() const {
f67539c2 12128 ceph_assert(min_alloc_size);
11fdf7f2
TL
12129 return round_up_to(
12130 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
12131}
12132
7c673cae
FG
12133void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
12134{
12135 dout(10) << __func__ << " ondisk_format " << ondisk_format
12136 << " min_compat_ondisk_format " << min_compat_ondisk_format
12137 << dendl;
11fdf7f2 12138 ceph_assert(ondisk_format == latest_ondisk_format);
7c673cae
FG
12139 {
12140 bufferlist bl;
11fdf7f2 12141 encode(ondisk_format, bl);
7c673cae
FG
12142 t->set(PREFIX_SUPER, "ondisk_format", bl);
12143 }
12144 {
12145 bufferlist bl;
11fdf7f2 12146 encode(min_compat_ondisk_format, bl);
7c673cae
FG
12147 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
12148 }
12149}
12150
12151int BlueStore::_open_super_meta()
12152{
12153 // nid
12154 {
12155 nid_max = 0;
12156 bufferlist bl;
12157 db->get(PREFIX_SUPER, "nid_max", &bl);
11fdf7f2 12158 auto p = bl.cbegin();
7c673cae
FG
12159 try {
12160 uint64_t v;
11fdf7f2 12161 decode(v, p);
7c673cae 12162 nid_max = v;
f67539c2 12163 } catch (ceph::buffer::error& e) {
7c673cae
FG
12164 derr << __func__ << " unable to read nid_max" << dendl;
12165 return -EIO;
12166 }
f67539c2 12167 dout(1) << __func__ << " old nid_max " << nid_max << dendl;
7c673cae
FG
12168 nid_last = nid_max.load();
12169 }
12170
12171 // blobid
12172 {
12173 blobid_max = 0;
12174 bufferlist bl;
12175 db->get(PREFIX_SUPER, "blobid_max", &bl);
11fdf7f2 12176 auto p = bl.cbegin();
7c673cae
FG
12177 try {
12178 uint64_t v;
11fdf7f2 12179 decode(v, p);
7c673cae 12180 blobid_max = v;
f67539c2 12181 } catch (ceph::buffer::error& e) {
7c673cae
FG
12182 derr << __func__ << " unable to read blobid_max" << dendl;
12183 return -EIO;
12184 }
f67539c2 12185 dout(1) << __func__ << " old blobid_max " << blobid_max << dendl;
7c673cae
FG
12186 blobid_last = blobid_max.load();
12187 }
12188
12189 // freelist
12190 {
12191 bufferlist bl;
12192 db->get(PREFIX_SUPER, "freelist_type", &bl);
12193 if (bl.length()) {
12194 freelist_type = std::string(bl.c_str(), bl.length());
7c673cae 12195 } else {
11fdf7f2 12196 ceph_abort_msg("Not Support extent freelist manager");
7c673cae 12197 }
20effc67 12198 dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
7c673cae 12199 }
7c673cae
FG
12200 // ondisk format
12201 int32_t compat_ondisk_format = 0;
12202 {
12203 bufferlist bl;
12204 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
12205 if (r < 0) {
12206 // base case: kraken bluestore is v1 and readable by v1
12207 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
12208 << dendl;
12209 ondisk_format = 1;
12210 compat_ondisk_format = 1;
12211 } else {
11fdf7f2 12212 auto p = bl.cbegin();
7c673cae 12213 try {
11fdf7f2 12214 decode(ondisk_format, p);
f67539c2 12215 } catch (ceph::buffer::error& e) {
7c673cae
FG
12216 derr << __func__ << " unable to read ondisk_format" << dendl;
12217 return -EIO;
12218 }
12219 bl.clear();
12220 {
12221 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11fdf7f2
TL
12222 ceph_assert(!r);
12223 auto p = bl.cbegin();
7c673cae 12224 try {
11fdf7f2 12225 decode(compat_ondisk_format, p);
f67539c2 12226 } catch (ceph::buffer::error& e) {
7c673cae
FG
12227 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
12228 return -EIO;
12229 }
12230 }
12231 }
f67539c2 12232 dout(1) << __func__ << " ondisk_format " << ondisk_format
7c673cae
FG
12233 << " compat_ondisk_format " << compat_ondisk_format
12234 << dendl;
12235 }
12236
12237 if (latest_ondisk_format < compat_ondisk_format) {
12238 derr << __func__ << " compat_ondisk_format is "
12239 << compat_ondisk_format << " but we only understand version "
12240 << latest_ondisk_format << dendl;
12241 return -EPERM;
12242 }
7c673cae
FG
12243
12244 {
12245 bufferlist bl;
12246 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11fdf7f2 12247 auto p = bl.cbegin();
7c673cae
FG
12248 try {
12249 uint64_t val;
11fdf7f2 12250 decode(val, p);
7c673cae 12251 min_alloc_size = val;
224ce89b 12252 min_alloc_size_order = ctz(val);
20effc67
TL
12253 min_alloc_size_mask = min_alloc_size - 1;
12254
11fdf7f2 12255 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
f67539c2 12256 } catch (ceph::buffer::error& e) {
7c673cae
FG
12257 derr << __func__ << " unable to read min_alloc_size" << dendl;
12258 return -EIO;
12259 }
f67539c2 12260 dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7c673cae 12261 << std::dec << dendl;
20effc67
TL
12262 logger->set(l_bluestore_alloc_unit, min_alloc_size);
12263 }
12264
12265 // smr fields
12266 {
12267 bufferlist bl;
12268 int r = db->get(PREFIX_SUPER, "zone_size", &bl);
12269 if (r >= 0) {
12270 auto p = bl.cbegin();
12271 decode(zone_size, p);
12272 dout(1) << __func__ << " zone_size 0x" << std::hex << zone_size << std::dec << dendl;
12273 ceph_assert(bdev->is_smr());
12274 } else {
12275 ceph_assert(!bdev->is_smr());
12276 }
12277 }
12278 {
12279 bufferlist bl;
12280 int r = db->get(PREFIX_SUPER, "first_sequential_zone", &bl);
12281 if (r >= 0) {
12282 auto p = bl.cbegin();
12283 decode(first_sequential_zone, p);
12284 dout(1) << __func__ << " first_sequential_zone 0x" << std::hex
12285 << first_sequential_zone << std::dec << dendl;
12286 ceph_assert(bdev->is_smr());
12287 } else {
12288 ceph_assert(!bdev->is_smr());
12289 }
7c673cae 12290 }
9f95a23c
TL
12291
12292 _set_per_pool_omap();
12293
224ce89b 12294 _open_statfs();
7c673cae
FG
12295 _set_alloc_sizes();
12296 _set_throttle_params();
12297
12298 _set_csum();
12299 _set_compression();
12300 _set_blob_size();
12301
11fdf7f2 12302 _validate_bdev();
7c673cae
FG
12303 return 0;
12304}
12305
12306int BlueStore::_upgrade_super()
12307{
12308 dout(1) << __func__ << " from " << ondisk_format << ", latest "
12309 << latest_ondisk_format << dendl;
11fdf7f2
TL
12310 if (ondisk_format < latest_ondisk_format) {
12311 ceph_assert(ondisk_format > 0);
12312 ceph_assert(ondisk_format < latest_ondisk_format);
12313
1911f103 12314 KeyValueDB::Transaction t = db->get_transaction();
11fdf7f2
TL
12315 if (ondisk_format == 1) {
12316 // changes:
12317 // - super: added ondisk_format
12318 // - super: added min_readable_ondisk_format
12319 // - super: added min_compat_ondisk_format
12320 // - super: added min_alloc_size
12321 // - super: removed min_min_alloc_size
11fdf7f2
TL
12322 {
12323 bufferlist bl;
12324 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
12325 auto p = bl.cbegin();
12326 try {
12327 uint64_t val;
12328 decode(val, p);
12329 min_alloc_size = val;
f67539c2 12330 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
12331 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
12332 return -EIO;
12333 }
12334 t->set(PREFIX_SUPER, "min_alloc_size", bl);
12335 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7c673cae 12336 }
11fdf7f2 12337 ondisk_format = 2;
7c673cae 12338 }
9f95a23c
TL
12339 if (ondisk_format == 2) {
12340 // changes:
f67539c2
TL
12341 // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all*
12342 // oondes are using the per-pool prefix until a repair is run; at that
9f95a23c
TL
12343 // point the per_pool_omap=1 key will be set.
12344 // - super: added per_pool_omap key, which indicates that *all* objects
12345 // are using the new prefix and key format
12346 ondisk_format = 3;
1911f103
TL
12347 }
12348 if (ondisk_format == 3) {
12349 // changes:
12350 // - FreelistManager keeps meta within bdev label
12351 int r = _write_out_fm_meta(0);
9f95a23c 12352 ceph_assert(r == 0);
1911f103 12353 ondisk_format = 4;
9f95a23c 12354 }
1911f103
TL
12355 // This to be the last operation
12356 _prepare_ondisk_format_super(t);
12357 int r = db->submit_transaction_sync(t);
12358 ceph_assert(r == 0);
7c673cae 12359 }
7c673cae
FG
12360 // done
12361 dout(1) << __func__ << " done" << dendl;
12362 return 0;
12363}
12364
12365void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
12366{
224ce89b 12367 if (o->onode.nid) {
11fdf7f2 12368 ceph_assert(o->exists);
7c673cae 12369 return;
224ce89b 12370 }
7c673cae
FG
12371 uint64_t nid = ++nid_last;
12372 dout(20) << __func__ << " " << nid << dendl;
12373 o->onode.nid = nid;
12374 txc->last_nid = nid;
224ce89b 12375 o->exists = true;
7c673cae
FG
12376}
12377
12378uint64_t BlueStore::_assign_blobid(TransContext *txc)
12379{
12380 uint64_t bid = ++blobid_last;
12381 dout(20) << __func__ << " " << bid << dendl;
12382 txc->last_blobid = bid;
12383 return bid;
12384}
12385
12386void BlueStore::get_db_statistics(Formatter *f)
12387{
12388 db->get_statistics(f);
12389}
12390
11fdf7f2
TL
12391BlueStore::TransContext *BlueStore::_txc_create(
12392 Collection *c, OpSequencer *osr,
f67539c2
TL
12393 list<Context*> *on_commits,
12394 TrackedOpRef osd_op)
7c673cae 12395{
11fdf7f2 12396 TransContext *txc = new TransContext(cct, c, osr, on_commits);
7c673cae 12397 txc->t = db->get_transaction();
f67539c2
TL
12398
12399#ifdef WITH_BLKIN
12400 if (osd_op && osd_op->pg_trace) {
12401 txc->trace.init("TransContext", &trace_endpoint,
12402 &osd_op->pg_trace);
12403 txc->trace.event("txc create");
12404 txc->trace.keyval("txc seq", txc->seq);
12405 }
12406#endif
12407
7c673cae
FG
12408 osr->queue_new(txc);
12409 dout(20) << __func__ << " osr " << osr << " = " << txc
12410 << " seq " << txc->seq << dendl;
12411 return txc;
12412}
12413
12414void BlueStore::_txc_calc_cost(TransContext *txc)
12415{
11fdf7f2
TL
12416 // one "io" for the kv commit
12417 auto ios = 1 + txc->ioc.get_num_ios();
7c673cae
FG
12418 auto cost = throttle_cost_per_io.load();
12419 txc->cost = ios * cost + txc->bytes;
9f95a23c 12420 txc->ios = ios;
7c673cae
FG
12421 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
12422 << ios << " ios * " << cost << " + " << txc->bytes
12423 << " bytes)" << dendl;
12424}
12425
12426void BlueStore::_txc_update_store_statfs(TransContext *txc)
12427{
12428 if (txc->statfs_delta.is_empty())
12429 return;
12430
12431 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
12432 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
12433 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
12434 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
12435 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
12436
12437 bufferlist bl;
12438 txc->statfs_delta.encode(bl);
11fdf7f2
TL
12439 if (per_pool_stat_collection) {
12440 string key;
12441 get_pool_stat_key(txc->osd_pool_id, &key);
12442 txc->t->merge(PREFIX_STAT, key, bl);
12443
12444 std::lock_guard l(vstatfs_lock);
12445 auto& stats = osd_pools[txc->osd_pool_id];
12446 stats += txc->statfs_delta;
12447
12448 vstatfs += txc->statfs_delta; //non-persistent in this mode
12449
12450 } else {
12451 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
7c673cae 12452
11fdf7f2
TL
12453 std::lock_guard l(vstatfs_lock);
12454 vstatfs += txc->statfs_delta;
12455 }
7c673cae
FG
12456 txc->statfs_delta.reset();
12457}
12458
12459void BlueStore::_txc_state_proc(TransContext *txc)
12460{
12461 while (true) {
12462 dout(10) << __func__ << " txc " << txc
12463 << " " << txc->get_state_name() << dendl;
f67539c2 12464 switch (txc->get_state()) {
7c673cae 12465 case TransContext::STATE_PREPARE:
9f95a23c 12466 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
7c673cae 12467 if (txc->ioc.has_pending_aios()) {
f67539c2
TL
12468 txc->set_state(TransContext::STATE_AIO_WAIT);
12469#ifdef WITH_BLKIN
12470 if (txc->trace) {
12471 txc->trace.keyval("pending aios", txc->ioc.num_pending.load());
12472 }
12473#endif
7c673cae
FG
12474 txc->had_ios = true;
12475 _txc_aio_submit(txc);
12476 return;
12477 }
12478 // ** fall-thru **
12479
12480 case TransContext::STATE_AIO_WAIT:
11fdf7f2 12481 {
9f95a23c
TL
12482 mono_clock::duration lat = throttle.log_state_latency(
12483 *txc, logger, l_bluestore_state_aio_wait_lat);
12484 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
11fdf7f2
TL
12485 dout(0) << __func__ << " slow aio_wait, txc = " << txc
12486 << ", latency = " << lat
12487 << dendl;
12488 }
12489 }
12490
7c673cae
FG
12491 _txc_finish_io(txc); // may trigger blocked txc's too
12492 return;
12493
12494 case TransContext::STATE_IO_DONE:
11fdf7f2 12495 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
7c673cae
FG
12496 if (txc->had_ios) {
12497 ++txc->osr->txc_with_unstable_io;
12498 }
9f95a23c 12499 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
f67539c2 12500 txc->set_state(TransContext::STATE_KV_QUEUED);
7c673cae
FG
12501 if (cct->_conf->bluestore_sync_submit_transaction) {
12502 if (txc->last_nid >= nid_max ||
12503 txc->last_blobid >= blobid_max) {
12504 dout(20) << __func__
12505 << " last_{nid,blobid} exceeds max, submit via kv thread"
12506 << dendl;
12507 } else if (txc->osr->kv_committing_serially) {
12508 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
12509 << dendl;
12510 // note: this is starvation-prone. once we have a txc in a busy
12511 // sequencer that is committing serially it is possible to keep
12512 // submitting new transactions fast enough that we get stuck doing
12513 // so. the alternative is to block here... fixme?
12514 } else if (txc->osr->txc_with_unstable_io) {
12515 dout(20) << __func__ << " prior txc(s) with unstable ios "
12516 << txc->osr->txc_with_unstable_io.load() << dendl;
12517 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
12518 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
12519 == 0) {
12520 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
12521 << dendl;
12522 } else {
9f95a23c 12523 _txc_apply_kv(txc, true);
7c673cae
FG
12524 }
12525 }
12526 {
11fdf7f2 12527 std::lock_guard l(kv_lock);
7c673cae 12528 kv_queue.push_back(txc);
9f95a23c
TL
12529 if (!kv_sync_in_progress) {
12530 kv_sync_in_progress = true;
12531 kv_cond.notify_one();
12532 }
f67539c2 12533 if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) {
7c673cae
FG
12534 kv_queue_unsubmitted.push_back(txc);
12535 ++txc->osr->kv_committing_serially;
12536 }
31f18b77
FG
12537 if (txc->had_ios)
12538 kv_ios++;
12539 kv_throttle_costs += txc->cost;
7c673cae
FG
12540 }
12541 return;
12542 case TransContext::STATE_KV_SUBMITTED:
7c673cae
FG
12543 _txc_committed_kv(txc);
12544 // ** fall-thru **
12545
12546 case TransContext::STATE_KV_DONE:
9f95a23c 12547 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
7c673cae 12548 if (txc->deferred_txn) {
f67539c2 12549 txc->set_state(TransContext::STATE_DEFERRED_QUEUED);
7c673cae
FG
12550 _deferred_queue(txc);
12551 return;
12552 }
f67539c2 12553 txc->set_state(TransContext::STATE_FINISHING);
7c673cae
FG
12554 break;
12555
12556 case TransContext::STATE_DEFERRED_CLEANUP:
9f95a23c 12557 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
f67539c2 12558 txc->set_state(TransContext::STATE_FINISHING);
7c673cae
FG
12559 // ** fall-thru **
12560
12561 case TransContext::STATE_FINISHING:
9f95a23c 12562 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
7c673cae
FG
12563 _txc_finish(txc);
12564 return;
12565
12566 default:
12567 derr << __func__ << " unexpected txc " << txc
12568 << " state " << txc->get_state_name() << dendl;
11fdf7f2 12569 ceph_abort_msg("unexpected txc state");
7c673cae
FG
12570 return;
12571 }
12572 }
12573}
12574
12575void BlueStore::_txc_finish_io(TransContext *txc)
12576{
12577 dout(20) << __func__ << " " << txc << dendl;
12578
12579 /*
12580 * we need to preserve the order of kv transactions,
12581 * even though aio will complete in any order.
12582 */
12583
12584 OpSequencer *osr = txc->osr.get();
11fdf7f2 12585 std::lock_guard l(osr->qlock);
f67539c2 12586 txc->set_state(TransContext::STATE_IO_DONE);
11fdf7f2 12587 txc->ioc.release_running_aios();
7c673cae
FG
12588 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
12589 while (p != osr->q.begin()) {
12590 --p;
f67539c2 12591 if (p->get_state() < TransContext::STATE_IO_DONE) {
7c673cae
FG
12592 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
12593 << p->get_state_name() << dendl;
12594 return;
12595 }
f67539c2 12596 if (p->get_state() > TransContext::STATE_IO_DONE) {
7c673cae
FG
12597 ++p;
12598 break;
12599 }
12600 }
12601 do {
12602 _txc_state_proc(&*p++);
12603 } while (p != osr->q.end() &&
f67539c2 12604 p->get_state() == TransContext::STATE_IO_DONE);
7c673cae 12605
11fdf7f2 12606 if (osr->kv_submitted_waiters) {
7c673cae
FG
12607 osr->qcond.notify_all();
12608 }
12609}
12610
12611void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
12612{
12613 dout(20) << __func__ << " txc " << txc
12614 << " onodes " << txc->onodes
12615 << " shared_blobs " << txc->shared_blobs
12616 << dendl;
12617
12618 // finalize onodes
12619 for (auto o : txc->onodes) {
11fdf7f2 12620 _record_onode(o, t);
7c673cae
FG
12621 o->flushing_count++;
12622 }
12623
12624 // objects we modified but didn't affect the onode
12625 auto p = txc->modified_objects.begin();
12626 while (p != txc->modified_objects.end()) {
12627 if (txc->onodes.count(*p) == 0) {
12628 (*p)->flushing_count++;
12629 ++p;
12630 } else {
12631 // remove dups with onodes list to avoid problems in _txc_finish
12632 p = txc->modified_objects.erase(p);
12633 }
12634 }
12635
12636 // finalize shared_blobs
12637 for (auto sb : txc->shared_blobs) {
12638 string key;
12639 auto sbid = sb->get_sbid();
12640 get_shared_blob_key(sbid, &key);
12641 if (sb->persistent->empty()) {
11fdf7f2
TL
12642 dout(20) << __func__ << " shared_blob 0x"
12643 << std::hex << sbid << std::dec
7c673cae
FG
12644 << " is empty" << dendl;
12645 t->rmkey(PREFIX_SHARED_BLOB, key);
12646 } else {
12647 bufferlist bl;
11fdf7f2
TL
12648 encode(*(sb->persistent), bl);
12649 dout(20) << __func__ << " shared_blob 0x"
12650 << std::hex << sbid << std::dec
31f18b77 12651 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
12652 t->set(PREFIX_SHARED_BLOB, key, bl);
12653 }
12654 }
12655}
12656
12657void BlueStore::BSPerfTracker::update_from_perfcounters(
12658 PerfCounters &logger)
12659{
11fdf7f2
TL
12660 os_commit_latency_ns.consume_next(
12661 logger.get_tavg_ns(
7c673cae 12662 l_bluestore_commit_lat));
11fdf7f2
TL
12663 os_apply_latency_ns.consume_next(
12664 logger.get_tavg_ns(
7c673cae
FG
12665 l_bluestore_commit_lat));
12666}
12667
12668void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
12669{
12670 dout(20) << __func__ << " txc " << txc << std::hex
12671 << " allocated 0x" << txc->allocated
12672 << " released 0x" << txc->released
12673 << std::dec << dendl;
12674
20effc67
TL
12675 if (!fm->is_null_manager())
12676 {
12677 // We have to handle the case where we allocate *and* deallocate the
12678 // same region in this transaction. The freelist doesn't like that.
12679 // (Actually, the only thing that cares is the BitmapFreelistManager
12680 // debug check. But that's important.)
12681 interval_set<uint64_t> tmp_allocated, tmp_released;
12682 interval_set<uint64_t> *pallocated = &txc->allocated;
12683 interval_set<uint64_t> *preleased = &txc->released;
12684 if (!txc->allocated.empty() && !txc->released.empty()) {
12685 interval_set<uint64_t> overlap;
12686 overlap.intersection_of(txc->allocated, txc->released);
12687 if (!overlap.empty()) {
12688 tmp_allocated = txc->allocated;
12689 tmp_allocated.subtract(overlap);
12690 tmp_released = txc->released;
12691 tmp_released.subtract(overlap);
12692 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
12693 << ", new allocated 0x" << tmp_allocated
12694 << " released 0x" << tmp_released << std::dec
12695 << dendl;
12696 pallocated = &tmp_allocated;
12697 preleased = &tmp_released;
12698 }
7c673cae 12699 }
7c673cae 12700
20effc67
TL
12701 // update freelist with non-overlap sets
12702 for (interval_set<uint64_t>::iterator p = pallocated->begin();
12703 p != pallocated->end();
12704 ++p) {
12705 fm->allocate(p.get_start(), p.get_len(), t);
12706 }
12707 for (interval_set<uint64_t>::iterator p = preleased->begin();
12708 p != preleased->end();
12709 ++p) {
12710 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
12711 << "~" << p.get_len() << std::dec << dendl;
12712 fm->release(p.get_start(), p.get_len(), t);
12713 }
7c673cae
FG
12714 }
12715
20effc67 12716#ifdef HAVE_LIBZBD
f67539c2 12717 if (bdev->is_smr()) {
20effc67
TL
12718 for (auto& i : txc->old_zone_offset_refs) {
12719 dout(20) << __func__ << " rm ref zone 0x" << std::hex << i.first.second
12720 << " offset 0x" << i.second << std::dec
12721 << " -> " << i.first.first->oid << dendl;
12722 string key;
12723 get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
12724 txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
12725 }
12726 for (auto& i : txc->new_zone_offset_refs) {
12727 // (zone, offset) -> oid
12728 dout(20) << __func__ << " add ref zone 0x" << std::hex << i.first.second
12729 << " offset 0x" << i.second << std::dec
12730 << " -> " << i.first.first->oid << dendl;
12731 string key;
12732 get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
12733 bufferlist v;
12734 txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
12735 }
f67539c2 12736 }
20effc67 12737#endif
f67539c2 12738
7c673cae
FG
12739 _txc_update_store_statfs(txc);
12740}
12741
9f95a23c 12742void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
7c673cae 12743{
f67539c2 12744 ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED);
9f95a23c
TL
12745 {
12746#if defined(WITH_LTTNG)
12747 auto start = mono_clock::now();
12748#endif
12749
f67539c2
TL
12750#ifdef WITH_BLKIN
12751 if (txc->trace) {
12752 txc->trace.event("db async submit");
12753 }
12754#endif
12755
9f95a23c
TL
12756 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
12757 ceph_assert(r == 0);
f67539c2 12758 txc->set_state(TransContext::STATE_KV_SUBMITTED);
9f95a23c
TL
12759 if (txc->osr->kv_submitted_waiters) {
12760 std::lock_guard l(txc->osr->qlock);
12761 txc->osr->qcond.notify_all();
12762 }
12763
12764#if defined(WITH_LTTNG)
12765 if (txc->tracing) {
12766 tracepoint(
12767 bluestore,
12768 transaction_kv_submit_latency,
12769 txc->osr->get_sequencer_id(),
12770 txc->seq,
12771 sync_submit_transaction,
12772 ceph::to_seconds<double>(mono_clock::now() - start));
12773 }
12774#endif
12775 }
12776
7c673cae
FG
12777 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
12778 for (auto& o : *ls) {
12779 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
12780 << dendl;
9f95a23c 12781 if (--o->flushing_count == 0 && o->waiting_count.load()) {
11fdf7f2 12782 std::lock_guard l(o->flush_lock);
7c673cae
FG
12783 o->flush_cond.notify_all();
12784 }
12785 }
12786 }
12787}
12788
12789void BlueStore::_txc_committed_kv(TransContext *txc)
12790{
12791 dout(20) << __func__ << " txc " << txc << dendl;
9f95a23c 12792 throttle.complete_kv(*txc);
1adf2230 12793 {
11fdf7f2 12794 std::lock_guard l(txc->osr->qlock);
f67539c2 12795 txc->set_state(TransContext::STATE_KV_DONE);
11fdf7f2
TL
12796 if (txc->ch->commit_queue) {
12797 txc->ch->commit_queue->queue(txc->oncommits);
12798 } else {
12799 finisher.queue(txc->oncommits);
1adf2230 12800 }
7c673cae 12801 }
9f95a23c 12802 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
494da23a
TL
12803 log_latency_fn(
12804 __func__,
12805 l_bluestore_commit_lat,
9f95a23c 12806 mono_clock::now() - txc->start,
494da23a
TL
12807 cct->_conf->bluestore_log_op_age,
12808 [&](auto lat) {
12809 return ", txc = " + stringify(txc);
12810 }
11fdf7f2 12811 );
7c673cae
FG
12812}
12813
12814void BlueStore::_txc_finish(TransContext *txc)
12815{
12816 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
f67539c2 12817 ceph_assert(txc->get_state() == TransContext::STATE_FINISHING);
7c673cae
FG
12818
12819 for (auto& sb : txc->shared_blobs_written) {
f64942e4 12820 sb->finish_write(txc->seq);
7c673cae
FG
12821 }
12822 txc->shared_blobs_written.clear();
12823
12824 while (!txc->removed_collections.empty()) {
12825 _queue_reap_collection(txc->removed_collections.front());
12826 txc->removed_collections.pop_front();
12827 }
12828
12829 OpSequencerRef osr = txc->osr;
7c673cae 12830 bool empty = false;
31f18b77 12831 bool submit_deferred = false;
7c673cae
FG
12832 OpSequencer::q_list_t releasing_txc;
12833 {
11fdf7f2 12834 std::lock_guard l(osr->qlock);
f67539c2 12835 txc->set_state(TransContext::STATE_DONE);
7c673cae
FG
12836 bool notify = false;
12837 while (!osr->q.empty()) {
12838 TransContext *txc = &osr->q.front();
12839 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
12840 << dendl;
f67539c2
TL
12841 if (txc->get_state() != TransContext::STATE_DONE) {
12842 if (txc->get_state() == TransContext::STATE_PREPARE &&
7c673cae
FG
12843 deferred_aggressive) {
12844 // for _osr_drain_preceding()
12845 notify = true;
12846 }
f67539c2 12847 if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED &&
11fdf7f2 12848 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
31f18b77
FG
12849 submit_deferred = true;
12850 }
7c673cae
FG
12851 break;
12852 }
12853
7c673cae
FG
12854 osr->q.pop_front();
12855 releasing_txc.push_back(*txc);
7c673cae 12856 }
9f95a23c 12857
7c673cae
FG
12858 if (osr->q.empty()) {
12859 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
12860 empty = true;
12861 }
9f95a23c
TL
12862
12863 // only drain()/drain_preceding() need wakeup,
12864 // other cases use kv_submitted_waiters
12865 if (notify || empty) {
12866 osr->qcond.notify_all();
12867 }
7c673cae 12868 }
9f95a23c 12869
7c673cae
FG
12870 while (!releasing_txc.empty()) {
12871 // release to allocator only after all preceding txc's have also
12872 // finished any deferred writes that potentially land in these
12873 // blocks
12874 auto txc = &releasing_txc.front();
12875 _txc_release_alloc(txc);
12876 releasing_txc.pop_front();
9f95a23c
TL
12877 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
12878 throttle.complete(*txc);
7c673cae
FG
12879 delete txc;
12880 }
12881
31f18b77
FG
12882 if (submit_deferred) {
12883 // we're pinning memory; flush! we could be more fine-grained here but
12884 // i'm not sure it's worth the bother.
12885 deferred_try_submit();
7c673cae
FG
12886 }
12887
7c673cae 12888 if (empty && osr->zombie) {
11fdf7f2
TL
12889 std::lock_guard l(zombie_osr_lock);
12890 if (zombie_osr_set.erase(osr->cid)) {
12891 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
12892 } else {
12893 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
12894 << dendl;
12895 }
7c673cae 12896 }
9f95a23c 12897}
7c673cae
FG
12898
12899void BlueStore::_txc_release_alloc(TransContext *txc)
12900{
a8e16298 12901 // it's expected we're called with lazy_release_lock already taken!
11fdf7f2
TL
12902 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
12903 int r = 0;
12904 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
12905 r = bdev->queue_discard(txc->released);
12906 if (r == 0) {
12907 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
12908 << txc->released << std::dec << dendl;
12909 goto out;
12910 }
12911 } else if (cct->_conf->bdev_enable_discard) {
12912 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
12913 bdev->discard(p.get_start(), p.get_len());
12914 }
12915 }
12916 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
94b18763 12917 << txc->released << std::dec << dendl;
20effc67 12918 alloc->release(txc->released);
7c673cae
FG
12919 }
12920
11fdf7f2 12921out:
7c673cae
FG
12922 txc->allocated.clear();
12923 txc->released.clear();
12924}
12925
11fdf7f2
TL
12926void BlueStore::_osr_attach(Collection *c)
12927{
20effc67 12928 // note: caller has coll_lock
11fdf7f2
TL
12929 auto q = coll_map.find(c->cid);
12930 if (q != coll_map.end()) {
12931 c->osr = q->second->osr;
12932 ldout(cct, 10) << __func__ << " " << c->cid
12933 << " reusing osr " << c->osr << " from existing coll "
12934 << q->second << dendl;
12935 } else {
12936 std::lock_guard l(zombie_osr_lock);
12937 auto p = zombie_osr_set.find(c->cid);
12938 if (p == zombie_osr_set.end()) {
9f95a23c 12939 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
11fdf7f2
TL
12940 ldout(cct, 10) << __func__ << " " << c->cid
12941 << " fresh osr " << c->osr << dendl;
12942 } else {
12943 c->osr = p->second;
12944 zombie_osr_set.erase(p);
12945 ldout(cct, 10) << __func__ << " " << c->cid
12946 << " resurrecting zombie osr " << c->osr << dendl;
12947 c->osr->zombie = false;
12948 }
12949 }
12950}
12951
12952void BlueStore::_osr_register_zombie(OpSequencer *osr)
12953{
12954 std::lock_guard l(zombie_osr_lock);
12955 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
12956 osr->zombie = true;
12957 auto i = zombie_osr_set.emplace(osr->cid, osr);
12958 // this is either a new insertion or the same osr is already there
12959 ceph_assert(i.second || i.first->second == osr);
12960}
12961
7c673cae
FG
12962void BlueStore::_osr_drain_preceding(TransContext *txc)
12963{
12964 OpSequencer *osr = txc->osr.get();
12965 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
12966 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
12967 {
12968 // submit anything pending
f67539c2 12969 osr->deferred_lock.lock();
11fdf7f2 12970 if (osr->deferred_pending && !osr->deferred_running) {
224ce89b
WB
12971 _deferred_submit_unlock(osr);
12972 } else {
f67539c2 12973 osr->deferred_lock.unlock();
7c673cae
FG
12974 }
12975 }
12976 {
12977 // wake up any previously finished deferred events
11fdf7f2 12978 std::lock_guard l(kv_lock);
9f95a23c
TL
12979 if (!kv_sync_in_progress) {
12980 kv_sync_in_progress = true;
12981 kv_cond.notify_one();
12982 }
7c673cae
FG
12983 }
12984 osr->drain_preceding(txc);
12985 --deferred_aggressive;
12986 dout(10) << __func__ << " " << osr << " done" << dendl;
12987}
12988
11fdf7f2
TL
12989void BlueStore::_osr_drain(OpSequencer *osr)
12990{
12991 dout(10) << __func__ << " " << osr << dendl;
12992 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
12993 {
12994 // submit anything pending
f67539c2 12995 osr->deferred_lock.lock();
11fdf7f2
TL
12996 if (osr->deferred_pending && !osr->deferred_running) {
12997 _deferred_submit_unlock(osr);
12998 } else {
f67539c2 12999 osr->deferred_lock.unlock();
11fdf7f2
TL
13000 }
13001 }
13002 {
13003 // wake up any previously finished deferred events
13004 std::lock_guard l(kv_lock);
9f95a23c
TL
13005 if (!kv_sync_in_progress) {
13006 kv_sync_in_progress = true;
13007 kv_cond.notify_one();
13008 }
11fdf7f2
TL
13009 }
13010 osr->drain();
13011 --deferred_aggressive;
13012 dout(10) << __func__ << " " << osr << " done" << dendl;
13013}
13014
7c673cae
FG
13015void BlueStore::_osr_drain_all()
13016{
13017 dout(10) << __func__ << dendl;
13018
13019 set<OpSequencerRef> s;
11fdf7f2
TL
13020 vector<OpSequencerRef> zombies;
13021 {
9f95a23c 13022 std::shared_lock l(coll_lock);
11fdf7f2
TL
13023 for (auto& i : coll_map) {
13024 s.insert(i.second->osr);
13025 }
13026 }
7c673cae 13027 {
11fdf7f2
TL
13028 std::lock_guard l(zombie_osr_lock);
13029 for (auto& i : zombie_osr_set) {
13030 s.insert(i.second);
13031 zombies.push_back(i.second);
13032 }
7c673cae
FG
13033 }
13034 dout(20) << __func__ << " osr_set " << s << dendl;
13035
13036 ++deferred_aggressive;
13037 {
13038 // submit anything pending
224ce89b 13039 deferred_try_submit();
7c673cae
FG
13040 }
13041 {
13042 // wake up any previously finished deferred events
11fdf7f2 13043 std::lock_guard l(kv_lock);
7c673cae
FG
13044 kv_cond.notify_one();
13045 }
31f18b77 13046 {
11fdf7f2 13047 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
13048 kv_finalize_cond.notify_one();
13049 }
7c673cae
FG
13050 for (auto osr : s) {
13051 dout(20) << __func__ << " drain " << osr << dendl;
13052 osr->drain();
13053 }
13054 --deferred_aggressive;
13055
7c673cae 13056 {
11fdf7f2
TL
13057 std::lock_guard l(zombie_osr_lock);
13058 for (auto& osr : zombies) {
13059 if (zombie_osr_set.erase(osr->cid)) {
13060 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
13061 ceph_assert(osr->q.empty());
13062 } else if (osr->zombie) {
13063 dout(10) << __func__ << " empty zombie osr " << osr
13064 << " already reaped" << dendl;
13065 ceph_assert(osr->q.empty());
13066 } else {
13067 dout(10) << __func__ << " empty zombie osr " << osr
13068 << " resurrected" << dendl;
13069 }
7c673cae
FG
13070 }
13071 }
11fdf7f2
TL
13072
13073 dout(10) << __func__ << " done" << dendl;
7c673cae
FG
13074}
13075
11fdf7f2 13076
31f18b77
FG
13077void BlueStore::_kv_start()
13078{
13079 dout(10) << __func__ << dendl;
13080
11fdf7f2 13081 finisher.start();
31f18b77
FG
13082 kv_sync_thread.create("bstore_kv_sync");
13083 kv_finalize_thread.create("bstore_kv_final");
13084}
13085
13086void BlueStore::_kv_stop()
13087{
13088 dout(10) << __func__ << dendl;
13089 {
9f95a23c 13090 std::unique_lock l{kv_lock};
31f18b77
FG
13091 while (!kv_sync_started) {
13092 kv_cond.wait(l);
13093 }
13094 kv_stop = true;
13095 kv_cond.notify_all();
13096 }
13097 {
9f95a23c 13098 std::unique_lock l{kv_finalize_lock};
31f18b77
FG
13099 while (!kv_finalize_started) {
13100 kv_finalize_cond.wait(l);
13101 }
13102 kv_finalize_stop = true;
13103 kv_finalize_cond.notify_all();
13104 }
13105 kv_sync_thread.join();
13106 kv_finalize_thread.join();
11fdf7f2 13107 ceph_assert(removed_collections.empty());
31f18b77 13108 {
11fdf7f2 13109 std::lock_guard l(kv_lock);
31f18b77
FG
13110 kv_stop = false;
13111 }
13112 {
11fdf7f2 13113 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
13114 kv_finalize_stop = false;
13115 }
13116 dout(10) << __func__ << " stopping finishers" << dendl;
11fdf7f2
TL
13117 finisher.wait_for_empty();
13118 finisher.stop();
31f18b77
FG
13119 dout(10) << __func__ << " stopped" << dendl;
13120}
13121
7c673cae
FG
13122void BlueStore::_kv_sync_thread()
13123{
13124 dout(10) << __func__ << " start" << dendl;
11fdf7f2 13125 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
9f95a23c 13126 std::unique_lock l{kv_lock};
11fdf7f2 13127 ceph_assert(!kv_sync_started);
31f18b77
FG
13128 kv_sync_started = true;
13129 kv_cond.notify_all();
adb31ebb
TL
13130
13131 auto t0 = mono_clock::now();
13132 timespan twait = ceph::make_timespan(0);
13133 size_t kv_submitted = 0;
13134
7c673cae 13135 while (true) {
adb31ebb
TL
13136 auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
13137 auto observation_period =
13138 ceph::make_timespan(period);
13139 auto elapsed = mono_clock::now() - t0;
13140 if (period && elapsed >= observation_period) {
13141 dout(5) << __func__ << " utilization: idle "
13142 << twait << " of " << elapsed
13143 << ", submitted: " << kv_submitted
13144 <<dendl;
13145 t0 = mono_clock::now();
13146 twait = ceph::make_timespan(0);
13147 kv_submitted = 0;
13148 }
11fdf7f2 13149 ceph_assert(kv_committing.empty());
7c673cae
FG
13150 if (kv_queue.empty() &&
13151 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
11fdf7f2 13152 !deferred_aggressive)) {
7c673cae
FG
13153 if (kv_stop)
13154 break;
13155 dout(20) << __func__ << " sleep" << dendl;
adb31ebb 13156 auto t = mono_clock::now();
9f95a23c 13157 kv_sync_in_progress = false;
11fdf7f2 13158 kv_cond.wait(l);
adb31ebb
TL
13159 twait += mono_clock::now() - t;
13160
7c673cae
FG
13161 dout(20) << __func__ << " wake" << dendl;
13162 } else {
13163 deque<TransContext*> kv_submitting;
13164 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
13165 uint64_t aios = 0, costs = 0;
13166
7c673cae
FG
13167 dout(20) << __func__ << " committing " << kv_queue.size()
13168 << " submitting " << kv_queue_unsubmitted.size()
13169 << " deferred done " << deferred_done_queue.size()
13170 << " stable " << deferred_stable_queue.size()
13171 << dendl;
13172 kv_committing.swap(kv_queue);
13173 kv_submitting.swap(kv_queue_unsubmitted);
13174 deferred_done.swap(deferred_done_queue);
13175 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
13176 aios = kv_ios;
13177 costs = kv_throttle_costs;
13178 kv_ios = 0;
13179 kv_throttle_costs = 0;
7c673cae
FG
13180 l.unlock();
13181
13182 dout(30) << __func__ << " committing " << kv_committing << dendl;
13183 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
13184 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
13185 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
13186
11fdf7f2
TL
13187 auto start = mono_clock::now();
13188
7c673cae
FG
13189 bool force_flush = false;
13190 // if bluefs is sharing the same device as data (only), then we
13191 // can rely on the bluefs commit to flush the device and make
13192 // deferred aios stable. that means that if we do have done deferred
13193 // txcs AND we are not on a single device, we need to force a flush.
9f95a23c 13194 if (bluefs && bluefs_layout.single_shared_device()) {
31f18b77 13195 if (aios) {
7c673cae 13196 force_flush = true;
11fdf7f2 13197 } else if (kv_committing.empty() && deferred_stable.empty()) {
7c673cae
FG
13198 force_flush = true; // there's nothing else to commit!
13199 } else if (deferred_aggressive) {
13200 force_flush = true;
13201 }
11fdf7f2
TL
13202 } else {
13203 if (aios || !deferred_done.empty()) {
13204 force_flush = true;
13205 } else {
13206 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
13207 }
13208 }
7c673cae
FG
13209
13210 if (force_flush) {
31f18b77 13211 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
13212 << " force_flush=" << (int)force_flush
13213 << ", flushing, deferred done->stable" << dendl;
13214 // flush/barrier on block device
13215 bdev->flush();
13216
13217 // if we flush then deferred done are now deferred stable
13218 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
13219 deferred_done.end());
13220 deferred_done.clear();
13221 }
11fdf7f2 13222 auto after_flush = mono_clock::now();
7c673cae
FG
13223
13224 // we will use one final transaction to force a sync
13225 KeyValueDB::Transaction synct = db->get_transaction();
13226
13227 // increase {nid,blobid}_max? note that this covers both the
13228 // case where we are approaching the max and the case we passed
13229 // it. in either case, we increase the max in the earlier txn
13230 // we submit.
13231 uint64_t new_nid_max = 0, new_blobid_max = 0;
13232 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
13233 KeyValueDB::Transaction t =
13234 kv_submitting.empty() ? synct : kv_submitting.front()->t;
13235 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
13236 bufferlist bl;
11fdf7f2 13237 encode(new_nid_max, bl);
7c673cae
FG
13238 t->set(PREFIX_SUPER, "nid_max", bl);
13239 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
13240 }
13241 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
13242 KeyValueDB::Transaction t =
13243 kv_submitting.empty() ? synct : kv_submitting.front()->t;
13244 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
13245 bufferlist bl;
11fdf7f2 13246 encode(new_blobid_max, bl);
7c673cae
FG
13247 t->set(PREFIX_SUPER, "blobid_max", bl);
13248 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
13249 }
c07f9fc5
FG
13250
13251 for (auto txc : kv_committing) {
9f95a23c 13252 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
f67539c2 13253 if (txc->get_state() == TransContext::STATE_KV_QUEUED) {
adb31ebb 13254 ++kv_submitted;
9f95a23c 13255 _txc_apply_kv(txc, false);
c07f9fc5 13256 --txc->osr->kv_committing_serially;
c07f9fc5 13257 } else {
f67539c2 13258 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
7c673cae 13259 }
7c673cae
FG
13260 if (txc->had_ios) {
13261 --txc->osr->txc_with_unstable_io;
13262 }
7c673cae
FG
13263 }
13264
31f18b77
FG
13265 // release throttle *before* we commit. this allows new ops
13266 // to be prepared and enter pipeline while we are waiting on
13267 // the kv commit sync/flush. then hopefully on the next
13268 // iteration there will already be ops awake. otherwise, we
13269 // end up going to sleep, and then wake up when the very first
13270 // transaction is ready for commit.
9f95a23c 13271 throttle.release_kv_throttle(costs);
31f18b77 13272
7c673cae
FG
13273 // cleanup sync deferred keys
13274 for (auto b : deferred_stable) {
13275 for (auto& txc : b->txcs) {
13276 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
11fdf7f2 13277 ceph_assert(wt.released.empty()); // only kraken did this
7c673cae
FG
13278 string key;
13279 get_deferred_key(wt.seq, &key);
13280 synct->rm_single_key(PREFIX_DEFERRED, key);
13281 }
13282 }
13283
9f95a23c
TL
13284#if defined(WITH_LTTNG)
13285 auto sync_start = mono_clock::now();
13286#endif
7c673cae 13287 // submit synct synchronously (block and wait for it to commit)
31f18b77 13288 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
11fdf7f2
TL
13289 ceph_assert(r == 0);
13290
f67539c2
TL
13291#ifdef WITH_BLKIN
13292 for (auto txc : kv_committing) {
13293 if (txc->trace) {
13294 txc->trace.event("db sync submit");
13295 txc->trace.keyval("kv_committing size", kv_committing.size());
13296 }
13297 }
13298#endif
13299
9f95a23c
TL
13300 int committing_size = kv_committing.size();
13301 int deferred_size = deferred_stable.size();
13302
13303#if defined(WITH_LTTNG)
13304 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
13305 for (auto txc: kv_committing) {
13306 if (txc->tracing) {
13307 tracepoint(
13308 bluestore,
13309 transaction_kv_sync_latency,
13310 txc->osr->get_sequencer_id(),
13311 txc->seq,
13312 kv_committing.size(),
13313 deferred_done.size(),
13314 deferred_stable.size(),
13315 sync_latency);
13316 }
13317 }
13318#endif
13319
11fdf7f2 13320 {
9f95a23c 13321 std::unique_lock m{kv_finalize_lock};
11fdf7f2
TL
13322 if (kv_committing_to_finalize.empty()) {
13323 kv_committing_to_finalize.swap(kv_committing);
13324 } else {
13325 kv_committing_to_finalize.insert(
13326 kv_committing_to_finalize.end(),
13327 kv_committing.begin(),
13328 kv_committing.end());
13329 kv_committing.clear();
13330 }
13331 if (deferred_stable_to_finalize.empty()) {
13332 deferred_stable_to_finalize.swap(deferred_stable);
13333 } else {
13334 deferred_stable_to_finalize.insert(
13335 deferred_stable_to_finalize.end(),
13336 deferred_stable.begin(),
13337 deferred_stable.end());
13338 deferred_stable.clear();
13339 }
9f95a23c
TL
13340 if (!kv_finalize_in_progress) {
13341 kv_finalize_in_progress = true;
13342 kv_finalize_cond.notify_one();
13343 }
11fdf7f2 13344 }
7c673cae
FG
13345
13346 if (new_nid_max) {
13347 nid_max = new_nid_max;
13348 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
13349 }
13350 if (new_blobid_max) {
13351 blobid_max = new_blobid_max;
13352 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
13353 }
13354
224ce89b 13355 {
11fdf7f2
TL
13356 auto finish = mono_clock::now();
13357 ceph::timespan dur_flush = after_flush - start;
13358 ceph::timespan dur_kv = finish - after_flush;
13359 ceph::timespan dur = finish - start;
9f95a23c
TL
13360 dout(20) << __func__ << " committed " << committing_size
13361 << " cleaned " << deferred_size
224ce89b
WB
13362 << " in " << dur
13363 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
13364 << dendl;
494da23a
TL
13365 log_latency("kv_flush",
13366 l_bluestore_kv_flush_lat,
13367 dur_flush,
13368 cct->_conf->bluestore_log_op_age);
13369 log_latency("kv_commit",
13370 l_bluestore_kv_commit_lat,
13371 dur_kv,
13372 cct->_conf->bluestore_log_op_age);
13373 log_latency("kv_sync",
13374 l_bluestore_kv_sync_lat,
13375 dur,
13376 cct->_conf->bluestore_log_op_age);
7c673cae 13377 }
31f18b77 13378
31f18b77
FG
13379 l.lock();
13380 // previously deferred "done" are now "stable" by virtue of this
13381 // commit cycle.
13382 deferred_stable_queue.swap(deferred_done);
13383 }
13384 }
13385 dout(10) << __func__ << " finish" << dendl;
13386 kv_sync_started = false;
13387}
13388
13389void BlueStore::_kv_finalize_thread()
13390{
13391 deque<TransContext*> kv_committed;
13392 deque<DeferredBatch*> deferred_stable;
13393 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
13394 std::unique_lock l(kv_finalize_lock);
13395 ceph_assert(!kv_finalize_started);
31f18b77
FG
13396 kv_finalize_started = true;
13397 kv_finalize_cond.notify_all();
13398 while (true) {
11fdf7f2
TL
13399 ceph_assert(kv_committed.empty());
13400 ceph_assert(deferred_stable.empty());
31f18b77
FG
13401 if (kv_committing_to_finalize.empty() &&
13402 deferred_stable_to_finalize.empty()) {
13403 if (kv_finalize_stop)
13404 break;
13405 dout(20) << __func__ << " sleep" << dendl;
9f95a23c 13406 kv_finalize_in_progress = false;
31f18b77
FG
13407 kv_finalize_cond.wait(l);
13408 dout(20) << __func__ << " wake" << dendl;
13409 } else {
13410 kv_committed.swap(kv_committing_to_finalize);
13411 deferred_stable.swap(deferred_stable_to_finalize);
13412 l.unlock();
13413 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
13414 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
13415
11fdf7f2
TL
13416 auto start = mono_clock::now();
13417
31f18b77
FG
13418 while (!kv_committed.empty()) {
13419 TransContext *txc = kv_committed.front();
f67539c2 13420 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
7c673cae 13421 _txc_state_proc(txc);
31f18b77 13422 kv_committed.pop_front();
7c673cae 13423 }
31f18b77 13424
7c673cae
FG
13425 for (auto b : deferred_stable) {
13426 auto p = b->txcs.begin();
13427 while (p != b->txcs.end()) {
13428 TransContext *txc = &*p;
13429 p = b->txcs.erase(p); // unlink here because
13430 _txc_state_proc(txc); // this may destroy txc
13431 }
13432 delete b;
13433 }
31f18b77 13434 deferred_stable.clear();
7c673cae
FG
13435
13436 if (!deferred_aggressive) {
31f18b77 13437 if (deferred_queue_size >= deferred_batch_ops.load() ||
9f95a23c 13438 throttle.should_submit_deferred()) {
224ce89b 13439 deferred_try_submit();
7c673cae
FG
13440 }
13441 }
13442
13443 // this is as good a place as any ...
13444 _reap_collections();
13445
11fdf7f2 13446 logger->set(l_bluestore_fragmentation,
20effc67 13447 (uint64_t)(alloc->get_fragmentation() * 1000));
11fdf7f2 13448
494da23a
TL
13449 log_latency("kv_final",
13450 l_bluestore_kv_final_lat,
13451 mono_clock::now() - start,
13452 cct->_conf->bluestore_log_op_age);
11fdf7f2 13453
7c673cae 13454 l.lock();
7c673cae
FG
13455 }
13456 }
13457 dout(10) << __func__ << " finish" << dendl;
31f18b77 13458 kv_finalize_started = false;
7c673cae
FG
13459}
13460
20effc67
TL
13461#ifdef HAVE_LIBZBD
13462void BlueStore::_zoned_cleaner_start()
13463{
f67539c2 13464 dout(10) << __func__ << dendl;
f67539c2
TL
13465 zoned_cleaner_thread.create("bstore_zcleaner");
13466}
13467
20effc67
TL
13468void BlueStore::_zoned_cleaner_stop()
13469{
f67539c2
TL
13470 dout(10) << __func__ << dendl;
13471 {
13472 std::unique_lock l{zoned_cleaner_lock};
13473 while (!zoned_cleaner_started) {
13474 zoned_cleaner_cond.wait(l);
13475 }
13476 zoned_cleaner_stop = true;
13477 zoned_cleaner_cond.notify_all();
13478 }
13479 zoned_cleaner_thread.join();
13480 {
13481 std::lock_guard l{zoned_cleaner_lock};
13482 zoned_cleaner_stop = false;
13483 }
13484 dout(10) << __func__ << " done" << dendl;
13485}
13486
20effc67
TL
13487void BlueStore::_zoned_cleaner_thread()
13488{
f67539c2
TL
13489 dout(10) << __func__ << " start" << dendl;
13490 std::unique_lock l{zoned_cleaner_lock};
13491 ceph_assert(!zoned_cleaner_started);
13492 zoned_cleaner_started = true;
13493 zoned_cleaner_cond.notify_all();
20effc67
TL
13494 auto a = dynamic_cast<ZonedAllocator*>(alloc);
13495 ceph_assert(a);
13496 auto f = dynamic_cast<ZonedFreelistManager*>(fm);
13497 ceph_assert(f);
f67539c2 13498 while (true) {
20effc67
TL
13499 // thresholds to trigger cleaning
13500 // FIXME
13501 float min_score = .05; // score: bytes saved / bytes moved
13502 uint64_t min_saved = zone_size / 32; // min bytes saved to consider cleaning
13503 auto zone_to_clean = a->pick_zone_to_clean(min_score, min_saved);
13504 if (zone_to_clean < 0) {
f67539c2
TL
13505 if (zoned_cleaner_stop) {
13506 break;
13507 }
20effc67
TL
13508 auto period = ceph::make_timespan(cct->_conf->bluestore_cleaner_sleep_interval);
13509 dout(20) << __func__ << " sleep for " << period << dendl;
13510 zoned_cleaner_cond.wait_for(l, period);
f67539c2
TL
13511 dout(20) << __func__ << " wake" << dendl;
13512 } else {
f67539c2 13513 l.unlock();
20effc67
TL
13514 a->set_cleaning_zone(zone_to_clean);
13515 _zoned_clean_zone(zone_to_clean, a, f);
13516 a->clear_cleaning_zone(zone_to_clean);
f67539c2
TL
13517 l.lock();
13518 }
13519 }
13520 dout(10) << __func__ << " finish" << dendl;
13521 zoned_cleaner_started = false;
13522}
13523
20effc67
TL
13524void BlueStore::_zoned_clean_zone(
13525 uint64_t zone,
13526 ZonedAllocator *a,
13527 ZonedFreelistManager *f
13528 )
13529{
13530 dout(10) << __func__ << " cleaning zone 0x" << std::hex << zone << std::dec << dendl;
13531
13532 KeyValueDB::Iterator it = db->get_iterator(PREFIX_ZONED_CL_INFO);
13533 std::string zone_start;
13534 get_zone_offset_object_key(zone, 0, ghobject_t(), &zone_start);
13535 for (it->lower_bound(zone_start); it->valid(); it->next()) {
13536 uint32_t z;
13537 uint64_t offset;
13538 ghobject_t oid;
13539 string k = it->key();
13540 int r = get_key_zone_offset_object(k, &z, &offset, &oid);
13541 if (r < 0) {
13542 derr << __func__ << " failed to decode zone ref " << pretty_binary_string(k)
13543 << dendl;
13544 continue;
13545 }
13546 if (zone != z) {
13547 dout(10) << __func__ << " reached end of zone refs" << dendl;
13548 break;
13549 }
13550 dout(10) << __func__ << " zone 0x" << std::hex << zone << " offset 0x" << offset
13551 << std::dec << " " << oid << dendl;
13552 _clean_some(oid, zone);
13553 }
13554
13555 if (a->get_live_bytes(zone) > 0) {
13556 derr << "zone 0x" << std::hex << zone << " still has 0x" << a->get_live_bytes(zone)
13557 << " live bytes" << std::dec << dendl;
13558 // should we do something else here to avoid a live-lock in the event of a problem?
13559 return;
13560 }
13561
13562 // make sure transactions flush/drain/commit (and data is all rewritten
13563 // safely elsewhere) before we blow away the cleaned zone
13564 _osr_drain_all();
13565
13566 // reset the device zone
13567 dout(10) << __func__ << " resetting zone 0x" << std::hex << zone << std::dec << dendl;
13568 bdev->reset_zone(zone);
13569
13570 // record that we can now write there
13571 f->mark_zone_to_clean_free(zone, db);
13572 bdev->flush();
13573
13574 // then allow ourselves to start allocating there
13575 dout(10) << __func__ << " done cleaning zone 0x" << std::hex << zone << std::dec
13576 << dendl;
13577 a->reset_zone(zone);
13578}
13579
13580void BlueStore::_clean_some(ghobject_t oid, uint32_t zone)
13581{
13582 dout(10) << __func__ << " " << oid << " from zone 0x" << std::hex << zone << std::dec
13583 << dendl;
13584
13585 CollectionRef cref = _get_collection_by_oid(oid);
13586 if (!cref) {
13587 dout(10) << __func__ << " can't find collection for " << oid << dendl;
13588 return;
13589 }
13590 Collection *c = cref.get();
13591
13592 // serialize io dispatch vs other transactions
13593 std::lock_guard l(atomic_alloc_and_submit_lock);
13594 std::unique_lock l2(c->lock);
13595
13596 auto o = c->get_onode(oid, false);
13597 if (!o) {
13598 dout(10) << __func__ << " can't find " << oid << dendl;
13599 return;
13600 }
13601
13602 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
13603 _dump_onode<30>(cct, *o);
13604
13605 // NOTE: This is a naive rewrite strategy. If any blobs are
13606 // shared, they will be duplicated for each object that references
13607 // them. That means any cloned/snapshotted objects will explode
13608 // their utilization. This won't matter for RGW workloads, but
13609 // for RBD and CephFS it is completely unacceptable, and it's
13610 // entirely reasonable to have "archival" data workloads on SMR
13611 // for CephFS and (possibly/probably) RBD.
13612 //
13613 // At some point we need to replace this with something more
13614 // sophisticated that ensures that a shared blob gets moved once
13615 // and all referencing objects get updated to point to the new
13616 // location.
13617
13618 map<uint32_t, uint32_t> to_move;
13619 for (auto& e : o->extent_map.extent_map) {
13620 bool touches_zone = false;
13621 for (auto& be : e.blob->get_blob().get_extents()) {
13622 if (be.is_valid()) {
13623 uint32_t z = be.offset / zone_size;
13624 if (z == zone) {
13625 touches_zone = true;
13626 break;
13627 }
13628 }
13629 }
13630 if (touches_zone) {
13631 to_move[e.logical_offset] = e.length;
13632 }
13633 }
13634 if (to_move.empty()) {
13635 dout(10) << __func__ << " no references to zone 0x" << std::hex << zone
13636 << std::dec << " from " << oid << dendl;
13637 return;
13638 }
13639
13640 dout(10) << __func__ << " rewriting object extents 0x" << std::hex << to_move
13641 << std::dec << dendl;
13642 OpSequencer *osr = c->osr.get();
13643 TransContext *txc = _txc_create(c, osr, nullptr);
13644
13645 spg_t pgid;
13646 if (c->cid.is_pg(&pgid)) {
13647 txc->osd_pool_id = pgid.pool();
13648 }
13649
13650 for (auto& [offset, length] : to_move) {
13651 bufferlist bl;
13652 int r = _do_read(c, o, offset, length, bl, 0);
13653 ceph_assert(r == (int)length);
13654
13655 r = _do_write(txc, cref, o, offset, length, bl, 0);
13656 ceph_assert(r >= 0);
13657 }
13658 txc->write_onode(o);
13659
13660 _txc_write_nodes(txc, txc->t);
13661 _txc_finalize_kv(txc, txc->t);
13662 _txc_state_proc(txc);
f67539c2 13663}
20effc67 13664#endif
f67539c2 13665
7c673cae 13666bluestore_deferred_op_t *BlueStore::_get_deferred_op(
522d829b 13667 TransContext *txc, uint64_t len)
7c673cae
FG
13668{
13669 if (!txc->deferred_txn) {
13670 txc->deferred_txn = new bluestore_deferred_transaction_t;
13671 }
13672 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
20effc67
TL
13673 logger->inc(l_bluestore_issued_deferred_writes);
13674 logger->inc(l_bluestore_issued_deferred_write_bytes, len);
7c673cae
FG
13675 return &txc->deferred_txn->ops.back();
13676}
13677
13678void BlueStore::_deferred_queue(TransContext *txc)
13679{
13680 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
f67539c2
TL
13681
13682 DeferredBatch *tmp;
13683 txc->osr->deferred_lock.lock();
13684 {
13685 if (!txc->osr->deferred_pending) {
13686 tmp = new DeferredBatch(cct, txc->osr.get());
13687 } else {
13688 tmp = txc->osr->deferred_pending;
13689 }
7c673cae 13690 }
f67539c2
TL
13691
13692 tmp->txcs.push_back(*txc);
7c673cae
FG
13693 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
13694 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
13695 const auto& op = *opi;
11fdf7f2 13696 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
7c673cae
FG
13697 bufferlist::const_iterator p = op.data.begin();
13698 for (auto e : op.extents) {
f67539c2 13699 tmp->prepare_write(cct, wt.seq, e.offset, e.length, p);
7c673cae
FG
13700 }
13701 }
f67539c2
TL
13702
13703 {
13704 ++deferred_queue_size;
13705 txc->osr->deferred_pending = tmp;
13706 // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
13707 // So we should add osr into deferred_queue.
13708 if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) {
13709 deferred_lock.lock();
13710 deferred_queue.push_back(*txc->osr);
13711 deferred_lock.unlock();
13712 }
13713
13714 if (deferred_aggressive &&
13715 !txc->osr->deferred_running) {
13716 _deferred_submit_unlock(txc->osr.get());
13717 } else {
13718 txc->osr->deferred_lock.unlock();
13719 }
7c673cae 13720 }
f67539c2 13721 }
7c673cae 13722
224ce89b 13723void BlueStore::deferred_try_submit()
7c673cae
FG
13724{
13725 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
13726 << deferred_queue_size << " txcs" << dendl;
224ce89b 13727 vector<OpSequencerRef> osrs;
f67539c2
TL
13728
13729 {
13730 std::lock_guard l(deferred_lock);
13731 osrs.reserve(deferred_queue.size());
13732 for (auto& osr : deferred_queue) {
13733 osrs.push_back(&osr);
13734 }
224ce89b 13735 }
f67539c2 13736
224ce89b 13737 for (auto& osr : osrs) {
f67539c2 13738 osr->deferred_lock.lock();
181888fb
FG
13739 if (osr->deferred_pending) {
13740 if (!osr->deferred_running) {
13741 _deferred_submit_unlock(osr.get());
181888fb 13742 } else {
f67539c2 13743 osr->deferred_lock.unlock();
181888fb
FG
13744 dout(20) << __func__ << " osr " << osr << " already has running"
13745 << dendl;
13746 }
13747 } else {
f67539c2 13748 osr->deferred_lock.unlock();
181888fb 13749 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
13750 }
13751 }
9f95a23c 13752
f67539c2
TL
13753 {
13754 std::lock_guard l(deferred_lock);
13755 deferred_last_submitted = ceph_clock_now();
13756 }
7c673cae
FG
13757}
13758
224ce89b 13759void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
13760{
13761 dout(10) << __func__ << " osr " << osr
13762 << " " << osr->deferred_pending->iomap.size() << " ios pending "
13763 << dendl;
11fdf7f2
TL
13764 ceph_assert(osr->deferred_pending);
13765 ceph_assert(!osr->deferred_running);
7c673cae
FG
13766
13767 auto b = osr->deferred_pending;
13768 deferred_queue_size -= b->seq_bytes.size();
11fdf7f2 13769 ceph_assert(deferred_queue_size >= 0);
7c673cae
FG
13770
13771 osr->deferred_running = osr->deferred_pending;
13772 osr->deferred_pending = nullptr;
13773
f67539c2 13774 osr->deferred_lock.unlock();
11fdf7f2
TL
13775
13776 for (auto& txc : b->txcs) {
9f95a23c 13777 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
11fdf7f2 13778 }
7c673cae
FG
13779 uint64_t start = 0, pos = 0;
13780 bufferlist bl;
13781 auto i = b->iomap.begin();
13782 while (true) {
13783 if (i == b->iomap.end() || i->first != pos) {
13784 if (bl.length()) {
13785 dout(20) << __func__ << " write 0x" << std::hex
13786 << start << "~" << bl.length()
13787 << " crc " << bl.crc32c(-1) << std::dec << dendl;
11fdf7f2 13788 if (!g_conf()->bluestore_debug_omit_block_device_write) {
20effc67
TL
13789 logger->inc(l_bluestore_submitted_deferred_writes);
13790 logger->inc(l_bluestore_submitted_deferred_write_bytes, bl.length());
7c673cae 13791 int r = bdev->aio_write(start, bl, &b->ioc, false);
11fdf7f2 13792 ceph_assert(r == 0);
7c673cae
FG
13793 }
13794 }
13795 if (i == b->iomap.end()) {
13796 break;
13797 }
13798 start = 0;
13799 pos = i->first;
13800 bl.clear();
13801 }
13802 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
13803 << std::hex << pos << "~" << i->second.bl.length() << std::dec
13804 << dendl;
13805 if (!bl.length()) {
13806 start = pos;
13807 }
13808 pos += i->second.bl.length();
13809 bl.claim_append(i->second.bl);
13810 ++i;
13811 }
224ce89b 13812
7c673cae
FG
13813 bdev->aio_submit(&b->ioc);
13814}
13815
3efd9988
FG
13816struct C_DeferredTrySubmit : public Context {
13817 BlueStore *store;
13818 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
13819 void finish(int r) {
13820 store->deferred_try_submit();
13821 }
13822};
13823
7c673cae
FG
13824void BlueStore::_deferred_aio_finish(OpSequencer *osr)
13825{
13826 dout(10) << __func__ << " osr " << osr << dendl;
11fdf7f2 13827 ceph_assert(osr->deferred_running);
7c673cae
FG
13828 DeferredBatch *b = osr->deferred_running;
13829
13830 {
f67539c2 13831 osr->deferred_lock.lock();
11fdf7f2 13832 ceph_assert(osr->deferred_running == b);
7c673cae
FG
13833 osr->deferred_running = nullptr;
13834 if (!osr->deferred_pending) {
181888fb 13835 dout(20) << __func__ << " dequeueing" << dendl;
f67539c2
TL
13836 {
13837 deferred_lock.lock();
13838 auto q = deferred_queue.iterator_to(*osr);
13839 deferred_queue.erase(q);
13840 deferred_lock.unlock();
13841 }
13842 osr->deferred_lock.unlock();
181888fb 13843 } else {
f67539c2 13844 osr->deferred_lock.unlock();
9f95a23c
TL
13845 if (deferred_aggressive) {
13846 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
13847 finisher.queue(new C_DeferredTrySubmit(this));
13848 } else {
13849 dout(20) << __func__ << " leaving queued, more pending" << dendl;
13850 }
7c673cae
FG
13851 }
13852 }
13853
13854 {
31f18b77 13855 uint64_t costs = 0;
11fdf7f2 13856 {
11fdf7f2
TL
13857 for (auto& i : b->txcs) {
13858 TransContext *txc = &i;
9f95a23c 13859 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
f67539c2 13860 txc->set_state(TransContext::STATE_DEFERRED_CLEANUP);
11fdf7f2
TL
13861 costs += txc->cost;
13862 }
7c673cae 13863 }
9f95a23c 13864 throttle.release_deferred_throttle(costs);
7c673cae
FG
13865 }
13866
9f95a23c 13867 {
11fdf7f2 13868 std::lock_guard l(kv_lock);
9f95a23c
TL
13869 deferred_done_queue.emplace_back(b);
13870
13871 // in the normal case, do not bother waking up the kv thread; it will
13872 // catch us on the next commit anyway.
13873 if (deferred_aggressive && !kv_sync_in_progress) {
13874 kv_sync_in_progress = true;
13875 kv_cond.notify_one();
13876 }
7c673cae
FG
13877 }
13878}
13879
13880int BlueStore::_deferred_replay()
13881{
13882 dout(10) << __func__ << " start" << dendl;
7c673cae
FG
13883 int count = 0;
13884 int r = 0;
11fdf7f2
TL
13885 CollectionRef ch = _get_collection(coll_t::meta());
13886 bool fake_ch = false;
13887 if (!ch) {
13888 // hmm, replaying initial mkfs?
13889 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
13890 fake_ch = true;
13891 }
13892 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
7c673cae
FG
13893 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
13894 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
13895 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
13896 << dendl;
13897 bluestore_deferred_transaction_t *deferred_txn =
13898 new bluestore_deferred_transaction_t;
13899 bufferlist bl = it->value();
11fdf7f2 13900 auto p = bl.cbegin();
7c673cae 13901 try {
11fdf7f2 13902 decode(*deferred_txn, p);
f67539c2 13903 } catch (ceph::buffer::error& e) {
7c673cae
FG
13904 derr << __func__ << " failed to decode deferred txn "
13905 << pretty_binary_string(it->key()) << dendl;
13906 delete deferred_txn;
13907 r = -EIO;
13908 goto out;
13909 }
11fdf7f2 13910 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
7c673cae 13911 txc->deferred_txn = deferred_txn;
f67539c2 13912 txc->set_state(TransContext::STATE_KV_DONE);
7c673cae
FG
13913 _txc_state_proc(txc);
13914 }
13915 out:
13916 dout(20) << __func__ << " draining osr" << dendl;
11fdf7f2 13917 _osr_register_zombie(osr);
7c673cae 13918 _osr_drain_all();
11fdf7f2
TL
13919 if (fake_ch) {
13920 new_coll_map.clear();
13921 }
7c673cae
FG
13922 dout(10) << __func__ << " completed " << count << " events" << dendl;
13923 return r;
13924}
13925
13926// ---------------------------
13927// transactions
13928
13929int BlueStore::queue_transactions(
11fdf7f2
TL
13930 CollectionHandle& ch,
13931 vector<Transaction>& tls,
13932 TrackedOpRef op,
13933 ThreadPool::TPHandle *handle)
13934{
13935 FUNCTRACE(cct);
13936 list<Context *> on_applied, on_commit, on_applied_sync;
7c673cae 13937 ObjectStore::Transaction::collect_contexts(
11fdf7f2 13938 tls, &on_applied, &on_commit, &on_applied_sync);
7c673cae 13939
11fdf7f2
TL
13940 auto start = mono_clock::now();
13941
13942 Collection *c = static_cast<Collection*>(ch.get());
13943 OpSequencer *osr = c->osr.get();
13944 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
7c673cae 13945
f67539c2
TL
13946 // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
13947 // submission to happen atomically because if I/O submission happens in a
13948 // different order than I/O allocation, we end up issuing non-sequential
13949 // writes to the drive. This is a temporary solution until ZONE APPEND
13950 // support matures in the kernel. For more information please see:
13951 // https://www.usenix.org/conference/vault20/presentation/bjorling
13952 if (bdev->is_smr()) {
13953 atomic_alloc_and_submit_lock.lock();
13954 }
20effc67
TL
13955
13956 // prepare
13957 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
13958 &on_commit, op);
13959
7c673cae 13960 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
7c673cae
FG
13961 txc->bytes += (*p).get_num_bytes();
13962 _txc_add_transaction(txc, &(*p));
13963 }
13964 _txc_calc_cost(txc);
13965
13966 _txc_write_nodes(txc, txc->t);
13967
13968 // journal deferred items
13969 if (txc->deferred_txn) {
13970 txc->deferred_txn->seq = ++deferred_seq;
13971 bufferlist bl;
11fdf7f2 13972 encode(*txc->deferred_txn, bl);
7c673cae
FG
13973 string key;
13974 get_deferred_key(txc->deferred_txn->seq, &key);
13975 txc->t->set(PREFIX_DEFERRED, key, bl);
13976 }
13977
13978 _txc_finalize_kv(txc, txc->t);
f67539c2
TL
13979
13980#ifdef WITH_BLKIN
13981 if (txc->trace) {
13982 txc->trace.event("txc encode finished");
13983 }
13984#endif
13985
7c673cae
FG
13986 if (handle)
13987 handle->suspend_tp_timeout();
13988
11fdf7f2 13989 auto tstart = mono_clock::now();
9f95a23c
TL
13990
13991 if (!throttle.try_start_transaction(
13992 *db,
13993 *txc,
13994 tstart)) {
7c673cae 13995 // ensure we do not block here because of deferred writes
9f95a23c
TL
13996 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
13997 << dendl;
13998 ++deferred_aggressive;
13999 deferred_try_submit();
14000 {
14001 // wake up any previously finished deferred events
14002 std::lock_guard l(kv_lock);
14003 if (!kv_sync_in_progress) {
14004 kv_sync_in_progress = true;
3efd9988
FG
14005 kv_cond.notify_one();
14006 }
9f95a23c
TL
14007 }
14008 throttle.finish_start_transaction(*db, *txc, tstart);
14009 --deferred_aggressive;
7c673cae 14010 }
11fdf7f2 14011 auto tend = mono_clock::now();
7c673cae
FG
14012
14013 if (handle)
14014 handle->reset_tp_timeout();
14015
14016 logger->inc(l_bluestore_txc);
14017
14018 // execute (start)
14019 _txc_state_proc(txc);
14020
f67539c2
TL
14021 if (bdev->is_smr()) {
14022 atomic_alloc_and_submit_lock.unlock();
14023 }
14024
11fdf7f2
TL
14025 // we're immediately readable (unlike FileStore)
14026 for (auto c : on_applied_sync) {
14027 c->complete(0);
14028 }
14029 if (!on_applied.empty()) {
14030 if (c->commit_queue) {
14031 c->commit_queue->queue(on_applied);
14032 } else {
14033 finisher.queue(on_applied);
14034 }
14035 }
14036
f67539c2
TL
14037#ifdef WITH_BLKIN
14038 if (txc->trace) {
14039 txc->trace.event("txc applied");
14040 }
14041#endif
14042
494da23a
TL
14043 log_latency("submit_transact",
14044 l_bluestore_submit_lat,
14045 mono_clock::now() - start,
14046 cct->_conf->bluestore_log_op_age);
14047 log_latency("throttle_transact",
14048 l_bluestore_throttle_lat,
14049 tend - tstart,
14050 cct->_conf->bluestore_log_op_age);
7c673cae
FG
14051 return 0;
14052}
14053
14054void BlueStore::_txc_aio_submit(TransContext *txc)
14055{
14056 dout(10) << __func__ << " txc " << txc << dendl;
14057 bdev->aio_submit(&txc->ioc);
14058}
14059
14060void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
14061{
14062 Transaction::iterator i = t->begin();
14063
81eedcae 14064 _dump_transaction<30>(cct, t);
7c673cae
FG
14065
14066 vector<CollectionRef> cvec(i.colls.size());
14067 unsigned j = 0;
14068 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
14069 ++p, ++j) {
14070 cvec[j] = _get_collection(*p);
7c673cae 14071 }
11fdf7f2 14072
7c673cae
FG
14073 vector<OnodeRef> ovec(i.objects.size());
14074
14075 for (int pos = 0; i.have_op(); ++pos) {
14076 Transaction::Op *op = i.decode_op();
14077 int r = 0;
14078
14079 // no coll or obj
14080 if (op->op == Transaction::OP_NOP)
14081 continue;
14082
11fdf7f2 14083
7c673cae
FG
14084 // collection operations
14085 CollectionRef &c = cvec[op->cid];
11fdf7f2
TL
14086
14087 // initialize osd_pool_id and do a smoke test that all collections belong
14088 // to the same pool
14089 spg_t pgid;
14090 if (!!c ? c->cid.is_pg(&pgid) : false) {
14091 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
14092 txc->osd_pool_id == pgid.pool());
14093 txc->osd_pool_id = pgid.pool();
14094 }
14095
7c673cae
FG
14096 switch (op->op) {
14097 case Transaction::OP_RMCOLL:
14098 {
14099 const coll_t &cid = i.get_cid(op->cid);
14100 r = _remove_collection(txc, cid, &c);
14101 if (!r)
14102 continue;
14103 }
14104 break;
14105
14106 case Transaction::OP_MKCOLL:
14107 {
11fdf7f2 14108 ceph_assert(!c);
7c673cae
FG
14109 const coll_t &cid = i.get_cid(op->cid);
14110 r = _create_collection(txc, cid, op->split_bits, &c);
14111 if (!r)
14112 continue;
14113 }
14114 break;
14115
14116 case Transaction::OP_SPLIT_COLLECTION:
11fdf7f2 14117 ceph_abort_msg("deprecated");
7c673cae
FG
14118 break;
14119
14120 case Transaction::OP_SPLIT_COLLECTION2:
14121 {
14122 uint32_t bits = op->split_bits;
14123 uint32_t rem = op->split_rem;
14124 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
14125 if (!r)
14126 continue;
14127 }
14128 break;
14129
11fdf7f2
TL
14130 case Transaction::OP_MERGE_COLLECTION:
14131 {
14132 uint32_t bits = op->split_bits;
14133 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
14134 if (!r)
14135 continue;
14136 }
14137 break;
14138
7c673cae
FG
14139 case Transaction::OP_COLL_HINT:
14140 {
f67539c2 14141 uint32_t type = op->hint;
7c673cae
FG
14142 bufferlist hint;
14143 i.decode_bl(hint);
11fdf7f2 14144 auto hiter = hint.cbegin();
7c673cae
FG
14145 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
14146 uint32_t pg_num;
14147 uint64_t num_objs;
11fdf7f2
TL
14148 decode(pg_num, hiter);
14149 decode(num_objs, hiter);
7c673cae
FG
14150 dout(10) << __func__ << " collection hint objects is a no-op, "
14151 << " pg_num " << pg_num << " num_objects " << num_objs
14152 << dendl;
14153 } else {
14154 // Ignore the hint
14155 dout(10) << __func__ << " unknown collection hint " << type << dendl;
14156 }
14157 continue;
14158 }
14159 break;
14160
14161 case Transaction::OP_COLL_SETATTR:
14162 r = -EOPNOTSUPP;
14163 break;
14164
14165 case Transaction::OP_COLL_RMATTR:
14166 r = -EOPNOTSUPP;
14167 break;
14168
14169 case Transaction::OP_COLL_RENAME:
11fdf7f2 14170 ceph_abort_msg("not implemented");
7c673cae
FG
14171 break;
14172 }
14173 if (r < 0) {
14174 derr << __func__ << " error " << cpp_strerror(r)
14175 << " not handled on operation " << op->op
14176 << " (op " << pos << ", counting from 0)" << dendl;
81eedcae 14177 _dump_transaction<0>(cct, t);
11fdf7f2 14178 ceph_abort_msg("unexpected error");
7c673cae
FG
14179 }
14180
14181 // these operations implicity create the object
14182 bool create = false;
14183 if (op->op == Transaction::OP_TOUCH ||
9f95a23c 14184 op->op == Transaction::OP_CREATE ||
7c673cae
FG
14185 op->op == Transaction::OP_WRITE ||
14186 op->op == Transaction::OP_ZERO) {
14187 create = true;
14188 }
14189
14190 // object operations
9f95a23c 14191 std::unique_lock l(c->lock);
7c673cae
FG
14192 OnodeRef &o = ovec[op->oid];
14193 if (!o) {
14194 ghobject_t oid = i.get_oid(op->oid);
9f95a23c 14195 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
7c673cae
FG
14196 }
14197 if (!create && (!o || !o->exists)) {
14198 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
14199 << i.get_oid(op->oid) << dendl;
14200 r = -ENOENT;
14201 goto endop;
14202 }
14203
14204 switch (op->op) {
9f95a23c 14205 case Transaction::OP_CREATE:
7c673cae
FG
14206 case Transaction::OP_TOUCH:
14207 r = _touch(txc, c, o);
14208 break;
14209
14210 case Transaction::OP_WRITE:
14211 {
14212 uint64_t off = op->off;
14213 uint64_t len = op->len;
14214 uint32_t fadvise_flags = i.get_fadvise_flags();
14215 bufferlist bl;
14216 i.decode_bl(bl);
14217 r = _write(txc, c, o, off, len, bl, fadvise_flags);
14218 }
14219 break;
14220
14221 case Transaction::OP_ZERO:
14222 {
14223 uint64_t off = op->off;
14224 uint64_t len = op->len;
14225 r = _zero(txc, c, o, off, len);
14226 }
14227 break;
14228
14229 case Transaction::OP_TRIMCACHE:
14230 {
14231 // deprecated, no-op
14232 }
14233 break;
14234
14235 case Transaction::OP_TRUNCATE:
14236 {
14237 uint64_t off = op->off;
35e4c445 14238 r = _truncate(txc, c, o, off);
7c673cae
FG
14239 }
14240 break;
14241
14242 case Transaction::OP_REMOVE:
14243 {
14244 r = _remove(txc, c, o);
14245 }
14246 break;
14247
14248 case Transaction::OP_SETATTR:
14249 {
14250 string name = i.decode_string();
14251 bufferptr bp;
14252 i.decode_bp(bp);
14253 r = _setattr(txc, c, o, name, bp);
14254 }
14255 break;
14256
14257 case Transaction::OP_SETATTRS:
14258 {
14259 map<string, bufferptr> aset;
14260 i.decode_attrset(aset);
14261 r = _setattrs(txc, c, o, aset);
14262 }
14263 break;
14264
14265 case Transaction::OP_RMATTR:
14266 {
14267 string name = i.decode_string();
14268 r = _rmattr(txc, c, o, name);
14269 }
14270 break;
14271
14272 case Transaction::OP_RMATTRS:
14273 {
14274 r = _rmattrs(txc, c, o);
14275 }
14276 break;
14277
14278 case Transaction::OP_CLONE:
14279 {
14280 OnodeRef& no = ovec[op->dest_oid];
14281 if (!no) {
14282 const ghobject_t& noid = i.get_oid(op->dest_oid);
14283 no = c->get_onode(noid, true);
14284 }
14285 r = _clone(txc, c, o, no);
14286 }
14287 break;
14288
14289 case Transaction::OP_CLONERANGE:
11fdf7f2 14290 ceph_abort_msg("deprecated");
7c673cae
FG
14291 break;
14292
14293 case Transaction::OP_CLONERANGE2:
14294 {
14295 OnodeRef& no = ovec[op->dest_oid];
14296 if (!no) {
14297 const ghobject_t& noid = i.get_oid(op->dest_oid);
14298 no = c->get_onode(noid, true);
14299 }
14300 uint64_t srcoff = op->off;
14301 uint64_t len = op->len;
14302 uint64_t dstoff = op->dest_off;
14303 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
14304 }
14305 break;
14306
14307 case Transaction::OP_COLL_ADD:
11fdf7f2 14308 ceph_abort_msg("not implemented");
7c673cae
FG
14309 break;
14310
14311 case Transaction::OP_COLL_REMOVE:
11fdf7f2 14312 ceph_abort_msg("not implemented");
7c673cae
FG
14313 break;
14314
14315 case Transaction::OP_COLL_MOVE:
11fdf7f2 14316 ceph_abort_msg("deprecated");
7c673cae
FG
14317 break;
14318
14319 case Transaction::OP_COLL_MOVE_RENAME:
14320 case Transaction::OP_TRY_RENAME:
14321 {
11fdf7f2 14322 ceph_assert(op->cid == op->dest_cid);
7c673cae
FG
14323 const ghobject_t& noid = i.get_oid(op->dest_oid);
14324 OnodeRef& no = ovec[op->dest_oid];
14325 if (!no) {
14326 no = c->get_onode(noid, false);
14327 }
14328 r = _rename(txc, c, o, no, noid);
14329 }
14330 break;
14331
14332 case Transaction::OP_OMAP_CLEAR:
14333 {
14334 r = _omap_clear(txc, c, o);
14335 }
14336 break;
14337 case Transaction::OP_OMAP_SETKEYS:
14338 {
14339 bufferlist aset_bl;
14340 i.decode_attrset_bl(&aset_bl);
14341 r = _omap_setkeys(txc, c, o, aset_bl);
14342 }
14343 break;
14344 case Transaction::OP_OMAP_RMKEYS:
14345 {
14346 bufferlist keys_bl;
14347 i.decode_keyset_bl(&keys_bl);
14348 r = _omap_rmkeys(txc, c, o, keys_bl);
14349 }
14350 break;
14351 case Transaction::OP_OMAP_RMKEYRANGE:
14352 {
14353 string first, last;
14354 first = i.decode_string();
14355 last = i.decode_string();
14356 r = _omap_rmkey_range(txc, c, o, first, last);
14357 }
14358 break;
14359 case Transaction::OP_OMAP_SETHEADER:
14360 {
14361 bufferlist bl;
14362 i.decode_bl(bl);
14363 r = _omap_setheader(txc, c, o, bl);
14364 }
14365 break;
14366
14367 case Transaction::OP_SETALLOCHINT:
14368 {
14369 r = _set_alloc_hint(txc, c, o,
14370 op->expected_object_size,
14371 op->expected_write_size,
f67539c2 14372 op->hint);
7c673cae
FG
14373 }
14374 break;
14375
14376 default:
11fdf7f2 14377 derr << __func__ << " bad op " << op->op << dendl;
7c673cae
FG
14378 ceph_abort();
14379 }
14380
14381 endop:
14382 if (r < 0) {
14383 bool ok = false;
14384
14385 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
14386 op->op == Transaction::OP_CLONE ||
14387 op->op == Transaction::OP_CLONERANGE2 ||
14388 op->op == Transaction::OP_COLL_ADD ||
14389 op->op == Transaction::OP_SETATTR ||
14390 op->op == Transaction::OP_SETATTRS ||
14391 op->op == Transaction::OP_RMATTR ||
14392 op->op == Transaction::OP_OMAP_SETKEYS ||
14393 op->op == Transaction::OP_OMAP_RMKEYS ||
14394 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
14395 op->op == Transaction::OP_OMAP_SETHEADER))
14396 // -ENOENT is usually okay
14397 ok = true;
14398 if (r == -ENODATA)
14399 ok = true;
14400
14401 if (!ok) {
14402 const char *msg = "unexpected error code";
14403
14404 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
14405 op->op == Transaction::OP_CLONE ||
14406 op->op == Transaction::OP_CLONERANGE2))
14407 msg = "ENOENT on clone suggests osd bug";
14408
14409 if (r == -ENOSPC)
14410 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
14411 // by partially applying transactions.
14412 msg = "ENOSPC from bluestore, misconfigured cluster";
14413
14414 if (r == -ENOTEMPTY) {
14415 msg = "ENOTEMPTY suggests garbage data in osd data dir";
14416 }
14417
14418 derr << __func__ << " error " << cpp_strerror(r)
14419 << " not handled on operation " << op->op
14420 << " (op " << pos << ", counting from 0)"
14421 << dendl;
14422 derr << msg << dendl;
81eedcae 14423 _dump_transaction<0>(cct, t);
11fdf7f2 14424 ceph_abort_msg("unexpected error");
7c673cae
FG
14425 }
14426 }
14427 }
14428}
14429
14430
14431
14432// -----------------
14433// write operations
14434
14435int BlueStore::_touch(TransContext *txc,
14436 CollectionRef& c,
14437 OnodeRef &o)
14438{
14439 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14440 int r = 0;
7c673cae
FG
14441 _assign_nid(txc, o);
14442 txc->write_onode(o);
14443 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14444 return r;
14445}
14446
7c673cae
FG
14447void BlueStore::_pad_zeros(
14448 bufferlist *bl, uint64_t *offset,
14449 uint64_t chunk_size)
14450{
14451 auto length = bl->length();
14452 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
14453 << " chunk_size 0x" << chunk_size << std::dec << dendl;
14454 dout(40) << "before:\n";
14455 bl->hexdump(*_dout);
14456 *_dout << dendl;
14457 // front
14458 size_t front_pad = *offset % chunk_size;
14459 size_t back_pad = 0;
14460 size_t pad_count = 0;
14461 if (front_pad) {
11fdf7f2 14462 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
f67539c2 14463 bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size);
224ce89b 14464 z.zero(0, front_pad, false);
7c673cae 14465 pad_count += front_pad;
9f95a23c 14466 bl->begin().copy(front_copy, z.c_str() + front_pad);
7c673cae
FG
14467 if (front_copy + front_pad < chunk_size) {
14468 back_pad = chunk_size - (length + front_pad);
224ce89b 14469 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
14470 pad_count += back_pad;
14471 }
14472 bufferlist old, t;
14473 old.swap(*bl);
14474 t.substr_of(old, front_copy, length - front_copy);
14475 bl->append(z);
14476 bl->claim_append(t);
14477 *offset -= front_pad;
224ce89b 14478 length += pad_count;
7c673cae
FG
14479 }
14480
14481 // back
14482 uint64_t end = *offset + length;
14483 unsigned back_copy = end % chunk_size;
14484 if (back_copy) {
11fdf7f2 14485 ceph_assert(back_pad == 0);
7c673cae 14486 back_pad = chunk_size - back_copy;
11fdf7f2 14487 ceph_assert(back_copy <= length);
7c673cae 14488 bufferptr tail(chunk_size);
9f95a23c 14489 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
224ce89b 14490 tail.zero(back_copy, back_pad, false);
7c673cae
FG
14491 bufferlist old;
14492 old.swap(*bl);
14493 bl->substr_of(old, 0, length - back_copy);
14494 bl->append(tail);
14495 length += back_pad;
14496 pad_count += back_pad;
14497 }
14498 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
14499 << back_pad << " on front/back, now 0x" << *offset << "~"
14500 << length << std::dec << dendl;
14501 dout(40) << "after:\n";
14502 bl->hexdump(*_dout);
14503 *_dout << dendl;
14504 if (pad_count)
14505 logger->inc(l_bluestore_write_pad_bytes, pad_count);
11fdf7f2 14506 ceph_assert(bl->length() == length);
7c673cae
FG
14507}
14508
14509void BlueStore::_do_write_small(
14510 TransContext *txc,
14511 CollectionRef &c,
14512 OnodeRef o,
14513 uint64_t offset, uint64_t length,
14514 bufferlist::iterator& blp,
14515 WriteContext *wctx)
14516{
14517 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
14518 << std::dec << dendl;
11fdf7f2 14519 ceph_assert(length < min_alloc_size);
f67539c2 14520
7c673cae
FG
14521 uint64_t end_offs = offset + length;
14522
14523 logger->inc(l_bluestore_write_small);
14524 logger->inc(l_bluestore_write_small_bytes, length);
14525
14526 bufferlist bl;
14527 blp.copy(length, bl);
14528
81eedcae
TL
14529 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
14530 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
14531 uint32_t alloc_len = min_alloc_size;
14532 auto offset0 = p2align<uint64_t>(offset, alloc_len);
14533
14534 bool any_change;
14535
14536 // search suitable extent in both forward and reverse direction in
14537 // [offset - target_max_blob_size, offset + target_max_blob_size] range
14538 // then check if blob can be reused via can_reuse_blob func or apply
14539 // direct/deferred write (the latter for extents including or higher
14540 // than 'offset' only).
14541 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
14542
20effc67 14543#ifdef HAVE_LIBZBD
f67539c2
TL
14544 // On zoned devices, the first goal is to support non-overwrite workloads,
14545 // such as RGW, with large, aligned objects. Therefore, for user writes
14546 // _do_write_small should not trigger. OSDs, however, write and update a tiny
14547 // amount of metadata, such as OSD maps, to disk. For those cases, we
14548 // temporarily just pad them to min_alloc_size and write them to a new place
14549 // on every update.
14550 if (bdev->is_smr()) {
f67539c2
TL
14551 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
14552 uint64_t b_off0 = b_off;
f67539c2 14553 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
20effc67
TL
14554
14555 // Zero detection -- small block
14556 if (!bl.is_zero()) {
14557 BlobRef b = c->new_blob();
14558 _pad_zeros(&bl, &b_off0, min_alloc_size);
14559 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true);
14560 } else { // if (bl.is_zero())
14561 dout(20) << __func__ << " skip small zero block " << std::hex
14562 << " (0x" << b_off0 << "~" << bl.length() << ")"
14563 << " (0x" << b_off << "~" << length << ")"
14564 << std::dec << dendl;
14565 logger->inc(l_bluestore_write_small_skipped);
14566 logger->inc(l_bluestore_write_small_skipped_bytes, length);
14567 }
14568
f67539c2
TL
14569 return;
14570 }
20effc67 14571#endif
f67539c2 14572
7c673cae
FG
14573 // Look for an existing mutable blob we can use.
14574 auto begin = o->extent_map.extent_map.begin();
14575 auto end = o->extent_map.extent_map.end();
14576 auto ep = o->extent_map.seek_lextent(offset);
14577 if (ep != begin) {
14578 --ep;
14579 if (ep->blob_end() <= offset) {
14580 ++ep;
14581 }
14582 }
f67539c2
TL
14583 auto prev_ep = end;
14584 if (ep != begin) {
14585 prev_ep = ep;
7c673cae 14586 --prev_ep;
7c673cae
FG
14587 }
14588
eafe8130
TL
14589 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
14590 // We don't want to have more blobs than min alloc units fit
14591 // into 2 max blobs
14592 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
14593 bool above_blob_threshold = false;
14594
14595 inspected_blobs.reserve(blob_threshold);
14596
14597 uint64_t max_off = 0;
14598 auto start_ep = ep;
14599 auto end_ep = ep; // exclusively
7c673cae
FG
14600 do {
14601 any_change = false;
14602
14603 if (ep != end && ep->logical_offset < offset + max_bsize) {
14604 BlobRef b = ep->blob;
eafe8130
TL
14605 if (!above_blob_threshold) {
14606 inspected_blobs.insert(&b->get_blob());
14607 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
14608 }
14609 max_off = ep->logical_end();
7c673cae 14610 auto bstart = ep->blob_start();
eafe8130 14611
7c673cae
FG
14612 dout(20) << __func__ << " considering " << *b
14613 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
14614 if (bstart >= end_offs) {
14615 dout(20) << __func__ << " ignoring distant " << *b << dendl;
14616 } else if (!b->get_blob().is_mutable()) {
14617 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
14618 } else if (ep->logical_offset % min_alloc_size !=
14619 ep->blob_offset % min_alloc_size) {
14620 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
14621 } else {
14622 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
14623 // can we pad our head/tail out with zeros?
14624 uint64_t head_pad, tail_pad;
11fdf7f2
TL
14625 head_pad = p2phase(offset, chunk_size);
14626 tail_pad = p2nphase(end_offs, chunk_size);
7c673cae
FG
14627 if (head_pad || tail_pad) {
14628 o->extent_map.fault_range(db, offset - head_pad,
14629 end_offs - offset + head_pad + tail_pad);
14630 }
14631 if (head_pad &&
a4b75251 14632 o->extent_map.has_any_lextents(offset - head_pad, head_pad)) {
7c673cae
FG
14633 head_pad = 0;
14634 }
14635 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
14636 tail_pad = 0;
14637 }
14638
14639 uint64_t b_off = offset - head_pad - bstart;
14640 uint64_t b_len = length + head_pad + tail_pad;
14641
14642 // direct write into unused blocks of an existing mutable blob?
14643 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
14644 b->get_blob().get_ondisk_length() >= b_off + b_len &&
14645 b->get_blob().is_unused(b_off, b_len) &&
14646 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 14647 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
14648
14649 dout(20) << __func__ << " write to unused 0x" << std::hex
14650 << b_off << "~" << b_len
14651 << " pad 0x" << head_pad << " + 0x" << tail_pad
14652 << std::dec << " of mutable " << *b << dendl;
224ce89b 14653 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
14654 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14655
11fdf7f2 14656 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 14657 if (b_len < prefer_deferred_size) {
7c673cae
FG
14658 dout(20) << __func__ << " deferring small 0x" << std::hex
14659 << b_len << std::dec << " unused write via deferred" << dendl;
522d829b 14660 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
7c673cae
FG
14661 op->op = bluestore_deferred_op_t::OP_WRITE;
14662 b->get_blob().map(
14663 b_off, b_len,
14664 [&](uint64_t offset, uint64_t length) {
14665 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14666 return 0;
14667 });
224ce89b 14668 op->data = bl;
7c673cae
FG
14669 } else {
14670 b->get_blob().map_bl(
224ce89b 14671 b_off, bl,
7c673cae
FG
14672 [&](uint64_t offset, bufferlist& t) {
14673 bdev->aio_write(offset, t,
14674 &txc->ioc, wctx->buffered);
14675 });
14676 }
14677 }
224ce89b 14678 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
14679 dout(20) << __func__ << " lex old " << *ep << dendl;
14680 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
14681 b,
14682 &wctx->old_extents);
14683 b->dirty_blob().mark_used(le->blob_offset, le->length);
f67539c2 14684
7c673cae
FG
14685 txc->statfs_delta.stored() += le->length;
14686 dout(20) << __func__ << " lex " << *le << dendl;
14687 logger->inc(l_bluestore_write_small_unused);
14688 return;
14689 }
14690 // read some data to fill out the chunk?
11fdf7f2
TL
14691 uint64_t head_read = p2phase(b_off, chunk_size);
14692 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
7c673cae
FG
14693 if ((head_read || tail_read) &&
14694 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
14695 head_read + tail_read < min_alloc_size) {
14696 b_off -= head_read;
14697 b_len += head_read + tail_read;
14698
14699 } else {
14700 head_read = tail_read = 0;
14701 }
14702
14703 // chunk-aligned deferred overwrite?
14704 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
14705 b_off % chunk_size == 0 &&
14706 b_len % chunk_size == 0 &&
14707 b->get_blob().is_allocated(b_off, b_len)) {
14708
224ce89b 14709 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
14710
14711 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
14712 << " and tail 0x" << tail_read << std::dec << dendl;
14713 if (head_read) {
14714 bufferlist head_bl;
14715 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
14716 head_bl, 0);
11fdf7f2 14717 ceph_assert(r >= 0 && r <= (int)head_read);
7c673cae
FG
14718 size_t zlen = head_read - r;
14719 if (zlen) {
14720 head_bl.append_zero(zlen);
14721 logger->inc(l_bluestore_write_pad_bytes, zlen);
14722 }
11fdf7f2
TL
14723 head_bl.claim_append(bl);
14724 bl.swap(head_bl);
7c673cae
FG
14725 logger->inc(l_bluestore_write_penalty_read_ops);
14726 }
14727 if (tail_read) {
14728 bufferlist tail_bl;
14729 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
14730 tail_bl, 0);
11fdf7f2 14731 ceph_assert(r >= 0 && r <= (int)tail_read);
7c673cae
FG
14732 size_t zlen = tail_read - r;
14733 if (zlen) {
14734 tail_bl.append_zero(zlen);
14735 logger->inc(l_bluestore_write_pad_bytes, zlen);
14736 }
224ce89b 14737 bl.claim_append(tail_bl);
7c673cae
FG
14738 logger->inc(l_bluestore_write_penalty_read_ops);
14739 }
f67539c2 14740 logger->inc(l_bluestore_write_small_pre_read);
7c673cae 14741
224ce89b 14742 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
14743 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14744
f67539c2 14745 b->dirty_blob().calc_csum(b_off, bl);
11fdf7f2
TL
14746
14747 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 14748 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
11fdf7f2
TL
14749 op->op = bluestore_deferred_op_t::OP_WRITE;
14750 int r = b->get_blob().map(
14751 b_off, b_len,
14752 [&](uint64_t offset, uint64_t length) {
14753 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14754 return 0;
14755 });
14756 ceph_assert(r == 0);
f67539c2 14757 op->data = std::move(bl);
11fdf7f2
TL
14758 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
14759 << b_len << std::dec << " of mutable " << *b
14760 << " at " << op->extents << dendl;
14761 }
14762
7c673cae
FG
14763 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
14764 b, &wctx->old_extents);
14765 b->dirty_blob().mark_used(le->blob_offset, le->length);
14766 txc->statfs_delta.stored() += le->length;
14767 dout(20) << __func__ << " lex " << *le << dendl;
7c673cae
FG
14768 return;
14769 }
224ce89b
WB
14770 // try to reuse blob if we can
14771 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
14772 max_bsize,
14773 offset0 - bstart,
14774 &alloc_len)) {
11fdf7f2 14775 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
14776 // fit into reused blob
14777 // Need to check for pending writes desiring to
14778 // reuse the same pextent. The rationale is that during GC two chunks
14779 // from garbage blobs(compressed?) can share logical space within the same
14780 // AU. That's in turn might be caused by unaligned len in clone_range2.
14781 // Hence the second write will fail in an attempt to reuse blob at
14782 // do_alloc_write().
14783 if (!wctx->has_conflict(b,
14784 offset0,
14785 offset0 + alloc_len,
14786 min_alloc_size)) {
14787
14788 // we can't reuse pad_head/pad_tail since they might be truncated
14789 // due to existent extents
14790 uint64_t b_off = offset - bstart;
14791 uint64_t b_off0 = b_off;
20effc67 14792 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
7c673cae 14793
20effc67
TL
14794 // Zero detection -- small block
14795 if (!bl.is_zero()) {
14796 _pad_zeros(&bl, &b_off0, chunk_size);
14797
14798 dout(20) << __func__ << " reuse blob " << *b << std::hex
14799 << " (0x" << b_off0 << "~" << bl.length() << ")"
14800 << " (0x" << b_off << "~" << length << ")"
14801 << std::dec << dendl;
14802
14803 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
14804 false, false);
14805 logger->inc(l_bluestore_write_small_unused);
14806 } else { // if (bl.is_zero())
14807 dout(20) << __func__ << " skip small zero block " << std::hex
14808 << " (0x" << b_off0 << "~" << bl.length() << ")"
14809 << " (0x" << b_off << "~" << length << ")"
14810 << std::dec << dendl;
14811 logger->inc(l_bluestore_write_small_skipped);
14812 logger->inc(l_bluestore_write_small_skipped_bytes, length);
14813 }
7c673cae 14814
7c673cae
FG
14815 return;
14816 }
14817 }
14818 }
14819 ++ep;
eafe8130 14820 end_ep = ep;
7c673cae
FG
14821 any_change = true;
14822 } // if (ep != end && ep->logical_offset < offset + max_bsize)
14823
14824 // check extent for reuse in reverse order
14825 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
14826 BlobRef b = prev_ep->blob;
eafe8130
TL
14827 if (!above_blob_threshold) {
14828 inspected_blobs.insert(&b->get_blob());
14829 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
14830 }
14831 start_ep = prev_ep;
7c673cae
FG
14832 auto bstart = prev_ep->blob_start();
14833 dout(20) << __func__ << " considering " << *b
14834 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 14835 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
14836 max_bsize,
14837 offset0 - bstart,
14838 &alloc_len)) {
11fdf7f2 14839 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
14840 // fit into reused blob
14841 // Need to check for pending writes desiring to
14842 // reuse the same pextent. The rationale is that during GC two chunks
14843 // from garbage blobs(compressed?) can share logical space within the same
14844 // AU. That's in turn might be caused by unaligned len in clone_range2.
14845 // Hence the second write will fail in an attempt to reuse blob at
14846 // do_alloc_write().
14847 if (!wctx->has_conflict(b,
14848 offset0,
14849 offset0 + alloc_len,
14850 min_alloc_size)) {
14851
7c673cae
FG
14852 uint64_t b_off = offset - bstart;
14853 uint64_t b_off0 = b_off;
20effc67 14854 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
7c673cae 14855
20effc67
TL
14856 // Zero detection -- small block
14857 if (!bl.is_zero()) {
14858 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
14859 _pad_zeros(&bl, &b_off0, chunk_size);
14860
14861 dout(20) << __func__ << " reuse blob " << *b << std::hex
14862 << " (0x" << b_off0 << "~" << bl.length() << ")"
14863 << " (0x" << b_off << "~" << length << ")"
14864 << std::dec << dendl;
14865
14866 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
14867 false, false);
14868 logger->inc(l_bluestore_write_small_unused);
14869 } else { // if (bl.is_zero())
14870 dout(20) << __func__ << " skip small zero block " << std::hex
14871 << " (0x" << b_off0 << "~" << bl.length() << ")"
14872 << " (0x" << b_off << "~" << length << ")"
14873 << std::dec << dendl;
14874 logger->inc(l_bluestore_write_small_skipped);
14875 logger->inc(l_bluestore_write_small_skipped_bytes, length);
14876 }
7c673cae 14877
7c673cae
FG
14878 return;
14879 }
14880 }
14881 if (prev_ep != begin) {
14882 --prev_ep;
14883 any_change = true;
14884 } else {
14885 prev_ep = end; // to avoid useless first extent re-check
14886 }
14887 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
14888 } while (any_change);
14889
eafe8130
TL
14890 if (above_blob_threshold) {
14891 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
14892 << " " << std::hex << min_off << "~" << max_off << std::dec
14893 << dendl;
14894 ceph_assert(start_ep != end_ep);
14895 for (auto ep = start_ep; ep != end_ep; ++ep) {
14896 dout(20) << __func__ << " inserting for GC "
14897 << std::hex << ep->logical_offset << "~" << ep->length
14898 << std::dec << dendl;
14899
14900 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
14901 }
14902 // insert newly written extent to GC
14903 wctx->extents_to_gc.union_insert(offset, length);
14904 dout(20) << __func__ << " inserting (last) for GC "
14905 << std::hex << offset << "~" << length
14906 << std::dec << dendl;
14907 }
11fdf7f2 14908 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
7c673cae 14909 uint64_t b_off0 = b_off;
7c673cae 14910 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
20effc67
TL
14911
14912 // Zero detection -- small block
14913 if (!bl.is_zero()) {
14914 // new blob.
14915 BlobRef b = c->new_blob();
14916 _pad_zeros(&bl, &b_off0, block_size);
14917 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
14918 min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
14919 // doesn't match disk one only
14920 true);
14921 } else { // if (bl.is_zero())
14922 dout(20) << __func__ << " skip small zero block " << std::hex
14923 << " (0x" << b_off0 << "~" << bl.length() << ")"
14924 << " (0x" << b_off << "~" << length << ")"
14925 << std::dec << dendl;
14926 logger->inc(l_bluestore_write_small_skipped);
14927 logger->inc(l_bluestore_write_small_skipped_bytes, length);
14928 }
7c673cae
FG
14929
14930 return;
14931}
14932
20effc67
TL
14933bool BlueStore::has_null_fm()
14934{
14935 return fm->is_null_manager();
14936}
14937
f67539c2
TL
14938bool BlueStore::BigDeferredWriteContext::can_defer(
14939 BlueStore::extent_map_t::iterator ep,
14940 uint64_t prefer_deferred_size,
14941 uint64_t block_size,
14942 uint64_t offset,
14943 uint64_t l)
14944{
14945 bool res = false;
14946 auto& blob = ep->blob->get_blob();
14947 if (offset >= ep->blob_start() &&
14948 blob.is_mutable()) {
14949 off = offset;
14950 b_off = offset - ep->blob_start();
14951 uint64_t chunk_size = blob.get_chunk_size(block_size);
14952 uint64_t ondisk = blob.get_ondisk_length();
14953 used = std::min(l, ondisk - b_off);
14954
14955 // will read some data to fill out the chunk?
14956 head_read = p2phase<uint64_t>(b_off, chunk_size);
14957 tail_read = p2nphase<uint64_t>(b_off + used, chunk_size);
14958 b_off -= head_read;
14959
14960 ceph_assert(b_off % chunk_size == 0);
14961 ceph_assert(blob_aligned_len() % chunk_size == 0);
14962
522d829b 14963 res = blob_aligned_len() < prefer_deferred_size &&
f67539c2
TL
14964 blob_aligned_len() <= ondisk &&
14965 blob.is_allocated(b_off, blob_aligned_len());
14966 if (res) {
14967 blob_ref = ep->blob;
14968 blob_start = ep->blob_start();
14969 }
14970 }
14971 return res;
14972}
14973
14974bool BlueStore::BigDeferredWriteContext::apply_defer()
14975{
14976 int r = blob_ref->get_blob().map(
14977 b_off, blob_aligned_len(),
14978 [&](const bluestore_pextent_t& pext,
14979 uint64_t offset,
14980 uint64_t length) {
14981 // apply deferred if overwrite breaks blob continuity only.
14982 // if it totally overlaps some pextent - fallback to regular write
14983 if (pext.offset < offset ||
14984 pext.end() > offset + length) {
14985 res_extents.emplace_back(bluestore_pextent_t(offset, length));
14986 return 0;
14987 }
14988 return -1;
14989 });
14990 return r >= 0;
14991}
14992
14993void BlueStore::_do_write_big_apply_deferred(
14994 TransContext* txc,
14995 CollectionRef& c,
14996 OnodeRef o,
14997 BlueStore::BigDeferredWriteContext& dctx,
14998 bufferlist::iterator& blp,
14999 WriteContext* wctx)
15000{
15001 bufferlist bl;
15002 dout(20) << __func__ << " reading head 0x" << std::hex << dctx.head_read
15003 << " and tail 0x" << dctx.tail_read << std::dec << dendl;
15004 if (dctx.head_read) {
15005 int r = _do_read(c.get(), o,
15006 dctx.off - dctx.head_read,
15007 dctx.head_read,
15008 bl,
15009 0);
15010 ceph_assert(r >= 0 && r <= (int)dctx.head_read);
15011 size_t zlen = dctx.head_read - r;
15012 if (zlen) {
15013 bl.append_zero(zlen);
15014 logger->inc(l_bluestore_write_pad_bytes, zlen);
15015 }
15016 logger->inc(l_bluestore_write_penalty_read_ops);
15017 }
15018 blp.copy(dctx.used, bl);
15019
15020 if (dctx.tail_read) {
15021 bufferlist tail_bl;
15022 int r = _do_read(c.get(), o,
15023 dctx.off + dctx.used, dctx.tail_read,
15024 tail_bl, 0);
15025 ceph_assert(r >= 0 && r <= (int)dctx.tail_read);
15026 size_t zlen = dctx.tail_read - r;
15027 if (zlen) {
15028 tail_bl.append_zero(zlen);
15029 logger->inc(l_bluestore_write_pad_bytes, zlen);
15030 }
15031 bl.claim_append(tail_bl);
15032 logger->inc(l_bluestore_write_penalty_read_ops);
15033 }
15034 auto& b0 = dctx.blob_ref;
15035 _buffer_cache_write(txc, b0, dctx.b_off, bl,
15036 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
15037
15038 b0->dirty_blob().calc_csum(dctx.b_off, bl);
15039
15040 Extent* le = o->extent_map.set_lextent(c, dctx.off,
15041 dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents);
15042
15043 // in fact this is a no-op for big writes but left here to maintain
15044 // uniformity and avoid missing after some refactor.
15045 b0->dirty_blob().mark_used(le->blob_offset, le->length);
15046 txc->statfs_delta.stored() += le->length;
15047
15048 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 15049 bluestore_deferred_op_t* op = _get_deferred_op(txc, bl.length());
f67539c2
TL
15050 op->op = bluestore_deferred_op_t::OP_WRITE;
15051 op->extents.swap(dctx.res_extents);
15052 op->data = std::move(bl);
15053 }
15054}
15055
7c673cae
FG
15056void BlueStore::_do_write_big(
15057 TransContext *txc,
15058 CollectionRef &c,
15059 OnodeRef o,
15060 uint64_t offset, uint64_t length,
15061 bufferlist::iterator& blp,
15062 WriteContext *wctx)
15063{
15064 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
15065 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
15066 << " compress " << (int)wctx->compress
15067 << dendl;
15068 logger->inc(l_bluestore_write_big);
15069 logger->inc(l_bluestore_write_big_bytes, length);
11fdf7f2 15070 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
f67539c2 15071 uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load();
7c673cae
FG
15072 while (length > 0) {
15073 bool new_blob = false;
7c673cae
FG
15074 BlobRef b;
15075 uint32_t b_off = 0;
522d829b 15076 uint32_t l = 0;
7c673cae
FG
15077
15078 //attempting to reuse existing blob
15079 if (!wctx->compress) {
522d829b
TL
15080 // enforce target blob alignment with max_bsize
15081 l = max_bsize - p2phase(offset, max_bsize);
15082 l = std::min(uint64_t(l), length);
15083
7c673cae 15084 auto end = o->extent_map.extent_map.end();
f67539c2 15085
522d829b
TL
15086 dout(20) << __func__ << " may be defer: 0x" << std::hex
15087 << offset << "~" << l
15088 << std::dec << dendl;
15089
f67539c2
TL
15090 if (prefer_deferred_size_snapshot &&
15091 l <= prefer_deferred_size_snapshot * 2) {
15092 // Single write that spans two adjusted existing blobs can result
15093 // in up to two deferred blocks of 'prefer_deferred_size'
15094 // So we're trying to minimize the amount of resulting blobs
15095 // and preserve 2 blobs rather than inserting one more in between
15096 // E.g. write 0x10000~20000 over existing blobs
15097 // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
15098 // performance point of view) to result in two deferred writes to
15099 // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
15100
15101 // look for an existing mutable blob we can write into
15102 auto ep = o->extent_map.seek_lextent(offset);
15103 auto ep_next = end;
15104 BigDeferredWriteContext head_info, tail_info;
15105
15106 bool will_defer = ep != end ?
15107 head_info.can_defer(ep,
15108 prefer_deferred_size_snapshot,
15109 block_size,
15110 offset,
15111 l) :
15112 false;
15113 auto offset_next = offset + head_info.used;
15114 auto remaining = l - head_info.used;
15115 if (will_defer && remaining) {
15116 will_defer = false;
15117 if (remaining <= prefer_deferred_size_snapshot) {
15118 ep_next = o->extent_map.seek_lextent(offset_next);
15119 // check if we can defer remaining totally
15120 will_defer = ep_next == end ?
15121 false :
15122 tail_info.can_defer(ep_next,
15123 prefer_deferred_size_snapshot,
15124 block_size,
15125 offset_next,
15126 remaining);
15127 will_defer = will_defer && remaining == tail_info.used;
15128 }
15129 }
15130 if (will_defer) {
15131 dout(20) << __func__ << " " << *(head_info.blob_ref)
15132 << " deferring big " << std::hex
15133 << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")"
15134 << std::dec << " write via deferred"
15135 << dendl;
15136 if (remaining) {
15137 dout(20) << __func__ << " " << *(tail_info.blob_ref)
15138 << " deferring big " << std::hex
15139 << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")"
15140 << std::dec << " write via deferred"
15141 << dendl;
15142 }
15143
15144 will_defer = head_info.apply_defer();
15145 if (!will_defer) {
15146 dout(20) << __func__
15147 << " deferring big fell back, head isn't continuous"
15148 << dendl;
15149 } else if (remaining) {
15150 will_defer = tail_info.apply_defer();
15151 if (!will_defer) {
15152 dout(20) << __func__
15153 << " deferring big fell back, tail isn't continuous"
15154 << dendl;
15155 }
15156 }
15157 }
15158 if (will_defer) {
15159 _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx);
15160 if (remaining) {
15161 _do_write_big_apply_deferred(txc, c, o, tail_info,
15162 blp, wctx);
15163 }
522d829b
TL
15164 dout(20) << __func__ << " defer big: 0x" << std::hex
15165 << offset << "~" << l
15166 << std::dec << dendl;
f67539c2
TL
15167 offset += l;
15168 length -= l;
15169 logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1);
15170 logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1);
15171 continue;
15172 }
15173 }
522d829b 15174 dout(20) << __func__ << " lookup for blocks to reuse..." << dendl;
f67539c2
TL
15175
15176 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
15177
15178 // seek again as punch_hole could invalidate ep
7c673cae 15179 auto ep = o->extent_map.seek_lextent(offset);
f67539c2
TL
15180 auto begin = o->extent_map.extent_map.begin();
15181 auto prev_ep = end;
15182 if (ep != begin) {
15183 prev_ep = ep;
7c673cae 15184 --prev_ep;
7c673cae 15185 }
f67539c2 15186
7c673cae
FG
15187 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
15188 // search suitable extent in both forward and reverse direction in
15189 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 15190 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
15191 bool any_change;
15192 do {
15193 any_change = false;
15194 if (ep != end && ep->logical_offset < offset + max_bsize) {
522d829b
TL
15195 dout(20) << __func__ << " considering " << *ep
15196 << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl;
f67539c2
TL
15197
15198 if (offset >= ep->blob_start() &&
224ce89b 15199 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
15200 offset - ep->blob_start(),
15201 &l)) {
15202 b = ep->blob;
f67539c2 15203 b_off = offset - ep->blob_start();
7c673cae
FG
15204 prev_ep = end; // to avoid check below
15205 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 15206 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
15207 } else {
15208 ++ep;
15209 any_change = true;
15210 }
15211 }
15212
15213 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
522d829b
TL
15214 dout(20) << __func__ << " considering rev " << *prev_ep
15215 << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl;
f67539c2 15216 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
15217 offset - prev_ep->blob_start(),
15218 &l)) {
15219 b = prev_ep->blob;
15220 b_off = offset - prev_ep->blob_start();
15221 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 15222 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
15223 } else if (prev_ep != begin) {
15224 --prev_ep;
15225 any_change = true;
15226 } else {
15227 prev_ep = end; // to avoid useless first extent re-check
15228 }
15229 }
15230 } while (b == nullptr && any_change);
f67539c2 15231 } else {
522d829b
TL
15232 // trying to utilize as longer chunk as permitted in case of compression.
15233 l = std::min(max_bsize, length);
f67539c2
TL
15234 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
15235 } // if (!wctx->compress)
15236
7c673cae
FG
15237 if (b == nullptr) {
15238 b = c->new_blob();
15239 b_off = 0;
15240 new_blob = true;
15241 }
7c673cae
FG
15242 bufferlist t;
15243 blp.copy(l, t);
20effc67
TL
15244
15245 // Zero detection -- big block
15246 if (!t.is_zero()) {
15247 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
15248
15249 dout(20) << __func__ << " schedule write big: 0x"
522d829b
TL
15250 << std::hex << offset << "~" << l << std::dec
15251 << (new_blob ? " new " : " reuse ")
15252 << *b << dendl;
20effc67
TL
15253
15254 logger->inc(l_bluestore_write_big_blobs);
15255 } else { // if (!t.is_zero())
15256 dout(20) << __func__ << " skip big zero block " << std::hex
15257 << " (0x" << b_off << "~" << t.length() << ")"
15258 << " (0x" << b_off << "~" << l << ")"
15259 << std::dec << dendl;
15260 logger->inc(l_bluestore_write_big_skipped_blobs);
15261 logger->inc(l_bluestore_write_big_skipped_bytes, l);
15262 }
15263
7c673cae
FG
15264 offset += l;
15265 length -= l;
7c673cae
FG
15266 }
15267}
15268
15269int BlueStore::_do_alloc_write(
15270 TransContext *txc,
15271 CollectionRef coll,
15272 OnodeRef o,
15273 WriteContext *wctx)
15274{
15275 dout(20) << __func__ << " txc " << txc
15276 << " " << wctx->writes.size() << " blobs"
15277 << dendl;
3efd9988
FG
15278 if (wctx->writes.empty()) {
15279 return 0;
7c673cae
FG
15280 }
15281
7c673cae
FG
15282 CompressorRef c;
15283 double crr = 0;
15284 if (wctx->compress) {
15285 c = select_option(
15286 "compression_algorithm",
15287 compressor,
15288 [&]() {
15289 string val;
15290 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
15291 CompressorRef cp = compressor;
15292 if (!cp || cp->get_type_name() != val) {
15293 cp = Compressor::create(cct, val);
11fdf7f2
TL
15294 if (!cp) {
15295 if (_set_compression_alert(false, val.c_str())) {
15296 derr << __func__ << " unable to initialize " << val.c_str()
15297 << " compressor" << dendl;
15298 }
15299 }
7c673cae
FG
15300 }
15301 return boost::optional<CompressorRef>(cp);
15302 }
15303 return boost::optional<CompressorRef>();
15304 }
15305 );
15306
15307 crr = select_option(
15308 "compression_required_ratio",
15309 cct->_conf->bluestore_compression_required_ratio,
15310 [&]() {
15311 double val;
3efd9988 15312 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
7c673cae
FG
15313 return boost::optional<double>(val);
15314 }
15315 return boost::optional<double>();
15316 }
15317 );
15318 }
15319
15320 // checksum
11fdf7f2 15321 int64_t csum = csum_type.load();
7c673cae
FG
15322 csum = select_option(
15323 "csum_type",
15324 csum,
15325 [&]() {
11fdf7f2 15326 int64_t val;
3efd9988 15327 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
11fdf7f2 15328 return boost::optional<int64_t>(val);
7c673cae 15329 }
11fdf7f2 15330 return boost::optional<int64_t>();
7c673cae
FG
15331 }
15332 );
15333
3efd9988
FG
15334 // compress (as needed) and calc needed space
15335 uint64_t need = 0;
11fdf7f2 15336 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae 15337 for (auto& wi : wctx->writes) {
3efd9988 15338 if (c && wi.blob_length > min_alloc_size) {
11fdf7f2 15339 auto start = mono_clock::now();
7c673cae
FG
15340
15341 // compress
11fdf7f2
TL
15342 ceph_assert(wi.b_off == 0);
15343 ceph_assert(wi.blob_length == wi.bl.length());
3efd9988 15344
7c673cae
FG
15345 // FIXME: memory alignment here is bad
15346 bufferlist t;
f67539c2
TL
15347 boost::optional<int32_t> compressor_message;
15348 int r = c->compress(wi.bl, t, compressor_message);
3efd9988 15349 uint64_t want_len_raw = wi.blob_length * crr;
11fdf7f2 15350 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
a8e16298
TL
15351 bool rejected = false;
15352 uint64_t compressed_len = t.length();
15353 // do an approximate (fast) estimation for resulting blob size
15354 // that doesn't take header overhead into account
11fdf7f2 15355 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
15356 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
15357 bluestore_compression_header_t chdr;
15358 chdr.type = c->get_type();
15359 chdr.length = t.length();
f67539c2 15360 chdr.compressor_message = compressor_message;
a8e16298
TL
15361 encode(chdr, wi.compressed_bl);
15362 wi.compressed_bl.claim_append(t);
15363
15364 compressed_len = wi.compressed_bl.length();
11fdf7f2 15365 result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
15366 if (result_len <= want_len && result_len < wi.blob_length) {
15367 // Cool. We compressed at least as much as we were hoping to.
15368 // pad out to min_alloc_size
15369 wi.compressed_bl.append_zero(result_len - compressed_len);
15370 wi.compressed_len = compressed_len;
15371 wi.compressed = true;
15372 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
15373 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
15374 << " -> 0x" << compressed_len << " => 0x" << result_len
15375 << " with " << c->get_type()
15376 << std::dec << dendl;
15377 txc->statfs_delta.compressed() += compressed_len;
15378 txc->statfs_delta.compressed_original() += wi.blob_length;
15379 txc->statfs_delta.compressed_allocated() += result_len;
15380 logger->inc(l_bluestore_compress_success_count);
15381 need += result_len;
15382 } else {
15383 rejected = true;
15384 }
15385 } else if (r != 0) {
15386 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
15387 << " bytes compressed using " << c->get_type_name()
15388 << std::dec
15389 << " failed with errcode = " << r
15390 << ", leaving uncompressed"
15391 << dendl;
15392 logger->inc(l_bluestore_compress_rejected_count);
15393 need += wi.blob_length;
7c673cae 15394 } else {
a8e16298
TL
15395 rejected = true;
15396 }
15397
15398 if (rejected) {
3efd9988 15399 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
a8e16298 15400 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
3efd9988
FG
15401 << " with " << c->get_type()
15402 << ", which is more than required 0x" << want_len_raw
7c673cae 15403 << " -> 0x" << want_len
3efd9988
FG
15404 << ", leaving uncompressed"
15405 << std::dec << dendl;
15406 logger->inc(l_bluestore_compress_rejected_count);
15407 need += wi.blob_length;
7c673cae 15408 }
494da23a
TL
15409 log_latency("compress@_do_alloc_write",
15410 l_bluestore_compress_lat,
15411 mono_clock::now() - start,
15412 cct->_conf->bluestore_log_op_age );
3efd9988
FG
15413 } else {
15414 need += wi.blob_length;
7c673cae 15415 }
3efd9988 15416 }
a8e16298 15417 PExtentVector prealloc;
3efd9988 15418 prealloc.reserve(2 * wctx->writes.size());;
11fdf7f2 15419 int64_t prealloc_left = 0;
20effc67 15420 prealloc_left = alloc->allocate(
3efd9988
FG
15421 need, min_alloc_size, need,
15422 0, &prealloc);
eafe8130 15423 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
11fdf7f2 15424 derr << __func__ << " failed to allocate 0x" << std::hex << need
eafe8130 15425 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
11fdf7f2 15426 << " min_alloc_size 0x" << min_alloc_size
20effc67 15427 << " available 0x " << alloc->get_free()
11fdf7f2
TL
15428 << std::dec << dendl;
15429 if (prealloc.size()) {
20effc67 15430 alloc->release(prealloc);
11fdf7f2 15431 }
a8e16298
TL
15432 return -ENOSPC;
15433 }
20effc67 15434 _collect_allocation_stats(need, min_alloc_size, prealloc);
f67539c2 15435
3efd9988
FG
15436 dout(20) << __func__ << " prealloc " << prealloc << dendl;
15437 auto prealloc_pos = prealloc.begin();
522d829b
TL
15438 ceph_assert(prealloc_pos != prealloc.end());
15439 uint64_t prealloc_pos_length = prealloc_pos->length;
3efd9988
FG
15440
15441 for (auto& wi : wctx->writes) {
522d829b 15442 bluestore_blob_t& dblob = wi.b->dirty_blob();
3efd9988
FG
15443 uint64_t b_off = wi.b_off;
15444 bufferlist *l = &wi.bl;
15445 uint64_t final_length = wi.blob_length;
15446 uint64_t csum_length = wi.blob_length;
3efd9988
FG
15447 if (wi.compressed) {
15448 final_length = wi.compressed_bl.length();
15449 csum_length = final_length;
adb31ebb 15450 unsigned csum_order = ctz(csum_length);
3efd9988
FG
15451 l = &wi.compressed_bl;
15452 dblob.set_compressed(wi.blob_length, wi.compressed_len);
adb31ebb 15453 if (csum != Checksummer::CSUM_NONE) {
522d829b
TL
15454 dout(20) << __func__
15455 << " initialize csum setting for compressed blob " << *wi.b
adb31ebb
TL
15456 << " csum_type " << Checksummer::get_csum_type_string(csum)
15457 << " csum_order " << csum_order
15458 << " csum_length 0x" << std::hex << csum_length
15459 << " blob_length 0x" << wi.blob_length
15460 << " compressed_length 0x" << wi.compressed_len << std::dec
15461 << dendl;
15462 dblob.init_csum(csum, csum_order, csum_length);
15463 }
3efd9988 15464 } else if (wi.new_blob) {
adb31ebb 15465 unsigned csum_order;
7c673cae 15466 // initialize newly created blob only
11fdf7f2 15467 ceph_assert(dblob.is_mutable());
7c673cae
FG
15468 if (l->length() != wi.blob_length) {
15469 // hrm, maybe we could do better here, but let's not bother.
15470 dout(20) << __func__ << " forcing csum_order to block_size_order "
15471 << block_size_order << dendl;
31f18b77 15472 csum_order = block_size_order;
7c673cae
FG
15473 } else {
15474 csum_order = std::min(wctx->csum_order, ctz(l->length()));
15475 }
15476 // try to align blob with max_blob_size to improve
15477 // its reuse ratio, e.g. in case of reverse write
15478 uint32_t suggested_boff =
15479 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
15480 if ((suggested_boff % (1 << csum_order)) == 0 &&
15481 suggested_boff + final_length <= max_bsize &&
15482 suggested_boff > b_off) {
181888fb 15483 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae 15484 << std::hex << suggested_boff << std::dec << dendl;
11fdf7f2 15485 ceph_assert(suggested_boff >= b_off);
7c673cae
FG
15486 csum_length += suggested_boff - b_off;
15487 b_off = suggested_boff;
15488 }
181888fb 15489 if (csum != Checksummer::CSUM_NONE) {
522d829b
TL
15490 dout(20) << __func__
15491 << " initialize csum setting for new blob " << *wi.b
181888fb
FG
15492 << " csum_type " << Checksummer::get_csum_type_string(csum)
15493 << " csum_order " << csum_order
15494 << " csum_length 0x" << std::hex << csum_length << std::dec
15495 << dendl;
15496 dblob.init_csum(csum, csum_order, csum_length);
15497 }
7c673cae
FG
15498 }
15499
a8e16298 15500 PExtentVector extents;
3efd9988 15501 int64_t left = final_length;
522d829b
TL
15502 bool has_chunk2defer = false;
15503 auto prefer_deferred_size_snapshot = prefer_deferred_size.load();
3efd9988 15504 while (left > 0) {
11fdf7f2 15505 ceph_assert(prealloc_left > 0);
522d829b 15506 has_chunk2defer |= (prealloc_pos_length < prefer_deferred_size_snapshot);
3efd9988
FG
15507 if (prealloc_pos->length <= left) {
15508 prealloc_left -= prealloc_pos->length;
15509 left -= prealloc_pos->length;
15510 txc->statfs_delta.allocated() += prealloc_pos->length;
15511 extents.push_back(*prealloc_pos);
15512 ++prealloc_pos;
522d829b
TL
15513 if (prealloc_pos != prealloc.end()) {
15514 prealloc_pos_length = prealloc_pos->length;
15515 }
3efd9988
FG
15516 } else {
15517 extents.emplace_back(prealloc_pos->offset, left);
15518 prealloc_pos->offset += left;
15519 prealloc_pos->length -= left;
15520 prealloc_left -= left;
15521 txc->statfs_delta.allocated() += left;
15522 left = 0;
15523 break;
15524 }
15525 }
7c673cae 15526 for (auto& p : extents) {
3efd9988 15527 txc->allocated.insert(p.offset, p.length);
7c673cae 15528 }
11fdf7f2 15529 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
7c673cae 15530
522d829b 15531 dout(20) << __func__ << " blob " << *wi.b << dendl;
181888fb 15532 if (dblob.has_csum()) {
7c673cae
FG
15533 dblob.calc_csum(b_off, *l);
15534 }
181888fb 15535
7c673cae 15536 if (wi.mark_unused) {
1911f103 15537 ceph_assert(!dblob.is_compressed());
7c673cae
FG
15538 auto b_end = b_off + wi.bl.length();
15539 if (b_off) {
15540 dblob.add_unused(0, b_off);
15541 }
1911f103
TL
15542 uint64_t llen = dblob.get_logical_length();
15543 if (b_end < llen) {
15544 dblob.add_unused(b_end, llen - b_end);
7c673cae
FG
15545 }
15546 }
15547
15548 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
15549 b_off + (wi.b_off0 - wi.b_off),
15550 wi.length0,
15551 wi.b,
15552 nullptr);
15553 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
15554 txc->statfs_delta.stored() += le->length;
15555 dout(20) << __func__ << " lex " << *le << dendl;
15556 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
15557 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
15558
15559 // queue io
11fdf7f2 15560 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 15561 if (has_chunk2defer && l->length() < prefer_deferred_size_snapshot) {
f67539c2 15562 dout(20) << __func__ << " deferring 0x" << std::hex
7c673cae 15563 << l->length() << std::dec << " write via deferred" << dendl;
522d829b 15564 bluestore_deferred_op_t *op = _get_deferred_op(txc, l->length());
7c673cae 15565 op->op = bluestore_deferred_op_t::OP_WRITE;
522d829b 15566 int r = wi.b->get_blob().map(
7c673cae
FG
15567 b_off, l->length(),
15568 [&](uint64_t offset, uint64_t length) {
15569 op->extents.emplace_back(bluestore_pextent_t(offset, length));
15570 return 0;
15571 });
11fdf7f2 15572 ceph_assert(r == 0);
7c673cae
FG
15573 op->data = *l;
15574 } else {
522d829b 15575 wi.b->get_blob().map_bl(
7c673cae
FG
15576 b_off, *l,
15577 [&](uint64_t offset, bufferlist& t) {
15578 bdev->aio_write(offset, t, &txc->ioc, false);
15579 });
f67539c2 15580 logger->inc(l_bluestore_write_new);
7c673cae
FG
15581 }
15582 }
15583 }
11fdf7f2
TL
15584 ceph_assert(prealloc_pos == prealloc.end());
15585 ceph_assert(prealloc_left == 0);
7c673cae
FG
15586 return 0;
15587}
15588
15589void BlueStore::_wctx_finish(
15590 TransContext *txc,
15591 CollectionRef& c,
15592 OnodeRef o,
31f18b77
FG
15593 WriteContext *wctx,
15594 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae 15595{
20effc67
TL
15596#ifdef HAVE_LIBZBD
15597 if (bdev->is_smr()) {
15598 for (auto& w : wctx->writes) {
15599 for (auto& e : w.b->get_blob().get_extents()) {
15600 if (!e.is_valid()) {
15601 continue;
15602 }
15603 uint32_t zone = e.offset / zone_size;
15604 if (!o->onode.zone_offset_refs.count(zone)) {
15605 uint64_t zoff = e.offset % zone_size;
15606 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
15607 << " offset 0x" << zoff << std::dec << dendl;
15608 txc->note_write_zone_offset(o, zone, zoff);
15609 }
15610 }
15611 }
15612 }
15613 set<uint32_t> zones_with_releases;
15614#endif
15615
7c673cae
FG
15616 auto oep = wctx->old_extents.begin();
15617 while (oep != wctx->old_extents.end()) {
15618 auto &lo = *oep;
15619 oep = wctx->old_extents.erase(oep);
15620 dout(20) << __func__ << " lex_old " << lo.e << dendl;
15621 BlobRef b = lo.e.blob;
15622 const bluestore_blob_t& blob = b->get_blob();
15623 if (blob.is_compressed()) {
15624 if (lo.blob_empty) {
15625 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
15626 }
15627 txc->statfs_delta.compressed_original() -= lo.e.length;
15628 }
15629 auto& r = lo.r;
15630 txc->statfs_delta.stored() -= lo.e.length;
15631 if (!r.empty()) {
f67539c2 15632 dout(20) << __func__ << " blob " << *b << " release " << r << dendl;
7c673cae
FG
15633 if (blob.is_shared()) {
15634 PExtentVector final;
15635 c->load_shared_blob(b->shared_blob);
11fdf7f2
TL
15636 bool unshare = false;
15637 bool* unshare_ptr =
15638 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
7c673cae 15639 for (auto e : r) {
31f18b77
FG
15640 b->shared_blob->put_ref(
15641 e.offset, e.length, &final,
11fdf7f2 15642 unshare_ptr);
20effc67
TL
15643#ifdef HAVE_LIBZBD
15644 // we also drop zone ref for shared blob extents
15645 if (bdev->is_smr() && e.is_valid()) {
15646 zones_with_releases.insert(e.offset / zone_size);
15647 }
15648#endif
11fdf7f2
TL
15649 }
15650 if (unshare) {
15651 ceph_assert(maybe_unshared_blobs);
15652 maybe_unshared_blobs->insert(b->shared_blob.get());
7c673cae
FG
15653 }
15654 dout(20) << __func__ << " shared_blob release " << final
15655 << " from " << *b->shared_blob << dendl;
15656 txc->write_shared_blob(b->shared_blob);
15657 r.clear();
15658 r.swap(final);
15659 }
15660 }
15661 // we can't invalidate our logical extents as we drop them because
15662 // other lextents (either in our onode or others) may still
15663 // reference them. but we can throw out anything that is no
15664 // longer allocated. Note that this will leave behind edge bits
15665 // that are no longer referenced but not deallocated (until they
15666 // age out of the cache naturally).
15667 b->discard_unallocated(c.get());
15668 for (auto e : r) {
15669 dout(20) << __func__ << " release " << e << dendl;
15670 txc->released.insert(e.offset, e.length);
15671 txc->statfs_delta.allocated() -= e.length;
15672 if (blob.is_compressed()) {
15673 txc->statfs_delta.compressed_allocated() -= e.length;
15674 }
20effc67
TL
15675#ifdef HAVE_LIBZBD
15676 if (bdev->is_smr() && e.is_valid()) {
15677 zones_with_releases.insert(e.offset / zone_size);
15678 }
15679#endif
7c673cae 15680 }
9f95a23c
TL
15681
15682 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
7c673cae
FG
15683 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
15684 << dendl;
15685 o->extent_map.spanning_blob_map.erase(b->id);
15686 }
9f95a23c 15687 delete &lo;
7c673cae 15688 }
20effc67
TL
15689
15690#ifdef HAVE_LIBZBD
15691 if (!zones_with_releases.empty()) {
15692 // we need to fault the entire extent range in here to determinte if we've dropped
15693 // all refs to a zone.
15694 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
15695 for (auto& b : o->extent_map.extent_map) {
15696 for (auto& e : b.blob->get_blob().get_extents()) {
15697 if (e.is_valid()) {
15698 zones_with_releases.erase(e.offset / zone_size);
15699 }
15700 }
15701 }
15702 for (auto zone : zones_with_releases) {
15703 auto p = o->onode.zone_offset_refs.find(zone);
15704 if (p != o->onode.zone_offset_refs.end()) {
15705 dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
15706 << " offset 0x" << p->second << std::dec << dendl;
15707 txc->note_release_zone_offset(o, zone, p->second);
15708 }
15709 }
15710 }
15711#endif
7c673cae
FG
15712}
15713
15714void BlueStore::_do_write_data(
15715 TransContext *txc,
15716 CollectionRef& c,
15717 OnodeRef o,
15718 uint64_t offset,
15719 uint64_t length,
15720 bufferlist& bl,
15721 WriteContext *wctx)
15722{
15723 uint64_t end = offset + length;
15724 bufferlist::iterator p = bl.begin();
15725
15726 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
15727 (length != min_alloc_size)) {
15728 // we fall within the same block
15729 _do_write_small(txc, c, o, offset, length, p, wctx);
15730 } else {
15731 uint64_t head_offset, head_length;
15732 uint64_t middle_offset, middle_length;
15733 uint64_t tail_offset, tail_length;
15734
15735 head_offset = offset;
11fdf7f2 15736 head_length = p2nphase(offset, min_alloc_size);
7c673cae 15737
11fdf7f2
TL
15738 tail_offset = p2align(end, min_alloc_size);
15739 tail_length = p2phase(end, min_alloc_size);
7c673cae
FG
15740
15741 middle_offset = head_offset + head_length;
15742 middle_length = length - head_length - tail_length;
15743
15744 if (head_length) {
15745 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
15746 }
15747
f67539c2 15748 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
7c673cae
FG
15749
15750 if (tail_length) {
15751 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
15752 }
15753 }
15754}
15755
31f18b77
FG
15756void BlueStore::_choose_write_options(
15757 CollectionRef& c,
15758 OnodeRef o,
15759 uint32_t fadvise_flags,
15760 WriteContext *wctx)
7c673cae 15761{
7c673cae
FG
15762 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
15763 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 15764 wctx->buffered = true;
7c673cae
FG
15765 } else if (cct->_conf->bluestore_default_buffered_write &&
15766 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
15767 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
15768 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 15769 wctx->buffered = true;
7c673cae
FG
15770 }
15771
31f18b77
FG
15772 // apply basic csum block size
15773 wctx->csum_order = block_size_order;
7c673cae
FG
15774
15775 // compression parameters
15776 unsigned alloc_hints = o->onode.alloc_hint_flags;
15777 auto cm = select_option(
15778 "compression_mode",
31f18b77 15779 comp_mode.load(),
7c673cae
FG
15780 [&]() {
15781 string val;
11fdf7f2 15782 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
15783 return boost::optional<Compressor::CompressionMode>(
15784 Compressor::get_comp_mode_type(val));
7c673cae
FG
15785 }
15786 return boost::optional<Compressor::CompressionMode>();
15787 }
15788 );
31f18b77
FG
15789
15790 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
15791 ((cm == Compressor::COMP_FORCE) ||
15792 (cm == Compressor::COMP_AGGRESSIVE &&
15793 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
15794 (cm == Compressor::COMP_PASSIVE &&
15795 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
15796
15797 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
15798 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
15799 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
15800 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 15801 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 15802
7c673cae 15803 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 15804
7c673cae 15805 if (o->onode.expected_write_size) {
224ce89b 15806 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 15807 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 15808 } else {
224ce89b 15809 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
15810 }
15811
31f18b77
FG
15812 if (wctx->compress) {
15813 wctx->target_blob_size = select_option(
7c673cae 15814 "compression_max_blob_size",
31f18b77 15815 comp_max_blob_size.load(),
7c673cae 15816 [&]() {
11fdf7f2
TL
15817 int64_t val;
15818 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
7c673cae
FG
15819 return boost::optional<uint64_t>((uint64_t)val);
15820 }
15821 return boost::optional<uint64_t>();
15822 }
15823 );
15824 }
15825 } else {
31f18b77
FG
15826 if (wctx->compress) {
15827 wctx->target_blob_size = select_option(
7c673cae 15828 "compression_min_blob_size",
31f18b77 15829 comp_min_blob_size.load(),
7c673cae 15830 [&]() {
11fdf7f2
TL
15831 int64_t val;
15832 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
7c673cae
FG
15833 return boost::optional<uint64_t>((uint64_t)val);
15834 }
15835 return boost::optional<uint64_t>();
15836 }
15837 );
15838 }
15839 }
31f18b77 15840
7c673cae 15841 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
15842 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
15843 wctx->target_blob_size = max_bsize;
7c673cae 15844 }
31f18b77 15845
7c673cae
FG
15846 // set the min blob size floor at 2x the min_alloc_size, or else we
15847 // won't be able to allocate a smaller extent for the compressed
15848 // data.
31f18b77
FG
15849 if (wctx->compress &&
15850 wctx->target_blob_size < min_alloc_size * 2) {
15851 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 15852 }
31f18b77
FG
15853
15854 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
15855 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
f64942e4
AA
15856 << " compress=" << (int)wctx->compress
15857 << " buffered=" << (int)wctx->buffered
31f18b77
FG
15858 << std::dec << dendl;
15859}
15860
15861int BlueStore::_do_gc(
15862 TransContext *txc,
15863 CollectionRef& c,
15864 OnodeRef o,
31f18b77
FG
15865 const WriteContext& wctx,
15866 uint64_t *dirty_start,
15867 uint64_t *dirty_end)
15868{
31f18b77 15869
1adf2230 15870 bool dirty_range_updated = false;
31f18b77 15871 WriteContext wctx_gc;
7c673cae 15872 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 15873
eafe8130 15874 auto & extents_to_collect = wctx.extents_to_gc;
31f18b77
FG
15875 for (auto it = extents_to_collect.begin();
15876 it != extents_to_collect.end();
15877 ++it) {
15878 bufferlist bl;
eafe8130
TL
15879 auto offset = (*it).first;
15880 auto length = (*it).second;
15881 dout(20) << __func__ << " processing " << std::hex
15882 << offset << "~" << length << std::dec
15883 << dendl;
15884 int r = _do_read(c.get(), o, offset, length, bl, 0);
15885 ceph_assert(r == (int)length);
31f18b77 15886
eafe8130
TL
15887 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
15888 logger->inc(l_bluestore_gc_merged, length);
31f18b77 15889
eafe8130
TL
15890 if (*dirty_start > offset) {
15891 *dirty_start = offset;
1adf2230 15892 dirty_range_updated = true;
31f18b77
FG
15893 }
15894
eafe8130
TL
15895 if (*dirty_end < offset + length) {
15896 *dirty_end = offset + length;
1adf2230 15897 dirty_range_updated = true;
31f18b77
FG
15898 }
15899 }
1adf2230
AA
15900 if (dirty_range_updated) {
15901 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
15902 }
31f18b77
FG
15903
15904 dout(30) << __func__ << " alloc write" << dendl;
15905 int r = _do_alloc_write(txc, c, o, &wctx_gc);
15906 if (r < 0) {
15907 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
15908 << dendl;
15909 return r;
15910 }
15911
15912 _wctx_finish(txc, c, o, &wctx_gc);
15913 return 0;
15914}
15915
15916int BlueStore::_do_write(
15917 TransContext *txc,
15918 CollectionRef& c,
15919 OnodeRef o,
15920 uint64_t offset,
15921 uint64_t length,
15922 bufferlist& bl,
15923 uint32_t fadvise_flags)
15924{
15925 int r = 0;
15926
15927 dout(20) << __func__
15928 << " " << o->oid
15929 << " 0x" << std::hex << offset << "~" << length
15930 << " - have 0x" << o->onode.size
15931 << " (" << std::dec << o->onode.size << ")"
f67539c2
TL
15932 << " bytes" << std::hex
15933 << " fadvise_flags 0x" << fadvise_flags
15934 << " alloc_hint 0x" << o->onode.alloc_hint_flags
15935 << " expected_object_size " << o->onode.expected_object_size
15936 << " expected_write_size " << o->onode.expected_write_size
15937 << std::dec
31f18b77 15938 << dendl;
81eedcae 15939 _dump_onode<30>(cct, *o);
31f18b77
FG
15940
15941 if (length == 0) {
15942 return 0;
15943 }
15944
15945 uint64_t end = offset + length;
15946
15947 GarbageCollector gc(c->store->cct);
eafe8130 15948 int64_t benefit = 0;
31f18b77
FG
15949 auto dirty_start = offset;
15950 auto dirty_end = end;
15951
15952 WriteContext wctx;
15953 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
15954 o->extent_map.fault_range(db, offset, length);
15955 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
15956 r = _do_alloc_write(txc, c, o, &wctx);
15957 if (r < 0) {
15958 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
15959 << dendl;
15960 goto out;
15961 }
15962
eafe8130
TL
15963 if (wctx.extents_to_gc.empty() ||
15964 wctx.extents_to_gc.range_start() > offset ||
15965 wctx.extents_to_gc.range_end() < offset + length) {
15966 benefit = gc.estimate(offset,
15967 length,
15968 o->extent_map,
15969 wctx.old_extents,
15970 min_alloc_size);
15971 }
15972
31f18b77
FG
15973 // NB: _wctx_finish() will empty old_extents
15974 // so we must do gc estimation before that
7c673cae
FG
15975 _wctx_finish(txc, c, o, &wctx);
15976 if (end > o->onode.size) {
15977 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 15978 << std::dec << dendl;
7c673cae
FG
15979 o->onode.size = end;
15980 }
15981
11fdf7f2 15982 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
eafe8130
TL
15983 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
15984 dout(20) << __func__
15985 << " perform garbage collection for compressed extents, "
15986 << "expected benefit = " << benefit << " AUs" << dendl;
15987 }
15988 if (!wctx.extents_to_gc.empty()) {
15989 dout(20) << __func__ << " perform garbage collection" << dendl;
15990
15991 r = _do_gc(txc, c, o,
15992 wctx,
15993 &dirty_start, &dirty_end);
15994 if (r < 0) {
15995 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
15996 << dendl;
15997 goto out;
7c673cae 15998 }
eafe8130
TL
15999 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
16000 << "~" << dirty_end - dirty_start << std::dec << dendl;
7c673cae 16001 }
7c673cae 16002 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
16003 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
16004
7c673cae
FG
16005 r = 0;
16006
16007 out:
16008 return r;
16009}
16010
16011int BlueStore::_write(TransContext *txc,
16012 CollectionRef& c,
16013 OnodeRef& o,
31f18b77
FG
16014 uint64_t offset, size_t length,
16015 bufferlist& bl,
16016 uint32_t fadvise_flags)
7c673cae
FG
16017{
16018 dout(15) << __func__ << " " << c->cid << " " << o->oid
16019 << " 0x" << std::hex << offset << "~" << length << std::dec
16020 << dendl;
35e4c445
FG
16021 int r = 0;
16022 if (offset + length >= OBJECT_MAX_SIZE) {
16023 r = -E2BIG;
16024 } else {
16025 _assign_nid(txc, o);
16026 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
16027 txc->write_onode(o);
16028 }
7c673cae
FG
16029 dout(10) << __func__ << " " << c->cid << " " << o->oid
16030 << " 0x" << std::hex << offset << "~" << length << std::dec
16031 << " = " << r << dendl;
16032 return r;
16033}
16034
16035int BlueStore::_zero(TransContext *txc,
16036 CollectionRef& c,
16037 OnodeRef& o,
16038 uint64_t offset, size_t length)
16039{
16040 dout(15) << __func__ << " " << c->cid << " " << o->oid
16041 << " 0x" << std::hex << offset << "~" << length << std::dec
16042 << dendl;
35e4c445
FG
16043 int r = 0;
16044 if (offset + length >= OBJECT_MAX_SIZE) {
16045 r = -E2BIG;
16046 } else {
16047 _assign_nid(txc, o);
16048 r = _do_zero(txc, c, o, offset, length);
16049 }
7c673cae
FG
16050 dout(10) << __func__ << " " << c->cid << " " << o->oid
16051 << " 0x" << std::hex << offset << "~" << length << std::dec
16052 << " = " << r << dendl;
16053 return r;
16054}
16055
16056int BlueStore::_do_zero(TransContext *txc,
16057 CollectionRef& c,
16058 OnodeRef& o,
16059 uint64_t offset, size_t length)
16060{
16061 dout(15) << __func__ << " " << c->cid << " " << o->oid
16062 << " 0x" << std::hex << offset << "~" << length << std::dec
16063 << dendl;
16064 int r = 0;
16065
81eedcae 16066 _dump_onode<30>(cct, *o);
7c673cae
FG
16067
16068 WriteContext wctx;
16069 o->extent_map.fault_range(db, offset, length);
16070 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 16071 o->extent_map.dirty_range(offset, length);
7c673cae
FG
16072 _wctx_finish(txc, c, o, &wctx);
16073
b32b8144 16074 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
16075 o->onode.size = offset + length;
16076 dout(20) << __func__ << " extending size to " << offset + length
16077 << dendl;
16078 }
16079 txc->write_onode(o);
16080
16081 dout(10) << __func__ << " " << c->cid << " " << o->oid
16082 << " 0x" << std::hex << offset << "~" << length << std::dec
16083 << " = " << r << dendl;
16084 return r;
16085}
16086
16087void BlueStore::_do_truncate(
31f18b77
FG
16088 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
16089 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
16090{
16091 dout(15) << __func__ << " " << c->cid << " " << o->oid
16092 << " 0x" << std::hex << offset << std::dec << dendl;
16093
81eedcae 16094 _dump_onode<30>(cct, *o);
7c673cae
FG
16095
16096 if (offset == o->onode.size)
31f18b77 16097 return;
7c673cae 16098
f67539c2 16099 WriteContext wctx;
7c673cae 16100 if (offset < o->onode.size) {
7c673cae
FG
16101 uint64_t length = o->onode.size - offset;
16102 o->extent_map.fault_range(db, offset, length);
16103 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 16104 o->extent_map.dirty_range(offset, length);
20effc67 16105
31f18b77 16106 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
16107
16108 // if we have shards past EOF, ask for a reshard
16109 if (!o->onode.extent_map_shards.empty() &&
16110 o->onode.extent_map_shards.back().offset >= offset) {
16111 dout(10) << __func__ << " request reshard past EOF" << dendl;
16112 if (offset) {
16113 o->extent_map.request_reshard(offset - 1, offset + length);
16114 } else {
16115 o->extent_map.request_reshard(0, length);
16116 }
16117 }
16118 }
16119
16120 o->onode.size = offset;
16121
16122 txc->write_onode(o);
16123}
16124
35e4c445 16125int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
16126 CollectionRef& c,
16127 OnodeRef& o,
16128 uint64_t offset)
16129{
16130 dout(15) << __func__ << " " << c->cid << " " << o->oid
16131 << " 0x" << std::hex << offset << std::dec
16132 << dendl;
20effc67
TL
16133
16134 auto start_time = mono_clock::now();
35e4c445
FG
16135 int r = 0;
16136 if (offset >= OBJECT_MAX_SIZE) {
16137 r = -E2BIG;
16138 } else {
16139 _do_truncate(txc, c, o, offset);
16140 }
20effc67
TL
16141 log_latency_fn(
16142 __func__,
16143 l_bluestore_truncate_lat,
16144 mono_clock::now() - start_time,
16145 cct->_conf->bluestore_log_op_age,
16146 [&](const ceph::timespan& lat) {
16147 ostringstream ostr;
16148 ostr << ", lat = " << timespan_str(lat)
16149 << " cid =" << c->cid
16150 << " oid =" << o->oid;
16151 return ostr.str();
16152 }
16153 );
35e4c445
FG
16154 dout(10) << __func__ << " " << c->cid << " " << o->oid
16155 << " 0x" << std::hex << offset << std::dec
16156 << " = " << r << dendl;
16157 return r;
7c673cae
FG
16158}
16159
16160int BlueStore::_do_remove(
16161 TransContext *txc,
16162 CollectionRef& c,
16163 OnodeRef o)
16164{
31f18b77 16165 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
16166 bool is_gen = !o->oid.is_no_gen();
16167 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
16168 if (o->onode.has_omap()) {
16169 o->flush();
9f95a23c 16170 _do_omap_clear(txc, o);
7c673cae
FG
16171 }
16172 o->exists = false;
16173 string key;
16174 for (auto &s : o->extent_map.shards) {
16175 dout(20) << __func__ << " removing shard 0x" << std::hex
16176 << s.shard_info->offset << std::dec << dendl;
16177 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
16178 [&](const string& final_key) {
16179 txc->t->rmkey(PREFIX_OBJ, final_key);
16180 }
16181 );
16182 }
16183 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
a8e16298 16184 txc->note_removed_object(o);
7c673cae
FG
16185 o->extent_map.clear();
16186 o->onode = bluestore_onode_t();
16187 _debug_obj_on_delete(o->oid);
31f18b77 16188
224ce89b
WB
16189 if (!is_gen || maybe_unshared_blobs.empty()) {
16190 return 0;
16191 }
31f18b77 16192
224ce89b
WB
16193 // see if we can unshare blobs still referenced by the head
16194 dout(10) << __func__ << " gen and maybe_unshared_blobs "
16195 << maybe_unshared_blobs << dendl;
16196 ghobject_t nogen = o->oid;
16197 nogen.generation = ghobject_t::NO_GEN;
f67539c2 16198 OnodeRef h = c->get_onode(nogen, false);
224ce89b
WB
16199
16200 if (!h || !h->exists) {
16201 return 0;
16202 }
16203
16204 dout(20) << __func__ << " checking for unshareable blobs on " << h
16205 << " " << h->oid << dendl;
16206 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
16207 for (auto& e : h->extent_map.extent_map) {
16208 const bluestore_blob_t& b = e.blob->get_blob();
16209 SharedBlob *sb = e.blob->shared_blob.get();
16210 if (b.is_shared() &&
16211 sb->loaded &&
16212 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
16213 if (b.is_compressed()) {
16214 expect[sb].get(0, b.get_ondisk_length());
16215 } else {
16216 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
16217 expect[sb].get(off, len);
16218 return 0;
16219 });
16220 }
224ce89b
WB
16221 }
16222 }
31f18b77 16223
224ce89b
WB
16224 vector<SharedBlob*> unshared_blobs;
16225 unshared_blobs.reserve(maybe_unshared_blobs.size());
16226 for (auto& p : expect) {
16227 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
16228 if (p.first->persistent->ref_map == p.second) {
16229 SharedBlob *sb = p.first;
16230 dout(20) << __func__ << " unsharing " << *sb << dendl;
16231 unshared_blobs.push_back(sb);
16232 txc->unshare_blob(sb);
16233 uint64_t sbid = c->make_blob_unshared(sb);
16234 string key;
16235 get_shared_blob_key(sbid, &key);
16236 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
16237 }
16238 }
16239
16240 if (unshared_blobs.empty()) {
16241 return 0;
16242 }
16243
224ce89b
WB
16244 for (auto& e : h->extent_map.extent_map) {
16245 const bluestore_blob_t& b = e.blob->get_blob();
16246 SharedBlob *sb = e.blob->shared_blob.get();
16247 if (b.is_shared() &&
16248 std::find(unshared_blobs.begin(), unshared_blobs.end(),
16249 sb) != unshared_blobs.end()) {
16250 dout(20) << __func__ << " unsharing " << e << dendl;
16251 bluestore_blob_t& blob = e.blob->dirty_blob();
16252 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 16253 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
16254 }
16255 }
224ce89b
WB
16256 txc->write_onode(h);
16257
7c673cae
FG
16258 return 0;
16259}
16260
16261int BlueStore::_remove(TransContext *txc,
16262 CollectionRef& c,
16263 OnodeRef &o)
16264{
11fdf7f2
TL
16265 dout(15) << __func__ << " " << c->cid << " " << o->oid
16266 << " onode " << o.get()
16267 << " txc "<< txc << dendl;
20effc67 16268 auto start_time = mono_clock::now();
7c673cae 16269 int r = _do_remove(txc, c, o);
20effc67 16270
adb31ebb
TL
16271 log_latency_fn(
16272 __func__,
16273 l_bluestore_remove_lat,
16274 mono_clock::now() - start_time,
16275 cct->_conf->bluestore_log_op_age,
16276 [&](const ceph::timespan& lat) {
16277 ostringstream ostr;
16278 ostr << ", lat = " << timespan_str(lat)
16279 << " cid =" << c->cid
16280 << " oid =" << o->oid;
16281 return ostr.str();
16282 }
16283 );
16284
7c673cae
FG
16285 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16286 return r;
16287}
16288
16289int BlueStore::_setattr(TransContext *txc,
16290 CollectionRef& c,
16291 OnodeRef& o,
16292 const string& name,
16293 bufferptr& val)
16294{
16295 dout(15) << __func__ << " " << c->cid << " " << o->oid
16296 << " " << name << " (" << val.length() << " bytes)"
16297 << dendl;
16298 int r = 0;
3efd9988
FG
16299 if (val.is_partial()) {
16300 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
16301 val.length());
f91f0fd5 16302 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
16303 } else {
16304 auto& b = o->onode.attrs[name.c_str()] = val;
f91f0fd5 16305 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 16306 }
7c673cae
FG
16307 txc->write_onode(o);
16308 dout(10) << __func__ << " " << c->cid << " " << o->oid
16309 << " " << name << " (" << val.length() << " bytes)"
16310 << " = " << r << dendl;
16311 return r;
16312}
16313
16314int BlueStore::_setattrs(TransContext *txc,
16315 CollectionRef& c,
16316 OnodeRef& o,
16317 const map<string,bufferptr>& aset)
16318{
16319 dout(15) << __func__ << " " << c->cid << " " << o->oid
16320 << " " << aset.size() << " keys"
16321 << dendl;
16322 int r = 0;
16323 for (map<string,bufferptr>::const_iterator p = aset.begin();
16324 p != aset.end(); ++p) {
3efd9988
FG
16325 if (p->second.is_partial()) {
16326 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 16327 bufferptr(p->second.c_str(), p->second.length());
f91f0fd5 16328 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
16329 } else {
16330 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
f91f0fd5 16331 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 16332 }
7c673cae
FG
16333 }
16334 txc->write_onode(o);
16335 dout(10) << __func__ << " " << c->cid << " " << o->oid
16336 << " " << aset.size() << " keys"
16337 << " = " << r << dendl;
16338 return r;
16339}
16340
16341
16342int BlueStore::_rmattr(TransContext *txc,
16343 CollectionRef& c,
16344 OnodeRef& o,
16345 const string& name)
16346{
16347 dout(15) << __func__ << " " << c->cid << " " << o->oid
16348 << " " << name << dendl;
16349 int r = 0;
16350 auto it = o->onode.attrs.find(name.c_str());
16351 if (it == o->onode.attrs.end())
16352 goto out;
16353
16354 o->onode.attrs.erase(it);
16355 txc->write_onode(o);
16356
16357 out:
16358 dout(10) << __func__ << " " << c->cid << " " << o->oid
16359 << " " << name << " = " << r << dendl;
16360 return r;
16361}
16362
16363int BlueStore::_rmattrs(TransContext *txc,
16364 CollectionRef& c,
16365 OnodeRef& o)
16366{
16367 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16368 int r = 0;
16369
16370 if (o->onode.attrs.empty())
16371 goto out;
16372
16373 o->onode.attrs.clear();
16374 txc->write_onode(o);
16375
16376 out:
16377 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16378 return r;
16379}
16380
9f95a23c 16381void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
7c673cae 16382{
9f95a23c 16383 const string& omap_prefix = o->get_omap_prefix();
7c673cae 16384 string prefix, tail;
9f95a23c
TL
16385 o->get_omap_header(&prefix);
16386 o->get_omap_tail(&tail);
11fdf7f2 16387 txc->t->rm_range_keys(omap_prefix, prefix, tail);
494da23a 16388 txc->t->rmkey(omap_prefix, tail);
20effc67 16389 o->onode.clear_omap_flag();
11fdf7f2
TL
16390 dout(20) << __func__ << " remove range start: "
16391 << pretty_binary_string(prefix) << " end: "
16392 << pretty_binary_string(tail) << dendl;
7c673cae
FG
16393}
16394
16395int BlueStore::_omap_clear(TransContext *txc,
16396 CollectionRef& c,
16397 OnodeRef& o)
16398{
16399 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
20effc67
TL
16400 auto t0 = mono_clock::now();
16401
7c673cae
FG
16402 int r = 0;
16403 if (o->onode.has_omap()) {
16404 o->flush();
9f95a23c 16405 _do_omap_clear(txc, o);
7c673cae
FG
16406 txc->write_onode(o);
16407 }
20effc67
TL
16408 logger->tinc(l_bluestore_omap_clear_lat, mono_clock::now() - t0);
16409
7c673cae
FG
16410 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16411 return r;
16412}
16413
16414int BlueStore::_omap_setkeys(TransContext *txc,
16415 CollectionRef& c,
16416 OnodeRef& o,
16417 bufferlist &bl)
16418{
16419 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16420 int r;
11fdf7f2 16421 auto p = bl.cbegin();
7c673cae
FG
16422 __u32 num;
16423 if (!o->onode.has_omap()) {
11fdf7f2 16424 if (o->oid.is_pgmeta()) {
9f95a23c
TL
16425 o->onode.set_omap_flags_pgmeta();
16426 } else {
522d829b 16427 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
11fdf7f2 16428 }
7c673cae 16429 txc->write_onode(o);
494da23a 16430
9f95a23c 16431 const string& prefix = o->get_omap_prefix();
494da23a
TL
16432 string key_tail;
16433 bufferlist tail;
9f95a23c 16434 o->get_omap_tail(&key_tail);
494da23a 16435 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
16436 } else {
16437 txc->note_modified_object(o);
16438 }
9f95a23c 16439 const string& prefix = o->get_omap_prefix();
7c673cae 16440 string final_key;
9f95a23c
TL
16441 o->get_omap_key(string(), &final_key);
16442 size_t base_key_len = final_key.size();
11fdf7f2 16443 decode(num, p);
7c673cae
FG
16444 while (num--) {
16445 string key;
16446 bufferlist value;
11fdf7f2
TL
16447 decode(key, p);
16448 decode(value, p);
9f95a23c 16449 final_key.resize(base_key_len); // keep prefix
7c673cae 16450 final_key += key;
11fdf7f2 16451 dout(20) << __func__ << " " << pretty_binary_string(final_key)
7c673cae 16452 << " <- " << key << dendl;
11fdf7f2 16453 txc->t->set(prefix, final_key, value);
7c673cae
FG
16454 }
16455 r = 0;
16456 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16457 return r;
16458}
16459
16460int BlueStore::_omap_setheader(TransContext *txc,
16461 CollectionRef& c,
16462 OnodeRef &o,
16463 bufferlist& bl)
16464{
16465 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16466 int r;
16467 string key;
16468 if (!o->onode.has_omap()) {
11fdf7f2 16469 if (o->oid.is_pgmeta()) {
9f95a23c
TL
16470 o->onode.set_omap_flags_pgmeta();
16471 } else {
522d829b 16472 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
11fdf7f2 16473 }
7c673cae 16474 txc->write_onode(o);
494da23a 16475
9f95a23c 16476 const string& prefix = o->get_omap_prefix();
494da23a
TL
16477 string key_tail;
16478 bufferlist tail;
9f95a23c 16479 o->get_omap_tail(&key_tail);
494da23a 16480 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
16481 } else {
16482 txc->note_modified_object(o);
16483 }
9f95a23c
TL
16484 const string& prefix = o->get_omap_prefix();
16485 o->get_omap_header(&key);
11fdf7f2 16486 txc->t->set(prefix, key, bl);
7c673cae
FG
16487 r = 0;
16488 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16489 return r;
16490}
16491
16492int BlueStore::_omap_rmkeys(TransContext *txc,
16493 CollectionRef& c,
16494 OnodeRef& o,
16495 bufferlist& bl)
16496{
16497 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
16498 int r = 0;
11fdf7f2 16499 auto p = bl.cbegin();
7c673cae
FG
16500 __u32 num;
16501 string final_key;
16502
16503 if (!o->onode.has_omap()) {
16504 goto out;
16505 }
11fdf7f2 16506 {
9f95a23c
TL
16507 const string& prefix = o->get_omap_prefix();
16508 o->get_omap_key(string(), &final_key);
16509 size_t base_key_len = final_key.size();
11fdf7f2
TL
16510 decode(num, p);
16511 while (num--) {
16512 string key;
16513 decode(key, p);
9f95a23c 16514 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
16515 final_key += key;
16516 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
16517 << " <- " << key << dendl;
16518 txc->t->rmkey(prefix, final_key);
16519 }
7c673cae
FG
16520 }
16521 txc->note_modified_object(o);
16522
16523 out:
16524 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16525 return r;
16526}
16527
16528int BlueStore::_omap_rmkey_range(TransContext *txc,
16529 CollectionRef& c,
16530 OnodeRef& o,
16531 const string& first, const string& last)
16532{
16533 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
7c673cae
FG
16534 string key_first, key_last;
16535 int r = 0;
16536 if (!o->onode.has_omap()) {
16537 goto out;
16538 }
11fdf7f2 16539 {
9f95a23c 16540 const string& prefix = o->get_omap_prefix();
11fdf7f2 16541 o->flush();
9f95a23c
TL
16542 o->get_omap_key(first, &key_first);
16543 o->get_omap_key(last, &key_last);
11fdf7f2
TL
16544 txc->t->rm_range_keys(prefix, key_first, key_last);
16545 dout(20) << __func__ << " remove range start: "
16546 << pretty_binary_string(key_first) << " end: "
16547 << pretty_binary_string(key_last) << dendl;
7c673cae
FG
16548 }
16549 txc->note_modified_object(o);
16550
16551 out:
16552 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
16553 return r;
16554}
16555
16556int BlueStore::_set_alloc_hint(
16557 TransContext *txc,
16558 CollectionRef& c,
16559 OnodeRef& o,
16560 uint64_t expected_object_size,
16561 uint64_t expected_write_size,
16562 uint32_t flags)
16563{
16564 dout(15) << __func__ << " " << c->cid << " " << o->oid
16565 << " object_size " << expected_object_size
16566 << " write_size " << expected_write_size
16567 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
16568 << dendl;
16569 int r = 0;
16570 o->onode.expected_object_size = expected_object_size;
16571 o->onode.expected_write_size = expected_write_size;
16572 o->onode.alloc_hint_flags = flags;
16573 txc->write_onode(o);
16574 dout(10) << __func__ << " " << c->cid << " " << o->oid
16575 << " object_size " << expected_object_size
16576 << " write_size " << expected_write_size
16577 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
16578 << " = " << r << dendl;
16579 return r;
16580}
16581
16582int BlueStore::_clone(TransContext *txc,
16583 CollectionRef& c,
16584 OnodeRef& oldo,
16585 OnodeRef& newo)
16586{
16587 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16588 << newo->oid << dendl;
16589 int r = 0;
16590 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
16591 derr << __func__ << " mismatched hash on " << oldo->oid
16592 << " and " << newo->oid << dendl;
16593 return -EINVAL;
16594 }
16595
7c673cae
FG
16596 _assign_nid(txc, newo);
16597
16598 // clone data
16599 oldo->flush();
16600 _do_truncate(txc, c, newo, 0);
16601 if (cct->_conf->bluestore_clone_cow) {
16602 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
16603 } else {
16604 bufferlist bl;
16605 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
16606 if (r < 0)
16607 goto out;
16608 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
16609 if (r < 0)
16610 goto out;
16611 }
16612
16613 // clone attrs
16614 newo->onode.attrs = oldo->onode.attrs;
16615
16616 // clone omap
16617 if (newo->onode.has_omap()) {
16618 dout(20) << __func__ << " clearing old omap data" << dendl;
16619 newo->flush();
9f95a23c 16620 _do_omap_clear(txc, newo);
7c673cae
FG
16621 }
16622 if (oldo->onode.has_omap()) {
16623 dout(20) << __func__ << " copying omap data" << dendl;
494da23a 16624 if (newo->oid.is_pgmeta()) {
9f95a23c
TL
16625 newo->onode.set_omap_flags_pgmeta();
16626 } else {
522d829b 16627 newo->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
7c673cae 16628 }
20effc67
TL
16629 // check if prefix for omap key is exactly the same size for both objects
16630 // otherwise rewrite_omap_key will corrupt data
16631 ceph_assert(oldo->onode.flags == newo->onode.flags);
9f95a23c 16632 const string& prefix = newo->get_omap_prefix();
11fdf7f2 16633 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 16634 string head, tail;
9f95a23c
TL
16635 oldo->get_omap_header(&head);
16636 oldo->get_omap_tail(&tail);
7c673cae
FG
16637 it->lower_bound(head);
16638 while (it->valid()) {
16639 if (it->key() >= tail) {
16640 dout(30) << __func__ << " reached tail" << dendl;
16641 break;
16642 } else {
16643 dout(30) << __func__ << " got header/data "
16644 << pretty_binary_string(it->key()) << dendl;
16645 string key;
9f95a23c 16646 newo->rewrite_omap_key(it->key(), &key);
11fdf7f2 16647 txc->t->set(prefix, key, it->value());
7c673cae
FG
16648 }
16649 it->next();
16650 }
494da23a
TL
16651 string new_tail;
16652 bufferlist new_tail_value;
9f95a23c 16653 newo->get_omap_tail(&new_tail);
494da23a 16654 txc->t->set(prefix, new_tail, new_tail_value);
7c673cae
FG
16655 }
16656
16657 txc->write_onode(newo);
16658 r = 0;
16659
16660 out:
16661 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16662 << newo->oid << " = " << r << dendl;
16663 return r;
16664}
16665
16666int BlueStore::_do_clone_range(
16667 TransContext *txc,
16668 CollectionRef& c,
16669 OnodeRef& oldo,
16670 OnodeRef& newo,
224ce89b
WB
16671 uint64_t srcoff,
16672 uint64_t length,
16673 uint64_t dstoff)
7c673cae
FG
16674{
16675 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16676 << newo->oid
16677 << " 0x" << std::hex << srcoff << "~" << length << " -> "
16678 << " 0x" << dstoff << "~" << length << std::dec << dendl;
16679 oldo->extent_map.fault_range(db, srcoff, length);
16680 newo->extent_map.fault_range(db, dstoff, length);
81eedcae
TL
16681 _dump_onode<30>(cct, *oldo);
16682 _dump_onode<30>(cct, *newo);
7c673cae 16683
11fdf7f2 16684 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
7c673cae 16685
20effc67
TL
16686#ifdef HAVE_LIBZBD
16687 if (bdev->is_smr()) {
16688 // duplicate the refs for the shared region.
16689 Extent dummy(dstoff);
16690 for (auto e = newo->extent_map.extent_map.lower_bound(dummy);
16691 e != newo->extent_map.extent_map.end();
16692 ++e) {
16693 if (e->logical_offset >= dstoff + length) {
16694 break;
16695 }
16696 for (auto& ex : e->blob->get_blob().get_extents()) {
16697 // note that we may introduce a new extent reference that is
16698 // earlier than the first zone ref. we allow this since it is
16699 // a lot of work to avoid and has marginal impact on cleaning
16700 // performance.
16701 if (!ex.is_valid()) {
16702 continue;
16703 }
16704 uint32_t zone = ex.offset / zone_size;
16705 if (!newo->onode.zone_offset_refs.count(zone)) {
16706 uint64_t zoff = ex.offset % zone_size;
16707 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
16708 << " offset 0x" << zoff << std::dec
16709 << " -> " << newo->oid << dendl;
16710 txc->note_write_zone_offset(newo, zone, zoff);
16711 }
16712 }
16713 }
16714 }
16715#endif
16716
16717 _dump_onode<30>(cct, *oldo);
16718 _dump_onode<30>(cct, *newo);
16719 return 0;
16720}
16721
16722int BlueStore::_clone_range(TransContext *txc,
16723 CollectionRef& c,
16724 OnodeRef& oldo,
16725 OnodeRef& newo,
16726 uint64_t srcoff, uint64_t length, uint64_t dstoff)
16727{
16728 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
7c673cae
FG
16729 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
16730 << " to offset 0x" << dstoff << std::dec << dendl;
16731 int r = 0;
16732
35e4c445
FG
16733 if (srcoff + length >= OBJECT_MAX_SIZE ||
16734 dstoff + length >= OBJECT_MAX_SIZE) {
16735 r = -E2BIG;
16736 goto out;
16737 }
7c673cae
FG
16738 if (srcoff + length > oldo->onode.size) {
16739 r = -EINVAL;
16740 goto out;
16741 }
16742
7c673cae
FG
16743 _assign_nid(txc, newo);
16744
16745 if (length > 0) {
16746 if (cct->_conf->bluestore_clone_cow) {
16747 _do_zero(txc, c, newo, dstoff, length);
16748 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
16749 } else {
16750 bufferlist bl;
16751 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
16752 if (r < 0)
16753 goto out;
16754 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
16755 if (r < 0)
16756 goto out;
16757 }
16758 }
16759
16760 txc->write_onode(newo);
16761 r = 0;
16762
16763 out:
16764 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16765 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
16766 << " to offset 0x" << dstoff << std::dec
16767 << " = " << r << dendl;
16768 return r;
16769}
16770
16771int BlueStore::_rename(TransContext *txc,
16772 CollectionRef& c,
16773 OnodeRef& oldo,
16774 OnodeRef& newo,
16775 const ghobject_t& new_oid)
16776{
16777 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
16778 << new_oid << dendl;
16779 int r;
16780 ghobject_t old_oid = oldo->oid;
f91f0fd5 16781 mempool::bluestore_cache_meta::string new_okey;
7c673cae
FG
16782
16783 if (newo) {
16784 if (newo->exists) {
16785 r = -EEXIST;
16786 goto out;
16787 }
11fdf7f2 16788 ceph_assert(txc->onodes.count(newo) == 0);
7c673cae
FG
16789 }
16790
16791 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
16792
16793 // rewrite shards
16794 {
16795 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
16796 get_object_key(cct, new_oid, &new_okey);
16797 string key;
16798 for (auto &s : oldo->extent_map.shards) {
16799 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
16800 [&](const string& final_key) {
16801 txc->t->rmkey(PREFIX_OBJ, final_key);
16802 }
16803 );
16804 s.dirty = true;
16805 }
16806 }
16807
16808 newo = oldo;
16809 txc->write_onode(newo);
16810
16811 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
16812 // Onode in the old slot
16813 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
16814 r = 0;
16815
f64942e4
AA
16816 // hold a ref to new Onode in old name position, to ensure we don't drop
16817 // it from the cache before this txc commits (or else someone may come along
16818 // and read newo's metadata via the old name).
16819 txc->note_modified_object(oldo);
16820
20effc67
TL
16821#ifdef HAVE_LIBZBD
16822 if (bdev->is_smr()) {
16823 // adjust zone refs
16824 for (auto& [zone, offset] : newo->onode.zone_offset_refs) {
16825 dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
16826 << " offset 0x" << offset << std::dec
16827 << " -> " << oldo->oid << dendl;
16828 string key;
16829 get_zone_offset_object_key(zone, offset, oldo->oid, &key);
16830 txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
16831
16832 dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
16833 << " offset 0x" << offset << std::dec
16834 << " -> " << newo->oid << dendl;
16835 get_zone_offset_object_key(zone, offset, newo->oid, &key);
16836 bufferlist v;
16837 txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
16838 }
16839 }
16840#endif
16841
7c673cae
FG
16842 out:
16843 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
16844 << new_oid << " = " << r << dendl;
16845 return r;
16846}
16847
16848// collections
16849
16850int BlueStore::_create_collection(
16851 TransContext *txc,
16852 const coll_t &cid,
16853 unsigned bits,
16854 CollectionRef *c)
16855{
16856 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
16857 int r;
16858 bufferlist bl;
16859
16860 {
9f95a23c 16861 std::unique_lock l(coll_lock);
7c673cae
FG
16862 if (*c) {
16863 r = -EEXIST;
16864 goto out;
16865 }
11fdf7f2
TL
16866 auto p = new_coll_map.find(cid);
16867 ceph_assert(p != new_coll_map.end());
16868 *c = p->second;
7c673cae
FG
16869 (*c)->cnode.bits = bits;
16870 coll_map[cid] = *c;
11fdf7f2 16871 new_coll_map.erase(p);
7c673cae 16872 }
11fdf7f2 16873 encode((*c)->cnode, bl);
7c673cae
FG
16874 txc->t->set(PREFIX_COLL, stringify(cid), bl);
16875 r = 0;
16876
16877 out:
16878 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
16879 return r;
16880}
16881
16882int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
16883 CollectionRef *c)
16884{
16885 dout(15) << __func__ << " " << cid << dendl;
16886 int r;
16887
11fdf7f2 16888 (*c)->flush_all_but_last();
7c673cae 16889 {
9f95a23c 16890 std::unique_lock l(coll_lock);
7c673cae
FG
16891 if (!*c) {
16892 r = -ENOENT;
16893 goto out;
16894 }
16895 size_t nonexistent_count = 0;
11fdf7f2 16896 ceph_assert((*c)->exists);
adb31ebb 16897 if ((*c)->onode_map.map_any([&](Onode* o) {
f67539c2
TL
16898 if (o->exists) {
16899 dout(1) << __func__ << " " << o->oid << " " << o
16900 << " exists in onode_map" << dendl;
7c673cae 16901 return true;
f67539c2
TL
16902 }
16903 ++nonexistent_count;
16904 return false;
16905 })) {
7c673cae
FG
16906 r = -ENOTEMPTY;
16907 goto out;
16908 }
7c673cae
FG
16909 vector<ghobject_t> ls;
16910 ghobject_t next;
16911 // Enumerate onodes in db, up to nonexistent_count + 1
16912 // then check if all of them are marked as non-existent.
11fdf7f2 16913 // Bypass the check if (next != ghobject_t::get_max())
7c673cae 16914 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
f91f0fd5 16915 nonexistent_count + 1, false, &ls, &next);
7c673cae 16916 if (r >= 0) {
11fdf7f2
TL
16917 // If true mean collecton has more objects than nonexistent_count,
16918 // so bypass check.
16919 bool exists = (!next.is_max());
7c673cae
FG
16920 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
16921 dout(10) << __func__ << " oid " << *it << dendl;
16922 auto onode = (*c)->onode_map.lookup(*it);
16923 exists = !onode || onode->exists;
16924 if (exists) {
494da23a 16925 dout(1) << __func__ << " " << *it
f67539c2
TL
16926 << " exists in db, "
16927 << (!onode ? "not present in ram" : "present in ram")
16928 << dendl;
7c673cae
FG
16929 }
16930 }
16931 if (!exists) {
f67539c2 16932 _do_remove_collection(txc, c);
7c673cae
FG
16933 r = 0;
16934 } else {
16935 dout(10) << __func__ << " " << cid
16936 << " is non-empty" << dendl;
f67539c2 16937 r = -ENOTEMPTY;
7c673cae
FG
16938 }
16939 }
16940 }
f67539c2 16941out:
7c673cae
FG
16942 dout(10) << __func__ << " " << cid << " = " << r << dendl;
16943 return r;
16944}
16945
11fdf7f2
TL
16946void BlueStore::_do_remove_collection(TransContext *txc,
16947 CollectionRef *c)
16948{
16949 coll_map.erase((*c)->cid);
16950 txc->removed_collections.push_back(*c);
16951 (*c)->exists = false;
16952 _osr_register_zombie((*c)->osr.get());
16953 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
16954 c->reset();
16955}
16956
7c673cae
FG
16957int BlueStore::_split_collection(TransContext *txc,
16958 CollectionRef& c,
16959 CollectionRef& d,
16960 unsigned bits, int rem)
16961{
16962 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
16963 << " bits " << bits << dendl;
9f95a23c
TL
16964 std::unique_lock l(c->lock);
16965 std::unique_lock l2(d->lock);
7c673cae
FG
16966 int r;
16967
16968 // flush all previous deferred writes on this sequencer. this is a bit
16969 // heavyweight, but we need to make sure all deferred writes complete
16970 // before we split as the new collection's sequencer may need to order
16971 // this after those writes, and we don't bother with the complexity of
16972 // moving those TransContexts over to the new osr.
16973 _osr_drain_preceding(txc);
16974
16975 // move any cached items (onodes and referenced shared blobs) that will
16976 // belong to the child collection post-split. leave everything else behind.
16977 // this may include things that don't strictly belong to the now-smaller
16978 // parent split, but the OSD will always send us a split for every new
16979 // child.
16980
16981 spg_t pgid, dest_pgid;
16982 bool is_pg = c->cid.is_pg(&pgid);
11fdf7f2 16983 ceph_assert(is_pg);
7c673cae 16984 is_pg = d->cid.is_pg(&dest_pgid);
11fdf7f2 16985 ceph_assert(is_pg);
7c673cae
FG
16986
16987 // the destination should initially be empty.
11fdf7f2
TL
16988 ceph_assert(d->onode_map.empty());
16989 ceph_assert(d->shared_blob_set.empty());
16990 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
16991
16992 c->split_cache(d.get());
16993
16994 // adjust bits. note that this will be redundant for all but the first
16995 // split call for this parent (first child).
16996 c->cnode.bits = bits;
11fdf7f2 16997 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
16998 r = 0;
16999
17000 bufferlist bl;
11fdf7f2 17001 encode(c->cnode, bl);
7c673cae
FG
17002 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
17003
17004 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
17005 << " bits " << bits << " = " << r << dendl;
17006 return r;
17007}
17008
11fdf7f2
TL
17009int BlueStore::_merge_collection(
17010 TransContext *txc,
17011 CollectionRef *c,
17012 CollectionRef& d,
17013 unsigned bits)
17014{
17015 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
17016 << " bits " << bits << dendl;
9f95a23c
TL
17017 std::unique_lock l((*c)->lock);
17018 std::unique_lock l2(d->lock);
11fdf7f2
TL
17019 int r;
17020
17021 coll_t cid = (*c)->cid;
17022
17023 // flush all previous deferred writes on the source collection to ensure
17024 // that all deferred writes complete before we merge as the target collection's
17025 // sequencer may need to order new ops after those writes.
17026
17027 _osr_drain((*c)->osr.get());
17028
17029 // move any cached items (onodes and referenced shared blobs) that will
17030 // belong to the child collection post-split. leave everything else behind.
17031 // this may include things that don't strictly belong to the now-smaller
17032 // parent split, but the OSD will always send us a split for every new
17033 // child.
17034
17035 spg_t pgid, dest_pgid;
17036 bool is_pg = cid.is_pg(&pgid);
17037 ceph_assert(is_pg);
17038 is_pg = d->cid.is_pg(&dest_pgid);
17039 ceph_assert(is_pg);
17040
17041 // adjust bits. note that this will be redundant for all but the first
17042 // merge call for the parent/target.
17043 d->cnode.bits = bits;
17044
17045 // behavior depends on target (d) bits, so this after that is updated.
17046 (*c)->split_cache(d.get());
17047
17048 // remove source collection
17049 {
9f95a23c 17050 std::unique_lock l3(coll_lock);
11fdf7f2
TL
17051 _do_remove_collection(txc, c);
17052 }
17053
17054 r = 0;
17055
17056 bufferlist bl;
17057 encode(d->cnode, bl);
17058 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
17059
17060 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
17061 << " bits " << bits << " = " << r << dendl;
17062 return r;
17063}
17064
494da23a
TL
17065void BlueStore::log_latency(
17066 const char* name,
17067 int idx,
17068 const ceph::timespan& l,
17069 double lat_threshold,
17070 const char* info) const
17071{
17072 logger->tinc(idx, l);
17073 if (lat_threshold > 0.0 &&
17074 l >= make_timespan(lat_threshold)) {
17075 dout(0) << __func__ << " slow operation observed for " << name
17076 << ", latency = " << l
17077 << info
17078 << dendl;
17079 }
17080}
17081
11fdf7f2 17082void BlueStore::log_latency_fn(
494da23a 17083 const char* name,
11fdf7f2
TL
17084 int idx,
17085 const ceph::timespan& l,
494da23a
TL
17086 double lat_threshold,
17087 std::function<string (const ceph::timespan& lat)> fn) const
11fdf7f2 17088{
494da23a
TL
17089 logger->tinc(idx, l);
17090 if (lat_threshold > 0.0 &&
17091 l >= make_timespan(lat_threshold)) {
17092 dout(0) << __func__ << " slow operation observed for " << name
17093 << ", latency = " << l
17094 << fn(l)
17095 << dendl;
17096 }
11fdf7f2
TL
17097}
17098
9f95a23c
TL
17099#if defined(WITH_LTTNG)
17100void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
17101 KeyValueDB &db,
17102 TransContext &txc,
17103 mono_clock::time_point start_throttle_acquire)
17104{
17105 pending_kv_ios += txc.ios;
17106 if (txc.deferred_txn) {
17107 pending_deferred_ios += txc.ios;
17108 }
17109
17110 uint64_t started = 0;
17111 uint64_t completed = 0;
17112 if (should_trace(&started, &completed)) {
17113 txc.tracing = true;
17114 uint64_t rocksdb_base_level,
17115 rocksdb_estimate_pending_compaction_bytes,
17116 rocksdb_cur_size_all_mem_tables,
17117 rocksdb_compaction_pending,
17118 rocksdb_mem_table_flush_pending,
17119 rocksdb_num_running_compactions,
17120 rocksdb_num_running_flushes,
17121 rocksdb_actual_delayed_write_rate;
17122 db.get_property(
17123 "rocksdb.base-level",
17124 &rocksdb_base_level);
17125 db.get_property(
17126 "rocksdb.estimate-pending-compaction-bytes",
17127 &rocksdb_estimate_pending_compaction_bytes);
17128 db.get_property(
17129 "rocksdb.cur-size-all-mem-tables",
17130 &rocksdb_cur_size_all_mem_tables);
17131 db.get_property(
17132 "rocksdb.compaction-pending",
17133 &rocksdb_compaction_pending);
17134 db.get_property(
17135 "rocksdb.mem-table-flush-pending",
17136 &rocksdb_mem_table_flush_pending);
17137 db.get_property(
17138 "rocksdb.num-running-compactions",
17139 &rocksdb_num_running_compactions);
17140 db.get_property(
17141 "rocksdb.num-running-flushes",
17142 &rocksdb_num_running_flushes);
17143 db.get_property(
17144 "rocksdb.actual-delayed-write-rate",
17145 &rocksdb_actual_delayed_write_rate);
17146
17147
17148 tracepoint(
17149 bluestore,
17150 transaction_initial_state,
17151 txc.osr->get_sequencer_id(),
17152 txc.seq,
17153 throttle_bytes.get_current(),
17154 throttle_deferred_bytes.get_current(),
17155 pending_kv_ios,
17156 pending_deferred_ios,
17157 started,
17158 completed,
17159 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
17160
17161 tracepoint(
17162 bluestore,
17163 transaction_initial_state_rocksdb,
17164 txc.osr->get_sequencer_id(),
17165 txc.seq,
17166 rocksdb_base_level,
17167 rocksdb_estimate_pending_compaction_bytes,
17168 rocksdb_cur_size_all_mem_tables,
17169 rocksdb_compaction_pending,
17170 rocksdb_mem_table_flush_pending,
17171 rocksdb_num_running_compactions,
17172 rocksdb_num_running_flushes,
17173 rocksdb_actual_delayed_write_rate);
17174 }
17175}
17176#endif
17177
17178mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
17179 TransContext &txc, PerfCounters *logger, int state)
17180{
17181 mono_clock::time_point now = mono_clock::now();
17182 mono_clock::duration lat = now - txc.last_stamp;
17183 logger->tinc(state, lat);
17184#if defined(WITH_LTTNG)
17185 if (txc.tracing &&
17186 state >= l_bluestore_state_prepare_lat &&
17187 state <= l_bluestore_state_done_lat) {
17188 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
17189 tracepoint(
17190 bluestore,
17191 transaction_state_duration,
17192 txc.osr->get_sequencer_id(),
17193 txc.seq,
17194 state,
17195 ceph::to_seconds<double>(lat));
17196 }
17197#endif
17198 txc.last_stamp = now;
17199 return lat;
17200}
17201
17202bool BlueStore::BlueStoreThrottle::try_start_transaction(
17203 KeyValueDB &db,
17204 TransContext &txc,
17205 mono_clock::time_point start_throttle_acquire)
17206{
17207 throttle_bytes.get(txc.cost);
17208
17209 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
17210 emit_initial_tracepoint(db, txc, start_throttle_acquire);
17211 return true;
17212 } else {
17213 return false;
17214 }
17215}
17216
17217void BlueStore::BlueStoreThrottle::finish_start_transaction(
17218 KeyValueDB &db,
17219 TransContext &txc,
17220 mono_clock::time_point start_throttle_acquire)
17221{
17222 ceph_assert(txc.deferred_txn);
17223 throttle_deferred_bytes.get(txc.cost);
17224 emit_initial_tracepoint(db, txc, start_throttle_acquire);
17225}
17226
17227#if defined(WITH_LTTNG)
17228void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
17229{
17230 pending_kv_ios -= 1;
17231 ios_completed_since_last_traced++;
17232 if (txc.tracing) {
17233 tracepoint(
17234 bluestore,
17235 transaction_commit_latency,
17236 txc.osr->get_sequencer_id(),
17237 txc.seq,
17238 ceph::to_seconds<double>(mono_clock::now() - txc.start));
17239 }
17240}
17241#endif
17242
17243#if defined(WITH_LTTNG)
17244void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
17245{
17246 if (txc.deferred_txn) {
17247 pending_deferred_ios -= 1;
17248 }
17249 if (txc.tracing) {
17250 mono_clock::time_point now = mono_clock::now();
17251 mono_clock::duration lat = now - txc.start;
17252 tracepoint(
17253 bluestore,
17254 transaction_total_duration,
17255 txc.osr->get_sequencer_id(),
17256 txc.seq,
17257 ceph::to_seconds<double>(lat));
17258 }
17259}
17260#endif
11fdf7f2 17261
7c673cae
FG
17262const string prefix_onode = "o";
17263const string prefix_onode_shard = "x";
17264const string prefix_other = "Z";
7c673cae
FG
17265//Itrerates through the db and collects the stats
17266void BlueStore::generate_db_histogram(Formatter *f)
17267{
17268 //globals
17269 uint64_t num_onodes = 0;
17270 uint64_t num_shards = 0;
17271 uint64_t num_super = 0;
17272 uint64_t num_coll = 0;
17273 uint64_t num_omap = 0;
11fdf7f2 17274 uint64_t num_pgmeta_omap = 0;
7c673cae
FG
17275 uint64_t num_deferred = 0;
17276 uint64_t num_alloc = 0;
17277 uint64_t num_stat = 0;
17278 uint64_t num_others = 0;
17279 uint64_t num_shared_shards = 0;
17280 size_t max_key_size =0, max_value_size = 0;
17281 uint64_t total_key_size = 0, total_value_size = 0;
17282 size_t key_size = 0, value_size = 0;
20effc67 17283 KeyValueHistogram hist;
7c673cae 17284
11fdf7f2 17285 auto start = coarse_mono_clock::now();
7c673cae 17286
11fdf7f2 17287 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
7c673cae
FG
17288 iter->seek_to_first();
17289 while (iter->valid()) {
17290 dout(30) << __func__ << " Key: " << iter->key() << dendl;
17291 key_size = iter->key_size();
17292 value_size = iter->value_size();
17293 hist.value_hist[hist.get_value_slab(value_size)]++;
11fdf7f2
TL
17294 max_key_size = std::max(max_key_size, key_size);
17295 max_value_size = std::max(max_value_size, value_size);
7c673cae
FG
17296 total_key_size += key_size;
17297 total_value_size += value_size;
17298
17299 pair<string,string> key(iter->raw_key());
17300
17301 if (key.first == PREFIX_SUPER) {
17302 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
17303 num_super++;
17304 } else if (key.first == PREFIX_STAT) {
17305 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
17306 num_stat++;
17307 } else if (key.first == PREFIX_COLL) {
17308 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
17309 num_coll++;
17310 } else if (key.first == PREFIX_OBJ) {
17311 if (key.second.back() == ONODE_KEY_SUFFIX) {
17312 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
17313 num_onodes++;
17314 } else {
17315 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
17316 num_shards++;
17317 }
17318 } else if (key.first == PREFIX_OMAP) {
17319 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
17320 num_omap++;
f67539c2
TL
17321 } else if (key.first == PREFIX_PERPOOL_OMAP) {
17322 hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size);
17323 num_omap++;
17324 } else if (key.first == PREFIX_PERPG_OMAP) {
17325 hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size);
17326 num_omap++;
11fdf7f2
TL
17327 } else if (key.first == PREFIX_PGMETA_OMAP) {
17328 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
17329 num_pgmeta_omap++;
7c673cae
FG
17330 } else if (key.first == PREFIX_DEFERRED) {
17331 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
17332 num_deferred++;
11fdf7f2 17333 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
7c673cae
FG
17334 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
17335 num_alloc++;
17336 } else if (key.first == PREFIX_SHARED_BLOB) {
17337 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
17338 num_shared_shards++;
17339 } else {
17340 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
17341 num_others++;
17342 }
17343 iter->next();
17344 }
17345
11fdf7f2 17346 ceph::timespan duration = coarse_mono_clock::now() - start;
7c673cae
FG
17347 f->open_object_section("rocksdb_key_value_stats");
17348 f->dump_unsigned("num_onodes", num_onodes);
17349 f->dump_unsigned("num_shards", num_shards);
17350 f->dump_unsigned("num_super", num_super);
17351 f->dump_unsigned("num_coll", num_coll);
17352 f->dump_unsigned("num_omap", num_omap);
11fdf7f2 17353 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
7c673cae
FG
17354 f->dump_unsigned("num_deferred", num_deferred);
17355 f->dump_unsigned("num_alloc", num_alloc);
17356 f->dump_unsigned("num_stat", num_stat);
17357 f->dump_unsigned("num_shared_shards", num_shared_shards);
17358 f->dump_unsigned("num_others", num_others);
17359 f->dump_unsigned("max_key_size", max_key_size);
17360 f->dump_unsigned("max_value_size", max_value_size);
17361 f->dump_unsigned("total_key_size", total_key_size);
17362 f->dump_unsigned("total_value_size", total_value_size);
17363 f->close_section();
17364
17365 hist.dump(f);
17366
17367 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
17368
17369}
17370
f6b5b4d7 17371void BlueStore::_shutdown_cache()
7c673cae
FG
17372{
17373 dout(10) << __func__ << dendl;
9f95a23c
TL
17374 for (auto i : buffer_cache_shards) {
17375 i->flush();
11fdf7f2 17376 ceph_assert(i->empty());
7c673cae
FG
17377 }
17378 for (auto& p : coll_map) {
f6b5b4d7 17379 p.second->onode_map.clear();
3efd9988
FG
17380 if (!p.second->shared_blob_set.empty()) {
17381 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11fdf7f2 17382 p.second->shared_blob_set.dump<0>(cct);
3efd9988 17383 }
11fdf7f2
TL
17384 ceph_assert(p.second->onode_map.empty());
17385 ceph_assert(p.second->shared_blob_set.empty());
7c673cae
FG
17386 }
17387 coll_map.clear();
f6b5b4d7
TL
17388 for (auto i : onode_cache_shards) {
17389 ceph_assert(i->empty());
17390 }
7c673cae
FG
17391}
17392
31f18b77
FG
17393// For external caller.
17394// We use a best-effort policy instead, e.g.,
17395// we don't care if there are still some pinned onodes/data in the cache
17396// after this command is completed.
11fdf7f2 17397int BlueStore::flush_cache(ostream *os)
31f18b77
FG
17398{
17399 dout(10) << __func__ << dendl;
9f95a23c
TL
17400 for (auto i : onode_cache_shards) {
17401 i->flush();
17402 }
17403 for (auto i : buffer_cache_shards) {
17404 i->flush();
31f18b77 17405 }
11fdf7f2
TL
17406
17407 return 0;
31f18b77
FG
17408}
17409
7c673cae
FG
17410void BlueStore::_apply_padding(uint64_t head_pad,
17411 uint64_t tail_pad,
7c673cae
FG
17412 bufferlist& padded)
17413{
7c673cae 17414 if (head_pad) {
224ce89b 17415 padded.prepend_zero(head_pad);
7c673cae
FG
17416 }
17417 if (tail_pad) {
17418 padded.append_zero(tail_pad);
17419 }
17420 if (head_pad || tail_pad) {
17421 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
17422 << " tail 0x" << tail_pad << std::dec << dendl;
17423 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
17424 }
17425}
17426
11fdf7f2
TL
17427void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
17428{
17429 // finalize extent_map shards
17430 o->extent_map.update(txn, false);
17431 if (o->extent_map.needs_reshard()) {
17432 o->extent_map.reshard(db, txn);
17433 o->extent_map.update(txn, true);
17434 if (o->extent_map.needs_reshard()) {
17435 dout(20) << __func__ << " warning: still wants reshard, check options?"
17436 << dendl;
17437 o->extent_map.clear_needs_reshard();
17438 }
17439 logger->inc(l_bluestore_onode_reshard);
17440 }
17441
17442 // bound encode
17443 size_t bound = 0;
17444 denc(o->onode, bound);
17445 o->extent_map.bound_encode_spanning_blobs(bound);
17446 if (o->onode.extent_map_shards.empty()) {
17447 denc(o->extent_map.inline_bl, bound);
17448 }
17449
17450 // encode
17451 bufferlist bl;
17452 unsigned onode_part, blob_part, extent_part;
17453 {
17454 auto p = bl.get_contiguous_appender(bound, true);
17455 denc(o->onode, p);
17456 onode_part = p.get_logical_offset();
17457 o->extent_map.encode_spanning_blobs(p);
17458 blob_part = p.get_logical_offset() - onode_part;
17459 if (o->onode.extent_map_shards.empty()) {
17460 denc(o->extent_map.inline_bl, p);
17461 }
17462 extent_part = p.get_logical_offset() - onode_part - blob_part;
17463 }
17464
17465 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
17466 << " (" << onode_part << " bytes onode + "
17467 << blob_part << " bytes spanning blobs + "
17468 << extent_part << " bytes inline extents)"
17469 << dendl;
17470
17471
17472 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
17473}
17474
17475void BlueStore::_log_alerts(osd_alert_list_t& alerts)
17476{
17477 std::lock_guard l(qlock);
17478
522d829b
TL
17479 if (!spurious_read_errors_alert.empty() &&
17480 cct->_conf->bluestore_warn_on_spurious_read_errors) {
f67539c2
TL
17481 alerts.emplace(
17482 "BLUESTORE_SPURIOUS_READ_ERRORS",
17483 spurious_read_errors_alert);
17484 }
81eedcae
TL
17485 if (!disk_size_mismatch_alert.empty()) {
17486 alerts.emplace(
17487 "BLUESTORE_DISK_SIZE_MISMATCH",
17488 disk_size_mismatch_alert);
17489 }
17490 if (!legacy_statfs_alert.empty()) {
17491 alerts.emplace(
17492 "BLUESTORE_LEGACY_STATFS",
17493 legacy_statfs_alert);
17494 }
11fdf7f2
TL
17495 if (!spillover_alert.empty() &&
17496 cct->_conf->bluestore_warn_on_bluefs_spillover) {
17497 alerts.emplace(
17498 "BLUEFS_SPILLOVER",
17499 spillover_alert);
17500 }
f67539c2
TL
17501 if (!no_per_pg_omap_alert.empty()) {
17502 alerts.emplace(
17503 "BLUESTORE_NO_PER_PG_OMAP",
17504 no_per_pg_omap_alert);
17505 }
9f95a23c
TL
17506 if (!no_per_pool_omap_alert.empty()) {
17507 alerts.emplace(
17508 "BLUESTORE_NO_PER_POOL_OMAP",
17509 no_per_pool_omap_alert);
17510 }
11fdf7f2
TL
17511 string s0(failed_cmode);
17512
17513 if (!failed_compressors.empty()) {
17514 if (!s0.empty()) {
17515 s0 += ", ";
17516 }
17517 s0 += "unable to load:";
17518 bool first = true;
17519 for (auto& s : failed_compressors) {
17520 if (first) {
17521 first = false;
17522 } else {
17523 s0 += ", ";
17524 }
17525 s0 += s;
17526 }
17527 alerts.emplace(
17528 "BLUESTORE_NO_COMPRESSION",
17529 s0);
17530 }
17531}
17532
9f95a23c 17533void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
20effc67 17534 const PExtentVector& extents)
9f95a23c
TL
17535{
17536 alloc_stats_count++;
20effc67 17537 alloc_stats_fragments += extents.size();
9f95a23c 17538 alloc_stats_size += need;
20effc67
TL
17539
17540 for (auto& e : extents) {
17541 logger->hinc(l_bluestore_allocate_hist, e.length, need);
17542 }
9f95a23c
TL
17543}
17544
17545void BlueStore::_record_allocation_stats()
17546{
17547 // don't care about data consistency,
17548 // fields can be partially modified while making the tuple
17549 auto t0 = std::make_tuple(
17550 alloc_stats_count.exchange(0),
17551 alloc_stats_fragments.exchange(0),
17552 alloc_stats_size.exchange(0));
17553
17554 dout(0) << " allocation stats probe "
17555 << probe_count << ":"
17556 << " cnt: " << std::get<0>(t0)
17557 << " frags: " << std::get<1>(t0)
17558 << " size: " << std::get<2>(t0)
17559 << dendl;
17560
17561
17562 //
17563 // Keep the history for probes from the power-of-two sequence:
17564 // -1, -2, -4, -8, -16
17565 //
17566 size_t base = 1;
17567 for (auto& t : alloc_stats_history) {
17568 dout(0) << " probe -"
17569 << base + (probe_count % base) << ": "
17570 << std::get<0>(t)
17571 << ", " << std::get<1>(t)
17572 << ", " << std::get<2>(t)
17573 << dendl;
17574 base <<= 1;
17575 }
17576 dout(0) << "------------" << dendl;
17577
f67539c2 17578 ++ probe_count;
9f95a23c 17579
f67539c2
TL
17580 for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) {
17581 if ((probe_count % (1 << i)) == 0) {
17582 alloc_stats_history[i] = alloc_stats_history[i - 1];
17583 }
9f95a23c
TL
17584 }
17585 alloc_stats_history[0].swap(t0);
17586}
17587
7c673cae 17588// ===========================================
11fdf7f2
TL
17589// BlueStoreRepairer
17590
17591size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
17592 const interval_set<uint64_t>& extents)
17593{
17594 ceph_assert(granularity); // initialized
17595 // can't call for the second time
17596 ceph_assert(!was_filtered_out);
17597 ceph_assert(collections_bfs.size() == objects_bfs.size());
17598
17599 uint64_t prev_pos = 0;
17600 uint64_t npos = collections_bfs.size();
17601
17602 bloom_vector collections_reduced;
17603 bloom_vector objects_reduced;
17604
17605 for (auto e : extents) {
17606 if (e.second == 0) {
17607 continue;
17608 }
17609 uint64_t pos = max(e.first / granularity, prev_pos);
17610 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
17611 while (pos != npos && pos < end_pos) {
17612 ceph_assert( collections_bfs[pos].element_count() ==
17613 objects_bfs[pos].element_count());
17614 if (collections_bfs[pos].element_count()) {
17615 collections_reduced.push_back(std::move(collections_bfs[pos]));
17616 objects_reduced.push_back(std::move(objects_bfs[pos]));
17617 }
17618 ++pos;
17619 }
17620 prev_pos = end_pos;
17621 }
17622 collections_reduced.swap(collections_bfs);
17623 objects_reduced.swap(objects_bfs);
17624 was_filtered_out = true;
17625 return collections_bfs.size();
17626}
17627
17628bool BlueStoreRepairer::remove_key(KeyValueDB *db,
17629 const string& prefix,
17630 const string& key)
17631{
b3b6e05e 17632 std::lock_guard l(lock);
11fdf7f2
TL
17633 if (!remove_key_txn) {
17634 remove_key_txn = db->get_transaction();
17635 }
17636 ++to_repair_cnt;
17637 remove_key_txn->rmkey(prefix, key);
17638
17639 return true;
17640}
17641
f67539c2 17642void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val)
9f95a23c 17643{
b3b6e05e
TL
17644 std::lock_guard l(lock); // possibly redundant
17645 ceph_assert(fix_per_pool_omap_txn == nullptr);
9f95a23c
TL
17646 fix_per_pool_omap_txn = db->get_transaction();
17647 ++to_repair_cnt;
17648 bufferlist bl;
f67539c2 17649 bl.append(stringify(val));
9f95a23c
TL
17650 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
17651}
17652
11fdf7f2 17653bool BlueStoreRepairer::fix_shared_blob(
20effc67 17654 KeyValueDB::Transaction txn,
11fdf7f2 17655 uint64_t sbid,
20effc67
TL
17656 bluestore_extent_ref_map_t* ref_map,
17657 size_t repaired)
11fdf7f2 17658{
11fdf7f2
TL
17659 string key;
17660 get_shared_blob_key(sbid, &key);
20effc67
TL
17661 if (ref_map) {
17662 bluestore_shared_blob_t persistent(sbid, std::move(*ref_map));
17663 bufferlist bl;
17664 encode(persistent, bl);
17665 txn->set(PREFIX_SHARED_BLOB, key, bl);
11fdf7f2
TL
17666 } else {
17667 txn->rmkey(PREFIX_SHARED_BLOB, key);
17668 }
20effc67 17669 to_repair_cnt += repaired;
11fdf7f2
TL
17670 return true;
17671}
17672
17673bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
17674 const string& key,
17675 const store_statfs_t& new_statfs)
17676{
b3b6e05e 17677 std::lock_guard l(lock);
11fdf7f2
TL
17678 if (!fix_statfs_txn) {
17679 fix_statfs_txn = db->get_transaction();
17680 }
17681 BlueStore::volatile_statfs vstatfs;
17682 vstatfs = new_statfs;
17683 bufferlist bl;
17684 vstatfs.encode(bl);
17685 ++to_repair_cnt;
17686 fix_statfs_txn->set(PREFIX_STAT, key, bl);
17687 return true;
17688}
17689
17690bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
17691 FreelistManager* fm,
17692 uint64_t offset, uint64_t len)
17693{
b3b6e05e 17694 std::lock_guard l(lock);
20effc67
TL
17695 ceph_assert(!fm->is_null_manager());
17696
11fdf7f2
TL
17697 if (!fix_fm_leaked_txn) {
17698 fix_fm_leaked_txn = db->get_transaction();
17699 }
17700 ++to_repair_cnt;
17701 fm->release(offset, len, fix_fm_leaked_txn);
17702 return true;
17703}
17704bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
17705 FreelistManager* fm,
17706 uint64_t offset, uint64_t len)
17707{
b3b6e05e 17708 std::lock_guard l(lock);
20effc67
TL
17709 ceph_assert(!fm->is_null_manager());
17710
11fdf7f2
TL
17711 if (!fix_fm_false_free_txn) {
17712 fix_fm_false_free_txn = db->get_transaction();
17713 }
17714 ++to_repair_cnt;
17715 fm->allocate(offset, len, fix_fm_false_free_txn);
17716 return true;
17717}
17718
b3b6e05e
TL
17719bool BlueStoreRepairer::fix_spanning_blobs(
17720 KeyValueDB* db,
17721 std::function<void(KeyValueDB::Transaction)> f)
adb31ebb 17722{
b3b6e05e 17723 std::lock_guard l(lock);
adb31ebb
TL
17724 if (!fix_onode_txn) {
17725 fix_onode_txn = db->get_transaction();
17726 }
b3b6e05e 17727 f(fix_onode_txn);
adb31ebb 17728 ++to_repair_cnt;
b3b6e05e 17729 return true;
adb31ebb
TL
17730}
17731
11fdf7f2
TL
17732bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
17733{
b3b6e05e 17734 //NB: not for use in multithreading mode!!!
11fdf7f2
TL
17735 if (misreferenced_extents.size()) {
17736 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
17737 ceph_assert(n > 0);
17738 if (!fix_misreferences_txn) {
17739 fix_misreferences_txn = db->get_transaction();
17740 }
17741 return true;
17742 }
17743 return false;
17744}
17745
17746unsigned BlueStoreRepairer::apply(KeyValueDB* db)
17747{
b3b6e05e 17748 //NB: not for use in multithreading mode!!!
9f95a23c 17749 if (fix_per_pool_omap_txn) {
20effc67
TL
17750 auto ok = db->submit_transaction_sync(fix_per_pool_omap_txn) == 0;
17751 ceph_assert(ok);
9f95a23c
TL
17752 fix_per_pool_omap_txn = nullptr;
17753 }
11fdf7f2 17754 if (fix_fm_leaked_txn) {
20effc67
TL
17755 auto ok = db->submit_transaction_sync(fix_fm_leaked_txn) == 0;
17756 ceph_assert(ok);
11fdf7f2
TL
17757 fix_fm_leaked_txn = nullptr;
17758 }
17759 if (fix_fm_false_free_txn) {
20effc67
TL
17760 auto ok = db->submit_transaction_sync(fix_fm_false_free_txn) == 0;
17761 ceph_assert(ok);
11fdf7f2
TL
17762 fix_fm_false_free_txn = nullptr;
17763 }
17764 if (remove_key_txn) {
20effc67
TL
17765 auto ok = db->submit_transaction_sync(remove_key_txn) == 0;
17766 ceph_assert(ok);
11fdf7f2
TL
17767 remove_key_txn = nullptr;
17768 }
17769 if (fix_misreferences_txn) {
20effc67
TL
17770 auto ok = db->submit_transaction_sync(fix_misreferences_txn) == 0;
17771 ceph_assert(ok);
11fdf7f2
TL
17772 fix_misreferences_txn = nullptr;
17773 }
adb31ebb 17774 if (fix_onode_txn) {
20effc67
TL
17775 auto ok = db->submit_transaction_sync(fix_onode_txn) == 0;
17776 ceph_assert(ok);
adb31ebb
TL
17777 fix_onode_txn = nullptr;
17778 }
11fdf7f2 17779 if (fix_shared_blob_txn) {
20effc67
TL
17780 auto ok = db->submit_transaction_sync(fix_shared_blob_txn) == 0;
17781 ceph_assert(ok);
11fdf7f2
TL
17782 fix_shared_blob_txn = nullptr;
17783 }
11fdf7f2 17784 if (fix_statfs_txn) {
20effc67
TL
17785 auto ok = db->submit_transaction_sync(fix_statfs_txn) == 0;
17786 ceph_assert(ok);
11fdf7f2
TL
17787 fix_statfs_txn = nullptr;
17788 }
522d829b
TL
17789 if (need_compact) {
17790 db->compact();
17791 need_compact = false;
17792 }
11fdf7f2
TL
17793 unsigned repaired = to_repair_cnt;
17794 to_repair_cnt = 0;
17795 return repaired;
17796}
17797
17798// =======================================================
9f95a23c
TL
17799// RocksDBBlueFSVolumeSelector
17800
17801uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
17802 ceph_assert(h != nullptr);
17803 uint64_t hint = reinterpret_cast<uint64_t>(h);
17804 uint8_t res;
17805 switch (hint) {
17806 case LEVEL_SLOW:
17807 res = BlueFS::BDEV_SLOW;
17808 if (db_avail4slow > 0) {
17809 // considering statically available db space vs.
17810 // - observed maximums on DB dev for DB/WAL/UNSORTED data
17811 // - observed maximum spillovers
17812 uint64_t max_db_use = 0; // max db usage we potentially observed
f6b5b4d7 17813 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
9f95a23c
TL
17814 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
17815 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
17816 // this could go to db hence using it in the estimation
17817 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
17818
17819 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
17820 uint64_t avail = min(
17821 db_avail4slow,
17822 max_db_use < db_total ? db_total - max_db_use : 0);
17823
17824 // considering current DB dev usage for SLOW data
17825 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
17826 res = BlueFS::BDEV_DB;
17827 }
17828 }
17829 break;
f6b5b4d7 17830 case LEVEL_LOG:
9f95a23c
TL
17831 case LEVEL_WAL:
17832 res = BlueFS::BDEV_WAL;
17833 break;
17834 case LEVEL_DB:
17835 default:
17836 res = BlueFS::BDEV_DB;
17837 break;
17838 }
17839 return res;
17840}
17841
17842void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
17843{
a4b75251
TL
17844 auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST];
17845 res.emplace_back(base, db_size);
17846 auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST];
17847 if (slow_size == 0) {
17848 slow_size = db_size;
17849 }
17850 res.emplace_back(base + ".slow", slow_size);
9f95a23c
TL
17851}
17852
b3b6e05e 17853void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
9f95a23c
TL
17854 uint8_t res = LEVEL_DB;
17855 if (dirname.length() > 5) {
17856 // the "db.slow" and "db.wal" directory names are hard-coded at
17857 // match up with bluestore. the slow device is always the second
17858 // one (when a dedicated block.db device is present and used at
17859 // bdev 0). the wal device is always last.
17860 if (boost::algorithm::ends_with(dirname, ".slow")) {
17861 res = LEVEL_SLOW;
17862 }
17863 else if (boost::algorithm::ends_with(dirname, ".wal")) {
17864 res = LEVEL_WAL;
17865 }
17866 }
17867 return reinterpret_cast<void*>(res);
17868}
17869
17870void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
17871 auto max_x = per_level_per_dev_usage.get_max_x();
17872 auto max_y = per_level_per_dev_usage.get_max_y();
17873 sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
17874 << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
17875 << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
17876 << ", db_avail:" << db_avail4slow << std::endl
17877 << "Usage matrix:" << std::endl;
f6b5b4d7 17878 constexpr std::array<const char*, 8> names{ {
9f95a23c
TL
17879 "DEV/LEV",
17880 "WAL",
17881 "DB",
17882 "SLOW",
17883 "*",
17884 "*",
f6b5b4d7
TL
17885 "REAL",
17886 "FILES",
9f95a23c
TL
17887 } };
17888 const size_t width = 12;
17889 for (size_t i = 0; i < names.size(); ++i) {
17890 sout.setf(std::ios::left, std::ios::adjustfield);
17891 sout.width(width);
17892 sout << names[i];
17893 }
17894 sout << std::endl;
17895 for (size_t l = 0; l < max_y; l++) {
17896 sout.setf(std::ios::left, std::ios::adjustfield);
17897 sout.width(width);
17898 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
17899 case LEVEL_LOG:
17900 sout << "LOG"; break;
9f95a23c
TL
17901 case LEVEL_WAL:
17902 sout << "WAL"; break;
17903 case LEVEL_DB:
17904 sout << "DB"; break;
17905 case LEVEL_SLOW:
17906 sout << "SLOW"; break;
17907 case LEVEL_MAX:
17908 sout << "TOTALS"; break;
17909 }
f6b5b4d7 17910 for (size_t d = 0; d < max_x; d++) {
9f95a23c
TL
17911 sout.setf(std::ios::left, std::ios::adjustfield);
17912 sout.width(width);
17913 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
17914 }
17915 sout.setf(std::ios::left, std::ios::adjustfield);
17916 sout.width(width);
f6b5b4d7 17917 sout << stringify(per_level_files[l]) << std::endl;
9f95a23c
TL
17918 }
17919 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
17920 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
17921 sout << "MAXIMUMS:" << std::endl;
17922 for (size_t l = 0; l < max_y; l++) {
17923 sout.setf(std::ios::left, std::ios::adjustfield);
17924 sout.width(width);
17925 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
17926 case LEVEL_LOG:
17927 sout << "LOG"; break;
9f95a23c
TL
17928 case LEVEL_WAL:
17929 sout << "WAL"; break;
17930 case LEVEL_DB:
17931 sout << "DB"; break;
17932 case LEVEL_SLOW:
17933 sout << "SLOW"; break;
17934 case LEVEL_MAX:
17935 sout << "TOTALS"; break;
17936 }
17937 for (size_t d = 0; d < max_x - 1; d++) {
17938 sout.setf(std::ios::left, std::ios::adjustfield);
17939 sout.width(width);
17940 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
17941 }
17942 sout.setf(std::ios::left, std::ios::adjustfield);
17943 sout.width(width);
17944 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
17945 if (l < max_y - 1) {
17946 sout << std::endl;
17947 }
17948 }
17949}
11fdf7f2 17950
20effc67
TL
17951BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const {
17952 RocksDBBlueFSVolumeSelector* ns =
17953 new RocksDBBlueFSVolumeSelector(0, 0, 0,
17954 0, 0, 0,
17955 0, 0, false);
17956 return ns;
17957}
17958
17959bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) {
17960 RocksDBBlueFSVolumeSelector* o = dynamic_cast<RocksDBBlueFSVolumeSelector*>(other);
17961 ceph_assert(o);
17962 bool equal = true;
17963 for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) {
17964 for (size_t y = 0; y <LEVEL_MAX - LEVEL_FIRST + 1; y++) {
17965 equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y));
17966 }
17967 }
17968 for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) {
17969 equal &= (per_level_files[t] == o->per_level_files[t]);
17970 }
17971 return equal;
17972}
17973
9f95a23c 17974// =======================================================
20effc67
TL
17975
17976//================================================================================================================
17977// BlueStore is committing all allocation information (alloc/release) into RocksDB before the client Write is performed.
17978// This cause a delay in write path and add significant load to the CPU/Memory/Disk.
17979// The reason for the RocksDB updates is that it allows Ceph to survive any failure without losing the allocation state.
17980//
17981// We changed the code skiping RocksDB updates on allocation time and instead performing a full desatge of the allocator object
17982// with all the OSD allocation state in a single step during umount().
17983// This change leads to a 25% increase in IOPS and reduced latency in small random-write workload, but exposes the system
17984// to losing allocation info in failure cases where we don't call umount.
17985// We add code to perform a full allocation-map rebuild from information stored inside the ONode which is used in failure cases.
17986// When we perform a graceful shutdown there is no need for recovery and we simply read the allocation-map from a flat file
17987// where we store the allocation-map during umount().
17988//================================================================================================================
17989
17990#undef dout_prefix
17991#define dout_prefix *_dout << "bluestore::NCB::" << __func__ << "::"
17992
17993static const std::string allocator_dir = "ALLOCATOR_NCB_DIR";
17994static const std::string allocator_file = "ALLOCATOR_NCB_FILE";
17995static uint32_t s_format_version = 0x01; // support future changes to allocator-map file
17996static uint32_t s_serial = 0x01;
17997
17998#if 1
17999#define CEPHTOH_32 le32toh
18000#define CEPHTOH_64 le64toh
18001#define HTOCEPH_32 htole32
18002#define HTOCEPH_64 htole64
18003#else
18004// help debug the encode/decode by forcing alien format
18005#define CEPHTOH_32 be32toh
18006#define CEPHTOH_64 be64toh
18007#define HTOCEPH_32 htobe32
18008#define HTOCEPH_64 htobe64
18009#endif
18010
18011// 48 Bytes header for on-disk alloator image
18012const uint64_t ALLOCATOR_IMAGE_VALID_SIGNATURE = 0x1FACE0FF;
18013struct allocator_image_header {
18014 uint32_t format_version; // 0x00
18015 uint32_t valid_signature; // 0x04
18016 utime_t timestamp; // 0x08
18017 uint32_t serial; // 0x10
18018 uint32_t pad[0x7]; // 0x14
18019
18020 allocator_image_header() {
18021 memset((char*)this, 0, sizeof(allocator_image_header));
18022 }
18023
18024 // create header in CEPH format
18025 allocator_image_header(utime_t timestamp, uint32_t format_version, uint32_t serial) {
18026 this->format_version = format_version;
18027 this->timestamp = timestamp;
18028 this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
18029 this->serial = serial;
18030 memset(this->pad, 0, sizeof(this->pad));
18031 }
18032
18033 friend std::ostream& operator<<(std::ostream& out, const allocator_image_header& header) {
18034 out << "format_version = " << header.format_version << std::endl;
18035 out << "valid_signature = " << header.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
18036 out << "timestamp = " << header.timestamp << std::endl;
18037 out << "serial = " << header.serial << std::endl;
18038 for (unsigned i = 0; i < sizeof(header.pad)/sizeof(uint32_t); i++) {
18039 if (header.pad[i]) {
18040 out << "header.pad[" << i << "] = " << header.pad[i] << std::endl;
18041 }
18042 }
18043 return out;
18044 }
18045
18046 DENC(allocator_image_header, v, p) {
18047 denc(v.format_version, p);
18048 denc(v.valid_signature, p);
18049 denc(v.timestamp.tv.tv_sec, p);
18050 denc(v.timestamp.tv.tv_nsec, p);
18051 denc(v.serial, p);
18052 for (auto& pad: v.pad) {
18053 denc(pad, p);
18054 }
18055 }
18056
18057
18058 int verify(CephContext* cct, const std::string &path) {
18059 if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
18060 for (unsigned i = 0; i < (sizeof(pad) / sizeof(uint32_t)); i++) {
18061 if (this->pad[i]) {
18062 derr << "Illegal Header - pad[" << i << "]="<< pad[i] << dendl;
18063 return -1;
18064 }
18065 }
18066 return 0;
18067 }
18068 else {
18069 derr << "Illegal Header - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
18070 return -1;
18071 }
18072 }
18073};
18074WRITE_CLASS_DENC(allocator_image_header)
18075
18076// 56 Bytes trailer for on-disk alloator image
18077struct allocator_image_trailer {
18078 extent_t null_extent; // 0x00
18079
18080 uint32_t format_version; // 0x10
18081 uint32_t valid_signature; // 0x14
18082
18083 utime_t timestamp; // 0x18
18084
18085 uint32_t serial; // 0x20
18086 uint32_t pad; // 0x24
18087 uint64_t entries_count; // 0x28
18088 uint64_t allocation_size; // 0x30
18089
18090 // trailer is created in CEPH format
18091 allocator_image_trailer(utime_t timestamp, uint32_t format_version, uint32_t serial, uint64_t entries_count, uint64_t allocation_size) {
18092 memset((char*)&(this->null_extent), 0, sizeof(this->null_extent));
18093 this->format_version = format_version;
18094 this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
18095 this->timestamp = timestamp;
18096 this->serial = serial;
18097 this->pad = 0;
18098 this->entries_count = entries_count;
18099 this->allocation_size = allocation_size;
18100 }
18101
18102 allocator_image_trailer() {
18103 memset((char*)this, 0, sizeof(allocator_image_trailer));
18104 }
18105
18106 friend std::ostream& operator<<(std::ostream& out, const allocator_image_trailer& trailer) {
18107 if (trailer.null_extent.offset || trailer.null_extent.length) {
18108 out << "trailer.null_extent.offset = " << trailer.null_extent.offset << std::endl;
18109 out << "trailer.null_extent.length = " << trailer.null_extent.length << std::endl;
18110 }
18111 out << "format_version = " << trailer.format_version << std::endl;
18112 out << "valid_signature = " << trailer.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
18113 out << "timestamp = " << trailer.timestamp << std::endl;
18114 out << "serial = " << trailer.serial << std::endl;
18115 if (trailer.pad) {
18116 out << "trailer.pad= " << trailer.pad << std::endl;
18117 }
18118 out << "entries_count = " << trailer.entries_count << std::endl;
18119 out << "allocation_size = " << trailer.allocation_size << std::endl;
18120 return out;
18121 }
18122
18123 int verify(CephContext* cct, const std::string &path, const allocator_image_header *p_header, uint64_t entries_count, uint64_t allocation_size) {
18124 if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
18125
18126 // trailer must starts with null extents (both fields set to zero) [no need to convert formats for zero)
18127 if (null_extent.offset || null_extent.length) {
18128 derr << "illegal trailer - null_extent = [" << null_extent.offset << "," << null_extent.length << "]"<< dendl;
18129 return -1;
18130 }
18131
18132 if (serial != p_header->serial) {
18133 derr << "Illegal trailer: header->serial(" << p_header->serial << ") != trailer->serial(" << serial << ")" << dendl;
18134 return -1;
18135 }
18136
18137 if (format_version != p_header->format_version) {
18138 derr << "Illegal trailer: header->format_version(" << p_header->format_version
18139 << ") != trailer->format_version(" << format_version << ")" << dendl;
18140 return -1;
18141 }
18142
18143 if (timestamp != p_header->timestamp) {
18144 derr << "Illegal trailer: header->timestamp(" << p_header->timestamp
18145 << ") != trailer->timestamp(" << timestamp << ")" << dendl;
18146 return -1;
18147 }
18148
18149 if (this->entries_count != entries_count) {
18150 derr << "Illegal trailer: entries_count(" << entries_count << ") != trailer->entries_count("
18151 << this->entries_count << ")" << dendl;
18152 return -1;
18153 }
18154
18155 if (this->allocation_size != allocation_size) {
18156 derr << "Illegal trailer: allocation_size(" << allocation_size << ") != trailer->allocation_size("
18157 << this->allocation_size << ")" << dendl;
18158 return -1;
18159 }
18160
18161 if (pad) {
18162 derr << "Illegal Trailer - pad="<< pad << dendl;
18163 return -1;
18164 }
18165
18166 // if arrived here -> trailer is valid !!
18167 return 0;
18168 } else {
18169 derr << "Illegal Trailer - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
18170 return -1;
18171 }
18172 }
18173
18174 DENC(allocator_image_trailer, v, p) {
18175 denc(v.null_extent.offset, p);
18176 denc(v.null_extent.length, p);
18177 denc(v.format_version, p);
18178 denc(v.valid_signature, p);
18179 denc(v.timestamp.tv.tv_sec, p);
18180 denc(v.timestamp.tv.tv_nsec, p);
18181 denc(v.serial, p);
18182 denc(v.pad, p);
18183 denc(v.entries_count, p);
18184 denc(v.allocation_size, p);
18185 }
18186};
18187WRITE_CLASS_DENC(allocator_image_trailer)
18188
18189
18190//-------------------------------------------------------------------------------------
18191// invalidate old allocation file if exists so will go directly to recovery after failure
18192// we can safely ignore non-existing file
18193int BlueStore::invalidate_allocation_file_on_bluefs()
18194{
18195 // mark that allocation-file was invalidated and we should destage a new copy whne closing db
18196 need_to_destage_allocation_file = true;
18197 dout(10) << "need_to_destage_allocation_file was set" << dendl;
18198
18199 BlueFS::FileWriter *p_handle = nullptr;
18200 if (!bluefs->dir_exists(allocator_dir)) {
18201 dout(5) << "allocator_dir(" << allocator_dir << ") doesn't exist" << dendl;
18202 // nothing to do -> return
18203 return 0;
18204 }
18205
18206 int ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
18207 if (ret != 0) {
18208 dout(5) << "allocator_file(" << allocator_file << ") doesn't exist" << dendl;
18209 // nothing to do -> return
18210 return 0;
18211 }
18212
18213
18214 ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, true);
18215 if (ret != 0) {
18216 derr << "Failed open_for_write with error-code " << ret << dendl;
18217 return -1;
18218 }
18219
18220 dout(5) << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
18221 ret = bluefs->truncate(p_handle, 0);
18222 if (ret != 0) {
18223 derr << "Failed truncate with error-code " << ret << dendl;
18224 bluefs->close_writer(p_handle);
18225 return -1;
18226 }
18227
18228 bluefs->fsync(p_handle);
18229 bluefs->close_writer(p_handle);
18230
18231 return 0;
18232}
18233
18234//-----------------------------------------------------------------------------------
18235// load bluefs extents into bluefs_extents_vec
18236int load_bluefs_extents(BlueFS *bluefs,
18237 bluefs_layout_t *bluefs_layout,
18238 CephContext* cct,
18239 const std::string &path,
18240 std::vector<extent_t> &bluefs_extents_vec,
18241 uint64_t min_alloc_size)
18242{
18243 if (! bluefs) {
18244 dout(5) << "No BlueFS device found!!" << dendl;
18245 return 0;
18246 }
18247
18248 interval_set<uint64_t> bluefs_extents;
18249 int ret = bluefs->get_block_extents(bluefs_layout->shared_bdev, &bluefs_extents);
18250 if (ret < 0) {
18251 derr << "failed bluefs->get_block_extents()!!" << dendl;
18252 return ret;
18253 }
18254
18255 for (auto itr = bluefs_extents.begin(); itr != bluefs_extents.end(); itr++) {
18256 extent_t e = { .offset = itr.get_start(), .length = itr.get_len() };
18257 bluefs_extents_vec.push_back(e);
18258 }
18259
18260 dout(5) << "BlueFS extent_count=" << bluefs_extents_vec.size() << dendl;
18261 return 0;
18262}
18263
18264//-----------------------------------------------------------------------------------
18265int BlueStore::copy_allocator(Allocator* src_alloc, Allocator* dest_alloc, uint64_t* p_num_entries)
18266{
18267 *p_num_entries = 0;
18268 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
18269 (*p_num_entries)++;
18270 };
18271 src_alloc->dump(count_entries);
18272
18273 dout(5) << "count num_entries=" << *p_num_entries << dendl;
18274
18275 // add 16K extra entries in case new allocation happened
18276 (*p_num_entries) += 16*1024;
18277 unique_ptr<extent_t[]> arr;
18278 try {
18279 arr = make_unique<extent_t[]>(*p_num_entries);
18280 } catch (std::bad_alloc&) {
18281 derr << "****Failed dynamic allocation, num_entries=" << *p_num_entries << dendl;
18282 return -1;
18283 }
18284
18285 uint64_t idx = 0;
18286 auto copy_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
18287 if (extent_length > 0) {
18288 if (idx < *p_num_entries) {
18289 arr[idx] = {extent_offset, extent_length};
18290 }
18291 idx++;
18292 }
18293 else {
18294 derr << "zero length extent!!! offset=" << extent_offset << ", index=" << idx << dendl;
18295 }
18296 };
18297 src_alloc->dump(copy_entries);
18298
18299 dout(5) << "copy num_entries=" << idx << dendl;
18300 if (idx > *p_num_entries) {
18301 derr << "****spillover, num_entries=" << *p_num_entries << ", spillover=" << (idx - *p_num_entries) << dendl;
18302 ceph_assert(idx <= *p_num_entries);
18303 }
18304
18305 *p_num_entries = idx;
18306
18307 for (idx = 0; idx < *p_num_entries; idx++) {
18308 const extent_t *p_extent = &arr[idx];
18309 dest_alloc->init_add_free(p_extent->offset, p_extent->length);
18310 }
18311
18312 return 0;
18313}
18314
18315//-----------------------------------------------------------------------------------
18316static uint32_t flush_extent_buffer_with_crc(BlueFS::FileWriter *p_handle, const char* buffer, const char *p_curr, uint32_t crc)
18317{
18318 std::ptrdiff_t length = p_curr - buffer;
18319 p_handle->append(buffer, length);
18320
18321 crc = ceph_crc32c(crc, (const uint8_t*)buffer, length);
18322 uint32_t encoded_crc = HTOCEPH_32(crc);
18323 p_handle->append((byte*)&encoded_crc, sizeof(encoded_crc));
18324
18325 return crc;
18326}
18327
18328const unsigned MAX_EXTENTS_IN_BUFFER = 4 * 1024; // 4K extents = 64KB of data
18329// write the allocator to a flat bluefs file - 4K extents at a time
18330//-----------------------------------------------------------------------------------
18331int BlueStore::store_allocator(Allocator* src_allocator)
18332{
18333 // when storing allocations to file we must be sure there is no background compactions
18334 // the easiest way to achieve it is to make sure db is closed
18335 ceph_assert(db == nullptr);
18336 utime_t start_time = ceph_clock_now();
18337 int ret = 0;
18338
18339 // create dir if doesn't exist already
18340 if (!bluefs->dir_exists(allocator_dir) ) {
18341 ret = bluefs->mkdir(allocator_dir);
18342 if (ret != 0) {
18343 derr << "Failed mkdir with error-code " << ret << dendl;
18344 return -1;
18345 }
18346 }
18347
18348 // reuse previous file-allocation if exists
18349 ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
18350 bool overwrite_file = (ret == 0);
18351 //derr << __func__ << "bluefs->open_for_write(" << overwrite_file << ")" << dendl;
18352 BlueFS::FileWriter *p_handle = nullptr;
18353 ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, overwrite_file);
18354 if (ret != 0) {
18355 derr << __func__ << "Failed open_for_write with error-code " << ret << dendl;
18356 return -1;
18357 }
18358
18359 uint64_t file_size = p_handle->file->fnode.size;
18360 uint64_t allocated = p_handle->file->fnode.get_allocated();
18361 dout(5) << "file_size=" << file_size << ", allocated=" << allocated << dendl;
18362
18363 unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(src_allocator));
18364 if (!allocator) {
18365 bluefs->close_writer(p_handle);
18366 return -1;
18367 }
18368
18369 // store all extents (except for the bluefs extents we removed) in a single flat file
18370 utime_t timestamp = ceph_clock_now();
18371 uint32_t crc = -1;
18372 {
18373 allocator_image_header header(timestamp, s_format_version, s_serial);
18374 bufferlist header_bl;
18375 encode(header, header_bl);
18376 crc = header_bl.crc32c(crc);
18377 encode(crc, header_bl);
18378 p_handle->append(header_bl);
18379 }
18380
18381 crc = -1; // reset crc
18382 extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
18383 extent_t *p_curr = buffer;
18384 const extent_t *p_end = buffer + MAX_EXTENTS_IN_BUFFER;
18385 uint64_t extent_count = 0;
18386 uint64_t allocation_size = 0;
18387 auto iterated_allocation = [&](uint64_t extent_offset, uint64_t extent_length) {
18388 if (extent_length == 0) {
18389 derr << __func__ << "" << extent_count << "::[" << extent_offset << "," << extent_length << "]" << dendl;
18390 ret = -1;
18391 return;
18392 }
18393 p_curr->offset = HTOCEPH_64(extent_offset);
18394 p_curr->length = HTOCEPH_64(extent_length);
18395 extent_count++;
18396 allocation_size += extent_length;
18397 p_curr++;
18398
18399 if (p_curr == p_end) {
18400 crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
18401 p_curr = buffer; // recycle the buffer
18402 }
18403 };
18404 allocator->dump(iterated_allocation);
18405 // if got null extent -> fail the operation
18406 if (ret != 0) {
18407 derr << "Illegal extent, fail store operation" << dendl;
18408 derr << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
18409 bluefs->truncate(p_handle, 0);
18410 bluefs->close_writer(p_handle);
18411 return -1;
18412 }
18413
18414 // if we got any leftovers -> add crc and append to file
18415 if (p_curr > buffer) {
18416 crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
18417 }
18418
18419 {
18420 allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
18421 bufferlist trailer_bl;
18422 encode(trailer, trailer_bl);
18423 uint32_t crc = -1;
18424 crc = trailer_bl.crc32c(crc);
18425 encode(crc, trailer_bl);
18426 p_handle->append(trailer_bl);
18427 }
18428
18429 bluefs->fsync(p_handle);
18430 bluefs->truncate(p_handle, p_handle->pos);
18431 bluefs->fsync(p_handle);
18432
18433 utime_t duration = ceph_clock_now() - start_time;
18434 dout(5) <<"WRITE-extent_count=" << extent_count << ", file_size=" << p_handle->file->fnode.size << dendl;
18435 dout(5) <<"p_handle->pos=" << p_handle->pos << " WRITE-duration=" << duration << " seconds" << dendl;
18436
18437 bluefs->close_writer(p_handle);
18438 need_to_destage_allocation_file = false;
18439 dout(10) << "need_to_destage_allocation_file was clear" << dendl;
18440 return 0;
18441}
18442
18443//-----------------------------------------------------------------------------------
18444Allocator* BlueStore::create_bitmap_allocator(uint64_t bdev_size) {
18445 // create allocator
18446 uint64_t alloc_size = min_alloc_size;
18447 Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size,
18448 zone_size, first_sequential_zone,
18449 "recovery");
18450 if (alloc) {
18451 return alloc;
18452 } else {
18453 derr << "Failed Allocator Creation" << dendl;
18454 return nullptr;
18455 }
18456}
18457
18458//-----------------------------------------------------------------------------------
18459size_t calc_allocator_image_header_size()
18460{
18461 utime_t timestamp = ceph_clock_now();
18462 allocator_image_header header(timestamp, s_format_version, s_serial);
18463 bufferlist header_bl;
18464 encode(header, header_bl);
18465 uint32_t crc = -1;
18466 crc = header_bl.crc32c(crc);
18467 encode(crc, header_bl);
18468
18469 return header_bl.length();
18470}
18471
18472//-----------------------------------------------------------------------------------
18473int calc_allocator_image_trailer_size()
18474{
18475 utime_t timestamp = ceph_clock_now();
18476 uint64_t extent_count = -1;
18477 uint64_t allocation_size = -1;
18478 uint32_t crc = -1;
18479 bufferlist trailer_bl;
18480 allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
18481
18482 encode(trailer, trailer_bl);
18483 crc = trailer_bl.crc32c(crc);
18484 encode(crc, trailer_bl);
18485 return trailer_bl.length();
18486}
18487
18488//-----------------------------------------------------------------------------------
18489int BlueStore::__restore_allocator(Allocator* allocator, uint64_t *num, uint64_t *bytes)
18490{
18491 utime_t start_time = ceph_clock_now();
18492 BlueFS::FileReader *p_temp_handle = nullptr;
18493 int ret = bluefs->open_for_read(allocator_dir, allocator_file, &p_temp_handle, false);
18494 if (ret != 0) {
18495 derr << "Failed open_for_read with error-code " << ret << dendl;
18496 return -1;
18497 }
18498 unique_ptr<BlueFS::FileReader> p_handle(p_temp_handle);
18499 uint64_t read_alloc_size = 0;
18500 uint64_t file_size = p_handle->file->fnode.size;
18501 dout(5) << "file_size=" << file_size << ",sizeof(extent_t)=" << sizeof(extent_t) << dendl;
18502
18503 // make sure we were able to store a valid copy
18504 if (file_size == 0) {
18505 derr << "No Valid allocation info on disk (empty file)" << dendl;
18506 return -1;
18507 }
18508
18509 // first read the header
18510 size_t offset = 0;
18511 allocator_image_header header;
18512 int header_size = calc_allocator_image_header_size();
18513 {
18514 bufferlist header_bl,temp_bl;
18515 int read_bytes = bluefs->read(p_handle.get(), offset, header_size, &temp_bl, nullptr);
18516 if (read_bytes != header_size) {
18517 derr << "Failed bluefs->read() for header::read_bytes=" << read_bytes << ", req_bytes=" << header_size << dendl;
18518 return -1;
18519 }
18520
18521 offset += read_bytes;
18522
18523 header_bl.claim_append(temp_bl);
18524 auto p = header_bl.cbegin();
18525 decode(header, p);
18526 if (header.verify(cct, path) != 0 ) {
18527 derr << "header = \n" << header << dendl;
18528 return -1;
18529 }
18530
18531 uint32_t crc_calc = -1, crc;
18532 crc_calc = header_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
18533 decode(crc, p);
18534 if (crc != crc_calc) {
18535 derr << "crc mismatch!!! crc=" << crc << ", crc_calc=" << crc_calc << dendl;
18536 derr << "header = \n" << header << dendl;
18537 return -1;
18538 }
18539
18540 // increment version for next store
18541 s_serial = header.serial + 1;
18542 }
18543
18544 // then read the payload (extents list) using a recycled buffer
18545 extent_t buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
18546 uint32_t crc = -1;
18547 int trailer_size = calc_allocator_image_trailer_size();
18548 uint64_t extent_count = 0;
18549 uint64_t extents_bytes_left = file_size - (header_size + trailer_size + sizeof(crc));
18550 while (extents_bytes_left) {
18551 int req_bytes = std::min(extents_bytes_left, sizeof(buffer));
18552 int read_bytes = bluefs->read(p_handle.get(), offset, req_bytes, nullptr, (char*)buffer);
18553 if (read_bytes != req_bytes) {
18554 derr << "Failed bluefs->read()::read_bytes=" << read_bytes << ", req_bytes=" << req_bytes << dendl;
18555 return -1;
18556 }
18557
18558 offset += read_bytes;
18559 extents_bytes_left -= read_bytes;
18560
18561 const unsigned num_extent_in_buffer = read_bytes/sizeof(extent_t);
18562 const extent_t *p_end = buffer + num_extent_in_buffer;
18563 for (const extent_t *p_ext = buffer; p_ext < p_end; p_ext++) {
18564 uint64_t offset = CEPHTOH_64(p_ext->offset);
18565 uint64_t length = CEPHTOH_64(p_ext->length);
18566 read_alloc_size += length;
18567
18568 if (length > 0) {
18569 allocator->init_add_free(offset, length);
18570 extent_count ++;
18571 } else {
18572 derr << "extent with zero length at idx=" << extent_count << dendl;
18573 return -1;
18574 }
18575 }
18576
18577 uint32_t calc_crc = ceph_crc32c(crc, (const uint8_t*)buffer, read_bytes);
18578 read_bytes = bluefs->read(p_handle.get(), offset, sizeof(crc), nullptr, (char*)&crc);
18579 if (read_bytes == sizeof(crc) ) {
18580 crc = CEPHTOH_32(crc);
18581 if (crc != calc_crc) {
18582 derr << "data crc mismatch!!! crc=" << crc << ", calc_crc=" << calc_crc << dendl;
18583 derr << "extents_bytes_left=" << extents_bytes_left << ", offset=" << offset << ", extent_count=" << extent_count << dendl;
18584 return -1;
18585 }
18586
18587 offset += read_bytes;
18588 if (extents_bytes_left) {
18589 extents_bytes_left -= read_bytes;
18590 }
18591 } else {
18592 derr << "Failed bluefs->read() for crc::read_bytes=" << read_bytes << ", req_bytes=" << sizeof(crc) << dendl;
18593 return -1;
18594 }
18595
18596 }
18597
18598 // finally, read teh trailer and verify it is in good shape and that we got all the extents
18599 {
18600 bufferlist trailer_bl,temp_bl;
18601 int read_bytes = bluefs->read(p_handle.get(), offset, trailer_size, &temp_bl, nullptr);
18602 if (read_bytes != trailer_size) {
18603 derr << "Failed bluefs->read() for trailer::read_bytes=" << read_bytes << ", req_bytes=" << trailer_size << dendl;
18604 return -1;
18605 }
18606 offset += read_bytes;
18607
18608 trailer_bl.claim_append(temp_bl);
18609 uint32_t crc_calc = -1;
18610 uint32_t crc;
18611 allocator_image_trailer trailer;
18612 auto p = trailer_bl.cbegin();
18613 decode(trailer, p);
18614 if (trailer.verify(cct, path, &header, extent_count, read_alloc_size) != 0 ) {
18615 derr << "trailer=\n" << trailer << dendl;
18616 return -1;
18617 }
18618
18619 crc_calc = trailer_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
18620 decode(crc, p);
18621 if (crc != crc_calc) {
18622 derr << "trailer crc mismatch!::crc=" << crc << ", crc_calc=" << crc_calc << dendl;
18623 derr << "trailer=\n" << trailer << dendl;
18624 return -1;
18625 }
18626 }
18627
18628 utime_t duration = ceph_clock_now() - start_time;
18629 dout(5) << "READ--extent_count=" << extent_count << ", read_alloc_size= "
18630 << read_alloc_size << ", file_size=" << file_size << dendl;
18631 dout(5) << "READ duration=" << duration << " seconds, s_serial=" << s_serial << dendl;
18632 *num = extent_count;
18633 *bytes = read_alloc_size;
18634 return 0;
18635}
18636
18637//-----------------------------------------------------------------------------------
18638int BlueStore::restore_allocator(Allocator* dest_allocator, uint64_t *num, uint64_t *bytes)
18639{
18640 utime_t start = ceph_clock_now();
18641 auto temp_allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
18642 int ret = __restore_allocator(temp_allocator.get(), num, bytes);
18643 if (ret != 0) {
18644 return ret;
18645 }
18646
18647 uint64_t num_entries = 0;
18648 dout(5) << " calling copy_allocator(bitmap_allocator -> shared_alloc.a)" << dendl;
18649 copy_allocator(temp_allocator.get(), dest_allocator, &num_entries);
18650 utime_t duration = ceph_clock_now() - start;
18651 dout(5) << "restored in " << duration << " seconds, num_entries=" << num_entries << dendl;
18652 return ret;
18653}
18654
18655//-------------------------------------------------------------------------
18656void BlueStore::ExtentMap::provide_shard_info_to_onode(bufferlist v, uint32_t shard_id)
18657{
18658 [[maybe_unused]] auto cct = onode->c->store->cct;
18659 auto path = onode->c->store->path;
18660 if (shard_id < shards.size()) {
18661 auto p = &shards[shard_id];
18662 if (!p->loaded) {
18663 dout(30) << "opening shard 0x" << std::hex << p->shard_info->offset << std::dec << dendl;
18664 p->extents = decode_some(v);
18665 p->loaded = true;
18666 dout(20) << "open shard 0x" << std::hex << p->shard_info->offset << std::dec << dendl;
18667 ceph_assert(p->dirty == false);
18668 ceph_assert(v.length() == p->shard_info->bytes);
18669 }
18670 } else {
18671 derr << "illegal shard-id=" << shard_id << " shards.size()=" << shards.size() << dendl;
18672 ceph_assert(shard_id < shards.size());
18673 }
18674}
18675
18676//-----------------------------------------------------------------------------------
18677void BlueStore::set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length)
18678{
18679 ceph_assert((offset & min_alloc_size_mask) == 0);
18680 ceph_assert((length & min_alloc_size_mask) == 0);
18681 sbmap->set(offset >> min_alloc_size_order, length >> min_alloc_size_order);
18682}
18683
18684//---------------------------------------------------------
18685// Process all physical extents from a given Onode (including all its shards)
18686void BlueStore::read_allocation_from_single_onode(
18687 SimpleBitmap* sbmap,
18688 BlueStore::OnodeRef& onode_ref,
18689 read_alloc_stats_t& stats)
18690{
18691 // create a map holding all physical-extents of this Onode to prevent duplication from being added twice and more
18692 std::unordered_map<uint64_t, uint32_t> lcl_extnt_map;
18693 unsigned blobs_count = 0;
18694 uint64_t pos = 0;
18695
18696 stats.spanning_blob_count += onode_ref->extent_map.spanning_blob_map.size();
18697 // first iterate over all logical-extents
18698 for (struct Extent& l_extent : onode_ref->extent_map.extent_map) {
18699 ceph_assert(l_extent.logical_offset >= pos);
18700
18701 pos = l_extent.logical_offset + l_extent.length;
18702 ceph_assert(l_extent.blob);
18703 const bluestore_blob_t& blob = l_extent.blob->get_blob();
18704 const PExtentVector& p_extent_vec = blob.get_extents();
18705 blobs_count++;
18706 if (blob.is_compressed()) {
18707 stats.compressed_blob_count++;
18708 }
18709
18710 if (blob.is_shared()) {
18711 stats.shared_blobs_count++;
18712 }
18713
18714 // process all physical extent in this blob
18715 for (auto p_extent = p_extent_vec.begin(); p_extent != p_extent_vec.end(); p_extent++) {
18716 auto offset = p_extent->offset;
18717 auto length = p_extent->length;
18718
18719 // Offset of -1 means that the extent was removed (and it is only a place holder) and can be safely skipped
18720 if (offset == (uint64_t)-1) {
18721 stats.skipped_illegal_extent++;
18722 continue;
18723 }
18724
18725 if (!blob.is_shared()) {
18726 // skip repeating extents
18727 auto lcl_itr = lcl_extnt_map.find(offset);
18728 // extents using shared blobs might have differnt length
18729 if (lcl_itr != lcl_extnt_map.end() ) {
18730 // repeated extents must have the same length!
18731 ceph_assert(lcl_extnt_map[offset] == length);
18732 stats.skipped_repeated_extent++;
18733 } else {
18734 lcl_extnt_map[offset] = length;
18735 set_allocation_in_simple_bmap(sbmap, offset, length);
18736 stats.extent_count++;
18737 }
18738 } else {
18739 // extents using shared blobs might have differnt length
18740 set_allocation_in_simple_bmap(sbmap, offset, length);
18741 stats.extent_count++;
18742 }
18743
18744 } // physical-extents loop
18745
18746 } // logical-extents loop
18747
18748 if (blobs_count < MAX_BLOBS_IN_ONODE) {
18749 stats.blobs_in_onode[blobs_count]++;
18750 } else {
18751 // store all counts higher than MAX_BLOBS_IN_ONODE in a single bucket at offset zero
18752 stats.blobs_in_onode[MAX_BLOBS_IN_ONODE]++;
18753 }
18754}
18755
18756//-------------------------------------------------------------------------
18757int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats_t& stats)
18758{
18759 // finally add all space take by user data
18760 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
18761 if (!it) {
18762 // TBD - find a better error code
18763 derr << "failed db->get_iterator(PREFIX_OBJ)" << dendl;
18764 return -1;
18765 }
18766
18767 CollectionRef collection_ref;
18768 spg_t pgid;
18769 BlueStore::OnodeRef onode_ref;
18770 bool has_open_onode = false;
18771 uint32_t shard_id = 0;
18772 uint64_t kv_count = 0;
18773 uint64_t count_interval = 1'000'000;
18774 // iterate over all ONodes stored in RocksDB
18775 for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) {
18776 // trace an even after every million processed objects (typically every 5-10 seconds)
18777 if (kv_count && (kv_count % count_interval == 0) ) {
18778 dout(5) << "processed objects count = " << kv_count << dendl;
18779 }
18780
18781 // Shards - Code
18782 // add the extents from the shards to the main Obj
18783 if (is_extent_shard_key(it->key())) {
18784 // shards must follow a valid main object
18785 if (has_open_onode) {
18786 // shards keys must start with the main object key
18787 if (it->key().find(onode_ref->key) == 0) {
18788 // shards count can't exceed declared shard-count in the main-object
18789 if (shard_id < onode_ref->extent_map.shards.size()) {
18790 onode_ref->extent_map.provide_shard_info_to_onode(it->value(), shard_id);
18791 stats.shard_count++;
18792 shard_id++;
18793 } else {
18794 derr << "illegal shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
18795 derr << "shard->key=" << pretty_binary_string(it->key()) << dendl;
18796 ceph_assert(shard_id < onode_ref->extent_map.shards.size());
18797 }
18798 } else {
18799 derr << "illegal shard-key::onode->key=" << pretty_binary_string(onode_ref->key) << " shard->key=" << pretty_binary_string(it->key()) << dendl;
18800 ceph_assert(it->key().find(onode_ref->key) == 0);
18801 }
18802 } else {
18803 derr << "error::shard without main objects for key=" << pretty_binary_string(it->key()) << dendl;
18804 ceph_assert(has_open_onode);
18805 }
18806
18807 } else {
18808 // Main Object Code
18809
18810 if (has_open_onode) {
18811 // make sure we got all shards of this object
18812 if (shard_id == onode_ref->extent_map.shards.size()) {
18813 // We completed an Onode Object -> pass it to be processed
18814 read_allocation_from_single_onode(sbmap, onode_ref, stats);
18815 } else {
18816 derr << "Missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
18817 ceph_assert(shard_id == onode_ref->extent_map.shards.size());
18818 }
18819 } else {
18820 // We opened a new Object
18821 has_open_onode = true;
18822 }
18823
18824 // The main Obj is always first in RocksDB so we can start with shard_id set to zero
18825 shard_id = 0;
18826 stats.onode_count++;
18827 ghobject_t oid;
18828 int ret = get_key_object(it->key(), &oid);
18829 if (ret < 0) {
18830 derr << "bad object key " << pretty_binary_string(it->key()) << dendl;
18831 ceph_assert(ret == 0);
18832 continue;
18833 }
18834
18835 // fill collection_ref if doesn't exist yet
18836 // We process all the obejcts in a given collection and then move to the next collection
18837 // This means we only search once for every given collection
18838 if (!collection_ref ||
18839 oid.shard_id != pgid.shard ||
18840 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
18841 !collection_ref->contains(oid)) {
18842 stats.collection_search++;
18843 collection_ref = nullptr;
18844
18845 for (auto& p : coll_map) {
18846 if (p.second->contains(oid)) {
18847 collection_ref = p.second;
18848 break;
18849 }
18850 }
18851
18852 if (!collection_ref) {
18853 derr << "stray object " << oid << " not owned by any collection" << dendl;
18854 ceph_assert(collection_ref);
18855 continue;
18856 }
18857
18858 collection_ref->cid.is_pg(&pgid);
18859 }
18860 onode_ref.reset(BlueStore::Onode::decode(collection_ref, oid, it->key(), it->value()));
18861 }
18862 }
18863
18864 // process the last object
18865 if (has_open_onode) {
18866 // make sure we got all shards of this object
18867 if (shard_id == onode_ref->extent_map.shards.size()) {
18868 // We completed an Onode Object -> pass it to be processed
18869 read_allocation_from_single_onode(sbmap, onode_ref, stats);
18870 } else {
18871 derr << "Last Object is missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
18872 ceph_assert(shard_id == onode_ref->extent_map.shards.size());
18873 }
18874 }
18875 dout(5) << "onode_count=" << stats.onode_count << " ,shard_count=" << stats.shard_count << dendl;
18876
18877 return 0;
18878}
18879
18880//---------------------------------------------------------
18881int BlueStore::reconstruct_allocations(SimpleBitmap *sbmap, read_alloc_stats_t &stats)
18882{
18883 // first set space used by superblock
18884 auto super_length = std::max<uint64_t>(min_alloc_size, SUPER_RESERVED);
18885 set_allocation_in_simple_bmap(sbmap, 0, super_length);
18886 stats.extent_count++;
18887
18888 // then set all space taken by Objects
18889 int ret = read_allocation_from_onodes(sbmap, stats);
18890 if (ret < 0) {
18891 derr << "failed read_allocation_from_onodes()" << dendl;
18892 return ret;
18893 }
18894
18895 return 0;
18896}
18897
18898//-----------------------------------------------------------------------------------
18899static void copy_simple_bitmap_to_allocator(SimpleBitmap* sbmap, Allocator* dest_alloc, uint64_t alloc_size)
18900{
18901 int alloc_size_shift = ctz(alloc_size);
18902 uint64_t offset = 0;
18903 extent_t ext = sbmap->get_next_clr_extent(offset);
18904 while (ext.length != 0) {
18905 dest_alloc->init_add_free(ext.offset << alloc_size_shift, ext.length << alloc_size_shift);
18906 offset = ext.offset + ext.length;
18907 ext = sbmap->get_next_clr_extent(offset);
18908 }
18909}
18910
18911//---------------------------------------------------------
18912int BlueStore::read_allocation_from_drive_on_startup()
18913{
18914 int ret = 0;
18915
18916 ret = _open_collections();
18917 if (ret < 0) {
18918 return ret;
18919 }
18920 auto shutdown_cache = make_scope_guard([&] {
18921 _shutdown_cache();
18922 });
18923
18924 utime_t start = ceph_clock_now();
18925 read_alloc_stats_t stats = {};
18926 SimpleBitmap sbmap(cct, div_round_up(bdev->get_size(), min_alloc_size));
18927 ret = reconstruct_allocations(&sbmap, stats);
18928 if (ret != 0) {
18929 return ret;
18930 }
18931
18932 copy_simple_bitmap_to_allocator(&sbmap, alloc, min_alloc_size);
18933
18934 utime_t duration = ceph_clock_now() - start;
18935 dout(1) << "::Allocation Recovery was completed in " << duration << " seconds, extent_count=" << stats.extent_count << dendl;
18936 return ret;
18937}
18938
18939
18940
18941
18942// Only used for debugging purposes - we build a secondary allocator from the Onodes and compare it to the existing one
18943// Not meant to be run by customers
18944#ifdef CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
18945
18946#include <stdlib.h>
18947#include <algorithm>
18948//---------------------------------------------------------
18949int cmpfunc (const void * a, const void * b)
18950{
18951 if ( ((extent_t*)a)->offset > ((extent_t*)b)->offset ) {
18952 return 1;
18953 }
18954 else if( ((extent_t*)a)->offset < ((extent_t*)b)->offset ) {
18955 return -1;
18956 }
18957 else {
18958 return 0;
18959 }
18960}
18961
18962// compare the allocator built from Onodes with the system allocator (CF-B)
18963//---------------------------------------------------------
18964int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t req_extent_count, uint64_t memory_target)
18965{
18966 uint64_t allocation_size = std::min((req_extent_count) * sizeof(extent_t), memory_target / 3);
18967 uint64_t extent_count = allocation_size/sizeof(extent_t);
18968 dout(5) << "req_extent_count=" << req_extent_count << ", granted extent_count="<< extent_count << dendl;
18969
18970 unique_ptr<extent_t[]> arr1;
18971 unique_ptr<extent_t[]> arr2;
18972 try {
18973 arr1 = make_unique<extent_t[]>(extent_count);
18974 arr2 = make_unique<extent_t[]>(extent_count);
18975 } catch (std::bad_alloc&) {
18976 derr << "****Failed dynamic allocation, extent_count=" << extent_count << dendl;
18977 return -1;
18978 }
18979
18980 // copy the extents from the allocators into simple array and then compare them
18981 uint64_t size1 = 0, size2 = 0;
18982 uint64_t idx1 = 0, idx2 = 0;
18983 auto iterated_mapper1 = [&](uint64_t offset, uint64_t length) {
18984 size1 += length;
18985 if (idx1 < extent_count) {
18986 arr1[idx1++] = {offset, length};
18987 }
18988 else if (idx1 == extent_count) {
18989 derr << "(2)compare_allocators:: spillover" << dendl;
18990 idx1 ++;
18991 }
18992
18993 };
18994
18995 auto iterated_mapper2 = [&](uint64_t offset, uint64_t length) {
18996 size2 += length;
18997 if (idx2 < extent_count) {
18998 arr2[idx2++] = {offset, length};
18999 }
19000 else if (idx2 == extent_count) {
19001 derr << "(2)compare_allocators:: spillover" << dendl;
19002 idx2 ++;
19003 }
19004 };
19005
19006 alloc1->dump(iterated_mapper1);
19007 alloc2->dump(iterated_mapper2);
19008
19009 qsort(arr1.get(), std::min(idx1, extent_count), sizeof(extent_t), cmpfunc);
19010 qsort(arr2.get(), std::min(idx2, extent_count), sizeof(extent_t), cmpfunc);
19011
19012 if (idx1 == idx2) {
19013 idx1 = idx2 = std::min(idx1, extent_count);
19014 if (memcmp(arr1.get(), arr2.get(), sizeof(extent_t) * idx2) == 0) {
19015 return 0;
19016 }
19017 derr << "Failed memcmp(arr1, arr2, sizeof(extent_t)*idx2)" << dendl;
19018 for (uint64_t i = 0; i < idx1; i++) {
19019 if (memcmp(arr1.get()+i, arr2.get()+i, sizeof(extent_t)) != 0) {
19020 derr << "!!!![" << i << "] arr1::<" << arr1[i].offset << "," << arr1[i].length << ">" << dendl;
19021 derr << "!!!![" << i << "] arr2::<" << arr2[i].offset << "," << arr2[i].length << ">" << dendl;
19022 return -1;
19023 }
19024 }
19025 return 0;
19026 } else {
19027 derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl;
19028 std::cout << "===================================================================" << std::endl;
19029 for (uint64_t i = 0; i < idx1; i++) {
19030 std::cout << "arr1[" << i << "]<" << arr1[i].offset << "," << arr1[i].length << "> " << std::endl;
19031 }
19032
19033 std::cout << "===================================================================" << std::endl;
19034 for (uint64_t i = 0; i < idx2; i++) {
19035 std::cout << "arr2[" << i << "]<" << arr2[i].offset << "," << arr2[i].length << "> " << std::endl;
19036 }
19037 return -1;
19038 }
19039}
19040
19041//---------------------------------------------------------
19042int BlueStore::add_existing_bluefs_allocation(Allocator* allocator, read_alloc_stats_t &stats)
19043{
19044 // then add space used by bluefs to store rocksdb
19045 unsigned extent_count = 0;
19046 if (bluefs) {
19047 interval_set<uint64_t> bluefs_extents;
19048 int ret = bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents);
19049 if (ret < 0) {
19050 return ret;
19051 }
19052 for (auto itr = bluefs_extents.begin(); itr != bluefs_extents.end(); extent_count++, itr++) {
19053 allocator->init_rm_free(itr.get_start(), itr.get_len());
19054 stats.extent_count++;
19055 }
19056 }
19057
19058 dout(5) << "bluefs extent_count=" << extent_count << dendl;
19059 return 0;
19060}
19061
19062//---------------------------------------------------------
19063int BlueStore::read_allocation_from_drive_for_bluestore_tool()
19064{
19065 dout(5) << __func__ << dendl;
19066 int ret = 0;
19067 uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
19068 ret = _open_db_and_around(true, false);
19069 if (ret < 0) {
19070 return ret;
19071 }
19072
19073 ret = _open_collections();
19074 if (ret < 0) {
19075 _close_db_and_around();
19076 return ret;
19077 }
19078
19079 utime_t duration;
19080 read_alloc_stats_t stats = {};
19081 utime_t start = ceph_clock_now();
19082
19083 auto shutdown_cache = make_scope_guard([&] {
19084 std::cout << "Allocation Recovery was completed in " << duration
19085 << " seconds; insert_count=" << stats.insert_count
19086 << "; extent_count=" << stats.extent_count << std::endl;
19087 _shutdown_cache();
19088 _close_db_and_around();
19089 });
19090
19091 {
19092 auto allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
19093 //reconstruct allocations into a temp simple-bitmap and copy into allocator
19094 {
19095 SimpleBitmap sbmap(cct, div_round_up(bdev->get_size(), min_alloc_size));
19096 ret = reconstruct_allocations(&sbmap, stats);
19097 if (ret != 0) {
19098 return ret;
19099 }
19100 copy_simple_bitmap_to_allocator(&sbmap, allocator.get(), min_alloc_size);
19101 }
19102
19103 // add allocation space used by the bluefs itself
19104 ret = add_existing_bluefs_allocation(allocator.get(), stats);
19105 if (ret < 0) {
19106 return ret;
19107 }
19108
19109 duration = ceph_clock_now() - start;
19110 stats.insert_count = 0;
19111 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
19112 stats.insert_count++;
19113 };
19114 allocator->dump(count_entries);
19115 ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target);
19116 if (ret != 0) {
19117 dout(5) << "Allocator drive - file integrity check OK" << dendl;
19118 } else {
19119 derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl;
19120 }
19121 }
19122
19123 std::cout << stats << std::endl;
19124 return ret;
19125}
19126
19127//---------------------------------------------------------
19128Allocator* BlueStore::clone_allocator_without_bluefs(Allocator *src_allocator)
19129{
19130 uint64_t bdev_size = bdev->get_size();
19131 Allocator* allocator = create_bitmap_allocator(bdev_size);
19132 if (allocator) {
19133 dout(5) << "bitmap-allocator=" << allocator << dendl;
19134 } else {
19135 derr << "****failed create_bitmap_allocator()" << dendl;
19136 return nullptr;
19137 }
19138
19139 uint64_t num_entries = 0;
19140 copy_allocator(src_allocator, allocator, &num_entries);
19141
19142 // BlueFS stores its internal allocation outside RocksDB (FM) so we should not destage them to the allcoator-file
19143 // we are going to hide bluefs allocation during allocator-destage as they are stored elsewhere
19144 {
19145 std::vector<extent_t> bluefs_extents_vec;
19146 // load current bluefs internal allocation into a vector
19147 load_bluefs_extents(bluefs, &bluefs_layout, cct, path, bluefs_extents_vec, min_alloc_size);
19148 // then remove them from the shared allocator before dumping it to disk (bluefs stored them internally)
19149 for (auto itr = bluefs_extents_vec.begin(); itr != bluefs_extents_vec.end(); ++itr) {
19150 allocator->init_add_free(itr->offset, itr->length);
19151 }
19152 }
19153
19154 return allocator;
19155}
19156
19157//---------------------------------------------------------
19158static void clear_allocation_objects_from_rocksdb(KeyValueDB *db, CephContext *cct, const std::string &path)
19159{
19160 dout(5) << "t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP)" << dendl;
19161 KeyValueDB::Transaction t = db->get_transaction();
19162 t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP);
19163 db->submit_transaction_sync(t);
19164}
19165
19166//---------------------------------------------------------
19167void BlueStore::copy_allocator_content_to_fm(Allocator *allocator, FreelistManager *real_fm)
19168{
19169 unsigned max_txn = 1024;
19170 dout(5) << "max_transaction_submit=" << max_txn << dendl;
19171 uint64_t size = 0, idx = 0;
19172 KeyValueDB::Transaction txn = db->get_transaction();
19173 auto iterated_insert = [&](uint64_t offset, uint64_t length) {
19174 size += length;
19175 real_fm->release(offset, length, txn);
19176 if ((++idx % max_txn) == 0) {
19177 db->submit_transaction_sync(txn);
19178 txn = db->get_transaction();
19179 }
19180 };
19181 allocator->dump(iterated_insert);
19182 if (idx % max_txn != 0) {
19183 db->submit_transaction_sync(txn);
19184 }
19185 dout(5) << "size=" << size << ", num extents=" << idx << dendl;
19186}
19187
19188//---------------------------------------------------------
19189Allocator* BlueStore::initialize_allocator_from_freelist(FreelistManager *real_fm)
19190{
19191 dout(5) << "real_fm->enumerate_next" << dendl;
19192 Allocator* allocator2 = create_bitmap_allocator(bdev->get_size());
19193 if (allocator2) {
19194 dout(5) << "bitmap-allocator=" << allocator2 << dendl;
19195 } else {
19196 return nullptr;
19197 }
19198
19199 uint64_t size2 = 0, idx2 = 0;
19200 real_fm->enumerate_reset();
19201 uint64_t offset, length;
19202 while (real_fm->enumerate_next(db, &offset, &length)) {
19203 allocator2->init_add_free(offset, length);
19204 ++idx2;
19205 size2 += length;
19206 }
19207 real_fm->enumerate_reset();
19208
19209 dout(5) << "size2=" << size2 << ", num2=" << idx2 << dendl;
19210 return allocator2;
19211}
19212
19213//---------------------------------------------------------
19214// close the active fm and open it in a new mode like makefs()
19215// but make sure to mark the full device space as allocated
19216// later we will mark all exetents from the allocator as free
19217int BlueStore::reset_fm_for_restore()
19218{
19219 dout(5) << "<<==>> fm->clear_null_manager()" << dendl;
19220 fm->shutdown();
19221 delete fm;
19222 fm = nullptr;
19223 freelist_type = "bitmap";
19224 KeyValueDB::Transaction t = db->get_transaction();
19225 // call _open_fm() with fm_restore set to TRUE
19226 // this will mark the full device space as allocated (and not just the reserved space)
19227 _open_fm(t, true, true);
19228 if (fm == nullptr) {
19229 derr << "Failed _open_fm()" << dendl;
19230 return -1;
19231 }
19232 db->submit_transaction_sync(t);
19233 ceph_assert(!fm->is_null_manager());
19234 dout(5) << "fm was reactivated in full mode" << dendl;
19235 return 0;
19236}
19237
19238
19239//---------------------------------------------------------
19240// create a temp allocator filled with allocation state from the fm
19241// and compare it to the base allocator passed in
19242int BlueStore::verify_rocksdb_allocations(Allocator *allocator)
19243{
19244 dout(5) << "verify that alloc content is identical to FM" << dendl;
19245 // initialize from freelist
19246 Allocator* temp_allocator = initialize_allocator_from_freelist(fm);
19247 if (temp_allocator == nullptr) {
19248 return -1;
19249 }
19250
19251 uint64_t insert_count = 0;
19252 auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
19253 insert_count++;
19254 };
19255 temp_allocator->dump(count_entries);
19256 uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
19257 int ret = compare_allocators(allocator, temp_allocator, insert_count, memory_target);
19258
19259 delete temp_allocator;
19260
19261 if (ret == 0) {
19262 dout(5) << "SUCCESS!!! compare(allocator, temp_allocator)" << dendl;
19263 return 0;
19264 } else {
19265 derr << "**** FAILURE compare(allocator, temp_allocator)::ret=" << ret << dendl;
19266 return -1;
19267 }
19268}
19269
19270//---------------------------------------------------------
19271int BlueStore::db_cleanup(int ret)
19272{
19273 _shutdown_cache();
19274 _close_db_and_around();
19275 return ret;
19276}
19277
19278//---------------------------------------------------------
19279// convert back the system from null-allocator to using rocksdb to store allocation
19280int BlueStore::push_allocation_to_rocksdb()
19281{
19282 if (cct->_conf->bluestore_allocation_from_file) {
19283 derr << "cct->_conf->bluestore_allocation_from_file must be cleared first" << dendl;
19284 derr << "please change default to false in ceph.conf file>" << dendl;
19285 return -1;
19286 }
19287
19288 dout(5) << "calling open_db_and_around() in read/write mode" << dendl;
19289 int ret = _open_db_and_around(false);
19290 if (ret < 0) {
19291 return ret;
19292 }
19293
19294 if (!fm->is_null_manager()) {
19295 derr << "This is not a NULL-MANAGER -> nothing to do..." << dendl;
19296 return db_cleanup(0);
19297 }
19298
19299 // start by creating a clone copy of the shared-allocator
19300 unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(alloc));
19301 if (!allocator) {
19302 return db_cleanup(-1);
19303 }
19304
19305 // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
19306 clear_allocation_objects_from_rocksdb(db, cct, path);
19307
19308 // then open fm in new mode with the full devie marked as alloctaed
19309 if (reset_fm_for_restore() != 0) {
19310 return db_cleanup(-1);
19311 }
19312
19313 // push the free-space from the allocator (shared-alloc without bfs) to rocksdb
19314 copy_allocator_content_to_fm(allocator.get(), fm);
19315
19316 // compare the allocator info with the info stored in the fm/rocksdb
19317 if (verify_rocksdb_allocations(allocator.get()) == 0) {
19318 // all is good -> we can commit to rocksdb allocator
19319 commit_to_real_manager();
19320 } else {
19321 return db_cleanup(-1);
19322 }
19323
19324 // can't be too paranoid :-)
19325 dout(5) << "Running full scale verification..." << dendl;
19326 // close db/fm/allocator and start fresh
19327 db_cleanup(0);
19328 dout(5) << "calling open_db_and_around() in read-only mode" << dendl;
19329 ret = _open_db_and_around(true);
19330 if (ret < 0) {
19331 return db_cleanup(ret);
19332 }
19333 ceph_assert(!fm->is_null_manager());
19334 ceph_assert(verify_rocksdb_allocations(allocator.get()) == 0);
19335
19336 return db_cleanup(ret);
19337}
19338
19339#endif // CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
19340
19341//-------------------------------------------------------------------------------------
19342static int commit_freelist_type(KeyValueDB *db, const std::string& freelist_type, CephContext *cct, const std::string &path)
19343{
19344 // When freelist_type to "bitmap" we will store allocation in RocksDB
19345 // When allocation-info is stored in a single file we set freelist_type to "null"
19346 // This will direct the startup code to read allocation from file and not RocksDB
19347 KeyValueDB::Transaction t = db->get_transaction();
19348 if (t == nullptr) {
19349 derr << "db->get_transaction() failed!!!" << dendl;
19350 return -1;
19351 }
19352
19353 bufferlist bl;
19354 bl.append(freelist_type);
19355 t->set(PREFIX_SUPER, "freelist_type", bl);
19356
19357 int ret = db->submit_transaction_sync(t);
19358 if (ret != 0) {
19359 derr << "Failed db->submit_transaction_sync(t)" << dendl;
19360 }
19361 return ret;
19362}
19363
19364//-------------------------------------------------------------------------------------
19365int BlueStore::commit_to_null_manager()
19366{
19367 dout(5) << "Set FreelistManager to NULL FM..." << dendl;
19368 fm->set_null_manager();
19369 freelist_type = "null";
19370#if 1
19371 return commit_freelist_type(db, freelist_type, cct, path);
19372#else
19373 // should check how long this step take on a big configuration as deletes are expensive
19374 if (commit_freelist_type(db, freelist_type, cct, path) == 0) {
19375 // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
19376 clear_allocation_objects_from_rocksdb(db, cct, path);
19377 }
19378#endif
19379}
19380
19381
19382//-------------------------------------------------------------------------------------
19383int BlueStore::commit_to_real_manager()
19384{
19385 dout(5) << "Set FreelistManager to Real FM..." << dendl;
19386 ceph_assert(!fm->is_null_manager());
19387 freelist_type = "bitmap";
19388 int ret = commit_freelist_type(db, freelist_type, cct, path);
19389 if (ret == 0) {
19390 //remove the allocation_file
19391 invalidate_allocation_file_on_bluefs();
19392 ret = bluefs->unlink(allocator_dir, allocator_file);
19393 bluefs->sync_metadata(false);
19394 if (ret == 0) {
19395 dout(5) << "Remove Allocation File successfully" << dendl;
19396 }
19397 else {
19398 derr << "Remove Allocation File ret_code=" << ret << dendl;
19399 }
19400 }
19401
19402 return ret;
19403}
19404
19405//================================================================================================================
19406//================================================================================================================