]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
buildsys: switch source download to quincy
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
20
eafe8130 21#include <boost/container/flat_set.hpp>
9f95a23c 22#include "boost/algorithm/string.hpp"
eafe8130 23
31f18b77
FG
24#include "include/cpp-btree/btree_set.h"
25
7c673cae 26#include "BlueStore.h"
f67539c2 27#include "bluestore_common.h"
7c673cae
FG
28#include "os/kv.h"
29#include "include/compat.h"
30#include "include/intarith.h"
31#include "include/stringify.h"
11fdf7f2
TL
32#include "include/str_map.h"
33#include "include/util.h"
7c673cae
FG
34#include "common/errno.h"
35#include "common/safe_io.h"
91327a77 36#include "common/PriorityCache.h"
9f95a23c 37#include "common/RWLock.h"
7c673cae
FG
38#include "Allocator.h"
39#include "FreelistManager.h"
40#include "BlueFS.h"
41#include "BlueRocksEnv.h"
42#include "auth/Crypto.h"
43#include "common/EventTrace.h"
91327a77 44#include "perfglue/heap_profiler.h"
11fdf7f2
TL
45#include "common/blkdev.h"
46#include "common/numa.h"
f67539c2 47#include "common/pretty_binary.h"
7c673cae 48
9f95a23c
TL
49#if defined(WITH_LTTNG)
50#define TRACEPOINT_DEFINE
51#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
52#include "tracing/bluestore.h"
53#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
54#undef TRACEPOINT_DEFINE
55#else
56#define tracepoint(...)
57#endif
58
7c673cae
FG
59#define dout_context cct
60#define dout_subsys ceph_subsys_bluestore
61
31f18b77
FG
62using bid_t = decltype(BlueStore::Blob::id);
63
64// bluestore_cache_onode
7c673cae 65MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 66 bluestore_cache_onode);
7c673cae 67
31f18b77 68// bluestore_cache_other
7c673cae 69MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
f91f0fd5 70 bluestore_Buffer);
7c673cae 71MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
f91f0fd5 72 bluestore_Extent);
7c673cae 73MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
f91f0fd5 74 bluestore_Blob);
7c673cae 75MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
f91f0fd5 76 bluestore_SharedBlob);
31f18b77
FG
77
78// bluestore_txc
79MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
80 bluestore_txc);
f67539c2
TL
81using std::deque;
82using std::min;
83using std::make_pair;
84using std::numeric_limits;
85using std::pair;
86using std::list;
87using std::map;
88using std::max;
89using std::ostream;
90using std::ostringstream;
91using std::set;
92using std::string;
93using std::stringstream;
94using std::vector;
95
96using ceph::bufferlist;
97using ceph::bufferptr;
98using ceph::coarse_mono_clock;
99using ceph::decode;
100using ceph::encode;
101using ceph::Formatter;
102using ceph::JSONFormatter;
103using ceph::make_timespan;
104using ceph::mono_clock;
105using ceph::mono_time;
106using ceph::timespan_str;
7c673cae
FG
107
108// kv store prefixes
11fdf7f2
TL
109const string PREFIX_SUPER = "S"; // field -> value
110const string PREFIX_STAT = "T"; // field -> value(int64 array)
111const string PREFIX_COLL = "C"; // collection name -> cnode_t
112const string PREFIX_OBJ = "O"; // object name -> onode_t
113const string PREFIX_OMAP = "M"; // u64 + keyname -> value
114const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
9f95a23c 115const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
f67539c2 116const string PREFIX_PERPG_OMAP = "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value
11fdf7f2
TL
117const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
118const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
119const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
7c673cae 120const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
f67539c2
TL
121const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager)
122const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager)
123const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata)
7c673cae 124
11fdf7f2
TL
125const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
126
7c673cae
FG
127// write a label in the first block. always use this size. note that
128// bluefs makes a matching assumption about the location of its
129// superblock (always the second block of the device).
130#define BDEV_LABEL_BLOCK_SIZE 4096
131
132// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
133#define SUPER_RESERVED 8192
134
135#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
136
137
138/*
139 * extent map blob encoding
140 *
141 * we use the low bits of the blobid field to indicate some common scenarios
142 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
143 */
144#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
145#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
146#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
147#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
148#define BLOBID_SHIFT_BITS 4
149
150/*
151 * object name key structure
152 *
153 * encoded u8: shard + 2^7 (so that it sorts properly)
154 * encoded u64: poolid + 2^63 (so that it sorts properly)
155 * encoded u32: hash (bit reversed)
156 *
157 * escaped string: namespace
158 *
159 * escaped string: key or object name
160 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
161 * we are done. otherwise, we are followed by the object name.
162 * escaped string: object name (unless '=' above)
163 *
164 * encoded u64: snap
165 * encoded u64: generation
166 * 'o'
167 */
168#define ONODE_KEY_SUFFIX 'o'
169
170/*
171 * extent shard key
172 *
173 * object prefix key
174 * u32
175 * 'x'
176 */
177#define EXTENT_SHARD_KEY_SUFFIX 'x'
178
179/*
180 * string encoding in the key
181 *
182 * The key string needs to lexicographically sort the same way that
183 * ghobject_t does. We do this by escaping anything <= to '#' with #
184 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
185 * hex digits.
186 *
187 * We use ! as a terminator for strings; this works because it is < #
188 * and will get escaped if it is present in the string.
189 *
f91f0fd5
TL
190 * NOTE: There is a bug in this implementation: due to implicit
191 * character type conversion in comparison it may produce unexpected
192 * ordering. Unfortunately fixing the bug would mean invalidating the
193 * keys in existing deployments. Instead we do additional sorting
194 * where it is needed.
7c673cae
FG
195 */
196template<typename S>
197static void append_escaped(const string &in, S *out)
198{
224ce89b
WB
199 char hexbyte[in.length() * 3 + 1];
200 char* ptr = &hexbyte[0];
7c673cae 201 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
f91f0fd5 202 if (*i <= '#') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
203 *ptr++ = '#';
204 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
205 *ptr++ = "0123456789abcdef"[*i & 0x0f];
f91f0fd5 206 } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
207 *ptr++ = '~';
208 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
209 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 210 } else {
224ce89b 211 *ptr++ = *i;
7c673cae
FG
212 }
213 }
224ce89b
WB
214 *ptr++ = '!';
215 out->append(hexbyte, ptr - &hexbyte[0]);
216}
217
218inline unsigned h2i(char c)
219{
220 if ((c >= '0') && (c <= '9')) {
221 return c - 0x30;
222 } else if ((c >= 'a') && (c <= 'f')) {
223 return c - 'a' + 10;
224 } else if ((c >= 'A') && (c <= 'F')) {
225 return c - 'A' + 10;
226 } else {
227 return 256; // make it always larger than 255
228 }
7c673cae
FG
229}
230
231static int decode_escaped(const char *p, string *out)
232{
224ce89b
WB
233 char buff[256];
234 char* ptr = &buff[0];
235 char* max = &buff[252];
7c673cae
FG
236 const char *orig_p = p;
237 while (*p && *p != '!') {
238 if (*p == '#' || *p == '~') {
224ce89b
WB
239 unsigned hex = 0;
240 p++;
241 hex = h2i(*p++) << 4;
242 if (hex > 255) {
243 return -EINVAL;
244 }
245 hex |= h2i(*p++);
246 if (hex > 255) {
247 return -EINVAL;
248 }
249 *ptr++ = hex;
7c673cae 250 } else {
224ce89b
WB
251 *ptr++ = *p++;
252 }
253 if (ptr > max) {
254 out->append(buff, ptr-buff);
255 ptr = &buff[0];
7c673cae
FG
256 }
257 }
224ce89b
WB
258 if (ptr != buff) {
259 out->append(buff, ptr-buff);
260 }
7c673cae
FG
261 return p - orig_p;
262}
263
7c673cae
FG
264template<typename T>
265static void _key_encode_shard(shard_id_t shard, T *key)
266{
267 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
268}
269
270static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
271{
272 pshard->id = (uint8_t)*key - (uint8_t)0x80;
273 return key + 1;
274}
275
f91f0fd5 276static void get_coll_range(const coll_t& cid, int bits,
f67539c2 277 ghobject_t *temp_start, ghobject_t *temp_end,
a4b75251 278 ghobject_t *start, ghobject_t *end, bool legacy)
7c673cae 279{
7c673cae 280 spg_t pgid;
a4b75251
TL
281 constexpr uint32_t MAX_HASH = std::numeric_limits<uint32_t>::max();
282 // use different nspaces due to we use different schemes when encoding
283 // keys for listing objects
284 const std::string_view MAX_NSPACE = legacy ? "\x7f" : "\xff";
7c673cae 285 if (cid.is_pg(&pgid)) {
f91f0fd5 286 start->shard_id = pgid.shard;
7c673cae
FG
287 *temp_start = *start;
288
f91f0fd5
TL
289 start->hobj.pool = pgid.pool();
290 temp_start->hobj.pool = -2ll - pgid.pool();
7c673cae
FG
291
292 *end = *start;
293 *temp_end = *temp_start;
294
295 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
f91f0fd5
TL
296 start->hobj.set_bitwise_key_u32(reverse_hash);
297 temp_start->hobj.set_bitwise_key_u32(reverse_hash);
7c673cae
FG
298
299 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
a4b75251
TL
300 if (end_hash > MAX_HASH) {
301 // make sure end hobj is even greater than the maximum possible hobj
302 end->hobj.set_bitwise_key_u32(MAX_HASH);
303 temp_end->hobj.set_bitwise_key_u32(MAX_HASH);
304 end->hobj.nspace = MAX_NSPACE;
305 } else {
306 end->hobj.set_bitwise_key_u32(end_hash);
307 temp_end->hobj.set_bitwise_key_u32(end_hash);
308 }
7c673cae 309 } else {
f91f0fd5
TL
310 start->shard_id = shard_id_t::NO_SHARD;
311 start->hobj.pool = -1ull;
312
7c673cae 313 *end = *start;
f91f0fd5 314 start->hobj.set_bitwise_key_u32(0);
a4b75251
TL
315 end->hobj.set_bitwise_key_u32(MAX_HASH);
316 end->hobj.nspace = MAX_NSPACE;
7c673cae
FG
317 // no separate temp section
318 *temp_start = *end;
319 *temp_end = *end;
320 }
f91f0fd5
TL
321
322 start->generation = 0;
323 end->generation = 0;
324 temp_start->generation = 0;
325 temp_end->generation = 0;
7c673cae
FG
326}
327
328static void get_shared_blob_key(uint64_t sbid, string *key)
329{
330 key->clear();
331 _key_encode_u64(sbid, key);
332}
333
334static int get_key_shared_blob(const string& key, uint64_t *sbid)
335{
336 const char *p = key.c_str();
337 if (key.length() < sizeof(uint64_t))
338 return -1;
224ce89b 339 _key_decode_u64(p, sbid);
7c673cae
FG
340 return 0;
341}
342
343template<typename S>
f91f0fd5 344static void _key_encode_prefix(const ghobject_t& oid, S *key)
7c673cae 345{
f91f0fd5
TL
346 _key_encode_shard(oid.shard_id, key);
347 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
348 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
349}
7c673cae 350
f91f0fd5
TL
351static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
352{
7c673cae
FG
353 p = _key_decode_shard(p, &oid->shard_id);
354
355 uint64_t pool;
356 p = _key_decode_u64(p, &pool);
357 oid->hobj.pool = pool - 0x8000000000000000ull;
358
359 unsigned hash;
360 p = _key_decode_u32(p, &hash);
361
362 oid->hobj.set_bitwise_key_u32(hash);
363
f91f0fd5
TL
364 return p;
365}
366
367#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
368
369template<typename S>
370static int get_key_object(const S& key, ghobject_t *oid)
371{
372 int r;
373 const char *p = key.c_str();
374
375 if (key.length() < ENCODED_KEY_PREFIX_LEN)
376 return -1;
377
378 p = _key_decode_prefix(p, oid);
379
380 if (key.length() == ENCODED_KEY_PREFIX_LEN)
381 return -2;
382
7c673cae
FG
383 r = decode_escaped(p, &oid->hobj.nspace);
384 if (r < 0)
385 return -2;
386 p += r + 1;
387
388 string k;
389 r = decode_escaped(p, &k);
390 if (r < 0)
391 return -3;
392 p += r + 1;
393 if (*p == '=') {
394 // no key
395 ++p;
396 oid->hobj.oid.name = k;
397 } else if (*p == '<' || *p == '>') {
398 // key + name
399 ++p;
400 r = decode_escaped(p, &oid->hobj.oid.name);
401 if (r < 0)
402 return -5;
403 p += r + 1;
404 oid->hobj.set_key(k);
405 } else {
406 // malformed
407 return -6;
408 }
409
410 p = _key_decode_u64(p, &oid->hobj.snap.val);
411 p = _key_decode_u64(p, &oid->generation);
412
413 if (*p != ONODE_KEY_SUFFIX) {
414 return -7;
415 }
416 p++;
417 if (*p) {
418 // if we get something other than a null terminator here,
419 // something goes wrong.
420 return -8;
421 }
422
423 return 0;
424}
425
426template<typename S>
427static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
428{
429 key->clear();
430
f91f0fd5 431 size_t max_len = ENCODED_KEY_PREFIX_LEN +
7c673cae
FG
432 (oid.hobj.nspace.length() * 3 + 1) +
433 (oid.hobj.get_key().length() * 3 + 1) +
434 1 + // for '<', '=', or '>'
435 (oid.hobj.oid.name.length() * 3 + 1) +
436 8 + 8 + 1;
437 key->reserve(max_len);
438
f91f0fd5 439 _key_encode_prefix(oid, key);
7c673cae
FG
440
441 append_escaped(oid.hobj.nspace, key);
442
443 if (oid.hobj.get_key().length()) {
444 // is a key... could be < = or >.
445 append_escaped(oid.hobj.get_key(), key);
446 // (ASCII chars < = and > sort in that order, yay)
447 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
448 if (r) {
449 key->append(r > 0 ? ">" : "<");
450 append_escaped(oid.hobj.oid.name, key);
451 } else {
452 // same as no key
453 key->append("=");
454 }
455 } else {
456 // no key
457 append_escaped(oid.hobj.oid.name, key);
458 key->append("=");
459 }
460
461 _key_encode_u64(oid.hobj.snap, key);
462 _key_encode_u64(oid.generation, key);
463
464 key->push_back(ONODE_KEY_SUFFIX);
465
466 // sanity check
467 if (true) {
468 ghobject_t t;
469 int r = get_key_object(*key, &t);
470 if (r || t != oid) {
471 derr << " r " << r << dendl;
472 derr << "key " << pretty_binary_string(*key) << dendl;
473 derr << "oid " << oid << dendl;
474 derr << " t " << t << dendl;
11fdf7f2 475 ceph_assert(r == 0 && t == oid);
7c673cae
FG
476 }
477 }
478}
479
7c673cae
FG
480// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
481// char lets us quickly test whether it is a shard key without decoding any
482// of the prefix bytes.
483template<typename S>
484static void get_extent_shard_key(const S& onode_key, uint32_t offset,
485 string *key)
486{
487 key->clear();
488 key->reserve(onode_key.length() + 4 + 1);
489 key->append(onode_key.c_str(), onode_key.size());
490 _key_encode_u32(offset, key);
491 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
492}
493
494static void rewrite_extent_shard_key(uint32_t offset, string *key)
495{
11fdf7f2
TL
496 ceph_assert(key->size() > sizeof(uint32_t) + 1);
497 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
498 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
499}
500
501template<typename S>
502static void generate_extent_shard_key_and_apply(
503 const S& onode_key,
504 uint32_t offset,
505 string *key,
506 std::function<void(const string& final_key)> apply)
507{
508 if (key->empty()) { // make full key
11fdf7f2 509 ceph_assert(!onode_key.empty());
7c673cae
FG
510 get_extent_shard_key(onode_key, offset, key);
511 } else {
512 rewrite_extent_shard_key(offset, key);
513 }
514 apply(*key);
515}
516
517int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
518{
11fdf7f2
TL
519 ceph_assert(key.size() > sizeof(uint32_t) + 1);
520 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
521 int okey_len = key.size() - sizeof(uint32_t) - 1;
522 *onode_key = key.substr(0, okey_len);
523 const char *p = key.data() + okey_len;
224ce89b 524 _key_decode_u32(p, offset);
7c673cae
FG
525 return 0;
526}
527
528static bool is_extent_shard_key(const string& key)
529{
530 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
531}
532
7c673cae
FG
533static void get_deferred_key(uint64_t seq, string *out)
534{
535 _key_encode_u64(seq, out);
536}
537
11fdf7f2
TL
538static void get_pool_stat_key(int64_t pool_id, string *key)
539{
540 key->clear();
541 _key_encode_u64(pool_id, key);
542}
543
544static int get_key_pool_stat(const string& key, uint64_t* pool_id)
545{
546 const char *p = key.c_str();
547 if (key.length() < sizeof(uint64_t))
548 return -1;
549 _key_decode_u64(p, pool_id);
550 return 0;
551}
7c673cae 552
522d829b 553
81eedcae
TL
554template <int LogLevelV>
555void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
556{
557 uint64_t pos = 0;
558 for (auto& s : em.shards) {
559 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
560 << (s.loaded ? " (loaded)" : "")
561 << (s.dirty ? " (dirty)" : "")
562 << dendl;
563 }
564 for (auto& e : em.extent_map) {
565 dout(LogLevelV) << __func__ << " " << e << dendl;
566 ceph_assert(e.logical_offset >= pos);
567 pos = e.logical_offset + e.length;
568 const bluestore_blob_t& blob = e.blob->get_blob();
569 if (blob.has_csum()) {
570 vector<uint64_t> v;
571 unsigned n = blob.get_csum_count();
572 for (unsigned i = 0; i < n; ++i)
573 v.push_back(blob.get_csum_item(i));
574 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
575 << dendl;
576 }
577 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
578 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
579 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
580 << "~" << i.second->length << std::dec
581 << " " << *i.second << dendl;
582 }
583 }
584}
585
586template <int LogLevelV>
587void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
588{
589 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
590 return;
591 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
592 << " nid " << o.onode.nid
593 << " size 0x" << std::hex << o.onode.size
594 << " (" << std::dec << o.onode.size << ")"
595 << " expected_object_size " << o.onode.expected_object_size
596 << " expected_write_size " << o.onode.expected_write_size
597 << " in " << o.onode.extent_map_shards.size() << " shards"
598 << ", " << o.extent_map.spanning_blob_map.size()
599 << " spanning blobs"
600 << dendl;
601 for (auto p = o.onode.attrs.begin();
602 p != o.onode.attrs.end();
603 ++p) {
604 dout(LogLevelV) << __func__ << " attr " << p->first
605 << " len " << p->second.length() << dendl;
606 }
607 _dump_extent_map<LogLevelV>(cct, o.extent_map);
608}
609
610template <int LogLevelV>
611void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
612{
613 dout(LogLevelV) << __func__ << " transaction dump:\n";
614 JSONFormatter f(true);
615 f.open_object_section("transaction");
616 t->dump(&f);
617 f.close_section();
618 f.flush(*_dout);
619 *_dout << dendl;
620}
621
7c673cae
FG
622// Buffer
623
624ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
625{
626 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
627 << b.offset << "~" << b.length << std::dec
628 << " " << BlueStore::Buffer::get_state_name(b.state);
629 if (b.flags)
630 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
631 return out << ")";
632}
633
f91f0fd5
TL
634namespace {
635
636/*
637 * Due to a bug in key string encoding (see a comment for append_escaped)
638 * the KeyValueDB iterator does not lexicographically sort the same
639 * way that ghobject_t does: objects with the same hash may have wrong order.
640 *
641 * This is the iterator wrapper that fixes the keys order.
642 */
643
644class CollectionListIterator {
645public:
646 CollectionListIterator(const KeyValueDB::Iterator &it)
647 : m_it(it) {
648 }
649 virtual ~CollectionListIterator() {
650 }
651
652 virtual bool valid() const = 0;
653 virtual const ghobject_t &oid() const = 0;
654 virtual void lower_bound(const ghobject_t &oid) = 0;
655 virtual void upper_bound(const ghobject_t &oid) = 0;
656 virtual void next() = 0;
657
adb31ebb
TL
658 virtual int cmp(const ghobject_t &oid) const = 0;
659
660 bool is_ge(const ghobject_t &oid) const {
661 return cmp(oid) >= 0;
662 }
663
664 bool is_lt(const ghobject_t &oid) const {
665 return cmp(oid) < 0;
666 }
667
f91f0fd5
TL
668protected:
669 KeyValueDB::Iterator m_it;
670};
671
672class SimpleCollectionListIterator : public CollectionListIterator {
673public:
674 SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
675 : CollectionListIterator(it), m_cct(cct) {
676 }
677
678 bool valid() const override {
679 return m_it->valid();
680 }
681
682 const ghobject_t &oid() const override {
683 ceph_assert(valid());
684
685 return m_oid;
686 }
687
688 void lower_bound(const ghobject_t &oid) override {
689 string key;
690 get_object_key(m_cct, oid, &key);
691
692 m_it->lower_bound(key);
693 get_oid();
694 }
695
696 void upper_bound(const ghobject_t &oid) override {
697 string key;
698 get_object_key(m_cct, oid, &key);
699
700 m_it->upper_bound(key);
701 get_oid();
702 }
703
704 void next() override {
705 ceph_assert(valid());
706
707 m_it->next();
708 get_oid();
709 }
710
adb31ebb
TL
711 int cmp(const ghobject_t &oid) const override {
712 ceph_assert(valid());
713
714 string key;
715 get_object_key(m_cct, oid, &key);
716
717 return m_it->key().compare(key);
718 }
719
f91f0fd5
TL
720private:
721 CephContext *m_cct;
722 ghobject_t m_oid;
723
724 void get_oid() {
f67539c2
TL
725 m_oid = ghobject_t();
726 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
727 m_it->next();
f91f0fd5 728 }
f67539c2 729 if (!valid()) {
f91f0fd5
TL
730 return;
731 }
732
f91f0fd5
TL
733 int r = get_key_object(m_it->key(), &m_oid);
734 ceph_assert(r == 0);
735 }
736};
737
738class SortedCollectionListIterator : public CollectionListIterator {
739public:
740 SortedCollectionListIterator(const KeyValueDB::Iterator &it)
741 : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
742 }
743
744 bool valid() const override {
745 return m_chunk_iter != m_chunk.end();
746 }
747
748 const ghobject_t &oid() const override {
749 ceph_assert(valid());
750
751 return m_chunk_iter->first;
752 }
753
754 void lower_bound(const ghobject_t &oid) override {
755 std::string key;
756 _key_encode_prefix(oid, &key);
757
758 m_it->lower_bound(key);
759 m_chunk_iter = m_chunk.end();
760 if (!get_next_chunk()) {
761 return;
762 }
763
764 if (this->oid().shard_id != oid.shard_id ||
765 this->oid().hobj.pool != oid.hobj.pool ||
766 this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
767 return;
768 }
769
770 m_chunk_iter = m_chunk.lower_bound(oid);
771 if (m_chunk_iter == m_chunk.end()) {
772 get_next_chunk();
773 }
774 }
775
776 void upper_bound(const ghobject_t &oid) override {
777 lower_bound(oid);
778
779 if (valid() && this->oid() == oid) {
780 next();
781 }
782 }
783
784 void next() override {
785 ceph_assert(valid());
786
787 m_chunk_iter++;
788 if (m_chunk_iter == m_chunk.end()) {
789 get_next_chunk();
790 }
791 }
792
adb31ebb
TL
793 int cmp(const ghobject_t &oid) const override {
794 ceph_assert(valid());
795
796 if (this->oid() < oid) {
797 return -1;
798 }
799 if (this->oid() > oid) {
800 return 1;
801 }
802 return 0;
803 }
804
f91f0fd5
TL
805private:
806 std::map<ghobject_t, std::string> m_chunk;
807 std::map<ghobject_t, std::string>::iterator m_chunk_iter;
808
809 bool get_next_chunk() {
810 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
811 m_it->next();
812 }
813
814 if (!m_it->valid()) {
815 return false;
816 }
817
818 ghobject_t oid;
819 int r = get_key_object(m_it->key(), &oid);
820 ceph_assert(r == 0);
821
822 m_chunk.clear();
823 while (true) {
824 m_chunk.insert({oid, m_it->key()});
825
826 do {
827 m_it->next();
828 } while (m_it->valid() && is_extent_shard_key(m_it->key()));
829
830 if (!m_it->valid()) {
831 break;
832 }
833
834 ghobject_t next;
835 r = get_key_object(m_it->key(), &next);
836 ceph_assert(r == 0);
837 if (next.shard_id != oid.shard_id ||
838 next.hobj.pool != oid.hobj.pool ||
839 next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
840 break;
841 }
842 oid = next;
843 }
844
845 m_chunk_iter = m_chunk.begin();
846 return true;
847 }
848};
849
850} // anonymous namespace
851
7c673cae
FG
852// Garbage Collector
853
854void BlueStore::GarbageCollector::process_protrusive_extents(
855 const BlueStore::ExtentMap& extent_map,
856 uint64_t start_offset,
857 uint64_t end_offset,
858 uint64_t start_touch_offset,
859 uint64_t end_touch_offset,
860 uint64_t min_alloc_size)
861{
11fdf7f2 862 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
7c673cae 863
11fdf7f2
TL
864 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
865 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
7c673cae
FG
866
867 dout(30) << __func__ << " (hex): [" << std::hex
868 << lookup_start_offset << ", " << lookup_end_offset
869 << ")" << std::dec << dendl;
870
871 for (auto it = extent_map.seek_lextent(lookup_start_offset);
872 it != extent_map.extent_map.end() &&
873 it->logical_offset < lookup_end_offset;
874 ++it) {
875 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
876 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
877
878 dout(30) << __func__ << " " << *it
879 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
880 << dendl;
881
882 Blob* b = it->blob.get();
883
884 if (it->logical_offset >=start_touch_offset &&
885 it->logical_end() <= end_touch_offset) {
886 // Process extents within the range affected by
887 // the current write request.
888 // Need to take into account if existing extents
889 // can be merged with them (uncompressed case)
890 if (!b->get_blob().is_compressed()) {
891 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
892 --blob_info_counted->expected_allocations; // don't need to allocate
893 // new AU for compressed
894 // data since another
895 // collocated uncompressed
896 // blob already exists
897 dout(30) << __func__ << " --expected:"
898 << alloc_unit_start << dendl;
899 }
900 used_alloc_unit = alloc_unit_end;
901 blob_info_counted = nullptr;
902 }
903 } else if (b->get_blob().is_compressed()) {
904
905 // additionally we take compressed blobs that were not impacted
906 // by the write into account too
907 BlobInfo& bi =
908 affected_blobs.emplace(
909 b, BlobInfo(b->get_referenced_bytes())).first->second;
910
911 int adjust =
912 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
913 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
914 dout(30) << __func__ << " expected_allocations="
915 << bi.expected_allocations << " end_au:"
916 << alloc_unit_end << dendl;
917
918 blob_info_counted = &bi;
919 used_alloc_unit = alloc_unit_end;
920
11fdf7f2 921 ceph_assert(it->length <= bi.referenced_bytes);
7c673cae
FG
922 bi.referenced_bytes -= it->length;
923 dout(30) << __func__ << " affected_blob:" << *b
924 << " unref 0x" << std::hex << it->length
925 << " referenced = 0x" << bi.referenced_bytes
926 << std::dec << dendl;
927 // NOTE: we can't move specific blob to resulting GC list here
928 // when reference counter == 0 since subsequent extents might
929 // decrement its expected_allocation.
930 // Hence need to enumerate all the extents first.
931 if (!bi.collect_candidate) {
932 bi.first_lextent = it;
933 bi.collect_candidate = true;
934 }
935 bi.last_lextent = it;
936 } else {
937 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
938 // don't need to allocate new AU for compressed data since another
939 // collocated uncompressed blob already exists
940 --blob_info_counted->expected_allocations;
941 dout(30) << __func__ << " --expected_allocations:"
942 << alloc_unit_start << dendl;
943 }
944 used_alloc_unit = alloc_unit_end;
945 blob_info_counted = nullptr;
946 }
947 }
948
949 for (auto b_it = affected_blobs.begin();
950 b_it != affected_blobs.end();
951 ++b_it) {
952 Blob* b = b_it->first;
953 BlobInfo& bi = b_it->second;
954 if (bi.referenced_bytes == 0) {
955 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
956 int64_t blob_expected_for_release =
11fdf7f2 957 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
7c673cae
FG
958
959 dout(30) << __func__ << " " << *(b_it->first)
960 << " expected4release=" << blob_expected_for_release
961 << " expected_allocations=" << bi.expected_allocations
962 << dendl;
963 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
11fdf7f2 964 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
7c673cae
FG
965 if (bi.collect_candidate) {
966 auto it = bi.first_lextent;
967 bool bExit = false;
968 do {
969 if (it->blob.get() == b) {
eafe8130 970 extents_to_collect.insert(it->logical_offset, it->length);
7c673cae
FG
971 }
972 bExit = it == bi.last_lextent;
973 ++it;
31f18b77 974 } while (!bExit);
7c673cae
FG
975 }
976 expected_for_release += blob_expected_for_release;
977 expected_allocations += bi.expected_allocations;
978 }
979 }
980 }
981}
982
983int64_t BlueStore::GarbageCollector::estimate(
984 uint64_t start_offset,
985 uint64_t length,
986 const BlueStore::ExtentMap& extent_map,
987 const BlueStore::old_extent_map_t& old_extents,
988 uint64_t min_alloc_size)
989{
990
991 affected_blobs.clear();
992 extents_to_collect.clear();
993 used_alloc_unit = boost::optional<uint64_t >();
994 blob_info_counted = nullptr;
995
eafe8130
TL
996 uint64_t gc_start_offset = start_offset;
997 uint64_t gc_end_offset = start_offset + length;
7c673cae
FG
998
999 uint64_t end_offset = start_offset + length;
1000
1001 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
1002 Blob* b = it->e.blob.get();
1003 if (b->get_blob().is_compressed()) {
1004
1005 // update gc_start_offset/gc_end_offset if needed
1006 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
11fdf7f2 1007 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
7c673cae
FG
1008
1009 auto o = it->e.logical_offset;
1010 auto l = it->e.length;
1011
1012 uint64_t ref_bytes = b->get_referenced_bytes();
1013 // micro optimization to bypass blobs that have no more references
1014 if (ref_bytes != 0) {
1015 dout(30) << __func__ << " affected_blob:" << *b
1016 << " unref 0x" << std::hex << o << "~" << l
1017 << std::dec << dendl;
1018 affected_blobs.emplace(b, BlobInfo(ref_bytes));
1019 }
1020 }
1021 }
1022 dout(30) << __func__ << " gc range(hex): [" << std::hex
1023 << gc_start_offset << ", " << gc_end_offset
1024 << ")" << std::dec << dendl;
1025
1026 // enumerate preceeding extents to check if they reference affected blobs
1027 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
1028 process_protrusive_extents(extent_map,
1029 gc_start_offset,
1030 gc_end_offset,
1031 start_offset,
1032 end_offset,
1033 min_alloc_size);
1034 }
1035 return expected_for_release - expected_allocations;
1036}
1037
9f95a23c
TL
1038// LruOnodeCacheShard
1039struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
1040 typedef boost::intrusive::list<
1041 BlueStore::Onode,
1042 boost::intrusive::member_hook<
1043 BlueStore::Onode,
1044 boost::intrusive::list_member_hook<>,
1045 &BlueStore::Onode::lru_item> > list_t;
7c673cae 1046
9f95a23c 1047 list_t lru;
7c673cae 1048
9f95a23c 1049 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
7c673cae 1050
f6b5b4d7 1051 void _add(BlueStore::Onode* o, int level) override
9f95a23c 1052 {
f6b5b4d7 1053 if (o->put_cache()) {
9f95a23c 1054 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
f6b5b4d7
TL
1055 } else {
1056 ++num_pinned;
9f95a23c 1057 }
f6b5b4d7
TL
1058 ++num; // we count both pinned and unpinned entries
1059 dout(20) << __func__ << " " << this << " " << o->oid << " added, num=" << num << dendl;
eafe8130 1060 }
f6b5b4d7 1061 void _rm(BlueStore::Onode* o) override
9f95a23c 1062 {
f6b5b4d7 1063 if (o->pop_cache()) {
9f95a23c 1064 lru.erase(lru.iterator_to(*o));
f6b5b4d7
TL
1065 } else {
1066 ceph_assert(num_pinned);
1067 --num_pinned;
9f95a23c 1068 }
f6b5b4d7
TL
1069 ceph_assert(num);
1070 --num;
1071 dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
9f95a23c 1072 }
f6b5b4d7 1073 void _pin(BlueStore::Onode* o) override
9f95a23c 1074 {
9f95a23c 1075 lru.erase(lru.iterator_to(*o));
f6b5b4d7
TL
1076 ++num_pinned;
1077 dout(20) << __func__ << this << " " << " " << " " << o->oid << " pinned" << dendl;
9f95a23c 1078 }
f6b5b4d7 1079 void _unpin(BlueStore::Onode* o) override
9f95a23c 1080 {
f6b5b4d7
TL
1081 lru.push_front(*o);
1082 ceph_assert(num_pinned);
1083 --num_pinned;
1084 dout(20) << __func__ << this << " " << " " << " " << o->oid << " unpinned" << dendl;
9f95a23c 1085 }
adb31ebb
TL
1086 void _unpin_and_rm(BlueStore::Onode* o) override
1087 {
1088 o->pop_cache();
1089 ceph_assert(num_pinned);
1090 --num_pinned;
1091 ceph_assert(num);
1092 --num;
1093 }
9f95a23c
TL
1094 void _trim_to(uint64_t new_size) override
1095 {
1096 if (new_size >= lru.size()) {
1097 return; // don't even try
1098 }
1099 uint64_t n = lru.size() - new_size;
1100 auto p = lru.end();
1101 ceph_assert(p != lru.begin());
1102 --p;
f6b5b4d7
TL
1103 ceph_assert(num >= n);
1104 num -= n;
1105 while (n-- > 0) {
9f95a23c 1106 BlueStore::Onode *o = &*p;
f6b5b4d7
TL
1107 dout(20) << __func__ << " rm " << o->oid << " "
1108 << o->nref << " " << o->cached << " " << o->pinned << dendl;
9f95a23c
TL
1109 if (p != lru.begin()) {
1110 lru.erase(p--);
1111 } else {
f6b5b4d7 1112 ceph_assert(n == 0);
9f95a23c 1113 lru.erase(p);
9f95a23c 1114 }
f6b5b4d7
TL
1115 auto pinned = !o->pop_cache();
1116 ceph_assert(!pinned);
1117 o->c->onode_map._remove(o->oid);
9f95a23c 1118 }
f6b5b4d7
TL
1119 }
1120 void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
1121 {
1122 if (to == this) {
1123 return;
1124 }
1125 ceph_assert(o->cached);
1126 ceph_assert(o->pinned);
1127 ceph_assert(num);
1128 ceph_assert(num_pinned);
1129 --num_pinned;
1130 --num;
1131 ++to->num_pinned;
1132 ++to->num;
9f95a23c
TL
1133 }
1134 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
1135 {
f6b5b4d7 1136 *onodes += num;
9f95a23c
TL
1137 *pinned_onodes += num_pinned;
1138 }
1139};
7c673cae 1140
9f95a23c
TL
1141// OnodeCacheShard
1142BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
1143 CephContext* cct,
1144 string type,
1145 PerfCounters *logger)
7c673cae 1146{
9f95a23c
TL
1147 BlueStore::OnodeCacheShard *c = nullptr;
1148 // Currently we only implement an LRU cache for onodes
1149 c = new LruOnodeCacheShard(cct);
1150 c->logger = logger;
1151 return c;
7c673cae
FG
1152}
1153
9f95a23c
TL
1154// LruBufferCacheShard
1155struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
1156 typedef boost::intrusive::list<
1157 BlueStore::Buffer,
1158 boost::intrusive::member_hook<
1159 BlueStore::Buffer,
1160 boost::intrusive::list_member_hook<>,
1161 &BlueStore::Buffer::lru_item> > list_t;
1162 list_t lru;
1163
1164 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
1165
1166 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
1167 if (near) {
1168 auto q = lru.iterator_to(*near);
1169 lru.insert(q, *b);
1170 } else if (level > 0) {
1171 lru.push_front(*b);
1172 } else {
1173 lru.push_back(*b);
7c673cae 1174 }
9f95a23c
TL
1175 buffer_bytes += b->length;
1176 num = lru.size();
1177 }
1178 void _rm(BlueStore::Buffer *b) override {
1179 ceph_assert(buffer_bytes >= b->length);
1180 buffer_bytes -= b->length;
1181 auto q = lru.iterator_to(*b);
1182 lru.erase(q);
1183 num = lru.size();
1184 }
1185 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
1186 src->_rm(b);
1187 _add(b, 0, nullptr);
1188 }
1189 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
1190 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1191 buffer_bytes += delta;
1192 }
1193 void _touch(BlueStore::Buffer *b) override {
1194 auto p = lru.iterator_to(*b);
1195 lru.erase(p);
1196 lru.push_front(*b);
1197 num = lru.size();
1198 _audit("_touch_buffer end");
1199 }
7c673cae 1200
9f95a23c
TL
1201 void _trim_to(uint64_t max) override
1202 {
1203 while (buffer_bytes > max) {
1204 auto i = lru.rbegin();
1205 if (i == lru.rend()) {
1206 // stop if lru is now empty
7c673cae
FG
1207 break;
1208 }
1209
9f95a23c
TL
1210 BlueStore::Buffer *b = &*i;
1211 ceph_assert(b->is_clean());
1212 dout(20) << __func__ << " rm " << *b << dendl;
1213 b->space->_rm_buffer(this, b);
7c673cae 1214 }
9f95a23c 1215 num = lru.size();
7c673cae 1216 }
7c673cae 1217
9f95a23c
TL
1218 void add_stats(uint64_t *extents,
1219 uint64_t *blobs,
1220 uint64_t *buffers,
1221 uint64_t *bytes) override {
1222 *extents += num_extents;
1223 *blobs += num_blobs;
1224 *buffers += num;
1225 *bytes += buffer_bytes;
7c673cae 1226 }
9f95a23c
TL
1227#ifdef DEBUG_CACHE
1228 void _audit(const char *s) override
1229 {
1230 dout(10) << __func__ << " " << when << " start" << dendl;
1231 uint64_t s = 0;
1232 for (auto i = lru.begin(); i != lru.end(); ++i) {
1233 s += i->length;
1234 }
1235 if (s != buffer_bytes) {
1236 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1237 << dendl;
1238 for (auto i = lru.begin(); i != lru.end(); ++i) {
1239 derr << __func__ << " " << *i << dendl;
1240 }
1241 ceph_assert(s == buffer_bytes);
7c673cae 1242 }
9f95a23c
TL
1243 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1244 << " ok" << dendl;
7c673cae 1245 }
7c673cae 1246#endif
9f95a23c 1247};
7c673cae 1248
9f95a23c
TL
1249// TwoQBufferCacheShard
1250
1251struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1252 typedef boost::intrusive::list<
1253 BlueStore::Buffer,
1254 boost::intrusive::member_hook<
1255 BlueStore::Buffer,
1256 boost::intrusive::list_member_hook<>,
1257 &BlueStore::Buffer::lru_item> > list_t;
1258 list_t hot; ///< "Am" hot buffers
1259 list_t warm_in; ///< "A1in" newly warm buffers
1260 list_t warm_out; ///< "A1out" empty buffers we've evicted
9f95a23c
TL
1261
1262 enum {
1263 BUFFER_NEW = 0,
1264 BUFFER_WARM_IN, ///< in warm_in
1265 BUFFER_WARM_OUT, ///< in warm_out
1266 BUFFER_HOT, ///< in hot
1267 BUFFER_TYPE_MAX
1268 };
7c673cae 1269
9f95a23c 1270 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
7c673cae 1271
9f95a23c
TL
1272public:
1273 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
7c673cae 1274
9f95a23c
TL
1275 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1276 {
1277 dout(20) << __func__ << " level " << level << " near " << near
1278 << " on " << *b
1279 << " which has cache_private " << b->cache_private << dendl;
1280 if (near) {
1281 b->cache_private = near->cache_private;
1282 switch (b->cache_private) {
1283 case BUFFER_WARM_IN:
1284 warm_in.insert(warm_in.iterator_to(*near), *b);
1285 break;
1286 case BUFFER_WARM_OUT:
1287 ceph_assert(b->is_empty());
1288 warm_out.insert(warm_out.iterator_to(*near), *b);
1289 break;
1290 case BUFFER_HOT:
1291 hot.insert(hot.iterator_to(*near), *b);
1292 break;
1293 default:
1294 ceph_abort_msg("bad cache_private");
1295 }
1296 } else if (b->cache_private == BUFFER_NEW) {
1297 b->cache_private = BUFFER_WARM_IN;
1298 if (level > 0) {
1299 warm_in.push_front(*b);
1300 } else {
1301 // take caller hint to start at the back of the warm queue
1302 warm_in.push_back(*b);
1303 }
1304 } else {
1305 // we got a hint from discard
1306 switch (b->cache_private) {
1307 case BUFFER_WARM_IN:
1308 // stay in warm_in. move to front, even though 2Q doesn't actually
1309 // do this.
1310 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1311 warm_in.push_front(*b);
1312 break;
1313 case BUFFER_WARM_OUT:
1314 b->cache_private = BUFFER_HOT;
1315 // move to hot. fall-thru
1316 case BUFFER_HOT:
1317 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1318 hot.push_front(*b);
1319 break;
1320 default:
1321 ceph_abort_msg("bad cache_private");
1322 }
1323 }
1324 if (!b->is_empty()) {
1325 buffer_bytes += b->length;
1326 list_bytes[b->cache_private] += b->length;
1327 }
1328 num = hot.size() + warm_in.size();
1329 }
1330
1331 void _rm(BlueStore::Buffer *b) override
1332 {
1333 dout(20) << __func__ << " " << *b << dendl;
1334 if (!b->is_empty()) {
1335 ceph_assert(buffer_bytes >= b->length);
1336 buffer_bytes -= b->length;
1337 ceph_assert(list_bytes[b->cache_private] >= b->length);
1338 list_bytes[b->cache_private] -= b->length;
1339 }
7c673cae
FG
1340 switch (b->cache_private) {
1341 case BUFFER_WARM_IN:
9f95a23c 1342 warm_in.erase(warm_in.iterator_to(*b));
7c673cae
FG
1343 break;
1344 case BUFFER_WARM_OUT:
9f95a23c 1345 warm_out.erase(warm_out.iterator_to(*b));
7c673cae
FG
1346 break;
1347 case BUFFER_HOT:
9f95a23c 1348 hot.erase(hot.iterator_to(*b));
7c673cae
FG
1349 break;
1350 default:
11fdf7f2 1351 ceph_abort_msg("bad cache_private");
7c673cae 1352 }
9f95a23c
TL
1353 num = hot.size() + warm_in.size();
1354 }
1355
1356 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1357 {
1358 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1359 src->_rm(b);
1360
1361 // preserve which list we're on (even if we can't preserve the order!)
7c673cae
FG
1362 switch (b->cache_private) {
1363 case BUFFER_WARM_IN:
9f95a23c
TL
1364 ceph_assert(!b->is_empty());
1365 warm_in.push_back(*b);
7c673cae
FG
1366 break;
1367 case BUFFER_WARM_OUT:
9f95a23c
TL
1368 ceph_assert(b->is_empty());
1369 warm_out.push_back(*b);
1370 break;
7c673cae 1371 case BUFFER_HOT:
9f95a23c
TL
1372 ceph_assert(!b->is_empty());
1373 hot.push_back(*b);
7c673cae
FG
1374 break;
1375 default:
11fdf7f2 1376 ceph_abort_msg("bad cache_private");
7c673cae 1377 }
9f95a23c
TL
1378 if (!b->is_empty()) {
1379 buffer_bytes += b->length;
1380 list_bytes[b->cache_private] += b->length;
1381 }
1382 num = hot.size() + warm_in.size();
7c673cae 1383 }
7c673cae 1384
9f95a23c
TL
1385 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1386 {
1387 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1388 if (!b->is_empty()) {
1389 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1390 buffer_bytes += delta;
1391 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1392 list_bytes[b->cache_private] += delta;
1393 }
7c673cae 1394 }
7c673cae 1395
9f95a23c
TL
1396 void _touch(BlueStore::Buffer *b) override {
1397 switch (b->cache_private) {
1398 case BUFFER_WARM_IN:
1399 // do nothing (somewhat counter-intuitively!)
1400 break;
1401 case BUFFER_WARM_OUT:
1402 // move from warm_out to hot LRU
1403 ceph_abort_msg("this happens via discard hint");
1404 break;
1405 case BUFFER_HOT:
1406 // move to front of hot LRU
1407 hot.erase(hot.iterator_to(*b));
1408 hot.push_front(*b);
1409 break;
1410 }
1411 num = hot.size() + warm_in.size();
1412 _audit("_touch_buffer end");
7c673cae 1413 }
7c673cae 1414
9f95a23c
TL
1415 void _trim_to(uint64_t max) override
1416 {
1417 if (buffer_bytes > max) {
1418 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1419 uint64_t khot = max - kin;
1420
1421 // pre-calculate kout based on average buffer size too,
1422 // which is typical(the warm_in and hot lists may change later)
1423 uint64_t kout = 0;
1424 uint64_t buffer_num = hot.size() + warm_in.size();
1425 if (buffer_num) {
1426 uint64_t avg_size = buffer_bytes / buffer_num;
1427 ceph_assert(avg_size);
1428 uint64_t calculated_num = max / avg_size;
1429 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1430 }
1431
1432 if (list_bytes[BUFFER_HOT] < khot) {
1433 // hot is small, give slack to warm_in
1434 kin += khot - list_bytes[BUFFER_HOT];
1435 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1436 // warm_in is small, give slack to hot
1437 khot += kin - list_bytes[BUFFER_WARM_IN];
1438 }
1439
1440 // adjust warm_in list
1441 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1442 uint64_t evicted = 0;
1443
1444 while (to_evict_bytes > 0) {
1445 auto p = warm_in.rbegin();
1446 if (p == warm_in.rend()) {
1447 // stop if warm_in list is now empty
1448 break;
1449 }
7c673cae 1450
9f95a23c
TL
1451 BlueStore::Buffer *b = &*p;
1452 ceph_assert(b->is_clean());
1453 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1454 ceph_assert(buffer_bytes >= b->length);
1455 buffer_bytes -= b->length;
1456 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1457 list_bytes[BUFFER_WARM_IN] -= b->length;
1458 to_evict_bytes -= b->length;
1459 evicted += b->length;
1460 b->state = BlueStore::Buffer::STATE_EMPTY;
1461 b->data.clear();
1462 warm_in.erase(warm_in.iterator_to(*b));
1463 warm_out.push_front(*b);
1464 b->cache_private = BUFFER_WARM_OUT;
1465 }
1466
1467 if (evicted > 0) {
1468 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1469 << " from warm_in list, done evicting warm_in buffers"
1470 << dendl;
1471 }
7c673cae 1472
9f95a23c
TL
1473 // adjust hot list
1474 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1475 evicted = 0;
7c673cae 1476
9f95a23c
TL
1477 while (to_evict_bytes > 0) {
1478 auto p = hot.rbegin();
1479 if (p == hot.rend()) {
1480 // stop if hot list is now empty
1481 break;
1482 }
7c673cae 1483
9f95a23c
TL
1484 BlueStore::Buffer *b = &*p;
1485 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1486 ceph_assert(b->is_clean());
1487 // adjust evict size before buffer goes invalid
1488 to_evict_bytes -= b->length;
1489 evicted += b->length;
1490 b->space->_rm_buffer(this, b);
1491 }
7c673cae 1492
9f95a23c
TL
1493 if (evicted > 0) {
1494 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1495 << " from hot list, done evicting hot buffers"
1496 << dendl;
7c673cae
FG
1497 }
1498
9f95a23c
TL
1499 // adjust warm out list too, if necessary
1500 int64_t n = warm_out.size() - kout;
1501 while (n-- > 0) {
1502 BlueStore::Buffer *b = &*warm_out.rbegin();
1503 ceph_assert(b->is_empty());
1504 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1505 b->space->_rm_buffer(this, b);
1506 }
7c673cae 1507 }
9f95a23c
TL
1508 num = hot.size() + warm_in.size();
1509 }
7c673cae 1510
9f95a23c
TL
1511 void add_stats(uint64_t *extents,
1512 uint64_t *blobs,
1513 uint64_t *buffers,
1514 uint64_t *bytes) override {
1515 *extents += num_extents;
1516 *blobs += num_blobs;
1517 *buffers += num;
1518 *bytes += buffer_bytes;
1519 }
7c673cae 1520
9f95a23c
TL
1521#ifdef DEBUG_CACHE
1522 void _audit(const char *s) override
1523 {
1524 dout(10) << __func__ << " " << when << " start" << dendl;
1525 uint64_t s = 0;
1526 for (auto i = hot.begin(); i != hot.end(); ++i) {
1527 s += i->length;
7c673cae
FG
1528 }
1529
9f95a23c
TL
1530 uint64_t hot_bytes = s;
1531 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1532 derr << __func__ << " hot_list_bytes "
1533 << list_bytes[BUFFER_HOT]
1534 << " != actual " << hot_bytes
1535 << dendl;
1536 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
7c673cae
FG
1537 }
1538
9f95a23c
TL
1539 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1540 s += i->length;
7c673cae 1541 }
7c673cae 1542
9f95a23c
TL
1543 uint64_t warm_in_bytes = s - hot_bytes;
1544 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1545 derr << __func__ << " warm_in_list_bytes "
1546 << list_bytes[BUFFER_WARM_IN]
1547 << " != actual " << warm_in_bytes
1548 << dendl;
1549 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
7c673cae 1550 }
7c673cae 1551
9f95a23c
TL
1552 if (s != buffer_bytes) {
1553 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1554 << dendl;
1555 ceph_assert(s == buffer_bytes);
1556 }
7c673cae 1557
9f95a23c
TL
1558 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1559 << " ok" << dendl;
7c673cae 1560 }
9f95a23c
TL
1561#endif
1562};
7c673cae 1563
9f95a23c 1564// BuferCacheShard
7c673cae 1565
9f95a23c
TL
1566BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1567 CephContext* cct,
1568 string type,
1569 PerfCounters *logger)
1570{
1571 BufferCacheShard *c = nullptr;
1572 if (type == "lru")
1573 c = new LruBufferCacheShard(cct);
1574 else if (type == "2q")
1575 c = new TwoQBufferCacheShard(cct);
1576 else
1577 ceph_abort_msg("unrecognized cache type");
1578 c->logger = logger;
1579 return c;
7c673cae 1580}
7c673cae
FG
1581
1582// BufferSpace
1583
1584#undef dout_prefix
1585#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1586
9f95a23c 1587void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
7c673cae
FG
1588{
1589 // note: we already hold cache->lock
1590 ldout(cache->cct, 20) << __func__ << dendl;
1591 while (!buffer_map.empty()) {
1592 _rm_buffer(cache, buffer_map.begin());
1593 }
1594}
1595
9f95a23c 1596int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
7c673cae
FG
1597{
1598 // note: we already hold cache->lock
1599 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1600 << std::dec << dendl;
1601 int cache_private = 0;
1602 cache->_audit("discard start");
1603 auto i = _data_lower_bound(offset);
1604 uint32_t end = offset + length;
1605 while (i != buffer_map.end()) {
1606 Buffer *b = i->second.get();
1607 if (b->offset >= end) {
1608 break;
1609 }
1610 if (b->cache_private > cache_private) {
1611 cache_private = b->cache_private;
1612 }
1613 if (b->offset < offset) {
1614 int64_t front = offset - b->offset;
1615 if (b->end() > end) {
1616 // drop middle (split)
1617 uint32_t tail = b->end() - end;
1618 if (b->data.length()) {
1619 bufferlist bl;
1620 bl.substr_of(b->data, b->length - tail, tail);
f67539c2 1621 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
31f18b77
FG
1622 nb->maybe_rebuild();
1623 _add_buffer(cache, nb, 0, b);
7c673cae 1624 } else {
f67539c2
TL
1625 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail,
1626 b->flags),
1627 0, b);
7c673cae
FG
1628 }
1629 if (!b->is_writing()) {
9f95a23c 1630 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1631 }
1632 b->truncate(front);
31f18b77 1633 b->maybe_rebuild();
7c673cae
FG
1634 cache->_audit("discard end 1");
1635 break;
1636 } else {
1637 // drop tail
1638 if (!b->is_writing()) {
9f95a23c 1639 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1640 }
1641 b->truncate(front);
31f18b77 1642 b->maybe_rebuild();
7c673cae
FG
1643 ++i;
1644 continue;
1645 }
1646 }
1647 if (b->end() <= end) {
1648 // drop entire buffer
1649 _rm_buffer(cache, i++);
1650 continue;
1651 }
1652 // drop front
1653 uint32_t keep = b->end() - end;
1654 if (b->data.length()) {
1655 bufferlist bl;
1656 bl.substr_of(b->data, b->length - keep, keep);
f67539c2 1657 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
31f18b77
FG
1658 nb->maybe_rebuild();
1659 _add_buffer(cache, nb, 0, b);
7c673cae 1660 } else {
f67539c2
TL
1661 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep,
1662 b->flags),
1663 0, b);
7c673cae
FG
1664 }
1665 _rm_buffer(cache, i);
1666 cache->_audit("discard end 2");
1667 break;
1668 }
1669 return cache_private;
1670}
1671
1672void BlueStore::BufferSpace::read(
9f95a23c 1673 BufferCacheShard* cache,
224ce89b
WB
1674 uint32_t offset,
1675 uint32_t length,
7c673cae 1676 BlueStore::ready_regions_t& res,
91327a77
AA
1677 interval_set<uint32_t>& res_intervals,
1678 int flags)
7c673cae 1679{
7c673cae
FG
1680 res.clear();
1681 res_intervals.clear();
1682 uint32_t want_bytes = length;
1683 uint32_t end = offset + length;
224ce89b
WB
1684
1685 {
11fdf7f2 1686 std::lock_guard l(cache->lock);
224ce89b
WB
1687 for (auto i = _data_lower_bound(offset);
1688 i != buffer_map.end() && offset < end && i->first < end;
1689 ++i) {
1690 Buffer *b = i->second.get();
11fdf7f2 1691 ceph_assert(b->end() > offset);
91327a77
AA
1692
1693 bool val = false;
1694 if (flags & BYPASS_CLEAN_CACHE)
1695 val = b->is_writing();
1696 else
1697 val = b->is_writing() || b->is_clean();
1698 if (val) {
224ce89b
WB
1699 if (b->offset < offset) {
1700 uint32_t skip = offset - b->offset;
11fdf7f2 1701 uint32_t l = min(length, b->length - skip);
224ce89b
WB
1702 res[offset].substr_of(b->data, skip, l);
1703 res_intervals.insert(offset, l);
1704 offset += l;
1705 length -= l;
1706 if (!b->is_writing()) {
9f95a23c 1707 cache->_touch(b);
f67539c2 1708 }
224ce89b
WB
1709 continue;
1710 }
1711 if (b->offset > offset) {
1712 uint32_t gap = b->offset - offset;
1713 if (length <= gap) {
1714 break;
1715 }
1716 offset += gap;
1717 length -= gap;
1718 }
1719 if (!b->is_writing()) {
9f95a23c 1720 cache->_touch(b);
224ce89b
WB
1721 }
1722 if (b->length > length) {
1723 res[offset].substr_of(b->data, 0, length);
1724 res_intervals.insert(offset, length);
7c673cae 1725 break;
224ce89b
WB
1726 } else {
1727 res[offset].append(b->data);
1728 res_intervals.insert(offset, b->length);
1729 if (b->length == length)
1730 break;
1731 offset += b->length;
1732 length -= b->length;
1733 }
7c673cae
FG
1734 }
1735 }
1736 }
1737
1738 uint64_t hit_bytes = res_intervals.size();
11fdf7f2 1739 ceph_assert(hit_bytes <= want_bytes);
7c673cae
FG
1740 uint64_t miss_bytes = want_bytes - hit_bytes;
1741 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1742 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1743}
1744
9f95a23c 1745void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
7c673cae 1746{
7c673cae
FG
1747 auto i = writing.begin();
1748 while (i != writing.end()) {
1749 if (i->seq > seq) {
1750 break;
1751 }
1752 if (i->seq < seq) {
1753 ++i;
1754 continue;
1755 }
1756
1757 Buffer *b = &*i;
11fdf7f2 1758 ceph_assert(b->is_writing());
7c673cae
FG
1759
1760 if (b->flags & Buffer::FLAG_NOCACHE) {
1761 writing.erase(i++);
1762 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1763 buffer_map.erase(b->offset);
1764 } else {
1765 b->state = Buffer::STATE_CLEAN;
1766 writing.erase(i++);
31f18b77
FG
1767 b->maybe_rebuild();
1768 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
9f95a23c 1769 cache->_add(b, 1, nullptr);
7c673cae
FG
1770 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1771 }
1772 }
9f95a23c 1773 cache->_trim();
7c673cae
FG
1774 cache->_audit("finish_write end");
1775}
1776
9f95a23c 1777void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
7c673cae 1778{
11fdf7f2 1779 std::lock_guard lk(cache->lock);
7c673cae
FG
1780 if (buffer_map.empty())
1781 return;
1782
1783 auto p = --buffer_map.end();
1784 while (true) {
1785 if (p->second->end() <= pos)
1786 break;
1787
1788 if (p->second->offset < pos) {
1789 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1790 size_t left = pos - p->second->offset;
1791 size_t right = p->second->length - left;
1792 if (p->second->data.length()) {
1793 bufferlist bl;
1794 bl.substr_of(p->second->data, left, right);
f67539c2
TL
1795 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1796 0, bl, p->second->flags),
7c673cae
FG
1797 0, p->second.get());
1798 } else {
f67539c2
TL
1799 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1800 0, right, p->second->flags),
7c673cae
FG
1801 0, p->second.get());
1802 }
9f95a23c 1803 cache->_adjust_size(p->second.get(), -right);
7c673cae
FG
1804 p->second->truncate(left);
1805 break;
1806 }
1807
11fdf7f2 1808 ceph_assert(p->second->end() > pos);
7c673cae
FG
1809 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1810 if (p->second->data.length()) {
1811 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
f67539c2 1812 p->second->offset - pos, p->second->data, p->second->flags),
7c673cae
FG
1813 0, p->second.get());
1814 } else {
1815 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
f67539c2 1816 p->second->offset - pos, p->second->length, p->second->flags),
7c673cae
FG
1817 0, p->second.get());
1818 }
1819 if (p == buffer_map.begin()) {
1820 _rm_buffer(cache, p);
1821 break;
1822 } else {
1823 _rm_buffer(cache, p--);
1824 }
1825 }
11fdf7f2 1826 ceph_assert(writing.empty());
9f95a23c 1827 cache->_trim();
7c673cae
FG
1828}
1829
1830// OnodeSpace
1831
1832#undef dout_prefix
1833#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1834
f6b5b4d7
TL
1835BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
1836 OnodeRef& o)
7c673cae 1837{
11fdf7f2 1838 std::lock_guard l(cache->lock);
7c673cae
FG
1839 auto p = onode_map.find(oid);
1840 if (p != onode_map.end()) {
1841 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1842 << " raced, returning existing " << p->second
1843 << dendl;
1844 return p->second;
1845 }
f6b5b4d7 1846 ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
7c673cae 1847 onode_map[oid] = o;
f6b5b4d7 1848 cache->_add(o.get(), 1);
9f95a23c 1849 cache->_trim();
7c673cae
FG
1850 return o;
1851}
1852
f6b5b4d7
TL
1853void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
1854{
1855 ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
1856 onode_map.erase(oid);
1857}
1858
7c673cae
FG
1859BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1860{
7c673cae 1861 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b
WB
1862 OnodeRef o;
1863 bool hit = false;
1864
1865 {
11fdf7f2 1866 std::lock_guard l(cache->lock);
224ce89b
WB
1867 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1868 if (p == onode_map.end()) {
1869 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1870 } else {
1871 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
f6b5b4d7
TL
1872 << " " << p->second->nref
1873 << " " << p->second->cached
1874 << " " << p->second->pinned
224ce89b 1875 << dendl;
f6b5b4d7
TL
1876 // This will pin onode and implicitly touch the cache when Onode
1877 // eventually will become unpinned
224ce89b 1878 o = p->second;
f6b5b4d7
TL
1879 ceph_assert(!o->cached || o->pinned);
1880
1881 hit = true;
224ce89b
WB
1882 }
1883 }
1884
1885 if (hit) {
1886 cache->logger->inc(l_bluestore_onode_hits);
1887 } else {
7c673cae 1888 cache->logger->inc(l_bluestore_onode_misses);
7c673cae 1889 }
224ce89b 1890 return o;
7c673cae
FG
1891}
1892
1893void BlueStore::OnodeSpace::clear()
1894{
11fdf7f2 1895 std::lock_guard l(cache->lock);
f6b5b4d7 1896 ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
7c673cae 1897 for (auto &p : onode_map) {
f6b5b4d7 1898 cache->_rm(p.second.get());
7c673cae
FG
1899 }
1900 onode_map.clear();
1901}
1902
1903bool BlueStore::OnodeSpace::empty()
1904{
11fdf7f2 1905 std::lock_guard l(cache->lock);
7c673cae
FG
1906 return onode_map.empty();
1907}
1908
1909void BlueStore::OnodeSpace::rename(
1910 OnodeRef& oldo,
1911 const ghobject_t& old_oid,
1912 const ghobject_t& new_oid,
f91f0fd5 1913 const mempool::bluestore_cache_meta::string& new_okey)
7c673cae 1914{
11fdf7f2 1915 std::lock_guard l(cache->lock);
7c673cae
FG
1916 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1917 << dendl;
1918 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1919 po = onode_map.find(old_oid);
1920 pn = onode_map.find(new_oid);
11fdf7f2 1921 ceph_assert(po != pn);
7c673cae 1922
11fdf7f2 1923 ceph_assert(po != onode_map.end());
7c673cae
FG
1924 if (pn != onode_map.end()) {
1925 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1926 << dendl;
f6b5b4d7 1927 cache->_rm(pn->second.get());
7c673cae
FG
1928 onode_map.erase(pn);
1929 }
1930 OnodeRef o = po->second;
1931
1932 // install a non-existent onode at old location
1933 oldo.reset(new Onode(o->c, old_oid, o->key));
1934 po->second = oldo;
f6b5b4d7
TL
1935 cache->_add(oldo.get(), 1);
1936 // add at new position and fix oid, key.
1937 // This will pin 'o' and implicitly touch cache
1938 // when it will eventually become unpinned
7c673cae 1939 onode_map.insert(make_pair(new_oid, o));
f6b5b4d7
TL
1940 ceph_assert(o->pinned);
1941
7c673cae
FG
1942 o->oid = new_oid;
1943 o->key = new_okey;
9f95a23c 1944 cache->_trim();
7c673cae
FG
1945}
1946
adb31ebb 1947bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
7c673cae 1948{
11fdf7f2 1949 std::lock_guard l(cache->lock);
7c673cae
FG
1950 ldout(cache->cct, 20) << __func__ << dendl;
1951 for (auto& i : onode_map) {
adb31ebb 1952 if (f(i.second.get())) {
7c673cae
FG
1953 return true;
1954 }
1955 }
1956 return false;
1957}
1958
11fdf7f2
TL
1959template <int LogLevelV = 30>
1960void BlueStore::OnodeSpace::dump(CephContext *cct)
3efd9988
FG
1961{
1962 for (auto& i : onode_map) {
f6b5b4d7
TL
1963 ldout(cct, LogLevelV) << i.first << " : " << i.second
1964 << " " << i.second->nref
1965 << " " << i.second->cached
1966 << " " << i.second->pinned
1967 << dendl;
3efd9988
FG
1968 }
1969}
7c673cae
FG
1970
1971// SharedBlob
1972
1973#undef dout_prefix
1974#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
9f95a23c
TL
1975#undef dout_context
1976#define dout_context coll->store->cct
7c673cae 1977
9f95a23c 1978void BlueStore::SharedBlob::dump(Formatter* f) const
7c673cae 1979{
9f95a23c
TL
1980 f->dump_bool("loaded", loaded);
1981 if (loaded) {
1982 persistent->dump(f);
1983 } else {
1984 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
1985 }
1986}
1987
1988ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1989{
1990 out << "SharedBlob(" << &sb;
1991
7c673cae
FG
1992 if (sb.loaded) {
1993 out << " loaded " << *sb.persistent;
1994 } else {
1995 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1996 }
1997 return out << ")";
1998}
1999
2000BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
2001 : coll(_coll), sbid_unloaded(i)
2002{
11fdf7f2 2003 ceph_assert(sbid_unloaded > 0);
7c673cae
FG
2004 if (get_cache()) {
2005 get_cache()->add_blob();
2006 }
2007}
2008
2009BlueStore::SharedBlob::~SharedBlob()
2010{
7c673cae
FG
2011 if (loaded && persistent) {
2012 delete persistent;
2013 }
2014}
2015
2016void BlueStore::SharedBlob::put()
2017{
2018 if (--nref == 0) {
9f95a23c
TL
2019 dout(20) << __func__ << " " << this
2020 << " removing self from set " << get_parent()
2021 << dendl;
1adf2230
AA
2022 again:
2023 auto coll_snap = coll;
2024 if (coll_snap) {
11fdf7f2 2025 std::lock_guard l(coll_snap->cache->lock);
1adf2230
AA
2026 if (coll_snap != coll) {
2027 goto again;
2028 }
91327a77
AA
2029 if (!coll_snap->shared_blob_set.remove(this, true)) {
2030 // race with lookup
2031 return;
2032 }
1adf2230
AA
2033 bc._clear(coll_snap->cache);
2034 coll_snap->cache->rm_blob();
7c673cae 2035 }
28e407b8 2036 delete this;
7c673cae
FG
2037 }
2038}
2039
2040void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
2041{
11fdf7f2 2042 ceph_assert(persistent);
7c673cae
FG
2043 persistent->ref_map.get(offset, length);
2044}
2045
2046void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77 2047 PExtentVector *r,
11fdf7f2 2048 bool *unshare)
7c673cae 2049{
11fdf7f2
TL
2050 ceph_assert(persistent);
2051 persistent->ref_map.put(offset, length, r,
2052 unshare && !*unshare ? unshare : nullptr);
7c673cae
FG
2053}
2054
f64942e4
AA
2055void BlueStore::SharedBlob::finish_write(uint64_t seq)
2056{
2057 while (true) {
9f95a23c 2058 BufferCacheShard *cache = coll->cache;
11fdf7f2 2059 std::lock_guard l(cache->lock);
f64942e4 2060 if (coll->cache != cache) {
9f95a23c
TL
2061 dout(20) << __func__
2062 << " raced with sb cache update, was " << cache
2063 << ", now " << coll->cache << ", retrying"
2064 << dendl;
f64942e4
AA
2065 continue;
2066 }
2067 bc._finish_write(cache, seq);
2068 break;
2069 }
2070}
2071
3efd9988
FG
2072// SharedBlobSet
2073
2074#undef dout_prefix
2075#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2076
11fdf7f2
TL
2077template <int LogLevelV = 30>
2078void BlueStore::SharedBlobSet::dump(CephContext *cct)
3efd9988 2079{
11fdf7f2 2080 std::lock_guard l(lock);
3efd9988 2081 for (auto& i : sb_map) {
11fdf7f2 2082 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
3efd9988
FG
2083 }
2084}
2085
7c673cae
FG
2086// Blob
2087
2088#undef dout_prefix
2089#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2090
9f95a23c
TL
2091void BlueStore::Blob::dump(Formatter* f) const
2092{
2093 if (is_spanning()) {
2094 f->dump_unsigned("spanning_id ", id);
2095 }
2096 blob.dump(f);
2097 if (shared_blob) {
2098 f->dump_object("shared", *shared_blob);
2099 }
2100}
2101
7c673cae
FG
2102ostream& operator<<(ostream& out, const BlueStore::Blob& b)
2103{
2104 out << "Blob(" << &b;
2105 if (b.is_spanning()) {
2106 out << " spanning " << b.id;
2107 }
35e4c445
FG
2108 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
2109 if (b.shared_blob) {
2110 out << " " << *b.shared_blob;
2111 } else {
2112 out << " (shared_blob=NULL)";
2113 }
2114 out << ")";
7c673cae
FG
2115 return out;
2116}
2117
2118void BlueStore::Blob::discard_unallocated(Collection *coll)
2119{
224ce89b 2120 if (get_blob().is_shared()) {
7c673cae
FG
2121 return;
2122 }
224ce89b 2123 if (get_blob().is_compressed()) {
7c673cae
FG
2124 bool discard = false;
2125 bool all_invalid = true;
224ce89b 2126 for (auto e : get_blob().get_extents()) {
7c673cae
FG
2127 if (!e.is_valid()) {
2128 discard = true;
2129 } else {
2130 all_invalid = false;
2131 }
2132 }
11fdf7f2 2133 ceph_assert(discard == all_invalid); // in case of compressed blob all
7c673cae
FG
2134 // or none pextents are invalid.
2135 if (discard) {
224ce89b
WB
2136 shared_blob->bc.discard(shared_blob->get_cache(), 0,
2137 get_blob().get_logical_length());
7c673cae
FG
2138 }
2139 } else {
2140 size_t pos = 0;
224ce89b 2141 for (auto e : get_blob().get_extents()) {
7c673cae 2142 if (!e.is_valid()) {
9f95a23c
TL
2143 dout(20) << __func__ << " 0x" << std::hex << pos
2144 << "~" << e.length
2145 << std::dec << dendl;
7c673cae
FG
2146 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
2147 }
2148 pos += e.length;
2149 }
224ce89b
WB
2150 if (get_blob().can_prune_tail()) {
2151 dirty_blob().prune_tail();
2152 used_in_blob.prune_tail(get_blob().get_ondisk_length());
224ce89b 2153 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
2154 }
2155 }
2156}
2157
2158void BlueStore::Blob::get_ref(
2159 Collection *coll,
2160 uint32_t offset,
2161 uint32_t length)
2162{
2163 // Caller has to initialize Blob's logical length prior to increment
2164 // references. Otherwise one is neither unable to determine required
2165 // amount of counters in case of per-au tracking nor obtain min_release_size
2166 // for single counter mode.
11fdf7f2 2167 ceph_assert(get_blob().get_logical_length() != 0);
7c673cae
FG
2168 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2169 << std::dec << " " << *this << dendl;
2170
2171 if (used_in_blob.is_empty()) {
2172 uint32_t min_release_size =
224ce89b
WB
2173 get_blob().get_release_size(coll->store->min_alloc_size);
2174 uint64_t l = get_blob().get_logical_length();
2175 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
2176 << min_release_size << std::dec << dendl;
7c673cae
FG
2177 used_in_blob.init(l, min_release_size);
2178 }
2179 used_in_blob.get(
2180 offset,
2181 length);
2182}
2183
2184bool BlueStore::Blob::put_ref(
2185 Collection *coll,
2186 uint32_t offset,
2187 uint32_t length,
2188 PExtentVector *r)
2189{
2190 PExtentVector logical;
2191
7c673cae
FG
2192 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2193 << std::dec << " " << *this << dendl;
2194
2195 bool empty = used_in_blob.put(
2196 offset,
2197 length,
2198 &logical);
2199 r->clear();
2200 // nothing to release
2201 if (!empty && logical.empty()) {
2202 return false;
2203 }
2204
2205 bluestore_blob_t& b = dirty_blob();
2206 return b.release_extents(empty, logical, r);
2207}
2208
224ce89b 2209bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
2210 uint32_t target_blob_size,
2211 uint32_t b_offset,
2212 uint32_t *length0) {
11fdf7f2
TL
2213 ceph_assert(min_alloc_size);
2214 ceph_assert(target_blob_size);
7c673cae
FG
2215 if (!get_blob().is_mutable()) {
2216 return false;
2217 }
2218
2219 uint32_t length = *length0;
2220 uint32_t end = b_offset + length;
2221
2222 // Currently for the sake of simplicity we omit blob reuse if data is
2223 // unaligned with csum chunk. Later we can perform padding if needed.
2224 if (get_blob().has_csum() &&
2225 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2226 (end % get_blob().get_csum_chunk_size()) != 0)) {
2227 return false;
2228 }
2229
2230 auto blen = get_blob().get_logical_length();
2231 uint32_t new_blen = blen;
2232
2233 // make sure target_blob_size isn't less than current blob len
11fdf7f2 2234 target_blob_size = std::max(blen, target_blob_size);
7c673cae
FG
2235
2236 if (b_offset >= blen) {
224ce89b
WB
2237 // new data totally stands out of the existing blob
2238 new_blen = end;
7c673cae 2239 } else {
224ce89b 2240 // new data overlaps with the existing blob
11fdf7f2 2241 new_blen = std::max(blen, end);
224ce89b
WB
2242
2243 uint32_t overlap = 0;
2244 if (new_blen > blen) {
2245 overlap = blen - b_offset;
2246 } else {
2247 overlap = length;
2248 }
2249
2250 if (!get_blob().is_unallocated(b_offset, overlap)) {
2251 // abort if any piece of the overlap has already been allocated
2252 return false;
7c673cae
FG
2253 }
2254 }
224ce89b 2255
7c673cae
FG
2256 if (new_blen > blen) {
2257 int64_t overflow = int64_t(new_blen) - target_blob_size;
2258 // Unable to decrease the provided length to fit into max_blob_size
2259 if (overflow >= length) {
2260 return false;
2261 }
2262
2263 // FIXME: in some cases we could reduce unused resolution
2264 if (get_blob().has_unused()) {
2265 return false;
2266 }
2267
2268 if (overflow > 0) {
2269 new_blen -= overflow;
2270 length -= overflow;
2271 *length0 = length;
2272 }
224ce89b 2273
7c673cae
FG
2274 if (new_blen > blen) {
2275 dirty_blob().add_tail(new_blen);
2276 used_in_blob.add_tail(new_blen,
224ce89b 2277 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
2278 }
2279 }
2280 return true;
2281}
2282
2283void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2284{
7c673cae
FG
2285 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2286 << " start " << *this << dendl;
11fdf7f2
TL
2287 ceph_assert(blob.can_split());
2288 ceph_assert(used_in_blob.can_split());
7c673cae
FG
2289 bluestore_blob_t &lb = dirty_blob();
2290 bluestore_blob_t &rb = r->dirty_blob();
2291
2292 used_in_blob.split(
2293 blob_offset,
2294 &(r->used_in_blob));
2295
2296 lb.split(blob_offset, rb);
2297 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2298
2299 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2300 << " finish " << *this << dendl;
2301 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2302 << " and " << *r << dendl;
2303}
2304
2305#ifndef CACHE_BLOB_BL
2306void BlueStore::Blob::decode(
2307 Collection *coll,
11fdf7f2 2308 bufferptr::const_iterator& p,
7c673cae
FG
2309 uint64_t struct_v,
2310 uint64_t* sbid,
2311 bool include_ref_map)
2312{
2313 denc(blob, p, struct_v);
2314 if (blob.is_shared()) {
2315 denc(*sbid, p);
2316 }
2317 if (include_ref_map) {
2318 if (struct_v > 1) {
2319 used_in_blob.decode(p);
2320 } else {
2321 used_in_blob.clear();
2322 bluestore_extent_ref_map_t legacy_ref_map;
2323 legacy_ref_map.decode(p);
2324 for (auto r : legacy_ref_map.ref_map) {
2325 get_ref(
2326 coll,
2327 r.first,
2328 r.second.refs * r.second.length);
2329 }
2330 }
2331 }
2332}
2333#endif
2334
2335// Extent
2336
9f95a23c
TL
2337void BlueStore::Extent::dump(Formatter* f) const
2338{
2339 f->dump_unsigned("logical_offset", logical_offset);
2340 f->dump_unsigned("length", length);
2341 f->dump_unsigned("blob_offset", blob_offset);
2342 f->dump_object("blob", *blob);
2343}
2344
7c673cae
FG
2345ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2346{
2347 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2348 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2349 << " " << *e.blob;
2350}
2351
2352// OldExtent
2353BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2354 uint32_t lo,
2355 uint32_t o,
2356 uint32_t l,
2357 BlobRef& b) {
2358 OldExtent* oe = new OldExtent(lo, o, l, b);
2359 b->put_ref(c.get(), o, l, &(oe->r));
adb31ebb 2360 oe->blob_empty = !b->is_referenced();
7c673cae
FG
2361 return oe;
2362}
2363
2364// ExtentMap
2365
2366#undef dout_prefix
2367#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
9f95a23c
TL
2368#undef dout_context
2369#define dout_context onode->c->store->cct
7c673cae
FG
2370
2371BlueStore::ExtentMap::ExtentMap(Onode *o)
2372 : onode(o),
2373 inline_bl(
2374 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2375}
2376
9f95a23c
TL
2377void BlueStore::ExtentMap::dump(Formatter* f) const
2378{
2379 f->open_array_section("extents");
2380
2381 for (auto& e : extent_map) {
2382 f->dump_object("extent", e);
2383 }
2384 f->close_section();
2385}
2386
11fdf7f2
TL
2387void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2388 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2389 uint64_t& length, uint64_t& dstoff) {
2390
2391 auto cct = onode->c->store->cct;
2392 bool inject_21040 =
2393 cct->_conf->bluestore_debug_inject_bug21040;
2394 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2395 for (auto& e : oldo->extent_map.extent_map) {
2396 e.blob->last_encoded_id = -1;
2397 }
2398
2399 int n = 0;
2400 uint64_t end = srcoff + length;
2401 uint32_t dirty_range_begin = 0;
2402 uint32_t dirty_range_end = 0;
2403 bool src_dirty = false;
2404 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2405 ep != oldo->extent_map.extent_map.end();
2406 ++ep) {
2407 auto& e = *ep;
2408 if (e.logical_offset >= end) {
2409 break;
2410 }
2411 dout(20) << __func__ << " src " << e << dendl;
2412 BlobRef cb;
2413 bool blob_duped = true;
2414 if (e.blob->last_encoded_id >= 0) {
2415 cb = id_to_blob[e.blob->last_encoded_id];
2416 blob_duped = false;
2417 } else {
2418 // dup the blob
2419 const bluestore_blob_t& blob = e.blob->get_blob();
2420 // make sure it is shared
2421 if (!blob.is_shared()) {
2422 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2423 if (!inject_21040 && !src_dirty) {
2424 src_dirty = true;
2425 dirty_range_begin = e.logical_offset;
2426 } else if (inject_21040 &&
2427 dirty_range_begin == 0 && dirty_range_end == 0) {
2428 dirty_range_begin = e.logical_offset;
2429 }
2430 ceph_assert(e.logical_end() > 0);
2431 // -1 to exclude next potential shard
2432 dirty_range_end = e.logical_end() - 1;
2433 } else {
2434 c->load_shared_blob(e.blob->shared_blob);
2435 }
2436 cb = new Blob();
2437 e.blob->last_encoded_id = n;
2438 id_to_blob[n] = cb;
2439 e.blob->dup(*cb);
2440 // bump the extent refs on the copied blob's extents
2441 for (auto p : blob.get_extents()) {
2442 if (p.is_valid()) {
2443 e.blob->shared_blob->get_ref(p.offset, p.length);
2444 }
2445 }
2446 txc->write_shared_blob(e.blob->shared_blob);
2447 dout(20) << __func__ << " new " << *cb << dendl;
2448 }
2449
2450 int skip_front, skip_back;
2451 if (e.logical_offset < srcoff) {
2452 skip_front = srcoff - e.logical_offset;
2453 } else {
2454 skip_front = 0;
2455 }
2456 if (e.logical_end() > end) {
2457 skip_back = e.logical_end() - end;
2458 } else {
2459 skip_back = 0;
2460 }
2461
2462 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2463 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2464 newo->extent_map.extent_map.insert(*ne);
2465 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2466 // fixme: we may leave parts of new blob unreferenced that could
2467 // be freed (relative to the shared_blob).
2468 txc->statfs_delta.stored() += ne->length;
2469 if (e.blob->get_blob().is_compressed()) {
2470 txc->statfs_delta.compressed_original() += ne->length;
2471 if (blob_duped) {
2472 txc->statfs_delta.compressed() +=
2473 cb->get_blob().get_compressed_payload_length();
2474 }
2475 }
2476 dout(20) << __func__ << " dst " << *ne << dendl;
2477 ++n;
2478 }
2479 if ((!inject_21040 && src_dirty) ||
2480 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2481 oldo->extent_map.dirty_range(dirty_range_begin,
2482 dirty_range_end - dirty_range_begin);
2483 txc->write_onode(oldo);
2484 }
2485 txc->write_onode(newo);
2486
2487 if (dstoff + length > newo->onode.size) {
2488 newo->onode.size = dstoff + length;
2489 }
2490 newo->extent_map.dirty_range(dstoff, length);
2491}
7c673cae
FG
2492void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2493 bool force)
2494{
2495 auto cct = onode->c->store->cct; //used by dout
2496 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2497 if (onode->onode.extent_map_shards.empty()) {
2498 if (inline_bl.length() == 0) {
2499 unsigned n;
2500 // we need to encode inline_bl to measure encoded length
2501 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
f91f0fd5 2502 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
11fdf7f2 2503 ceph_assert(!never_happen);
7c673cae
FG
2504 size_t len = inline_bl.length();
2505 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2506 << " extents" << dendl;
2507 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2508 request_reshard(0, OBJECT_MAX_SIZE);
2509 return;
2510 }
2511 }
2512 // will persist in the onode key.
2513 } else {
2514 // pending shard update
2515 struct dirty_shard_t {
2516 Shard *shard;
2517 bufferlist bl;
2518 dirty_shard_t(Shard *s) : shard(s) {}
2519 };
2520 vector<dirty_shard_t> encoded_shards;
2521 // allocate slots for all shards in a single call instead of
2522 // doing multiple allocations - one per each dirty shard
2523 encoded_shards.reserve(shards.size());
2524
2525 auto p = shards.begin();
2526 auto prev_p = p;
2527 while (p != shards.end()) {
11fdf7f2 2528 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2529 auto n = p;
2530 ++n;
2531 if (p->dirty) {
2532 uint32_t endoff;
2533 if (n == shards.end()) {
2534 endoff = OBJECT_MAX_SIZE;
2535 } else {
2536 endoff = n->shard_info->offset;
2537 }
2538 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2539 bufferlist& bl = encoded_shards.back().bl;
2540 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2541 bl, &p->extents)) {
2542 if (force) {
2543 derr << __func__ << " encode_some needs reshard" << dendl;
11fdf7f2 2544 ceph_assert(!force);
7c673cae
FG
2545 }
2546 }
2547 size_t len = bl.length();
2548
2549 dout(20) << __func__ << " shard 0x" << std::hex
2550 << p->shard_info->offset << std::dec << " is " << len
2551 << " bytes (was " << p->shard_info->bytes << ") from "
2552 << p->extents << " extents" << dendl;
2553
2554 if (!force) {
2555 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2556 // we are big; reshard ourselves
2557 request_reshard(p->shard_info->offset, endoff);
2558 }
2559 // avoid resharding the trailing shard, even if it is small
2560 else if (n != shards.end() &&
11fdf7f2
TL
2561 len < g_conf()->bluestore_extent_map_shard_min_size) {
2562 ceph_assert(endoff != OBJECT_MAX_SIZE);
31f18b77
FG
2563 if (p == shards.begin()) {
2564 // we are the first shard, combine with next shard
7c673cae 2565 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2566 } else {
31f18b77
FG
2567 // combine either with the previous shard or the next,
2568 // whichever is smaller
7c673cae
FG
2569 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2570 request_reshard(p->shard_info->offset, endoff + 1);
2571 } else {
2572 request_reshard(prev_p->shard_info->offset, endoff);
2573 }
2574 }
2575 }
2576 }
2577 }
2578 prev_p = p;
2579 p = n;
2580 }
2581 if (needs_reshard()) {
2582 return;
2583 }
2584
2585 // schedule DB update for dirty shards
2586 string key;
2587 for (auto& it : encoded_shards) {
2588 it.shard->dirty = false;
2589 it.shard->shard_info->bytes = it.bl.length();
2590 generate_extent_shard_key_and_apply(
2591 onode->key,
2592 it.shard->shard_info->offset,
2593 &key,
2594 [&](const string& final_key) {
2595 t->set(PREFIX_OBJ, final_key, it.bl);
2596 }
2597 );
2598 }
2599 }
2600}
2601
31f18b77
FG
2602bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2603{
2604 if (spanning_blob_map.empty())
2605 return 0;
2606 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2607 // bid is valid and available.
2608 if (bid >= 0)
2609 return bid;
2610 // Find next unused bid;
2611 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2612 const auto begin_bid = bid;
2613 do {
2614 if (!spanning_blob_map.count(bid))
2615 return bid;
2616 else {
2617 bid++;
2618 if (bid < 0) bid = 0;
2619 }
2620 } while (bid != begin_bid);
81eedcae
TL
2621 auto cct = onode->c->store->cct; // used by dout
2622 _dump_onode<0>(cct, *onode);
11fdf7f2 2623 ceph_abort_msg("no available blob id");
31f18b77
FG
2624}
2625
7c673cae
FG
2626void BlueStore::ExtentMap::reshard(
2627 KeyValueDB *db,
2628 KeyValueDB::Transaction t)
2629{
2630 auto cct = onode->c->store->cct; // used by dout
2631
2632 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2633 << needs_reshard_end << ")" << std::dec
2634 << " of " << onode->onode.extent_map_shards.size()
2635 << " shards on " << onode->oid << dendl;
2636 for (auto& p : spanning_blob_map) {
2637 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2638 << dendl;
2639 }
2640 // determine shard index range
2641 unsigned si_begin = 0, si_end = 0;
2642 if (!shards.empty()) {
2643 while (si_begin + 1 < shards.size() &&
2644 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2645 ++si_begin;
2646 }
2647 needs_reshard_begin = shards[si_begin].shard_info->offset;
2648 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2649 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2650 needs_reshard_end = shards[si_end].shard_info->offset;
2651 break;
2652 }
2653 }
2654 if (si_end == shards.size()) {
2655 needs_reshard_end = OBJECT_MAX_SIZE;
2656 }
2657 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2658 << " over 0x[" << std::hex << needs_reshard_begin << ","
2659 << needs_reshard_end << ")" << std::dec << dendl;
2660 }
2661
181888fb 2662 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2663
2664 // we may need to fault in a larger interval later must have all
2665 // referring extents for spanning blobs loaded in order to have
2666 // accurate use_tracker values.
2667 uint32_t spanning_scan_begin = needs_reshard_begin;
2668 uint32_t spanning_scan_end = needs_reshard_end;
2669
2670 // remove old keys
2671 string key;
2672 for (unsigned i = si_begin; i < si_end; ++i) {
2673 generate_extent_shard_key_and_apply(
2674 onode->key, shards[i].shard_info->offset, &key,
2675 [&](const string& final_key) {
2676 t->rmkey(PREFIX_OBJ, final_key);
2677 }
2678 );
2679 }
2680
2681 // calculate average extent size
2682 unsigned bytes = 0;
2683 unsigned extents = 0;
2684 if (onode->onode.extent_map_shards.empty()) {
2685 bytes = inline_bl.length();
2686 extents = extent_map.size();
2687 } else {
2688 for (unsigned i = si_begin; i < si_end; ++i) {
2689 bytes += shards[i].shard_info->bytes;
2690 extents += shards[i].extents;
2691 }
2692 }
2693 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2694 unsigned slop = target *
2695 cct->_conf->bluestore_extent_map_shard_target_size_slop;
11fdf7f2 2696 unsigned extent_avg = bytes / std::max(1u, extents);
7c673cae
FG
2697 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2698 << ", slop " << slop << dendl;
2699
2700 // reshard
2701 unsigned estimate = 0;
31f18b77 2702 unsigned offset = needs_reshard_begin;
7c673cae
FG
2703 vector<bluestore_onode_t::shard_info> new_shard_info;
2704 unsigned max_blob_end = 0;
2705 Extent dummy(needs_reshard_begin);
2706 for (auto e = extent_map.lower_bound(dummy);
2707 e != extent_map.end();
2708 ++e) {
2709 if (e->logical_offset >= needs_reshard_end) {
2710 break;
2711 }
2712 dout(30) << " extent " << *e << dendl;
2713
2714 // disfavor shard boundaries that span a blob
2715 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2716 if (estimate &&
2717 estimate + extent_avg > target + (would_span ? slop : 0)) {
2718 // new shard
31f18b77 2719 if (offset == needs_reshard_begin) {
7c673cae
FG
2720 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2721 new_shard_info.back().offset = offset;
2722 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2723 << std::dec << dendl;
7c673cae
FG
2724 }
2725 offset = e->logical_offset;
2726 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2727 new_shard_info.back().offset = offset;
2728 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2729 << std::dec << dendl;
2730 estimate = 0;
2731 }
2732 estimate += extent_avg;
31f18b77
FG
2733 unsigned bs = e->blob_start();
2734 if (bs < spanning_scan_begin) {
2735 spanning_scan_begin = bs;
7c673cae
FG
2736 }
2737 uint32_t be = e->blob_end();
2738 if (be > max_blob_end) {
2739 max_blob_end = be;
2740 }
2741 if (be > spanning_scan_end) {
2742 spanning_scan_end = be;
2743 }
2744 }
2745 if (new_shard_info.empty() && (si_begin > 0 ||
2746 si_end < shards.size())) {
2747 // we resharded a partial range; we must produce at least one output
2748 // shard
2749 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2750 new_shard_info.back().offset = needs_reshard_begin;
2751 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2752 << std::dec << " (singleton degenerate case)" << dendl;
2753 }
2754
2755 auto& sv = onode->onode.extent_map_shards;
2756 dout(20) << __func__ << " new " << new_shard_info << dendl;
2757 dout(20) << __func__ << " old " << sv << dendl;
2758 if (sv.empty()) {
2759 // no old shards to keep
2760 sv.swap(new_shard_info);
2761 init_shards(true, true);
2762 } else {
2763 // splice in new shards
2764 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2765 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2766 sv.insert(
2767 sv.begin() + si_begin,
2768 new_shard_info.begin(),
2769 new_shard_info.end());
2770 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2771 si_end = si_begin + new_shard_info.size();
31f18b77 2772
11fdf7f2 2773 ceph_assert(sv.size() == shards.size());
31f18b77
FG
2774
2775 // note that we need to update every shard_info of shards here,
2776 // as sv might have been totally re-allocated above
2777 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2778 shards[i].shard_info = &sv[i];
31f18b77
FG
2779 }
2780
2781 // mark newly added shards as dirty
2782 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2783 shards[i].loaded = true;
2784 shards[i].dirty = true;
2785 }
7c673cae
FG
2786 }
2787 dout(20) << __func__ << " fin " << sv << dendl;
2788 inline_bl.clear();
2789
2790 if (sv.empty()) {
2791 // no more shards; unspan all previously spanning blobs
2792 auto p = spanning_blob_map.begin();
2793 while (p != spanning_blob_map.end()) {
2794 p->second->id = -1;
2795 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2796 p = spanning_blob_map.erase(p);
2797 }
2798 } else {
2799 // identify new spanning blobs
2800 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2801 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2802 if (spanning_scan_begin < needs_reshard_begin) {
2803 fault_range(db, spanning_scan_begin,
2804 needs_reshard_begin - spanning_scan_begin);
2805 }
2806 if (spanning_scan_end > needs_reshard_end) {
2807 fault_range(db, needs_reshard_end,
31f18b77 2808 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2809 }
2810 auto sp = sv.begin() + si_begin;
2811 auto esp = sv.end();
2812 unsigned shard_start = sp->offset;
2813 unsigned shard_end;
2814 ++sp;
2815 if (sp == esp) {
2816 shard_end = OBJECT_MAX_SIZE;
2817 } else {
2818 shard_end = sp->offset;
2819 }
7c673cae 2820 Extent dummy(needs_reshard_begin);
9f95a23c
TL
2821
2822 bool was_too_many_blobs_check = false;
2823 auto too_many_blobs_threshold =
2824 g_conf()->bluestore_debug_too_many_blobs_threshold;
2825 auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
2826 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2827 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
2828
7c673cae
FG
2829 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2830 if (e->logical_offset >= needs_reshard_end) {
2831 break;
2832 }
2833 dout(30) << " extent " << *e << dendl;
2834 while (e->logical_offset >= shard_end) {
2835 shard_start = shard_end;
11fdf7f2 2836 ceph_assert(sp != esp);
7c673cae
FG
2837 ++sp;
2838 if (sp == esp) {
2839 shard_end = OBJECT_MAX_SIZE;
2840 } else {
2841 shard_end = sp->offset;
2842 }
2843 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2844 << " to 0x" << shard_end << std::dec << dendl;
2845 }
9f95a23c 2846
7c673cae
FG
2847 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2848 if (!e->blob->is_spanning()) {
2849 // We have two options: (1) split the blob into pieces at the
2850 // shard boundaries (and adjust extents accordingly), or (2)
2851 // mark it spanning. We prefer to cut the blob if we can. Note that
2852 // we may have to split it multiple times--potentially at every
2853 // shard boundary.
2854 bool must_span = false;
2855 BlobRef b = e->blob;
2856 if (b->can_split()) {
2857 uint32_t bstart = e->blob_start();
2858 uint32_t bend = e->blob_end();
2859 for (const auto& sh : shards) {
2860 if (bstart < sh.shard_info->offset &&
2861 bend > sh.shard_info->offset) {
2862 uint32_t blob_offset = sh.shard_info->offset - bstart;
2863 if (b->can_split_at(blob_offset)) {
2864 dout(20) << __func__ << " splitting blob, bstart 0x"
2865 << std::hex << bstart << " blob_offset 0x"
2866 << blob_offset << std::dec << " " << *b << dendl;
2867 b = split_blob(b, blob_offset, sh.shard_info->offset);
2868 // switch b to the new right-hand side, in case it
2869 // *also* has to get split.
2870 bstart += blob_offset;
2871 onode->c->store->logger->inc(l_bluestore_blob_split);
2872 } else {
2873 must_span = true;
2874 break;
2875 }
2876 }
2877 }
2878 } else {
2879 must_span = true;
2880 }
2881 if (must_span) {
31f18b77
FG
2882 auto bid = allocate_spanning_blob_id();
2883 b->id = bid;
7c673cae
FG
2884 spanning_blob_map[b->id] = b;
2885 dout(20) << __func__ << " adding spanning " << *b << dendl;
9f95a23c
TL
2886 if (!was_too_many_blobs_check &&
2887 too_many_blobs_threshold &&
2888 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2889
2890 was_too_many_blobs_check = true;
2891 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2892 if (dumped_onodes[i].first == onode->oid) {
2893 oid_slot = &dumped_onodes[i];
2894 break;
2895 }
2896 if (!oldest_slot || (oldest_slot &&
2897 dumped_onodes[i].second < oldest_slot->second)) {
2898 oldest_slot = &dumped_onodes[i];
2899 }
2900 }
2901 }
7c673cae
FG
2902 }
2903 }
2904 } else {
2905 if (e->blob->is_spanning()) {
2906 spanning_blob_map.erase(e->blob->id);
2907 e->blob->id = -1;
2908 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2909 }
2910 }
2911 }
9f95a23c
TL
2912 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
2913 (oid_slot &&
2914 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
2915 if (do_dump) {
2916 dout(0) << __func__
2917 << " spanning blob count exceeds threshold, "
2918 << spanning_blob_map.size() << " spanning blobs"
2919 << dendl;
2920 _dump_onode<0>(cct, *onode);
2921 if (oid_slot) {
2922 oid_slot->second = mono_clock::now();
2923 } else {
2924 ceph_assert(oldest_slot);
2925 oldest_slot->first = onode->oid;
2926 oldest_slot->second = mono_clock::now();
2927 }
2928 }
7c673cae
FG
2929 }
2930
2931 clear_needs_reshard();
2932}
2933
2934bool BlueStore::ExtentMap::encode_some(
2935 uint32_t offset,
2936 uint32_t length,
2937 bufferlist& bl,
2938 unsigned *pn)
2939{
7c673cae
FG
2940 Extent dummy(offset);
2941 auto start = extent_map.lower_bound(dummy);
2942 uint32_t end = offset + length;
2943
2944 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2945 // serialization only. Hence there is no specific
2946 // handling at ExtentMap level.
2947
2948 unsigned n = 0;
2949 size_t bound = 0;
7c673cae
FG
2950 bool must_reshard = false;
2951 for (auto p = start;
2952 p != extent_map.end() && p->logical_offset < end;
2953 ++p, ++n) {
11fdf7f2 2954 ceph_assert(p->logical_offset >= offset);
7c673cae
FG
2955 p->blob->last_encoded_id = -1;
2956 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2957 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2958 << std::dec << " hit new spanning blob " << *p << dendl;
2959 request_reshard(p->blob_start(), p->blob_end());
2960 must_reshard = true;
2961 }
31f18b77
FG
2962 if (!must_reshard) {
2963 denc_varint(0, bound); // blobid
2964 denc_varint(0, bound); // logical_offset
2965 denc_varint(0, bound); // len
2966 denc_varint(0, bound); // blob_offset
7c673cae 2967
31f18b77
FG
2968 p->blob->bound_encode(
2969 bound,
2970 struct_v,
2971 p->blob->shared_blob->get_sbid(),
2972 false);
2973 }
7c673cae
FG
2974 }
2975 if (must_reshard) {
2976 return true;
2977 }
2978
31f18b77
FG
2979 denc(struct_v, bound);
2980 denc_varint(0, bound); // number of extents
2981
7c673cae
FG
2982 {
2983 auto app = bl.get_contiguous_appender(bound);
2984 denc(struct_v, app);
2985 denc_varint(n, app);
2986 if (pn) {
2987 *pn = n;
2988 }
2989
2990 n = 0;
2991 uint64_t pos = 0;
2992 uint64_t prev_len = 0;
2993 for (auto p = start;
2994 p != extent_map.end() && p->logical_offset < end;
2995 ++p, ++n) {
2996 unsigned blobid;
2997 bool include_blob = false;
2998 if (p->blob->is_spanning()) {
2999 blobid = p->blob->id << BLOBID_SHIFT_BITS;
3000 blobid |= BLOBID_FLAG_SPANNING;
3001 } else if (p->blob->last_encoded_id < 0) {
3002 p->blob->last_encoded_id = n + 1; // so it is always non-zero
3003 include_blob = true;
3004 blobid = 0; // the decoder will infer the id from n
3005 } else {
3006 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
3007 }
3008 if (p->logical_offset == pos) {
3009 blobid |= BLOBID_FLAG_CONTIGUOUS;
3010 }
3011 if (p->blob_offset == 0) {
3012 blobid |= BLOBID_FLAG_ZEROOFFSET;
3013 }
3014 if (p->length == prev_len) {
3015 blobid |= BLOBID_FLAG_SAMELENGTH;
3016 } else {
3017 prev_len = p->length;
3018 }
3019 denc_varint(blobid, app);
3020 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3021 denc_varint_lowz(p->logical_offset - pos, app);
3022 }
3023 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3024 denc_varint_lowz(p->blob_offset, app);
3025 }
3026 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3027 denc_varint_lowz(p->length, app);
3028 }
3029 pos = p->logical_end();
3030 if (include_blob) {
3031 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
3032 }
3033 }
3034 }
3035 /*derr << __func__ << bl << dendl;
3036 derr << __func__ << ":";
3037 bl.hexdump(*_dout);
3038 *_dout << dendl;
3039 */
3040 return false;
3041}
3042
3043unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
3044{
7c673cae
FG
3045 /*
3046 derr << __func__ << ":";
3047 bl.hexdump(*_dout);
3048 *_dout << dendl;
3049 */
3050
11fdf7f2 3051 ceph_assert(bl.get_num_buffers() <= 1);
7c673cae
FG
3052 auto p = bl.front().begin_deep();
3053 __u8 struct_v;
3054 denc(struct_v, p);
3055 // Version 2 differs from v1 in blob's ref_map
3056 // serialization only. Hence there is no specific
3057 // handling at ExtentMap level below.
11fdf7f2 3058 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
3059
3060 uint32_t num;
3061 denc_varint(num, p);
3062 vector<BlobRef> blobs(num);
3063 uint64_t pos = 0;
3064 uint64_t prev_len = 0;
3065 unsigned n = 0;
3066
3067 while (!p.end()) {
3068 Extent *le = new Extent();
3069 uint64_t blobid;
3070 denc_varint(blobid, p);
3071 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3072 uint64_t gap;
3073 denc_varint_lowz(gap, p);
3074 pos += gap;
3075 }
3076 le->logical_offset = pos;
3077 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3078 denc_varint_lowz(le->blob_offset, p);
3079 } else {
3080 le->blob_offset = 0;
3081 }
3082 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3083 denc_varint_lowz(prev_len, p);
3084 }
3085 le->length = prev_len;
3086
3087 if (blobid & BLOBID_FLAG_SPANNING) {
3088 dout(30) << __func__ << " getting spanning blob "
3089 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
3090 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
3091 } else {
3092 blobid >>= BLOBID_SHIFT_BITS;
3093 if (blobid) {
3094 le->assign_blob(blobs[blobid - 1]);
11fdf7f2 3095 ceph_assert(le->blob);
7c673cae
FG
3096 } else {
3097 Blob *b = new Blob();
3098 uint64_t sbid = 0;
3099 b->decode(onode->c, p, struct_v, &sbid, false);
3100 blobs[n] = b;
3101 onode->c->open_shared_blob(sbid, b);
3102 le->assign_blob(b);
3103 }
3104 // we build ref_map dynamically for non-spanning blobs
3105 le->blob->get_ref(
3106 onode->c,
3107 le->blob_offset,
3108 le->length);
3109 }
3110 pos += prev_len;
3111 ++n;
3112 extent_map.insert(*le);
3113 }
3114
11fdf7f2 3115 ceph_assert(n == num);
7c673cae
FG
3116 return num;
3117}
3118
3119void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
3120{
3121 // Version 2 differs from v1 in blob's ref_map
3122 // serialization only. Hence there is no specific
3123 // handling at ExtentMap level.
3124 __u8 struct_v = 2;
3125
3126 denc(struct_v, p);
3127 denc_varint((uint32_t)0, p);
3128 size_t key_size = 0;
3129 denc_varint((uint32_t)0, key_size);
3130 p += spanning_blob_map.size() * key_size;
3131 for (const auto& i : spanning_blob_map) {
3132 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3133 }
3134}
3135
3136void BlueStore::ExtentMap::encode_spanning_blobs(
3137 bufferlist::contiguous_appender& p)
3138{
3139 // Version 2 differs from v1 in blob's ref_map
3140 // serialization only. Hence there is no specific
3141 // handling at ExtentMap level.
3142 __u8 struct_v = 2;
3143
3144 denc(struct_v, p);
3145 denc_varint(spanning_blob_map.size(), p);
3146 for (auto& i : spanning_blob_map) {
3147 denc_varint(i.second->id, p);
3148 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3149 }
3150}
3151
3152void BlueStore::ExtentMap::decode_spanning_blobs(
11fdf7f2 3153 bufferptr::const_iterator& p)
7c673cae
FG
3154{
3155 __u8 struct_v;
3156 denc(struct_v, p);
3157 // Version 2 differs from v1 in blob's ref_map
3158 // serialization only. Hence there is no specific
3159 // handling at ExtentMap level.
11fdf7f2 3160 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
3161
3162 unsigned n;
3163 denc_varint(n, p);
3164 while (n--) {
3165 BlobRef b(new Blob());
3166 denc_varint(b->id, p);
3167 spanning_blob_map[b->id] = b;
3168 uint64_t sbid = 0;
3169 b->decode(onode->c, p, struct_v, &sbid, true);
3170 onode->c->open_shared_blob(sbid, b);
3171 }
3172}
3173
3174void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
3175{
3176 shards.resize(onode->onode.extent_map_shards.size());
3177 unsigned i = 0;
3178 for (auto &s : onode->onode.extent_map_shards) {
3179 shards[i].shard_info = &s;
3180 shards[i].loaded = loaded;
3181 shards[i].dirty = dirty;
3182 ++i;
3183 }
3184}
3185
3186void BlueStore::ExtentMap::fault_range(
3187 KeyValueDB *db,
3188 uint32_t offset,
3189 uint32_t length)
3190{
7c673cae
FG
3191 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3192 << std::dec << dendl;
3193 auto start = seek_shard(offset);
3194 auto last = seek_shard(offset + length);
3195
3196 if (start < 0)
3197 return;
3198
11fdf7f2 3199 ceph_assert(last >= start);
7c673cae
FG
3200 string key;
3201 while (start <= last) {
11fdf7f2 3202 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3203 auto p = &shards[start];
3204 if (!p->loaded) {
3205 dout(30) << __func__ << " opening shard 0x" << std::hex
3206 << p->shard_info->offset << std::dec << dendl;
3207 bufferlist v;
3208 generate_extent_shard_key_and_apply(
3209 onode->key, p->shard_info->offset, &key,
3210 [&](const string& final_key) {
3211 int r = db->get(PREFIX_OBJ, final_key, &v);
3212 if (r < 0) {
3213 derr << __func__ << " missing shard 0x" << std::hex
3214 << p->shard_info->offset << std::dec << " for " << onode->oid
3215 << dendl;
11fdf7f2 3216 ceph_assert(r >= 0);
7c673cae
FG
3217 }
3218 }
3219 );
3220 p->extents = decode_some(v);
3221 p->loaded = true;
3222 dout(20) << __func__ << " open shard 0x" << std::hex
81eedcae
TL
3223 << p->shard_info->offset
3224 << " for range 0x" << offset << "~" << length << std::dec
7c673cae 3225 << " (" << v.length() << " bytes)" << dendl;
11fdf7f2
TL
3226 ceph_assert(p->dirty == false);
3227 ceph_assert(v.length() == p->shard_info->bytes);
7c673cae
FG
3228 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3229 } else {
3230 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3231 }
3232 ++start;
3233 }
3234}
3235
3236void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
3237 uint32_t offset,
3238 uint32_t length)
3239{
7c673cae
FG
3240 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3241 << std::dec << dendl;
3242 if (shards.empty()) {
3243 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3244 inline_bl.clear();
3245 return;
3246 }
3247 auto start = seek_shard(offset);
11fdf7f2
TL
3248 if (length == 0) {
3249 length = 1;
3250 }
3251 auto last = seek_shard(offset + length - 1);
7c673cae
FG
3252 if (start < 0)
3253 return;
3254
11fdf7f2 3255 ceph_assert(last >= start);
7c673cae 3256 while (start <= last) {
11fdf7f2 3257 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3258 auto p = &shards[start];
3259 if (!p->loaded) {
11fdf7f2
TL
3260 derr << __func__ << "on write 0x" << std::hex << offset
3261 << "~" << length << " shard 0x" << p->shard_info->offset
3262 << std::dec << " is not loaded, can't mark dirty" << dendl;
3263 ceph_abort_msg("can't mark unloaded shard dirty");
7c673cae
FG
3264 }
3265 if (!p->dirty) {
3266 dout(20) << __func__ << " mark shard 0x" << std::hex
3267 << p->shard_info->offset << std::dec << " dirty" << dendl;
3268 p->dirty = true;
3269 }
3270 ++start;
3271 }
3272}
3273
3274BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3275 uint64_t offset)
3276{
3277 Extent dummy(offset);
3278 return extent_map.find(dummy);
3279}
3280
7c673cae
FG
3281BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3282 uint64_t offset)
3283{
3284 Extent dummy(offset);
3285 auto fp = extent_map.lower_bound(dummy);
3286 if (fp != extent_map.begin()) {
3287 --fp;
3288 if (fp->logical_end() <= offset) {
3289 ++fp;
3290 }
3291 }
3292 return fp;
3293}
3294
3295BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3296 uint64_t offset) const
3297{
3298 Extent dummy(offset);
3299 auto fp = extent_map.lower_bound(dummy);
3300 if (fp != extent_map.begin()) {
3301 --fp;
3302 if (fp->logical_end() <= offset) {
3303 ++fp;
3304 }
3305 }
3306 return fp;
3307}
3308
3309bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3310{
3311 auto fp = seek_lextent(offset);
3312 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3313 return false;
3314 }
3315 return true;
3316}
3317
3318int BlueStore::ExtentMap::compress_extent_map(
3319 uint64_t offset,
3320 uint64_t length)
3321{
7c673cae
FG
3322 if (extent_map.empty())
3323 return 0;
3324 int removed = 0;
3325 auto p = seek_lextent(offset);
3326 if (p != extent_map.begin()) {
3327 --p; // start to the left of offset
3328 }
3329 // the caller should have just written to this region
11fdf7f2 3330 ceph_assert(p != extent_map.end());
7c673cae
FG
3331
3332 // identify the *next* shard
3333 auto pshard = shards.begin();
3334 while (pshard != shards.end() &&
3335 p->logical_offset >= pshard->shard_info->offset) {
3336 ++pshard;
3337 }
3338 uint64_t shard_end;
3339 if (pshard != shards.end()) {
3340 shard_end = pshard->shard_info->offset;
3341 } else {
3342 shard_end = OBJECT_MAX_SIZE;
3343 }
3344
3345 auto n = p;
3346 for (++n; n != extent_map.end(); p = n++) {
3347 if (n->logical_offset > offset + length) {
3348 break; // stop after end
3349 }
3350 while (n != extent_map.end() &&
3351 p->logical_end() == n->logical_offset &&
3352 p->blob == n->blob &&
3353 p->blob_offset + p->length == n->blob_offset &&
3354 n->logical_offset < shard_end) {
3355 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3356 << " next shard 0x" << shard_end << std::dec
3357 << " merging " << *p << " and " << *n << dendl;
3358 p->length += n->length;
3359 rm(n++);
3360 ++removed;
3361 }
3362 if (n == extent_map.end()) {
3363 break;
3364 }
3365 if (n->logical_offset >= shard_end) {
11fdf7f2 3366 ceph_assert(pshard != shards.end());
7c673cae
FG
3367 ++pshard;
3368 if (pshard != shards.end()) {
3369 shard_end = pshard->shard_info->offset;
3370 } else {
3371 shard_end = OBJECT_MAX_SIZE;
3372 }
3373 }
3374 }
11fdf7f2 3375 if (removed) {
7c673cae
FG
3376 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3377 }
3378 return removed;
3379}
3380
3381void BlueStore::ExtentMap::punch_hole(
3382 CollectionRef &c,
3383 uint64_t offset,
3384 uint64_t length,
3385 old_extent_map_t *old_extents)
3386{
3387 auto p = seek_lextent(offset);
3388 uint64_t end = offset + length;
3389 while (p != extent_map.end()) {
3390 if (p->logical_offset >= end) {
3391 break;
3392 }
3393 if (p->logical_offset < offset) {
3394 if (p->logical_end() > end) {
3395 // split and deref middle
3396 uint64_t front = offset - p->logical_offset;
3397 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3398 length, p->blob);
3399 old_extents->push_back(*oe);
3400 add(end,
3401 p->blob_offset + front + length,
3402 p->length - front - length,
3403 p->blob);
3404 p->length = front;
3405 break;
3406 } else {
3407 // deref tail
11fdf7f2 3408 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
7c673cae
FG
3409 uint64_t keep = offset - p->logical_offset;
3410 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3411 p->length - keep, p->blob);
3412 old_extents->push_back(*oe);
3413 p->length = keep;
3414 ++p;
3415 continue;
3416 }
3417 }
3418 if (p->logical_offset + p->length <= end) {
3419 // deref whole lextent
3420 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3421 p->length, p->blob);
3422 old_extents->push_back(*oe);
3423 rm(p++);
3424 continue;
3425 }
3426 // deref head
3427 uint64_t keep = p->logical_end() - end;
3428 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3429 p->length - keep, p->blob);
3430 old_extents->push_back(*oe);
3431
3432 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3433 rm(p);
3434 break;
3435 }
3436}
3437
3438BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3439 CollectionRef &c,
3440 uint64_t logical_offset,
3441 uint64_t blob_offset, uint64_t length, BlobRef b,
3442 old_extent_map_t *old_extents)
3443{
3444 // We need to have completely initialized Blob to increment its ref counters.
11fdf7f2 3445 ceph_assert(b->get_blob().get_logical_length() != 0);
7c673cae
FG
3446
3447 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3448 // old_extents list if we overwre the blob totally
3449 // This might happen during WAL overwrite.
3450 b->get_ref(onode->c, blob_offset, length);
3451
3452 if (old_extents) {
3453 punch_hole(c, logical_offset, length, old_extents);
3454 }
3455
3456 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3457 extent_map.insert(*le);
3458 if (spans_shard(logical_offset, length)) {
3459 request_reshard(logical_offset, logical_offset + length);
3460 }
3461 return le;
3462}
3463
3464BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3465 BlobRef lb,
3466 uint32_t blob_offset,
3467 uint32_t pos)
3468{
7c673cae
FG
3469 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3470 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3471 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3472 << dendl;
3473 BlobRef rb = onode->c->new_blob();
3474 lb->split(onode->c, blob_offset, rb.get());
3475
3476 for (auto ep = seek_lextent(pos);
3477 ep != extent_map.end() && ep->logical_offset < end_pos;
3478 ++ep) {
3479 if (ep->blob != lb) {
3480 continue;
3481 }
3482 if (ep->logical_offset < pos) {
3483 // split extent
3484 size_t left = pos - ep->logical_offset;
3485 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3486 extent_map.insert(*ne);
3487 ep->length = left;
3488 dout(30) << __func__ << " split " << *ep << dendl;
3489 dout(30) << __func__ << " to " << *ne << dendl;
3490 } else {
3491 // switch blob
11fdf7f2 3492 ceph_assert(ep->blob_offset >= blob_offset);
7c673cae
FG
3493
3494 ep->blob = rb;
3495 ep->blob_offset -= blob_offset;
3496 dout(30) << __func__ << " adjusted " << *ep << dendl;
3497 }
3498 }
3499 return rb;
3500}
3501
3502// Onode
3503
3504#undef dout_prefix
3505#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3506
f6b5b4d7
TL
3507//
3508// A tricky thing about Onode's ref counter is that we do an additional
3509// increment when newly pinned instance is detected. And -1 on unpin.
3510// This prevents from a conflict with a delete call (when nref == 0).
3511// The latter might happen while the thread is in unpin() function
3512// (and e.g. waiting for lock acquisition) since nref is already
3513// decremented. And another 'putting' thread on the instance will release it.
3514//
3515void BlueStore::Onode::get() {
adb31ebb
TL
3516 if (++nref >= 2 && !pinned) {
3517 OnodeCacheShard* ocs = c->get_onode_cache();
f67539c2
TL
3518 ocs->lock.lock();
3519 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3520 while (ocs != c->get_onode_cache()) {
3521 ocs->lock.unlock();
3522 ocs = c->get_onode_cache();
3523 ocs->lock.lock();
3524 }
adb31ebb
TL
3525 bool was_pinned = pinned;
3526 pinned = nref >= 2;
3527 // additional increment for newly pinned instance
3528 bool r = !was_pinned && pinned;
3529 if (r) {
3530 ++nref;
3531 }
3532 if (cached && r) {
3533 ocs->_pin(this);
3534 }
f67539c2 3535 ocs->lock.unlock();
f6b5b4d7
TL
3536 }
3537}
3538void BlueStore::Onode::put() {
adb31ebb
TL
3539 int n = --nref;
3540 if (n == 2) {
3541 OnodeCacheShard* ocs = c->get_onode_cache();
f67539c2
TL
3542 ocs->lock.lock();
3543 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3544 while (ocs != c->get_onode_cache()) {
3545 ocs->lock.unlock();
3546 ocs = c->get_onode_cache();
3547 ocs->lock.lock();
3548 }
adb31ebb
TL
3549 bool need_unpin = pinned;
3550 pinned = pinned && nref > 2; // intentionally use > not >= as we have
3551 // +1 due to pinned state
3552 need_unpin = need_unpin && !pinned;
3553 if (cached && need_unpin) {
3554 if (exists) {
3555 ocs->_unpin(this);
3556 } else {
3557 ocs->_unpin_and_rm(this);
3558 // remove will also decrement nref and delete Onode
3559 c->onode_map._remove(oid);
3560 }
3561 }
3562 // additional decrement for newly unpinned instance
3563 // should be the last action since Onode can be released
3564 // at any point after this decrement
3565 if (need_unpin) {
3566 n = --nref;
3567 }
f67539c2 3568 ocs->lock.unlock();
f6b5b4d7 3569 }
adb31ebb 3570 if (n == 0) {
f6b5b4d7
TL
3571 delete this;
3572 }
3573}
3574
eafe8130
TL
3575BlueStore::Onode* BlueStore::Onode::decode(
3576 CollectionRef c,
3577 const ghobject_t& oid,
3578 const string& key,
3579 const bufferlist& v)
3580{
3581 Onode* on = new Onode(c.get(), oid, key);
3582 on->exists = true;
3583 auto p = v.front().begin_deep();
3584 on->onode.decode(p);
3585 for (auto& i : on->onode.attrs) {
f91f0fd5 3586 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
eafe8130
TL
3587 }
3588
3589 // initialize extent_map
3590 on->extent_map.decode_spanning_blobs(p);
3591 if (on->onode.extent_map_shards.empty()) {
3592 denc(on->extent_map.inline_bl, p);
3593 on->extent_map.decode_some(on->extent_map.inline_bl);
3594 on->extent_map.inline_bl.reassign_to_mempool(
f91f0fd5 3595 mempool::mempool_bluestore_cache_data);
eafe8130
TL
3596 }
3597 else {
3598 on->extent_map.init_shards(false, false);
3599 }
3600 return on;
3601}
3602
7c673cae
FG
3603void BlueStore::Onode::flush()
3604{
3605 if (flushing_count.load()) {
3606 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
9f95a23c 3607 waiting_count++;
11fdf7f2 3608 std::unique_lock l(flush_lock);
7c673cae
FG
3609 while (flushing_count.load()) {
3610 flush_cond.wait(l);
3611 }
9f95a23c 3612 waiting_count--;
7c673cae
FG
3613 }
3614 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3615}
3616
9f95a23c
TL
3617void BlueStore::Onode::dump(Formatter* f) const
3618{
3619 onode.dump(f);
3620 extent_map.dump(f);
3621}
3622
522d829b 3623const std::string& BlueStore::Onode::calc_omap_prefix(uint8_t flags)
9f95a23c 3624{
522d829b 3625 if (bluestore_onode_t::is_pgmeta_omap(flags)) {
9f95a23c
TL
3626 return PREFIX_PGMETA_OMAP;
3627 }
522d829b 3628 if (bluestore_onode_t::is_perpg_omap(flags)) {
f67539c2
TL
3629 return PREFIX_PERPG_OMAP;
3630 }
522d829b 3631 if (bluestore_onode_t::is_perpool_omap(flags)) {
9f95a23c
TL
3632 return PREFIX_PERPOOL_OMAP;
3633 }
3634 return PREFIX_OMAP;
3635}
3636
3637// '-' < '.' < '~'
522d829b
TL
3638void BlueStore::Onode::calc_omap_header(
3639 uint8_t flags,
3640 const Onode* o,
3641 std::string* out)
9f95a23c 3642{
522d829b
TL
3643 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3644 if (bluestore_onode_t::is_perpg_omap(flags)) {
3645 _key_encode_u64(o->c->pool(), out);
3646 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3647 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3648 _key_encode_u64(o->c->pool(), out);
f67539c2 3649 }
9f95a23c 3650 }
522d829b 3651 _key_encode_u64(o->onode.nid, out);
9f95a23c
TL
3652 out->push_back('-');
3653}
3654
522d829b
TL
3655void BlueStore::Onode::calc_omap_key(uint8_t flags,
3656 const Onode* o,
3657 const std::string& key,
3658 std::string* out)
9f95a23c 3659{
522d829b
TL
3660 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3661 if (bluestore_onode_t::is_perpg_omap(flags)) {
3662 _key_encode_u64(o->c->pool(), out);
3663 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3664 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3665 _key_encode_u64(o->c->pool(), out);
f67539c2 3666 }
9f95a23c 3667 }
522d829b 3668 _key_encode_u64(o->onode.nid, out);
9f95a23c
TL
3669 out->push_back('.');
3670 out->append(key);
3671}
3672
3673void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3674{
f67539c2
TL
3675 if (!onode.is_pgmeta_omap()) {
3676 if (onode.is_perpg_omap()) {
3677 _key_encode_u64(c->pool(), out);
3678 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3679 } else if (onode.is_perpool_omap()) {
3680 _key_encode_u64(c->pool(), out);
3681 }
9f95a23c
TL
3682 }
3683 _key_encode_u64(onode.nid, out);
3684 out->append(old.c_str() + out->length(), old.size() - out->length());
3685}
3686
522d829b
TL
3687void BlueStore::Onode::calc_omap_tail(
3688 uint8_t flags,
3689 const Onode* o,
3690 std::string* out)
9f95a23c 3691{
522d829b
TL
3692 if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
3693 if (bluestore_onode_t::is_perpg_omap(flags)) {
3694 _key_encode_u64(o->c->pool(), out);
3695 _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
3696 } else if (bluestore_onode_t::is_perpool_omap(flags)) {
3697 _key_encode_u64(o->c->pool(), out);
f67539c2 3698 }
9f95a23c 3699 }
522d829b 3700 _key_encode_u64(o->onode.nid, out);
9f95a23c
TL
3701 out->push_back('~');
3702}
3703
3704void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3705{
f67539c2
TL
3706 size_t pos = sizeof(uint64_t) + 1;
3707 if (!onode.is_pgmeta_omap()) {
3708 if (onode.is_perpg_omap()) {
3709 pos += sizeof(uint64_t) + sizeof(uint32_t);
3710 } else if (onode.is_perpool_omap()) {
3711 pos += sizeof(uint64_t);
3712 }
9f95a23c 3713 }
f67539c2 3714 *user_key = key.substr(pos);
9f95a23c
TL
3715}
3716
7c673cae
FG
3717// =======================================================
3718// WriteContext
3719
3720/// Checks for writes to the same pextent within a blob
3721bool BlueStore::WriteContext::has_conflict(
3722 BlobRef b,
3723 uint64_t loffs,
3724 uint64_t loffs_end,
3725 uint64_t min_alloc_size)
3726{
11fdf7f2
TL
3727 ceph_assert((loffs % min_alloc_size) == 0);
3728 ceph_assert((loffs_end % min_alloc_size) == 0);
7c673cae
FG
3729 for (auto w : writes) {
3730 if (b == w.b) {
11fdf7f2
TL
3731 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3732 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
7c673cae 3733 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 3734 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
3735 return true;
3736 }
3737 }
3738 }
3739 return false;
3740}
3741
3742// =======================================================
3743
3744// DeferredBatch
3745#undef dout_prefix
3746#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
9f95a23c
TL
3747#undef dout_context
3748#define dout_context cct
7c673cae
FG
3749
3750void BlueStore::DeferredBatch::prepare_write(
3751 CephContext *cct,
3752 uint64_t seq, uint64_t offset, uint64_t length,
3753 bufferlist::const_iterator& blp)
3754{
3755 _discard(cct, offset, length);
3756 auto i = iomap.insert(make_pair(offset, deferred_io()));
11fdf7f2 3757 ceph_assert(i.second); // this should be a new insertion
7c673cae
FG
3758 i.first->second.seq = seq;
3759 blp.copy(length, i.first->second.bl);
31f18b77
FG
3760 i.first->second.bl.reassign_to_mempool(
3761 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3762 dout(20) << __func__ << " seq " << seq
3763 << " 0x" << std::hex << offset << "~" << length
3764 << " crc " << i.first->second.bl.crc32c(-1)
3765 << std::dec << dendl;
3766 seq_bytes[seq] += length;
3767#ifdef DEBUG_DEFERRED
3768 _audit(cct);
3769#endif
3770}
3771
3772void BlueStore::DeferredBatch::_discard(
3773 CephContext *cct, uint64_t offset, uint64_t length)
3774{
3775 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3776 << std::dec << dendl;
3777 auto p = iomap.lower_bound(offset);
3778 if (p != iomap.begin()) {
3779 --p;
3780 auto end = p->first + p->second.bl.length();
3781 if (end > offset) {
3782 bufferlist head;
3783 head.substr_of(p->second.bl, 0, offset - p->first);
3784 dout(20) << __func__ << " keep head " << p->second.seq
3785 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3786 << " -> 0x" << head.length() << std::dec << dendl;
3787 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3788 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3789 if (end > offset + length) {
3790 bufferlist tail;
3791 tail.substr_of(p->second.bl, offset + length - p->first,
3792 end - (offset + length));
3793 dout(20) << __func__ << " keep tail " << p->second.seq
3794 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3795 << " -> 0x" << tail.length() << std::dec << dendl;
3796 auto &n = iomap[offset + length];
3797 n.bl.swap(tail);
3798 n.seq = p->second.seq;
3799 i->second -= length;
3800 } else {
3801 i->second -= end - offset;
3802 }
11fdf7f2 3803 ceph_assert(i->second >= 0);
7c673cae
FG
3804 p->second.bl.swap(head);
3805 }
3806 ++p;
3807 }
3808 while (p != iomap.end()) {
3809 if (p->first >= offset + length) {
3810 break;
3811 }
3812 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3813 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3814 auto end = p->first + p->second.bl.length();
3815 if (end > offset + length) {
3816 unsigned drop_front = offset + length - p->first;
3817 unsigned keep_tail = end - (offset + length);
3818 dout(20) << __func__ << " truncate front " << p->second.seq
3819 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3820 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3821 << " to 0x" << (offset + length) << "~" << keep_tail
3822 << std::dec << dendl;
3823 auto &s = iomap[offset + length];
3824 s.seq = p->second.seq;
3825 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3826 i->second -= drop_front;
3827 } else {
3828 dout(20) << __func__ << " drop " << p->second.seq
3829 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3830 << std::dec << dendl;
3831 i->second -= p->second.bl.length();
3832 }
11fdf7f2 3833 ceph_assert(i->second >= 0);
7c673cae
FG
3834 p = iomap.erase(p);
3835 }
3836}
3837
3838void BlueStore::DeferredBatch::_audit(CephContext *cct)
3839{
3840 map<uint64_t,int> sb;
3841 for (auto p : seq_bytes) {
3842 sb[p.first] = 0; // make sure we have the same set of keys
3843 }
3844 uint64_t pos = 0;
3845 for (auto& p : iomap) {
11fdf7f2 3846 ceph_assert(p.first >= pos);
7c673cae
FG
3847 sb[p.second.seq] += p.second.bl.length();
3848 pos = p.first + p.second.bl.length();
3849 }
11fdf7f2 3850 ceph_assert(sb == seq_bytes);
7c673cae
FG
3851}
3852
3853
3854// Collection
3855
3856#undef dout_prefix
3857#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3858
9f95a23c
TL
3859BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3860 : CollectionImpl(store_->cct, cid),
11fdf7f2 3861 store(store_),
9f95a23c 3862 cache(bc),
7c673cae 3863 exists(true),
9f95a23c 3864 onode_map(oc),
11fdf7f2
TL
3865 commit_queue(nullptr)
3866{
3867}
3868
3869bool BlueStore::Collection::flush_commit(Context *c)
3870{
3871 return osr->flush_commit(c);
3872}
3873
3874void BlueStore::Collection::flush()
3875{
3876 osr->flush();
3877}
3878
3879void BlueStore::Collection::flush_all_but_last()
7c673cae 3880{
11fdf7f2 3881 osr->flush_all_but_last();
7c673cae
FG
3882}
3883
3884void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3885{
11fdf7f2 3886 ceph_assert(!b->shared_blob);
7c673cae
FG
3887 const bluestore_blob_t& blob = b->get_blob();
3888 if (!blob.is_shared()) {
3889 b->shared_blob = new SharedBlob(this);
3890 return;
3891 }
3892
3893 b->shared_blob = shared_blob_set.lookup(sbid);
3894 if (b->shared_blob) {
3895 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3896 << std::dec << " had " << *b->shared_blob << dendl;
3897 } else {
3898 b->shared_blob = new SharedBlob(sbid, this);
3899 shared_blob_set.add(this, b->shared_blob.get());
3900 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3901 << std::dec << " opened " << *b->shared_blob
3902 << dendl;
3903 }
3904}
3905
3906void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3907{
3908 if (!sb->is_loaded()) {
3909
3910 bufferlist v;
3911 string key;
3912 auto sbid = sb->get_sbid();
3913 get_shared_blob_key(sbid, &key);
3914 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3915 if (r < 0) {
3916 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3917 << std::dec << " not found at key "
3918 << pretty_binary_string(key) << dendl;
11fdf7f2 3919 ceph_abort_msg("uh oh, missing shared_blob");
7c673cae
FG
3920 }
3921
3922 sb->loaded = true;
3923 sb->persistent = new bluestore_shared_blob_t(sbid);
11fdf7f2
TL
3924 auto p = v.cbegin();
3925 decode(*(sb->persistent), p);
7c673cae
FG
3926 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3927 << std::dec << " loaded shared_blob " << *sb << dendl;
3928 }
3929}
3930
3931void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3932{
7c673cae 3933 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
11fdf7f2 3934 ceph_assert(!b->shared_blob->is_loaded());
7c673cae
FG
3935
3936 // update blob
31f18b77 3937 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 3938 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
3939
3940 // update shared blob
3941 b->shared_blob->loaded = true;
3942 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3943 shared_blob_set.add(this, b->shared_blob.get());
3944 for (auto p : blob.get_extents()) {
3945 if (p.is_valid()) {
3946 b->shared_blob->get_ref(
3947 p.offset,
3948 p.length);
3949 }
3950 }
3951 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3952}
3953
31f18b77
FG
3954uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3955{
3956 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
11fdf7f2 3957 ceph_assert(sb->is_loaded());
31f18b77
FG
3958
3959 uint64_t sbid = sb->get_sbid();
3960 shared_blob_set.remove(sb);
3961 sb->loaded = false;
3962 delete sb->persistent;
3963 sb->sbid_unloaded = 0;
3964 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3965 return sbid;
3966}
3967
7c673cae
FG
3968BlueStore::OnodeRef BlueStore::Collection::get_onode(
3969 const ghobject_t& oid,
9f95a23c
TL
3970 bool create,
3971 bool is_createop)
7c673cae 3972{
9f95a23c 3973 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
7c673cae
FG
3974
3975 spg_t pgid;
3976 if (cid.is_pg(&pgid)) {
3977 if (!oid.match(cnode.bits, pgid.ps())) {
3978 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3979 << pgid << " bits " << cnode.bits << dendl;
3980 ceph_abort();
3981 }
3982 }
3983
3984 OnodeRef o = onode_map.lookup(oid);
3985 if (o)
3986 return o;
3987
eafe8130 3988 string key;
7c673cae
FG
3989 get_object_key(store->cct, oid, &key);
3990
3991 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3992 << pretty_binary_string(key) << dendl;
3993
3994 bufferlist v;
9f95a23c 3995 int r = -ENOENT;
7c673cae 3996 Onode *on;
9f95a23c
TL
3997 if (!is_createop) {
3998 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3999 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
4000 }
7c673cae 4001 if (v.length() == 0) {
11fdf7f2 4002 ceph_assert(r == -ENOENT);
f67539c2 4003 if (!create)
7c673cae
FG
4004 return OnodeRef();
4005
4006 // new object, new onode
4007 on = new Onode(this, oid, key);
4008 } else {
4009 // loaded
11fdf7f2 4010 ceph_assert(r >= 0);
eafe8130 4011 on = Onode::decode(this, oid, key, v);
7c673cae
FG
4012 }
4013 o.reset(on);
4014 return onode_map.add(oid, o);
4015}
4016
4017void BlueStore::Collection::split_cache(
4018 Collection *dest)
4019{
4020 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
4021
f67539c2
TL
4022 auto *ocache = get_onode_cache();
4023 auto *ocache_dest = dest->get_onode_cache();
4024
4025 // lock cache shards
4026 std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock);
4027 std::lock_guard l(ocache->lock, std::adopt_lock);
4028 std::lock_guard l2(ocache_dest->lock, std::adopt_lock);
4029 std::lock_guard l3(cache->lock, std::adopt_lock);
4030 std::lock_guard l4(dest->cache->lock, std::adopt_lock);
7c673cae
FG
4031
4032 int destbits = dest->cnode.bits;
4033 spg_t destpg;
4034 bool is_pg = dest->cid.is_pg(&destpg);
11fdf7f2 4035 ceph_assert(is_pg);
7c673cae
FG
4036
4037 auto p = onode_map.onode_map.begin();
4038 while (p != onode_map.onode_map.end()) {
11fdf7f2 4039 OnodeRef o = p->second;
7c673cae
FG
4040 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
4041 // onode does not belong to this child
11fdf7f2
TL
4042 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
4043 << dendl;
7c673cae
FG
4044 ++p;
4045 } else {
7c673cae
FG
4046 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
4047 << dendl;
4048
f6b5b4d7
TL
4049 // ensuring that nref is always >= 2 and hence onode is pinned and
4050 // physically out of cache during the transition
4051 OnodeRef o_pin = o;
4052 ceph_assert(o->pinned);
4053
7c673cae 4054 p = onode_map.onode_map.erase(p);
7c673cae 4055 dest->onode_map.onode_map[o->oid] = o;
adb31ebb 4056 if (o->cached) {
f6b5b4d7 4057 get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
9f95a23c 4058 }
f6b5b4d7 4059 o->c = dest;
7c673cae
FG
4060
4061 // move over shared blobs and buffers. cover shared blobs from
4062 // both extent map and spanning blob map (the full extent map
4063 // may not be faulted in)
4064 vector<SharedBlob*> sbvec;
4065 for (auto& e : o->extent_map.extent_map) {
4066 sbvec.push_back(e.blob->shared_blob.get());
4067 }
4068 for (auto& b : o->extent_map.spanning_blob_map) {
4069 sbvec.push_back(b.second->shared_blob.get());
4070 }
4071 for (auto sb : sbvec) {
4072 if (sb->coll == dest) {
4073 ldout(store->cct, 20) << __func__ << " already moved " << *sb
4074 << dendl;
4075 continue;
4076 }
4077 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
4078 if (sb->get_sbid()) {
4079 ldout(store->cct, 20) << __func__
4080 << " moving registration " << *sb << dendl;
4081 shared_blob_set.remove(sb);
4082 dest->shared_blob_set.add(dest, sb);
4083 }
3efd9988 4084 sb->coll = dest;
7c673cae 4085 if (dest->cache != cache) {
7c673cae
FG
4086 for (auto& i : sb->bc.buffer_map) {
4087 if (!i.second->is_writing()) {
4088 ldout(store->cct, 20) << __func__ << " moving " << *i.second
4089 << dendl;
9f95a23c 4090 dest->cache->_move(cache, i.second.get());
7c673cae
FG
4091 }
4092 }
4093 }
4094 }
7c673cae
FG
4095 }
4096 }
9f95a23c 4097 dest->cache->_trim();
7c673cae
FG
4098}
4099
7c673cae
FG
4100// =======================================================
4101
91327a77
AA
4102// MempoolThread
4103
4104#undef dout_prefix
4105#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
9f95a23c
TL
4106#undef dout_context
4107#define dout_context store->cct
91327a77 4108
7c673cae
FG
4109void *BlueStore::MempoolThread::entry()
4110{
9f95a23c 4111 std::unique_lock l{lock};
11fdf7f2 4112
92f5a8d4 4113 uint32_t prev_config_change = store->config_changed.load();
eafe8130
TL
4114 uint64_t base = store->osd_memory_base;
4115 double fragmentation = store->osd_memory_expected_fragmentation;
4116 uint64_t target = store->osd_memory_target;
4117 uint64_t min = store->osd_memory_cache_min;
4118 uint64_t max = min;
4119
4120 // When setting the maximum amount of memory to use for cache, first
4121 // assume some base amount of memory for the OSD and then fudge in
4122 // some overhead for fragmentation that scales with cache usage.
4123 uint64_t ltarget = (1.0 - fragmentation) * target;
4124 if (ltarget > base + min) {
4125 max = ltarget - base;
11fdf7f2 4126 }
31f18b77 4127
eafe8130 4128 binned_kv_cache = store->db->get_priority_cache();
f67539c2 4129 binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
eafe8130
TL
4130 if (store->cache_autotune && binned_kv_cache != nullptr) {
4131 pcm = std::make_shared<PriorityCache::Manager>(
f67539c2 4132 store->cct, min, max, target, true, "bluestore-pricache");
eafe8130
TL
4133 pcm->insert("kv", binned_kv_cache, true);
4134 pcm->insert("meta", meta_cache, true);
4135 pcm->insert("data", data_cache, true);
f67539c2
TL
4136 if (binned_kv_onode_cache != nullptr) {
4137 pcm->insert("kv_onode", binned_kv_onode_cache, true);
4138 }
eafe8130 4139 }
91327a77
AA
4140
4141 utime_t next_balance = ceph_clock_now();
4142 utime_t next_resize = ceph_clock_now();
9f95a23c
TL
4143 utime_t next_deferred_force_submit = ceph_clock_now();
4144 utime_t alloc_stats_dump_clock = ceph_clock_now();
31f18b77 4145
91327a77 4146 bool interval_stats_trim = false;
91327a77 4147 while (!stop) {
92f5a8d4
TL
4148 // Update pcm cache settings if related configuration was changed
4149 uint32_t cur_config_change = store->config_changed.load();
4150 if (cur_config_change != prev_config_change) {
4151 _update_cache_settings();
4152 prev_config_change = cur_config_change;
4153 }
4154
91327a77
AA
4155 // Before we trim, check and see if it's time to rebalance/resize.
4156 double autotune_interval = store->cache_autotune_interval;
4157 double resize_interval = store->osd_memory_cache_resize_interval;
9f95a23c
TL
4158 double max_defer_interval = store->max_defer_interval;
4159
4160 double alloc_stats_dump_interval =
4161 store->cct->_conf->bluestore_alloc_stats_dump_interval;
91327a77 4162
9f95a23c
TL
4163 if (alloc_stats_dump_interval > 0 &&
4164 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
4165 store->_record_allocation_stats();
4166 alloc_stats_dump_clock = ceph_clock_now();
4167 }
91327a77 4168 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
11fdf7f2
TL
4169 _adjust_cache_settings();
4170
91327a77 4171 // Log events at 5 instead of 20 when balance happens.
91327a77 4172 interval_stats_trim = true;
eafe8130
TL
4173
4174 if (pcm != nullptr) {
4175 pcm->balance();
91327a77 4176 }
31f18b77 4177
91327a77
AA
4178 next_balance = ceph_clock_now();
4179 next_balance += autotune_interval;
4180 }
4181 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
eafe8130
TL
4182 if (ceph_using_tcmalloc() && pcm != nullptr) {
4183 pcm->tune_memory();
91327a77
AA
4184 }
4185 next_resize = ceph_clock_now();
4186 next_resize += resize_interval;
31f18b77
FG
4187 }
4188
9f95a23c
TL
4189 if (max_defer_interval > 0 &&
4190 next_deferred_force_submit < ceph_clock_now()) {
4191 if (store->get_deferred_last_submitted() + max_defer_interval <
4192 ceph_clock_now()) {
4193 store->deferred_try_submit();
4194 }
4195 next_deferred_force_submit = ceph_clock_now();
4196 next_deferred_force_submit += max_defer_interval/3;
4197 }
4198
4199 // Now Resize the shards
4200 _resize_shards(interval_stats_trim);
91327a77 4201 interval_stats_trim = false;
31f18b77 4202
91327a77 4203 store->_update_cache_logger();
11fdf7f2
TL
4204 auto wait = ceph::make_timespan(
4205 store->cct->_conf->bluestore_cache_trim_interval);
4206 cond.wait_for(l, wait);
7c673cae 4207 }
9f95a23c
TL
4208 // do final dump
4209 store->_record_allocation_stats();
7c673cae 4210 stop = false;
f67539c2 4211 pcm = nullptr;
7c673cae
FG
4212 return NULL;
4213}
4214
91327a77
AA
4215void BlueStore::MempoolThread::_adjust_cache_settings()
4216{
11fdf7f2
TL
4217 if (binned_kv_cache != nullptr) {
4218 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
4219 }
f67539c2
TL
4220 if (binned_kv_onode_cache != nullptr) {
4221 binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
4222 }
11fdf7f2
TL
4223 meta_cache->set_cache_ratio(store->cache_meta_ratio);
4224 data_cache->set_cache_ratio(store->cache_data_ratio);
91327a77
AA
4225}
4226
9f95a23c 4227void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
91327a77 4228{
9f95a23c
TL
4229 size_t onode_shards = store->onode_cache_shards.size();
4230 size_t buffer_shards = store->buffer_cache_shards.size();
91327a77 4231 int64_t kv_used = store->db->get_cache_usage();
f67539c2 4232 int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
11fdf7f2
TL
4233 int64_t meta_used = meta_cache->_get_used_bytes();
4234 int64_t data_used = data_cache->_get_used_bytes();
91327a77
AA
4235
4236 uint64_t cache_size = store->cache_size;
4237 int64_t kv_alloc =
11fdf7f2 4238 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
f67539c2
TL
4239 int64_t kv_onode_alloc =
4240 static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
91327a77 4241 int64_t meta_alloc =
11fdf7f2 4242 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
91327a77 4243 int64_t data_alloc =
11fdf7f2 4244 static_cast<int64_t>(store->cache_data_ratio * cache_size);
91327a77 4245
eafe8130
TL
4246 if (pcm != nullptr && binned_kv_cache != nullptr) {
4247 cache_size = pcm->get_tuned_mem();
11fdf7f2
TL
4248 kv_alloc = binned_kv_cache->get_committed_size();
4249 meta_alloc = meta_cache->get_committed_size();
4250 data_alloc = data_cache->get_committed_size();
f67539c2
TL
4251 if (binned_kv_onode_cache != nullptr) {
4252 kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
4253 }
91327a77
AA
4254 }
4255
4256 if (interval_stats) {
9f95a23c 4257 dout(5) << __func__ << " cache_size: " << cache_size
91327a77
AA
4258 << " kv_alloc: " << kv_alloc
4259 << " kv_used: " << kv_used
f67539c2
TL
4260 << " kv_onode_alloc: " << kv_onode_alloc
4261 << " kv_onode_used: " << kv_onode_used
91327a77
AA
4262 << " meta_alloc: " << meta_alloc
4263 << " meta_used: " << meta_used
4264 << " data_alloc: " << data_alloc
4265 << " data_used: " << data_used << dendl;
4266 } else {
9f95a23c 4267 dout(20) << __func__ << " cache_size: " << cache_size
91327a77
AA
4268 << " kv_alloc: " << kv_alloc
4269 << " kv_used: " << kv_used
f67539c2
TL
4270 << " kv_onode_alloc: " << kv_onode_alloc
4271 << " kv_onode_used: " << kv_onode_used
91327a77
AA
4272 << " meta_alloc: " << meta_alloc
4273 << " meta_used: " << meta_used
4274 << " data_alloc: " << data_alloc
4275 << " data_used: " << data_used << dendl;
4276 }
4277
4278 uint64_t max_shard_onodes = static_cast<uint64_t>(
9f95a23c
TL
4279 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
4280 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
91327a77 4281
9f95a23c 4282 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
91327a77
AA
4283 << " max_shard_buffer: " << max_shard_buffer << dendl;
4284
9f95a23c
TL
4285 for (auto i : store->onode_cache_shards) {
4286 i->set_max(max_shard_onodes);
4287 }
4288 for (auto i : store->buffer_cache_shards) {
4289 i->set_max(max_shard_buffer);
91327a77
AA
4290 }
4291}
4292
92f5a8d4
TL
4293void BlueStore::MempoolThread::_update_cache_settings()
4294{
4295 // Nothing to do if pcm is not used.
4296 if (pcm == nullptr) {
4297 return;
4298 }
4299
92f5a8d4
TL
4300 uint64_t target = store->osd_memory_target;
4301 uint64_t base = store->osd_memory_base;
4302 uint64_t min = store->osd_memory_cache_min;
4303 uint64_t max = min;
4304 double fragmentation = store->osd_memory_expected_fragmentation;
4305
4306 uint64_t ltarget = (1.0 - fragmentation) * target;
4307 if (ltarget > base + min) {
4308 max = ltarget - base;
4309 }
4310
4311 // set pcm cache levels
4312 pcm->set_target_memory(target);
4313 pcm->set_min_memory(min);
4314 pcm->set_max_memory(max);
4315
9f95a23c 4316 dout(5) << __func__ << " updated pcm target: " << target
92f5a8d4
TL
4317 << " pcm min: " << min
4318 << " pcm max: " << max
4319 << dendl;
4320}
4321
7c673cae
FG
4322// =======================================================
4323
31f18b77
FG
4324// OmapIteratorImpl
4325
4326#undef dout_prefix
4327#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4328
4329BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4330 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
4331 : c(c), o(o), it(it)
4332{
9f95a23c 4333 std::shared_lock l(c->lock);
31f18b77 4334 if (o->onode.has_omap()) {
9f95a23c
TL
4335 o->get_omap_key(string(), &head);
4336 o->get_omap_tail(&tail);
31f18b77
FG
4337 it->lower_bound(head);
4338 }
4339}
4340
11fdf7f2
TL
4341string BlueStore::OmapIteratorImpl::_stringify() const
4342{
4343 stringstream s;
4344 s << " omap_iterator(cid = " << c->cid
4345 <<", oid = " << o->oid << ")";
4346 return s.str();
4347}
4348
31f18b77
FG
4349int BlueStore::OmapIteratorImpl::seek_to_first()
4350{
9f95a23c 4351 std::shared_lock l(c->lock);
11fdf7f2 4352 auto start1 = mono_clock::now();
31f18b77
FG
4353 if (o->onode.has_omap()) {
4354 it->lower_bound(head);
4355 } else {
4356 it = KeyValueDB::Iterator();
4357 }
494da23a
TL
4358 c->store->log_latency(
4359 __func__,
11fdf7f2
TL
4360 l_bluestore_omap_seek_to_first_lat,
4361 mono_clock::now() - start1,
494da23a 4362 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2 4363
31f18b77
FG
4364 return 0;
4365}
4366
4367int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4368{
9f95a23c 4369 std::shared_lock l(c->lock);
11fdf7f2 4370 auto start1 = mono_clock::now();
31f18b77
FG
4371 if (o->onode.has_omap()) {
4372 string key;
9f95a23c 4373 o->get_omap_key(after, &key);
31f18b77
FG
4374 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4375 << pretty_binary_string(key) << dendl;
4376 it->upper_bound(key);
4377 } else {
4378 it = KeyValueDB::Iterator();
4379 }
11fdf7f2 4380 c->store->log_latency_fn(
494da23a 4381 __func__,
11fdf7f2
TL
4382 l_bluestore_omap_upper_bound_lat,
4383 mono_clock::now() - start1,
494da23a 4384 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4385 [&] (const ceph::timespan& lat) {
494da23a 4386 return ", after = " + after +
11fdf7f2
TL
4387 _stringify();
4388 }
4389 );
31f18b77
FG
4390 return 0;
4391}
4392
4393int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4394{
9f95a23c 4395 std::shared_lock l(c->lock);
11fdf7f2 4396 auto start1 = mono_clock::now();
31f18b77
FG
4397 if (o->onode.has_omap()) {
4398 string key;
9f95a23c 4399 o->get_omap_key(to, &key);
31f18b77
FG
4400 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4401 << pretty_binary_string(key) << dendl;
4402 it->lower_bound(key);
4403 } else {
4404 it = KeyValueDB::Iterator();
4405 }
11fdf7f2 4406 c->store->log_latency_fn(
494da23a 4407 __func__,
11fdf7f2
TL
4408 l_bluestore_omap_lower_bound_lat,
4409 mono_clock::now() - start1,
494da23a 4410 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4411 [&] (const ceph::timespan& lat) {
494da23a 4412 return ", to = " + to +
11fdf7f2
TL
4413 _stringify();
4414 }
4415 );
31f18b77
FG
4416 return 0;
4417}
4418
4419bool BlueStore::OmapIteratorImpl::valid()
4420{
9f95a23c 4421 std::shared_lock l(c->lock);
31f18b77 4422 bool r = o->onode.has_omap() && it && it->valid() &&
494da23a 4423 it->raw_key().second < tail;
31f18b77
FG
4424 if (it && it->valid()) {
4425 ldout(c->store->cct,20) << __func__ << " is at "
4426 << pretty_binary_string(it->raw_key().second)
4427 << dendl;
4428 }
4429 return r;
4430}
4431
11fdf7f2 4432int BlueStore::OmapIteratorImpl::next()
31f18b77 4433{
11fdf7f2 4434 int r = -1;
9f95a23c 4435 std::shared_lock l(c->lock);
11fdf7f2 4436 auto start1 = mono_clock::now();
31f18b77
FG
4437 if (o->onode.has_omap()) {
4438 it->next();
11fdf7f2 4439 r = 0;
31f18b77 4440 }
494da23a
TL
4441 c->store->log_latency(
4442 __func__,
11fdf7f2
TL
4443 l_bluestore_omap_next_lat,
4444 mono_clock::now() - start1,
494da23a 4445 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2
TL
4446
4447 return r;
31f18b77
FG
4448}
4449
4450string BlueStore::OmapIteratorImpl::key()
4451{
9f95a23c 4452 std::shared_lock l(c->lock);
11fdf7f2 4453 ceph_assert(it->valid());
31f18b77
FG
4454 string db_key = it->raw_key().second;
4455 string user_key;
9f95a23c 4456 o->decode_omap_key(db_key, &user_key);
494da23a 4457
31f18b77
FG
4458 return user_key;
4459}
4460
4461bufferlist BlueStore::OmapIteratorImpl::value()
4462{
9f95a23c 4463 std::shared_lock l(c->lock);
11fdf7f2 4464 ceph_assert(it->valid());
31f18b77
FG
4465 return it->value();
4466}
4467
4468
4469// =====================================
4470
7c673cae
FG
4471#undef dout_prefix
4472#define dout_prefix *_dout << "bluestore(" << path << ") "
9f95a23c
TL
4473#undef dout_context
4474#define dout_context cct
7c673cae
FG
4475
4476
4477static void aio_cb(void *priv, void *priv2)
4478{
4479 BlueStore *store = static_cast<BlueStore*>(priv);
4480 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4481 c->aio_finish(store);
4482}
4483
11fdf7f2
TL
4484static void discard_cb(void *priv, void *priv2)
4485{
4486 BlueStore *store = static_cast<BlueStore*>(priv);
4487 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4488 store->handle_discard(*tmp);
4489}
4490
4491void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4492{
4493 dout(10) << __func__ << dendl;
f67539c2
TL
4494 ceph_assert(shared_alloc.a);
4495 shared_alloc.a->release(to_release);
11fdf7f2
TL
4496}
4497
7c673cae 4498BlueStore::BlueStore(CephContext *cct, const string& path)
9f95a23c 4499 : BlueStore(cct, path, 0) {}
7c673cae
FG
4500
4501BlueStore::BlueStore(CephContext *cct,
4502 const string& path,
4503 uint64_t _min_alloc_size)
4504 : ObjectStore(cct, path),
9f95a23c 4505 throttle(cct),
11fdf7f2 4506 finisher(cct, "commit_finisher", "cfin"),
7c673cae 4507 kv_sync_thread(this),
31f18b77 4508 kv_finalize_thread(this),
f67539c2 4509 zoned_cleaner_thread(this),
7c673cae
FG
4510 min_alloc_size(_min_alloc_size),
4511 min_alloc_size_order(ctz(_min_alloc_size)),
4512 mempool_thread(this)
4513{
4514 _init_logger();
11fdf7f2 4515 cct->_conf.add_observer(this);
7c673cae 4516 set_cache_shards(1);
7c673cae
FG
4517}
4518
4519BlueStore::~BlueStore()
4520{
11fdf7f2 4521 cct->_conf.remove_observer(this);
7c673cae 4522 _shutdown_logger();
11fdf7f2
TL
4523 ceph_assert(!mounted);
4524 ceph_assert(db == NULL);
4525 ceph_assert(bluefs == NULL);
4526 ceph_assert(fsid_fd < 0);
4527 ceph_assert(path_fd < 0);
9f95a23c
TL
4528 for (auto i : onode_cache_shards) {
4529 delete i;
4530 }
4531 for (auto i : buffer_cache_shards) {
7c673cae
FG
4532 delete i;
4533 }
9f95a23c
TL
4534 onode_cache_shards.clear();
4535 buffer_cache_shards.clear();
7c673cae
FG
4536}
4537
4538const char **BlueStore::get_tracked_conf_keys() const
4539{
4540 static const char* KEYS[] = {
4541 "bluestore_csum_type",
4542 "bluestore_compression_mode",
4543 "bluestore_compression_algorithm",
4544 "bluestore_compression_min_blob_size",
4545 "bluestore_compression_min_blob_size_ssd",
4546 "bluestore_compression_min_blob_size_hdd",
4547 "bluestore_compression_max_blob_size",
4548 "bluestore_compression_max_blob_size_ssd",
4549 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 4550 "bluestore_compression_required_ratio",
7c673cae
FG
4551 "bluestore_max_alloc_size",
4552 "bluestore_prefer_deferred_size",
181888fb
FG
4553 "bluestore_prefer_deferred_size_hdd",
4554 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
4555 "bluestore_deferred_batch_ops",
4556 "bluestore_deferred_batch_ops_hdd",
4557 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
4558 "bluestore_throttle_bytes",
4559 "bluestore_throttle_deferred_bytes",
4560 "bluestore_throttle_cost_per_io_hdd",
4561 "bluestore_throttle_cost_per_io_ssd",
4562 "bluestore_throttle_cost_per_io",
4563 "bluestore_max_blob_size",
4564 "bluestore_max_blob_size_ssd",
4565 "bluestore_max_blob_size_hdd",
11fdf7f2
TL
4566 "osd_memory_target",
4567 "osd_memory_target_cgroup_limit_ratio",
4568 "osd_memory_base",
4569 "osd_memory_cache_min",
92f5a8d4 4570 "osd_memory_expected_fragmentation",
11fdf7f2
TL
4571 "bluestore_cache_autotune",
4572 "bluestore_cache_autotune_interval",
81eedcae 4573 "bluestore_warn_on_legacy_statfs",
9f95a23c
TL
4574 "bluestore_warn_on_no_per_pool_omap",
4575 "bluestore_max_defer_interval",
7c673cae
FG
4576 NULL
4577 };
4578 return KEYS;
4579}
4580
11fdf7f2 4581void BlueStore::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
4582 const std::set<std::string> &changed)
4583{
eafe8130 4584 if (changed.count("bluestore_warn_on_legacy_statfs")) {
81eedcae
TL
4585 _check_legacy_statfs_alert();
4586 }
f67539c2
TL
4587 if (changed.count("bluestore_warn_on_no_per_pool_omap") ||
4588 changed.count("bluestore_warn_on_no_per_pg_omap")) {
4589 _check_no_per_pg_or_pool_omap_alert();
9f95a23c 4590 }
81eedcae 4591
7c673cae
FG
4592 if (changed.count("bluestore_csum_type")) {
4593 _set_csum();
4594 }
4595 if (changed.count("bluestore_compression_mode") ||
4596 changed.count("bluestore_compression_algorithm") ||
4597 changed.count("bluestore_compression_min_blob_size") ||
4598 changed.count("bluestore_compression_max_blob_size")) {
4599 if (bdev) {
4600 _set_compression();
4601 }
4602 }
4603 if (changed.count("bluestore_max_blob_size") ||
4604 changed.count("bluestore_max_blob_size_ssd") ||
4605 changed.count("bluestore_max_blob_size_hdd")) {
4606 if (bdev) {
4607 // only after startup
4608 _set_blob_size();
4609 }
4610 }
4611 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
4612 changed.count("bluestore_prefer_deferred_size_hdd") ||
4613 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
4614 changed.count("bluestore_max_alloc_size") ||
4615 changed.count("bluestore_deferred_batch_ops") ||
4616 changed.count("bluestore_deferred_batch_ops_hdd") ||
4617 changed.count("bluestore_deferred_batch_ops_ssd")) {
4618 if (bdev) {
4619 // only after startup
4620 _set_alloc_sizes();
4621 }
4622 }
4623 if (changed.count("bluestore_throttle_cost_per_io") ||
4624 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4625 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4626 if (bdev) {
4627 _set_throttle_params();
4628 }
4629 }
9f95a23c
TL
4630 if (changed.count("bluestore_throttle_bytes") ||
4631 changed.count("bluestore_throttle_deferred_bytes") ||
4632 changed.count("bluestore_throttle_trace_rate")) {
4633 throttle.reset_throttle(conf);
7c673cae 4634 }
9f95a23c
TL
4635 if (changed.count("bluestore_max_defer_interval")) {
4636 if (bdev) {
4637 _set_max_defer_interval();
4638 }
7c673cae 4639 }
92f5a8d4
TL
4640 if (changed.count("osd_memory_target") ||
4641 changed.count("osd_memory_base") ||
4642 changed.count("osd_memory_cache_min") ||
4643 changed.count("osd_memory_expected_fragmentation")) {
4644 _update_osd_memory_options();
4645 }
7c673cae
FG
4646}
4647
4648void BlueStore::_set_compression()
4649{
224ce89b
WB
4650 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4651 if (m) {
11fdf7f2 4652 _clear_compression_alert();
224ce89b
WB
4653 comp_mode = *m;
4654 } else {
4655 derr << __func__ << " unrecognized value '"
4656 << cct->_conf->bluestore_compression_mode
4657 << "' for bluestore_compression_mode, reverting to 'none'"
4658 << dendl;
4659 comp_mode = Compressor::COMP_NONE;
11fdf7f2
TL
4660 string s("unknown mode: ");
4661 s += cct->_conf->bluestore_compression_mode;
4662 _set_compression_alert(true, s.c_str());
224ce89b
WB
4663 }
4664
4665 compressor = nullptr;
4666
3efd9988
FG
4667 if (cct->_conf->bluestore_compression_min_blob_size) {
4668 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae 4669 } else {
11fdf7f2 4670 ceph_assert(bdev);
9f95a23c 4671 if (_use_rotational_settings()) {
7c673cae
FG
4672 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4673 } else {
4674 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4675 }
4676 }
4677
4678 if (cct->_conf->bluestore_compression_max_blob_size) {
4679 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4680 } else {
11fdf7f2 4681 ceph_assert(bdev);
9f95a23c 4682 if (_use_rotational_settings()) {
7c673cae
FG
4683 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4684 } else {
4685 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4686 }
4687 }
4688
7c673cae
FG
4689 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4690 if (!alg_name.empty()) {
4691 compressor = Compressor::create(cct, alg_name);
4692 if (!compressor) {
4693 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4694 << dendl;
11fdf7f2 4695 _set_compression_alert(false, alg_name.c_str());
7c673cae
FG
4696 }
4697 }
4698
4699 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4700 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
11fdf7f2
TL
4701 << " min_blob " << comp_min_blob_size
4702 << " max_blob " << comp_max_blob_size
7c673cae
FG
4703 << dendl;
4704}
4705
4706void BlueStore::_set_csum()
4707{
4708 csum_type = Checksummer::CSUM_NONE;
4709 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4710 if (t > Checksummer::CSUM_NONE)
4711 csum_type = t;
4712
4713 dout(10) << __func__ << " csum_type "
4714 << Checksummer::get_csum_type_string(csum_type)
4715 << dendl;
4716}
4717
4718void BlueStore::_set_throttle_params()
4719{
4720 if (cct->_conf->bluestore_throttle_cost_per_io) {
4721 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4722 } else {
11fdf7f2 4723 ceph_assert(bdev);
9f95a23c 4724 if (_use_rotational_settings()) {
7c673cae
FG
4725 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4726 } else {
4727 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4728 }
4729 }
4730
4731 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4732 << dendl;
4733}
4734void BlueStore::_set_blob_size()
4735{
4736 if (cct->_conf->bluestore_max_blob_size) {
4737 max_blob_size = cct->_conf->bluestore_max_blob_size;
4738 } else {
11fdf7f2 4739 ceph_assert(bdev);
9f95a23c 4740 if (_use_rotational_settings()) {
7c673cae
FG
4741 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4742 } else {
4743 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4744 }
4745 }
4746 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4747 << std::dec << dendl;
4748}
4749
92f5a8d4
TL
4750void BlueStore::_update_osd_memory_options()
4751{
4752 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4753 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4754 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4755 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4756 config_changed++;
4757 dout(10) << __func__
4758 << " osd_memory_target " << osd_memory_target
4759 << " osd_memory_base " << osd_memory_base
4760 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4761 << " osd_memory_cache_min " << osd_memory_cache_min
4762 << dendl;
4763}
4764
11fdf7f2 4765int BlueStore::_set_cache_sizes()
1adf2230 4766{
11fdf7f2
TL
4767 ceph_assert(bdev);
4768 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
91327a77 4769 cache_autotune_interval =
11fdf7f2
TL
4770 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4771 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4772 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
91327a77 4773 osd_memory_expected_fragmentation =
11fdf7f2
TL
4774 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4775 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
91327a77 4776 osd_memory_cache_resize_interval =
11fdf7f2 4777 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
91327a77 4778
224ce89b
WB
4779 if (cct->_conf->bluestore_cache_size) {
4780 cache_size = cct->_conf->bluestore_cache_size;
4781 } else {
4782 // choose global cache size based on backend type
9f95a23c 4783 if (_use_rotational_settings()) {
224ce89b
WB
4784 cache_size = cct->_conf->bluestore_cache_size_hdd;
4785 } else {
4786 cache_size = cct->_conf->bluestore_cache_size_ssd;
4787 }
4788 }
31f18b77 4789
f67539c2 4790 cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
224ce89b 4791 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 4792 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77 4793 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4794 return -EINVAL;
4795 }
91327a77 4796
f67539c2 4797 cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
224ce89b 4798 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 4799 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
91327a77 4800 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4801 return -EINVAL;
4802 }
91327a77 4803
f67539c2
TL
4804 cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
4805 if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
4806 derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
4807 << ") must be in range [0,1.0]" << dendl;
4808 return -EINVAL;
4809 }
4810
31f18b77 4811 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 4812 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77
AA
4813 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4814 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4815 << dendl;
31f18b77
FG
4816 return -EINVAL;
4817 }
91327a77 4818
f67539c2
TL
4819 cache_data_ratio = (double)1.0 -
4820 (double)cache_meta_ratio -
4821 (double)cache_kv_ratio -
4822 (double)cache_kv_onode_ratio;
31f18b77
FG
4823 if (cache_data_ratio < 0) {
4824 // deal with floating point imprecision
4825 cache_data_ratio = 0;
4826 }
91327a77 4827
224ce89b
WB
4828 dout(1) << __func__ << " cache_size " << cache_size
4829 << " meta " << cache_meta_ratio
31f18b77
FG
4830 << " kv " << cache_kv_ratio
4831 << " data " << cache_data_ratio
4832 << dendl;
4833 return 0;
4834}
4835
3efd9988
FG
4836int BlueStore::write_meta(const std::string& key, const std::string& value)
4837{
4838 bluestore_bdev_label_t label;
4839 string p = path + "/block";
4840 int r = _read_bdev_label(cct, p, &label);
4841 if (r < 0) {
4842 return ObjectStore::write_meta(key, value);
4843 }
4844 label.meta[key] = value;
4845 r = _write_bdev_label(cct, p, label);
11fdf7f2 4846 ceph_assert(r == 0);
3efd9988
FG
4847 return ObjectStore::write_meta(key, value);
4848}
4849
4850int BlueStore::read_meta(const std::string& key, std::string *value)
4851{
4852 bluestore_bdev_label_t label;
4853 string p = path + "/block";
4854 int r = _read_bdev_label(cct, p, &label);
4855 if (r < 0) {
4856 return ObjectStore::read_meta(key, value);
4857 }
4858 auto i = label.meta.find(key);
4859 if (i == label.meta.end()) {
4860 return ObjectStore::read_meta(key, value);
4861 }
4862 *value = i->second;
4863 return 0;
4864}
4865
7c673cae
FG
4866void BlueStore::_init_logger()
4867{
4868 PerfCountersBuilder b(cct, "bluestore",
4869 l_bluestore_first, l_bluestore_last);
4870 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4871 "Average kv_thread flush latency",
4872 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4873 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4874 "Average kv_thread commit latency");
11fdf7f2
TL
4875 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
4876 "Average kv_sync thread latency",
4877 "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
4878 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
4879 "Average kv_finalize thread latency",
4880 "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
4881 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4882 "Average prepare state latency");
4883 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4884 "Average aio_wait state latency",
4885 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4886 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4887 "Average io_done state latency");
4888 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4889 "Average kv_queued state latency");
4890 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4891 "Average kv_commiting state latency");
4892 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4893 "Average kv_done state latency");
4894 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4895 "Average deferred_queued state latency");
4896 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4897 "Average aio_wait state latency");
4898 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4899 "Average cleanup state latency");
4900 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4901 "Average finishing state latency");
4902 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4903 "Average done state latency");
4904 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4905 "Average submit throttle latency",
4906 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4907 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4908 "Average submit latency",
4909 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4910 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4911 "Average commit latency",
4912 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4913 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4914 "Average read latency",
4915 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4916 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4917 "Average read onode metadata latency");
4918 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4919 "Average read latency");
4920 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4921 "Average compress latency");
4922 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4923 "Average decompress latency");
4924 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4925 "Average checksum latency");
4926 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4927 "Sum for beneficial compress ops");
4928 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4929 "Sum for compress ops rejected due to low net gain of space");
4930 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
11fdf7f2 4931 "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4932 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4933 "Sum for deferred write op");
4934 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
11fdf7f2 4935 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
7c673cae
FG
4936 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4937 "Sum for write penalty read ops");
4938 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4939 "Sum for allocated bytes");
4940 b.add_u64(l_bluestore_stored, "bluestore_stored",
4941 "Sum for stored bytes");
4942 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
92f5a8d4
TL
4943 "Sum for stored compressed bytes",
4944 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4945 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
92f5a8d4
TL
4946 "Sum for bytes allocated for compressed data",
4947 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4948 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
92f5a8d4
TL
4949 "Sum for original bytes that were compressed",
4950 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
4951 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4952 "Number of onodes in cache");
9f95a23c
TL
4953 b.add_u64(l_bluestore_pinned_onodes, "bluestore_pinned_onodes",
4954 "Number of pinned onodes in cache");
7c673cae
FG
4955 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4956 "Sum for onode-lookups hit in the cache");
4957 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4958 "Sum for onode-lookups missed in the cache");
4959 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4960 "Sum for onode-shard lookups hit in the cache");
4961 b.add_u64_counter(l_bluestore_onode_shard_misses,
4962 "bluestore_onode_shard_misses",
4963 "Sum for onode-shard lookups missed in the cache");
4964 b.add_u64(l_bluestore_extents, "bluestore_extents",
4965 "Number of extents in cache");
4966 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4967 "Number of blobs in cache");
4968 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4969 "Number of buffers in cache");
4970 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
11fdf7f2 4971 "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4972 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
11fdf7f2 4973 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4974 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
11fdf7f2 4975 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4976
4977 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4978 "Large aligned writes into fresh blobs");
4979 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
11fdf7f2 4980 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4981 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4982 "Large aligned writes into fresh blobs (blobs)");
f67539c2
TL
4983 b.add_u64_counter(l_bluestore_write_big_deferred,
4984 "bluestore_write_big_deferred",
4985 "Big overwrites using deferred");
7c673cae
FG
4986 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4987 "Small writes into existing or sparse small blobs");
4988 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
11fdf7f2 4989 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4990 b.add_u64_counter(l_bluestore_write_small_unused,
4991 "bluestore_write_small_unused",
4992 "Small writes into unused portion of existing blob");
f67539c2
TL
4993 b.add_u64_counter(l_bluestore_write_deferred,
4994 "bluestore_write_deferred",
522d829b
TL
4995 "Total deferred writes submitted");
4996 b.add_u64_counter(l_bluestore_write_deferred_bytes,
4997 "bluestore_write_deferred_bytes",
4998 "Total bytes submitted as deferred writes");
7c673cae
FG
4999 b.add_u64_counter(l_bluestore_write_small_pre_read,
5000 "bluestore_write_small_pre_read",
5001 "Small writes that required we read some data (possibly "
5002 "cached) to fill out the block");
f67539c2
TL
5003 b.add_u64_counter(l_bluestore_write_new, "bluestore_write_new",
5004 "Write into new blob");
7c673cae
FG
5005
5006 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
5007 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
5008 "Onode extent map reshard events");
5009 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
5010 "Sum for blob splitting due to resharding");
5011 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
5012 "Sum for extents that have been removed due to compression");
5013 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
5014 "Sum for extents that have been merged due to garbage "
5015 "collection");
b32b8144
FG
5016 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
5017 "Read EIO errors propagated to high level callers");
f64942e4
AA
5018 b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
5019 "Read operations that required at least one retry due to failed checksum validation");
a8e16298
TL
5020 b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
5021 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
11fdf7f2
TL
5022 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
5023 "Average omap iterator seek_to_first call latency");
5024 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
5025 "Average omap iterator upper_bound call latency");
5026 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
5027 "Average omap iterator lower_bound call latency");
5028 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
5029 "Average omap iterator next call latency");
adb31ebb
TL
5030 b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
5031 "Average omap get_keys call latency");
5032 b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
5033 "Average omap get_values call latency");
494da23a
TL
5034 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
5035 "Average collection listing latency");
adb31ebb
TL
5036 b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
5037 "Average removal latency");
5038
7c673cae
FG
5039 logger = b.create_perf_counters();
5040 cct->get_perfcounters_collection()->add(logger);
5041}
5042
5043int BlueStore::_reload_logger()
5044{
5045 struct store_statfs_t store_statfs;
7c673cae 5046 int r = statfs(&store_statfs);
11fdf7f2 5047 if (r >= 0) {
7c673cae 5048 logger->set(l_bluestore_allocated, store_statfs.allocated);
11fdf7f2
TL
5049 logger->set(l_bluestore_stored, store_statfs.data_stored);
5050 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
5051 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
5052 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
7c673cae
FG
5053 }
5054 return r;
5055}
5056
5057void BlueStore::_shutdown_logger()
5058{
5059 cct->get_perfcounters_collection()->remove(logger);
5060 delete logger;
5061}
5062
5063int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
5064 uuid_d *fsid)
5065{
5066 bluestore_bdev_label_t label;
5067 int r = _read_bdev_label(cct, path, &label);
5068 if (r < 0)
5069 return r;
5070 *fsid = label.osd_uuid;
5071 return 0;
5072}
5073
5074int BlueStore::_open_path()
5075{
b32b8144 5076 // sanity check(s)
11fdf7f2 5077 ceph_assert(path_fd < 0);
91327a77 5078 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
7c673cae
FG
5079 if (path_fd < 0) {
5080 int r = -errno;
5081 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
5082 << dendl;
5083 return r;
5084 }
5085 return 0;
5086}
5087
5088void BlueStore::_close_path()
5089{
5090 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
5091 path_fd = -1;
5092}
5093
3efd9988
FG
5094int BlueStore::_write_bdev_label(CephContext *cct,
5095 string path, bluestore_bdev_label_t label)
7c673cae
FG
5096{
5097 dout(10) << __func__ << " path " << path << " label " << label << dendl;
5098 bufferlist bl;
11fdf7f2 5099 encode(label, bl);
7c673cae 5100 uint32_t crc = bl.crc32c(-1);
11fdf7f2
TL
5101 encode(crc, bl);
5102 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
7c673cae
FG
5103 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
5104 z.zero();
5105 bl.append(std::move(z));
5106
91327a77 5107 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
7c673cae
FG
5108 if (fd < 0) {
5109 fd = -errno;
5110 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5111 << dendl;
5112 return fd;
5113 }
5114 int r = bl.write_fd(fd);
5115 if (r < 0) {
5116 derr << __func__ << " failed to write to " << path
5117 << ": " << cpp_strerror(r) << dendl;
11fdf7f2 5118 goto out;
7c673cae 5119 }
3efd9988
FG
5120 r = ::fsync(fd);
5121 if (r < 0) {
5122 derr << __func__ << " failed to fsync " << path
5123 << ": " << cpp_strerror(r) << dendl;
5124 }
11fdf7f2 5125out:
7c673cae
FG
5126 VOID_TEMP_FAILURE_RETRY(::close(fd));
5127 return r;
5128}
5129
5130int BlueStore::_read_bdev_label(CephContext* cct, string path,
5131 bluestore_bdev_label_t *label)
5132{
5133 dout(10) << __func__ << dendl;
91327a77 5134 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
7c673cae
FG
5135 if (fd < 0) {
5136 fd = -errno;
5137 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5138 << dendl;
5139 return fd;
5140 }
5141 bufferlist bl;
5142 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
5143 VOID_TEMP_FAILURE_RETRY(::close(fd));
5144 if (r < 0) {
5145 derr << __func__ << " failed to read from " << path
5146 << ": " << cpp_strerror(r) << dendl;
5147 return r;
5148 }
5149
5150 uint32_t crc, expected_crc;
11fdf7f2 5151 auto p = bl.cbegin();
7c673cae 5152 try {
11fdf7f2 5153 decode(*label, p);
7c673cae
FG
5154 bufferlist t;
5155 t.substr_of(bl, 0, p.get_off());
5156 crc = t.crc32c(-1);
11fdf7f2 5157 decode(expected_crc, p);
7c673cae 5158 }
f67539c2 5159 catch (ceph::buffer::error& e) {
b32b8144 5160 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
5161 << ": " << e.what()
5162 << dendl;
b32b8144 5163 return -ENOENT;
7c673cae
FG
5164 }
5165 if (crc != expected_crc) {
5166 derr << __func__ << " bad crc on label, expected " << expected_crc
5167 << " != actual " << crc << dendl;
5168 return -EIO;
5169 }
5170 dout(10) << __func__ << " got " << *label << dendl;
5171 return 0;
5172}
5173
5174int BlueStore::_check_or_set_bdev_label(
5175 string path, uint64_t size, string desc, bool create)
5176{
5177 bluestore_bdev_label_t label;
5178 if (create) {
5179 label.osd_uuid = fsid;
5180 label.size = size;
5181 label.btime = ceph_clock_now();
5182 label.description = desc;
3efd9988 5183 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
5184 if (r < 0)
5185 return r;
5186 } else {
5187 int r = _read_bdev_label(cct, path, &label);
5188 if (r < 0)
5189 return r;
31f18b77
FG
5190 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
5191 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5192 << " and fsid " << fsid << " check bypassed" << dendl;
1911f103 5193 } else if (label.osd_uuid != fsid) {
7c673cae
FG
5194 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5195 << " does not match our fsid " << fsid << dendl;
5196 return -EIO;
5197 }
5198 }
5199 return 0;
5200}
5201
5202void BlueStore::_set_alloc_sizes(void)
5203{
7c673cae
FG
5204 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
5205
5206 if (cct->_conf->bluestore_prefer_deferred_size) {
5207 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
5208 } else {
11fdf7f2 5209 ceph_assert(bdev);
9f95a23c 5210 if (_use_rotational_settings()) {
7c673cae
FG
5211 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
5212 } else {
5213 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
5214 }
5215 }
5216
5217 if (cct->_conf->bluestore_deferred_batch_ops) {
5218 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
5219 } else {
11fdf7f2 5220 ceph_assert(bdev);
9f95a23c 5221 if (_use_rotational_settings()) {
7c673cae
FG
5222 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
5223 } else {
5224 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
5225 }
5226 }
5227
5228 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11fdf7f2 5229 << std::dec << " order " << (int)min_alloc_size_order
7c673cae
FG
5230 << " max_alloc_size 0x" << std::hex << max_alloc_size
5231 << " prefer_deferred_size 0x" << prefer_deferred_size
5232 << std::dec
5233 << " deferred_batch_ops " << deferred_batch_ops
5234 << dendl;
5235}
5236
5237int BlueStore::_open_bdev(bool create)
5238{
11fdf7f2 5239 ceph_assert(bdev == NULL);
7c673cae 5240 string p = path + "/block";
11fdf7f2 5241 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
7c673cae
FG
5242 int r = bdev->open(p);
5243 if (r < 0)
5244 goto fail;
5245
11fdf7f2
TL
5246 if (create && cct->_conf->bdev_enable_discard) {
5247 bdev->discard(0, bdev->get_size());
5248 }
5249
7c673cae
FG
5250 if (bdev->supported_bdev_label()) {
5251 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
5252 if (r < 0)
5253 goto fail_close;
5254 }
5255
5256 // initialize global block parameters
5257 block_size = bdev->get_block_size();
5258 block_mask = ~(block_size - 1);
5259 block_size_order = ctz(block_size);
11fdf7f2 5260 ceph_assert(block_size == 1u << block_size_order);
9f95a23c 5261 _set_max_defer_interval();
224ce89b
WB
5262 // and set cache_size based on device type
5263 r = _set_cache_sizes();
5264 if (r < 0) {
5265 goto fail_close;
5266 }
f67539c2
TL
5267
5268 if (bdev->is_smr()) {
5269 freelist_type = "zoned";
5270 }
7c673cae
FG
5271 return 0;
5272
5273 fail_close:
5274 bdev->close();
5275 fail:
5276 delete bdev;
5277 bdev = NULL;
5278 return r;
5279}
5280
11fdf7f2
TL
5281void BlueStore::_validate_bdev()
5282{
5283 ceph_assert(bdev);
11fdf7f2 5284 uint64_t dev_size = bdev->get_size();
f67539c2 5285 ceph_assert(dev_size > _get_ondisk_reserved());
11fdf7f2
TL
5286}
5287
7c673cae
FG
5288void BlueStore::_close_bdev()
5289{
11fdf7f2 5290 ceph_assert(bdev);
7c673cae
FG
5291 bdev->close();
5292 delete bdev;
5293 bdev = NULL;
5294}
5295
1911f103 5296int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only)
7c673cae 5297{
1911f103 5298 int r;
1911f103 5299
11fdf7f2
TL
5300 ceph_assert(fm == NULL);
5301 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
5302 ceph_assert(fm);
5303 if (t) {
5304 // create mode. initialize freespace
7c673cae 5305 dout(20) << __func__ << " initializing freespace" << dendl;
7c673cae
FG
5306 {
5307 bufferlist bl;
5308 bl.append(freelist_type);
5309 t->set(PREFIX_SUPER, "freelist_type", bl);
5310 }
b32b8144
FG
5311 // being able to allocate in units less than bdev block size
5312 // seems to be a bad idea.
11fdf7f2 5313 ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
f67539c2
TL
5314
5315 uint64_t alloc_size = min_alloc_size;
5316 if (bdev->is_smr()) {
5317 alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
5318 }
5319
5320 fm->create(bdev->get_size(), alloc_size, t);
7c673cae
FG
5321
5322 // allocate superblock reserved space. note that we do not mark
5323 // bluefs space as allocated in the freelist; we instead rely on
f67539c2 5324 // bluefs doing that itself.
11fdf7f2 5325 auto reserved = _get_ondisk_reserved();
3efd9988 5326 fm->allocate(0, reserved, t);
7c673cae 5327
7c673cae
FG
5328 if (cct->_conf->bluestore_debug_prefill > 0) {
5329 uint64_t end = bdev->get_size() - reserved;
5330 dout(1) << __func__ << " pre-fragmenting freespace, using "
5331 << cct->_conf->bluestore_debug_prefill << " with max free extent "
5332 << cct->_conf->bluestore_debug_prefragment_max << dendl;
11fdf7f2 5333 uint64_t start = p2roundup(reserved, min_alloc_size);
7c673cae
FG
5334 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
5335 float r = cct->_conf->bluestore_debug_prefill;
5336 r /= 1.0 - r;
5337 bool stop = false;
5338
5339 while (!stop && start < end) {
5340 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
5341 if (start + l > end) {
5342 l = end - start;
11fdf7f2 5343 l = p2align(l, min_alloc_size);
7c673cae 5344 }
11fdf7f2 5345 ceph_assert(start + l <= end);
7c673cae
FG
5346
5347 uint64_t u = 1 + (uint64_t)(r * (double)l);
11fdf7f2 5348 u = p2roundup(u, min_alloc_size);
7c673cae
FG
5349 if (start + l + u > end) {
5350 u = end - (start + l);
5351 // trim to align so we don't overflow again
11fdf7f2 5352 u = p2align(u, min_alloc_size);
7c673cae
FG
5353 stop = true;
5354 }
11fdf7f2 5355 ceph_assert(start + l + u <= end);
7c673cae 5356
11fdf7f2 5357 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
7c673cae
FG
5358 << " use 0x" << u << std::dec << dendl;
5359
5360 if (u == 0) {
5361 // break if u has been trimmed to nothing
5362 break;
5363 }
5364
5365 fm->allocate(start + l, u, t);
5366 start += l + u;
5367 }
5368 }
f67539c2 5369 r = _write_out_fm_meta(0);
1911f103
TL
5370 ceph_assert(r == 0);
5371 } else {
f67539c2
TL
5372 r = fm->init(db, read_only,
5373 [&](const std::string& key, std::string* result) {
5374 return read_meta(key, result);
5375 });
1911f103 5376 if (r < 0) {
f67539c2 5377 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
1911f103
TL
5378 delete fm;
5379 fm = NULL;
5380 return r;
5381 }
7c673cae 5382 }
81eedcae
TL
5383 // if space size tracked by free list manager is that higher than actual
5384 // dev size one can hit out-of-space allocation which will result
5385 // in data loss and/or assertions
5386 // Probably user altered the device size somehow.
5387 // The only fix for now is to redeploy OSD.
5388 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5389 ostringstream ss;
5390 ss << "slow device size mismatch detected, "
5391 << " fm size(" << fm->get_size()
5392 << ") > slow device size(" << bdev->get_size()
5393 << "), Please stop using this OSD as it might cause data loss.";
5394 _set_disk_size_mismatch_alert(ss.str());
5395 }
7c673cae
FG
5396 return 0;
5397}
5398
5399void BlueStore::_close_fm()
5400{
5401 dout(10) << __func__ << dendl;
11fdf7f2 5402 ceph_assert(fm);
7c673cae
FG
5403 fm->shutdown();
5404 delete fm;
5405 fm = NULL;
5406}
5407
f67539c2 5408int BlueStore::_write_out_fm_meta(uint64_t target_size)
1911f103 5409{
f67539c2 5410 int r = 0;
1911f103
TL
5411 string p = path + "/block";
5412
5413 std::vector<std::pair<string, string>> fm_meta;
5414 fm->get_meta(target_size, &fm_meta);
5415
1911f103 5416 for (auto& m : fm_meta) {
f67539c2
TL
5417 r = write_meta(m.first, m.second);
5418 ceph_assert(r == 0);
1911f103 5419 }
1911f103
TL
5420 return r;
5421}
5422
f67539c2 5423int BlueStore::_create_alloc()
7c673cae 5424{
f67539c2 5425 ceph_assert(shared_alloc.a == NULL);
11fdf7f2
TL
5426 ceph_assert(bdev->get_size());
5427
f67539c2
TL
5428 uint64_t alloc_size = min_alloc_size;
5429 if (bdev->is_smr()) {
5430 int r = _zoned_check_config_settings();
5431 if (r < 0)
11fdf7f2 5432 return r;
f67539c2 5433 alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
11fdf7f2
TL
5434 }
5435
f67539c2
TL
5436 shared_alloc.set(Allocator::create(cct, cct->_conf->bluestore_allocator,
5437 bdev->get_size(),
5438 alloc_size, "block"));
5439
5440 if (!shared_alloc.a) {
5441 lderr(cct) << __func__ << "Failed to create allocator:: "
5442 << cct->_conf->bluestore_allocator
5443 << dendl;
7c673cae
FG
5444 return -EINVAL;
5445 }
f67539c2
TL
5446 return 0;
5447}
5448
5449int BlueStore::_init_alloc()
5450{
5451 int r = _create_alloc();
5452 if (r < 0) {
5453 return r;
5454 }
5455 ceph_assert(shared_alloc.a != NULL);
5456
5457 if (bdev->is_smr()) {
5458 shared_alloc.a->zoned_set_zone_states(fm->get_zone_states(db));
5459 }
7c673cae
FG
5460
5461 uint64_t num = 0, bytes = 0;
5462
5463 dout(1) << __func__ << " opening allocation metadata" << dendl;
5464 // initialize from freelist
5465 fm->enumerate_reset();
5466 uint64_t offset, length;
11fdf7f2 5467 while (fm->enumerate_next(db, &offset, &length)) {
f67539c2 5468 shared_alloc.a->init_add_free(offset, length);
7c673cae
FG
5469 ++num;
5470 bytes += length;
5471 }
224ce89b 5472 fm->enumerate_reset();
7c673cae 5473
f67539c2
TL
5474 dout(1) << __func__
5475 << " loaded " << byte_u_t(bytes) << " in " << num << " extents"
5476 << std::hex
5477 << ", allocator type " << shared_alloc.a->get_type()
5478 << ", capacity 0x" << shared_alloc.a->get_capacity()
5479 << ", block size 0x" << shared_alloc.a->get_block_size()
5480 << ", free 0x" << shared_alloc.a->get_free()
5481 << ", fragmentation " << shared_alloc.a->get_fragmentation()
5482 << std::dec << dendl;
1911f103 5483
7c673cae
FG
5484 return 0;
5485}
5486
5487void BlueStore::_close_alloc()
5488{
11fdf7f2
TL
5489 ceph_assert(bdev);
5490 bdev->discard_drain();
5491
f67539c2
TL
5492 ceph_assert(shared_alloc.a);
5493 shared_alloc.a->shutdown();
5494 delete shared_alloc.a;
5495 shared_alloc.reset();
7c673cae
FG
5496}
5497
5498int BlueStore::_open_fsid(bool create)
5499{
11fdf7f2 5500 ceph_assert(fsid_fd < 0);
91327a77 5501 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
5502 if (create)
5503 flags |= O_CREAT;
5504 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
5505 if (fsid_fd < 0) {
5506 int err = -errno;
5507 derr << __func__ << " " << cpp_strerror(err) << dendl;
5508 return err;
5509 }
5510 return 0;
5511}
5512
5513int BlueStore::_read_fsid(uuid_d *uuid)
5514{
5515 char fsid_str[40];
5516 memset(fsid_str, 0, sizeof(fsid_str));
5517 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
5518 if (ret < 0) {
5519 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
5520 return ret;
5521 }
5522 if (ret > 36)
5523 fsid_str[36] = 0;
5524 else
5525 fsid_str[ret] = 0;
5526 if (!uuid->parse(fsid_str)) {
5527 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
5528 return -EINVAL;
5529 }
5530 return 0;
5531}
5532
5533int BlueStore::_write_fsid()
5534{
5535 int r = ::ftruncate(fsid_fd, 0);
5536 if (r < 0) {
5537 r = -errno;
5538 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
5539 return r;
5540 }
5541 string str = stringify(fsid) + "\n";
5542 r = safe_write(fsid_fd, str.c_str(), str.length());
5543 if (r < 0) {
5544 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
5545 return r;
5546 }
5547 r = ::fsync(fsid_fd);
5548 if (r < 0) {
5549 r = -errno;
5550 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
5551 return r;
5552 }
5553 return 0;
5554}
5555
5556void BlueStore::_close_fsid()
5557{
5558 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
5559 fsid_fd = -1;
5560}
5561
5562int BlueStore::_lock_fsid()
5563{
5564 struct flock l;
5565 memset(&l, 0, sizeof(l));
5566 l.l_type = F_WRLCK;
5567 l.l_whence = SEEK_SET;
5568 int r = ::fcntl(fsid_fd, F_SETLK, &l);
5569 if (r < 0) {
5570 int err = errno;
5571 derr << __func__ << " failed to lock " << path << "/fsid"
5572 << " (is another ceph-osd still running?)"
5573 << cpp_strerror(err) << dendl;
5574 return -err;
5575 }
5576 return 0;
5577}
5578
31f18b77
FG
5579bool BlueStore::is_rotational()
5580{
5581 if (bdev) {
5582 return bdev->is_rotational();
5583 }
5584
5585 bool rotational = true;
5586 int r = _open_path();
5587 if (r < 0)
5588 goto out;
5589 r = _open_fsid(false);
5590 if (r < 0)
5591 goto out_path;
5592 r = _read_fsid(&fsid);
5593 if (r < 0)
5594 goto out_fsid;
5595 r = _lock_fsid();
5596 if (r < 0)
5597 goto out_fsid;
5598 r = _open_bdev(false);
5599 if (r < 0)
5600 goto out_fsid;
5601 rotational = bdev->is_rotational();
5602 _close_bdev();
5603 out_fsid:
5604 _close_fsid();
5605 out_path:
5606 _close_path();
5607 out:
5608 return rotational;
5609}
5610
d2e6a577
FG
5611bool BlueStore::is_journal_rotational()
5612{
5613 if (!bluefs) {
5614 dout(5) << __func__ << " bluefs disabled, default to store media type"
5615 << dendl;
5616 return is_rotational();
5617 }
5618 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
5619 return bluefs->wal_is_rotational();
5620}
5621
9f95a23c
TL
5622bool BlueStore::_use_rotational_settings()
5623{
5624 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
5625 return true;
5626 }
5627 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
5628 return false;
5629 }
5630 return bdev->is_rotational();
5631}
5632
7c673cae
FG
5633bool BlueStore::test_mount_in_use()
5634{
5635 // most error conditions mean the mount is not in use (e.g., because
5636 // it doesn't exist). only if we fail to lock do we conclude it is
5637 // in use.
5638 bool ret = false;
5639 int r = _open_path();
5640 if (r < 0)
5641 return false;
5642 r = _open_fsid(false);
5643 if (r < 0)
5644 goto out_path;
5645 r = _lock_fsid();
5646 if (r < 0)
5647 ret = true; // if we can't lock, it is in use
5648 _close_fsid();
5649 out_path:
5650 _close_path();
5651 return ret;
5652}
5653
11fdf7f2 5654int BlueStore::_minimal_open_bluefs(bool create)
7c673cae
FG
5655{
5656 int r;
11fdf7f2 5657 bluefs = new BlueFS(cct);
7c673cae 5658
11fdf7f2
TL
5659 string bfn;
5660 struct stat st;
5661
5662 bfn = path + "/block.db";
5663 if (::stat(bfn.c_str(), &st) == 0) {
eafe8130
TL
5664 r = bluefs->add_block_device(
5665 BlueFS::BDEV_DB, bfn,
f67539c2
TL
5666 create && cct->_conf->bdev_enable_discard,
5667 SUPER_RESERVED);
7c673cae 5668 if (r < 0) {
11fdf7f2
TL
5669 derr << __func__ << " add block device(" << bfn << ") returned: "
5670 << cpp_strerror(r) << dendl;
5671 goto free_bluefs;
7c673cae 5672 }
7c673cae 5673
11fdf7f2
TL
5674 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
5675 r = _check_or_set_bdev_label(
5676 bfn,
5677 bluefs->get_block_device_size(BlueFS::BDEV_DB),
5678 "bluefs db", create);
5679 if (r < 0) {
5680 derr << __func__
5681 << " check block device(" << bfn << ") label returned: "
5682 << cpp_strerror(r) << dendl;
5683 goto free_bluefs;
5684 }
7c673cae 5685 }
9f95a23c
TL
5686 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
5687 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
5688 } else {
5689 r = -errno;
5690 if (::lstat(bfn.c_str(), &st) == -1) {
5691 r = 0;
9f95a23c 5692 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7c673cae 5693 } else {
11fdf7f2
TL
5694 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5695 << cpp_strerror(r) << dendl;
5696 goto free_bluefs;
7c673cae
FG
5697 }
5698 }
7c673cae 5699
11fdf7f2
TL
5700 // shared device
5701 bfn = path + "/block";
5702 // never trim here
9f95a23c 5703 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
f67539c2
TL
5704 0, // no need to provide valid 'reserved' for shared dev
5705 &shared_alloc);
11fdf7f2
TL
5706 if (r < 0) {
5707 derr << __func__ << " add block device(" << bfn << ") returned: "
5708 << cpp_strerror(r) << dendl;
5709 goto free_bluefs;
5710 }
11fdf7f2
TL
5711
5712 bfn = path + "/block.wal";
5713 if (::stat(bfn.c_str(), &st) == 0) {
5714 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
f67539c2
TL
5715 create && cct->_conf->bdev_enable_discard,
5716 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
5717 if (r < 0) {
5718 derr << __func__ << " add block device(" << bfn << ") returned: "
5719 << cpp_strerror(r) << dendl;
5720 goto free_bluefs;
5721 }
7c673cae 5722
11fdf7f2
TL
5723 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
5724 r = _check_or_set_bdev_label(
5725 bfn,
5726 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
5727 "bluefs wal", create);
7c673cae 5728 if (r < 0) {
11fdf7f2
TL
5729 derr << __func__ << " check block device(" << bfn
5730 << ") label returned: " << cpp_strerror(r) << dendl;
7c673cae
FG
5731 goto free_bluefs;
5732 }
7c673cae
FG
5733 }
5734
9f95a23c 5735 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
5736 } else {
5737 r = 0;
5738 if (::lstat(bfn.c_str(), &st) != -1) {
5739 r = -errno;
5740 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5741 << cpp_strerror(r) << dendl;
7c673cae
FG
5742 goto free_bluefs;
5743 }
11fdf7f2
TL
5744 }
5745 return 0;
7c673cae 5746
11fdf7f2
TL
5747free_bluefs:
5748 ceph_assert(bluefs);
5749 delete bluefs;
5750 bluefs = NULL;
5751 return r;
5752}
7c673cae 5753
f67539c2 5754int BlueStore::_open_bluefs(bool create, bool read_only)
11fdf7f2
TL
5755{
5756 int r = _minimal_open_bluefs(create);
5757 if (r < 0) {
5758 return r;
5759 }
f67539c2 5760 BlueFSVolumeSelector* vselector = nullptr;
9f95a23c
TL
5761 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
5762
5763 string options = cct->_conf->bluestore_rocksdb_options;
cd265ab1
TL
5764 string options_annex = cct->_conf->bluestore_rocksdb_options_annex;
5765 if (!options_annex.empty()) {
5766 if (!options.empty() &&
5767 *options.rbegin() != ',') {
5768 options += ',';
5769 }
5770 options += options_annex;
5771 }
9f95a23c
TL
5772
5773 rocksdb::Options rocks_opts;
f67539c2 5774 r = RocksDBStore::ParseOptionsFromStringStatic(
9f95a23c
TL
5775 cct,
5776 options,
5777 rocks_opts,
5778 nullptr);
5779 if (r < 0) {
5780 return r;
5781 }
f67539c2
TL
5782 if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
5783 vselector = new FitToFastVolumeSelector(
9f95a23c
TL
5784 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
5785 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
f67539c2
TL
5786 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100);
5787 } else {
5788 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
5789 vselector =
5790 new RocksDBBlueFSVolumeSelector(
5791 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
5792 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
5793 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
5794 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5795 rocks_opts.max_bytes_for_level_base,
5796 rocks_opts.max_bytes_for_level_multiplier,
5797 reserved_factor,
5798 cct->_conf->bluestore_volume_selection_reserved,
5799 cct->_conf->bluestore_volume_selection_policy == "use_some_extra");
5800 }
9f95a23c 5801 }
11fdf7f2 5802 if (create) {
9f95a23c 5803 bluefs->mkfs(fsid, bluefs_layout);
11fdf7f2 5804 }
9f95a23c 5805 bluefs->set_volume_selector(vselector);
11fdf7f2
TL
5806 r = bluefs->mount();
5807 if (r < 0) {
5808 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
5809 }
9f95a23c 5810 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
11fdf7f2
TL
5811 return r;
5812}
5813
1911f103 5814void BlueStore::_close_bluefs(bool cold_close)
11fdf7f2 5815{
1911f103 5816 bluefs->umount(cold_close);
11fdf7f2
TL
5817 _minimal_close_bluefs();
5818}
5819
5820void BlueStore::_minimal_close_bluefs()
5821{
5822 delete bluefs;
5823 bluefs = NULL;
5824}
5825
5826int BlueStore::_is_bluefs(bool create, bool* ret)
5827{
5828 if (create) {
5829 *ret = cct->_conf->bluestore_bluefs;
5830 } else {
5831 string s;
5832 int r = read_meta("bluefs", &s);
5833 if (r < 0) {
5834 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
5835 return -EIO;
5836 }
5837 if (s == "1") {
5838 *ret = true;
5839 } else if (s == "0") {
5840 *ret = false;
31f18b77 5841 } else {
11fdf7f2
TL
5842 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
5843 << dendl;
5844 return -EIO;
5845 }
5846 }
5847 return 0;
5848}
5849
5850/*
5851* opens both DB and dependant super_meta, FreelistManager and allocator
5852* in the proper order
5853*/
f67539c2 5854int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
11fdf7f2 5855{
f67539c2
TL
5856 dout(0) << __func__ << " read-only:" << read_only
5857 << " repair:" << to_repair << dendl;
5858 {
5859 string type;
5860 int r = read_meta("type", &type);
5861 if (r < 0) {
5862 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
5863 << dendl;
11fdf7f2 5864 return r;
f67539c2 5865 }
11fdf7f2 5866
f67539c2
TL
5867 if (type != "bluestore") {
5868 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5869 return -EIO;
11fdf7f2 5870 }
f67539c2 5871 }
11fdf7f2 5872
f67539c2
TL
5873 int r = _open_path();
5874 if (r < 0)
5875 return r;
5876 r = _open_fsid(false);
5877 if (r < 0)
5878 goto out_path;
11fdf7f2 5879
f67539c2
TL
5880 r = _read_fsid(&fsid);
5881 if (r < 0)
5882 goto out_fsid;
11fdf7f2 5883
f67539c2
TL
5884 r = _lock_fsid();
5885 if (r < 0)
5886 goto out_fsid;
11fdf7f2 5887
f67539c2
TL
5888 r = _open_bdev(false);
5889 if (r < 0)
5890 goto out_fsid;
7c673cae 5891
f67539c2
TL
5892 // open in read-only first to read FM list and init allocator
5893 // as they might be needed for some BlueFS procedures
5894 r = _open_db(false, false, true);
5895 if (r < 0)
5896 goto out_bdev;
11fdf7f2 5897
f67539c2
TL
5898 r = _open_super_meta();
5899 if (r < 0) {
5900 goto out_db;
5901 }
5902
5903 r = _open_fm(nullptr, true);
5904 if (r < 0)
5905 goto out_db;
5906
5907 r = _init_alloc();
5908 if (r < 0)
5909 goto out_fm;
5910
5911 // Re-open in the proper mode(s).
5912
5913 // Can't simply bypass second open for read-only mode as we need to
5914 // load allocated extents from bluefs into allocator.
5915 // And now it's time to do that
5916 //
5917 _close_db(true);
5918
5919 r = _open_db(false, to_repair, read_only);
5920 if (r < 0) {
5921 goto out_alloc;
11fdf7f2
TL
5922 }
5923 return 0;
5924
f67539c2
TL
5925out_alloc:
5926 _close_alloc();
5927out_fm:
11fdf7f2
TL
5928 _close_fm();
5929 out_db:
1911f103 5930 _close_db(read_only);
f67539c2
TL
5931 out_bdev:
5932 _close_bdev();
5933 out_fsid:
5934 _close_fsid();
5935 out_path:
5936 _close_path();
11fdf7f2
TL
5937 return r;
5938}
5939
1911f103 5940void BlueStore::_close_db_and_around(bool read_only)
11fdf7f2 5941{
f67539c2
TL
5942 _close_db(read_only);
5943 _close_fm();
5944 _close_alloc();
5945 _close_bdev();
5946 _close_fsid();
5947 _close_path();
5948}
5949
5950int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair)
5951{
5952 _kv_only = true;
5953 int r = _open_db_and_around(false, to_repair);
5954 if (r == 0) {
5955 *pdb = db;
11fdf7f2 5956 } else {
f67539c2 5957 *pdb = nullptr;
11fdf7f2 5958 }
f67539c2 5959 return r;
11fdf7f2
TL
5960}
5961
f67539c2 5962int BlueStore::close_db_environment()
11fdf7f2 5963{
f67539c2
TL
5964 _close_db_and_around(false);
5965 return 0;
11fdf7f2
TL
5966}
5967
f67539c2
TL
5968int BlueStore::_prepare_db_environment(bool create, bool read_only,
5969 std::string* _fn, std::string* _kv_backend)
11fdf7f2
TL
5970{
5971 int r;
5972 ceph_assert(!db);
f67539c2
TL
5973 std::string& fn=*_fn;
5974 std::string& kv_backend=*_kv_backend;
5975 fn = path + "/db";
11fdf7f2
TL
5976 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
5977
11fdf7f2
TL
5978 if (create) {
5979 kv_backend = cct->_conf->bluestore_kvbackend;
5980 } else {
5981 r = read_meta("kv_backend", &kv_backend);
7c673cae 5982 if (r < 0) {
11fdf7f2
TL
5983 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
5984 return -EIO;
5985 }
5986 }
5987 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
5988
5989 bool do_bluefs;
5990 r = _is_bluefs(create, &do_bluefs);
5991 if (r < 0) {
5992 return r;
5993 }
5994 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
5995
5996 map<string,string> kv_options;
5997 // force separate wal dir for all new deployments.
5998 kv_options["separate_wal_dir"] = 1;
5999 rocksdb::Env *env = NULL;
6000 if (do_bluefs) {
6001 dout(10) << __func__ << " initializing bluefs" << dendl;
6002 if (kv_backend != "rocksdb") {
6003 derr << " backend must be rocksdb to use bluefs" << dendl;
6004 return -EINVAL;
7c673cae 6005 }
11fdf7f2 6006
f67539c2 6007 r = _open_bluefs(create, read_only);
11fdf7f2
TL
6008 if (r < 0) {
6009 return r;
6010 }
11fdf7f2 6011
7c673cae 6012 if (cct->_conf->bluestore_bluefs_env_mirror) {
9f95a23c
TL
6013 rocksdb::Env* a = new BlueRocksEnv(bluefs);
6014 rocksdb::Env* b = rocksdb::Env::Default();
7c673cae 6015 if (create) {
9f95a23c
TL
6016 string cmd = "rm -rf " + path + "/db " +
6017 path + "/db.slow " +
6018 path + "/db.wal";
6019 int r = system(cmd.c_str());
6020 (void)r;
7c673cae
FG
6021 }
6022 env = new rocksdb::EnvMirror(b, a, false, true);
1911f103 6023 } else {
7c673cae
FG
6024 env = new BlueRocksEnv(bluefs);
6025
6026 // simplify the dir names, too, as "seen" by rocksdb
6027 fn = "db";
6028 }
9f95a23c
TL
6029 BlueFSVolumeSelector::paths paths;
6030 bluefs->get_vselector_paths(fn, paths);
7c673cae 6031
522d829b 6032 {
7c673cae 6033 ostringstream db_paths;
9f95a23c
TL
6034 bool first = true;
6035 for (auto& p : paths) {
6036 if (!first) {
6037 db_paths << " ";
6038 }
6039 first = false;
6040 db_paths << p.first << "," << p.second;
6041
6042 }
11fdf7f2 6043 kv_options["db_paths"] = db_paths.str();
9f95a23c 6044 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
7c673cae
FG
6045 }
6046
6047 if (create) {
9f95a23c
TL
6048 for (auto& p : paths) {
6049 env->CreateDir(p.first);
6050 }
6051 // Selectors don't provide wal path so far hence create explicitly
11fdf7f2 6052 env->CreateDir(fn + ".wal");
11fdf7f2
TL
6053 } else {
6054 std::vector<std::string> res;
6055 // check for dir presence
6056 auto r = env->GetChildren(fn+".wal", &res);
6057 if (r.IsNotFound()) {
6058 kv_options.erase("separate_wal_dir");
6059 }
7c673cae 6060 }
11fdf7f2
TL
6061 } else {
6062 string walfn = path + "/db.wal";
7c673cae 6063
11fdf7f2
TL
6064 if (create) {
6065 int r = ::mkdir(fn.c_str(), 0755);
6066 if (r < 0)
6067 r = -errno;
6068 if (r < 0 && r != -EEXIST) {
6069 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
6070 << dendl;
6071 return r;
6072 }
6073
6074 // wal_dir, too!
7c673cae
FG
6075 r = ::mkdir(walfn.c_str(), 0755);
6076 if (r < 0)
6077 r = -errno;
6078 if (r < 0 && r != -EEXIST) {
6079 derr << __func__ << " failed to create " << walfn
6080 << ": " << cpp_strerror(r)
6081 << dendl;
6082 return r;
6083 }
11fdf7f2
TL
6084 } else {
6085 struct stat st;
6086 r = ::stat(walfn.c_str(), &st);
6087 if (r < 0 && errno == ENOENT) {
6088 kv_options.erase("separate_wal_dir");
6089 }
7c673cae
FG
6090 }
6091 }
6092
91327a77 6093
7c673cae
FG
6094 db = KeyValueDB::create(cct,
6095 kv_backend,
6096 fn,
11fdf7f2 6097 kv_options,
7c673cae
FG
6098 static_cast<void*>(env));
6099 if (!db) {
6100 derr << __func__ << " error creating db" << dendl;
6101 if (bluefs) {
1911f103 6102 _close_bluefs(read_only);
7c673cae
FG
6103 }
6104 // delete env manually here since we can't depend on db to do this
6105 // under this case
6106 delete env;
6107 env = NULL;
6108 return -EIO;
6109 }
6110
f67539c2 6111 FreelistManager::setup_merge_operators(db, freelist_type);
7c673cae 6112 db->set_merge_operator(PREFIX_STAT, merge_op);
91327a77 6113 db->set_cache_size(cache_kv_ratio * cache_size);
f67539c2
TL
6114 return 0;
6115}
31f18b77 6116
f67539c2
TL
6117int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
6118{
6119 int r;
6120 ceph_assert(!(create && read_only));
6121 string options;
6122 string options_annex;
6123 stringstream err;
6124 string kv_dir_fn;
6125 string kv_backend;
6126 std::string sharding_def;
6127 r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend);
6128 if (r < 0) {
6129 derr << __func__ << " failed to prepare db environment: " << err.str() << dendl;
6130 return -EIO;
6131 }
11fdf7f2 6132 if (kv_backend == "rocksdb") {
7c673cae 6133 options = cct->_conf->bluestore_rocksdb_options;
cd265ab1
TL
6134 options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6135 if (!options_annex.empty()) {
6136 if (!options.empty() &&
6137 *options.rbegin() != ',') {
6138 options += ',';
6139 }
6140 options += options_annex;
6141 }
11fdf7f2 6142
f67539c2
TL
6143 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
6144 sharding_def = cct->_conf.get_val<std::string>("bluestore_rocksdb_cfs");
11fdf7f2
TL
6145 }
6146 }
6147
7c673cae 6148 db->init(options);
11fdf7f2
TL
6149 if (to_repair_db)
6150 return 0;
6151 if (create) {
f67539c2 6152 r = db->create_and_open(err, sharding_def);
11fdf7f2
TL
6153 } else {
6154 // we pass in cf list here, but it is only used if the db already has
6155 // column families created.
6156 r = read_only ?
f67539c2
TL
6157 db->open_read_only(err, sharding_def) :
6158 db->open(err, sharding_def);
11fdf7f2 6159 }
7c673cae
FG
6160 if (r) {
6161 derr << __func__ << " erroring opening db: " << err.str() << dendl;
1911f103 6162 _close_db(read_only);
7c673cae
FG
6163 return -EIO;
6164 }
6165 dout(1) << __func__ << " opened " << kv_backend
f67539c2 6166 << " path " << kv_dir_fn << " options " << options << dendl;
7c673cae 6167 return 0;
7c673cae
FG
6168}
6169
1911f103 6170void BlueStore::_close_db(bool cold_close)
7c673cae 6171{
11fdf7f2 6172 ceph_assert(db);
7c673cae
FG
6173 delete db;
6174 db = NULL;
6175 if (bluefs) {
1911f103 6176 _close_bluefs(cold_close);
7c673cae
FG
6177 }
6178}
6179
11fdf7f2 6180void BlueStore::_dump_alloc_on_failure()
7c673cae 6181{
11fdf7f2
TL
6182 auto dump_interval =
6183 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
6184 if (dump_interval > 0 &&
6185 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
f67539c2 6186 shared_alloc.a->dump();
11fdf7f2
TL
6187 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
6188 next_dump_on_bluefs_alloc_failure += dump_interval;
7c673cae 6189 }
11fdf7f2 6190}
7c673cae 6191
eafe8130 6192int BlueStore::_open_collections()
7c673cae 6193{
28e407b8 6194 dout(10) << __func__ << dendl;
eafe8130 6195 collections_had_errors = false;
11fdf7f2 6196 ceph_assert(coll_map.empty());
7c673cae
FG
6197 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6198 for (it->upper_bound(string());
6199 it->valid();
6200 it->next()) {
6201 coll_t cid;
6202 if (cid.parse(it->key())) {
9f95a23c 6203 auto c = ceph::make_ref<Collection>(
7c673cae 6204 this,
9f95a23c
TL
6205 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6206 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6207 cid);
7c673cae 6208 bufferlist bl = it->value();
11fdf7f2 6209 auto p = bl.cbegin();
7c673cae 6210 try {
11fdf7f2 6211 decode(c->cnode, p);
f67539c2 6212 } catch (ceph::buffer::error& e) {
7c673cae
FG
6213 derr << __func__ << " failed to decode cnode, key:"
6214 << pretty_binary_string(it->key()) << dendl;
6215 return -EIO;
6216 }
28e407b8
AA
6217 dout(20) << __func__ << " opened " << cid << " " << c
6218 << " " << c->cnode << dendl;
11fdf7f2 6219 _osr_attach(c.get());
7c673cae 6220 coll_map[cid] = c;
11fdf7f2 6221
7c673cae
FG
6222 } else {
6223 derr << __func__ << " unrecognized collection " << it->key() << dendl;
eafe8130 6224 collections_had_errors = true;
7c673cae
FG
6225 }
6226 }
6227 return 0;
6228}
6229
eafe8130
TL
6230void BlueStore::_fsck_collections(int64_t* errors)
6231{
6232 if (collections_had_errors) {
6233 dout(10) << __func__ << dendl;
f67539c2 6234 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
6235 for (it->upper_bound(string());
6236 it->valid();
6237 it->next()) {
6238 coll_t cid;
6239 if (!cid.parse(it->key())) {
6240 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6241 if (errors) {
6242 (*errors)++;
6243 }
6244 }
6245 }
6246 }
6247}
6248
9f95a23c
TL
6249void BlueStore::_set_per_pool_omap()
6250{
f67539c2 6251 per_pool_omap = OMAP_BULK;
9f95a23c
TL
6252 bufferlist bl;
6253 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6254 if (bl.length()) {
f67539c2
TL
6255 auto s = bl.to_str();
6256 if (s == stringify(OMAP_PER_POOL)) {
6257 per_pool_omap = OMAP_PER_POOL;
a4b75251 6258 } else if (s == stringify(OMAP_PER_PG)) {
f67539c2 6259 per_pool_omap = OMAP_PER_PG;
a4b75251
TL
6260 } else {
6261 ceph_assert(s == stringify(OMAP_BULK));
f67539c2
TL
6262 }
6263 dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl;
9f95a23c
TL
6264 } else {
6265 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6266 }
f67539c2 6267 _check_no_per_pg_or_pool_omap_alert();
9f95a23c
TL
6268}
6269
224ce89b 6270void BlueStore::_open_statfs()
31f18b77 6271{
11fdf7f2
TL
6272 osd_pools.clear();
6273 vstatfs.reset();
6274
31f18b77 6275 bufferlist bl;
11fdf7f2 6276 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
31f18b77 6277 if (r >= 0) {
11fdf7f2 6278 per_pool_stat_collection = false;
31f18b77 6279 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
11fdf7f2 6280 auto it = bl.cbegin();
31f18b77 6281 vstatfs.decode(it);
11fdf7f2 6282 dout(10) << __func__ << " store_statfs is found" << dendl;
224ce89b 6283 } else {
31f18b77
FG
6284 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6285 }
81eedcae 6286 _check_legacy_statfs_alert();
11fdf7f2
TL
6287 } else {
6288 per_pool_stat_collection = true;
6289 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
f67539c2 6290 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2
TL
6291 for (it->upper_bound(string());
6292 it->valid();
6293 it->next()) {
6294
6295 uint64_t pool_id;
6296 int r = get_key_pool_stat(it->key(), &pool_id);
6297 ceph_assert(r == 0);
6298
6299 bufferlist bl;
6300 bl = it->value();
6301 auto p = bl.cbegin();
6302 auto& st = osd_pools[pool_id];
6303 try {
6304 st.decode(p);
6305 vstatfs += st;
6306
6307 dout(30) << __func__ << " pool " << pool_id
6308 << " statfs " << st << dendl;
f67539c2 6309 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
6310 derr << __func__ << " failed to decode pool stats, key:"
6311 << pretty_binary_string(it->key()) << dendl;
6312 }
6313 }
31f18b77 6314 }
11fdf7f2
TL
6315 dout(30) << __func__ << " statfs " << vstatfs << dendl;
6316
31f18b77
FG
6317}
6318
7c673cae
FG
6319int BlueStore::_setup_block_symlink_or_file(
6320 string name,
6321 string epath,
6322 uint64_t size,
6323 bool create)
6324{
6325 dout(20) << __func__ << " name " << name << " path " << epath
6326 << " size " << size << " create=" << (int)create << dendl;
6327 int r = 0;
91327a77 6328 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
6329 if (create)
6330 flags |= O_CREAT;
6331 if (epath.length()) {
6332 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6333 if (r < 0) {
6334 r = -errno;
6335 derr << __func__ << " failed to create " << name << " symlink to "
6336 << epath << ": " << cpp_strerror(r) << dendl;
6337 return r;
6338 }
6339
6340 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6341 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6342 if (fd < 0) {
6343 r = -errno;
6344 derr << __func__ << " failed to open " << epath << " file: "
6345 << cpp_strerror(r) << dendl;
6346 return r;
6347 }
11fdf7f2
TL
6348 // write the Transport ID of the NVMe device
6349 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6350 // where "0000:02:00.0" is the selector of a PCI device, see
6351 // the first column of "lspci -mm -n -D"
6352 string trid{"trtype:PCIe "};
6353 trid += "traddr:";
6354 trid += epath.substr(strlen(SPDK_PREFIX));
6355 r = ::write(fd, trid.c_str(), trid.size());
6356 ceph_assert(r == static_cast<int>(trid.size()));
7c673cae
FG
6357 dout(1) << __func__ << " created " << name << " symlink to "
6358 << epath << dendl;
6359 VOID_TEMP_FAILURE_RETRY(::close(fd));
6360 }
6361 }
6362 if (size) {
6363 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
6364 if (fd >= 0) {
6365 // block file is present
6366 struct stat st;
6367 int r = ::fstat(fd, &st);
6368 if (r == 0 &&
6369 S_ISREG(st.st_mode) && // if it is a regular file
6370 st.st_size == 0) { // and is 0 bytes
6371 r = ::ftruncate(fd, size);
6372 if (r < 0) {
6373 r = -errno;
6374 derr << __func__ << " failed to resize " << name << " file to "
6375 << size << ": " << cpp_strerror(r) << dendl;
6376 VOID_TEMP_FAILURE_RETRY(::close(fd));
6377 return r;
6378 }
6379
6380 if (cct->_conf->bluestore_block_preallocate_file) {
28e407b8
AA
6381 r = ::ceph_posix_fallocate(fd, 0, size);
6382 if (r > 0) {
7c673cae
FG
6383 derr << __func__ << " failed to prefallocate " << name << " file to "
6384 << size << ": " << cpp_strerror(r) << dendl;
6385 VOID_TEMP_FAILURE_RETRY(::close(fd));
6386 return -r;
6387 }
7c673cae
FG
6388 }
6389 dout(1) << __func__ << " resized " << name << " file to "
1adf2230 6390 << byte_u_t(size) << dendl;
7c673cae
FG
6391 }
6392 VOID_TEMP_FAILURE_RETRY(::close(fd));
6393 } else {
6394 int r = -errno;
6395 if (r != -ENOENT) {
6396 derr << __func__ << " failed to open " << name << " file: "
6397 << cpp_strerror(r) << dendl;
6398 return r;
6399 }
6400 }
6401 }
6402 return 0;
6403}
6404
6405int BlueStore::mkfs()
6406{
6407 dout(1) << __func__ << " path " << path << dendl;
6408 int r;
6409 uuid_d old_fsid;
f67539c2 6410 uint64_t reserved;
eafe8130
TL
6411 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6412 derr << __func__ << " osd_max_object_size "
6413 << cct->_conf->osd_max_object_size << " > bluestore max "
6414 << OBJECT_MAX_SIZE << dendl;
6415 return -EINVAL;
6416 }
6417
7c673cae
FG
6418 {
6419 string done;
6420 r = read_meta("mkfs_done", &done);
6421 if (r == 0) {
6422 dout(1) << __func__ << " already created" << dendl;
6423 if (cct->_conf->bluestore_fsck_on_mkfs) {
6424 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6425 if (r < 0) {
6426 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
6427 << dendl;
6428 return r;
6429 }
6430 if (r > 0) {
6431 derr << __func__ << " fsck found " << r << " errors" << dendl;
6432 r = -EIO;
6433 }
6434 }
6435 return r; // idempotent
6436 }
6437 }
6438
6439 {
6440 string type;
6441 r = read_meta("type", &type);
6442 if (r == 0) {
6443 if (type != "bluestore") {
6444 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6445 return -EIO;
6446 }
6447 } else {
6448 r = write_meta("type", "bluestore");
6449 if (r < 0)
6450 return r;
6451 }
6452 }
6453
6454 freelist_type = "bitmap";
6455
6456 r = _open_path();
6457 if (r < 0)
6458 return r;
6459
6460 r = _open_fsid(true);
6461 if (r < 0)
6462 goto out_path_fd;
6463
6464 r = _lock_fsid();
6465 if (r < 0)
6466 goto out_close_fsid;
6467
6468 r = _read_fsid(&old_fsid);
6469 if (r < 0 || old_fsid.is_zero()) {
6470 if (fsid.is_zero()) {
6471 fsid.generate_random();
6472 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6473 } else {
6474 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6475 }
6476 // we'll write it later.
6477 } else {
6478 if (!fsid.is_zero() && fsid != old_fsid) {
6479 derr << __func__ << " on-disk fsid " << old_fsid
6480 << " != provided " << fsid << dendl;
6481 r = -EINVAL;
6482 goto out_close_fsid;
6483 }
6484 fsid = old_fsid;
6485 }
6486
6487 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
6488 cct->_conf->bluestore_block_size,
6489 cct->_conf->bluestore_block_create);
6490 if (r < 0)
6491 goto out_close_fsid;
6492 if (cct->_conf->bluestore_bluefs) {
6493 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
6494 cct->_conf->bluestore_block_wal_size,
6495 cct->_conf->bluestore_block_wal_create);
6496 if (r < 0)
6497 goto out_close_fsid;
6498 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
6499 cct->_conf->bluestore_block_db_size,
6500 cct->_conf->bluestore_block_db_create);
6501 if (r < 0)
6502 goto out_close_fsid;
6503 }
6504
6505 r = _open_bdev(true);
6506 if (r < 0)
6507 goto out_close_fsid;
6508
3efd9988
FG
6509 // choose min_alloc_size
6510 if (cct->_conf->bluestore_min_alloc_size) {
6511 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
6512 } else {
11fdf7f2 6513 ceph_assert(bdev);
f67539c2 6514 if (_use_rotational_settings()) {
3efd9988
FG
6515 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
6516 } else {
6517 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
6518 }
6519 }
11fdf7f2 6520 _validate_bdev();
3efd9988
FG
6521
6522 // make sure min_alloc_size is power of 2 aligned.
11fdf7f2 6523 if (!isp2(min_alloc_size)) {
3efd9988
FG
6524 derr << __func__ << " min_alloc_size 0x"
6525 << std::hex << min_alloc_size << std::dec
6526 << " is not power of 2 aligned!"
6527 << dendl;
6528 r = -EINVAL;
6529 goto out_close_bdev;
6530 }
6531
f67539c2
TL
6532 r = _create_alloc();
6533 if (r < 0) {
6534 goto out_close_bdev;
6535 }
6536
6537 reserved = _get_ondisk_reserved();
6538 shared_alloc.a->init_add_free(reserved,
6539 p2align(bdev->get_size(), min_alloc_size) - reserved);
6540
7c673cae
FG
6541 r = _open_db(true);
6542 if (r < 0)
f67539c2 6543 goto out_close_alloc;
7c673cae 6544
7c673cae
FG
6545 {
6546 KeyValueDB::Transaction t = db->get_transaction();
1911f103 6547 r = _open_fm(t, true);
11fdf7f2
TL
6548 if (r < 0)
6549 goto out_close_db;
7c673cae
FG
6550 {
6551 bufferlist bl;
11fdf7f2 6552 encode((uint64_t)0, bl);
7c673cae
FG
6553 t->set(PREFIX_SUPER, "nid_max", bl);
6554 t->set(PREFIX_SUPER, "blobid_max", bl);
6555 }
6556
7c673cae
FG
6557 {
6558 bufferlist bl;
11fdf7f2 6559 encode((uint64_t)min_alloc_size, bl);
7c673cae
FG
6560 t->set(PREFIX_SUPER, "min_alloc_size", bl);
6561 }
9f95a23c
TL
6562 {
6563 bufferlist bl;
a4b75251
TL
6564 if (cct->_conf.get_val<bool>("bluestore_debug_legacy_omap")) {
6565 bl.append(stringify(OMAP_BULK));
6566 } else {
6567 bl.append(stringify(OMAP_PER_PG));
6568 }
9f95a23c
TL
6569 t->set(PREFIX_SUPER, "per_pool_omap", bl);
6570 }
7c673cae
FG
6571 ondisk_format = latest_ondisk_format;
6572 _prepare_ondisk_format_super(t);
6573 db->submit_transaction_sync(t);
6574 }
6575
7c673cae
FG
6576 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
6577 if (r < 0)
224ce89b
WB
6578 goto out_close_fm;
6579
3efd9988 6580 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 6581 if (r < 0)
224ce89b 6582 goto out_close_fm;
7c673cae
FG
6583
6584 if (fsid != old_fsid) {
6585 r = _write_fsid();
6586 if (r < 0) {
6587 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 6588 goto out_close_fm;
7c673cae
FG
6589 }
6590 }
6591
7c673cae
FG
6592 out_close_fm:
6593 _close_fm();
6594 out_close_db:
1911f103 6595 _close_db(false);
f67539c2
TL
6596 out_close_alloc:
6597 _close_alloc();
7c673cae
FG
6598 out_close_bdev:
6599 _close_bdev();
6600 out_close_fsid:
6601 _close_fsid();
6602 out_path_fd:
6603 _close_path();
6604
6605 if (r == 0 &&
6606 cct->_conf->bluestore_fsck_on_mkfs) {
6607 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6608 if (rc < 0)
6609 return rc;
6610 if (rc > 0) {
6611 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6612 r = -EIO;
6613 }
11fdf7f2
TL
6614 }
6615
6616 if (r == 0) {
6617 // indicate success by writing the 'mkfs_done' file
6618 r = write_meta("mkfs_done", "yes");
6619 }
6620
6621 if (r < 0) {
6622 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6623 } else {
6624 dout(0) << __func__ << " success" << dendl;
6625 }
6626 return r;
6627}
6628
11fdf7f2
TL
6629int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
6630{
6631 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6632 int r;
6633 ceph_assert(path_fd < 0);
6634
6635 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6636
6637 if (!cct->_conf->bluestore_bluefs) {
6638 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6639 return -EIO;
6640 }
6641
f67539c2 6642 r = _open_db_and_around(true);
11fdf7f2 6643
11fdf7f2
TL
6644 if (id == BlueFS::BDEV_NEWWAL) {
6645 string p = path + "/block.wal";
6646 r = _setup_block_symlink_or_file("block.wal", dev_path,
6647 cct->_conf->bluestore_block_wal_size,
6648 true);
6649 ceph_assert(r == 0);
6650
6651 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
f67539c2
TL
6652 cct->_conf->bdev_enable_discard,
6653 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
6654 ceph_assert(r == 0);
6655
6656 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6657 r = _check_or_set_bdev_label(
6658 p,
6659 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6660 "bluefs wal",
6661 true);
6662 ceph_assert(r == 0);
6663 }
6664
9f95a23c 6665 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6666 } else if (id == BlueFS::BDEV_NEWDB) {
6667 string p = path + "/block.db";
6668 r = _setup_block_symlink_or_file("block.db", dev_path,
6669 cct->_conf->bluestore_block_db_size,
6670 true);
6671 ceph_assert(r == 0);
6672
6673 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
f67539c2
TL
6674 cct->_conf->bdev_enable_discard,
6675 SUPER_RESERVED);
11fdf7f2
TL
6676 ceph_assert(r == 0);
6677
6678 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6679 r = _check_or_set_bdev_label(
6680 p,
6681 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6682 "bluefs db",
6683 true);
6684 ceph_assert(r == 0);
6685 }
9f95a23c
TL
6686 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6687 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
6688 }
6689
6690 bluefs->umount();
6691 bluefs->mount();
6692
9f95a23c 6693 r = bluefs->prepare_new_device(id, bluefs_layout);
11fdf7f2
TL
6694 ceph_assert(r == 0);
6695
6696 if (r < 0) {
6697 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6698 } else {
6699 dout(0) << __func__ << " success" << dendl;
6700 }
6701
f67539c2 6702 _close_db_and_around(true);
11fdf7f2
TL
6703 return r;
6704}
6705
6706int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
6707 int id)
6708{
6709 dout(10) << __func__ << " id:" << id << dendl;
6710 ceph_assert(path_fd < 0);
6711
6712 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
6713
6714 if (!cct->_conf->bluestore_bluefs) {
6715 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6716 return -EIO;
6717 }
6718
f67539c2 6719 int r = _open_db_and_around(true);
11fdf7f2 6720
f67539c2 6721 uint64_t used_space = 0;
11fdf7f2 6722 for(auto src_id : devs_source) {
f67539c2 6723 used_space += bluefs->get_used(src_id);
11fdf7f2
TL
6724 }
6725 uint64_t target_free = bluefs->get_free(id);
f67539c2 6726 if (target_free < used_space) {
11fdf7f2
TL
6727 derr << __func__
6728 << " can't migrate, free space at target: " << target_free
6729 << " is less than required space: " << used_space
6730 << dendl;
f67539c2
TL
6731 r = -ENOSPC;
6732 goto shutdown;
11fdf7f2 6733 }
9f95a23c
TL
6734 if (devs_source.count(BlueFS::BDEV_DB)) {
6735 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6736 bluefs_layout.dedicated_db = false;
6737 }
6738 if (devs_source.count(BlueFS::BDEV_WAL)) {
6739 bluefs_layout.dedicated_wal = false;
6740 }
6741 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
11fdf7f2
TL
6742 if (r < 0) {
6743 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6744 goto shutdown;
6745 }
6746
6747 if (devs_source.count(BlueFS::BDEV_DB)) {
6748 r = unlink(string(path + "/block.db").c_str());
6749 ceph_assert(r == 0);
6750 }
6751 if (devs_source.count(BlueFS::BDEV_WAL)) {
6752 r = unlink(string(path + "/block.wal").c_str());
6753 ceph_assert(r == 0);
6754 }
6755
6756shutdown:
f67539c2 6757 _close_db_and_around(true);
11fdf7f2
TL
6758 return r;
6759}
6760
6761int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
6762 int id,
6763 const string& dev_path)
6764{
6765 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6766 int r;
6767 ceph_assert(path_fd < 0);
6768
6769 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6770
6771 if (!cct->_conf->bluestore_bluefs) {
6772 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6773 return -EIO;
6774 }
6775
f67539c2 6776 r = _open_db_and_around(true);
11fdf7f2 6777
11fdf7f2
TL
6778 string link_db;
6779 string link_wal;
6780 if (devs_source.count(BlueFS::BDEV_DB) &&
9f95a23c 6781 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 6782 link_db = path + "/block.db";
9f95a23c
TL
6783 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6784 bluefs_layout.dedicated_db = false;
11fdf7f2
TL
6785 }
6786 if (devs_source.count(BlueFS::BDEV_WAL)) {
6787 link_wal = path + "/block.wal";
9f95a23c 6788 bluefs_layout.dedicated_wal = false;
11fdf7f2
TL
6789 }
6790
6791 size_t target_size;
6792 string target_name;
6793 if (id == BlueFS::BDEV_NEWWAL) {
6794 target_name = "block.wal";
6795 target_size = cct->_conf->bluestore_block_wal_size;
9f95a23c 6796 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6797
6798 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
f67539c2
TL
6799 cct->_conf->bdev_enable_discard,
6800 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
6801 ceph_assert(r == 0);
6802
6803 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6804 r = _check_or_set_bdev_label(
6805 dev_path,
6806 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6807 "bluefs wal",
6808 true);
6809 ceph_assert(r == 0);
6810 }
11fdf7f2
TL
6811 } else if (id == BlueFS::BDEV_NEWDB) {
6812 target_name = "block.db";
6813 target_size = cct->_conf->bluestore_block_db_size;
9f95a23c
TL
6814 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6815 bluefs_layout.dedicated_db = true;
31f18b77 6816
11fdf7f2 6817 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
f67539c2
TL
6818 cct->_conf->bdev_enable_discard,
6819 SUPER_RESERVED);
11fdf7f2
TL
6820 ceph_assert(r == 0);
6821
6822 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6823 r = _check_or_set_bdev_label(
6824 dev_path,
6825 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6826 "bluefs db",
6827 true);
6828 ceph_assert(r == 0);
6829 }
31f18b77
FG
6830 }
6831
11fdf7f2
TL
6832 bluefs->umount();
6833 bluefs->mount();
6834
9f95a23c 6835 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
11fdf7f2 6836
7c673cae 6837 if (r < 0) {
11fdf7f2
TL
6838 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6839 goto shutdown;
6840 }
6841
6842 if (!link_db.empty()) {
6843 r = unlink(link_db.c_str());
6844 ceph_assert(r == 0);
6845 }
6846 if (!link_wal.empty()) {
6847 r = unlink(link_wal.c_str());
6848 ceph_assert(r == 0);
6849 }
6850 r = _setup_block_symlink_or_file(
6851 target_name,
6852 dev_path,
6853 target_size,
6854 true);
6855 ceph_assert(r == 0);
6856 dout(0) << __func__ << " success" << dendl;
6857
6858shutdown:
f67539c2
TL
6859 _close_db_and_around(true);
6860
11fdf7f2
TL
6861 return r;
6862}
6863
6864string BlueStore::get_device_path(unsigned id)
6865{
6866 string res;
6867 if (id < BlueFS::MAX_BDEV) {
6868 switch (id) {
6869 case BlueFS::BDEV_WAL:
6870 res = path + "/block.wal";
6871 break;
6872 case BlueFS::BDEV_DB:
9f95a23c 6873 if (id == bluefs_layout.shared_bdev) {
11fdf7f2
TL
6874 res = path + "/block";
6875 } else {
6876 res = path + "/block.db";
6877 }
6878 break;
6879 case BlueFS::BDEV_SLOW:
6880 res = path + "/block";
6881 break;
6882 }
6883 }
6884 return res;
6885}
6886
f67539c2
TL
6887int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
6888{
6889 bluestore_bdev_label_t label;
6890 int r = _read_bdev_label(cct, path, &label);
6891 if (r < 0) {
6892 derr << "unable to read label for " << path << ": "
6893 << cpp_strerror(r) << dendl;
6894 } else {
6895 label.size = size;
6896 r = _write_bdev_label(cct, path, label);
6897 if (r < 0) {
6898 derr << "unable to write label for " << path << ": "
6899 << cpp_strerror(r) << dendl;
6900 }
6901 }
6902 return r;
6903}
6904
11fdf7f2
TL
6905int BlueStore::expand_devices(ostream& out)
6906{
f67539c2 6907 int r = _open_db_and_around(true);
11fdf7f2
TL
6908 ceph_assert(r == 0);
6909 bluefs->dump_block_extents(out);
1911f103 6910 out << "Expanding DB/WAL..." << std::endl;
11fdf7f2 6911 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
9f95a23c 6912 if (devid == bluefs_layout.shared_bdev ) {
11fdf7f2
TL
6913 continue;
6914 }
6915 uint64_t size = bluefs->get_block_device_size(devid);
6916 if (size == 0) {
6917 // no bdev
6918 continue;
6919 }
6920
f67539c2
TL
6921 out << devid
6922 <<" : expanding " << " to 0x" << size << std::dec << std::endl;
6923 string p = get_device_path(devid);
6924 const char* path = p.c_str();
6925 if (path == nullptr) {
6926 derr << devid
6927 <<": can't find device path " << dendl;
6928 continue;
6929 }
6930 if (bluefs->bdev_support_label(devid)) {
6931 if (_set_bdev_label_size(p, size) >= 0) {
6932 out << devid
6933 << " : size label updated to " << size
6934 << std::endl;
11fdf7f2 6935 }
11fdf7f2
TL
6936 }
6937 }
6938 uint64_t size0 = fm->get_size();
6939 uint64_t size = bdev->get_size();
6940 if (size0 < size) {
9f95a23c 6941 out << bluefs_layout.shared_bdev
1911f103
TL
6942 << " : expanding " << " from 0x" << std::hex
6943 << size0 << " to 0x" << size << std::dec << std::endl;
f67539c2
TL
6944 _write_out_fm_meta(size);
6945 if (bdev->supported_bdev_label()) {
6946 if (_set_bdev_label_size(path, size) >= 0) {
6947 out << bluefs_layout.shared_bdev
6948 << " : size label updated to " << size
6949 << std::endl;
6950 }
6951 }
6952 _close_db_and_around(true);
1911f103
TL
6953
6954 // mount in read/write to sync expansion changes
f67539c2 6955 r = _mount();
11fdf7f2 6956 ceph_assert(r == 0);
1911f103
TL
6957 umount();
6958 } else {
f67539c2 6959 _close_db_and_around(true);
7c673cae 6960 }
1911f103
TL
6961 return r;
6962}
6963
6964int BlueStore::dump_bluefs_sizes(ostream& out)
6965{
f67539c2 6966 int r = _open_db_and_around(true);
1911f103
TL
6967 ceph_assert(r == 0);
6968 bluefs->dump_block_extents(out);
f67539c2 6969 _close_db_and_around(true);
7c673cae
FG
6970 return r;
6971}
6972
6973void BlueStore::set_cache_shards(unsigned num)
6974{
6975 dout(10) << __func__ << " " << num << dendl;
9f95a23c
TL
6976 size_t oold = onode_cache_shards.size();
6977 size_t bold = buffer_cache_shards.size();
6978 ceph_assert(num >= oold && num >= bold);
6979 onode_cache_shards.resize(num);
6980 buffer_cache_shards.resize(num);
6981 for (unsigned i = oold; i < num; ++i) {
6982 onode_cache_shards[i] =
6983 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
6984 logger);
6985 }
6986 for (unsigned i = bold; i < num; ++i) {
6987 buffer_cache_shards[i] =
6988 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
6989 logger);
7c673cae
FG
6990 }
6991}
6992
f67539c2 6993int BlueStore::_mount()
7c673cae
FG
6994{
6995 dout(1) << __func__ << " path " << path << dendl;
6996
f67539c2 6997 _kv_only = false;
7c673cae
FG
6998 if (cct->_conf->bluestore_fsck_on_mount) {
6999 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
7000 if (rc < 0)
7001 return rc;
7002 if (rc > 0) {
7003 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7004 return -EIO;
7005 }
7006 }
7007
eafe8130
TL
7008 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
7009 derr << __func__ << " osd_max_object_size "
7010 << cct->_conf->osd_max_object_size << " > bluestore max "
7011 << OBJECT_MAX_SIZE << dendl;
7012 return -EINVAL;
7013 }
7014
f67539c2 7015 int r = _open_db_and_around(false);
9f95a23c 7016 if (r < 0) {
f67539c2 7017 return r;
11fdf7f2 7018 }
7c673cae 7019
11fdf7f2
TL
7020 r = _upgrade_super();
7021 if (r < 0) {
7c673cae 7022 goto out_db;
11fdf7f2 7023 }
7c673cae
FG
7024
7025 r = _open_collections();
7026 if (r < 0)
11fdf7f2 7027 goto out_db;
7c673cae
FG
7028
7029 r = _reload_logger();
7030 if (r < 0)
7031 goto out_coll;
7032
31f18b77 7033 _kv_start();
7c673cae 7034
f67539c2
TL
7035 if (bdev->is_smr()) {
7036 _zoned_cleaner_start();
7037 }
7038
7c673cae
FG
7039 r = _deferred_replay();
7040 if (r < 0)
7041 goto out_stop;
7042
7043 mempool_thread.init();
7044
f67539c2 7045 if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
eafe8130 7046 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
9f95a23c 7047
f67539c2 7048 auto was_per_pool_omap = per_pool_omap;
9f95a23c 7049
eafe8130
TL
7050 dout(1) << __func__ << " quick-fix on mount" << dendl;
7051 _fsck_on_open(FSCK_SHALLOW, true);
7052
7053 //reread statfs
7054 //FIXME minor: replace with actual open/close?
7055 _open_statfs();
eafe8130 7056 _check_legacy_statfs_alert();
9f95a23c
TL
7057
7058 //set again as hopefully it has been fixed
f67539c2 7059 if (was_per_pool_omap != OMAP_PER_PG) {
9f95a23c
TL
7060 _set_per_pool_omap();
7061 }
eafe8130
TL
7062 }
7063
7c673cae
FG
7064 mounted = true;
7065 return 0;
7066
7067 out_stop:
f67539c2
TL
7068 if (bdev->is_smr()) {
7069 _zoned_cleaner_stop();
7070 }
7c673cae 7071 _kv_stop();
7c673cae 7072 out_coll:
f6b5b4d7 7073 _shutdown_cache();
7c673cae 7074 out_db:
1911f103 7075 _close_db_and_around(false);
7c673cae
FG
7076 return r;
7077}
7078
7079int BlueStore::umount()
7080{
11fdf7f2 7081 ceph_assert(_kv_only || mounted);
7c673cae
FG
7082 dout(1) << __func__ << dendl;
7083
7084 _osr_drain_all();
7c673cae 7085
7c673cae 7086 mounted = false;
3efd9988
FG
7087 if (!_kv_only) {
7088 mempool_thread.shutdown();
f67539c2
TL
7089 if (bdev->is_smr()) {
7090 dout(20) << __func__ << " stopping zone cleaner thread" << dendl;
7091 _zoned_cleaner_stop();
7092 }
3efd9988
FG
7093 dout(20) << __func__ << " stopping kv thread" << dendl;
7094 _kv_stop();
f6b5b4d7 7095 _shutdown_cache();
3efd9988
FG
7096 dout(20) << __func__ << " closing" << dendl;
7097
3efd9988 7098 }
1911f103 7099 _close_db_and_around(false);
7c673cae
FG
7100
7101 if (cct->_conf->bluestore_fsck_on_umount) {
7102 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7103 if (rc < 0)
7104 return rc;
7105 if (rc > 0) {
7106 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7107 return -EIO;
7108 }
7109 }
7110 return 0;
7111}
7112
eafe8130
TL
7113int BlueStore::cold_open()
7114{
f67539c2 7115 return _open_db_and_around(true);
eafe8130 7116}
f67539c2 7117
eafe8130
TL
7118int BlueStore::cold_close()
7119{
1911f103 7120 _close_db_and_around(true);
eafe8130
TL
7121 return 0;
7122}
7123
9f95a23c
TL
7124// derr wrapper to limit enormous output and avoid log flooding.
7125// Of limited use where such output is expected for now
7126#define fsck_derr(err_cnt, threshold) \
7127 if (err_cnt <= threshold) { \
7128 bool need_skip_print = err_cnt == threshold; \
7129 derr
7130
7131#define fsck_dendl \
7132 dendl; \
7133 if (need_skip_print) \
7134 derr << "more error lines skipped..." << dendl; \
7c673cae 7135 }
7c673cae 7136
eafe8130
TL
7137int _fsck_sum_extents(
7138 const PExtentVector& extents,
7139 bool compressed,
7140 store_statfs_t& expected_statfs)
7141{
7142 for (auto e : extents) {
7143 if (!e.is_valid())
7144 continue;
7145 expected_statfs.allocated += e.length;
7146 if (compressed) {
7147 expected_statfs.data_compressed_allocated += e.length;
7148 }
7149 }
7150 return 0;
7151}
7152
7c673cae 7153int BlueStore::_fsck_check_extents(
11fdf7f2 7154 const coll_t& cid,
7c673cae
FG
7155 const ghobject_t& oid,
7156 const PExtentVector& extents,
7157 bool compressed,
7158 mempool_dynamic_bitset &used_blocks,
b32b8144 7159 uint64_t granularity,
11fdf7f2 7160 BlueStoreRepairer* repairer,
eafe8130
TL
7161 store_statfs_t& expected_statfs,
7162 FSCKDepth depth)
7c673cae
FG
7163{
7164 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
7165 int errors = 0;
7166 for (auto e : extents) {
7167 if (!e.is_valid())
7168 continue;
7169 expected_statfs.allocated += e.length;
7170 if (compressed) {
11fdf7f2 7171 expected_statfs.data_compressed_allocated += e.length;
7c673cae 7172 }
eafe8130
TL
7173 if (depth != FSCK_SHALLOW) {
7174 bool already = false;
9f95a23c 7175 apply_for_bitset_range(
eafe8130
TL
7176 e.offset, e.length, granularity, used_blocks,
7177 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
7178 if (bs.test(pos)) {
7179 if (repairer) {
7180 repairer->note_misreference(
7181 pos * min_alloc_size, min_alloc_size, !already);
7182 }
7183 if (!already) {
7184 derr << "fsck error: " << oid << " extent " << e
7185 << " or a subset is already allocated (misreferenced)" << dendl;
7186 ++errors;
7187 already = true;
7188 }
11fdf7f2 7189 }
eafe8130
TL
7190 else
7191 bs.set(pos);
7192 });
7193 if (repairer) {
b3b6e05e 7194 repairer->set_space_used(e.offset, e.length, cid, oid);
eafe8130 7195 }
11fdf7f2 7196
eafe8130
TL
7197 if (e.end() > bdev->get_size()) {
7198 derr << "fsck error: " << oid << " extent " << e
7199 << " past end of block device" << dendl;
7200 ++errors;
7201 }
7c673cae
FG
7202 }
7203 }
7204 return errors;
7205}
7206
11fdf7f2
TL
7207void BlueStore::_fsck_check_pool_statfs(
7208 BlueStore::per_pool_statfs& expected_pool_statfs,
eafe8130
TL
7209 int64_t& errors,
7210 int64_t& warnings,
11fdf7f2
TL
7211 BlueStoreRepairer* repairer)
7212{
f67539c2 7213 auto it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2
TL
7214 if (it) {
7215 for (it->lower_bound(string()); it->valid(); it->next()) {
7216 string key = it->key();
7217 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
7218 if (repairer) {
eafe8130
TL
7219 ++errors;
7220 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
7221 derr << "fsck error: " << "legacy statfs record found, removing"
11fdf7f2
TL
7222 << dendl;
7223 }
7224 continue;
7225 }
11fdf7f2
TL
7226 uint64_t pool_id;
7227 if (get_key_pool_stat(key, &pool_id) < 0) {
7228 derr << "fsck error: bad key " << key
7229 << "in statfs namespece" << dendl;
7230 if (repairer) {
7231 repairer->remove_key(db, PREFIX_STAT, key);
7232 }
7233 ++errors;
7234 continue;
7235 }
7236
7237 volatile_statfs vstatfs;
7238 bufferlist bl = it->value();
7239 auto blp = bl.cbegin();
7240 try {
7241 vstatfs.decode(blp);
f67539c2 7242 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
7243 derr << "fsck error: failed to decode Pool StatFS record"
7244 << pretty_binary_string(key) << dendl;
7245 if (repairer) {
7246 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
7247 << pretty_binary_string(key)
7248 << "', removing" << dendl;
7249 repairer->remove_key(db, PREFIX_STAT, key);
7250 }
7251 ++errors;
7252 vstatfs.reset();
7253 }
7254 auto stat_it = expected_pool_statfs.find(pool_id);
7255 if (stat_it == expected_pool_statfs.end()) {
7256 if (vstatfs.is_empty()) {
7257 // we don't consider that as an error since empty pool statfs
7258 // are left in DB for now
7259 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7260 << std::hex << pool_id << std::dec << dendl;
7261 if (repairer) {
7262 // but we need to increment error count in case of repair
7263 // to have proper counters at the end
7264 // (as repairer increments recovery counter anyway).
7265 ++errors;
7266 }
7267 } else {
7268 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
7269 << std::hex << pool_id << std::dec << dendl;
7270 ++errors;
7271 }
7272 if (repairer) {
522d829b 7273 repairer->remove_key(db, PREFIX_STAT, key);
11fdf7f2
TL
7274 }
7275 continue;
7276 }
7277 store_statfs_t statfs;
7278 vstatfs.publish(&statfs);
7279 if (!(stat_it->second == statfs)) {
7280 derr << "fsck error: actual " << statfs
7281 << " != expected " << stat_it->second
7282 << " for pool "
7283 << std::hex << pool_id << std::dec << dendl;
7284 if (repairer) {
7285 repairer->fix_statfs(db, key, stat_it->second);
7286 }
7287 ++errors;
7288 }
7289 expected_pool_statfs.erase(stat_it);
7290 }
7291 } // if (it)
eafe8130
TL
7292 for (auto& s : expected_pool_statfs) {
7293 if (s.second.is_zero()) {
11fdf7f2
TL
7294 // we might lack empty statfs recs in DB
7295 continue;
7296 }
7297 derr << "fsck error: missing Pool StatFS record for pool "
eafe8130 7298 << std::hex << s.first << std::dec << dendl;
11fdf7f2
TL
7299 if (repairer) {
7300 string key;
eafe8130
TL
7301 get_pool_stat_key(s.first, &key);
7302 repairer->fix_statfs(db, key, s.second);
11fdf7f2
TL
7303 }
7304 ++errors;
7305 }
eafe8130 7306 if (!per_pool_stat_collection &&
eafe8130
TL
7307 repairer) {
7308 // by virtue of running this method, we correct the top-level
7309 // error of having global stats
7310 repairer->inc_repaired();
7311 }
11fdf7f2
TL
7312}
7313
eafe8130
TL
7314BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
7315 BlueStore::FSCKDepth depth,
7316 int64_t pool_id,
7317 BlueStore::CollectionRef c,
7318 const ghobject_t& oid,
7319 const string& key,
7320 const bufferlist& value,
9f95a23c 7321 mempool::bluestore_fsck::list<string>* expecting_shards,
eafe8130
TL
7322 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
7323 const BlueStore::FSCK_ObjectCtx& ctx)
7324{
7325 auto& errors = ctx.errors;
7326 auto& num_objects = ctx.num_objects;
7327 auto& num_extents = ctx.num_extents;
7328 auto& num_blobs = ctx.num_blobs;
7329 auto& num_sharded_objects = ctx.num_sharded_objects;
7330 auto& num_spanning_blobs = ctx.num_spanning_blobs;
7331 auto used_blocks = ctx.used_blocks;
7332 auto sb_info_lock = ctx.sb_info_lock;
7333 auto& sb_info = ctx.sb_info;
7334 auto repairer = ctx.repairer;
7335
7336 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
7337 &ctx.expected_pool_statfs[pool_id] :
7338 &ctx.expected_store_statfs;
7339
7340 dout(10) << __func__ << " " << oid << dendl;
7341 OnodeRef o;
7342 o.reset(Onode::decode(c, oid, key, value));
7343 ++num_objects;
7c673cae 7344
eafe8130 7345 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7c673cae 7346
eafe8130
TL
7347 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7348 _dump_onode<30>(cct, *o);
7349 // shards
7350 if (!o->extent_map.shards.empty()) {
7351 ++num_sharded_objects;
7352 if (depth != FSCK_SHALLOW) {
9f95a23c 7353 ceph_assert(expecting_shards);
eafe8130
TL
7354 for (auto& s : o->extent_map.shards) {
7355 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
9f95a23c 7356 expecting_shards->push_back(string());
eafe8130 7357 get_extent_shard_key(o->key, s.shard_info->offset,
9f95a23c 7358 &expecting_shards->back());
eafe8130
TL
7359 if (s.shard_info->offset >= o->onode.size) {
7360 derr << "fsck error: " << oid << " shard 0x" << std::hex
7361 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
7362 << std::dec << dendl;
7363 ++errors;
7364 }
7365 }
7366 }
7367 }
7c673cae 7368
eafe8130
TL
7369 // lextents
7370 uint64_t pos = 0;
7371 mempool::bluestore_fsck::map<BlobRef,
7372 bluestore_blob_use_tracker_t> ref_map;
7373 for (auto& l : o->extent_map.extent_map) {
7374 dout(20) << __func__ << " " << l << dendl;
7375 if (l.logical_offset < pos) {
7376 derr << "fsck error: " << oid << " lextent at 0x"
7377 << std::hex << l.logical_offset
7378 << " overlaps with the previous, which ends at 0x" << pos
7379 << std::dec << dendl;
7380 ++errors;
7381 }
7382 if (depth != FSCK_SHALLOW &&
7383 o->extent_map.spans_shard(l.logical_offset, l.length)) {
7384 derr << "fsck error: " << oid << " lextent at 0x"
7385 << std::hex << l.logical_offset << "~" << l.length
7386 << " spans a shard boundary"
7387 << std::dec << dendl;
7388 ++errors;
7389 }
7390 pos = l.logical_offset + l.length;
7391 res_statfs->data_stored += l.length;
7392 ceph_assert(l.blob);
7393 const bluestore_blob_t& blob = l.blob->get_blob();
7394
7395 auto& ref = ref_map[l.blob];
7396 if (ref.is_empty()) {
7397 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
7398 uint32_t l = blob.get_logical_length();
7399 ref.init(l, min_release_size);
7400 }
7401 ref.get(
7402 l.blob_offset,
7403 l.length);
7404 ++num_extents;
7405 if (depth != FSCK_SHALLOW &&
7406 blob.has_unused()) {
7407 ceph_assert(referenced);
7408 auto p = referenced->find(l.blob);
7409 bluestore_blob_t::unused_t* pu;
7410 if (p == referenced->end()) {
7411 pu = &(*referenced)[l.blob];
7412 }
7413 else {
7414 pu = &p->second;
7415 }
7416 uint64_t blob_len = blob.get_logical_length();
7417 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
7418 ceph_assert(l.blob_offset + l.length <= blob_len);
7419 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
7420 uint64_t start = l.blob_offset / chunk_size;
7421 uint64_t end =
7422 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
7423 for (auto i = start; i < end; ++i) {
7424 (*pu) |= (1u << i);
7425 }
7426 }
7427 } //for (auto& l : o->extent_map.extent_map)
7428
7429 for (auto& i : ref_map) {
7430 ++num_blobs;
7431 const bluestore_blob_t& blob = i.first->get_blob();
7432 bool equal =
7433 depth == FSCK_SHALLOW ? true :
7434 i.first->get_blob_use_tracker().equal(i.second);
7435 if (!equal) {
7436 derr << "fsck error: " << oid << " blob " << *i.first
7437 << " doesn't match expected ref_map " << i.second << dendl;
7438 ++errors;
7439 }
7440 if (blob.is_compressed()) {
7441 res_statfs->data_compressed += blob.get_compressed_payload_length();
7442 res_statfs->data_compressed_original +=
7443 i.first->get_referenced_bytes();
7444 }
7445 if (blob.is_shared()) {
7446 if (i.first->shared_blob->get_sbid() > blobid_max) {
7447 derr << "fsck error: " << oid << " blob " << blob
7448 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
7449 << blobid_max << dendl;
7450 ++errors;
7451 }
7452 else if (i.first->shared_blob->get_sbid() == 0) {
7453 derr << "fsck error: " << oid << " blob " << blob
7454 << " marked as shared but has uninitialized sbid"
7455 << dendl;
7456 ++errors;
7457 }
7458 // the below lock is optional and provided in multithreading mode only
7459 if (sb_info_lock) {
7460 sb_info_lock->lock();
7461 }
7462 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
7463 ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
7464 ceph_assert(sbi.pool_id == INT64_MIN ||
7465 sbi.pool_id == oid.hobj.get_logical_pool());
7466 sbi.cid = c->cid;
7467 sbi.pool_id = oid.hobj.get_logical_pool();
7468 sbi.sb = i.first->shared_blob;
7469 sbi.oids.push_back(oid);
7470 sbi.compressed = blob.is_compressed();
7471 for (auto e : blob.get_extents()) {
7472 if (e.is_valid()) {
7473 sbi.ref_map.get(e.offset, e.length);
7474 }
7475 }
7476 if (sb_info_lock) {
7477 sb_info_lock->unlock();
7478 }
7479 } else if (depth != FSCK_SHALLOW) {
7480 ceph_assert(used_blocks);
7481 errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
7482 blob.is_compressed(),
7483 *used_blocks,
7484 fm->get_alloc_size(),
7485 repairer,
7486 *res_statfs,
7487 depth);
7488 } else {
7489 errors += _fsck_sum_extents(
7490 blob.get_extents(),
7491 blob.is_compressed(),
7492 *res_statfs);
7493 }
7494 } // for (auto& i : ref_map)
9f95a23c 7495
adb31ebb
TL
7496 {
7497 auto &sbm = o->extent_map.spanning_blob_map;
7498 size_t broken = 0;
7499 BlobRef first_broken;
7500 for (auto it = sbm.begin(); it != sbm.end();) {
7501 auto it1 = it++;
7502 if (ref_map.count(it1->second) == 0) {
7503 if (!broken) {
7504 first_broken = it1->second;
7505 ++errors;
7506 }
7507 broken++;
7508 if (repairer) {
7509 sbm.erase(it1);
7510 }
7511 }
7512 }
7513 if (broken) {
7514 derr << "fsck error: " << oid << " - " << broken
7515 << " zombie spanning blob(s) found, the first one: "
7516 << *first_broken << dendl;
7517 if(repairer) {
b3b6e05e
TL
7518 repairer->fix_spanning_blobs(
7519 db,
7520 [&](KeyValueDB::Transaction txn) {
7521 _record_onode(o, txn);
7522 });
adb31ebb
TL
7523 }
7524 }
7525 }
7526
9f95a23c
TL
7527 if (o->onode.has_omap()) {
7528 _fsck_check_object_omap(depth, o, ctx);
7529 }
7530
eafe8130
TL
7531 return o;
7532}
7533
7534#include "common/WorkQueue.h"
7535
7536class ShallowFSCKThreadPool : public ThreadPool
7537{
7538public:
7539 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
7540 ThreadPool(cct_, nm, tn, n) {
7541 }
7542 void worker(ThreadPool::WorkThread* wt) override {
7543 int next_wq = 0;
7544 while (!_stop) {
7545 next_wq %= work_queues.size();
7546 WorkQueue_ *wq = work_queues[next_wq++];
7547
7548 void* item = wq->_void_dequeue();
7549 if (item) {
7550 processing++;
7551 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
7552 wq->_void_process(item, tp_handle);
7553 processing--;
7554 }
7555 }
7556 }
7557 template <size_t BatchLen>
7558 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
7559 {
7560 struct Entry {
7561 int64_t pool_id;
7562 BlueStore::CollectionRef c;
7563 ghobject_t oid;
7564 string key;
7565 bufferlist value;
7566 };
7567 struct Batch {
7568 std::atomic<size_t> running = { 0 };
7569 size_t entry_count = 0;
7570 std::array<Entry, BatchLen> entries;
7571
7572 int64_t errors = 0;
7573 int64_t warnings = 0;
7574 uint64_t num_objects = 0;
7575 uint64_t num_extents = 0;
7576 uint64_t num_blobs = 0;
7577 uint64_t num_sharded_objects = 0;
7578 uint64_t num_spanning_blobs = 0;
7579 store_statfs_t expected_store_statfs;
7580 BlueStore::per_pool_statfs expected_pool_statfs;
7581 };
7582
7583 size_t batchCount;
7584 BlueStore* store = nullptr;
7585
eafe8130
TL
7586 ceph::mutex* sb_info_lock = nullptr;
7587 BlueStore::sb_info_map_t* sb_info = nullptr;
7588 BlueStoreRepairer* repairer = nullptr;
7589
7590 Batch* batches = nullptr;
7591 size_t last_batch_pos = 0;
7592 bool batch_acquired = false;
7593
7594 FSCKWorkQueue(std::string n,
7595 size_t _batchCount,
7596 BlueStore* _store,
eafe8130
TL
7597 ceph::mutex* _sb_info_lock,
7598 BlueStore::sb_info_map_t& _sb_info,
7599 BlueStoreRepairer* _repairer) :
f67539c2 7600 WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()),
eafe8130
TL
7601 batchCount(_batchCount),
7602 store(_store),
eafe8130
TL
7603 sb_info_lock(_sb_info_lock),
7604 sb_info(&_sb_info),
7605 repairer(_repairer)
7606 {
7607 batches = new Batch[batchCount];
7608 }
7609 ~FSCKWorkQueue() {
7610 delete[] batches;
7611 }
7612
7613 /// Remove all work items from the queue.
7614 void _clear() override {
7615 //do nothing
7616 }
7617 /// Check whether there is anything to do.
7618 bool _empty() override {
7619 ceph_assert(false);
7620 }
7621
7622 /// Get the next work item to process.
7623 void* _void_dequeue() override {
7624 size_t pos = rand() % batchCount;
7625 size_t pos0 = pos;
7626 do {
7627 auto& batch = batches[pos];
7628 if (batch.running.fetch_add(1) == 0) {
7629 if (batch.entry_count) {
7630 return &batch;
7631 }
7632 }
7633 batch.running--;
7634 pos++;
7635 pos %= batchCount;
7636 } while (pos != pos0);
7637 return nullptr;
7638 }
7639 /** @brief Process the work item.
7640 * This function will be called several times in parallel
7641 * and must therefore be thread-safe. */
7642 void _void_process(void* item, TPHandle& handle) override {
7643 Batch* batch = (Batch*)item;
7644
7645 BlueStore::FSCK_ObjectCtx ctx(
7646 batch->errors,
7647 batch->warnings,
7648 batch->num_objects,
7649 batch->num_extents,
7650 batch->num_blobs,
7651 batch->num_sharded_objects,
7652 batch->num_spanning_blobs,
7653 nullptr, // used_blocks
9f95a23c 7654 nullptr, //used_omap_head
eafe8130
TL
7655 sb_info_lock,
7656 *sb_info,
7657 batch->expected_store_statfs,
7658 batch->expected_pool_statfs,
7659 repairer);
7660
7661 for (size_t i = 0; i < batch->entry_count; i++) {
7662 auto& entry = batch->entries[i];
7663
7664 store->fsck_check_objects_shallow(
7665 BlueStore::FSCK_SHALLOW,
7666 entry.pool_id,
7667 entry.c,
7668 entry.oid,
7669 entry.key,
7670 entry.value,
9f95a23c 7671 nullptr, // expecting_shards - this will need a protection if passed
eafe8130
TL
7672 nullptr, // referenced
7673 ctx);
7674 }
7675 //std::cout << "processed " << batch << std::endl;
7676 batch->entry_count = 0;
7677 batch->running--;
7678 }
7679 /** @brief Synchronously finish processing a work item.
7680 * This function is called after _void_process with the global thread pool lock held,
7681 * so at most one copy will execute simultaneously for a given thread pool.
7682 * It can be used for non-thread-safe finalization. */
7683 void _void_process_finish(void*) override {
7684 ceph_assert(false);
7685 }
7686
7687 bool queue(
7688 int64_t pool_id,
7689 BlueStore::CollectionRef c,
7690 const ghobject_t& oid,
7691 const string& key,
7692 const bufferlist& value) {
7693 bool res = false;
7694 size_t pos0 = last_batch_pos;
7695 if (!batch_acquired) {
7696 do {
7697 auto& batch = batches[last_batch_pos];
7698 if (batch.running.fetch_add(1) == 0) {
7699 if (batch.entry_count < BatchLen) {
7700 batch_acquired = true;
7701 break;
7702 }
7703 }
7704 batch.running.fetch_sub(1);
7705 last_batch_pos++;
7706 last_batch_pos %= batchCount;
7707 } while (last_batch_pos != pos0);
7708 }
7709 if (batch_acquired) {
7710 auto& batch = batches[last_batch_pos];
7711 ceph_assert(batch.running);
7712 ceph_assert(batch.entry_count < BatchLen);
7713
7714 auto& entry = batch.entries[batch.entry_count];
7715 entry.pool_id = pool_id;
7716 entry.c = c;
7717 entry.oid = oid;
7718 entry.key = key;
7719 entry.value = value;
7720
7721 ++batch.entry_count;
7722 if (batch.entry_count == BatchLen) {
7723 batch_acquired = false;
7724 batch.running.fetch_sub(1);
7725 last_batch_pos++;
7726 last_batch_pos %= batchCount;
7727 }
7728 res = true;
7729 }
7730 return res;
7731 }
7732
7733 void finalize(ThreadPool& tp,
7734 BlueStore::FSCK_ObjectCtx& ctx) {
7735 if (batch_acquired) {
7736 auto& batch = batches[last_batch_pos];
7737 ceph_assert(batch.running);
7738 batch.running.fetch_sub(1);
7739 }
7740 tp.stop();
7741
7742 for (size_t i = 0; i < batchCount; i++) {
7743 auto& batch = batches[i];
7744
7745 //process leftovers if any
7746 if (batch.entry_count) {
7747 TPHandle tp_handle(store->cct,
7748 nullptr,
7749 timeout_interval,
7750 suicide_interval);
7751 ceph_assert(batch.running == 0);
7752
7753 batch.running++; // just to be on-par with the regular call
7754 _void_process(&batch, tp_handle);
7755 }
7756 ceph_assert(batch.entry_count == 0);
7757
7758 ctx.errors += batch.errors;
7759 ctx.warnings += batch.warnings;
7760 ctx.num_objects += batch.num_objects;
7761 ctx.num_extents += batch.num_extents;
7762 ctx.num_blobs += batch.num_blobs;
7763 ctx.num_sharded_objects += batch.num_sharded_objects;
7764 ctx.num_spanning_blobs += batch.num_spanning_blobs;
9f95a23c 7765
eafe8130
TL
7766 ctx.expected_store_statfs.add(batch.expected_store_statfs);
7767
7768 for (auto it = batch.expected_pool_statfs.begin();
7769 it != batch.expected_pool_statfs.end();
7770 it++) {
7771 ctx.expected_pool_statfs[it->first].add(it->second);
7772 }
7773 }
7774 }
7775 };
7776};
7777
9f95a23c
TL
7778void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
7779 OnodeRef& o,
7780 const BlueStore::FSCK_ObjectCtx& ctx)
eafe8130 7781{
9f95a23c
TL
7782 auto& errors = ctx.errors;
7783 auto& warnings = ctx.warnings;
7784 auto repairer = ctx.repairer;
7785
7786 ceph_assert(o->onode.has_omap());
7787 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
f67539c2 7788 if (per_pool_omap == OMAP_PER_POOL) {
9f95a23c
TL
7789 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
7790 << "fsck error: " << o->oid
7791 << " has omap that is not per-pool or pgmeta"
7792 << fsck_dendl;
7793 ++errors;
7794 } else {
7795 const char* w;
7796 int64_t num;
7797 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
7798 ++errors;
7799 num = errors;
7800 w = "error";
7801 } else {
7802 ++warnings;
7803 num = warnings;
7804 w = "warning";
7805 }
7806 fsck_derr(num, MAX_FSCK_ERROR_LINES)
7807 << "fsck " << w << ": " << o->oid
7808 << " has omap that is not per-pool or pgmeta"
7809 << fsck_dendl;
7810 }
f67539c2
TL
7811 } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) {
7812 if (per_pool_omap == OMAP_PER_PG) {
7813 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
7814 << "fsck error: " << o->oid
7815 << " has omap that is not per-pg or pgmeta"
7816 << fsck_dendl;
7817 ++errors;
7818 } else {
7819 const char* w;
7820 int64_t num;
7821 if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) {
7822 ++errors;
7823 num = errors;
7824 w = "error";
7825 } else {
7826 ++warnings;
7827 num = warnings;
7828 w = "warning";
7829 }
7830 fsck_derr(num, MAX_FSCK_ERROR_LINES)
7831 << "fsck " << w << ": " << o->oid
7832 << " has omap that is not per-pg or pgmeta"
7833 << fsck_dendl;
7834 }
9f95a23c
TL
7835 }
7836 if (repairer &&
f67539c2 7837 !o->onode.is_perpg_omap() &&
9f95a23c 7838 !o->onode.is_pgmeta_omap()) {
f67539c2 7839 dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl;
522d829b 7840 bufferlist header;
9f95a23c 7841 map<string, bufferlist> kv;
522d829b
TL
7842 {
7843 KeyValueDB::Transaction txn = db->get_transaction();
7844 uint64_t txn_cost = 0;
7845 const string& prefix = Onode::calc_omap_prefix(o->onode.flags);
7846 uint8_t new_flags = o->onode.flags |
7847 bluestore_onode_t::FLAG_PERPOOL_OMAP |
7848 bluestore_onode_t::FLAG_PERPG_OMAP;
7849 const string& new_omap_prefix = Onode::calc_omap_prefix(new_flags);
7850
7851 KeyValueDB::Iterator it = db->get_iterator(prefix);
7852 string head, tail;
7853 o->get_omap_header(&head);
7854 o->get_omap_tail(&tail);
7855 it->lower_bound(head);
7856 // head
7857 if (it->valid() && it->key() == head) {
7858 dout(30) << __func__ << " got header" << dendl;
7859 header = it->value();
7860 if (header.length()) {
7861 string new_head;
7862 Onode::calc_omap_header(new_flags, o.get(), &new_head);
7863 txn->set(new_omap_prefix, new_head, header);
7864 txn_cost += new_head.length() + header.length();
7865 }
a4b75251 7866 it->next();
522d829b
TL
7867 }
7868 // tail
7869 {
7870 string new_tail;
7871 Onode::calc_omap_tail(new_flags, o.get(), &new_tail);
7872 bufferlist empty;
7873 txn->set(new_omap_prefix, new_tail, empty);
7874 txn_cost += new_tail.length() + new_tail.length();
7875 }
7876 // values
7877 string final_key;
7878 Onode::calc_omap_key(new_flags, o.get(), string(), &final_key);
7879 size_t base_key_len = final_key.size();
7880 while (it->valid() && it->key() < tail) {
7881 string user_key;
7882 o->decode_omap_key(it->key(), &user_key);
7883 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7884 << " -> " << user_key << dendl;
7885
7886 final_key.resize(base_key_len);
a4b75251 7887 final_key += user_key;
522d829b
TL
7888 auto v = it->value();
7889 txn->set(new_omap_prefix, final_key, v);
7890 txn_cost += final_key.length() + v.length();
7891
7892 // submit a portion if cost exceeds 16MB
7893 if (txn_cost >= 16 * (1 << 20) ) {
7894 db->submit_transaction_sync(txn);
7895 txn = db->get_transaction();
7896 txn_cost = 0;
7897 }
7898 it->next();
7899 }
7900 if (txn_cost > 0) {
7901 db->submit_transaction_sync(txn);
7902 }
7903 }
7904 // finalize: remove legacy data
7905 {
9f95a23c
TL
7906 KeyValueDB::Transaction txn = db->get_transaction();
7907 // remove old keys
7908 const string& old_omap_prefix = o->get_omap_prefix();
7909 string old_head, old_tail;
7910 o->get_omap_header(&old_head);
7911 o->get_omap_tail(&old_tail);
7912 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
7913 txn->rmkey(old_omap_prefix, old_tail);
7914 // set flag
f67539c2 7915 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP);
9f95a23c 7916 _record_onode(o, txn);
9f95a23c
TL
7917 db->submit_transaction_sync(txn);
7918 repairer->inc_repaired();
522d829b 7919 repairer->request_compaction();
9f95a23c 7920 }
eafe8130 7921 }
9f95a23c 7922}
eafe8130 7923
9f95a23c
TL
7924void BlueStore::_fsck_check_objects(FSCKDepth depth,
7925 BlueStore::FSCK_ObjectCtx& ctx)
7926{
eafe8130 7927 auto& errors = ctx.errors;
eafe8130
TL
7928 auto sb_info_lock = ctx.sb_info_lock;
7929 auto& sb_info = ctx.sb_info;
7930 auto repairer = ctx.repairer;
7931
7932 uint64_t_btree_t used_nids;
7933
7934 size_t processed_myself = 0;
7935
f67539c2 7936 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
7937 mempool::bluestore_fsck::list<string> expecting_shards;
7938 if (it) {
7939 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
7940 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
7941 std::unique_ptr<WQ> wq(
7942 new WQ(
7943 "FSCKWorkQueue",
7944 (thread_count ? : 1) * 32,
7945 this,
eafe8130
TL
7946 sb_info_lock,
7947 sb_info,
7948 repairer));
7949
7950 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
7951
7952 thread_pool.add_work_queue(wq.get());
7953 if (depth == FSCK_SHALLOW && thread_count > 0) {
7954 //not the best place but let's check anyway
7955 ceph_assert(sb_info_lock);
7956 thread_pool.start();
7957 }
7958
7959 //fill global if not overriden below
7960 CollectionRef c;
7961 int64_t pool_id = -1;
7962 spg_t pgid;
7963 for (it->lower_bound(string()); it->valid(); it->next()) {
7964 dout(30) << __func__ << " key "
7965 << pretty_binary_string(it->key()) << dendl;
7966 if (is_extent_shard_key(it->key())) {
7967 if (depth == FSCK_SHALLOW) {
7968 continue;
7969 }
7970 while (!expecting_shards.empty() &&
7971 expecting_shards.front() < it->key()) {
7972 derr << "fsck error: missing shard key "
7973 << pretty_binary_string(expecting_shards.front())
7974 << dendl;
7975 ++errors;
7976 expecting_shards.pop_front();
7977 }
7978 if (!expecting_shards.empty() &&
7979 expecting_shards.front() == it->key()) {
7980 // all good
7981 expecting_shards.pop_front();
7982 continue;
7983 }
7984
7985 uint32_t offset;
7986 string okey;
7987 get_key_extent_shard(it->key(), &okey, &offset);
7988 derr << "fsck error: stray shard 0x" << std::hex << offset
7989 << std::dec << dendl;
7990 if (expecting_shards.empty()) {
7991 derr << "fsck error: " << pretty_binary_string(it->key())
7992 << " is unexpected" << dendl;
7993 ++errors;
7994 continue;
7995 }
7996 while (expecting_shards.front() > it->key()) {
7997 derr << "fsck error: saw " << pretty_binary_string(it->key())
7998 << dendl;
7999 derr << "fsck error: exp "
8000 << pretty_binary_string(expecting_shards.front()) << dendl;
8001 ++errors;
8002 expecting_shards.pop_front();
8003 if (expecting_shards.empty()) {
8004 break;
8005 }
8006 }
8007 continue;
8008 }
8009
8010 ghobject_t oid;
8011 int r = get_key_object(it->key(), &oid);
8012 if (r < 0) {
8013 derr << "fsck error: bad object key "
8014 << pretty_binary_string(it->key()) << dendl;
8015 ++errors;
8016 continue;
8017 }
8018 if (!c ||
8019 oid.shard_id != pgid.shard ||
8020 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8021 !c->contains(oid)) {
8022 c = nullptr;
8023 for (auto& p : coll_map) {
8024 if (p.second->contains(oid)) {
8025 c = p.second;
8026 break;
8027 }
8028 }
8029 if (!c) {
8030 derr << "fsck error: stray object " << oid
8031 << " not owned by any collection" << dendl;
8032 ++errors;
8033 continue;
8034 }
8035 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
8036 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
8037 << dendl;
8038 }
8039
8040 if (depth != FSCK_SHALLOW &&
8041 !expecting_shards.empty()) {
8042 for (auto& k : expecting_shards) {
8043 derr << "fsck error: missing shard key "
8044 << pretty_binary_string(k) << dendl;
8045 }
8046 ++errors;
8047 expecting_shards.clear();
8048 }
8049
8050 bool queued = false;
8051 if (depth == FSCK_SHALLOW && thread_count > 0) {
8052 queued = wq->queue(
8053 pool_id,
8054 c,
8055 oid,
8056 it->key(),
8057 it->value());
8058 }
8059 OnodeRef o;
8060 map<BlobRef, bluestore_blob_t::unused_t> referenced;
8061
8062 if (!queued) {
8063 ++processed_myself;
8064
8065 o = fsck_check_objects_shallow(
8066 depth,
8067 pool_id,
8068 c,
8069 oid,
8070 it->key(),
8071 it->value(),
9f95a23c 8072 &expecting_shards,
eafe8130
TL
8073 &referenced,
8074 ctx);
8075 }
8076
8077 if (depth != FSCK_SHALLOW) {
8078 ceph_assert(o != nullptr);
8079 if (o->onode.nid) {
8080 if (o->onode.nid > nid_max) {
8081 derr << "fsck error: " << oid << " nid " << o->onode.nid
8082 << " > nid_max " << nid_max << dendl;
8083 ++errors;
8084 }
8085 if (used_nids.count(o->onode.nid)) {
8086 derr << "fsck error: " << oid << " nid " << o->onode.nid
8087 << " already in use" << dendl;
8088 ++errors;
8089 continue; // go for next object
8090 }
8091 used_nids.insert(o->onode.nid);
8092 }
8093 for (auto& i : referenced) {
8094 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
8095 << std::dec << " for " << *i.first << dendl;
8096 const bluestore_blob_t& blob = i.first->get_blob();
8097 if (i.second & blob.unused) {
8098 derr << "fsck error: " << oid << " blob claims unused 0x"
8099 << std::hex << blob.unused
8100 << " but extents reference 0x" << i.second << std::dec
8101 << " on blob " << *i.first << dendl;
8102 ++errors;
8103 }
8104 if (blob.has_csum()) {
8105 uint64_t blob_len = blob.get_logical_length();
8106 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
8107 unsigned csum_count = blob.get_csum_count();
8108 unsigned csum_chunk_size = blob.get_csum_chunk_size();
8109 for (unsigned p = 0; p < csum_count; ++p) {
8110 unsigned pos = p * csum_chunk_size;
8111 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
8112 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
8113 unsigned mask = 1u << firstbit;
8114 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
8115 mask |= 1u << b;
8116 }
8117 if ((blob.unused & mask) == mask) {
8118 // this csum chunk region is marked unused
8119 if (blob.get_csum_item(p) != 0) {
8120 derr << "fsck error: " << oid
8121 << " blob claims csum chunk 0x" << std::hex << pos
8122 << "~" << csum_chunk_size
8123 << " is unused (mask 0x" << mask << " of unused 0x"
8124 << blob.unused << ") but csum is non-zero 0x"
8125 << blob.get_csum_item(p) << std::dec << " on blob "
8126 << *i.first << dendl;
8127 ++errors;
8128 }
8129 }
8130 }
8131 }
8132 }
8133 // omap
8134 if (o->onode.has_omap()) {
9f95a23c
TL
8135 ceph_assert(ctx.used_omap_head);
8136 if (ctx.used_omap_head->count(o->onode.nid)) {
8137 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
8138 << " already in use" << dendl;
eafe8130
TL
8139 ++errors;
8140 } else {
9f95a23c 8141 ctx.used_omap_head->insert(o->onode.nid);
eafe8130 8142 }
9f95a23c 8143 } // if (o->onode.has_omap())
eafe8130
TL
8144 if (depth == FSCK_DEEP) {
8145 bufferlist bl;
8146 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
8147 uint64_t offset = 0;
8148 do {
8149 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
8150 int r = _do_read(c.get(), o, offset, l, bl,
8151 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
8152 if (r < 0) {
8153 ++errors;
8154 derr << "fsck error: " << oid << std::hex
8155 << " error during read: "
8156 << " " << offset << "~" << l
8157 << " " << cpp_strerror(r) << std::dec
8158 << dendl;
8159 break;
8160 }
8161 offset += l;
8162 } while (offset < o->onode.size);
8163 } // deep
8164 } //if (depth != FSCK_SHALLOW)
8165 } // for (it->lower_bound(string()); it->valid(); it->next())
8166 if (depth == FSCK_SHALLOW && thread_count > 0) {
8167 wq->finalize(thread_pool, ctx);
8168 if (processed_myself) {
8169 // may be needs more threads?
8170 dout(0) << __func__ << " partial offload"
8171 << ", done myself " << processed_myself
8172 << " of " << ctx.num_objects
8173 << "objects, threads " << thread_count
8174 << dendl;
8175 }
8176 }
8177 } // if (it)
8178}
8179/**
8180An overview for currently implemented repair logics
8181performed in fsck in two stages: detection(+preparation) and commit.
8182Detection stage (in processing order):
8183 (Issue -> Repair action to schedule)
8184 - Detect undecodable keys for Shared Blobs -> Remove
8185 - Detect undecodable records for Shared Blobs -> Remove
8186 (might trigger missed Shared Blob detection below)
8187 - Detect stray records for Shared Blobs -> Remove
8188 - Detect misreferenced pextents -> Fix
8189 Prepare Bloom-like filter to track cid/oid -> pextent
8190 Prepare list of extents that are improperly referenced
8191 Enumerate Onode records that might use 'misreferenced' pextents
8192 (Bloom-like filter applied to reduce computation)
8193 Per each questinable Onode enumerate all blobs and identify broken ones
8194 (i.e. blobs having 'misreferences')
8195 Rewrite each broken blob data by allocating another extents and
8196 copying data there
8197 If blob is shared - unshare it and mark corresponding Shared Blob
8198 for removal
8199 Release previously allocated space
8200 Update Extent Map
8201 - Detect missed Shared Blobs -> Recreate
8202 - Detect undecodable deferred transaction -> Remove
8203 - Detect Freelist Manager's 'false free' entries -> Mark as used
8204 - Detect Freelist Manager's leaked entries -> Mark as free
8205 - Detect statfs inconsistency - Update
8206 Commit stage (separate DB commit per each step):
8207 - Apply leaked FM entries fix
8208 - Apply 'false free' FM entries fix
8209 - Apply 'Remove' actions
8210 - Apply fix for misreference pextents
8211 - Apply Shared Blob recreate
8212 (can be merged with the step above if misreferences were dectected)
8213 - Apply StatFS update
8214*/
8215int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
8216{
8217 dout(1) << __func__
8218 << (repair ? " repair" : " check")
8219 << (depth == FSCK_DEEP ? " (deep)" :
8220 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8221 << dendl;
8222
8223 // in deep mode we need R/W write access to be able to replay deferred ops
8224 bool read_only = !(repair || depth == FSCK_DEEP);
8225
f67539c2 8226 int r = _open_db_and_around(read_only);
eafe8130
TL
8227 if (r < 0)
8228 return r;
7c673cae 8229
11fdf7f2
TL
8230 if (!read_only) {
8231 r = _upgrade_super();
8232 if (r < 0) {
8233 goto out_db;
8234 }
8235 }
7c673cae 8236
eafe8130 8237 r = _open_collections();
7c673cae 8238 if (r < 0)
11fdf7f2 8239 goto out_db;
7c673cae
FG
8240
8241 mempool_thread.init();
8242
11fdf7f2
TL
8243 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8244 // enable in repair or deep mode modes only
8245 if (!read_only) {
8246 _kv_start();
8247 r = _deferred_replay();
8248 _kv_stop();
8249 }
7c673cae
FG
8250 if (r < 0)
8251 goto out_scan;
8252
eafe8130
TL
8253 r = _fsck_on_open(depth, repair);
8254
8255out_scan:
8256 mempool_thread.shutdown();
f6b5b4d7 8257 _shutdown_cache();
eafe8130 8258out_db:
1911f103 8259 _close_db_and_around(false);
eafe8130
TL
8260
8261 return r;
8262}
8263
8264int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
8265{
8266 dout(1) << __func__
8267 << " <<<START>>>"
8268 << (repair ? " repair" : " check")
8269 << (depth == FSCK_DEEP ? " (deep)" :
8270 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8271 << " start" << dendl;
8272 int64_t errors = 0;
8273 int64_t warnings = 0;
8274 unsigned repaired = 0;
8275
8276 uint64_t_btree_t used_omap_head;
eafe8130
TL
8277 uint64_t_btree_t used_sbids;
8278
f67539c2 8279 mempool_dynamic_bitset used_blocks, bluefs_used_blocks;
eafe8130
TL
8280 KeyValueDB::Iterator it;
8281 store_statfs_t expected_store_statfs, actual_statfs;
8282 per_pool_statfs expected_pool_statfs;
8283
8284 sb_info_map_t sb_info;
8285
8286 uint64_t num_objects = 0;
8287 uint64_t num_extents = 0;
8288 uint64_t num_blobs = 0;
8289 uint64_t num_spanning_blobs = 0;
8290 uint64_t num_shared_blobs = 0;
8291 uint64_t num_sharded_objects = 0;
8292 BlueStoreRepairer repairer;
8293
f67539c2
TL
8294 auto alloc_size = fm->get_alloc_size();
8295
eafe8130
TL
8296 utime_t start = ceph_clock_now();
8297
8298 _fsck_collections(&errors);
b32b8144 8299 used_blocks.resize(fm->get_alloc_units());
7c673cae
FG
8300
8301 if (bluefs) {
f67539c2 8302 interval_set<uint64_t> bluefs_extents;
11fdf7f2 8303
f67539c2
TL
8304 int r = bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents);
8305 ceph_assert(r == 0);
8306 for (auto [start, len] : bluefs_extents) {
8307 apply_for_bitset_range(start, len, alloc_size, used_blocks,
8308 [&](uint64_t pos, mempool_dynamic_bitset& bs) {
8309 ceph_assert(pos < bs.size());
7c673cae 8310 bs.set(pos);
f67539c2
TL
8311 }
8312 );
8313 }
8314 }
8315
8316 bluefs_used_blocks = used_blocks;
8317
8318 apply_for_bitset_range(
8319 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks,
8320 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8321 bs.set(pos);
7c673cae 8322 }
f67539c2
TL
8323 );
8324
8325
8326 if (repair) {
b3b6e05e 8327 repairer.init_space_usage_tracker(
f67539c2
TL
8328 bdev->get_size(),
8329 min_alloc_size);
8330 }
8331
8332 if (bluefs) {
eafe8130 8333 int r = bluefs->fsck();
7c673cae 8334 if (r < 0) {
eafe8130 8335 return r;
7c673cae
FG
8336 }
8337 if (r > 0)
8338 errors += r;
8339 }
8340
eafe8130
TL
8341 if (!per_pool_stat_collection) {
8342 const char *w;
8343 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
8344 w = "error";
8345 ++errors;
8346 } else {
8347 w = "warning";
8348 ++warnings;
8349 }
8350 derr << "fsck " << w << ": store not yet converted to per-pool stats"
8351 << dendl;
8352 }
f67539c2 8353 if (per_pool_omap != OMAP_PER_PG) {
9f95a23c
TL
8354 const char *w;
8355 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8356 w = "error";
8357 ++errors;
8358 } else {
8359 w = "warning";
8360 ++warnings;
8361 }
f67539c2 8362 derr << "fsck " << w << ": store not yet converted to per-pg omap"
9f95a23c
TL
8363 << dendl;
8364 }
8365
11fdf7f2 8366 // get expected statfs; reset unaffected fields to be able to compare
7c673cae
FG
8367 // structs
8368 statfs(&actual_statfs);
11fdf7f2
TL
8369 actual_statfs.total = 0;
8370 actual_statfs.internally_reserved = 0;
8371 actual_statfs.available = 0;
8372 actual_statfs.internal_metadata = 0;
8373 actual_statfs.omap_allocated = 0;
8374
eafe8130
TL
8375 if (g_conf()->bluestore_debug_fsck_abort) {
8376 dout(1) << __func__ << " debug abort" << dendl;
8377 goto out_scan;
8378 }
7c673cae 8379 // walk PREFIX_OBJ
eafe8130
TL
8380 {
8381 dout(1) << __func__ << " walking object keyspace" << dendl;
8382 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8383 BlueStore::FSCK_ObjectCtx ctx(
8384 errors,
8385 warnings,
8386 num_objects,
8387 num_extents,
8388 num_blobs,
8389 num_sharded_objects,
8390 num_spanning_blobs,
8391 &used_blocks,
8392 &used_omap_head,
9f95a23c
TL
8393 //no need for the below lock when in non-shallow mode as
8394 // there is no multithreading in this case
8395 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
eafe8130
TL
8396 sb_info,
8397 expected_store_statfs,
8398 expected_pool_statfs,
8399 repair ? &repairer : nullptr);
9f95a23c
TL
8400
8401 _fsck_check_objects(depth, ctx);
eafe8130 8402 }
11fdf7f2 8403
7c673cae 8404 dout(1) << __func__ << " checking shared_blobs" << dendl;
f67539c2 8405 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
7c673cae 8406 if (it) {
eafe8130
TL
8407 // FIXME minor: perhaps simplify for shallow mode?
8408 // fill global if not overriden below
8409 auto expected_statfs = &expected_store_statfs;
11fdf7f2 8410
7c673cae
FG
8411 for (it->lower_bound(string()); it->valid(); it->next()) {
8412 string key = it->key();
8413 uint64_t sbid;
8414 if (get_key_shared_blob(key, &sbid)) {
3efd9988 8415 derr << "fsck error: bad key '" << key
7c673cae 8416 << "' in shared blob namespace" << dendl;
11fdf7f2
TL
8417 if (repair) {
8418 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8419 }
7c673cae
FG
8420 ++errors;
8421 continue;
8422 }
8423 auto p = sb_info.find(sbid);
8424 if (p == sb_info.end()) {
3efd9988 8425 derr << "fsck error: found stray shared blob data for sbid 0x"
7c673cae 8426 << std::hex << sbid << std::dec << dendl;
11fdf7f2
TL
8427 if (repair) {
8428 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8429 }
7c673cae
FG
8430 ++errors;
8431 } else {
8432 ++num_shared_blobs;
8433 sb_info_t& sbi = p->second;
8434 bluestore_shared_blob_t shared_blob(sbid);
8435 bufferlist bl = it->value();
11fdf7f2
TL
8436 auto blp = bl.cbegin();
8437 try {
8438 decode(shared_blob, blp);
f67539c2 8439 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
8440 ++errors;
8441 // Force update and don't report as missing
8442 sbi.updated = sbi.passed = true;
8443
8444 derr << "fsck error: failed to decode Shared Blob"
8445 << pretty_binary_string(it->key()) << dendl;
8446 if (repair) {
8447 dout(20) << __func__ << " undecodable Shared Blob, key:'"
8448 << pretty_binary_string(it->key())
8449 << "', removing" << dendl;
8450 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8451 }
8452 continue;
8453 }
7c673cae
FG
8454 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
8455 if (shared_blob.ref_map != sbi.ref_map) {
3efd9988 8456 derr << "fsck error: shared blob 0x" << std::hex << sbid
11fdf7f2
TL
8457 << std::dec << " ref_map " << shared_blob.ref_map
8458 << " != expected " << sbi.ref_map << dendl;
8459 sbi.updated = true; // will update later in repair mode only!
7c673cae
FG
8460 ++errors;
8461 }
8462 PExtentVector extents;
8463 for (auto &r : shared_blob.ref_map.ref_map) {
8464 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
8465 }
eafe8130 8466 if (per_pool_stat_collection || repair) {
11fdf7f2
TL
8467 expected_statfs = &expected_pool_statfs[sbi.pool_id];
8468 }
8469 errors += _fsck_check_extents(sbi.cid,
a4b75251 8470 sbi.oids.front(),
7c673cae 8471 extents,
a4b75251 8472 sbi.compressed,
b32b8144
FG
8473 used_blocks,
8474 fm->get_alloc_size(),
11fdf7f2 8475 repair ? &repairer : nullptr,
eafe8130
TL
8476 *expected_statfs,
8477 depth);
11fdf7f2
TL
8478 sbi.passed = true;
8479 }
8480 }
8481 } // if (it)
8482
8483 if (repair && repairer.preprocess_misreference(db)) {
8484
8485 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
11fdf7f2
TL
8486 auto& misref_extents = repairer.get_misreferences();
8487 interval_set<uint64_t> to_release;
f67539c2 8488 it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2 8489 if (it) {
eafe8130
TL
8490 // fill global if not overriden below
8491 auto expected_statfs = &expected_store_statfs;
11fdf7f2
TL
8492
8493 CollectionRef c;
8494 spg_t pgid;
8495 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
8496 bool bypass_rest = false;
8497 for (it->lower_bound(string()); it->valid() && !bypass_rest;
8498 it->next()) {
8499 dout(30) << __func__ << " key "
8500 << pretty_binary_string(it->key()) << dendl;
8501 if (is_extent_shard_key(it->key())) {
8502 continue;
8503 }
8504
8505 ghobject_t oid;
8506 int r = get_key_object(it->key(), &oid);
b3b6e05e 8507 if (r < 0 || !repairer.is_used(oid)) {
11fdf7f2
TL
8508 continue;
8509 }
8510
8511 if (!c ||
8512 oid.shard_id != pgid.shard ||
8513 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8514 !c->contains(oid)) {
8515 c = nullptr;
8516 for (auto& p : coll_map) {
8517 if (p.second->contains(oid)) {
8518 c = p.second;
8519 break;
8520 }
8521 }
8522 if (!c) {
8523 continue;
8524 }
eafe8130
TL
8525 if (per_pool_stat_collection || repair) {
8526 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
11fdf7f2
TL
8527 expected_statfs = &expected_pool_statfs[pool_id];
8528 }
8529 }
b3b6e05e 8530 if (!repairer.is_used(c->cid)) {
11fdf7f2
TL
8531 continue;
8532 }
8533
8534 dout(20) << __func__ << " check misreference for col:" << c->cid
8535 << " obj:" << oid << dendl;
8536
eafe8130
TL
8537 OnodeRef o;
8538 o.reset(Onode::decode(c, oid, it->key(), it->value()));
11fdf7f2
TL
8539 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8540 mempool::bluestore_fsck::set<BlobRef> blobs;
8541
8542 for (auto& e : o->extent_map.extent_map) {
8543 blobs.insert(e.blob);
8544 }
8545 bool need_onode_update = false;
8546 bool first_dump = true;
8547 for(auto b : blobs) {
8548 bool broken_blob = false;
8549 auto& pextents = b->dirty_blob().dirty_extents();
8550 for (auto& e : pextents) {
8551 if (!e.is_valid()) {
8552 continue;
8553 }
8554 // for the sake of simplicity and proper shared blob handling
8555 // always rewrite the whole blob even when it's partially
8556 // misreferenced.
8557 if (misref_extents.intersects(e.offset, e.length)) {
8558 if (first_dump) {
8559 first_dump = false;
81eedcae 8560 _dump_onode<10>(cct, *o);
11fdf7f2
TL
8561 }
8562 broken_blob = true;
8563 break;
8564 }
8565 }
8566 if (!broken_blob)
8567 continue;
8568 bool compressed = b->get_blob().is_compressed();
8569 need_onode_update = true;
8570 dout(10) << __func__
8571 << " fix misreferences in oid:" << oid
8572 << " " << *b << dendl;
8573 uint64_t b_off = 0;
8574 PExtentVector pext_to_release;
8575 pext_to_release.reserve(pextents.size());
8576 // rewriting all valid pextents
8577 for (auto e = pextents.begin(); e != pextents.end();
a4b75251
TL
8578 e++) {
8579 auto b_off_cur = b_off;
8580 b_off += e->length;
11fdf7f2
TL
8581 if (!e->is_valid()) {
8582 continue;
8583 }
8584 PExtentVector exts;
f67539c2
TL
8585 int64_t alloc_len =
8586 shared_alloc.a->allocate(e->length, min_alloc_size,
8587 0, 0, &exts);
eafe8130 8588 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
11fdf7f2
TL
8589 derr << __func__
8590 << " failed to allocate 0x" << std::hex << e->length
eafe8130 8591 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2 8592 << " min_alloc_size 0x" << min_alloc_size
f67539c2 8593 << " available 0x " << shared_alloc.a->get_free()
11fdf7f2
TL
8594 << std::dec << dendl;
8595 if (alloc_len > 0) {
f67539c2 8596 shared_alloc.a->release(exts);
11fdf7f2
TL
8597 }
8598 bypass_rest = true;
8599 break;
8600 }
8601 expected_statfs->allocated += e->length;
8602 if (compressed) {
8603 expected_statfs->data_compressed_allocated += e->length;
8604 }
8605
8606 bufferlist bl;
8607 IOContext ioc(cct, NULL, true); // allow EIO
8608 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
8609 if (r < 0) {
8610 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
8611 <<"~" << e->length << std::dec << dendl;
8612 ceph_abort_msg("read failed, wtf");
8613 }
8614 pext_to_release.push_back(*e);
8615 e = pextents.erase(e);
8616 e = pextents.insert(e, exts.begin(), exts.end());
8617 b->get_blob().map_bl(
a4b75251
TL
8618 b_off_cur,
8619 bl,
11fdf7f2
TL
8620 [&](uint64_t offset, bufferlist& t) {
8621 int r = bdev->write(offset, t, false);
8622 ceph_assert(r == 0);
8623 });
8624 e += exts.size() - 1;
8625 for (auto& p : exts) {
8626 fm->allocate(p.offset, p.length, txn);
8627 }
8628 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8629
8630 if (b->get_blob().is_shared()) {
8631 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
8632
8633 auto sb_it = sb_info.find(b->shared_blob->get_sbid());
8634 ceph_assert(sb_it != sb_info.end());
8635 sb_info_t& sbi = sb_it->second;
8636
8637 for (auto& r : sbi.ref_map.ref_map) {
8638 expected_statfs->allocated -= r.second.length;
8639 if (sbi.compressed) {
8640 // NB: it's crucial to use compressed flag from sb_info_t
8641 // as we originally used that value while accumulating
8642 // expected_statfs
8643 expected_statfs->data_compressed_allocated -= r.second.length;
8644 }
8645 }
8646 sbi.updated = sbi.passed = true;
8647 sbi.ref_map.clear();
8648
8649 // relying on blob's pextents to decide what to release.
8650 for (auto& p : pext_to_release) {
8651 to_release.union_insert(p.offset, p.length);
8652 }
8653 } else {
8654 for (auto& p : pext_to_release) {
8655 expected_statfs->allocated -= p.length;
8656 if (compressed) {
8657 expected_statfs->data_compressed_allocated -= p.length;
8658 }
8659 to_release.union_insert(p.offset, p.length);
8660 }
8661 }
8662 if (bypass_rest) {
8663 break;
8664 }
8665 } // for(auto b : blobs)
8666 if (need_onode_update) {
8667 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
8668 _record_onode(o, txn);
8669 }
8670 } // for (it->lower_bound(string()); it->valid(); it->next())
8671
8672 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
8673 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
8674 << "~" << it.get_len() << std::dec << dendl;
8675 fm->release(it.get_start(), it.get_len(), txn);
8676 }
f67539c2 8677 shared_alloc.a->release(to_release);
11fdf7f2
TL
8678 to_release.clear();
8679 } // if (it) {
8680 } //if (repair && repairer.preprocess_misreference()) {
8681
eafe8130
TL
8682 if (depth != FSCK_SHALLOW) {
8683 for (auto &p : sb_info) {
8684 sb_info_t& sbi = p.second;
8685 if (!sbi.passed) {
8686 derr << "fsck error: missing " << *sbi.sb << dendl;
8687 ++errors;
8688 }
8689 if (repair && (!sbi.passed || sbi.updated)) {
8690 auto sbid = p.first;
8691 if (sbi.ref_map.empty()) {
8692 ceph_assert(sbi.passed);
8693 dout(20) << __func__ << " " << *sbi.sb
8694 << " is empty, removing" << dendl;
8695 repairer.fix_shared_blob(db, sbid, nullptr);
8696 } else {
8697 bufferlist bl;
8698 bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
8699 encode(persistent, bl);
8700 dout(20) << __func__ << " " << *sbi.sb
a4b75251
TL
8701 << " is " << bl.length() << " bytes, updating"
8702 << dendl;
11fdf7f2 8703
eafe8130 8704 repairer.fix_shared_blob(db, sbid, &bl);
a4b75251
TL
8705 // we need to account for shared blob pextents at both
8706 // stats and used blocks to avoid related errors.
8707 PExtentVector extents;
8708 for (auto& r : persistent.ref_map.ref_map) {
8709 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
8710 }
8711 auto* expected_statfs = &expected_pool_statfs[sbi.pool_id];
8712 int errors = _fsck_check_extents(sbi.cid,
8713 ghobject_t(), // doesn't matter
8714 extents,
8715 sbi.compressed,
8716 used_blocks,
8717 fm->get_alloc_size(),
8718 nullptr,
8719 *expected_statfs,
8720 depth);
8721 if (errors) {
8722 derr << __func__ << " " << errors
8723 << " unexpected error(s) after missed shared blob repair,"
8724 << " perhaps worth one more repair attempt"
8725 << dendl;
8726 }
eafe8130 8727 }
7c673cae
FG
8728 }
8729 }
8730 }
11fdf7f2
TL
8731 sb_info.clear();
8732
eafe8130
TL
8733 // check global stats only if fscking (not repairing) w/o per-pool stats
8734 if (!per_pool_stat_collection &&
8735 !repair &&
8736 !(actual_statfs == expected_store_statfs)) {
8737 derr << "fsck error: actual " << actual_statfs
8738 << " != expected " << expected_store_statfs << dendl;
8739 if (repair) {
8740 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
8741 expected_store_statfs);
11fdf7f2 8742 }
eafe8130 8743 ++errors;
7c673cae
FG
8744 }
8745
eafe8130
TL
8746 dout(1) << __func__ << " checking pool_statfs" << dendl;
8747 _fsck_check_pool_statfs(expected_pool_statfs,
8748 errors, warnings, repair ? &repairer : nullptr);
8749
8750 if (depth != FSCK_SHALLOW) {
9f95a23c 8751 dout(1) << __func__ << " checking for stray omap data " << dendl;
f67539c2 8752 it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE);
eafe8130 8753 if (it) {
9f95a23c 8754 uint64_t last_omap_head = 0;
eafe8130
TL
8755 for (it->lower_bound(string()); it->valid(); it->next()) {
8756 uint64_t omap_head;
f67539c2 8757
eafe8130 8758 _key_decode_u64(it->key().c_str(), &omap_head);
f67539c2 8759
9f95a23c 8760 if (used_omap_head.count(omap_head) == 0 &&
f67539c2 8761 omap_head != last_omap_head) {
9f95a23c
TL
8762 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8763 << "fsck error: found stray omap data on omap_head "
f67539c2
TL
8764 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8765 ++errors;
8766 last_omap_head = omap_head;
eafe8130 8767 }
7c673cae
FG
8768 }
8769 }
f67539c2 8770 it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE);
eafe8130 8771 if (it) {
9f95a23c 8772 uint64_t last_omap_head = 0;
eafe8130
TL
8773 for (it->lower_bound(string()); it->valid(); it->next()) {
8774 uint64_t omap_head;
8775 _key_decode_u64(it->key().c_str(), &omap_head);
9f95a23c
TL
8776 if (used_omap_head.count(omap_head) == 0 &&
8777 omap_head != last_omap_head) {
8778 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8779 << "fsck error: found stray (pgmeta) omap data on omap_head "
8780 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8781 last_omap_head = omap_head;
eafe8130
TL
8782 ++errors;
8783 }
11fdf7f2
TL
8784 }
8785 }
f67539c2 8786 it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9f95a23c
TL
8787 if (it) {
8788 uint64_t last_omap_head = 0;
8789 for (it->lower_bound(string()); it->valid(); it->next()) {
8790 uint64_t pool;
8791 uint64_t omap_head;
8792 string k = it->key();
8793 const char *c = k.c_str();
8794 c = _key_decode_u64(c, &pool);
8795 c = _key_decode_u64(c, &omap_head);
8796 if (used_omap_head.count(omap_head) == 0 &&
f67539c2 8797 omap_head != last_omap_head) {
9f95a23c
TL
8798 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8799 << "fsck error: found stray (per-pool) omap data on omap_head "
8800 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8801 ++errors;
f67539c2
TL
8802 last_omap_head = omap_head;
8803 }
8804 }
8805 }
8806 it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE);
8807 if (it) {
8808 uint64_t last_omap_head = 0;
8809 for (it->lower_bound(string()); it->valid(); it->next()) {
8810 uint64_t pool;
8811 uint32_t hash;
8812 uint64_t omap_head;
8813 string k = it->key();
8814 const char* c = k.c_str();
8815 c = _key_decode_u64(c, &pool);
8816 c = _key_decode_u32(c, &hash);
8817 c = _key_decode_u64(c, &omap_head);
8818 if (used_omap_head.count(omap_head) == 0 &&
8819 omap_head != last_omap_head) {
8820 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8821 << "fsck error: found stray (per-pg) omap data on omap_head "
8822 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8823 ++errors;
8824 last_omap_head = omap_head;
9f95a23c
TL
8825 }
8826 }
8827 }
eafe8130 8828 dout(1) << __func__ << " checking deferred events" << dendl;
f67539c2 8829 it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
8830 if (it) {
8831 for (it->lower_bound(string()); it->valid(); it->next()) {
8832 bufferlist bl = it->value();
8833 auto p = bl.cbegin();
8834 bluestore_deferred_transaction_t wt;
8835 try {
8836 decode(wt, p);
f67539c2 8837 } catch (ceph::buffer::error& e) {
eafe8130
TL
8838 derr << "fsck error: failed to decode deferred txn "
8839 << pretty_binary_string(it->key()) << dendl;
8840 if (repair) {
8841 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
8842 << pretty_binary_string(it->key())
8843 << "', removing" << dendl;
8844 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8845 }
8846 continue;
8847 }
8848 dout(20) << __func__ << " deferred " << wt.seq
8849 << " ops " << wt.ops.size()
8850 << " released 0x" << std::hex << wt.released << std::dec << dendl;
8851 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
9f95a23c 8852 apply_for_bitset_range(
f67539c2 8853 e.get_start(), e.get_len(), alloc_size, used_blocks,
eafe8130 8854 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
8855 bs.set(pos);
8856 }
8857 );
8858 }
7c673cae 8859 }
eafe8130
TL
8860 }
8861
8862 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
8863 {
eafe8130
TL
8864 fm->enumerate_reset();
8865 uint64_t offset, length;
8866 while (fm->enumerate_next(db, &offset, &length)) {
8867 bool intersects = false;
9f95a23c 8868 apply_for_bitset_range(
f67539c2 8869 offset, length, alloc_size, used_blocks,
eafe8130 8870 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
f67539c2
TL
8871 ceph_assert(pos < bs.size());
8872 if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
eafe8130
TL
8873 if (offset == SUPER_RESERVED &&
8874 length == min_alloc_size - SUPER_RESERVED) {
8875 // this is due to the change just after luminous to min_alloc_size
8876 // granularity allocations, and our baked in assumption at the top
8877 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
8878 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
8879 // since we will never allocate this region below min_alloc_size.
8880 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
8881 << " and min_alloc_size, 0x" << std::hex << offset << "~"
8882 << length << std::dec << dendl;
8883 } else {
8884 intersects = true;
8885 if (repair) {
8886 repairer.fix_false_free(db, fm,
8887 pos * min_alloc_size,
8888 min_alloc_size);
8889 }
11fdf7f2 8890 }
eafe8130
TL
8891 } else {
8892 bs.set(pos);
8893 }
7c673cae 8894 }
eafe8130
TL
8895 );
8896 if (intersects) {
8897 derr << "fsck error: free extent 0x" << std::hex << offset
8898 << "~" << length << std::dec
8899 << " intersects allocated blocks" << dendl;
8900 ++errors;
7c673cae 8901 }
b5b8bbf5 8902 }
eafe8130
TL
8903 fm->enumerate_reset();
8904 size_t count = used_blocks.count();
8905 if (used_blocks.size() != count) {
8906 ceph_assert(used_blocks.size() > count);
8907 used_blocks.flip();
8908 size_t start = used_blocks.find_first();
8909 while (start != decltype(used_blocks)::npos) {
8910 size_t cur = start;
8911 while (true) {
8912 size_t next = used_blocks.find_next(cur);
8913 if (next != cur + 1) {
8914 ++errors;
8915 derr << "fsck error: leaked extent 0x" << std::hex
8916 << ((uint64_t)start * fm->get_alloc_size()) << "~"
8917 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
8918 << dendl;
8919 if (repair) {
8920 repairer.fix_leaked(db,
8921 fm,
8922 start * min_alloc_size,
8923 (cur + 1 - start) * min_alloc_size);
8924 }
8925 start = next;
8926 break;
11fdf7f2 8927 }
eafe8130 8928 cur = next;
b5b8bbf5 8929 }
eafe8130
TL
8930 }
8931 used_blocks.flip();
b5b8bbf5 8932 }
7c673cae
FG
8933 }
8934 }
11fdf7f2 8935 if (repair) {
f67539c2
TL
8936 if (per_pool_omap != OMAP_PER_PG) {
8937 dout(5) << __func__ << " fixing per_pg_omap" << dendl;
8938 repairer.fix_per_pool_omap(db, OMAP_PER_PG);
9f95a23c
TL
8939 }
8940
11fdf7f2
TL
8941 dout(5) << __func__ << " applying repair results" << dendl;
8942 repaired = repairer.apply(db);
8943 dout(5) << __func__ << " repair applied" << dendl;
8944 }
7c673cae 8945
eafe8130 8946out_scan:
7c673cae
FG
8947 dout(2) << __func__ << " " << num_objects << " objects, "
8948 << num_sharded_objects << " of them sharded. "
8949 << dendl;
8950 dout(2) << __func__ << " " << num_extents << " extents to "
8951 << num_blobs << " blobs, "
8952 << num_spanning_blobs << " spanning, "
8953 << num_shared_blobs << " shared."
8954 << dendl;
8955
8956 utime_t duration = ceph_clock_now() - start;
9f95a23c
TL
8957 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
8958 << warnings << " warnings, "
8959 << repaired << " repaired, "
8960 << (errors + warnings - (int)repaired) << " remaining in "
7c673cae 8961 << duration << " seconds" << dendl;
9f95a23c
TL
8962
8963 // In non-repair mode we should return error count only as
8964 // it indicates if store status is OK.
8965 // In repair mode both errors and warnings are taken into account
8966 // since repaired counter relates to them both.
8967 return repair ? errors + warnings - (int)repaired : errors;
11fdf7f2
TL
8968}
8969
8970/// methods to inject various errors fsck can repair
8971void BlueStore::inject_broken_shared_blob_key(const string& key,
8972 const bufferlist& bl)
8973{
8974 KeyValueDB::Transaction txn;
8975 txn = db->get_transaction();
8976 txn->set(PREFIX_SHARED_BLOB, key, bl);
8977 db->submit_transaction_sync(txn);
8978};
8979
a4b75251
TL
8980void BlueStore::inject_no_shared_blob_key()
8981{
8982 KeyValueDB::Transaction txn;
8983 txn = db->get_transaction();
8984 ceph_assert(blobid_last > 0);
8985 // kill the last used sbid, this can be broken due to blobid preallocation
8986 // in rare cases, leaving as-is for the sake of simplicity
8987 uint64_t sbid = blobid_last;
8988
8989 string key;
8990 dout(5) << __func__<< " " << sbid << dendl;
8991 get_shared_blob_key(sbid, &key);
8992 txn->rmkey(PREFIX_SHARED_BLOB, key);
8993 db->submit_transaction_sync(txn);
8994};
8995
8996
11fdf7f2
TL
8997void BlueStore::inject_leaked(uint64_t len)
8998{
8999 KeyValueDB::Transaction txn;
9000 txn = db->get_transaction();
9001
9002 PExtentVector exts;
f67539c2 9003 int64_t alloc_len = shared_alloc.a->allocate(len, min_alloc_size,
11fdf7f2
TL
9004 min_alloc_size * 256, 0, &exts);
9005 ceph_assert(alloc_len >= (int64_t)len);
9006 for (auto& p : exts) {
9007 fm->allocate(p.offset, p.length, txn);
9008 }
9009 db->submit_transaction_sync(txn);
9010}
9011
9012void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
9013{
9014 KeyValueDB::Transaction txn;
9015 OnodeRef o;
9016 CollectionRef c = _get_collection(cid);
9017 ceph_assert(c);
9018 {
9f95a23c 9019 std::unique_lock l{c->lock}; // just to avoid internal asserts
11fdf7f2
TL
9020 o = c->get_onode(oid, false);
9021 ceph_assert(o);
9022 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9023 }
9024
9025 bool injected = false;
9026 txn = db->get_transaction();
9027 auto& em = o->extent_map.extent_map;
9028 std::vector<const PExtentVector*> v;
9029 if (em.size()) {
9030 v.push_back(&em.begin()->blob->get_blob().get_extents());
9031 }
9032 if (em.size() > 1) {
9033 auto it = em.end();
9034 --it;
9035 v.push_back(&(it->blob->get_blob().get_extents()));
9036 }
9037 for (auto pext : v) {
9038 if (pext->size()) {
9039 auto p = pext->begin();
9040 while (p != pext->end()) {
9041 if (p->is_valid()) {
9042 dout(20) << __func__ << " release 0x" << std::hex << p->offset
9043 << "~" << p->length << std::dec << dendl;
9044 fm->release(p->offset, p->length, txn);
9045 injected = true;
9046 break;
9047 }
9048 ++p;
9049 }
9050 }
9051 }
9052 ceph_assert(injected);
9053 db->submit_transaction_sync(txn);
9054}
9055
9f95a23c
TL
9056void BlueStore::inject_legacy_omap()
9057{
9058 dout(1) << __func__ << dendl;
f67539c2 9059 per_pool_omap = OMAP_BULK;
9f95a23c
TL
9060 KeyValueDB::Transaction txn;
9061 txn = db->get_transaction();
9062 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
9063 db->submit_transaction_sync(txn);
9064}
9065
9066void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
9067{
9068 dout(1) << __func__ << " "
9069 << cid << " " << oid
9070 <<dendl;
9071 KeyValueDB::Transaction txn;
9072 OnodeRef o;
9073 CollectionRef c = _get_collection(cid);
9074 ceph_assert(c);
9075 {
9076 std::unique_lock l{ c->lock }; // just to avoid internal asserts
9077 o = c->get_onode(oid, false);
9078 ceph_assert(o);
9079 }
f67539c2
TL
9080 o->onode.clear_flag(
9081 bluestore_onode_t::FLAG_PERPG_OMAP |
9082 bluestore_onode_t::FLAG_PERPOOL_OMAP |
9083 bluestore_onode_t::FLAG_PGMETA_OMAP);
9f95a23c
TL
9084 txn = db->get_transaction();
9085 _record_onode(o, txn);
9086 db->submit_transaction_sync(txn);
9087}
9088
9089
11fdf7f2
TL
9090void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
9091{
9092 BlueStoreRepairer repairer;
9093 repairer.fix_statfs(db, key, new_statfs);
9094 repairer.apply(db);
9095}
9096
eafe8130
TL
9097void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
9098{
9099 KeyValueDB::Transaction t = db->get_transaction();
9100 volatile_statfs v;
9101 v = new_statfs;
9102 bufferlist bl;
9103 v.encode(bl);
9104 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
9105 db->submit_transaction_sync(t);
9106}
9107
11fdf7f2
TL
9108void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
9109 coll_t cid2, ghobject_t oid2,
9110 uint64_t offset)
9111{
9112 OnodeRef o1;
9113 CollectionRef c1 = _get_collection(cid1);
9114 ceph_assert(c1);
9115 {
9f95a23c 9116 std::unique_lock l{c1->lock}; // just to avoid internal asserts
11fdf7f2
TL
9117 o1 = c1->get_onode(oid1, false);
9118 ceph_assert(o1);
9119 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9120 }
9121 OnodeRef o2;
9122 CollectionRef c2 = _get_collection(cid2);
9123 ceph_assert(c2);
9124 {
9f95a23c 9125 std::unique_lock l{c2->lock}; // just to avoid internal asserts
11fdf7f2
TL
9126 o2 = c2->get_onode(oid2, false);
9127 ceph_assert(o2);
9128 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9129 }
9130 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
9131 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
9132
9133 // require onode/extent layout to be the same (and simple)
9134 // to make things easier
9135 ceph_assert(o1->onode.extent_map_shards.empty());
9136 ceph_assert(o2->onode.extent_map_shards.empty());
9137 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
9138 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
9139 ceph_assert(e1.logical_offset == e2.logical_offset);
9140 ceph_assert(e1.length == e2.length);
9141 ceph_assert(e1.blob_offset == e2.blob_offset);
9142
9143 KeyValueDB::Transaction txn;
9144 txn = db->get_transaction();
9145
9146 // along with misreference error this will create space leaks errors
9147 e2.blob->dirty_blob() = e1.blob->get_blob();
9148 o2->extent_map.dirty_range(offset, e2.length);
9149 o2->extent_map.update(txn, false);
9150
9151 _record_onode(o2, txn);
9152 db->submit_transaction_sync(txn);
7c673cae
FG
9153}
9154
adb31ebb
TL
9155void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
9156 int16_t blob_id)
9157{
9158 OnodeRef o;
9159 CollectionRef c = _get_collection(cid);
9160 ceph_assert(c);
9161 {
9162 std::unique_lock l{ c->lock }; // just to avoid internal asserts
9163 o = c->get_onode(oid, false);
9164 ceph_assert(o);
9165 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9166 }
9167
9168 BlobRef b = c->new_blob();
9169 b->id = blob_id;
9170 o->extent_map.spanning_blob_map[blob_id] = b;
9171
9172 KeyValueDB::Transaction txn;
9173 txn = db->get_transaction();
9174
9175 _record_onode(o, txn);
9176 db->submit_transaction_sync(txn);
9177}
9178
a4b75251
TL
9179void BlueStore::inject_bluefs_file(std::string_view dir, std::string_view name, size_t new_size)
9180{
9181 ceph_assert(bluefs);
9182
9183 BlueFS::FileWriter* p_handle = nullptr;
9184 auto ret = bluefs->open_for_write(dir, name, &p_handle, false);
9185 ceph_assert(ret == 0);
9186
9187 std::string s('0', new_size);
9188 bufferlist bl;
9189 bl.append(s);
9190 p_handle->append(bl);
9191
9192 bluefs->fsync(p_handle);
9193 bluefs->close_writer(p_handle);
9194}
9195
7c673cae
FG
9196void BlueStore::collect_metadata(map<string,string> *pm)
9197{
9198 dout(10) << __func__ << dendl;
9199 bdev->collect_metadata("bluestore_bdev_", pm);
9200 if (bluefs) {
9201 (*pm)["bluefs"] = "1";
9f95a23c
TL
9202 // this value is for backward compatibility only
9203 (*pm)["bluefs_single_shared_device"] = \
9204 stringify((int)bluefs_layout.single_shared_device());
9205 (*pm)["bluefs_dedicated_db"] = \
9206 stringify((int)bluefs_layout.dedicated_db);
9207 (*pm)["bluefs_dedicated_wal"] = \
9208 stringify((int)bluefs_layout.dedicated_wal);
9209 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
7c673cae
FG
9210 } else {
9211 (*pm)["bluefs"] = "0";
9212 }
11fdf7f2
TL
9213
9214 // report numa mapping for underlying devices
9215 int node = -1;
9216 set<int> nodes;
9217 set<string> failed;
9218 int r = get_numa_node(&node, &nodes, &failed);
9219 if (r >= 0) {
9220 if (!failed.empty()) {
9221 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
9222 }
9223 if (!nodes.empty()) {
9224 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
9225 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
9226 }
9227 if (node >= 0) {
9228 (*pm)["objectstore_numa_node"] = stringify(node);
9229 }
9230 }
9231}
9232
9233int BlueStore::get_numa_node(
9234 int *final_node,
9235 set<int> *out_nodes,
9236 set<string> *out_failed)
9237{
9238 int node = -1;
9239 set<string> devices;
9240 get_devices(&devices);
9241 set<int> nodes;
9242 set<string> failed;
9243 for (auto& devname : devices) {
9244 int n;
9245 BlkDev bdev(devname);
9246 int r = bdev.get_numa_node(&n);
9247 if (r < 0) {
9248 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
9249 << dendl;
9250 failed.insert(devname);
9251 continue;
9252 }
9253 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
9254 << dendl;
9255 nodes.insert(n);
9256 if (node < 0) {
9257 node = n;
9258 }
9259 }
9260 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
9261 *final_node = node;
9262 }
9263 if (out_nodes) {
9264 *out_nodes = nodes;
9265 }
9266 if (out_failed) {
9267 *out_failed = failed;
9268 }
9269 return 0;
9270}
9271
9272int BlueStore::get_devices(set<string> *ls)
9273{
9274 if (bdev) {
9275 bdev->get_devices(ls);
9276 if (bluefs) {
9277 bluefs->get_devices(ls);
9278 }
9279 return 0;
9280 }
9281
9282 // grumble, we haven't started up yet.
9283 int r = _open_path();
9284 if (r < 0)
9285 goto out;
9286 r = _open_fsid(false);
9287 if (r < 0)
9288 goto out_path;
9289 r = _read_fsid(&fsid);
9290 if (r < 0)
9291 goto out_fsid;
9292 r = _lock_fsid();
9293 if (r < 0)
9294 goto out_fsid;
9295 r = _open_bdev(false);
9296 if (r < 0)
9297 goto out_fsid;
9298 r = _minimal_open_bluefs(false);
9299 if (r < 0)
9300 goto out_bdev;
9301 bdev->get_devices(ls);
9302 if (bluefs) {
9303 bluefs->get_devices(ls);
9304 }
9305 r = 0;
9306 _minimal_close_bluefs();
9307 out_bdev:
9308 _close_bdev();
9309 out_fsid:
9310 _close_fsid();
9311 out_path:
9312 _close_path();
9313 out:
9314 return r;
7c673cae
FG
9315}
9316
11fdf7f2 9317void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
7c673cae
FG
9318{
9319 buf->reset();
11fdf7f2 9320
f67539c2
TL
9321 auto prefix = per_pool_omap == OMAP_BULK ?
9322 PREFIX_OMAP :
9323 per_pool_omap == OMAP_PER_POOL ?
9324 PREFIX_PERPOOL_OMAP :
9325 PREFIX_PERPG_OMAP;
9f95a23c 9326 buf->omap_allocated =
f67539c2 9327 db->estimate_prefix_size(prefix, string());
11fdf7f2 9328
f67539c2 9329 uint64_t bfree = shared_alloc.a->get_free();
7c673cae
FG
9330
9331 if (bluefs) {
f67539c2 9332 buf->internally_reserved = 0;
11fdf7f2 9333 // include dedicated db, too, if that isn't the shared device.
9f95a23c 9334 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 9335 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
7c673cae 9336 }
11fdf7f2
TL
9337 // call any non-omap bluefs space "internal metadata"
9338 buf->internal_metadata =
f67539c2 9339 bluefs->get_used()
11fdf7f2 9340 - buf->omap_allocated;
7c673cae
FG
9341 }
9342
11fdf7f2
TL
9343 uint64_t thin_total, thin_avail;
9344 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
9345 buf->total += thin_total;
9346
9347 // we are limited by both the size of the virtual device and the
9348 // underlying physical device.
9349 bfree = std::min(bfree, thin_avail);
9350
9351 buf->allocated = thin_total - thin_avail;
9352 } else {
9353 buf->total += bdev->get_size();
9354 }
9355 buf->available = bfree;
9356}
9357
9358int BlueStore::statfs(struct store_statfs_t *buf,
9359 osd_alert_list_t* alerts)
9360{
9361 if (alerts) {
9362 alerts->clear();
9363 _log_alerts(*alerts);
9364 }
9365 _get_statfs_overall(buf);
31f18b77 9366 {
11fdf7f2 9367 std::lock_guard l(vstatfs_lock);
31f18b77 9368 buf->allocated = vstatfs.allocated();
11fdf7f2
TL
9369 buf->data_stored = vstatfs.stored();
9370 buf->data_compressed = vstatfs.compressed();
9371 buf->data_compressed_original = vstatfs.compressed_original();
9372 buf->data_compressed_allocated = vstatfs.compressed_allocated();
9373 }
9374
9375 dout(20) << __func__ << " " << *buf << dendl;
9376 return 0;
9377}
9378
9f95a23c
TL
9379int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
9380 bool *out_per_pool_omap)
11fdf7f2
TL
9381{
9382 dout(20) << __func__ << " pool " << pool_id<< dendl;
81eedcae 9383
11fdf7f2
TL
9384 if (!per_pool_stat_collection) {
9385 dout(20) << __func__ << " not supported in legacy mode " << dendl;
9386 return -ENOTSUP;
7c673cae 9387 }
11fdf7f2 9388 buf->reset();
7c673cae 9389
11fdf7f2
TL
9390 {
9391 std::lock_guard l(vstatfs_lock);
9392 osd_pools[pool_id].publish(buf);
9393 }
9f95a23c
TL
9394
9395 string key_prefix;
9396 _key_encode_u64(pool_id, &key_prefix);
f67539c2
TL
9397 *out_per_pool_omap = per_pool_omap != OMAP_BULK;
9398 if (*out_per_pool_omap) {
9399 auto prefix = per_pool_omap == OMAP_PER_POOL ?
9400 PREFIX_PERPOOL_OMAP :
9401 PREFIX_PERPG_OMAP;
9402 buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix);
9403 }
9f95a23c 9404
11fdf7f2 9405 dout(10) << __func__ << *buf << dendl;
7c673cae
FG
9406 return 0;
9407}
9408
81eedcae
TL
9409void BlueStore::_check_legacy_statfs_alert()
9410{
9411 string s;
9412 if (!per_pool_stat_collection &&
eafe8130 9413 cct->_conf->bluestore_warn_on_legacy_statfs) {
81eedcae
TL
9414 s = "legacy statfs reporting detected, "
9415 "suggest to run store repair to get consistent statistic reports";
9416 }
9417 std::lock_guard l(qlock);
9418 legacy_statfs_alert = s;
9419}
9420
f67539c2 9421void BlueStore::_check_no_per_pg_or_pool_omap_alert()
9f95a23c 9422{
f67539c2
TL
9423 string per_pg, per_pool;
9424 if (per_pool_omap != OMAP_PER_PG) {
9425 if (cct->_conf->bluestore_warn_on_no_per_pg_omap) {
9426 per_pg = "legacy (not per-pg) omap detected, "
9427 "suggest to run store repair to benefit from faster PG removal";
9428 }
9429 if (per_pool_omap != OMAP_PER_POOL) {
9430 if (cct->_conf->bluestore_warn_on_no_per_pool_omap) {
9431 per_pool = "legacy (not per-pool) omap detected, "
9432 "suggest to run store repair to benefit from per-pool omap usage statistics";
9433 }
9434 }
9f95a23c
TL
9435 }
9436 std::lock_guard l(qlock);
f67539c2
TL
9437 no_per_pg_omap_alert = per_pg;
9438 no_per_pool_omap_alert = per_pool;
9f95a23c
TL
9439}
9440
7c673cae
FG
9441// ---------------
9442// cache
9443
9444BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
9445{
9f95a23c 9446 std::shared_lock l(coll_lock);
7c673cae
FG
9447 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
9448 if (cp == coll_map.end())
9449 return CollectionRef();
9450 return cp->second;
9451}
9452
9453void BlueStore::_queue_reap_collection(CollectionRef& c)
9454{
9455 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
9456 // _reap_collections and this in the same thread,
9457 // so no need a lock.
7c673cae
FG
9458 removed_collections.push_back(c);
9459}
9460
9461void BlueStore::_reap_collections()
9462{
94b18763 9463
7c673cae
FG
9464 list<CollectionRef> removed_colls;
9465 {
94b18763
FG
9466 // _queue_reap_collection and this in the same thread.
9467 // So no need a lock.
9468 if (!removed_collections.empty())
9469 removed_colls.swap(removed_collections);
9470 else
9471 return;
7c673cae
FG
9472 }
9473
94b18763
FG
9474 list<CollectionRef>::iterator p = removed_colls.begin();
9475 while (p != removed_colls.end()) {
7c673cae
FG
9476 CollectionRef c = *p;
9477 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
adb31ebb 9478 if (c->onode_map.map_any([&](Onode* o) {
11fdf7f2 9479 ceph_assert(!o->exists);
7c673cae
FG
9480 if (o->flushing_count.load()) {
9481 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
9482 << " flush_txns " << o->flushing_count << dendl;
94b18763 9483 return true;
7c673cae 9484 }
94b18763 9485 return false;
7c673cae 9486 })) {
94b18763 9487 ++p;
7c673cae
FG
9488 continue;
9489 }
9490 c->onode_map.clear();
94b18763 9491 p = removed_colls.erase(p);
7c673cae
FG
9492 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
9493 }
94b18763 9494 if (removed_colls.empty()) {
7c673cae 9495 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
9496 } else {
9497 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
9498 }
9499}
9500
9501void BlueStore::_update_cache_logger()
9502{
9503 uint64_t num_onodes = 0;
9f95a23c 9504 uint64_t num_pinned_onodes = 0;
7c673cae
FG
9505 uint64_t num_extents = 0;
9506 uint64_t num_blobs = 0;
9507 uint64_t num_buffers = 0;
9508 uint64_t num_buffer_bytes = 0;
9f95a23c
TL
9509 for (auto c : onode_cache_shards) {
9510 c->add_stats(&num_onodes, &num_pinned_onodes);
9511 }
9512 for (auto c : buffer_cache_shards) {
9513 c->add_stats(&num_extents, &num_blobs,
9514 &num_buffers, &num_buffer_bytes);
7c673cae
FG
9515 }
9516 logger->set(l_bluestore_onodes, num_onodes);
9f95a23c 9517 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
7c673cae
FG
9518 logger->set(l_bluestore_extents, num_extents);
9519 logger->set(l_bluestore_blobs, num_blobs);
9520 logger->set(l_bluestore_buffers, num_buffers);
9521 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
9522}
9523
9524// ---------------
9525// read operations
9526
9527ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
9528{
9529 return _get_collection(cid);
9530}
9531
11fdf7f2
TL
9532ObjectStore::CollectionHandle BlueStore::create_new_collection(
9533 const coll_t& cid)
7c673cae 9534{
9f95a23c
TL
9535 std::unique_lock l{coll_lock};
9536 auto c = ceph::make_ref<Collection>(
11fdf7f2 9537 this,
9f95a23c
TL
9538 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
9539 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
11fdf7f2
TL
9540 cid);
9541 new_coll_map[cid] = c;
9f95a23c 9542 _osr_attach(c.get());
11fdf7f2
TL
9543 return c;
9544}
9545
9546void BlueStore::set_collection_commit_queue(
9547 const coll_t& cid,
9548 ContextQueue *commit_queue)
9549{
9550 if (commit_queue) {
9f95a23c 9551 std::shared_lock l(coll_lock);
11fdf7f2
TL
9552 if (coll_map.count(cid)) {
9553 coll_map[cid]->commit_queue = commit_queue;
9554 } else if (new_coll_map.count(cid)) {
9555 new_coll_map[cid]->commit_queue = commit_queue;
9556 }
9557 }
7c673cae
FG
9558}
9559
11fdf7f2 9560
7c673cae
FG
9561bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
9562{
9563 Collection *c = static_cast<Collection *>(c_.get());
9564 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
9565 if (!c->exists)
9566 return false;
9567
9568 bool r = true;
9569
9570 {
9f95a23c 9571 std::shared_lock l(c->lock);
7c673cae
FG
9572 OnodeRef o = c->get_onode(oid, false);
9573 if (!o || !o->exists)
9574 r = false;
9575 }
9576
7c673cae
FG
9577 return r;
9578}
9579
7c673cae
FG
9580int BlueStore::stat(
9581 CollectionHandle &c_,
9582 const ghobject_t& oid,
9583 struct stat *st,
9584 bool allow_eio)
9585{
9586 Collection *c = static_cast<Collection *>(c_.get());
9587 if (!c->exists)
9588 return -ENOENT;
9589 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
9590
9591 {
9f95a23c 9592 std::shared_lock l(c->lock);
7c673cae
FG
9593 OnodeRef o = c->get_onode(oid, false);
9594 if (!o || !o->exists)
9595 return -ENOENT;
9596 st->st_size = o->onode.size;
9597 st->st_blksize = 4096;
9598 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
9599 st->st_nlink = 1;
9600 }
9601
7c673cae
FG
9602 int r = 0;
9603 if (_debug_mdata_eio(oid)) {
9604 r = -EIO;
9605 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9606 }
9607 return r;
9608}
9609int BlueStore::set_collection_opts(
11fdf7f2 9610 CollectionHandle& ch,
7c673cae
FG
9611 const pool_opts_t& opts)
9612{
7c673cae 9613 Collection *c = static_cast<Collection *>(ch.get());
11fdf7f2 9614 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
7c673cae
FG
9615 if (!c->exists)
9616 return -ENOENT;
9f95a23c 9617 std::unique_lock l{c->lock};
7c673cae
FG
9618 c->pool_opts = opts;
9619 return 0;
9620}
9621
7c673cae
FG
9622int BlueStore::read(
9623 CollectionHandle &c_,
9624 const ghobject_t& oid,
9625 uint64_t offset,
9626 size_t length,
9627 bufferlist& bl,
224ce89b 9628 uint32_t op_flags)
7c673cae 9629{
11fdf7f2 9630 auto start = mono_clock::now();
7c673cae
FG
9631 Collection *c = static_cast<Collection *>(c_.get());
9632 const coll_t &cid = c->get_cid();
9633 dout(15) << __func__ << " " << cid << " " << oid
9634 << " 0x" << std::hex << offset << "~" << length << std::dec
9635 << dendl;
9636 if (!c->exists)
9637 return -ENOENT;
9638
9639 bl.clear();
9640 int r;
9641 {
9f95a23c 9642 std::shared_lock l(c->lock);
11fdf7f2 9643 auto start1 = mono_clock::now();
7c673cae 9644 OnodeRef o = c->get_onode(oid, false);
494da23a
TL
9645 log_latency("get_onode@read",
9646 l_bluestore_read_onode_meta_lat,
9647 mono_clock::now() - start1,
9648 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9649 if (!o || !o->exists) {
9650 r = -ENOENT;
9651 goto out;
9652 }
9653
9654 if (offset == length && offset == 0)
9655 length = o->onode.size;
9656
9657 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
9658 if (r == -EIO) {
9659 logger->inc(l_bluestore_read_eio);
9660 }
7c673cae
FG
9661 }
9662
9663 out:
28e407b8 9664 if (r >= 0 && _debug_data_eio(oid)) {
7c673cae
FG
9665 r = -EIO;
9666 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11fdf7f2
TL
9667 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
9668 cct->_conf->bluestore_debug_random_read_err &&
9669 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
9670 100.0)) == 0) {
224ce89b
WB
9671 dout(0) << __func__ << ": inject random EIO" << dendl;
9672 r = -EIO;
7c673cae
FG
9673 }
9674 dout(10) << __func__ << " " << cid << " " << oid
9675 << " 0x" << std::hex << offset << "~" << length << std::dec
9676 << " = " << r << dendl;
494da23a
TL
9677 log_latency(__func__,
9678 l_bluestore_read_lat,
9679 mono_clock::now() - start,
9680 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9681 return r;
9682}
9683
9f95a23c 9684void BlueStore::_read_cache(
7c673cae
FG
9685 OnodeRef o,
9686 uint64_t offset,
9687 size_t length,
9f95a23c
TL
9688 int read_cache_policy,
9689 ready_regions_t& ready_regions,
9690 blobs2read_t& blobs2read)
7c673cae 9691{
7c673cae 9692 // build blob-wise list to of stuff read (that isn't cached)
7c673cae
FG
9693 unsigned left = length;
9694 uint64_t pos = offset;
7c673cae
FG
9695 auto lp = o->extent_map.seek_lextent(offset);
9696 while (left > 0 && lp != o->extent_map.extent_map.end()) {
9697 if (pos < lp->logical_offset) {
9698 unsigned hole = lp->logical_offset - pos;
9699 if (hole >= left) {
9f95a23c 9700 break;
7c673cae
FG
9701 }
9702 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
9f95a23c 9703 << std::dec << dendl;
7c673cae
FG
9704 pos += hole;
9705 left -= hole;
9706 }
94b18763 9707 BlobRef& bptr = lp->blob;
7c673cae
FG
9708 unsigned l_off = pos - lp->logical_offset;
9709 unsigned b_off = l_off + lp->blob_offset;
9710 unsigned b_len = std::min(left, lp->length - l_off);
9711
9712 ready_regions_t cache_res;
9713 interval_set<uint32_t> cache_interval;
9714 bptr->shared_blob->bc.read(
91327a77
AA
9715 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
9716 read_cache_policy);
7c673cae 9717 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c
TL
9718 << " need 0x" << b_off << "~" << b_len
9719 << " cache has 0x" << cache_interval
9720 << std::dec << dendl;
7c673cae
FG
9721
9722 auto pc = cache_res.begin();
11fdf7f2 9723 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
7c673cae
FG
9724 while (b_len > 0) {
9725 unsigned l;
9726 if (pc != cache_res.end() &&
9f95a23c
TL
9727 pc->first == b_off) {
9728 l = pc->second.length();
f67539c2 9729 ready_regions[pos] = std::move(pc->second);
9f95a23c
TL
9730 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
9731 << b_off << "~" << l << std::dec << dendl;
9732 ++pc;
7c673cae 9733 } else {
9f95a23c
TL
9734 l = b_len;
9735 if (pc != cache_res.end()) {
9736 ceph_assert(pc->first > b_off);
9737 l = pc->first - b_off;
9738 }
9739 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
9740 << b_off << "~" << l << std::dec << dendl;
9741 // merge regions
9742 {
9743 uint64_t r_off = b_off;
9744 uint64_t r_len = l;
9745 uint64_t front = r_off % chunk_size;
9746 if (front) {
9747 r_off -= front;
9748 r_len += front;
9749 }
9750 unsigned tail = r_len % chunk_size;
9751 if (tail) {
9752 r_len += chunk_size - tail;
9753 }
9754 bool merged = false;
9755 regions2read_t& r2r = blobs2read[bptr];
9756 if (r2r.size()) {
9757 read_req_t& pre = r2r.back();
9758 if (r_off <= (pre.r_off + pre.r_len)) {
9759 front += (r_off - pre.r_off);
9760 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
9761 pre.regs.emplace_back(region_t(pos, b_off, l, front));
9762 merged = true;
9763 }
9764 }
9765 if (!merged) {
9766 read_req_t req(r_off, r_len);
9767 req.regs.emplace_back(region_t(pos, b_off, l, front));
9768 r2r.emplace_back(std::move(req));
9769 }
9770 }
7c673cae
FG
9771 }
9772 pos += l;
9773 b_off += l;
9774 left -= l;
9775 b_len -= l;
9776 }
9777 ++lp;
9778 }
9f95a23c 9779}
7c673cae 9780
9f95a23c
TL
9781int BlueStore::_prepare_read_ioc(
9782 blobs2read_t& blobs2read,
9783 vector<bufferlist>* compressed_blob_bls,
9784 IOContext* ioc)
9785{
7c673cae 9786 for (auto& p : blobs2read) {
94b18763 9787 const BlobRef& bptr = p.first;
11fdf7f2 9788 regions2read_t& r2r = p.second;
7c673cae 9789 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c 9790 << " need " << r2r << std::dec << dendl;
7c673cae
FG
9791 if (bptr->get_blob().is_compressed()) {
9792 // read the whole thing
9f95a23c
TL
9793 if (compressed_blob_bls->empty()) {
9794 // ensure we avoid any reallocation on subsequent blobs
9795 compressed_blob_bls->reserve(blobs2read.size());
9796 }
9797 compressed_blob_bls->push_back(bufferlist());
9798 bufferlist& bl = compressed_blob_bls->back();
9799 auto r = bptr->get_blob().map(
9800 0, bptr->get_blob().get_ondisk_length(),
9801 [&](uint64_t offset, uint64_t length) {
9802 int r = bdev->aio_read(offset, length, &bl, ioc);
9803 if (r < 0)
7c673cae
FG
9804 return r;
9805 return 0;
9f95a23c 9806 });
b32b8144
FG
9807 if (r < 0) {
9808 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
9809 if (r == -EIO) {
9810 // propagate EIO to caller
9811 return r;
9812 }
11fdf7f2 9813 ceph_assert(r == 0);
b32b8144 9814 }
7c673cae
FG
9815 } else {
9816 // read the pieces
11fdf7f2 9817 for (auto& req : r2r) {
9f95a23c
TL
9818 dout(20) << __func__ << " region 0x" << std::hex
9819 << req.regs.front().logical_offset
9820 << ": 0x" << req.regs.front().blob_xoffset
9821 << " reading 0x" << req.r_off
9822 << "~" << req.r_len << std::dec
9823 << dendl;
7c673cae 9824
9f95a23c
TL
9825 // read it
9826 auto r = bptr->get_blob().map(
9827 req.r_off, req.r_len,
9828 [&](uint64_t offset, uint64_t length) {
9829 int r = bdev->aio_read(offset, length, &req.bl, ioc);
9830 if (r < 0)
7c673cae
FG
9831 return r;
9832 return 0;
9f95a23c 9833 });
b32b8144
FG
9834 if (r < 0) {
9835 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
9836 << dendl;
9837 if (r == -EIO) {
9838 // propagate EIO to caller
9839 return r;
9840 }
11fdf7f2 9841 ceph_assert(r == 0);
b32b8144 9842 }
9f95a23c 9843 ceph_assert(req.bl.length() == req.r_len);
7c673cae
FG
9844 }
9845 }
9846 }
9f95a23c
TL
9847 return 0;
9848}
11fdf7f2 9849
9f95a23c
TL
9850int BlueStore::_generate_read_result_bl(
9851 OnodeRef o,
9852 uint64_t offset,
9853 size_t length,
9854 ready_regions_t& ready_regions,
9855 vector<bufferlist>& compressed_blob_bls,
9856 blobs2read_t& blobs2read,
9857 bool buffered,
9858 bool* csum_error,
9859 bufferlist& bl)
9860{
9861 // enumerate and decompress desired blobs
7c673cae
FG
9862 auto p = compressed_blob_bls.begin();
9863 blobs2read_t::iterator b2r_it = blobs2read.begin();
9864 while (b2r_it != blobs2read.end()) {
94b18763 9865 const BlobRef& bptr = b2r_it->first;
11fdf7f2 9866 regions2read_t& r2r = b2r_it->second;
7c673cae 9867 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c 9868 << " need 0x" << r2r << std::dec << dendl;
7c673cae 9869 if (bptr->get_blob().is_compressed()) {
11fdf7f2 9870 ceph_assert(p != compressed_blob_bls.end());
7c673cae
FG
9871 bufferlist& compressed_bl = *p++;
9872 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
9f95a23c
TL
9873 r2r.front().regs.front().logical_offset) < 0) {
9874 *csum_error = true;
9875 return -EIO;
7c673cae
FG
9876 }
9877 bufferlist raw_bl;
9f95a23c 9878 auto r = _decompress(compressed_bl, &raw_bl);
7c673cae 9879 if (r < 0)
9f95a23c 9880 return r;
7c673cae 9881 if (buffered) {
9f95a23c
TL
9882 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
9883 raw_bl);
7c673cae 9884 }
11fdf7f2
TL
9885 for (auto& req : r2r) {
9886 for (auto& r : req.regs) {
9887 ready_regions[r.logical_offset].substr_of(
9888 raw_bl, r.blob_xoffset, r.length);
9889 }
7c673cae
FG
9890 }
9891 } else {
11fdf7f2 9892 for (auto& req : r2r) {
9f95a23c
TL
9893 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
9894 req.regs.front().logical_offset) < 0) {
9895 *csum_error = true;
9896 return -EIO;
9897 }
9898 if (buffered) {
9899 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
9900 req.r_off, req.bl);
9901 }
7c673cae 9902
9f95a23c
TL
9903 // prune and keep result
9904 for (const auto& r : req.regs) {
9905 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
11fdf7f2 9906 }
7c673cae
FG
9907 }
9908 }
9909 ++b2r_it;
9910 }
9911
9912 // generate a resulting buffer
9913 auto pr = ready_regions.begin();
9914 auto pr_end = ready_regions.end();
9f95a23c 9915 uint64_t pos = 0;
7c673cae
FG
9916 while (pos < length) {
9917 if (pr != pr_end && pr->first == pos + offset) {
9918 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
9919 << ": data from 0x" << pr->first << "~" << pr->second.length()
9920 << std::dec << dendl;
7c673cae
FG
9921 pos += pr->second.length();
9922 bl.claim_append(pr->second);
9923 ++pr;
9924 } else {
9925 uint64_t l = length - pos;
9926 if (pr != pr_end) {
11fdf7f2 9927 ceph_assert(pr->first > pos + offset);
9f95a23c 9928 l = pr->first - (pos + offset);
7c673cae
FG
9929 }
9930 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
9931 << ": zeros for 0x" << (pos + offset) << "~" << l
9932 << std::dec << dendl;
7c673cae
FG
9933 bl.append_zero(l);
9934 pos += l;
9935 }
9936 }
11fdf7f2
TL
9937 ceph_assert(bl.length() == length);
9938 ceph_assert(pos == length);
9939 ceph_assert(pr == pr_end);
9f95a23c
TL
9940 return 0;
9941}
9942
9943int BlueStore::_do_read(
9944 Collection *c,
9945 OnodeRef o,
9946 uint64_t offset,
9947 size_t length,
9948 bufferlist& bl,
9949 uint32_t op_flags,
9950 uint64_t retry_count)
9951{
9952 FUNCTRACE(cct);
9953 int r = 0;
9954 int read_cache_policy = 0; // do not bypass clean or dirty cache
9955
9956 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
9957 << " size 0x" << o->onode.size << " (" << std::dec
9958 << o->onode.size << ")" << dendl;
9959 bl.clear();
9960
9961 if (offset >= o->onode.size) {
9962 return r;
9963 }
9964
9965 // generally, don't buffer anything, unless the client explicitly requests
9966 // it.
9967 bool buffered = false;
9968 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
9969 dout(20) << __func__ << " will do buffered read" << dendl;
9970 buffered = true;
9971 } else if (cct->_conf->bluestore_default_buffered_read &&
9972 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
9973 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
9974 dout(20) << __func__ << " defaulting to buffered read" << dendl;
9975 buffered = true;
9976 }
9977
9978 if (offset + length > o->onode.size) {
9979 length = o->onode.size - offset;
9980 }
9981
9982 auto start = mono_clock::now();
9983 o->extent_map.fault_range(db, offset, length);
9984 log_latency(__func__,
9985 l_bluestore_read_onode_meta_lat,
9986 mono_clock::now() - start,
9987 cct->_conf->bluestore_log_op_age);
9988 _dump_onode<30>(cct, *o);
9989
9990 // for deep-scrub, we only read dirty cache and bypass clean cache in
9991 // order to read underlying block device in case there are silent disk errors.
9992 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
9993 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
9994 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
9995 }
9996
9997 // build blob-wise list to of stuff read (that isn't cached)
9998 ready_regions_t ready_regions;
9999 blobs2read_t blobs2read;
10000 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
10001
10002
10003 // read raw blob data.
10004 start = mono_clock::now(); // for the sake of simplicity
10005 // measure the whole block below.
10006 // The error isn't that much...
10007 vector<bufferlist> compressed_blob_bls;
10008 IOContext ioc(cct, NULL, true); // allow EIO
10009 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
10010 // we always issue aio for reading, so errors other than EIO are not allowed
10011 if (r < 0)
10012 return r;
10013
f67539c2 10014 int64_t num_ios = blobs2read.size();
9f95a23c 10015 if (ioc.has_pending_aios()) {
f67539c2 10016 num_ios = ioc.get_num_ios();
9f95a23c
TL
10017 bdev->aio_submit(&ioc);
10018 dout(20) << __func__ << " waiting for aio" << dendl;
10019 ioc.aio_wait();
10020 r = ioc.get_return_value();
10021 if (r < 0) {
10022 ceph_assert(r == -EIO); // no other errors allowed
10023 return -EIO;
10024 }
10025 }
10026 log_latency_fn(__func__,
10027 l_bluestore_read_wait_aio_lat,
10028 mono_clock::now() - start,
10029 cct->_conf->bluestore_log_op_age,
10030 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
10031 );
10032
10033 bool csum_error = false;
10034 r = _generate_read_result_bl(o, offset, length, ready_regions,
10035 compressed_blob_bls, blobs2read,
10036 buffered, &csum_error, bl);
10037 if (csum_error) {
10038 // Handles spurious read errors caused by a kernel bug.
10039 // We sometimes get all-zero pages as a result of the read under
10040 // high memory pressure. Retrying the failing read succeeds in most
10041 // cases.
10042 // See also: http://tracker.ceph.com/issues/22464
10043 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
10044 return -EIO;
10045 }
10046 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
10047 }
7c673cae 10048 r = bl.length();
f64942e4
AA
10049 if (retry_count) {
10050 logger->inc(l_bluestore_reads_with_retries);
10051 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
10052 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
f67539c2
TL
10053 stringstream s;
10054 s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
10055 _set_spurious_read_errors_alert(s.str());
f64942e4 10056 }
7c673cae
FG
10057 return r;
10058}
10059
10060int BlueStore::_verify_csum(OnodeRef& o,
10061 const bluestore_blob_t* blob, uint64_t blob_xoffset,
10062 const bufferlist& bl,
10063 uint64_t logical_offset) const
10064{
10065 int bad;
10066 uint64_t bad_csum;
11fdf7f2 10067 auto start = mono_clock::now();
7c673cae 10068 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
f64942e4
AA
10069 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
10070 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
10071 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
10072 bad = blob_xoffset;
10073 r = -1;
10074 bad_csum = 0xDEADBEEF;
10075 }
7c673cae
FG
10076 if (r < 0) {
10077 if (r == -1) {
10078 PExtentVector pex;
10079 blob->map(
10080 bad,
10081 blob->get_csum_chunk_size(),
10082 [&](uint64_t offset, uint64_t length) {
10083 pex.emplace_back(bluestore_pextent_t(offset, length));
10084 return 0;
10085 });
10086 derr << __func__ << " bad "
10087 << Checksummer::get_csum_type_string(blob->csum_type)
10088 << "/0x" << std::hex << blob->get_csum_chunk_size()
10089 << " checksum at blob offset 0x" << bad
10090 << ", got 0x" << bad_csum << ", expected 0x"
10091 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
10092 << ", device location " << pex
10093 << ", logical extent 0x" << std::hex
10094 << (logical_offset + bad - blob_xoffset) << "~"
10095 << blob->get_csum_chunk_size() << std::dec
10096 << ", object " << o->oid
10097 << dendl;
10098 } else {
10099 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
10100 }
10101 }
494da23a
TL
10102 log_latency(__func__,
10103 l_bluestore_csum_lat,
10104 mono_clock::now() - start,
10105 cct->_conf->bluestore_log_op_age);
11fdf7f2
TL
10106 if (cct->_conf->bluestore_ignore_data_csum) {
10107 return 0;
10108 }
7c673cae
FG
10109 return r;
10110}
10111
10112int BlueStore::_decompress(bufferlist& source, bufferlist* result)
10113{
10114 int r = 0;
11fdf7f2
TL
10115 auto start = mono_clock::now();
10116 auto i = source.cbegin();
7c673cae 10117 bluestore_compression_header_t chdr;
11fdf7f2 10118 decode(chdr, i);
7c673cae
FG
10119 int alg = int(chdr.type);
10120 CompressorRef cp = compressor;
10121 if (!cp || (int)cp->get_type() != alg) {
10122 cp = Compressor::create(cct, alg);
10123 }
10124
10125 if (!cp.get()) {
10126 // if compressor isn't available - error, because cannot return
10127 // decompressed data?
11fdf7f2
TL
10128
10129 const char* alg_name = Compressor::get_comp_alg_name(alg);
10130 derr << __func__ << " can't load decompressor " << alg_name << dendl;
10131 _set_compression_alert(false, alg_name);
7c673cae
FG
10132 r = -EIO;
10133 } else {
f67539c2 10134 r = cp->decompress(i, chdr.length, *result, chdr.compressor_message);
7c673cae
FG
10135 if (r < 0) {
10136 derr << __func__ << " decompression failed with exit code " << r << dendl;
10137 r = -EIO;
10138 }
10139 }
494da23a
TL
10140 log_latency(__func__,
10141 l_bluestore_decompress_lat,
10142 mono_clock::now() - start,
10143 cct->_conf->bluestore_log_op_age);
7c673cae
FG
10144 return r;
10145}
10146
10147// this stores fiemap into interval_set, other variations
10148// use it internally
10149int BlueStore::_fiemap(
10150 CollectionHandle &c_,
10151 const ghobject_t& oid,
10152 uint64_t offset,
10153 size_t length,
10154 interval_set<uint64_t>& destset)
10155{
10156 Collection *c = static_cast<Collection *>(c_.get());
10157 if (!c->exists)
10158 return -ENOENT;
10159 {
9f95a23c 10160 std::shared_lock l(c->lock);
7c673cae
FG
10161
10162 OnodeRef o = c->get_onode(oid, false);
10163 if (!o || !o->exists) {
10164 return -ENOENT;
10165 }
81eedcae 10166 _dump_onode<30>(cct, *o);
7c673cae
FG
10167
10168 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10169 << " size 0x" << o->onode.size << std::dec << dendl;
10170
10171 boost::intrusive::set<Extent>::iterator ep, eend;
10172 if (offset >= o->onode.size)
10173 goto out;
10174
10175 if (offset + length > o->onode.size) {
10176 length = o->onode.size - offset;
10177 }
10178
10179 o->extent_map.fault_range(db, offset, length);
10180 eend = o->extent_map.extent_map.end();
10181 ep = o->extent_map.seek_lextent(offset);
10182 while (length > 0) {
10183 dout(20) << __func__ << " offset " << offset << dendl;
10184 if (ep != eend && ep->logical_offset + ep->length <= offset) {
10185 ++ep;
10186 continue;
10187 }
10188
10189 uint64_t x_len = length;
10190 if (ep != eend && ep->logical_offset <= offset) {
10191 uint64_t x_off = offset - ep->logical_offset;
11fdf7f2 10192 x_len = std::min(x_len, ep->length - x_off);
7c673cae
FG
10193 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
10194 << x_len << std::dec << " blob " << ep->blob << dendl;
10195 destset.insert(offset, x_len);
10196 length -= x_len;
10197 offset += x_len;
10198 if (x_off + x_len == ep->length)
10199 ++ep;
10200 continue;
10201 }
10202 if (ep != eend &&
10203 ep->logical_offset > offset &&
10204 ep->logical_offset - offset < x_len) {
10205 x_len = ep->logical_offset - offset;
10206 }
10207 offset += x_len;
10208 length -= x_len;
10209 }
10210 }
9f95a23c
TL
10211
10212 out:
10213 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10214 << " size = 0x(" << destset << ")" << std::dec << dendl;
10215 return 0;
10216}
10217
10218int BlueStore::fiemap(
10219 CollectionHandle &c_,
10220 const ghobject_t& oid,
10221 uint64_t offset,
10222 size_t length,
10223 bufferlist& bl)
10224{
10225 interval_set<uint64_t> m;
10226 int r = _fiemap(c_, oid, offset, length, m);
10227 if (r >= 0) {
10228 encode(m, bl);
10229 }
10230 return r;
10231}
10232
10233int BlueStore::fiemap(
10234 CollectionHandle &c_,
10235 const ghobject_t& oid,
10236 uint64_t offset,
10237 size_t length,
10238 map<uint64_t, uint64_t>& destmap)
10239{
10240 interval_set<uint64_t> m;
10241 int r = _fiemap(c_, oid, offset, length, m);
10242 if (r >= 0) {
10243 destmap = std::move(m).detach();
10244 }
10245 return r;
10246}
10247
10248int BlueStore::readv(
10249 CollectionHandle &c_,
10250 const ghobject_t& oid,
10251 interval_set<uint64_t>& m,
10252 bufferlist& bl,
10253 uint32_t op_flags)
10254{
10255 auto start = mono_clock::now();
10256 Collection *c = static_cast<Collection *>(c_.get());
10257 const coll_t &cid = c->get_cid();
10258 dout(15) << __func__ << " " << cid << " " << oid
10259 << " fiemap " << m
10260 << dendl;
10261 if (!c->exists)
10262 return -ENOENT;
10263
10264 bl.clear();
10265 int r;
10266 {
10267 std::shared_lock l(c->lock);
10268 auto start1 = mono_clock::now();
10269 OnodeRef o = c->get_onode(oid, false);
10270 log_latency("get_onode@read",
10271 l_bluestore_read_onode_meta_lat,
10272 mono_clock::now() - start1,
10273 cct->_conf->bluestore_log_op_age);
10274 if (!o || !o->exists) {
10275 r = -ENOENT;
10276 goto out;
10277 }
10278
10279 if (m.empty()) {
10280 r = 0;
10281 goto out;
10282 }
10283
10284 r = _do_readv(c, o, m, bl, op_flags);
10285 if (r == -EIO) {
10286 logger->inc(l_bluestore_read_eio);
10287 }
10288 }
10289
10290 out:
10291 if (r >= 0 && _debug_data_eio(oid)) {
10292 r = -EIO;
10293 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10294 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10295 cct->_conf->bluestore_debug_random_read_err &&
10296 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10297 100.0)) == 0) {
10298 dout(0) << __func__ << ": inject random EIO" << dendl;
10299 r = -EIO;
10300 }
10301 dout(10) << __func__ << " " << cid << " " << oid
10302 << " fiemap " << m << std::dec
10303 << " = " << r << dendl;
10304 log_latency(__func__,
10305 l_bluestore_read_lat,
10306 mono_clock::now() - start,
10307 cct->_conf->bluestore_log_op_age);
10308 return r;
10309}
10310
10311int BlueStore::_do_readv(
10312 Collection *c,
10313 OnodeRef o,
10314 const interval_set<uint64_t>& m,
10315 bufferlist& bl,
10316 uint32_t op_flags,
10317 uint64_t retry_count)
10318{
10319 FUNCTRACE(cct);
10320 int r = 0;
10321 int read_cache_policy = 0; // do not bypass clean or dirty cache
10322
10323 dout(20) << __func__ << " fiemap " << m << std::hex
10324 << " size 0x" << o->onode.size << " (" << std::dec
10325 << o->onode.size << ")" << dendl;
10326
10327 // generally, don't buffer anything, unless the client explicitly requests
10328 // it.
10329 bool buffered = false;
10330 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10331 dout(20) << __func__ << " will do buffered read" << dendl;
10332 buffered = true;
10333 } else if (cct->_conf->bluestore_default_buffered_read &&
10334 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10335 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10336 dout(20) << __func__ << " defaulting to buffered read" << dendl;
10337 buffered = true;
10338 }
10339 // this method must be idempotent since we may call it several times
10340 // before we finally read the expected result.
10341 bl.clear();
10342
10343 // call fiemap first!
10344 ceph_assert(m.range_start() <= o->onode.size);
10345 ceph_assert(m.range_end() <= o->onode.size);
10346 auto start = mono_clock::now();
10347 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
10348 log_latency(__func__,
10349 l_bluestore_read_onode_meta_lat,
10350 mono_clock::now() - start,
10351 cct->_conf->bluestore_log_op_age);
10352 _dump_onode<30>(cct, *o);
10353
10354 IOContext ioc(cct, NULL, true); // allow EIO
10355 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
10356 raw_results.reserve(m.num_intervals());
10357 int i = 0;
10358 for (auto p = m.begin(); p != m.end(); p++, i++) {
10359 raw_results.push_back({});
10360 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
10361 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
10362 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
10363 // we always issue aio for reading, so errors other than EIO are not allowed
10364 if (r < 0)
10365 return r;
10366 }
10367
10368 auto num_ios = m.size();
10369 if (ioc.has_pending_aios()) {
10370 num_ios = ioc.get_num_ios();
10371 bdev->aio_submit(&ioc);
10372 dout(20) << __func__ << " waiting for aio" << dendl;
10373 ioc.aio_wait();
10374 r = ioc.get_return_value();
10375 if (r < 0) {
10376 ceph_assert(r == -EIO); // no other errors allowed
10377 return -EIO;
10378 }
10379 }
10380 log_latency_fn(__func__,
10381 l_bluestore_read_wait_aio_lat,
10382 mono_clock::now() - start,
10383 cct->_conf->bluestore_log_op_age,
10384 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
10385 );
10386
10387 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
10388 i = 0;
10389 for (auto p = m.begin(); p != m.end(); p++, i++) {
10390 bool csum_error = false;
10391 bufferlist t;
10392 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
10393 std::get<0>(raw_results[i]),
10394 std::get<1>(raw_results[i]),
10395 std::get<2>(raw_results[i]),
10396 buffered, &csum_error, t);
10397 if (csum_error) {
10398 // Handles spurious read errors caused by a kernel bug.
10399 // We sometimes get all-zero pages as a result of the read under
10400 // high memory pressure. Retrying the failing read succeeds in most
10401 // cases.
10402 // See also: http://tracker.ceph.com/issues/22464
10403 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
10404 return -EIO;
10405 }
10406 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
10407 }
10408 bl.claim_append(t);
10409 }
10410 if (retry_count) {
10411 logger->inc(l_bluestore_reads_with_retries);
10412 dout(5) << __func__ << " read fiemap " << m
10413 << " failed " << retry_count << " times before succeeding"
10414 << dendl;
10415 }
10416 return bl.length();
7c673cae
FG
10417}
10418
9f95a23c 10419int BlueStore::dump_onode(CollectionHandle &c_,
7c673cae 10420 const ghobject_t& oid,
9f95a23c
TL
10421 const string& section_name,
10422 Formatter *f)
7c673cae 10423{
9f95a23c
TL
10424 Collection *c = static_cast<Collection *>(c_.get());
10425 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10426 if (!c->exists)
10427 return -ENOENT;
7c673cae 10428
9f95a23c
TL
10429 int r;
10430 {
10431 std::shared_lock l(c->lock);
10432
10433 OnodeRef o = c->get_onode(oid, false);
10434 if (!o || !o->exists) {
10435 r = -ENOENT;
10436 goto out;
10437 }
10438 // FIXME minor: actually the next line isn't enough to
10439 // load shared blobs. Leaving as is for now..
10440 //
10441 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10442
10443 _dump_onode<0>(cct, *o);
10444 f->open_object_section(section_name.c_str());
10445 o->dump(f);
10446 f->close_section();
10447 r = 0;
7c673cae 10448 }
9f95a23c
TL
10449 out:
10450 dout(10) << __func__ << " " << c->cid << " " << oid
10451 << " = " << r << dendl;
7c673cae
FG
10452 return r;
10453}
10454
7c673cae
FG
10455int BlueStore::getattr(
10456 CollectionHandle &c_,
10457 const ghobject_t& oid,
10458 const char *name,
10459 bufferptr& value)
10460{
10461 Collection *c = static_cast<Collection *>(c_.get());
10462 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
10463 if (!c->exists)
10464 return -ENOENT;
10465
10466 int r;
10467 {
9f95a23c 10468 std::shared_lock l(c->lock);
f91f0fd5 10469 mempool::bluestore_cache_meta::string k(name);
7c673cae
FG
10470
10471 OnodeRef o = c->get_onode(oid, false);
10472 if (!o || !o->exists) {
10473 r = -ENOENT;
10474 goto out;
10475 }
10476
10477 if (!o->onode.attrs.count(k)) {
10478 r = -ENODATA;
10479 goto out;
10480 }
10481 value = o->onode.attrs[k];
10482 r = 0;
10483 }
10484 out:
7c673cae
FG
10485 if (r == 0 && _debug_mdata_eio(oid)) {
10486 r = -EIO;
10487 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10488 }
10489 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
10490 << " = " << r << dendl;
10491 return r;
10492}
10493
7c673cae
FG
10494int BlueStore::getattrs(
10495 CollectionHandle &c_,
10496 const ghobject_t& oid,
10497 map<string,bufferptr>& aset)
10498{
10499 Collection *c = static_cast<Collection *>(c_.get());
10500 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10501 if (!c->exists)
10502 return -ENOENT;
10503
10504 int r;
10505 {
9f95a23c 10506 std::shared_lock l(c->lock);
7c673cae
FG
10507
10508 OnodeRef o = c->get_onode(oid, false);
10509 if (!o || !o->exists) {
10510 r = -ENOENT;
10511 goto out;
10512 }
10513 for (auto& i : o->onode.attrs) {
10514 aset.emplace(i.first.c_str(), i.second);
10515 }
10516 r = 0;
10517 }
10518
10519 out:
7c673cae
FG
10520 if (r == 0 && _debug_mdata_eio(oid)) {
10521 r = -EIO;
10522 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10523 }
10524 dout(10) << __func__ << " " << c->cid << " " << oid
10525 << " = " << r << dendl;
10526 return r;
10527}
10528
10529int BlueStore::list_collections(vector<coll_t>& ls)
10530{
9f95a23c 10531 std::shared_lock l(coll_lock);
11fdf7f2 10532 ls.reserve(coll_map.size());
7c673cae
FG
10533 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
10534 p != coll_map.end();
10535 ++p)
10536 ls.push_back(p->first);
10537 return 0;
10538}
10539
10540bool BlueStore::collection_exists(const coll_t& c)
10541{
9f95a23c 10542 std::shared_lock l(coll_lock);
7c673cae
FG
10543 return coll_map.count(c);
10544}
10545
11fdf7f2 10546int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
7c673cae 10547{
11fdf7f2 10548 dout(15) << __func__ << " " << ch->cid << dendl;
7c673cae
FG
10549 vector<ghobject_t> ls;
10550 ghobject_t next;
11fdf7f2 10551 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
7c673cae
FG
10552 &ls, &next);
10553 if (r < 0) {
10554 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
10555 << dendl;
10556 return r;
10557 }
10558 *empty = ls.empty();
11fdf7f2 10559 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
7c673cae
FG
10560 return 0;
10561}
10562
11fdf7f2 10563int BlueStore::collection_bits(CollectionHandle& ch)
7c673cae 10564{
11fdf7f2
TL
10565 dout(15) << __func__ << " " << ch->cid << dendl;
10566 Collection *c = static_cast<Collection*>(ch.get());
9f95a23c 10567 std::shared_lock l(c->lock);
11fdf7f2 10568 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
7c673cae
FG
10569 return c->cnode.bits;
10570}
10571
7c673cae
FG
10572int BlueStore::collection_list(
10573 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10574 vector<ghobject_t> *ls, ghobject_t *pnext)
10575{
10576 Collection *c = static_cast<Collection *>(c_.get());
11fdf7f2 10577 c->flush();
7c673cae
FG
10578 dout(15) << __func__ << " " << c->cid
10579 << " start " << start << " end " << end << " max " << max << dendl;
10580 int r;
10581 {
9f95a23c 10582 std::shared_lock l(c->lock);
f91f0fd5
TL
10583 r = _collection_list(c, start, end, max, false, ls, pnext);
10584 }
10585
10586 dout(10) << __func__ << " " << c->cid
10587 << " start " << start << " end " << end << " max " << max
10588 << " = " << r << ", ls.size() = " << ls->size()
10589 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10590 return r;
10591}
10592
10593int BlueStore::collection_list_legacy(
10594 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10595 vector<ghobject_t> *ls, ghobject_t *pnext)
10596{
10597 Collection *c = static_cast<Collection *>(c_.get());
10598 c->flush();
10599 dout(15) << __func__ << " " << c->cid
10600 << " start " << start << " end " << end << " max " << max << dendl;
10601 int r;
10602 {
10603 std::shared_lock l(c->lock);
10604 r = _collection_list(c, start, end, max, true, ls, pnext);
7c673cae
FG
10605 }
10606
7c673cae
FG
10607 dout(10) << __func__ << " " << c->cid
10608 << " start " << start << " end " << end << " max " << max
10609 << " = " << r << ", ls.size() = " << ls->size()
10610 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10611 return r;
10612}
10613
10614int BlueStore::_collection_list(
10615 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
f91f0fd5 10616 bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
7c673cae
FG
10617{
10618
10619 if (!c->exists)
10620 return -ENOENT;
10621
7c673cae 10622 ghobject_t static_next;
f91f0fd5
TL
10623 std::unique_ptr<CollectionListIterator> it;
10624 ghobject_t coll_range_temp_start, coll_range_temp_end;
10625 ghobject_t coll_range_start, coll_range_end;
f91f0fd5 10626 ghobject_t pend;
7c673cae
FG
10627 bool temp;
10628
10629 if (!pnext)
10630 pnext = &static_next;
10631
a4b75251
TL
10632 auto log_latency = make_scope_guard(
10633 [&, start_time = mono_clock::now(), func_name = __func__] {
10634 log_latency_fn(
10635 func_name,
10636 l_bluestore_remove_lat,
10637 mono_clock::now() - start_time,
10638 cct->_conf->bluestore_log_collection_list_age,
10639 [&](const ceph::timespan& lat) {
10640 ostringstream ostr;
10641 ostr << ", lat = " << timespan_str(lat)
10642 << " cid =" << c->cid
10643 << " start " << start << " end " << end
10644 << " max " << max;
10645 return ostr.str();
10646 });
10647 });
10648
11fdf7f2 10649 if (start.is_max() || start.hobj.is_max()) {
a4b75251
TL
10650 *pnext = ghobject_t::get_max();
10651 return 0;
7c673cae 10652 }
f91f0fd5 10653 get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
a4b75251 10654 &coll_range_temp_end, &coll_range_start, &coll_range_end, legacy);
7c673cae 10655 dout(20) << __func__
f91f0fd5
TL
10656 << " range " << coll_range_temp_start
10657 << " to " << coll_range_temp_end
10658 << " and " << coll_range_start
10659 << " to " << coll_range_end
7c673cae 10660 << " start " << start << dendl;
f91f0fd5
TL
10661 if (legacy) {
10662 it = std::make_unique<SimpleCollectionListIterator>(
10663 cct, db->get_iterator(PREFIX_OBJ));
10664 } else {
10665 it = std::make_unique<SortedCollectionListIterator>(
10666 db->get_iterator(PREFIX_OBJ));
10667 }
7c673cae
FG
10668 if (start == ghobject_t() ||
10669 start.hobj == hobject_t() ||
10670 start == c->cid.get_min_hobj()) {
f91f0fd5 10671 it->upper_bound(coll_range_temp_start);
7c673cae
FG
10672 temp = true;
10673 } else {
7c673cae
FG
10674 if (start.hobj.is_temp()) {
10675 temp = true;
f91f0fd5 10676 ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
7c673cae
FG
10677 } else {
10678 temp = false;
f91f0fd5 10679 ceph_assert(start >= coll_range_start && start < coll_range_end);
7c673cae 10680 }
f91f0fd5
TL
10681 dout(20) << __func__ << " temp=" << (int)temp << dendl;
10682 it->lower_bound(start);
7c673cae
FG
10683 }
10684 if (end.hobj.is_max()) {
f91f0fd5 10685 pend = temp ? coll_range_temp_end : coll_range_end;
7c673cae 10686 } else {
7c673cae 10687 if (end.hobj.is_temp()) {
a4b75251 10688 if (temp) {
f91f0fd5 10689 pend = end;
a4b75251
TL
10690 } else {
10691 *pnext = ghobject_t::get_max();
10692 return 0;
10693 }
7c673cae 10694 } else {
f91f0fd5 10695 pend = temp ? coll_range_temp_end : end;
7c673cae
FG
10696 }
10697 }
f91f0fd5 10698 dout(20) << __func__ << " pend " << pend << dendl;
7c673cae 10699 while (true) {
adb31ebb 10700 if (!it->valid() || it->is_ge(pend)) {
7c673cae
FG
10701 if (!it->valid())
10702 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
10703 else
f91f0fd5 10704 dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
7c673cae
FG
10705 if (temp) {
10706 if (end.hobj.is_temp()) {
adb31ebb 10707 if (it->valid() && it->is_lt(coll_range_temp_end)) {
f91f0fd5 10708 *pnext = it->oid();
a4b75251 10709 return 0;
f91f0fd5 10710 }
7c673cae
FG
10711 break;
10712 }
10713 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
10714 temp = false;
f91f0fd5
TL
10715 it->upper_bound(coll_range_start);
10716 if (end.hobj.is_max())
10717 pend = coll_range_end;
10718 else
10719 pend = end;
10720 dout(30) << __func__ << " pend " << pend << dendl;
7c673cae
FG
10721 continue;
10722 }
adb31ebb 10723 if (it->valid() && it->is_lt(coll_range_end)) {
f91f0fd5 10724 *pnext = it->oid();
a4b75251 10725 return 0;
f91f0fd5 10726 }
7c673cae
FG
10727 break;
10728 }
f91f0fd5 10729 dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
7c673cae
FG
10730 if (ls->size() >= (unsigned)max) {
10731 dout(20) << __func__ << " reached max " << max << dendl;
f91f0fd5 10732 *pnext = it->oid();
a4b75251 10733 return 0;
7c673cae 10734 }
f91f0fd5 10735 ls->push_back(it->oid());
7c673cae
FG
10736 it->next();
10737 }
a4b75251
TL
10738 *pnext = ghobject_t::get_max();
10739 return 0;
7c673cae
FG
10740}
10741
7c673cae
FG
10742int BlueStore::omap_get(
10743 CollectionHandle &c_, ///< [in] Collection containing oid
10744 const ghobject_t &oid, ///< [in] Object containing omap
10745 bufferlist *header, ///< [out] omap header
10746 map<string, bufferlist> *out /// < [out] Key to value map
10747 )
10748{
10749 Collection *c = static_cast<Collection *>(c_.get());
9f95a23c
TL
10750 return _omap_get(c, oid, header, out);
10751}
10752
10753int BlueStore::_omap_get(
10754 Collection *c, ///< [in] Collection containing oid
10755 const ghobject_t &oid, ///< [in] Object containing omap
10756 bufferlist *header, ///< [out] omap header
10757 map<string, bufferlist> *out /// < [out] Key to value map
10758 )
10759{
7c673cae
FG
10760 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10761 if (!c->exists)
10762 return -ENOENT;
9f95a23c 10763 std::shared_lock l(c->lock);
7c673cae
FG
10764 int r = 0;
10765 OnodeRef o = c->get_onode(oid, false);
10766 if (!o || !o->exists) {
10767 r = -ENOENT;
10768 goto out;
10769 }
9f95a23c
TL
10770 r = _onode_omap_get(o, header, out);
10771 out:
10772 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10773 << dendl;
10774 return r;
10775}
10776
10777int BlueStore::_onode_omap_get(
10778 const OnodeRef &o, ///< [in] Object containing omap
10779 bufferlist *header, ///< [out] omap header
10780 map<string, bufferlist> *out /// < [out] Key to value map
10781)
10782{
10783 int r = 0;
10784 if (!o || !o->exists) {
10785 r = -ENOENT;
10786 goto out;
10787 }
7c673cae
FG
10788 if (!o->onode.has_omap())
10789 goto out;
10790 o->flush();
10791 {
9f95a23c 10792 const string& prefix = o->get_omap_prefix();
11fdf7f2 10793 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 10794 string head, tail;
9f95a23c
TL
10795 o->get_omap_header(&head);
10796 o->get_omap_tail(&tail);
7c673cae
FG
10797 it->lower_bound(head);
10798 while (it->valid()) {
10799 if (it->key() == head) {
9f95a23c
TL
10800 dout(30) << __func__ << " got header" << dendl;
10801 *header = it->value();
7c673cae 10802 } else if (it->key() >= tail) {
9f95a23c
TL
10803 dout(30) << __func__ << " reached tail" << dendl;
10804 break;
7c673cae 10805 } else {
9f95a23c
TL
10806 string user_key;
10807 o->decode_omap_key(it->key(), &user_key);
10808 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
10809 << " -> " << user_key << dendl;
10810 (*out)[user_key] = it->value();
7c673cae
FG
10811 }
10812 it->next();
10813 }
10814 }
9f95a23c 10815out:
7c673cae
FG
10816 return r;
10817}
10818
7c673cae
FG
10819int BlueStore::omap_get_header(
10820 CollectionHandle &c_, ///< [in] Collection containing oid
10821 const ghobject_t &oid, ///< [in] Object containing omap
10822 bufferlist *header, ///< [out] omap header
10823 bool allow_eio ///< [in] don't assert on eio
10824 )
10825{
10826 Collection *c = static_cast<Collection *>(c_.get());
10827 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10828 if (!c->exists)
10829 return -ENOENT;
9f95a23c 10830 std::shared_lock l(c->lock);
7c673cae
FG
10831 int r = 0;
10832 OnodeRef o = c->get_onode(oid, false);
10833 if (!o || !o->exists) {
10834 r = -ENOENT;
10835 goto out;
10836 }
10837 if (!o->onode.has_omap())
10838 goto out;
10839 o->flush();
10840 {
10841 string head;
9f95a23c
TL
10842 o->get_omap_header(&head);
10843 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
7c673cae
FG
10844 dout(30) << __func__ << " got header" << dendl;
10845 } else {
10846 dout(30) << __func__ << " no header" << dendl;
10847 }
10848 }
10849 out:
10850 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10851 << dendl;
10852 return r;
10853}
10854
7c673cae
FG
10855int BlueStore::omap_get_keys(
10856 CollectionHandle &c_, ///< [in] Collection containing oid
10857 const ghobject_t &oid, ///< [in] Object containing omap
10858 set<string> *keys ///< [out] Keys defined on oid
10859 )
10860{
10861 Collection *c = static_cast<Collection *>(c_.get());
10862 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10863 if (!c->exists)
10864 return -ENOENT;
adb31ebb 10865 auto start1 = mono_clock::now();
9f95a23c 10866 std::shared_lock l(c->lock);
7c673cae
FG
10867 int r = 0;
10868 OnodeRef o = c->get_onode(oid, false);
10869 if (!o || !o->exists) {
10870 r = -ENOENT;
10871 goto out;
10872 }
10873 if (!o->onode.has_omap())
10874 goto out;
10875 o->flush();
10876 {
9f95a23c 10877 const string& prefix = o->get_omap_prefix();
11fdf7f2 10878 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 10879 string head, tail;
9f95a23c
TL
10880 o->get_omap_key(string(), &head);
10881 o->get_omap_tail(&tail);
7c673cae
FG
10882 it->lower_bound(head);
10883 while (it->valid()) {
10884 if (it->key() >= tail) {
10885 dout(30) << __func__ << " reached tail" << dendl;
10886 break;
10887 }
10888 string user_key;
9f95a23c 10889 o->decode_omap_key(it->key(), &user_key);
11fdf7f2 10890 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
10891 << " -> " << user_key << dendl;
10892 keys->insert(user_key);
10893 it->next();
11fdf7f2
TL
10894 }
10895 }
10896 out:
adb31ebb
TL
10897 c->store->log_latency(
10898 __func__,
10899 l_bluestore_omap_get_keys_lat,
10900 mono_clock::now() - start1,
10901 c->store->cct->_conf->bluestore_log_omap_iterator_age);
10902
11fdf7f2
TL
10903 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10904 << dendl;
10905 return r;
7c673cae
FG
10906}
10907
10908int BlueStore::omap_get_values(
10909 CollectionHandle &c_, ///< [in] Collection containing oid
10910 const ghobject_t &oid, ///< [in] Object containing omap
10911 const set<string> &keys, ///< [in] Keys to get
10912 map<string, bufferlist> *out ///< [out] Returned keys and values
10913 )
10914{
10915 Collection *c = static_cast<Collection *>(c_.get());
10916 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10917 if (!c->exists)
10918 return -ENOENT;
9f95a23c 10919 std::shared_lock l(c->lock);
adb31ebb 10920 auto start1 = mono_clock::now();
7c673cae
FG
10921 int r = 0;
10922 string final_key;
10923 OnodeRef o = c->get_onode(oid, false);
10924 if (!o || !o->exists) {
10925 r = -ENOENT;
10926 goto out;
10927 }
9f95a23c 10928 if (!o->onode.has_omap()) {
7c673cae 10929 goto out;
9f95a23c
TL
10930 }
10931 o->flush();
11fdf7f2 10932 {
9f95a23c
TL
10933 const string& prefix = o->get_omap_prefix();
10934 o->get_omap_key(string(), &final_key);
10935 size_t base_key_len = final_key.size();
11fdf7f2 10936 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 10937 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
10938 final_key += *p;
10939 bufferlist val;
10940 if (db->get(prefix, final_key, &val) >= 0) {
10941 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
10942 << " -> " << *p << dendl;
10943 out->insert(make_pair(*p, val));
10944 }
7c673cae
FG
10945 }
10946 }
10947 out:
adb31ebb
TL
10948 c->store->log_latency(
10949 __func__,
10950 l_bluestore_omap_get_values_lat,
10951 mono_clock::now() - start1,
10952 c->store->cct->_conf->bluestore_log_omap_iterator_age);
10953
7c673cae
FG
10954 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10955 << dendl;
10956 return r;
10957}
10958
9f95a23c
TL
10959#ifdef WITH_SEASTAR
10960int BlueStore::omap_get_values(
10961 CollectionHandle &c_, ///< [in] Collection containing oid
10962 const ghobject_t &oid, ///< [in] Object containing omap
10963 const std::optional<string> &start_after, ///< [in] Keys to get
10964 map<string, bufferlist> *output ///< [out] Returned keys and values
10965 )
10966{
10967 Collection *c = static_cast<Collection *>(c_.get());
10968 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10969 if (!c->exists)
10970 return -ENOENT;
10971 std::shared_lock l(c->lock);
10972 int r = 0;
10973 OnodeRef o = c->get_onode(oid, false);
10974 if (!o || !o->exists) {
10975 r = -ENOENT;
10976 goto out;
10977 }
10978 if (!o->onode.has_omap()) {
10979 goto out;
10980 }
10981 o->flush();
10982 {
10983 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
10984 if (!iter) {
10985 r = -ENOENT;
10986 goto out;
10987 }
10988 iter->upper_bound(*start_after);
10989 for (; iter->valid(); iter->next()) {
10990 output->insert(make_pair(iter->key(), iter->value()));
10991 }
10992 }
10993
10994out:
10995 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10996 << dendl;
10997 return r;
10998}
10999#endif
11000
7c673cae
FG
11001int BlueStore::omap_check_keys(
11002 CollectionHandle &c_, ///< [in] Collection containing oid
11003 const ghobject_t &oid, ///< [in] Object containing omap
11004 const set<string> &keys, ///< [in] Keys to check
11005 set<string> *out ///< [out] Subset of keys defined on oid
11006 )
11007{
11008 Collection *c = static_cast<Collection *>(c_.get());
11009 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11010 if (!c->exists)
11011 return -ENOENT;
9f95a23c 11012 std::shared_lock l(c->lock);
7c673cae
FG
11013 int r = 0;
11014 string final_key;
11015 OnodeRef o = c->get_onode(oid, false);
11016 if (!o || !o->exists) {
11017 r = -ENOENT;
11018 goto out;
11019 }
9f95a23c 11020 if (!o->onode.has_omap()) {
7c673cae 11021 goto out;
9f95a23c
TL
11022 }
11023 o->flush();
11fdf7f2 11024 {
9f95a23c
TL
11025 const string& prefix = o->get_omap_prefix();
11026 o->get_omap_key(string(), &final_key);
11027 size_t base_key_len = final_key.size();
11fdf7f2 11028 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 11029 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
11030 final_key += *p;
11031 bufferlist val;
11032 if (db->get(prefix, final_key, &val) >= 0) {
11033 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
11034 << " -> " << *p << dendl;
11035 out->insert(*p);
11036 } else {
11037 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
11038 << " -> " << *p << dendl;
11039 }
7c673cae
FG
11040 }
11041 }
11042 out:
11043 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11044 << dendl;
11045 return r;
11046}
11047
7c673cae
FG
11048ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
11049 CollectionHandle &c_, ///< [in] collection
11050 const ghobject_t &oid ///< [in] object
11051 )
11052{
11053 Collection *c = static_cast<Collection *>(c_.get());
11054 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
11055 if (!c->exists) {
11056 return ObjectMap::ObjectMapIterator();
11057 }
9f95a23c 11058 std::shared_lock l(c->lock);
7c673cae
FG
11059 OnodeRef o = c->get_onode(oid, false);
11060 if (!o || !o->exists) {
11061 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
11062 return ObjectMap::ObjectMapIterator();
11063 }
11064 o->flush();
11065 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
9f95a23c 11066 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix());
7c673cae
FG
11067 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
11068}
11069
11070// -----------------
11071// write helpers
11072
11fdf7f2 11073uint64_t BlueStore::_get_ondisk_reserved() const {
f67539c2 11074 ceph_assert(min_alloc_size);
11fdf7f2
TL
11075 return round_up_to(
11076 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
11077}
11078
7c673cae
FG
11079void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
11080{
11081 dout(10) << __func__ << " ondisk_format " << ondisk_format
11082 << " min_compat_ondisk_format " << min_compat_ondisk_format
11083 << dendl;
11fdf7f2 11084 ceph_assert(ondisk_format == latest_ondisk_format);
7c673cae
FG
11085 {
11086 bufferlist bl;
11fdf7f2 11087 encode(ondisk_format, bl);
7c673cae
FG
11088 t->set(PREFIX_SUPER, "ondisk_format", bl);
11089 }
11090 {
11091 bufferlist bl;
11fdf7f2 11092 encode(min_compat_ondisk_format, bl);
7c673cae
FG
11093 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
11094 }
11095}
11096
11097int BlueStore::_open_super_meta()
11098{
11099 // nid
11100 {
11101 nid_max = 0;
11102 bufferlist bl;
11103 db->get(PREFIX_SUPER, "nid_max", &bl);
11fdf7f2 11104 auto p = bl.cbegin();
7c673cae
FG
11105 try {
11106 uint64_t v;
11fdf7f2 11107 decode(v, p);
7c673cae 11108 nid_max = v;
f67539c2 11109 } catch (ceph::buffer::error& e) {
7c673cae
FG
11110 derr << __func__ << " unable to read nid_max" << dendl;
11111 return -EIO;
11112 }
f67539c2 11113 dout(1) << __func__ << " old nid_max " << nid_max << dendl;
7c673cae
FG
11114 nid_last = nid_max.load();
11115 }
11116
11117 // blobid
11118 {
11119 blobid_max = 0;
11120 bufferlist bl;
11121 db->get(PREFIX_SUPER, "blobid_max", &bl);
11fdf7f2 11122 auto p = bl.cbegin();
7c673cae
FG
11123 try {
11124 uint64_t v;
11fdf7f2 11125 decode(v, p);
7c673cae 11126 blobid_max = v;
f67539c2 11127 } catch (ceph::buffer::error& e) {
7c673cae
FG
11128 derr << __func__ << " unable to read blobid_max" << dendl;
11129 return -EIO;
11130 }
f67539c2 11131 dout(1) << __func__ << " old blobid_max " << blobid_max << dendl;
7c673cae
FG
11132 blobid_last = blobid_max.load();
11133 }
11134
11135 // freelist
11136 {
11137 bufferlist bl;
11138 db->get(PREFIX_SUPER, "freelist_type", &bl);
11139 if (bl.length()) {
11140 freelist_type = std::string(bl.c_str(), bl.length());
f67539c2 11141 dout(1) << __func__ << " freelist_type " << freelist_type << dendl;
7c673cae 11142 } else {
11fdf7f2 11143 ceph_abort_msg("Not Support extent freelist manager");
7c673cae 11144 }
7c673cae
FG
11145 }
11146
11147 // ondisk format
11148 int32_t compat_ondisk_format = 0;
11149 {
11150 bufferlist bl;
11151 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
11152 if (r < 0) {
11153 // base case: kraken bluestore is v1 and readable by v1
11154 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
11155 << dendl;
11156 ondisk_format = 1;
11157 compat_ondisk_format = 1;
11158 } else {
11fdf7f2 11159 auto p = bl.cbegin();
7c673cae 11160 try {
11fdf7f2 11161 decode(ondisk_format, p);
f67539c2 11162 } catch (ceph::buffer::error& e) {
7c673cae
FG
11163 derr << __func__ << " unable to read ondisk_format" << dendl;
11164 return -EIO;
11165 }
11166 bl.clear();
11167 {
11168 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11fdf7f2
TL
11169 ceph_assert(!r);
11170 auto p = bl.cbegin();
7c673cae 11171 try {
11fdf7f2 11172 decode(compat_ondisk_format, p);
f67539c2 11173 } catch (ceph::buffer::error& e) {
7c673cae
FG
11174 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
11175 return -EIO;
11176 }
11177 }
11178 }
f67539c2 11179 dout(1) << __func__ << " ondisk_format " << ondisk_format
7c673cae
FG
11180 << " compat_ondisk_format " << compat_ondisk_format
11181 << dendl;
11182 }
11183
11184 if (latest_ondisk_format < compat_ondisk_format) {
11185 derr << __func__ << " compat_ondisk_format is "
11186 << compat_ondisk_format << " but we only understand version "
11187 << latest_ondisk_format << dendl;
11188 return -EPERM;
11189 }
7c673cae
FG
11190
11191 {
11192 bufferlist bl;
11193 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11fdf7f2 11194 auto p = bl.cbegin();
7c673cae
FG
11195 try {
11196 uint64_t val;
11fdf7f2 11197 decode(val, p);
7c673cae 11198 min_alloc_size = val;
224ce89b 11199 min_alloc_size_order = ctz(val);
11fdf7f2 11200 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
f67539c2 11201 } catch (ceph::buffer::error& e) {
7c673cae
FG
11202 derr << __func__ << " unable to read min_alloc_size" << dendl;
11203 return -EIO;
11204 }
f67539c2 11205 dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7c673cae
FG
11206 << std::dec << dendl;
11207 }
9f95a23c
TL
11208
11209 _set_per_pool_omap();
11210
224ce89b 11211 _open_statfs();
7c673cae
FG
11212 _set_alloc_sizes();
11213 _set_throttle_params();
11214
11215 _set_csum();
11216 _set_compression();
11217 _set_blob_size();
11218
11fdf7f2 11219 _validate_bdev();
7c673cae
FG
11220 return 0;
11221}
11222
11223int BlueStore::_upgrade_super()
11224{
11225 dout(1) << __func__ << " from " << ondisk_format << ", latest "
11226 << latest_ondisk_format << dendl;
11fdf7f2
TL
11227 if (ondisk_format < latest_ondisk_format) {
11228 ceph_assert(ondisk_format > 0);
11229 ceph_assert(ondisk_format < latest_ondisk_format);
11230
1911f103 11231 KeyValueDB::Transaction t = db->get_transaction();
11fdf7f2
TL
11232 if (ondisk_format == 1) {
11233 // changes:
11234 // - super: added ondisk_format
11235 // - super: added min_readable_ondisk_format
11236 // - super: added min_compat_ondisk_format
11237 // - super: added min_alloc_size
11238 // - super: removed min_min_alloc_size
11fdf7f2
TL
11239 {
11240 bufferlist bl;
11241 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
11242 auto p = bl.cbegin();
11243 try {
11244 uint64_t val;
11245 decode(val, p);
11246 min_alloc_size = val;
f67539c2 11247 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
11248 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
11249 return -EIO;
11250 }
11251 t->set(PREFIX_SUPER, "min_alloc_size", bl);
11252 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7c673cae 11253 }
11fdf7f2 11254 ondisk_format = 2;
7c673cae 11255 }
9f95a23c
TL
11256 if (ondisk_format == 2) {
11257 // changes:
f67539c2
TL
11258 // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all*
11259 // oondes are using the per-pool prefix until a repair is run; at that
9f95a23c
TL
11260 // point the per_pool_omap=1 key will be set.
11261 // - super: added per_pool_omap key, which indicates that *all* objects
11262 // are using the new prefix and key format
11263 ondisk_format = 3;
1911f103
TL
11264 }
11265 if (ondisk_format == 3) {
11266 // changes:
11267 // - FreelistManager keeps meta within bdev label
11268 int r = _write_out_fm_meta(0);
9f95a23c 11269 ceph_assert(r == 0);
1911f103 11270 ondisk_format = 4;
9f95a23c 11271 }
1911f103
TL
11272 // This to be the last operation
11273 _prepare_ondisk_format_super(t);
11274 int r = db->submit_transaction_sync(t);
11275 ceph_assert(r == 0);
7c673cae 11276 }
7c673cae
FG
11277 // done
11278 dout(1) << __func__ << " done" << dendl;
11279 return 0;
11280}
11281
11282void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
11283{
224ce89b 11284 if (o->onode.nid) {
11fdf7f2 11285 ceph_assert(o->exists);
7c673cae 11286 return;
224ce89b 11287 }
7c673cae
FG
11288 uint64_t nid = ++nid_last;
11289 dout(20) << __func__ << " " << nid << dendl;
11290 o->onode.nid = nid;
11291 txc->last_nid = nid;
224ce89b 11292 o->exists = true;
7c673cae
FG
11293}
11294
11295uint64_t BlueStore::_assign_blobid(TransContext *txc)
11296{
11297 uint64_t bid = ++blobid_last;
11298 dout(20) << __func__ << " " << bid << dendl;
11299 txc->last_blobid = bid;
11300 return bid;
11301}
11302
11303void BlueStore::get_db_statistics(Formatter *f)
11304{
11305 db->get_statistics(f);
11306}
11307
11fdf7f2
TL
11308BlueStore::TransContext *BlueStore::_txc_create(
11309 Collection *c, OpSequencer *osr,
f67539c2
TL
11310 list<Context*> *on_commits,
11311 TrackedOpRef osd_op)
7c673cae 11312{
11fdf7f2 11313 TransContext *txc = new TransContext(cct, c, osr, on_commits);
7c673cae 11314 txc->t = db->get_transaction();
f67539c2
TL
11315
11316#ifdef WITH_BLKIN
11317 if (osd_op && osd_op->pg_trace) {
11318 txc->trace.init("TransContext", &trace_endpoint,
11319 &osd_op->pg_trace);
11320 txc->trace.event("txc create");
11321 txc->trace.keyval("txc seq", txc->seq);
11322 }
11323#endif
11324
7c673cae
FG
11325 osr->queue_new(txc);
11326 dout(20) << __func__ << " osr " << osr << " = " << txc
11327 << " seq " << txc->seq << dendl;
11328 return txc;
11329}
11330
11331void BlueStore::_txc_calc_cost(TransContext *txc)
11332{
11fdf7f2
TL
11333 // one "io" for the kv commit
11334 auto ios = 1 + txc->ioc.get_num_ios();
7c673cae
FG
11335 auto cost = throttle_cost_per_io.load();
11336 txc->cost = ios * cost + txc->bytes;
9f95a23c 11337 txc->ios = ios;
7c673cae
FG
11338 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
11339 << ios << " ios * " << cost << " + " << txc->bytes
11340 << " bytes)" << dendl;
11341}
11342
11343void BlueStore::_txc_update_store_statfs(TransContext *txc)
11344{
11345 if (txc->statfs_delta.is_empty())
11346 return;
11347
11348 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
11349 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
11350 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
11351 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
11352 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
11353
11354 bufferlist bl;
11355 txc->statfs_delta.encode(bl);
11fdf7f2
TL
11356 if (per_pool_stat_collection) {
11357 string key;
11358 get_pool_stat_key(txc->osd_pool_id, &key);
11359 txc->t->merge(PREFIX_STAT, key, bl);
11360
11361 std::lock_guard l(vstatfs_lock);
11362 auto& stats = osd_pools[txc->osd_pool_id];
11363 stats += txc->statfs_delta;
11364
11365 vstatfs += txc->statfs_delta; //non-persistent in this mode
11366
11367 } else {
11368 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
7c673cae 11369
11fdf7f2
TL
11370 std::lock_guard l(vstatfs_lock);
11371 vstatfs += txc->statfs_delta;
11372 }
7c673cae
FG
11373 txc->statfs_delta.reset();
11374}
11375
11376void BlueStore::_txc_state_proc(TransContext *txc)
11377{
11378 while (true) {
11379 dout(10) << __func__ << " txc " << txc
11380 << " " << txc->get_state_name() << dendl;
f67539c2 11381 switch (txc->get_state()) {
7c673cae 11382 case TransContext::STATE_PREPARE:
9f95a23c 11383 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
7c673cae 11384 if (txc->ioc.has_pending_aios()) {
f67539c2
TL
11385 txc->set_state(TransContext::STATE_AIO_WAIT);
11386#ifdef WITH_BLKIN
11387 if (txc->trace) {
11388 txc->trace.keyval("pending aios", txc->ioc.num_pending.load());
11389 }
11390#endif
7c673cae
FG
11391 txc->had_ios = true;
11392 _txc_aio_submit(txc);
11393 return;
11394 }
11395 // ** fall-thru **
11396
11397 case TransContext::STATE_AIO_WAIT:
11fdf7f2 11398 {
9f95a23c
TL
11399 mono_clock::duration lat = throttle.log_state_latency(
11400 *txc, logger, l_bluestore_state_aio_wait_lat);
11401 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
11fdf7f2
TL
11402 dout(0) << __func__ << " slow aio_wait, txc = " << txc
11403 << ", latency = " << lat
11404 << dendl;
11405 }
11406 }
11407
7c673cae
FG
11408 _txc_finish_io(txc); // may trigger blocked txc's too
11409 return;
11410
11411 case TransContext::STATE_IO_DONE:
11fdf7f2 11412 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
7c673cae
FG
11413 if (txc->had_ios) {
11414 ++txc->osr->txc_with_unstable_io;
11415 }
9f95a23c 11416 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
f67539c2 11417 txc->set_state(TransContext::STATE_KV_QUEUED);
7c673cae
FG
11418 if (cct->_conf->bluestore_sync_submit_transaction) {
11419 if (txc->last_nid >= nid_max ||
11420 txc->last_blobid >= blobid_max) {
11421 dout(20) << __func__
11422 << " last_{nid,blobid} exceeds max, submit via kv thread"
11423 << dendl;
11424 } else if (txc->osr->kv_committing_serially) {
11425 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
11426 << dendl;
11427 // note: this is starvation-prone. once we have a txc in a busy
11428 // sequencer that is committing serially it is possible to keep
11429 // submitting new transactions fast enough that we get stuck doing
11430 // so. the alternative is to block here... fixme?
11431 } else if (txc->osr->txc_with_unstable_io) {
11432 dout(20) << __func__ << " prior txc(s) with unstable ios "
11433 << txc->osr->txc_with_unstable_io.load() << dendl;
11434 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
11435 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
11436 == 0) {
11437 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
11438 << dendl;
11439 } else {
9f95a23c 11440 _txc_apply_kv(txc, true);
7c673cae
FG
11441 }
11442 }
11443 {
11fdf7f2 11444 std::lock_guard l(kv_lock);
7c673cae 11445 kv_queue.push_back(txc);
9f95a23c
TL
11446 if (!kv_sync_in_progress) {
11447 kv_sync_in_progress = true;
11448 kv_cond.notify_one();
11449 }
f67539c2 11450 if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) {
7c673cae
FG
11451 kv_queue_unsubmitted.push_back(txc);
11452 ++txc->osr->kv_committing_serially;
11453 }
31f18b77
FG
11454 if (txc->had_ios)
11455 kv_ios++;
11456 kv_throttle_costs += txc->cost;
7c673cae
FG
11457 }
11458 return;
11459 case TransContext::STATE_KV_SUBMITTED:
7c673cae
FG
11460 _txc_committed_kv(txc);
11461 // ** fall-thru **
11462
11463 case TransContext::STATE_KV_DONE:
9f95a23c 11464 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
7c673cae 11465 if (txc->deferred_txn) {
f67539c2 11466 txc->set_state(TransContext::STATE_DEFERRED_QUEUED);
7c673cae
FG
11467 _deferred_queue(txc);
11468 return;
11469 }
f67539c2 11470 txc->set_state(TransContext::STATE_FINISHING);
7c673cae
FG
11471 break;
11472
11473 case TransContext::STATE_DEFERRED_CLEANUP:
9f95a23c 11474 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
f67539c2 11475 txc->set_state(TransContext::STATE_FINISHING);
7c673cae
FG
11476 // ** fall-thru **
11477
11478 case TransContext::STATE_FINISHING:
9f95a23c 11479 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
7c673cae
FG
11480 _txc_finish(txc);
11481 return;
11482
11483 default:
11484 derr << __func__ << " unexpected txc " << txc
11485 << " state " << txc->get_state_name() << dendl;
11fdf7f2 11486 ceph_abort_msg("unexpected txc state");
7c673cae
FG
11487 return;
11488 }
11489 }
11490}
11491
11492void BlueStore::_txc_finish_io(TransContext *txc)
11493{
11494 dout(20) << __func__ << " " << txc << dendl;
11495
11496 /*
11497 * we need to preserve the order of kv transactions,
11498 * even though aio will complete in any order.
11499 */
11500
11501 OpSequencer *osr = txc->osr.get();
11fdf7f2 11502 std::lock_guard l(osr->qlock);
f67539c2 11503 txc->set_state(TransContext::STATE_IO_DONE);
11fdf7f2 11504 txc->ioc.release_running_aios();
7c673cae
FG
11505 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
11506 while (p != osr->q.begin()) {
11507 --p;
f67539c2 11508 if (p->get_state() < TransContext::STATE_IO_DONE) {
7c673cae
FG
11509 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
11510 << p->get_state_name() << dendl;
11511 return;
11512 }
f67539c2 11513 if (p->get_state() > TransContext::STATE_IO_DONE) {
7c673cae
FG
11514 ++p;
11515 break;
11516 }
11517 }
11518 do {
11519 _txc_state_proc(&*p++);
11520 } while (p != osr->q.end() &&
f67539c2 11521 p->get_state() == TransContext::STATE_IO_DONE);
7c673cae 11522
11fdf7f2 11523 if (osr->kv_submitted_waiters) {
7c673cae
FG
11524 osr->qcond.notify_all();
11525 }
11526}
11527
11528void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
11529{
11530 dout(20) << __func__ << " txc " << txc
11531 << " onodes " << txc->onodes
11532 << " shared_blobs " << txc->shared_blobs
11533 << dendl;
11534
11535 // finalize onodes
11536 for (auto o : txc->onodes) {
11fdf7f2 11537 _record_onode(o, t);
7c673cae
FG
11538 o->flushing_count++;
11539 }
11540
11541 // objects we modified but didn't affect the onode
11542 auto p = txc->modified_objects.begin();
11543 while (p != txc->modified_objects.end()) {
11544 if (txc->onodes.count(*p) == 0) {
11545 (*p)->flushing_count++;
11546 ++p;
11547 } else {
11548 // remove dups with onodes list to avoid problems in _txc_finish
11549 p = txc->modified_objects.erase(p);
11550 }
11551 }
11552
11553 // finalize shared_blobs
11554 for (auto sb : txc->shared_blobs) {
11555 string key;
11556 auto sbid = sb->get_sbid();
11557 get_shared_blob_key(sbid, &key);
11558 if (sb->persistent->empty()) {
11fdf7f2
TL
11559 dout(20) << __func__ << " shared_blob 0x"
11560 << std::hex << sbid << std::dec
7c673cae
FG
11561 << " is empty" << dendl;
11562 t->rmkey(PREFIX_SHARED_BLOB, key);
11563 } else {
11564 bufferlist bl;
11fdf7f2
TL
11565 encode(*(sb->persistent), bl);
11566 dout(20) << __func__ << " shared_blob 0x"
11567 << std::hex << sbid << std::dec
31f18b77 11568 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
11569 t->set(PREFIX_SHARED_BLOB, key, bl);
11570 }
11571 }
11572}
11573
11574void BlueStore::BSPerfTracker::update_from_perfcounters(
11575 PerfCounters &logger)
11576{
11fdf7f2
TL
11577 os_commit_latency_ns.consume_next(
11578 logger.get_tavg_ns(
7c673cae 11579 l_bluestore_commit_lat));
11fdf7f2
TL
11580 os_apply_latency_ns.consume_next(
11581 logger.get_tavg_ns(
7c673cae
FG
11582 l_bluestore_commit_lat));
11583}
11584
f67539c2
TL
11585// For every object we maintain <zone_num+oid, offset> tuple in the key-value
11586// store. When a new object written to a zone, we insert the corresponding
11587// tuple to the database. When an object is truncated, we remove the
11588// corresponding tuple. When an object is overwritten, we remove the old tuple
11589// and insert a new tuple corresponding to the new location of the object. The
11590// cleaner can now identify live objects within the zone <zone_num> by
11591// enumerating all the keys starting with <zone_num> prefix.
11592void BlueStore::_zoned_update_cleaning_metadata(TransContext *txc) {
11593 for (const auto &[o, offsets] : txc->zoned_onode_to_offset_map) {
11594 std::string key;
11595 get_object_key(cct, o->oid, &key);
11596 for (auto offset : offsets) {
11597 if (offset > 0) {
11598 bufferlist offset_bl;
11599 encode(offset, offset_bl);
11600 txc->t->set(_zoned_get_prefix(offset), key, offset_bl);
11601 } else {
11602 txc->t->rmkey(_zoned_get_prefix(-offset), key);
11603 }
11604 }
11605 }
11606}
11607
11608std::string BlueStore::_zoned_get_prefix(uint64_t offset) {
11609 uint64_t zone_num = offset / bdev->get_zone_size();
11610 std::string zone_key;
11611 _key_encode_u64(zone_num, &zone_key);
11612 return PREFIX_ZONED_CL_INFO + zone_key;
11613}
11614
11615// For now, to avoid interface changes we piggyback zone_size (in MiB) and the
11616// first sequential zone number onto min_alloc_size and pass it to functions
11617// Allocator::create and FreelistManager::create.
11618uint64_t BlueStore::_zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size) {
11619 uint64_t zone_size = bdev->get_zone_size();
11620 uint64_t zone_size_mb = zone_size / (1024 * 1024);
11621 uint64_t first_seq_zone = bdev->get_conventional_region_size() / zone_size;
11622 min_alloc_size |= (zone_size_mb << 32);
11623 min_alloc_size |= (first_seq_zone << 48);
11624 return min_alloc_size;
11625}
11626
11627int BlueStore::_zoned_check_config_settings() {
11628 if (cct->_conf->bluestore_allocator != "zoned") {
11629 dout(1) << __func__ << " The drive is HM-SMR but "
11630 << cct->_conf->bluestore_allocator << " allocator is specified. "
11631 << "Only zoned allocator can be used with HM-SMR drive." << dendl;
11632 return -EINVAL;
11633 }
11634
11635 // At least for now we want to use large min_alloc_size with HM-SMR drives.
11636 // Populating used_blocks bitset on a debug build of ceph-osd takes about 5
11637 // minutes with a 14 TB HM-SMR drive and 4 KiB min_alloc_size.
11638 if (min_alloc_size < 64 * 1024) {
11639 dout(1) << __func__ << " The drive is HM-SMR but min_alloc_size is "
11640 << min_alloc_size << ". "
11641 << "Please set to at least 64 KiB." << dendl;
11642 return -EINVAL;
11643 }
11644
11645 // We don't want to defer writes with HM-SMR because it violates sequential
11646 // write requirement.
11647 if (prefer_deferred_size) {
11648 dout(1) << __func__ << " The drive is HM-SMR but prefer_deferred_size is "
11649 << prefer_deferred_size << ". "
11650 << "Please set to 0." << dendl;
11651 return -EINVAL;
11652 }
11653 return 0;
11654}
11655
7c673cae
FG
11656void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
11657{
11658 dout(20) << __func__ << " txc " << txc << std::hex
11659 << " allocated 0x" << txc->allocated
11660 << " released 0x" << txc->released
11661 << std::dec << dendl;
11662
11663 // We have to handle the case where we allocate *and* deallocate the
11664 // same region in this transaction. The freelist doesn't like that.
11665 // (Actually, the only thing that cares is the BitmapFreelistManager
11666 // debug check. But that's important.)
11667 interval_set<uint64_t> tmp_allocated, tmp_released;
11668 interval_set<uint64_t> *pallocated = &txc->allocated;
11669 interval_set<uint64_t> *preleased = &txc->released;
11670 if (!txc->allocated.empty() && !txc->released.empty()) {
11671 interval_set<uint64_t> overlap;
11672 overlap.intersection_of(txc->allocated, txc->released);
11673 if (!overlap.empty()) {
11674 tmp_allocated = txc->allocated;
11675 tmp_allocated.subtract(overlap);
11676 tmp_released = txc->released;
11677 tmp_released.subtract(overlap);
11678 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
11679 << ", new allocated 0x" << tmp_allocated
11680 << " released 0x" << tmp_released << std::dec
11681 << dendl;
11682 pallocated = &tmp_allocated;
11683 preleased = &tmp_released;
11684 }
11685 }
11686
11687 // update freelist with non-overlap sets
11688 for (interval_set<uint64_t>::iterator p = pallocated->begin();
11689 p != pallocated->end();
11690 ++p) {
11691 fm->allocate(p.get_start(), p.get_len(), t);
11692 }
11693 for (interval_set<uint64_t>::iterator p = preleased->begin();
11694 p != preleased->end();
11695 ++p) {
11696 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
11697 << "~" << p.get_len() << std::dec << dendl;
11698 fm->release(p.get_start(), p.get_len(), t);
11699 }
11700
f67539c2
TL
11701 if (bdev->is_smr()) {
11702 _zoned_update_cleaning_metadata(txc);
11703 }
11704
7c673cae
FG
11705 _txc_update_store_statfs(txc);
11706}
11707
9f95a23c 11708void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
7c673cae 11709{
f67539c2 11710 ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED);
9f95a23c
TL
11711 {
11712#if defined(WITH_LTTNG)
11713 auto start = mono_clock::now();
11714#endif
11715
f67539c2
TL
11716#ifdef WITH_BLKIN
11717 if (txc->trace) {
11718 txc->trace.event("db async submit");
11719 }
11720#endif
11721
9f95a23c
TL
11722 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11723 ceph_assert(r == 0);
f67539c2 11724 txc->set_state(TransContext::STATE_KV_SUBMITTED);
9f95a23c
TL
11725 if (txc->osr->kv_submitted_waiters) {
11726 std::lock_guard l(txc->osr->qlock);
11727 txc->osr->qcond.notify_all();
11728 }
11729
11730#if defined(WITH_LTTNG)
11731 if (txc->tracing) {
11732 tracepoint(
11733 bluestore,
11734 transaction_kv_submit_latency,
11735 txc->osr->get_sequencer_id(),
11736 txc->seq,
11737 sync_submit_transaction,
11738 ceph::to_seconds<double>(mono_clock::now() - start));
11739 }
11740#endif
11741 }
11742
7c673cae
FG
11743 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
11744 for (auto& o : *ls) {
11745 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
11746 << dendl;
9f95a23c 11747 if (--o->flushing_count == 0 && o->waiting_count.load()) {
11fdf7f2 11748 std::lock_guard l(o->flush_lock);
7c673cae
FG
11749 o->flush_cond.notify_all();
11750 }
11751 }
11752 }
11753}
11754
11755void BlueStore::_txc_committed_kv(TransContext *txc)
11756{
11757 dout(20) << __func__ << " txc " << txc << dendl;
9f95a23c 11758 throttle.complete_kv(*txc);
1adf2230 11759 {
11fdf7f2 11760 std::lock_guard l(txc->osr->qlock);
f67539c2 11761 txc->set_state(TransContext::STATE_KV_DONE);
11fdf7f2
TL
11762 if (txc->ch->commit_queue) {
11763 txc->ch->commit_queue->queue(txc->oncommits);
11764 } else {
11765 finisher.queue(txc->oncommits);
1adf2230 11766 }
7c673cae 11767 }
9f95a23c 11768 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
494da23a
TL
11769 log_latency_fn(
11770 __func__,
11771 l_bluestore_commit_lat,
9f95a23c 11772 mono_clock::now() - txc->start,
494da23a
TL
11773 cct->_conf->bluestore_log_op_age,
11774 [&](auto lat) {
11775 return ", txc = " + stringify(txc);
11776 }
11fdf7f2 11777 );
7c673cae
FG
11778}
11779
11780void BlueStore::_txc_finish(TransContext *txc)
11781{
11782 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
f67539c2 11783 ceph_assert(txc->get_state() == TransContext::STATE_FINISHING);
7c673cae
FG
11784
11785 for (auto& sb : txc->shared_blobs_written) {
f64942e4 11786 sb->finish_write(txc->seq);
7c673cae
FG
11787 }
11788 txc->shared_blobs_written.clear();
11789
11790 while (!txc->removed_collections.empty()) {
11791 _queue_reap_collection(txc->removed_collections.front());
11792 txc->removed_collections.pop_front();
11793 }
11794
11795 OpSequencerRef osr = txc->osr;
7c673cae 11796 bool empty = false;
31f18b77 11797 bool submit_deferred = false;
7c673cae
FG
11798 OpSequencer::q_list_t releasing_txc;
11799 {
11fdf7f2 11800 std::lock_guard l(osr->qlock);
f67539c2 11801 txc->set_state(TransContext::STATE_DONE);
7c673cae
FG
11802 bool notify = false;
11803 while (!osr->q.empty()) {
11804 TransContext *txc = &osr->q.front();
11805 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
11806 << dendl;
f67539c2
TL
11807 if (txc->get_state() != TransContext::STATE_DONE) {
11808 if (txc->get_state() == TransContext::STATE_PREPARE &&
7c673cae
FG
11809 deferred_aggressive) {
11810 // for _osr_drain_preceding()
11811 notify = true;
11812 }
f67539c2 11813 if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED &&
11fdf7f2 11814 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
31f18b77
FG
11815 submit_deferred = true;
11816 }
7c673cae
FG
11817 break;
11818 }
11819
7c673cae
FG
11820 osr->q.pop_front();
11821 releasing_txc.push_back(*txc);
7c673cae 11822 }
9f95a23c 11823
7c673cae
FG
11824 if (osr->q.empty()) {
11825 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
11826 empty = true;
11827 }
9f95a23c
TL
11828
11829 // only drain()/drain_preceding() need wakeup,
11830 // other cases use kv_submitted_waiters
11831 if (notify || empty) {
11832 osr->qcond.notify_all();
11833 }
7c673cae 11834 }
9f95a23c 11835
7c673cae
FG
11836 while (!releasing_txc.empty()) {
11837 // release to allocator only after all preceding txc's have also
11838 // finished any deferred writes that potentially land in these
11839 // blocks
11840 auto txc = &releasing_txc.front();
11841 _txc_release_alloc(txc);
11842 releasing_txc.pop_front();
9f95a23c
TL
11843 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
11844 throttle.complete(*txc);
7c673cae
FG
11845 delete txc;
11846 }
11847
31f18b77
FG
11848 if (submit_deferred) {
11849 // we're pinning memory; flush! we could be more fine-grained here but
11850 // i'm not sure it's worth the bother.
11851 deferred_try_submit();
7c673cae
FG
11852 }
11853
7c673cae 11854 if (empty && osr->zombie) {
11fdf7f2
TL
11855 std::lock_guard l(zombie_osr_lock);
11856 if (zombie_osr_set.erase(osr->cid)) {
11857 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
11858 } else {
11859 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
11860 << dendl;
11861 }
7c673cae 11862 }
9f95a23c 11863}
7c673cae
FG
11864
11865void BlueStore::_txc_release_alloc(TransContext *txc)
11866{
a8e16298 11867 // it's expected we're called with lazy_release_lock already taken!
11fdf7f2
TL
11868 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
11869 int r = 0;
11870 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
11871 r = bdev->queue_discard(txc->released);
11872 if (r == 0) {
11873 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
11874 << txc->released << std::dec << dendl;
11875 goto out;
11876 }
11877 } else if (cct->_conf->bdev_enable_discard) {
11878 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
11879 bdev->discard(p.get_start(), p.get_len());
11880 }
11881 }
11882 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
94b18763 11883 << txc->released << std::dec << dendl;
f67539c2 11884 shared_alloc.a->release(txc->released);
7c673cae
FG
11885 }
11886
11fdf7f2 11887out:
7c673cae
FG
11888 txc->allocated.clear();
11889 txc->released.clear();
11890}
11891
11fdf7f2
TL
11892void BlueStore::_osr_attach(Collection *c)
11893{
11894 // note: caller has RWLock on coll_map
11895 auto q = coll_map.find(c->cid);
11896 if (q != coll_map.end()) {
11897 c->osr = q->second->osr;
11898 ldout(cct, 10) << __func__ << " " << c->cid
11899 << " reusing osr " << c->osr << " from existing coll "
11900 << q->second << dendl;
11901 } else {
11902 std::lock_guard l(zombie_osr_lock);
11903 auto p = zombie_osr_set.find(c->cid);
11904 if (p == zombie_osr_set.end()) {
9f95a23c 11905 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
11fdf7f2
TL
11906 ldout(cct, 10) << __func__ << " " << c->cid
11907 << " fresh osr " << c->osr << dendl;
11908 } else {
11909 c->osr = p->second;
11910 zombie_osr_set.erase(p);
11911 ldout(cct, 10) << __func__ << " " << c->cid
11912 << " resurrecting zombie osr " << c->osr << dendl;
11913 c->osr->zombie = false;
11914 }
11915 }
11916}
11917
11918void BlueStore::_osr_register_zombie(OpSequencer *osr)
11919{
11920 std::lock_guard l(zombie_osr_lock);
11921 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
11922 osr->zombie = true;
11923 auto i = zombie_osr_set.emplace(osr->cid, osr);
11924 // this is either a new insertion or the same osr is already there
11925 ceph_assert(i.second || i.first->second == osr);
11926}
11927
7c673cae
FG
11928void BlueStore::_osr_drain_preceding(TransContext *txc)
11929{
11930 OpSequencer *osr = txc->osr.get();
11931 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
11932 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11933 {
11934 // submit anything pending
f67539c2 11935 osr->deferred_lock.lock();
11fdf7f2 11936 if (osr->deferred_pending && !osr->deferred_running) {
224ce89b
WB
11937 _deferred_submit_unlock(osr);
11938 } else {
f67539c2 11939 osr->deferred_lock.unlock();
7c673cae
FG
11940 }
11941 }
11942 {
11943 // wake up any previously finished deferred events
11fdf7f2 11944 std::lock_guard l(kv_lock);
9f95a23c
TL
11945 if (!kv_sync_in_progress) {
11946 kv_sync_in_progress = true;
11947 kv_cond.notify_one();
11948 }
7c673cae
FG
11949 }
11950 osr->drain_preceding(txc);
11951 --deferred_aggressive;
11952 dout(10) << __func__ << " " << osr << " done" << dendl;
11953}
11954
11fdf7f2
TL
11955void BlueStore::_osr_drain(OpSequencer *osr)
11956{
11957 dout(10) << __func__ << " " << osr << dendl;
11958 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11959 {
11960 // submit anything pending
f67539c2 11961 osr->deferred_lock.lock();
11fdf7f2
TL
11962 if (osr->deferred_pending && !osr->deferred_running) {
11963 _deferred_submit_unlock(osr);
11964 } else {
f67539c2 11965 osr->deferred_lock.unlock();
11fdf7f2
TL
11966 }
11967 }
11968 {
11969 // wake up any previously finished deferred events
11970 std::lock_guard l(kv_lock);
9f95a23c
TL
11971 if (!kv_sync_in_progress) {
11972 kv_sync_in_progress = true;
11973 kv_cond.notify_one();
11974 }
11fdf7f2
TL
11975 }
11976 osr->drain();
11977 --deferred_aggressive;
11978 dout(10) << __func__ << " " << osr << " done" << dendl;
11979}
11980
7c673cae
FG
11981void BlueStore::_osr_drain_all()
11982{
11983 dout(10) << __func__ << dendl;
11984
11985 set<OpSequencerRef> s;
11fdf7f2
TL
11986 vector<OpSequencerRef> zombies;
11987 {
9f95a23c 11988 std::shared_lock l(coll_lock);
11fdf7f2
TL
11989 for (auto& i : coll_map) {
11990 s.insert(i.second->osr);
11991 }
11992 }
7c673cae 11993 {
11fdf7f2
TL
11994 std::lock_guard l(zombie_osr_lock);
11995 for (auto& i : zombie_osr_set) {
11996 s.insert(i.second);
11997 zombies.push_back(i.second);
11998 }
7c673cae
FG
11999 }
12000 dout(20) << __func__ << " osr_set " << s << dendl;
12001
12002 ++deferred_aggressive;
12003 {
12004 // submit anything pending
224ce89b 12005 deferred_try_submit();
7c673cae
FG
12006 }
12007 {
12008 // wake up any previously finished deferred events
11fdf7f2 12009 std::lock_guard l(kv_lock);
7c673cae
FG
12010 kv_cond.notify_one();
12011 }
31f18b77 12012 {
11fdf7f2 12013 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
12014 kv_finalize_cond.notify_one();
12015 }
7c673cae
FG
12016 for (auto osr : s) {
12017 dout(20) << __func__ << " drain " << osr << dendl;
12018 osr->drain();
12019 }
12020 --deferred_aggressive;
12021
7c673cae 12022 {
11fdf7f2
TL
12023 std::lock_guard l(zombie_osr_lock);
12024 for (auto& osr : zombies) {
12025 if (zombie_osr_set.erase(osr->cid)) {
12026 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
12027 ceph_assert(osr->q.empty());
12028 } else if (osr->zombie) {
12029 dout(10) << __func__ << " empty zombie osr " << osr
12030 << " already reaped" << dendl;
12031 ceph_assert(osr->q.empty());
12032 } else {
12033 dout(10) << __func__ << " empty zombie osr " << osr
12034 << " resurrected" << dendl;
12035 }
7c673cae
FG
12036 }
12037 }
11fdf7f2
TL
12038
12039 dout(10) << __func__ << " done" << dendl;
7c673cae
FG
12040}
12041
11fdf7f2 12042
31f18b77
FG
12043void BlueStore::_kv_start()
12044{
12045 dout(10) << __func__ << dendl;
12046
11fdf7f2 12047 finisher.start();
31f18b77
FG
12048 kv_sync_thread.create("bstore_kv_sync");
12049 kv_finalize_thread.create("bstore_kv_final");
12050}
12051
12052void BlueStore::_kv_stop()
12053{
12054 dout(10) << __func__ << dendl;
12055 {
9f95a23c 12056 std::unique_lock l{kv_lock};
31f18b77
FG
12057 while (!kv_sync_started) {
12058 kv_cond.wait(l);
12059 }
12060 kv_stop = true;
12061 kv_cond.notify_all();
12062 }
12063 {
9f95a23c 12064 std::unique_lock l{kv_finalize_lock};
31f18b77
FG
12065 while (!kv_finalize_started) {
12066 kv_finalize_cond.wait(l);
12067 }
12068 kv_finalize_stop = true;
12069 kv_finalize_cond.notify_all();
12070 }
12071 kv_sync_thread.join();
12072 kv_finalize_thread.join();
11fdf7f2 12073 ceph_assert(removed_collections.empty());
31f18b77 12074 {
11fdf7f2 12075 std::lock_guard l(kv_lock);
31f18b77
FG
12076 kv_stop = false;
12077 }
12078 {
11fdf7f2 12079 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
12080 kv_finalize_stop = false;
12081 }
12082 dout(10) << __func__ << " stopping finishers" << dendl;
11fdf7f2
TL
12083 finisher.wait_for_empty();
12084 finisher.stop();
31f18b77
FG
12085 dout(10) << __func__ << " stopped" << dendl;
12086}
12087
7c673cae
FG
12088void BlueStore::_kv_sync_thread()
12089{
12090 dout(10) << __func__ << " start" << dendl;
11fdf7f2 12091 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
9f95a23c 12092 std::unique_lock l{kv_lock};
11fdf7f2 12093 ceph_assert(!kv_sync_started);
31f18b77
FG
12094 kv_sync_started = true;
12095 kv_cond.notify_all();
adb31ebb
TL
12096
12097 auto t0 = mono_clock::now();
12098 timespan twait = ceph::make_timespan(0);
12099 size_t kv_submitted = 0;
12100
7c673cae 12101 while (true) {
adb31ebb
TL
12102 auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
12103 auto observation_period =
12104 ceph::make_timespan(period);
12105 auto elapsed = mono_clock::now() - t0;
12106 if (period && elapsed >= observation_period) {
12107 dout(5) << __func__ << " utilization: idle "
12108 << twait << " of " << elapsed
12109 << ", submitted: " << kv_submitted
12110 <<dendl;
12111 t0 = mono_clock::now();
12112 twait = ceph::make_timespan(0);
12113 kv_submitted = 0;
12114 }
11fdf7f2 12115 ceph_assert(kv_committing.empty());
7c673cae
FG
12116 if (kv_queue.empty() &&
12117 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
11fdf7f2 12118 !deferred_aggressive)) {
7c673cae
FG
12119 if (kv_stop)
12120 break;
12121 dout(20) << __func__ << " sleep" << dendl;
adb31ebb 12122 auto t = mono_clock::now();
9f95a23c 12123 kv_sync_in_progress = false;
11fdf7f2 12124 kv_cond.wait(l);
adb31ebb
TL
12125 twait += mono_clock::now() - t;
12126
7c673cae
FG
12127 dout(20) << __func__ << " wake" << dendl;
12128 } else {
12129 deque<TransContext*> kv_submitting;
12130 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
12131 uint64_t aios = 0, costs = 0;
12132
7c673cae
FG
12133 dout(20) << __func__ << " committing " << kv_queue.size()
12134 << " submitting " << kv_queue_unsubmitted.size()
12135 << " deferred done " << deferred_done_queue.size()
12136 << " stable " << deferred_stable_queue.size()
12137 << dendl;
12138 kv_committing.swap(kv_queue);
12139 kv_submitting.swap(kv_queue_unsubmitted);
12140 deferred_done.swap(deferred_done_queue);
12141 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
12142 aios = kv_ios;
12143 costs = kv_throttle_costs;
12144 kv_ios = 0;
12145 kv_throttle_costs = 0;
7c673cae
FG
12146 l.unlock();
12147
12148 dout(30) << __func__ << " committing " << kv_committing << dendl;
12149 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
12150 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
12151 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
12152
11fdf7f2
TL
12153 auto start = mono_clock::now();
12154
7c673cae
FG
12155 bool force_flush = false;
12156 // if bluefs is sharing the same device as data (only), then we
12157 // can rely on the bluefs commit to flush the device and make
12158 // deferred aios stable. that means that if we do have done deferred
12159 // txcs AND we are not on a single device, we need to force a flush.
9f95a23c 12160 if (bluefs && bluefs_layout.single_shared_device()) {
31f18b77 12161 if (aios) {
7c673cae 12162 force_flush = true;
11fdf7f2 12163 } else if (kv_committing.empty() && deferred_stable.empty()) {
7c673cae
FG
12164 force_flush = true; // there's nothing else to commit!
12165 } else if (deferred_aggressive) {
12166 force_flush = true;
12167 }
11fdf7f2
TL
12168 } else {
12169 if (aios || !deferred_done.empty()) {
12170 force_flush = true;
12171 } else {
12172 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
12173 }
12174 }
7c673cae
FG
12175
12176 if (force_flush) {
31f18b77 12177 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
12178 << " force_flush=" << (int)force_flush
12179 << ", flushing, deferred done->stable" << dendl;
12180 // flush/barrier on block device
12181 bdev->flush();
12182
12183 // if we flush then deferred done are now deferred stable
12184 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
12185 deferred_done.end());
12186 deferred_done.clear();
12187 }
11fdf7f2 12188 auto after_flush = mono_clock::now();
7c673cae
FG
12189
12190 // we will use one final transaction to force a sync
12191 KeyValueDB::Transaction synct = db->get_transaction();
12192
12193 // increase {nid,blobid}_max? note that this covers both the
12194 // case where we are approaching the max and the case we passed
12195 // it. in either case, we increase the max in the earlier txn
12196 // we submit.
12197 uint64_t new_nid_max = 0, new_blobid_max = 0;
12198 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
12199 KeyValueDB::Transaction t =
12200 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12201 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
12202 bufferlist bl;
11fdf7f2 12203 encode(new_nid_max, bl);
7c673cae
FG
12204 t->set(PREFIX_SUPER, "nid_max", bl);
12205 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
12206 }
12207 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
12208 KeyValueDB::Transaction t =
12209 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12210 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
12211 bufferlist bl;
11fdf7f2 12212 encode(new_blobid_max, bl);
7c673cae
FG
12213 t->set(PREFIX_SUPER, "blobid_max", bl);
12214 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
12215 }
c07f9fc5
FG
12216
12217 for (auto txc : kv_committing) {
9f95a23c 12218 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
f67539c2 12219 if (txc->get_state() == TransContext::STATE_KV_QUEUED) {
adb31ebb 12220 ++kv_submitted;
9f95a23c 12221 _txc_apply_kv(txc, false);
c07f9fc5 12222 --txc->osr->kv_committing_serially;
c07f9fc5 12223 } else {
f67539c2 12224 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
7c673cae 12225 }
7c673cae
FG
12226 if (txc->had_ios) {
12227 --txc->osr->txc_with_unstable_io;
12228 }
7c673cae
FG
12229 }
12230
31f18b77
FG
12231 // release throttle *before* we commit. this allows new ops
12232 // to be prepared and enter pipeline while we are waiting on
12233 // the kv commit sync/flush. then hopefully on the next
12234 // iteration there will already be ops awake. otherwise, we
12235 // end up going to sleep, and then wake up when the very first
12236 // transaction is ready for commit.
9f95a23c 12237 throttle.release_kv_throttle(costs);
31f18b77 12238
7c673cae
FG
12239 // cleanup sync deferred keys
12240 for (auto b : deferred_stable) {
12241 for (auto& txc : b->txcs) {
12242 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
11fdf7f2 12243 ceph_assert(wt.released.empty()); // only kraken did this
7c673cae
FG
12244 string key;
12245 get_deferred_key(wt.seq, &key);
12246 synct->rm_single_key(PREFIX_DEFERRED, key);
12247 }
12248 }
12249
9f95a23c
TL
12250#if defined(WITH_LTTNG)
12251 auto sync_start = mono_clock::now();
12252#endif
7c673cae 12253 // submit synct synchronously (block and wait for it to commit)
31f18b77 12254 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
11fdf7f2
TL
12255 ceph_assert(r == 0);
12256
f67539c2
TL
12257#ifdef WITH_BLKIN
12258 for (auto txc : kv_committing) {
12259 if (txc->trace) {
12260 txc->trace.event("db sync submit");
12261 txc->trace.keyval("kv_committing size", kv_committing.size());
12262 }
12263 }
12264#endif
12265
9f95a23c
TL
12266 int committing_size = kv_committing.size();
12267 int deferred_size = deferred_stable.size();
12268
12269#if defined(WITH_LTTNG)
12270 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
12271 for (auto txc: kv_committing) {
12272 if (txc->tracing) {
12273 tracepoint(
12274 bluestore,
12275 transaction_kv_sync_latency,
12276 txc->osr->get_sequencer_id(),
12277 txc->seq,
12278 kv_committing.size(),
12279 deferred_done.size(),
12280 deferred_stable.size(),
12281 sync_latency);
12282 }
12283 }
12284#endif
12285
11fdf7f2 12286 {
9f95a23c 12287 std::unique_lock m{kv_finalize_lock};
11fdf7f2
TL
12288 if (kv_committing_to_finalize.empty()) {
12289 kv_committing_to_finalize.swap(kv_committing);
12290 } else {
12291 kv_committing_to_finalize.insert(
12292 kv_committing_to_finalize.end(),
12293 kv_committing.begin(),
12294 kv_committing.end());
12295 kv_committing.clear();
12296 }
12297 if (deferred_stable_to_finalize.empty()) {
12298 deferred_stable_to_finalize.swap(deferred_stable);
12299 } else {
12300 deferred_stable_to_finalize.insert(
12301 deferred_stable_to_finalize.end(),
12302 deferred_stable.begin(),
12303 deferred_stable.end());
12304 deferred_stable.clear();
12305 }
9f95a23c
TL
12306 if (!kv_finalize_in_progress) {
12307 kv_finalize_in_progress = true;
12308 kv_finalize_cond.notify_one();
12309 }
11fdf7f2 12310 }
7c673cae
FG
12311
12312 if (new_nid_max) {
12313 nid_max = new_nid_max;
12314 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
12315 }
12316 if (new_blobid_max) {
12317 blobid_max = new_blobid_max;
12318 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
12319 }
12320
224ce89b 12321 {
11fdf7f2
TL
12322 auto finish = mono_clock::now();
12323 ceph::timespan dur_flush = after_flush - start;
12324 ceph::timespan dur_kv = finish - after_flush;
12325 ceph::timespan dur = finish - start;
9f95a23c
TL
12326 dout(20) << __func__ << " committed " << committing_size
12327 << " cleaned " << deferred_size
224ce89b
WB
12328 << " in " << dur
12329 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
12330 << dendl;
494da23a
TL
12331 log_latency("kv_flush",
12332 l_bluestore_kv_flush_lat,
12333 dur_flush,
12334 cct->_conf->bluestore_log_op_age);
12335 log_latency("kv_commit",
12336 l_bluestore_kv_commit_lat,
12337 dur_kv,
12338 cct->_conf->bluestore_log_op_age);
12339 log_latency("kv_sync",
12340 l_bluestore_kv_sync_lat,
12341 dur,
12342 cct->_conf->bluestore_log_op_age);
7c673cae 12343 }
31f18b77 12344
31f18b77
FG
12345 l.lock();
12346 // previously deferred "done" are now "stable" by virtue of this
12347 // commit cycle.
12348 deferred_stable_queue.swap(deferred_done);
12349 }
12350 }
12351 dout(10) << __func__ << " finish" << dendl;
12352 kv_sync_started = false;
12353}
12354
12355void BlueStore::_kv_finalize_thread()
12356{
12357 deque<TransContext*> kv_committed;
12358 deque<DeferredBatch*> deferred_stable;
12359 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
12360 std::unique_lock l(kv_finalize_lock);
12361 ceph_assert(!kv_finalize_started);
31f18b77
FG
12362 kv_finalize_started = true;
12363 kv_finalize_cond.notify_all();
12364 while (true) {
11fdf7f2
TL
12365 ceph_assert(kv_committed.empty());
12366 ceph_assert(deferred_stable.empty());
31f18b77
FG
12367 if (kv_committing_to_finalize.empty() &&
12368 deferred_stable_to_finalize.empty()) {
12369 if (kv_finalize_stop)
12370 break;
12371 dout(20) << __func__ << " sleep" << dendl;
9f95a23c 12372 kv_finalize_in_progress = false;
31f18b77
FG
12373 kv_finalize_cond.wait(l);
12374 dout(20) << __func__ << " wake" << dendl;
12375 } else {
12376 kv_committed.swap(kv_committing_to_finalize);
12377 deferred_stable.swap(deferred_stable_to_finalize);
12378 l.unlock();
12379 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
12380 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
12381
11fdf7f2
TL
12382 auto start = mono_clock::now();
12383
31f18b77
FG
12384 while (!kv_committed.empty()) {
12385 TransContext *txc = kv_committed.front();
f67539c2 12386 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
7c673cae 12387 _txc_state_proc(txc);
31f18b77 12388 kv_committed.pop_front();
7c673cae 12389 }
31f18b77 12390
7c673cae
FG
12391 for (auto b : deferred_stable) {
12392 auto p = b->txcs.begin();
12393 while (p != b->txcs.end()) {
12394 TransContext *txc = &*p;
12395 p = b->txcs.erase(p); // unlink here because
12396 _txc_state_proc(txc); // this may destroy txc
12397 }
12398 delete b;
12399 }
31f18b77 12400 deferred_stable.clear();
7c673cae
FG
12401
12402 if (!deferred_aggressive) {
31f18b77 12403 if (deferred_queue_size >= deferred_batch_ops.load() ||
9f95a23c 12404 throttle.should_submit_deferred()) {
224ce89b 12405 deferred_try_submit();
7c673cae
FG
12406 }
12407 }
12408
12409 // this is as good a place as any ...
12410 _reap_collections();
12411
11fdf7f2 12412 logger->set(l_bluestore_fragmentation,
f67539c2 12413 (uint64_t)(shared_alloc.a->get_fragmentation() * 1000));
11fdf7f2 12414
494da23a
TL
12415 log_latency("kv_final",
12416 l_bluestore_kv_final_lat,
12417 mono_clock::now() - start,
12418 cct->_conf->bluestore_log_op_age);
11fdf7f2 12419
7c673cae 12420 l.lock();
7c673cae
FG
12421 }
12422 }
12423 dout(10) << __func__ << " finish" << dendl;
31f18b77 12424 kv_finalize_started = false;
7c673cae
FG
12425}
12426
f67539c2
TL
12427void BlueStore::_zoned_cleaner_start() {
12428 dout(10) << __func__ << dendl;
12429
12430 zoned_cleaner_thread.create("bstore_zcleaner");
12431}
12432
12433void BlueStore::_zoned_cleaner_stop() {
12434 dout(10) << __func__ << dendl;
12435 {
12436 std::unique_lock l{zoned_cleaner_lock};
12437 while (!zoned_cleaner_started) {
12438 zoned_cleaner_cond.wait(l);
12439 }
12440 zoned_cleaner_stop = true;
12441 zoned_cleaner_cond.notify_all();
12442 }
12443 zoned_cleaner_thread.join();
12444 {
12445 std::lock_guard l{zoned_cleaner_lock};
12446 zoned_cleaner_stop = false;
12447 }
12448 dout(10) << __func__ << " done" << dendl;
12449}
12450
12451void BlueStore::_zoned_cleaner_thread() {
12452 dout(10) << __func__ << " start" << dendl;
12453 std::unique_lock l{zoned_cleaner_lock};
12454 ceph_assert(!zoned_cleaner_started);
12455 zoned_cleaner_started = true;
12456 zoned_cleaner_cond.notify_all();
12457 std::deque<uint64_t> zones_to_clean;
12458 while (true) {
12459 if (zoned_cleaner_queue.empty()) {
12460 if (zoned_cleaner_stop) {
12461 break;
12462 }
12463 dout(20) << __func__ << " sleep" << dendl;
12464 zoned_cleaner_cond.wait(l);
12465 dout(20) << __func__ << " wake" << dendl;
12466 } else {
12467 zones_to_clean.swap(zoned_cleaner_queue);
12468 l.unlock();
12469 while (!zones_to_clean.empty()) {
12470 _zoned_clean_zone(zones_to_clean.front());
12471 zones_to_clean.pop_front();
12472 }
12473 l.lock();
12474 }
12475 }
12476 dout(10) << __func__ << " finish" << dendl;
12477 zoned_cleaner_started = false;
12478}
12479
12480void BlueStore::_zoned_clean_zone(uint64_t zone_num) {
12481 dout(10) << __func__ << " cleaning zone " << zone_num << dendl;
12482}
12483
7c673cae 12484bluestore_deferred_op_t *BlueStore::_get_deferred_op(
522d829b 12485 TransContext *txc, uint64_t len)
7c673cae
FG
12486{
12487 if (!txc->deferred_txn) {
12488 txc->deferred_txn = new bluestore_deferred_transaction_t;
12489 }
12490 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
522d829b
TL
12491 logger->inc(l_bluestore_write_deferred);
12492 logger->inc(l_bluestore_write_deferred_bytes, len);
7c673cae
FG
12493 return &txc->deferred_txn->ops.back();
12494}
12495
12496void BlueStore::_deferred_queue(TransContext *txc)
12497{
12498 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
f67539c2
TL
12499
12500 DeferredBatch *tmp;
12501 txc->osr->deferred_lock.lock();
12502 {
12503 if (!txc->osr->deferred_pending) {
12504 tmp = new DeferredBatch(cct, txc->osr.get());
12505 } else {
12506 tmp = txc->osr->deferred_pending;
12507 }
7c673cae 12508 }
f67539c2
TL
12509
12510 tmp->txcs.push_back(*txc);
7c673cae
FG
12511 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
12512 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
12513 const auto& op = *opi;
11fdf7f2 12514 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
7c673cae
FG
12515 bufferlist::const_iterator p = op.data.begin();
12516 for (auto e : op.extents) {
f67539c2 12517 tmp->prepare_write(cct, wt.seq, e.offset, e.length, p);
7c673cae
FG
12518 }
12519 }
f67539c2
TL
12520
12521 {
12522 ++deferred_queue_size;
12523 txc->osr->deferred_pending = tmp;
12524 // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
12525 // So we should add osr into deferred_queue.
12526 if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) {
12527 deferred_lock.lock();
12528 deferred_queue.push_back(*txc->osr);
12529 deferred_lock.unlock();
12530 }
12531
12532 if (deferred_aggressive &&
12533 !txc->osr->deferred_running) {
12534 _deferred_submit_unlock(txc->osr.get());
12535 } else {
12536 txc->osr->deferred_lock.unlock();
12537 }
7c673cae 12538 }
f67539c2 12539 }
7c673cae 12540
224ce89b 12541void BlueStore::deferred_try_submit()
7c673cae
FG
12542{
12543 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
12544 << deferred_queue_size << " txcs" << dendl;
224ce89b 12545 vector<OpSequencerRef> osrs;
f67539c2
TL
12546
12547 {
12548 std::lock_guard l(deferred_lock);
12549 osrs.reserve(deferred_queue.size());
12550 for (auto& osr : deferred_queue) {
12551 osrs.push_back(&osr);
12552 }
224ce89b 12553 }
f67539c2 12554
224ce89b 12555 for (auto& osr : osrs) {
f67539c2 12556 osr->deferred_lock.lock();
181888fb
FG
12557 if (osr->deferred_pending) {
12558 if (!osr->deferred_running) {
12559 _deferred_submit_unlock(osr.get());
181888fb 12560 } else {
f67539c2 12561 osr->deferred_lock.unlock();
181888fb
FG
12562 dout(20) << __func__ << " osr " << osr << " already has running"
12563 << dendl;
12564 }
12565 } else {
f67539c2 12566 osr->deferred_lock.unlock();
181888fb 12567 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
12568 }
12569 }
9f95a23c 12570
f67539c2
TL
12571 {
12572 std::lock_guard l(deferred_lock);
12573 deferred_last_submitted = ceph_clock_now();
12574 }
7c673cae
FG
12575}
12576
224ce89b 12577void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
12578{
12579 dout(10) << __func__ << " osr " << osr
12580 << " " << osr->deferred_pending->iomap.size() << " ios pending "
12581 << dendl;
11fdf7f2
TL
12582 ceph_assert(osr->deferred_pending);
12583 ceph_assert(!osr->deferred_running);
7c673cae
FG
12584
12585 auto b = osr->deferred_pending;
12586 deferred_queue_size -= b->seq_bytes.size();
11fdf7f2 12587 ceph_assert(deferred_queue_size >= 0);
7c673cae
FG
12588
12589 osr->deferred_running = osr->deferred_pending;
12590 osr->deferred_pending = nullptr;
12591
f67539c2 12592 osr->deferred_lock.unlock();
11fdf7f2
TL
12593
12594 for (auto& txc : b->txcs) {
9f95a23c 12595 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
11fdf7f2 12596 }
7c673cae
FG
12597 uint64_t start = 0, pos = 0;
12598 bufferlist bl;
12599 auto i = b->iomap.begin();
12600 while (true) {
12601 if (i == b->iomap.end() || i->first != pos) {
12602 if (bl.length()) {
12603 dout(20) << __func__ << " write 0x" << std::hex
12604 << start << "~" << bl.length()
12605 << " crc " << bl.crc32c(-1) << std::dec << dendl;
11fdf7f2 12606 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
12607 logger->inc(l_bluestore_deferred_write_ops);
12608 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
12609 int r = bdev->aio_write(start, bl, &b->ioc, false);
11fdf7f2 12610 ceph_assert(r == 0);
7c673cae
FG
12611 }
12612 }
12613 if (i == b->iomap.end()) {
12614 break;
12615 }
12616 start = 0;
12617 pos = i->first;
12618 bl.clear();
12619 }
12620 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
12621 << std::hex << pos << "~" << i->second.bl.length() << std::dec
12622 << dendl;
12623 if (!bl.length()) {
12624 start = pos;
12625 }
12626 pos += i->second.bl.length();
12627 bl.claim_append(i->second.bl);
12628 ++i;
12629 }
224ce89b 12630
7c673cae
FG
12631 bdev->aio_submit(&b->ioc);
12632}
12633
3efd9988
FG
12634struct C_DeferredTrySubmit : public Context {
12635 BlueStore *store;
12636 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
12637 void finish(int r) {
12638 store->deferred_try_submit();
12639 }
12640};
12641
7c673cae
FG
12642void BlueStore::_deferred_aio_finish(OpSequencer *osr)
12643{
12644 dout(10) << __func__ << " osr " << osr << dendl;
11fdf7f2 12645 ceph_assert(osr->deferred_running);
7c673cae
FG
12646 DeferredBatch *b = osr->deferred_running;
12647
12648 {
f67539c2 12649 osr->deferred_lock.lock();
11fdf7f2 12650 ceph_assert(osr->deferred_running == b);
7c673cae
FG
12651 osr->deferred_running = nullptr;
12652 if (!osr->deferred_pending) {
181888fb 12653 dout(20) << __func__ << " dequeueing" << dendl;
f67539c2
TL
12654 {
12655 deferred_lock.lock();
12656 auto q = deferred_queue.iterator_to(*osr);
12657 deferred_queue.erase(q);
12658 deferred_lock.unlock();
12659 }
12660 osr->deferred_lock.unlock();
181888fb 12661 } else {
f67539c2 12662 osr->deferred_lock.unlock();
9f95a23c
TL
12663 if (deferred_aggressive) {
12664 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
12665 finisher.queue(new C_DeferredTrySubmit(this));
12666 } else {
12667 dout(20) << __func__ << " leaving queued, more pending" << dendl;
12668 }
7c673cae
FG
12669 }
12670 }
12671
12672 {
31f18b77 12673 uint64_t costs = 0;
11fdf7f2 12674 {
11fdf7f2
TL
12675 for (auto& i : b->txcs) {
12676 TransContext *txc = &i;
9f95a23c 12677 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
f67539c2 12678 txc->set_state(TransContext::STATE_DEFERRED_CLEANUP);
11fdf7f2
TL
12679 costs += txc->cost;
12680 }
7c673cae 12681 }
9f95a23c 12682 throttle.release_deferred_throttle(costs);
7c673cae
FG
12683 }
12684
9f95a23c 12685 {
11fdf7f2 12686 std::lock_guard l(kv_lock);
9f95a23c
TL
12687 deferred_done_queue.emplace_back(b);
12688
12689 // in the normal case, do not bother waking up the kv thread; it will
12690 // catch us on the next commit anyway.
12691 if (deferred_aggressive && !kv_sync_in_progress) {
12692 kv_sync_in_progress = true;
12693 kv_cond.notify_one();
12694 }
7c673cae
FG
12695 }
12696}
12697
12698int BlueStore::_deferred_replay()
12699{
12700 dout(10) << __func__ << " start" << dendl;
7c673cae
FG
12701 int count = 0;
12702 int r = 0;
11fdf7f2
TL
12703 CollectionRef ch = _get_collection(coll_t::meta());
12704 bool fake_ch = false;
12705 if (!ch) {
12706 // hmm, replaying initial mkfs?
12707 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
12708 fake_ch = true;
12709 }
12710 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
7c673cae
FG
12711 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
12712 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
12713 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
12714 << dendl;
12715 bluestore_deferred_transaction_t *deferred_txn =
12716 new bluestore_deferred_transaction_t;
12717 bufferlist bl = it->value();
11fdf7f2 12718 auto p = bl.cbegin();
7c673cae 12719 try {
11fdf7f2 12720 decode(*deferred_txn, p);
f67539c2 12721 } catch (ceph::buffer::error& e) {
7c673cae
FG
12722 derr << __func__ << " failed to decode deferred txn "
12723 << pretty_binary_string(it->key()) << dendl;
12724 delete deferred_txn;
12725 r = -EIO;
12726 goto out;
12727 }
11fdf7f2 12728 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
7c673cae 12729 txc->deferred_txn = deferred_txn;
f67539c2 12730 txc->set_state(TransContext::STATE_KV_DONE);
7c673cae
FG
12731 _txc_state_proc(txc);
12732 }
12733 out:
12734 dout(20) << __func__ << " draining osr" << dendl;
11fdf7f2 12735 _osr_register_zombie(osr);
7c673cae 12736 _osr_drain_all();
11fdf7f2
TL
12737 if (fake_ch) {
12738 new_coll_map.clear();
12739 }
7c673cae
FG
12740 dout(10) << __func__ << " completed " << count << " events" << dendl;
12741 return r;
12742}
12743
12744// ---------------------------
12745// transactions
12746
12747int BlueStore::queue_transactions(
11fdf7f2
TL
12748 CollectionHandle& ch,
12749 vector<Transaction>& tls,
12750 TrackedOpRef op,
12751 ThreadPool::TPHandle *handle)
12752{
12753 FUNCTRACE(cct);
12754 list<Context *> on_applied, on_commit, on_applied_sync;
7c673cae 12755 ObjectStore::Transaction::collect_contexts(
11fdf7f2 12756 tls, &on_applied, &on_commit, &on_applied_sync);
7c673cae 12757
11fdf7f2
TL
12758 auto start = mono_clock::now();
12759
12760 Collection *c = static_cast<Collection*>(ch.get());
12761 OpSequencer *osr = c->osr.get();
12762 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
7c673cae
FG
12763
12764 // prepare
11fdf7f2 12765 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
f67539c2 12766 &on_commit, op);
7c673cae 12767
f67539c2
TL
12768 // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
12769 // submission to happen atomically because if I/O submission happens in a
12770 // different order than I/O allocation, we end up issuing non-sequential
12771 // writes to the drive. This is a temporary solution until ZONE APPEND
12772 // support matures in the kernel. For more information please see:
12773 // https://www.usenix.org/conference/vault20/presentation/bjorling
12774 if (bdev->is_smr()) {
12775 atomic_alloc_and_submit_lock.lock();
12776 }
7c673cae 12777 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
7c673cae
FG
12778 txc->bytes += (*p).get_num_bytes();
12779 _txc_add_transaction(txc, &(*p));
12780 }
12781 _txc_calc_cost(txc);
12782
12783 _txc_write_nodes(txc, txc->t);
12784
12785 // journal deferred items
12786 if (txc->deferred_txn) {
12787 txc->deferred_txn->seq = ++deferred_seq;
12788 bufferlist bl;
11fdf7f2 12789 encode(*txc->deferred_txn, bl);
7c673cae
FG
12790 string key;
12791 get_deferred_key(txc->deferred_txn->seq, &key);
12792 txc->t->set(PREFIX_DEFERRED, key, bl);
12793 }
12794
12795 _txc_finalize_kv(txc, txc->t);
f67539c2
TL
12796
12797#ifdef WITH_BLKIN
12798 if (txc->trace) {
12799 txc->trace.event("txc encode finished");
12800 }
12801#endif
12802
7c673cae
FG
12803 if (handle)
12804 handle->suspend_tp_timeout();
12805
11fdf7f2 12806 auto tstart = mono_clock::now();
9f95a23c
TL
12807
12808 if (!throttle.try_start_transaction(
12809 *db,
12810 *txc,
12811 tstart)) {
7c673cae 12812 // ensure we do not block here because of deferred writes
9f95a23c
TL
12813 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
12814 << dendl;
12815 ++deferred_aggressive;
12816 deferred_try_submit();
12817 {
12818 // wake up any previously finished deferred events
12819 std::lock_guard l(kv_lock);
12820 if (!kv_sync_in_progress) {
12821 kv_sync_in_progress = true;
3efd9988
FG
12822 kv_cond.notify_one();
12823 }
9f95a23c
TL
12824 }
12825 throttle.finish_start_transaction(*db, *txc, tstart);
12826 --deferred_aggressive;
7c673cae 12827 }
11fdf7f2 12828 auto tend = mono_clock::now();
7c673cae
FG
12829
12830 if (handle)
12831 handle->reset_tp_timeout();
12832
12833 logger->inc(l_bluestore_txc);
12834
12835 // execute (start)
12836 _txc_state_proc(txc);
12837
f67539c2
TL
12838 if (bdev->is_smr()) {
12839 atomic_alloc_and_submit_lock.unlock();
12840 }
12841
11fdf7f2
TL
12842 // we're immediately readable (unlike FileStore)
12843 for (auto c : on_applied_sync) {
12844 c->complete(0);
12845 }
12846 if (!on_applied.empty()) {
12847 if (c->commit_queue) {
12848 c->commit_queue->queue(on_applied);
12849 } else {
12850 finisher.queue(on_applied);
12851 }
12852 }
12853
f67539c2
TL
12854#ifdef WITH_BLKIN
12855 if (txc->trace) {
12856 txc->trace.event("txc applied");
12857 }
12858#endif
12859
494da23a
TL
12860 log_latency("submit_transact",
12861 l_bluestore_submit_lat,
12862 mono_clock::now() - start,
12863 cct->_conf->bluestore_log_op_age);
12864 log_latency("throttle_transact",
12865 l_bluestore_throttle_lat,
12866 tend - tstart,
12867 cct->_conf->bluestore_log_op_age);
7c673cae
FG
12868 return 0;
12869}
12870
12871void BlueStore::_txc_aio_submit(TransContext *txc)
12872{
12873 dout(10) << __func__ << " txc " << txc << dendl;
12874 bdev->aio_submit(&txc->ioc);
12875}
12876
12877void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
12878{
12879 Transaction::iterator i = t->begin();
12880
81eedcae 12881 _dump_transaction<30>(cct, t);
7c673cae
FG
12882
12883 vector<CollectionRef> cvec(i.colls.size());
12884 unsigned j = 0;
12885 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
12886 ++p, ++j) {
12887 cvec[j] = _get_collection(*p);
7c673cae 12888 }
11fdf7f2 12889
7c673cae
FG
12890 vector<OnodeRef> ovec(i.objects.size());
12891
12892 for (int pos = 0; i.have_op(); ++pos) {
12893 Transaction::Op *op = i.decode_op();
12894 int r = 0;
12895
12896 // no coll or obj
12897 if (op->op == Transaction::OP_NOP)
12898 continue;
12899
11fdf7f2 12900
7c673cae
FG
12901 // collection operations
12902 CollectionRef &c = cvec[op->cid];
11fdf7f2
TL
12903
12904 // initialize osd_pool_id and do a smoke test that all collections belong
12905 // to the same pool
12906 spg_t pgid;
12907 if (!!c ? c->cid.is_pg(&pgid) : false) {
12908 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
12909 txc->osd_pool_id == pgid.pool());
12910 txc->osd_pool_id = pgid.pool();
12911 }
12912
7c673cae
FG
12913 switch (op->op) {
12914 case Transaction::OP_RMCOLL:
12915 {
12916 const coll_t &cid = i.get_cid(op->cid);
12917 r = _remove_collection(txc, cid, &c);
12918 if (!r)
12919 continue;
12920 }
12921 break;
12922
12923 case Transaction::OP_MKCOLL:
12924 {
11fdf7f2 12925 ceph_assert(!c);
7c673cae
FG
12926 const coll_t &cid = i.get_cid(op->cid);
12927 r = _create_collection(txc, cid, op->split_bits, &c);
12928 if (!r)
12929 continue;
12930 }
12931 break;
12932
12933 case Transaction::OP_SPLIT_COLLECTION:
11fdf7f2 12934 ceph_abort_msg("deprecated");
7c673cae
FG
12935 break;
12936
12937 case Transaction::OP_SPLIT_COLLECTION2:
12938 {
12939 uint32_t bits = op->split_bits;
12940 uint32_t rem = op->split_rem;
12941 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
12942 if (!r)
12943 continue;
12944 }
12945 break;
12946
11fdf7f2
TL
12947 case Transaction::OP_MERGE_COLLECTION:
12948 {
12949 uint32_t bits = op->split_bits;
12950 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
12951 if (!r)
12952 continue;
12953 }
12954 break;
12955
7c673cae
FG
12956 case Transaction::OP_COLL_HINT:
12957 {
f67539c2 12958 uint32_t type = op->hint;
7c673cae
FG
12959 bufferlist hint;
12960 i.decode_bl(hint);
11fdf7f2 12961 auto hiter = hint.cbegin();
7c673cae
FG
12962 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
12963 uint32_t pg_num;
12964 uint64_t num_objs;
11fdf7f2
TL
12965 decode(pg_num, hiter);
12966 decode(num_objs, hiter);
7c673cae
FG
12967 dout(10) << __func__ << " collection hint objects is a no-op, "
12968 << " pg_num " << pg_num << " num_objects " << num_objs
12969 << dendl;
12970 } else {
12971 // Ignore the hint
12972 dout(10) << __func__ << " unknown collection hint " << type << dendl;
12973 }
12974 continue;
12975 }
12976 break;
12977
12978 case Transaction::OP_COLL_SETATTR:
12979 r = -EOPNOTSUPP;
12980 break;
12981
12982 case Transaction::OP_COLL_RMATTR:
12983 r = -EOPNOTSUPP;
12984 break;
12985
12986 case Transaction::OP_COLL_RENAME:
11fdf7f2 12987 ceph_abort_msg("not implemented");
7c673cae
FG
12988 break;
12989 }
12990 if (r < 0) {
12991 derr << __func__ << " error " << cpp_strerror(r)
12992 << " not handled on operation " << op->op
12993 << " (op " << pos << ", counting from 0)" << dendl;
81eedcae 12994 _dump_transaction<0>(cct, t);
11fdf7f2 12995 ceph_abort_msg("unexpected error");
7c673cae
FG
12996 }
12997
12998 // these operations implicity create the object
12999 bool create = false;
13000 if (op->op == Transaction::OP_TOUCH ||
9f95a23c 13001 op->op == Transaction::OP_CREATE ||
7c673cae
FG
13002 op->op == Transaction::OP_WRITE ||
13003 op->op == Transaction::OP_ZERO) {
13004 create = true;
13005 }
13006
13007 // object operations
9f95a23c 13008 std::unique_lock l(c->lock);
7c673cae
FG
13009 OnodeRef &o = ovec[op->oid];
13010 if (!o) {
13011 ghobject_t oid = i.get_oid(op->oid);
9f95a23c 13012 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
7c673cae
FG
13013 }
13014 if (!create && (!o || !o->exists)) {
13015 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
13016 << i.get_oid(op->oid) << dendl;
13017 r = -ENOENT;
13018 goto endop;
13019 }
13020
13021 switch (op->op) {
9f95a23c 13022 case Transaction::OP_CREATE:
7c673cae
FG
13023 case Transaction::OP_TOUCH:
13024 r = _touch(txc, c, o);
13025 break;
13026
13027 case Transaction::OP_WRITE:
13028 {
13029 uint64_t off = op->off;
13030 uint64_t len = op->len;
13031 uint32_t fadvise_flags = i.get_fadvise_flags();
13032 bufferlist bl;
13033 i.decode_bl(bl);
13034 r = _write(txc, c, o, off, len, bl, fadvise_flags);
13035 }
13036 break;
13037
13038 case Transaction::OP_ZERO:
13039 {
13040 uint64_t off = op->off;
13041 uint64_t len = op->len;
13042 r = _zero(txc, c, o, off, len);
13043 }
13044 break;
13045
13046 case Transaction::OP_TRIMCACHE:
13047 {
13048 // deprecated, no-op
13049 }
13050 break;
13051
13052 case Transaction::OP_TRUNCATE:
13053 {
13054 uint64_t off = op->off;
35e4c445 13055 r = _truncate(txc, c, o, off);
7c673cae
FG
13056 }
13057 break;
13058
13059 case Transaction::OP_REMOVE:
13060 {
13061 r = _remove(txc, c, o);
13062 }
13063 break;
13064
13065 case Transaction::OP_SETATTR:
13066 {
13067 string name = i.decode_string();
13068 bufferptr bp;
13069 i.decode_bp(bp);
13070 r = _setattr(txc, c, o, name, bp);
13071 }
13072 break;
13073
13074 case Transaction::OP_SETATTRS:
13075 {
13076 map<string, bufferptr> aset;
13077 i.decode_attrset(aset);
13078 r = _setattrs(txc, c, o, aset);
13079 }
13080 break;
13081
13082 case Transaction::OP_RMATTR:
13083 {
13084 string name = i.decode_string();
13085 r = _rmattr(txc, c, o, name);
13086 }
13087 break;
13088
13089 case Transaction::OP_RMATTRS:
13090 {
13091 r = _rmattrs(txc, c, o);
13092 }
13093 break;
13094
13095 case Transaction::OP_CLONE:
13096 {
13097 OnodeRef& no = ovec[op->dest_oid];
13098 if (!no) {
13099 const ghobject_t& noid = i.get_oid(op->dest_oid);
13100 no = c->get_onode(noid, true);
13101 }
13102 r = _clone(txc, c, o, no);
13103 }
13104 break;
13105
13106 case Transaction::OP_CLONERANGE:
11fdf7f2 13107 ceph_abort_msg("deprecated");
7c673cae
FG
13108 break;
13109
13110 case Transaction::OP_CLONERANGE2:
13111 {
13112 OnodeRef& no = ovec[op->dest_oid];
13113 if (!no) {
13114 const ghobject_t& noid = i.get_oid(op->dest_oid);
13115 no = c->get_onode(noid, true);
13116 }
13117 uint64_t srcoff = op->off;
13118 uint64_t len = op->len;
13119 uint64_t dstoff = op->dest_off;
13120 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
13121 }
13122 break;
13123
13124 case Transaction::OP_COLL_ADD:
11fdf7f2 13125 ceph_abort_msg("not implemented");
7c673cae
FG
13126 break;
13127
13128 case Transaction::OP_COLL_REMOVE:
11fdf7f2 13129 ceph_abort_msg("not implemented");
7c673cae
FG
13130 break;
13131
13132 case Transaction::OP_COLL_MOVE:
11fdf7f2 13133 ceph_abort_msg("deprecated");
7c673cae
FG
13134 break;
13135
13136 case Transaction::OP_COLL_MOVE_RENAME:
13137 case Transaction::OP_TRY_RENAME:
13138 {
11fdf7f2 13139 ceph_assert(op->cid == op->dest_cid);
7c673cae
FG
13140 const ghobject_t& noid = i.get_oid(op->dest_oid);
13141 OnodeRef& no = ovec[op->dest_oid];
13142 if (!no) {
13143 no = c->get_onode(noid, false);
13144 }
13145 r = _rename(txc, c, o, no, noid);
13146 }
13147 break;
13148
13149 case Transaction::OP_OMAP_CLEAR:
13150 {
13151 r = _omap_clear(txc, c, o);
13152 }
13153 break;
13154 case Transaction::OP_OMAP_SETKEYS:
13155 {
13156 bufferlist aset_bl;
13157 i.decode_attrset_bl(&aset_bl);
13158 r = _omap_setkeys(txc, c, o, aset_bl);
13159 }
13160 break;
13161 case Transaction::OP_OMAP_RMKEYS:
13162 {
13163 bufferlist keys_bl;
13164 i.decode_keyset_bl(&keys_bl);
13165 r = _omap_rmkeys(txc, c, o, keys_bl);
13166 }
13167 break;
13168 case Transaction::OP_OMAP_RMKEYRANGE:
13169 {
13170 string first, last;
13171 first = i.decode_string();
13172 last = i.decode_string();
13173 r = _omap_rmkey_range(txc, c, o, first, last);
13174 }
13175 break;
13176 case Transaction::OP_OMAP_SETHEADER:
13177 {
13178 bufferlist bl;
13179 i.decode_bl(bl);
13180 r = _omap_setheader(txc, c, o, bl);
13181 }
13182 break;
13183
13184 case Transaction::OP_SETALLOCHINT:
13185 {
13186 r = _set_alloc_hint(txc, c, o,
13187 op->expected_object_size,
13188 op->expected_write_size,
f67539c2 13189 op->hint);
7c673cae
FG
13190 }
13191 break;
13192
13193 default:
11fdf7f2 13194 derr << __func__ << " bad op " << op->op << dendl;
7c673cae
FG
13195 ceph_abort();
13196 }
13197
13198 endop:
13199 if (r < 0) {
13200 bool ok = false;
13201
13202 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
13203 op->op == Transaction::OP_CLONE ||
13204 op->op == Transaction::OP_CLONERANGE2 ||
13205 op->op == Transaction::OP_COLL_ADD ||
13206 op->op == Transaction::OP_SETATTR ||
13207 op->op == Transaction::OP_SETATTRS ||
13208 op->op == Transaction::OP_RMATTR ||
13209 op->op == Transaction::OP_OMAP_SETKEYS ||
13210 op->op == Transaction::OP_OMAP_RMKEYS ||
13211 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
13212 op->op == Transaction::OP_OMAP_SETHEADER))
13213 // -ENOENT is usually okay
13214 ok = true;
13215 if (r == -ENODATA)
13216 ok = true;
13217
13218 if (!ok) {
13219 const char *msg = "unexpected error code";
13220
13221 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
13222 op->op == Transaction::OP_CLONE ||
13223 op->op == Transaction::OP_CLONERANGE2))
13224 msg = "ENOENT on clone suggests osd bug";
13225
13226 if (r == -ENOSPC)
13227 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
13228 // by partially applying transactions.
13229 msg = "ENOSPC from bluestore, misconfigured cluster";
13230
13231 if (r == -ENOTEMPTY) {
13232 msg = "ENOTEMPTY suggests garbage data in osd data dir";
13233 }
13234
13235 derr << __func__ << " error " << cpp_strerror(r)
13236 << " not handled on operation " << op->op
13237 << " (op " << pos << ", counting from 0)"
13238 << dendl;
13239 derr << msg << dendl;
81eedcae 13240 _dump_transaction<0>(cct, t);
11fdf7f2 13241 ceph_abort_msg("unexpected error");
7c673cae
FG
13242 }
13243 }
13244 }
13245}
13246
13247
13248
13249// -----------------
13250// write operations
13251
13252int BlueStore::_touch(TransContext *txc,
13253 CollectionRef& c,
13254 OnodeRef &o)
13255{
13256 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13257 int r = 0;
7c673cae
FG
13258 _assign_nid(txc, o);
13259 txc->write_onode(o);
13260 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13261 return r;
13262}
13263
7c673cae
FG
13264void BlueStore::_pad_zeros(
13265 bufferlist *bl, uint64_t *offset,
13266 uint64_t chunk_size)
13267{
13268 auto length = bl->length();
13269 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
13270 << " chunk_size 0x" << chunk_size << std::dec << dendl;
13271 dout(40) << "before:\n";
13272 bl->hexdump(*_dout);
13273 *_dout << dendl;
13274 // front
13275 size_t front_pad = *offset % chunk_size;
13276 size_t back_pad = 0;
13277 size_t pad_count = 0;
13278 if (front_pad) {
11fdf7f2 13279 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
f67539c2 13280 bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size);
224ce89b 13281 z.zero(0, front_pad, false);
7c673cae 13282 pad_count += front_pad;
9f95a23c 13283 bl->begin().copy(front_copy, z.c_str() + front_pad);
7c673cae
FG
13284 if (front_copy + front_pad < chunk_size) {
13285 back_pad = chunk_size - (length + front_pad);
224ce89b 13286 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
13287 pad_count += back_pad;
13288 }
13289 bufferlist old, t;
13290 old.swap(*bl);
13291 t.substr_of(old, front_copy, length - front_copy);
13292 bl->append(z);
13293 bl->claim_append(t);
13294 *offset -= front_pad;
224ce89b 13295 length += pad_count;
7c673cae
FG
13296 }
13297
13298 // back
13299 uint64_t end = *offset + length;
13300 unsigned back_copy = end % chunk_size;
13301 if (back_copy) {
11fdf7f2 13302 ceph_assert(back_pad == 0);
7c673cae 13303 back_pad = chunk_size - back_copy;
11fdf7f2 13304 ceph_assert(back_copy <= length);
7c673cae 13305 bufferptr tail(chunk_size);
9f95a23c 13306 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
224ce89b 13307 tail.zero(back_copy, back_pad, false);
7c673cae
FG
13308 bufferlist old;
13309 old.swap(*bl);
13310 bl->substr_of(old, 0, length - back_copy);
13311 bl->append(tail);
13312 length += back_pad;
13313 pad_count += back_pad;
13314 }
13315 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
13316 << back_pad << " on front/back, now 0x" << *offset << "~"
13317 << length << std::dec << dendl;
13318 dout(40) << "after:\n";
13319 bl->hexdump(*_dout);
13320 *_dout << dendl;
13321 if (pad_count)
13322 logger->inc(l_bluestore_write_pad_bytes, pad_count);
11fdf7f2 13323 ceph_assert(bl->length() == length);
7c673cae
FG
13324}
13325
13326void BlueStore::_do_write_small(
13327 TransContext *txc,
13328 CollectionRef &c,
13329 OnodeRef o,
13330 uint64_t offset, uint64_t length,
13331 bufferlist::iterator& blp,
13332 WriteContext *wctx)
13333{
13334 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13335 << std::dec << dendl;
11fdf7f2 13336 ceph_assert(length < min_alloc_size);
f67539c2 13337
7c673cae
FG
13338 uint64_t end_offs = offset + length;
13339
13340 logger->inc(l_bluestore_write_small);
13341 logger->inc(l_bluestore_write_small_bytes, length);
13342
13343 bufferlist bl;
13344 blp.copy(length, bl);
13345
81eedcae
TL
13346 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
13347 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13348 uint32_t alloc_len = min_alloc_size;
13349 auto offset0 = p2align<uint64_t>(offset, alloc_len);
13350
13351 bool any_change;
13352
13353 // search suitable extent in both forward and reverse direction in
13354 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13355 // then check if blob can be reused via can_reuse_blob func or apply
13356 // direct/deferred write (the latter for extents including or higher
13357 // than 'offset' only).
13358 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
13359
f67539c2
TL
13360 // On zoned devices, the first goal is to support non-overwrite workloads,
13361 // such as RGW, with large, aligned objects. Therefore, for user writes
13362 // _do_write_small should not trigger. OSDs, however, write and update a tiny
13363 // amount of metadata, such as OSD maps, to disk. For those cases, we
13364 // temporarily just pad them to min_alloc_size and write them to a new place
13365 // on every update.
13366 if (bdev->is_smr()) {
13367 BlobRef b = c->new_blob();
13368 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
13369 uint64_t b_off0 = b_off;
13370 _pad_zeros(&bl, &b_off0, min_alloc_size);
13371 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13372 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true);
13373 return;
13374 }
13375
7c673cae
FG
13376 // Look for an existing mutable blob we can use.
13377 auto begin = o->extent_map.extent_map.begin();
13378 auto end = o->extent_map.extent_map.end();
13379 auto ep = o->extent_map.seek_lextent(offset);
13380 if (ep != begin) {
13381 --ep;
13382 if (ep->blob_end() <= offset) {
13383 ++ep;
13384 }
13385 }
f67539c2
TL
13386 auto prev_ep = end;
13387 if (ep != begin) {
13388 prev_ep = ep;
7c673cae 13389 --prev_ep;
7c673cae
FG
13390 }
13391
eafe8130
TL
13392 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
13393 // We don't want to have more blobs than min alloc units fit
13394 // into 2 max blobs
13395 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
13396 bool above_blob_threshold = false;
13397
13398 inspected_blobs.reserve(blob_threshold);
13399
13400 uint64_t max_off = 0;
13401 auto start_ep = ep;
13402 auto end_ep = ep; // exclusively
7c673cae
FG
13403 do {
13404 any_change = false;
13405
13406 if (ep != end && ep->logical_offset < offset + max_bsize) {
13407 BlobRef b = ep->blob;
eafe8130
TL
13408 if (!above_blob_threshold) {
13409 inspected_blobs.insert(&b->get_blob());
13410 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13411 }
13412 max_off = ep->logical_end();
7c673cae 13413 auto bstart = ep->blob_start();
eafe8130 13414
7c673cae
FG
13415 dout(20) << __func__ << " considering " << *b
13416 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
13417 if (bstart >= end_offs) {
13418 dout(20) << __func__ << " ignoring distant " << *b << dendl;
13419 } else if (!b->get_blob().is_mutable()) {
13420 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
13421 } else if (ep->logical_offset % min_alloc_size !=
13422 ep->blob_offset % min_alloc_size) {
13423 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
13424 } else {
13425 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13426 // can we pad our head/tail out with zeros?
13427 uint64_t head_pad, tail_pad;
11fdf7f2
TL
13428 head_pad = p2phase(offset, chunk_size);
13429 tail_pad = p2nphase(end_offs, chunk_size);
7c673cae
FG
13430 if (head_pad || tail_pad) {
13431 o->extent_map.fault_range(db, offset - head_pad,
13432 end_offs - offset + head_pad + tail_pad);
13433 }
13434 if (head_pad &&
a4b75251 13435 o->extent_map.has_any_lextents(offset - head_pad, head_pad)) {
7c673cae
FG
13436 head_pad = 0;
13437 }
13438 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
13439 tail_pad = 0;
13440 }
13441
13442 uint64_t b_off = offset - head_pad - bstart;
13443 uint64_t b_len = length + head_pad + tail_pad;
13444
13445 // direct write into unused blocks of an existing mutable blob?
13446 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
13447 b->get_blob().get_ondisk_length() >= b_off + b_len &&
13448 b->get_blob().is_unused(b_off, b_len) &&
13449 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 13450 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
13451
13452 dout(20) << __func__ << " write to unused 0x" << std::hex
13453 << b_off << "~" << b_len
13454 << " pad 0x" << head_pad << " + 0x" << tail_pad
13455 << std::dec << " of mutable " << *b << dendl;
224ce89b 13456 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
13457 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13458
11fdf7f2 13459 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 13460 if (b_len < prefer_deferred_size) {
7c673cae
FG
13461 dout(20) << __func__ << " deferring small 0x" << std::hex
13462 << b_len << std::dec << " unused write via deferred" << dendl;
522d829b 13463 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
7c673cae
FG
13464 op->op = bluestore_deferred_op_t::OP_WRITE;
13465 b->get_blob().map(
13466 b_off, b_len,
13467 [&](uint64_t offset, uint64_t length) {
13468 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13469 return 0;
13470 });
224ce89b 13471 op->data = bl;
7c673cae
FG
13472 } else {
13473 b->get_blob().map_bl(
224ce89b 13474 b_off, bl,
7c673cae
FG
13475 [&](uint64_t offset, bufferlist& t) {
13476 bdev->aio_write(offset, t,
13477 &txc->ioc, wctx->buffered);
13478 });
13479 }
13480 }
224ce89b 13481 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
13482 dout(20) << __func__ << " lex old " << *ep << dendl;
13483 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
13484 b,
13485 &wctx->old_extents);
13486 b->dirty_blob().mark_used(le->blob_offset, le->length);
f67539c2 13487
7c673cae
FG
13488 txc->statfs_delta.stored() += le->length;
13489 dout(20) << __func__ << " lex " << *le << dendl;
13490 logger->inc(l_bluestore_write_small_unused);
13491 return;
13492 }
13493 // read some data to fill out the chunk?
11fdf7f2
TL
13494 uint64_t head_read = p2phase(b_off, chunk_size);
13495 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
7c673cae
FG
13496 if ((head_read || tail_read) &&
13497 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
13498 head_read + tail_read < min_alloc_size) {
13499 b_off -= head_read;
13500 b_len += head_read + tail_read;
13501
13502 } else {
13503 head_read = tail_read = 0;
13504 }
13505
13506 // chunk-aligned deferred overwrite?
13507 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
13508 b_off % chunk_size == 0 &&
13509 b_len % chunk_size == 0 &&
13510 b->get_blob().is_allocated(b_off, b_len)) {
13511
224ce89b 13512 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
13513
13514 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
13515 << " and tail 0x" << tail_read << std::dec << dendl;
13516 if (head_read) {
13517 bufferlist head_bl;
13518 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
13519 head_bl, 0);
11fdf7f2 13520 ceph_assert(r >= 0 && r <= (int)head_read);
7c673cae
FG
13521 size_t zlen = head_read - r;
13522 if (zlen) {
13523 head_bl.append_zero(zlen);
13524 logger->inc(l_bluestore_write_pad_bytes, zlen);
13525 }
11fdf7f2
TL
13526 head_bl.claim_append(bl);
13527 bl.swap(head_bl);
7c673cae
FG
13528 logger->inc(l_bluestore_write_penalty_read_ops);
13529 }
13530 if (tail_read) {
13531 bufferlist tail_bl;
13532 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
13533 tail_bl, 0);
11fdf7f2 13534 ceph_assert(r >= 0 && r <= (int)tail_read);
7c673cae
FG
13535 size_t zlen = tail_read - r;
13536 if (zlen) {
13537 tail_bl.append_zero(zlen);
13538 logger->inc(l_bluestore_write_pad_bytes, zlen);
13539 }
224ce89b 13540 bl.claim_append(tail_bl);
7c673cae
FG
13541 logger->inc(l_bluestore_write_penalty_read_ops);
13542 }
f67539c2 13543 logger->inc(l_bluestore_write_small_pre_read);
7c673cae 13544
224ce89b 13545 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
13546 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13547
f67539c2 13548 b->dirty_blob().calc_csum(b_off, bl);
11fdf7f2
TL
13549
13550 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 13551 bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
11fdf7f2
TL
13552 op->op = bluestore_deferred_op_t::OP_WRITE;
13553 int r = b->get_blob().map(
13554 b_off, b_len,
13555 [&](uint64_t offset, uint64_t length) {
13556 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13557 return 0;
13558 });
13559 ceph_assert(r == 0);
f67539c2 13560 op->data = std::move(bl);
11fdf7f2
TL
13561 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
13562 << b_len << std::dec << " of mutable " << *b
13563 << " at " << op->extents << dendl;
13564 }
13565
7c673cae
FG
13566 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
13567 b, &wctx->old_extents);
13568 b->dirty_blob().mark_used(le->blob_offset, le->length);
13569 txc->statfs_delta.stored() += le->length;
13570 dout(20) << __func__ << " lex " << *le << dendl;
7c673cae
FG
13571 return;
13572 }
224ce89b
WB
13573 // try to reuse blob if we can
13574 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
13575 max_bsize,
13576 offset0 - bstart,
13577 &alloc_len)) {
11fdf7f2 13578 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
13579 // fit into reused blob
13580 // Need to check for pending writes desiring to
13581 // reuse the same pextent. The rationale is that during GC two chunks
13582 // from garbage blobs(compressed?) can share logical space within the same
13583 // AU. That's in turn might be caused by unaligned len in clone_range2.
13584 // Hence the second write will fail in an attempt to reuse blob at
13585 // do_alloc_write().
13586 if (!wctx->has_conflict(b,
13587 offset0,
13588 offset0 + alloc_len,
13589 min_alloc_size)) {
13590
13591 // we can't reuse pad_head/pad_tail since they might be truncated
13592 // due to existent extents
13593 uint64_t b_off = offset - bstart;
13594 uint64_t b_off0 = b_off;
13595 _pad_zeros(&bl, &b_off0, chunk_size);
13596
13597 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
13598 << " (0x" << b_off0 << "~" << bl.length() << ")"
13599 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
13600 << std::dec << dendl;
13601
13602 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13603 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13604 false, false);
13605 logger->inc(l_bluestore_write_small_unused);
13606 return;
13607 }
13608 }
13609 }
13610 ++ep;
eafe8130 13611 end_ep = ep;
7c673cae
FG
13612 any_change = true;
13613 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13614
13615 // check extent for reuse in reverse order
13616 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
13617 BlobRef b = prev_ep->blob;
eafe8130
TL
13618 if (!above_blob_threshold) {
13619 inspected_blobs.insert(&b->get_blob());
13620 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13621 }
13622 start_ep = prev_ep;
7c673cae
FG
13623 auto bstart = prev_ep->blob_start();
13624 dout(20) << __func__ << " considering " << *b
13625 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 13626 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
13627 max_bsize,
13628 offset0 - bstart,
13629 &alloc_len)) {
11fdf7f2 13630 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
13631 // fit into reused blob
13632 // Need to check for pending writes desiring to
13633 // reuse the same pextent. The rationale is that during GC two chunks
13634 // from garbage blobs(compressed?) can share logical space within the same
13635 // AU. That's in turn might be caused by unaligned len in clone_range2.
13636 // Hence the second write will fail in an attempt to reuse blob at
13637 // do_alloc_write().
13638 if (!wctx->has_conflict(b,
13639 offset0,
13640 offset0 + alloc_len,
13641 min_alloc_size)) {
13642
13643 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13644 uint64_t b_off = offset - bstart;
13645 uint64_t b_off0 = b_off;
13646 _pad_zeros(&bl, &b_off0, chunk_size);
13647
13648 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
13649 << " (0x" << b_off0 << "~" << bl.length() << ")"
13650 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
13651 << std::dec << dendl;
13652
13653 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13654 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13655 false, false);
13656 logger->inc(l_bluestore_write_small_unused);
13657 return;
13658 }
13659 }
13660 if (prev_ep != begin) {
13661 --prev_ep;
13662 any_change = true;
13663 } else {
13664 prev_ep = end; // to avoid useless first extent re-check
13665 }
13666 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13667 } while (any_change);
13668
eafe8130
TL
13669 if (above_blob_threshold) {
13670 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
13671 << " " << std::hex << min_off << "~" << max_off << std::dec
13672 << dendl;
13673 ceph_assert(start_ep != end_ep);
13674 for (auto ep = start_ep; ep != end_ep; ++ep) {
13675 dout(20) << __func__ << " inserting for GC "
13676 << std::hex << ep->logical_offset << "~" << ep->length
13677 << std::dec << dendl;
13678
13679 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
13680 }
13681 // insert newly written extent to GC
13682 wctx->extents_to_gc.union_insert(offset, length);
13683 dout(20) << __func__ << " inserting (last) for GC "
13684 << std::hex << offset << "~" << length
13685 << std::dec << dendl;
13686 }
7c673cae 13687 // new blob.
7c673cae 13688 BlobRef b = c->new_blob();
11fdf7f2 13689 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
7c673cae
FG
13690 uint64_t b_off0 = b_off;
13691 _pad_zeros(&bl, &b_off0, block_size);
13692 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
1911f103
TL
13693 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13694 min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
13695 // doesn't match disk one only
13696 true);
7c673cae
FG
13697
13698 return;
13699}
13700
f67539c2
TL
13701bool BlueStore::BigDeferredWriteContext::can_defer(
13702 BlueStore::extent_map_t::iterator ep,
13703 uint64_t prefer_deferred_size,
13704 uint64_t block_size,
13705 uint64_t offset,
13706 uint64_t l)
13707{
13708 bool res = false;
13709 auto& blob = ep->blob->get_blob();
13710 if (offset >= ep->blob_start() &&
13711 blob.is_mutable()) {
13712 off = offset;
13713 b_off = offset - ep->blob_start();
13714 uint64_t chunk_size = blob.get_chunk_size(block_size);
13715 uint64_t ondisk = blob.get_ondisk_length();
13716 used = std::min(l, ondisk - b_off);
13717
13718 // will read some data to fill out the chunk?
13719 head_read = p2phase<uint64_t>(b_off, chunk_size);
13720 tail_read = p2nphase<uint64_t>(b_off + used, chunk_size);
13721 b_off -= head_read;
13722
13723 ceph_assert(b_off % chunk_size == 0);
13724 ceph_assert(blob_aligned_len() % chunk_size == 0);
13725
522d829b 13726 res = blob_aligned_len() < prefer_deferred_size &&
f67539c2
TL
13727 blob_aligned_len() <= ondisk &&
13728 blob.is_allocated(b_off, blob_aligned_len());
13729 if (res) {
13730 blob_ref = ep->blob;
13731 blob_start = ep->blob_start();
13732 }
13733 }
13734 return res;
13735}
13736
13737bool BlueStore::BigDeferredWriteContext::apply_defer()
13738{
13739 int r = blob_ref->get_blob().map(
13740 b_off, blob_aligned_len(),
13741 [&](const bluestore_pextent_t& pext,
13742 uint64_t offset,
13743 uint64_t length) {
13744 // apply deferred if overwrite breaks blob continuity only.
13745 // if it totally overlaps some pextent - fallback to regular write
13746 if (pext.offset < offset ||
13747 pext.end() > offset + length) {
13748 res_extents.emplace_back(bluestore_pextent_t(offset, length));
13749 return 0;
13750 }
13751 return -1;
13752 });
13753 return r >= 0;
13754}
13755
13756void BlueStore::_do_write_big_apply_deferred(
13757 TransContext* txc,
13758 CollectionRef& c,
13759 OnodeRef o,
13760 BlueStore::BigDeferredWriteContext& dctx,
13761 bufferlist::iterator& blp,
13762 WriteContext* wctx)
13763{
13764 bufferlist bl;
13765 dout(20) << __func__ << " reading head 0x" << std::hex << dctx.head_read
13766 << " and tail 0x" << dctx.tail_read << std::dec << dendl;
13767 if (dctx.head_read) {
13768 int r = _do_read(c.get(), o,
13769 dctx.off - dctx.head_read,
13770 dctx.head_read,
13771 bl,
13772 0);
13773 ceph_assert(r >= 0 && r <= (int)dctx.head_read);
13774 size_t zlen = dctx.head_read - r;
13775 if (zlen) {
13776 bl.append_zero(zlen);
13777 logger->inc(l_bluestore_write_pad_bytes, zlen);
13778 }
13779 logger->inc(l_bluestore_write_penalty_read_ops);
13780 }
13781 blp.copy(dctx.used, bl);
13782
13783 if (dctx.tail_read) {
13784 bufferlist tail_bl;
13785 int r = _do_read(c.get(), o,
13786 dctx.off + dctx.used, dctx.tail_read,
13787 tail_bl, 0);
13788 ceph_assert(r >= 0 && r <= (int)dctx.tail_read);
13789 size_t zlen = dctx.tail_read - r;
13790 if (zlen) {
13791 tail_bl.append_zero(zlen);
13792 logger->inc(l_bluestore_write_pad_bytes, zlen);
13793 }
13794 bl.claim_append(tail_bl);
13795 logger->inc(l_bluestore_write_penalty_read_ops);
13796 }
13797 auto& b0 = dctx.blob_ref;
13798 _buffer_cache_write(txc, b0, dctx.b_off, bl,
13799 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13800
13801 b0->dirty_blob().calc_csum(dctx.b_off, bl);
13802
13803 Extent* le = o->extent_map.set_lextent(c, dctx.off,
13804 dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents);
13805
13806 // in fact this is a no-op for big writes but left here to maintain
13807 // uniformity and avoid missing after some refactor.
13808 b0->dirty_blob().mark_used(le->blob_offset, le->length);
13809 txc->statfs_delta.stored() += le->length;
13810
13811 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 13812 bluestore_deferred_op_t* op = _get_deferred_op(txc, bl.length());
f67539c2
TL
13813 op->op = bluestore_deferred_op_t::OP_WRITE;
13814 op->extents.swap(dctx.res_extents);
13815 op->data = std::move(bl);
13816 }
13817}
13818
7c673cae
FG
13819void BlueStore::_do_write_big(
13820 TransContext *txc,
13821 CollectionRef &c,
13822 OnodeRef o,
13823 uint64_t offset, uint64_t length,
13824 bufferlist::iterator& blp,
13825 WriteContext *wctx)
13826{
13827 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13828 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
13829 << " compress " << (int)wctx->compress
13830 << dendl;
13831 logger->inc(l_bluestore_write_big);
13832 logger->inc(l_bluestore_write_big_bytes, length);
11fdf7f2 13833 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
f67539c2 13834 uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load();
7c673cae
FG
13835 while (length > 0) {
13836 bool new_blob = false;
7c673cae
FG
13837 BlobRef b;
13838 uint32_t b_off = 0;
522d829b 13839 uint32_t l = 0;
7c673cae
FG
13840
13841 //attempting to reuse existing blob
13842 if (!wctx->compress) {
522d829b
TL
13843 // enforce target blob alignment with max_bsize
13844 l = max_bsize - p2phase(offset, max_bsize);
13845 l = std::min(uint64_t(l), length);
13846
7c673cae 13847 auto end = o->extent_map.extent_map.end();
f67539c2 13848
522d829b
TL
13849 dout(20) << __func__ << " may be defer: 0x" << std::hex
13850 << offset << "~" << l
13851 << std::dec << dendl;
13852
f67539c2
TL
13853 if (prefer_deferred_size_snapshot &&
13854 l <= prefer_deferred_size_snapshot * 2) {
13855 // Single write that spans two adjusted existing blobs can result
13856 // in up to two deferred blocks of 'prefer_deferred_size'
13857 // So we're trying to minimize the amount of resulting blobs
13858 // and preserve 2 blobs rather than inserting one more in between
13859 // E.g. write 0x10000~20000 over existing blobs
13860 // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
13861 // performance point of view) to result in two deferred writes to
13862 // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
13863
13864 // look for an existing mutable blob we can write into
13865 auto ep = o->extent_map.seek_lextent(offset);
13866 auto ep_next = end;
13867 BigDeferredWriteContext head_info, tail_info;
13868
13869 bool will_defer = ep != end ?
13870 head_info.can_defer(ep,
13871 prefer_deferred_size_snapshot,
13872 block_size,
13873 offset,
13874 l) :
13875 false;
13876 auto offset_next = offset + head_info.used;
13877 auto remaining = l - head_info.used;
13878 if (will_defer && remaining) {
13879 will_defer = false;
13880 if (remaining <= prefer_deferred_size_snapshot) {
13881 ep_next = o->extent_map.seek_lextent(offset_next);
13882 // check if we can defer remaining totally
13883 will_defer = ep_next == end ?
13884 false :
13885 tail_info.can_defer(ep_next,
13886 prefer_deferred_size_snapshot,
13887 block_size,
13888 offset_next,
13889 remaining);
13890 will_defer = will_defer && remaining == tail_info.used;
13891 }
13892 }
13893 if (will_defer) {
13894 dout(20) << __func__ << " " << *(head_info.blob_ref)
13895 << " deferring big " << std::hex
13896 << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")"
13897 << std::dec << " write via deferred"
13898 << dendl;
13899 if (remaining) {
13900 dout(20) << __func__ << " " << *(tail_info.blob_ref)
13901 << " deferring big " << std::hex
13902 << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")"
13903 << std::dec << " write via deferred"
13904 << dendl;
13905 }
13906
13907 will_defer = head_info.apply_defer();
13908 if (!will_defer) {
13909 dout(20) << __func__
13910 << " deferring big fell back, head isn't continuous"
13911 << dendl;
13912 } else if (remaining) {
13913 will_defer = tail_info.apply_defer();
13914 if (!will_defer) {
13915 dout(20) << __func__
13916 << " deferring big fell back, tail isn't continuous"
13917 << dendl;
13918 }
13919 }
13920 }
13921 if (will_defer) {
13922 _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx);
13923 if (remaining) {
13924 _do_write_big_apply_deferred(txc, c, o, tail_info,
13925 blp, wctx);
13926 }
522d829b
TL
13927 dout(20) << __func__ << " defer big: 0x" << std::hex
13928 << offset << "~" << l
13929 << std::dec << dendl;
f67539c2
TL
13930 offset += l;
13931 length -= l;
13932 logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1);
13933 logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1);
13934 continue;
13935 }
13936 }
522d829b 13937 dout(20) << __func__ << " lookup for blocks to reuse..." << dendl;
f67539c2
TL
13938
13939 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
13940
13941 // seek again as punch_hole could invalidate ep
7c673cae 13942 auto ep = o->extent_map.seek_lextent(offset);
f67539c2
TL
13943 auto begin = o->extent_map.extent_map.begin();
13944 auto prev_ep = end;
13945 if (ep != begin) {
13946 prev_ep = ep;
7c673cae 13947 --prev_ep;
7c673cae 13948 }
f67539c2 13949
7c673cae
FG
13950 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13951 // search suitable extent in both forward and reverse direction in
13952 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 13953 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
13954 bool any_change;
13955 do {
13956 any_change = false;
13957 if (ep != end && ep->logical_offset < offset + max_bsize) {
522d829b
TL
13958 dout(20) << __func__ << " considering " << *ep
13959 << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl;
f67539c2
TL
13960
13961 if (offset >= ep->blob_start() &&
224ce89b 13962 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
13963 offset - ep->blob_start(),
13964 &l)) {
13965 b = ep->blob;
f67539c2 13966 b_off = offset - ep->blob_start();
7c673cae
FG
13967 prev_ep = end; // to avoid check below
13968 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 13969 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
13970 } else {
13971 ++ep;
13972 any_change = true;
13973 }
13974 }
13975
13976 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
522d829b
TL
13977 dout(20) << __func__ << " considering rev " << *prev_ep
13978 << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl;
f67539c2 13979 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
13980 offset - prev_ep->blob_start(),
13981 &l)) {
13982 b = prev_ep->blob;
13983 b_off = offset - prev_ep->blob_start();
13984 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 13985 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
13986 } else if (prev_ep != begin) {
13987 --prev_ep;
13988 any_change = true;
13989 } else {
13990 prev_ep = end; // to avoid useless first extent re-check
13991 }
13992 }
13993 } while (b == nullptr && any_change);
f67539c2 13994 } else {
522d829b
TL
13995 // trying to utilize as longer chunk as permitted in case of compression.
13996 l = std::min(max_bsize, length);
f67539c2
TL
13997 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
13998 } // if (!wctx->compress)
13999
7c673cae
FG
14000 if (b == nullptr) {
14001 b = c->new_blob();
14002 b_off = 0;
14003 new_blob = true;
14004 }
7c673cae
FG
14005 bufferlist t;
14006 blp.copy(l, t);
14007 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
522d829b
TL
14008 dout(20) << __func__ << " schedule write big: 0x"
14009 << std::hex << offset << "~" << l << std::dec
14010 << (new_blob ? " new " : " reuse ")
14011 << *b << dendl;
7c673cae
FG
14012 offset += l;
14013 length -= l;
14014 logger->inc(l_bluestore_write_big_blobs);
14015 }
14016}
14017
14018int BlueStore::_do_alloc_write(
14019 TransContext *txc,
14020 CollectionRef coll,
14021 OnodeRef o,
14022 WriteContext *wctx)
14023{
14024 dout(20) << __func__ << " txc " << txc
14025 << " " << wctx->writes.size() << " blobs"
14026 << dendl;
3efd9988
FG
14027 if (wctx->writes.empty()) {
14028 return 0;
7c673cae
FG
14029 }
14030
7c673cae
FG
14031 CompressorRef c;
14032 double crr = 0;
14033 if (wctx->compress) {
14034 c = select_option(
14035 "compression_algorithm",
14036 compressor,
14037 [&]() {
14038 string val;
14039 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
14040 CompressorRef cp = compressor;
14041 if (!cp || cp->get_type_name() != val) {
14042 cp = Compressor::create(cct, val);
11fdf7f2
TL
14043 if (!cp) {
14044 if (_set_compression_alert(false, val.c_str())) {
14045 derr << __func__ << " unable to initialize " << val.c_str()
14046 << " compressor" << dendl;
14047 }
14048 }
7c673cae
FG
14049 }
14050 return boost::optional<CompressorRef>(cp);
14051 }
14052 return boost::optional<CompressorRef>();
14053 }
14054 );
14055
14056 crr = select_option(
14057 "compression_required_ratio",
14058 cct->_conf->bluestore_compression_required_ratio,
14059 [&]() {
14060 double val;
3efd9988 14061 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
7c673cae
FG
14062 return boost::optional<double>(val);
14063 }
14064 return boost::optional<double>();
14065 }
14066 );
14067 }
14068
14069 // checksum
11fdf7f2 14070 int64_t csum = csum_type.load();
7c673cae
FG
14071 csum = select_option(
14072 "csum_type",
14073 csum,
14074 [&]() {
11fdf7f2 14075 int64_t val;
3efd9988 14076 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
11fdf7f2 14077 return boost::optional<int64_t>(val);
7c673cae 14078 }
11fdf7f2 14079 return boost::optional<int64_t>();
7c673cae
FG
14080 }
14081 );
14082
3efd9988
FG
14083 // compress (as needed) and calc needed space
14084 uint64_t need = 0;
11fdf7f2 14085 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae 14086 for (auto& wi : wctx->writes) {
3efd9988 14087 if (c && wi.blob_length > min_alloc_size) {
11fdf7f2 14088 auto start = mono_clock::now();
7c673cae
FG
14089
14090 // compress
11fdf7f2
TL
14091 ceph_assert(wi.b_off == 0);
14092 ceph_assert(wi.blob_length == wi.bl.length());
3efd9988 14093
7c673cae
FG
14094 // FIXME: memory alignment here is bad
14095 bufferlist t;
f67539c2
TL
14096 boost::optional<int32_t> compressor_message;
14097 int r = c->compress(wi.bl, t, compressor_message);
3efd9988 14098 uint64_t want_len_raw = wi.blob_length * crr;
11fdf7f2 14099 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
a8e16298
TL
14100 bool rejected = false;
14101 uint64_t compressed_len = t.length();
14102 // do an approximate (fast) estimation for resulting blob size
14103 // that doesn't take header overhead into account
11fdf7f2 14104 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
14105 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
14106 bluestore_compression_header_t chdr;
14107 chdr.type = c->get_type();
14108 chdr.length = t.length();
f67539c2 14109 chdr.compressor_message = compressor_message;
a8e16298
TL
14110 encode(chdr, wi.compressed_bl);
14111 wi.compressed_bl.claim_append(t);
14112
14113 compressed_len = wi.compressed_bl.length();
11fdf7f2 14114 result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
14115 if (result_len <= want_len && result_len < wi.blob_length) {
14116 // Cool. We compressed at least as much as we were hoping to.
14117 // pad out to min_alloc_size
14118 wi.compressed_bl.append_zero(result_len - compressed_len);
14119 wi.compressed_len = compressed_len;
14120 wi.compressed = true;
14121 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
14122 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
14123 << " -> 0x" << compressed_len << " => 0x" << result_len
14124 << " with " << c->get_type()
14125 << std::dec << dendl;
14126 txc->statfs_delta.compressed() += compressed_len;
14127 txc->statfs_delta.compressed_original() += wi.blob_length;
14128 txc->statfs_delta.compressed_allocated() += result_len;
14129 logger->inc(l_bluestore_compress_success_count);
14130 need += result_len;
14131 } else {
14132 rejected = true;
14133 }
14134 } else if (r != 0) {
14135 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
14136 << " bytes compressed using " << c->get_type_name()
14137 << std::dec
14138 << " failed with errcode = " << r
14139 << ", leaving uncompressed"
14140 << dendl;
14141 logger->inc(l_bluestore_compress_rejected_count);
14142 need += wi.blob_length;
7c673cae 14143 } else {
a8e16298
TL
14144 rejected = true;
14145 }
14146
14147 if (rejected) {
3efd9988 14148 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
a8e16298 14149 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
3efd9988
FG
14150 << " with " << c->get_type()
14151 << ", which is more than required 0x" << want_len_raw
7c673cae 14152 << " -> 0x" << want_len
3efd9988
FG
14153 << ", leaving uncompressed"
14154 << std::dec << dendl;
14155 logger->inc(l_bluestore_compress_rejected_count);
14156 need += wi.blob_length;
7c673cae 14157 }
494da23a
TL
14158 log_latency("compress@_do_alloc_write",
14159 l_bluestore_compress_lat,
14160 mono_clock::now() - start,
14161 cct->_conf->bluestore_log_op_age );
3efd9988
FG
14162 } else {
14163 need += wi.blob_length;
7c673cae 14164 }
3efd9988 14165 }
a8e16298 14166 PExtentVector prealloc;
3efd9988 14167 prealloc.reserve(2 * wctx->writes.size());;
11fdf7f2 14168 int64_t prealloc_left = 0;
f67539c2 14169 prealloc_left = shared_alloc.a->allocate(
3efd9988
FG
14170 need, min_alloc_size, need,
14171 0, &prealloc);
eafe8130 14172 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
11fdf7f2 14173 derr << __func__ << " failed to allocate 0x" << std::hex << need
eafe8130 14174 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
11fdf7f2 14175 << " min_alloc_size 0x" << min_alloc_size
f67539c2 14176 << " available 0x " << shared_alloc.a->get_free()
11fdf7f2
TL
14177 << std::dec << dendl;
14178 if (prealloc.size()) {
f67539c2 14179 shared_alloc.a->release(prealloc);
11fdf7f2 14180 }
a8e16298
TL
14181 return -ENOSPC;
14182 }
9f95a23c 14183 _collect_allocation_stats(need, min_alloc_size, prealloc.size());
a8e16298 14184
f67539c2
TL
14185 if (bdev->is_smr()) {
14186 std::deque<uint64_t> zones_to_clean;
14187 if (shared_alloc.a->zoned_get_zones_to_clean(&zones_to_clean)) {
14188 std::lock_guard l{zoned_cleaner_lock};
14189 zoned_cleaner_queue.swap(zones_to_clean);
14190 zoned_cleaner_cond.notify_one();
14191 }
14192 }
14193
3efd9988
FG
14194 dout(20) << __func__ << " prealloc " << prealloc << dendl;
14195 auto prealloc_pos = prealloc.begin();
522d829b
TL
14196 ceph_assert(prealloc_pos != prealloc.end());
14197 uint64_t prealloc_pos_length = prealloc_pos->length;
3efd9988
FG
14198
14199 for (auto& wi : wctx->writes) {
522d829b 14200 bluestore_blob_t& dblob = wi.b->dirty_blob();
3efd9988
FG
14201 uint64_t b_off = wi.b_off;
14202 bufferlist *l = &wi.bl;
14203 uint64_t final_length = wi.blob_length;
14204 uint64_t csum_length = wi.blob_length;
3efd9988
FG
14205 if (wi.compressed) {
14206 final_length = wi.compressed_bl.length();
14207 csum_length = final_length;
adb31ebb 14208 unsigned csum_order = ctz(csum_length);
3efd9988
FG
14209 l = &wi.compressed_bl;
14210 dblob.set_compressed(wi.blob_length, wi.compressed_len);
adb31ebb 14211 if (csum != Checksummer::CSUM_NONE) {
522d829b
TL
14212 dout(20) << __func__
14213 << " initialize csum setting for compressed blob " << *wi.b
adb31ebb
TL
14214 << " csum_type " << Checksummer::get_csum_type_string(csum)
14215 << " csum_order " << csum_order
14216 << " csum_length 0x" << std::hex << csum_length
14217 << " blob_length 0x" << wi.blob_length
14218 << " compressed_length 0x" << wi.compressed_len << std::dec
14219 << dendl;
14220 dblob.init_csum(csum, csum_order, csum_length);
14221 }
3efd9988 14222 } else if (wi.new_blob) {
adb31ebb 14223 unsigned csum_order;
7c673cae 14224 // initialize newly created blob only
11fdf7f2 14225 ceph_assert(dblob.is_mutable());
7c673cae
FG
14226 if (l->length() != wi.blob_length) {
14227 // hrm, maybe we could do better here, but let's not bother.
14228 dout(20) << __func__ << " forcing csum_order to block_size_order "
14229 << block_size_order << dendl;
31f18b77 14230 csum_order = block_size_order;
7c673cae
FG
14231 } else {
14232 csum_order = std::min(wctx->csum_order, ctz(l->length()));
14233 }
14234 // try to align blob with max_blob_size to improve
14235 // its reuse ratio, e.g. in case of reverse write
14236 uint32_t suggested_boff =
14237 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
14238 if ((suggested_boff % (1 << csum_order)) == 0 &&
14239 suggested_boff + final_length <= max_bsize &&
14240 suggested_boff > b_off) {
181888fb 14241 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae 14242 << std::hex << suggested_boff << std::dec << dendl;
11fdf7f2 14243 ceph_assert(suggested_boff >= b_off);
7c673cae
FG
14244 csum_length += suggested_boff - b_off;
14245 b_off = suggested_boff;
14246 }
181888fb 14247 if (csum != Checksummer::CSUM_NONE) {
522d829b
TL
14248 dout(20) << __func__
14249 << " initialize csum setting for new blob " << *wi.b
181888fb
FG
14250 << " csum_type " << Checksummer::get_csum_type_string(csum)
14251 << " csum_order " << csum_order
14252 << " csum_length 0x" << std::hex << csum_length << std::dec
14253 << dendl;
14254 dblob.init_csum(csum, csum_order, csum_length);
14255 }
7c673cae
FG
14256 }
14257
a8e16298 14258 PExtentVector extents;
3efd9988 14259 int64_t left = final_length;
522d829b
TL
14260 bool has_chunk2defer = false;
14261 auto prefer_deferred_size_snapshot = prefer_deferred_size.load();
3efd9988 14262 while (left > 0) {
11fdf7f2 14263 ceph_assert(prealloc_left > 0);
522d829b 14264 has_chunk2defer |= (prealloc_pos_length < prefer_deferred_size_snapshot);
3efd9988
FG
14265 if (prealloc_pos->length <= left) {
14266 prealloc_left -= prealloc_pos->length;
14267 left -= prealloc_pos->length;
14268 txc->statfs_delta.allocated() += prealloc_pos->length;
14269 extents.push_back(*prealloc_pos);
14270 ++prealloc_pos;
522d829b
TL
14271 if (prealloc_pos != prealloc.end()) {
14272 prealloc_pos_length = prealloc_pos->length;
14273 }
3efd9988
FG
14274 } else {
14275 extents.emplace_back(prealloc_pos->offset, left);
14276 prealloc_pos->offset += left;
14277 prealloc_pos->length -= left;
14278 prealloc_left -= left;
14279 txc->statfs_delta.allocated() += left;
14280 left = 0;
14281 break;
14282 }
14283 }
7c673cae 14284 for (auto& p : extents) {
3efd9988 14285 txc->allocated.insert(p.offset, p.length);
7c673cae 14286 }
11fdf7f2 14287 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
7c673cae 14288
522d829b 14289 dout(20) << __func__ << " blob " << *wi.b << dendl;
181888fb 14290 if (dblob.has_csum()) {
7c673cae
FG
14291 dblob.calc_csum(b_off, *l);
14292 }
181888fb 14293
7c673cae 14294 if (wi.mark_unused) {
1911f103 14295 ceph_assert(!dblob.is_compressed());
7c673cae
FG
14296 auto b_end = b_off + wi.bl.length();
14297 if (b_off) {
14298 dblob.add_unused(0, b_off);
14299 }
1911f103
TL
14300 uint64_t llen = dblob.get_logical_length();
14301 if (b_end < llen) {
14302 dblob.add_unused(b_end, llen - b_end);
7c673cae
FG
14303 }
14304 }
14305
14306 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
14307 b_off + (wi.b_off0 - wi.b_off),
14308 wi.length0,
14309 wi.b,
14310 nullptr);
14311 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
14312 txc->statfs_delta.stored() += le->length;
14313 dout(20) << __func__ << " lex " << *le << dendl;
14314 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
14315 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14316
14317 // queue io
11fdf7f2 14318 if (!g_conf()->bluestore_debug_omit_block_device_write) {
522d829b 14319 if (has_chunk2defer && l->length() < prefer_deferred_size_snapshot) {
f67539c2 14320 dout(20) << __func__ << " deferring 0x" << std::hex
7c673cae 14321 << l->length() << std::dec << " write via deferred" << dendl;
522d829b 14322 bluestore_deferred_op_t *op = _get_deferred_op(txc, l->length());
7c673cae 14323 op->op = bluestore_deferred_op_t::OP_WRITE;
522d829b 14324 int r = wi.b->get_blob().map(
7c673cae
FG
14325 b_off, l->length(),
14326 [&](uint64_t offset, uint64_t length) {
14327 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14328 return 0;
14329 });
11fdf7f2 14330 ceph_assert(r == 0);
7c673cae
FG
14331 op->data = *l;
14332 } else {
522d829b 14333 wi.b->get_blob().map_bl(
7c673cae
FG
14334 b_off, *l,
14335 [&](uint64_t offset, bufferlist& t) {
14336 bdev->aio_write(offset, t, &txc->ioc, false);
14337 });
f67539c2 14338 logger->inc(l_bluestore_write_new);
7c673cae
FG
14339 }
14340 }
14341 }
11fdf7f2
TL
14342 ceph_assert(prealloc_pos == prealloc.end());
14343 ceph_assert(prealloc_left == 0);
7c673cae
FG
14344 return 0;
14345}
14346
14347void BlueStore::_wctx_finish(
14348 TransContext *txc,
14349 CollectionRef& c,
14350 OnodeRef o,
31f18b77
FG
14351 WriteContext *wctx,
14352 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
14353{
14354 auto oep = wctx->old_extents.begin();
14355 while (oep != wctx->old_extents.end()) {
14356 auto &lo = *oep;
14357 oep = wctx->old_extents.erase(oep);
14358 dout(20) << __func__ << " lex_old " << lo.e << dendl;
14359 BlobRef b = lo.e.blob;
14360 const bluestore_blob_t& blob = b->get_blob();
14361 if (blob.is_compressed()) {
14362 if (lo.blob_empty) {
14363 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
14364 }
14365 txc->statfs_delta.compressed_original() -= lo.e.length;
14366 }
14367 auto& r = lo.r;
14368 txc->statfs_delta.stored() -= lo.e.length;
14369 if (!r.empty()) {
f67539c2 14370 dout(20) << __func__ << " blob " << *b << " release " << r << dendl;
7c673cae
FG
14371 if (blob.is_shared()) {
14372 PExtentVector final;
14373 c->load_shared_blob(b->shared_blob);
11fdf7f2
TL
14374 bool unshare = false;
14375 bool* unshare_ptr =
14376 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
7c673cae 14377 for (auto e : r) {
31f18b77
FG
14378 b->shared_blob->put_ref(
14379 e.offset, e.length, &final,
11fdf7f2
TL
14380 unshare_ptr);
14381 }
14382 if (unshare) {
14383 ceph_assert(maybe_unshared_blobs);
14384 maybe_unshared_blobs->insert(b->shared_blob.get());
7c673cae
FG
14385 }
14386 dout(20) << __func__ << " shared_blob release " << final
14387 << " from " << *b->shared_blob << dendl;
14388 txc->write_shared_blob(b->shared_blob);
14389 r.clear();
14390 r.swap(final);
14391 }
14392 }
14393 // we can't invalidate our logical extents as we drop them because
14394 // other lextents (either in our onode or others) may still
14395 // reference them. but we can throw out anything that is no
14396 // longer allocated. Note that this will leave behind edge bits
14397 // that are no longer referenced but not deallocated (until they
14398 // age out of the cache naturally).
14399 b->discard_unallocated(c.get());
14400 for (auto e : r) {
14401 dout(20) << __func__ << " release " << e << dendl;
14402 txc->released.insert(e.offset, e.length);
14403 txc->statfs_delta.allocated() -= e.length;
14404 if (blob.is_compressed()) {
14405 txc->statfs_delta.compressed_allocated() -= e.length;
14406 }
14407 }
9f95a23c
TL
14408
14409 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
7c673cae
FG
14410 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
14411 << dendl;
14412 o->extent_map.spanning_blob_map.erase(b->id);
14413 }
9f95a23c 14414 delete &lo;
7c673cae
FG
14415 }
14416}
14417
14418void BlueStore::_do_write_data(
14419 TransContext *txc,
14420 CollectionRef& c,
14421 OnodeRef o,
14422 uint64_t offset,
14423 uint64_t length,
14424 bufferlist& bl,
14425 WriteContext *wctx)
14426{
14427 uint64_t end = offset + length;
14428 bufferlist::iterator p = bl.begin();
14429
14430 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
14431 (length != min_alloc_size)) {
14432 // we fall within the same block
14433 _do_write_small(txc, c, o, offset, length, p, wctx);
14434 } else {
14435 uint64_t head_offset, head_length;
14436 uint64_t middle_offset, middle_length;
14437 uint64_t tail_offset, tail_length;
14438
14439 head_offset = offset;
11fdf7f2 14440 head_length = p2nphase(offset, min_alloc_size);
7c673cae 14441
11fdf7f2
TL
14442 tail_offset = p2align(end, min_alloc_size);
14443 tail_length = p2phase(end, min_alloc_size);
7c673cae
FG
14444
14445 middle_offset = head_offset + head_length;
14446 middle_length = length - head_length - tail_length;
14447
14448 if (head_length) {
14449 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
14450 }
14451
f67539c2 14452 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
7c673cae
FG
14453
14454 if (tail_length) {
14455 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
14456 }
14457 }
14458}
14459
31f18b77
FG
14460void BlueStore::_choose_write_options(
14461 CollectionRef& c,
14462 OnodeRef o,
14463 uint32_t fadvise_flags,
14464 WriteContext *wctx)
7c673cae 14465{
7c673cae
FG
14466 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
14467 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 14468 wctx->buffered = true;
7c673cae
FG
14469 } else if (cct->_conf->bluestore_default_buffered_write &&
14470 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
14471 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
14472 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 14473 wctx->buffered = true;
7c673cae
FG
14474 }
14475
31f18b77
FG
14476 // apply basic csum block size
14477 wctx->csum_order = block_size_order;
7c673cae
FG
14478
14479 // compression parameters
14480 unsigned alloc_hints = o->onode.alloc_hint_flags;
14481 auto cm = select_option(
14482 "compression_mode",
31f18b77 14483 comp_mode.load(),
7c673cae
FG
14484 [&]() {
14485 string val;
11fdf7f2 14486 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
14487 return boost::optional<Compressor::CompressionMode>(
14488 Compressor::get_comp_mode_type(val));
7c673cae
FG
14489 }
14490 return boost::optional<Compressor::CompressionMode>();
14491 }
14492 );
31f18b77
FG
14493
14494 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
14495 ((cm == Compressor::COMP_FORCE) ||
14496 (cm == Compressor::COMP_AGGRESSIVE &&
14497 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
14498 (cm == Compressor::COMP_PASSIVE &&
14499 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
14500
14501 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
14502 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
14503 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
14504 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 14505 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 14506
7c673cae 14507 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 14508
7c673cae 14509 if (o->onode.expected_write_size) {
224ce89b 14510 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 14511 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 14512 } else {
224ce89b 14513 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
14514 }
14515
31f18b77
FG
14516 if (wctx->compress) {
14517 wctx->target_blob_size = select_option(
7c673cae 14518 "compression_max_blob_size",
31f18b77 14519 comp_max_blob_size.load(),
7c673cae 14520 [&]() {
11fdf7f2
TL
14521 int64_t val;
14522 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
7c673cae
FG
14523 return boost::optional<uint64_t>((uint64_t)val);
14524 }
14525 return boost::optional<uint64_t>();
14526 }
14527 );
14528 }
14529 } else {
31f18b77
FG
14530 if (wctx->compress) {
14531 wctx->target_blob_size = select_option(
7c673cae 14532 "compression_min_blob_size",
31f18b77 14533 comp_min_blob_size.load(),
7c673cae 14534 [&]() {
11fdf7f2
TL
14535 int64_t val;
14536 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
7c673cae
FG
14537 return boost::optional<uint64_t>((uint64_t)val);
14538 }
14539 return boost::optional<uint64_t>();
14540 }
14541 );
14542 }
14543 }
31f18b77 14544
7c673cae 14545 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
14546 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
14547 wctx->target_blob_size = max_bsize;
7c673cae 14548 }
31f18b77 14549
7c673cae
FG
14550 // set the min blob size floor at 2x the min_alloc_size, or else we
14551 // won't be able to allocate a smaller extent for the compressed
14552 // data.
31f18b77
FG
14553 if (wctx->compress &&
14554 wctx->target_blob_size < min_alloc_size * 2) {
14555 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 14556 }
31f18b77
FG
14557
14558 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
14559 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
f64942e4
AA
14560 << " compress=" << (int)wctx->compress
14561 << " buffered=" << (int)wctx->buffered
31f18b77
FG
14562 << std::dec << dendl;
14563}
14564
14565int BlueStore::_do_gc(
14566 TransContext *txc,
14567 CollectionRef& c,
14568 OnodeRef o,
31f18b77
FG
14569 const WriteContext& wctx,
14570 uint64_t *dirty_start,
14571 uint64_t *dirty_end)
14572{
31f18b77 14573
1adf2230 14574 bool dirty_range_updated = false;
31f18b77 14575 WriteContext wctx_gc;
7c673cae 14576 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 14577
eafe8130 14578 auto & extents_to_collect = wctx.extents_to_gc;
31f18b77
FG
14579 for (auto it = extents_to_collect.begin();
14580 it != extents_to_collect.end();
14581 ++it) {
14582 bufferlist bl;
eafe8130
TL
14583 auto offset = (*it).first;
14584 auto length = (*it).second;
14585 dout(20) << __func__ << " processing " << std::hex
14586 << offset << "~" << length << std::dec
14587 << dendl;
14588 int r = _do_read(c.get(), o, offset, length, bl, 0);
14589 ceph_assert(r == (int)length);
31f18b77 14590
eafe8130
TL
14591 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
14592 logger->inc(l_bluestore_gc_merged, length);
31f18b77 14593
eafe8130
TL
14594 if (*dirty_start > offset) {
14595 *dirty_start = offset;
1adf2230 14596 dirty_range_updated = true;
31f18b77
FG
14597 }
14598
eafe8130
TL
14599 if (*dirty_end < offset + length) {
14600 *dirty_end = offset + length;
1adf2230 14601 dirty_range_updated = true;
31f18b77
FG
14602 }
14603 }
1adf2230
AA
14604 if (dirty_range_updated) {
14605 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
14606 }
31f18b77
FG
14607
14608 dout(30) << __func__ << " alloc write" << dendl;
14609 int r = _do_alloc_write(txc, c, o, &wctx_gc);
14610 if (r < 0) {
14611 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14612 << dendl;
14613 return r;
14614 }
14615
14616 _wctx_finish(txc, c, o, &wctx_gc);
14617 return 0;
14618}
14619
14620int BlueStore::_do_write(
14621 TransContext *txc,
14622 CollectionRef& c,
14623 OnodeRef o,
14624 uint64_t offset,
14625 uint64_t length,
14626 bufferlist& bl,
14627 uint32_t fadvise_flags)
14628{
14629 int r = 0;
14630
14631 dout(20) << __func__
14632 << " " << o->oid
14633 << " 0x" << std::hex << offset << "~" << length
14634 << " - have 0x" << o->onode.size
14635 << " (" << std::dec << o->onode.size << ")"
f67539c2
TL
14636 << " bytes" << std::hex
14637 << " fadvise_flags 0x" << fadvise_flags
14638 << " alloc_hint 0x" << o->onode.alloc_hint_flags
14639 << " expected_object_size " << o->onode.expected_object_size
14640 << " expected_write_size " << o->onode.expected_write_size
14641 << std::dec
31f18b77 14642 << dendl;
81eedcae 14643 _dump_onode<30>(cct, *o);
31f18b77
FG
14644
14645 if (length == 0) {
14646 return 0;
14647 }
14648
14649 uint64_t end = offset + length;
14650
14651 GarbageCollector gc(c->store->cct);
eafe8130 14652 int64_t benefit = 0;
31f18b77
FG
14653 auto dirty_start = offset;
14654 auto dirty_end = end;
14655
14656 WriteContext wctx;
14657 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
14658 o->extent_map.fault_range(db, offset, length);
14659 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
14660 r = _do_alloc_write(txc, c, o, &wctx);
14661 if (r < 0) {
14662 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14663 << dendl;
14664 goto out;
14665 }
14666
eafe8130
TL
14667 if (wctx.extents_to_gc.empty() ||
14668 wctx.extents_to_gc.range_start() > offset ||
14669 wctx.extents_to_gc.range_end() < offset + length) {
14670 benefit = gc.estimate(offset,
14671 length,
14672 o->extent_map,
14673 wctx.old_extents,
14674 min_alloc_size);
14675 }
14676
f67539c2
TL
14677 if (bdev->is_smr()) {
14678 if (wctx.old_extents.empty()) {
14679 txc->zoned_note_new_object(o);
14680 } else {
14681 int64_t old_ondisk_offset = wctx.old_extents.begin()->r.begin()->offset;
14682 txc->zoned_note_updated_object(o, old_ondisk_offset);
14683 }
14684 }
14685
31f18b77
FG
14686 // NB: _wctx_finish() will empty old_extents
14687 // so we must do gc estimation before that
7c673cae
FG
14688 _wctx_finish(txc, c, o, &wctx);
14689 if (end > o->onode.size) {
14690 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 14691 << std::dec << dendl;
7c673cae
FG
14692 o->onode.size = end;
14693 }
14694
11fdf7f2 14695 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
eafe8130
TL
14696 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
14697 dout(20) << __func__
14698 << " perform garbage collection for compressed extents, "
14699 << "expected benefit = " << benefit << " AUs" << dendl;
14700 }
14701 if (!wctx.extents_to_gc.empty()) {
14702 dout(20) << __func__ << " perform garbage collection" << dendl;
14703
14704 r = _do_gc(txc, c, o,
14705 wctx,
14706 &dirty_start, &dirty_end);
14707 if (r < 0) {
14708 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
14709 << dendl;
14710 goto out;
7c673cae 14711 }
eafe8130
TL
14712 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
14713 << "~" << dirty_end - dirty_start << std::dec << dendl;
7c673cae 14714 }
7c673cae 14715 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
14716 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
14717
7c673cae
FG
14718 r = 0;
14719
14720 out:
14721 return r;
14722}
14723
14724int BlueStore::_write(TransContext *txc,
14725 CollectionRef& c,
14726 OnodeRef& o,
31f18b77
FG
14727 uint64_t offset, size_t length,
14728 bufferlist& bl,
14729 uint32_t fadvise_flags)
7c673cae
FG
14730{
14731 dout(15) << __func__ << " " << c->cid << " " << o->oid
14732 << " 0x" << std::hex << offset << "~" << length << std::dec
14733 << dendl;
35e4c445
FG
14734 int r = 0;
14735 if (offset + length >= OBJECT_MAX_SIZE) {
14736 r = -E2BIG;
14737 } else {
14738 _assign_nid(txc, o);
14739 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
14740 txc->write_onode(o);
14741 }
7c673cae
FG
14742 dout(10) << __func__ << " " << c->cid << " " << o->oid
14743 << " 0x" << std::hex << offset << "~" << length << std::dec
14744 << " = " << r << dendl;
14745 return r;
14746}
14747
14748int BlueStore::_zero(TransContext *txc,
14749 CollectionRef& c,
14750 OnodeRef& o,
14751 uint64_t offset, size_t length)
14752{
14753 dout(15) << __func__ << " " << c->cid << " " << o->oid
14754 << " 0x" << std::hex << offset << "~" << length << std::dec
14755 << dendl;
35e4c445
FG
14756 int r = 0;
14757 if (offset + length >= OBJECT_MAX_SIZE) {
14758 r = -E2BIG;
14759 } else {
14760 _assign_nid(txc, o);
14761 r = _do_zero(txc, c, o, offset, length);
14762 }
7c673cae
FG
14763 dout(10) << __func__ << " " << c->cid << " " << o->oid
14764 << " 0x" << std::hex << offset << "~" << length << std::dec
14765 << " = " << r << dendl;
14766 return r;
14767}
14768
14769int BlueStore::_do_zero(TransContext *txc,
14770 CollectionRef& c,
14771 OnodeRef& o,
14772 uint64_t offset, size_t length)
14773{
14774 dout(15) << __func__ << " " << c->cid << " " << o->oid
14775 << " 0x" << std::hex << offset << "~" << length << std::dec
14776 << dendl;
14777 int r = 0;
14778
81eedcae 14779 _dump_onode<30>(cct, *o);
7c673cae
FG
14780
14781 WriteContext wctx;
14782 o->extent_map.fault_range(db, offset, length);
14783 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 14784 o->extent_map.dirty_range(offset, length);
7c673cae
FG
14785 _wctx_finish(txc, c, o, &wctx);
14786
b32b8144 14787 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
14788 o->onode.size = offset + length;
14789 dout(20) << __func__ << " extending size to " << offset + length
14790 << dendl;
14791 }
14792 txc->write_onode(o);
14793
14794 dout(10) << __func__ << " " << c->cid << " " << o->oid
14795 << " 0x" << std::hex << offset << "~" << length << std::dec
14796 << " = " << r << dendl;
14797 return r;
14798}
14799
14800void BlueStore::_do_truncate(
31f18b77
FG
14801 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
14802 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
14803{
14804 dout(15) << __func__ << " " << c->cid << " " << o->oid
14805 << " 0x" << std::hex << offset << std::dec << dendl;
14806
81eedcae 14807 _dump_onode<30>(cct, *o);
7c673cae
FG
14808
14809 if (offset == o->onode.size)
31f18b77 14810 return;
7c673cae 14811
f67539c2 14812 WriteContext wctx;
7c673cae 14813 if (offset < o->onode.size) {
7c673cae
FG
14814 uint64_t length = o->onode.size - offset;
14815 o->extent_map.fault_range(db, offset, length);
14816 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77
FG
14817 o->extent_map.dirty_range(offset, length);
14818 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
14819
14820 // if we have shards past EOF, ask for a reshard
14821 if (!o->onode.extent_map_shards.empty() &&
14822 o->onode.extent_map_shards.back().offset >= offset) {
14823 dout(10) << __func__ << " request reshard past EOF" << dendl;
14824 if (offset) {
14825 o->extent_map.request_reshard(offset - 1, offset + length);
14826 } else {
14827 o->extent_map.request_reshard(0, length);
14828 }
14829 }
14830 }
14831
14832 o->onode.size = offset;
14833
f67539c2
TL
14834 if (bdev->is_smr()) {
14835 // On zoned devices, we currently support only removing an object or
14836 // truncating it to zero size, both of which fall through this code path.
14837 ceph_assert(offset == 0 && !wctx.old_extents.empty());
14838 int64_t ondisk_offset = wctx.old_extents.begin()->r.begin()->offset;
14839 txc->zoned_note_truncated_object(o, ondisk_offset);
14840 }
14841
7c673cae
FG
14842 txc->write_onode(o);
14843}
14844
35e4c445 14845int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
14846 CollectionRef& c,
14847 OnodeRef& o,
14848 uint64_t offset)
14849{
14850 dout(15) << __func__ << " " << c->cid << " " << o->oid
14851 << " 0x" << std::hex << offset << std::dec
14852 << dendl;
35e4c445
FG
14853 int r = 0;
14854 if (offset >= OBJECT_MAX_SIZE) {
14855 r = -E2BIG;
14856 } else {
14857 _do_truncate(txc, c, o, offset);
14858 }
14859 dout(10) << __func__ << " " << c->cid << " " << o->oid
14860 << " 0x" << std::hex << offset << std::dec
14861 << " = " << r << dendl;
14862 return r;
7c673cae
FG
14863}
14864
14865int BlueStore::_do_remove(
14866 TransContext *txc,
14867 CollectionRef& c,
14868 OnodeRef o)
14869{
31f18b77 14870 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
14871 bool is_gen = !o->oid.is_no_gen();
14872 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
14873 if (o->onode.has_omap()) {
14874 o->flush();
9f95a23c 14875 _do_omap_clear(txc, o);
7c673cae
FG
14876 }
14877 o->exists = false;
14878 string key;
14879 for (auto &s : o->extent_map.shards) {
14880 dout(20) << __func__ << " removing shard 0x" << std::hex
14881 << s.shard_info->offset << std::dec << dendl;
14882 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
14883 [&](const string& final_key) {
14884 txc->t->rmkey(PREFIX_OBJ, final_key);
14885 }
14886 );
14887 }
14888 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
a8e16298 14889 txc->note_removed_object(o);
7c673cae
FG
14890 o->extent_map.clear();
14891 o->onode = bluestore_onode_t();
14892 _debug_obj_on_delete(o->oid);
31f18b77 14893
224ce89b
WB
14894 if (!is_gen || maybe_unshared_blobs.empty()) {
14895 return 0;
14896 }
31f18b77 14897
224ce89b
WB
14898 // see if we can unshare blobs still referenced by the head
14899 dout(10) << __func__ << " gen and maybe_unshared_blobs "
14900 << maybe_unshared_blobs << dendl;
14901 ghobject_t nogen = o->oid;
14902 nogen.generation = ghobject_t::NO_GEN;
f67539c2 14903 OnodeRef h = c->get_onode(nogen, false);
224ce89b
WB
14904
14905 if (!h || !h->exists) {
14906 return 0;
14907 }
14908
14909 dout(20) << __func__ << " checking for unshareable blobs on " << h
14910 << " " << h->oid << dendl;
14911 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
14912 for (auto& e : h->extent_map.extent_map) {
14913 const bluestore_blob_t& b = e.blob->get_blob();
14914 SharedBlob *sb = e.blob->shared_blob.get();
14915 if (b.is_shared() &&
14916 sb->loaded &&
14917 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
14918 if (b.is_compressed()) {
14919 expect[sb].get(0, b.get_ondisk_length());
14920 } else {
14921 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
14922 expect[sb].get(off, len);
14923 return 0;
14924 });
14925 }
224ce89b
WB
14926 }
14927 }
31f18b77 14928
224ce89b
WB
14929 vector<SharedBlob*> unshared_blobs;
14930 unshared_blobs.reserve(maybe_unshared_blobs.size());
14931 for (auto& p : expect) {
14932 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
14933 if (p.first->persistent->ref_map == p.second) {
14934 SharedBlob *sb = p.first;
14935 dout(20) << __func__ << " unsharing " << *sb << dendl;
14936 unshared_blobs.push_back(sb);
14937 txc->unshare_blob(sb);
14938 uint64_t sbid = c->make_blob_unshared(sb);
14939 string key;
14940 get_shared_blob_key(sbid, &key);
14941 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
14942 }
14943 }
14944
14945 if (unshared_blobs.empty()) {
14946 return 0;
14947 }
14948
224ce89b
WB
14949 for (auto& e : h->extent_map.extent_map) {
14950 const bluestore_blob_t& b = e.blob->get_blob();
14951 SharedBlob *sb = e.blob->shared_blob.get();
14952 if (b.is_shared() &&
14953 std::find(unshared_blobs.begin(), unshared_blobs.end(),
14954 sb) != unshared_blobs.end()) {
14955 dout(20) << __func__ << " unsharing " << e << dendl;
14956 bluestore_blob_t& blob = e.blob->dirty_blob();
14957 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 14958 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
14959 }
14960 }
224ce89b
WB
14961 txc->write_onode(h);
14962
7c673cae
FG
14963 return 0;
14964}
14965
14966int BlueStore::_remove(TransContext *txc,
14967 CollectionRef& c,
14968 OnodeRef &o)
14969{
11fdf7f2
TL
14970 dout(15) << __func__ << " " << c->cid << " " << o->oid
14971 << " onode " << o.get()
14972 << " txc "<< txc << dendl;
adb31ebb
TL
14973
14974 auto start_time = mono_clock::now();
7c673cae 14975 int r = _do_remove(txc, c, o);
adb31ebb
TL
14976 log_latency_fn(
14977 __func__,
14978 l_bluestore_remove_lat,
14979 mono_clock::now() - start_time,
14980 cct->_conf->bluestore_log_op_age,
14981 [&](const ceph::timespan& lat) {
14982 ostringstream ostr;
14983 ostr << ", lat = " << timespan_str(lat)
14984 << " cid =" << c->cid
14985 << " oid =" << o->oid;
14986 return ostr.str();
14987 }
14988 );
14989
7c673cae
FG
14990 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14991 return r;
14992}
14993
14994int BlueStore::_setattr(TransContext *txc,
14995 CollectionRef& c,
14996 OnodeRef& o,
14997 const string& name,
14998 bufferptr& val)
14999{
15000 dout(15) << __func__ << " " << c->cid << " " << o->oid
15001 << " " << name << " (" << val.length() << " bytes)"
15002 << dendl;
15003 int r = 0;
3efd9988
FG
15004 if (val.is_partial()) {
15005 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
15006 val.length());
f91f0fd5 15007 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
15008 } else {
15009 auto& b = o->onode.attrs[name.c_str()] = val;
f91f0fd5 15010 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 15011 }
7c673cae
FG
15012 txc->write_onode(o);
15013 dout(10) << __func__ << " " << c->cid << " " << o->oid
15014 << " " << name << " (" << val.length() << " bytes)"
15015 << " = " << r << dendl;
15016 return r;
15017}
15018
15019int BlueStore::_setattrs(TransContext *txc,
15020 CollectionRef& c,
15021 OnodeRef& o,
15022 const map<string,bufferptr>& aset)
15023{
15024 dout(15) << __func__ << " " << c->cid << " " << o->oid
15025 << " " << aset.size() << " keys"
15026 << dendl;
15027 int r = 0;
15028 for (map<string,bufferptr>::const_iterator p = aset.begin();
15029 p != aset.end(); ++p) {
3efd9988
FG
15030 if (p->second.is_partial()) {
15031 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 15032 bufferptr(p->second.c_str(), p->second.length());
f91f0fd5 15033 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
15034 } else {
15035 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
f91f0fd5 15036 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 15037 }
7c673cae
FG
15038 }
15039 txc->write_onode(o);
15040 dout(10) << __func__ << " " << c->cid << " " << o->oid
15041 << " " << aset.size() << " keys"
15042 << " = " << r << dendl;
15043 return r;
15044}
15045
15046
15047int BlueStore::_rmattr(TransContext *txc,
15048 CollectionRef& c,
15049 OnodeRef& o,
15050 const string& name)
15051{
15052 dout(15) << __func__ << " " << c->cid << " " << o->oid
15053 << " " << name << dendl;
15054 int r = 0;
15055 auto it = o->onode.attrs.find(name.c_str());
15056 if (it == o->onode.attrs.end())
15057 goto out;
15058
15059 o->onode.attrs.erase(it);
15060 txc->write_onode(o);
15061
15062 out:
15063 dout(10) << __func__ << " " << c->cid << " " << o->oid
15064 << " " << name << " = " << r << dendl;
15065 return r;
15066}
15067
15068int BlueStore::_rmattrs(TransContext *txc,
15069 CollectionRef& c,
15070 OnodeRef& o)
15071{
15072 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15073 int r = 0;
15074
15075 if (o->onode.attrs.empty())
15076 goto out;
15077
15078 o->onode.attrs.clear();
15079 txc->write_onode(o);
15080
15081 out:
15082 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15083 return r;
15084}
15085
9f95a23c 15086void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
7c673cae 15087{
9f95a23c 15088 const string& omap_prefix = o->get_omap_prefix();
7c673cae 15089 string prefix, tail;
9f95a23c
TL
15090 o->get_omap_header(&prefix);
15091 o->get_omap_tail(&tail);
11fdf7f2 15092 txc->t->rm_range_keys(omap_prefix, prefix, tail);
494da23a 15093 txc->t->rmkey(omap_prefix, tail);
11fdf7f2
TL
15094 dout(20) << __func__ << " remove range start: "
15095 << pretty_binary_string(prefix) << " end: "
15096 << pretty_binary_string(tail) << dendl;
7c673cae
FG
15097}
15098
15099int BlueStore::_omap_clear(TransContext *txc,
15100 CollectionRef& c,
15101 OnodeRef& o)
15102{
15103 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15104 int r = 0;
15105 if (o->onode.has_omap()) {
15106 o->flush();
9f95a23c 15107 _do_omap_clear(txc, o);
7c673cae
FG
15108 o->onode.clear_omap_flag();
15109 txc->write_onode(o);
15110 }
15111 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15112 return r;
15113}
15114
15115int BlueStore::_omap_setkeys(TransContext *txc,
15116 CollectionRef& c,
15117 OnodeRef& o,
15118 bufferlist &bl)
15119{
15120 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15121 int r;
11fdf7f2 15122 auto p = bl.cbegin();
7c673cae
FG
15123 __u32 num;
15124 if (!o->onode.has_omap()) {
11fdf7f2 15125 if (o->oid.is_pgmeta()) {
9f95a23c
TL
15126 o->onode.set_omap_flags_pgmeta();
15127 } else {
522d829b 15128 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
11fdf7f2 15129 }
7c673cae 15130 txc->write_onode(o);
494da23a 15131
9f95a23c 15132 const string& prefix = o->get_omap_prefix();
494da23a
TL
15133 string key_tail;
15134 bufferlist tail;
9f95a23c 15135 o->get_omap_tail(&key_tail);
494da23a 15136 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
15137 } else {
15138 txc->note_modified_object(o);
15139 }
9f95a23c 15140 const string& prefix = o->get_omap_prefix();
7c673cae 15141 string final_key;
9f95a23c
TL
15142 o->get_omap_key(string(), &final_key);
15143 size_t base_key_len = final_key.size();
11fdf7f2 15144 decode(num, p);
7c673cae
FG
15145 while (num--) {
15146 string key;
15147 bufferlist value;
11fdf7f2
TL
15148 decode(key, p);
15149 decode(value, p);
9f95a23c 15150 final_key.resize(base_key_len); // keep prefix
7c673cae 15151 final_key += key;
11fdf7f2 15152 dout(20) << __func__ << " " << pretty_binary_string(final_key)
7c673cae 15153 << " <- " << key << dendl;
11fdf7f2 15154 txc->t->set(prefix, final_key, value);
7c673cae
FG
15155 }
15156 r = 0;
15157 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15158 return r;
15159}
15160
15161int BlueStore::_omap_setheader(TransContext *txc,
15162 CollectionRef& c,
15163 OnodeRef &o,
15164 bufferlist& bl)
15165{
15166 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15167 int r;
15168 string key;
15169 if (!o->onode.has_omap()) {
11fdf7f2 15170 if (o->oid.is_pgmeta()) {
9f95a23c
TL
15171 o->onode.set_omap_flags_pgmeta();
15172 } else {
522d829b 15173 o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
11fdf7f2 15174 }
7c673cae 15175 txc->write_onode(o);
494da23a 15176
9f95a23c 15177 const string& prefix = o->get_omap_prefix();
494da23a
TL
15178 string key_tail;
15179 bufferlist tail;
9f95a23c 15180 o->get_omap_tail(&key_tail);
494da23a 15181 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
15182 } else {
15183 txc->note_modified_object(o);
15184 }
9f95a23c
TL
15185 const string& prefix = o->get_omap_prefix();
15186 o->get_omap_header(&key);
11fdf7f2 15187 txc->t->set(prefix, key, bl);
7c673cae
FG
15188 r = 0;
15189 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15190 return r;
15191}
15192
15193int BlueStore::_omap_rmkeys(TransContext *txc,
15194 CollectionRef& c,
15195 OnodeRef& o,
15196 bufferlist& bl)
15197{
15198 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15199 int r = 0;
11fdf7f2 15200 auto p = bl.cbegin();
7c673cae
FG
15201 __u32 num;
15202 string final_key;
15203
15204 if (!o->onode.has_omap()) {
15205 goto out;
15206 }
11fdf7f2 15207 {
9f95a23c
TL
15208 const string& prefix = o->get_omap_prefix();
15209 o->get_omap_key(string(), &final_key);
15210 size_t base_key_len = final_key.size();
11fdf7f2
TL
15211 decode(num, p);
15212 while (num--) {
15213 string key;
15214 decode(key, p);
9f95a23c 15215 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
15216 final_key += key;
15217 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
15218 << " <- " << key << dendl;
15219 txc->t->rmkey(prefix, final_key);
15220 }
7c673cae
FG
15221 }
15222 txc->note_modified_object(o);
15223
15224 out:
15225 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15226 return r;
15227}
15228
15229int BlueStore::_omap_rmkey_range(TransContext *txc,
15230 CollectionRef& c,
15231 OnodeRef& o,
15232 const string& first, const string& last)
15233{
15234 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
7c673cae
FG
15235 string key_first, key_last;
15236 int r = 0;
15237 if (!o->onode.has_omap()) {
15238 goto out;
15239 }
11fdf7f2 15240 {
9f95a23c 15241 const string& prefix = o->get_omap_prefix();
11fdf7f2 15242 o->flush();
9f95a23c
TL
15243 o->get_omap_key(first, &key_first);
15244 o->get_omap_key(last, &key_last);
11fdf7f2
TL
15245 txc->t->rm_range_keys(prefix, key_first, key_last);
15246 dout(20) << __func__ << " remove range start: "
15247 << pretty_binary_string(key_first) << " end: "
15248 << pretty_binary_string(key_last) << dendl;
7c673cae
FG
15249 }
15250 txc->note_modified_object(o);
15251
15252 out:
15253 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15254 return r;
15255}
15256
15257int BlueStore::_set_alloc_hint(
15258 TransContext *txc,
15259 CollectionRef& c,
15260 OnodeRef& o,
15261 uint64_t expected_object_size,
15262 uint64_t expected_write_size,
15263 uint32_t flags)
15264{
15265 dout(15) << __func__ << " " << c->cid << " " << o->oid
15266 << " object_size " << expected_object_size
15267 << " write_size " << expected_write_size
15268 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
15269 << dendl;
15270 int r = 0;
15271 o->onode.expected_object_size = expected_object_size;
15272 o->onode.expected_write_size = expected_write_size;
15273 o->onode.alloc_hint_flags = flags;
15274 txc->write_onode(o);
15275 dout(10) << __func__ << " " << c->cid << " " << o->oid
15276 << " object_size " << expected_object_size
15277 << " write_size " << expected_write_size
15278 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
15279 << " = " << r << dendl;
15280 return r;
15281}
15282
15283int BlueStore::_clone(TransContext *txc,
15284 CollectionRef& c,
15285 OnodeRef& oldo,
15286 OnodeRef& newo)
15287{
15288 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15289 << newo->oid << dendl;
15290 int r = 0;
15291 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
15292 derr << __func__ << " mismatched hash on " << oldo->oid
15293 << " and " << newo->oid << dendl;
15294 return -EINVAL;
15295 }
15296
7c673cae
FG
15297 _assign_nid(txc, newo);
15298
15299 // clone data
15300 oldo->flush();
15301 _do_truncate(txc, c, newo, 0);
15302 if (cct->_conf->bluestore_clone_cow) {
15303 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
15304 } else {
15305 bufferlist bl;
15306 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
15307 if (r < 0)
15308 goto out;
15309 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
15310 if (r < 0)
15311 goto out;
15312 }
15313
15314 // clone attrs
15315 newo->onode.attrs = oldo->onode.attrs;
15316
15317 // clone omap
15318 if (newo->onode.has_omap()) {
15319 dout(20) << __func__ << " clearing old omap data" << dendl;
15320 newo->flush();
9f95a23c 15321 _do_omap_clear(txc, newo);
494da23a 15322 newo->onode.clear_omap_flag();
7c673cae
FG
15323 }
15324 if (oldo->onode.has_omap()) {
15325 dout(20) << __func__ << " copying omap data" << dendl;
494da23a 15326 if (newo->oid.is_pgmeta()) {
9f95a23c
TL
15327 newo->onode.set_omap_flags_pgmeta();
15328 } else {
522d829b 15329 newo->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
7c673cae 15330 }
9f95a23c 15331 const string& prefix = newo->get_omap_prefix();
11fdf7f2 15332 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 15333 string head, tail;
9f95a23c
TL
15334 oldo->get_omap_header(&head);
15335 oldo->get_omap_tail(&tail);
7c673cae
FG
15336 it->lower_bound(head);
15337 while (it->valid()) {
15338 if (it->key() >= tail) {
15339 dout(30) << __func__ << " reached tail" << dendl;
15340 break;
15341 } else {
15342 dout(30) << __func__ << " got header/data "
15343 << pretty_binary_string(it->key()) << dendl;
15344 string key;
9f95a23c 15345 newo->rewrite_omap_key(it->key(), &key);
11fdf7f2 15346 txc->t->set(prefix, key, it->value());
7c673cae
FG
15347 }
15348 it->next();
15349 }
494da23a
TL
15350 string new_tail;
15351 bufferlist new_tail_value;
9f95a23c 15352 newo->get_omap_tail(&new_tail);
494da23a 15353 txc->t->set(prefix, new_tail, new_tail_value);
7c673cae
FG
15354 }
15355
15356 txc->write_onode(newo);
15357 r = 0;
15358
15359 out:
15360 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15361 << newo->oid << " = " << r << dendl;
15362 return r;
15363}
15364
15365int BlueStore::_do_clone_range(
15366 TransContext *txc,
15367 CollectionRef& c,
15368 OnodeRef& oldo,
15369 OnodeRef& newo,
224ce89b
WB
15370 uint64_t srcoff,
15371 uint64_t length,
15372 uint64_t dstoff)
7c673cae
FG
15373{
15374 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15375 << newo->oid
15376 << " 0x" << std::hex << srcoff << "~" << length << " -> "
15377 << " 0x" << dstoff << "~" << length << std::dec << dendl;
15378 oldo->extent_map.fault_range(db, srcoff, length);
15379 newo->extent_map.fault_range(db, dstoff, length);
81eedcae
TL
15380 _dump_onode<30>(cct, *oldo);
15381 _dump_onode<30>(cct, *newo);
7c673cae 15382
11fdf7f2 15383 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
81eedcae
TL
15384 _dump_onode<30>(cct, *oldo);
15385 _dump_onode<30>(cct, *newo);
7c673cae
FG
15386 return 0;
15387}
15388
15389int BlueStore::_clone_range(TransContext *txc,
15390 CollectionRef& c,
15391 OnodeRef& oldo,
15392 OnodeRef& newo,
15393 uint64_t srcoff, uint64_t length, uint64_t dstoff)
15394{
15395 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15396 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
15397 << " to offset 0x" << dstoff << std::dec << dendl;
15398 int r = 0;
15399
35e4c445
FG
15400 if (srcoff + length >= OBJECT_MAX_SIZE ||
15401 dstoff + length >= OBJECT_MAX_SIZE) {
15402 r = -E2BIG;
15403 goto out;
15404 }
7c673cae
FG
15405 if (srcoff + length > oldo->onode.size) {
15406 r = -EINVAL;
15407 goto out;
15408 }
15409
7c673cae
FG
15410 _assign_nid(txc, newo);
15411
15412 if (length > 0) {
15413 if (cct->_conf->bluestore_clone_cow) {
15414 _do_zero(txc, c, newo, dstoff, length);
15415 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
15416 } else {
15417 bufferlist bl;
15418 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
15419 if (r < 0)
15420 goto out;
15421 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
15422 if (r < 0)
15423 goto out;
15424 }
15425 }
15426
15427 txc->write_onode(newo);
15428 r = 0;
15429
15430 out:
15431 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15432 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
15433 << " to offset 0x" << dstoff << std::dec
15434 << " = " << r << dendl;
15435 return r;
15436}
15437
15438int BlueStore::_rename(TransContext *txc,
15439 CollectionRef& c,
15440 OnodeRef& oldo,
15441 OnodeRef& newo,
15442 const ghobject_t& new_oid)
15443{
15444 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15445 << new_oid << dendl;
15446 int r;
15447 ghobject_t old_oid = oldo->oid;
f91f0fd5 15448 mempool::bluestore_cache_meta::string new_okey;
7c673cae
FG
15449
15450 if (newo) {
15451 if (newo->exists) {
15452 r = -EEXIST;
15453 goto out;
15454 }
11fdf7f2 15455 ceph_assert(txc->onodes.count(newo) == 0);
7c673cae
FG
15456 }
15457
15458 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
15459
15460 // rewrite shards
15461 {
15462 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
15463 get_object_key(cct, new_oid, &new_okey);
15464 string key;
15465 for (auto &s : oldo->extent_map.shards) {
15466 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
15467 [&](const string& final_key) {
15468 txc->t->rmkey(PREFIX_OBJ, final_key);
15469 }
15470 );
15471 s.dirty = true;
15472 }
15473 }
15474
15475 newo = oldo;
15476 txc->write_onode(newo);
15477
15478 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
15479 // Onode in the old slot
15480 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
15481 r = 0;
15482
f64942e4
AA
15483 // hold a ref to new Onode in old name position, to ensure we don't drop
15484 // it from the cache before this txc commits (or else someone may come along
15485 // and read newo's metadata via the old name).
15486 txc->note_modified_object(oldo);
15487
7c673cae
FG
15488 out:
15489 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
15490 << new_oid << " = " << r << dendl;
15491 return r;
15492}
15493
15494// collections
15495
15496int BlueStore::_create_collection(
15497 TransContext *txc,
15498 const coll_t &cid,
15499 unsigned bits,
15500 CollectionRef *c)
15501{
15502 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
15503 int r;
15504 bufferlist bl;
15505
15506 {
9f95a23c 15507 std::unique_lock l(coll_lock);
7c673cae
FG
15508 if (*c) {
15509 r = -EEXIST;
15510 goto out;
15511 }
11fdf7f2
TL
15512 auto p = new_coll_map.find(cid);
15513 ceph_assert(p != new_coll_map.end());
15514 *c = p->second;
7c673cae
FG
15515 (*c)->cnode.bits = bits;
15516 coll_map[cid] = *c;
11fdf7f2 15517 new_coll_map.erase(p);
7c673cae 15518 }
11fdf7f2 15519 encode((*c)->cnode, bl);
7c673cae
FG
15520 txc->t->set(PREFIX_COLL, stringify(cid), bl);
15521 r = 0;
15522
15523 out:
15524 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
15525 return r;
15526}
15527
15528int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
15529 CollectionRef *c)
15530{
15531 dout(15) << __func__ << " " << cid << dendl;
15532 int r;
15533
11fdf7f2 15534 (*c)->flush_all_but_last();
7c673cae 15535 {
9f95a23c 15536 std::unique_lock l(coll_lock);
7c673cae
FG
15537 if (!*c) {
15538 r = -ENOENT;
15539 goto out;
15540 }
15541 size_t nonexistent_count = 0;
11fdf7f2 15542 ceph_assert((*c)->exists);
adb31ebb 15543 if ((*c)->onode_map.map_any([&](Onode* o) {
f67539c2
TL
15544 if (o->exists) {
15545 dout(1) << __func__ << " " << o->oid << " " << o
15546 << " exists in onode_map" << dendl;
7c673cae 15547 return true;
f67539c2
TL
15548 }
15549 ++nonexistent_count;
15550 return false;
15551 })) {
7c673cae
FG
15552 r = -ENOTEMPTY;
15553 goto out;
15554 }
7c673cae
FG
15555 vector<ghobject_t> ls;
15556 ghobject_t next;
15557 // Enumerate onodes in db, up to nonexistent_count + 1
15558 // then check if all of them are marked as non-existent.
11fdf7f2 15559 // Bypass the check if (next != ghobject_t::get_max())
7c673cae 15560 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
f91f0fd5 15561 nonexistent_count + 1, false, &ls, &next);
7c673cae 15562 if (r >= 0) {
11fdf7f2
TL
15563 // If true mean collecton has more objects than nonexistent_count,
15564 // so bypass check.
15565 bool exists = (!next.is_max());
7c673cae
FG
15566 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
15567 dout(10) << __func__ << " oid " << *it << dendl;
15568 auto onode = (*c)->onode_map.lookup(*it);
15569 exists = !onode || onode->exists;
15570 if (exists) {
494da23a 15571 dout(1) << __func__ << " " << *it
f67539c2
TL
15572 << " exists in db, "
15573 << (!onode ? "not present in ram" : "present in ram")
15574 << dendl;
7c673cae
FG
15575 }
15576 }
15577 if (!exists) {
f67539c2 15578 _do_remove_collection(txc, c);
7c673cae
FG
15579 r = 0;
15580 } else {
15581 dout(10) << __func__ << " " << cid
15582 << " is non-empty" << dendl;
f67539c2 15583 r = -ENOTEMPTY;
7c673cae
FG
15584 }
15585 }
15586 }
f67539c2 15587out:
7c673cae
FG
15588 dout(10) << __func__ << " " << cid << " = " << r << dendl;
15589 return r;
15590}
15591
11fdf7f2
TL
15592void BlueStore::_do_remove_collection(TransContext *txc,
15593 CollectionRef *c)
15594{
15595 coll_map.erase((*c)->cid);
15596 txc->removed_collections.push_back(*c);
15597 (*c)->exists = false;
15598 _osr_register_zombie((*c)->osr.get());
15599 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
15600 c->reset();
15601}
15602
7c673cae
FG
15603int BlueStore::_split_collection(TransContext *txc,
15604 CollectionRef& c,
15605 CollectionRef& d,
15606 unsigned bits, int rem)
15607{
15608 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
15609 << " bits " << bits << dendl;
9f95a23c
TL
15610 std::unique_lock l(c->lock);
15611 std::unique_lock l2(d->lock);
7c673cae
FG
15612 int r;
15613
15614 // flush all previous deferred writes on this sequencer. this is a bit
15615 // heavyweight, but we need to make sure all deferred writes complete
15616 // before we split as the new collection's sequencer may need to order
15617 // this after those writes, and we don't bother with the complexity of
15618 // moving those TransContexts over to the new osr.
15619 _osr_drain_preceding(txc);
15620
15621 // move any cached items (onodes and referenced shared blobs) that will
15622 // belong to the child collection post-split. leave everything else behind.
15623 // this may include things that don't strictly belong to the now-smaller
15624 // parent split, but the OSD will always send us a split for every new
15625 // child.
15626
15627 spg_t pgid, dest_pgid;
15628 bool is_pg = c->cid.is_pg(&pgid);
11fdf7f2 15629 ceph_assert(is_pg);
7c673cae 15630 is_pg = d->cid.is_pg(&dest_pgid);
11fdf7f2 15631 ceph_assert(is_pg);
7c673cae
FG
15632
15633 // the destination should initially be empty.
11fdf7f2
TL
15634 ceph_assert(d->onode_map.empty());
15635 ceph_assert(d->shared_blob_set.empty());
15636 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
15637
15638 c->split_cache(d.get());
15639
15640 // adjust bits. note that this will be redundant for all but the first
15641 // split call for this parent (first child).
15642 c->cnode.bits = bits;
11fdf7f2 15643 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
15644 r = 0;
15645
15646 bufferlist bl;
11fdf7f2 15647 encode(c->cnode, bl);
7c673cae
FG
15648 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
15649
15650 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
15651 << " bits " << bits << " = " << r << dendl;
15652 return r;
15653}
15654
11fdf7f2
TL
15655int BlueStore::_merge_collection(
15656 TransContext *txc,
15657 CollectionRef *c,
15658 CollectionRef& d,
15659 unsigned bits)
15660{
15661 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
15662 << " bits " << bits << dendl;
9f95a23c
TL
15663 std::unique_lock l((*c)->lock);
15664 std::unique_lock l2(d->lock);
11fdf7f2
TL
15665 int r;
15666
15667 coll_t cid = (*c)->cid;
15668
15669 // flush all previous deferred writes on the source collection to ensure
15670 // that all deferred writes complete before we merge as the target collection's
15671 // sequencer may need to order new ops after those writes.
15672
15673 _osr_drain((*c)->osr.get());
15674
15675 // move any cached items (onodes and referenced shared blobs) that will
15676 // belong to the child collection post-split. leave everything else behind.
15677 // this may include things that don't strictly belong to the now-smaller
15678 // parent split, but the OSD will always send us a split for every new
15679 // child.
15680
15681 spg_t pgid, dest_pgid;
15682 bool is_pg = cid.is_pg(&pgid);
15683 ceph_assert(is_pg);
15684 is_pg = d->cid.is_pg(&dest_pgid);
15685 ceph_assert(is_pg);
15686
15687 // adjust bits. note that this will be redundant for all but the first
15688 // merge call for the parent/target.
15689 d->cnode.bits = bits;
15690
15691 // behavior depends on target (d) bits, so this after that is updated.
15692 (*c)->split_cache(d.get());
15693
15694 // remove source collection
15695 {
9f95a23c 15696 std::unique_lock l3(coll_lock);
11fdf7f2
TL
15697 _do_remove_collection(txc, c);
15698 }
15699
15700 r = 0;
15701
15702 bufferlist bl;
15703 encode(d->cnode, bl);
15704 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
15705
15706 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
15707 << " bits " << bits << " = " << r << dendl;
15708 return r;
15709}
15710
494da23a
TL
15711void BlueStore::log_latency(
15712 const char* name,
15713 int idx,
15714 const ceph::timespan& l,
15715 double lat_threshold,
15716 const char* info) const
15717{
15718 logger->tinc(idx, l);
15719 if (lat_threshold > 0.0 &&
15720 l >= make_timespan(lat_threshold)) {
15721 dout(0) << __func__ << " slow operation observed for " << name
15722 << ", latency = " << l
15723 << info
15724 << dendl;
15725 }
15726}
15727
11fdf7f2 15728void BlueStore::log_latency_fn(
494da23a 15729 const char* name,
11fdf7f2
TL
15730 int idx,
15731 const ceph::timespan& l,
494da23a
TL
15732 double lat_threshold,
15733 std::function<string (const ceph::timespan& lat)> fn) const
11fdf7f2 15734{
494da23a
TL
15735 logger->tinc(idx, l);
15736 if (lat_threshold > 0.0 &&
15737 l >= make_timespan(lat_threshold)) {
15738 dout(0) << __func__ << " slow operation observed for " << name
15739 << ", latency = " << l
15740 << fn(l)
15741 << dendl;
15742 }
11fdf7f2
TL
15743}
15744
9f95a23c
TL
15745#if defined(WITH_LTTNG)
15746void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
15747 KeyValueDB &db,
15748 TransContext &txc,
15749 mono_clock::time_point start_throttle_acquire)
15750{
15751 pending_kv_ios += txc.ios;
15752 if (txc.deferred_txn) {
15753 pending_deferred_ios += txc.ios;
15754 }
15755
15756 uint64_t started = 0;
15757 uint64_t completed = 0;
15758 if (should_trace(&started, &completed)) {
15759 txc.tracing = true;
15760 uint64_t rocksdb_base_level,
15761 rocksdb_estimate_pending_compaction_bytes,
15762 rocksdb_cur_size_all_mem_tables,
15763 rocksdb_compaction_pending,
15764 rocksdb_mem_table_flush_pending,
15765 rocksdb_num_running_compactions,
15766 rocksdb_num_running_flushes,
15767 rocksdb_actual_delayed_write_rate;
15768 db.get_property(
15769 "rocksdb.base-level",
15770 &rocksdb_base_level);
15771 db.get_property(
15772 "rocksdb.estimate-pending-compaction-bytes",
15773 &rocksdb_estimate_pending_compaction_bytes);
15774 db.get_property(
15775 "rocksdb.cur-size-all-mem-tables",
15776 &rocksdb_cur_size_all_mem_tables);
15777 db.get_property(
15778 "rocksdb.compaction-pending",
15779 &rocksdb_compaction_pending);
15780 db.get_property(
15781 "rocksdb.mem-table-flush-pending",
15782 &rocksdb_mem_table_flush_pending);
15783 db.get_property(
15784 "rocksdb.num-running-compactions",
15785 &rocksdb_num_running_compactions);
15786 db.get_property(
15787 "rocksdb.num-running-flushes",
15788 &rocksdb_num_running_flushes);
15789 db.get_property(
15790 "rocksdb.actual-delayed-write-rate",
15791 &rocksdb_actual_delayed_write_rate);
15792
15793
15794 tracepoint(
15795 bluestore,
15796 transaction_initial_state,
15797 txc.osr->get_sequencer_id(),
15798 txc.seq,
15799 throttle_bytes.get_current(),
15800 throttle_deferred_bytes.get_current(),
15801 pending_kv_ios,
15802 pending_deferred_ios,
15803 started,
15804 completed,
15805 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
15806
15807 tracepoint(
15808 bluestore,
15809 transaction_initial_state_rocksdb,
15810 txc.osr->get_sequencer_id(),
15811 txc.seq,
15812 rocksdb_base_level,
15813 rocksdb_estimate_pending_compaction_bytes,
15814 rocksdb_cur_size_all_mem_tables,
15815 rocksdb_compaction_pending,
15816 rocksdb_mem_table_flush_pending,
15817 rocksdb_num_running_compactions,
15818 rocksdb_num_running_flushes,
15819 rocksdb_actual_delayed_write_rate);
15820 }
15821}
15822#endif
15823
15824mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
15825 TransContext &txc, PerfCounters *logger, int state)
15826{
15827 mono_clock::time_point now = mono_clock::now();
15828 mono_clock::duration lat = now - txc.last_stamp;
15829 logger->tinc(state, lat);
15830#if defined(WITH_LTTNG)
15831 if (txc.tracing &&
15832 state >= l_bluestore_state_prepare_lat &&
15833 state <= l_bluestore_state_done_lat) {
15834 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
15835 tracepoint(
15836 bluestore,
15837 transaction_state_duration,
15838 txc.osr->get_sequencer_id(),
15839 txc.seq,
15840 state,
15841 ceph::to_seconds<double>(lat));
15842 }
15843#endif
15844 txc.last_stamp = now;
15845 return lat;
15846}
15847
15848bool BlueStore::BlueStoreThrottle::try_start_transaction(
15849 KeyValueDB &db,
15850 TransContext &txc,
15851 mono_clock::time_point start_throttle_acquire)
15852{
15853 throttle_bytes.get(txc.cost);
15854
15855 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
15856 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15857 return true;
15858 } else {
15859 return false;
15860 }
15861}
15862
15863void BlueStore::BlueStoreThrottle::finish_start_transaction(
15864 KeyValueDB &db,
15865 TransContext &txc,
15866 mono_clock::time_point start_throttle_acquire)
15867{
15868 ceph_assert(txc.deferred_txn);
15869 throttle_deferred_bytes.get(txc.cost);
15870 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15871}
15872
15873#if defined(WITH_LTTNG)
15874void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
15875{
15876 pending_kv_ios -= 1;
15877 ios_completed_since_last_traced++;
15878 if (txc.tracing) {
15879 tracepoint(
15880 bluestore,
15881 transaction_commit_latency,
15882 txc.osr->get_sequencer_id(),
15883 txc.seq,
15884 ceph::to_seconds<double>(mono_clock::now() - txc.start));
15885 }
15886}
15887#endif
15888
15889#if defined(WITH_LTTNG)
15890void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
15891{
15892 if (txc.deferred_txn) {
15893 pending_deferred_ios -= 1;
15894 }
15895 if (txc.tracing) {
15896 mono_clock::time_point now = mono_clock::now();
15897 mono_clock::duration lat = now - txc.start;
15898 tracepoint(
15899 bluestore,
15900 transaction_total_duration,
15901 txc.osr->get_sequencer_id(),
15902 txc.seq,
15903 ceph::to_seconds<double>(lat));
15904 }
15905}
15906#endif
11fdf7f2 15907
7c673cae
FG
15908// DB key value Histogram
15909#define KEY_SLAB 32
15910#define VALUE_SLAB 64
15911
15912const string prefix_onode = "o";
15913const string prefix_onode_shard = "x";
15914const string prefix_other = "Z";
15915
15916int BlueStore::DBHistogram::get_key_slab(size_t sz)
15917{
15918 return (sz/KEY_SLAB);
15919}
15920
15921string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
15922{
15923 int lower_bound = slab * KEY_SLAB;
15924 int upper_bound = (slab + 1) * KEY_SLAB;
15925 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15926 return ret;
15927}
15928
15929int BlueStore::DBHistogram::get_value_slab(size_t sz)
15930{
15931 return (sz/VALUE_SLAB);
15932}
15933
15934string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
15935{
15936 int lower_bound = slab * VALUE_SLAB;
15937 int upper_bound = (slab + 1) * VALUE_SLAB;
15938 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15939 return ret;
15940}
15941
15942void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
15943 const string &prefix, size_t key_size, size_t value_size)
15944{
15945 uint32_t key_slab = get_key_slab(key_size);
15946 uint32_t value_slab = get_value_slab(value_size);
15947 key_hist[prefix][key_slab].count++;
11fdf7f2
TL
15948 key_hist[prefix][key_slab].max_len =
15949 std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
7c673cae
FG
15950 key_hist[prefix][key_slab].val_map[value_slab].count++;
15951 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11fdf7f2
TL
15952 std::max<size_t>(value_size,
15953 key_hist[prefix][key_slab].val_map[value_slab].max_len);
7c673cae
FG
15954}
15955
15956void BlueStore::DBHistogram::dump(Formatter *f)
15957{
15958 f->open_object_section("rocksdb_value_distribution");
15959 for (auto i : value_hist) {
15960 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
15961 }
15962 f->close_section();
15963
15964 f->open_object_section("rocksdb_key_value_histogram");
15965 for (auto i : key_hist) {
15966 f->dump_string("prefix", i.first);
15967 f->open_object_section("key_hist");
15968 for ( auto k : i.second) {
15969 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
15970 f->dump_unsigned("max_len", k.second.max_len);
15971 f->open_object_section("value_hist");
15972 for ( auto j : k.second.val_map) {
15973 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
15974 f->dump_unsigned("max_len", j.second.max_len);
15975 }
15976 f->close_section();
15977 }
15978 f->close_section();
15979 }
15980 f->close_section();
15981}
15982
15983//Itrerates through the db and collects the stats
15984void BlueStore::generate_db_histogram(Formatter *f)
15985{
15986 //globals
15987 uint64_t num_onodes = 0;
15988 uint64_t num_shards = 0;
15989 uint64_t num_super = 0;
15990 uint64_t num_coll = 0;
15991 uint64_t num_omap = 0;
11fdf7f2 15992 uint64_t num_pgmeta_omap = 0;
7c673cae
FG
15993 uint64_t num_deferred = 0;
15994 uint64_t num_alloc = 0;
15995 uint64_t num_stat = 0;
15996 uint64_t num_others = 0;
15997 uint64_t num_shared_shards = 0;
15998 size_t max_key_size =0, max_value_size = 0;
15999 uint64_t total_key_size = 0, total_value_size = 0;
16000 size_t key_size = 0, value_size = 0;
16001 DBHistogram hist;
16002
11fdf7f2 16003 auto start = coarse_mono_clock::now();
7c673cae 16004
11fdf7f2 16005 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
7c673cae
FG
16006 iter->seek_to_first();
16007 while (iter->valid()) {
16008 dout(30) << __func__ << " Key: " << iter->key() << dendl;
16009 key_size = iter->key_size();
16010 value_size = iter->value_size();
16011 hist.value_hist[hist.get_value_slab(value_size)]++;
11fdf7f2
TL
16012 max_key_size = std::max(max_key_size, key_size);
16013 max_value_size = std::max(max_value_size, value_size);
7c673cae
FG
16014 total_key_size += key_size;
16015 total_value_size += value_size;
16016
16017 pair<string,string> key(iter->raw_key());
16018
16019 if (key.first == PREFIX_SUPER) {
16020 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
16021 num_super++;
16022 } else if (key.first == PREFIX_STAT) {
16023 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
16024 num_stat++;
16025 } else if (key.first == PREFIX_COLL) {
16026 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
16027 num_coll++;
16028 } else if (key.first == PREFIX_OBJ) {
16029 if (key.second.back() == ONODE_KEY_SUFFIX) {
16030 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
16031 num_onodes++;
16032 } else {
16033 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
16034 num_shards++;
16035 }
16036 } else if (key.first == PREFIX_OMAP) {
16037 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
16038 num_omap++;
f67539c2
TL
16039 } else if (key.first == PREFIX_PERPOOL_OMAP) {
16040 hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size);
16041 num_omap++;
16042 } else if (key.first == PREFIX_PERPG_OMAP) {
16043 hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size);
16044 num_omap++;
11fdf7f2
TL
16045 } else if (key.first == PREFIX_PGMETA_OMAP) {
16046 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
16047 num_pgmeta_omap++;
7c673cae
FG
16048 } else if (key.first == PREFIX_DEFERRED) {
16049 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
16050 num_deferred++;
11fdf7f2 16051 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
7c673cae
FG
16052 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
16053 num_alloc++;
16054 } else if (key.first == PREFIX_SHARED_BLOB) {
16055 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
16056 num_shared_shards++;
16057 } else {
16058 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
16059 num_others++;
16060 }
16061 iter->next();
16062 }
16063
11fdf7f2 16064 ceph::timespan duration = coarse_mono_clock::now() - start;
7c673cae
FG
16065 f->open_object_section("rocksdb_key_value_stats");
16066 f->dump_unsigned("num_onodes", num_onodes);
16067 f->dump_unsigned("num_shards", num_shards);
16068 f->dump_unsigned("num_super", num_super);
16069 f->dump_unsigned("num_coll", num_coll);
16070 f->dump_unsigned("num_omap", num_omap);
11fdf7f2 16071 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
7c673cae
FG
16072 f->dump_unsigned("num_deferred", num_deferred);
16073 f->dump_unsigned("num_alloc", num_alloc);
16074 f->dump_unsigned("num_stat", num_stat);
16075 f->dump_unsigned("num_shared_shards", num_shared_shards);
16076 f->dump_unsigned("num_others", num_others);
16077 f->dump_unsigned("max_key_size", max_key_size);
16078 f->dump_unsigned("max_value_size", max_value_size);
16079 f->dump_unsigned("total_key_size", total_key_size);
16080 f->dump_unsigned("total_value_size", total_value_size);
16081 f->close_section();
16082
16083 hist.dump(f);
16084
16085 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
16086
16087}
16088
f6b5b4d7 16089void BlueStore::_shutdown_cache()
7c673cae
FG
16090{
16091 dout(10) << __func__ << dendl;
9f95a23c
TL
16092 for (auto i : buffer_cache_shards) {
16093 i->flush();
11fdf7f2 16094 ceph_assert(i->empty());
7c673cae
FG
16095 }
16096 for (auto& p : coll_map) {
f6b5b4d7 16097 p.second->onode_map.clear();
3efd9988
FG
16098 if (!p.second->shared_blob_set.empty()) {
16099 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11fdf7f2 16100 p.second->shared_blob_set.dump<0>(cct);
3efd9988 16101 }
11fdf7f2
TL
16102 ceph_assert(p.second->onode_map.empty());
16103 ceph_assert(p.second->shared_blob_set.empty());
7c673cae
FG
16104 }
16105 coll_map.clear();
f6b5b4d7
TL
16106 for (auto i : onode_cache_shards) {
16107 ceph_assert(i->empty());
16108 }
7c673cae
FG
16109}
16110
31f18b77
FG
16111// For external caller.
16112// We use a best-effort policy instead, e.g.,
16113// we don't care if there are still some pinned onodes/data in the cache
16114// after this command is completed.
11fdf7f2 16115int BlueStore::flush_cache(ostream *os)
31f18b77
FG
16116{
16117 dout(10) << __func__ << dendl;
9f95a23c
TL
16118 for (auto i : onode_cache_shards) {
16119 i->flush();
16120 }
16121 for (auto i : buffer_cache_shards) {
16122 i->flush();
31f18b77 16123 }
11fdf7f2
TL
16124
16125 return 0;
31f18b77
FG
16126}
16127
7c673cae
FG
16128void BlueStore::_apply_padding(uint64_t head_pad,
16129 uint64_t tail_pad,
7c673cae
FG
16130 bufferlist& padded)
16131{
7c673cae 16132 if (head_pad) {
224ce89b 16133 padded.prepend_zero(head_pad);
7c673cae
FG
16134 }
16135 if (tail_pad) {
16136 padded.append_zero(tail_pad);
16137 }
16138 if (head_pad || tail_pad) {
16139 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
16140 << " tail 0x" << tail_pad << std::dec << dendl;
16141 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
16142 }
16143}
16144
11fdf7f2
TL
16145void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
16146{
16147 // finalize extent_map shards
16148 o->extent_map.update(txn, false);
16149 if (o->extent_map.needs_reshard()) {
16150 o->extent_map.reshard(db, txn);
16151 o->extent_map.update(txn, true);
16152 if (o->extent_map.needs_reshard()) {
16153 dout(20) << __func__ << " warning: still wants reshard, check options?"
16154 << dendl;
16155 o->extent_map.clear_needs_reshard();
16156 }
16157 logger->inc(l_bluestore_onode_reshard);
16158 }
16159
16160 // bound encode
16161 size_t bound = 0;
16162 denc(o->onode, bound);
16163 o->extent_map.bound_encode_spanning_blobs(bound);
16164 if (o->onode.extent_map_shards.empty()) {
16165 denc(o->extent_map.inline_bl, bound);
16166 }
16167
16168 // encode
16169 bufferlist bl;
16170 unsigned onode_part, blob_part, extent_part;
16171 {
16172 auto p = bl.get_contiguous_appender(bound, true);
16173 denc(o->onode, p);
16174 onode_part = p.get_logical_offset();
16175 o->extent_map.encode_spanning_blobs(p);
16176 blob_part = p.get_logical_offset() - onode_part;
16177 if (o->onode.extent_map_shards.empty()) {
16178 denc(o->extent_map.inline_bl, p);
16179 }
16180 extent_part = p.get_logical_offset() - onode_part - blob_part;
16181 }
16182
16183 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
16184 << " (" << onode_part << " bytes onode + "
16185 << blob_part << " bytes spanning blobs + "
16186 << extent_part << " bytes inline extents)"
16187 << dendl;
16188
16189
16190 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
16191}
16192
16193void BlueStore::_log_alerts(osd_alert_list_t& alerts)
16194{
16195 std::lock_guard l(qlock);
16196
522d829b
TL
16197 if (!spurious_read_errors_alert.empty() &&
16198 cct->_conf->bluestore_warn_on_spurious_read_errors) {
f67539c2
TL
16199 alerts.emplace(
16200 "BLUESTORE_SPURIOUS_READ_ERRORS",
16201 spurious_read_errors_alert);
16202 }
81eedcae
TL
16203 if (!disk_size_mismatch_alert.empty()) {
16204 alerts.emplace(
16205 "BLUESTORE_DISK_SIZE_MISMATCH",
16206 disk_size_mismatch_alert);
16207 }
16208 if (!legacy_statfs_alert.empty()) {
16209 alerts.emplace(
16210 "BLUESTORE_LEGACY_STATFS",
16211 legacy_statfs_alert);
16212 }
11fdf7f2
TL
16213 if (!spillover_alert.empty() &&
16214 cct->_conf->bluestore_warn_on_bluefs_spillover) {
16215 alerts.emplace(
16216 "BLUEFS_SPILLOVER",
16217 spillover_alert);
16218 }
f67539c2
TL
16219 if (!no_per_pg_omap_alert.empty()) {
16220 alerts.emplace(
16221 "BLUESTORE_NO_PER_PG_OMAP",
16222 no_per_pg_omap_alert);
16223 }
9f95a23c
TL
16224 if (!no_per_pool_omap_alert.empty()) {
16225 alerts.emplace(
16226 "BLUESTORE_NO_PER_POOL_OMAP",
16227 no_per_pool_omap_alert);
16228 }
11fdf7f2
TL
16229 string s0(failed_cmode);
16230
16231 if (!failed_compressors.empty()) {
16232 if (!s0.empty()) {
16233 s0 += ", ";
16234 }
16235 s0 += "unable to load:";
16236 bool first = true;
16237 for (auto& s : failed_compressors) {
16238 if (first) {
16239 first = false;
16240 } else {
16241 s0 += ", ";
16242 }
16243 s0 += s;
16244 }
16245 alerts.emplace(
16246 "BLUESTORE_NO_COMPRESSION",
16247 s0);
16248 }
16249}
16250
9f95a23c
TL
16251void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
16252 size_t extents)
16253{
16254 alloc_stats_count++;
16255 alloc_stats_fragments += extents;
16256 alloc_stats_size += need;
16257}
16258
16259void BlueStore::_record_allocation_stats()
16260{
16261 // don't care about data consistency,
16262 // fields can be partially modified while making the tuple
16263 auto t0 = std::make_tuple(
16264 alloc_stats_count.exchange(0),
16265 alloc_stats_fragments.exchange(0),
16266 alloc_stats_size.exchange(0));
16267
16268 dout(0) << " allocation stats probe "
16269 << probe_count << ":"
16270 << " cnt: " << std::get<0>(t0)
16271 << " frags: " << std::get<1>(t0)
16272 << " size: " << std::get<2>(t0)
16273 << dendl;
16274
16275
16276 //
16277 // Keep the history for probes from the power-of-two sequence:
16278 // -1, -2, -4, -8, -16
16279 //
16280 size_t base = 1;
16281 for (auto& t : alloc_stats_history) {
16282 dout(0) << " probe -"
16283 << base + (probe_count % base) << ": "
16284 << std::get<0>(t)
16285 << ", " << std::get<1>(t)
16286 << ", " << std::get<2>(t)
16287 << dendl;
16288 base <<= 1;
16289 }
16290 dout(0) << "------------" << dendl;
16291
f67539c2 16292 ++ probe_count;
9f95a23c 16293
f67539c2
TL
16294 for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) {
16295 if ((probe_count % (1 << i)) == 0) {
16296 alloc_stats_history[i] = alloc_stats_history[i - 1];
16297 }
9f95a23c
TL
16298 }
16299 alloc_stats_history[0].swap(t0);
16300}
16301
7c673cae 16302// ===========================================
11fdf7f2
TL
16303// BlueStoreRepairer
16304
16305size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
16306 const interval_set<uint64_t>& extents)
16307{
16308 ceph_assert(granularity); // initialized
16309 // can't call for the second time
16310 ceph_assert(!was_filtered_out);
16311 ceph_assert(collections_bfs.size() == objects_bfs.size());
16312
16313 uint64_t prev_pos = 0;
16314 uint64_t npos = collections_bfs.size();
16315
16316 bloom_vector collections_reduced;
16317 bloom_vector objects_reduced;
16318
16319 for (auto e : extents) {
16320 if (e.second == 0) {
16321 continue;
16322 }
16323 uint64_t pos = max(e.first / granularity, prev_pos);
16324 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
16325 while (pos != npos && pos < end_pos) {
16326 ceph_assert( collections_bfs[pos].element_count() ==
16327 objects_bfs[pos].element_count());
16328 if (collections_bfs[pos].element_count()) {
16329 collections_reduced.push_back(std::move(collections_bfs[pos]));
16330 objects_reduced.push_back(std::move(objects_bfs[pos]));
16331 }
16332 ++pos;
16333 }
16334 prev_pos = end_pos;
16335 }
16336 collections_reduced.swap(collections_bfs);
16337 objects_reduced.swap(objects_bfs);
16338 was_filtered_out = true;
16339 return collections_bfs.size();
16340}
16341
16342bool BlueStoreRepairer::remove_key(KeyValueDB *db,
16343 const string& prefix,
16344 const string& key)
16345{
b3b6e05e 16346 std::lock_guard l(lock);
11fdf7f2
TL
16347 if (!remove_key_txn) {
16348 remove_key_txn = db->get_transaction();
16349 }
16350 ++to_repair_cnt;
16351 remove_key_txn->rmkey(prefix, key);
16352
16353 return true;
16354}
16355
f67539c2 16356void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val)
9f95a23c 16357{
b3b6e05e
TL
16358 std::lock_guard l(lock); // possibly redundant
16359 ceph_assert(fix_per_pool_omap_txn == nullptr);
9f95a23c
TL
16360 fix_per_pool_omap_txn = db->get_transaction();
16361 ++to_repair_cnt;
16362 bufferlist bl;
f67539c2 16363 bl.append(stringify(val));
9f95a23c
TL
16364 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
16365}
16366
11fdf7f2
TL
16367bool BlueStoreRepairer::fix_shared_blob(
16368 KeyValueDB *db,
16369 uint64_t sbid,
16370 const bufferlist* bl)
16371{
b3b6e05e 16372 std::lock_guard l(lock); // possibly redundant
11fdf7f2
TL
16373 KeyValueDB::Transaction txn;
16374 if (fix_misreferences_txn) { // reuse this txn
16375 txn = fix_misreferences_txn;
16376 } else {
16377 if (!fix_shared_blob_txn) {
16378 fix_shared_blob_txn = db->get_transaction();
16379 }
16380 txn = fix_shared_blob_txn;
16381 }
16382 string key;
16383 get_shared_blob_key(sbid, &key);
16384
16385 ++to_repair_cnt;
16386 if (bl) {
16387 txn->set(PREFIX_SHARED_BLOB, key, *bl);
16388 } else {
16389 txn->rmkey(PREFIX_SHARED_BLOB, key);
16390 }
16391 return true;
16392}
16393
16394bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
16395 const string& key,
16396 const store_statfs_t& new_statfs)
16397{
b3b6e05e 16398 std::lock_guard l(lock);
11fdf7f2
TL
16399 if (!fix_statfs_txn) {
16400 fix_statfs_txn = db->get_transaction();
16401 }
16402 BlueStore::volatile_statfs vstatfs;
16403 vstatfs = new_statfs;
16404 bufferlist bl;
16405 vstatfs.encode(bl);
16406 ++to_repair_cnt;
16407 fix_statfs_txn->set(PREFIX_STAT, key, bl);
16408 return true;
16409}
16410
16411bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
16412 FreelistManager* fm,
16413 uint64_t offset, uint64_t len)
16414{
b3b6e05e 16415 std::lock_guard l(lock);
11fdf7f2
TL
16416 if (!fix_fm_leaked_txn) {
16417 fix_fm_leaked_txn = db->get_transaction();
16418 }
16419 ++to_repair_cnt;
16420 fm->release(offset, len, fix_fm_leaked_txn);
16421 return true;
16422}
16423bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
16424 FreelistManager* fm,
16425 uint64_t offset, uint64_t len)
16426{
b3b6e05e 16427 std::lock_guard l(lock);
11fdf7f2
TL
16428 if (!fix_fm_false_free_txn) {
16429 fix_fm_false_free_txn = db->get_transaction();
16430 }
16431 ++to_repair_cnt;
16432 fm->allocate(offset, len, fix_fm_false_free_txn);
16433 return true;
16434}
16435
b3b6e05e
TL
16436bool BlueStoreRepairer::fix_spanning_blobs(
16437 KeyValueDB* db,
16438 std::function<void(KeyValueDB::Transaction)> f)
adb31ebb 16439{
b3b6e05e 16440 std::lock_guard l(lock);
adb31ebb
TL
16441 if (!fix_onode_txn) {
16442 fix_onode_txn = db->get_transaction();
16443 }
b3b6e05e 16444 f(fix_onode_txn);
adb31ebb 16445 ++to_repair_cnt;
b3b6e05e 16446 return true;
adb31ebb
TL
16447}
16448
11fdf7f2
TL
16449bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
16450{
b3b6e05e 16451 //NB: not for use in multithreading mode!!!
11fdf7f2
TL
16452 if (misreferenced_extents.size()) {
16453 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
16454 ceph_assert(n > 0);
16455 if (!fix_misreferences_txn) {
16456 fix_misreferences_txn = db->get_transaction();
16457 }
16458 return true;
16459 }
16460 return false;
16461}
16462
16463unsigned BlueStoreRepairer::apply(KeyValueDB* db)
16464{
b3b6e05e 16465 //NB: not for use in multithreading mode!!!
9f95a23c
TL
16466 if (fix_per_pool_omap_txn) {
16467 db->submit_transaction_sync(fix_per_pool_omap_txn);
16468 fix_per_pool_omap_txn = nullptr;
16469 }
11fdf7f2
TL
16470 if (fix_fm_leaked_txn) {
16471 db->submit_transaction_sync(fix_fm_leaked_txn);
16472 fix_fm_leaked_txn = nullptr;
16473 }
16474 if (fix_fm_false_free_txn) {
16475 db->submit_transaction_sync(fix_fm_false_free_txn);
16476 fix_fm_false_free_txn = nullptr;
16477 }
16478 if (remove_key_txn) {
16479 db->submit_transaction_sync(remove_key_txn);
16480 remove_key_txn = nullptr;
16481 }
16482 if (fix_misreferences_txn) {
16483 db->submit_transaction_sync(fix_misreferences_txn);
16484 fix_misreferences_txn = nullptr;
16485 }
adb31ebb
TL
16486 if (fix_onode_txn) {
16487 db->submit_transaction_sync(fix_onode_txn);
16488 fix_onode_txn = nullptr;
16489 }
11fdf7f2
TL
16490 if (fix_shared_blob_txn) {
16491 db->submit_transaction_sync(fix_shared_blob_txn);
16492 fix_shared_blob_txn = nullptr;
16493 }
16494
16495 if (fix_statfs_txn) {
16496 db->submit_transaction_sync(fix_statfs_txn);
16497 fix_statfs_txn = nullptr;
16498 }
522d829b
TL
16499 if (need_compact) {
16500 db->compact();
16501 need_compact = false;
16502 }
11fdf7f2
TL
16503 unsigned repaired = to_repair_cnt;
16504 to_repair_cnt = 0;
16505 return repaired;
16506}
16507
16508// =======================================================
9f95a23c
TL
16509// RocksDBBlueFSVolumeSelector
16510
16511uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
16512 ceph_assert(h != nullptr);
16513 uint64_t hint = reinterpret_cast<uint64_t>(h);
16514 uint8_t res;
16515 switch (hint) {
16516 case LEVEL_SLOW:
16517 res = BlueFS::BDEV_SLOW;
16518 if (db_avail4slow > 0) {
16519 // considering statically available db space vs.
16520 // - observed maximums on DB dev for DB/WAL/UNSORTED data
16521 // - observed maximum spillovers
16522 uint64_t max_db_use = 0; // max db usage we potentially observed
f6b5b4d7 16523 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
9f95a23c
TL
16524 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
16525 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
16526 // this could go to db hence using it in the estimation
16527 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
16528
16529 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
16530 uint64_t avail = min(
16531 db_avail4slow,
16532 max_db_use < db_total ? db_total - max_db_use : 0);
16533
16534 // considering current DB dev usage for SLOW data
16535 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
16536 res = BlueFS::BDEV_DB;
16537 }
16538 }
16539 break;
f6b5b4d7 16540 case LEVEL_LOG:
9f95a23c
TL
16541 case LEVEL_WAL:
16542 res = BlueFS::BDEV_WAL;
16543 break;
16544 case LEVEL_DB:
16545 default:
16546 res = BlueFS::BDEV_DB;
16547 break;
16548 }
16549 return res;
16550}
16551
16552void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
16553{
a4b75251
TL
16554 auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST];
16555 res.emplace_back(base, db_size);
16556 auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST];
16557 if (slow_size == 0) {
16558 slow_size = db_size;
16559 }
16560 res.emplace_back(base + ".slow", slow_size);
9f95a23c
TL
16561}
16562
b3b6e05e 16563void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
9f95a23c
TL
16564 uint8_t res = LEVEL_DB;
16565 if (dirname.length() > 5) {
16566 // the "db.slow" and "db.wal" directory names are hard-coded at
16567 // match up with bluestore. the slow device is always the second
16568 // one (when a dedicated block.db device is present and used at
16569 // bdev 0). the wal device is always last.
16570 if (boost::algorithm::ends_with(dirname, ".slow")) {
16571 res = LEVEL_SLOW;
16572 }
16573 else if (boost::algorithm::ends_with(dirname, ".wal")) {
16574 res = LEVEL_WAL;
16575 }
16576 }
16577 return reinterpret_cast<void*>(res);
16578}
16579
16580void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
16581 auto max_x = per_level_per_dev_usage.get_max_x();
16582 auto max_y = per_level_per_dev_usage.get_max_y();
16583 sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
16584 << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
16585 << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
16586 << ", db_avail:" << db_avail4slow << std::endl
16587 << "Usage matrix:" << std::endl;
f6b5b4d7 16588 constexpr std::array<const char*, 8> names{ {
9f95a23c
TL
16589 "DEV/LEV",
16590 "WAL",
16591 "DB",
16592 "SLOW",
16593 "*",
16594 "*",
f6b5b4d7
TL
16595 "REAL",
16596 "FILES",
9f95a23c
TL
16597 } };
16598 const size_t width = 12;
16599 for (size_t i = 0; i < names.size(); ++i) {
16600 sout.setf(std::ios::left, std::ios::adjustfield);
16601 sout.width(width);
16602 sout << names[i];
16603 }
16604 sout << std::endl;
16605 for (size_t l = 0; l < max_y; l++) {
16606 sout.setf(std::ios::left, std::ios::adjustfield);
16607 sout.width(width);
16608 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
16609 case LEVEL_LOG:
16610 sout << "LOG"; break;
9f95a23c
TL
16611 case LEVEL_WAL:
16612 sout << "WAL"; break;
16613 case LEVEL_DB:
16614 sout << "DB"; break;
16615 case LEVEL_SLOW:
16616 sout << "SLOW"; break;
16617 case LEVEL_MAX:
16618 sout << "TOTALS"; break;
16619 }
f6b5b4d7 16620 for (size_t d = 0; d < max_x; d++) {
9f95a23c
TL
16621 sout.setf(std::ios::left, std::ios::adjustfield);
16622 sout.width(width);
16623 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
16624 }
16625 sout.setf(std::ios::left, std::ios::adjustfield);
16626 sout.width(width);
f6b5b4d7 16627 sout << stringify(per_level_files[l]) << std::endl;
9f95a23c
TL
16628 }
16629 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
16630 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
16631 sout << "MAXIMUMS:" << std::endl;
16632 for (size_t l = 0; l < max_y; l++) {
16633 sout.setf(std::ios::left, std::ios::adjustfield);
16634 sout.width(width);
16635 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
16636 case LEVEL_LOG:
16637 sout << "LOG"; break;
9f95a23c
TL
16638 case LEVEL_WAL:
16639 sout << "WAL"; break;
16640 case LEVEL_DB:
16641 sout << "DB"; break;
16642 case LEVEL_SLOW:
16643 sout << "SLOW"; break;
16644 case LEVEL_MAX:
16645 sout << "TOTALS"; break;
16646 }
16647 for (size_t d = 0; d < max_x - 1; d++) {
16648 sout.setf(std::ios::left, std::ios::adjustfield);
16649 sout.width(width);
16650 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
16651 }
16652 sout.setf(std::ios::left, std::ios::adjustfield);
16653 sout.width(width);
16654 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
16655 if (l < max_y - 1) {
16656 sout << std::endl;
16657 }
16658 }
16659}
11fdf7f2 16660
9f95a23c 16661// =======================================================