]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
import ceph pacific 16.2.5
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
20
eafe8130 21#include <boost/container/flat_set.hpp>
9f95a23c 22#include "boost/algorithm/string.hpp"
eafe8130 23
31f18b77
FG
24#include "include/cpp-btree/btree_set.h"
25
7c673cae 26#include "BlueStore.h"
f67539c2 27#include "bluestore_common.h"
7c673cae
FG
28#include "os/kv.h"
29#include "include/compat.h"
30#include "include/intarith.h"
31#include "include/stringify.h"
11fdf7f2
TL
32#include "include/str_map.h"
33#include "include/util.h"
7c673cae
FG
34#include "common/errno.h"
35#include "common/safe_io.h"
91327a77 36#include "common/PriorityCache.h"
9f95a23c 37#include "common/RWLock.h"
7c673cae
FG
38#include "Allocator.h"
39#include "FreelistManager.h"
40#include "BlueFS.h"
41#include "BlueRocksEnv.h"
42#include "auth/Crypto.h"
43#include "common/EventTrace.h"
91327a77 44#include "perfglue/heap_profiler.h"
11fdf7f2
TL
45#include "common/blkdev.h"
46#include "common/numa.h"
f67539c2 47#include "common/pretty_binary.h"
7c673cae 48
9f95a23c
TL
49#if defined(WITH_LTTNG)
50#define TRACEPOINT_DEFINE
51#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
52#include "tracing/bluestore.h"
53#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
54#undef TRACEPOINT_DEFINE
55#else
56#define tracepoint(...)
57#endif
58
7c673cae
FG
59#define dout_context cct
60#define dout_subsys ceph_subsys_bluestore
61
31f18b77
FG
62using bid_t = decltype(BlueStore::Blob::id);
63
64// bluestore_cache_onode
7c673cae 65MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 66 bluestore_cache_onode);
7c673cae 67
31f18b77 68// bluestore_cache_other
7c673cae 69MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
f91f0fd5 70 bluestore_Buffer);
7c673cae 71MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
f91f0fd5 72 bluestore_Extent);
7c673cae 73MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
f91f0fd5 74 bluestore_Blob);
7c673cae 75MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
f91f0fd5 76 bluestore_SharedBlob);
31f18b77
FG
77
78// bluestore_txc
79MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
80 bluestore_txc);
f67539c2
TL
81using std::deque;
82using std::min;
83using std::make_pair;
84using std::numeric_limits;
85using std::pair;
86using std::list;
87using std::map;
88using std::max;
89using std::ostream;
90using std::ostringstream;
91using std::set;
92using std::string;
93using std::stringstream;
94using std::vector;
95
96using ceph::bufferlist;
97using ceph::bufferptr;
98using ceph::coarse_mono_clock;
99using ceph::decode;
100using ceph::encode;
101using ceph::Formatter;
102using ceph::JSONFormatter;
103using ceph::make_timespan;
104using ceph::mono_clock;
105using ceph::mono_time;
106using ceph::timespan_str;
7c673cae
FG
107
108// kv store prefixes
11fdf7f2
TL
109const string PREFIX_SUPER = "S"; // field -> value
110const string PREFIX_STAT = "T"; // field -> value(int64 array)
111const string PREFIX_COLL = "C"; // collection name -> cnode_t
112const string PREFIX_OBJ = "O"; // object name -> onode_t
113const string PREFIX_OMAP = "M"; // u64 + keyname -> value
114const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
9f95a23c 115const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
f67539c2 116const string PREFIX_PERPG_OMAP = "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value
11fdf7f2
TL
117const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
118const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
119const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
7c673cae 120const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
f67539c2
TL
121const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager)
122const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager)
123const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata)
7c673cae 124
11fdf7f2
TL
125const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
126
7c673cae
FG
127// write a label in the first block. always use this size. note that
128// bluefs makes a matching assumption about the location of its
129// superblock (always the second block of the device).
130#define BDEV_LABEL_BLOCK_SIZE 4096
131
132// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
133#define SUPER_RESERVED 8192
134
135#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
136
137
138/*
139 * extent map blob encoding
140 *
141 * we use the low bits of the blobid field to indicate some common scenarios
142 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
143 */
144#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
145#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
146#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
147#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
148#define BLOBID_SHIFT_BITS 4
149
150/*
151 * object name key structure
152 *
153 * encoded u8: shard + 2^7 (so that it sorts properly)
154 * encoded u64: poolid + 2^63 (so that it sorts properly)
155 * encoded u32: hash (bit reversed)
156 *
157 * escaped string: namespace
158 *
159 * escaped string: key or object name
160 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
161 * we are done. otherwise, we are followed by the object name.
162 * escaped string: object name (unless '=' above)
163 *
164 * encoded u64: snap
165 * encoded u64: generation
166 * 'o'
167 */
168#define ONODE_KEY_SUFFIX 'o'
169
170/*
171 * extent shard key
172 *
173 * object prefix key
174 * u32
175 * 'x'
176 */
177#define EXTENT_SHARD_KEY_SUFFIX 'x'
178
179/*
180 * string encoding in the key
181 *
182 * The key string needs to lexicographically sort the same way that
183 * ghobject_t does. We do this by escaping anything <= to '#' with #
184 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
185 * hex digits.
186 *
187 * We use ! as a terminator for strings; this works because it is < #
188 * and will get escaped if it is present in the string.
189 *
f91f0fd5
TL
190 * NOTE: There is a bug in this implementation: due to implicit
191 * character type conversion in comparison it may produce unexpected
192 * ordering. Unfortunately fixing the bug would mean invalidating the
193 * keys in existing deployments. Instead we do additional sorting
194 * where it is needed.
7c673cae
FG
195 */
196template<typename S>
197static void append_escaped(const string &in, S *out)
198{
224ce89b
WB
199 char hexbyte[in.length() * 3 + 1];
200 char* ptr = &hexbyte[0];
7c673cae 201 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
f91f0fd5 202 if (*i <= '#') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
203 *ptr++ = '#';
204 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
205 *ptr++ = "0123456789abcdef"[*i & 0x0f];
f91f0fd5 206 } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
207 *ptr++ = '~';
208 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
209 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 210 } else {
224ce89b 211 *ptr++ = *i;
7c673cae
FG
212 }
213 }
224ce89b
WB
214 *ptr++ = '!';
215 out->append(hexbyte, ptr - &hexbyte[0]);
216}
217
218inline unsigned h2i(char c)
219{
220 if ((c >= '0') && (c <= '9')) {
221 return c - 0x30;
222 } else if ((c >= 'a') && (c <= 'f')) {
223 return c - 'a' + 10;
224 } else if ((c >= 'A') && (c <= 'F')) {
225 return c - 'A' + 10;
226 } else {
227 return 256; // make it always larger than 255
228 }
7c673cae
FG
229}
230
231static int decode_escaped(const char *p, string *out)
232{
224ce89b
WB
233 char buff[256];
234 char* ptr = &buff[0];
235 char* max = &buff[252];
7c673cae
FG
236 const char *orig_p = p;
237 while (*p && *p != '!') {
238 if (*p == '#' || *p == '~') {
224ce89b
WB
239 unsigned hex = 0;
240 p++;
241 hex = h2i(*p++) << 4;
242 if (hex > 255) {
243 return -EINVAL;
244 }
245 hex |= h2i(*p++);
246 if (hex > 255) {
247 return -EINVAL;
248 }
249 *ptr++ = hex;
7c673cae 250 } else {
224ce89b
WB
251 *ptr++ = *p++;
252 }
253 if (ptr > max) {
254 out->append(buff, ptr-buff);
255 ptr = &buff[0];
7c673cae
FG
256 }
257 }
224ce89b
WB
258 if (ptr != buff) {
259 out->append(buff, ptr-buff);
260 }
7c673cae
FG
261 return p - orig_p;
262}
263
7c673cae
FG
264template<typename T>
265static void _key_encode_shard(shard_id_t shard, T *key)
266{
267 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
268}
269
270static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
271{
272 pshard->id = (uint8_t)*key - (uint8_t)0x80;
273 return key + 1;
274}
275
f91f0fd5 276static void get_coll_range(const coll_t& cid, int bits,
f67539c2
TL
277 ghobject_t *temp_start, ghobject_t *temp_end,
278 ghobject_t *start, ghobject_t *end)
7c673cae 279{
7c673cae
FG
280 spg_t pgid;
281 if (cid.is_pg(&pgid)) {
f91f0fd5 282 start->shard_id = pgid.shard;
7c673cae
FG
283 *temp_start = *start;
284
f91f0fd5
TL
285 start->hobj.pool = pgid.pool();
286 temp_start->hobj.pool = -2ll - pgid.pool();
7c673cae
FG
287
288 *end = *start;
289 *temp_end = *temp_start;
290
291 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
f91f0fd5
TL
292 start->hobj.set_bitwise_key_u32(reverse_hash);
293 temp_start->hobj.set_bitwise_key_u32(reverse_hash);
7c673cae
FG
294
295 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
296 if (end_hash > 0xffffffffull)
297 end_hash = 0xffffffffull;
298
f91f0fd5
TL
299 end->hobj.set_bitwise_key_u32(end_hash);
300 temp_end->hobj.set_bitwise_key_u32(end_hash);
7c673cae 301 } else {
f91f0fd5
TL
302 start->shard_id = shard_id_t::NO_SHARD;
303 start->hobj.pool = -1ull;
304
7c673cae 305 *end = *start;
f91f0fd5
TL
306 start->hobj.set_bitwise_key_u32(0);
307 end->hobj.set_bitwise_key_u32(0xffffffff);
7c673cae
FG
308
309 // no separate temp section
310 *temp_start = *end;
311 *temp_end = *end;
312 }
f91f0fd5
TL
313
314 start->generation = 0;
315 end->generation = 0;
316 temp_start->generation = 0;
317 temp_end->generation = 0;
7c673cae
FG
318}
319
320static void get_shared_blob_key(uint64_t sbid, string *key)
321{
322 key->clear();
323 _key_encode_u64(sbid, key);
324}
325
326static int get_key_shared_blob(const string& key, uint64_t *sbid)
327{
328 const char *p = key.c_str();
329 if (key.length() < sizeof(uint64_t))
330 return -1;
224ce89b 331 _key_decode_u64(p, sbid);
7c673cae
FG
332 return 0;
333}
334
335template<typename S>
f91f0fd5 336static void _key_encode_prefix(const ghobject_t& oid, S *key)
7c673cae 337{
f91f0fd5
TL
338 _key_encode_shard(oid.shard_id, key);
339 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
340 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
341}
7c673cae 342
f91f0fd5
TL
343static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
344{
7c673cae
FG
345 p = _key_decode_shard(p, &oid->shard_id);
346
347 uint64_t pool;
348 p = _key_decode_u64(p, &pool);
349 oid->hobj.pool = pool - 0x8000000000000000ull;
350
351 unsigned hash;
352 p = _key_decode_u32(p, &hash);
353
354 oid->hobj.set_bitwise_key_u32(hash);
355
f91f0fd5
TL
356 return p;
357}
358
359#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
360
361template<typename S>
362static int get_key_object(const S& key, ghobject_t *oid)
363{
364 int r;
365 const char *p = key.c_str();
366
367 if (key.length() < ENCODED_KEY_PREFIX_LEN)
368 return -1;
369
370 p = _key_decode_prefix(p, oid);
371
372 if (key.length() == ENCODED_KEY_PREFIX_LEN)
373 return -2;
374
7c673cae
FG
375 r = decode_escaped(p, &oid->hobj.nspace);
376 if (r < 0)
377 return -2;
378 p += r + 1;
379
380 string k;
381 r = decode_escaped(p, &k);
382 if (r < 0)
383 return -3;
384 p += r + 1;
385 if (*p == '=') {
386 // no key
387 ++p;
388 oid->hobj.oid.name = k;
389 } else if (*p == '<' || *p == '>') {
390 // key + name
391 ++p;
392 r = decode_escaped(p, &oid->hobj.oid.name);
393 if (r < 0)
394 return -5;
395 p += r + 1;
396 oid->hobj.set_key(k);
397 } else {
398 // malformed
399 return -6;
400 }
401
402 p = _key_decode_u64(p, &oid->hobj.snap.val);
403 p = _key_decode_u64(p, &oid->generation);
404
405 if (*p != ONODE_KEY_SUFFIX) {
406 return -7;
407 }
408 p++;
409 if (*p) {
410 // if we get something other than a null terminator here,
411 // something goes wrong.
412 return -8;
413 }
414
415 return 0;
416}
417
418template<typename S>
419static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
420{
421 key->clear();
422
f91f0fd5 423 size_t max_len = ENCODED_KEY_PREFIX_LEN +
7c673cae
FG
424 (oid.hobj.nspace.length() * 3 + 1) +
425 (oid.hobj.get_key().length() * 3 + 1) +
426 1 + // for '<', '=', or '>'
427 (oid.hobj.oid.name.length() * 3 + 1) +
428 8 + 8 + 1;
429 key->reserve(max_len);
430
f91f0fd5 431 _key_encode_prefix(oid, key);
7c673cae
FG
432
433 append_escaped(oid.hobj.nspace, key);
434
435 if (oid.hobj.get_key().length()) {
436 // is a key... could be < = or >.
437 append_escaped(oid.hobj.get_key(), key);
438 // (ASCII chars < = and > sort in that order, yay)
439 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
440 if (r) {
441 key->append(r > 0 ? ">" : "<");
442 append_escaped(oid.hobj.oid.name, key);
443 } else {
444 // same as no key
445 key->append("=");
446 }
447 } else {
448 // no key
449 append_escaped(oid.hobj.oid.name, key);
450 key->append("=");
451 }
452
453 _key_encode_u64(oid.hobj.snap, key);
454 _key_encode_u64(oid.generation, key);
455
456 key->push_back(ONODE_KEY_SUFFIX);
457
458 // sanity check
459 if (true) {
460 ghobject_t t;
461 int r = get_key_object(*key, &t);
462 if (r || t != oid) {
463 derr << " r " << r << dendl;
464 derr << "key " << pretty_binary_string(*key) << dendl;
465 derr << "oid " << oid << dendl;
466 derr << " t " << t << dendl;
11fdf7f2 467 ceph_assert(r == 0 && t == oid);
7c673cae
FG
468 }
469 }
470}
471
7c673cae
FG
472// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
473// char lets us quickly test whether it is a shard key without decoding any
474// of the prefix bytes.
475template<typename S>
476static void get_extent_shard_key(const S& onode_key, uint32_t offset,
477 string *key)
478{
479 key->clear();
480 key->reserve(onode_key.length() + 4 + 1);
481 key->append(onode_key.c_str(), onode_key.size());
482 _key_encode_u32(offset, key);
483 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
484}
485
486static void rewrite_extent_shard_key(uint32_t offset, string *key)
487{
11fdf7f2
TL
488 ceph_assert(key->size() > sizeof(uint32_t) + 1);
489 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
490 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
491}
492
493template<typename S>
494static void generate_extent_shard_key_and_apply(
495 const S& onode_key,
496 uint32_t offset,
497 string *key,
498 std::function<void(const string& final_key)> apply)
499{
500 if (key->empty()) { // make full key
11fdf7f2 501 ceph_assert(!onode_key.empty());
7c673cae
FG
502 get_extent_shard_key(onode_key, offset, key);
503 } else {
504 rewrite_extent_shard_key(offset, key);
505 }
506 apply(*key);
507}
508
509int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
510{
11fdf7f2
TL
511 ceph_assert(key.size() > sizeof(uint32_t) + 1);
512 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
513 int okey_len = key.size() - sizeof(uint32_t) - 1;
514 *onode_key = key.substr(0, okey_len);
515 const char *p = key.data() + okey_len;
224ce89b 516 _key_decode_u32(p, offset);
7c673cae
FG
517 return 0;
518}
519
520static bool is_extent_shard_key(const string& key)
521{
522 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
523}
524
7c673cae
FG
525static void get_deferred_key(uint64_t seq, string *out)
526{
527 _key_encode_u64(seq, out);
528}
529
11fdf7f2
TL
530static void get_pool_stat_key(int64_t pool_id, string *key)
531{
532 key->clear();
533 _key_encode_u64(pool_id, key);
534}
535
536static int get_key_pool_stat(const string& key, uint64_t* pool_id)
537{
538 const char *p = key.c_str();
539 if (key.length() < sizeof(uint64_t))
540 return -1;
541 _key_decode_u64(p, pool_id);
542 return 0;
543}
7c673cae 544
81eedcae
TL
545template <int LogLevelV>
546void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
547{
548 uint64_t pos = 0;
549 for (auto& s : em.shards) {
550 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
551 << (s.loaded ? " (loaded)" : "")
552 << (s.dirty ? " (dirty)" : "")
553 << dendl;
554 }
555 for (auto& e : em.extent_map) {
556 dout(LogLevelV) << __func__ << " " << e << dendl;
557 ceph_assert(e.logical_offset >= pos);
558 pos = e.logical_offset + e.length;
559 const bluestore_blob_t& blob = e.blob->get_blob();
560 if (blob.has_csum()) {
561 vector<uint64_t> v;
562 unsigned n = blob.get_csum_count();
563 for (unsigned i = 0; i < n; ++i)
564 v.push_back(blob.get_csum_item(i));
565 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
566 << dendl;
567 }
568 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
569 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
570 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
571 << "~" << i.second->length << std::dec
572 << " " << *i.second << dendl;
573 }
574 }
575}
576
577template <int LogLevelV>
578void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
579{
580 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
581 return;
582 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
583 << " nid " << o.onode.nid
584 << " size 0x" << std::hex << o.onode.size
585 << " (" << std::dec << o.onode.size << ")"
586 << " expected_object_size " << o.onode.expected_object_size
587 << " expected_write_size " << o.onode.expected_write_size
588 << " in " << o.onode.extent_map_shards.size() << " shards"
589 << ", " << o.extent_map.spanning_blob_map.size()
590 << " spanning blobs"
591 << dendl;
592 for (auto p = o.onode.attrs.begin();
593 p != o.onode.attrs.end();
594 ++p) {
595 dout(LogLevelV) << __func__ << " attr " << p->first
596 << " len " << p->second.length() << dendl;
597 }
598 _dump_extent_map<LogLevelV>(cct, o.extent_map);
599}
600
601template <int LogLevelV>
602void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
603{
604 dout(LogLevelV) << __func__ << " transaction dump:\n";
605 JSONFormatter f(true);
606 f.open_object_section("transaction");
607 t->dump(&f);
608 f.close_section();
609 f.flush(*_dout);
610 *_dout << dendl;
611}
612
7c673cae
FG
613// Buffer
614
615ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
616{
617 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
618 << b.offset << "~" << b.length << std::dec
619 << " " << BlueStore::Buffer::get_state_name(b.state);
620 if (b.flags)
621 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
622 return out << ")";
623}
624
f91f0fd5
TL
625namespace {
626
627/*
628 * Due to a bug in key string encoding (see a comment for append_escaped)
629 * the KeyValueDB iterator does not lexicographically sort the same
630 * way that ghobject_t does: objects with the same hash may have wrong order.
631 *
632 * This is the iterator wrapper that fixes the keys order.
633 */
634
635class CollectionListIterator {
636public:
637 CollectionListIterator(const KeyValueDB::Iterator &it)
638 : m_it(it) {
639 }
640 virtual ~CollectionListIterator() {
641 }
642
643 virtual bool valid() const = 0;
644 virtual const ghobject_t &oid() const = 0;
645 virtual void lower_bound(const ghobject_t &oid) = 0;
646 virtual void upper_bound(const ghobject_t &oid) = 0;
647 virtual void next() = 0;
648
adb31ebb
TL
649 virtual int cmp(const ghobject_t &oid) const = 0;
650
651 bool is_ge(const ghobject_t &oid) const {
652 return cmp(oid) >= 0;
653 }
654
655 bool is_lt(const ghobject_t &oid) const {
656 return cmp(oid) < 0;
657 }
658
f91f0fd5
TL
659protected:
660 KeyValueDB::Iterator m_it;
661};
662
663class SimpleCollectionListIterator : public CollectionListIterator {
664public:
665 SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
666 : CollectionListIterator(it), m_cct(cct) {
667 }
668
669 bool valid() const override {
670 return m_it->valid();
671 }
672
673 const ghobject_t &oid() const override {
674 ceph_assert(valid());
675
676 return m_oid;
677 }
678
679 void lower_bound(const ghobject_t &oid) override {
680 string key;
681 get_object_key(m_cct, oid, &key);
682
683 m_it->lower_bound(key);
684 get_oid();
685 }
686
687 void upper_bound(const ghobject_t &oid) override {
688 string key;
689 get_object_key(m_cct, oid, &key);
690
691 m_it->upper_bound(key);
692 get_oid();
693 }
694
695 void next() override {
696 ceph_assert(valid());
697
698 m_it->next();
699 get_oid();
700 }
701
adb31ebb
TL
702 int cmp(const ghobject_t &oid) const override {
703 ceph_assert(valid());
704
705 string key;
706 get_object_key(m_cct, oid, &key);
707
708 return m_it->key().compare(key);
709 }
710
f91f0fd5
TL
711private:
712 CephContext *m_cct;
713 ghobject_t m_oid;
714
715 void get_oid() {
f67539c2
TL
716 m_oid = ghobject_t();
717 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
718 m_it->next();
f91f0fd5 719 }
f67539c2 720 if (!valid()) {
f91f0fd5
TL
721 return;
722 }
723
f91f0fd5
TL
724 int r = get_key_object(m_it->key(), &m_oid);
725 ceph_assert(r == 0);
726 }
727};
728
729class SortedCollectionListIterator : public CollectionListIterator {
730public:
731 SortedCollectionListIterator(const KeyValueDB::Iterator &it)
732 : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
733 }
734
735 bool valid() const override {
736 return m_chunk_iter != m_chunk.end();
737 }
738
739 const ghobject_t &oid() const override {
740 ceph_assert(valid());
741
742 return m_chunk_iter->first;
743 }
744
745 void lower_bound(const ghobject_t &oid) override {
746 std::string key;
747 _key_encode_prefix(oid, &key);
748
749 m_it->lower_bound(key);
750 m_chunk_iter = m_chunk.end();
751 if (!get_next_chunk()) {
752 return;
753 }
754
755 if (this->oid().shard_id != oid.shard_id ||
756 this->oid().hobj.pool != oid.hobj.pool ||
757 this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
758 return;
759 }
760
761 m_chunk_iter = m_chunk.lower_bound(oid);
762 if (m_chunk_iter == m_chunk.end()) {
763 get_next_chunk();
764 }
765 }
766
767 void upper_bound(const ghobject_t &oid) override {
768 lower_bound(oid);
769
770 if (valid() && this->oid() == oid) {
771 next();
772 }
773 }
774
775 void next() override {
776 ceph_assert(valid());
777
778 m_chunk_iter++;
779 if (m_chunk_iter == m_chunk.end()) {
780 get_next_chunk();
781 }
782 }
783
adb31ebb
TL
784 int cmp(const ghobject_t &oid) const override {
785 ceph_assert(valid());
786
787 if (this->oid() < oid) {
788 return -1;
789 }
790 if (this->oid() > oid) {
791 return 1;
792 }
793 return 0;
794 }
795
f91f0fd5
TL
796private:
797 std::map<ghobject_t, std::string> m_chunk;
798 std::map<ghobject_t, std::string>::iterator m_chunk_iter;
799
800 bool get_next_chunk() {
801 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
802 m_it->next();
803 }
804
805 if (!m_it->valid()) {
806 return false;
807 }
808
809 ghobject_t oid;
810 int r = get_key_object(m_it->key(), &oid);
811 ceph_assert(r == 0);
812
813 m_chunk.clear();
814 while (true) {
815 m_chunk.insert({oid, m_it->key()});
816
817 do {
818 m_it->next();
819 } while (m_it->valid() && is_extent_shard_key(m_it->key()));
820
821 if (!m_it->valid()) {
822 break;
823 }
824
825 ghobject_t next;
826 r = get_key_object(m_it->key(), &next);
827 ceph_assert(r == 0);
828 if (next.shard_id != oid.shard_id ||
829 next.hobj.pool != oid.hobj.pool ||
830 next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
831 break;
832 }
833 oid = next;
834 }
835
836 m_chunk_iter = m_chunk.begin();
837 return true;
838 }
839};
840
841} // anonymous namespace
842
7c673cae
FG
843// Garbage Collector
844
845void BlueStore::GarbageCollector::process_protrusive_extents(
846 const BlueStore::ExtentMap& extent_map,
847 uint64_t start_offset,
848 uint64_t end_offset,
849 uint64_t start_touch_offset,
850 uint64_t end_touch_offset,
851 uint64_t min_alloc_size)
852{
11fdf7f2 853 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
7c673cae 854
11fdf7f2
TL
855 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
856 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
7c673cae
FG
857
858 dout(30) << __func__ << " (hex): [" << std::hex
859 << lookup_start_offset << ", " << lookup_end_offset
860 << ")" << std::dec << dendl;
861
862 for (auto it = extent_map.seek_lextent(lookup_start_offset);
863 it != extent_map.extent_map.end() &&
864 it->logical_offset < lookup_end_offset;
865 ++it) {
866 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
867 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
868
869 dout(30) << __func__ << " " << *it
870 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
871 << dendl;
872
873 Blob* b = it->blob.get();
874
875 if (it->logical_offset >=start_touch_offset &&
876 it->logical_end() <= end_touch_offset) {
877 // Process extents within the range affected by
878 // the current write request.
879 // Need to take into account if existing extents
880 // can be merged with them (uncompressed case)
881 if (!b->get_blob().is_compressed()) {
882 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
883 --blob_info_counted->expected_allocations; // don't need to allocate
884 // new AU for compressed
885 // data since another
886 // collocated uncompressed
887 // blob already exists
888 dout(30) << __func__ << " --expected:"
889 << alloc_unit_start << dendl;
890 }
891 used_alloc_unit = alloc_unit_end;
892 blob_info_counted = nullptr;
893 }
894 } else if (b->get_blob().is_compressed()) {
895
896 // additionally we take compressed blobs that were not impacted
897 // by the write into account too
898 BlobInfo& bi =
899 affected_blobs.emplace(
900 b, BlobInfo(b->get_referenced_bytes())).first->second;
901
902 int adjust =
903 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
904 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
905 dout(30) << __func__ << " expected_allocations="
906 << bi.expected_allocations << " end_au:"
907 << alloc_unit_end << dendl;
908
909 blob_info_counted = &bi;
910 used_alloc_unit = alloc_unit_end;
911
11fdf7f2 912 ceph_assert(it->length <= bi.referenced_bytes);
7c673cae
FG
913 bi.referenced_bytes -= it->length;
914 dout(30) << __func__ << " affected_blob:" << *b
915 << " unref 0x" << std::hex << it->length
916 << " referenced = 0x" << bi.referenced_bytes
917 << std::dec << dendl;
918 // NOTE: we can't move specific blob to resulting GC list here
919 // when reference counter == 0 since subsequent extents might
920 // decrement its expected_allocation.
921 // Hence need to enumerate all the extents first.
922 if (!bi.collect_candidate) {
923 bi.first_lextent = it;
924 bi.collect_candidate = true;
925 }
926 bi.last_lextent = it;
927 } else {
928 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
929 // don't need to allocate new AU for compressed data since another
930 // collocated uncompressed blob already exists
931 --blob_info_counted->expected_allocations;
932 dout(30) << __func__ << " --expected_allocations:"
933 << alloc_unit_start << dendl;
934 }
935 used_alloc_unit = alloc_unit_end;
936 blob_info_counted = nullptr;
937 }
938 }
939
940 for (auto b_it = affected_blobs.begin();
941 b_it != affected_blobs.end();
942 ++b_it) {
943 Blob* b = b_it->first;
944 BlobInfo& bi = b_it->second;
945 if (bi.referenced_bytes == 0) {
946 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
947 int64_t blob_expected_for_release =
11fdf7f2 948 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
7c673cae
FG
949
950 dout(30) << __func__ << " " << *(b_it->first)
951 << " expected4release=" << blob_expected_for_release
952 << " expected_allocations=" << bi.expected_allocations
953 << dendl;
954 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
11fdf7f2 955 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
7c673cae
FG
956 if (bi.collect_candidate) {
957 auto it = bi.first_lextent;
958 bool bExit = false;
959 do {
960 if (it->blob.get() == b) {
eafe8130 961 extents_to_collect.insert(it->logical_offset, it->length);
7c673cae
FG
962 }
963 bExit = it == bi.last_lextent;
964 ++it;
31f18b77 965 } while (!bExit);
7c673cae
FG
966 }
967 expected_for_release += blob_expected_for_release;
968 expected_allocations += bi.expected_allocations;
969 }
970 }
971 }
972}
973
974int64_t BlueStore::GarbageCollector::estimate(
975 uint64_t start_offset,
976 uint64_t length,
977 const BlueStore::ExtentMap& extent_map,
978 const BlueStore::old_extent_map_t& old_extents,
979 uint64_t min_alloc_size)
980{
981
982 affected_blobs.clear();
983 extents_to_collect.clear();
984 used_alloc_unit = boost::optional<uint64_t >();
985 blob_info_counted = nullptr;
986
eafe8130
TL
987 uint64_t gc_start_offset = start_offset;
988 uint64_t gc_end_offset = start_offset + length;
7c673cae
FG
989
990 uint64_t end_offset = start_offset + length;
991
992 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
993 Blob* b = it->e.blob.get();
994 if (b->get_blob().is_compressed()) {
995
996 // update gc_start_offset/gc_end_offset if needed
997 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
11fdf7f2 998 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
7c673cae
FG
999
1000 auto o = it->e.logical_offset;
1001 auto l = it->e.length;
1002
1003 uint64_t ref_bytes = b->get_referenced_bytes();
1004 // micro optimization to bypass blobs that have no more references
1005 if (ref_bytes != 0) {
1006 dout(30) << __func__ << " affected_blob:" << *b
1007 << " unref 0x" << std::hex << o << "~" << l
1008 << std::dec << dendl;
1009 affected_blobs.emplace(b, BlobInfo(ref_bytes));
1010 }
1011 }
1012 }
1013 dout(30) << __func__ << " gc range(hex): [" << std::hex
1014 << gc_start_offset << ", " << gc_end_offset
1015 << ")" << std::dec << dendl;
1016
1017 // enumerate preceeding extents to check if they reference affected blobs
1018 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
1019 process_protrusive_extents(extent_map,
1020 gc_start_offset,
1021 gc_end_offset,
1022 start_offset,
1023 end_offset,
1024 min_alloc_size);
1025 }
1026 return expected_for_release - expected_allocations;
1027}
1028
9f95a23c
TL
1029// LruOnodeCacheShard
1030struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
1031 typedef boost::intrusive::list<
1032 BlueStore::Onode,
1033 boost::intrusive::member_hook<
1034 BlueStore::Onode,
1035 boost::intrusive::list_member_hook<>,
1036 &BlueStore::Onode::lru_item> > list_t;
7c673cae 1037
9f95a23c 1038 list_t lru;
7c673cae 1039
9f95a23c 1040 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
7c673cae 1041
f6b5b4d7 1042 void _add(BlueStore::Onode* o, int level) override
9f95a23c 1043 {
f6b5b4d7 1044 if (o->put_cache()) {
9f95a23c 1045 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
f6b5b4d7
TL
1046 } else {
1047 ++num_pinned;
9f95a23c 1048 }
f6b5b4d7
TL
1049 ++num; // we count both pinned and unpinned entries
1050 dout(20) << __func__ << " " << this << " " << o->oid << " added, num=" << num << dendl;
eafe8130 1051 }
f6b5b4d7 1052 void _rm(BlueStore::Onode* o) override
9f95a23c 1053 {
f6b5b4d7 1054 if (o->pop_cache()) {
9f95a23c 1055 lru.erase(lru.iterator_to(*o));
f6b5b4d7
TL
1056 } else {
1057 ceph_assert(num_pinned);
1058 --num_pinned;
9f95a23c 1059 }
f6b5b4d7
TL
1060 ceph_assert(num);
1061 --num;
1062 dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
9f95a23c 1063 }
f6b5b4d7 1064 void _pin(BlueStore::Onode* o) override
9f95a23c 1065 {
9f95a23c 1066 lru.erase(lru.iterator_to(*o));
f6b5b4d7
TL
1067 ++num_pinned;
1068 dout(20) << __func__ << this << " " << " " << " " << o->oid << " pinned" << dendl;
9f95a23c 1069 }
f6b5b4d7 1070 void _unpin(BlueStore::Onode* o) override
9f95a23c 1071 {
f6b5b4d7
TL
1072 lru.push_front(*o);
1073 ceph_assert(num_pinned);
1074 --num_pinned;
1075 dout(20) << __func__ << this << " " << " " << " " << o->oid << " unpinned" << dendl;
9f95a23c 1076 }
adb31ebb
TL
1077 void _unpin_and_rm(BlueStore::Onode* o) override
1078 {
1079 o->pop_cache();
1080 ceph_assert(num_pinned);
1081 --num_pinned;
1082 ceph_assert(num);
1083 --num;
1084 }
9f95a23c
TL
1085 void _trim_to(uint64_t new_size) override
1086 {
1087 if (new_size >= lru.size()) {
1088 return; // don't even try
1089 }
1090 uint64_t n = lru.size() - new_size;
1091 auto p = lru.end();
1092 ceph_assert(p != lru.begin());
1093 --p;
f6b5b4d7
TL
1094 ceph_assert(num >= n);
1095 num -= n;
1096 while (n-- > 0) {
9f95a23c 1097 BlueStore::Onode *o = &*p;
f6b5b4d7
TL
1098 dout(20) << __func__ << " rm " << o->oid << " "
1099 << o->nref << " " << o->cached << " " << o->pinned << dendl;
9f95a23c
TL
1100 if (p != lru.begin()) {
1101 lru.erase(p--);
1102 } else {
f6b5b4d7 1103 ceph_assert(n == 0);
9f95a23c 1104 lru.erase(p);
9f95a23c 1105 }
f6b5b4d7
TL
1106 auto pinned = !o->pop_cache();
1107 ceph_assert(!pinned);
1108 o->c->onode_map._remove(o->oid);
9f95a23c 1109 }
f6b5b4d7
TL
1110 }
1111 void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
1112 {
1113 if (to == this) {
1114 return;
1115 }
1116 ceph_assert(o->cached);
1117 ceph_assert(o->pinned);
1118 ceph_assert(num);
1119 ceph_assert(num_pinned);
1120 --num_pinned;
1121 --num;
1122 ++to->num_pinned;
1123 ++to->num;
9f95a23c
TL
1124 }
1125 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
1126 {
f6b5b4d7 1127 *onodes += num;
9f95a23c
TL
1128 *pinned_onodes += num_pinned;
1129 }
1130};
7c673cae 1131
9f95a23c
TL
1132// OnodeCacheShard
1133BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
1134 CephContext* cct,
1135 string type,
1136 PerfCounters *logger)
7c673cae 1137{
9f95a23c
TL
1138 BlueStore::OnodeCacheShard *c = nullptr;
1139 // Currently we only implement an LRU cache for onodes
1140 c = new LruOnodeCacheShard(cct);
1141 c->logger = logger;
1142 return c;
7c673cae
FG
1143}
1144
9f95a23c
TL
1145// LruBufferCacheShard
1146struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
1147 typedef boost::intrusive::list<
1148 BlueStore::Buffer,
1149 boost::intrusive::member_hook<
1150 BlueStore::Buffer,
1151 boost::intrusive::list_member_hook<>,
1152 &BlueStore::Buffer::lru_item> > list_t;
1153 list_t lru;
1154
1155 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
1156
1157 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
1158 if (near) {
1159 auto q = lru.iterator_to(*near);
1160 lru.insert(q, *b);
1161 } else if (level > 0) {
1162 lru.push_front(*b);
1163 } else {
1164 lru.push_back(*b);
7c673cae 1165 }
9f95a23c
TL
1166 buffer_bytes += b->length;
1167 num = lru.size();
1168 }
1169 void _rm(BlueStore::Buffer *b) override {
1170 ceph_assert(buffer_bytes >= b->length);
1171 buffer_bytes -= b->length;
1172 auto q = lru.iterator_to(*b);
1173 lru.erase(q);
1174 num = lru.size();
1175 }
1176 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
1177 src->_rm(b);
1178 _add(b, 0, nullptr);
1179 }
1180 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
1181 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1182 buffer_bytes += delta;
1183 }
1184 void _touch(BlueStore::Buffer *b) override {
1185 auto p = lru.iterator_to(*b);
1186 lru.erase(p);
1187 lru.push_front(*b);
1188 num = lru.size();
1189 _audit("_touch_buffer end");
1190 }
7c673cae 1191
9f95a23c
TL
1192 void _trim_to(uint64_t max) override
1193 {
1194 while (buffer_bytes > max) {
1195 auto i = lru.rbegin();
1196 if (i == lru.rend()) {
1197 // stop if lru is now empty
7c673cae
FG
1198 break;
1199 }
1200
9f95a23c
TL
1201 BlueStore::Buffer *b = &*i;
1202 ceph_assert(b->is_clean());
1203 dout(20) << __func__ << " rm " << *b << dendl;
1204 b->space->_rm_buffer(this, b);
7c673cae 1205 }
9f95a23c 1206 num = lru.size();
7c673cae 1207 }
7c673cae 1208
9f95a23c
TL
1209 void add_stats(uint64_t *extents,
1210 uint64_t *blobs,
1211 uint64_t *buffers,
1212 uint64_t *bytes) override {
1213 *extents += num_extents;
1214 *blobs += num_blobs;
1215 *buffers += num;
1216 *bytes += buffer_bytes;
7c673cae 1217 }
9f95a23c
TL
1218#ifdef DEBUG_CACHE
1219 void _audit(const char *s) override
1220 {
1221 dout(10) << __func__ << " " << when << " start" << dendl;
1222 uint64_t s = 0;
1223 for (auto i = lru.begin(); i != lru.end(); ++i) {
1224 s += i->length;
1225 }
1226 if (s != buffer_bytes) {
1227 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1228 << dendl;
1229 for (auto i = lru.begin(); i != lru.end(); ++i) {
1230 derr << __func__ << " " << *i << dendl;
1231 }
1232 ceph_assert(s == buffer_bytes);
7c673cae 1233 }
9f95a23c
TL
1234 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1235 << " ok" << dendl;
7c673cae 1236 }
7c673cae 1237#endif
9f95a23c 1238};
7c673cae 1239
9f95a23c
TL
1240// TwoQBufferCacheShard
1241
1242struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1243 typedef boost::intrusive::list<
1244 BlueStore::Buffer,
1245 boost::intrusive::member_hook<
1246 BlueStore::Buffer,
1247 boost::intrusive::list_member_hook<>,
1248 &BlueStore::Buffer::lru_item> > list_t;
1249 list_t hot; ///< "Am" hot buffers
1250 list_t warm_in; ///< "A1in" newly warm buffers
1251 list_t warm_out; ///< "A1out" empty buffers we've evicted
9f95a23c
TL
1252
1253 enum {
1254 BUFFER_NEW = 0,
1255 BUFFER_WARM_IN, ///< in warm_in
1256 BUFFER_WARM_OUT, ///< in warm_out
1257 BUFFER_HOT, ///< in hot
1258 BUFFER_TYPE_MAX
1259 };
7c673cae 1260
9f95a23c 1261 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
7c673cae 1262
9f95a23c
TL
1263public:
1264 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
7c673cae 1265
9f95a23c
TL
1266 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1267 {
1268 dout(20) << __func__ << " level " << level << " near " << near
1269 << " on " << *b
1270 << " which has cache_private " << b->cache_private << dendl;
1271 if (near) {
1272 b->cache_private = near->cache_private;
1273 switch (b->cache_private) {
1274 case BUFFER_WARM_IN:
1275 warm_in.insert(warm_in.iterator_to(*near), *b);
1276 break;
1277 case BUFFER_WARM_OUT:
1278 ceph_assert(b->is_empty());
1279 warm_out.insert(warm_out.iterator_to(*near), *b);
1280 break;
1281 case BUFFER_HOT:
1282 hot.insert(hot.iterator_to(*near), *b);
1283 break;
1284 default:
1285 ceph_abort_msg("bad cache_private");
1286 }
1287 } else if (b->cache_private == BUFFER_NEW) {
1288 b->cache_private = BUFFER_WARM_IN;
1289 if (level > 0) {
1290 warm_in.push_front(*b);
1291 } else {
1292 // take caller hint to start at the back of the warm queue
1293 warm_in.push_back(*b);
1294 }
1295 } else {
1296 // we got a hint from discard
1297 switch (b->cache_private) {
1298 case BUFFER_WARM_IN:
1299 // stay in warm_in. move to front, even though 2Q doesn't actually
1300 // do this.
1301 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1302 warm_in.push_front(*b);
1303 break;
1304 case BUFFER_WARM_OUT:
1305 b->cache_private = BUFFER_HOT;
1306 // move to hot. fall-thru
1307 case BUFFER_HOT:
1308 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1309 hot.push_front(*b);
1310 break;
1311 default:
1312 ceph_abort_msg("bad cache_private");
1313 }
1314 }
1315 if (!b->is_empty()) {
1316 buffer_bytes += b->length;
1317 list_bytes[b->cache_private] += b->length;
1318 }
1319 num = hot.size() + warm_in.size();
1320 }
1321
1322 void _rm(BlueStore::Buffer *b) override
1323 {
1324 dout(20) << __func__ << " " << *b << dendl;
1325 if (!b->is_empty()) {
1326 ceph_assert(buffer_bytes >= b->length);
1327 buffer_bytes -= b->length;
1328 ceph_assert(list_bytes[b->cache_private] >= b->length);
1329 list_bytes[b->cache_private] -= b->length;
1330 }
7c673cae
FG
1331 switch (b->cache_private) {
1332 case BUFFER_WARM_IN:
9f95a23c 1333 warm_in.erase(warm_in.iterator_to(*b));
7c673cae
FG
1334 break;
1335 case BUFFER_WARM_OUT:
9f95a23c 1336 warm_out.erase(warm_out.iterator_to(*b));
7c673cae
FG
1337 break;
1338 case BUFFER_HOT:
9f95a23c 1339 hot.erase(hot.iterator_to(*b));
7c673cae
FG
1340 break;
1341 default:
11fdf7f2 1342 ceph_abort_msg("bad cache_private");
7c673cae 1343 }
9f95a23c
TL
1344 num = hot.size() + warm_in.size();
1345 }
1346
1347 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1348 {
1349 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1350 src->_rm(b);
1351
1352 // preserve which list we're on (even if we can't preserve the order!)
7c673cae
FG
1353 switch (b->cache_private) {
1354 case BUFFER_WARM_IN:
9f95a23c
TL
1355 ceph_assert(!b->is_empty());
1356 warm_in.push_back(*b);
7c673cae
FG
1357 break;
1358 case BUFFER_WARM_OUT:
9f95a23c
TL
1359 ceph_assert(b->is_empty());
1360 warm_out.push_back(*b);
1361 break;
7c673cae 1362 case BUFFER_HOT:
9f95a23c
TL
1363 ceph_assert(!b->is_empty());
1364 hot.push_back(*b);
7c673cae
FG
1365 break;
1366 default:
11fdf7f2 1367 ceph_abort_msg("bad cache_private");
7c673cae 1368 }
9f95a23c
TL
1369 if (!b->is_empty()) {
1370 buffer_bytes += b->length;
1371 list_bytes[b->cache_private] += b->length;
1372 }
1373 num = hot.size() + warm_in.size();
7c673cae 1374 }
7c673cae 1375
9f95a23c
TL
1376 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1377 {
1378 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1379 if (!b->is_empty()) {
1380 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1381 buffer_bytes += delta;
1382 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1383 list_bytes[b->cache_private] += delta;
1384 }
7c673cae 1385 }
7c673cae 1386
9f95a23c
TL
1387 void _touch(BlueStore::Buffer *b) override {
1388 switch (b->cache_private) {
1389 case BUFFER_WARM_IN:
1390 // do nothing (somewhat counter-intuitively!)
1391 break;
1392 case BUFFER_WARM_OUT:
1393 // move from warm_out to hot LRU
1394 ceph_abort_msg("this happens via discard hint");
1395 break;
1396 case BUFFER_HOT:
1397 // move to front of hot LRU
1398 hot.erase(hot.iterator_to(*b));
1399 hot.push_front(*b);
1400 break;
1401 }
1402 num = hot.size() + warm_in.size();
1403 _audit("_touch_buffer end");
7c673cae 1404 }
7c673cae 1405
9f95a23c
TL
1406 void _trim_to(uint64_t max) override
1407 {
1408 if (buffer_bytes > max) {
1409 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1410 uint64_t khot = max - kin;
1411
1412 // pre-calculate kout based on average buffer size too,
1413 // which is typical(the warm_in and hot lists may change later)
1414 uint64_t kout = 0;
1415 uint64_t buffer_num = hot.size() + warm_in.size();
1416 if (buffer_num) {
1417 uint64_t avg_size = buffer_bytes / buffer_num;
1418 ceph_assert(avg_size);
1419 uint64_t calculated_num = max / avg_size;
1420 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1421 }
1422
1423 if (list_bytes[BUFFER_HOT] < khot) {
1424 // hot is small, give slack to warm_in
1425 kin += khot - list_bytes[BUFFER_HOT];
1426 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1427 // warm_in is small, give slack to hot
1428 khot += kin - list_bytes[BUFFER_WARM_IN];
1429 }
1430
1431 // adjust warm_in list
1432 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1433 uint64_t evicted = 0;
1434
1435 while (to_evict_bytes > 0) {
1436 auto p = warm_in.rbegin();
1437 if (p == warm_in.rend()) {
1438 // stop if warm_in list is now empty
1439 break;
1440 }
7c673cae 1441
9f95a23c
TL
1442 BlueStore::Buffer *b = &*p;
1443 ceph_assert(b->is_clean());
1444 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1445 ceph_assert(buffer_bytes >= b->length);
1446 buffer_bytes -= b->length;
1447 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1448 list_bytes[BUFFER_WARM_IN] -= b->length;
1449 to_evict_bytes -= b->length;
1450 evicted += b->length;
1451 b->state = BlueStore::Buffer::STATE_EMPTY;
1452 b->data.clear();
1453 warm_in.erase(warm_in.iterator_to(*b));
1454 warm_out.push_front(*b);
1455 b->cache_private = BUFFER_WARM_OUT;
1456 }
1457
1458 if (evicted > 0) {
1459 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1460 << " from warm_in list, done evicting warm_in buffers"
1461 << dendl;
1462 }
7c673cae 1463
9f95a23c
TL
1464 // adjust hot list
1465 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1466 evicted = 0;
7c673cae 1467
9f95a23c
TL
1468 while (to_evict_bytes > 0) {
1469 auto p = hot.rbegin();
1470 if (p == hot.rend()) {
1471 // stop if hot list is now empty
1472 break;
1473 }
7c673cae 1474
9f95a23c
TL
1475 BlueStore::Buffer *b = &*p;
1476 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1477 ceph_assert(b->is_clean());
1478 // adjust evict size before buffer goes invalid
1479 to_evict_bytes -= b->length;
1480 evicted += b->length;
1481 b->space->_rm_buffer(this, b);
1482 }
7c673cae 1483
9f95a23c
TL
1484 if (evicted > 0) {
1485 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1486 << " from hot list, done evicting hot buffers"
1487 << dendl;
7c673cae
FG
1488 }
1489
9f95a23c
TL
1490 // adjust warm out list too, if necessary
1491 int64_t n = warm_out.size() - kout;
1492 while (n-- > 0) {
1493 BlueStore::Buffer *b = &*warm_out.rbegin();
1494 ceph_assert(b->is_empty());
1495 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1496 b->space->_rm_buffer(this, b);
1497 }
7c673cae 1498 }
9f95a23c
TL
1499 num = hot.size() + warm_in.size();
1500 }
7c673cae 1501
9f95a23c
TL
1502 void add_stats(uint64_t *extents,
1503 uint64_t *blobs,
1504 uint64_t *buffers,
1505 uint64_t *bytes) override {
1506 *extents += num_extents;
1507 *blobs += num_blobs;
1508 *buffers += num;
1509 *bytes += buffer_bytes;
1510 }
7c673cae 1511
9f95a23c
TL
1512#ifdef DEBUG_CACHE
1513 void _audit(const char *s) override
1514 {
1515 dout(10) << __func__ << " " << when << " start" << dendl;
1516 uint64_t s = 0;
1517 for (auto i = hot.begin(); i != hot.end(); ++i) {
1518 s += i->length;
7c673cae
FG
1519 }
1520
9f95a23c
TL
1521 uint64_t hot_bytes = s;
1522 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1523 derr << __func__ << " hot_list_bytes "
1524 << list_bytes[BUFFER_HOT]
1525 << " != actual " << hot_bytes
1526 << dendl;
1527 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
7c673cae
FG
1528 }
1529
9f95a23c
TL
1530 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1531 s += i->length;
7c673cae 1532 }
7c673cae 1533
9f95a23c
TL
1534 uint64_t warm_in_bytes = s - hot_bytes;
1535 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1536 derr << __func__ << " warm_in_list_bytes "
1537 << list_bytes[BUFFER_WARM_IN]
1538 << " != actual " << warm_in_bytes
1539 << dendl;
1540 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
7c673cae 1541 }
7c673cae 1542
9f95a23c
TL
1543 if (s != buffer_bytes) {
1544 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1545 << dendl;
1546 ceph_assert(s == buffer_bytes);
1547 }
7c673cae 1548
9f95a23c
TL
1549 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1550 << " ok" << dendl;
7c673cae 1551 }
9f95a23c
TL
1552#endif
1553};
7c673cae 1554
9f95a23c 1555// BuferCacheShard
7c673cae 1556
9f95a23c
TL
1557BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1558 CephContext* cct,
1559 string type,
1560 PerfCounters *logger)
1561{
1562 BufferCacheShard *c = nullptr;
1563 if (type == "lru")
1564 c = new LruBufferCacheShard(cct);
1565 else if (type == "2q")
1566 c = new TwoQBufferCacheShard(cct);
1567 else
1568 ceph_abort_msg("unrecognized cache type");
1569 c->logger = logger;
1570 return c;
7c673cae 1571}
7c673cae
FG
1572
1573// BufferSpace
1574
1575#undef dout_prefix
1576#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1577
9f95a23c 1578void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
7c673cae
FG
1579{
1580 // note: we already hold cache->lock
1581 ldout(cache->cct, 20) << __func__ << dendl;
1582 while (!buffer_map.empty()) {
1583 _rm_buffer(cache, buffer_map.begin());
1584 }
1585}
1586
9f95a23c 1587int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
7c673cae
FG
1588{
1589 // note: we already hold cache->lock
1590 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1591 << std::dec << dendl;
1592 int cache_private = 0;
1593 cache->_audit("discard start");
1594 auto i = _data_lower_bound(offset);
1595 uint32_t end = offset + length;
1596 while (i != buffer_map.end()) {
1597 Buffer *b = i->second.get();
1598 if (b->offset >= end) {
1599 break;
1600 }
1601 if (b->cache_private > cache_private) {
1602 cache_private = b->cache_private;
1603 }
1604 if (b->offset < offset) {
1605 int64_t front = offset - b->offset;
1606 if (b->end() > end) {
1607 // drop middle (split)
1608 uint32_t tail = b->end() - end;
1609 if (b->data.length()) {
1610 bufferlist bl;
1611 bl.substr_of(b->data, b->length - tail, tail);
f67539c2 1612 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
31f18b77
FG
1613 nb->maybe_rebuild();
1614 _add_buffer(cache, nb, 0, b);
7c673cae 1615 } else {
f67539c2
TL
1616 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail,
1617 b->flags),
1618 0, b);
7c673cae
FG
1619 }
1620 if (!b->is_writing()) {
9f95a23c 1621 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1622 }
1623 b->truncate(front);
31f18b77 1624 b->maybe_rebuild();
7c673cae
FG
1625 cache->_audit("discard end 1");
1626 break;
1627 } else {
1628 // drop tail
1629 if (!b->is_writing()) {
9f95a23c 1630 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1631 }
1632 b->truncate(front);
31f18b77 1633 b->maybe_rebuild();
7c673cae
FG
1634 ++i;
1635 continue;
1636 }
1637 }
1638 if (b->end() <= end) {
1639 // drop entire buffer
1640 _rm_buffer(cache, i++);
1641 continue;
1642 }
1643 // drop front
1644 uint32_t keep = b->end() - end;
1645 if (b->data.length()) {
1646 bufferlist bl;
1647 bl.substr_of(b->data, b->length - keep, keep);
f67539c2 1648 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
31f18b77
FG
1649 nb->maybe_rebuild();
1650 _add_buffer(cache, nb, 0, b);
7c673cae 1651 } else {
f67539c2
TL
1652 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep,
1653 b->flags),
1654 0, b);
7c673cae
FG
1655 }
1656 _rm_buffer(cache, i);
1657 cache->_audit("discard end 2");
1658 break;
1659 }
1660 return cache_private;
1661}
1662
1663void BlueStore::BufferSpace::read(
9f95a23c 1664 BufferCacheShard* cache,
224ce89b
WB
1665 uint32_t offset,
1666 uint32_t length,
7c673cae 1667 BlueStore::ready_regions_t& res,
91327a77
AA
1668 interval_set<uint32_t>& res_intervals,
1669 int flags)
7c673cae 1670{
7c673cae
FG
1671 res.clear();
1672 res_intervals.clear();
1673 uint32_t want_bytes = length;
1674 uint32_t end = offset + length;
224ce89b
WB
1675
1676 {
11fdf7f2 1677 std::lock_guard l(cache->lock);
224ce89b
WB
1678 for (auto i = _data_lower_bound(offset);
1679 i != buffer_map.end() && offset < end && i->first < end;
1680 ++i) {
1681 Buffer *b = i->second.get();
11fdf7f2 1682 ceph_assert(b->end() > offset);
91327a77
AA
1683
1684 bool val = false;
1685 if (flags & BYPASS_CLEAN_CACHE)
1686 val = b->is_writing();
1687 else
1688 val = b->is_writing() || b->is_clean();
1689 if (val) {
224ce89b
WB
1690 if (b->offset < offset) {
1691 uint32_t skip = offset - b->offset;
11fdf7f2 1692 uint32_t l = min(length, b->length - skip);
224ce89b
WB
1693 res[offset].substr_of(b->data, skip, l);
1694 res_intervals.insert(offset, l);
1695 offset += l;
1696 length -= l;
1697 if (!b->is_writing()) {
9f95a23c 1698 cache->_touch(b);
f67539c2 1699 }
224ce89b
WB
1700 continue;
1701 }
1702 if (b->offset > offset) {
1703 uint32_t gap = b->offset - offset;
1704 if (length <= gap) {
1705 break;
1706 }
1707 offset += gap;
1708 length -= gap;
1709 }
1710 if (!b->is_writing()) {
9f95a23c 1711 cache->_touch(b);
224ce89b
WB
1712 }
1713 if (b->length > length) {
1714 res[offset].substr_of(b->data, 0, length);
1715 res_intervals.insert(offset, length);
7c673cae 1716 break;
224ce89b
WB
1717 } else {
1718 res[offset].append(b->data);
1719 res_intervals.insert(offset, b->length);
1720 if (b->length == length)
1721 break;
1722 offset += b->length;
1723 length -= b->length;
1724 }
7c673cae
FG
1725 }
1726 }
1727 }
1728
1729 uint64_t hit_bytes = res_intervals.size();
11fdf7f2 1730 ceph_assert(hit_bytes <= want_bytes);
7c673cae
FG
1731 uint64_t miss_bytes = want_bytes - hit_bytes;
1732 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1733 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1734}
1735
9f95a23c 1736void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
7c673cae 1737{
7c673cae
FG
1738 auto i = writing.begin();
1739 while (i != writing.end()) {
1740 if (i->seq > seq) {
1741 break;
1742 }
1743 if (i->seq < seq) {
1744 ++i;
1745 continue;
1746 }
1747
1748 Buffer *b = &*i;
11fdf7f2 1749 ceph_assert(b->is_writing());
7c673cae
FG
1750
1751 if (b->flags & Buffer::FLAG_NOCACHE) {
1752 writing.erase(i++);
1753 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1754 buffer_map.erase(b->offset);
1755 } else {
1756 b->state = Buffer::STATE_CLEAN;
1757 writing.erase(i++);
31f18b77
FG
1758 b->maybe_rebuild();
1759 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
9f95a23c 1760 cache->_add(b, 1, nullptr);
7c673cae
FG
1761 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1762 }
1763 }
9f95a23c 1764 cache->_trim();
7c673cae
FG
1765 cache->_audit("finish_write end");
1766}
1767
9f95a23c 1768void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
7c673cae 1769{
11fdf7f2 1770 std::lock_guard lk(cache->lock);
7c673cae
FG
1771 if (buffer_map.empty())
1772 return;
1773
1774 auto p = --buffer_map.end();
1775 while (true) {
1776 if (p->second->end() <= pos)
1777 break;
1778
1779 if (p->second->offset < pos) {
1780 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1781 size_t left = pos - p->second->offset;
1782 size_t right = p->second->length - left;
1783 if (p->second->data.length()) {
1784 bufferlist bl;
1785 bl.substr_of(p->second->data, left, right);
f67539c2
TL
1786 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1787 0, bl, p->second->flags),
7c673cae
FG
1788 0, p->second.get());
1789 } else {
f67539c2
TL
1790 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1791 0, right, p->second->flags),
7c673cae
FG
1792 0, p->second.get());
1793 }
9f95a23c 1794 cache->_adjust_size(p->second.get(), -right);
7c673cae
FG
1795 p->second->truncate(left);
1796 break;
1797 }
1798
11fdf7f2 1799 ceph_assert(p->second->end() > pos);
7c673cae
FG
1800 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1801 if (p->second->data.length()) {
1802 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
f67539c2 1803 p->second->offset - pos, p->second->data, p->second->flags),
7c673cae
FG
1804 0, p->second.get());
1805 } else {
1806 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
f67539c2 1807 p->second->offset - pos, p->second->length, p->second->flags),
7c673cae
FG
1808 0, p->second.get());
1809 }
1810 if (p == buffer_map.begin()) {
1811 _rm_buffer(cache, p);
1812 break;
1813 } else {
1814 _rm_buffer(cache, p--);
1815 }
1816 }
11fdf7f2 1817 ceph_assert(writing.empty());
9f95a23c 1818 cache->_trim();
7c673cae
FG
1819}
1820
1821// OnodeSpace
1822
1823#undef dout_prefix
1824#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1825
f6b5b4d7
TL
1826BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
1827 OnodeRef& o)
7c673cae 1828{
11fdf7f2 1829 std::lock_guard l(cache->lock);
7c673cae
FG
1830 auto p = onode_map.find(oid);
1831 if (p != onode_map.end()) {
1832 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1833 << " raced, returning existing " << p->second
1834 << dendl;
1835 return p->second;
1836 }
f6b5b4d7 1837 ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
7c673cae 1838 onode_map[oid] = o;
f6b5b4d7 1839 cache->_add(o.get(), 1);
9f95a23c 1840 cache->_trim();
7c673cae
FG
1841 return o;
1842}
1843
f6b5b4d7
TL
1844void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
1845{
1846 ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
1847 onode_map.erase(oid);
1848}
1849
7c673cae
FG
1850BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1851{
7c673cae 1852 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b
WB
1853 OnodeRef o;
1854 bool hit = false;
1855
1856 {
11fdf7f2 1857 std::lock_guard l(cache->lock);
224ce89b
WB
1858 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1859 if (p == onode_map.end()) {
1860 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1861 } else {
1862 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
f6b5b4d7
TL
1863 << " " << p->second->nref
1864 << " " << p->second->cached
1865 << " " << p->second->pinned
224ce89b 1866 << dendl;
f6b5b4d7
TL
1867 // This will pin onode and implicitly touch the cache when Onode
1868 // eventually will become unpinned
224ce89b 1869 o = p->second;
f6b5b4d7
TL
1870 ceph_assert(!o->cached || o->pinned);
1871
1872 hit = true;
224ce89b
WB
1873 }
1874 }
1875
1876 if (hit) {
1877 cache->logger->inc(l_bluestore_onode_hits);
1878 } else {
7c673cae 1879 cache->logger->inc(l_bluestore_onode_misses);
7c673cae 1880 }
224ce89b 1881 return o;
7c673cae
FG
1882}
1883
1884void BlueStore::OnodeSpace::clear()
1885{
11fdf7f2 1886 std::lock_guard l(cache->lock);
f6b5b4d7 1887 ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
7c673cae 1888 for (auto &p : onode_map) {
f6b5b4d7 1889 cache->_rm(p.second.get());
7c673cae
FG
1890 }
1891 onode_map.clear();
1892}
1893
1894bool BlueStore::OnodeSpace::empty()
1895{
11fdf7f2 1896 std::lock_guard l(cache->lock);
7c673cae
FG
1897 return onode_map.empty();
1898}
1899
1900void BlueStore::OnodeSpace::rename(
1901 OnodeRef& oldo,
1902 const ghobject_t& old_oid,
1903 const ghobject_t& new_oid,
f91f0fd5 1904 const mempool::bluestore_cache_meta::string& new_okey)
7c673cae 1905{
11fdf7f2 1906 std::lock_guard l(cache->lock);
7c673cae
FG
1907 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1908 << dendl;
1909 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1910 po = onode_map.find(old_oid);
1911 pn = onode_map.find(new_oid);
11fdf7f2 1912 ceph_assert(po != pn);
7c673cae 1913
11fdf7f2 1914 ceph_assert(po != onode_map.end());
7c673cae
FG
1915 if (pn != onode_map.end()) {
1916 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1917 << dendl;
f6b5b4d7 1918 cache->_rm(pn->second.get());
7c673cae
FG
1919 onode_map.erase(pn);
1920 }
1921 OnodeRef o = po->second;
1922
1923 // install a non-existent onode at old location
1924 oldo.reset(new Onode(o->c, old_oid, o->key));
1925 po->second = oldo;
f6b5b4d7
TL
1926 cache->_add(oldo.get(), 1);
1927 // add at new position and fix oid, key.
1928 // This will pin 'o' and implicitly touch cache
1929 // when it will eventually become unpinned
7c673cae 1930 onode_map.insert(make_pair(new_oid, o));
f6b5b4d7
TL
1931 ceph_assert(o->pinned);
1932
7c673cae
FG
1933 o->oid = new_oid;
1934 o->key = new_okey;
9f95a23c 1935 cache->_trim();
7c673cae
FG
1936}
1937
adb31ebb 1938bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
7c673cae 1939{
11fdf7f2 1940 std::lock_guard l(cache->lock);
7c673cae
FG
1941 ldout(cache->cct, 20) << __func__ << dendl;
1942 for (auto& i : onode_map) {
adb31ebb 1943 if (f(i.second.get())) {
7c673cae
FG
1944 return true;
1945 }
1946 }
1947 return false;
1948}
1949
11fdf7f2
TL
1950template <int LogLevelV = 30>
1951void BlueStore::OnodeSpace::dump(CephContext *cct)
3efd9988
FG
1952{
1953 for (auto& i : onode_map) {
f6b5b4d7
TL
1954 ldout(cct, LogLevelV) << i.first << " : " << i.second
1955 << " " << i.second->nref
1956 << " " << i.second->cached
1957 << " " << i.second->pinned
1958 << dendl;
3efd9988
FG
1959 }
1960}
7c673cae
FG
1961
1962// SharedBlob
1963
1964#undef dout_prefix
1965#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
9f95a23c
TL
1966#undef dout_context
1967#define dout_context coll->store->cct
7c673cae 1968
9f95a23c 1969void BlueStore::SharedBlob::dump(Formatter* f) const
7c673cae 1970{
9f95a23c
TL
1971 f->dump_bool("loaded", loaded);
1972 if (loaded) {
1973 persistent->dump(f);
1974 } else {
1975 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
1976 }
1977}
1978
1979ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1980{
1981 out << "SharedBlob(" << &sb;
1982
7c673cae
FG
1983 if (sb.loaded) {
1984 out << " loaded " << *sb.persistent;
1985 } else {
1986 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1987 }
1988 return out << ")";
1989}
1990
1991BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1992 : coll(_coll), sbid_unloaded(i)
1993{
11fdf7f2 1994 ceph_assert(sbid_unloaded > 0);
7c673cae
FG
1995 if (get_cache()) {
1996 get_cache()->add_blob();
1997 }
1998}
1999
2000BlueStore::SharedBlob::~SharedBlob()
2001{
7c673cae
FG
2002 if (loaded && persistent) {
2003 delete persistent;
2004 }
2005}
2006
2007void BlueStore::SharedBlob::put()
2008{
2009 if (--nref == 0) {
9f95a23c
TL
2010 dout(20) << __func__ << " " << this
2011 << " removing self from set " << get_parent()
2012 << dendl;
1adf2230
AA
2013 again:
2014 auto coll_snap = coll;
2015 if (coll_snap) {
11fdf7f2 2016 std::lock_guard l(coll_snap->cache->lock);
1adf2230
AA
2017 if (coll_snap != coll) {
2018 goto again;
2019 }
91327a77
AA
2020 if (!coll_snap->shared_blob_set.remove(this, true)) {
2021 // race with lookup
2022 return;
2023 }
1adf2230
AA
2024 bc._clear(coll_snap->cache);
2025 coll_snap->cache->rm_blob();
7c673cae 2026 }
28e407b8 2027 delete this;
7c673cae
FG
2028 }
2029}
2030
2031void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
2032{
11fdf7f2 2033 ceph_assert(persistent);
7c673cae
FG
2034 persistent->ref_map.get(offset, length);
2035}
2036
2037void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77 2038 PExtentVector *r,
11fdf7f2 2039 bool *unshare)
7c673cae 2040{
11fdf7f2
TL
2041 ceph_assert(persistent);
2042 persistent->ref_map.put(offset, length, r,
2043 unshare && !*unshare ? unshare : nullptr);
7c673cae
FG
2044}
2045
f64942e4
AA
2046void BlueStore::SharedBlob::finish_write(uint64_t seq)
2047{
2048 while (true) {
9f95a23c 2049 BufferCacheShard *cache = coll->cache;
11fdf7f2 2050 std::lock_guard l(cache->lock);
f64942e4 2051 if (coll->cache != cache) {
9f95a23c
TL
2052 dout(20) << __func__
2053 << " raced with sb cache update, was " << cache
2054 << ", now " << coll->cache << ", retrying"
2055 << dendl;
f64942e4
AA
2056 continue;
2057 }
2058 bc._finish_write(cache, seq);
2059 break;
2060 }
2061}
2062
3efd9988
FG
2063// SharedBlobSet
2064
2065#undef dout_prefix
2066#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2067
11fdf7f2
TL
2068template <int LogLevelV = 30>
2069void BlueStore::SharedBlobSet::dump(CephContext *cct)
3efd9988 2070{
11fdf7f2 2071 std::lock_guard l(lock);
3efd9988 2072 for (auto& i : sb_map) {
11fdf7f2 2073 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
3efd9988
FG
2074 }
2075}
2076
7c673cae
FG
2077// Blob
2078
2079#undef dout_prefix
2080#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2081
9f95a23c
TL
2082void BlueStore::Blob::dump(Formatter* f) const
2083{
2084 if (is_spanning()) {
2085 f->dump_unsigned("spanning_id ", id);
2086 }
2087 blob.dump(f);
2088 if (shared_blob) {
2089 f->dump_object("shared", *shared_blob);
2090 }
2091}
2092
7c673cae
FG
2093ostream& operator<<(ostream& out, const BlueStore::Blob& b)
2094{
2095 out << "Blob(" << &b;
2096 if (b.is_spanning()) {
2097 out << " spanning " << b.id;
2098 }
35e4c445
FG
2099 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
2100 if (b.shared_blob) {
2101 out << " " << *b.shared_blob;
2102 } else {
2103 out << " (shared_blob=NULL)";
2104 }
2105 out << ")";
7c673cae
FG
2106 return out;
2107}
2108
2109void BlueStore::Blob::discard_unallocated(Collection *coll)
2110{
224ce89b 2111 if (get_blob().is_shared()) {
7c673cae
FG
2112 return;
2113 }
224ce89b 2114 if (get_blob().is_compressed()) {
7c673cae
FG
2115 bool discard = false;
2116 bool all_invalid = true;
224ce89b 2117 for (auto e : get_blob().get_extents()) {
7c673cae
FG
2118 if (!e.is_valid()) {
2119 discard = true;
2120 } else {
2121 all_invalid = false;
2122 }
2123 }
11fdf7f2 2124 ceph_assert(discard == all_invalid); // in case of compressed blob all
7c673cae
FG
2125 // or none pextents are invalid.
2126 if (discard) {
224ce89b
WB
2127 shared_blob->bc.discard(shared_blob->get_cache(), 0,
2128 get_blob().get_logical_length());
7c673cae
FG
2129 }
2130 } else {
2131 size_t pos = 0;
224ce89b 2132 for (auto e : get_blob().get_extents()) {
7c673cae 2133 if (!e.is_valid()) {
9f95a23c
TL
2134 dout(20) << __func__ << " 0x" << std::hex << pos
2135 << "~" << e.length
2136 << std::dec << dendl;
7c673cae
FG
2137 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
2138 }
2139 pos += e.length;
2140 }
224ce89b
WB
2141 if (get_blob().can_prune_tail()) {
2142 dirty_blob().prune_tail();
2143 used_in_blob.prune_tail(get_blob().get_ondisk_length());
224ce89b 2144 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
2145 }
2146 }
2147}
2148
2149void BlueStore::Blob::get_ref(
2150 Collection *coll,
2151 uint32_t offset,
2152 uint32_t length)
2153{
2154 // Caller has to initialize Blob's logical length prior to increment
2155 // references. Otherwise one is neither unable to determine required
2156 // amount of counters in case of per-au tracking nor obtain min_release_size
2157 // for single counter mode.
11fdf7f2 2158 ceph_assert(get_blob().get_logical_length() != 0);
7c673cae
FG
2159 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2160 << std::dec << " " << *this << dendl;
2161
2162 if (used_in_blob.is_empty()) {
2163 uint32_t min_release_size =
224ce89b
WB
2164 get_blob().get_release_size(coll->store->min_alloc_size);
2165 uint64_t l = get_blob().get_logical_length();
2166 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
2167 << min_release_size << std::dec << dendl;
7c673cae
FG
2168 used_in_blob.init(l, min_release_size);
2169 }
2170 used_in_blob.get(
2171 offset,
2172 length);
2173}
2174
2175bool BlueStore::Blob::put_ref(
2176 Collection *coll,
2177 uint32_t offset,
2178 uint32_t length,
2179 PExtentVector *r)
2180{
2181 PExtentVector logical;
2182
7c673cae
FG
2183 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2184 << std::dec << " " << *this << dendl;
2185
2186 bool empty = used_in_blob.put(
2187 offset,
2188 length,
2189 &logical);
2190 r->clear();
2191 // nothing to release
2192 if (!empty && logical.empty()) {
2193 return false;
2194 }
2195
2196 bluestore_blob_t& b = dirty_blob();
2197 return b.release_extents(empty, logical, r);
2198}
2199
224ce89b 2200bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
2201 uint32_t target_blob_size,
2202 uint32_t b_offset,
2203 uint32_t *length0) {
11fdf7f2
TL
2204 ceph_assert(min_alloc_size);
2205 ceph_assert(target_blob_size);
7c673cae
FG
2206 if (!get_blob().is_mutable()) {
2207 return false;
2208 }
2209
2210 uint32_t length = *length0;
2211 uint32_t end = b_offset + length;
2212
2213 // Currently for the sake of simplicity we omit blob reuse if data is
2214 // unaligned with csum chunk. Later we can perform padding if needed.
2215 if (get_blob().has_csum() &&
2216 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2217 (end % get_blob().get_csum_chunk_size()) != 0)) {
2218 return false;
2219 }
2220
2221 auto blen = get_blob().get_logical_length();
2222 uint32_t new_blen = blen;
2223
2224 // make sure target_blob_size isn't less than current blob len
11fdf7f2 2225 target_blob_size = std::max(blen, target_blob_size);
7c673cae
FG
2226
2227 if (b_offset >= blen) {
224ce89b
WB
2228 // new data totally stands out of the existing blob
2229 new_blen = end;
7c673cae 2230 } else {
224ce89b 2231 // new data overlaps with the existing blob
11fdf7f2 2232 new_blen = std::max(blen, end);
224ce89b
WB
2233
2234 uint32_t overlap = 0;
2235 if (new_blen > blen) {
2236 overlap = blen - b_offset;
2237 } else {
2238 overlap = length;
2239 }
2240
2241 if (!get_blob().is_unallocated(b_offset, overlap)) {
2242 // abort if any piece of the overlap has already been allocated
2243 return false;
7c673cae
FG
2244 }
2245 }
224ce89b 2246
7c673cae
FG
2247 if (new_blen > blen) {
2248 int64_t overflow = int64_t(new_blen) - target_blob_size;
2249 // Unable to decrease the provided length to fit into max_blob_size
2250 if (overflow >= length) {
2251 return false;
2252 }
2253
2254 // FIXME: in some cases we could reduce unused resolution
2255 if (get_blob().has_unused()) {
2256 return false;
2257 }
2258
2259 if (overflow > 0) {
2260 new_blen -= overflow;
2261 length -= overflow;
2262 *length0 = length;
2263 }
224ce89b 2264
7c673cae
FG
2265 if (new_blen > blen) {
2266 dirty_blob().add_tail(new_blen);
2267 used_in_blob.add_tail(new_blen,
224ce89b 2268 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
2269 }
2270 }
2271 return true;
2272}
2273
2274void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2275{
7c673cae
FG
2276 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2277 << " start " << *this << dendl;
11fdf7f2
TL
2278 ceph_assert(blob.can_split());
2279 ceph_assert(used_in_blob.can_split());
7c673cae
FG
2280 bluestore_blob_t &lb = dirty_blob();
2281 bluestore_blob_t &rb = r->dirty_blob();
2282
2283 used_in_blob.split(
2284 blob_offset,
2285 &(r->used_in_blob));
2286
2287 lb.split(blob_offset, rb);
2288 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2289
2290 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2291 << " finish " << *this << dendl;
2292 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2293 << " and " << *r << dendl;
2294}
2295
2296#ifndef CACHE_BLOB_BL
2297void BlueStore::Blob::decode(
2298 Collection *coll,
11fdf7f2 2299 bufferptr::const_iterator& p,
7c673cae
FG
2300 uint64_t struct_v,
2301 uint64_t* sbid,
2302 bool include_ref_map)
2303{
2304 denc(blob, p, struct_v);
2305 if (blob.is_shared()) {
2306 denc(*sbid, p);
2307 }
2308 if (include_ref_map) {
2309 if (struct_v > 1) {
2310 used_in_blob.decode(p);
2311 } else {
2312 used_in_blob.clear();
2313 bluestore_extent_ref_map_t legacy_ref_map;
2314 legacy_ref_map.decode(p);
2315 for (auto r : legacy_ref_map.ref_map) {
2316 get_ref(
2317 coll,
2318 r.first,
2319 r.second.refs * r.second.length);
2320 }
2321 }
2322 }
2323}
2324#endif
2325
2326// Extent
2327
9f95a23c
TL
2328void BlueStore::Extent::dump(Formatter* f) const
2329{
2330 f->dump_unsigned("logical_offset", logical_offset);
2331 f->dump_unsigned("length", length);
2332 f->dump_unsigned("blob_offset", blob_offset);
2333 f->dump_object("blob", *blob);
2334}
2335
7c673cae
FG
2336ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2337{
2338 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2339 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2340 << " " << *e.blob;
2341}
2342
2343// OldExtent
2344BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2345 uint32_t lo,
2346 uint32_t o,
2347 uint32_t l,
2348 BlobRef& b) {
2349 OldExtent* oe = new OldExtent(lo, o, l, b);
2350 b->put_ref(c.get(), o, l, &(oe->r));
adb31ebb 2351 oe->blob_empty = !b->is_referenced();
7c673cae
FG
2352 return oe;
2353}
2354
2355// ExtentMap
2356
2357#undef dout_prefix
2358#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
9f95a23c
TL
2359#undef dout_context
2360#define dout_context onode->c->store->cct
7c673cae
FG
2361
2362BlueStore::ExtentMap::ExtentMap(Onode *o)
2363 : onode(o),
2364 inline_bl(
2365 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2366}
2367
9f95a23c
TL
2368void BlueStore::ExtentMap::dump(Formatter* f) const
2369{
2370 f->open_array_section("extents");
2371
2372 for (auto& e : extent_map) {
2373 f->dump_object("extent", e);
2374 }
2375 f->close_section();
2376}
2377
11fdf7f2
TL
2378void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2379 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2380 uint64_t& length, uint64_t& dstoff) {
2381
2382 auto cct = onode->c->store->cct;
2383 bool inject_21040 =
2384 cct->_conf->bluestore_debug_inject_bug21040;
2385 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2386 for (auto& e : oldo->extent_map.extent_map) {
2387 e.blob->last_encoded_id = -1;
2388 }
2389
2390 int n = 0;
2391 uint64_t end = srcoff + length;
2392 uint32_t dirty_range_begin = 0;
2393 uint32_t dirty_range_end = 0;
2394 bool src_dirty = false;
2395 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2396 ep != oldo->extent_map.extent_map.end();
2397 ++ep) {
2398 auto& e = *ep;
2399 if (e.logical_offset >= end) {
2400 break;
2401 }
2402 dout(20) << __func__ << " src " << e << dendl;
2403 BlobRef cb;
2404 bool blob_duped = true;
2405 if (e.blob->last_encoded_id >= 0) {
2406 cb = id_to_blob[e.blob->last_encoded_id];
2407 blob_duped = false;
2408 } else {
2409 // dup the blob
2410 const bluestore_blob_t& blob = e.blob->get_blob();
2411 // make sure it is shared
2412 if (!blob.is_shared()) {
2413 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2414 if (!inject_21040 && !src_dirty) {
2415 src_dirty = true;
2416 dirty_range_begin = e.logical_offset;
2417 } else if (inject_21040 &&
2418 dirty_range_begin == 0 && dirty_range_end == 0) {
2419 dirty_range_begin = e.logical_offset;
2420 }
2421 ceph_assert(e.logical_end() > 0);
2422 // -1 to exclude next potential shard
2423 dirty_range_end = e.logical_end() - 1;
2424 } else {
2425 c->load_shared_blob(e.blob->shared_blob);
2426 }
2427 cb = new Blob();
2428 e.blob->last_encoded_id = n;
2429 id_to_blob[n] = cb;
2430 e.blob->dup(*cb);
2431 // bump the extent refs on the copied blob's extents
2432 for (auto p : blob.get_extents()) {
2433 if (p.is_valid()) {
2434 e.blob->shared_blob->get_ref(p.offset, p.length);
2435 }
2436 }
2437 txc->write_shared_blob(e.blob->shared_blob);
2438 dout(20) << __func__ << " new " << *cb << dendl;
2439 }
2440
2441 int skip_front, skip_back;
2442 if (e.logical_offset < srcoff) {
2443 skip_front = srcoff - e.logical_offset;
2444 } else {
2445 skip_front = 0;
2446 }
2447 if (e.logical_end() > end) {
2448 skip_back = e.logical_end() - end;
2449 } else {
2450 skip_back = 0;
2451 }
2452
2453 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2454 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2455 newo->extent_map.extent_map.insert(*ne);
2456 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2457 // fixme: we may leave parts of new blob unreferenced that could
2458 // be freed (relative to the shared_blob).
2459 txc->statfs_delta.stored() += ne->length;
2460 if (e.blob->get_blob().is_compressed()) {
2461 txc->statfs_delta.compressed_original() += ne->length;
2462 if (blob_duped) {
2463 txc->statfs_delta.compressed() +=
2464 cb->get_blob().get_compressed_payload_length();
2465 }
2466 }
2467 dout(20) << __func__ << " dst " << *ne << dendl;
2468 ++n;
2469 }
2470 if ((!inject_21040 && src_dirty) ||
2471 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2472 oldo->extent_map.dirty_range(dirty_range_begin,
2473 dirty_range_end - dirty_range_begin);
2474 txc->write_onode(oldo);
2475 }
2476 txc->write_onode(newo);
2477
2478 if (dstoff + length > newo->onode.size) {
2479 newo->onode.size = dstoff + length;
2480 }
2481 newo->extent_map.dirty_range(dstoff, length);
2482}
7c673cae
FG
2483void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2484 bool force)
2485{
2486 auto cct = onode->c->store->cct; //used by dout
2487 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2488 if (onode->onode.extent_map_shards.empty()) {
2489 if (inline_bl.length() == 0) {
2490 unsigned n;
2491 // we need to encode inline_bl to measure encoded length
2492 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
f91f0fd5 2493 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
11fdf7f2 2494 ceph_assert(!never_happen);
7c673cae
FG
2495 size_t len = inline_bl.length();
2496 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2497 << " extents" << dendl;
2498 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2499 request_reshard(0, OBJECT_MAX_SIZE);
2500 return;
2501 }
2502 }
2503 // will persist in the onode key.
2504 } else {
2505 // pending shard update
2506 struct dirty_shard_t {
2507 Shard *shard;
2508 bufferlist bl;
2509 dirty_shard_t(Shard *s) : shard(s) {}
2510 };
2511 vector<dirty_shard_t> encoded_shards;
2512 // allocate slots for all shards in a single call instead of
2513 // doing multiple allocations - one per each dirty shard
2514 encoded_shards.reserve(shards.size());
2515
2516 auto p = shards.begin();
2517 auto prev_p = p;
2518 while (p != shards.end()) {
11fdf7f2 2519 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2520 auto n = p;
2521 ++n;
2522 if (p->dirty) {
2523 uint32_t endoff;
2524 if (n == shards.end()) {
2525 endoff = OBJECT_MAX_SIZE;
2526 } else {
2527 endoff = n->shard_info->offset;
2528 }
2529 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2530 bufferlist& bl = encoded_shards.back().bl;
2531 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2532 bl, &p->extents)) {
2533 if (force) {
2534 derr << __func__ << " encode_some needs reshard" << dendl;
11fdf7f2 2535 ceph_assert(!force);
7c673cae
FG
2536 }
2537 }
2538 size_t len = bl.length();
2539
2540 dout(20) << __func__ << " shard 0x" << std::hex
2541 << p->shard_info->offset << std::dec << " is " << len
2542 << " bytes (was " << p->shard_info->bytes << ") from "
2543 << p->extents << " extents" << dendl;
2544
2545 if (!force) {
2546 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2547 // we are big; reshard ourselves
2548 request_reshard(p->shard_info->offset, endoff);
2549 }
2550 // avoid resharding the trailing shard, even if it is small
2551 else if (n != shards.end() &&
11fdf7f2
TL
2552 len < g_conf()->bluestore_extent_map_shard_min_size) {
2553 ceph_assert(endoff != OBJECT_MAX_SIZE);
31f18b77
FG
2554 if (p == shards.begin()) {
2555 // we are the first shard, combine with next shard
7c673cae 2556 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2557 } else {
31f18b77
FG
2558 // combine either with the previous shard or the next,
2559 // whichever is smaller
7c673cae
FG
2560 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2561 request_reshard(p->shard_info->offset, endoff + 1);
2562 } else {
2563 request_reshard(prev_p->shard_info->offset, endoff);
2564 }
2565 }
2566 }
2567 }
2568 }
2569 prev_p = p;
2570 p = n;
2571 }
2572 if (needs_reshard()) {
2573 return;
2574 }
2575
2576 // schedule DB update for dirty shards
2577 string key;
2578 for (auto& it : encoded_shards) {
2579 it.shard->dirty = false;
2580 it.shard->shard_info->bytes = it.bl.length();
2581 generate_extent_shard_key_and_apply(
2582 onode->key,
2583 it.shard->shard_info->offset,
2584 &key,
2585 [&](const string& final_key) {
2586 t->set(PREFIX_OBJ, final_key, it.bl);
2587 }
2588 );
2589 }
2590 }
2591}
2592
31f18b77
FG
2593bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2594{
2595 if (spanning_blob_map.empty())
2596 return 0;
2597 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2598 // bid is valid and available.
2599 if (bid >= 0)
2600 return bid;
2601 // Find next unused bid;
2602 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2603 const auto begin_bid = bid;
2604 do {
2605 if (!spanning_blob_map.count(bid))
2606 return bid;
2607 else {
2608 bid++;
2609 if (bid < 0) bid = 0;
2610 }
2611 } while (bid != begin_bid);
81eedcae
TL
2612 auto cct = onode->c->store->cct; // used by dout
2613 _dump_onode<0>(cct, *onode);
11fdf7f2 2614 ceph_abort_msg("no available blob id");
31f18b77
FG
2615}
2616
7c673cae
FG
2617void BlueStore::ExtentMap::reshard(
2618 KeyValueDB *db,
2619 KeyValueDB::Transaction t)
2620{
2621 auto cct = onode->c->store->cct; // used by dout
2622
2623 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2624 << needs_reshard_end << ")" << std::dec
2625 << " of " << onode->onode.extent_map_shards.size()
2626 << " shards on " << onode->oid << dendl;
2627 for (auto& p : spanning_blob_map) {
2628 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2629 << dendl;
2630 }
2631 // determine shard index range
2632 unsigned si_begin = 0, si_end = 0;
2633 if (!shards.empty()) {
2634 while (si_begin + 1 < shards.size() &&
2635 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2636 ++si_begin;
2637 }
2638 needs_reshard_begin = shards[si_begin].shard_info->offset;
2639 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2640 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2641 needs_reshard_end = shards[si_end].shard_info->offset;
2642 break;
2643 }
2644 }
2645 if (si_end == shards.size()) {
2646 needs_reshard_end = OBJECT_MAX_SIZE;
2647 }
2648 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2649 << " over 0x[" << std::hex << needs_reshard_begin << ","
2650 << needs_reshard_end << ")" << std::dec << dendl;
2651 }
2652
181888fb 2653 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2654
2655 // we may need to fault in a larger interval later must have all
2656 // referring extents for spanning blobs loaded in order to have
2657 // accurate use_tracker values.
2658 uint32_t spanning_scan_begin = needs_reshard_begin;
2659 uint32_t spanning_scan_end = needs_reshard_end;
2660
2661 // remove old keys
2662 string key;
2663 for (unsigned i = si_begin; i < si_end; ++i) {
2664 generate_extent_shard_key_and_apply(
2665 onode->key, shards[i].shard_info->offset, &key,
2666 [&](const string& final_key) {
2667 t->rmkey(PREFIX_OBJ, final_key);
2668 }
2669 );
2670 }
2671
2672 // calculate average extent size
2673 unsigned bytes = 0;
2674 unsigned extents = 0;
2675 if (onode->onode.extent_map_shards.empty()) {
2676 bytes = inline_bl.length();
2677 extents = extent_map.size();
2678 } else {
2679 for (unsigned i = si_begin; i < si_end; ++i) {
2680 bytes += shards[i].shard_info->bytes;
2681 extents += shards[i].extents;
2682 }
2683 }
2684 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2685 unsigned slop = target *
2686 cct->_conf->bluestore_extent_map_shard_target_size_slop;
11fdf7f2 2687 unsigned extent_avg = bytes / std::max(1u, extents);
7c673cae
FG
2688 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2689 << ", slop " << slop << dendl;
2690
2691 // reshard
2692 unsigned estimate = 0;
31f18b77 2693 unsigned offset = needs_reshard_begin;
7c673cae
FG
2694 vector<bluestore_onode_t::shard_info> new_shard_info;
2695 unsigned max_blob_end = 0;
2696 Extent dummy(needs_reshard_begin);
2697 for (auto e = extent_map.lower_bound(dummy);
2698 e != extent_map.end();
2699 ++e) {
2700 if (e->logical_offset >= needs_reshard_end) {
2701 break;
2702 }
2703 dout(30) << " extent " << *e << dendl;
2704
2705 // disfavor shard boundaries that span a blob
2706 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2707 if (estimate &&
2708 estimate + extent_avg > target + (would_span ? slop : 0)) {
2709 // new shard
31f18b77 2710 if (offset == needs_reshard_begin) {
7c673cae
FG
2711 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2712 new_shard_info.back().offset = offset;
2713 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2714 << std::dec << dendl;
7c673cae
FG
2715 }
2716 offset = e->logical_offset;
2717 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2718 new_shard_info.back().offset = offset;
2719 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2720 << std::dec << dendl;
2721 estimate = 0;
2722 }
2723 estimate += extent_avg;
31f18b77
FG
2724 unsigned bs = e->blob_start();
2725 if (bs < spanning_scan_begin) {
2726 spanning_scan_begin = bs;
7c673cae
FG
2727 }
2728 uint32_t be = e->blob_end();
2729 if (be > max_blob_end) {
2730 max_blob_end = be;
2731 }
2732 if (be > spanning_scan_end) {
2733 spanning_scan_end = be;
2734 }
2735 }
2736 if (new_shard_info.empty() && (si_begin > 0 ||
2737 si_end < shards.size())) {
2738 // we resharded a partial range; we must produce at least one output
2739 // shard
2740 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2741 new_shard_info.back().offset = needs_reshard_begin;
2742 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2743 << std::dec << " (singleton degenerate case)" << dendl;
2744 }
2745
2746 auto& sv = onode->onode.extent_map_shards;
2747 dout(20) << __func__ << " new " << new_shard_info << dendl;
2748 dout(20) << __func__ << " old " << sv << dendl;
2749 if (sv.empty()) {
2750 // no old shards to keep
2751 sv.swap(new_shard_info);
2752 init_shards(true, true);
2753 } else {
2754 // splice in new shards
2755 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2756 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2757 sv.insert(
2758 sv.begin() + si_begin,
2759 new_shard_info.begin(),
2760 new_shard_info.end());
2761 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2762 si_end = si_begin + new_shard_info.size();
31f18b77 2763
11fdf7f2 2764 ceph_assert(sv.size() == shards.size());
31f18b77
FG
2765
2766 // note that we need to update every shard_info of shards here,
2767 // as sv might have been totally re-allocated above
2768 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2769 shards[i].shard_info = &sv[i];
31f18b77
FG
2770 }
2771
2772 // mark newly added shards as dirty
2773 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2774 shards[i].loaded = true;
2775 shards[i].dirty = true;
2776 }
7c673cae
FG
2777 }
2778 dout(20) << __func__ << " fin " << sv << dendl;
2779 inline_bl.clear();
2780
2781 if (sv.empty()) {
2782 // no more shards; unspan all previously spanning blobs
2783 auto p = spanning_blob_map.begin();
2784 while (p != spanning_blob_map.end()) {
2785 p->second->id = -1;
2786 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2787 p = spanning_blob_map.erase(p);
2788 }
2789 } else {
2790 // identify new spanning blobs
2791 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2792 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2793 if (spanning_scan_begin < needs_reshard_begin) {
2794 fault_range(db, spanning_scan_begin,
2795 needs_reshard_begin - spanning_scan_begin);
2796 }
2797 if (spanning_scan_end > needs_reshard_end) {
2798 fault_range(db, needs_reshard_end,
31f18b77 2799 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2800 }
2801 auto sp = sv.begin() + si_begin;
2802 auto esp = sv.end();
2803 unsigned shard_start = sp->offset;
2804 unsigned shard_end;
2805 ++sp;
2806 if (sp == esp) {
2807 shard_end = OBJECT_MAX_SIZE;
2808 } else {
2809 shard_end = sp->offset;
2810 }
7c673cae 2811 Extent dummy(needs_reshard_begin);
9f95a23c
TL
2812
2813 bool was_too_many_blobs_check = false;
2814 auto too_many_blobs_threshold =
2815 g_conf()->bluestore_debug_too_many_blobs_threshold;
2816 auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
2817 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2818 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
2819
7c673cae
FG
2820 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2821 if (e->logical_offset >= needs_reshard_end) {
2822 break;
2823 }
2824 dout(30) << " extent " << *e << dendl;
2825 while (e->logical_offset >= shard_end) {
2826 shard_start = shard_end;
11fdf7f2 2827 ceph_assert(sp != esp);
7c673cae
FG
2828 ++sp;
2829 if (sp == esp) {
2830 shard_end = OBJECT_MAX_SIZE;
2831 } else {
2832 shard_end = sp->offset;
2833 }
2834 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2835 << " to 0x" << shard_end << std::dec << dendl;
2836 }
9f95a23c 2837
7c673cae
FG
2838 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2839 if (!e->blob->is_spanning()) {
2840 // We have two options: (1) split the blob into pieces at the
2841 // shard boundaries (and adjust extents accordingly), or (2)
2842 // mark it spanning. We prefer to cut the blob if we can. Note that
2843 // we may have to split it multiple times--potentially at every
2844 // shard boundary.
2845 bool must_span = false;
2846 BlobRef b = e->blob;
2847 if (b->can_split()) {
2848 uint32_t bstart = e->blob_start();
2849 uint32_t bend = e->blob_end();
2850 for (const auto& sh : shards) {
2851 if (bstart < sh.shard_info->offset &&
2852 bend > sh.shard_info->offset) {
2853 uint32_t blob_offset = sh.shard_info->offset - bstart;
2854 if (b->can_split_at(blob_offset)) {
2855 dout(20) << __func__ << " splitting blob, bstart 0x"
2856 << std::hex << bstart << " blob_offset 0x"
2857 << blob_offset << std::dec << " " << *b << dendl;
2858 b = split_blob(b, blob_offset, sh.shard_info->offset);
2859 // switch b to the new right-hand side, in case it
2860 // *also* has to get split.
2861 bstart += blob_offset;
2862 onode->c->store->logger->inc(l_bluestore_blob_split);
2863 } else {
2864 must_span = true;
2865 break;
2866 }
2867 }
2868 }
2869 } else {
2870 must_span = true;
2871 }
2872 if (must_span) {
31f18b77
FG
2873 auto bid = allocate_spanning_blob_id();
2874 b->id = bid;
7c673cae
FG
2875 spanning_blob_map[b->id] = b;
2876 dout(20) << __func__ << " adding spanning " << *b << dendl;
9f95a23c
TL
2877 if (!was_too_many_blobs_check &&
2878 too_many_blobs_threshold &&
2879 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2880
2881 was_too_many_blobs_check = true;
2882 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2883 if (dumped_onodes[i].first == onode->oid) {
2884 oid_slot = &dumped_onodes[i];
2885 break;
2886 }
2887 if (!oldest_slot || (oldest_slot &&
2888 dumped_onodes[i].second < oldest_slot->second)) {
2889 oldest_slot = &dumped_onodes[i];
2890 }
2891 }
2892 }
7c673cae
FG
2893 }
2894 }
2895 } else {
2896 if (e->blob->is_spanning()) {
2897 spanning_blob_map.erase(e->blob->id);
2898 e->blob->id = -1;
2899 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2900 }
2901 }
2902 }
9f95a23c
TL
2903 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
2904 (oid_slot &&
2905 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
2906 if (do_dump) {
2907 dout(0) << __func__
2908 << " spanning blob count exceeds threshold, "
2909 << spanning_blob_map.size() << " spanning blobs"
2910 << dendl;
2911 _dump_onode<0>(cct, *onode);
2912 if (oid_slot) {
2913 oid_slot->second = mono_clock::now();
2914 } else {
2915 ceph_assert(oldest_slot);
2916 oldest_slot->first = onode->oid;
2917 oldest_slot->second = mono_clock::now();
2918 }
2919 }
7c673cae
FG
2920 }
2921
2922 clear_needs_reshard();
2923}
2924
2925bool BlueStore::ExtentMap::encode_some(
2926 uint32_t offset,
2927 uint32_t length,
2928 bufferlist& bl,
2929 unsigned *pn)
2930{
7c673cae
FG
2931 Extent dummy(offset);
2932 auto start = extent_map.lower_bound(dummy);
2933 uint32_t end = offset + length;
2934
2935 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2936 // serialization only. Hence there is no specific
2937 // handling at ExtentMap level.
2938
2939 unsigned n = 0;
2940 size_t bound = 0;
7c673cae
FG
2941 bool must_reshard = false;
2942 for (auto p = start;
2943 p != extent_map.end() && p->logical_offset < end;
2944 ++p, ++n) {
11fdf7f2 2945 ceph_assert(p->logical_offset >= offset);
7c673cae
FG
2946 p->blob->last_encoded_id = -1;
2947 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2948 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2949 << std::dec << " hit new spanning blob " << *p << dendl;
2950 request_reshard(p->blob_start(), p->blob_end());
2951 must_reshard = true;
2952 }
31f18b77
FG
2953 if (!must_reshard) {
2954 denc_varint(0, bound); // blobid
2955 denc_varint(0, bound); // logical_offset
2956 denc_varint(0, bound); // len
2957 denc_varint(0, bound); // blob_offset
7c673cae 2958
31f18b77
FG
2959 p->blob->bound_encode(
2960 bound,
2961 struct_v,
2962 p->blob->shared_blob->get_sbid(),
2963 false);
2964 }
7c673cae
FG
2965 }
2966 if (must_reshard) {
2967 return true;
2968 }
2969
31f18b77
FG
2970 denc(struct_v, bound);
2971 denc_varint(0, bound); // number of extents
2972
7c673cae
FG
2973 {
2974 auto app = bl.get_contiguous_appender(bound);
2975 denc(struct_v, app);
2976 denc_varint(n, app);
2977 if (pn) {
2978 *pn = n;
2979 }
2980
2981 n = 0;
2982 uint64_t pos = 0;
2983 uint64_t prev_len = 0;
2984 for (auto p = start;
2985 p != extent_map.end() && p->logical_offset < end;
2986 ++p, ++n) {
2987 unsigned blobid;
2988 bool include_blob = false;
2989 if (p->blob->is_spanning()) {
2990 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2991 blobid |= BLOBID_FLAG_SPANNING;
2992 } else if (p->blob->last_encoded_id < 0) {
2993 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2994 include_blob = true;
2995 blobid = 0; // the decoder will infer the id from n
2996 } else {
2997 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2998 }
2999 if (p->logical_offset == pos) {
3000 blobid |= BLOBID_FLAG_CONTIGUOUS;
3001 }
3002 if (p->blob_offset == 0) {
3003 blobid |= BLOBID_FLAG_ZEROOFFSET;
3004 }
3005 if (p->length == prev_len) {
3006 blobid |= BLOBID_FLAG_SAMELENGTH;
3007 } else {
3008 prev_len = p->length;
3009 }
3010 denc_varint(blobid, app);
3011 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3012 denc_varint_lowz(p->logical_offset - pos, app);
3013 }
3014 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3015 denc_varint_lowz(p->blob_offset, app);
3016 }
3017 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3018 denc_varint_lowz(p->length, app);
3019 }
3020 pos = p->logical_end();
3021 if (include_blob) {
3022 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
3023 }
3024 }
3025 }
3026 /*derr << __func__ << bl << dendl;
3027 derr << __func__ << ":";
3028 bl.hexdump(*_dout);
3029 *_dout << dendl;
3030 */
3031 return false;
3032}
3033
3034unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
3035{
7c673cae
FG
3036 /*
3037 derr << __func__ << ":";
3038 bl.hexdump(*_dout);
3039 *_dout << dendl;
3040 */
3041
11fdf7f2 3042 ceph_assert(bl.get_num_buffers() <= 1);
7c673cae
FG
3043 auto p = bl.front().begin_deep();
3044 __u8 struct_v;
3045 denc(struct_v, p);
3046 // Version 2 differs from v1 in blob's ref_map
3047 // serialization only. Hence there is no specific
3048 // handling at ExtentMap level below.
11fdf7f2 3049 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
3050
3051 uint32_t num;
3052 denc_varint(num, p);
3053 vector<BlobRef> blobs(num);
3054 uint64_t pos = 0;
3055 uint64_t prev_len = 0;
3056 unsigned n = 0;
3057
3058 while (!p.end()) {
3059 Extent *le = new Extent();
3060 uint64_t blobid;
3061 denc_varint(blobid, p);
3062 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3063 uint64_t gap;
3064 denc_varint_lowz(gap, p);
3065 pos += gap;
3066 }
3067 le->logical_offset = pos;
3068 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3069 denc_varint_lowz(le->blob_offset, p);
3070 } else {
3071 le->blob_offset = 0;
3072 }
3073 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3074 denc_varint_lowz(prev_len, p);
3075 }
3076 le->length = prev_len;
3077
3078 if (blobid & BLOBID_FLAG_SPANNING) {
3079 dout(30) << __func__ << " getting spanning blob "
3080 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
3081 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
3082 } else {
3083 blobid >>= BLOBID_SHIFT_BITS;
3084 if (blobid) {
3085 le->assign_blob(blobs[blobid - 1]);
11fdf7f2 3086 ceph_assert(le->blob);
7c673cae
FG
3087 } else {
3088 Blob *b = new Blob();
3089 uint64_t sbid = 0;
3090 b->decode(onode->c, p, struct_v, &sbid, false);
3091 blobs[n] = b;
3092 onode->c->open_shared_blob(sbid, b);
3093 le->assign_blob(b);
3094 }
3095 // we build ref_map dynamically for non-spanning blobs
3096 le->blob->get_ref(
3097 onode->c,
3098 le->blob_offset,
3099 le->length);
3100 }
3101 pos += prev_len;
3102 ++n;
3103 extent_map.insert(*le);
3104 }
3105
11fdf7f2 3106 ceph_assert(n == num);
7c673cae
FG
3107 return num;
3108}
3109
3110void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
3111{
3112 // Version 2 differs from v1 in blob's ref_map
3113 // serialization only. Hence there is no specific
3114 // handling at ExtentMap level.
3115 __u8 struct_v = 2;
3116
3117 denc(struct_v, p);
3118 denc_varint((uint32_t)0, p);
3119 size_t key_size = 0;
3120 denc_varint((uint32_t)0, key_size);
3121 p += spanning_blob_map.size() * key_size;
3122 for (const auto& i : spanning_blob_map) {
3123 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3124 }
3125}
3126
3127void BlueStore::ExtentMap::encode_spanning_blobs(
3128 bufferlist::contiguous_appender& p)
3129{
3130 // Version 2 differs from v1 in blob's ref_map
3131 // serialization only. Hence there is no specific
3132 // handling at ExtentMap level.
3133 __u8 struct_v = 2;
3134
3135 denc(struct_v, p);
3136 denc_varint(spanning_blob_map.size(), p);
3137 for (auto& i : spanning_blob_map) {
3138 denc_varint(i.second->id, p);
3139 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3140 }
3141}
3142
3143void BlueStore::ExtentMap::decode_spanning_blobs(
11fdf7f2 3144 bufferptr::const_iterator& p)
7c673cae
FG
3145{
3146 __u8 struct_v;
3147 denc(struct_v, p);
3148 // Version 2 differs from v1 in blob's ref_map
3149 // serialization only. Hence there is no specific
3150 // handling at ExtentMap level.
11fdf7f2 3151 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
3152
3153 unsigned n;
3154 denc_varint(n, p);
3155 while (n--) {
3156 BlobRef b(new Blob());
3157 denc_varint(b->id, p);
3158 spanning_blob_map[b->id] = b;
3159 uint64_t sbid = 0;
3160 b->decode(onode->c, p, struct_v, &sbid, true);
3161 onode->c->open_shared_blob(sbid, b);
3162 }
3163}
3164
3165void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
3166{
3167 shards.resize(onode->onode.extent_map_shards.size());
3168 unsigned i = 0;
3169 for (auto &s : onode->onode.extent_map_shards) {
3170 shards[i].shard_info = &s;
3171 shards[i].loaded = loaded;
3172 shards[i].dirty = dirty;
3173 ++i;
3174 }
3175}
3176
3177void BlueStore::ExtentMap::fault_range(
3178 KeyValueDB *db,
3179 uint32_t offset,
3180 uint32_t length)
3181{
7c673cae
FG
3182 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3183 << std::dec << dendl;
3184 auto start = seek_shard(offset);
3185 auto last = seek_shard(offset + length);
3186
3187 if (start < 0)
3188 return;
3189
11fdf7f2 3190 ceph_assert(last >= start);
7c673cae
FG
3191 string key;
3192 while (start <= last) {
11fdf7f2 3193 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3194 auto p = &shards[start];
3195 if (!p->loaded) {
3196 dout(30) << __func__ << " opening shard 0x" << std::hex
3197 << p->shard_info->offset << std::dec << dendl;
3198 bufferlist v;
3199 generate_extent_shard_key_and_apply(
3200 onode->key, p->shard_info->offset, &key,
3201 [&](const string& final_key) {
3202 int r = db->get(PREFIX_OBJ, final_key, &v);
3203 if (r < 0) {
3204 derr << __func__ << " missing shard 0x" << std::hex
3205 << p->shard_info->offset << std::dec << " for " << onode->oid
3206 << dendl;
11fdf7f2 3207 ceph_assert(r >= 0);
7c673cae
FG
3208 }
3209 }
3210 );
3211 p->extents = decode_some(v);
3212 p->loaded = true;
3213 dout(20) << __func__ << " open shard 0x" << std::hex
81eedcae
TL
3214 << p->shard_info->offset
3215 << " for range 0x" << offset << "~" << length << std::dec
7c673cae 3216 << " (" << v.length() << " bytes)" << dendl;
11fdf7f2
TL
3217 ceph_assert(p->dirty == false);
3218 ceph_assert(v.length() == p->shard_info->bytes);
7c673cae
FG
3219 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3220 } else {
3221 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3222 }
3223 ++start;
3224 }
3225}
3226
3227void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
3228 uint32_t offset,
3229 uint32_t length)
3230{
7c673cae
FG
3231 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3232 << std::dec << dendl;
3233 if (shards.empty()) {
3234 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3235 inline_bl.clear();
3236 return;
3237 }
3238 auto start = seek_shard(offset);
11fdf7f2
TL
3239 if (length == 0) {
3240 length = 1;
3241 }
3242 auto last = seek_shard(offset + length - 1);
7c673cae
FG
3243 if (start < 0)
3244 return;
3245
11fdf7f2 3246 ceph_assert(last >= start);
7c673cae 3247 while (start <= last) {
11fdf7f2 3248 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3249 auto p = &shards[start];
3250 if (!p->loaded) {
11fdf7f2
TL
3251 derr << __func__ << "on write 0x" << std::hex << offset
3252 << "~" << length << " shard 0x" << p->shard_info->offset
3253 << std::dec << " is not loaded, can't mark dirty" << dendl;
3254 ceph_abort_msg("can't mark unloaded shard dirty");
7c673cae
FG
3255 }
3256 if (!p->dirty) {
3257 dout(20) << __func__ << " mark shard 0x" << std::hex
3258 << p->shard_info->offset << std::dec << " dirty" << dendl;
3259 p->dirty = true;
3260 }
3261 ++start;
3262 }
3263}
3264
3265BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3266 uint64_t offset)
3267{
3268 Extent dummy(offset);
3269 return extent_map.find(dummy);
3270}
3271
7c673cae
FG
3272BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3273 uint64_t offset)
3274{
3275 Extent dummy(offset);
3276 auto fp = extent_map.lower_bound(dummy);
3277 if (fp != extent_map.begin()) {
3278 --fp;
3279 if (fp->logical_end() <= offset) {
3280 ++fp;
3281 }
3282 }
3283 return fp;
3284}
3285
3286BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3287 uint64_t offset) const
3288{
3289 Extent dummy(offset);
3290 auto fp = extent_map.lower_bound(dummy);
3291 if (fp != extent_map.begin()) {
3292 --fp;
3293 if (fp->logical_end() <= offset) {
3294 ++fp;
3295 }
3296 }
3297 return fp;
3298}
3299
3300bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3301{
3302 auto fp = seek_lextent(offset);
3303 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3304 return false;
3305 }
3306 return true;
3307}
3308
3309int BlueStore::ExtentMap::compress_extent_map(
3310 uint64_t offset,
3311 uint64_t length)
3312{
7c673cae
FG
3313 if (extent_map.empty())
3314 return 0;
3315 int removed = 0;
3316 auto p = seek_lextent(offset);
3317 if (p != extent_map.begin()) {
3318 --p; // start to the left of offset
3319 }
3320 // the caller should have just written to this region
11fdf7f2 3321 ceph_assert(p != extent_map.end());
7c673cae
FG
3322
3323 // identify the *next* shard
3324 auto pshard = shards.begin();
3325 while (pshard != shards.end() &&
3326 p->logical_offset >= pshard->shard_info->offset) {
3327 ++pshard;
3328 }
3329 uint64_t shard_end;
3330 if (pshard != shards.end()) {
3331 shard_end = pshard->shard_info->offset;
3332 } else {
3333 shard_end = OBJECT_MAX_SIZE;
3334 }
3335
3336 auto n = p;
3337 for (++n; n != extent_map.end(); p = n++) {
3338 if (n->logical_offset > offset + length) {
3339 break; // stop after end
3340 }
3341 while (n != extent_map.end() &&
3342 p->logical_end() == n->logical_offset &&
3343 p->blob == n->blob &&
3344 p->blob_offset + p->length == n->blob_offset &&
3345 n->logical_offset < shard_end) {
3346 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3347 << " next shard 0x" << shard_end << std::dec
3348 << " merging " << *p << " and " << *n << dendl;
3349 p->length += n->length;
3350 rm(n++);
3351 ++removed;
3352 }
3353 if (n == extent_map.end()) {
3354 break;
3355 }
3356 if (n->logical_offset >= shard_end) {
11fdf7f2 3357 ceph_assert(pshard != shards.end());
7c673cae
FG
3358 ++pshard;
3359 if (pshard != shards.end()) {
3360 shard_end = pshard->shard_info->offset;
3361 } else {
3362 shard_end = OBJECT_MAX_SIZE;
3363 }
3364 }
3365 }
11fdf7f2 3366 if (removed) {
7c673cae
FG
3367 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3368 }
3369 return removed;
3370}
3371
3372void BlueStore::ExtentMap::punch_hole(
3373 CollectionRef &c,
3374 uint64_t offset,
3375 uint64_t length,
3376 old_extent_map_t *old_extents)
3377{
3378 auto p = seek_lextent(offset);
3379 uint64_t end = offset + length;
3380 while (p != extent_map.end()) {
3381 if (p->logical_offset >= end) {
3382 break;
3383 }
3384 if (p->logical_offset < offset) {
3385 if (p->logical_end() > end) {
3386 // split and deref middle
3387 uint64_t front = offset - p->logical_offset;
3388 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3389 length, p->blob);
3390 old_extents->push_back(*oe);
3391 add(end,
3392 p->blob_offset + front + length,
3393 p->length - front - length,
3394 p->blob);
3395 p->length = front;
3396 break;
3397 } else {
3398 // deref tail
11fdf7f2 3399 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
7c673cae
FG
3400 uint64_t keep = offset - p->logical_offset;
3401 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3402 p->length - keep, p->blob);
3403 old_extents->push_back(*oe);
3404 p->length = keep;
3405 ++p;
3406 continue;
3407 }
3408 }
3409 if (p->logical_offset + p->length <= end) {
3410 // deref whole lextent
3411 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3412 p->length, p->blob);
3413 old_extents->push_back(*oe);
3414 rm(p++);
3415 continue;
3416 }
3417 // deref head
3418 uint64_t keep = p->logical_end() - end;
3419 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3420 p->length - keep, p->blob);
3421 old_extents->push_back(*oe);
3422
3423 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3424 rm(p);
3425 break;
3426 }
3427}
3428
3429BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3430 CollectionRef &c,
3431 uint64_t logical_offset,
3432 uint64_t blob_offset, uint64_t length, BlobRef b,
3433 old_extent_map_t *old_extents)
3434{
3435 // We need to have completely initialized Blob to increment its ref counters.
11fdf7f2 3436 ceph_assert(b->get_blob().get_logical_length() != 0);
7c673cae
FG
3437
3438 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3439 // old_extents list if we overwre the blob totally
3440 // This might happen during WAL overwrite.
3441 b->get_ref(onode->c, blob_offset, length);
3442
3443 if (old_extents) {
3444 punch_hole(c, logical_offset, length, old_extents);
3445 }
3446
3447 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3448 extent_map.insert(*le);
3449 if (spans_shard(logical_offset, length)) {
3450 request_reshard(logical_offset, logical_offset + length);
3451 }
3452 return le;
3453}
3454
3455BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3456 BlobRef lb,
3457 uint32_t blob_offset,
3458 uint32_t pos)
3459{
7c673cae
FG
3460 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3461 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3462 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3463 << dendl;
3464 BlobRef rb = onode->c->new_blob();
3465 lb->split(onode->c, blob_offset, rb.get());
3466
3467 for (auto ep = seek_lextent(pos);
3468 ep != extent_map.end() && ep->logical_offset < end_pos;
3469 ++ep) {
3470 if (ep->blob != lb) {
3471 continue;
3472 }
3473 if (ep->logical_offset < pos) {
3474 // split extent
3475 size_t left = pos - ep->logical_offset;
3476 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3477 extent_map.insert(*ne);
3478 ep->length = left;
3479 dout(30) << __func__ << " split " << *ep << dendl;
3480 dout(30) << __func__ << " to " << *ne << dendl;
3481 } else {
3482 // switch blob
11fdf7f2 3483 ceph_assert(ep->blob_offset >= blob_offset);
7c673cae
FG
3484
3485 ep->blob = rb;
3486 ep->blob_offset -= blob_offset;
3487 dout(30) << __func__ << " adjusted " << *ep << dendl;
3488 }
3489 }
3490 return rb;
3491}
3492
3493// Onode
3494
3495#undef dout_prefix
3496#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3497
f6b5b4d7
TL
3498//
3499// A tricky thing about Onode's ref counter is that we do an additional
3500// increment when newly pinned instance is detected. And -1 on unpin.
3501// This prevents from a conflict with a delete call (when nref == 0).
3502// The latter might happen while the thread is in unpin() function
3503// (and e.g. waiting for lock acquisition) since nref is already
3504// decremented. And another 'putting' thread on the instance will release it.
3505//
3506void BlueStore::Onode::get() {
adb31ebb
TL
3507 if (++nref >= 2 && !pinned) {
3508 OnodeCacheShard* ocs = c->get_onode_cache();
f67539c2
TL
3509 ocs->lock.lock();
3510 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3511 while (ocs != c->get_onode_cache()) {
3512 ocs->lock.unlock();
3513 ocs = c->get_onode_cache();
3514 ocs->lock.lock();
3515 }
adb31ebb
TL
3516 bool was_pinned = pinned;
3517 pinned = nref >= 2;
3518 // additional increment for newly pinned instance
3519 bool r = !was_pinned && pinned;
3520 if (r) {
3521 ++nref;
3522 }
3523 if (cached && r) {
3524 ocs->_pin(this);
3525 }
f67539c2 3526 ocs->lock.unlock();
f6b5b4d7
TL
3527 }
3528}
3529void BlueStore::Onode::put() {
adb31ebb
TL
3530 int n = --nref;
3531 if (n == 2) {
3532 OnodeCacheShard* ocs = c->get_onode_cache();
f67539c2
TL
3533 ocs->lock.lock();
3534 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3535 while (ocs != c->get_onode_cache()) {
3536 ocs->lock.unlock();
3537 ocs = c->get_onode_cache();
3538 ocs->lock.lock();
3539 }
adb31ebb
TL
3540 bool need_unpin = pinned;
3541 pinned = pinned && nref > 2; // intentionally use > not >= as we have
3542 // +1 due to pinned state
3543 need_unpin = need_unpin && !pinned;
3544 if (cached && need_unpin) {
3545 if (exists) {
3546 ocs->_unpin(this);
3547 } else {
3548 ocs->_unpin_and_rm(this);
3549 // remove will also decrement nref and delete Onode
3550 c->onode_map._remove(oid);
3551 }
3552 }
3553 // additional decrement for newly unpinned instance
3554 // should be the last action since Onode can be released
3555 // at any point after this decrement
3556 if (need_unpin) {
3557 n = --nref;
3558 }
f67539c2 3559 ocs->lock.unlock();
f6b5b4d7 3560 }
adb31ebb 3561 if (n == 0) {
f6b5b4d7
TL
3562 delete this;
3563 }
3564}
3565
eafe8130
TL
3566BlueStore::Onode* BlueStore::Onode::decode(
3567 CollectionRef c,
3568 const ghobject_t& oid,
3569 const string& key,
3570 const bufferlist& v)
3571{
3572 Onode* on = new Onode(c.get(), oid, key);
3573 on->exists = true;
3574 auto p = v.front().begin_deep();
3575 on->onode.decode(p);
3576 for (auto& i : on->onode.attrs) {
f91f0fd5 3577 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
eafe8130
TL
3578 }
3579
3580 // initialize extent_map
3581 on->extent_map.decode_spanning_blobs(p);
3582 if (on->onode.extent_map_shards.empty()) {
3583 denc(on->extent_map.inline_bl, p);
3584 on->extent_map.decode_some(on->extent_map.inline_bl);
3585 on->extent_map.inline_bl.reassign_to_mempool(
f91f0fd5 3586 mempool::mempool_bluestore_cache_data);
eafe8130
TL
3587 }
3588 else {
3589 on->extent_map.init_shards(false, false);
3590 }
3591 return on;
3592}
3593
7c673cae
FG
3594void BlueStore::Onode::flush()
3595{
3596 if (flushing_count.load()) {
3597 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
9f95a23c 3598 waiting_count++;
11fdf7f2 3599 std::unique_lock l(flush_lock);
7c673cae
FG
3600 while (flushing_count.load()) {
3601 flush_cond.wait(l);
3602 }
9f95a23c 3603 waiting_count--;
7c673cae
FG
3604 }
3605 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3606}
3607
9f95a23c
TL
3608void BlueStore::Onode::dump(Formatter* f) const
3609{
3610 onode.dump(f);
3611 extent_map.dump(f);
3612}
3613
3614
3615const string& BlueStore::Onode::get_omap_prefix()
3616{
3617 if (onode.is_pgmeta_omap()) {
3618 return PREFIX_PGMETA_OMAP;
3619 }
f67539c2
TL
3620 if (onode.is_perpg_omap()) {
3621 return PREFIX_PERPG_OMAP;
3622 }
9f95a23c
TL
3623 if (onode.is_perpool_omap()) {
3624 return PREFIX_PERPOOL_OMAP;
3625 }
3626 return PREFIX_OMAP;
3627}
3628
3629// '-' < '.' < '~'
3630
3631void BlueStore::Onode::get_omap_header(string *out)
3632{
f67539c2
TL
3633 if (!onode.is_pgmeta_omap()) {
3634 if (onode.is_perpg_omap()) {
3635 _key_encode_u64(c->pool(), out);
3636 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3637 } else if (onode.is_perpool_omap()) {
3638 _key_encode_u64(c->pool(), out);
3639 }
9f95a23c
TL
3640 }
3641 _key_encode_u64(onode.nid, out);
3642 out->push_back('-');
3643}
3644
3645void BlueStore::Onode::get_omap_key(const string& key, string *out)
3646{
f67539c2
TL
3647 if (!onode.is_pgmeta_omap()) {
3648 if (onode.is_perpg_omap()) {
3649 _key_encode_u64(c->pool(), out);
3650 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3651 } else if (onode.is_perpool_omap()) {
3652 _key_encode_u64(c->pool(), out);
3653 }
9f95a23c
TL
3654 }
3655 _key_encode_u64(onode.nid, out);
3656 out->push_back('.');
3657 out->append(key);
3658}
3659
3660void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3661{
f67539c2
TL
3662 if (!onode.is_pgmeta_omap()) {
3663 if (onode.is_perpg_omap()) {
3664 _key_encode_u64(c->pool(), out);
3665 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3666 } else if (onode.is_perpool_omap()) {
3667 _key_encode_u64(c->pool(), out);
3668 }
9f95a23c
TL
3669 }
3670 _key_encode_u64(onode.nid, out);
3671 out->append(old.c_str() + out->length(), old.size() - out->length());
3672}
3673
3674void BlueStore::Onode::get_omap_tail(string *out)
3675{
f67539c2
TL
3676 if (!onode.is_pgmeta_omap()) {
3677 if (onode.is_perpg_omap()) {
3678 _key_encode_u64(c->pool(), out);
3679 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3680 } else if (onode.is_perpool_omap()) {
3681 _key_encode_u64(c->pool(), out);
3682 }
9f95a23c
TL
3683 }
3684 _key_encode_u64(onode.nid, out);
3685 out->push_back('~');
3686}
3687
3688void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3689{
f67539c2
TL
3690 size_t pos = sizeof(uint64_t) + 1;
3691 if (!onode.is_pgmeta_omap()) {
3692 if (onode.is_perpg_omap()) {
3693 pos += sizeof(uint64_t) + sizeof(uint32_t);
3694 } else if (onode.is_perpool_omap()) {
3695 pos += sizeof(uint64_t);
3696 }
9f95a23c 3697 }
f67539c2 3698 *user_key = key.substr(pos);
9f95a23c
TL
3699}
3700
3701
7c673cae
FG
3702// =======================================================
3703// WriteContext
3704
3705/// Checks for writes to the same pextent within a blob
3706bool BlueStore::WriteContext::has_conflict(
3707 BlobRef b,
3708 uint64_t loffs,
3709 uint64_t loffs_end,
3710 uint64_t min_alloc_size)
3711{
11fdf7f2
TL
3712 ceph_assert((loffs % min_alloc_size) == 0);
3713 ceph_assert((loffs_end % min_alloc_size) == 0);
7c673cae
FG
3714 for (auto w : writes) {
3715 if (b == w.b) {
11fdf7f2
TL
3716 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3717 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
7c673cae 3718 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 3719 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
3720 return true;
3721 }
3722 }
3723 }
3724 return false;
3725}
3726
3727// =======================================================
3728
3729// DeferredBatch
3730#undef dout_prefix
3731#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
9f95a23c
TL
3732#undef dout_context
3733#define dout_context cct
7c673cae
FG
3734
3735void BlueStore::DeferredBatch::prepare_write(
3736 CephContext *cct,
3737 uint64_t seq, uint64_t offset, uint64_t length,
3738 bufferlist::const_iterator& blp)
3739{
3740 _discard(cct, offset, length);
3741 auto i = iomap.insert(make_pair(offset, deferred_io()));
11fdf7f2 3742 ceph_assert(i.second); // this should be a new insertion
7c673cae
FG
3743 i.first->second.seq = seq;
3744 blp.copy(length, i.first->second.bl);
31f18b77
FG
3745 i.first->second.bl.reassign_to_mempool(
3746 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3747 dout(20) << __func__ << " seq " << seq
3748 << " 0x" << std::hex << offset << "~" << length
3749 << " crc " << i.first->second.bl.crc32c(-1)
3750 << std::dec << dendl;
3751 seq_bytes[seq] += length;
3752#ifdef DEBUG_DEFERRED
3753 _audit(cct);
3754#endif
3755}
3756
3757void BlueStore::DeferredBatch::_discard(
3758 CephContext *cct, uint64_t offset, uint64_t length)
3759{
3760 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3761 << std::dec << dendl;
3762 auto p = iomap.lower_bound(offset);
3763 if (p != iomap.begin()) {
3764 --p;
3765 auto end = p->first + p->second.bl.length();
3766 if (end > offset) {
3767 bufferlist head;
3768 head.substr_of(p->second.bl, 0, offset - p->first);
3769 dout(20) << __func__ << " keep head " << p->second.seq
3770 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3771 << " -> 0x" << head.length() << std::dec << dendl;
3772 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3773 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3774 if (end > offset + length) {
3775 bufferlist tail;
3776 tail.substr_of(p->second.bl, offset + length - p->first,
3777 end - (offset + length));
3778 dout(20) << __func__ << " keep tail " << p->second.seq
3779 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3780 << " -> 0x" << tail.length() << std::dec << dendl;
3781 auto &n = iomap[offset + length];
3782 n.bl.swap(tail);
3783 n.seq = p->second.seq;
3784 i->second -= length;
3785 } else {
3786 i->second -= end - offset;
3787 }
11fdf7f2 3788 ceph_assert(i->second >= 0);
7c673cae
FG
3789 p->second.bl.swap(head);
3790 }
3791 ++p;
3792 }
3793 while (p != iomap.end()) {
3794 if (p->first >= offset + length) {
3795 break;
3796 }
3797 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3798 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3799 auto end = p->first + p->second.bl.length();
3800 if (end > offset + length) {
3801 unsigned drop_front = offset + length - p->first;
3802 unsigned keep_tail = end - (offset + length);
3803 dout(20) << __func__ << " truncate front " << p->second.seq
3804 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3805 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3806 << " to 0x" << (offset + length) << "~" << keep_tail
3807 << std::dec << dendl;
3808 auto &s = iomap[offset + length];
3809 s.seq = p->second.seq;
3810 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3811 i->second -= drop_front;
3812 } else {
3813 dout(20) << __func__ << " drop " << p->second.seq
3814 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3815 << std::dec << dendl;
3816 i->second -= p->second.bl.length();
3817 }
11fdf7f2 3818 ceph_assert(i->second >= 0);
7c673cae
FG
3819 p = iomap.erase(p);
3820 }
3821}
3822
3823void BlueStore::DeferredBatch::_audit(CephContext *cct)
3824{
3825 map<uint64_t,int> sb;
3826 for (auto p : seq_bytes) {
3827 sb[p.first] = 0; // make sure we have the same set of keys
3828 }
3829 uint64_t pos = 0;
3830 for (auto& p : iomap) {
11fdf7f2 3831 ceph_assert(p.first >= pos);
7c673cae
FG
3832 sb[p.second.seq] += p.second.bl.length();
3833 pos = p.first + p.second.bl.length();
3834 }
11fdf7f2 3835 ceph_assert(sb == seq_bytes);
7c673cae
FG
3836}
3837
3838
3839// Collection
3840
3841#undef dout_prefix
3842#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3843
9f95a23c
TL
3844BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3845 : CollectionImpl(store_->cct, cid),
11fdf7f2 3846 store(store_),
9f95a23c 3847 cache(bc),
7c673cae 3848 exists(true),
9f95a23c 3849 onode_map(oc),
11fdf7f2
TL
3850 commit_queue(nullptr)
3851{
3852}
3853
3854bool BlueStore::Collection::flush_commit(Context *c)
3855{
3856 return osr->flush_commit(c);
3857}
3858
3859void BlueStore::Collection::flush()
3860{
3861 osr->flush();
3862}
3863
3864void BlueStore::Collection::flush_all_but_last()
7c673cae 3865{
11fdf7f2 3866 osr->flush_all_but_last();
7c673cae
FG
3867}
3868
3869void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3870{
11fdf7f2 3871 ceph_assert(!b->shared_blob);
7c673cae
FG
3872 const bluestore_blob_t& blob = b->get_blob();
3873 if (!blob.is_shared()) {
3874 b->shared_blob = new SharedBlob(this);
3875 return;
3876 }
3877
3878 b->shared_blob = shared_blob_set.lookup(sbid);
3879 if (b->shared_blob) {
3880 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3881 << std::dec << " had " << *b->shared_blob << dendl;
3882 } else {
3883 b->shared_blob = new SharedBlob(sbid, this);
3884 shared_blob_set.add(this, b->shared_blob.get());
3885 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3886 << std::dec << " opened " << *b->shared_blob
3887 << dendl;
3888 }
3889}
3890
3891void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3892{
3893 if (!sb->is_loaded()) {
3894
3895 bufferlist v;
3896 string key;
3897 auto sbid = sb->get_sbid();
3898 get_shared_blob_key(sbid, &key);
3899 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3900 if (r < 0) {
3901 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3902 << std::dec << " not found at key "
3903 << pretty_binary_string(key) << dendl;
11fdf7f2 3904 ceph_abort_msg("uh oh, missing shared_blob");
7c673cae
FG
3905 }
3906
3907 sb->loaded = true;
3908 sb->persistent = new bluestore_shared_blob_t(sbid);
11fdf7f2
TL
3909 auto p = v.cbegin();
3910 decode(*(sb->persistent), p);
7c673cae
FG
3911 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3912 << std::dec << " loaded shared_blob " << *sb << dendl;
3913 }
3914}
3915
3916void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3917{
7c673cae 3918 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
11fdf7f2 3919 ceph_assert(!b->shared_blob->is_loaded());
7c673cae
FG
3920
3921 // update blob
31f18b77 3922 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 3923 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
3924
3925 // update shared blob
3926 b->shared_blob->loaded = true;
3927 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3928 shared_blob_set.add(this, b->shared_blob.get());
3929 for (auto p : blob.get_extents()) {
3930 if (p.is_valid()) {
3931 b->shared_blob->get_ref(
3932 p.offset,
3933 p.length);
3934 }
3935 }
3936 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3937}
3938
31f18b77
FG
3939uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3940{
3941 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
11fdf7f2 3942 ceph_assert(sb->is_loaded());
31f18b77
FG
3943
3944 uint64_t sbid = sb->get_sbid();
3945 shared_blob_set.remove(sb);
3946 sb->loaded = false;
3947 delete sb->persistent;
3948 sb->sbid_unloaded = 0;
3949 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3950 return sbid;
3951}
3952
7c673cae
FG
3953BlueStore::OnodeRef BlueStore::Collection::get_onode(
3954 const ghobject_t& oid,
9f95a23c
TL
3955 bool create,
3956 bool is_createop)
7c673cae 3957{
9f95a23c 3958 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
7c673cae
FG
3959
3960 spg_t pgid;
3961 if (cid.is_pg(&pgid)) {
3962 if (!oid.match(cnode.bits, pgid.ps())) {
3963 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3964 << pgid << " bits " << cnode.bits << dendl;
3965 ceph_abort();
3966 }
3967 }
3968
3969 OnodeRef o = onode_map.lookup(oid);
3970 if (o)
3971 return o;
3972
eafe8130 3973 string key;
7c673cae
FG
3974 get_object_key(store->cct, oid, &key);
3975
3976 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3977 << pretty_binary_string(key) << dendl;
3978
3979 bufferlist v;
9f95a23c 3980 int r = -ENOENT;
7c673cae 3981 Onode *on;
9f95a23c
TL
3982 if (!is_createop) {
3983 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3984 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3985 }
7c673cae 3986 if (v.length() == 0) {
11fdf7f2 3987 ceph_assert(r == -ENOENT);
f67539c2 3988 if (!create)
7c673cae
FG
3989 return OnodeRef();
3990
3991 // new object, new onode
3992 on = new Onode(this, oid, key);
3993 } else {
3994 // loaded
11fdf7f2 3995 ceph_assert(r >= 0);
eafe8130 3996 on = Onode::decode(this, oid, key, v);
7c673cae
FG
3997 }
3998 o.reset(on);
3999 return onode_map.add(oid, o);
4000}
4001
4002void BlueStore::Collection::split_cache(
4003 Collection *dest)
4004{
4005 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
4006
f67539c2
TL
4007 auto *ocache = get_onode_cache();
4008 auto *ocache_dest = dest->get_onode_cache();
4009
4010 // lock cache shards
4011 std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock);
4012 std::lock_guard l(ocache->lock, std::adopt_lock);
4013 std::lock_guard l2(ocache_dest->lock, std::adopt_lock);
4014 std::lock_guard l3(cache->lock, std::adopt_lock);
4015 std::lock_guard l4(dest->cache->lock, std::adopt_lock);
7c673cae
FG
4016
4017 int destbits = dest->cnode.bits;
4018 spg_t destpg;
4019 bool is_pg = dest->cid.is_pg(&destpg);
11fdf7f2 4020 ceph_assert(is_pg);
7c673cae
FG
4021
4022 auto p = onode_map.onode_map.begin();
4023 while (p != onode_map.onode_map.end()) {
11fdf7f2 4024 OnodeRef o = p->second;
7c673cae
FG
4025 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
4026 // onode does not belong to this child
11fdf7f2
TL
4027 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
4028 << dendl;
7c673cae
FG
4029 ++p;
4030 } else {
7c673cae
FG
4031 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
4032 << dendl;
4033
f6b5b4d7
TL
4034 // ensuring that nref is always >= 2 and hence onode is pinned and
4035 // physically out of cache during the transition
4036 OnodeRef o_pin = o;
4037 ceph_assert(o->pinned);
4038
7c673cae 4039 p = onode_map.onode_map.erase(p);
7c673cae 4040 dest->onode_map.onode_map[o->oid] = o;
adb31ebb 4041 if (o->cached) {
f6b5b4d7 4042 get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
9f95a23c 4043 }
f6b5b4d7 4044 o->c = dest;
7c673cae
FG
4045
4046 // move over shared blobs and buffers. cover shared blobs from
4047 // both extent map and spanning blob map (the full extent map
4048 // may not be faulted in)
4049 vector<SharedBlob*> sbvec;
4050 for (auto& e : o->extent_map.extent_map) {
4051 sbvec.push_back(e.blob->shared_blob.get());
4052 }
4053 for (auto& b : o->extent_map.spanning_blob_map) {
4054 sbvec.push_back(b.second->shared_blob.get());
4055 }
4056 for (auto sb : sbvec) {
4057 if (sb->coll == dest) {
4058 ldout(store->cct, 20) << __func__ << " already moved " << *sb
4059 << dendl;
4060 continue;
4061 }
4062 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
4063 if (sb->get_sbid()) {
4064 ldout(store->cct, 20) << __func__
4065 << " moving registration " << *sb << dendl;
4066 shared_blob_set.remove(sb);
4067 dest->shared_blob_set.add(dest, sb);
4068 }
3efd9988 4069 sb->coll = dest;
7c673cae 4070 if (dest->cache != cache) {
7c673cae
FG
4071 for (auto& i : sb->bc.buffer_map) {
4072 if (!i.second->is_writing()) {
4073 ldout(store->cct, 20) << __func__ << " moving " << *i.second
4074 << dendl;
9f95a23c 4075 dest->cache->_move(cache, i.second.get());
7c673cae
FG
4076 }
4077 }
4078 }
4079 }
7c673cae
FG
4080 }
4081 }
9f95a23c 4082 dest->cache->_trim();
7c673cae
FG
4083}
4084
7c673cae
FG
4085// =======================================================
4086
91327a77
AA
4087// MempoolThread
4088
4089#undef dout_prefix
4090#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
9f95a23c
TL
4091#undef dout_context
4092#define dout_context store->cct
91327a77 4093
7c673cae
FG
4094void *BlueStore::MempoolThread::entry()
4095{
9f95a23c 4096 std::unique_lock l{lock};
11fdf7f2 4097
92f5a8d4 4098 uint32_t prev_config_change = store->config_changed.load();
eafe8130
TL
4099 uint64_t base = store->osd_memory_base;
4100 double fragmentation = store->osd_memory_expected_fragmentation;
4101 uint64_t target = store->osd_memory_target;
4102 uint64_t min = store->osd_memory_cache_min;
4103 uint64_t max = min;
4104
4105 // When setting the maximum amount of memory to use for cache, first
4106 // assume some base amount of memory for the OSD and then fudge in
4107 // some overhead for fragmentation that scales with cache usage.
4108 uint64_t ltarget = (1.0 - fragmentation) * target;
4109 if (ltarget > base + min) {
4110 max = ltarget - base;
11fdf7f2 4111 }
31f18b77 4112
eafe8130 4113 binned_kv_cache = store->db->get_priority_cache();
f67539c2 4114 binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
eafe8130
TL
4115 if (store->cache_autotune && binned_kv_cache != nullptr) {
4116 pcm = std::make_shared<PriorityCache::Manager>(
f67539c2 4117 store->cct, min, max, target, true, "bluestore-pricache");
eafe8130
TL
4118 pcm->insert("kv", binned_kv_cache, true);
4119 pcm->insert("meta", meta_cache, true);
4120 pcm->insert("data", data_cache, true);
f67539c2
TL
4121 if (binned_kv_onode_cache != nullptr) {
4122 pcm->insert("kv_onode", binned_kv_onode_cache, true);
4123 }
eafe8130 4124 }
91327a77
AA
4125
4126 utime_t next_balance = ceph_clock_now();
4127 utime_t next_resize = ceph_clock_now();
9f95a23c
TL
4128 utime_t next_deferred_force_submit = ceph_clock_now();
4129 utime_t alloc_stats_dump_clock = ceph_clock_now();
31f18b77 4130
91327a77 4131 bool interval_stats_trim = false;
91327a77 4132 while (!stop) {
92f5a8d4
TL
4133 // Update pcm cache settings if related configuration was changed
4134 uint32_t cur_config_change = store->config_changed.load();
4135 if (cur_config_change != prev_config_change) {
4136 _update_cache_settings();
4137 prev_config_change = cur_config_change;
4138 }
4139
91327a77
AA
4140 // Before we trim, check and see if it's time to rebalance/resize.
4141 double autotune_interval = store->cache_autotune_interval;
4142 double resize_interval = store->osd_memory_cache_resize_interval;
9f95a23c
TL
4143 double max_defer_interval = store->max_defer_interval;
4144
4145 double alloc_stats_dump_interval =
4146 store->cct->_conf->bluestore_alloc_stats_dump_interval;
91327a77 4147
9f95a23c
TL
4148 if (alloc_stats_dump_interval > 0 &&
4149 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
4150 store->_record_allocation_stats();
4151 alloc_stats_dump_clock = ceph_clock_now();
4152 }
91327a77 4153 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
11fdf7f2
TL
4154 _adjust_cache_settings();
4155
91327a77 4156 // Log events at 5 instead of 20 when balance happens.
91327a77 4157 interval_stats_trim = true;
eafe8130
TL
4158
4159 if (pcm != nullptr) {
4160 pcm->balance();
91327a77 4161 }
31f18b77 4162
91327a77
AA
4163 next_balance = ceph_clock_now();
4164 next_balance += autotune_interval;
4165 }
4166 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
eafe8130
TL
4167 if (ceph_using_tcmalloc() && pcm != nullptr) {
4168 pcm->tune_memory();
91327a77
AA
4169 }
4170 next_resize = ceph_clock_now();
4171 next_resize += resize_interval;
31f18b77
FG
4172 }
4173
9f95a23c
TL
4174 if (max_defer_interval > 0 &&
4175 next_deferred_force_submit < ceph_clock_now()) {
4176 if (store->get_deferred_last_submitted() + max_defer_interval <
4177 ceph_clock_now()) {
4178 store->deferred_try_submit();
4179 }
4180 next_deferred_force_submit = ceph_clock_now();
4181 next_deferred_force_submit += max_defer_interval/3;
4182 }
4183
4184 // Now Resize the shards
4185 _resize_shards(interval_stats_trim);
91327a77 4186 interval_stats_trim = false;
31f18b77 4187
91327a77 4188 store->_update_cache_logger();
11fdf7f2
TL
4189 auto wait = ceph::make_timespan(
4190 store->cct->_conf->bluestore_cache_trim_interval);
4191 cond.wait_for(l, wait);
7c673cae 4192 }
9f95a23c
TL
4193 // do final dump
4194 store->_record_allocation_stats();
7c673cae 4195 stop = false;
f67539c2 4196 pcm = nullptr;
7c673cae
FG
4197 return NULL;
4198}
4199
91327a77
AA
4200void BlueStore::MempoolThread::_adjust_cache_settings()
4201{
11fdf7f2
TL
4202 if (binned_kv_cache != nullptr) {
4203 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
4204 }
f67539c2
TL
4205 if (binned_kv_onode_cache != nullptr) {
4206 binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
4207 }
11fdf7f2
TL
4208 meta_cache->set_cache_ratio(store->cache_meta_ratio);
4209 data_cache->set_cache_ratio(store->cache_data_ratio);
91327a77
AA
4210}
4211
9f95a23c 4212void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
91327a77 4213{
9f95a23c
TL
4214 size_t onode_shards = store->onode_cache_shards.size();
4215 size_t buffer_shards = store->buffer_cache_shards.size();
91327a77 4216 int64_t kv_used = store->db->get_cache_usage();
f67539c2 4217 int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
11fdf7f2
TL
4218 int64_t meta_used = meta_cache->_get_used_bytes();
4219 int64_t data_used = data_cache->_get_used_bytes();
91327a77
AA
4220
4221 uint64_t cache_size = store->cache_size;
4222 int64_t kv_alloc =
11fdf7f2 4223 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
f67539c2
TL
4224 int64_t kv_onode_alloc =
4225 static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
91327a77 4226 int64_t meta_alloc =
11fdf7f2 4227 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
91327a77 4228 int64_t data_alloc =
11fdf7f2 4229 static_cast<int64_t>(store->cache_data_ratio * cache_size);
91327a77 4230
eafe8130
TL
4231 if (pcm != nullptr && binned_kv_cache != nullptr) {
4232 cache_size = pcm->get_tuned_mem();
11fdf7f2
TL
4233 kv_alloc = binned_kv_cache->get_committed_size();
4234 meta_alloc = meta_cache->get_committed_size();
4235 data_alloc = data_cache->get_committed_size();
f67539c2
TL
4236 if (binned_kv_onode_cache != nullptr) {
4237 kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
4238 }
91327a77
AA
4239 }
4240
4241 if (interval_stats) {
9f95a23c 4242 dout(5) << __func__ << " cache_size: " << cache_size
91327a77
AA
4243 << " kv_alloc: " << kv_alloc
4244 << " kv_used: " << kv_used
f67539c2
TL
4245 << " kv_onode_alloc: " << kv_onode_alloc
4246 << " kv_onode_used: " << kv_onode_used
91327a77
AA
4247 << " meta_alloc: " << meta_alloc
4248 << " meta_used: " << meta_used
4249 << " data_alloc: " << data_alloc
4250 << " data_used: " << data_used << dendl;
4251 } else {
9f95a23c 4252 dout(20) << __func__ << " cache_size: " << cache_size
91327a77
AA
4253 << " kv_alloc: " << kv_alloc
4254 << " kv_used: " << kv_used
f67539c2
TL
4255 << " kv_onode_alloc: " << kv_onode_alloc
4256 << " kv_onode_used: " << kv_onode_used
91327a77
AA
4257 << " meta_alloc: " << meta_alloc
4258 << " meta_used: " << meta_used
4259 << " data_alloc: " << data_alloc
4260 << " data_used: " << data_used << dendl;
4261 }
4262
4263 uint64_t max_shard_onodes = static_cast<uint64_t>(
9f95a23c
TL
4264 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
4265 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
91327a77 4266
9f95a23c 4267 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
91327a77
AA
4268 << " max_shard_buffer: " << max_shard_buffer << dendl;
4269
9f95a23c
TL
4270 for (auto i : store->onode_cache_shards) {
4271 i->set_max(max_shard_onodes);
4272 }
4273 for (auto i : store->buffer_cache_shards) {
4274 i->set_max(max_shard_buffer);
91327a77
AA
4275 }
4276}
4277
92f5a8d4
TL
4278void BlueStore::MempoolThread::_update_cache_settings()
4279{
4280 // Nothing to do if pcm is not used.
4281 if (pcm == nullptr) {
4282 return;
4283 }
4284
92f5a8d4
TL
4285 uint64_t target = store->osd_memory_target;
4286 uint64_t base = store->osd_memory_base;
4287 uint64_t min = store->osd_memory_cache_min;
4288 uint64_t max = min;
4289 double fragmentation = store->osd_memory_expected_fragmentation;
4290
4291 uint64_t ltarget = (1.0 - fragmentation) * target;
4292 if (ltarget > base + min) {
4293 max = ltarget - base;
4294 }
4295
4296 // set pcm cache levels
4297 pcm->set_target_memory(target);
4298 pcm->set_min_memory(min);
4299 pcm->set_max_memory(max);
4300
9f95a23c 4301 dout(5) << __func__ << " updated pcm target: " << target
92f5a8d4
TL
4302 << " pcm min: " << min
4303 << " pcm max: " << max
4304 << dendl;
4305}
4306
7c673cae
FG
4307// =======================================================
4308
31f18b77
FG
4309// OmapIteratorImpl
4310
4311#undef dout_prefix
4312#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4313
4314BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4315 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
4316 : c(c), o(o), it(it)
4317{
9f95a23c 4318 std::shared_lock l(c->lock);
31f18b77 4319 if (o->onode.has_omap()) {
9f95a23c
TL
4320 o->get_omap_key(string(), &head);
4321 o->get_omap_tail(&tail);
31f18b77
FG
4322 it->lower_bound(head);
4323 }
4324}
4325
11fdf7f2
TL
4326string BlueStore::OmapIteratorImpl::_stringify() const
4327{
4328 stringstream s;
4329 s << " omap_iterator(cid = " << c->cid
4330 <<", oid = " << o->oid << ")";
4331 return s.str();
4332}
4333
31f18b77
FG
4334int BlueStore::OmapIteratorImpl::seek_to_first()
4335{
9f95a23c 4336 std::shared_lock l(c->lock);
11fdf7f2 4337 auto start1 = mono_clock::now();
31f18b77
FG
4338 if (o->onode.has_omap()) {
4339 it->lower_bound(head);
4340 } else {
4341 it = KeyValueDB::Iterator();
4342 }
494da23a
TL
4343 c->store->log_latency(
4344 __func__,
11fdf7f2
TL
4345 l_bluestore_omap_seek_to_first_lat,
4346 mono_clock::now() - start1,
494da23a 4347 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2 4348
31f18b77
FG
4349 return 0;
4350}
4351
4352int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4353{
9f95a23c 4354 std::shared_lock l(c->lock);
11fdf7f2 4355 auto start1 = mono_clock::now();
31f18b77
FG
4356 if (o->onode.has_omap()) {
4357 string key;
9f95a23c 4358 o->get_omap_key(after, &key);
31f18b77
FG
4359 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4360 << pretty_binary_string(key) << dendl;
4361 it->upper_bound(key);
4362 } else {
4363 it = KeyValueDB::Iterator();
4364 }
11fdf7f2 4365 c->store->log_latency_fn(
494da23a 4366 __func__,
11fdf7f2
TL
4367 l_bluestore_omap_upper_bound_lat,
4368 mono_clock::now() - start1,
494da23a 4369 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4370 [&] (const ceph::timespan& lat) {
494da23a 4371 return ", after = " + after +
11fdf7f2
TL
4372 _stringify();
4373 }
4374 );
31f18b77
FG
4375 return 0;
4376}
4377
4378int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4379{
9f95a23c 4380 std::shared_lock l(c->lock);
11fdf7f2 4381 auto start1 = mono_clock::now();
31f18b77
FG
4382 if (o->onode.has_omap()) {
4383 string key;
9f95a23c 4384 o->get_omap_key(to, &key);
31f18b77
FG
4385 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4386 << pretty_binary_string(key) << dendl;
4387 it->lower_bound(key);
4388 } else {
4389 it = KeyValueDB::Iterator();
4390 }
11fdf7f2 4391 c->store->log_latency_fn(
494da23a 4392 __func__,
11fdf7f2
TL
4393 l_bluestore_omap_lower_bound_lat,
4394 mono_clock::now() - start1,
494da23a 4395 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4396 [&] (const ceph::timespan& lat) {
494da23a 4397 return ", to = " + to +
11fdf7f2
TL
4398 _stringify();
4399 }
4400 );
31f18b77
FG
4401 return 0;
4402}
4403
4404bool BlueStore::OmapIteratorImpl::valid()
4405{
9f95a23c 4406 std::shared_lock l(c->lock);
31f18b77 4407 bool r = o->onode.has_omap() && it && it->valid() &&
494da23a 4408 it->raw_key().second < tail;
31f18b77
FG
4409 if (it && it->valid()) {
4410 ldout(c->store->cct,20) << __func__ << " is at "
4411 << pretty_binary_string(it->raw_key().second)
4412 << dendl;
4413 }
4414 return r;
4415}
4416
11fdf7f2 4417int BlueStore::OmapIteratorImpl::next()
31f18b77 4418{
11fdf7f2 4419 int r = -1;
9f95a23c 4420 std::shared_lock l(c->lock);
11fdf7f2 4421 auto start1 = mono_clock::now();
31f18b77
FG
4422 if (o->onode.has_omap()) {
4423 it->next();
11fdf7f2 4424 r = 0;
31f18b77 4425 }
494da23a
TL
4426 c->store->log_latency(
4427 __func__,
11fdf7f2
TL
4428 l_bluestore_omap_next_lat,
4429 mono_clock::now() - start1,
494da23a 4430 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2
TL
4431
4432 return r;
31f18b77
FG
4433}
4434
4435string BlueStore::OmapIteratorImpl::key()
4436{
9f95a23c 4437 std::shared_lock l(c->lock);
11fdf7f2 4438 ceph_assert(it->valid());
31f18b77
FG
4439 string db_key = it->raw_key().second;
4440 string user_key;
9f95a23c 4441 o->decode_omap_key(db_key, &user_key);
494da23a 4442
31f18b77
FG
4443 return user_key;
4444}
4445
4446bufferlist BlueStore::OmapIteratorImpl::value()
4447{
9f95a23c 4448 std::shared_lock l(c->lock);
11fdf7f2 4449 ceph_assert(it->valid());
31f18b77
FG
4450 return it->value();
4451}
4452
4453
4454// =====================================
4455
7c673cae
FG
4456#undef dout_prefix
4457#define dout_prefix *_dout << "bluestore(" << path << ") "
9f95a23c
TL
4458#undef dout_context
4459#define dout_context cct
7c673cae
FG
4460
4461
4462static void aio_cb(void *priv, void *priv2)
4463{
4464 BlueStore *store = static_cast<BlueStore*>(priv);
4465 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4466 c->aio_finish(store);
4467}
4468
11fdf7f2
TL
4469static void discard_cb(void *priv, void *priv2)
4470{
4471 BlueStore *store = static_cast<BlueStore*>(priv);
4472 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4473 store->handle_discard(*tmp);
4474}
4475
4476void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4477{
4478 dout(10) << __func__ << dendl;
f67539c2
TL
4479 ceph_assert(shared_alloc.a);
4480 shared_alloc.a->release(to_release);
11fdf7f2
TL
4481}
4482
7c673cae 4483BlueStore::BlueStore(CephContext *cct, const string& path)
9f95a23c 4484 : BlueStore(cct, path, 0) {}
7c673cae
FG
4485
4486BlueStore::BlueStore(CephContext *cct,
4487 const string& path,
4488 uint64_t _min_alloc_size)
4489 : ObjectStore(cct, path),
9f95a23c 4490 throttle(cct),
11fdf7f2 4491 finisher(cct, "commit_finisher", "cfin"),
7c673cae 4492 kv_sync_thread(this),
31f18b77 4493 kv_finalize_thread(this),
f67539c2 4494 zoned_cleaner_thread(this),
7c673cae
FG
4495 min_alloc_size(_min_alloc_size),
4496 min_alloc_size_order(ctz(_min_alloc_size)),
4497 mempool_thread(this)
4498{
4499 _init_logger();
11fdf7f2 4500 cct->_conf.add_observer(this);
7c673cae 4501 set_cache_shards(1);
7c673cae
FG
4502}
4503
4504BlueStore::~BlueStore()
4505{
11fdf7f2 4506 cct->_conf.remove_observer(this);
7c673cae 4507 _shutdown_logger();
11fdf7f2
TL
4508 ceph_assert(!mounted);
4509 ceph_assert(db == NULL);
4510 ceph_assert(bluefs == NULL);
4511 ceph_assert(fsid_fd < 0);
4512 ceph_assert(path_fd < 0);
9f95a23c
TL
4513 for (auto i : onode_cache_shards) {
4514 delete i;
4515 }
4516 for (auto i : buffer_cache_shards) {
7c673cae
FG
4517 delete i;
4518 }
9f95a23c
TL
4519 onode_cache_shards.clear();
4520 buffer_cache_shards.clear();
7c673cae
FG
4521}
4522
4523const char **BlueStore::get_tracked_conf_keys() const
4524{
4525 static const char* KEYS[] = {
4526 "bluestore_csum_type",
4527 "bluestore_compression_mode",
4528 "bluestore_compression_algorithm",
4529 "bluestore_compression_min_blob_size",
4530 "bluestore_compression_min_blob_size_ssd",
4531 "bluestore_compression_min_blob_size_hdd",
4532 "bluestore_compression_max_blob_size",
4533 "bluestore_compression_max_blob_size_ssd",
4534 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 4535 "bluestore_compression_required_ratio",
7c673cae
FG
4536 "bluestore_max_alloc_size",
4537 "bluestore_prefer_deferred_size",
181888fb
FG
4538 "bluestore_prefer_deferred_size_hdd",
4539 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
4540 "bluestore_deferred_batch_ops",
4541 "bluestore_deferred_batch_ops_hdd",
4542 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
4543 "bluestore_throttle_bytes",
4544 "bluestore_throttle_deferred_bytes",
4545 "bluestore_throttle_cost_per_io_hdd",
4546 "bluestore_throttle_cost_per_io_ssd",
4547 "bluestore_throttle_cost_per_io",
4548 "bluestore_max_blob_size",
4549 "bluestore_max_blob_size_ssd",
4550 "bluestore_max_blob_size_hdd",
11fdf7f2
TL
4551 "osd_memory_target",
4552 "osd_memory_target_cgroup_limit_ratio",
4553 "osd_memory_base",
4554 "osd_memory_cache_min",
92f5a8d4 4555 "osd_memory_expected_fragmentation",
11fdf7f2
TL
4556 "bluestore_cache_autotune",
4557 "bluestore_cache_autotune_interval",
81eedcae 4558 "bluestore_warn_on_legacy_statfs",
9f95a23c
TL
4559 "bluestore_warn_on_no_per_pool_omap",
4560 "bluestore_max_defer_interval",
7c673cae
FG
4561 NULL
4562 };
4563 return KEYS;
4564}
4565
11fdf7f2 4566void BlueStore::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
4567 const std::set<std::string> &changed)
4568{
eafe8130 4569 if (changed.count("bluestore_warn_on_legacy_statfs")) {
81eedcae
TL
4570 _check_legacy_statfs_alert();
4571 }
f67539c2
TL
4572 if (changed.count("bluestore_warn_on_no_per_pool_omap") ||
4573 changed.count("bluestore_warn_on_no_per_pg_omap")) {
4574 _check_no_per_pg_or_pool_omap_alert();
9f95a23c 4575 }
81eedcae 4576
7c673cae
FG
4577 if (changed.count("bluestore_csum_type")) {
4578 _set_csum();
4579 }
4580 if (changed.count("bluestore_compression_mode") ||
4581 changed.count("bluestore_compression_algorithm") ||
4582 changed.count("bluestore_compression_min_blob_size") ||
4583 changed.count("bluestore_compression_max_blob_size")) {
4584 if (bdev) {
4585 _set_compression();
4586 }
4587 }
4588 if (changed.count("bluestore_max_blob_size") ||
4589 changed.count("bluestore_max_blob_size_ssd") ||
4590 changed.count("bluestore_max_blob_size_hdd")) {
4591 if (bdev) {
4592 // only after startup
4593 _set_blob_size();
4594 }
4595 }
4596 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
4597 changed.count("bluestore_prefer_deferred_size_hdd") ||
4598 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
4599 changed.count("bluestore_max_alloc_size") ||
4600 changed.count("bluestore_deferred_batch_ops") ||
4601 changed.count("bluestore_deferred_batch_ops_hdd") ||
4602 changed.count("bluestore_deferred_batch_ops_ssd")) {
4603 if (bdev) {
4604 // only after startup
4605 _set_alloc_sizes();
4606 }
4607 }
4608 if (changed.count("bluestore_throttle_cost_per_io") ||
4609 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4610 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4611 if (bdev) {
4612 _set_throttle_params();
4613 }
4614 }
9f95a23c
TL
4615 if (changed.count("bluestore_throttle_bytes") ||
4616 changed.count("bluestore_throttle_deferred_bytes") ||
4617 changed.count("bluestore_throttle_trace_rate")) {
4618 throttle.reset_throttle(conf);
7c673cae 4619 }
9f95a23c
TL
4620 if (changed.count("bluestore_max_defer_interval")) {
4621 if (bdev) {
4622 _set_max_defer_interval();
4623 }
7c673cae 4624 }
92f5a8d4
TL
4625 if (changed.count("osd_memory_target") ||
4626 changed.count("osd_memory_base") ||
4627 changed.count("osd_memory_cache_min") ||
4628 changed.count("osd_memory_expected_fragmentation")) {
4629 _update_osd_memory_options();
4630 }
7c673cae
FG
4631}
4632
4633void BlueStore::_set_compression()
4634{
224ce89b
WB
4635 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4636 if (m) {
11fdf7f2 4637 _clear_compression_alert();
224ce89b
WB
4638 comp_mode = *m;
4639 } else {
4640 derr << __func__ << " unrecognized value '"
4641 << cct->_conf->bluestore_compression_mode
4642 << "' for bluestore_compression_mode, reverting to 'none'"
4643 << dendl;
4644 comp_mode = Compressor::COMP_NONE;
11fdf7f2
TL
4645 string s("unknown mode: ");
4646 s += cct->_conf->bluestore_compression_mode;
4647 _set_compression_alert(true, s.c_str());
224ce89b
WB
4648 }
4649
4650 compressor = nullptr;
4651
3efd9988
FG
4652 if (cct->_conf->bluestore_compression_min_blob_size) {
4653 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae 4654 } else {
11fdf7f2 4655 ceph_assert(bdev);
9f95a23c 4656 if (_use_rotational_settings()) {
7c673cae
FG
4657 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4658 } else {
4659 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4660 }
4661 }
4662
4663 if (cct->_conf->bluestore_compression_max_blob_size) {
4664 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4665 } else {
11fdf7f2 4666 ceph_assert(bdev);
9f95a23c 4667 if (_use_rotational_settings()) {
7c673cae
FG
4668 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4669 } else {
4670 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4671 }
4672 }
4673
7c673cae
FG
4674 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4675 if (!alg_name.empty()) {
4676 compressor = Compressor::create(cct, alg_name);
4677 if (!compressor) {
4678 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4679 << dendl;
11fdf7f2 4680 _set_compression_alert(false, alg_name.c_str());
7c673cae
FG
4681 }
4682 }
4683
4684 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4685 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
11fdf7f2
TL
4686 << " min_blob " << comp_min_blob_size
4687 << " max_blob " << comp_max_blob_size
7c673cae
FG
4688 << dendl;
4689}
4690
4691void BlueStore::_set_csum()
4692{
4693 csum_type = Checksummer::CSUM_NONE;
4694 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4695 if (t > Checksummer::CSUM_NONE)
4696 csum_type = t;
4697
4698 dout(10) << __func__ << " csum_type "
4699 << Checksummer::get_csum_type_string(csum_type)
4700 << dendl;
4701}
4702
4703void BlueStore::_set_throttle_params()
4704{
4705 if (cct->_conf->bluestore_throttle_cost_per_io) {
4706 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4707 } else {
11fdf7f2 4708 ceph_assert(bdev);
9f95a23c 4709 if (_use_rotational_settings()) {
7c673cae
FG
4710 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4711 } else {
4712 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4713 }
4714 }
4715
4716 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4717 << dendl;
4718}
4719void BlueStore::_set_blob_size()
4720{
4721 if (cct->_conf->bluestore_max_blob_size) {
4722 max_blob_size = cct->_conf->bluestore_max_blob_size;
4723 } else {
11fdf7f2 4724 ceph_assert(bdev);
9f95a23c 4725 if (_use_rotational_settings()) {
7c673cae
FG
4726 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4727 } else {
4728 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4729 }
4730 }
4731 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4732 << std::dec << dendl;
4733}
4734
92f5a8d4
TL
4735void BlueStore::_update_osd_memory_options()
4736{
4737 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4738 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4739 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4740 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4741 config_changed++;
4742 dout(10) << __func__
4743 << " osd_memory_target " << osd_memory_target
4744 << " osd_memory_base " << osd_memory_base
4745 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4746 << " osd_memory_cache_min " << osd_memory_cache_min
4747 << dendl;
4748}
4749
11fdf7f2 4750int BlueStore::_set_cache_sizes()
1adf2230 4751{
11fdf7f2
TL
4752 ceph_assert(bdev);
4753 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
91327a77 4754 cache_autotune_interval =
11fdf7f2
TL
4755 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4756 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4757 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
91327a77 4758 osd_memory_expected_fragmentation =
11fdf7f2
TL
4759 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4760 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
91327a77 4761 osd_memory_cache_resize_interval =
11fdf7f2 4762 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
91327a77 4763
224ce89b
WB
4764 if (cct->_conf->bluestore_cache_size) {
4765 cache_size = cct->_conf->bluestore_cache_size;
4766 } else {
4767 // choose global cache size based on backend type
9f95a23c 4768 if (_use_rotational_settings()) {
224ce89b
WB
4769 cache_size = cct->_conf->bluestore_cache_size_hdd;
4770 } else {
4771 cache_size = cct->_conf->bluestore_cache_size_ssd;
4772 }
4773 }
31f18b77 4774
f67539c2 4775 cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
224ce89b 4776 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 4777 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77 4778 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4779 return -EINVAL;
4780 }
91327a77 4781
f67539c2 4782 cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
224ce89b 4783 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 4784 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
91327a77 4785 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4786 return -EINVAL;
4787 }
91327a77 4788
f67539c2
TL
4789 cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
4790 if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
4791 derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
4792 << ") must be in range [0,1.0]" << dendl;
4793 return -EINVAL;
4794 }
4795
31f18b77 4796 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 4797 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77
AA
4798 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4799 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4800 << dendl;
31f18b77
FG
4801 return -EINVAL;
4802 }
91327a77 4803
f67539c2
TL
4804 cache_data_ratio = (double)1.0 -
4805 (double)cache_meta_ratio -
4806 (double)cache_kv_ratio -
4807 (double)cache_kv_onode_ratio;
31f18b77
FG
4808 if (cache_data_ratio < 0) {
4809 // deal with floating point imprecision
4810 cache_data_ratio = 0;
4811 }
91327a77 4812
224ce89b
WB
4813 dout(1) << __func__ << " cache_size " << cache_size
4814 << " meta " << cache_meta_ratio
31f18b77
FG
4815 << " kv " << cache_kv_ratio
4816 << " data " << cache_data_ratio
4817 << dendl;
4818 return 0;
4819}
4820
3efd9988
FG
4821int BlueStore::write_meta(const std::string& key, const std::string& value)
4822{
4823 bluestore_bdev_label_t label;
4824 string p = path + "/block";
4825 int r = _read_bdev_label(cct, p, &label);
4826 if (r < 0) {
4827 return ObjectStore::write_meta(key, value);
4828 }
4829 label.meta[key] = value;
4830 r = _write_bdev_label(cct, p, label);
11fdf7f2 4831 ceph_assert(r == 0);
3efd9988
FG
4832 return ObjectStore::write_meta(key, value);
4833}
4834
4835int BlueStore::read_meta(const std::string& key, std::string *value)
4836{
4837 bluestore_bdev_label_t label;
4838 string p = path + "/block";
4839 int r = _read_bdev_label(cct, p, &label);
4840 if (r < 0) {
4841 return ObjectStore::read_meta(key, value);
4842 }
4843 auto i = label.meta.find(key);
4844 if (i == label.meta.end()) {
4845 return ObjectStore::read_meta(key, value);
4846 }
4847 *value = i->second;
4848 return 0;
4849}
4850
7c673cae
FG
4851void BlueStore::_init_logger()
4852{
4853 PerfCountersBuilder b(cct, "bluestore",
4854 l_bluestore_first, l_bluestore_last);
4855 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4856 "Average kv_thread flush latency",
4857 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4858 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4859 "Average kv_thread commit latency");
11fdf7f2
TL
4860 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
4861 "Average kv_sync thread latency",
4862 "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
4863 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
4864 "Average kv_finalize thread latency",
4865 "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
4866 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4867 "Average prepare state latency");
4868 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4869 "Average aio_wait state latency",
4870 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4871 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4872 "Average io_done state latency");
4873 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4874 "Average kv_queued state latency");
4875 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4876 "Average kv_commiting state latency");
4877 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4878 "Average kv_done state latency");
4879 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4880 "Average deferred_queued state latency");
4881 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4882 "Average aio_wait state latency");
4883 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4884 "Average cleanup state latency");
4885 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4886 "Average finishing state latency");
4887 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4888 "Average done state latency");
4889 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4890 "Average submit throttle latency",
4891 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4892 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4893 "Average submit latency",
4894 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4895 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4896 "Average commit latency",
4897 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4898 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4899 "Average read latency",
4900 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4901 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4902 "Average read onode metadata latency");
4903 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4904 "Average read latency");
4905 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4906 "Average compress latency");
4907 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4908 "Average decompress latency");
4909 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4910 "Average checksum latency");
4911 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4912 "Sum for beneficial compress ops");
4913 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4914 "Sum for compress ops rejected due to low net gain of space");
4915 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
11fdf7f2 4916 "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4917 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4918 "Sum for deferred write op");
4919 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
11fdf7f2 4920 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
7c673cae
FG
4921 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4922 "Sum for write penalty read ops");
4923 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4924 "Sum for allocated bytes");
4925 b.add_u64(l_bluestore_stored, "bluestore_stored",
4926 "Sum for stored bytes");
4927 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
92f5a8d4
TL
4928 "Sum for stored compressed bytes",
4929 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4930 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
92f5a8d4
TL
4931 "Sum for bytes allocated for compressed data",
4932 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4933 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
92f5a8d4
TL
4934 "Sum for original bytes that were compressed",
4935 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
4936 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4937 "Number of onodes in cache");
9f95a23c
TL
4938 b.add_u64(l_bluestore_pinned_onodes, "bluestore_pinned_onodes",
4939 "Number of pinned onodes in cache");
7c673cae
FG
4940 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4941 "Sum for onode-lookups hit in the cache");
4942 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4943 "Sum for onode-lookups missed in the cache");
4944 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4945 "Sum for onode-shard lookups hit in the cache");
4946 b.add_u64_counter(l_bluestore_onode_shard_misses,
4947 "bluestore_onode_shard_misses",
4948 "Sum for onode-shard lookups missed in the cache");
4949 b.add_u64(l_bluestore_extents, "bluestore_extents",
4950 "Number of extents in cache");
4951 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4952 "Number of blobs in cache");
4953 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4954 "Number of buffers in cache");
4955 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
11fdf7f2 4956 "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4957 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
11fdf7f2 4958 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4959 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
11fdf7f2 4960 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4961
4962 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4963 "Large aligned writes into fresh blobs");
4964 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
11fdf7f2 4965 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4966 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4967 "Large aligned writes into fresh blobs (blobs)");
f67539c2
TL
4968 b.add_u64_counter(l_bluestore_write_big_deferred,
4969 "bluestore_write_big_deferred",
4970 "Big overwrites using deferred");
7c673cae
FG
4971 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4972 "Small writes into existing or sparse small blobs");
4973 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
11fdf7f2 4974 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4975 b.add_u64_counter(l_bluestore_write_small_unused,
4976 "bluestore_write_small_unused",
4977 "Small writes into unused portion of existing blob");
f67539c2
TL
4978 b.add_u64_counter(l_bluestore_write_deferred,
4979 "bluestore_write_deferred",
4980 "Overwrites using deferred");
7c673cae
FG
4981 b.add_u64_counter(l_bluestore_write_small_pre_read,
4982 "bluestore_write_small_pre_read",
4983 "Small writes that required we read some data (possibly "
4984 "cached) to fill out the block");
f67539c2
TL
4985 b.add_u64_counter(l_bluestore_write_new, "bluestore_write_new",
4986 "Write into new blob");
7c673cae
FG
4987
4988 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
4989 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
4990 "Onode extent map reshard events");
4991 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
4992 "Sum for blob splitting due to resharding");
4993 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
4994 "Sum for extents that have been removed due to compression");
4995 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
4996 "Sum for extents that have been merged due to garbage "
4997 "collection");
b32b8144
FG
4998 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
4999 "Read EIO errors propagated to high level callers");
f64942e4
AA
5000 b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
5001 "Read operations that required at least one retry due to failed checksum validation");
a8e16298
TL
5002 b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
5003 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
11fdf7f2
TL
5004 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
5005 "Average omap iterator seek_to_first call latency");
5006 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
5007 "Average omap iterator upper_bound call latency");
5008 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
5009 "Average omap iterator lower_bound call latency");
5010 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
5011 "Average omap iterator next call latency");
adb31ebb
TL
5012 b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
5013 "Average omap get_keys call latency");
5014 b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
5015 "Average omap get_values call latency");
494da23a
TL
5016 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
5017 "Average collection listing latency");
adb31ebb
TL
5018 b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
5019 "Average removal latency");
5020
7c673cae
FG
5021 logger = b.create_perf_counters();
5022 cct->get_perfcounters_collection()->add(logger);
5023}
5024
5025int BlueStore::_reload_logger()
5026{
5027 struct store_statfs_t store_statfs;
7c673cae 5028 int r = statfs(&store_statfs);
11fdf7f2 5029 if (r >= 0) {
7c673cae 5030 logger->set(l_bluestore_allocated, store_statfs.allocated);
11fdf7f2
TL
5031 logger->set(l_bluestore_stored, store_statfs.data_stored);
5032 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
5033 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
5034 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
7c673cae
FG
5035 }
5036 return r;
5037}
5038
5039void BlueStore::_shutdown_logger()
5040{
5041 cct->get_perfcounters_collection()->remove(logger);
5042 delete logger;
5043}
5044
5045int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
5046 uuid_d *fsid)
5047{
5048 bluestore_bdev_label_t label;
5049 int r = _read_bdev_label(cct, path, &label);
5050 if (r < 0)
5051 return r;
5052 *fsid = label.osd_uuid;
5053 return 0;
5054}
5055
5056int BlueStore::_open_path()
5057{
b32b8144 5058 // sanity check(s)
11fdf7f2 5059 ceph_assert(path_fd < 0);
91327a77 5060 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
7c673cae
FG
5061 if (path_fd < 0) {
5062 int r = -errno;
5063 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
5064 << dendl;
5065 return r;
5066 }
5067 return 0;
5068}
5069
5070void BlueStore::_close_path()
5071{
5072 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
5073 path_fd = -1;
5074}
5075
3efd9988
FG
5076int BlueStore::_write_bdev_label(CephContext *cct,
5077 string path, bluestore_bdev_label_t label)
7c673cae
FG
5078{
5079 dout(10) << __func__ << " path " << path << " label " << label << dendl;
5080 bufferlist bl;
11fdf7f2 5081 encode(label, bl);
7c673cae 5082 uint32_t crc = bl.crc32c(-1);
11fdf7f2
TL
5083 encode(crc, bl);
5084 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
7c673cae
FG
5085 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
5086 z.zero();
5087 bl.append(std::move(z));
5088
91327a77 5089 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
7c673cae
FG
5090 if (fd < 0) {
5091 fd = -errno;
5092 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5093 << dendl;
5094 return fd;
5095 }
5096 int r = bl.write_fd(fd);
5097 if (r < 0) {
5098 derr << __func__ << " failed to write to " << path
5099 << ": " << cpp_strerror(r) << dendl;
11fdf7f2 5100 goto out;
7c673cae 5101 }
3efd9988
FG
5102 r = ::fsync(fd);
5103 if (r < 0) {
5104 derr << __func__ << " failed to fsync " << path
5105 << ": " << cpp_strerror(r) << dendl;
5106 }
11fdf7f2 5107out:
7c673cae
FG
5108 VOID_TEMP_FAILURE_RETRY(::close(fd));
5109 return r;
5110}
5111
5112int BlueStore::_read_bdev_label(CephContext* cct, string path,
5113 bluestore_bdev_label_t *label)
5114{
5115 dout(10) << __func__ << dendl;
91327a77 5116 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
7c673cae
FG
5117 if (fd < 0) {
5118 fd = -errno;
5119 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5120 << dendl;
5121 return fd;
5122 }
5123 bufferlist bl;
5124 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
5125 VOID_TEMP_FAILURE_RETRY(::close(fd));
5126 if (r < 0) {
5127 derr << __func__ << " failed to read from " << path
5128 << ": " << cpp_strerror(r) << dendl;
5129 return r;
5130 }
5131
5132 uint32_t crc, expected_crc;
11fdf7f2 5133 auto p = bl.cbegin();
7c673cae 5134 try {
11fdf7f2 5135 decode(*label, p);
7c673cae
FG
5136 bufferlist t;
5137 t.substr_of(bl, 0, p.get_off());
5138 crc = t.crc32c(-1);
11fdf7f2 5139 decode(expected_crc, p);
7c673cae 5140 }
f67539c2 5141 catch (ceph::buffer::error& e) {
b32b8144 5142 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
5143 << ": " << e.what()
5144 << dendl;
b32b8144 5145 return -ENOENT;
7c673cae
FG
5146 }
5147 if (crc != expected_crc) {
5148 derr << __func__ << " bad crc on label, expected " << expected_crc
5149 << " != actual " << crc << dendl;
5150 return -EIO;
5151 }
5152 dout(10) << __func__ << " got " << *label << dendl;
5153 return 0;
5154}
5155
5156int BlueStore::_check_or_set_bdev_label(
5157 string path, uint64_t size, string desc, bool create)
5158{
5159 bluestore_bdev_label_t label;
5160 if (create) {
5161 label.osd_uuid = fsid;
5162 label.size = size;
5163 label.btime = ceph_clock_now();
5164 label.description = desc;
3efd9988 5165 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
5166 if (r < 0)
5167 return r;
5168 } else {
5169 int r = _read_bdev_label(cct, path, &label);
5170 if (r < 0)
5171 return r;
31f18b77
FG
5172 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
5173 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5174 << " and fsid " << fsid << " check bypassed" << dendl;
1911f103 5175 } else if (label.osd_uuid != fsid) {
7c673cae
FG
5176 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5177 << " does not match our fsid " << fsid << dendl;
5178 return -EIO;
5179 }
5180 }
5181 return 0;
5182}
5183
5184void BlueStore::_set_alloc_sizes(void)
5185{
7c673cae
FG
5186 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
5187
5188 if (cct->_conf->bluestore_prefer_deferred_size) {
5189 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
5190 } else {
11fdf7f2 5191 ceph_assert(bdev);
9f95a23c 5192 if (_use_rotational_settings()) {
7c673cae
FG
5193 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
5194 } else {
5195 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
5196 }
5197 }
5198
5199 if (cct->_conf->bluestore_deferred_batch_ops) {
5200 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
5201 } else {
11fdf7f2 5202 ceph_assert(bdev);
9f95a23c 5203 if (_use_rotational_settings()) {
7c673cae
FG
5204 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
5205 } else {
5206 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
5207 }
5208 }
5209
5210 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11fdf7f2 5211 << std::dec << " order " << (int)min_alloc_size_order
7c673cae
FG
5212 << " max_alloc_size 0x" << std::hex << max_alloc_size
5213 << " prefer_deferred_size 0x" << prefer_deferred_size
5214 << std::dec
5215 << " deferred_batch_ops " << deferred_batch_ops
5216 << dendl;
5217}
5218
5219int BlueStore::_open_bdev(bool create)
5220{
11fdf7f2 5221 ceph_assert(bdev == NULL);
7c673cae 5222 string p = path + "/block";
11fdf7f2 5223 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
7c673cae
FG
5224 int r = bdev->open(p);
5225 if (r < 0)
5226 goto fail;
5227
11fdf7f2
TL
5228 if (create && cct->_conf->bdev_enable_discard) {
5229 bdev->discard(0, bdev->get_size());
5230 }
5231
7c673cae
FG
5232 if (bdev->supported_bdev_label()) {
5233 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
5234 if (r < 0)
5235 goto fail_close;
5236 }
5237
5238 // initialize global block parameters
5239 block_size = bdev->get_block_size();
5240 block_mask = ~(block_size - 1);
5241 block_size_order = ctz(block_size);
11fdf7f2 5242 ceph_assert(block_size == 1u << block_size_order);
9f95a23c 5243 _set_max_defer_interval();
224ce89b
WB
5244 // and set cache_size based on device type
5245 r = _set_cache_sizes();
5246 if (r < 0) {
5247 goto fail_close;
5248 }
f67539c2
TL
5249
5250 if (bdev->is_smr()) {
5251 freelist_type = "zoned";
5252 }
7c673cae
FG
5253 return 0;
5254
5255 fail_close:
5256 bdev->close();
5257 fail:
5258 delete bdev;
5259 bdev = NULL;
5260 return r;
5261}
5262
11fdf7f2
TL
5263void BlueStore::_validate_bdev()
5264{
5265 ceph_assert(bdev);
11fdf7f2 5266 uint64_t dev_size = bdev->get_size();
f67539c2 5267 ceph_assert(dev_size > _get_ondisk_reserved());
11fdf7f2
TL
5268}
5269
7c673cae
FG
5270void BlueStore::_close_bdev()
5271{
11fdf7f2 5272 ceph_assert(bdev);
7c673cae
FG
5273 bdev->close();
5274 delete bdev;
5275 bdev = NULL;
5276}
5277
1911f103 5278int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only)
7c673cae 5279{
1911f103 5280 int r;
1911f103 5281
11fdf7f2
TL
5282 ceph_assert(fm == NULL);
5283 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
5284 ceph_assert(fm);
5285 if (t) {
5286 // create mode. initialize freespace
7c673cae 5287 dout(20) << __func__ << " initializing freespace" << dendl;
7c673cae
FG
5288 {
5289 bufferlist bl;
5290 bl.append(freelist_type);
5291 t->set(PREFIX_SUPER, "freelist_type", bl);
5292 }
b32b8144
FG
5293 // being able to allocate in units less than bdev block size
5294 // seems to be a bad idea.
11fdf7f2 5295 ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
f67539c2
TL
5296
5297 uint64_t alloc_size = min_alloc_size;
5298 if (bdev->is_smr()) {
5299 alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
5300 }
5301
5302 fm->create(bdev->get_size(), alloc_size, t);
7c673cae
FG
5303
5304 // allocate superblock reserved space. note that we do not mark
5305 // bluefs space as allocated in the freelist; we instead rely on
f67539c2 5306 // bluefs doing that itself.
11fdf7f2 5307 auto reserved = _get_ondisk_reserved();
3efd9988 5308 fm->allocate(0, reserved, t);
7c673cae 5309
7c673cae
FG
5310 if (cct->_conf->bluestore_debug_prefill > 0) {
5311 uint64_t end = bdev->get_size() - reserved;
5312 dout(1) << __func__ << " pre-fragmenting freespace, using "
5313 << cct->_conf->bluestore_debug_prefill << " with max free extent "
5314 << cct->_conf->bluestore_debug_prefragment_max << dendl;
11fdf7f2 5315 uint64_t start = p2roundup(reserved, min_alloc_size);
7c673cae
FG
5316 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
5317 float r = cct->_conf->bluestore_debug_prefill;
5318 r /= 1.0 - r;
5319 bool stop = false;
5320
5321 while (!stop && start < end) {
5322 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
5323 if (start + l > end) {
5324 l = end - start;
11fdf7f2 5325 l = p2align(l, min_alloc_size);
7c673cae 5326 }
11fdf7f2 5327 ceph_assert(start + l <= end);
7c673cae
FG
5328
5329 uint64_t u = 1 + (uint64_t)(r * (double)l);
11fdf7f2 5330 u = p2roundup(u, min_alloc_size);
7c673cae
FG
5331 if (start + l + u > end) {
5332 u = end - (start + l);
5333 // trim to align so we don't overflow again
11fdf7f2 5334 u = p2align(u, min_alloc_size);
7c673cae
FG
5335 stop = true;
5336 }
11fdf7f2 5337 ceph_assert(start + l + u <= end);
7c673cae 5338
11fdf7f2 5339 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
7c673cae
FG
5340 << " use 0x" << u << std::dec << dendl;
5341
5342 if (u == 0) {
5343 // break if u has been trimmed to nothing
5344 break;
5345 }
5346
5347 fm->allocate(start + l, u, t);
5348 start += l + u;
5349 }
5350 }
f67539c2 5351 r = _write_out_fm_meta(0);
1911f103
TL
5352 ceph_assert(r == 0);
5353 } else {
f67539c2
TL
5354 r = fm->init(db, read_only,
5355 [&](const std::string& key, std::string* result) {
5356 return read_meta(key, result);
5357 });
1911f103 5358 if (r < 0) {
f67539c2 5359 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
1911f103
TL
5360 delete fm;
5361 fm = NULL;
5362 return r;
5363 }
7c673cae 5364 }
81eedcae
TL
5365 // if space size tracked by free list manager is that higher than actual
5366 // dev size one can hit out-of-space allocation which will result
5367 // in data loss and/or assertions
5368 // Probably user altered the device size somehow.
5369 // The only fix for now is to redeploy OSD.
5370 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5371 ostringstream ss;
5372 ss << "slow device size mismatch detected, "
5373 << " fm size(" << fm->get_size()
5374 << ") > slow device size(" << bdev->get_size()
5375 << "), Please stop using this OSD as it might cause data loss.";
5376 _set_disk_size_mismatch_alert(ss.str());
5377 }
7c673cae
FG
5378 return 0;
5379}
5380
5381void BlueStore::_close_fm()
5382{
5383 dout(10) << __func__ << dendl;
11fdf7f2 5384 ceph_assert(fm);
7c673cae
FG
5385 fm->shutdown();
5386 delete fm;
5387 fm = NULL;
5388}
5389
f67539c2 5390int BlueStore::_write_out_fm_meta(uint64_t target_size)
1911f103 5391{
f67539c2 5392 int r = 0;
1911f103
TL
5393 string p = path + "/block";
5394
5395 std::vector<std::pair<string, string>> fm_meta;
5396 fm->get_meta(target_size, &fm_meta);
5397
1911f103 5398 for (auto& m : fm_meta) {
f67539c2
TL
5399 r = write_meta(m.first, m.second);
5400 ceph_assert(r == 0);
1911f103 5401 }
1911f103
TL
5402 return r;
5403}
5404
f67539c2 5405int BlueStore::_create_alloc()
7c673cae 5406{
f67539c2 5407 ceph_assert(shared_alloc.a == NULL);
11fdf7f2
TL
5408 ceph_assert(bdev->get_size());
5409
f67539c2
TL
5410 uint64_t alloc_size = min_alloc_size;
5411 if (bdev->is_smr()) {
5412 int r = _zoned_check_config_settings();
5413 if (r < 0)
11fdf7f2 5414 return r;
f67539c2 5415 alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
11fdf7f2
TL
5416 }
5417
f67539c2
TL
5418 shared_alloc.set(Allocator::create(cct, cct->_conf->bluestore_allocator,
5419 bdev->get_size(),
5420 alloc_size, "block"));
5421
5422 if (!shared_alloc.a) {
5423 lderr(cct) << __func__ << "Failed to create allocator:: "
5424 << cct->_conf->bluestore_allocator
5425 << dendl;
7c673cae
FG
5426 return -EINVAL;
5427 }
f67539c2
TL
5428 return 0;
5429}
5430
5431int BlueStore::_init_alloc()
5432{
5433 int r = _create_alloc();
5434 if (r < 0) {
5435 return r;
5436 }
5437 ceph_assert(shared_alloc.a != NULL);
5438
5439 if (bdev->is_smr()) {
5440 shared_alloc.a->zoned_set_zone_states(fm->get_zone_states(db));
5441 }
7c673cae
FG
5442
5443 uint64_t num = 0, bytes = 0;
5444
5445 dout(1) << __func__ << " opening allocation metadata" << dendl;
5446 // initialize from freelist
5447 fm->enumerate_reset();
5448 uint64_t offset, length;
11fdf7f2 5449 while (fm->enumerate_next(db, &offset, &length)) {
f67539c2 5450 shared_alloc.a->init_add_free(offset, length);
7c673cae
FG
5451 ++num;
5452 bytes += length;
5453 }
224ce89b 5454 fm->enumerate_reset();
7c673cae 5455
f67539c2
TL
5456 dout(1) << __func__
5457 << " loaded " << byte_u_t(bytes) << " in " << num << " extents"
5458 << std::hex
5459 << ", allocator type " << shared_alloc.a->get_type()
5460 << ", capacity 0x" << shared_alloc.a->get_capacity()
5461 << ", block size 0x" << shared_alloc.a->get_block_size()
5462 << ", free 0x" << shared_alloc.a->get_free()
5463 << ", fragmentation " << shared_alloc.a->get_fragmentation()
5464 << std::dec << dendl;
1911f103 5465
7c673cae
FG
5466 return 0;
5467}
5468
5469void BlueStore::_close_alloc()
5470{
11fdf7f2
TL
5471 ceph_assert(bdev);
5472 bdev->discard_drain();
5473
f67539c2
TL
5474 ceph_assert(shared_alloc.a);
5475 shared_alloc.a->shutdown();
5476 delete shared_alloc.a;
5477 shared_alloc.reset();
7c673cae
FG
5478}
5479
5480int BlueStore::_open_fsid(bool create)
5481{
11fdf7f2 5482 ceph_assert(fsid_fd < 0);
91327a77 5483 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
5484 if (create)
5485 flags |= O_CREAT;
5486 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
5487 if (fsid_fd < 0) {
5488 int err = -errno;
5489 derr << __func__ << " " << cpp_strerror(err) << dendl;
5490 return err;
5491 }
5492 return 0;
5493}
5494
5495int BlueStore::_read_fsid(uuid_d *uuid)
5496{
5497 char fsid_str[40];
5498 memset(fsid_str, 0, sizeof(fsid_str));
5499 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
5500 if (ret < 0) {
5501 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
5502 return ret;
5503 }
5504 if (ret > 36)
5505 fsid_str[36] = 0;
5506 else
5507 fsid_str[ret] = 0;
5508 if (!uuid->parse(fsid_str)) {
5509 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
5510 return -EINVAL;
5511 }
5512 return 0;
5513}
5514
5515int BlueStore::_write_fsid()
5516{
5517 int r = ::ftruncate(fsid_fd, 0);
5518 if (r < 0) {
5519 r = -errno;
5520 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
5521 return r;
5522 }
5523 string str = stringify(fsid) + "\n";
5524 r = safe_write(fsid_fd, str.c_str(), str.length());
5525 if (r < 0) {
5526 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
5527 return r;
5528 }
5529 r = ::fsync(fsid_fd);
5530 if (r < 0) {
5531 r = -errno;
5532 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
5533 return r;
5534 }
5535 return 0;
5536}
5537
5538void BlueStore::_close_fsid()
5539{
5540 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
5541 fsid_fd = -1;
5542}
5543
5544int BlueStore::_lock_fsid()
5545{
5546 struct flock l;
5547 memset(&l, 0, sizeof(l));
5548 l.l_type = F_WRLCK;
5549 l.l_whence = SEEK_SET;
5550 int r = ::fcntl(fsid_fd, F_SETLK, &l);
5551 if (r < 0) {
5552 int err = errno;
5553 derr << __func__ << " failed to lock " << path << "/fsid"
5554 << " (is another ceph-osd still running?)"
5555 << cpp_strerror(err) << dendl;
5556 return -err;
5557 }
5558 return 0;
5559}
5560
31f18b77
FG
5561bool BlueStore::is_rotational()
5562{
5563 if (bdev) {
5564 return bdev->is_rotational();
5565 }
5566
5567 bool rotational = true;
5568 int r = _open_path();
5569 if (r < 0)
5570 goto out;
5571 r = _open_fsid(false);
5572 if (r < 0)
5573 goto out_path;
5574 r = _read_fsid(&fsid);
5575 if (r < 0)
5576 goto out_fsid;
5577 r = _lock_fsid();
5578 if (r < 0)
5579 goto out_fsid;
5580 r = _open_bdev(false);
5581 if (r < 0)
5582 goto out_fsid;
5583 rotational = bdev->is_rotational();
5584 _close_bdev();
5585 out_fsid:
5586 _close_fsid();
5587 out_path:
5588 _close_path();
5589 out:
5590 return rotational;
5591}
5592
d2e6a577
FG
5593bool BlueStore::is_journal_rotational()
5594{
5595 if (!bluefs) {
5596 dout(5) << __func__ << " bluefs disabled, default to store media type"
5597 << dendl;
5598 return is_rotational();
5599 }
5600 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
5601 return bluefs->wal_is_rotational();
5602}
5603
9f95a23c
TL
5604bool BlueStore::_use_rotational_settings()
5605{
5606 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
5607 return true;
5608 }
5609 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
5610 return false;
5611 }
5612 return bdev->is_rotational();
5613}
5614
7c673cae
FG
5615bool BlueStore::test_mount_in_use()
5616{
5617 // most error conditions mean the mount is not in use (e.g., because
5618 // it doesn't exist). only if we fail to lock do we conclude it is
5619 // in use.
5620 bool ret = false;
5621 int r = _open_path();
5622 if (r < 0)
5623 return false;
5624 r = _open_fsid(false);
5625 if (r < 0)
5626 goto out_path;
5627 r = _lock_fsid();
5628 if (r < 0)
5629 ret = true; // if we can't lock, it is in use
5630 _close_fsid();
5631 out_path:
5632 _close_path();
5633 return ret;
5634}
5635
11fdf7f2 5636int BlueStore::_minimal_open_bluefs(bool create)
7c673cae
FG
5637{
5638 int r;
11fdf7f2 5639 bluefs = new BlueFS(cct);
7c673cae 5640
11fdf7f2
TL
5641 string bfn;
5642 struct stat st;
5643
5644 bfn = path + "/block.db";
5645 if (::stat(bfn.c_str(), &st) == 0) {
eafe8130
TL
5646 r = bluefs->add_block_device(
5647 BlueFS::BDEV_DB, bfn,
f67539c2
TL
5648 create && cct->_conf->bdev_enable_discard,
5649 SUPER_RESERVED);
7c673cae 5650 if (r < 0) {
11fdf7f2
TL
5651 derr << __func__ << " add block device(" << bfn << ") returned: "
5652 << cpp_strerror(r) << dendl;
5653 goto free_bluefs;
7c673cae 5654 }
7c673cae 5655
11fdf7f2
TL
5656 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
5657 r = _check_or_set_bdev_label(
5658 bfn,
5659 bluefs->get_block_device_size(BlueFS::BDEV_DB),
5660 "bluefs db", create);
5661 if (r < 0) {
5662 derr << __func__
5663 << " check block device(" << bfn << ") label returned: "
5664 << cpp_strerror(r) << dendl;
5665 goto free_bluefs;
5666 }
7c673cae 5667 }
9f95a23c
TL
5668 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
5669 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
5670 } else {
5671 r = -errno;
5672 if (::lstat(bfn.c_str(), &st) == -1) {
5673 r = 0;
9f95a23c 5674 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7c673cae 5675 } else {
11fdf7f2
TL
5676 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5677 << cpp_strerror(r) << dendl;
5678 goto free_bluefs;
7c673cae
FG
5679 }
5680 }
7c673cae 5681
11fdf7f2
TL
5682 // shared device
5683 bfn = path + "/block";
5684 // never trim here
9f95a23c 5685 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
f67539c2
TL
5686 0, // no need to provide valid 'reserved' for shared dev
5687 &shared_alloc);
11fdf7f2
TL
5688 if (r < 0) {
5689 derr << __func__ << " add block device(" << bfn << ") returned: "
5690 << cpp_strerror(r) << dendl;
5691 goto free_bluefs;
5692 }
11fdf7f2
TL
5693
5694 bfn = path + "/block.wal";
5695 if (::stat(bfn.c_str(), &st) == 0) {
5696 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
f67539c2
TL
5697 create && cct->_conf->bdev_enable_discard,
5698 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
5699 if (r < 0) {
5700 derr << __func__ << " add block device(" << bfn << ") returned: "
5701 << cpp_strerror(r) << dendl;
5702 goto free_bluefs;
5703 }
7c673cae 5704
11fdf7f2
TL
5705 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
5706 r = _check_or_set_bdev_label(
5707 bfn,
5708 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
5709 "bluefs wal", create);
7c673cae 5710 if (r < 0) {
11fdf7f2
TL
5711 derr << __func__ << " check block device(" << bfn
5712 << ") label returned: " << cpp_strerror(r) << dendl;
7c673cae
FG
5713 goto free_bluefs;
5714 }
7c673cae
FG
5715 }
5716
9f95a23c 5717 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
5718 } else {
5719 r = 0;
5720 if (::lstat(bfn.c_str(), &st) != -1) {
5721 r = -errno;
5722 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5723 << cpp_strerror(r) << dendl;
7c673cae
FG
5724 goto free_bluefs;
5725 }
11fdf7f2
TL
5726 }
5727 return 0;
7c673cae 5728
11fdf7f2
TL
5729free_bluefs:
5730 ceph_assert(bluefs);
5731 delete bluefs;
5732 bluefs = NULL;
5733 return r;
5734}
7c673cae 5735
f67539c2 5736int BlueStore::_open_bluefs(bool create, bool read_only)
11fdf7f2
TL
5737{
5738 int r = _minimal_open_bluefs(create);
5739 if (r < 0) {
5740 return r;
5741 }
f67539c2 5742 BlueFSVolumeSelector* vselector = nullptr;
9f95a23c
TL
5743 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
5744
5745 string options = cct->_conf->bluestore_rocksdb_options;
cd265ab1
TL
5746 string options_annex = cct->_conf->bluestore_rocksdb_options_annex;
5747 if (!options_annex.empty()) {
5748 if (!options.empty() &&
5749 *options.rbegin() != ',') {
5750 options += ',';
5751 }
5752 options += options_annex;
5753 }
9f95a23c
TL
5754
5755 rocksdb::Options rocks_opts;
f67539c2 5756 r = RocksDBStore::ParseOptionsFromStringStatic(
9f95a23c
TL
5757 cct,
5758 options,
5759 rocks_opts,
5760 nullptr);
5761 if (r < 0) {
5762 return r;
5763 }
f67539c2
TL
5764 if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
5765 vselector = new FitToFastVolumeSelector(
9f95a23c
TL
5766 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
5767 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
f67539c2
TL
5768 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100);
5769 } else {
5770 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
5771 vselector =
5772 new RocksDBBlueFSVolumeSelector(
5773 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
5774 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
5775 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
5776 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5777 rocks_opts.max_bytes_for_level_base,
5778 rocks_opts.max_bytes_for_level_multiplier,
5779 reserved_factor,
5780 cct->_conf->bluestore_volume_selection_reserved,
5781 cct->_conf->bluestore_volume_selection_policy == "use_some_extra");
5782 }
9f95a23c 5783 }
11fdf7f2 5784 if (create) {
9f95a23c 5785 bluefs->mkfs(fsid, bluefs_layout);
11fdf7f2 5786 }
9f95a23c 5787 bluefs->set_volume_selector(vselector);
11fdf7f2
TL
5788 r = bluefs->mount();
5789 if (r < 0) {
5790 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
5791 }
9f95a23c 5792 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
11fdf7f2
TL
5793 return r;
5794}
5795
1911f103 5796void BlueStore::_close_bluefs(bool cold_close)
11fdf7f2 5797{
1911f103 5798 bluefs->umount(cold_close);
11fdf7f2
TL
5799 _minimal_close_bluefs();
5800}
5801
5802void BlueStore::_minimal_close_bluefs()
5803{
5804 delete bluefs;
5805 bluefs = NULL;
5806}
5807
5808int BlueStore::_is_bluefs(bool create, bool* ret)
5809{
5810 if (create) {
5811 *ret = cct->_conf->bluestore_bluefs;
5812 } else {
5813 string s;
5814 int r = read_meta("bluefs", &s);
5815 if (r < 0) {
5816 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
5817 return -EIO;
5818 }
5819 if (s == "1") {
5820 *ret = true;
5821 } else if (s == "0") {
5822 *ret = false;
31f18b77 5823 } else {
11fdf7f2
TL
5824 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
5825 << dendl;
5826 return -EIO;
5827 }
5828 }
5829 return 0;
5830}
5831
5832/*
5833* opens both DB and dependant super_meta, FreelistManager and allocator
5834* in the proper order
5835*/
f67539c2 5836int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
11fdf7f2 5837{
f67539c2
TL
5838 dout(0) << __func__ << " read-only:" << read_only
5839 << " repair:" << to_repair << dendl;
5840 {
5841 string type;
5842 int r = read_meta("type", &type);
5843 if (r < 0) {
5844 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
5845 << dendl;
11fdf7f2 5846 return r;
f67539c2 5847 }
11fdf7f2 5848
f67539c2
TL
5849 if (type != "bluestore") {
5850 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5851 return -EIO;
11fdf7f2 5852 }
f67539c2 5853 }
11fdf7f2 5854
f67539c2
TL
5855 int r = _open_path();
5856 if (r < 0)
5857 return r;
5858 r = _open_fsid(false);
5859 if (r < 0)
5860 goto out_path;
11fdf7f2 5861
f67539c2
TL
5862 r = _read_fsid(&fsid);
5863 if (r < 0)
5864 goto out_fsid;
11fdf7f2 5865
f67539c2
TL
5866 r = _lock_fsid();
5867 if (r < 0)
5868 goto out_fsid;
11fdf7f2 5869
f67539c2
TL
5870 r = _open_bdev(false);
5871 if (r < 0)
5872 goto out_fsid;
7c673cae 5873
f67539c2
TL
5874 // open in read-only first to read FM list and init allocator
5875 // as they might be needed for some BlueFS procedures
5876 r = _open_db(false, false, true);
5877 if (r < 0)
5878 goto out_bdev;
11fdf7f2 5879
f67539c2
TL
5880 r = _open_super_meta();
5881 if (r < 0) {
5882 goto out_db;
5883 }
5884
5885 r = _open_fm(nullptr, true);
5886 if (r < 0)
5887 goto out_db;
5888
5889 r = _init_alloc();
5890 if (r < 0)
5891 goto out_fm;
5892
5893 // Re-open in the proper mode(s).
5894
5895 // Can't simply bypass second open for read-only mode as we need to
5896 // load allocated extents from bluefs into allocator.
5897 // And now it's time to do that
5898 //
5899 _close_db(true);
5900
5901 r = _open_db(false, to_repair, read_only);
5902 if (r < 0) {
5903 goto out_alloc;
11fdf7f2
TL
5904 }
5905 return 0;
5906
f67539c2
TL
5907out_alloc:
5908 _close_alloc();
5909out_fm:
11fdf7f2
TL
5910 _close_fm();
5911 out_db:
1911f103 5912 _close_db(read_only);
f67539c2
TL
5913 out_bdev:
5914 _close_bdev();
5915 out_fsid:
5916 _close_fsid();
5917 out_path:
5918 _close_path();
11fdf7f2
TL
5919 return r;
5920}
5921
1911f103 5922void BlueStore::_close_db_and_around(bool read_only)
11fdf7f2 5923{
f67539c2
TL
5924 _close_db(read_only);
5925 _close_fm();
5926 _close_alloc();
5927 _close_bdev();
5928 _close_fsid();
5929 _close_path();
5930}
5931
5932int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair)
5933{
5934 _kv_only = true;
5935 int r = _open_db_and_around(false, to_repair);
5936 if (r == 0) {
5937 *pdb = db;
11fdf7f2 5938 } else {
f67539c2 5939 *pdb = nullptr;
11fdf7f2 5940 }
f67539c2 5941 return r;
11fdf7f2
TL
5942}
5943
f67539c2 5944int BlueStore::close_db_environment()
11fdf7f2 5945{
f67539c2
TL
5946 _close_db_and_around(false);
5947 return 0;
11fdf7f2
TL
5948}
5949
f67539c2
TL
5950int BlueStore::_prepare_db_environment(bool create, bool read_only,
5951 std::string* _fn, std::string* _kv_backend)
11fdf7f2
TL
5952{
5953 int r;
5954 ceph_assert(!db);
f67539c2
TL
5955 std::string& fn=*_fn;
5956 std::string& kv_backend=*_kv_backend;
5957 fn = path + "/db";
11fdf7f2
TL
5958 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
5959
11fdf7f2
TL
5960 if (create) {
5961 kv_backend = cct->_conf->bluestore_kvbackend;
5962 } else {
5963 r = read_meta("kv_backend", &kv_backend);
7c673cae 5964 if (r < 0) {
11fdf7f2
TL
5965 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
5966 return -EIO;
5967 }
5968 }
5969 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
5970
5971 bool do_bluefs;
5972 r = _is_bluefs(create, &do_bluefs);
5973 if (r < 0) {
5974 return r;
5975 }
5976 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
5977
5978 map<string,string> kv_options;
5979 // force separate wal dir for all new deployments.
5980 kv_options["separate_wal_dir"] = 1;
5981 rocksdb::Env *env = NULL;
5982 if (do_bluefs) {
5983 dout(10) << __func__ << " initializing bluefs" << dendl;
5984 if (kv_backend != "rocksdb") {
5985 derr << " backend must be rocksdb to use bluefs" << dendl;
5986 return -EINVAL;
7c673cae 5987 }
11fdf7f2 5988
f67539c2 5989 r = _open_bluefs(create, read_only);
11fdf7f2
TL
5990 if (r < 0) {
5991 return r;
5992 }
11fdf7f2 5993
7c673cae 5994 if (cct->_conf->bluestore_bluefs_env_mirror) {
9f95a23c
TL
5995 rocksdb::Env* a = new BlueRocksEnv(bluefs);
5996 rocksdb::Env* b = rocksdb::Env::Default();
7c673cae 5997 if (create) {
9f95a23c
TL
5998 string cmd = "rm -rf " + path + "/db " +
5999 path + "/db.slow " +
6000 path + "/db.wal";
6001 int r = system(cmd.c_str());
6002 (void)r;
7c673cae
FG
6003 }
6004 env = new rocksdb::EnvMirror(b, a, false, true);
1911f103 6005 } else {
7c673cae
FG
6006 env = new BlueRocksEnv(bluefs);
6007
6008 // simplify the dir names, too, as "seen" by rocksdb
6009 fn = "db";
6010 }
9f95a23c
TL
6011 BlueFSVolumeSelector::paths paths;
6012 bluefs->get_vselector_paths(fn, paths);
7c673cae 6013
9f95a23c 6014 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
7c673cae
FG
6015 // we have both block.db and block; tell rocksdb!
6016 // note: the second (last) size value doesn't really matter
6017 ostringstream db_paths;
9f95a23c
TL
6018 bool first = true;
6019 for (auto& p : paths) {
6020 if (!first) {
6021 db_paths << " ";
6022 }
6023 first = false;
6024 db_paths << p.first << "," << p.second;
6025
6026 }
11fdf7f2 6027 kv_options["db_paths"] = db_paths.str();
9f95a23c 6028 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
7c673cae
FG
6029 }
6030
6031 if (create) {
9f95a23c
TL
6032 for (auto& p : paths) {
6033 env->CreateDir(p.first);
6034 }
6035 // Selectors don't provide wal path so far hence create explicitly
11fdf7f2 6036 env->CreateDir(fn + ".wal");
11fdf7f2
TL
6037 } else {
6038 std::vector<std::string> res;
6039 // check for dir presence
6040 auto r = env->GetChildren(fn+".wal", &res);
6041 if (r.IsNotFound()) {
6042 kv_options.erase("separate_wal_dir");
6043 }
7c673cae 6044 }
11fdf7f2
TL
6045 } else {
6046 string walfn = path + "/db.wal";
7c673cae 6047
11fdf7f2
TL
6048 if (create) {
6049 int r = ::mkdir(fn.c_str(), 0755);
6050 if (r < 0)
6051 r = -errno;
6052 if (r < 0 && r != -EEXIST) {
6053 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
6054 << dendl;
6055 return r;
6056 }
6057
6058 // wal_dir, too!
7c673cae
FG
6059 r = ::mkdir(walfn.c_str(), 0755);
6060 if (r < 0)
6061 r = -errno;
6062 if (r < 0 && r != -EEXIST) {
6063 derr << __func__ << " failed to create " << walfn
6064 << ": " << cpp_strerror(r)
6065 << dendl;
6066 return r;
6067 }
11fdf7f2
TL
6068 } else {
6069 struct stat st;
6070 r = ::stat(walfn.c_str(), &st);
6071 if (r < 0 && errno == ENOENT) {
6072 kv_options.erase("separate_wal_dir");
6073 }
7c673cae
FG
6074 }
6075 }
6076
91327a77 6077
7c673cae
FG
6078 db = KeyValueDB::create(cct,
6079 kv_backend,
6080 fn,
11fdf7f2 6081 kv_options,
7c673cae
FG
6082 static_cast<void*>(env));
6083 if (!db) {
6084 derr << __func__ << " error creating db" << dendl;
6085 if (bluefs) {
1911f103 6086 _close_bluefs(read_only);
7c673cae
FG
6087 }
6088 // delete env manually here since we can't depend on db to do this
6089 // under this case
6090 delete env;
6091 env = NULL;
6092 return -EIO;
6093 }
6094
f67539c2 6095 FreelistManager::setup_merge_operators(db, freelist_type);
7c673cae 6096 db->set_merge_operator(PREFIX_STAT, merge_op);
91327a77 6097 db->set_cache_size(cache_kv_ratio * cache_size);
f67539c2
TL
6098 return 0;
6099}
31f18b77 6100
f67539c2
TL
6101int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
6102{
6103 int r;
6104 ceph_assert(!(create && read_only));
6105 string options;
6106 string options_annex;
6107 stringstream err;
6108 string kv_dir_fn;
6109 string kv_backend;
6110 std::string sharding_def;
6111 r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend);
6112 if (r < 0) {
6113 derr << __func__ << " failed to prepare db environment: " << err.str() << dendl;
6114 return -EIO;
6115 }
11fdf7f2 6116 if (kv_backend == "rocksdb") {
7c673cae 6117 options = cct->_conf->bluestore_rocksdb_options;
cd265ab1
TL
6118 options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6119 if (!options_annex.empty()) {
6120 if (!options.empty() &&
6121 *options.rbegin() != ',') {
6122 options += ',';
6123 }
6124 options += options_annex;
6125 }
11fdf7f2 6126
f67539c2
TL
6127 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
6128 sharding_def = cct->_conf.get_val<std::string>("bluestore_rocksdb_cfs");
11fdf7f2
TL
6129 }
6130 }
6131
7c673cae 6132 db->init(options);
11fdf7f2
TL
6133 if (to_repair_db)
6134 return 0;
6135 if (create) {
f67539c2 6136 r = db->create_and_open(err, sharding_def);
11fdf7f2
TL
6137 } else {
6138 // we pass in cf list here, but it is only used if the db already has
6139 // column families created.
6140 r = read_only ?
f67539c2
TL
6141 db->open_read_only(err, sharding_def) :
6142 db->open(err, sharding_def);
11fdf7f2 6143 }
7c673cae
FG
6144 if (r) {
6145 derr << __func__ << " erroring opening db: " << err.str() << dendl;
1911f103 6146 _close_db(read_only);
7c673cae
FG
6147 return -EIO;
6148 }
6149 dout(1) << __func__ << " opened " << kv_backend
f67539c2 6150 << " path " << kv_dir_fn << " options " << options << dendl;
7c673cae 6151 return 0;
7c673cae
FG
6152}
6153
1911f103 6154void BlueStore::_close_db(bool cold_close)
7c673cae 6155{
11fdf7f2 6156 ceph_assert(db);
7c673cae
FG
6157 delete db;
6158 db = NULL;
6159 if (bluefs) {
1911f103 6160 _close_bluefs(cold_close);
7c673cae
FG
6161 }
6162}
6163
11fdf7f2 6164void BlueStore::_dump_alloc_on_failure()
7c673cae 6165{
11fdf7f2
TL
6166 auto dump_interval =
6167 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
6168 if (dump_interval > 0 &&
6169 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
f67539c2 6170 shared_alloc.a->dump();
11fdf7f2
TL
6171 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
6172 next_dump_on_bluefs_alloc_failure += dump_interval;
7c673cae 6173 }
11fdf7f2 6174}
7c673cae 6175
eafe8130 6176int BlueStore::_open_collections()
7c673cae 6177{
28e407b8 6178 dout(10) << __func__ << dendl;
eafe8130 6179 collections_had_errors = false;
11fdf7f2 6180 ceph_assert(coll_map.empty());
7c673cae
FG
6181 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6182 for (it->upper_bound(string());
6183 it->valid();
6184 it->next()) {
6185 coll_t cid;
6186 if (cid.parse(it->key())) {
9f95a23c 6187 auto c = ceph::make_ref<Collection>(
7c673cae 6188 this,
9f95a23c
TL
6189 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6190 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6191 cid);
7c673cae 6192 bufferlist bl = it->value();
11fdf7f2 6193 auto p = bl.cbegin();
7c673cae 6194 try {
11fdf7f2 6195 decode(c->cnode, p);
f67539c2 6196 } catch (ceph::buffer::error& e) {
7c673cae
FG
6197 derr << __func__ << " failed to decode cnode, key:"
6198 << pretty_binary_string(it->key()) << dendl;
6199 return -EIO;
6200 }
28e407b8
AA
6201 dout(20) << __func__ << " opened " << cid << " " << c
6202 << " " << c->cnode << dendl;
11fdf7f2 6203 _osr_attach(c.get());
7c673cae 6204 coll_map[cid] = c;
11fdf7f2 6205
7c673cae
FG
6206 } else {
6207 derr << __func__ << " unrecognized collection " << it->key() << dendl;
eafe8130 6208 collections_had_errors = true;
7c673cae
FG
6209 }
6210 }
6211 return 0;
6212}
6213
eafe8130
TL
6214void BlueStore::_fsck_collections(int64_t* errors)
6215{
6216 if (collections_had_errors) {
6217 dout(10) << __func__ << dendl;
f67539c2 6218 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
6219 for (it->upper_bound(string());
6220 it->valid();
6221 it->next()) {
6222 coll_t cid;
6223 if (!cid.parse(it->key())) {
6224 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6225 if (errors) {
6226 (*errors)++;
6227 }
6228 }
6229 }
6230 }
6231}
6232
9f95a23c
TL
6233void BlueStore::_set_per_pool_omap()
6234{
f67539c2 6235 per_pool_omap = OMAP_BULK;
9f95a23c
TL
6236 bufferlist bl;
6237 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6238 if (bl.length()) {
f67539c2
TL
6239 auto s = bl.to_str();
6240 if (s == stringify(OMAP_PER_POOL)) {
6241 per_pool_omap = OMAP_PER_POOL;
6242 } else {
6243 ceph_assert(s == stringify(OMAP_PER_PG));
6244 per_pool_omap = OMAP_PER_PG;
6245 }
6246 dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl;
9f95a23c
TL
6247 } else {
6248 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6249 }
f67539c2 6250 _check_no_per_pg_or_pool_omap_alert();
9f95a23c
TL
6251}
6252
224ce89b 6253void BlueStore::_open_statfs()
31f18b77 6254{
11fdf7f2
TL
6255 osd_pools.clear();
6256 vstatfs.reset();
6257
31f18b77 6258 bufferlist bl;
11fdf7f2 6259 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
31f18b77 6260 if (r >= 0) {
11fdf7f2 6261 per_pool_stat_collection = false;
31f18b77 6262 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
11fdf7f2 6263 auto it = bl.cbegin();
31f18b77 6264 vstatfs.decode(it);
11fdf7f2 6265 dout(10) << __func__ << " store_statfs is found" << dendl;
224ce89b 6266 } else {
31f18b77
FG
6267 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6268 }
81eedcae 6269 _check_legacy_statfs_alert();
11fdf7f2
TL
6270 } else {
6271 per_pool_stat_collection = true;
6272 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
f67539c2 6273 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2
TL
6274 for (it->upper_bound(string());
6275 it->valid();
6276 it->next()) {
6277
6278 uint64_t pool_id;
6279 int r = get_key_pool_stat(it->key(), &pool_id);
6280 ceph_assert(r == 0);
6281
6282 bufferlist bl;
6283 bl = it->value();
6284 auto p = bl.cbegin();
6285 auto& st = osd_pools[pool_id];
6286 try {
6287 st.decode(p);
6288 vstatfs += st;
6289
6290 dout(30) << __func__ << " pool " << pool_id
6291 << " statfs " << st << dendl;
f67539c2 6292 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
6293 derr << __func__ << " failed to decode pool stats, key:"
6294 << pretty_binary_string(it->key()) << dendl;
6295 }
6296 }
31f18b77 6297 }
11fdf7f2
TL
6298 dout(30) << __func__ << " statfs " << vstatfs << dendl;
6299
31f18b77
FG
6300}
6301
7c673cae
FG
6302int BlueStore::_setup_block_symlink_or_file(
6303 string name,
6304 string epath,
6305 uint64_t size,
6306 bool create)
6307{
6308 dout(20) << __func__ << " name " << name << " path " << epath
6309 << " size " << size << " create=" << (int)create << dendl;
6310 int r = 0;
91327a77 6311 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
6312 if (create)
6313 flags |= O_CREAT;
6314 if (epath.length()) {
6315 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6316 if (r < 0) {
6317 r = -errno;
6318 derr << __func__ << " failed to create " << name << " symlink to "
6319 << epath << ": " << cpp_strerror(r) << dendl;
6320 return r;
6321 }
6322
6323 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6324 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6325 if (fd < 0) {
6326 r = -errno;
6327 derr << __func__ << " failed to open " << epath << " file: "
6328 << cpp_strerror(r) << dendl;
6329 return r;
6330 }
11fdf7f2
TL
6331 // write the Transport ID of the NVMe device
6332 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6333 // where "0000:02:00.0" is the selector of a PCI device, see
6334 // the first column of "lspci -mm -n -D"
6335 string trid{"trtype:PCIe "};
6336 trid += "traddr:";
6337 trid += epath.substr(strlen(SPDK_PREFIX));
6338 r = ::write(fd, trid.c_str(), trid.size());
6339 ceph_assert(r == static_cast<int>(trid.size()));
7c673cae
FG
6340 dout(1) << __func__ << " created " << name << " symlink to "
6341 << epath << dendl;
6342 VOID_TEMP_FAILURE_RETRY(::close(fd));
6343 }
6344 }
6345 if (size) {
6346 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
6347 if (fd >= 0) {
6348 // block file is present
6349 struct stat st;
6350 int r = ::fstat(fd, &st);
6351 if (r == 0 &&
6352 S_ISREG(st.st_mode) && // if it is a regular file
6353 st.st_size == 0) { // and is 0 bytes
6354 r = ::ftruncate(fd, size);
6355 if (r < 0) {
6356 r = -errno;
6357 derr << __func__ << " failed to resize " << name << " file to "
6358 << size << ": " << cpp_strerror(r) << dendl;
6359 VOID_TEMP_FAILURE_RETRY(::close(fd));
6360 return r;
6361 }
6362
6363 if (cct->_conf->bluestore_block_preallocate_file) {
28e407b8
AA
6364 r = ::ceph_posix_fallocate(fd, 0, size);
6365 if (r > 0) {
7c673cae
FG
6366 derr << __func__ << " failed to prefallocate " << name << " file to "
6367 << size << ": " << cpp_strerror(r) << dendl;
6368 VOID_TEMP_FAILURE_RETRY(::close(fd));
6369 return -r;
6370 }
7c673cae
FG
6371 }
6372 dout(1) << __func__ << " resized " << name << " file to "
1adf2230 6373 << byte_u_t(size) << dendl;
7c673cae
FG
6374 }
6375 VOID_TEMP_FAILURE_RETRY(::close(fd));
6376 } else {
6377 int r = -errno;
6378 if (r != -ENOENT) {
6379 derr << __func__ << " failed to open " << name << " file: "
6380 << cpp_strerror(r) << dendl;
6381 return r;
6382 }
6383 }
6384 }
6385 return 0;
6386}
6387
6388int BlueStore::mkfs()
6389{
6390 dout(1) << __func__ << " path " << path << dendl;
6391 int r;
6392 uuid_d old_fsid;
f67539c2 6393 uint64_t reserved;
eafe8130
TL
6394 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6395 derr << __func__ << " osd_max_object_size "
6396 << cct->_conf->osd_max_object_size << " > bluestore max "
6397 << OBJECT_MAX_SIZE << dendl;
6398 return -EINVAL;
6399 }
6400
7c673cae
FG
6401 {
6402 string done;
6403 r = read_meta("mkfs_done", &done);
6404 if (r == 0) {
6405 dout(1) << __func__ << " already created" << dendl;
6406 if (cct->_conf->bluestore_fsck_on_mkfs) {
6407 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6408 if (r < 0) {
6409 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
6410 << dendl;
6411 return r;
6412 }
6413 if (r > 0) {
6414 derr << __func__ << " fsck found " << r << " errors" << dendl;
6415 r = -EIO;
6416 }
6417 }
6418 return r; // idempotent
6419 }
6420 }
6421
6422 {
6423 string type;
6424 r = read_meta("type", &type);
6425 if (r == 0) {
6426 if (type != "bluestore") {
6427 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6428 return -EIO;
6429 }
6430 } else {
6431 r = write_meta("type", "bluestore");
6432 if (r < 0)
6433 return r;
6434 }
6435 }
6436
6437 freelist_type = "bitmap";
6438
6439 r = _open_path();
6440 if (r < 0)
6441 return r;
6442
6443 r = _open_fsid(true);
6444 if (r < 0)
6445 goto out_path_fd;
6446
6447 r = _lock_fsid();
6448 if (r < 0)
6449 goto out_close_fsid;
6450
6451 r = _read_fsid(&old_fsid);
6452 if (r < 0 || old_fsid.is_zero()) {
6453 if (fsid.is_zero()) {
6454 fsid.generate_random();
6455 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6456 } else {
6457 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6458 }
6459 // we'll write it later.
6460 } else {
6461 if (!fsid.is_zero() && fsid != old_fsid) {
6462 derr << __func__ << " on-disk fsid " << old_fsid
6463 << " != provided " << fsid << dendl;
6464 r = -EINVAL;
6465 goto out_close_fsid;
6466 }
6467 fsid = old_fsid;
6468 }
6469
6470 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
6471 cct->_conf->bluestore_block_size,
6472 cct->_conf->bluestore_block_create);
6473 if (r < 0)
6474 goto out_close_fsid;
6475 if (cct->_conf->bluestore_bluefs) {
6476 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
6477 cct->_conf->bluestore_block_wal_size,
6478 cct->_conf->bluestore_block_wal_create);
6479 if (r < 0)
6480 goto out_close_fsid;
6481 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
6482 cct->_conf->bluestore_block_db_size,
6483 cct->_conf->bluestore_block_db_create);
6484 if (r < 0)
6485 goto out_close_fsid;
6486 }
6487
6488 r = _open_bdev(true);
6489 if (r < 0)
6490 goto out_close_fsid;
6491
3efd9988
FG
6492 // choose min_alloc_size
6493 if (cct->_conf->bluestore_min_alloc_size) {
6494 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
6495 } else {
11fdf7f2 6496 ceph_assert(bdev);
f67539c2 6497 if (_use_rotational_settings()) {
3efd9988
FG
6498 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
6499 } else {
6500 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
6501 }
6502 }
11fdf7f2 6503 _validate_bdev();
3efd9988
FG
6504
6505 // make sure min_alloc_size is power of 2 aligned.
11fdf7f2 6506 if (!isp2(min_alloc_size)) {
3efd9988
FG
6507 derr << __func__ << " min_alloc_size 0x"
6508 << std::hex << min_alloc_size << std::dec
6509 << " is not power of 2 aligned!"
6510 << dendl;
6511 r = -EINVAL;
6512 goto out_close_bdev;
6513 }
6514
f67539c2
TL
6515 r = _create_alloc();
6516 if (r < 0) {
6517 goto out_close_bdev;
6518 }
6519
6520 reserved = _get_ondisk_reserved();
6521 shared_alloc.a->init_add_free(reserved,
6522 p2align(bdev->get_size(), min_alloc_size) - reserved);
6523
7c673cae
FG
6524 r = _open_db(true);
6525 if (r < 0)
f67539c2 6526 goto out_close_alloc;
7c673cae 6527
7c673cae
FG
6528 {
6529 KeyValueDB::Transaction t = db->get_transaction();
1911f103 6530 r = _open_fm(t, true);
11fdf7f2
TL
6531 if (r < 0)
6532 goto out_close_db;
7c673cae
FG
6533 {
6534 bufferlist bl;
11fdf7f2 6535 encode((uint64_t)0, bl);
7c673cae
FG
6536 t->set(PREFIX_SUPER, "nid_max", bl);
6537 t->set(PREFIX_SUPER, "blobid_max", bl);
6538 }
6539
7c673cae
FG
6540 {
6541 bufferlist bl;
11fdf7f2 6542 encode((uint64_t)min_alloc_size, bl);
7c673cae
FG
6543 t->set(PREFIX_SUPER, "min_alloc_size", bl);
6544 }
9f95a23c
TL
6545 {
6546 bufferlist bl;
f67539c2 6547 bl.append(stringify(OMAP_PER_PG));
9f95a23c
TL
6548 t->set(PREFIX_SUPER, "per_pool_omap", bl);
6549 }
7c673cae
FG
6550 ondisk_format = latest_ondisk_format;
6551 _prepare_ondisk_format_super(t);
6552 db->submit_transaction_sync(t);
6553 }
6554
7c673cae
FG
6555 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
6556 if (r < 0)
224ce89b
WB
6557 goto out_close_fm;
6558
3efd9988 6559 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 6560 if (r < 0)
224ce89b 6561 goto out_close_fm;
7c673cae
FG
6562
6563 if (fsid != old_fsid) {
6564 r = _write_fsid();
6565 if (r < 0) {
6566 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 6567 goto out_close_fm;
7c673cae
FG
6568 }
6569 }
6570
7c673cae
FG
6571 out_close_fm:
6572 _close_fm();
6573 out_close_db:
1911f103 6574 _close_db(false);
f67539c2
TL
6575 out_close_alloc:
6576 _close_alloc();
7c673cae
FG
6577 out_close_bdev:
6578 _close_bdev();
6579 out_close_fsid:
6580 _close_fsid();
6581 out_path_fd:
6582 _close_path();
6583
6584 if (r == 0 &&
6585 cct->_conf->bluestore_fsck_on_mkfs) {
6586 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6587 if (rc < 0)
6588 return rc;
6589 if (rc > 0) {
6590 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6591 r = -EIO;
6592 }
11fdf7f2
TL
6593 }
6594
6595 if (r == 0) {
6596 // indicate success by writing the 'mkfs_done' file
6597 r = write_meta("mkfs_done", "yes");
6598 }
6599
6600 if (r < 0) {
6601 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6602 } else {
6603 dout(0) << __func__ << " success" << dendl;
6604 }
6605 return r;
6606}
6607
11fdf7f2
TL
6608int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
6609{
6610 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6611 int r;
6612 ceph_assert(path_fd < 0);
6613
6614 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6615
6616 if (!cct->_conf->bluestore_bluefs) {
6617 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6618 return -EIO;
6619 }
6620
f67539c2 6621 r = _open_db_and_around(true);
11fdf7f2 6622
11fdf7f2
TL
6623 if (id == BlueFS::BDEV_NEWWAL) {
6624 string p = path + "/block.wal";
6625 r = _setup_block_symlink_or_file("block.wal", dev_path,
6626 cct->_conf->bluestore_block_wal_size,
6627 true);
6628 ceph_assert(r == 0);
6629
6630 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
f67539c2
TL
6631 cct->_conf->bdev_enable_discard,
6632 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
6633 ceph_assert(r == 0);
6634
6635 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6636 r = _check_or_set_bdev_label(
6637 p,
6638 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6639 "bluefs wal",
6640 true);
6641 ceph_assert(r == 0);
6642 }
6643
9f95a23c 6644 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6645 } else if (id == BlueFS::BDEV_NEWDB) {
6646 string p = path + "/block.db";
6647 r = _setup_block_symlink_or_file("block.db", dev_path,
6648 cct->_conf->bluestore_block_db_size,
6649 true);
6650 ceph_assert(r == 0);
6651
6652 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
f67539c2
TL
6653 cct->_conf->bdev_enable_discard,
6654 SUPER_RESERVED);
11fdf7f2
TL
6655 ceph_assert(r == 0);
6656
6657 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6658 r = _check_or_set_bdev_label(
6659 p,
6660 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6661 "bluefs db",
6662 true);
6663 ceph_assert(r == 0);
6664 }
9f95a23c
TL
6665 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6666 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
6667 }
6668
6669 bluefs->umount();
6670 bluefs->mount();
6671
9f95a23c 6672 r = bluefs->prepare_new_device(id, bluefs_layout);
11fdf7f2
TL
6673 ceph_assert(r == 0);
6674
6675 if (r < 0) {
6676 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6677 } else {
6678 dout(0) << __func__ << " success" << dendl;
6679 }
6680
f67539c2 6681 _close_db_and_around(true);
11fdf7f2
TL
6682 return r;
6683}
6684
6685int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
6686 int id)
6687{
6688 dout(10) << __func__ << " id:" << id << dendl;
6689 ceph_assert(path_fd < 0);
6690
6691 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
6692
6693 if (!cct->_conf->bluestore_bluefs) {
6694 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6695 return -EIO;
6696 }
6697
f67539c2 6698 int r = _open_db_and_around(true);
11fdf7f2 6699
f67539c2 6700 uint64_t used_space = 0;
11fdf7f2 6701 for(auto src_id : devs_source) {
f67539c2 6702 used_space += bluefs->get_used(src_id);
11fdf7f2
TL
6703 }
6704 uint64_t target_free = bluefs->get_free(id);
f67539c2 6705 if (target_free < used_space) {
11fdf7f2
TL
6706 derr << __func__
6707 << " can't migrate, free space at target: " << target_free
6708 << " is less than required space: " << used_space
6709 << dendl;
f67539c2
TL
6710 r = -ENOSPC;
6711 goto shutdown;
11fdf7f2 6712 }
9f95a23c
TL
6713 if (devs_source.count(BlueFS::BDEV_DB)) {
6714 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6715 bluefs_layout.dedicated_db = false;
6716 }
6717 if (devs_source.count(BlueFS::BDEV_WAL)) {
6718 bluefs_layout.dedicated_wal = false;
6719 }
6720 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
11fdf7f2
TL
6721 if (r < 0) {
6722 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6723 goto shutdown;
6724 }
6725
6726 if (devs_source.count(BlueFS::BDEV_DB)) {
6727 r = unlink(string(path + "/block.db").c_str());
6728 ceph_assert(r == 0);
6729 }
6730 if (devs_source.count(BlueFS::BDEV_WAL)) {
6731 r = unlink(string(path + "/block.wal").c_str());
6732 ceph_assert(r == 0);
6733 }
6734
6735shutdown:
f67539c2 6736 _close_db_and_around(true);
11fdf7f2
TL
6737 return r;
6738}
6739
6740int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
6741 int id,
6742 const string& dev_path)
6743{
6744 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6745 int r;
6746 ceph_assert(path_fd < 0);
6747
6748 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6749
6750 if (!cct->_conf->bluestore_bluefs) {
6751 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6752 return -EIO;
6753 }
6754
f67539c2 6755 r = _open_db_and_around(true);
11fdf7f2 6756
11fdf7f2
TL
6757 string link_db;
6758 string link_wal;
6759 if (devs_source.count(BlueFS::BDEV_DB) &&
9f95a23c 6760 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 6761 link_db = path + "/block.db";
9f95a23c
TL
6762 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6763 bluefs_layout.dedicated_db = false;
11fdf7f2
TL
6764 }
6765 if (devs_source.count(BlueFS::BDEV_WAL)) {
6766 link_wal = path + "/block.wal";
9f95a23c 6767 bluefs_layout.dedicated_wal = false;
11fdf7f2
TL
6768 }
6769
6770 size_t target_size;
6771 string target_name;
6772 if (id == BlueFS::BDEV_NEWWAL) {
6773 target_name = "block.wal";
6774 target_size = cct->_conf->bluestore_block_wal_size;
9f95a23c 6775 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6776
6777 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
f67539c2
TL
6778 cct->_conf->bdev_enable_discard,
6779 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
6780 ceph_assert(r == 0);
6781
6782 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6783 r = _check_or_set_bdev_label(
6784 dev_path,
6785 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6786 "bluefs wal",
6787 true);
6788 ceph_assert(r == 0);
6789 }
11fdf7f2
TL
6790 } else if (id == BlueFS::BDEV_NEWDB) {
6791 target_name = "block.db";
6792 target_size = cct->_conf->bluestore_block_db_size;
9f95a23c
TL
6793 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6794 bluefs_layout.dedicated_db = true;
31f18b77 6795
11fdf7f2 6796 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
f67539c2
TL
6797 cct->_conf->bdev_enable_discard,
6798 SUPER_RESERVED);
11fdf7f2
TL
6799 ceph_assert(r == 0);
6800
6801 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6802 r = _check_or_set_bdev_label(
6803 dev_path,
6804 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6805 "bluefs db",
6806 true);
6807 ceph_assert(r == 0);
6808 }
31f18b77
FG
6809 }
6810
11fdf7f2
TL
6811 bluefs->umount();
6812 bluefs->mount();
6813
9f95a23c 6814 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
11fdf7f2 6815
7c673cae 6816 if (r < 0) {
11fdf7f2
TL
6817 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6818 goto shutdown;
6819 }
6820
6821 if (!link_db.empty()) {
6822 r = unlink(link_db.c_str());
6823 ceph_assert(r == 0);
6824 }
6825 if (!link_wal.empty()) {
6826 r = unlink(link_wal.c_str());
6827 ceph_assert(r == 0);
6828 }
6829 r = _setup_block_symlink_or_file(
6830 target_name,
6831 dev_path,
6832 target_size,
6833 true);
6834 ceph_assert(r == 0);
6835 dout(0) << __func__ << " success" << dendl;
6836
6837shutdown:
f67539c2
TL
6838 _close_db_and_around(true);
6839
11fdf7f2
TL
6840 return r;
6841}
6842
6843string BlueStore::get_device_path(unsigned id)
6844{
6845 string res;
6846 if (id < BlueFS::MAX_BDEV) {
6847 switch (id) {
6848 case BlueFS::BDEV_WAL:
6849 res = path + "/block.wal";
6850 break;
6851 case BlueFS::BDEV_DB:
9f95a23c 6852 if (id == bluefs_layout.shared_bdev) {
11fdf7f2
TL
6853 res = path + "/block";
6854 } else {
6855 res = path + "/block.db";
6856 }
6857 break;
6858 case BlueFS::BDEV_SLOW:
6859 res = path + "/block";
6860 break;
6861 }
6862 }
6863 return res;
6864}
6865
f67539c2
TL
6866int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
6867{
6868 bluestore_bdev_label_t label;
6869 int r = _read_bdev_label(cct, path, &label);
6870 if (r < 0) {
6871 derr << "unable to read label for " << path << ": "
6872 << cpp_strerror(r) << dendl;
6873 } else {
6874 label.size = size;
6875 r = _write_bdev_label(cct, path, label);
6876 if (r < 0) {
6877 derr << "unable to write label for " << path << ": "
6878 << cpp_strerror(r) << dendl;
6879 }
6880 }
6881 return r;
6882}
6883
11fdf7f2
TL
6884int BlueStore::expand_devices(ostream& out)
6885{
f67539c2 6886 int r = _open_db_and_around(true);
11fdf7f2
TL
6887 ceph_assert(r == 0);
6888 bluefs->dump_block_extents(out);
1911f103 6889 out << "Expanding DB/WAL..." << std::endl;
11fdf7f2 6890 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
9f95a23c 6891 if (devid == bluefs_layout.shared_bdev ) {
11fdf7f2
TL
6892 continue;
6893 }
6894 uint64_t size = bluefs->get_block_device_size(devid);
6895 if (size == 0) {
6896 // no bdev
6897 continue;
6898 }
6899
f67539c2
TL
6900 out << devid
6901 <<" : expanding " << " to 0x" << size << std::dec << std::endl;
6902 string p = get_device_path(devid);
6903 const char* path = p.c_str();
6904 if (path == nullptr) {
6905 derr << devid
6906 <<": can't find device path " << dendl;
6907 continue;
6908 }
6909 if (bluefs->bdev_support_label(devid)) {
6910 if (_set_bdev_label_size(p, size) >= 0) {
6911 out << devid
6912 << " : size label updated to " << size
6913 << std::endl;
11fdf7f2 6914 }
11fdf7f2
TL
6915 }
6916 }
6917 uint64_t size0 = fm->get_size();
6918 uint64_t size = bdev->get_size();
6919 if (size0 < size) {
9f95a23c 6920 out << bluefs_layout.shared_bdev
1911f103
TL
6921 << " : expanding " << " from 0x" << std::hex
6922 << size0 << " to 0x" << size << std::dec << std::endl;
f67539c2
TL
6923 _write_out_fm_meta(size);
6924 if (bdev->supported_bdev_label()) {
6925 if (_set_bdev_label_size(path, size) >= 0) {
6926 out << bluefs_layout.shared_bdev
6927 << " : size label updated to " << size
6928 << std::endl;
6929 }
6930 }
6931 _close_db_and_around(true);
1911f103
TL
6932
6933 // mount in read/write to sync expansion changes
f67539c2 6934 r = _mount();
11fdf7f2 6935 ceph_assert(r == 0);
1911f103
TL
6936 umount();
6937 } else {
f67539c2 6938 _close_db_and_around(true);
7c673cae 6939 }
1911f103
TL
6940 return r;
6941}
6942
6943int BlueStore::dump_bluefs_sizes(ostream& out)
6944{
f67539c2 6945 int r = _open_db_and_around(true);
1911f103
TL
6946 ceph_assert(r == 0);
6947 bluefs->dump_block_extents(out);
f67539c2 6948 _close_db_and_around(true);
7c673cae
FG
6949 return r;
6950}
6951
6952void BlueStore::set_cache_shards(unsigned num)
6953{
6954 dout(10) << __func__ << " " << num << dendl;
9f95a23c
TL
6955 size_t oold = onode_cache_shards.size();
6956 size_t bold = buffer_cache_shards.size();
6957 ceph_assert(num >= oold && num >= bold);
6958 onode_cache_shards.resize(num);
6959 buffer_cache_shards.resize(num);
6960 for (unsigned i = oold; i < num; ++i) {
6961 onode_cache_shards[i] =
6962 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
6963 logger);
6964 }
6965 for (unsigned i = bold; i < num; ++i) {
6966 buffer_cache_shards[i] =
6967 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
6968 logger);
7c673cae
FG
6969 }
6970}
6971
f67539c2 6972int BlueStore::_mount()
7c673cae
FG
6973{
6974 dout(1) << __func__ << " path " << path << dendl;
6975
f67539c2 6976 _kv_only = false;
7c673cae
FG
6977 if (cct->_conf->bluestore_fsck_on_mount) {
6978 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
6979 if (rc < 0)
6980 return rc;
6981 if (rc > 0) {
6982 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6983 return -EIO;
6984 }
6985 }
6986
eafe8130
TL
6987 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6988 derr << __func__ << " osd_max_object_size "
6989 << cct->_conf->osd_max_object_size << " > bluestore max "
6990 << OBJECT_MAX_SIZE << dendl;
6991 return -EINVAL;
6992 }
6993
f67539c2 6994 int r = _open_db_and_around(false);
9f95a23c 6995 if (r < 0) {
f67539c2 6996 return r;
11fdf7f2 6997 }
7c673cae 6998
11fdf7f2
TL
6999 r = _upgrade_super();
7000 if (r < 0) {
7c673cae 7001 goto out_db;
11fdf7f2 7002 }
7c673cae
FG
7003
7004 r = _open_collections();
7005 if (r < 0)
11fdf7f2 7006 goto out_db;
7c673cae
FG
7007
7008 r = _reload_logger();
7009 if (r < 0)
7010 goto out_coll;
7011
31f18b77 7012 _kv_start();
7c673cae 7013
f67539c2
TL
7014 if (bdev->is_smr()) {
7015 _zoned_cleaner_start();
7016 }
7017
7c673cae
FG
7018 r = _deferred_replay();
7019 if (r < 0)
7020 goto out_stop;
7021
7022 mempool_thread.init();
7023
f67539c2 7024 if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
eafe8130 7025 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
9f95a23c 7026
f67539c2 7027 auto was_per_pool_omap = per_pool_omap;
9f95a23c 7028
eafe8130
TL
7029 dout(1) << __func__ << " quick-fix on mount" << dendl;
7030 _fsck_on_open(FSCK_SHALLOW, true);
7031
7032 //reread statfs
7033 //FIXME minor: replace with actual open/close?
7034 _open_statfs();
eafe8130 7035 _check_legacy_statfs_alert();
9f95a23c
TL
7036
7037 //set again as hopefully it has been fixed
f67539c2 7038 if (was_per_pool_omap != OMAP_PER_PG) {
9f95a23c
TL
7039 _set_per_pool_omap();
7040 }
eafe8130
TL
7041 }
7042
7c673cae
FG
7043 mounted = true;
7044 return 0;
7045
7046 out_stop:
f67539c2
TL
7047 if (bdev->is_smr()) {
7048 _zoned_cleaner_stop();
7049 }
7c673cae 7050 _kv_stop();
7c673cae 7051 out_coll:
f6b5b4d7 7052 _shutdown_cache();
7c673cae 7053 out_db:
1911f103 7054 _close_db_and_around(false);
7c673cae
FG
7055 return r;
7056}
7057
7058int BlueStore::umount()
7059{
11fdf7f2 7060 ceph_assert(_kv_only || mounted);
7c673cae
FG
7061 dout(1) << __func__ << dendl;
7062
7063 _osr_drain_all();
7c673cae 7064
7c673cae 7065 mounted = false;
3efd9988
FG
7066 if (!_kv_only) {
7067 mempool_thread.shutdown();
f67539c2
TL
7068 if (bdev->is_smr()) {
7069 dout(20) << __func__ << " stopping zone cleaner thread" << dendl;
7070 _zoned_cleaner_stop();
7071 }
3efd9988
FG
7072 dout(20) << __func__ << " stopping kv thread" << dendl;
7073 _kv_stop();
f6b5b4d7 7074 _shutdown_cache();
3efd9988
FG
7075 dout(20) << __func__ << " closing" << dendl;
7076
3efd9988 7077 }
1911f103 7078 _close_db_and_around(false);
7c673cae
FG
7079
7080 if (cct->_conf->bluestore_fsck_on_umount) {
7081 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7082 if (rc < 0)
7083 return rc;
7084 if (rc > 0) {
7085 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7086 return -EIO;
7087 }
7088 }
7089 return 0;
7090}
7091
eafe8130
TL
7092int BlueStore::cold_open()
7093{
f67539c2 7094 return _open_db_and_around(true);
eafe8130 7095}
f67539c2 7096
eafe8130
TL
7097int BlueStore::cold_close()
7098{
1911f103 7099 _close_db_and_around(true);
eafe8130
TL
7100 return 0;
7101}
7102
9f95a23c
TL
7103// derr wrapper to limit enormous output and avoid log flooding.
7104// Of limited use where such output is expected for now
7105#define fsck_derr(err_cnt, threshold) \
7106 if (err_cnt <= threshold) { \
7107 bool need_skip_print = err_cnt == threshold; \
7108 derr
7109
7110#define fsck_dendl \
7111 dendl; \
7112 if (need_skip_print) \
7113 derr << "more error lines skipped..." << dendl; \
7c673cae 7114 }
7c673cae 7115
eafe8130
TL
7116int _fsck_sum_extents(
7117 const PExtentVector& extents,
7118 bool compressed,
7119 store_statfs_t& expected_statfs)
7120{
7121 for (auto e : extents) {
7122 if (!e.is_valid())
7123 continue;
7124 expected_statfs.allocated += e.length;
7125 if (compressed) {
7126 expected_statfs.data_compressed_allocated += e.length;
7127 }
7128 }
7129 return 0;
7130}
7131
7c673cae 7132int BlueStore::_fsck_check_extents(
11fdf7f2 7133 const coll_t& cid,
7c673cae
FG
7134 const ghobject_t& oid,
7135 const PExtentVector& extents,
7136 bool compressed,
7137 mempool_dynamic_bitset &used_blocks,
b32b8144 7138 uint64_t granularity,
11fdf7f2 7139 BlueStoreRepairer* repairer,
eafe8130
TL
7140 store_statfs_t& expected_statfs,
7141 FSCKDepth depth)
7c673cae
FG
7142{
7143 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
7144 int errors = 0;
7145 for (auto e : extents) {
7146 if (!e.is_valid())
7147 continue;
7148 expected_statfs.allocated += e.length;
7149 if (compressed) {
11fdf7f2 7150 expected_statfs.data_compressed_allocated += e.length;
7c673cae 7151 }
eafe8130
TL
7152 if (depth != FSCK_SHALLOW) {
7153 bool already = false;
9f95a23c 7154 apply_for_bitset_range(
eafe8130
TL
7155 e.offset, e.length, granularity, used_blocks,
7156 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
7157 if (bs.test(pos)) {
7158 if (repairer) {
7159 repairer->note_misreference(
7160 pos * min_alloc_size, min_alloc_size, !already);
7161 }
7162 if (!already) {
7163 derr << "fsck error: " << oid << " extent " << e
7164 << " or a subset is already allocated (misreferenced)" << dendl;
7165 ++errors;
7166 already = true;
7167 }
11fdf7f2 7168 }
eafe8130
TL
7169 else
7170 bs.set(pos);
7171 });
7172 if (repairer) {
b3b6e05e 7173 repairer->set_space_used(e.offset, e.length, cid, oid);
eafe8130 7174 }
11fdf7f2 7175
eafe8130
TL
7176 if (e.end() > bdev->get_size()) {
7177 derr << "fsck error: " << oid << " extent " << e
7178 << " past end of block device" << dendl;
7179 ++errors;
7180 }
7c673cae
FG
7181 }
7182 }
7183 return errors;
7184}
7185
11fdf7f2
TL
7186void BlueStore::_fsck_check_pool_statfs(
7187 BlueStore::per_pool_statfs& expected_pool_statfs,
eafe8130
TL
7188 int64_t& errors,
7189 int64_t& warnings,
11fdf7f2
TL
7190 BlueStoreRepairer* repairer)
7191{
f67539c2 7192 auto it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2
TL
7193 if (it) {
7194 for (it->lower_bound(string()); it->valid(); it->next()) {
7195 string key = it->key();
7196 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
7197 if (repairer) {
eafe8130
TL
7198 ++errors;
7199 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
7200 derr << "fsck error: " << "legacy statfs record found, removing"
11fdf7f2
TL
7201 << dendl;
7202 }
7203 continue;
7204 }
11fdf7f2
TL
7205 uint64_t pool_id;
7206 if (get_key_pool_stat(key, &pool_id) < 0) {
7207 derr << "fsck error: bad key " << key
7208 << "in statfs namespece" << dendl;
7209 if (repairer) {
7210 repairer->remove_key(db, PREFIX_STAT, key);
7211 }
7212 ++errors;
7213 continue;
7214 }
7215
7216 volatile_statfs vstatfs;
7217 bufferlist bl = it->value();
7218 auto blp = bl.cbegin();
7219 try {
7220 vstatfs.decode(blp);
f67539c2 7221 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
7222 derr << "fsck error: failed to decode Pool StatFS record"
7223 << pretty_binary_string(key) << dendl;
7224 if (repairer) {
7225 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
7226 << pretty_binary_string(key)
7227 << "', removing" << dendl;
7228 repairer->remove_key(db, PREFIX_STAT, key);
7229 }
7230 ++errors;
7231 vstatfs.reset();
7232 }
7233 auto stat_it = expected_pool_statfs.find(pool_id);
7234 if (stat_it == expected_pool_statfs.end()) {
7235 if (vstatfs.is_empty()) {
7236 // we don't consider that as an error since empty pool statfs
7237 // are left in DB for now
7238 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7239 << std::hex << pool_id << std::dec << dendl;
7240 if (repairer) {
7241 // but we need to increment error count in case of repair
7242 // to have proper counters at the end
7243 // (as repairer increments recovery counter anyway).
7244 ++errors;
7245 }
7246 } else {
7247 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
7248 << std::hex << pool_id << std::dec << dendl;
7249 ++errors;
7250 }
7251 if (repairer) {
7252 repairer->remove_key(db, PREFIX_SHARED_BLOB, key);
7253 }
7254 continue;
7255 }
7256 store_statfs_t statfs;
7257 vstatfs.publish(&statfs);
7258 if (!(stat_it->second == statfs)) {
7259 derr << "fsck error: actual " << statfs
7260 << " != expected " << stat_it->second
7261 << " for pool "
7262 << std::hex << pool_id << std::dec << dendl;
7263 if (repairer) {
7264 repairer->fix_statfs(db, key, stat_it->second);
7265 }
7266 ++errors;
7267 }
7268 expected_pool_statfs.erase(stat_it);
7269 }
7270 } // if (it)
eafe8130
TL
7271 for (auto& s : expected_pool_statfs) {
7272 if (s.second.is_zero()) {
11fdf7f2
TL
7273 // we might lack empty statfs recs in DB
7274 continue;
7275 }
7276 derr << "fsck error: missing Pool StatFS record for pool "
eafe8130 7277 << std::hex << s.first << std::dec << dendl;
11fdf7f2
TL
7278 if (repairer) {
7279 string key;
eafe8130
TL
7280 get_pool_stat_key(s.first, &key);
7281 repairer->fix_statfs(db, key, s.second);
11fdf7f2
TL
7282 }
7283 ++errors;
7284 }
eafe8130 7285 if (!per_pool_stat_collection &&
eafe8130
TL
7286 repairer) {
7287 // by virtue of running this method, we correct the top-level
7288 // error of having global stats
7289 repairer->inc_repaired();
7290 }
11fdf7f2
TL
7291}
7292
eafe8130
TL
7293BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
7294 BlueStore::FSCKDepth depth,
7295 int64_t pool_id,
7296 BlueStore::CollectionRef c,
7297 const ghobject_t& oid,
7298 const string& key,
7299 const bufferlist& value,
9f95a23c 7300 mempool::bluestore_fsck::list<string>* expecting_shards,
eafe8130
TL
7301 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
7302 const BlueStore::FSCK_ObjectCtx& ctx)
7303{
7304 auto& errors = ctx.errors;
7305 auto& num_objects = ctx.num_objects;
7306 auto& num_extents = ctx.num_extents;
7307 auto& num_blobs = ctx.num_blobs;
7308 auto& num_sharded_objects = ctx.num_sharded_objects;
7309 auto& num_spanning_blobs = ctx.num_spanning_blobs;
7310 auto used_blocks = ctx.used_blocks;
7311 auto sb_info_lock = ctx.sb_info_lock;
7312 auto& sb_info = ctx.sb_info;
7313 auto repairer = ctx.repairer;
7314
7315 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
7316 &ctx.expected_pool_statfs[pool_id] :
7317 &ctx.expected_store_statfs;
7318
7319 dout(10) << __func__ << " " << oid << dendl;
7320 OnodeRef o;
7321 o.reset(Onode::decode(c, oid, key, value));
7322 ++num_objects;
7c673cae 7323
eafe8130 7324 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7c673cae 7325
eafe8130
TL
7326 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7327 _dump_onode<30>(cct, *o);
7328 // shards
7329 if (!o->extent_map.shards.empty()) {
7330 ++num_sharded_objects;
7331 if (depth != FSCK_SHALLOW) {
9f95a23c 7332 ceph_assert(expecting_shards);
eafe8130
TL
7333 for (auto& s : o->extent_map.shards) {
7334 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
9f95a23c 7335 expecting_shards->push_back(string());
eafe8130 7336 get_extent_shard_key(o->key, s.shard_info->offset,
9f95a23c 7337 &expecting_shards->back());
eafe8130
TL
7338 if (s.shard_info->offset >= o->onode.size) {
7339 derr << "fsck error: " << oid << " shard 0x" << std::hex
7340 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
7341 << std::dec << dendl;
7342 ++errors;
7343 }
7344 }
7345 }
7346 }
7c673cae 7347
eafe8130
TL
7348 // lextents
7349 uint64_t pos = 0;
7350 mempool::bluestore_fsck::map<BlobRef,
7351 bluestore_blob_use_tracker_t> ref_map;
7352 for (auto& l : o->extent_map.extent_map) {
7353 dout(20) << __func__ << " " << l << dendl;
7354 if (l.logical_offset < pos) {
7355 derr << "fsck error: " << oid << " lextent at 0x"
7356 << std::hex << l.logical_offset
7357 << " overlaps with the previous, which ends at 0x" << pos
7358 << std::dec << dendl;
7359 ++errors;
7360 }
7361 if (depth != FSCK_SHALLOW &&
7362 o->extent_map.spans_shard(l.logical_offset, l.length)) {
7363 derr << "fsck error: " << oid << " lextent at 0x"
7364 << std::hex << l.logical_offset << "~" << l.length
7365 << " spans a shard boundary"
7366 << std::dec << dendl;
7367 ++errors;
7368 }
7369 pos = l.logical_offset + l.length;
7370 res_statfs->data_stored += l.length;
7371 ceph_assert(l.blob);
7372 const bluestore_blob_t& blob = l.blob->get_blob();
7373
7374 auto& ref = ref_map[l.blob];
7375 if (ref.is_empty()) {
7376 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
7377 uint32_t l = blob.get_logical_length();
7378 ref.init(l, min_release_size);
7379 }
7380 ref.get(
7381 l.blob_offset,
7382 l.length);
7383 ++num_extents;
7384 if (depth != FSCK_SHALLOW &&
7385 blob.has_unused()) {
7386 ceph_assert(referenced);
7387 auto p = referenced->find(l.blob);
7388 bluestore_blob_t::unused_t* pu;
7389 if (p == referenced->end()) {
7390 pu = &(*referenced)[l.blob];
7391 }
7392 else {
7393 pu = &p->second;
7394 }
7395 uint64_t blob_len = blob.get_logical_length();
7396 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
7397 ceph_assert(l.blob_offset + l.length <= blob_len);
7398 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
7399 uint64_t start = l.blob_offset / chunk_size;
7400 uint64_t end =
7401 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
7402 for (auto i = start; i < end; ++i) {
7403 (*pu) |= (1u << i);
7404 }
7405 }
7406 } //for (auto& l : o->extent_map.extent_map)
7407
7408 for (auto& i : ref_map) {
7409 ++num_blobs;
7410 const bluestore_blob_t& blob = i.first->get_blob();
7411 bool equal =
7412 depth == FSCK_SHALLOW ? true :
7413 i.first->get_blob_use_tracker().equal(i.second);
7414 if (!equal) {
7415 derr << "fsck error: " << oid << " blob " << *i.first
7416 << " doesn't match expected ref_map " << i.second << dendl;
7417 ++errors;
7418 }
7419 if (blob.is_compressed()) {
7420 res_statfs->data_compressed += blob.get_compressed_payload_length();
7421 res_statfs->data_compressed_original +=
7422 i.first->get_referenced_bytes();
7423 }
7424 if (blob.is_shared()) {
7425 if (i.first->shared_blob->get_sbid() > blobid_max) {
7426 derr << "fsck error: " << oid << " blob " << blob
7427 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
7428 << blobid_max << dendl;
7429 ++errors;
7430 }
7431 else if (i.first->shared_blob->get_sbid() == 0) {
7432 derr << "fsck error: " << oid << " blob " << blob
7433 << " marked as shared but has uninitialized sbid"
7434 << dendl;
7435 ++errors;
7436 }
7437 // the below lock is optional and provided in multithreading mode only
7438 if (sb_info_lock) {
7439 sb_info_lock->lock();
7440 }
7441 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
7442 ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
7443 ceph_assert(sbi.pool_id == INT64_MIN ||
7444 sbi.pool_id == oid.hobj.get_logical_pool());
7445 sbi.cid = c->cid;
7446 sbi.pool_id = oid.hobj.get_logical_pool();
7447 sbi.sb = i.first->shared_blob;
7448 sbi.oids.push_back(oid);
7449 sbi.compressed = blob.is_compressed();
7450 for (auto e : blob.get_extents()) {
7451 if (e.is_valid()) {
7452 sbi.ref_map.get(e.offset, e.length);
7453 }
7454 }
7455 if (sb_info_lock) {
7456 sb_info_lock->unlock();
7457 }
7458 } else if (depth != FSCK_SHALLOW) {
7459 ceph_assert(used_blocks);
7460 errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
7461 blob.is_compressed(),
7462 *used_blocks,
7463 fm->get_alloc_size(),
7464 repairer,
7465 *res_statfs,
7466 depth);
7467 } else {
7468 errors += _fsck_sum_extents(
7469 blob.get_extents(),
7470 blob.is_compressed(),
7471 *res_statfs);
7472 }
7473 } // for (auto& i : ref_map)
9f95a23c 7474
adb31ebb
TL
7475 {
7476 auto &sbm = o->extent_map.spanning_blob_map;
7477 size_t broken = 0;
7478 BlobRef first_broken;
7479 for (auto it = sbm.begin(); it != sbm.end();) {
7480 auto it1 = it++;
7481 if (ref_map.count(it1->second) == 0) {
7482 if (!broken) {
7483 first_broken = it1->second;
7484 ++errors;
7485 }
7486 broken++;
7487 if (repairer) {
7488 sbm.erase(it1);
7489 }
7490 }
7491 }
7492 if (broken) {
7493 derr << "fsck error: " << oid << " - " << broken
7494 << " zombie spanning blob(s) found, the first one: "
7495 << *first_broken << dendl;
7496 if(repairer) {
b3b6e05e
TL
7497 repairer->fix_spanning_blobs(
7498 db,
7499 [&](KeyValueDB::Transaction txn) {
7500 _record_onode(o, txn);
7501 });
adb31ebb
TL
7502 }
7503 }
7504 }
7505
9f95a23c
TL
7506 if (o->onode.has_omap()) {
7507 _fsck_check_object_omap(depth, o, ctx);
7508 }
7509
eafe8130
TL
7510 return o;
7511}
7512
7513#include "common/WorkQueue.h"
7514
7515class ShallowFSCKThreadPool : public ThreadPool
7516{
7517public:
7518 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
7519 ThreadPool(cct_, nm, tn, n) {
7520 }
7521 void worker(ThreadPool::WorkThread* wt) override {
7522 int next_wq = 0;
7523 while (!_stop) {
7524 next_wq %= work_queues.size();
7525 WorkQueue_ *wq = work_queues[next_wq++];
7526
7527 void* item = wq->_void_dequeue();
7528 if (item) {
7529 processing++;
7530 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
7531 wq->_void_process(item, tp_handle);
7532 processing--;
7533 }
7534 }
7535 }
7536 template <size_t BatchLen>
7537 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
7538 {
7539 struct Entry {
7540 int64_t pool_id;
7541 BlueStore::CollectionRef c;
7542 ghobject_t oid;
7543 string key;
7544 bufferlist value;
7545 };
7546 struct Batch {
7547 std::atomic<size_t> running = { 0 };
7548 size_t entry_count = 0;
7549 std::array<Entry, BatchLen> entries;
7550
7551 int64_t errors = 0;
7552 int64_t warnings = 0;
7553 uint64_t num_objects = 0;
7554 uint64_t num_extents = 0;
7555 uint64_t num_blobs = 0;
7556 uint64_t num_sharded_objects = 0;
7557 uint64_t num_spanning_blobs = 0;
7558 store_statfs_t expected_store_statfs;
7559 BlueStore::per_pool_statfs expected_pool_statfs;
7560 };
7561
7562 size_t batchCount;
7563 BlueStore* store = nullptr;
7564
eafe8130
TL
7565 ceph::mutex* sb_info_lock = nullptr;
7566 BlueStore::sb_info_map_t* sb_info = nullptr;
7567 BlueStoreRepairer* repairer = nullptr;
7568
7569 Batch* batches = nullptr;
7570 size_t last_batch_pos = 0;
7571 bool batch_acquired = false;
7572
7573 FSCKWorkQueue(std::string n,
7574 size_t _batchCount,
7575 BlueStore* _store,
eafe8130
TL
7576 ceph::mutex* _sb_info_lock,
7577 BlueStore::sb_info_map_t& _sb_info,
7578 BlueStoreRepairer* _repairer) :
f67539c2 7579 WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()),
eafe8130
TL
7580 batchCount(_batchCount),
7581 store(_store),
eafe8130
TL
7582 sb_info_lock(_sb_info_lock),
7583 sb_info(&_sb_info),
7584 repairer(_repairer)
7585 {
7586 batches = new Batch[batchCount];
7587 }
7588 ~FSCKWorkQueue() {
7589 delete[] batches;
7590 }
7591
7592 /// Remove all work items from the queue.
7593 void _clear() override {
7594 //do nothing
7595 }
7596 /// Check whether there is anything to do.
7597 bool _empty() override {
7598 ceph_assert(false);
7599 }
7600
7601 /// Get the next work item to process.
7602 void* _void_dequeue() override {
7603 size_t pos = rand() % batchCount;
7604 size_t pos0 = pos;
7605 do {
7606 auto& batch = batches[pos];
7607 if (batch.running.fetch_add(1) == 0) {
7608 if (batch.entry_count) {
7609 return &batch;
7610 }
7611 }
7612 batch.running--;
7613 pos++;
7614 pos %= batchCount;
7615 } while (pos != pos0);
7616 return nullptr;
7617 }
7618 /** @brief Process the work item.
7619 * This function will be called several times in parallel
7620 * and must therefore be thread-safe. */
7621 void _void_process(void* item, TPHandle& handle) override {
7622 Batch* batch = (Batch*)item;
7623
7624 BlueStore::FSCK_ObjectCtx ctx(
7625 batch->errors,
7626 batch->warnings,
7627 batch->num_objects,
7628 batch->num_extents,
7629 batch->num_blobs,
7630 batch->num_sharded_objects,
7631 batch->num_spanning_blobs,
7632 nullptr, // used_blocks
9f95a23c 7633 nullptr, //used_omap_head
eafe8130
TL
7634 sb_info_lock,
7635 *sb_info,
7636 batch->expected_store_statfs,
7637 batch->expected_pool_statfs,
7638 repairer);
7639
7640 for (size_t i = 0; i < batch->entry_count; i++) {
7641 auto& entry = batch->entries[i];
7642
7643 store->fsck_check_objects_shallow(
7644 BlueStore::FSCK_SHALLOW,
7645 entry.pool_id,
7646 entry.c,
7647 entry.oid,
7648 entry.key,
7649 entry.value,
9f95a23c 7650 nullptr, // expecting_shards - this will need a protection if passed
eafe8130
TL
7651 nullptr, // referenced
7652 ctx);
7653 }
7654 //std::cout << "processed " << batch << std::endl;
7655 batch->entry_count = 0;
7656 batch->running--;
7657 }
7658 /** @brief Synchronously finish processing a work item.
7659 * This function is called after _void_process with the global thread pool lock held,
7660 * so at most one copy will execute simultaneously for a given thread pool.
7661 * It can be used for non-thread-safe finalization. */
7662 void _void_process_finish(void*) override {
7663 ceph_assert(false);
7664 }
7665
7666 bool queue(
7667 int64_t pool_id,
7668 BlueStore::CollectionRef c,
7669 const ghobject_t& oid,
7670 const string& key,
7671 const bufferlist& value) {
7672 bool res = false;
7673 size_t pos0 = last_batch_pos;
7674 if (!batch_acquired) {
7675 do {
7676 auto& batch = batches[last_batch_pos];
7677 if (batch.running.fetch_add(1) == 0) {
7678 if (batch.entry_count < BatchLen) {
7679 batch_acquired = true;
7680 break;
7681 }
7682 }
7683 batch.running.fetch_sub(1);
7684 last_batch_pos++;
7685 last_batch_pos %= batchCount;
7686 } while (last_batch_pos != pos0);
7687 }
7688 if (batch_acquired) {
7689 auto& batch = batches[last_batch_pos];
7690 ceph_assert(batch.running);
7691 ceph_assert(batch.entry_count < BatchLen);
7692
7693 auto& entry = batch.entries[batch.entry_count];
7694 entry.pool_id = pool_id;
7695 entry.c = c;
7696 entry.oid = oid;
7697 entry.key = key;
7698 entry.value = value;
7699
7700 ++batch.entry_count;
7701 if (batch.entry_count == BatchLen) {
7702 batch_acquired = false;
7703 batch.running.fetch_sub(1);
7704 last_batch_pos++;
7705 last_batch_pos %= batchCount;
7706 }
7707 res = true;
7708 }
7709 return res;
7710 }
7711
7712 void finalize(ThreadPool& tp,
7713 BlueStore::FSCK_ObjectCtx& ctx) {
7714 if (batch_acquired) {
7715 auto& batch = batches[last_batch_pos];
7716 ceph_assert(batch.running);
7717 batch.running.fetch_sub(1);
7718 }
7719 tp.stop();
7720
7721 for (size_t i = 0; i < batchCount; i++) {
7722 auto& batch = batches[i];
7723
7724 //process leftovers if any
7725 if (batch.entry_count) {
7726 TPHandle tp_handle(store->cct,
7727 nullptr,
7728 timeout_interval,
7729 suicide_interval);
7730 ceph_assert(batch.running == 0);
7731
7732 batch.running++; // just to be on-par with the regular call
7733 _void_process(&batch, tp_handle);
7734 }
7735 ceph_assert(batch.entry_count == 0);
7736
7737 ctx.errors += batch.errors;
7738 ctx.warnings += batch.warnings;
7739 ctx.num_objects += batch.num_objects;
7740 ctx.num_extents += batch.num_extents;
7741 ctx.num_blobs += batch.num_blobs;
7742 ctx.num_sharded_objects += batch.num_sharded_objects;
7743 ctx.num_spanning_blobs += batch.num_spanning_blobs;
9f95a23c 7744
eafe8130
TL
7745 ctx.expected_store_statfs.add(batch.expected_store_statfs);
7746
7747 for (auto it = batch.expected_pool_statfs.begin();
7748 it != batch.expected_pool_statfs.end();
7749 it++) {
7750 ctx.expected_pool_statfs[it->first].add(it->second);
7751 }
7752 }
7753 }
7754 };
7755};
7756
9f95a23c
TL
7757void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
7758 OnodeRef& o,
7759 const BlueStore::FSCK_ObjectCtx& ctx)
eafe8130 7760{
9f95a23c
TL
7761 auto& errors = ctx.errors;
7762 auto& warnings = ctx.warnings;
7763 auto repairer = ctx.repairer;
7764
7765 ceph_assert(o->onode.has_omap());
7766 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
f67539c2 7767 if (per_pool_omap == OMAP_PER_POOL) {
9f95a23c
TL
7768 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
7769 << "fsck error: " << o->oid
7770 << " has omap that is not per-pool or pgmeta"
7771 << fsck_dendl;
7772 ++errors;
7773 } else {
7774 const char* w;
7775 int64_t num;
7776 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
7777 ++errors;
7778 num = errors;
7779 w = "error";
7780 } else {
7781 ++warnings;
7782 num = warnings;
7783 w = "warning";
7784 }
7785 fsck_derr(num, MAX_FSCK_ERROR_LINES)
7786 << "fsck " << w << ": " << o->oid
7787 << " has omap that is not per-pool or pgmeta"
7788 << fsck_dendl;
7789 }
f67539c2
TL
7790 } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) {
7791 if (per_pool_omap == OMAP_PER_PG) {
7792 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
7793 << "fsck error: " << o->oid
7794 << " has omap that is not per-pg or pgmeta"
7795 << fsck_dendl;
7796 ++errors;
7797 } else {
7798 const char* w;
7799 int64_t num;
7800 if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) {
7801 ++errors;
7802 num = errors;
7803 w = "error";
7804 } else {
7805 ++warnings;
7806 num = warnings;
7807 w = "warning";
7808 }
7809 fsck_derr(num, MAX_FSCK_ERROR_LINES)
7810 << "fsck " << w << ": " << o->oid
7811 << " has omap that is not per-pg or pgmeta"
7812 << fsck_dendl;
7813 }
9f95a23c
TL
7814 }
7815 if (repairer &&
f67539c2 7816 !o->onode.is_perpg_omap() &&
9f95a23c 7817 !o->onode.is_pgmeta_omap()) {
f67539c2 7818 dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl;
9f95a23c
TL
7819 bufferlist h;
7820 map<string, bufferlist> kv;
7821 int r = _onode_omap_get(o, &h, &kv);
7822 if (r < 0) {
7823 derr << " got " << r << " " << cpp_strerror(r) << dendl;
7824 } else {
7825 KeyValueDB::Transaction txn = db->get_transaction();
7826 // remove old keys
7827 const string& old_omap_prefix = o->get_omap_prefix();
7828 string old_head, old_tail;
7829 o->get_omap_header(&old_head);
7830 o->get_omap_tail(&old_tail);
7831 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
7832 txn->rmkey(old_omap_prefix, old_tail);
7833 // set flag
f67539c2 7834 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP);
9f95a23c
TL
7835 _record_onode(o, txn);
7836 const string& new_omap_prefix = o->get_omap_prefix();
7837 // head
7838 if (h.length()) {
7839 string new_head;
7840 o->get_omap_header(&new_head);
7841 txn->set(new_omap_prefix, new_head, h);
7842 }
7843 // tail
7844 string new_tail;
7845 o->get_omap_tail(&new_tail);
7846 bufferlist empty;
7847 txn->set(new_omap_prefix, new_tail, empty);
7848 // values
7849 string final_key;
7850 o->get_omap_key(string(), &final_key);
7851 size_t base_key_len = final_key.size();
7852 for (auto& i : kv) {
7853 final_key.resize(base_key_len);
7854 final_key += i.first;
7855 txn->set(new_omap_prefix, final_key, i.second);
7856 }
7857 db->submit_transaction_sync(txn);
7858 repairer->inc_repaired();
7859 }
eafe8130 7860 }
9f95a23c 7861}
eafe8130 7862
9f95a23c
TL
7863void BlueStore::_fsck_check_objects(FSCKDepth depth,
7864 BlueStore::FSCK_ObjectCtx& ctx)
7865{
eafe8130 7866 auto& errors = ctx.errors;
eafe8130
TL
7867 auto sb_info_lock = ctx.sb_info_lock;
7868 auto& sb_info = ctx.sb_info;
7869 auto repairer = ctx.repairer;
7870
7871 uint64_t_btree_t used_nids;
7872
7873 size_t processed_myself = 0;
7874
f67539c2 7875 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
7876 mempool::bluestore_fsck::list<string> expecting_shards;
7877 if (it) {
7878 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
7879 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
7880 std::unique_ptr<WQ> wq(
7881 new WQ(
7882 "FSCKWorkQueue",
7883 (thread_count ? : 1) * 32,
7884 this,
eafe8130
TL
7885 sb_info_lock,
7886 sb_info,
7887 repairer));
7888
7889 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
7890
7891 thread_pool.add_work_queue(wq.get());
7892 if (depth == FSCK_SHALLOW && thread_count > 0) {
7893 //not the best place but let's check anyway
7894 ceph_assert(sb_info_lock);
7895 thread_pool.start();
7896 }
7897
7898 //fill global if not overriden below
7899 CollectionRef c;
7900 int64_t pool_id = -1;
7901 spg_t pgid;
7902 for (it->lower_bound(string()); it->valid(); it->next()) {
7903 dout(30) << __func__ << " key "
7904 << pretty_binary_string(it->key()) << dendl;
7905 if (is_extent_shard_key(it->key())) {
7906 if (depth == FSCK_SHALLOW) {
7907 continue;
7908 }
7909 while (!expecting_shards.empty() &&
7910 expecting_shards.front() < it->key()) {
7911 derr << "fsck error: missing shard key "
7912 << pretty_binary_string(expecting_shards.front())
7913 << dendl;
7914 ++errors;
7915 expecting_shards.pop_front();
7916 }
7917 if (!expecting_shards.empty() &&
7918 expecting_shards.front() == it->key()) {
7919 // all good
7920 expecting_shards.pop_front();
7921 continue;
7922 }
7923
7924 uint32_t offset;
7925 string okey;
7926 get_key_extent_shard(it->key(), &okey, &offset);
7927 derr << "fsck error: stray shard 0x" << std::hex << offset
7928 << std::dec << dendl;
7929 if (expecting_shards.empty()) {
7930 derr << "fsck error: " << pretty_binary_string(it->key())
7931 << " is unexpected" << dendl;
7932 ++errors;
7933 continue;
7934 }
7935 while (expecting_shards.front() > it->key()) {
7936 derr << "fsck error: saw " << pretty_binary_string(it->key())
7937 << dendl;
7938 derr << "fsck error: exp "
7939 << pretty_binary_string(expecting_shards.front()) << dendl;
7940 ++errors;
7941 expecting_shards.pop_front();
7942 if (expecting_shards.empty()) {
7943 break;
7944 }
7945 }
7946 continue;
7947 }
7948
7949 ghobject_t oid;
7950 int r = get_key_object(it->key(), &oid);
7951 if (r < 0) {
7952 derr << "fsck error: bad object key "
7953 << pretty_binary_string(it->key()) << dendl;
7954 ++errors;
7955 continue;
7956 }
7957 if (!c ||
7958 oid.shard_id != pgid.shard ||
7959 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
7960 !c->contains(oid)) {
7961 c = nullptr;
7962 for (auto& p : coll_map) {
7963 if (p.second->contains(oid)) {
7964 c = p.second;
7965 break;
7966 }
7967 }
7968 if (!c) {
7969 derr << "fsck error: stray object " << oid
7970 << " not owned by any collection" << dendl;
7971 ++errors;
7972 continue;
7973 }
7974 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
7975 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
7976 << dendl;
7977 }
7978
7979 if (depth != FSCK_SHALLOW &&
7980 !expecting_shards.empty()) {
7981 for (auto& k : expecting_shards) {
7982 derr << "fsck error: missing shard key "
7983 << pretty_binary_string(k) << dendl;
7984 }
7985 ++errors;
7986 expecting_shards.clear();
7987 }
7988
7989 bool queued = false;
7990 if (depth == FSCK_SHALLOW && thread_count > 0) {
7991 queued = wq->queue(
7992 pool_id,
7993 c,
7994 oid,
7995 it->key(),
7996 it->value());
7997 }
7998 OnodeRef o;
7999 map<BlobRef, bluestore_blob_t::unused_t> referenced;
8000
8001 if (!queued) {
8002 ++processed_myself;
8003
8004 o = fsck_check_objects_shallow(
8005 depth,
8006 pool_id,
8007 c,
8008 oid,
8009 it->key(),
8010 it->value(),
9f95a23c 8011 &expecting_shards,
eafe8130
TL
8012 &referenced,
8013 ctx);
8014 }
8015
8016 if (depth != FSCK_SHALLOW) {
8017 ceph_assert(o != nullptr);
8018 if (o->onode.nid) {
8019 if (o->onode.nid > nid_max) {
8020 derr << "fsck error: " << oid << " nid " << o->onode.nid
8021 << " > nid_max " << nid_max << dendl;
8022 ++errors;
8023 }
8024 if (used_nids.count(o->onode.nid)) {
8025 derr << "fsck error: " << oid << " nid " << o->onode.nid
8026 << " already in use" << dendl;
8027 ++errors;
8028 continue; // go for next object
8029 }
8030 used_nids.insert(o->onode.nid);
8031 }
8032 for (auto& i : referenced) {
8033 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
8034 << std::dec << " for " << *i.first << dendl;
8035 const bluestore_blob_t& blob = i.first->get_blob();
8036 if (i.second & blob.unused) {
8037 derr << "fsck error: " << oid << " blob claims unused 0x"
8038 << std::hex << blob.unused
8039 << " but extents reference 0x" << i.second << std::dec
8040 << " on blob " << *i.first << dendl;
8041 ++errors;
8042 }
8043 if (blob.has_csum()) {
8044 uint64_t blob_len = blob.get_logical_length();
8045 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
8046 unsigned csum_count = blob.get_csum_count();
8047 unsigned csum_chunk_size = blob.get_csum_chunk_size();
8048 for (unsigned p = 0; p < csum_count; ++p) {
8049 unsigned pos = p * csum_chunk_size;
8050 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
8051 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
8052 unsigned mask = 1u << firstbit;
8053 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
8054 mask |= 1u << b;
8055 }
8056 if ((blob.unused & mask) == mask) {
8057 // this csum chunk region is marked unused
8058 if (blob.get_csum_item(p) != 0) {
8059 derr << "fsck error: " << oid
8060 << " blob claims csum chunk 0x" << std::hex << pos
8061 << "~" << csum_chunk_size
8062 << " is unused (mask 0x" << mask << " of unused 0x"
8063 << blob.unused << ") but csum is non-zero 0x"
8064 << blob.get_csum_item(p) << std::dec << " on blob "
8065 << *i.first << dendl;
8066 ++errors;
8067 }
8068 }
8069 }
8070 }
8071 }
8072 // omap
8073 if (o->onode.has_omap()) {
9f95a23c
TL
8074 ceph_assert(ctx.used_omap_head);
8075 if (ctx.used_omap_head->count(o->onode.nid)) {
8076 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
8077 << " already in use" << dendl;
eafe8130
TL
8078 ++errors;
8079 } else {
9f95a23c 8080 ctx.used_omap_head->insert(o->onode.nid);
eafe8130 8081 }
9f95a23c 8082 } // if (o->onode.has_omap())
eafe8130
TL
8083 if (depth == FSCK_DEEP) {
8084 bufferlist bl;
8085 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
8086 uint64_t offset = 0;
8087 do {
8088 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
8089 int r = _do_read(c.get(), o, offset, l, bl,
8090 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
8091 if (r < 0) {
8092 ++errors;
8093 derr << "fsck error: " << oid << std::hex
8094 << " error during read: "
8095 << " " << offset << "~" << l
8096 << " " << cpp_strerror(r) << std::dec
8097 << dendl;
8098 break;
8099 }
8100 offset += l;
8101 } while (offset < o->onode.size);
8102 } // deep
8103 } //if (depth != FSCK_SHALLOW)
8104 } // for (it->lower_bound(string()); it->valid(); it->next())
8105 if (depth == FSCK_SHALLOW && thread_count > 0) {
8106 wq->finalize(thread_pool, ctx);
8107 if (processed_myself) {
8108 // may be needs more threads?
8109 dout(0) << __func__ << " partial offload"
8110 << ", done myself " << processed_myself
8111 << " of " << ctx.num_objects
8112 << "objects, threads " << thread_count
8113 << dendl;
8114 }
8115 }
8116 } // if (it)
8117}
8118/**
8119An overview for currently implemented repair logics
8120performed in fsck in two stages: detection(+preparation) and commit.
8121Detection stage (in processing order):
8122 (Issue -> Repair action to schedule)
8123 - Detect undecodable keys for Shared Blobs -> Remove
8124 - Detect undecodable records for Shared Blobs -> Remove
8125 (might trigger missed Shared Blob detection below)
8126 - Detect stray records for Shared Blobs -> Remove
8127 - Detect misreferenced pextents -> Fix
8128 Prepare Bloom-like filter to track cid/oid -> pextent
8129 Prepare list of extents that are improperly referenced
8130 Enumerate Onode records that might use 'misreferenced' pextents
8131 (Bloom-like filter applied to reduce computation)
8132 Per each questinable Onode enumerate all blobs and identify broken ones
8133 (i.e. blobs having 'misreferences')
8134 Rewrite each broken blob data by allocating another extents and
8135 copying data there
8136 If blob is shared - unshare it and mark corresponding Shared Blob
8137 for removal
8138 Release previously allocated space
8139 Update Extent Map
8140 - Detect missed Shared Blobs -> Recreate
8141 - Detect undecodable deferred transaction -> Remove
8142 - Detect Freelist Manager's 'false free' entries -> Mark as used
8143 - Detect Freelist Manager's leaked entries -> Mark as free
8144 - Detect statfs inconsistency - Update
8145 Commit stage (separate DB commit per each step):
8146 - Apply leaked FM entries fix
8147 - Apply 'false free' FM entries fix
8148 - Apply 'Remove' actions
8149 - Apply fix for misreference pextents
8150 - Apply Shared Blob recreate
8151 (can be merged with the step above if misreferences were dectected)
8152 - Apply StatFS update
8153*/
8154int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
8155{
8156 dout(1) << __func__
8157 << (repair ? " repair" : " check")
8158 << (depth == FSCK_DEEP ? " (deep)" :
8159 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8160 << dendl;
8161
8162 // in deep mode we need R/W write access to be able to replay deferred ops
8163 bool read_only = !(repair || depth == FSCK_DEEP);
8164
f67539c2 8165 int r = _open_db_and_around(read_only);
eafe8130
TL
8166 if (r < 0)
8167 return r;
7c673cae 8168
11fdf7f2
TL
8169 if (!read_only) {
8170 r = _upgrade_super();
8171 if (r < 0) {
8172 goto out_db;
8173 }
8174 }
7c673cae 8175
eafe8130 8176 r = _open_collections();
7c673cae 8177 if (r < 0)
11fdf7f2 8178 goto out_db;
7c673cae
FG
8179
8180 mempool_thread.init();
8181
11fdf7f2
TL
8182 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8183 // enable in repair or deep mode modes only
8184 if (!read_only) {
8185 _kv_start();
8186 r = _deferred_replay();
8187 _kv_stop();
8188 }
7c673cae
FG
8189 if (r < 0)
8190 goto out_scan;
8191
eafe8130
TL
8192 r = _fsck_on_open(depth, repair);
8193
8194out_scan:
8195 mempool_thread.shutdown();
f6b5b4d7 8196 _shutdown_cache();
eafe8130 8197out_db:
1911f103 8198 _close_db_and_around(false);
eafe8130
TL
8199
8200 return r;
8201}
8202
8203int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
8204{
8205 dout(1) << __func__
8206 << " <<<START>>>"
8207 << (repair ? " repair" : " check")
8208 << (depth == FSCK_DEEP ? " (deep)" :
8209 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8210 << " start" << dendl;
8211 int64_t errors = 0;
8212 int64_t warnings = 0;
8213 unsigned repaired = 0;
8214
8215 uint64_t_btree_t used_omap_head;
eafe8130
TL
8216 uint64_t_btree_t used_sbids;
8217
f67539c2 8218 mempool_dynamic_bitset used_blocks, bluefs_used_blocks;
eafe8130
TL
8219 KeyValueDB::Iterator it;
8220 store_statfs_t expected_store_statfs, actual_statfs;
8221 per_pool_statfs expected_pool_statfs;
8222
8223 sb_info_map_t sb_info;
8224
8225 uint64_t num_objects = 0;
8226 uint64_t num_extents = 0;
8227 uint64_t num_blobs = 0;
8228 uint64_t num_spanning_blobs = 0;
8229 uint64_t num_shared_blobs = 0;
8230 uint64_t num_sharded_objects = 0;
8231 BlueStoreRepairer repairer;
8232
f67539c2
TL
8233 auto alloc_size = fm->get_alloc_size();
8234
eafe8130
TL
8235 utime_t start = ceph_clock_now();
8236
8237 _fsck_collections(&errors);
b32b8144 8238 used_blocks.resize(fm->get_alloc_units());
7c673cae
FG
8239
8240 if (bluefs) {
f67539c2 8241 interval_set<uint64_t> bluefs_extents;
11fdf7f2 8242
f67539c2
TL
8243 int r = bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents);
8244 ceph_assert(r == 0);
8245 for (auto [start, len] : bluefs_extents) {
8246 apply_for_bitset_range(start, len, alloc_size, used_blocks,
8247 [&](uint64_t pos, mempool_dynamic_bitset& bs) {
8248 ceph_assert(pos < bs.size());
7c673cae 8249 bs.set(pos);
f67539c2
TL
8250 }
8251 );
8252 }
8253 }
8254
8255 bluefs_used_blocks = used_blocks;
8256
8257 apply_for_bitset_range(
8258 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks,
8259 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8260 bs.set(pos);
7c673cae 8261 }
f67539c2
TL
8262 );
8263
8264
8265 if (repair) {
b3b6e05e 8266 repairer.init_space_usage_tracker(
f67539c2
TL
8267 bdev->get_size(),
8268 min_alloc_size);
8269 }
8270
8271 if (bluefs) {
eafe8130 8272 int r = bluefs->fsck();
7c673cae 8273 if (r < 0) {
eafe8130 8274 return r;
7c673cae
FG
8275 }
8276 if (r > 0)
8277 errors += r;
8278 }
8279
eafe8130
TL
8280 if (!per_pool_stat_collection) {
8281 const char *w;
8282 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
8283 w = "error";
8284 ++errors;
8285 } else {
8286 w = "warning";
8287 ++warnings;
8288 }
8289 derr << "fsck " << w << ": store not yet converted to per-pool stats"
8290 << dendl;
8291 }
f67539c2 8292 if (per_pool_omap != OMAP_PER_PG) {
9f95a23c
TL
8293 const char *w;
8294 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8295 w = "error";
8296 ++errors;
8297 } else {
8298 w = "warning";
8299 ++warnings;
8300 }
f67539c2 8301 derr << "fsck " << w << ": store not yet converted to per-pg omap"
9f95a23c
TL
8302 << dendl;
8303 }
8304
11fdf7f2 8305 // get expected statfs; reset unaffected fields to be able to compare
7c673cae
FG
8306 // structs
8307 statfs(&actual_statfs);
11fdf7f2
TL
8308 actual_statfs.total = 0;
8309 actual_statfs.internally_reserved = 0;
8310 actual_statfs.available = 0;
8311 actual_statfs.internal_metadata = 0;
8312 actual_statfs.omap_allocated = 0;
8313
eafe8130
TL
8314 if (g_conf()->bluestore_debug_fsck_abort) {
8315 dout(1) << __func__ << " debug abort" << dendl;
8316 goto out_scan;
8317 }
7c673cae 8318 // walk PREFIX_OBJ
eafe8130
TL
8319 {
8320 dout(1) << __func__ << " walking object keyspace" << dendl;
8321 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8322 BlueStore::FSCK_ObjectCtx ctx(
8323 errors,
8324 warnings,
8325 num_objects,
8326 num_extents,
8327 num_blobs,
8328 num_sharded_objects,
8329 num_spanning_blobs,
8330 &used_blocks,
8331 &used_omap_head,
9f95a23c
TL
8332 //no need for the below lock when in non-shallow mode as
8333 // there is no multithreading in this case
8334 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
eafe8130
TL
8335 sb_info,
8336 expected_store_statfs,
8337 expected_pool_statfs,
8338 repair ? &repairer : nullptr);
9f95a23c
TL
8339
8340 _fsck_check_objects(depth, ctx);
eafe8130 8341 }
11fdf7f2 8342
7c673cae 8343 dout(1) << __func__ << " checking shared_blobs" << dendl;
f67539c2 8344 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
7c673cae 8345 if (it) {
eafe8130
TL
8346 // FIXME minor: perhaps simplify for shallow mode?
8347 // fill global if not overriden below
8348 auto expected_statfs = &expected_store_statfs;
11fdf7f2 8349
7c673cae
FG
8350 for (it->lower_bound(string()); it->valid(); it->next()) {
8351 string key = it->key();
8352 uint64_t sbid;
8353 if (get_key_shared_blob(key, &sbid)) {
3efd9988 8354 derr << "fsck error: bad key '" << key
7c673cae 8355 << "' in shared blob namespace" << dendl;
11fdf7f2
TL
8356 if (repair) {
8357 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8358 }
7c673cae
FG
8359 ++errors;
8360 continue;
8361 }
8362 auto p = sb_info.find(sbid);
8363 if (p == sb_info.end()) {
3efd9988 8364 derr << "fsck error: found stray shared blob data for sbid 0x"
7c673cae 8365 << std::hex << sbid << std::dec << dendl;
11fdf7f2
TL
8366 if (repair) {
8367 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8368 }
7c673cae
FG
8369 ++errors;
8370 } else {
8371 ++num_shared_blobs;
8372 sb_info_t& sbi = p->second;
8373 bluestore_shared_blob_t shared_blob(sbid);
8374 bufferlist bl = it->value();
11fdf7f2
TL
8375 auto blp = bl.cbegin();
8376 try {
8377 decode(shared_blob, blp);
f67539c2 8378 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
8379 ++errors;
8380 // Force update and don't report as missing
8381 sbi.updated = sbi.passed = true;
8382
8383 derr << "fsck error: failed to decode Shared Blob"
8384 << pretty_binary_string(it->key()) << dendl;
8385 if (repair) {
8386 dout(20) << __func__ << " undecodable Shared Blob, key:'"
8387 << pretty_binary_string(it->key())
8388 << "', removing" << dendl;
8389 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8390 }
8391 continue;
8392 }
7c673cae
FG
8393 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
8394 if (shared_blob.ref_map != sbi.ref_map) {
3efd9988 8395 derr << "fsck error: shared blob 0x" << std::hex << sbid
11fdf7f2
TL
8396 << std::dec << " ref_map " << shared_blob.ref_map
8397 << " != expected " << sbi.ref_map << dendl;
8398 sbi.updated = true; // will update later in repair mode only!
7c673cae
FG
8399 ++errors;
8400 }
8401 PExtentVector extents;
8402 for (auto &r : shared_blob.ref_map.ref_map) {
8403 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
8404 }
eafe8130 8405 if (per_pool_stat_collection || repair) {
11fdf7f2
TL
8406 expected_statfs = &expected_pool_statfs[sbi.pool_id];
8407 }
8408 errors += _fsck_check_extents(sbi.cid,
8409 p->second.oids.front(),
7c673cae
FG
8410 extents,
8411 p->second.compressed,
b32b8144
FG
8412 used_blocks,
8413 fm->get_alloc_size(),
11fdf7f2 8414 repair ? &repairer : nullptr,
eafe8130
TL
8415 *expected_statfs,
8416 depth);
11fdf7f2
TL
8417 sbi.passed = true;
8418 }
8419 }
8420 } // if (it)
8421
8422 if (repair && repairer.preprocess_misreference(db)) {
8423
8424 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
11fdf7f2
TL
8425 auto& misref_extents = repairer.get_misreferences();
8426 interval_set<uint64_t> to_release;
f67539c2 8427 it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2 8428 if (it) {
eafe8130
TL
8429 // fill global if not overriden below
8430 auto expected_statfs = &expected_store_statfs;
11fdf7f2
TL
8431
8432 CollectionRef c;
8433 spg_t pgid;
8434 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
8435 bool bypass_rest = false;
8436 for (it->lower_bound(string()); it->valid() && !bypass_rest;
8437 it->next()) {
8438 dout(30) << __func__ << " key "
8439 << pretty_binary_string(it->key()) << dendl;
8440 if (is_extent_shard_key(it->key())) {
8441 continue;
8442 }
8443
8444 ghobject_t oid;
8445 int r = get_key_object(it->key(), &oid);
b3b6e05e 8446 if (r < 0 || !repairer.is_used(oid)) {
11fdf7f2
TL
8447 continue;
8448 }
8449
8450 if (!c ||
8451 oid.shard_id != pgid.shard ||
8452 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8453 !c->contains(oid)) {
8454 c = nullptr;
8455 for (auto& p : coll_map) {
8456 if (p.second->contains(oid)) {
8457 c = p.second;
8458 break;
8459 }
8460 }
8461 if (!c) {
8462 continue;
8463 }
eafe8130
TL
8464 if (per_pool_stat_collection || repair) {
8465 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
11fdf7f2
TL
8466 expected_statfs = &expected_pool_statfs[pool_id];
8467 }
8468 }
b3b6e05e 8469 if (!repairer.is_used(c->cid)) {
11fdf7f2
TL
8470 continue;
8471 }
8472
8473 dout(20) << __func__ << " check misreference for col:" << c->cid
8474 << " obj:" << oid << dendl;
8475
eafe8130
TL
8476 OnodeRef o;
8477 o.reset(Onode::decode(c, oid, it->key(), it->value()));
11fdf7f2
TL
8478 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8479 mempool::bluestore_fsck::set<BlobRef> blobs;
8480
8481 for (auto& e : o->extent_map.extent_map) {
8482 blobs.insert(e.blob);
8483 }
8484 bool need_onode_update = false;
8485 bool first_dump = true;
8486 for(auto b : blobs) {
8487 bool broken_blob = false;
8488 auto& pextents = b->dirty_blob().dirty_extents();
8489 for (auto& e : pextents) {
8490 if (!e.is_valid()) {
8491 continue;
8492 }
8493 // for the sake of simplicity and proper shared blob handling
8494 // always rewrite the whole blob even when it's partially
8495 // misreferenced.
8496 if (misref_extents.intersects(e.offset, e.length)) {
8497 if (first_dump) {
8498 first_dump = false;
81eedcae 8499 _dump_onode<10>(cct, *o);
11fdf7f2
TL
8500 }
8501 broken_blob = true;
8502 break;
8503 }
8504 }
8505 if (!broken_blob)
8506 continue;
8507 bool compressed = b->get_blob().is_compressed();
8508 need_onode_update = true;
8509 dout(10) << __func__
8510 << " fix misreferences in oid:" << oid
8511 << " " << *b << dendl;
8512 uint64_t b_off = 0;
8513 PExtentVector pext_to_release;
8514 pext_to_release.reserve(pextents.size());
8515 // rewriting all valid pextents
8516 for (auto e = pextents.begin(); e != pextents.end();
8517 b_off += e->length, e++) {
8518 if (!e->is_valid()) {
8519 continue;
8520 }
8521 PExtentVector exts;
f67539c2
TL
8522 int64_t alloc_len =
8523 shared_alloc.a->allocate(e->length, min_alloc_size,
8524 0, 0, &exts);
eafe8130 8525 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
11fdf7f2
TL
8526 derr << __func__
8527 << " failed to allocate 0x" << std::hex << e->length
eafe8130 8528 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2 8529 << " min_alloc_size 0x" << min_alloc_size
f67539c2 8530 << " available 0x " << shared_alloc.a->get_free()
11fdf7f2
TL
8531 << std::dec << dendl;
8532 if (alloc_len > 0) {
f67539c2 8533 shared_alloc.a->release(exts);
11fdf7f2
TL
8534 }
8535 bypass_rest = true;
8536 break;
8537 }
8538 expected_statfs->allocated += e->length;
8539 if (compressed) {
8540 expected_statfs->data_compressed_allocated += e->length;
8541 }
8542
8543 bufferlist bl;
8544 IOContext ioc(cct, NULL, true); // allow EIO
8545 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
8546 if (r < 0) {
8547 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
8548 <<"~" << e->length << std::dec << dendl;
8549 ceph_abort_msg("read failed, wtf");
8550 }
8551 pext_to_release.push_back(*e);
8552 e = pextents.erase(e);
8553 e = pextents.insert(e, exts.begin(), exts.end());
8554 b->get_blob().map_bl(
8555 b_off, bl,
8556 [&](uint64_t offset, bufferlist& t) {
8557 int r = bdev->write(offset, t, false);
8558 ceph_assert(r == 0);
8559 });
8560 e += exts.size() - 1;
8561 for (auto& p : exts) {
8562 fm->allocate(p.offset, p.length, txn);
8563 }
8564 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8565
8566 if (b->get_blob().is_shared()) {
8567 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
8568
8569 auto sb_it = sb_info.find(b->shared_blob->get_sbid());
8570 ceph_assert(sb_it != sb_info.end());
8571 sb_info_t& sbi = sb_it->second;
8572
8573 for (auto& r : sbi.ref_map.ref_map) {
8574 expected_statfs->allocated -= r.second.length;
8575 if (sbi.compressed) {
8576 // NB: it's crucial to use compressed flag from sb_info_t
8577 // as we originally used that value while accumulating
8578 // expected_statfs
8579 expected_statfs->data_compressed_allocated -= r.second.length;
8580 }
8581 }
8582 sbi.updated = sbi.passed = true;
8583 sbi.ref_map.clear();
8584
8585 // relying on blob's pextents to decide what to release.
8586 for (auto& p : pext_to_release) {
8587 to_release.union_insert(p.offset, p.length);
8588 }
8589 } else {
8590 for (auto& p : pext_to_release) {
8591 expected_statfs->allocated -= p.length;
8592 if (compressed) {
8593 expected_statfs->data_compressed_allocated -= p.length;
8594 }
8595 to_release.union_insert(p.offset, p.length);
8596 }
8597 }
8598 if (bypass_rest) {
8599 break;
8600 }
8601 } // for(auto b : blobs)
8602 if (need_onode_update) {
8603 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
8604 _record_onode(o, txn);
8605 }
8606 } // for (it->lower_bound(string()); it->valid(); it->next())
8607
8608 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
8609 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
8610 << "~" << it.get_len() << std::dec << dendl;
8611 fm->release(it.get_start(), it.get_len(), txn);
8612 }
f67539c2 8613 shared_alloc.a->release(to_release);
11fdf7f2
TL
8614 to_release.clear();
8615 } // if (it) {
8616 } //if (repair && repairer.preprocess_misreference()) {
8617
eafe8130
TL
8618 if (depth != FSCK_SHALLOW) {
8619 for (auto &p : sb_info) {
8620 sb_info_t& sbi = p.second;
8621 if (!sbi.passed) {
8622 derr << "fsck error: missing " << *sbi.sb << dendl;
8623 ++errors;
8624 }
8625 if (repair && (!sbi.passed || sbi.updated)) {
8626 auto sbid = p.first;
8627 if (sbi.ref_map.empty()) {
8628 ceph_assert(sbi.passed);
8629 dout(20) << __func__ << " " << *sbi.sb
8630 << " is empty, removing" << dendl;
8631 repairer.fix_shared_blob(db, sbid, nullptr);
8632 } else {
8633 bufferlist bl;
8634 bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
8635 encode(persistent, bl);
8636 dout(20) << __func__ << " " << *sbi.sb
8637 << " is " << bl.length() << " bytes, updating" << dendl;
11fdf7f2 8638
eafe8130
TL
8639 repairer.fix_shared_blob(db, sbid, &bl);
8640 }
7c673cae
FG
8641 }
8642 }
8643 }
11fdf7f2
TL
8644 sb_info.clear();
8645
eafe8130
TL
8646 // check global stats only if fscking (not repairing) w/o per-pool stats
8647 if (!per_pool_stat_collection &&
8648 !repair &&
8649 !(actual_statfs == expected_store_statfs)) {
8650 derr << "fsck error: actual " << actual_statfs
8651 << " != expected " << expected_store_statfs << dendl;
8652 if (repair) {
8653 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
8654 expected_store_statfs);
11fdf7f2 8655 }
eafe8130 8656 ++errors;
7c673cae
FG
8657 }
8658
eafe8130
TL
8659 dout(1) << __func__ << " checking pool_statfs" << dendl;
8660 _fsck_check_pool_statfs(expected_pool_statfs,
8661 errors, warnings, repair ? &repairer : nullptr);
8662
8663 if (depth != FSCK_SHALLOW) {
9f95a23c 8664 dout(1) << __func__ << " checking for stray omap data " << dendl;
f67539c2 8665 it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE);
eafe8130 8666 if (it) {
9f95a23c 8667 uint64_t last_omap_head = 0;
eafe8130
TL
8668 for (it->lower_bound(string()); it->valid(); it->next()) {
8669 uint64_t omap_head;
f67539c2 8670
eafe8130 8671 _key_decode_u64(it->key().c_str(), &omap_head);
f67539c2 8672
9f95a23c 8673 if (used_omap_head.count(omap_head) == 0 &&
f67539c2 8674 omap_head != last_omap_head) {
9f95a23c
TL
8675 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8676 << "fsck error: found stray omap data on omap_head "
f67539c2
TL
8677 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8678 ++errors;
8679 last_omap_head = omap_head;
eafe8130 8680 }
7c673cae
FG
8681 }
8682 }
f67539c2 8683 it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE);
eafe8130 8684 if (it) {
9f95a23c 8685 uint64_t last_omap_head = 0;
eafe8130
TL
8686 for (it->lower_bound(string()); it->valid(); it->next()) {
8687 uint64_t omap_head;
8688 _key_decode_u64(it->key().c_str(), &omap_head);
9f95a23c
TL
8689 if (used_omap_head.count(omap_head) == 0 &&
8690 omap_head != last_omap_head) {
8691 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8692 << "fsck error: found stray (pgmeta) omap data on omap_head "
8693 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8694 last_omap_head = omap_head;
eafe8130
TL
8695 ++errors;
8696 }
11fdf7f2
TL
8697 }
8698 }
f67539c2 8699 it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9f95a23c
TL
8700 if (it) {
8701 uint64_t last_omap_head = 0;
8702 for (it->lower_bound(string()); it->valid(); it->next()) {
8703 uint64_t pool;
8704 uint64_t omap_head;
8705 string k = it->key();
8706 const char *c = k.c_str();
8707 c = _key_decode_u64(c, &pool);
8708 c = _key_decode_u64(c, &omap_head);
8709 if (used_omap_head.count(omap_head) == 0 &&
f67539c2 8710 omap_head != last_omap_head) {
9f95a23c
TL
8711 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8712 << "fsck error: found stray (per-pool) omap data on omap_head "
8713 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8714 ++errors;
f67539c2
TL
8715 last_omap_head = omap_head;
8716 }
8717 }
8718 }
8719 it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE);
8720 if (it) {
8721 uint64_t last_omap_head = 0;
8722 for (it->lower_bound(string()); it->valid(); it->next()) {
8723 uint64_t pool;
8724 uint32_t hash;
8725 uint64_t omap_head;
8726 string k = it->key();
8727 const char* c = k.c_str();
8728 c = _key_decode_u64(c, &pool);
8729 c = _key_decode_u32(c, &hash);
8730 c = _key_decode_u64(c, &omap_head);
8731 if (used_omap_head.count(omap_head) == 0 &&
8732 omap_head != last_omap_head) {
8733 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8734 << "fsck error: found stray (per-pg) omap data on omap_head "
8735 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8736 ++errors;
8737 last_omap_head = omap_head;
9f95a23c
TL
8738 }
8739 }
8740 }
eafe8130 8741 dout(1) << __func__ << " checking deferred events" << dendl;
f67539c2 8742 it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
8743 if (it) {
8744 for (it->lower_bound(string()); it->valid(); it->next()) {
8745 bufferlist bl = it->value();
8746 auto p = bl.cbegin();
8747 bluestore_deferred_transaction_t wt;
8748 try {
8749 decode(wt, p);
f67539c2 8750 } catch (ceph::buffer::error& e) {
eafe8130
TL
8751 derr << "fsck error: failed to decode deferred txn "
8752 << pretty_binary_string(it->key()) << dendl;
8753 if (repair) {
8754 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
8755 << pretty_binary_string(it->key())
8756 << "', removing" << dendl;
8757 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8758 }
8759 continue;
8760 }
8761 dout(20) << __func__ << " deferred " << wt.seq
8762 << " ops " << wt.ops.size()
8763 << " released 0x" << std::hex << wt.released << std::dec << dendl;
8764 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
9f95a23c 8765 apply_for_bitset_range(
f67539c2 8766 e.get_start(), e.get_len(), alloc_size, used_blocks,
eafe8130 8767 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
8768 bs.set(pos);
8769 }
8770 );
8771 }
7c673cae 8772 }
eafe8130
TL
8773 }
8774
8775 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
8776 {
eafe8130
TL
8777 fm->enumerate_reset();
8778 uint64_t offset, length;
8779 while (fm->enumerate_next(db, &offset, &length)) {
8780 bool intersects = false;
9f95a23c 8781 apply_for_bitset_range(
f67539c2 8782 offset, length, alloc_size, used_blocks,
eafe8130 8783 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
f67539c2
TL
8784 ceph_assert(pos < bs.size());
8785 if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
eafe8130
TL
8786 if (offset == SUPER_RESERVED &&
8787 length == min_alloc_size - SUPER_RESERVED) {
8788 // this is due to the change just after luminous to min_alloc_size
8789 // granularity allocations, and our baked in assumption at the top
8790 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
8791 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
8792 // since we will never allocate this region below min_alloc_size.
8793 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
8794 << " and min_alloc_size, 0x" << std::hex << offset << "~"
8795 << length << std::dec << dendl;
8796 } else {
8797 intersects = true;
8798 if (repair) {
8799 repairer.fix_false_free(db, fm,
8800 pos * min_alloc_size,
8801 min_alloc_size);
8802 }
11fdf7f2 8803 }
eafe8130
TL
8804 } else {
8805 bs.set(pos);
8806 }
7c673cae 8807 }
eafe8130
TL
8808 );
8809 if (intersects) {
8810 derr << "fsck error: free extent 0x" << std::hex << offset
8811 << "~" << length << std::dec
8812 << " intersects allocated blocks" << dendl;
8813 ++errors;
7c673cae 8814 }
b5b8bbf5 8815 }
eafe8130
TL
8816 fm->enumerate_reset();
8817 size_t count = used_blocks.count();
8818 if (used_blocks.size() != count) {
8819 ceph_assert(used_blocks.size() > count);
8820 used_blocks.flip();
8821 size_t start = used_blocks.find_first();
8822 while (start != decltype(used_blocks)::npos) {
8823 size_t cur = start;
8824 while (true) {
8825 size_t next = used_blocks.find_next(cur);
8826 if (next != cur + 1) {
8827 ++errors;
8828 derr << "fsck error: leaked extent 0x" << std::hex
8829 << ((uint64_t)start * fm->get_alloc_size()) << "~"
8830 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
8831 << dendl;
8832 if (repair) {
8833 repairer.fix_leaked(db,
8834 fm,
8835 start * min_alloc_size,
8836 (cur + 1 - start) * min_alloc_size);
8837 }
8838 start = next;
8839 break;
11fdf7f2 8840 }
eafe8130 8841 cur = next;
b5b8bbf5 8842 }
eafe8130
TL
8843 }
8844 used_blocks.flip();
b5b8bbf5 8845 }
7c673cae
FG
8846 }
8847 }
11fdf7f2 8848 if (repair) {
f67539c2
TL
8849 if (per_pool_omap != OMAP_PER_PG) {
8850 dout(5) << __func__ << " fixing per_pg_omap" << dendl;
8851 repairer.fix_per_pool_omap(db, OMAP_PER_PG);
9f95a23c
TL
8852 }
8853
11fdf7f2
TL
8854 dout(5) << __func__ << " applying repair results" << dendl;
8855 repaired = repairer.apply(db);
8856 dout(5) << __func__ << " repair applied" << dendl;
8857 }
7c673cae 8858
eafe8130 8859out_scan:
7c673cae
FG
8860 dout(2) << __func__ << " " << num_objects << " objects, "
8861 << num_sharded_objects << " of them sharded. "
8862 << dendl;
8863 dout(2) << __func__ << " " << num_extents << " extents to "
8864 << num_blobs << " blobs, "
8865 << num_spanning_blobs << " spanning, "
8866 << num_shared_blobs << " shared."
8867 << dendl;
8868
8869 utime_t duration = ceph_clock_now() - start;
9f95a23c
TL
8870 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
8871 << warnings << " warnings, "
8872 << repaired << " repaired, "
8873 << (errors + warnings - (int)repaired) << " remaining in "
7c673cae 8874 << duration << " seconds" << dendl;
9f95a23c
TL
8875
8876 // In non-repair mode we should return error count only as
8877 // it indicates if store status is OK.
8878 // In repair mode both errors and warnings are taken into account
8879 // since repaired counter relates to them both.
8880 return repair ? errors + warnings - (int)repaired : errors;
11fdf7f2
TL
8881}
8882
8883/// methods to inject various errors fsck can repair
8884void BlueStore::inject_broken_shared_blob_key(const string& key,
8885 const bufferlist& bl)
8886{
8887 KeyValueDB::Transaction txn;
8888 txn = db->get_transaction();
8889 txn->set(PREFIX_SHARED_BLOB, key, bl);
8890 db->submit_transaction_sync(txn);
8891};
8892
8893void BlueStore::inject_leaked(uint64_t len)
8894{
8895 KeyValueDB::Transaction txn;
8896 txn = db->get_transaction();
8897
8898 PExtentVector exts;
f67539c2 8899 int64_t alloc_len = shared_alloc.a->allocate(len, min_alloc_size,
11fdf7f2
TL
8900 min_alloc_size * 256, 0, &exts);
8901 ceph_assert(alloc_len >= (int64_t)len);
8902 for (auto& p : exts) {
8903 fm->allocate(p.offset, p.length, txn);
8904 }
8905 db->submit_transaction_sync(txn);
8906}
8907
8908void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
8909{
8910 KeyValueDB::Transaction txn;
8911 OnodeRef o;
8912 CollectionRef c = _get_collection(cid);
8913 ceph_assert(c);
8914 {
9f95a23c 8915 std::unique_lock l{c->lock}; // just to avoid internal asserts
11fdf7f2
TL
8916 o = c->get_onode(oid, false);
8917 ceph_assert(o);
8918 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8919 }
8920
8921 bool injected = false;
8922 txn = db->get_transaction();
8923 auto& em = o->extent_map.extent_map;
8924 std::vector<const PExtentVector*> v;
8925 if (em.size()) {
8926 v.push_back(&em.begin()->blob->get_blob().get_extents());
8927 }
8928 if (em.size() > 1) {
8929 auto it = em.end();
8930 --it;
8931 v.push_back(&(it->blob->get_blob().get_extents()));
8932 }
8933 for (auto pext : v) {
8934 if (pext->size()) {
8935 auto p = pext->begin();
8936 while (p != pext->end()) {
8937 if (p->is_valid()) {
8938 dout(20) << __func__ << " release 0x" << std::hex << p->offset
8939 << "~" << p->length << std::dec << dendl;
8940 fm->release(p->offset, p->length, txn);
8941 injected = true;
8942 break;
8943 }
8944 ++p;
8945 }
8946 }
8947 }
8948 ceph_assert(injected);
8949 db->submit_transaction_sync(txn);
8950}
8951
9f95a23c
TL
8952void BlueStore::inject_legacy_omap()
8953{
8954 dout(1) << __func__ << dendl;
f67539c2 8955 per_pool_omap = OMAP_BULK;
9f95a23c
TL
8956 KeyValueDB::Transaction txn;
8957 txn = db->get_transaction();
8958 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
8959 db->submit_transaction_sync(txn);
8960}
8961
8962void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
8963{
8964 dout(1) << __func__ << " "
8965 << cid << " " << oid
8966 <<dendl;
8967 KeyValueDB::Transaction txn;
8968 OnodeRef o;
8969 CollectionRef c = _get_collection(cid);
8970 ceph_assert(c);
8971 {
8972 std::unique_lock l{ c->lock }; // just to avoid internal asserts
8973 o = c->get_onode(oid, false);
8974 ceph_assert(o);
8975 }
f67539c2
TL
8976 o->onode.clear_flag(
8977 bluestore_onode_t::FLAG_PERPG_OMAP |
8978 bluestore_onode_t::FLAG_PERPOOL_OMAP |
8979 bluestore_onode_t::FLAG_PGMETA_OMAP);
9f95a23c
TL
8980 txn = db->get_transaction();
8981 _record_onode(o, txn);
8982 db->submit_transaction_sync(txn);
8983}
8984
8985
11fdf7f2
TL
8986void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
8987{
8988 BlueStoreRepairer repairer;
8989 repairer.fix_statfs(db, key, new_statfs);
8990 repairer.apply(db);
8991}
8992
eafe8130
TL
8993void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
8994{
8995 KeyValueDB::Transaction t = db->get_transaction();
8996 volatile_statfs v;
8997 v = new_statfs;
8998 bufferlist bl;
8999 v.encode(bl);
9000 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
9001 db->submit_transaction_sync(t);
9002}
9003
11fdf7f2
TL
9004void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
9005 coll_t cid2, ghobject_t oid2,
9006 uint64_t offset)
9007{
9008 OnodeRef o1;
9009 CollectionRef c1 = _get_collection(cid1);
9010 ceph_assert(c1);
9011 {
9f95a23c 9012 std::unique_lock l{c1->lock}; // just to avoid internal asserts
11fdf7f2
TL
9013 o1 = c1->get_onode(oid1, false);
9014 ceph_assert(o1);
9015 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9016 }
9017 OnodeRef o2;
9018 CollectionRef c2 = _get_collection(cid2);
9019 ceph_assert(c2);
9020 {
9f95a23c 9021 std::unique_lock l{c2->lock}; // just to avoid internal asserts
11fdf7f2
TL
9022 o2 = c2->get_onode(oid2, false);
9023 ceph_assert(o2);
9024 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9025 }
9026 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
9027 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
9028
9029 // require onode/extent layout to be the same (and simple)
9030 // to make things easier
9031 ceph_assert(o1->onode.extent_map_shards.empty());
9032 ceph_assert(o2->onode.extent_map_shards.empty());
9033 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
9034 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
9035 ceph_assert(e1.logical_offset == e2.logical_offset);
9036 ceph_assert(e1.length == e2.length);
9037 ceph_assert(e1.blob_offset == e2.blob_offset);
9038
9039 KeyValueDB::Transaction txn;
9040 txn = db->get_transaction();
9041
9042 // along with misreference error this will create space leaks errors
9043 e2.blob->dirty_blob() = e1.blob->get_blob();
9044 o2->extent_map.dirty_range(offset, e2.length);
9045 o2->extent_map.update(txn, false);
9046
9047 _record_onode(o2, txn);
9048 db->submit_transaction_sync(txn);
7c673cae
FG
9049}
9050
adb31ebb
TL
9051void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
9052 int16_t blob_id)
9053{
9054 OnodeRef o;
9055 CollectionRef c = _get_collection(cid);
9056 ceph_assert(c);
9057 {
9058 std::unique_lock l{ c->lock }; // just to avoid internal asserts
9059 o = c->get_onode(oid, false);
9060 ceph_assert(o);
9061 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9062 }
9063
9064 BlobRef b = c->new_blob();
9065 b->id = blob_id;
9066 o->extent_map.spanning_blob_map[blob_id] = b;
9067
9068 KeyValueDB::Transaction txn;
9069 txn = db->get_transaction();
9070
9071 _record_onode(o, txn);
9072 db->submit_transaction_sync(txn);
9073}
9074
7c673cae
FG
9075void BlueStore::collect_metadata(map<string,string> *pm)
9076{
9077 dout(10) << __func__ << dendl;
9078 bdev->collect_metadata("bluestore_bdev_", pm);
9079 if (bluefs) {
9080 (*pm)["bluefs"] = "1";
9f95a23c
TL
9081 // this value is for backward compatibility only
9082 (*pm)["bluefs_single_shared_device"] = \
9083 stringify((int)bluefs_layout.single_shared_device());
9084 (*pm)["bluefs_dedicated_db"] = \
9085 stringify((int)bluefs_layout.dedicated_db);
9086 (*pm)["bluefs_dedicated_wal"] = \
9087 stringify((int)bluefs_layout.dedicated_wal);
9088 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
7c673cae
FG
9089 } else {
9090 (*pm)["bluefs"] = "0";
9091 }
11fdf7f2
TL
9092
9093 // report numa mapping for underlying devices
9094 int node = -1;
9095 set<int> nodes;
9096 set<string> failed;
9097 int r = get_numa_node(&node, &nodes, &failed);
9098 if (r >= 0) {
9099 if (!failed.empty()) {
9100 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
9101 }
9102 if (!nodes.empty()) {
9103 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
9104 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
9105 }
9106 if (node >= 0) {
9107 (*pm)["objectstore_numa_node"] = stringify(node);
9108 }
9109 }
9110}
9111
9112int BlueStore::get_numa_node(
9113 int *final_node,
9114 set<int> *out_nodes,
9115 set<string> *out_failed)
9116{
9117 int node = -1;
9118 set<string> devices;
9119 get_devices(&devices);
9120 set<int> nodes;
9121 set<string> failed;
9122 for (auto& devname : devices) {
9123 int n;
9124 BlkDev bdev(devname);
9125 int r = bdev.get_numa_node(&n);
9126 if (r < 0) {
9127 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
9128 << dendl;
9129 failed.insert(devname);
9130 continue;
9131 }
9132 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
9133 << dendl;
9134 nodes.insert(n);
9135 if (node < 0) {
9136 node = n;
9137 }
9138 }
9139 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
9140 *final_node = node;
9141 }
9142 if (out_nodes) {
9143 *out_nodes = nodes;
9144 }
9145 if (out_failed) {
9146 *out_failed = failed;
9147 }
9148 return 0;
9149}
9150
9151int BlueStore::get_devices(set<string> *ls)
9152{
9153 if (bdev) {
9154 bdev->get_devices(ls);
9155 if (bluefs) {
9156 bluefs->get_devices(ls);
9157 }
9158 return 0;
9159 }
9160
9161 // grumble, we haven't started up yet.
9162 int r = _open_path();
9163 if (r < 0)
9164 goto out;
9165 r = _open_fsid(false);
9166 if (r < 0)
9167 goto out_path;
9168 r = _read_fsid(&fsid);
9169 if (r < 0)
9170 goto out_fsid;
9171 r = _lock_fsid();
9172 if (r < 0)
9173 goto out_fsid;
9174 r = _open_bdev(false);
9175 if (r < 0)
9176 goto out_fsid;
9177 r = _minimal_open_bluefs(false);
9178 if (r < 0)
9179 goto out_bdev;
9180 bdev->get_devices(ls);
9181 if (bluefs) {
9182 bluefs->get_devices(ls);
9183 }
9184 r = 0;
9185 _minimal_close_bluefs();
9186 out_bdev:
9187 _close_bdev();
9188 out_fsid:
9189 _close_fsid();
9190 out_path:
9191 _close_path();
9192 out:
9193 return r;
7c673cae
FG
9194}
9195
11fdf7f2 9196void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
7c673cae
FG
9197{
9198 buf->reset();
11fdf7f2 9199
f67539c2
TL
9200 auto prefix = per_pool_omap == OMAP_BULK ?
9201 PREFIX_OMAP :
9202 per_pool_omap == OMAP_PER_POOL ?
9203 PREFIX_PERPOOL_OMAP :
9204 PREFIX_PERPG_OMAP;
9f95a23c 9205 buf->omap_allocated =
f67539c2 9206 db->estimate_prefix_size(prefix, string());
11fdf7f2 9207
f67539c2 9208 uint64_t bfree = shared_alloc.a->get_free();
7c673cae
FG
9209
9210 if (bluefs) {
f67539c2 9211 buf->internally_reserved = 0;
11fdf7f2 9212 // include dedicated db, too, if that isn't the shared device.
9f95a23c 9213 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 9214 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
7c673cae 9215 }
11fdf7f2
TL
9216 // call any non-omap bluefs space "internal metadata"
9217 buf->internal_metadata =
f67539c2 9218 bluefs->get_used()
11fdf7f2 9219 - buf->omap_allocated;
7c673cae
FG
9220 }
9221
11fdf7f2
TL
9222 uint64_t thin_total, thin_avail;
9223 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
9224 buf->total += thin_total;
9225
9226 // we are limited by both the size of the virtual device and the
9227 // underlying physical device.
9228 bfree = std::min(bfree, thin_avail);
9229
9230 buf->allocated = thin_total - thin_avail;
9231 } else {
9232 buf->total += bdev->get_size();
9233 }
9234 buf->available = bfree;
9235}
9236
9237int BlueStore::statfs(struct store_statfs_t *buf,
9238 osd_alert_list_t* alerts)
9239{
9240 if (alerts) {
9241 alerts->clear();
9242 _log_alerts(*alerts);
9243 }
9244 _get_statfs_overall(buf);
31f18b77 9245 {
11fdf7f2 9246 std::lock_guard l(vstatfs_lock);
31f18b77 9247 buf->allocated = vstatfs.allocated();
11fdf7f2
TL
9248 buf->data_stored = vstatfs.stored();
9249 buf->data_compressed = vstatfs.compressed();
9250 buf->data_compressed_original = vstatfs.compressed_original();
9251 buf->data_compressed_allocated = vstatfs.compressed_allocated();
9252 }
9253
9254 dout(20) << __func__ << " " << *buf << dendl;
9255 return 0;
9256}
9257
9f95a23c
TL
9258int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
9259 bool *out_per_pool_omap)
11fdf7f2
TL
9260{
9261 dout(20) << __func__ << " pool " << pool_id<< dendl;
81eedcae 9262
11fdf7f2
TL
9263 if (!per_pool_stat_collection) {
9264 dout(20) << __func__ << " not supported in legacy mode " << dendl;
9265 return -ENOTSUP;
7c673cae 9266 }
11fdf7f2 9267 buf->reset();
7c673cae 9268
11fdf7f2
TL
9269 {
9270 std::lock_guard l(vstatfs_lock);
9271 osd_pools[pool_id].publish(buf);
9272 }
9f95a23c
TL
9273
9274 string key_prefix;
9275 _key_encode_u64(pool_id, &key_prefix);
f67539c2
TL
9276 *out_per_pool_omap = per_pool_omap != OMAP_BULK;
9277 if (*out_per_pool_omap) {
9278 auto prefix = per_pool_omap == OMAP_PER_POOL ?
9279 PREFIX_PERPOOL_OMAP :
9280 PREFIX_PERPG_OMAP;
9281 buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix);
9282 }
9f95a23c 9283
11fdf7f2 9284 dout(10) << __func__ << *buf << dendl;
7c673cae
FG
9285 return 0;
9286}
9287
81eedcae
TL
9288void BlueStore::_check_legacy_statfs_alert()
9289{
9290 string s;
9291 if (!per_pool_stat_collection &&
eafe8130 9292 cct->_conf->bluestore_warn_on_legacy_statfs) {
81eedcae
TL
9293 s = "legacy statfs reporting detected, "
9294 "suggest to run store repair to get consistent statistic reports";
9295 }
9296 std::lock_guard l(qlock);
9297 legacy_statfs_alert = s;
9298}
9299
f67539c2 9300void BlueStore::_check_no_per_pg_or_pool_omap_alert()
9f95a23c 9301{
f67539c2
TL
9302 string per_pg, per_pool;
9303 if (per_pool_omap != OMAP_PER_PG) {
9304 if (cct->_conf->bluestore_warn_on_no_per_pg_omap) {
9305 per_pg = "legacy (not per-pg) omap detected, "
9306 "suggest to run store repair to benefit from faster PG removal";
9307 }
9308 if (per_pool_omap != OMAP_PER_POOL) {
9309 if (cct->_conf->bluestore_warn_on_no_per_pool_omap) {
9310 per_pool = "legacy (not per-pool) omap detected, "
9311 "suggest to run store repair to benefit from per-pool omap usage statistics";
9312 }
9313 }
9f95a23c
TL
9314 }
9315 std::lock_guard l(qlock);
f67539c2
TL
9316 no_per_pg_omap_alert = per_pg;
9317 no_per_pool_omap_alert = per_pool;
9f95a23c
TL
9318}
9319
7c673cae
FG
9320// ---------------
9321// cache
9322
9323BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
9324{
9f95a23c 9325 std::shared_lock l(coll_lock);
7c673cae
FG
9326 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
9327 if (cp == coll_map.end())
9328 return CollectionRef();
9329 return cp->second;
9330}
9331
9332void BlueStore::_queue_reap_collection(CollectionRef& c)
9333{
9334 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
9335 // _reap_collections and this in the same thread,
9336 // so no need a lock.
7c673cae
FG
9337 removed_collections.push_back(c);
9338}
9339
9340void BlueStore::_reap_collections()
9341{
94b18763 9342
7c673cae
FG
9343 list<CollectionRef> removed_colls;
9344 {
94b18763
FG
9345 // _queue_reap_collection and this in the same thread.
9346 // So no need a lock.
9347 if (!removed_collections.empty())
9348 removed_colls.swap(removed_collections);
9349 else
9350 return;
7c673cae
FG
9351 }
9352
94b18763
FG
9353 list<CollectionRef>::iterator p = removed_colls.begin();
9354 while (p != removed_colls.end()) {
7c673cae
FG
9355 CollectionRef c = *p;
9356 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
adb31ebb 9357 if (c->onode_map.map_any([&](Onode* o) {
11fdf7f2 9358 ceph_assert(!o->exists);
7c673cae
FG
9359 if (o->flushing_count.load()) {
9360 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
9361 << " flush_txns " << o->flushing_count << dendl;
94b18763 9362 return true;
7c673cae 9363 }
94b18763 9364 return false;
7c673cae 9365 })) {
94b18763 9366 ++p;
7c673cae
FG
9367 continue;
9368 }
9369 c->onode_map.clear();
94b18763 9370 p = removed_colls.erase(p);
7c673cae
FG
9371 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
9372 }
94b18763 9373 if (removed_colls.empty()) {
7c673cae 9374 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
9375 } else {
9376 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
9377 }
9378}
9379
9380void BlueStore::_update_cache_logger()
9381{
9382 uint64_t num_onodes = 0;
9f95a23c 9383 uint64_t num_pinned_onodes = 0;
7c673cae
FG
9384 uint64_t num_extents = 0;
9385 uint64_t num_blobs = 0;
9386 uint64_t num_buffers = 0;
9387 uint64_t num_buffer_bytes = 0;
9f95a23c
TL
9388 for (auto c : onode_cache_shards) {
9389 c->add_stats(&num_onodes, &num_pinned_onodes);
9390 }
9391 for (auto c : buffer_cache_shards) {
9392 c->add_stats(&num_extents, &num_blobs,
9393 &num_buffers, &num_buffer_bytes);
7c673cae
FG
9394 }
9395 logger->set(l_bluestore_onodes, num_onodes);
9f95a23c 9396 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
7c673cae
FG
9397 logger->set(l_bluestore_extents, num_extents);
9398 logger->set(l_bluestore_blobs, num_blobs);
9399 logger->set(l_bluestore_buffers, num_buffers);
9400 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
9401}
9402
9403// ---------------
9404// read operations
9405
9406ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
9407{
9408 return _get_collection(cid);
9409}
9410
11fdf7f2
TL
9411ObjectStore::CollectionHandle BlueStore::create_new_collection(
9412 const coll_t& cid)
7c673cae 9413{
9f95a23c
TL
9414 std::unique_lock l{coll_lock};
9415 auto c = ceph::make_ref<Collection>(
11fdf7f2 9416 this,
9f95a23c
TL
9417 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
9418 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
11fdf7f2
TL
9419 cid);
9420 new_coll_map[cid] = c;
9f95a23c 9421 _osr_attach(c.get());
11fdf7f2
TL
9422 return c;
9423}
9424
9425void BlueStore::set_collection_commit_queue(
9426 const coll_t& cid,
9427 ContextQueue *commit_queue)
9428{
9429 if (commit_queue) {
9f95a23c 9430 std::shared_lock l(coll_lock);
11fdf7f2
TL
9431 if (coll_map.count(cid)) {
9432 coll_map[cid]->commit_queue = commit_queue;
9433 } else if (new_coll_map.count(cid)) {
9434 new_coll_map[cid]->commit_queue = commit_queue;
9435 }
9436 }
7c673cae
FG
9437}
9438
11fdf7f2 9439
7c673cae
FG
9440bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
9441{
9442 Collection *c = static_cast<Collection *>(c_.get());
9443 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
9444 if (!c->exists)
9445 return false;
9446
9447 bool r = true;
9448
9449 {
9f95a23c 9450 std::shared_lock l(c->lock);
7c673cae
FG
9451 OnodeRef o = c->get_onode(oid, false);
9452 if (!o || !o->exists)
9453 r = false;
9454 }
9455
7c673cae
FG
9456 return r;
9457}
9458
7c673cae
FG
9459int BlueStore::stat(
9460 CollectionHandle &c_,
9461 const ghobject_t& oid,
9462 struct stat *st,
9463 bool allow_eio)
9464{
9465 Collection *c = static_cast<Collection *>(c_.get());
9466 if (!c->exists)
9467 return -ENOENT;
9468 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
9469
9470 {
9f95a23c 9471 std::shared_lock l(c->lock);
7c673cae
FG
9472 OnodeRef o = c->get_onode(oid, false);
9473 if (!o || !o->exists)
9474 return -ENOENT;
9475 st->st_size = o->onode.size;
9476 st->st_blksize = 4096;
9477 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
9478 st->st_nlink = 1;
9479 }
9480
7c673cae
FG
9481 int r = 0;
9482 if (_debug_mdata_eio(oid)) {
9483 r = -EIO;
9484 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9485 }
9486 return r;
9487}
9488int BlueStore::set_collection_opts(
11fdf7f2 9489 CollectionHandle& ch,
7c673cae
FG
9490 const pool_opts_t& opts)
9491{
7c673cae 9492 Collection *c = static_cast<Collection *>(ch.get());
11fdf7f2 9493 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
7c673cae
FG
9494 if (!c->exists)
9495 return -ENOENT;
9f95a23c 9496 std::unique_lock l{c->lock};
7c673cae
FG
9497 c->pool_opts = opts;
9498 return 0;
9499}
9500
7c673cae
FG
9501int BlueStore::read(
9502 CollectionHandle &c_,
9503 const ghobject_t& oid,
9504 uint64_t offset,
9505 size_t length,
9506 bufferlist& bl,
224ce89b 9507 uint32_t op_flags)
7c673cae 9508{
11fdf7f2 9509 auto start = mono_clock::now();
7c673cae
FG
9510 Collection *c = static_cast<Collection *>(c_.get());
9511 const coll_t &cid = c->get_cid();
9512 dout(15) << __func__ << " " << cid << " " << oid
9513 << " 0x" << std::hex << offset << "~" << length << std::dec
9514 << dendl;
9515 if (!c->exists)
9516 return -ENOENT;
9517
9518 bl.clear();
9519 int r;
9520 {
9f95a23c 9521 std::shared_lock l(c->lock);
11fdf7f2 9522 auto start1 = mono_clock::now();
7c673cae 9523 OnodeRef o = c->get_onode(oid, false);
494da23a
TL
9524 log_latency("get_onode@read",
9525 l_bluestore_read_onode_meta_lat,
9526 mono_clock::now() - start1,
9527 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9528 if (!o || !o->exists) {
9529 r = -ENOENT;
9530 goto out;
9531 }
9532
9533 if (offset == length && offset == 0)
9534 length = o->onode.size;
9535
9536 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
9537 if (r == -EIO) {
9538 logger->inc(l_bluestore_read_eio);
9539 }
7c673cae
FG
9540 }
9541
9542 out:
28e407b8 9543 if (r >= 0 && _debug_data_eio(oid)) {
7c673cae
FG
9544 r = -EIO;
9545 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11fdf7f2
TL
9546 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
9547 cct->_conf->bluestore_debug_random_read_err &&
9548 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
9549 100.0)) == 0) {
224ce89b
WB
9550 dout(0) << __func__ << ": inject random EIO" << dendl;
9551 r = -EIO;
7c673cae
FG
9552 }
9553 dout(10) << __func__ << " " << cid << " " << oid
9554 << " 0x" << std::hex << offset << "~" << length << std::dec
9555 << " = " << r << dendl;
494da23a
TL
9556 log_latency(__func__,
9557 l_bluestore_read_lat,
9558 mono_clock::now() - start,
9559 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9560 return r;
9561}
9562
9f95a23c 9563void BlueStore::_read_cache(
7c673cae
FG
9564 OnodeRef o,
9565 uint64_t offset,
9566 size_t length,
9f95a23c
TL
9567 int read_cache_policy,
9568 ready_regions_t& ready_regions,
9569 blobs2read_t& blobs2read)
7c673cae 9570{
7c673cae 9571 // build blob-wise list to of stuff read (that isn't cached)
7c673cae
FG
9572 unsigned left = length;
9573 uint64_t pos = offset;
7c673cae
FG
9574 auto lp = o->extent_map.seek_lextent(offset);
9575 while (left > 0 && lp != o->extent_map.extent_map.end()) {
9576 if (pos < lp->logical_offset) {
9577 unsigned hole = lp->logical_offset - pos;
9578 if (hole >= left) {
9f95a23c 9579 break;
7c673cae
FG
9580 }
9581 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
9f95a23c 9582 << std::dec << dendl;
7c673cae
FG
9583 pos += hole;
9584 left -= hole;
9585 }
94b18763 9586 BlobRef& bptr = lp->blob;
7c673cae
FG
9587 unsigned l_off = pos - lp->logical_offset;
9588 unsigned b_off = l_off + lp->blob_offset;
9589 unsigned b_len = std::min(left, lp->length - l_off);
9590
9591 ready_regions_t cache_res;
9592 interval_set<uint32_t> cache_interval;
9593 bptr->shared_blob->bc.read(
91327a77
AA
9594 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
9595 read_cache_policy);
7c673cae 9596 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c
TL
9597 << " need 0x" << b_off << "~" << b_len
9598 << " cache has 0x" << cache_interval
9599 << std::dec << dendl;
7c673cae
FG
9600
9601 auto pc = cache_res.begin();
11fdf7f2 9602 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
7c673cae
FG
9603 while (b_len > 0) {
9604 unsigned l;
9605 if (pc != cache_res.end() &&
9f95a23c
TL
9606 pc->first == b_off) {
9607 l = pc->second.length();
f67539c2 9608 ready_regions[pos] = std::move(pc->second);
9f95a23c
TL
9609 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
9610 << b_off << "~" << l << std::dec << dendl;
9611 ++pc;
7c673cae 9612 } else {
9f95a23c
TL
9613 l = b_len;
9614 if (pc != cache_res.end()) {
9615 ceph_assert(pc->first > b_off);
9616 l = pc->first - b_off;
9617 }
9618 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
9619 << b_off << "~" << l << std::dec << dendl;
9620 // merge regions
9621 {
9622 uint64_t r_off = b_off;
9623 uint64_t r_len = l;
9624 uint64_t front = r_off % chunk_size;
9625 if (front) {
9626 r_off -= front;
9627 r_len += front;
9628 }
9629 unsigned tail = r_len % chunk_size;
9630 if (tail) {
9631 r_len += chunk_size - tail;
9632 }
9633 bool merged = false;
9634 regions2read_t& r2r = blobs2read[bptr];
9635 if (r2r.size()) {
9636 read_req_t& pre = r2r.back();
9637 if (r_off <= (pre.r_off + pre.r_len)) {
9638 front += (r_off - pre.r_off);
9639 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
9640 pre.regs.emplace_back(region_t(pos, b_off, l, front));
9641 merged = true;
9642 }
9643 }
9644 if (!merged) {
9645 read_req_t req(r_off, r_len);
9646 req.regs.emplace_back(region_t(pos, b_off, l, front));
9647 r2r.emplace_back(std::move(req));
9648 }
9649 }
7c673cae
FG
9650 }
9651 pos += l;
9652 b_off += l;
9653 left -= l;
9654 b_len -= l;
9655 }
9656 ++lp;
9657 }
9f95a23c 9658}
7c673cae 9659
9f95a23c
TL
9660int BlueStore::_prepare_read_ioc(
9661 blobs2read_t& blobs2read,
9662 vector<bufferlist>* compressed_blob_bls,
9663 IOContext* ioc)
9664{
7c673cae 9665 for (auto& p : blobs2read) {
94b18763 9666 const BlobRef& bptr = p.first;
11fdf7f2 9667 regions2read_t& r2r = p.second;
7c673cae 9668 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c 9669 << " need " << r2r << std::dec << dendl;
7c673cae
FG
9670 if (bptr->get_blob().is_compressed()) {
9671 // read the whole thing
9f95a23c
TL
9672 if (compressed_blob_bls->empty()) {
9673 // ensure we avoid any reallocation on subsequent blobs
9674 compressed_blob_bls->reserve(blobs2read.size());
9675 }
9676 compressed_blob_bls->push_back(bufferlist());
9677 bufferlist& bl = compressed_blob_bls->back();
9678 auto r = bptr->get_blob().map(
9679 0, bptr->get_blob().get_ondisk_length(),
9680 [&](uint64_t offset, uint64_t length) {
9681 int r = bdev->aio_read(offset, length, &bl, ioc);
9682 if (r < 0)
7c673cae
FG
9683 return r;
9684 return 0;
9f95a23c 9685 });
b32b8144
FG
9686 if (r < 0) {
9687 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
9688 if (r == -EIO) {
9689 // propagate EIO to caller
9690 return r;
9691 }
11fdf7f2 9692 ceph_assert(r == 0);
b32b8144 9693 }
7c673cae
FG
9694 } else {
9695 // read the pieces
11fdf7f2 9696 for (auto& req : r2r) {
9f95a23c
TL
9697 dout(20) << __func__ << " region 0x" << std::hex
9698 << req.regs.front().logical_offset
9699 << ": 0x" << req.regs.front().blob_xoffset
9700 << " reading 0x" << req.r_off
9701 << "~" << req.r_len << std::dec
9702 << dendl;
7c673cae 9703
9f95a23c
TL
9704 // read it
9705 auto r = bptr->get_blob().map(
9706 req.r_off, req.r_len,
9707 [&](uint64_t offset, uint64_t length) {
9708 int r = bdev->aio_read(offset, length, &req.bl, ioc);
9709 if (r < 0)
7c673cae
FG
9710 return r;
9711 return 0;
9f95a23c 9712 });
b32b8144
FG
9713 if (r < 0) {
9714 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
9715 << dendl;
9716 if (r == -EIO) {
9717 // propagate EIO to caller
9718 return r;
9719 }
11fdf7f2 9720 ceph_assert(r == 0);
b32b8144 9721 }
9f95a23c 9722 ceph_assert(req.bl.length() == req.r_len);
7c673cae
FG
9723 }
9724 }
9725 }
9f95a23c
TL
9726 return 0;
9727}
11fdf7f2 9728
9f95a23c
TL
9729int BlueStore::_generate_read_result_bl(
9730 OnodeRef o,
9731 uint64_t offset,
9732 size_t length,
9733 ready_regions_t& ready_regions,
9734 vector<bufferlist>& compressed_blob_bls,
9735 blobs2read_t& blobs2read,
9736 bool buffered,
9737 bool* csum_error,
9738 bufferlist& bl)
9739{
9740 // enumerate and decompress desired blobs
7c673cae
FG
9741 auto p = compressed_blob_bls.begin();
9742 blobs2read_t::iterator b2r_it = blobs2read.begin();
9743 while (b2r_it != blobs2read.end()) {
94b18763 9744 const BlobRef& bptr = b2r_it->first;
11fdf7f2 9745 regions2read_t& r2r = b2r_it->second;
7c673cae 9746 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c 9747 << " need 0x" << r2r << std::dec << dendl;
7c673cae 9748 if (bptr->get_blob().is_compressed()) {
11fdf7f2 9749 ceph_assert(p != compressed_blob_bls.end());
7c673cae
FG
9750 bufferlist& compressed_bl = *p++;
9751 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
9f95a23c
TL
9752 r2r.front().regs.front().logical_offset) < 0) {
9753 *csum_error = true;
9754 return -EIO;
7c673cae
FG
9755 }
9756 bufferlist raw_bl;
9f95a23c 9757 auto r = _decompress(compressed_bl, &raw_bl);
7c673cae 9758 if (r < 0)
9f95a23c 9759 return r;
7c673cae 9760 if (buffered) {
9f95a23c
TL
9761 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
9762 raw_bl);
7c673cae 9763 }
11fdf7f2
TL
9764 for (auto& req : r2r) {
9765 for (auto& r : req.regs) {
9766 ready_regions[r.logical_offset].substr_of(
9767 raw_bl, r.blob_xoffset, r.length);
9768 }
7c673cae
FG
9769 }
9770 } else {
11fdf7f2 9771 for (auto& req : r2r) {
9f95a23c
TL
9772 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
9773 req.regs.front().logical_offset) < 0) {
9774 *csum_error = true;
9775 return -EIO;
9776 }
9777 if (buffered) {
9778 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
9779 req.r_off, req.bl);
9780 }
7c673cae 9781
9f95a23c
TL
9782 // prune and keep result
9783 for (const auto& r : req.regs) {
9784 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
11fdf7f2 9785 }
7c673cae
FG
9786 }
9787 }
9788 ++b2r_it;
9789 }
9790
9791 // generate a resulting buffer
9792 auto pr = ready_regions.begin();
9793 auto pr_end = ready_regions.end();
9f95a23c 9794 uint64_t pos = 0;
7c673cae
FG
9795 while (pos < length) {
9796 if (pr != pr_end && pr->first == pos + offset) {
9797 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
9798 << ": data from 0x" << pr->first << "~" << pr->second.length()
9799 << std::dec << dendl;
7c673cae
FG
9800 pos += pr->second.length();
9801 bl.claim_append(pr->second);
9802 ++pr;
9803 } else {
9804 uint64_t l = length - pos;
9805 if (pr != pr_end) {
11fdf7f2 9806 ceph_assert(pr->first > pos + offset);
9f95a23c 9807 l = pr->first - (pos + offset);
7c673cae
FG
9808 }
9809 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
9810 << ": zeros for 0x" << (pos + offset) << "~" << l
9811 << std::dec << dendl;
7c673cae
FG
9812 bl.append_zero(l);
9813 pos += l;
9814 }
9815 }
11fdf7f2
TL
9816 ceph_assert(bl.length() == length);
9817 ceph_assert(pos == length);
9818 ceph_assert(pr == pr_end);
9f95a23c
TL
9819 return 0;
9820}
9821
9822int BlueStore::_do_read(
9823 Collection *c,
9824 OnodeRef o,
9825 uint64_t offset,
9826 size_t length,
9827 bufferlist& bl,
9828 uint32_t op_flags,
9829 uint64_t retry_count)
9830{
9831 FUNCTRACE(cct);
9832 int r = 0;
9833 int read_cache_policy = 0; // do not bypass clean or dirty cache
9834
9835 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
9836 << " size 0x" << o->onode.size << " (" << std::dec
9837 << o->onode.size << ")" << dendl;
9838 bl.clear();
9839
9840 if (offset >= o->onode.size) {
9841 return r;
9842 }
9843
9844 // generally, don't buffer anything, unless the client explicitly requests
9845 // it.
9846 bool buffered = false;
9847 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
9848 dout(20) << __func__ << " will do buffered read" << dendl;
9849 buffered = true;
9850 } else if (cct->_conf->bluestore_default_buffered_read &&
9851 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
9852 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
9853 dout(20) << __func__ << " defaulting to buffered read" << dendl;
9854 buffered = true;
9855 }
9856
9857 if (offset + length > o->onode.size) {
9858 length = o->onode.size - offset;
9859 }
9860
9861 auto start = mono_clock::now();
9862 o->extent_map.fault_range(db, offset, length);
9863 log_latency(__func__,
9864 l_bluestore_read_onode_meta_lat,
9865 mono_clock::now() - start,
9866 cct->_conf->bluestore_log_op_age);
9867 _dump_onode<30>(cct, *o);
9868
9869 // for deep-scrub, we only read dirty cache and bypass clean cache in
9870 // order to read underlying block device in case there are silent disk errors.
9871 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
9872 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
9873 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
9874 }
9875
9876 // build blob-wise list to of stuff read (that isn't cached)
9877 ready_regions_t ready_regions;
9878 blobs2read_t blobs2read;
9879 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
9880
9881
9882 // read raw blob data.
9883 start = mono_clock::now(); // for the sake of simplicity
9884 // measure the whole block below.
9885 // The error isn't that much...
9886 vector<bufferlist> compressed_blob_bls;
9887 IOContext ioc(cct, NULL, true); // allow EIO
9888 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
9889 // we always issue aio for reading, so errors other than EIO are not allowed
9890 if (r < 0)
9891 return r;
9892
f67539c2 9893 int64_t num_ios = blobs2read.size();
9f95a23c 9894 if (ioc.has_pending_aios()) {
f67539c2 9895 num_ios = ioc.get_num_ios();
9f95a23c
TL
9896 bdev->aio_submit(&ioc);
9897 dout(20) << __func__ << " waiting for aio" << dendl;
9898 ioc.aio_wait();
9899 r = ioc.get_return_value();
9900 if (r < 0) {
9901 ceph_assert(r == -EIO); // no other errors allowed
9902 return -EIO;
9903 }
9904 }
9905 log_latency_fn(__func__,
9906 l_bluestore_read_wait_aio_lat,
9907 mono_clock::now() - start,
9908 cct->_conf->bluestore_log_op_age,
9909 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
9910 );
9911
9912 bool csum_error = false;
9913 r = _generate_read_result_bl(o, offset, length, ready_regions,
9914 compressed_blob_bls, blobs2read,
9915 buffered, &csum_error, bl);
9916 if (csum_error) {
9917 // Handles spurious read errors caused by a kernel bug.
9918 // We sometimes get all-zero pages as a result of the read under
9919 // high memory pressure. Retrying the failing read succeeds in most
9920 // cases.
9921 // See also: http://tracker.ceph.com/issues/22464
9922 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
9923 return -EIO;
9924 }
9925 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
9926 }
7c673cae 9927 r = bl.length();
f64942e4
AA
9928 if (retry_count) {
9929 logger->inc(l_bluestore_reads_with_retries);
9930 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
9931 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
f67539c2
TL
9932 stringstream s;
9933 s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
9934 _set_spurious_read_errors_alert(s.str());
f64942e4 9935 }
7c673cae
FG
9936 return r;
9937}
9938
9939int BlueStore::_verify_csum(OnodeRef& o,
9940 const bluestore_blob_t* blob, uint64_t blob_xoffset,
9941 const bufferlist& bl,
9942 uint64_t logical_offset) const
9943{
9944 int bad;
9945 uint64_t bad_csum;
11fdf7f2 9946 auto start = mono_clock::now();
7c673cae 9947 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
f64942e4
AA
9948 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
9949 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
9950 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
9951 bad = blob_xoffset;
9952 r = -1;
9953 bad_csum = 0xDEADBEEF;
9954 }
7c673cae
FG
9955 if (r < 0) {
9956 if (r == -1) {
9957 PExtentVector pex;
9958 blob->map(
9959 bad,
9960 blob->get_csum_chunk_size(),
9961 [&](uint64_t offset, uint64_t length) {
9962 pex.emplace_back(bluestore_pextent_t(offset, length));
9963 return 0;
9964 });
9965 derr << __func__ << " bad "
9966 << Checksummer::get_csum_type_string(blob->csum_type)
9967 << "/0x" << std::hex << blob->get_csum_chunk_size()
9968 << " checksum at blob offset 0x" << bad
9969 << ", got 0x" << bad_csum << ", expected 0x"
9970 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
9971 << ", device location " << pex
9972 << ", logical extent 0x" << std::hex
9973 << (logical_offset + bad - blob_xoffset) << "~"
9974 << blob->get_csum_chunk_size() << std::dec
9975 << ", object " << o->oid
9976 << dendl;
9977 } else {
9978 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
9979 }
9980 }
494da23a
TL
9981 log_latency(__func__,
9982 l_bluestore_csum_lat,
9983 mono_clock::now() - start,
9984 cct->_conf->bluestore_log_op_age);
11fdf7f2
TL
9985 if (cct->_conf->bluestore_ignore_data_csum) {
9986 return 0;
9987 }
7c673cae
FG
9988 return r;
9989}
9990
9991int BlueStore::_decompress(bufferlist& source, bufferlist* result)
9992{
9993 int r = 0;
11fdf7f2
TL
9994 auto start = mono_clock::now();
9995 auto i = source.cbegin();
7c673cae 9996 bluestore_compression_header_t chdr;
11fdf7f2 9997 decode(chdr, i);
7c673cae
FG
9998 int alg = int(chdr.type);
9999 CompressorRef cp = compressor;
10000 if (!cp || (int)cp->get_type() != alg) {
10001 cp = Compressor::create(cct, alg);
10002 }
10003
10004 if (!cp.get()) {
10005 // if compressor isn't available - error, because cannot return
10006 // decompressed data?
11fdf7f2
TL
10007
10008 const char* alg_name = Compressor::get_comp_alg_name(alg);
10009 derr << __func__ << " can't load decompressor " << alg_name << dendl;
10010 _set_compression_alert(false, alg_name);
7c673cae
FG
10011 r = -EIO;
10012 } else {
f67539c2 10013 r = cp->decompress(i, chdr.length, *result, chdr.compressor_message);
7c673cae
FG
10014 if (r < 0) {
10015 derr << __func__ << " decompression failed with exit code " << r << dendl;
10016 r = -EIO;
10017 }
10018 }
494da23a
TL
10019 log_latency(__func__,
10020 l_bluestore_decompress_lat,
10021 mono_clock::now() - start,
10022 cct->_conf->bluestore_log_op_age);
7c673cae
FG
10023 return r;
10024}
10025
10026// this stores fiemap into interval_set, other variations
10027// use it internally
10028int BlueStore::_fiemap(
10029 CollectionHandle &c_,
10030 const ghobject_t& oid,
10031 uint64_t offset,
10032 size_t length,
10033 interval_set<uint64_t>& destset)
10034{
10035 Collection *c = static_cast<Collection *>(c_.get());
10036 if (!c->exists)
10037 return -ENOENT;
10038 {
9f95a23c 10039 std::shared_lock l(c->lock);
7c673cae
FG
10040
10041 OnodeRef o = c->get_onode(oid, false);
10042 if (!o || !o->exists) {
10043 return -ENOENT;
10044 }
81eedcae 10045 _dump_onode<30>(cct, *o);
7c673cae
FG
10046
10047 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10048 << " size 0x" << o->onode.size << std::dec << dendl;
10049
10050 boost::intrusive::set<Extent>::iterator ep, eend;
10051 if (offset >= o->onode.size)
10052 goto out;
10053
10054 if (offset + length > o->onode.size) {
10055 length = o->onode.size - offset;
10056 }
10057
10058 o->extent_map.fault_range(db, offset, length);
10059 eend = o->extent_map.extent_map.end();
10060 ep = o->extent_map.seek_lextent(offset);
10061 while (length > 0) {
10062 dout(20) << __func__ << " offset " << offset << dendl;
10063 if (ep != eend && ep->logical_offset + ep->length <= offset) {
10064 ++ep;
10065 continue;
10066 }
10067
10068 uint64_t x_len = length;
10069 if (ep != eend && ep->logical_offset <= offset) {
10070 uint64_t x_off = offset - ep->logical_offset;
11fdf7f2 10071 x_len = std::min(x_len, ep->length - x_off);
7c673cae
FG
10072 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
10073 << x_len << std::dec << " blob " << ep->blob << dendl;
10074 destset.insert(offset, x_len);
10075 length -= x_len;
10076 offset += x_len;
10077 if (x_off + x_len == ep->length)
10078 ++ep;
10079 continue;
10080 }
10081 if (ep != eend &&
10082 ep->logical_offset > offset &&
10083 ep->logical_offset - offset < x_len) {
10084 x_len = ep->logical_offset - offset;
10085 }
10086 offset += x_len;
10087 length -= x_len;
10088 }
10089 }
9f95a23c
TL
10090
10091 out:
10092 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10093 << " size = 0x(" << destset << ")" << std::dec << dendl;
10094 return 0;
10095}
10096
10097int BlueStore::fiemap(
10098 CollectionHandle &c_,
10099 const ghobject_t& oid,
10100 uint64_t offset,
10101 size_t length,
10102 bufferlist& bl)
10103{
10104 interval_set<uint64_t> m;
10105 int r = _fiemap(c_, oid, offset, length, m);
10106 if (r >= 0) {
10107 encode(m, bl);
10108 }
10109 return r;
10110}
10111
10112int BlueStore::fiemap(
10113 CollectionHandle &c_,
10114 const ghobject_t& oid,
10115 uint64_t offset,
10116 size_t length,
10117 map<uint64_t, uint64_t>& destmap)
10118{
10119 interval_set<uint64_t> m;
10120 int r = _fiemap(c_, oid, offset, length, m);
10121 if (r >= 0) {
10122 destmap = std::move(m).detach();
10123 }
10124 return r;
10125}
10126
10127int BlueStore::readv(
10128 CollectionHandle &c_,
10129 const ghobject_t& oid,
10130 interval_set<uint64_t>& m,
10131 bufferlist& bl,
10132 uint32_t op_flags)
10133{
10134 auto start = mono_clock::now();
10135 Collection *c = static_cast<Collection *>(c_.get());
10136 const coll_t &cid = c->get_cid();
10137 dout(15) << __func__ << " " << cid << " " << oid
10138 << " fiemap " << m
10139 << dendl;
10140 if (!c->exists)
10141 return -ENOENT;
10142
10143 bl.clear();
10144 int r;
10145 {
10146 std::shared_lock l(c->lock);
10147 auto start1 = mono_clock::now();
10148 OnodeRef o = c->get_onode(oid, false);
10149 log_latency("get_onode@read",
10150 l_bluestore_read_onode_meta_lat,
10151 mono_clock::now() - start1,
10152 cct->_conf->bluestore_log_op_age);
10153 if (!o || !o->exists) {
10154 r = -ENOENT;
10155 goto out;
10156 }
10157
10158 if (m.empty()) {
10159 r = 0;
10160 goto out;
10161 }
10162
10163 r = _do_readv(c, o, m, bl, op_flags);
10164 if (r == -EIO) {
10165 logger->inc(l_bluestore_read_eio);
10166 }
10167 }
10168
10169 out:
10170 if (r >= 0 && _debug_data_eio(oid)) {
10171 r = -EIO;
10172 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10173 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10174 cct->_conf->bluestore_debug_random_read_err &&
10175 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10176 100.0)) == 0) {
10177 dout(0) << __func__ << ": inject random EIO" << dendl;
10178 r = -EIO;
10179 }
10180 dout(10) << __func__ << " " << cid << " " << oid
10181 << " fiemap " << m << std::dec
10182 << " = " << r << dendl;
10183 log_latency(__func__,
10184 l_bluestore_read_lat,
10185 mono_clock::now() - start,
10186 cct->_conf->bluestore_log_op_age);
10187 return r;
10188}
10189
10190int BlueStore::_do_readv(
10191 Collection *c,
10192 OnodeRef o,
10193 const interval_set<uint64_t>& m,
10194 bufferlist& bl,
10195 uint32_t op_flags,
10196 uint64_t retry_count)
10197{
10198 FUNCTRACE(cct);
10199 int r = 0;
10200 int read_cache_policy = 0; // do not bypass clean or dirty cache
10201
10202 dout(20) << __func__ << " fiemap " << m << std::hex
10203 << " size 0x" << o->onode.size << " (" << std::dec
10204 << o->onode.size << ")" << dendl;
10205
10206 // generally, don't buffer anything, unless the client explicitly requests
10207 // it.
10208 bool buffered = false;
10209 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10210 dout(20) << __func__ << " will do buffered read" << dendl;
10211 buffered = true;
10212 } else if (cct->_conf->bluestore_default_buffered_read &&
10213 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10214 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10215 dout(20) << __func__ << " defaulting to buffered read" << dendl;
10216 buffered = true;
10217 }
10218 // this method must be idempotent since we may call it several times
10219 // before we finally read the expected result.
10220 bl.clear();
10221
10222 // call fiemap first!
10223 ceph_assert(m.range_start() <= o->onode.size);
10224 ceph_assert(m.range_end() <= o->onode.size);
10225 auto start = mono_clock::now();
10226 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
10227 log_latency(__func__,
10228 l_bluestore_read_onode_meta_lat,
10229 mono_clock::now() - start,
10230 cct->_conf->bluestore_log_op_age);
10231 _dump_onode<30>(cct, *o);
10232
10233 IOContext ioc(cct, NULL, true); // allow EIO
10234 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
10235 raw_results.reserve(m.num_intervals());
10236 int i = 0;
10237 for (auto p = m.begin(); p != m.end(); p++, i++) {
10238 raw_results.push_back({});
10239 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
10240 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
10241 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
10242 // we always issue aio for reading, so errors other than EIO are not allowed
10243 if (r < 0)
10244 return r;
10245 }
10246
10247 auto num_ios = m.size();
10248 if (ioc.has_pending_aios()) {
10249 num_ios = ioc.get_num_ios();
10250 bdev->aio_submit(&ioc);
10251 dout(20) << __func__ << " waiting for aio" << dendl;
10252 ioc.aio_wait();
10253 r = ioc.get_return_value();
10254 if (r < 0) {
10255 ceph_assert(r == -EIO); // no other errors allowed
10256 return -EIO;
10257 }
10258 }
10259 log_latency_fn(__func__,
10260 l_bluestore_read_wait_aio_lat,
10261 mono_clock::now() - start,
10262 cct->_conf->bluestore_log_op_age,
10263 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
10264 );
10265
10266 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
10267 i = 0;
10268 for (auto p = m.begin(); p != m.end(); p++, i++) {
10269 bool csum_error = false;
10270 bufferlist t;
10271 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
10272 std::get<0>(raw_results[i]),
10273 std::get<1>(raw_results[i]),
10274 std::get<2>(raw_results[i]),
10275 buffered, &csum_error, t);
10276 if (csum_error) {
10277 // Handles spurious read errors caused by a kernel bug.
10278 // We sometimes get all-zero pages as a result of the read under
10279 // high memory pressure. Retrying the failing read succeeds in most
10280 // cases.
10281 // See also: http://tracker.ceph.com/issues/22464
10282 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
10283 return -EIO;
10284 }
10285 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
10286 }
10287 bl.claim_append(t);
10288 }
10289 if (retry_count) {
10290 logger->inc(l_bluestore_reads_with_retries);
10291 dout(5) << __func__ << " read fiemap " << m
10292 << " failed " << retry_count << " times before succeeding"
10293 << dendl;
10294 }
10295 return bl.length();
7c673cae
FG
10296}
10297
9f95a23c 10298int BlueStore::dump_onode(CollectionHandle &c_,
7c673cae 10299 const ghobject_t& oid,
9f95a23c
TL
10300 const string& section_name,
10301 Formatter *f)
7c673cae 10302{
9f95a23c
TL
10303 Collection *c = static_cast<Collection *>(c_.get());
10304 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10305 if (!c->exists)
10306 return -ENOENT;
7c673cae 10307
9f95a23c
TL
10308 int r;
10309 {
10310 std::shared_lock l(c->lock);
10311
10312 OnodeRef o = c->get_onode(oid, false);
10313 if (!o || !o->exists) {
10314 r = -ENOENT;
10315 goto out;
10316 }
10317 // FIXME minor: actually the next line isn't enough to
10318 // load shared blobs. Leaving as is for now..
10319 //
10320 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10321
10322 _dump_onode<0>(cct, *o);
10323 f->open_object_section(section_name.c_str());
10324 o->dump(f);
10325 f->close_section();
10326 r = 0;
7c673cae 10327 }
9f95a23c
TL
10328 out:
10329 dout(10) << __func__ << " " << c->cid << " " << oid
10330 << " = " << r << dendl;
7c673cae
FG
10331 return r;
10332}
10333
7c673cae
FG
10334int BlueStore::getattr(
10335 CollectionHandle &c_,
10336 const ghobject_t& oid,
10337 const char *name,
10338 bufferptr& value)
10339{
10340 Collection *c = static_cast<Collection *>(c_.get());
10341 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
10342 if (!c->exists)
10343 return -ENOENT;
10344
10345 int r;
10346 {
9f95a23c 10347 std::shared_lock l(c->lock);
f91f0fd5 10348 mempool::bluestore_cache_meta::string k(name);
7c673cae
FG
10349
10350 OnodeRef o = c->get_onode(oid, false);
10351 if (!o || !o->exists) {
10352 r = -ENOENT;
10353 goto out;
10354 }
10355
10356 if (!o->onode.attrs.count(k)) {
10357 r = -ENODATA;
10358 goto out;
10359 }
10360 value = o->onode.attrs[k];
10361 r = 0;
10362 }
10363 out:
7c673cae
FG
10364 if (r == 0 && _debug_mdata_eio(oid)) {
10365 r = -EIO;
10366 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10367 }
10368 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
10369 << " = " << r << dendl;
10370 return r;
10371}
10372
7c673cae
FG
10373int BlueStore::getattrs(
10374 CollectionHandle &c_,
10375 const ghobject_t& oid,
10376 map<string,bufferptr>& aset)
10377{
10378 Collection *c = static_cast<Collection *>(c_.get());
10379 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10380 if (!c->exists)
10381 return -ENOENT;
10382
10383 int r;
10384 {
9f95a23c 10385 std::shared_lock l(c->lock);
7c673cae
FG
10386
10387 OnodeRef o = c->get_onode(oid, false);
10388 if (!o || !o->exists) {
10389 r = -ENOENT;
10390 goto out;
10391 }
10392 for (auto& i : o->onode.attrs) {
10393 aset.emplace(i.first.c_str(), i.second);
10394 }
10395 r = 0;
10396 }
10397
10398 out:
7c673cae
FG
10399 if (r == 0 && _debug_mdata_eio(oid)) {
10400 r = -EIO;
10401 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10402 }
10403 dout(10) << __func__ << " " << c->cid << " " << oid
10404 << " = " << r << dendl;
10405 return r;
10406}
10407
10408int BlueStore::list_collections(vector<coll_t>& ls)
10409{
9f95a23c 10410 std::shared_lock l(coll_lock);
11fdf7f2 10411 ls.reserve(coll_map.size());
7c673cae
FG
10412 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
10413 p != coll_map.end();
10414 ++p)
10415 ls.push_back(p->first);
10416 return 0;
10417}
10418
10419bool BlueStore::collection_exists(const coll_t& c)
10420{
9f95a23c 10421 std::shared_lock l(coll_lock);
7c673cae
FG
10422 return coll_map.count(c);
10423}
10424
11fdf7f2 10425int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
7c673cae 10426{
11fdf7f2 10427 dout(15) << __func__ << " " << ch->cid << dendl;
7c673cae
FG
10428 vector<ghobject_t> ls;
10429 ghobject_t next;
11fdf7f2 10430 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
7c673cae
FG
10431 &ls, &next);
10432 if (r < 0) {
10433 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
10434 << dendl;
10435 return r;
10436 }
10437 *empty = ls.empty();
11fdf7f2 10438 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
7c673cae
FG
10439 return 0;
10440}
10441
11fdf7f2 10442int BlueStore::collection_bits(CollectionHandle& ch)
7c673cae 10443{
11fdf7f2
TL
10444 dout(15) << __func__ << " " << ch->cid << dendl;
10445 Collection *c = static_cast<Collection*>(ch.get());
9f95a23c 10446 std::shared_lock l(c->lock);
11fdf7f2 10447 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
7c673cae
FG
10448 return c->cnode.bits;
10449}
10450
7c673cae
FG
10451int BlueStore::collection_list(
10452 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10453 vector<ghobject_t> *ls, ghobject_t *pnext)
10454{
10455 Collection *c = static_cast<Collection *>(c_.get());
11fdf7f2 10456 c->flush();
7c673cae
FG
10457 dout(15) << __func__ << " " << c->cid
10458 << " start " << start << " end " << end << " max " << max << dendl;
10459 int r;
10460 {
9f95a23c 10461 std::shared_lock l(c->lock);
f91f0fd5
TL
10462 r = _collection_list(c, start, end, max, false, ls, pnext);
10463 }
10464
10465 dout(10) << __func__ << " " << c->cid
10466 << " start " << start << " end " << end << " max " << max
10467 << " = " << r << ", ls.size() = " << ls->size()
10468 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10469 return r;
10470}
10471
10472int BlueStore::collection_list_legacy(
10473 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10474 vector<ghobject_t> *ls, ghobject_t *pnext)
10475{
10476 Collection *c = static_cast<Collection *>(c_.get());
10477 c->flush();
10478 dout(15) << __func__ << " " << c->cid
10479 << " start " << start << " end " << end << " max " << max << dendl;
10480 int r;
10481 {
10482 std::shared_lock l(c->lock);
10483 r = _collection_list(c, start, end, max, true, ls, pnext);
7c673cae
FG
10484 }
10485
7c673cae
FG
10486 dout(10) << __func__ << " " << c->cid
10487 << " start " << start << " end " << end << " max " << max
10488 << " = " << r << ", ls.size() = " << ls->size()
10489 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10490 return r;
10491}
10492
10493int BlueStore::_collection_list(
10494 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
f91f0fd5 10495 bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
7c673cae
FG
10496{
10497
10498 if (!c->exists)
10499 return -ENOENT;
10500
494da23a 10501 auto start_time = mono_clock::now();
7c673cae
FG
10502 int r = 0;
10503 ghobject_t static_next;
f91f0fd5
TL
10504 std::unique_ptr<CollectionListIterator> it;
10505 ghobject_t coll_range_temp_start, coll_range_temp_end;
10506 ghobject_t coll_range_start, coll_range_end;
7c673cae 10507 bool set_next = false;
f91f0fd5 10508 ghobject_t pend;
7c673cae
FG
10509 bool temp;
10510
10511 if (!pnext)
10512 pnext = &static_next;
10513
11fdf7f2 10514 if (start.is_max() || start.hobj.is_max()) {
7c673cae
FG
10515 goto out;
10516 }
f91f0fd5
TL
10517 get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
10518 &coll_range_temp_end, &coll_range_start, &coll_range_end);
7c673cae 10519 dout(20) << __func__
f91f0fd5
TL
10520 << " range " << coll_range_temp_start
10521 << " to " << coll_range_temp_end
10522 << " and " << coll_range_start
10523 << " to " << coll_range_end
7c673cae 10524 << " start " << start << dendl;
f91f0fd5
TL
10525 if (legacy) {
10526 it = std::make_unique<SimpleCollectionListIterator>(
10527 cct, db->get_iterator(PREFIX_OBJ));
10528 } else {
10529 it = std::make_unique<SortedCollectionListIterator>(
10530 db->get_iterator(PREFIX_OBJ));
10531 }
7c673cae
FG
10532 if (start == ghobject_t() ||
10533 start.hobj == hobject_t() ||
10534 start == c->cid.get_min_hobj()) {
f91f0fd5 10535 it->upper_bound(coll_range_temp_start);
7c673cae
FG
10536 temp = true;
10537 } else {
7c673cae
FG
10538 if (start.hobj.is_temp()) {
10539 temp = true;
f91f0fd5 10540 ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
7c673cae
FG
10541 } else {
10542 temp = false;
f91f0fd5 10543 ceph_assert(start >= coll_range_start && start < coll_range_end);
7c673cae 10544 }
f91f0fd5
TL
10545 dout(20) << __func__ << " temp=" << (int)temp << dendl;
10546 it->lower_bound(start);
7c673cae
FG
10547 }
10548 if (end.hobj.is_max()) {
f91f0fd5 10549 pend = temp ? coll_range_temp_end : coll_range_end;
7c673cae 10550 } else {
7c673cae
FG
10551 if (end.hobj.is_temp()) {
10552 if (temp)
f91f0fd5 10553 pend = end;
7c673cae 10554 else
f91f0fd5 10555 goto out;
7c673cae 10556 } else {
f91f0fd5 10557 pend = temp ? coll_range_temp_end : end;
7c673cae
FG
10558 }
10559 }
f91f0fd5 10560 dout(20) << __func__ << " pend " << pend << dendl;
7c673cae 10561 while (true) {
adb31ebb 10562 if (!it->valid() || it->is_ge(pend)) {
7c673cae
FG
10563 if (!it->valid())
10564 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
10565 else
f91f0fd5 10566 dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
7c673cae
FG
10567 if (temp) {
10568 if (end.hobj.is_temp()) {
adb31ebb 10569 if (it->valid() && it->is_lt(coll_range_temp_end)) {
f91f0fd5
TL
10570 *pnext = it->oid();
10571 set_next = true;
10572 }
7c673cae
FG
10573 break;
10574 }
10575 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
10576 temp = false;
f91f0fd5
TL
10577 it->upper_bound(coll_range_start);
10578 if (end.hobj.is_max())
10579 pend = coll_range_end;
10580 else
10581 pend = end;
10582 dout(30) << __func__ << " pend " << pend << dendl;
7c673cae
FG
10583 continue;
10584 }
adb31ebb 10585 if (it->valid() && it->is_lt(coll_range_end)) {
f91f0fd5
TL
10586 *pnext = it->oid();
10587 set_next = true;
10588 }
7c673cae
FG
10589 break;
10590 }
f91f0fd5 10591 dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
7c673cae
FG
10592 if (ls->size() >= (unsigned)max) {
10593 dout(20) << __func__ << " reached max " << max << dendl;
f91f0fd5 10594 *pnext = it->oid();
7c673cae
FG
10595 set_next = true;
10596 break;
10597 }
f91f0fd5 10598 ls->push_back(it->oid());
7c673cae
FG
10599 it->next();
10600 }
10601out:
10602 if (!set_next) {
10603 *pnext = ghobject_t::get_max();
10604 }
494da23a
TL
10605 log_latency_fn(
10606 __func__,
10607 l_bluestore_clist_lat,
10608 mono_clock::now() - start_time,
10609 cct->_conf->bluestore_log_collection_list_age,
10610 [&] (const ceph::timespan& lat) {
10611 ostringstream ostr;
10612 ostr << ", lat = " << timespan_str(lat)
10613 << " cid =" << c->cid
10614 << " start " << start << " end " << end
10615 << " max " << max;
10616 return ostr.str();
10617 }
10618 );
7c673cae
FG
10619 return r;
10620}
10621
7c673cae
FG
10622int BlueStore::omap_get(
10623 CollectionHandle &c_, ///< [in] Collection containing oid
10624 const ghobject_t &oid, ///< [in] Object containing omap
10625 bufferlist *header, ///< [out] omap header
10626 map<string, bufferlist> *out /// < [out] Key to value map
10627 )
10628{
10629 Collection *c = static_cast<Collection *>(c_.get());
9f95a23c
TL
10630 return _omap_get(c, oid, header, out);
10631}
10632
10633int BlueStore::_omap_get(
10634 Collection *c, ///< [in] Collection containing oid
10635 const ghobject_t &oid, ///< [in] Object containing omap
10636 bufferlist *header, ///< [out] omap header
10637 map<string, bufferlist> *out /// < [out] Key to value map
10638 )
10639{
7c673cae
FG
10640 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10641 if (!c->exists)
10642 return -ENOENT;
9f95a23c 10643 std::shared_lock l(c->lock);
7c673cae
FG
10644 int r = 0;
10645 OnodeRef o = c->get_onode(oid, false);
10646 if (!o || !o->exists) {
10647 r = -ENOENT;
10648 goto out;
10649 }
9f95a23c
TL
10650 r = _onode_omap_get(o, header, out);
10651 out:
10652 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10653 << dendl;
10654 return r;
10655}
10656
10657int BlueStore::_onode_omap_get(
10658 const OnodeRef &o, ///< [in] Object containing omap
10659 bufferlist *header, ///< [out] omap header
10660 map<string, bufferlist> *out /// < [out] Key to value map
10661)
10662{
10663 int r = 0;
10664 if (!o || !o->exists) {
10665 r = -ENOENT;
10666 goto out;
10667 }
7c673cae
FG
10668 if (!o->onode.has_omap())
10669 goto out;
10670 o->flush();
10671 {
9f95a23c 10672 const string& prefix = o->get_omap_prefix();
11fdf7f2 10673 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 10674 string head, tail;
9f95a23c
TL
10675 o->get_omap_header(&head);
10676 o->get_omap_tail(&tail);
7c673cae
FG
10677 it->lower_bound(head);
10678 while (it->valid()) {
10679 if (it->key() == head) {
9f95a23c
TL
10680 dout(30) << __func__ << " got header" << dendl;
10681 *header = it->value();
7c673cae 10682 } else if (it->key() >= tail) {
9f95a23c
TL
10683 dout(30) << __func__ << " reached tail" << dendl;
10684 break;
7c673cae 10685 } else {
9f95a23c
TL
10686 string user_key;
10687 o->decode_omap_key(it->key(), &user_key);
10688 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
10689 << " -> " << user_key << dendl;
10690 (*out)[user_key] = it->value();
7c673cae
FG
10691 }
10692 it->next();
10693 }
10694 }
9f95a23c 10695out:
7c673cae
FG
10696 return r;
10697}
10698
7c673cae
FG
10699int BlueStore::omap_get_header(
10700 CollectionHandle &c_, ///< [in] Collection containing oid
10701 const ghobject_t &oid, ///< [in] Object containing omap
10702 bufferlist *header, ///< [out] omap header
10703 bool allow_eio ///< [in] don't assert on eio
10704 )
10705{
10706 Collection *c = static_cast<Collection *>(c_.get());
10707 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10708 if (!c->exists)
10709 return -ENOENT;
9f95a23c 10710 std::shared_lock l(c->lock);
7c673cae
FG
10711 int r = 0;
10712 OnodeRef o = c->get_onode(oid, false);
10713 if (!o || !o->exists) {
10714 r = -ENOENT;
10715 goto out;
10716 }
10717 if (!o->onode.has_omap())
10718 goto out;
10719 o->flush();
10720 {
10721 string head;
9f95a23c
TL
10722 o->get_omap_header(&head);
10723 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
7c673cae
FG
10724 dout(30) << __func__ << " got header" << dendl;
10725 } else {
10726 dout(30) << __func__ << " no header" << dendl;
10727 }
10728 }
10729 out:
10730 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10731 << dendl;
10732 return r;
10733}
10734
7c673cae
FG
10735int BlueStore::omap_get_keys(
10736 CollectionHandle &c_, ///< [in] Collection containing oid
10737 const ghobject_t &oid, ///< [in] Object containing omap
10738 set<string> *keys ///< [out] Keys defined on oid
10739 )
10740{
10741 Collection *c = static_cast<Collection *>(c_.get());
10742 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10743 if (!c->exists)
10744 return -ENOENT;
adb31ebb 10745 auto start1 = mono_clock::now();
9f95a23c 10746 std::shared_lock l(c->lock);
7c673cae
FG
10747 int r = 0;
10748 OnodeRef o = c->get_onode(oid, false);
10749 if (!o || !o->exists) {
10750 r = -ENOENT;
10751 goto out;
10752 }
10753 if (!o->onode.has_omap())
10754 goto out;
10755 o->flush();
10756 {
9f95a23c 10757 const string& prefix = o->get_omap_prefix();
11fdf7f2 10758 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 10759 string head, tail;
9f95a23c
TL
10760 o->get_omap_key(string(), &head);
10761 o->get_omap_tail(&tail);
7c673cae
FG
10762 it->lower_bound(head);
10763 while (it->valid()) {
10764 if (it->key() >= tail) {
10765 dout(30) << __func__ << " reached tail" << dendl;
10766 break;
10767 }
10768 string user_key;
9f95a23c 10769 o->decode_omap_key(it->key(), &user_key);
11fdf7f2 10770 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
10771 << " -> " << user_key << dendl;
10772 keys->insert(user_key);
10773 it->next();
11fdf7f2
TL
10774 }
10775 }
10776 out:
adb31ebb
TL
10777 c->store->log_latency(
10778 __func__,
10779 l_bluestore_omap_get_keys_lat,
10780 mono_clock::now() - start1,
10781 c->store->cct->_conf->bluestore_log_omap_iterator_age);
10782
11fdf7f2
TL
10783 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10784 << dendl;
10785 return r;
7c673cae
FG
10786}
10787
10788int BlueStore::omap_get_values(
10789 CollectionHandle &c_, ///< [in] Collection containing oid
10790 const ghobject_t &oid, ///< [in] Object containing omap
10791 const set<string> &keys, ///< [in] Keys to get
10792 map<string, bufferlist> *out ///< [out] Returned keys and values
10793 )
10794{
10795 Collection *c = static_cast<Collection *>(c_.get());
10796 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10797 if (!c->exists)
10798 return -ENOENT;
9f95a23c 10799 std::shared_lock l(c->lock);
adb31ebb 10800 auto start1 = mono_clock::now();
7c673cae
FG
10801 int r = 0;
10802 string final_key;
10803 OnodeRef o = c->get_onode(oid, false);
10804 if (!o || !o->exists) {
10805 r = -ENOENT;
10806 goto out;
10807 }
9f95a23c 10808 if (!o->onode.has_omap()) {
7c673cae 10809 goto out;
9f95a23c
TL
10810 }
10811 o->flush();
11fdf7f2 10812 {
9f95a23c
TL
10813 const string& prefix = o->get_omap_prefix();
10814 o->get_omap_key(string(), &final_key);
10815 size_t base_key_len = final_key.size();
11fdf7f2 10816 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 10817 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
10818 final_key += *p;
10819 bufferlist val;
10820 if (db->get(prefix, final_key, &val) >= 0) {
10821 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
10822 << " -> " << *p << dendl;
10823 out->insert(make_pair(*p, val));
10824 }
7c673cae
FG
10825 }
10826 }
10827 out:
adb31ebb
TL
10828 c->store->log_latency(
10829 __func__,
10830 l_bluestore_omap_get_values_lat,
10831 mono_clock::now() - start1,
10832 c->store->cct->_conf->bluestore_log_omap_iterator_age);
10833
7c673cae
FG
10834 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10835 << dendl;
10836 return r;
10837}
10838
9f95a23c
TL
10839#ifdef WITH_SEASTAR
10840int BlueStore::omap_get_values(
10841 CollectionHandle &c_, ///< [in] Collection containing oid
10842 const ghobject_t &oid, ///< [in] Object containing omap
10843 const std::optional<string> &start_after, ///< [in] Keys to get
10844 map<string, bufferlist> *output ///< [out] Returned keys and values
10845 )
10846{
10847 Collection *c = static_cast<Collection *>(c_.get());
10848 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10849 if (!c->exists)
10850 return -ENOENT;
10851 std::shared_lock l(c->lock);
10852 int r = 0;
10853 OnodeRef o = c->get_onode(oid, false);
10854 if (!o || !o->exists) {
10855 r = -ENOENT;
10856 goto out;
10857 }
10858 if (!o->onode.has_omap()) {
10859 goto out;
10860 }
10861 o->flush();
10862 {
10863 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
10864 if (!iter) {
10865 r = -ENOENT;
10866 goto out;
10867 }
10868 iter->upper_bound(*start_after);
10869 for (; iter->valid(); iter->next()) {
10870 output->insert(make_pair(iter->key(), iter->value()));
10871 }
10872 }
10873
10874out:
10875 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10876 << dendl;
10877 return r;
10878}
10879#endif
10880
7c673cae
FG
10881int BlueStore::omap_check_keys(
10882 CollectionHandle &c_, ///< [in] Collection containing oid
10883 const ghobject_t &oid, ///< [in] Object containing omap
10884 const set<string> &keys, ///< [in] Keys to check
10885 set<string> *out ///< [out] Subset of keys defined on oid
10886 )
10887{
10888 Collection *c = static_cast<Collection *>(c_.get());
10889 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10890 if (!c->exists)
10891 return -ENOENT;
9f95a23c 10892 std::shared_lock l(c->lock);
7c673cae
FG
10893 int r = 0;
10894 string final_key;
10895 OnodeRef o = c->get_onode(oid, false);
10896 if (!o || !o->exists) {
10897 r = -ENOENT;
10898 goto out;
10899 }
9f95a23c 10900 if (!o->onode.has_omap()) {
7c673cae 10901 goto out;
9f95a23c
TL
10902 }
10903 o->flush();
11fdf7f2 10904 {
9f95a23c
TL
10905 const string& prefix = o->get_omap_prefix();
10906 o->get_omap_key(string(), &final_key);
10907 size_t base_key_len = final_key.size();
11fdf7f2 10908 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 10909 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
10910 final_key += *p;
10911 bufferlist val;
10912 if (db->get(prefix, final_key, &val) >= 0) {
10913 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
10914 << " -> " << *p << dendl;
10915 out->insert(*p);
10916 } else {
10917 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
10918 << " -> " << *p << dendl;
10919 }
7c673cae
FG
10920 }
10921 }
10922 out:
10923 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10924 << dendl;
10925 return r;
10926}
10927
7c673cae
FG
10928ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
10929 CollectionHandle &c_, ///< [in] collection
10930 const ghobject_t &oid ///< [in] object
10931 )
10932{
10933 Collection *c = static_cast<Collection *>(c_.get());
10934 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
10935 if (!c->exists) {
10936 return ObjectMap::ObjectMapIterator();
10937 }
9f95a23c 10938 std::shared_lock l(c->lock);
7c673cae
FG
10939 OnodeRef o = c->get_onode(oid, false);
10940 if (!o || !o->exists) {
10941 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
10942 return ObjectMap::ObjectMapIterator();
10943 }
10944 o->flush();
10945 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
9f95a23c 10946 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix());
7c673cae
FG
10947 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
10948}
10949
10950// -----------------
10951// write helpers
10952
11fdf7f2 10953uint64_t BlueStore::_get_ondisk_reserved() const {
f67539c2 10954 ceph_assert(min_alloc_size);
11fdf7f2
TL
10955 return round_up_to(
10956 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
10957}
10958
7c673cae
FG
10959void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
10960{
10961 dout(10) << __func__ << " ondisk_format " << ondisk_format
10962 << " min_compat_ondisk_format " << min_compat_ondisk_format
10963 << dendl;
11fdf7f2 10964 ceph_assert(ondisk_format == latest_ondisk_format);
7c673cae
FG
10965 {
10966 bufferlist bl;
11fdf7f2 10967 encode(ondisk_format, bl);
7c673cae
FG
10968 t->set(PREFIX_SUPER, "ondisk_format", bl);
10969 }
10970 {
10971 bufferlist bl;
11fdf7f2 10972 encode(min_compat_ondisk_format, bl);
7c673cae
FG
10973 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
10974 }
10975}
10976
10977int BlueStore::_open_super_meta()
10978{
10979 // nid
10980 {
10981 nid_max = 0;
10982 bufferlist bl;
10983 db->get(PREFIX_SUPER, "nid_max", &bl);
11fdf7f2 10984 auto p = bl.cbegin();
7c673cae
FG
10985 try {
10986 uint64_t v;
11fdf7f2 10987 decode(v, p);
7c673cae 10988 nid_max = v;
f67539c2 10989 } catch (ceph::buffer::error& e) {
7c673cae
FG
10990 derr << __func__ << " unable to read nid_max" << dendl;
10991 return -EIO;
10992 }
f67539c2 10993 dout(1) << __func__ << " old nid_max " << nid_max << dendl;
7c673cae
FG
10994 nid_last = nid_max.load();
10995 }
10996
10997 // blobid
10998 {
10999 blobid_max = 0;
11000 bufferlist bl;
11001 db->get(PREFIX_SUPER, "blobid_max", &bl);
11fdf7f2 11002 auto p = bl.cbegin();
7c673cae
FG
11003 try {
11004 uint64_t v;
11fdf7f2 11005 decode(v, p);
7c673cae 11006 blobid_max = v;
f67539c2 11007 } catch (ceph::buffer::error& e) {
7c673cae
FG
11008 derr << __func__ << " unable to read blobid_max" << dendl;
11009 return -EIO;
11010 }
f67539c2 11011 dout(1) << __func__ << " old blobid_max " << blobid_max << dendl;
7c673cae
FG
11012 blobid_last = blobid_max.load();
11013 }
11014
11015 // freelist
11016 {
11017 bufferlist bl;
11018 db->get(PREFIX_SUPER, "freelist_type", &bl);
11019 if (bl.length()) {
11020 freelist_type = std::string(bl.c_str(), bl.length());
f67539c2 11021 dout(1) << __func__ << " freelist_type " << freelist_type << dendl;
7c673cae 11022 } else {
11fdf7f2 11023 ceph_abort_msg("Not Support extent freelist manager");
7c673cae 11024 }
7c673cae
FG
11025 }
11026
11027 // ondisk format
11028 int32_t compat_ondisk_format = 0;
11029 {
11030 bufferlist bl;
11031 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
11032 if (r < 0) {
11033 // base case: kraken bluestore is v1 and readable by v1
11034 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
11035 << dendl;
11036 ondisk_format = 1;
11037 compat_ondisk_format = 1;
11038 } else {
11fdf7f2 11039 auto p = bl.cbegin();
7c673cae 11040 try {
11fdf7f2 11041 decode(ondisk_format, p);
f67539c2 11042 } catch (ceph::buffer::error& e) {
7c673cae
FG
11043 derr << __func__ << " unable to read ondisk_format" << dendl;
11044 return -EIO;
11045 }
11046 bl.clear();
11047 {
11048 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11fdf7f2
TL
11049 ceph_assert(!r);
11050 auto p = bl.cbegin();
7c673cae 11051 try {
11fdf7f2 11052 decode(compat_ondisk_format, p);
f67539c2 11053 } catch (ceph::buffer::error& e) {
7c673cae
FG
11054 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
11055 return -EIO;
11056 }
11057 }
11058 }
f67539c2 11059 dout(1) << __func__ << " ondisk_format " << ondisk_format
7c673cae
FG
11060 << " compat_ondisk_format " << compat_ondisk_format
11061 << dendl;
11062 }
11063
11064 if (latest_ondisk_format < compat_ondisk_format) {
11065 derr << __func__ << " compat_ondisk_format is "
11066 << compat_ondisk_format << " but we only understand version "
11067 << latest_ondisk_format << dendl;
11068 return -EPERM;
11069 }
7c673cae
FG
11070
11071 {
11072 bufferlist bl;
11073 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11fdf7f2 11074 auto p = bl.cbegin();
7c673cae
FG
11075 try {
11076 uint64_t val;
11fdf7f2 11077 decode(val, p);
7c673cae 11078 min_alloc_size = val;
224ce89b 11079 min_alloc_size_order = ctz(val);
11fdf7f2 11080 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
f67539c2 11081 } catch (ceph::buffer::error& e) {
7c673cae
FG
11082 derr << __func__ << " unable to read min_alloc_size" << dendl;
11083 return -EIO;
11084 }
f67539c2 11085 dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7c673cae
FG
11086 << std::dec << dendl;
11087 }
9f95a23c
TL
11088
11089 _set_per_pool_omap();
11090
224ce89b 11091 _open_statfs();
7c673cae
FG
11092 _set_alloc_sizes();
11093 _set_throttle_params();
11094
11095 _set_csum();
11096 _set_compression();
11097 _set_blob_size();
11098
11fdf7f2 11099 _validate_bdev();
7c673cae
FG
11100 return 0;
11101}
11102
11103int BlueStore::_upgrade_super()
11104{
11105 dout(1) << __func__ << " from " << ondisk_format << ", latest "
11106 << latest_ondisk_format << dendl;
11fdf7f2
TL
11107 if (ondisk_format < latest_ondisk_format) {
11108 ceph_assert(ondisk_format > 0);
11109 ceph_assert(ondisk_format < latest_ondisk_format);
11110
1911f103 11111 KeyValueDB::Transaction t = db->get_transaction();
11fdf7f2
TL
11112 if (ondisk_format == 1) {
11113 // changes:
11114 // - super: added ondisk_format
11115 // - super: added min_readable_ondisk_format
11116 // - super: added min_compat_ondisk_format
11117 // - super: added min_alloc_size
11118 // - super: removed min_min_alloc_size
11fdf7f2
TL
11119 {
11120 bufferlist bl;
11121 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
11122 auto p = bl.cbegin();
11123 try {
11124 uint64_t val;
11125 decode(val, p);
11126 min_alloc_size = val;
f67539c2 11127 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
11128 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
11129 return -EIO;
11130 }
11131 t->set(PREFIX_SUPER, "min_alloc_size", bl);
11132 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7c673cae 11133 }
11fdf7f2 11134 ondisk_format = 2;
7c673cae 11135 }
9f95a23c
TL
11136 if (ondisk_format == 2) {
11137 // changes:
f67539c2
TL
11138 // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all*
11139 // oondes are using the per-pool prefix until a repair is run; at that
9f95a23c
TL
11140 // point the per_pool_omap=1 key will be set.
11141 // - super: added per_pool_omap key, which indicates that *all* objects
11142 // are using the new prefix and key format
11143 ondisk_format = 3;
1911f103
TL
11144 }
11145 if (ondisk_format == 3) {
11146 // changes:
11147 // - FreelistManager keeps meta within bdev label
11148 int r = _write_out_fm_meta(0);
9f95a23c 11149 ceph_assert(r == 0);
1911f103 11150 ondisk_format = 4;
9f95a23c 11151 }
1911f103
TL
11152 // This to be the last operation
11153 _prepare_ondisk_format_super(t);
11154 int r = db->submit_transaction_sync(t);
11155 ceph_assert(r == 0);
7c673cae 11156 }
7c673cae
FG
11157 // done
11158 dout(1) << __func__ << " done" << dendl;
11159 return 0;
11160}
11161
11162void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
11163{
224ce89b 11164 if (o->onode.nid) {
11fdf7f2 11165 ceph_assert(o->exists);
7c673cae 11166 return;
224ce89b 11167 }
7c673cae
FG
11168 uint64_t nid = ++nid_last;
11169 dout(20) << __func__ << " " << nid << dendl;
11170 o->onode.nid = nid;
11171 txc->last_nid = nid;
224ce89b 11172 o->exists = true;
7c673cae
FG
11173}
11174
11175uint64_t BlueStore::_assign_blobid(TransContext *txc)
11176{
11177 uint64_t bid = ++blobid_last;
11178 dout(20) << __func__ << " " << bid << dendl;
11179 txc->last_blobid = bid;
11180 return bid;
11181}
11182
11183void BlueStore::get_db_statistics(Formatter *f)
11184{
11185 db->get_statistics(f);
11186}
11187
11fdf7f2
TL
11188BlueStore::TransContext *BlueStore::_txc_create(
11189 Collection *c, OpSequencer *osr,
f67539c2
TL
11190 list<Context*> *on_commits,
11191 TrackedOpRef osd_op)
7c673cae 11192{
11fdf7f2 11193 TransContext *txc = new TransContext(cct, c, osr, on_commits);
7c673cae 11194 txc->t = db->get_transaction();
f67539c2
TL
11195
11196#ifdef WITH_BLKIN
11197 if (osd_op && osd_op->pg_trace) {
11198 txc->trace.init("TransContext", &trace_endpoint,
11199 &osd_op->pg_trace);
11200 txc->trace.event("txc create");
11201 txc->trace.keyval("txc seq", txc->seq);
11202 }
11203#endif
11204
7c673cae
FG
11205 osr->queue_new(txc);
11206 dout(20) << __func__ << " osr " << osr << " = " << txc
11207 << " seq " << txc->seq << dendl;
11208 return txc;
11209}
11210
11211void BlueStore::_txc_calc_cost(TransContext *txc)
11212{
11fdf7f2
TL
11213 // one "io" for the kv commit
11214 auto ios = 1 + txc->ioc.get_num_ios();
7c673cae
FG
11215 auto cost = throttle_cost_per_io.load();
11216 txc->cost = ios * cost + txc->bytes;
9f95a23c 11217 txc->ios = ios;
7c673cae
FG
11218 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
11219 << ios << " ios * " << cost << " + " << txc->bytes
11220 << " bytes)" << dendl;
11221}
11222
11223void BlueStore::_txc_update_store_statfs(TransContext *txc)
11224{
11225 if (txc->statfs_delta.is_empty())
11226 return;
11227
11228 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
11229 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
11230 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
11231 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
11232 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
11233
11234 bufferlist bl;
11235 txc->statfs_delta.encode(bl);
11fdf7f2
TL
11236 if (per_pool_stat_collection) {
11237 string key;
11238 get_pool_stat_key(txc->osd_pool_id, &key);
11239 txc->t->merge(PREFIX_STAT, key, bl);
11240
11241 std::lock_guard l(vstatfs_lock);
11242 auto& stats = osd_pools[txc->osd_pool_id];
11243 stats += txc->statfs_delta;
11244
11245 vstatfs += txc->statfs_delta; //non-persistent in this mode
11246
11247 } else {
11248 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
7c673cae 11249
11fdf7f2
TL
11250 std::lock_guard l(vstatfs_lock);
11251 vstatfs += txc->statfs_delta;
11252 }
7c673cae
FG
11253 txc->statfs_delta.reset();
11254}
11255
11256void BlueStore::_txc_state_proc(TransContext *txc)
11257{
11258 while (true) {
11259 dout(10) << __func__ << " txc " << txc
11260 << " " << txc->get_state_name() << dendl;
f67539c2 11261 switch (txc->get_state()) {
7c673cae 11262 case TransContext::STATE_PREPARE:
9f95a23c 11263 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
7c673cae 11264 if (txc->ioc.has_pending_aios()) {
f67539c2
TL
11265 txc->set_state(TransContext::STATE_AIO_WAIT);
11266#ifdef WITH_BLKIN
11267 if (txc->trace) {
11268 txc->trace.keyval("pending aios", txc->ioc.num_pending.load());
11269 }
11270#endif
7c673cae
FG
11271 txc->had_ios = true;
11272 _txc_aio_submit(txc);
11273 return;
11274 }
11275 // ** fall-thru **
11276
11277 case TransContext::STATE_AIO_WAIT:
11fdf7f2 11278 {
9f95a23c
TL
11279 mono_clock::duration lat = throttle.log_state_latency(
11280 *txc, logger, l_bluestore_state_aio_wait_lat);
11281 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
11fdf7f2
TL
11282 dout(0) << __func__ << " slow aio_wait, txc = " << txc
11283 << ", latency = " << lat
11284 << dendl;
11285 }
11286 }
11287
7c673cae
FG
11288 _txc_finish_io(txc); // may trigger blocked txc's too
11289 return;
11290
11291 case TransContext::STATE_IO_DONE:
11fdf7f2 11292 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
7c673cae
FG
11293 if (txc->had_ios) {
11294 ++txc->osr->txc_with_unstable_io;
11295 }
9f95a23c 11296 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
f67539c2 11297 txc->set_state(TransContext::STATE_KV_QUEUED);
7c673cae
FG
11298 if (cct->_conf->bluestore_sync_submit_transaction) {
11299 if (txc->last_nid >= nid_max ||
11300 txc->last_blobid >= blobid_max) {
11301 dout(20) << __func__
11302 << " last_{nid,blobid} exceeds max, submit via kv thread"
11303 << dendl;
11304 } else if (txc->osr->kv_committing_serially) {
11305 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
11306 << dendl;
11307 // note: this is starvation-prone. once we have a txc in a busy
11308 // sequencer that is committing serially it is possible to keep
11309 // submitting new transactions fast enough that we get stuck doing
11310 // so. the alternative is to block here... fixme?
11311 } else if (txc->osr->txc_with_unstable_io) {
11312 dout(20) << __func__ << " prior txc(s) with unstable ios "
11313 << txc->osr->txc_with_unstable_io.load() << dendl;
11314 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
11315 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
11316 == 0) {
11317 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
11318 << dendl;
11319 } else {
9f95a23c 11320 _txc_apply_kv(txc, true);
7c673cae
FG
11321 }
11322 }
11323 {
11fdf7f2 11324 std::lock_guard l(kv_lock);
7c673cae 11325 kv_queue.push_back(txc);
9f95a23c
TL
11326 if (!kv_sync_in_progress) {
11327 kv_sync_in_progress = true;
11328 kv_cond.notify_one();
11329 }
f67539c2 11330 if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) {
7c673cae
FG
11331 kv_queue_unsubmitted.push_back(txc);
11332 ++txc->osr->kv_committing_serially;
11333 }
31f18b77
FG
11334 if (txc->had_ios)
11335 kv_ios++;
11336 kv_throttle_costs += txc->cost;
7c673cae
FG
11337 }
11338 return;
11339 case TransContext::STATE_KV_SUBMITTED:
7c673cae
FG
11340 _txc_committed_kv(txc);
11341 // ** fall-thru **
11342
11343 case TransContext::STATE_KV_DONE:
9f95a23c 11344 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
7c673cae 11345 if (txc->deferred_txn) {
f67539c2 11346 txc->set_state(TransContext::STATE_DEFERRED_QUEUED);
7c673cae
FG
11347 _deferred_queue(txc);
11348 return;
11349 }
f67539c2 11350 txc->set_state(TransContext::STATE_FINISHING);
7c673cae
FG
11351 break;
11352
11353 case TransContext::STATE_DEFERRED_CLEANUP:
9f95a23c 11354 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
f67539c2 11355 txc->set_state(TransContext::STATE_FINISHING);
7c673cae
FG
11356 // ** fall-thru **
11357
11358 case TransContext::STATE_FINISHING:
9f95a23c 11359 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
7c673cae
FG
11360 _txc_finish(txc);
11361 return;
11362
11363 default:
11364 derr << __func__ << " unexpected txc " << txc
11365 << " state " << txc->get_state_name() << dendl;
11fdf7f2 11366 ceph_abort_msg("unexpected txc state");
7c673cae
FG
11367 return;
11368 }
11369 }
11370}
11371
11372void BlueStore::_txc_finish_io(TransContext *txc)
11373{
11374 dout(20) << __func__ << " " << txc << dendl;
11375
11376 /*
11377 * we need to preserve the order of kv transactions,
11378 * even though aio will complete in any order.
11379 */
11380
11381 OpSequencer *osr = txc->osr.get();
11fdf7f2 11382 std::lock_guard l(osr->qlock);
f67539c2 11383 txc->set_state(TransContext::STATE_IO_DONE);
11fdf7f2 11384 txc->ioc.release_running_aios();
7c673cae
FG
11385 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
11386 while (p != osr->q.begin()) {
11387 --p;
f67539c2 11388 if (p->get_state() < TransContext::STATE_IO_DONE) {
7c673cae
FG
11389 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
11390 << p->get_state_name() << dendl;
11391 return;
11392 }
f67539c2 11393 if (p->get_state() > TransContext::STATE_IO_DONE) {
7c673cae
FG
11394 ++p;
11395 break;
11396 }
11397 }
11398 do {
11399 _txc_state_proc(&*p++);
11400 } while (p != osr->q.end() &&
f67539c2 11401 p->get_state() == TransContext::STATE_IO_DONE);
7c673cae 11402
11fdf7f2 11403 if (osr->kv_submitted_waiters) {
7c673cae
FG
11404 osr->qcond.notify_all();
11405 }
11406}
11407
11408void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
11409{
11410 dout(20) << __func__ << " txc " << txc
11411 << " onodes " << txc->onodes
11412 << " shared_blobs " << txc->shared_blobs
11413 << dendl;
11414
11415 // finalize onodes
11416 for (auto o : txc->onodes) {
11fdf7f2 11417 _record_onode(o, t);
7c673cae
FG
11418 o->flushing_count++;
11419 }
11420
11421 // objects we modified but didn't affect the onode
11422 auto p = txc->modified_objects.begin();
11423 while (p != txc->modified_objects.end()) {
11424 if (txc->onodes.count(*p) == 0) {
11425 (*p)->flushing_count++;
11426 ++p;
11427 } else {
11428 // remove dups with onodes list to avoid problems in _txc_finish
11429 p = txc->modified_objects.erase(p);
11430 }
11431 }
11432
11433 // finalize shared_blobs
11434 for (auto sb : txc->shared_blobs) {
11435 string key;
11436 auto sbid = sb->get_sbid();
11437 get_shared_blob_key(sbid, &key);
11438 if (sb->persistent->empty()) {
11fdf7f2
TL
11439 dout(20) << __func__ << " shared_blob 0x"
11440 << std::hex << sbid << std::dec
7c673cae
FG
11441 << " is empty" << dendl;
11442 t->rmkey(PREFIX_SHARED_BLOB, key);
11443 } else {
11444 bufferlist bl;
11fdf7f2
TL
11445 encode(*(sb->persistent), bl);
11446 dout(20) << __func__ << " shared_blob 0x"
11447 << std::hex << sbid << std::dec
31f18b77 11448 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
11449 t->set(PREFIX_SHARED_BLOB, key, bl);
11450 }
11451 }
11452}
11453
11454void BlueStore::BSPerfTracker::update_from_perfcounters(
11455 PerfCounters &logger)
11456{
11fdf7f2
TL
11457 os_commit_latency_ns.consume_next(
11458 logger.get_tavg_ns(
7c673cae 11459 l_bluestore_commit_lat));
11fdf7f2
TL
11460 os_apply_latency_ns.consume_next(
11461 logger.get_tavg_ns(
7c673cae
FG
11462 l_bluestore_commit_lat));
11463}
11464
f67539c2
TL
11465// For every object we maintain <zone_num+oid, offset> tuple in the key-value
11466// store. When a new object written to a zone, we insert the corresponding
11467// tuple to the database. When an object is truncated, we remove the
11468// corresponding tuple. When an object is overwritten, we remove the old tuple
11469// and insert a new tuple corresponding to the new location of the object. The
11470// cleaner can now identify live objects within the zone <zone_num> by
11471// enumerating all the keys starting with <zone_num> prefix.
11472void BlueStore::_zoned_update_cleaning_metadata(TransContext *txc) {
11473 for (const auto &[o, offsets] : txc->zoned_onode_to_offset_map) {
11474 std::string key;
11475 get_object_key(cct, o->oid, &key);
11476 for (auto offset : offsets) {
11477 if (offset > 0) {
11478 bufferlist offset_bl;
11479 encode(offset, offset_bl);
11480 txc->t->set(_zoned_get_prefix(offset), key, offset_bl);
11481 } else {
11482 txc->t->rmkey(_zoned_get_prefix(-offset), key);
11483 }
11484 }
11485 }
11486}
11487
11488std::string BlueStore::_zoned_get_prefix(uint64_t offset) {
11489 uint64_t zone_num = offset / bdev->get_zone_size();
11490 std::string zone_key;
11491 _key_encode_u64(zone_num, &zone_key);
11492 return PREFIX_ZONED_CL_INFO + zone_key;
11493}
11494
11495// For now, to avoid interface changes we piggyback zone_size (in MiB) and the
11496// first sequential zone number onto min_alloc_size and pass it to functions
11497// Allocator::create and FreelistManager::create.
11498uint64_t BlueStore::_zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size) {
11499 uint64_t zone_size = bdev->get_zone_size();
11500 uint64_t zone_size_mb = zone_size / (1024 * 1024);
11501 uint64_t first_seq_zone = bdev->get_conventional_region_size() / zone_size;
11502 min_alloc_size |= (zone_size_mb << 32);
11503 min_alloc_size |= (first_seq_zone << 48);
11504 return min_alloc_size;
11505}
11506
11507int BlueStore::_zoned_check_config_settings() {
11508 if (cct->_conf->bluestore_allocator != "zoned") {
11509 dout(1) << __func__ << " The drive is HM-SMR but "
11510 << cct->_conf->bluestore_allocator << " allocator is specified. "
11511 << "Only zoned allocator can be used with HM-SMR drive." << dendl;
11512 return -EINVAL;
11513 }
11514
11515 // At least for now we want to use large min_alloc_size with HM-SMR drives.
11516 // Populating used_blocks bitset on a debug build of ceph-osd takes about 5
11517 // minutes with a 14 TB HM-SMR drive and 4 KiB min_alloc_size.
11518 if (min_alloc_size < 64 * 1024) {
11519 dout(1) << __func__ << " The drive is HM-SMR but min_alloc_size is "
11520 << min_alloc_size << ". "
11521 << "Please set to at least 64 KiB." << dendl;
11522 return -EINVAL;
11523 }
11524
11525 // We don't want to defer writes with HM-SMR because it violates sequential
11526 // write requirement.
11527 if (prefer_deferred_size) {
11528 dout(1) << __func__ << " The drive is HM-SMR but prefer_deferred_size is "
11529 << prefer_deferred_size << ". "
11530 << "Please set to 0." << dendl;
11531 return -EINVAL;
11532 }
11533 return 0;
11534}
11535
7c673cae
FG
11536void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
11537{
11538 dout(20) << __func__ << " txc " << txc << std::hex
11539 << " allocated 0x" << txc->allocated
11540 << " released 0x" << txc->released
11541 << std::dec << dendl;
11542
11543 // We have to handle the case where we allocate *and* deallocate the
11544 // same region in this transaction. The freelist doesn't like that.
11545 // (Actually, the only thing that cares is the BitmapFreelistManager
11546 // debug check. But that's important.)
11547 interval_set<uint64_t> tmp_allocated, tmp_released;
11548 interval_set<uint64_t> *pallocated = &txc->allocated;
11549 interval_set<uint64_t> *preleased = &txc->released;
11550 if (!txc->allocated.empty() && !txc->released.empty()) {
11551 interval_set<uint64_t> overlap;
11552 overlap.intersection_of(txc->allocated, txc->released);
11553 if (!overlap.empty()) {
11554 tmp_allocated = txc->allocated;
11555 tmp_allocated.subtract(overlap);
11556 tmp_released = txc->released;
11557 tmp_released.subtract(overlap);
11558 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
11559 << ", new allocated 0x" << tmp_allocated
11560 << " released 0x" << tmp_released << std::dec
11561 << dendl;
11562 pallocated = &tmp_allocated;
11563 preleased = &tmp_released;
11564 }
11565 }
11566
11567 // update freelist with non-overlap sets
11568 for (interval_set<uint64_t>::iterator p = pallocated->begin();
11569 p != pallocated->end();
11570 ++p) {
11571 fm->allocate(p.get_start(), p.get_len(), t);
11572 }
11573 for (interval_set<uint64_t>::iterator p = preleased->begin();
11574 p != preleased->end();
11575 ++p) {
11576 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
11577 << "~" << p.get_len() << std::dec << dendl;
11578 fm->release(p.get_start(), p.get_len(), t);
11579 }
11580
f67539c2
TL
11581 if (bdev->is_smr()) {
11582 _zoned_update_cleaning_metadata(txc);
11583 }
11584
7c673cae
FG
11585 _txc_update_store_statfs(txc);
11586}
11587
9f95a23c 11588void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
7c673cae 11589{
f67539c2 11590 ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED);
9f95a23c
TL
11591 {
11592#if defined(WITH_LTTNG)
11593 auto start = mono_clock::now();
11594#endif
11595
f67539c2
TL
11596#ifdef WITH_BLKIN
11597 if (txc->trace) {
11598 txc->trace.event("db async submit");
11599 }
11600#endif
11601
9f95a23c
TL
11602 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11603 ceph_assert(r == 0);
f67539c2 11604 txc->set_state(TransContext::STATE_KV_SUBMITTED);
9f95a23c
TL
11605 if (txc->osr->kv_submitted_waiters) {
11606 std::lock_guard l(txc->osr->qlock);
11607 txc->osr->qcond.notify_all();
11608 }
11609
11610#if defined(WITH_LTTNG)
11611 if (txc->tracing) {
11612 tracepoint(
11613 bluestore,
11614 transaction_kv_submit_latency,
11615 txc->osr->get_sequencer_id(),
11616 txc->seq,
11617 sync_submit_transaction,
11618 ceph::to_seconds<double>(mono_clock::now() - start));
11619 }
11620#endif
11621 }
11622
7c673cae
FG
11623 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
11624 for (auto& o : *ls) {
11625 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
11626 << dendl;
9f95a23c 11627 if (--o->flushing_count == 0 && o->waiting_count.load()) {
11fdf7f2 11628 std::lock_guard l(o->flush_lock);
7c673cae
FG
11629 o->flush_cond.notify_all();
11630 }
11631 }
11632 }
11633}
11634
11635void BlueStore::_txc_committed_kv(TransContext *txc)
11636{
11637 dout(20) << __func__ << " txc " << txc << dendl;
9f95a23c 11638 throttle.complete_kv(*txc);
1adf2230 11639 {
11fdf7f2 11640 std::lock_guard l(txc->osr->qlock);
f67539c2 11641 txc->set_state(TransContext::STATE_KV_DONE);
11fdf7f2
TL
11642 if (txc->ch->commit_queue) {
11643 txc->ch->commit_queue->queue(txc->oncommits);
11644 } else {
11645 finisher.queue(txc->oncommits);
1adf2230 11646 }
7c673cae 11647 }
9f95a23c 11648 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
494da23a
TL
11649 log_latency_fn(
11650 __func__,
11651 l_bluestore_commit_lat,
9f95a23c 11652 mono_clock::now() - txc->start,
494da23a
TL
11653 cct->_conf->bluestore_log_op_age,
11654 [&](auto lat) {
11655 return ", txc = " + stringify(txc);
11656 }
11fdf7f2 11657 );
7c673cae
FG
11658}
11659
11660void BlueStore::_txc_finish(TransContext *txc)
11661{
11662 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
f67539c2 11663 ceph_assert(txc->get_state() == TransContext::STATE_FINISHING);
7c673cae
FG
11664
11665 for (auto& sb : txc->shared_blobs_written) {
f64942e4 11666 sb->finish_write(txc->seq);
7c673cae
FG
11667 }
11668 txc->shared_blobs_written.clear();
11669
11670 while (!txc->removed_collections.empty()) {
11671 _queue_reap_collection(txc->removed_collections.front());
11672 txc->removed_collections.pop_front();
11673 }
11674
11675 OpSequencerRef osr = txc->osr;
7c673cae 11676 bool empty = false;
31f18b77 11677 bool submit_deferred = false;
7c673cae
FG
11678 OpSequencer::q_list_t releasing_txc;
11679 {
11fdf7f2 11680 std::lock_guard l(osr->qlock);
f67539c2 11681 txc->set_state(TransContext::STATE_DONE);
7c673cae
FG
11682 bool notify = false;
11683 while (!osr->q.empty()) {
11684 TransContext *txc = &osr->q.front();
11685 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
11686 << dendl;
f67539c2
TL
11687 if (txc->get_state() != TransContext::STATE_DONE) {
11688 if (txc->get_state() == TransContext::STATE_PREPARE &&
7c673cae
FG
11689 deferred_aggressive) {
11690 // for _osr_drain_preceding()
11691 notify = true;
11692 }
f67539c2 11693 if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED &&
11fdf7f2 11694 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
31f18b77
FG
11695 submit_deferred = true;
11696 }
7c673cae
FG
11697 break;
11698 }
11699
7c673cae
FG
11700 osr->q.pop_front();
11701 releasing_txc.push_back(*txc);
7c673cae 11702 }
9f95a23c 11703
7c673cae
FG
11704 if (osr->q.empty()) {
11705 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
11706 empty = true;
11707 }
9f95a23c
TL
11708
11709 // only drain()/drain_preceding() need wakeup,
11710 // other cases use kv_submitted_waiters
11711 if (notify || empty) {
11712 osr->qcond.notify_all();
11713 }
7c673cae 11714 }
9f95a23c 11715
7c673cae
FG
11716 while (!releasing_txc.empty()) {
11717 // release to allocator only after all preceding txc's have also
11718 // finished any deferred writes that potentially land in these
11719 // blocks
11720 auto txc = &releasing_txc.front();
11721 _txc_release_alloc(txc);
11722 releasing_txc.pop_front();
9f95a23c
TL
11723 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
11724 throttle.complete(*txc);
7c673cae
FG
11725 delete txc;
11726 }
11727
31f18b77
FG
11728 if (submit_deferred) {
11729 // we're pinning memory; flush! we could be more fine-grained here but
11730 // i'm not sure it's worth the bother.
11731 deferred_try_submit();
7c673cae
FG
11732 }
11733
7c673cae 11734 if (empty && osr->zombie) {
11fdf7f2
TL
11735 std::lock_guard l(zombie_osr_lock);
11736 if (zombie_osr_set.erase(osr->cid)) {
11737 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
11738 } else {
11739 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
11740 << dendl;
11741 }
7c673cae 11742 }
9f95a23c 11743}
7c673cae
FG
11744
11745void BlueStore::_txc_release_alloc(TransContext *txc)
11746{
a8e16298 11747 // it's expected we're called with lazy_release_lock already taken!
11fdf7f2
TL
11748 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
11749 int r = 0;
11750 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
11751 r = bdev->queue_discard(txc->released);
11752 if (r == 0) {
11753 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
11754 << txc->released << std::dec << dendl;
11755 goto out;
11756 }
11757 } else if (cct->_conf->bdev_enable_discard) {
11758 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
11759 bdev->discard(p.get_start(), p.get_len());
11760 }
11761 }
11762 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
94b18763 11763 << txc->released << std::dec << dendl;
f67539c2 11764 shared_alloc.a->release(txc->released);
7c673cae
FG
11765 }
11766
11fdf7f2 11767out:
7c673cae
FG
11768 txc->allocated.clear();
11769 txc->released.clear();
11770}
11771
11fdf7f2
TL
11772void BlueStore::_osr_attach(Collection *c)
11773{
11774 // note: caller has RWLock on coll_map
11775 auto q = coll_map.find(c->cid);
11776 if (q != coll_map.end()) {
11777 c->osr = q->second->osr;
11778 ldout(cct, 10) << __func__ << " " << c->cid
11779 << " reusing osr " << c->osr << " from existing coll "
11780 << q->second << dendl;
11781 } else {
11782 std::lock_guard l(zombie_osr_lock);
11783 auto p = zombie_osr_set.find(c->cid);
11784 if (p == zombie_osr_set.end()) {
9f95a23c 11785 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
11fdf7f2
TL
11786 ldout(cct, 10) << __func__ << " " << c->cid
11787 << " fresh osr " << c->osr << dendl;
11788 } else {
11789 c->osr = p->second;
11790 zombie_osr_set.erase(p);
11791 ldout(cct, 10) << __func__ << " " << c->cid
11792 << " resurrecting zombie osr " << c->osr << dendl;
11793 c->osr->zombie = false;
11794 }
11795 }
11796}
11797
11798void BlueStore::_osr_register_zombie(OpSequencer *osr)
11799{
11800 std::lock_guard l(zombie_osr_lock);
11801 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
11802 osr->zombie = true;
11803 auto i = zombie_osr_set.emplace(osr->cid, osr);
11804 // this is either a new insertion or the same osr is already there
11805 ceph_assert(i.second || i.first->second == osr);
11806}
11807
7c673cae
FG
11808void BlueStore::_osr_drain_preceding(TransContext *txc)
11809{
11810 OpSequencer *osr = txc->osr.get();
11811 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
11812 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11813 {
11814 // submit anything pending
f67539c2 11815 osr->deferred_lock.lock();
11fdf7f2 11816 if (osr->deferred_pending && !osr->deferred_running) {
224ce89b
WB
11817 _deferred_submit_unlock(osr);
11818 } else {
f67539c2 11819 osr->deferred_lock.unlock();
7c673cae
FG
11820 }
11821 }
11822 {
11823 // wake up any previously finished deferred events
11fdf7f2 11824 std::lock_guard l(kv_lock);
9f95a23c
TL
11825 if (!kv_sync_in_progress) {
11826 kv_sync_in_progress = true;
11827 kv_cond.notify_one();
11828 }
7c673cae
FG
11829 }
11830 osr->drain_preceding(txc);
11831 --deferred_aggressive;
11832 dout(10) << __func__ << " " << osr << " done" << dendl;
11833}
11834
11fdf7f2
TL
11835void BlueStore::_osr_drain(OpSequencer *osr)
11836{
11837 dout(10) << __func__ << " " << osr << dendl;
11838 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11839 {
11840 // submit anything pending
f67539c2 11841 osr->deferred_lock.lock();
11fdf7f2
TL
11842 if (osr->deferred_pending && !osr->deferred_running) {
11843 _deferred_submit_unlock(osr);
11844 } else {
f67539c2 11845 osr->deferred_lock.unlock();
11fdf7f2
TL
11846 }
11847 }
11848 {
11849 // wake up any previously finished deferred events
11850 std::lock_guard l(kv_lock);
9f95a23c
TL
11851 if (!kv_sync_in_progress) {
11852 kv_sync_in_progress = true;
11853 kv_cond.notify_one();
11854 }
11fdf7f2
TL
11855 }
11856 osr->drain();
11857 --deferred_aggressive;
11858 dout(10) << __func__ << " " << osr << " done" << dendl;
11859}
11860
7c673cae
FG
11861void BlueStore::_osr_drain_all()
11862{
11863 dout(10) << __func__ << dendl;
11864
11865 set<OpSequencerRef> s;
11fdf7f2
TL
11866 vector<OpSequencerRef> zombies;
11867 {
9f95a23c 11868 std::shared_lock l(coll_lock);
11fdf7f2
TL
11869 for (auto& i : coll_map) {
11870 s.insert(i.second->osr);
11871 }
11872 }
7c673cae 11873 {
11fdf7f2
TL
11874 std::lock_guard l(zombie_osr_lock);
11875 for (auto& i : zombie_osr_set) {
11876 s.insert(i.second);
11877 zombies.push_back(i.second);
11878 }
7c673cae
FG
11879 }
11880 dout(20) << __func__ << " osr_set " << s << dendl;
11881
11882 ++deferred_aggressive;
11883 {
11884 // submit anything pending
224ce89b 11885 deferred_try_submit();
7c673cae
FG
11886 }
11887 {
11888 // wake up any previously finished deferred events
11fdf7f2 11889 std::lock_guard l(kv_lock);
7c673cae
FG
11890 kv_cond.notify_one();
11891 }
31f18b77 11892 {
11fdf7f2 11893 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
11894 kv_finalize_cond.notify_one();
11895 }
7c673cae
FG
11896 for (auto osr : s) {
11897 dout(20) << __func__ << " drain " << osr << dendl;
11898 osr->drain();
11899 }
11900 --deferred_aggressive;
11901
7c673cae 11902 {
11fdf7f2
TL
11903 std::lock_guard l(zombie_osr_lock);
11904 for (auto& osr : zombies) {
11905 if (zombie_osr_set.erase(osr->cid)) {
11906 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
11907 ceph_assert(osr->q.empty());
11908 } else if (osr->zombie) {
11909 dout(10) << __func__ << " empty zombie osr " << osr
11910 << " already reaped" << dendl;
11911 ceph_assert(osr->q.empty());
11912 } else {
11913 dout(10) << __func__ << " empty zombie osr " << osr
11914 << " resurrected" << dendl;
11915 }
7c673cae
FG
11916 }
11917 }
11fdf7f2
TL
11918
11919 dout(10) << __func__ << " done" << dendl;
7c673cae
FG
11920}
11921
11fdf7f2 11922
31f18b77
FG
11923void BlueStore::_kv_start()
11924{
11925 dout(10) << __func__ << dendl;
11926
11fdf7f2 11927 finisher.start();
31f18b77
FG
11928 kv_sync_thread.create("bstore_kv_sync");
11929 kv_finalize_thread.create("bstore_kv_final");
11930}
11931
11932void BlueStore::_kv_stop()
11933{
11934 dout(10) << __func__ << dendl;
11935 {
9f95a23c 11936 std::unique_lock l{kv_lock};
31f18b77
FG
11937 while (!kv_sync_started) {
11938 kv_cond.wait(l);
11939 }
11940 kv_stop = true;
11941 kv_cond.notify_all();
11942 }
11943 {
9f95a23c 11944 std::unique_lock l{kv_finalize_lock};
31f18b77
FG
11945 while (!kv_finalize_started) {
11946 kv_finalize_cond.wait(l);
11947 }
11948 kv_finalize_stop = true;
11949 kv_finalize_cond.notify_all();
11950 }
11951 kv_sync_thread.join();
11952 kv_finalize_thread.join();
11fdf7f2 11953 ceph_assert(removed_collections.empty());
31f18b77 11954 {
11fdf7f2 11955 std::lock_guard l(kv_lock);
31f18b77
FG
11956 kv_stop = false;
11957 }
11958 {
11fdf7f2 11959 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
11960 kv_finalize_stop = false;
11961 }
11962 dout(10) << __func__ << " stopping finishers" << dendl;
11fdf7f2
TL
11963 finisher.wait_for_empty();
11964 finisher.stop();
31f18b77
FG
11965 dout(10) << __func__ << " stopped" << dendl;
11966}
11967
7c673cae
FG
11968void BlueStore::_kv_sync_thread()
11969{
11970 dout(10) << __func__ << " start" << dendl;
11fdf7f2 11971 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
9f95a23c 11972 std::unique_lock l{kv_lock};
11fdf7f2 11973 ceph_assert(!kv_sync_started);
31f18b77
FG
11974 kv_sync_started = true;
11975 kv_cond.notify_all();
adb31ebb
TL
11976
11977 auto t0 = mono_clock::now();
11978 timespan twait = ceph::make_timespan(0);
11979 size_t kv_submitted = 0;
11980
7c673cae 11981 while (true) {
adb31ebb
TL
11982 auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
11983 auto observation_period =
11984 ceph::make_timespan(period);
11985 auto elapsed = mono_clock::now() - t0;
11986 if (period && elapsed >= observation_period) {
11987 dout(5) << __func__ << " utilization: idle "
11988 << twait << " of " << elapsed
11989 << ", submitted: " << kv_submitted
11990 <<dendl;
11991 t0 = mono_clock::now();
11992 twait = ceph::make_timespan(0);
11993 kv_submitted = 0;
11994 }
11fdf7f2 11995 ceph_assert(kv_committing.empty());
7c673cae
FG
11996 if (kv_queue.empty() &&
11997 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
11fdf7f2 11998 !deferred_aggressive)) {
7c673cae
FG
11999 if (kv_stop)
12000 break;
12001 dout(20) << __func__ << " sleep" << dendl;
adb31ebb 12002 auto t = mono_clock::now();
9f95a23c 12003 kv_sync_in_progress = false;
11fdf7f2 12004 kv_cond.wait(l);
adb31ebb
TL
12005 twait += mono_clock::now() - t;
12006
7c673cae
FG
12007 dout(20) << __func__ << " wake" << dendl;
12008 } else {
12009 deque<TransContext*> kv_submitting;
12010 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
12011 uint64_t aios = 0, costs = 0;
12012
7c673cae
FG
12013 dout(20) << __func__ << " committing " << kv_queue.size()
12014 << " submitting " << kv_queue_unsubmitted.size()
12015 << " deferred done " << deferred_done_queue.size()
12016 << " stable " << deferred_stable_queue.size()
12017 << dendl;
12018 kv_committing.swap(kv_queue);
12019 kv_submitting.swap(kv_queue_unsubmitted);
12020 deferred_done.swap(deferred_done_queue);
12021 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
12022 aios = kv_ios;
12023 costs = kv_throttle_costs;
12024 kv_ios = 0;
12025 kv_throttle_costs = 0;
7c673cae
FG
12026 l.unlock();
12027
12028 dout(30) << __func__ << " committing " << kv_committing << dendl;
12029 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
12030 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
12031 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
12032
11fdf7f2
TL
12033 auto start = mono_clock::now();
12034
7c673cae
FG
12035 bool force_flush = false;
12036 // if bluefs is sharing the same device as data (only), then we
12037 // can rely on the bluefs commit to flush the device and make
12038 // deferred aios stable. that means that if we do have done deferred
12039 // txcs AND we are not on a single device, we need to force a flush.
9f95a23c 12040 if (bluefs && bluefs_layout.single_shared_device()) {
31f18b77 12041 if (aios) {
7c673cae 12042 force_flush = true;
11fdf7f2 12043 } else if (kv_committing.empty() && deferred_stable.empty()) {
7c673cae
FG
12044 force_flush = true; // there's nothing else to commit!
12045 } else if (deferred_aggressive) {
12046 force_flush = true;
12047 }
11fdf7f2
TL
12048 } else {
12049 if (aios || !deferred_done.empty()) {
12050 force_flush = true;
12051 } else {
12052 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
12053 }
12054 }
7c673cae
FG
12055
12056 if (force_flush) {
31f18b77 12057 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
12058 << " force_flush=" << (int)force_flush
12059 << ", flushing, deferred done->stable" << dendl;
12060 // flush/barrier on block device
12061 bdev->flush();
12062
12063 // if we flush then deferred done are now deferred stable
12064 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
12065 deferred_done.end());
12066 deferred_done.clear();
12067 }
11fdf7f2 12068 auto after_flush = mono_clock::now();
7c673cae
FG
12069
12070 // we will use one final transaction to force a sync
12071 KeyValueDB::Transaction synct = db->get_transaction();
12072
12073 // increase {nid,blobid}_max? note that this covers both the
12074 // case where we are approaching the max and the case we passed
12075 // it. in either case, we increase the max in the earlier txn
12076 // we submit.
12077 uint64_t new_nid_max = 0, new_blobid_max = 0;
12078 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
12079 KeyValueDB::Transaction t =
12080 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12081 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
12082 bufferlist bl;
11fdf7f2 12083 encode(new_nid_max, bl);
7c673cae
FG
12084 t->set(PREFIX_SUPER, "nid_max", bl);
12085 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
12086 }
12087 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
12088 KeyValueDB::Transaction t =
12089 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12090 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
12091 bufferlist bl;
11fdf7f2 12092 encode(new_blobid_max, bl);
7c673cae
FG
12093 t->set(PREFIX_SUPER, "blobid_max", bl);
12094 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
12095 }
c07f9fc5
FG
12096
12097 for (auto txc : kv_committing) {
9f95a23c 12098 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
f67539c2 12099 if (txc->get_state() == TransContext::STATE_KV_QUEUED) {
adb31ebb 12100 ++kv_submitted;
9f95a23c 12101 _txc_apply_kv(txc, false);
c07f9fc5 12102 --txc->osr->kv_committing_serially;
c07f9fc5 12103 } else {
f67539c2 12104 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
7c673cae 12105 }
7c673cae
FG
12106 if (txc->had_ios) {
12107 --txc->osr->txc_with_unstable_io;
12108 }
7c673cae
FG
12109 }
12110
31f18b77
FG
12111 // release throttle *before* we commit. this allows new ops
12112 // to be prepared and enter pipeline while we are waiting on
12113 // the kv commit sync/flush. then hopefully on the next
12114 // iteration there will already be ops awake. otherwise, we
12115 // end up going to sleep, and then wake up when the very first
12116 // transaction is ready for commit.
9f95a23c 12117 throttle.release_kv_throttle(costs);
31f18b77 12118
7c673cae
FG
12119 // cleanup sync deferred keys
12120 for (auto b : deferred_stable) {
12121 for (auto& txc : b->txcs) {
12122 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
11fdf7f2 12123 ceph_assert(wt.released.empty()); // only kraken did this
7c673cae
FG
12124 string key;
12125 get_deferred_key(wt.seq, &key);
12126 synct->rm_single_key(PREFIX_DEFERRED, key);
12127 }
12128 }
12129
9f95a23c
TL
12130#if defined(WITH_LTTNG)
12131 auto sync_start = mono_clock::now();
12132#endif
7c673cae 12133 // submit synct synchronously (block and wait for it to commit)
31f18b77 12134 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
11fdf7f2
TL
12135 ceph_assert(r == 0);
12136
f67539c2
TL
12137#ifdef WITH_BLKIN
12138 for (auto txc : kv_committing) {
12139 if (txc->trace) {
12140 txc->trace.event("db sync submit");
12141 txc->trace.keyval("kv_committing size", kv_committing.size());
12142 }
12143 }
12144#endif
12145
9f95a23c
TL
12146 int committing_size = kv_committing.size();
12147 int deferred_size = deferred_stable.size();
12148
12149#if defined(WITH_LTTNG)
12150 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
12151 for (auto txc: kv_committing) {
12152 if (txc->tracing) {
12153 tracepoint(
12154 bluestore,
12155 transaction_kv_sync_latency,
12156 txc->osr->get_sequencer_id(),
12157 txc->seq,
12158 kv_committing.size(),
12159 deferred_done.size(),
12160 deferred_stable.size(),
12161 sync_latency);
12162 }
12163 }
12164#endif
12165
11fdf7f2 12166 {
9f95a23c 12167 std::unique_lock m{kv_finalize_lock};
11fdf7f2
TL
12168 if (kv_committing_to_finalize.empty()) {
12169 kv_committing_to_finalize.swap(kv_committing);
12170 } else {
12171 kv_committing_to_finalize.insert(
12172 kv_committing_to_finalize.end(),
12173 kv_committing.begin(),
12174 kv_committing.end());
12175 kv_committing.clear();
12176 }
12177 if (deferred_stable_to_finalize.empty()) {
12178 deferred_stable_to_finalize.swap(deferred_stable);
12179 } else {
12180 deferred_stable_to_finalize.insert(
12181 deferred_stable_to_finalize.end(),
12182 deferred_stable.begin(),
12183 deferred_stable.end());
12184 deferred_stable.clear();
12185 }
9f95a23c
TL
12186 if (!kv_finalize_in_progress) {
12187 kv_finalize_in_progress = true;
12188 kv_finalize_cond.notify_one();
12189 }
11fdf7f2 12190 }
7c673cae
FG
12191
12192 if (new_nid_max) {
12193 nid_max = new_nid_max;
12194 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
12195 }
12196 if (new_blobid_max) {
12197 blobid_max = new_blobid_max;
12198 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
12199 }
12200
224ce89b 12201 {
11fdf7f2
TL
12202 auto finish = mono_clock::now();
12203 ceph::timespan dur_flush = after_flush - start;
12204 ceph::timespan dur_kv = finish - after_flush;
12205 ceph::timespan dur = finish - start;
9f95a23c
TL
12206 dout(20) << __func__ << " committed " << committing_size
12207 << " cleaned " << deferred_size
224ce89b
WB
12208 << " in " << dur
12209 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
12210 << dendl;
494da23a
TL
12211 log_latency("kv_flush",
12212 l_bluestore_kv_flush_lat,
12213 dur_flush,
12214 cct->_conf->bluestore_log_op_age);
12215 log_latency("kv_commit",
12216 l_bluestore_kv_commit_lat,
12217 dur_kv,
12218 cct->_conf->bluestore_log_op_age);
12219 log_latency("kv_sync",
12220 l_bluestore_kv_sync_lat,
12221 dur,
12222 cct->_conf->bluestore_log_op_age);
7c673cae 12223 }
31f18b77 12224
31f18b77
FG
12225 l.lock();
12226 // previously deferred "done" are now "stable" by virtue of this
12227 // commit cycle.
12228 deferred_stable_queue.swap(deferred_done);
12229 }
12230 }
12231 dout(10) << __func__ << " finish" << dendl;
12232 kv_sync_started = false;
12233}
12234
12235void BlueStore::_kv_finalize_thread()
12236{
12237 deque<TransContext*> kv_committed;
12238 deque<DeferredBatch*> deferred_stable;
12239 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
12240 std::unique_lock l(kv_finalize_lock);
12241 ceph_assert(!kv_finalize_started);
31f18b77
FG
12242 kv_finalize_started = true;
12243 kv_finalize_cond.notify_all();
12244 while (true) {
11fdf7f2
TL
12245 ceph_assert(kv_committed.empty());
12246 ceph_assert(deferred_stable.empty());
31f18b77
FG
12247 if (kv_committing_to_finalize.empty() &&
12248 deferred_stable_to_finalize.empty()) {
12249 if (kv_finalize_stop)
12250 break;
12251 dout(20) << __func__ << " sleep" << dendl;
9f95a23c 12252 kv_finalize_in_progress = false;
31f18b77
FG
12253 kv_finalize_cond.wait(l);
12254 dout(20) << __func__ << " wake" << dendl;
12255 } else {
12256 kv_committed.swap(kv_committing_to_finalize);
12257 deferred_stable.swap(deferred_stable_to_finalize);
12258 l.unlock();
12259 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
12260 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
12261
11fdf7f2
TL
12262 auto start = mono_clock::now();
12263
31f18b77
FG
12264 while (!kv_committed.empty()) {
12265 TransContext *txc = kv_committed.front();
f67539c2 12266 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
7c673cae 12267 _txc_state_proc(txc);
31f18b77 12268 kv_committed.pop_front();
7c673cae 12269 }
31f18b77 12270
7c673cae
FG
12271 for (auto b : deferred_stable) {
12272 auto p = b->txcs.begin();
12273 while (p != b->txcs.end()) {
12274 TransContext *txc = &*p;
12275 p = b->txcs.erase(p); // unlink here because
12276 _txc_state_proc(txc); // this may destroy txc
12277 }
12278 delete b;
12279 }
31f18b77 12280 deferred_stable.clear();
7c673cae
FG
12281
12282 if (!deferred_aggressive) {
31f18b77 12283 if (deferred_queue_size >= deferred_batch_ops.load() ||
9f95a23c 12284 throttle.should_submit_deferred()) {
224ce89b 12285 deferred_try_submit();
7c673cae
FG
12286 }
12287 }
12288
12289 // this is as good a place as any ...
12290 _reap_collections();
12291
11fdf7f2 12292 logger->set(l_bluestore_fragmentation,
f67539c2 12293 (uint64_t)(shared_alloc.a->get_fragmentation() * 1000));
11fdf7f2 12294
494da23a
TL
12295 log_latency("kv_final",
12296 l_bluestore_kv_final_lat,
12297 mono_clock::now() - start,
12298 cct->_conf->bluestore_log_op_age);
11fdf7f2 12299
7c673cae 12300 l.lock();
7c673cae
FG
12301 }
12302 }
12303 dout(10) << __func__ << " finish" << dendl;
31f18b77 12304 kv_finalize_started = false;
7c673cae
FG
12305}
12306
f67539c2
TL
12307void BlueStore::_zoned_cleaner_start() {
12308 dout(10) << __func__ << dendl;
12309
12310 zoned_cleaner_thread.create("bstore_zcleaner");
12311}
12312
12313void BlueStore::_zoned_cleaner_stop() {
12314 dout(10) << __func__ << dendl;
12315 {
12316 std::unique_lock l{zoned_cleaner_lock};
12317 while (!zoned_cleaner_started) {
12318 zoned_cleaner_cond.wait(l);
12319 }
12320 zoned_cleaner_stop = true;
12321 zoned_cleaner_cond.notify_all();
12322 }
12323 zoned_cleaner_thread.join();
12324 {
12325 std::lock_guard l{zoned_cleaner_lock};
12326 zoned_cleaner_stop = false;
12327 }
12328 dout(10) << __func__ << " done" << dendl;
12329}
12330
12331void BlueStore::_zoned_cleaner_thread() {
12332 dout(10) << __func__ << " start" << dendl;
12333 std::unique_lock l{zoned_cleaner_lock};
12334 ceph_assert(!zoned_cleaner_started);
12335 zoned_cleaner_started = true;
12336 zoned_cleaner_cond.notify_all();
12337 std::deque<uint64_t> zones_to_clean;
12338 while (true) {
12339 if (zoned_cleaner_queue.empty()) {
12340 if (zoned_cleaner_stop) {
12341 break;
12342 }
12343 dout(20) << __func__ << " sleep" << dendl;
12344 zoned_cleaner_cond.wait(l);
12345 dout(20) << __func__ << " wake" << dendl;
12346 } else {
12347 zones_to_clean.swap(zoned_cleaner_queue);
12348 l.unlock();
12349 while (!zones_to_clean.empty()) {
12350 _zoned_clean_zone(zones_to_clean.front());
12351 zones_to_clean.pop_front();
12352 }
12353 l.lock();
12354 }
12355 }
12356 dout(10) << __func__ << " finish" << dendl;
12357 zoned_cleaner_started = false;
12358}
12359
12360void BlueStore::_zoned_clean_zone(uint64_t zone_num) {
12361 dout(10) << __func__ << " cleaning zone " << zone_num << dendl;
12362}
12363
7c673cae 12364bluestore_deferred_op_t *BlueStore::_get_deferred_op(
9f95a23c 12365 TransContext *txc)
7c673cae
FG
12366{
12367 if (!txc->deferred_txn) {
12368 txc->deferred_txn = new bluestore_deferred_transaction_t;
12369 }
12370 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
12371 return &txc->deferred_txn->ops.back();
12372}
12373
12374void BlueStore::_deferred_queue(TransContext *txc)
12375{
12376 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
f67539c2
TL
12377
12378 DeferredBatch *tmp;
12379 txc->osr->deferred_lock.lock();
12380 {
12381 if (!txc->osr->deferred_pending) {
12382 tmp = new DeferredBatch(cct, txc->osr.get());
12383 } else {
12384 tmp = txc->osr->deferred_pending;
12385 }
7c673cae 12386 }
f67539c2
TL
12387
12388 tmp->txcs.push_back(*txc);
7c673cae
FG
12389 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
12390 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
12391 const auto& op = *opi;
11fdf7f2 12392 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
7c673cae
FG
12393 bufferlist::const_iterator p = op.data.begin();
12394 for (auto e : op.extents) {
f67539c2 12395 tmp->prepare_write(cct, wt.seq, e.offset, e.length, p);
7c673cae
FG
12396 }
12397 }
f67539c2
TL
12398
12399 {
12400 ++deferred_queue_size;
12401 txc->osr->deferred_pending = tmp;
12402 // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
12403 // So we should add osr into deferred_queue.
12404 if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) {
12405 deferred_lock.lock();
12406 deferred_queue.push_back(*txc->osr);
12407 deferred_lock.unlock();
12408 }
12409
12410 if (deferred_aggressive &&
12411 !txc->osr->deferred_running) {
12412 _deferred_submit_unlock(txc->osr.get());
12413 } else {
12414 txc->osr->deferred_lock.unlock();
12415 }
7c673cae 12416 }
f67539c2 12417 }
7c673cae 12418
224ce89b 12419void BlueStore::deferred_try_submit()
7c673cae
FG
12420{
12421 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
12422 << deferred_queue_size << " txcs" << dendl;
224ce89b 12423 vector<OpSequencerRef> osrs;
f67539c2
TL
12424
12425 {
12426 std::lock_guard l(deferred_lock);
12427 osrs.reserve(deferred_queue.size());
12428 for (auto& osr : deferred_queue) {
12429 osrs.push_back(&osr);
12430 }
224ce89b 12431 }
f67539c2 12432
224ce89b 12433 for (auto& osr : osrs) {
f67539c2 12434 osr->deferred_lock.lock();
181888fb
FG
12435 if (osr->deferred_pending) {
12436 if (!osr->deferred_running) {
12437 _deferred_submit_unlock(osr.get());
181888fb 12438 } else {
f67539c2 12439 osr->deferred_lock.unlock();
181888fb
FG
12440 dout(20) << __func__ << " osr " << osr << " already has running"
12441 << dendl;
12442 }
12443 } else {
f67539c2 12444 osr->deferred_lock.unlock();
181888fb 12445 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
12446 }
12447 }
9f95a23c 12448
f67539c2
TL
12449 {
12450 std::lock_guard l(deferred_lock);
12451 deferred_last_submitted = ceph_clock_now();
12452 }
7c673cae
FG
12453}
12454
224ce89b 12455void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
12456{
12457 dout(10) << __func__ << " osr " << osr
12458 << " " << osr->deferred_pending->iomap.size() << " ios pending "
12459 << dendl;
11fdf7f2
TL
12460 ceph_assert(osr->deferred_pending);
12461 ceph_assert(!osr->deferred_running);
7c673cae
FG
12462
12463 auto b = osr->deferred_pending;
12464 deferred_queue_size -= b->seq_bytes.size();
11fdf7f2 12465 ceph_assert(deferred_queue_size >= 0);
7c673cae
FG
12466
12467 osr->deferred_running = osr->deferred_pending;
12468 osr->deferred_pending = nullptr;
12469
f67539c2 12470 osr->deferred_lock.unlock();
11fdf7f2
TL
12471
12472 for (auto& txc : b->txcs) {
9f95a23c 12473 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
11fdf7f2 12474 }
7c673cae
FG
12475 uint64_t start = 0, pos = 0;
12476 bufferlist bl;
12477 auto i = b->iomap.begin();
12478 while (true) {
12479 if (i == b->iomap.end() || i->first != pos) {
12480 if (bl.length()) {
12481 dout(20) << __func__ << " write 0x" << std::hex
12482 << start << "~" << bl.length()
12483 << " crc " << bl.crc32c(-1) << std::dec << dendl;
11fdf7f2 12484 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
12485 logger->inc(l_bluestore_deferred_write_ops);
12486 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
12487 int r = bdev->aio_write(start, bl, &b->ioc, false);
11fdf7f2 12488 ceph_assert(r == 0);
7c673cae
FG
12489 }
12490 }
12491 if (i == b->iomap.end()) {
12492 break;
12493 }
12494 start = 0;
12495 pos = i->first;
12496 bl.clear();
12497 }
12498 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
12499 << std::hex << pos << "~" << i->second.bl.length() << std::dec
12500 << dendl;
12501 if (!bl.length()) {
12502 start = pos;
12503 }
12504 pos += i->second.bl.length();
12505 bl.claim_append(i->second.bl);
12506 ++i;
12507 }
224ce89b 12508
7c673cae
FG
12509 bdev->aio_submit(&b->ioc);
12510}
12511
3efd9988
FG
12512struct C_DeferredTrySubmit : public Context {
12513 BlueStore *store;
12514 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
12515 void finish(int r) {
12516 store->deferred_try_submit();
12517 }
12518};
12519
7c673cae
FG
12520void BlueStore::_deferred_aio_finish(OpSequencer *osr)
12521{
12522 dout(10) << __func__ << " osr " << osr << dendl;
11fdf7f2 12523 ceph_assert(osr->deferred_running);
7c673cae
FG
12524 DeferredBatch *b = osr->deferred_running;
12525
12526 {
f67539c2 12527 osr->deferred_lock.lock();
11fdf7f2 12528 ceph_assert(osr->deferred_running == b);
7c673cae
FG
12529 osr->deferred_running = nullptr;
12530 if (!osr->deferred_pending) {
181888fb 12531 dout(20) << __func__ << " dequeueing" << dendl;
f67539c2
TL
12532 {
12533 deferred_lock.lock();
12534 auto q = deferred_queue.iterator_to(*osr);
12535 deferred_queue.erase(q);
12536 deferred_lock.unlock();
12537 }
12538 osr->deferred_lock.unlock();
181888fb 12539 } else {
f67539c2 12540 osr->deferred_lock.unlock();
9f95a23c
TL
12541 if (deferred_aggressive) {
12542 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
12543 finisher.queue(new C_DeferredTrySubmit(this));
12544 } else {
12545 dout(20) << __func__ << " leaving queued, more pending" << dendl;
12546 }
7c673cae
FG
12547 }
12548 }
12549
12550 {
31f18b77 12551 uint64_t costs = 0;
11fdf7f2 12552 {
11fdf7f2
TL
12553 for (auto& i : b->txcs) {
12554 TransContext *txc = &i;
9f95a23c 12555 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
f67539c2 12556 txc->set_state(TransContext::STATE_DEFERRED_CLEANUP);
11fdf7f2
TL
12557 costs += txc->cost;
12558 }
7c673cae 12559 }
9f95a23c 12560 throttle.release_deferred_throttle(costs);
7c673cae
FG
12561 }
12562
9f95a23c 12563 {
11fdf7f2 12564 std::lock_guard l(kv_lock);
9f95a23c
TL
12565 deferred_done_queue.emplace_back(b);
12566
12567 // in the normal case, do not bother waking up the kv thread; it will
12568 // catch us on the next commit anyway.
12569 if (deferred_aggressive && !kv_sync_in_progress) {
12570 kv_sync_in_progress = true;
12571 kv_cond.notify_one();
12572 }
7c673cae
FG
12573 }
12574}
12575
12576int BlueStore::_deferred_replay()
12577{
12578 dout(10) << __func__ << " start" << dendl;
7c673cae
FG
12579 int count = 0;
12580 int r = 0;
11fdf7f2
TL
12581 CollectionRef ch = _get_collection(coll_t::meta());
12582 bool fake_ch = false;
12583 if (!ch) {
12584 // hmm, replaying initial mkfs?
12585 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
12586 fake_ch = true;
12587 }
12588 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
7c673cae
FG
12589 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
12590 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
12591 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
12592 << dendl;
12593 bluestore_deferred_transaction_t *deferred_txn =
12594 new bluestore_deferred_transaction_t;
12595 bufferlist bl = it->value();
11fdf7f2 12596 auto p = bl.cbegin();
7c673cae 12597 try {
11fdf7f2 12598 decode(*deferred_txn, p);
f67539c2 12599 } catch (ceph::buffer::error& e) {
7c673cae
FG
12600 derr << __func__ << " failed to decode deferred txn "
12601 << pretty_binary_string(it->key()) << dendl;
12602 delete deferred_txn;
12603 r = -EIO;
12604 goto out;
12605 }
11fdf7f2 12606 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
7c673cae 12607 txc->deferred_txn = deferred_txn;
f67539c2 12608 txc->set_state(TransContext::STATE_KV_DONE);
7c673cae
FG
12609 _txc_state_proc(txc);
12610 }
12611 out:
12612 dout(20) << __func__ << " draining osr" << dendl;
11fdf7f2 12613 _osr_register_zombie(osr);
7c673cae 12614 _osr_drain_all();
11fdf7f2
TL
12615 if (fake_ch) {
12616 new_coll_map.clear();
12617 }
7c673cae
FG
12618 dout(10) << __func__ << " completed " << count << " events" << dendl;
12619 return r;
12620}
12621
12622// ---------------------------
12623// transactions
12624
12625int BlueStore::queue_transactions(
11fdf7f2
TL
12626 CollectionHandle& ch,
12627 vector<Transaction>& tls,
12628 TrackedOpRef op,
12629 ThreadPool::TPHandle *handle)
12630{
12631 FUNCTRACE(cct);
12632 list<Context *> on_applied, on_commit, on_applied_sync;
7c673cae 12633 ObjectStore::Transaction::collect_contexts(
11fdf7f2 12634 tls, &on_applied, &on_commit, &on_applied_sync);
7c673cae 12635
11fdf7f2
TL
12636 auto start = mono_clock::now();
12637
12638 Collection *c = static_cast<Collection*>(ch.get());
12639 OpSequencer *osr = c->osr.get();
12640 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
7c673cae
FG
12641
12642 // prepare
11fdf7f2 12643 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
f67539c2 12644 &on_commit, op);
7c673cae 12645
f67539c2
TL
12646 // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
12647 // submission to happen atomically because if I/O submission happens in a
12648 // different order than I/O allocation, we end up issuing non-sequential
12649 // writes to the drive. This is a temporary solution until ZONE APPEND
12650 // support matures in the kernel. For more information please see:
12651 // https://www.usenix.org/conference/vault20/presentation/bjorling
12652 if (bdev->is_smr()) {
12653 atomic_alloc_and_submit_lock.lock();
12654 }
7c673cae 12655 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
7c673cae
FG
12656 txc->bytes += (*p).get_num_bytes();
12657 _txc_add_transaction(txc, &(*p));
12658 }
12659 _txc_calc_cost(txc);
12660
12661 _txc_write_nodes(txc, txc->t);
12662
12663 // journal deferred items
12664 if (txc->deferred_txn) {
12665 txc->deferred_txn->seq = ++deferred_seq;
12666 bufferlist bl;
11fdf7f2 12667 encode(*txc->deferred_txn, bl);
7c673cae
FG
12668 string key;
12669 get_deferred_key(txc->deferred_txn->seq, &key);
12670 txc->t->set(PREFIX_DEFERRED, key, bl);
12671 }
12672
12673 _txc_finalize_kv(txc, txc->t);
f67539c2
TL
12674
12675#ifdef WITH_BLKIN
12676 if (txc->trace) {
12677 txc->trace.event("txc encode finished");
12678 }
12679#endif
12680
7c673cae
FG
12681 if (handle)
12682 handle->suspend_tp_timeout();
12683
11fdf7f2 12684 auto tstart = mono_clock::now();
9f95a23c
TL
12685
12686 if (!throttle.try_start_transaction(
12687 *db,
12688 *txc,
12689 tstart)) {
7c673cae 12690 // ensure we do not block here because of deferred writes
9f95a23c
TL
12691 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
12692 << dendl;
12693 ++deferred_aggressive;
12694 deferred_try_submit();
12695 {
12696 // wake up any previously finished deferred events
12697 std::lock_guard l(kv_lock);
12698 if (!kv_sync_in_progress) {
12699 kv_sync_in_progress = true;
3efd9988
FG
12700 kv_cond.notify_one();
12701 }
9f95a23c
TL
12702 }
12703 throttle.finish_start_transaction(*db, *txc, tstart);
12704 --deferred_aggressive;
7c673cae 12705 }
11fdf7f2 12706 auto tend = mono_clock::now();
7c673cae
FG
12707
12708 if (handle)
12709 handle->reset_tp_timeout();
12710
12711 logger->inc(l_bluestore_txc);
12712
12713 // execute (start)
12714 _txc_state_proc(txc);
12715
f67539c2
TL
12716 if (bdev->is_smr()) {
12717 atomic_alloc_and_submit_lock.unlock();
12718 }
12719
11fdf7f2
TL
12720 // we're immediately readable (unlike FileStore)
12721 for (auto c : on_applied_sync) {
12722 c->complete(0);
12723 }
12724 if (!on_applied.empty()) {
12725 if (c->commit_queue) {
12726 c->commit_queue->queue(on_applied);
12727 } else {
12728 finisher.queue(on_applied);
12729 }
12730 }
12731
f67539c2
TL
12732#ifdef WITH_BLKIN
12733 if (txc->trace) {
12734 txc->trace.event("txc applied");
12735 }
12736#endif
12737
494da23a
TL
12738 log_latency("submit_transact",
12739 l_bluestore_submit_lat,
12740 mono_clock::now() - start,
12741 cct->_conf->bluestore_log_op_age);
12742 log_latency("throttle_transact",
12743 l_bluestore_throttle_lat,
12744 tend - tstart,
12745 cct->_conf->bluestore_log_op_age);
7c673cae
FG
12746 return 0;
12747}
12748
12749void BlueStore::_txc_aio_submit(TransContext *txc)
12750{
12751 dout(10) << __func__ << " txc " << txc << dendl;
12752 bdev->aio_submit(&txc->ioc);
12753}
12754
12755void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
12756{
12757 Transaction::iterator i = t->begin();
12758
81eedcae 12759 _dump_transaction<30>(cct, t);
7c673cae
FG
12760
12761 vector<CollectionRef> cvec(i.colls.size());
12762 unsigned j = 0;
12763 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
12764 ++p, ++j) {
12765 cvec[j] = _get_collection(*p);
7c673cae 12766 }
11fdf7f2 12767
7c673cae
FG
12768 vector<OnodeRef> ovec(i.objects.size());
12769
12770 for (int pos = 0; i.have_op(); ++pos) {
12771 Transaction::Op *op = i.decode_op();
12772 int r = 0;
12773
12774 // no coll or obj
12775 if (op->op == Transaction::OP_NOP)
12776 continue;
12777
11fdf7f2 12778
7c673cae
FG
12779 // collection operations
12780 CollectionRef &c = cvec[op->cid];
11fdf7f2
TL
12781
12782 // initialize osd_pool_id and do a smoke test that all collections belong
12783 // to the same pool
12784 spg_t pgid;
12785 if (!!c ? c->cid.is_pg(&pgid) : false) {
12786 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
12787 txc->osd_pool_id == pgid.pool());
12788 txc->osd_pool_id = pgid.pool();
12789 }
12790
7c673cae
FG
12791 switch (op->op) {
12792 case Transaction::OP_RMCOLL:
12793 {
12794 const coll_t &cid = i.get_cid(op->cid);
12795 r = _remove_collection(txc, cid, &c);
12796 if (!r)
12797 continue;
12798 }
12799 break;
12800
12801 case Transaction::OP_MKCOLL:
12802 {
11fdf7f2 12803 ceph_assert(!c);
7c673cae
FG
12804 const coll_t &cid = i.get_cid(op->cid);
12805 r = _create_collection(txc, cid, op->split_bits, &c);
12806 if (!r)
12807 continue;
12808 }
12809 break;
12810
12811 case Transaction::OP_SPLIT_COLLECTION:
11fdf7f2 12812 ceph_abort_msg("deprecated");
7c673cae
FG
12813 break;
12814
12815 case Transaction::OP_SPLIT_COLLECTION2:
12816 {
12817 uint32_t bits = op->split_bits;
12818 uint32_t rem = op->split_rem;
12819 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
12820 if (!r)
12821 continue;
12822 }
12823 break;
12824
11fdf7f2
TL
12825 case Transaction::OP_MERGE_COLLECTION:
12826 {
12827 uint32_t bits = op->split_bits;
12828 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
12829 if (!r)
12830 continue;
12831 }
12832 break;
12833
7c673cae
FG
12834 case Transaction::OP_COLL_HINT:
12835 {
f67539c2 12836 uint32_t type = op->hint;
7c673cae
FG
12837 bufferlist hint;
12838 i.decode_bl(hint);
11fdf7f2 12839 auto hiter = hint.cbegin();
7c673cae
FG
12840 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
12841 uint32_t pg_num;
12842 uint64_t num_objs;
11fdf7f2
TL
12843 decode(pg_num, hiter);
12844 decode(num_objs, hiter);
7c673cae
FG
12845 dout(10) << __func__ << " collection hint objects is a no-op, "
12846 << " pg_num " << pg_num << " num_objects " << num_objs
12847 << dendl;
12848 } else {
12849 // Ignore the hint
12850 dout(10) << __func__ << " unknown collection hint " << type << dendl;
12851 }
12852 continue;
12853 }
12854 break;
12855
12856 case Transaction::OP_COLL_SETATTR:
12857 r = -EOPNOTSUPP;
12858 break;
12859
12860 case Transaction::OP_COLL_RMATTR:
12861 r = -EOPNOTSUPP;
12862 break;
12863
12864 case Transaction::OP_COLL_RENAME:
11fdf7f2 12865 ceph_abort_msg("not implemented");
7c673cae
FG
12866 break;
12867 }
12868 if (r < 0) {
12869 derr << __func__ << " error " << cpp_strerror(r)
12870 << " not handled on operation " << op->op
12871 << " (op " << pos << ", counting from 0)" << dendl;
81eedcae 12872 _dump_transaction<0>(cct, t);
11fdf7f2 12873 ceph_abort_msg("unexpected error");
7c673cae
FG
12874 }
12875
12876 // these operations implicity create the object
12877 bool create = false;
12878 if (op->op == Transaction::OP_TOUCH ||
9f95a23c 12879 op->op == Transaction::OP_CREATE ||
7c673cae
FG
12880 op->op == Transaction::OP_WRITE ||
12881 op->op == Transaction::OP_ZERO) {
12882 create = true;
12883 }
12884
12885 // object operations
9f95a23c 12886 std::unique_lock l(c->lock);
7c673cae
FG
12887 OnodeRef &o = ovec[op->oid];
12888 if (!o) {
12889 ghobject_t oid = i.get_oid(op->oid);
9f95a23c 12890 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
7c673cae
FG
12891 }
12892 if (!create && (!o || !o->exists)) {
12893 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
12894 << i.get_oid(op->oid) << dendl;
12895 r = -ENOENT;
12896 goto endop;
12897 }
12898
12899 switch (op->op) {
9f95a23c 12900 case Transaction::OP_CREATE:
7c673cae
FG
12901 case Transaction::OP_TOUCH:
12902 r = _touch(txc, c, o);
12903 break;
12904
12905 case Transaction::OP_WRITE:
12906 {
12907 uint64_t off = op->off;
12908 uint64_t len = op->len;
12909 uint32_t fadvise_flags = i.get_fadvise_flags();
12910 bufferlist bl;
12911 i.decode_bl(bl);
12912 r = _write(txc, c, o, off, len, bl, fadvise_flags);
12913 }
12914 break;
12915
12916 case Transaction::OP_ZERO:
12917 {
12918 uint64_t off = op->off;
12919 uint64_t len = op->len;
12920 r = _zero(txc, c, o, off, len);
12921 }
12922 break;
12923
12924 case Transaction::OP_TRIMCACHE:
12925 {
12926 // deprecated, no-op
12927 }
12928 break;
12929
12930 case Transaction::OP_TRUNCATE:
12931 {
12932 uint64_t off = op->off;
35e4c445 12933 r = _truncate(txc, c, o, off);
7c673cae
FG
12934 }
12935 break;
12936
12937 case Transaction::OP_REMOVE:
12938 {
12939 r = _remove(txc, c, o);
12940 }
12941 break;
12942
12943 case Transaction::OP_SETATTR:
12944 {
12945 string name = i.decode_string();
12946 bufferptr bp;
12947 i.decode_bp(bp);
12948 r = _setattr(txc, c, o, name, bp);
12949 }
12950 break;
12951
12952 case Transaction::OP_SETATTRS:
12953 {
12954 map<string, bufferptr> aset;
12955 i.decode_attrset(aset);
12956 r = _setattrs(txc, c, o, aset);
12957 }
12958 break;
12959
12960 case Transaction::OP_RMATTR:
12961 {
12962 string name = i.decode_string();
12963 r = _rmattr(txc, c, o, name);
12964 }
12965 break;
12966
12967 case Transaction::OP_RMATTRS:
12968 {
12969 r = _rmattrs(txc, c, o);
12970 }
12971 break;
12972
12973 case Transaction::OP_CLONE:
12974 {
12975 OnodeRef& no = ovec[op->dest_oid];
12976 if (!no) {
12977 const ghobject_t& noid = i.get_oid(op->dest_oid);
12978 no = c->get_onode(noid, true);
12979 }
12980 r = _clone(txc, c, o, no);
12981 }
12982 break;
12983
12984 case Transaction::OP_CLONERANGE:
11fdf7f2 12985 ceph_abort_msg("deprecated");
7c673cae
FG
12986 break;
12987
12988 case Transaction::OP_CLONERANGE2:
12989 {
12990 OnodeRef& no = ovec[op->dest_oid];
12991 if (!no) {
12992 const ghobject_t& noid = i.get_oid(op->dest_oid);
12993 no = c->get_onode(noid, true);
12994 }
12995 uint64_t srcoff = op->off;
12996 uint64_t len = op->len;
12997 uint64_t dstoff = op->dest_off;
12998 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
12999 }
13000 break;
13001
13002 case Transaction::OP_COLL_ADD:
11fdf7f2 13003 ceph_abort_msg("not implemented");
7c673cae
FG
13004 break;
13005
13006 case Transaction::OP_COLL_REMOVE:
11fdf7f2 13007 ceph_abort_msg("not implemented");
7c673cae
FG
13008 break;
13009
13010 case Transaction::OP_COLL_MOVE:
11fdf7f2 13011 ceph_abort_msg("deprecated");
7c673cae
FG
13012 break;
13013
13014 case Transaction::OP_COLL_MOVE_RENAME:
13015 case Transaction::OP_TRY_RENAME:
13016 {
11fdf7f2 13017 ceph_assert(op->cid == op->dest_cid);
7c673cae
FG
13018 const ghobject_t& noid = i.get_oid(op->dest_oid);
13019 OnodeRef& no = ovec[op->dest_oid];
13020 if (!no) {
13021 no = c->get_onode(noid, false);
13022 }
13023 r = _rename(txc, c, o, no, noid);
13024 }
13025 break;
13026
13027 case Transaction::OP_OMAP_CLEAR:
13028 {
13029 r = _omap_clear(txc, c, o);
13030 }
13031 break;
13032 case Transaction::OP_OMAP_SETKEYS:
13033 {
13034 bufferlist aset_bl;
13035 i.decode_attrset_bl(&aset_bl);
13036 r = _omap_setkeys(txc, c, o, aset_bl);
13037 }
13038 break;
13039 case Transaction::OP_OMAP_RMKEYS:
13040 {
13041 bufferlist keys_bl;
13042 i.decode_keyset_bl(&keys_bl);
13043 r = _omap_rmkeys(txc, c, o, keys_bl);
13044 }
13045 break;
13046 case Transaction::OP_OMAP_RMKEYRANGE:
13047 {
13048 string first, last;
13049 first = i.decode_string();
13050 last = i.decode_string();
13051 r = _omap_rmkey_range(txc, c, o, first, last);
13052 }
13053 break;
13054 case Transaction::OP_OMAP_SETHEADER:
13055 {
13056 bufferlist bl;
13057 i.decode_bl(bl);
13058 r = _omap_setheader(txc, c, o, bl);
13059 }
13060 break;
13061
13062 case Transaction::OP_SETALLOCHINT:
13063 {
13064 r = _set_alloc_hint(txc, c, o,
13065 op->expected_object_size,
13066 op->expected_write_size,
f67539c2 13067 op->hint);
7c673cae
FG
13068 }
13069 break;
13070
13071 default:
11fdf7f2 13072 derr << __func__ << " bad op " << op->op << dendl;
7c673cae
FG
13073 ceph_abort();
13074 }
13075
13076 endop:
13077 if (r < 0) {
13078 bool ok = false;
13079
13080 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
13081 op->op == Transaction::OP_CLONE ||
13082 op->op == Transaction::OP_CLONERANGE2 ||
13083 op->op == Transaction::OP_COLL_ADD ||
13084 op->op == Transaction::OP_SETATTR ||
13085 op->op == Transaction::OP_SETATTRS ||
13086 op->op == Transaction::OP_RMATTR ||
13087 op->op == Transaction::OP_OMAP_SETKEYS ||
13088 op->op == Transaction::OP_OMAP_RMKEYS ||
13089 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
13090 op->op == Transaction::OP_OMAP_SETHEADER))
13091 // -ENOENT is usually okay
13092 ok = true;
13093 if (r == -ENODATA)
13094 ok = true;
13095
13096 if (!ok) {
13097 const char *msg = "unexpected error code";
13098
13099 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
13100 op->op == Transaction::OP_CLONE ||
13101 op->op == Transaction::OP_CLONERANGE2))
13102 msg = "ENOENT on clone suggests osd bug";
13103
13104 if (r == -ENOSPC)
13105 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
13106 // by partially applying transactions.
13107 msg = "ENOSPC from bluestore, misconfigured cluster";
13108
13109 if (r == -ENOTEMPTY) {
13110 msg = "ENOTEMPTY suggests garbage data in osd data dir";
13111 }
13112
13113 derr << __func__ << " error " << cpp_strerror(r)
13114 << " not handled on operation " << op->op
13115 << " (op " << pos << ", counting from 0)"
13116 << dendl;
13117 derr << msg << dendl;
81eedcae 13118 _dump_transaction<0>(cct, t);
11fdf7f2 13119 ceph_abort_msg("unexpected error");
7c673cae
FG
13120 }
13121 }
13122 }
13123}
13124
13125
13126
13127// -----------------
13128// write operations
13129
13130int BlueStore::_touch(TransContext *txc,
13131 CollectionRef& c,
13132 OnodeRef &o)
13133{
13134 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13135 int r = 0;
7c673cae
FG
13136 _assign_nid(txc, o);
13137 txc->write_onode(o);
13138 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13139 return r;
13140}
13141
7c673cae
FG
13142void BlueStore::_pad_zeros(
13143 bufferlist *bl, uint64_t *offset,
13144 uint64_t chunk_size)
13145{
13146 auto length = bl->length();
13147 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
13148 << " chunk_size 0x" << chunk_size << std::dec << dendl;
13149 dout(40) << "before:\n";
13150 bl->hexdump(*_dout);
13151 *_dout << dendl;
13152 // front
13153 size_t front_pad = *offset % chunk_size;
13154 size_t back_pad = 0;
13155 size_t pad_count = 0;
13156 if (front_pad) {
11fdf7f2 13157 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
f67539c2 13158 bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size);
224ce89b 13159 z.zero(0, front_pad, false);
7c673cae 13160 pad_count += front_pad;
9f95a23c 13161 bl->begin().copy(front_copy, z.c_str() + front_pad);
7c673cae
FG
13162 if (front_copy + front_pad < chunk_size) {
13163 back_pad = chunk_size - (length + front_pad);
224ce89b 13164 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
13165 pad_count += back_pad;
13166 }
13167 bufferlist old, t;
13168 old.swap(*bl);
13169 t.substr_of(old, front_copy, length - front_copy);
13170 bl->append(z);
13171 bl->claim_append(t);
13172 *offset -= front_pad;
224ce89b 13173 length += pad_count;
7c673cae
FG
13174 }
13175
13176 // back
13177 uint64_t end = *offset + length;
13178 unsigned back_copy = end % chunk_size;
13179 if (back_copy) {
11fdf7f2 13180 ceph_assert(back_pad == 0);
7c673cae 13181 back_pad = chunk_size - back_copy;
11fdf7f2 13182 ceph_assert(back_copy <= length);
7c673cae 13183 bufferptr tail(chunk_size);
9f95a23c 13184 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
224ce89b 13185 tail.zero(back_copy, back_pad, false);
7c673cae
FG
13186 bufferlist old;
13187 old.swap(*bl);
13188 bl->substr_of(old, 0, length - back_copy);
13189 bl->append(tail);
13190 length += back_pad;
13191 pad_count += back_pad;
13192 }
13193 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
13194 << back_pad << " on front/back, now 0x" << *offset << "~"
13195 << length << std::dec << dendl;
13196 dout(40) << "after:\n";
13197 bl->hexdump(*_dout);
13198 *_dout << dendl;
13199 if (pad_count)
13200 logger->inc(l_bluestore_write_pad_bytes, pad_count);
11fdf7f2 13201 ceph_assert(bl->length() == length);
7c673cae
FG
13202}
13203
13204void BlueStore::_do_write_small(
13205 TransContext *txc,
13206 CollectionRef &c,
13207 OnodeRef o,
13208 uint64_t offset, uint64_t length,
13209 bufferlist::iterator& blp,
13210 WriteContext *wctx)
13211{
13212 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13213 << std::dec << dendl;
11fdf7f2 13214 ceph_assert(length < min_alloc_size);
f67539c2 13215
7c673cae
FG
13216 uint64_t end_offs = offset + length;
13217
13218 logger->inc(l_bluestore_write_small);
13219 logger->inc(l_bluestore_write_small_bytes, length);
13220
13221 bufferlist bl;
13222 blp.copy(length, bl);
13223
81eedcae
TL
13224 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
13225 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13226 uint32_t alloc_len = min_alloc_size;
13227 auto offset0 = p2align<uint64_t>(offset, alloc_len);
13228
13229 bool any_change;
13230
13231 // search suitable extent in both forward and reverse direction in
13232 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13233 // then check if blob can be reused via can_reuse_blob func or apply
13234 // direct/deferred write (the latter for extents including or higher
13235 // than 'offset' only).
13236 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
13237
f67539c2
TL
13238 // On zoned devices, the first goal is to support non-overwrite workloads,
13239 // such as RGW, with large, aligned objects. Therefore, for user writes
13240 // _do_write_small should not trigger. OSDs, however, write and update a tiny
13241 // amount of metadata, such as OSD maps, to disk. For those cases, we
13242 // temporarily just pad them to min_alloc_size and write them to a new place
13243 // on every update.
13244 if (bdev->is_smr()) {
13245 BlobRef b = c->new_blob();
13246 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
13247 uint64_t b_off0 = b_off;
13248 _pad_zeros(&bl, &b_off0, min_alloc_size);
13249 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13250 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true);
13251 return;
13252 }
13253
7c673cae
FG
13254 // Look for an existing mutable blob we can use.
13255 auto begin = o->extent_map.extent_map.begin();
13256 auto end = o->extent_map.extent_map.end();
13257 auto ep = o->extent_map.seek_lextent(offset);
13258 if (ep != begin) {
13259 --ep;
13260 if (ep->blob_end() <= offset) {
13261 ++ep;
13262 }
13263 }
f67539c2
TL
13264 auto prev_ep = end;
13265 if (ep != begin) {
13266 prev_ep = ep;
7c673cae 13267 --prev_ep;
7c673cae
FG
13268 }
13269
eafe8130
TL
13270 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
13271 // We don't want to have more blobs than min alloc units fit
13272 // into 2 max blobs
13273 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
13274 bool above_blob_threshold = false;
13275
13276 inspected_blobs.reserve(blob_threshold);
13277
13278 uint64_t max_off = 0;
13279 auto start_ep = ep;
13280 auto end_ep = ep; // exclusively
7c673cae
FG
13281 do {
13282 any_change = false;
13283
13284 if (ep != end && ep->logical_offset < offset + max_bsize) {
13285 BlobRef b = ep->blob;
eafe8130
TL
13286 if (!above_blob_threshold) {
13287 inspected_blobs.insert(&b->get_blob());
13288 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13289 }
13290 max_off = ep->logical_end();
7c673cae 13291 auto bstart = ep->blob_start();
eafe8130 13292
7c673cae
FG
13293 dout(20) << __func__ << " considering " << *b
13294 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
13295 if (bstart >= end_offs) {
13296 dout(20) << __func__ << " ignoring distant " << *b << dendl;
13297 } else if (!b->get_blob().is_mutable()) {
13298 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
13299 } else if (ep->logical_offset % min_alloc_size !=
13300 ep->blob_offset % min_alloc_size) {
13301 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
13302 } else {
13303 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13304 // can we pad our head/tail out with zeros?
13305 uint64_t head_pad, tail_pad;
11fdf7f2
TL
13306 head_pad = p2phase(offset, chunk_size);
13307 tail_pad = p2nphase(end_offs, chunk_size);
7c673cae
FG
13308 if (head_pad || tail_pad) {
13309 o->extent_map.fault_range(db, offset - head_pad,
13310 end_offs - offset + head_pad + tail_pad);
13311 }
13312 if (head_pad &&
13313 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
13314 head_pad = 0;
13315 }
13316 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
13317 tail_pad = 0;
13318 }
13319
13320 uint64_t b_off = offset - head_pad - bstart;
13321 uint64_t b_len = length + head_pad + tail_pad;
13322
13323 // direct write into unused blocks of an existing mutable blob?
13324 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
13325 b->get_blob().get_ondisk_length() >= b_off + b_len &&
13326 b->get_blob().is_unused(b_off, b_len) &&
13327 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 13328 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
13329
13330 dout(20) << __func__ << " write to unused 0x" << std::hex
13331 << b_off << "~" << b_len
13332 << " pad 0x" << head_pad << " + 0x" << tail_pad
13333 << std::dec << " of mutable " << *b << dendl;
224ce89b 13334 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
13335 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13336
11fdf7f2 13337 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
13338 if (b_len <= prefer_deferred_size) {
13339 dout(20) << __func__ << " deferring small 0x" << std::hex
13340 << b_len << std::dec << " unused write via deferred" << dendl;
9f95a23c 13341 bluestore_deferred_op_t *op = _get_deferred_op(txc);
7c673cae
FG
13342 op->op = bluestore_deferred_op_t::OP_WRITE;
13343 b->get_blob().map(
13344 b_off, b_len,
13345 [&](uint64_t offset, uint64_t length) {
13346 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13347 return 0;
13348 });
224ce89b 13349 op->data = bl;
7c673cae
FG
13350 } else {
13351 b->get_blob().map_bl(
224ce89b 13352 b_off, bl,
7c673cae
FG
13353 [&](uint64_t offset, bufferlist& t) {
13354 bdev->aio_write(offset, t,
13355 &txc->ioc, wctx->buffered);
13356 });
13357 }
13358 }
224ce89b 13359 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
13360 dout(20) << __func__ << " lex old " << *ep << dendl;
13361 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
13362 b,
13363 &wctx->old_extents);
13364 b->dirty_blob().mark_used(le->blob_offset, le->length);
f67539c2 13365
7c673cae
FG
13366 txc->statfs_delta.stored() += le->length;
13367 dout(20) << __func__ << " lex " << *le << dendl;
13368 logger->inc(l_bluestore_write_small_unused);
13369 return;
13370 }
13371 // read some data to fill out the chunk?
11fdf7f2
TL
13372 uint64_t head_read = p2phase(b_off, chunk_size);
13373 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
7c673cae
FG
13374 if ((head_read || tail_read) &&
13375 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
13376 head_read + tail_read < min_alloc_size) {
13377 b_off -= head_read;
13378 b_len += head_read + tail_read;
13379
13380 } else {
13381 head_read = tail_read = 0;
13382 }
13383
13384 // chunk-aligned deferred overwrite?
13385 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
13386 b_off % chunk_size == 0 &&
13387 b_len % chunk_size == 0 &&
13388 b->get_blob().is_allocated(b_off, b_len)) {
13389
224ce89b 13390 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
13391
13392 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
13393 << " and tail 0x" << tail_read << std::dec << dendl;
13394 if (head_read) {
13395 bufferlist head_bl;
13396 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
13397 head_bl, 0);
11fdf7f2 13398 ceph_assert(r >= 0 && r <= (int)head_read);
7c673cae
FG
13399 size_t zlen = head_read - r;
13400 if (zlen) {
13401 head_bl.append_zero(zlen);
13402 logger->inc(l_bluestore_write_pad_bytes, zlen);
13403 }
11fdf7f2
TL
13404 head_bl.claim_append(bl);
13405 bl.swap(head_bl);
7c673cae
FG
13406 logger->inc(l_bluestore_write_penalty_read_ops);
13407 }
13408 if (tail_read) {
13409 bufferlist tail_bl;
13410 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
13411 tail_bl, 0);
11fdf7f2 13412 ceph_assert(r >= 0 && r <= (int)tail_read);
7c673cae
FG
13413 size_t zlen = tail_read - r;
13414 if (zlen) {
13415 tail_bl.append_zero(zlen);
13416 logger->inc(l_bluestore_write_pad_bytes, zlen);
13417 }
224ce89b 13418 bl.claim_append(tail_bl);
7c673cae
FG
13419 logger->inc(l_bluestore_write_penalty_read_ops);
13420 }
f67539c2 13421 logger->inc(l_bluestore_write_small_pre_read);
7c673cae 13422
224ce89b 13423 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
13424 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13425
f67539c2 13426 b->dirty_blob().calc_csum(b_off, bl);
11fdf7f2
TL
13427
13428 if (!g_conf()->bluestore_debug_omit_block_device_write) {
9f95a23c 13429 bluestore_deferred_op_t *op = _get_deferred_op(txc);
11fdf7f2
TL
13430 op->op = bluestore_deferred_op_t::OP_WRITE;
13431 int r = b->get_blob().map(
13432 b_off, b_len,
13433 [&](uint64_t offset, uint64_t length) {
13434 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13435 return 0;
13436 });
13437 ceph_assert(r == 0);
f67539c2 13438 op->data = std::move(bl);
11fdf7f2
TL
13439 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
13440 << b_len << std::dec << " of mutable " << *b
13441 << " at " << op->extents << dendl;
13442 }
13443
7c673cae
FG
13444 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
13445 b, &wctx->old_extents);
13446 b->dirty_blob().mark_used(le->blob_offset, le->length);
13447 txc->statfs_delta.stored() += le->length;
13448 dout(20) << __func__ << " lex " << *le << dendl;
f67539c2 13449 logger->inc(l_bluestore_write_deferred);
7c673cae
FG
13450 return;
13451 }
224ce89b
WB
13452 // try to reuse blob if we can
13453 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
13454 max_bsize,
13455 offset0 - bstart,
13456 &alloc_len)) {
11fdf7f2 13457 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
13458 // fit into reused blob
13459 // Need to check for pending writes desiring to
13460 // reuse the same pextent. The rationale is that during GC two chunks
13461 // from garbage blobs(compressed?) can share logical space within the same
13462 // AU. That's in turn might be caused by unaligned len in clone_range2.
13463 // Hence the second write will fail in an attempt to reuse blob at
13464 // do_alloc_write().
13465 if (!wctx->has_conflict(b,
13466 offset0,
13467 offset0 + alloc_len,
13468 min_alloc_size)) {
13469
13470 // we can't reuse pad_head/pad_tail since they might be truncated
13471 // due to existent extents
13472 uint64_t b_off = offset - bstart;
13473 uint64_t b_off0 = b_off;
13474 _pad_zeros(&bl, &b_off0, chunk_size);
13475
13476 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
13477 << " (0x" << b_off0 << "~" << bl.length() << ")"
13478 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
13479 << std::dec << dendl;
13480
13481 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13482 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13483 false, false);
13484 logger->inc(l_bluestore_write_small_unused);
13485 return;
13486 }
13487 }
13488 }
13489 ++ep;
eafe8130 13490 end_ep = ep;
7c673cae
FG
13491 any_change = true;
13492 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13493
13494 // check extent for reuse in reverse order
13495 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
13496 BlobRef b = prev_ep->blob;
eafe8130
TL
13497 if (!above_blob_threshold) {
13498 inspected_blobs.insert(&b->get_blob());
13499 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13500 }
13501 start_ep = prev_ep;
7c673cae
FG
13502 auto bstart = prev_ep->blob_start();
13503 dout(20) << __func__ << " considering " << *b
13504 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 13505 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
13506 max_bsize,
13507 offset0 - bstart,
13508 &alloc_len)) {
11fdf7f2 13509 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
13510 // fit into reused blob
13511 // Need to check for pending writes desiring to
13512 // reuse the same pextent. The rationale is that during GC two chunks
13513 // from garbage blobs(compressed?) can share logical space within the same
13514 // AU. That's in turn might be caused by unaligned len in clone_range2.
13515 // Hence the second write will fail in an attempt to reuse blob at
13516 // do_alloc_write().
13517 if (!wctx->has_conflict(b,
13518 offset0,
13519 offset0 + alloc_len,
13520 min_alloc_size)) {
13521
13522 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13523 uint64_t b_off = offset - bstart;
13524 uint64_t b_off0 = b_off;
13525 _pad_zeros(&bl, &b_off0, chunk_size);
13526
13527 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
13528 << " (0x" << b_off0 << "~" << bl.length() << ")"
13529 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
13530 << std::dec << dendl;
13531
13532 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13533 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13534 false, false);
13535 logger->inc(l_bluestore_write_small_unused);
13536 return;
13537 }
13538 }
13539 if (prev_ep != begin) {
13540 --prev_ep;
13541 any_change = true;
13542 } else {
13543 prev_ep = end; // to avoid useless first extent re-check
13544 }
13545 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13546 } while (any_change);
13547
eafe8130
TL
13548 if (above_blob_threshold) {
13549 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
13550 << " " << std::hex << min_off << "~" << max_off << std::dec
13551 << dendl;
13552 ceph_assert(start_ep != end_ep);
13553 for (auto ep = start_ep; ep != end_ep; ++ep) {
13554 dout(20) << __func__ << " inserting for GC "
13555 << std::hex << ep->logical_offset << "~" << ep->length
13556 << std::dec << dendl;
13557
13558 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
13559 }
13560 // insert newly written extent to GC
13561 wctx->extents_to_gc.union_insert(offset, length);
13562 dout(20) << __func__ << " inserting (last) for GC "
13563 << std::hex << offset << "~" << length
13564 << std::dec << dendl;
13565 }
7c673cae 13566 // new blob.
7c673cae 13567 BlobRef b = c->new_blob();
11fdf7f2 13568 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
7c673cae
FG
13569 uint64_t b_off0 = b_off;
13570 _pad_zeros(&bl, &b_off0, block_size);
13571 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
1911f103
TL
13572 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13573 min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
13574 // doesn't match disk one only
13575 true);
7c673cae
FG
13576
13577 return;
13578}
13579
f67539c2
TL
13580bool BlueStore::BigDeferredWriteContext::can_defer(
13581 BlueStore::extent_map_t::iterator ep,
13582 uint64_t prefer_deferred_size,
13583 uint64_t block_size,
13584 uint64_t offset,
13585 uint64_t l)
13586{
13587 bool res = false;
13588 auto& blob = ep->blob->get_blob();
13589 if (offset >= ep->blob_start() &&
13590 blob.is_mutable()) {
13591 off = offset;
13592 b_off = offset - ep->blob_start();
13593 uint64_t chunk_size = blob.get_chunk_size(block_size);
13594 uint64_t ondisk = blob.get_ondisk_length();
13595 used = std::min(l, ondisk - b_off);
13596
13597 // will read some data to fill out the chunk?
13598 head_read = p2phase<uint64_t>(b_off, chunk_size);
13599 tail_read = p2nphase<uint64_t>(b_off + used, chunk_size);
13600 b_off -= head_read;
13601
13602 ceph_assert(b_off % chunk_size == 0);
13603 ceph_assert(blob_aligned_len() % chunk_size == 0);
13604
13605 res = blob_aligned_len() <= prefer_deferred_size &&
13606 blob_aligned_len() <= ondisk &&
13607 blob.is_allocated(b_off, blob_aligned_len());
13608 if (res) {
13609 blob_ref = ep->blob;
13610 blob_start = ep->blob_start();
13611 }
13612 }
13613 return res;
13614}
13615
13616bool BlueStore::BigDeferredWriteContext::apply_defer()
13617{
13618 int r = blob_ref->get_blob().map(
13619 b_off, blob_aligned_len(),
13620 [&](const bluestore_pextent_t& pext,
13621 uint64_t offset,
13622 uint64_t length) {
13623 // apply deferred if overwrite breaks blob continuity only.
13624 // if it totally overlaps some pextent - fallback to regular write
13625 if (pext.offset < offset ||
13626 pext.end() > offset + length) {
13627 res_extents.emplace_back(bluestore_pextent_t(offset, length));
13628 return 0;
13629 }
13630 return -1;
13631 });
13632 return r >= 0;
13633}
13634
13635void BlueStore::_do_write_big_apply_deferred(
13636 TransContext* txc,
13637 CollectionRef& c,
13638 OnodeRef o,
13639 BlueStore::BigDeferredWriteContext& dctx,
13640 bufferlist::iterator& blp,
13641 WriteContext* wctx)
13642{
13643 bufferlist bl;
13644 dout(20) << __func__ << " reading head 0x" << std::hex << dctx.head_read
13645 << " and tail 0x" << dctx.tail_read << std::dec << dendl;
13646 if (dctx.head_read) {
13647 int r = _do_read(c.get(), o,
13648 dctx.off - dctx.head_read,
13649 dctx.head_read,
13650 bl,
13651 0);
13652 ceph_assert(r >= 0 && r <= (int)dctx.head_read);
13653 size_t zlen = dctx.head_read - r;
13654 if (zlen) {
13655 bl.append_zero(zlen);
13656 logger->inc(l_bluestore_write_pad_bytes, zlen);
13657 }
13658 logger->inc(l_bluestore_write_penalty_read_ops);
13659 }
13660 blp.copy(dctx.used, bl);
13661
13662 if (dctx.tail_read) {
13663 bufferlist tail_bl;
13664 int r = _do_read(c.get(), o,
13665 dctx.off + dctx.used, dctx.tail_read,
13666 tail_bl, 0);
13667 ceph_assert(r >= 0 && r <= (int)dctx.tail_read);
13668 size_t zlen = dctx.tail_read - r;
13669 if (zlen) {
13670 tail_bl.append_zero(zlen);
13671 logger->inc(l_bluestore_write_pad_bytes, zlen);
13672 }
13673 bl.claim_append(tail_bl);
13674 logger->inc(l_bluestore_write_penalty_read_ops);
13675 }
13676 auto& b0 = dctx.blob_ref;
13677 _buffer_cache_write(txc, b0, dctx.b_off, bl,
13678 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13679
13680 b0->dirty_blob().calc_csum(dctx.b_off, bl);
13681
13682 Extent* le = o->extent_map.set_lextent(c, dctx.off,
13683 dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents);
13684
13685 // in fact this is a no-op for big writes but left here to maintain
13686 // uniformity and avoid missing after some refactor.
13687 b0->dirty_blob().mark_used(le->blob_offset, le->length);
13688 txc->statfs_delta.stored() += le->length;
13689
13690 if (!g_conf()->bluestore_debug_omit_block_device_write) {
13691 bluestore_deferred_op_t* op = _get_deferred_op(txc);
13692 op->op = bluestore_deferred_op_t::OP_WRITE;
13693 op->extents.swap(dctx.res_extents);
13694 op->data = std::move(bl);
13695 }
13696}
13697
7c673cae
FG
13698void BlueStore::_do_write_big(
13699 TransContext *txc,
13700 CollectionRef &c,
13701 OnodeRef o,
13702 uint64_t offset, uint64_t length,
13703 bufferlist::iterator& blp,
13704 WriteContext *wctx)
13705{
13706 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13707 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
13708 << " compress " << (int)wctx->compress
13709 << dendl;
13710 logger->inc(l_bluestore_write_big);
13711 logger->inc(l_bluestore_write_big_bytes, length);
11fdf7f2 13712 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
f67539c2 13713 uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load();
7c673cae
FG
13714 while (length > 0) {
13715 bool new_blob = false;
11fdf7f2 13716 uint32_t l = std::min(max_bsize, length);
7c673cae
FG
13717 BlobRef b;
13718 uint32_t b_off = 0;
13719
13720 //attempting to reuse existing blob
13721 if (!wctx->compress) {
7c673cae 13722 auto end = o->extent_map.extent_map.end();
f67539c2
TL
13723
13724 if (prefer_deferred_size_snapshot &&
13725 l <= prefer_deferred_size_snapshot * 2) {
13726 // Single write that spans two adjusted existing blobs can result
13727 // in up to two deferred blocks of 'prefer_deferred_size'
13728 // So we're trying to minimize the amount of resulting blobs
13729 // and preserve 2 blobs rather than inserting one more in between
13730 // E.g. write 0x10000~20000 over existing blobs
13731 // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
13732 // performance point of view) to result in two deferred writes to
13733 // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
13734
13735 // look for an existing mutable blob we can write into
13736 auto ep = o->extent_map.seek_lextent(offset);
13737 auto ep_next = end;
13738 BigDeferredWriteContext head_info, tail_info;
13739
13740 bool will_defer = ep != end ?
13741 head_info.can_defer(ep,
13742 prefer_deferred_size_snapshot,
13743 block_size,
13744 offset,
13745 l) :
13746 false;
13747 auto offset_next = offset + head_info.used;
13748 auto remaining = l - head_info.used;
13749 if (will_defer && remaining) {
13750 will_defer = false;
13751 if (remaining <= prefer_deferred_size_snapshot) {
13752 ep_next = o->extent_map.seek_lextent(offset_next);
13753 // check if we can defer remaining totally
13754 will_defer = ep_next == end ?
13755 false :
13756 tail_info.can_defer(ep_next,
13757 prefer_deferred_size_snapshot,
13758 block_size,
13759 offset_next,
13760 remaining);
13761 will_defer = will_defer && remaining == tail_info.used;
13762 }
13763 }
13764 if (will_defer) {
13765 dout(20) << __func__ << " " << *(head_info.blob_ref)
13766 << " deferring big " << std::hex
13767 << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")"
13768 << std::dec << " write via deferred"
13769 << dendl;
13770 if (remaining) {
13771 dout(20) << __func__ << " " << *(tail_info.blob_ref)
13772 << " deferring big " << std::hex
13773 << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")"
13774 << std::dec << " write via deferred"
13775 << dendl;
13776 }
13777
13778 will_defer = head_info.apply_defer();
13779 if (!will_defer) {
13780 dout(20) << __func__
13781 << " deferring big fell back, head isn't continuous"
13782 << dendl;
13783 } else if (remaining) {
13784 will_defer = tail_info.apply_defer();
13785 if (!will_defer) {
13786 dout(20) << __func__
13787 << " deferring big fell back, tail isn't continuous"
13788 << dendl;
13789 }
13790 }
13791 }
13792 if (will_defer) {
13793 _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx);
13794 if (remaining) {
13795 _do_write_big_apply_deferred(txc, c, o, tail_info,
13796 blp, wctx);
13797 }
13798 offset += l;
13799 length -= l;
13800 logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1);
13801 logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1);
13802 continue;
13803 }
13804 }
13805
13806 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
13807
13808 // seek again as punch_hole could invalidate ep
7c673cae 13809 auto ep = o->extent_map.seek_lextent(offset);
f67539c2
TL
13810 auto begin = o->extent_map.extent_map.begin();
13811 auto prev_ep = end;
13812 if (ep != begin) {
13813 prev_ep = ep;
7c673cae 13814 --prev_ep;
7c673cae 13815 }
f67539c2
TL
13816 dout(20) << __func__ << " no deferred" << dendl;
13817
7c673cae
FG
13818 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13819 // search suitable extent in both forward and reverse direction in
13820 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 13821 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
13822 bool any_change;
13823 do {
13824 any_change = false;
13825 if (ep != end && ep->logical_offset < offset + max_bsize) {
f67539c2
TL
13826 dout(20) << __func__ << " considering " << *ep << dendl;
13827 dout(20) << __func__ << " considering " << *(ep->blob)
13828 << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl;
13829
13830 if (offset >= ep->blob_start() &&
224ce89b 13831 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
13832 offset - ep->blob_start(),
13833 &l)) {
13834 b = ep->blob;
f67539c2 13835 b_off = offset - ep->blob_start();
7c673cae
FG
13836 prev_ep = end; // to avoid check below
13837 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 13838 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
13839 } else {
13840 ++ep;
13841 any_change = true;
13842 }
13843 }
13844
13845 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
f67539c2
TL
13846 dout(20) << __func__ << " considering rev " << *prev_ep << dendl;
13847 dout(20) << __func__ << " considering reverse " << *(prev_ep->blob)
13848 << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl;
13849 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
13850 offset - prev_ep->blob_start(),
13851 &l)) {
13852 b = prev_ep->blob;
13853 b_off = offset - prev_ep->blob_start();
13854 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 13855 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
13856 } else if (prev_ep != begin) {
13857 --prev_ep;
13858 any_change = true;
13859 } else {
13860 prev_ep = end; // to avoid useless first extent re-check
13861 }
13862 }
13863 } while (b == nullptr && any_change);
f67539c2
TL
13864 } else {
13865 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
13866 } // if (!wctx->compress)
13867
7c673cae
FG
13868 if (b == nullptr) {
13869 b = c->new_blob();
13870 b_off = 0;
13871 new_blob = true;
13872 }
7c673cae
FG
13873 bufferlist t;
13874 blp.copy(l, t);
13875 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
13876 offset += l;
13877 length -= l;
13878 logger->inc(l_bluestore_write_big_blobs);
13879 }
13880}
13881
13882int BlueStore::_do_alloc_write(
13883 TransContext *txc,
13884 CollectionRef coll,
13885 OnodeRef o,
13886 WriteContext *wctx)
13887{
13888 dout(20) << __func__ << " txc " << txc
13889 << " " << wctx->writes.size() << " blobs"
13890 << dendl;
3efd9988
FG
13891 if (wctx->writes.empty()) {
13892 return 0;
7c673cae
FG
13893 }
13894
7c673cae
FG
13895 CompressorRef c;
13896 double crr = 0;
13897 if (wctx->compress) {
13898 c = select_option(
13899 "compression_algorithm",
13900 compressor,
13901 [&]() {
13902 string val;
13903 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
13904 CompressorRef cp = compressor;
13905 if (!cp || cp->get_type_name() != val) {
13906 cp = Compressor::create(cct, val);
11fdf7f2
TL
13907 if (!cp) {
13908 if (_set_compression_alert(false, val.c_str())) {
13909 derr << __func__ << " unable to initialize " << val.c_str()
13910 << " compressor" << dendl;
13911 }
13912 }
7c673cae
FG
13913 }
13914 return boost::optional<CompressorRef>(cp);
13915 }
13916 return boost::optional<CompressorRef>();
13917 }
13918 );
13919
13920 crr = select_option(
13921 "compression_required_ratio",
13922 cct->_conf->bluestore_compression_required_ratio,
13923 [&]() {
13924 double val;
3efd9988 13925 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
7c673cae
FG
13926 return boost::optional<double>(val);
13927 }
13928 return boost::optional<double>();
13929 }
13930 );
13931 }
13932
13933 // checksum
11fdf7f2 13934 int64_t csum = csum_type.load();
7c673cae
FG
13935 csum = select_option(
13936 "csum_type",
13937 csum,
13938 [&]() {
11fdf7f2 13939 int64_t val;
3efd9988 13940 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
11fdf7f2 13941 return boost::optional<int64_t>(val);
7c673cae 13942 }
11fdf7f2 13943 return boost::optional<int64_t>();
7c673cae
FG
13944 }
13945 );
13946
3efd9988
FG
13947 // compress (as needed) and calc needed space
13948 uint64_t need = 0;
11fdf7f2 13949 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae 13950 for (auto& wi : wctx->writes) {
3efd9988 13951 if (c && wi.blob_length > min_alloc_size) {
11fdf7f2 13952 auto start = mono_clock::now();
7c673cae
FG
13953
13954 // compress
11fdf7f2
TL
13955 ceph_assert(wi.b_off == 0);
13956 ceph_assert(wi.blob_length == wi.bl.length());
3efd9988 13957
7c673cae
FG
13958 // FIXME: memory alignment here is bad
13959 bufferlist t;
f67539c2
TL
13960 boost::optional<int32_t> compressor_message;
13961 int r = c->compress(wi.bl, t, compressor_message);
3efd9988 13962 uint64_t want_len_raw = wi.blob_length * crr;
11fdf7f2 13963 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
a8e16298
TL
13964 bool rejected = false;
13965 uint64_t compressed_len = t.length();
13966 // do an approximate (fast) estimation for resulting blob size
13967 // that doesn't take header overhead into account
11fdf7f2 13968 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
13969 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
13970 bluestore_compression_header_t chdr;
13971 chdr.type = c->get_type();
13972 chdr.length = t.length();
f67539c2 13973 chdr.compressor_message = compressor_message;
a8e16298
TL
13974 encode(chdr, wi.compressed_bl);
13975 wi.compressed_bl.claim_append(t);
13976
13977 compressed_len = wi.compressed_bl.length();
11fdf7f2 13978 result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
13979 if (result_len <= want_len && result_len < wi.blob_length) {
13980 // Cool. We compressed at least as much as we were hoping to.
13981 // pad out to min_alloc_size
13982 wi.compressed_bl.append_zero(result_len - compressed_len);
13983 wi.compressed_len = compressed_len;
13984 wi.compressed = true;
13985 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
13986 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
13987 << " -> 0x" << compressed_len << " => 0x" << result_len
13988 << " with " << c->get_type()
13989 << std::dec << dendl;
13990 txc->statfs_delta.compressed() += compressed_len;
13991 txc->statfs_delta.compressed_original() += wi.blob_length;
13992 txc->statfs_delta.compressed_allocated() += result_len;
13993 logger->inc(l_bluestore_compress_success_count);
13994 need += result_len;
13995 } else {
13996 rejected = true;
13997 }
13998 } else if (r != 0) {
13999 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
14000 << " bytes compressed using " << c->get_type_name()
14001 << std::dec
14002 << " failed with errcode = " << r
14003 << ", leaving uncompressed"
14004 << dendl;
14005 logger->inc(l_bluestore_compress_rejected_count);
14006 need += wi.blob_length;
7c673cae 14007 } else {
a8e16298
TL
14008 rejected = true;
14009 }
14010
14011 if (rejected) {
3efd9988 14012 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
a8e16298 14013 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
3efd9988
FG
14014 << " with " << c->get_type()
14015 << ", which is more than required 0x" << want_len_raw
7c673cae 14016 << " -> 0x" << want_len
3efd9988
FG
14017 << ", leaving uncompressed"
14018 << std::dec << dendl;
14019 logger->inc(l_bluestore_compress_rejected_count);
14020 need += wi.blob_length;
7c673cae 14021 }
494da23a
TL
14022 log_latency("compress@_do_alloc_write",
14023 l_bluestore_compress_lat,
14024 mono_clock::now() - start,
14025 cct->_conf->bluestore_log_op_age );
3efd9988
FG
14026 } else {
14027 need += wi.blob_length;
7c673cae 14028 }
3efd9988 14029 }
a8e16298 14030 PExtentVector prealloc;
3efd9988 14031 prealloc.reserve(2 * wctx->writes.size());;
11fdf7f2 14032 int64_t prealloc_left = 0;
f67539c2 14033 prealloc_left = shared_alloc.a->allocate(
3efd9988
FG
14034 need, min_alloc_size, need,
14035 0, &prealloc);
eafe8130 14036 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
11fdf7f2 14037 derr << __func__ << " failed to allocate 0x" << std::hex << need
eafe8130 14038 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
11fdf7f2 14039 << " min_alloc_size 0x" << min_alloc_size
f67539c2 14040 << " available 0x " << shared_alloc.a->get_free()
11fdf7f2
TL
14041 << std::dec << dendl;
14042 if (prealloc.size()) {
f67539c2 14043 shared_alloc.a->release(prealloc);
11fdf7f2 14044 }
a8e16298
TL
14045 return -ENOSPC;
14046 }
9f95a23c 14047 _collect_allocation_stats(need, min_alloc_size, prealloc.size());
a8e16298 14048
f67539c2
TL
14049 if (bdev->is_smr()) {
14050 std::deque<uint64_t> zones_to_clean;
14051 if (shared_alloc.a->zoned_get_zones_to_clean(&zones_to_clean)) {
14052 std::lock_guard l{zoned_cleaner_lock};
14053 zoned_cleaner_queue.swap(zones_to_clean);
14054 zoned_cleaner_cond.notify_one();
14055 }
14056 }
14057
3efd9988
FG
14058 dout(20) << __func__ << " prealloc " << prealloc << dendl;
14059 auto prealloc_pos = prealloc.begin();
14060
14061 for (auto& wi : wctx->writes) {
14062 BlobRef b = wi.b;
14063 bluestore_blob_t& dblob = b->dirty_blob();
14064 uint64_t b_off = wi.b_off;
14065 bufferlist *l = &wi.bl;
14066 uint64_t final_length = wi.blob_length;
14067 uint64_t csum_length = wi.blob_length;
3efd9988
FG
14068 if (wi.compressed) {
14069 final_length = wi.compressed_bl.length();
14070 csum_length = final_length;
adb31ebb 14071 unsigned csum_order = ctz(csum_length);
3efd9988
FG
14072 l = &wi.compressed_bl;
14073 dblob.set_compressed(wi.blob_length, wi.compressed_len);
adb31ebb
TL
14074 if (csum != Checksummer::CSUM_NONE) {
14075 dout(20) << __func__ << " initialize csum setting for compressed blob " << *b
14076 << " csum_type " << Checksummer::get_csum_type_string(csum)
14077 << " csum_order " << csum_order
14078 << " csum_length 0x" << std::hex << csum_length
14079 << " blob_length 0x" << wi.blob_length
14080 << " compressed_length 0x" << wi.compressed_len << std::dec
14081 << dendl;
14082 dblob.init_csum(csum, csum_order, csum_length);
14083 }
3efd9988 14084 } else if (wi.new_blob) {
adb31ebb 14085 unsigned csum_order;
7c673cae 14086 // initialize newly created blob only
11fdf7f2 14087 ceph_assert(dblob.is_mutable());
7c673cae
FG
14088 if (l->length() != wi.blob_length) {
14089 // hrm, maybe we could do better here, but let's not bother.
14090 dout(20) << __func__ << " forcing csum_order to block_size_order "
14091 << block_size_order << dendl;
31f18b77 14092 csum_order = block_size_order;
7c673cae
FG
14093 } else {
14094 csum_order = std::min(wctx->csum_order, ctz(l->length()));
14095 }
14096 // try to align blob with max_blob_size to improve
14097 // its reuse ratio, e.g. in case of reverse write
14098 uint32_t suggested_boff =
14099 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
14100 if ((suggested_boff % (1 << csum_order)) == 0 &&
14101 suggested_boff + final_length <= max_bsize &&
14102 suggested_boff > b_off) {
181888fb 14103 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae 14104 << std::hex << suggested_boff << std::dec << dendl;
11fdf7f2 14105 ceph_assert(suggested_boff >= b_off);
7c673cae
FG
14106 csum_length += suggested_boff - b_off;
14107 b_off = suggested_boff;
14108 }
181888fb
FG
14109 if (csum != Checksummer::CSUM_NONE) {
14110 dout(20) << __func__ << " initialize csum setting for new blob " << *b
14111 << " csum_type " << Checksummer::get_csum_type_string(csum)
14112 << " csum_order " << csum_order
14113 << " csum_length 0x" << std::hex << csum_length << std::dec
14114 << dendl;
14115 dblob.init_csum(csum, csum_order, csum_length);
14116 }
7c673cae
FG
14117 }
14118
a8e16298 14119 PExtentVector extents;
3efd9988
FG
14120 int64_t left = final_length;
14121 while (left > 0) {
11fdf7f2 14122 ceph_assert(prealloc_left > 0);
3efd9988
FG
14123 if (prealloc_pos->length <= left) {
14124 prealloc_left -= prealloc_pos->length;
14125 left -= prealloc_pos->length;
14126 txc->statfs_delta.allocated() += prealloc_pos->length;
14127 extents.push_back(*prealloc_pos);
14128 ++prealloc_pos;
14129 } else {
14130 extents.emplace_back(prealloc_pos->offset, left);
14131 prealloc_pos->offset += left;
14132 prealloc_pos->length -= left;
14133 prealloc_left -= left;
14134 txc->statfs_delta.allocated() += left;
14135 left = 0;
14136 break;
14137 }
14138 }
7c673cae 14139 for (auto& p : extents) {
3efd9988 14140 txc->allocated.insert(p.offset, p.length);
7c673cae 14141 }
11fdf7f2 14142 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
7c673cae 14143
181888fb
FG
14144 dout(20) << __func__ << " blob " << *b << dendl;
14145 if (dblob.has_csum()) {
7c673cae
FG
14146 dblob.calc_csum(b_off, *l);
14147 }
181888fb 14148
7c673cae 14149 if (wi.mark_unused) {
1911f103 14150 ceph_assert(!dblob.is_compressed());
7c673cae
FG
14151 auto b_end = b_off + wi.bl.length();
14152 if (b_off) {
14153 dblob.add_unused(0, b_off);
14154 }
1911f103
TL
14155 uint64_t llen = dblob.get_logical_length();
14156 if (b_end < llen) {
14157 dblob.add_unused(b_end, llen - b_end);
7c673cae
FG
14158 }
14159 }
14160
14161 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
14162 b_off + (wi.b_off0 - wi.b_off),
14163 wi.length0,
14164 wi.b,
14165 nullptr);
14166 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
14167 txc->statfs_delta.stored() += le->length;
14168 dout(20) << __func__ << " lex " << *le << dendl;
14169 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
14170 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14171
14172 // queue io
11fdf7f2 14173 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae 14174 if (l->length() <= prefer_deferred_size.load()) {
f67539c2 14175 dout(20) << __func__ << " deferring 0x" << std::hex
7c673cae 14176 << l->length() << std::dec << " write via deferred" << dendl;
9f95a23c 14177 bluestore_deferred_op_t *op = _get_deferred_op(txc);
7c673cae
FG
14178 op->op = bluestore_deferred_op_t::OP_WRITE;
14179 int r = b->get_blob().map(
14180 b_off, l->length(),
14181 [&](uint64_t offset, uint64_t length) {
14182 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14183 return 0;
14184 });
11fdf7f2 14185 ceph_assert(r == 0);
7c673cae 14186 op->data = *l;
f67539c2 14187 logger->inc(l_bluestore_write_deferred);
7c673cae
FG
14188 } else {
14189 b->get_blob().map_bl(
14190 b_off, *l,
14191 [&](uint64_t offset, bufferlist& t) {
14192 bdev->aio_write(offset, t, &txc->ioc, false);
14193 });
f67539c2 14194 logger->inc(l_bluestore_write_new);
7c673cae
FG
14195 }
14196 }
14197 }
11fdf7f2
TL
14198 ceph_assert(prealloc_pos == prealloc.end());
14199 ceph_assert(prealloc_left == 0);
7c673cae
FG
14200 return 0;
14201}
14202
14203void BlueStore::_wctx_finish(
14204 TransContext *txc,
14205 CollectionRef& c,
14206 OnodeRef o,
31f18b77
FG
14207 WriteContext *wctx,
14208 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
14209{
14210 auto oep = wctx->old_extents.begin();
14211 while (oep != wctx->old_extents.end()) {
14212 auto &lo = *oep;
14213 oep = wctx->old_extents.erase(oep);
14214 dout(20) << __func__ << " lex_old " << lo.e << dendl;
14215 BlobRef b = lo.e.blob;
14216 const bluestore_blob_t& blob = b->get_blob();
14217 if (blob.is_compressed()) {
14218 if (lo.blob_empty) {
14219 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
14220 }
14221 txc->statfs_delta.compressed_original() -= lo.e.length;
14222 }
14223 auto& r = lo.r;
14224 txc->statfs_delta.stored() -= lo.e.length;
14225 if (!r.empty()) {
f67539c2 14226 dout(20) << __func__ << " blob " << *b << " release " << r << dendl;
7c673cae
FG
14227 if (blob.is_shared()) {
14228 PExtentVector final;
14229 c->load_shared_blob(b->shared_blob);
11fdf7f2
TL
14230 bool unshare = false;
14231 bool* unshare_ptr =
14232 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
7c673cae 14233 for (auto e : r) {
31f18b77
FG
14234 b->shared_blob->put_ref(
14235 e.offset, e.length, &final,
11fdf7f2
TL
14236 unshare_ptr);
14237 }
14238 if (unshare) {
14239 ceph_assert(maybe_unshared_blobs);
14240 maybe_unshared_blobs->insert(b->shared_blob.get());
7c673cae
FG
14241 }
14242 dout(20) << __func__ << " shared_blob release " << final
14243 << " from " << *b->shared_blob << dendl;
14244 txc->write_shared_blob(b->shared_blob);
14245 r.clear();
14246 r.swap(final);
14247 }
14248 }
14249 // we can't invalidate our logical extents as we drop them because
14250 // other lextents (either in our onode or others) may still
14251 // reference them. but we can throw out anything that is no
14252 // longer allocated. Note that this will leave behind edge bits
14253 // that are no longer referenced but not deallocated (until they
14254 // age out of the cache naturally).
14255 b->discard_unallocated(c.get());
14256 for (auto e : r) {
14257 dout(20) << __func__ << " release " << e << dendl;
14258 txc->released.insert(e.offset, e.length);
14259 txc->statfs_delta.allocated() -= e.length;
14260 if (blob.is_compressed()) {
14261 txc->statfs_delta.compressed_allocated() -= e.length;
14262 }
14263 }
9f95a23c
TL
14264
14265 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
7c673cae
FG
14266 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
14267 << dendl;
14268 o->extent_map.spanning_blob_map.erase(b->id);
14269 }
9f95a23c 14270 delete &lo;
7c673cae
FG
14271 }
14272}
14273
14274void BlueStore::_do_write_data(
14275 TransContext *txc,
14276 CollectionRef& c,
14277 OnodeRef o,
14278 uint64_t offset,
14279 uint64_t length,
14280 bufferlist& bl,
14281 WriteContext *wctx)
14282{
14283 uint64_t end = offset + length;
14284 bufferlist::iterator p = bl.begin();
14285
14286 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
14287 (length != min_alloc_size)) {
14288 // we fall within the same block
14289 _do_write_small(txc, c, o, offset, length, p, wctx);
14290 } else {
14291 uint64_t head_offset, head_length;
14292 uint64_t middle_offset, middle_length;
14293 uint64_t tail_offset, tail_length;
14294
14295 head_offset = offset;
11fdf7f2 14296 head_length = p2nphase(offset, min_alloc_size);
7c673cae 14297
11fdf7f2
TL
14298 tail_offset = p2align(end, min_alloc_size);
14299 tail_length = p2phase(end, min_alloc_size);
7c673cae
FG
14300
14301 middle_offset = head_offset + head_length;
14302 middle_length = length - head_length - tail_length;
14303
14304 if (head_length) {
14305 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
14306 }
14307
f67539c2 14308 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
7c673cae
FG
14309
14310 if (tail_length) {
14311 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
14312 }
14313 }
14314}
14315
31f18b77
FG
14316void BlueStore::_choose_write_options(
14317 CollectionRef& c,
14318 OnodeRef o,
14319 uint32_t fadvise_flags,
14320 WriteContext *wctx)
7c673cae 14321{
7c673cae
FG
14322 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
14323 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 14324 wctx->buffered = true;
7c673cae
FG
14325 } else if (cct->_conf->bluestore_default_buffered_write &&
14326 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
14327 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
14328 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 14329 wctx->buffered = true;
7c673cae
FG
14330 }
14331
31f18b77
FG
14332 // apply basic csum block size
14333 wctx->csum_order = block_size_order;
7c673cae
FG
14334
14335 // compression parameters
14336 unsigned alloc_hints = o->onode.alloc_hint_flags;
14337 auto cm = select_option(
14338 "compression_mode",
31f18b77 14339 comp_mode.load(),
7c673cae
FG
14340 [&]() {
14341 string val;
11fdf7f2 14342 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
14343 return boost::optional<Compressor::CompressionMode>(
14344 Compressor::get_comp_mode_type(val));
7c673cae
FG
14345 }
14346 return boost::optional<Compressor::CompressionMode>();
14347 }
14348 );
31f18b77
FG
14349
14350 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
14351 ((cm == Compressor::COMP_FORCE) ||
14352 (cm == Compressor::COMP_AGGRESSIVE &&
14353 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
14354 (cm == Compressor::COMP_PASSIVE &&
14355 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
14356
14357 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
14358 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
14359 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
14360 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 14361 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 14362
7c673cae 14363 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 14364
7c673cae 14365 if (o->onode.expected_write_size) {
224ce89b 14366 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 14367 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 14368 } else {
224ce89b 14369 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
14370 }
14371
31f18b77
FG
14372 if (wctx->compress) {
14373 wctx->target_blob_size = select_option(
7c673cae 14374 "compression_max_blob_size",
31f18b77 14375 comp_max_blob_size.load(),
7c673cae 14376 [&]() {
11fdf7f2
TL
14377 int64_t val;
14378 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
7c673cae
FG
14379 return boost::optional<uint64_t>((uint64_t)val);
14380 }
14381 return boost::optional<uint64_t>();
14382 }
14383 );
14384 }
14385 } else {
31f18b77
FG
14386 if (wctx->compress) {
14387 wctx->target_blob_size = select_option(
7c673cae 14388 "compression_min_blob_size",
31f18b77 14389 comp_min_blob_size.load(),
7c673cae 14390 [&]() {
11fdf7f2
TL
14391 int64_t val;
14392 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
7c673cae
FG
14393 return boost::optional<uint64_t>((uint64_t)val);
14394 }
14395 return boost::optional<uint64_t>();
14396 }
14397 );
14398 }
14399 }
31f18b77 14400
7c673cae 14401 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
14402 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
14403 wctx->target_blob_size = max_bsize;
7c673cae 14404 }
31f18b77 14405
7c673cae
FG
14406 // set the min blob size floor at 2x the min_alloc_size, or else we
14407 // won't be able to allocate a smaller extent for the compressed
14408 // data.
31f18b77
FG
14409 if (wctx->compress &&
14410 wctx->target_blob_size < min_alloc_size * 2) {
14411 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 14412 }
31f18b77
FG
14413
14414 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
14415 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
f64942e4
AA
14416 << " compress=" << (int)wctx->compress
14417 << " buffered=" << (int)wctx->buffered
31f18b77
FG
14418 << std::dec << dendl;
14419}
14420
14421int BlueStore::_do_gc(
14422 TransContext *txc,
14423 CollectionRef& c,
14424 OnodeRef o,
31f18b77
FG
14425 const WriteContext& wctx,
14426 uint64_t *dirty_start,
14427 uint64_t *dirty_end)
14428{
31f18b77 14429
1adf2230 14430 bool dirty_range_updated = false;
31f18b77 14431 WriteContext wctx_gc;
7c673cae 14432 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 14433
eafe8130 14434 auto & extents_to_collect = wctx.extents_to_gc;
31f18b77
FG
14435 for (auto it = extents_to_collect.begin();
14436 it != extents_to_collect.end();
14437 ++it) {
14438 bufferlist bl;
eafe8130
TL
14439 auto offset = (*it).first;
14440 auto length = (*it).second;
14441 dout(20) << __func__ << " processing " << std::hex
14442 << offset << "~" << length << std::dec
14443 << dendl;
14444 int r = _do_read(c.get(), o, offset, length, bl, 0);
14445 ceph_assert(r == (int)length);
31f18b77 14446
eafe8130
TL
14447 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
14448 logger->inc(l_bluestore_gc_merged, length);
31f18b77 14449
eafe8130
TL
14450 if (*dirty_start > offset) {
14451 *dirty_start = offset;
1adf2230 14452 dirty_range_updated = true;
31f18b77
FG
14453 }
14454
eafe8130
TL
14455 if (*dirty_end < offset + length) {
14456 *dirty_end = offset + length;
1adf2230 14457 dirty_range_updated = true;
31f18b77
FG
14458 }
14459 }
1adf2230
AA
14460 if (dirty_range_updated) {
14461 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
14462 }
31f18b77
FG
14463
14464 dout(30) << __func__ << " alloc write" << dendl;
14465 int r = _do_alloc_write(txc, c, o, &wctx_gc);
14466 if (r < 0) {
14467 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14468 << dendl;
14469 return r;
14470 }
14471
14472 _wctx_finish(txc, c, o, &wctx_gc);
14473 return 0;
14474}
14475
14476int BlueStore::_do_write(
14477 TransContext *txc,
14478 CollectionRef& c,
14479 OnodeRef o,
14480 uint64_t offset,
14481 uint64_t length,
14482 bufferlist& bl,
14483 uint32_t fadvise_flags)
14484{
14485 int r = 0;
14486
14487 dout(20) << __func__
14488 << " " << o->oid
14489 << " 0x" << std::hex << offset << "~" << length
14490 << " - have 0x" << o->onode.size
14491 << " (" << std::dec << o->onode.size << ")"
f67539c2
TL
14492 << " bytes" << std::hex
14493 << " fadvise_flags 0x" << fadvise_flags
14494 << " alloc_hint 0x" << o->onode.alloc_hint_flags
14495 << " expected_object_size " << o->onode.expected_object_size
14496 << " expected_write_size " << o->onode.expected_write_size
14497 << std::dec
31f18b77 14498 << dendl;
81eedcae 14499 _dump_onode<30>(cct, *o);
31f18b77
FG
14500
14501 if (length == 0) {
14502 return 0;
14503 }
14504
14505 uint64_t end = offset + length;
14506
14507 GarbageCollector gc(c->store->cct);
eafe8130 14508 int64_t benefit = 0;
31f18b77
FG
14509 auto dirty_start = offset;
14510 auto dirty_end = end;
14511
14512 WriteContext wctx;
14513 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
14514 o->extent_map.fault_range(db, offset, length);
14515 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
14516 r = _do_alloc_write(txc, c, o, &wctx);
14517 if (r < 0) {
14518 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14519 << dendl;
14520 goto out;
14521 }
14522
eafe8130
TL
14523 if (wctx.extents_to_gc.empty() ||
14524 wctx.extents_to_gc.range_start() > offset ||
14525 wctx.extents_to_gc.range_end() < offset + length) {
14526 benefit = gc.estimate(offset,
14527 length,
14528 o->extent_map,
14529 wctx.old_extents,
14530 min_alloc_size);
14531 }
14532
f67539c2
TL
14533 if (bdev->is_smr()) {
14534 if (wctx.old_extents.empty()) {
14535 txc->zoned_note_new_object(o);
14536 } else {
14537 int64_t old_ondisk_offset = wctx.old_extents.begin()->r.begin()->offset;
14538 txc->zoned_note_updated_object(o, old_ondisk_offset);
14539 }
14540 }
14541
31f18b77
FG
14542 // NB: _wctx_finish() will empty old_extents
14543 // so we must do gc estimation before that
7c673cae
FG
14544 _wctx_finish(txc, c, o, &wctx);
14545 if (end > o->onode.size) {
14546 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 14547 << std::dec << dendl;
7c673cae
FG
14548 o->onode.size = end;
14549 }
14550
11fdf7f2 14551 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
eafe8130
TL
14552 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
14553 dout(20) << __func__
14554 << " perform garbage collection for compressed extents, "
14555 << "expected benefit = " << benefit << " AUs" << dendl;
14556 }
14557 if (!wctx.extents_to_gc.empty()) {
14558 dout(20) << __func__ << " perform garbage collection" << dendl;
14559
14560 r = _do_gc(txc, c, o,
14561 wctx,
14562 &dirty_start, &dirty_end);
14563 if (r < 0) {
14564 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
14565 << dendl;
14566 goto out;
7c673cae 14567 }
eafe8130
TL
14568 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
14569 << "~" << dirty_end - dirty_start << std::dec << dendl;
7c673cae 14570 }
7c673cae 14571 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
14572 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
14573
7c673cae
FG
14574 r = 0;
14575
14576 out:
14577 return r;
14578}
14579
14580int BlueStore::_write(TransContext *txc,
14581 CollectionRef& c,
14582 OnodeRef& o,
31f18b77
FG
14583 uint64_t offset, size_t length,
14584 bufferlist& bl,
14585 uint32_t fadvise_flags)
7c673cae
FG
14586{
14587 dout(15) << __func__ << " " << c->cid << " " << o->oid
14588 << " 0x" << std::hex << offset << "~" << length << std::dec
14589 << dendl;
35e4c445
FG
14590 int r = 0;
14591 if (offset + length >= OBJECT_MAX_SIZE) {
14592 r = -E2BIG;
14593 } else {
14594 _assign_nid(txc, o);
14595 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
14596 txc->write_onode(o);
14597 }
7c673cae
FG
14598 dout(10) << __func__ << " " << c->cid << " " << o->oid
14599 << " 0x" << std::hex << offset << "~" << length << std::dec
14600 << " = " << r << dendl;
14601 return r;
14602}
14603
14604int BlueStore::_zero(TransContext *txc,
14605 CollectionRef& c,
14606 OnodeRef& o,
14607 uint64_t offset, size_t length)
14608{
14609 dout(15) << __func__ << " " << c->cid << " " << o->oid
14610 << " 0x" << std::hex << offset << "~" << length << std::dec
14611 << dendl;
35e4c445
FG
14612 int r = 0;
14613 if (offset + length >= OBJECT_MAX_SIZE) {
14614 r = -E2BIG;
14615 } else {
14616 _assign_nid(txc, o);
14617 r = _do_zero(txc, c, o, offset, length);
14618 }
7c673cae
FG
14619 dout(10) << __func__ << " " << c->cid << " " << o->oid
14620 << " 0x" << std::hex << offset << "~" << length << std::dec
14621 << " = " << r << dendl;
14622 return r;
14623}
14624
14625int BlueStore::_do_zero(TransContext *txc,
14626 CollectionRef& c,
14627 OnodeRef& o,
14628 uint64_t offset, size_t length)
14629{
14630 dout(15) << __func__ << " " << c->cid << " " << o->oid
14631 << " 0x" << std::hex << offset << "~" << length << std::dec
14632 << dendl;
14633 int r = 0;
14634
81eedcae 14635 _dump_onode<30>(cct, *o);
7c673cae
FG
14636
14637 WriteContext wctx;
14638 o->extent_map.fault_range(db, offset, length);
14639 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 14640 o->extent_map.dirty_range(offset, length);
7c673cae
FG
14641 _wctx_finish(txc, c, o, &wctx);
14642
b32b8144 14643 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
14644 o->onode.size = offset + length;
14645 dout(20) << __func__ << " extending size to " << offset + length
14646 << dendl;
14647 }
14648 txc->write_onode(o);
14649
14650 dout(10) << __func__ << " " << c->cid << " " << o->oid
14651 << " 0x" << std::hex << offset << "~" << length << std::dec
14652 << " = " << r << dendl;
14653 return r;
14654}
14655
14656void BlueStore::_do_truncate(
31f18b77
FG
14657 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
14658 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
14659{
14660 dout(15) << __func__ << " " << c->cid << " " << o->oid
14661 << " 0x" << std::hex << offset << std::dec << dendl;
14662
81eedcae 14663 _dump_onode<30>(cct, *o);
7c673cae
FG
14664
14665 if (offset == o->onode.size)
31f18b77 14666 return;
7c673cae 14667
f67539c2 14668 WriteContext wctx;
7c673cae 14669 if (offset < o->onode.size) {
7c673cae
FG
14670 uint64_t length = o->onode.size - offset;
14671 o->extent_map.fault_range(db, offset, length);
14672 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77
FG
14673 o->extent_map.dirty_range(offset, length);
14674 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
14675
14676 // if we have shards past EOF, ask for a reshard
14677 if (!o->onode.extent_map_shards.empty() &&
14678 o->onode.extent_map_shards.back().offset >= offset) {
14679 dout(10) << __func__ << " request reshard past EOF" << dendl;
14680 if (offset) {
14681 o->extent_map.request_reshard(offset - 1, offset + length);
14682 } else {
14683 o->extent_map.request_reshard(0, length);
14684 }
14685 }
14686 }
14687
14688 o->onode.size = offset;
14689
f67539c2
TL
14690 if (bdev->is_smr()) {
14691 // On zoned devices, we currently support only removing an object or
14692 // truncating it to zero size, both of which fall through this code path.
14693 ceph_assert(offset == 0 && !wctx.old_extents.empty());
14694 int64_t ondisk_offset = wctx.old_extents.begin()->r.begin()->offset;
14695 txc->zoned_note_truncated_object(o, ondisk_offset);
14696 }
14697
7c673cae
FG
14698 txc->write_onode(o);
14699}
14700
35e4c445 14701int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
14702 CollectionRef& c,
14703 OnodeRef& o,
14704 uint64_t offset)
14705{
14706 dout(15) << __func__ << " " << c->cid << " " << o->oid
14707 << " 0x" << std::hex << offset << std::dec
14708 << dendl;
35e4c445
FG
14709 int r = 0;
14710 if (offset >= OBJECT_MAX_SIZE) {
14711 r = -E2BIG;
14712 } else {
14713 _do_truncate(txc, c, o, offset);
14714 }
14715 dout(10) << __func__ << " " << c->cid << " " << o->oid
14716 << " 0x" << std::hex << offset << std::dec
14717 << " = " << r << dendl;
14718 return r;
7c673cae
FG
14719}
14720
14721int BlueStore::_do_remove(
14722 TransContext *txc,
14723 CollectionRef& c,
14724 OnodeRef o)
14725{
31f18b77 14726 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
14727 bool is_gen = !o->oid.is_no_gen();
14728 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
14729 if (o->onode.has_omap()) {
14730 o->flush();
9f95a23c 14731 _do_omap_clear(txc, o);
7c673cae
FG
14732 }
14733 o->exists = false;
14734 string key;
14735 for (auto &s : o->extent_map.shards) {
14736 dout(20) << __func__ << " removing shard 0x" << std::hex
14737 << s.shard_info->offset << std::dec << dendl;
14738 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
14739 [&](const string& final_key) {
14740 txc->t->rmkey(PREFIX_OBJ, final_key);
14741 }
14742 );
14743 }
14744 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
a8e16298 14745 txc->note_removed_object(o);
7c673cae
FG
14746 o->extent_map.clear();
14747 o->onode = bluestore_onode_t();
14748 _debug_obj_on_delete(o->oid);
31f18b77 14749
224ce89b
WB
14750 if (!is_gen || maybe_unshared_blobs.empty()) {
14751 return 0;
14752 }
31f18b77 14753
224ce89b
WB
14754 // see if we can unshare blobs still referenced by the head
14755 dout(10) << __func__ << " gen and maybe_unshared_blobs "
14756 << maybe_unshared_blobs << dendl;
14757 ghobject_t nogen = o->oid;
14758 nogen.generation = ghobject_t::NO_GEN;
f67539c2 14759 OnodeRef h = c->get_onode(nogen, false);
224ce89b
WB
14760
14761 if (!h || !h->exists) {
14762 return 0;
14763 }
14764
14765 dout(20) << __func__ << " checking for unshareable blobs on " << h
14766 << " " << h->oid << dendl;
14767 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
14768 for (auto& e : h->extent_map.extent_map) {
14769 const bluestore_blob_t& b = e.blob->get_blob();
14770 SharedBlob *sb = e.blob->shared_blob.get();
14771 if (b.is_shared() &&
14772 sb->loaded &&
14773 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
14774 if (b.is_compressed()) {
14775 expect[sb].get(0, b.get_ondisk_length());
14776 } else {
14777 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
14778 expect[sb].get(off, len);
14779 return 0;
14780 });
14781 }
224ce89b
WB
14782 }
14783 }
31f18b77 14784
224ce89b
WB
14785 vector<SharedBlob*> unshared_blobs;
14786 unshared_blobs.reserve(maybe_unshared_blobs.size());
14787 for (auto& p : expect) {
14788 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
14789 if (p.first->persistent->ref_map == p.second) {
14790 SharedBlob *sb = p.first;
14791 dout(20) << __func__ << " unsharing " << *sb << dendl;
14792 unshared_blobs.push_back(sb);
14793 txc->unshare_blob(sb);
14794 uint64_t sbid = c->make_blob_unshared(sb);
14795 string key;
14796 get_shared_blob_key(sbid, &key);
14797 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
14798 }
14799 }
14800
14801 if (unshared_blobs.empty()) {
14802 return 0;
14803 }
14804
224ce89b
WB
14805 for (auto& e : h->extent_map.extent_map) {
14806 const bluestore_blob_t& b = e.blob->get_blob();
14807 SharedBlob *sb = e.blob->shared_blob.get();
14808 if (b.is_shared() &&
14809 std::find(unshared_blobs.begin(), unshared_blobs.end(),
14810 sb) != unshared_blobs.end()) {
14811 dout(20) << __func__ << " unsharing " << e << dendl;
14812 bluestore_blob_t& blob = e.blob->dirty_blob();
14813 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 14814 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
14815 }
14816 }
224ce89b
WB
14817 txc->write_onode(h);
14818
7c673cae
FG
14819 return 0;
14820}
14821
14822int BlueStore::_remove(TransContext *txc,
14823 CollectionRef& c,
14824 OnodeRef &o)
14825{
11fdf7f2
TL
14826 dout(15) << __func__ << " " << c->cid << " " << o->oid
14827 << " onode " << o.get()
14828 << " txc "<< txc << dendl;
adb31ebb
TL
14829
14830 auto start_time = mono_clock::now();
7c673cae 14831 int r = _do_remove(txc, c, o);
adb31ebb
TL
14832 log_latency_fn(
14833 __func__,
14834 l_bluestore_remove_lat,
14835 mono_clock::now() - start_time,
14836 cct->_conf->bluestore_log_op_age,
14837 [&](const ceph::timespan& lat) {
14838 ostringstream ostr;
14839 ostr << ", lat = " << timespan_str(lat)
14840 << " cid =" << c->cid
14841 << " oid =" << o->oid;
14842 return ostr.str();
14843 }
14844 );
14845
7c673cae
FG
14846 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14847 return r;
14848}
14849
14850int BlueStore::_setattr(TransContext *txc,
14851 CollectionRef& c,
14852 OnodeRef& o,
14853 const string& name,
14854 bufferptr& val)
14855{
14856 dout(15) << __func__ << " " << c->cid << " " << o->oid
14857 << " " << name << " (" << val.length() << " bytes)"
14858 << dendl;
14859 int r = 0;
3efd9988
FG
14860 if (val.is_partial()) {
14861 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
14862 val.length());
f91f0fd5 14863 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
14864 } else {
14865 auto& b = o->onode.attrs[name.c_str()] = val;
f91f0fd5 14866 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 14867 }
7c673cae
FG
14868 txc->write_onode(o);
14869 dout(10) << __func__ << " " << c->cid << " " << o->oid
14870 << " " << name << " (" << val.length() << " bytes)"
14871 << " = " << r << dendl;
14872 return r;
14873}
14874
14875int BlueStore::_setattrs(TransContext *txc,
14876 CollectionRef& c,
14877 OnodeRef& o,
14878 const map<string,bufferptr>& aset)
14879{
14880 dout(15) << __func__ << " " << c->cid << " " << o->oid
14881 << " " << aset.size() << " keys"
14882 << dendl;
14883 int r = 0;
14884 for (map<string,bufferptr>::const_iterator p = aset.begin();
14885 p != aset.end(); ++p) {
3efd9988
FG
14886 if (p->second.is_partial()) {
14887 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 14888 bufferptr(p->second.c_str(), p->second.length());
f91f0fd5 14889 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
14890 } else {
14891 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
f91f0fd5 14892 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 14893 }
7c673cae
FG
14894 }
14895 txc->write_onode(o);
14896 dout(10) << __func__ << " " << c->cid << " " << o->oid
14897 << " " << aset.size() << " keys"
14898 << " = " << r << dendl;
14899 return r;
14900}
14901
14902
14903int BlueStore::_rmattr(TransContext *txc,
14904 CollectionRef& c,
14905 OnodeRef& o,
14906 const string& name)
14907{
14908 dout(15) << __func__ << " " << c->cid << " " << o->oid
14909 << " " << name << dendl;
14910 int r = 0;
14911 auto it = o->onode.attrs.find(name.c_str());
14912 if (it == o->onode.attrs.end())
14913 goto out;
14914
14915 o->onode.attrs.erase(it);
14916 txc->write_onode(o);
14917
14918 out:
14919 dout(10) << __func__ << " " << c->cid << " " << o->oid
14920 << " " << name << " = " << r << dendl;
14921 return r;
14922}
14923
14924int BlueStore::_rmattrs(TransContext *txc,
14925 CollectionRef& c,
14926 OnodeRef& o)
14927{
14928 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14929 int r = 0;
14930
14931 if (o->onode.attrs.empty())
14932 goto out;
14933
14934 o->onode.attrs.clear();
14935 txc->write_onode(o);
14936
14937 out:
14938 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14939 return r;
14940}
14941
9f95a23c 14942void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
7c673cae 14943{
9f95a23c 14944 const string& omap_prefix = o->get_omap_prefix();
7c673cae 14945 string prefix, tail;
9f95a23c
TL
14946 o->get_omap_header(&prefix);
14947 o->get_omap_tail(&tail);
11fdf7f2 14948 txc->t->rm_range_keys(omap_prefix, prefix, tail);
494da23a 14949 txc->t->rmkey(omap_prefix, tail);
11fdf7f2
TL
14950 dout(20) << __func__ << " remove range start: "
14951 << pretty_binary_string(prefix) << " end: "
14952 << pretty_binary_string(tail) << dendl;
7c673cae
FG
14953}
14954
14955int BlueStore::_omap_clear(TransContext *txc,
14956 CollectionRef& c,
14957 OnodeRef& o)
14958{
14959 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14960 int r = 0;
14961 if (o->onode.has_omap()) {
14962 o->flush();
9f95a23c 14963 _do_omap_clear(txc, o);
7c673cae
FG
14964 o->onode.clear_omap_flag();
14965 txc->write_onode(o);
14966 }
14967 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14968 return r;
14969}
14970
14971int BlueStore::_omap_setkeys(TransContext *txc,
14972 CollectionRef& c,
14973 OnodeRef& o,
14974 bufferlist &bl)
14975{
14976 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14977 int r;
11fdf7f2 14978 auto p = bl.cbegin();
7c673cae
FG
14979 __u32 num;
14980 if (!o->onode.has_omap()) {
11fdf7f2 14981 if (o->oid.is_pgmeta()) {
9f95a23c
TL
14982 o->onode.set_omap_flags_pgmeta();
14983 } else {
14984 o->onode.set_omap_flags();
11fdf7f2 14985 }
7c673cae 14986 txc->write_onode(o);
494da23a 14987
9f95a23c 14988 const string& prefix = o->get_omap_prefix();
494da23a
TL
14989 string key_tail;
14990 bufferlist tail;
9f95a23c 14991 o->get_omap_tail(&key_tail);
494da23a 14992 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
14993 } else {
14994 txc->note_modified_object(o);
14995 }
9f95a23c 14996 const string& prefix = o->get_omap_prefix();
7c673cae 14997 string final_key;
9f95a23c
TL
14998 o->get_omap_key(string(), &final_key);
14999 size_t base_key_len = final_key.size();
11fdf7f2 15000 decode(num, p);
7c673cae
FG
15001 while (num--) {
15002 string key;
15003 bufferlist value;
11fdf7f2
TL
15004 decode(key, p);
15005 decode(value, p);
9f95a23c 15006 final_key.resize(base_key_len); // keep prefix
7c673cae 15007 final_key += key;
11fdf7f2 15008 dout(20) << __func__ << " " << pretty_binary_string(final_key)
7c673cae 15009 << " <- " << key << dendl;
11fdf7f2 15010 txc->t->set(prefix, final_key, value);
7c673cae
FG
15011 }
15012 r = 0;
15013 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15014 return r;
15015}
15016
15017int BlueStore::_omap_setheader(TransContext *txc,
15018 CollectionRef& c,
15019 OnodeRef &o,
15020 bufferlist& bl)
15021{
15022 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15023 int r;
15024 string key;
15025 if (!o->onode.has_omap()) {
11fdf7f2 15026 if (o->oid.is_pgmeta()) {
9f95a23c
TL
15027 o->onode.set_omap_flags_pgmeta();
15028 } else {
15029 o->onode.set_omap_flags();
11fdf7f2 15030 }
7c673cae 15031 txc->write_onode(o);
494da23a 15032
9f95a23c 15033 const string& prefix = o->get_omap_prefix();
494da23a
TL
15034 string key_tail;
15035 bufferlist tail;
9f95a23c 15036 o->get_omap_tail(&key_tail);
494da23a 15037 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
15038 } else {
15039 txc->note_modified_object(o);
15040 }
9f95a23c
TL
15041 const string& prefix = o->get_omap_prefix();
15042 o->get_omap_header(&key);
11fdf7f2 15043 txc->t->set(prefix, key, bl);
7c673cae
FG
15044 r = 0;
15045 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15046 return r;
15047}
15048
15049int BlueStore::_omap_rmkeys(TransContext *txc,
15050 CollectionRef& c,
15051 OnodeRef& o,
15052 bufferlist& bl)
15053{
15054 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15055 int r = 0;
11fdf7f2 15056 auto p = bl.cbegin();
7c673cae
FG
15057 __u32 num;
15058 string final_key;
15059
15060 if (!o->onode.has_omap()) {
15061 goto out;
15062 }
11fdf7f2 15063 {
9f95a23c
TL
15064 const string& prefix = o->get_omap_prefix();
15065 o->get_omap_key(string(), &final_key);
15066 size_t base_key_len = final_key.size();
11fdf7f2
TL
15067 decode(num, p);
15068 while (num--) {
15069 string key;
15070 decode(key, p);
9f95a23c 15071 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
15072 final_key += key;
15073 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
15074 << " <- " << key << dendl;
15075 txc->t->rmkey(prefix, final_key);
15076 }
7c673cae
FG
15077 }
15078 txc->note_modified_object(o);
15079
15080 out:
15081 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15082 return r;
15083}
15084
15085int BlueStore::_omap_rmkey_range(TransContext *txc,
15086 CollectionRef& c,
15087 OnodeRef& o,
15088 const string& first, const string& last)
15089{
15090 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
7c673cae
FG
15091 string key_first, key_last;
15092 int r = 0;
15093 if (!o->onode.has_omap()) {
15094 goto out;
15095 }
11fdf7f2 15096 {
9f95a23c 15097 const string& prefix = o->get_omap_prefix();
11fdf7f2 15098 o->flush();
9f95a23c
TL
15099 o->get_omap_key(first, &key_first);
15100 o->get_omap_key(last, &key_last);
11fdf7f2
TL
15101 txc->t->rm_range_keys(prefix, key_first, key_last);
15102 dout(20) << __func__ << " remove range start: "
15103 << pretty_binary_string(key_first) << " end: "
15104 << pretty_binary_string(key_last) << dendl;
7c673cae
FG
15105 }
15106 txc->note_modified_object(o);
15107
15108 out:
15109 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15110 return r;
15111}
15112
15113int BlueStore::_set_alloc_hint(
15114 TransContext *txc,
15115 CollectionRef& c,
15116 OnodeRef& o,
15117 uint64_t expected_object_size,
15118 uint64_t expected_write_size,
15119 uint32_t flags)
15120{
15121 dout(15) << __func__ << " " << c->cid << " " << o->oid
15122 << " object_size " << expected_object_size
15123 << " write_size " << expected_write_size
15124 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
15125 << dendl;
15126 int r = 0;
15127 o->onode.expected_object_size = expected_object_size;
15128 o->onode.expected_write_size = expected_write_size;
15129 o->onode.alloc_hint_flags = flags;
15130 txc->write_onode(o);
15131 dout(10) << __func__ << " " << c->cid << " " << o->oid
15132 << " object_size " << expected_object_size
15133 << " write_size " << expected_write_size
15134 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
15135 << " = " << r << dendl;
15136 return r;
15137}
15138
15139int BlueStore::_clone(TransContext *txc,
15140 CollectionRef& c,
15141 OnodeRef& oldo,
15142 OnodeRef& newo)
15143{
15144 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15145 << newo->oid << dendl;
15146 int r = 0;
15147 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
15148 derr << __func__ << " mismatched hash on " << oldo->oid
15149 << " and " << newo->oid << dendl;
15150 return -EINVAL;
15151 }
15152
7c673cae
FG
15153 _assign_nid(txc, newo);
15154
15155 // clone data
15156 oldo->flush();
15157 _do_truncate(txc, c, newo, 0);
15158 if (cct->_conf->bluestore_clone_cow) {
15159 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
15160 } else {
15161 bufferlist bl;
15162 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
15163 if (r < 0)
15164 goto out;
15165 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
15166 if (r < 0)
15167 goto out;
15168 }
15169
15170 // clone attrs
15171 newo->onode.attrs = oldo->onode.attrs;
15172
15173 // clone omap
15174 if (newo->onode.has_omap()) {
15175 dout(20) << __func__ << " clearing old omap data" << dendl;
15176 newo->flush();
9f95a23c 15177 _do_omap_clear(txc, newo);
494da23a 15178 newo->onode.clear_omap_flag();
7c673cae
FG
15179 }
15180 if (oldo->onode.has_omap()) {
15181 dout(20) << __func__ << " copying omap data" << dendl;
494da23a 15182 if (newo->oid.is_pgmeta()) {
9f95a23c
TL
15183 newo->onode.set_omap_flags_pgmeta();
15184 } else {
15185 newo->onode.set_omap_flags();
7c673cae 15186 }
9f95a23c 15187 const string& prefix = newo->get_omap_prefix();
11fdf7f2 15188 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 15189 string head, tail;
9f95a23c
TL
15190 oldo->get_omap_header(&head);
15191 oldo->get_omap_tail(&tail);
7c673cae
FG
15192 it->lower_bound(head);
15193 while (it->valid()) {
15194 if (it->key() >= tail) {
15195 dout(30) << __func__ << " reached tail" << dendl;
15196 break;
15197 } else {
15198 dout(30) << __func__ << " got header/data "
15199 << pretty_binary_string(it->key()) << dendl;
15200 string key;
9f95a23c 15201 newo->rewrite_omap_key(it->key(), &key);
11fdf7f2 15202 txc->t->set(prefix, key, it->value());
7c673cae
FG
15203 }
15204 it->next();
15205 }
494da23a
TL
15206 string new_tail;
15207 bufferlist new_tail_value;
9f95a23c 15208 newo->get_omap_tail(&new_tail);
494da23a 15209 txc->t->set(prefix, new_tail, new_tail_value);
7c673cae
FG
15210 }
15211
15212 txc->write_onode(newo);
15213 r = 0;
15214
15215 out:
15216 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15217 << newo->oid << " = " << r << dendl;
15218 return r;
15219}
15220
15221int BlueStore::_do_clone_range(
15222 TransContext *txc,
15223 CollectionRef& c,
15224 OnodeRef& oldo,
15225 OnodeRef& newo,
224ce89b
WB
15226 uint64_t srcoff,
15227 uint64_t length,
15228 uint64_t dstoff)
7c673cae
FG
15229{
15230 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15231 << newo->oid
15232 << " 0x" << std::hex << srcoff << "~" << length << " -> "
15233 << " 0x" << dstoff << "~" << length << std::dec << dendl;
15234 oldo->extent_map.fault_range(db, srcoff, length);
15235 newo->extent_map.fault_range(db, dstoff, length);
81eedcae
TL
15236 _dump_onode<30>(cct, *oldo);
15237 _dump_onode<30>(cct, *newo);
7c673cae 15238
11fdf7f2 15239 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
81eedcae
TL
15240 _dump_onode<30>(cct, *oldo);
15241 _dump_onode<30>(cct, *newo);
7c673cae
FG
15242 return 0;
15243}
15244
15245int BlueStore::_clone_range(TransContext *txc,
15246 CollectionRef& c,
15247 OnodeRef& oldo,
15248 OnodeRef& newo,
15249 uint64_t srcoff, uint64_t length, uint64_t dstoff)
15250{
15251 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15252 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
15253 << " to offset 0x" << dstoff << std::dec << dendl;
15254 int r = 0;
15255
35e4c445
FG
15256 if (srcoff + length >= OBJECT_MAX_SIZE ||
15257 dstoff + length >= OBJECT_MAX_SIZE) {
15258 r = -E2BIG;
15259 goto out;
15260 }
7c673cae
FG
15261 if (srcoff + length > oldo->onode.size) {
15262 r = -EINVAL;
15263 goto out;
15264 }
15265
7c673cae
FG
15266 _assign_nid(txc, newo);
15267
15268 if (length > 0) {
15269 if (cct->_conf->bluestore_clone_cow) {
15270 _do_zero(txc, c, newo, dstoff, length);
15271 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
15272 } else {
15273 bufferlist bl;
15274 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
15275 if (r < 0)
15276 goto out;
15277 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
15278 if (r < 0)
15279 goto out;
15280 }
15281 }
15282
15283 txc->write_onode(newo);
15284 r = 0;
15285
15286 out:
15287 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15288 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
15289 << " to offset 0x" << dstoff << std::dec
15290 << " = " << r << dendl;
15291 return r;
15292}
15293
15294int BlueStore::_rename(TransContext *txc,
15295 CollectionRef& c,
15296 OnodeRef& oldo,
15297 OnodeRef& newo,
15298 const ghobject_t& new_oid)
15299{
15300 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15301 << new_oid << dendl;
15302 int r;
15303 ghobject_t old_oid = oldo->oid;
f91f0fd5 15304 mempool::bluestore_cache_meta::string new_okey;
7c673cae
FG
15305
15306 if (newo) {
15307 if (newo->exists) {
15308 r = -EEXIST;
15309 goto out;
15310 }
11fdf7f2 15311 ceph_assert(txc->onodes.count(newo) == 0);
7c673cae
FG
15312 }
15313
15314 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
15315
15316 // rewrite shards
15317 {
15318 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
15319 get_object_key(cct, new_oid, &new_okey);
15320 string key;
15321 for (auto &s : oldo->extent_map.shards) {
15322 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
15323 [&](const string& final_key) {
15324 txc->t->rmkey(PREFIX_OBJ, final_key);
15325 }
15326 );
15327 s.dirty = true;
15328 }
15329 }
15330
15331 newo = oldo;
15332 txc->write_onode(newo);
15333
15334 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
15335 // Onode in the old slot
15336 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
15337 r = 0;
15338
f64942e4
AA
15339 // hold a ref to new Onode in old name position, to ensure we don't drop
15340 // it from the cache before this txc commits (or else someone may come along
15341 // and read newo's metadata via the old name).
15342 txc->note_modified_object(oldo);
15343
7c673cae
FG
15344 out:
15345 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
15346 << new_oid << " = " << r << dendl;
15347 return r;
15348}
15349
15350// collections
15351
15352int BlueStore::_create_collection(
15353 TransContext *txc,
15354 const coll_t &cid,
15355 unsigned bits,
15356 CollectionRef *c)
15357{
15358 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
15359 int r;
15360 bufferlist bl;
15361
15362 {
9f95a23c 15363 std::unique_lock l(coll_lock);
7c673cae
FG
15364 if (*c) {
15365 r = -EEXIST;
15366 goto out;
15367 }
11fdf7f2
TL
15368 auto p = new_coll_map.find(cid);
15369 ceph_assert(p != new_coll_map.end());
15370 *c = p->second;
7c673cae
FG
15371 (*c)->cnode.bits = bits;
15372 coll_map[cid] = *c;
11fdf7f2 15373 new_coll_map.erase(p);
7c673cae 15374 }
11fdf7f2 15375 encode((*c)->cnode, bl);
7c673cae
FG
15376 txc->t->set(PREFIX_COLL, stringify(cid), bl);
15377 r = 0;
15378
15379 out:
15380 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
15381 return r;
15382}
15383
15384int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
15385 CollectionRef *c)
15386{
15387 dout(15) << __func__ << " " << cid << dendl;
15388 int r;
15389
11fdf7f2 15390 (*c)->flush_all_but_last();
7c673cae 15391 {
9f95a23c 15392 std::unique_lock l(coll_lock);
7c673cae
FG
15393 if (!*c) {
15394 r = -ENOENT;
15395 goto out;
15396 }
15397 size_t nonexistent_count = 0;
11fdf7f2 15398 ceph_assert((*c)->exists);
adb31ebb 15399 if ((*c)->onode_map.map_any([&](Onode* o) {
f67539c2
TL
15400 if (o->exists) {
15401 dout(1) << __func__ << " " << o->oid << " " << o
15402 << " exists in onode_map" << dendl;
7c673cae 15403 return true;
f67539c2
TL
15404 }
15405 ++nonexistent_count;
15406 return false;
15407 })) {
7c673cae
FG
15408 r = -ENOTEMPTY;
15409 goto out;
15410 }
7c673cae
FG
15411 vector<ghobject_t> ls;
15412 ghobject_t next;
15413 // Enumerate onodes in db, up to nonexistent_count + 1
15414 // then check if all of them are marked as non-existent.
11fdf7f2 15415 // Bypass the check if (next != ghobject_t::get_max())
7c673cae 15416 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
f91f0fd5 15417 nonexistent_count + 1, false, &ls, &next);
7c673cae 15418 if (r >= 0) {
11fdf7f2
TL
15419 // If true mean collecton has more objects than nonexistent_count,
15420 // so bypass check.
15421 bool exists = (!next.is_max());
7c673cae
FG
15422 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
15423 dout(10) << __func__ << " oid " << *it << dendl;
15424 auto onode = (*c)->onode_map.lookup(*it);
15425 exists = !onode || onode->exists;
15426 if (exists) {
494da23a 15427 dout(1) << __func__ << " " << *it
f67539c2
TL
15428 << " exists in db, "
15429 << (!onode ? "not present in ram" : "present in ram")
15430 << dendl;
7c673cae
FG
15431 }
15432 }
15433 if (!exists) {
f67539c2 15434 _do_remove_collection(txc, c);
7c673cae
FG
15435 r = 0;
15436 } else {
15437 dout(10) << __func__ << " " << cid
15438 << " is non-empty" << dendl;
f67539c2 15439 r = -ENOTEMPTY;
7c673cae
FG
15440 }
15441 }
15442 }
f67539c2 15443out:
7c673cae
FG
15444 dout(10) << __func__ << " " << cid << " = " << r << dendl;
15445 return r;
15446}
15447
11fdf7f2
TL
15448void BlueStore::_do_remove_collection(TransContext *txc,
15449 CollectionRef *c)
15450{
15451 coll_map.erase((*c)->cid);
15452 txc->removed_collections.push_back(*c);
15453 (*c)->exists = false;
15454 _osr_register_zombie((*c)->osr.get());
15455 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
15456 c->reset();
15457}
15458
7c673cae
FG
15459int BlueStore::_split_collection(TransContext *txc,
15460 CollectionRef& c,
15461 CollectionRef& d,
15462 unsigned bits, int rem)
15463{
15464 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
15465 << " bits " << bits << dendl;
9f95a23c
TL
15466 std::unique_lock l(c->lock);
15467 std::unique_lock l2(d->lock);
7c673cae
FG
15468 int r;
15469
15470 // flush all previous deferred writes on this sequencer. this is a bit
15471 // heavyweight, but we need to make sure all deferred writes complete
15472 // before we split as the new collection's sequencer may need to order
15473 // this after those writes, and we don't bother with the complexity of
15474 // moving those TransContexts over to the new osr.
15475 _osr_drain_preceding(txc);
15476
15477 // move any cached items (onodes and referenced shared blobs) that will
15478 // belong to the child collection post-split. leave everything else behind.
15479 // this may include things that don't strictly belong to the now-smaller
15480 // parent split, but the OSD will always send us a split for every new
15481 // child.
15482
15483 spg_t pgid, dest_pgid;
15484 bool is_pg = c->cid.is_pg(&pgid);
11fdf7f2 15485 ceph_assert(is_pg);
7c673cae 15486 is_pg = d->cid.is_pg(&dest_pgid);
11fdf7f2 15487 ceph_assert(is_pg);
7c673cae
FG
15488
15489 // the destination should initially be empty.
11fdf7f2
TL
15490 ceph_assert(d->onode_map.empty());
15491 ceph_assert(d->shared_blob_set.empty());
15492 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
15493
15494 c->split_cache(d.get());
15495
15496 // adjust bits. note that this will be redundant for all but the first
15497 // split call for this parent (first child).
15498 c->cnode.bits = bits;
11fdf7f2 15499 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
15500 r = 0;
15501
15502 bufferlist bl;
11fdf7f2 15503 encode(c->cnode, bl);
7c673cae
FG
15504 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
15505
15506 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
15507 << " bits " << bits << " = " << r << dendl;
15508 return r;
15509}
15510
11fdf7f2
TL
15511int BlueStore::_merge_collection(
15512 TransContext *txc,
15513 CollectionRef *c,
15514 CollectionRef& d,
15515 unsigned bits)
15516{
15517 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
15518 << " bits " << bits << dendl;
9f95a23c
TL
15519 std::unique_lock l((*c)->lock);
15520 std::unique_lock l2(d->lock);
11fdf7f2
TL
15521 int r;
15522
15523 coll_t cid = (*c)->cid;
15524
15525 // flush all previous deferred writes on the source collection to ensure
15526 // that all deferred writes complete before we merge as the target collection's
15527 // sequencer may need to order new ops after those writes.
15528
15529 _osr_drain((*c)->osr.get());
15530
15531 // move any cached items (onodes and referenced shared blobs) that will
15532 // belong to the child collection post-split. leave everything else behind.
15533 // this may include things that don't strictly belong to the now-smaller
15534 // parent split, but the OSD will always send us a split for every new
15535 // child.
15536
15537 spg_t pgid, dest_pgid;
15538 bool is_pg = cid.is_pg(&pgid);
15539 ceph_assert(is_pg);
15540 is_pg = d->cid.is_pg(&dest_pgid);
15541 ceph_assert(is_pg);
15542
15543 // adjust bits. note that this will be redundant for all but the first
15544 // merge call for the parent/target.
15545 d->cnode.bits = bits;
15546
15547 // behavior depends on target (d) bits, so this after that is updated.
15548 (*c)->split_cache(d.get());
15549
15550 // remove source collection
15551 {
9f95a23c 15552 std::unique_lock l3(coll_lock);
11fdf7f2
TL
15553 _do_remove_collection(txc, c);
15554 }
15555
15556 r = 0;
15557
15558 bufferlist bl;
15559 encode(d->cnode, bl);
15560 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
15561
15562 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
15563 << " bits " << bits << " = " << r << dendl;
15564 return r;
15565}
15566
494da23a
TL
15567void BlueStore::log_latency(
15568 const char* name,
15569 int idx,
15570 const ceph::timespan& l,
15571 double lat_threshold,
15572 const char* info) const
15573{
15574 logger->tinc(idx, l);
15575 if (lat_threshold > 0.0 &&
15576 l >= make_timespan(lat_threshold)) {
15577 dout(0) << __func__ << " slow operation observed for " << name
15578 << ", latency = " << l
15579 << info
15580 << dendl;
15581 }
15582}
15583
11fdf7f2 15584void BlueStore::log_latency_fn(
494da23a 15585 const char* name,
11fdf7f2
TL
15586 int idx,
15587 const ceph::timespan& l,
494da23a
TL
15588 double lat_threshold,
15589 std::function<string (const ceph::timespan& lat)> fn) const
11fdf7f2 15590{
494da23a
TL
15591 logger->tinc(idx, l);
15592 if (lat_threshold > 0.0 &&
15593 l >= make_timespan(lat_threshold)) {
15594 dout(0) << __func__ << " slow operation observed for " << name
15595 << ", latency = " << l
15596 << fn(l)
15597 << dendl;
15598 }
11fdf7f2
TL
15599}
15600
9f95a23c
TL
15601#if defined(WITH_LTTNG)
15602void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
15603 KeyValueDB &db,
15604 TransContext &txc,
15605 mono_clock::time_point start_throttle_acquire)
15606{
15607 pending_kv_ios += txc.ios;
15608 if (txc.deferred_txn) {
15609 pending_deferred_ios += txc.ios;
15610 }
15611
15612 uint64_t started = 0;
15613 uint64_t completed = 0;
15614 if (should_trace(&started, &completed)) {
15615 txc.tracing = true;
15616 uint64_t rocksdb_base_level,
15617 rocksdb_estimate_pending_compaction_bytes,
15618 rocksdb_cur_size_all_mem_tables,
15619 rocksdb_compaction_pending,
15620 rocksdb_mem_table_flush_pending,
15621 rocksdb_num_running_compactions,
15622 rocksdb_num_running_flushes,
15623 rocksdb_actual_delayed_write_rate;
15624 db.get_property(
15625 "rocksdb.base-level",
15626 &rocksdb_base_level);
15627 db.get_property(
15628 "rocksdb.estimate-pending-compaction-bytes",
15629 &rocksdb_estimate_pending_compaction_bytes);
15630 db.get_property(
15631 "rocksdb.cur-size-all-mem-tables",
15632 &rocksdb_cur_size_all_mem_tables);
15633 db.get_property(
15634 "rocksdb.compaction-pending",
15635 &rocksdb_compaction_pending);
15636 db.get_property(
15637 "rocksdb.mem-table-flush-pending",
15638 &rocksdb_mem_table_flush_pending);
15639 db.get_property(
15640 "rocksdb.num-running-compactions",
15641 &rocksdb_num_running_compactions);
15642 db.get_property(
15643 "rocksdb.num-running-flushes",
15644 &rocksdb_num_running_flushes);
15645 db.get_property(
15646 "rocksdb.actual-delayed-write-rate",
15647 &rocksdb_actual_delayed_write_rate);
15648
15649
15650 tracepoint(
15651 bluestore,
15652 transaction_initial_state,
15653 txc.osr->get_sequencer_id(),
15654 txc.seq,
15655 throttle_bytes.get_current(),
15656 throttle_deferred_bytes.get_current(),
15657 pending_kv_ios,
15658 pending_deferred_ios,
15659 started,
15660 completed,
15661 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
15662
15663 tracepoint(
15664 bluestore,
15665 transaction_initial_state_rocksdb,
15666 txc.osr->get_sequencer_id(),
15667 txc.seq,
15668 rocksdb_base_level,
15669 rocksdb_estimate_pending_compaction_bytes,
15670 rocksdb_cur_size_all_mem_tables,
15671 rocksdb_compaction_pending,
15672 rocksdb_mem_table_flush_pending,
15673 rocksdb_num_running_compactions,
15674 rocksdb_num_running_flushes,
15675 rocksdb_actual_delayed_write_rate);
15676 }
15677}
15678#endif
15679
15680mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
15681 TransContext &txc, PerfCounters *logger, int state)
15682{
15683 mono_clock::time_point now = mono_clock::now();
15684 mono_clock::duration lat = now - txc.last_stamp;
15685 logger->tinc(state, lat);
15686#if defined(WITH_LTTNG)
15687 if (txc.tracing &&
15688 state >= l_bluestore_state_prepare_lat &&
15689 state <= l_bluestore_state_done_lat) {
15690 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
15691 tracepoint(
15692 bluestore,
15693 transaction_state_duration,
15694 txc.osr->get_sequencer_id(),
15695 txc.seq,
15696 state,
15697 ceph::to_seconds<double>(lat));
15698 }
15699#endif
15700 txc.last_stamp = now;
15701 return lat;
15702}
15703
15704bool BlueStore::BlueStoreThrottle::try_start_transaction(
15705 KeyValueDB &db,
15706 TransContext &txc,
15707 mono_clock::time_point start_throttle_acquire)
15708{
15709 throttle_bytes.get(txc.cost);
15710
15711 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
15712 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15713 return true;
15714 } else {
15715 return false;
15716 }
15717}
15718
15719void BlueStore::BlueStoreThrottle::finish_start_transaction(
15720 KeyValueDB &db,
15721 TransContext &txc,
15722 mono_clock::time_point start_throttle_acquire)
15723{
15724 ceph_assert(txc.deferred_txn);
15725 throttle_deferred_bytes.get(txc.cost);
15726 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15727}
15728
15729#if defined(WITH_LTTNG)
15730void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
15731{
15732 pending_kv_ios -= 1;
15733 ios_completed_since_last_traced++;
15734 if (txc.tracing) {
15735 tracepoint(
15736 bluestore,
15737 transaction_commit_latency,
15738 txc.osr->get_sequencer_id(),
15739 txc.seq,
15740 ceph::to_seconds<double>(mono_clock::now() - txc.start));
15741 }
15742}
15743#endif
15744
15745#if defined(WITH_LTTNG)
15746void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
15747{
15748 if (txc.deferred_txn) {
15749 pending_deferred_ios -= 1;
15750 }
15751 if (txc.tracing) {
15752 mono_clock::time_point now = mono_clock::now();
15753 mono_clock::duration lat = now - txc.start;
15754 tracepoint(
15755 bluestore,
15756 transaction_total_duration,
15757 txc.osr->get_sequencer_id(),
15758 txc.seq,
15759 ceph::to_seconds<double>(lat));
15760 }
15761}
15762#endif
11fdf7f2 15763
7c673cae
FG
15764// DB key value Histogram
15765#define KEY_SLAB 32
15766#define VALUE_SLAB 64
15767
15768const string prefix_onode = "o";
15769const string prefix_onode_shard = "x";
15770const string prefix_other = "Z";
15771
15772int BlueStore::DBHistogram::get_key_slab(size_t sz)
15773{
15774 return (sz/KEY_SLAB);
15775}
15776
15777string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
15778{
15779 int lower_bound = slab * KEY_SLAB;
15780 int upper_bound = (slab + 1) * KEY_SLAB;
15781 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15782 return ret;
15783}
15784
15785int BlueStore::DBHistogram::get_value_slab(size_t sz)
15786{
15787 return (sz/VALUE_SLAB);
15788}
15789
15790string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
15791{
15792 int lower_bound = slab * VALUE_SLAB;
15793 int upper_bound = (slab + 1) * VALUE_SLAB;
15794 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15795 return ret;
15796}
15797
15798void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
15799 const string &prefix, size_t key_size, size_t value_size)
15800{
15801 uint32_t key_slab = get_key_slab(key_size);
15802 uint32_t value_slab = get_value_slab(value_size);
15803 key_hist[prefix][key_slab].count++;
11fdf7f2
TL
15804 key_hist[prefix][key_slab].max_len =
15805 std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
7c673cae
FG
15806 key_hist[prefix][key_slab].val_map[value_slab].count++;
15807 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11fdf7f2
TL
15808 std::max<size_t>(value_size,
15809 key_hist[prefix][key_slab].val_map[value_slab].max_len);
7c673cae
FG
15810}
15811
15812void BlueStore::DBHistogram::dump(Formatter *f)
15813{
15814 f->open_object_section("rocksdb_value_distribution");
15815 for (auto i : value_hist) {
15816 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
15817 }
15818 f->close_section();
15819
15820 f->open_object_section("rocksdb_key_value_histogram");
15821 for (auto i : key_hist) {
15822 f->dump_string("prefix", i.first);
15823 f->open_object_section("key_hist");
15824 for ( auto k : i.second) {
15825 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
15826 f->dump_unsigned("max_len", k.second.max_len);
15827 f->open_object_section("value_hist");
15828 for ( auto j : k.second.val_map) {
15829 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
15830 f->dump_unsigned("max_len", j.second.max_len);
15831 }
15832 f->close_section();
15833 }
15834 f->close_section();
15835 }
15836 f->close_section();
15837}
15838
15839//Itrerates through the db and collects the stats
15840void BlueStore::generate_db_histogram(Formatter *f)
15841{
15842 //globals
15843 uint64_t num_onodes = 0;
15844 uint64_t num_shards = 0;
15845 uint64_t num_super = 0;
15846 uint64_t num_coll = 0;
15847 uint64_t num_omap = 0;
11fdf7f2 15848 uint64_t num_pgmeta_omap = 0;
7c673cae
FG
15849 uint64_t num_deferred = 0;
15850 uint64_t num_alloc = 0;
15851 uint64_t num_stat = 0;
15852 uint64_t num_others = 0;
15853 uint64_t num_shared_shards = 0;
15854 size_t max_key_size =0, max_value_size = 0;
15855 uint64_t total_key_size = 0, total_value_size = 0;
15856 size_t key_size = 0, value_size = 0;
15857 DBHistogram hist;
15858
11fdf7f2 15859 auto start = coarse_mono_clock::now();
7c673cae 15860
11fdf7f2 15861 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
7c673cae
FG
15862 iter->seek_to_first();
15863 while (iter->valid()) {
15864 dout(30) << __func__ << " Key: " << iter->key() << dendl;
15865 key_size = iter->key_size();
15866 value_size = iter->value_size();
15867 hist.value_hist[hist.get_value_slab(value_size)]++;
11fdf7f2
TL
15868 max_key_size = std::max(max_key_size, key_size);
15869 max_value_size = std::max(max_value_size, value_size);
7c673cae
FG
15870 total_key_size += key_size;
15871 total_value_size += value_size;
15872
15873 pair<string,string> key(iter->raw_key());
15874
15875 if (key.first == PREFIX_SUPER) {
15876 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
15877 num_super++;
15878 } else if (key.first == PREFIX_STAT) {
15879 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
15880 num_stat++;
15881 } else if (key.first == PREFIX_COLL) {
15882 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
15883 num_coll++;
15884 } else if (key.first == PREFIX_OBJ) {
15885 if (key.second.back() == ONODE_KEY_SUFFIX) {
15886 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
15887 num_onodes++;
15888 } else {
15889 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
15890 num_shards++;
15891 }
15892 } else if (key.first == PREFIX_OMAP) {
15893 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
15894 num_omap++;
f67539c2
TL
15895 } else if (key.first == PREFIX_PERPOOL_OMAP) {
15896 hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size);
15897 num_omap++;
15898 } else if (key.first == PREFIX_PERPG_OMAP) {
15899 hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size);
15900 num_omap++;
11fdf7f2
TL
15901 } else if (key.first == PREFIX_PGMETA_OMAP) {
15902 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
15903 num_pgmeta_omap++;
7c673cae
FG
15904 } else if (key.first == PREFIX_DEFERRED) {
15905 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
15906 num_deferred++;
11fdf7f2 15907 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
7c673cae
FG
15908 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
15909 num_alloc++;
15910 } else if (key.first == PREFIX_SHARED_BLOB) {
15911 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
15912 num_shared_shards++;
15913 } else {
15914 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
15915 num_others++;
15916 }
15917 iter->next();
15918 }
15919
11fdf7f2 15920 ceph::timespan duration = coarse_mono_clock::now() - start;
7c673cae
FG
15921 f->open_object_section("rocksdb_key_value_stats");
15922 f->dump_unsigned("num_onodes", num_onodes);
15923 f->dump_unsigned("num_shards", num_shards);
15924 f->dump_unsigned("num_super", num_super);
15925 f->dump_unsigned("num_coll", num_coll);
15926 f->dump_unsigned("num_omap", num_omap);
11fdf7f2 15927 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
7c673cae
FG
15928 f->dump_unsigned("num_deferred", num_deferred);
15929 f->dump_unsigned("num_alloc", num_alloc);
15930 f->dump_unsigned("num_stat", num_stat);
15931 f->dump_unsigned("num_shared_shards", num_shared_shards);
15932 f->dump_unsigned("num_others", num_others);
15933 f->dump_unsigned("max_key_size", max_key_size);
15934 f->dump_unsigned("max_value_size", max_value_size);
15935 f->dump_unsigned("total_key_size", total_key_size);
15936 f->dump_unsigned("total_value_size", total_value_size);
15937 f->close_section();
15938
15939 hist.dump(f);
15940
15941 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
15942
15943}
15944
f6b5b4d7 15945void BlueStore::_shutdown_cache()
7c673cae
FG
15946{
15947 dout(10) << __func__ << dendl;
9f95a23c
TL
15948 for (auto i : buffer_cache_shards) {
15949 i->flush();
11fdf7f2 15950 ceph_assert(i->empty());
7c673cae
FG
15951 }
15952 for (auto& p : coll_map) {
f6b5b4d7 15953 p.second->onode_map.clear();
3efd9988
FG
15954 if (!p.second->shared_blob_set.empty()) {
15955 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11fdf7f2 15956 p.second->shared_blob_set.dump<0>(cct);
3efd9988 15957 }
11fdf7f2
TL
15958 ceph_assert(p.second->onode_map.empty());
15959 ceph_assert(p.second->shared_blob_set.empty());
7c673cae
FG
15960 }
15961 coll_map.clear();
f6b5b4d7
TL
15962 for (auto i : onode_cache_shards) {
15963 ceph_assert(i->empty());
15964 }
7c673cae
FG
15965}
15966
31f18b77
FG
15967// For external caller.
15968// We use a best-effort policy instead, e.g.,
15969// we don't care if there are still some pinned onodes/data in the cache
15970// after this command is completed.
11fdf7f2 15971int BlueStore::flush_cache(ostream *os)
31f18b77
FG
15972{
15973 dout(10) << __func__ << dendl;
9f95a23c
TL
15974 for (auto i : onode_cache_shards) {
15975 i->flush();
15976 }
15977 for (auto i : buffer_cache_shards) {
15978 i->flush();
31f18b77 15979 }
11fdf7f2
TL
15980
15981 return 0;
31f18b77
FG
15982}
15983
7c673cae
FG
15984void BlueStore::_apply_padding(uint64_t head_pad,
15985 uint64_t tail_pad,
7c673cae
FG
15986 bufferlist& padded)
15987{
7c673cae 15988 if (head_pad) {
224ce89b 15989 padded.prepend_zero(head_pad);
7c673cae
FG
15990 }
15991 if (tail_pad) {
15992 padded.append_zero(tail_pad);
15993 }
15994 if (head_pad || tail_pad) {
15995 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
15996 << " tail 0x" << tail_pad << std::dec << dendl;
15997 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
15998 }
15999}
16000
11fdf7f2
TL
16001void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
16002{
16003 // finalize extent_map shards
16004 o->extent_map.update(txn, false);
16005 if (o->extent_map.needs_reshard()) {
16006 o->extent_map.reshard(db, txn);
16007 o->extent_map.update(txn, true);
16008 if (o->extent_map.needs_reshard()) {
16009 dout(20) << __func__ << " warning: still wants reshard, check options?"
16010 << dendl;
16011 o->extent_map.clear_needs_reshard();
16012 }
16013 logger->inc(l_bluestore_onode_reshard);
16014 }
16015
16016 // bound encode
16017 size_t bound = 0;
16018 denc(o->onode, bound);
16019 o->extent_map.bound_encode_spanning_blobs(bound);
16020 if (o->onode.extent_map_shards.empty()) {
16021 denc(o->extent_map.inline_bl, bound);
16022 }
16023
16024 // encode
16025 bufferlist bl;
16026 unsigned onode_part, blob_part, extent_part;
16027 {
16028 auto p = bl.get_contiguous_appender(bound, true);
16029 denc(o->onode, p);
16030 onode_part = p.get_logical_offset();
16031 o->extent_map.encode_spanning_blobs(p);
16032 blob_part = p.get_logical_offset() - onode_part;
16033 if (o->onode.extent_map_shards.empty()) {
16034 denc(o->extent_map.inline_bl, p);
16035 }
16036 extent_part = p.get_logical_offset() - onode_part - blob_part;
16037 }
16038
16039 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
16040 << " (" << onode_part << " bytes onode + "
16041 << blob_part << " bytes spanning blobs + "
16042 << extent_part << " bytes inline extents)"
16043 << dendl;
16044
16045
16046 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
16047}
16048
16049void BlueStore::_log_alerts(osd_alert_list_t& alerts)
16050{
16051 std::lock_guard l(qlock);
16052
f67539c2
TL
16053 if (!spurious_read_errors_alert.empty()) {
16054 alerts.emplace(
16055 "BLUESTORE_SPURIOUS_READ_ERRORS",
16056 spurious_read_errors_alert);
16057 }
81eedcae
TL
16058 if (!disk_size_mismatch_alert.empty()) {
16059 alerts.emplace(
16060 "BLUESTORE_DISK_SIZE_MISMATCH",
16061 disk_size_mismatch_alert);
16062 }
16063 if (!legacy_statfs_alert.empty()) {
16064 alerts.emplace(
16065 "BLUESTORE_LEGACY_STATFS",
16066 legacy_statfs_alert);
16067 }
11fdf7f2
TL
16068 if (!spillover_alert.empty() &&
16069 cct->_conf->bluestore_warn_on_bluefs_spillover) {
16070 alerts.emplace(
16071 "BLUEFS_SPILLOVER",
16072 spillover_alert);
16073 }
f67539c2
TL
16074 if (!no_per_pg_omap_alert.empty()) {
16075 alerts.emplace(
16076 "BLUESTORE_NO_PER_PG_OMAP",
16077 no_per_pg_omap_alert);
16078 }
9f95a23c
TL
16079 if (!no_per_pool_omap_alert.empty()) {
16080 alerts.emplace(
16081 "BLUESTORE_NO_PER_POOL_OMAP",
16082 no_per_pool_omap_alert);
16083 }
11fdf7f2
TL
16084 string s0(failed_cmode);
16085
16086 if (!failed_compressors.empty()) {
16087 if (!s0.empty()) {
16088 s0 += ", ";
16089 }
16090 s0 += "unable to load:";
16091 bool first = true;
16092 for (auto& s : failed_compressors) {
16093 if (first) {
16094 first = false;
16095 } else {
16096 s0 += ", ";
16097 }
16098 s0 += s;
16099 }
16100 alerts.emplace(
16101 "BLUESTORE_NO_COMPRESSION",
16102 s0);
16103 }
16104}
16105
9f95a23c
TL
16106void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
16107 size_t extents)
16108{
16109 alloc_stats_count++;
16110 alloc_stats_fragments += extents;
16111 alloc_stats_size += need;
16112}
16113
16114void BlueStore::_record_allocation_stats()
16115{
16116 // don't care about data consistency,
16117 // fields can be partially modified while making the tuple
16118 auto t0 = std::make_tuple(
16119 alloc_stats_count.exchange(0),
16120 alloc_stats_fragments.exchange(0),
16121 alloc_stats_size.exchange(0));
16122
16123 dout(0) << " allocation stats probe "
16124 << probe_count << ":"
16125 << " cnt: " << std::get<0>(t0)
16126 << " frags: " << std::get<1>(t0)
16127 << " size: " << std::get<2>(t0)
16128 << dendl;
16129
16130
16131 //
16132 // Keep the history for probes from the power-of-two sequence:
16133 // -1, -2, -4, -8, -16
16134 //
16135 size_t base = 1;
16136 for (auto& t : alloc_stats_history) {
16137 dout(0) << " probe -"
16138 << base + (probe_count % base) << ": "
16139 << std::get<0>(t)
16140 << ", " << std::get<1>(t)
16141 << ", " << std::get<2>(t)
16142 << dendl;
16143 base <<= 1;
16144 }
16145 dout(0) << "------------" << dendl;
16146
f67539c2 16147 ++ probe_count;
9f95a23c 16148
f67539c2
TL
16149 for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) {
16150 if ((probe_count % (1 << i)) == 0) {
16151 alloc_stats_history[i] = alloc_stats_history[i - 1];
16152 }
9f95a23c
TL
16153 }
16154 alloc_stats_history[0].swap(t0);
16155}
16156
7c673cae 16157// ===========================================
11fdf7f2
TL
16158// BlueStoreRepairer
16159
16160size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
16161 const interval_set<uint64_t>& extents)
16162{
16163 ceph_assert(granularity); // initialized
16164 // can't call for the second time
16165 ceph_assert(!was_filtered_out);
16166 ceph_assert(collections_bfs.size() == objects_bfs.size());
16167
16168 uint64_t prev_pos = 0;
16169 uint64_t npos = collections_bfs.size();
16170
16171 bloom_vector collections_reduced;
16172 bloom_vector objects_reduced;
16173
16174 for (auto e : extents) {
16175 if (e.second == 0) {
16176 continue;
16177 }
16178 uint64_t pos = max(e.first / granularity, prev_pos);
16179 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
16180 while (pos != npos && pos < end_pos) {
16181 ceph_assert( collections_bfs[pos].element_count() ==
16182 objects_bfs[pos].element_count());
16183 if (collections_bfs[pos].element_count()) {
16184 collections_reduced.push_back(std::move(collections_bfs[pos]));
16185 objects_reduced.push_back(std::move(objects_bfs[pos]));
16186 }
16187 ++pos;
16188 }
16189 prev_pos = end_pos;
16190 }
16191 collections_reduced.swap(collections_bfs);
16192 objects_reduced.swap(objects_bfs);
16193 was_filtered_out = true;
16194 return collections_bfs.size();
16195}
16196
16197bool BlueStoreRepairer::remove_key(KeyValueDB *db,
16198 const string& prefix,
16199 const string& key)
16200{
b3b6e05e 16201 std::lock_guard l(lock);
11fdf7f2
TL
16202 if (!remove_key_txn) {
16203 remove_key_txn = db->get_transaction();
16204 }
16205 ++to_repair_cnt;
16206 remove_key_txn->rmkey(prefix, key);
16207
16208 return true;
16209}
16210
f67539c2 16211void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val)
9f95a23c 16212{
b3b6e05e
TL
16213 std::lock_guard l(lock); // possibly redundant
16214 ceph_assert(fix_per_pool_omap_txn == nullptr);
9f95a23c
TL
16215 fix_per_pool_omap_txn = db->get_transaction();
16216 ++to_repair_cnt;
16217 bufferlist bl;
f67539c2 16218 bl.append(stringify(val));
9f95a23c
TL
16219 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
16220}
16221
11fdf7f2
TL
16222bool BlueStoreRepairer::fix_shared_blob(
16223 KeyValueDB *db,
16224 uint64_t sbid,
16225 const bufferlist* bl)
16226{
b3b6e05e 16227 std::lock_guard l(lock); // possibly redundant
11fdf7f2
TL
16228 KeyValueDB::Transaction txn;
16229 if (fix_misreferences_txn) { // reuse this txn
16230 txn = fix_misreferences_txn;
16231 } else {
16232 if (!fix_shared_blob_txn) {
16233 fix_shared_blob_txn = db->get_transaction();
16234 }
16235 txn = fix_shared_blob_txn;
16236 }
16237 string key;
16238 get_shared_blob_key(sbid, &key);
16239
16240 ++to_repair_cnt;
16241 if (bl) {
16242 txn->set(PREFIX_SHARED_BLOB, key, *bl);
16243 } else {
16244 txn->rmkey(PREFIX_SHARED_BLOB, key);
16245 }
16246 return true;
16247}
16248
16249bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
16250 const string& key,
16251 const store_statfs_t& new_statfs)
16252{
b3b6e05e 16253 std::lock_guard l(lock);
11fdf7f2
TL
16254 if (!fix_statfs_txn) {
16255 fix_statfs_txn = db->get_transaction();
16256 }
16257 BlueStore::volatile_statfs vstatfs;
16258 vstatfs = new_statfs;
16259 bufferlist bl;
16260 vstatfs.encode(bl);
16261 ++to_repair_cnt;
16262 fix_statfs_txn->set(PREFIX_STAT, key, bl);
16263 return true;
16264}
16265
16266bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
16267 FreelistManager* fm,
16268 uint64_t offset, uint64_t len)
16269{
b3b6e05e 16270 std::lock_guard l(lock);
11fdf7f2
TL
16271 if (!fix_fm_leaked_txn) {
16272 fix_fm_leaked_txn = db->get_transaction();
16273 }
16274 ++to_repair_cnt;
16275 fm->release(offset, len, fix_fm_leaked_txn);
16276 return true;
16277}
16278bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
16279 FreelistManager* fm,
16280 uint64_t offset, uint64_t len)
16281{
b3b6e05e 16282 std::lock_guard l(lock);
11fdf7f2
TL
16283 if (!fix_fm_false_free_txn) {
16284 fix_fm_false_free_txn = db->get_transaction();
16285 }
16286 ++to_repair_cnt;
16287 fm->allocate(offset, len, fix_fm_false_free_txn);
16288 return true;
16289}
16290
b3b6e05e
TL
16291bool BlueStoreRepairer::fix_spanning_blobs(
16292 KeyValueDB* db,
16293 std::function<void(KeyValueDB::Transaction)> f)
adb31ebb 16294{
b3b6e05e 16295 std::lock_guard l(lock);
adb31ebb
TL
16296 if (!fix_onode_txn) {
16297 fix_onode_txn = db->get_transaction();
16298 }
b3b6e05e 16299 f(fix_onode_txn);
adb31ebb 16300 ++to_repair_cnt;
b3b6e05e 16301 return true;
adb31ebb
TL
16302}
16303
11fdf7f2
TL
16304bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
16305{
b3b6e05e 16306 //NB: not for use in multithreading mode!!!
11fdf7f2
TL
16307 if (misreferenced_extents.size()) {
16308 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
16309 ceph_assert(n > 0);
16310 if (!fix_misreferences_txn) {
16311 fix_misreferences_txn = db->get_transaction();
16312 }
16313 return true;
16314 }
16315 return false;
16316}
16317
16318unsigned BlueStoreRepairer::apply(KeyValueDB* db)
16319{
b3b6e05e 16320 //NB: not for use in multithreading mode!!!
9f95a23c
TL
16321 if (fix_per_pool_omap_txn) {
16322 db->submit_transaction_sync(fix_per_pool_omap_txn);
16323 fix_per_pool_omap_txn = nullptr;
16324 }
11fdf7f2
TL
16325 if (fix_fm_leaked_txn) {
16326 db->submit_transaction_sync(fix_fm_leaked_txn);
16327 fix_fm_leaked_txn = nullptr;
16328 }
16329 if (fix_fm_false_free_txn) {
16330 db->submit_transaction_sync(fix_fm_false_free_txn);
16331 fix_fm_false_free_txn = nullptr;
16332 }
16333 if (remove_key_txn) {
16334 db->submit_transaction_sync(remove_key_txn);
16335 remove_key_txn = nullptr;
16336 }
16337 if (fix_misreferences_txn) {
16338 db->submit_transaction_sync(fix_misreferences_txn);
16339 fix_misreferences_txn = nullptr;
16340 }
adb31ebb
TL
16341 if (fix_onode_txn) {
16342 db->submit_transaction_sync(fix_onode_txn);
16343 fix_onode_txn = nullptr;
16344 }
11fdf7f2
TL
16345 if (fix_shared_blob_txn) {
16346 db->submit_transaction_sync(fix_shared_blob_txn);
16347 fix_shared_blob_txn = nullptr;
16348 }
16349
16350 if (fix_statfs_txn) {
16351 db->submit_transaction_sync(fix_statfs_txn);
16352 fix_statfs_txn = nullptr;
16353 }
16354 unsigned repaired = to_repair_cnt;
16355 to_repair_cnt = 0;
16356 return repaired;
16357}
16358
16359// =======================================================
9f95a23c
TL
16360// RocksDBBlueFSVolumeSelector
16361
16362uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
16363 ceph_assert(h != nullptr);
16364 uint64_t hint = reinterpret_cast<uint64_t>(h);
16365 uint8_t res;
16366 switch (hint) {
16367 case LEVEL_SLOW:
16368 res = BlueFS::BDEV_SLOW;
16369 if (db_avail4slow > 0) {
16370 // considering statically available db space vs.
16371 // - observed maximums on DB dev for DB/WAL/UNSORTED data
16372 // - observed maximum spillovers
16373 uint64_t max_db_use = 0; // max db usage we potentially observed
f6b5b4d7 16374 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
9f95a23c
TL
16375 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
16376 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
16377 // this could go to db hence using it in the estimation
16378 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
16379
16380 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
16381 uint64_t avail = min(
16382 db_avail4slow,
16383 max_db_use < db_total ? db_total - max_db_use : 0);
16384
16385 // considering current DB dev usage for SLOW data
16386 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
16387 res = BlueFS::BDEV_DB;
16388 }
16389 }
16390 break;
f6b5b4d7 16391 case LEVEL_LOG:
9f95a23c
TL
16392 case LEVEL_WAL:
16393 res = BlueFS::BDEV_WAL;
16394 break;
16395 case LEVEL_DB:
16396 default:
16397 res = BlueFS::BDEV_DB;
16398 break;
16399 }
16400 return res;
16401}
16402
16403void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
16404{
16405 res.emplace_back(base, l_totals[LEVEL_DB - LEVEL_FIRST]);
16406 res.emplace_back(base + ".slow", l_totals[LEVEL_SLOW - LEVEL_FIRST]);
16407}
16408
b3b6e05e 16409void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
9f95a23c
TL
16410 uint8_t res = LEVEL_DB;
16411 if (dirname.length() > 5) {
16412 // the "db.slow" and "db.wal" directory names are hard-coded at
16413 // match up with bluestore. the slow device is always the second
16414 // one (when a dedicated block.db device is present and used at
16415 // bdev 0). the wal device is always last.
16416 if (boost::algorithm::ends_with(dirname, ".slow")) {
16417 res = LEVEL_SLOW;
16418 }
16419 else if (boost::algorithm::ends_with(dirname, ".wal")) {
16420 res = LEVEL_WAL;
16421 }
16422 }
16423 return reinterpret_cast<void*>(res);
16424}
16425
16426void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
16427 auto max_x = per_level_per_dev_usage.get_max_x();
16428 auto max_y = per_level_per_dev_usage.get_max_y();
16429 sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
16430 << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
16431 << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
16432 << ", db_avail:" << db_avail4slow << std::endl
16433 << "Usage matrix:" << std::endl;
f6b5b4d7 16434 constexpr std::array<const char*, 8> names{ {
9f95a23c
TL
16435 "DEV/LEV",
16436 "WAL",
16437 "DB",
16438 "SLOW",
16439 "*",
16440 "*",
f6b5b4d7
TL
16441 "REAL",
16442 "FILES",
9f95a23c
TL
16443 } };
16444 const size_t width = 12;
16445 for (size_t i = 0; i < names.size(); ++i) {
16446 sout.setf(std::ios::left, std::ios::adjustfield);
16447 sout.width(width);
16448 sout << names[i];
16449 }
16450 sout << std::endl;
16451 for (size_t l = 0; l < max_y; l++) {
16452 sout.setf(std::ios::left, std::ios::adjustfield);
16453 sout.width(width);
16454 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
16455 case LEVEL_LOG:
16456 sout << "LOG"; break;
9f95a23c
TL
16457 case LEVEL_WAL:
16458 sout << "WAL"; break;
16459 case LEVEL_DB:
16460 sout << "DB"; break;
16461 case LEVEL_SLOW:
16462 sout << "SLOW"; break;
16463 case LEVEL_MAX:
16464 sout << "TOTALS"; break;
16465 }
f6b5b4d7 16466 for (size_t d = 0; d < max_x; d++) {
9f95a23c
TL
16467 sout.setf(std::ios::left, std::ios::adjustfield);
16468 sout.width(width);
16469 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
16470 }
16471 sout.setf(std::ios::left, std::ios::adjustfield);
16472 sout.width(width);
f6b5b4d7 16473 sout << stringify(per_level_files[l]) << std::endl;
9f95a23c
TL
16474 }
16475 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
16476 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
16477 sout << "MAXIMUMS:" << std::endl;
16478 for (size_t l = 0; l < max_y; l++) {
16479 sout.setf(std::ios::left, std::ios::adjustfield);
16480 sout.width(width);
16481 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
16482 case LEVEL_LOG:
16483 sout << "LOG"; break;
9f95a23c
TL
16484 case LEVEL_WAL:
16485 sout << "WAL"; break;
16486 case LEVEL_DB:
16487 sout << "DB"; break;
16488 case LEVEL_SLOW:
16489 sout << "SLOW"; break;
16490 case LEVEL_MAX:
16491 sout << "TOTALS"; break;
16492 }
16493 for (size_t d = 0; d < max_x - 1; d++) {
16494 sout.setf(std::ios::left, std::ios::adjustfield);
16495 sout.width(width);
16496 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
16497 }
16498 sout.setf(std::ios::left, std::ios::adjustfield);
16499 sout.width(width);
16500 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
16501 if (l < max_y - 1) {
16502 sout << std::endl;
16503 }
16504 }
16505}
11fdf7f2 16506
9f95a23c 16507// =======================================================