]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
update dh_systemd restart patch for pacific
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
20
eafe8130 21#include <boost/container/flat_set.hpp>
9f95a23c 22#include "boost/algorithm/string.hpp"
eafe8130 23
31f18b77
FG
24#include "include/cpp-btree/btree_set.h"
25
7c673cae 26#include "BlueStore.h"
f67539c2 27#include "bluestore_common.h"
7c673cae
FG
28#include "os/kv.h"
29#include "include/compat.h"
30#include "include/intarith.h"
31#include "include/stringify.h"
11fdf7f2
TL
32#include "include/str_map.h"
33#include "include/util.h"
7c673cae
FG
34#include "common/errno.h"
35#include "common/safe_io.h"
91327a77 36#include "common/PriorityCache.h"
9f95a23c 37#include "common/RWLock.h"
7c673cae
FG
38#include "Allocator.h"
39#include "FreelistManager.h"
40#include "BlueFS.h"
41#include "BlueRocksEnv.h"
42#include "auth/Crypto.h"
43#include "common/EventTrace.h"
91327a77 44#include "perfglue/heap_profiler.h"
11fdf7f2
TL
45#include "common/blkdev.h"
46#include "common/numa.h"
f67539c2 47#include "common/pretty_binary.h"
7c673cae 48
9f95a23c
TL
49#if defined(WITH_LTTNG)
50#define TRACEPOINT_DEFINE
51#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
52#include "tracing/bluestore.h"
53#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
54#undef TRACEPOINT_DEFINE
55#else
56#define tracepoint(...)
57#endif
58
7c673cae
FG
59#define dout_context cct
60#define dout_subsys ceph_subsys_bluestore
61
31f18b77
FG
62using bid_t = decltype(BlueStore::Blob::id);
63
64// bluestore_cache_onode
7c673cae 65MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 66 bluestore_cache_onode);
7c673cae 67
31f18b77 68// bluestore_cache_other
7c673cae 69MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
f91f0fd5 70 bluestore_Buffer);
7c673cae 71MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
f91f0fd5 72 bluestore_Extent);
7c673cae 73MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
f91f0fd5 74 bluestore_Blob);
7c673cae 75MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
f91f0fd5 76 bluestore_SharedBlob);
31f18b77
FG
77
78// bluestore_txc
79MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
80 bluestore_txc);
f67539c2
TL
81using std::deque;
82using std::min;
83using std::make_pair;
84using std::numeric_limits;
85using std::pair;
86using std::list;
87using std::map;
88using std::max;
89using std::ostream;
90using std::ostringstream;
91using std::set;
92using std::string;
93using std::stringstream;
94using std::vector;
95
96using ceph::bufferlist;
97using ceph::bufferptr;
98using ceph::coarse_mono_clock;
99using ceph::decode;
100using ceph::encode;
101using ceph::Formatter;
102using ceph::JSONFormatter;
103using ceph::make_timespan;
104using ceph::mono_clock;
105using ceph::mono_time;
106using ceph::timespan_str;
7c673cae
FG
107
108// kv store prefixes
11fdf7f2
TL
109const string PREFIX_SUPER = "S"; // field -> value
110const string PREFIX_STAT = "T"; // field -> value(int64 array)
111const string PREFIX_COLL = "C"; // collection name -> cnode_t
112const string PREFIX_OBJ = "O"; // object name -> onode_t
113const string PREFIX_OMAP = "M"; // u64 + keyname -> value
114const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
9f95a23c 115const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
f67539c2 116const string PREFIX_PERPG_OMAP = "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value
11fdf7f2
TL
117const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
118const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
119const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
7c673cae 120const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
f67539c2
TL
121const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager)
122const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager)
123const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata)
7c673cae 124
11fdf7f2
TL
125const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
126
7c673cae
FG
127// write a label in the first block. always use this size. note that
128// bluefs makes a matching assumption about the location of its
129// superblock (always the second block of the device).
130#define BDEV_LABEL_BLOCK_SIZE 4096
131
132// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
133#define SUPER_RESERVED 8192
134
135#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
136
137
138/*
139 * extent map blob encoding
140 *
141 * we use the low bits of the blobid field to indicate some common scenarios
142 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
143 */
144#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
145#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
146#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
147#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
148#define BLOBID_SHIFT_BITS 4
149
150/*
151 * object name key structure
152 *
153 * encoded u8: shard + 2^7 (so that it sorts properly)
154 * encoded u64: poolid + 2^63 (so that it sorts properly)
155 * encoded u32: hash (bit reversed)
156 *
157 * escaped string: namespace
158 *
159 * escaped string: key or object name
160 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
161 * we are done. otherwise, we are followed by the object name.
162 * escaped string: object name (unless '=' above)
163 *
164 * encoded u64: snap
165 * encoded u64: generation
166 * 'o'
167 */
168#define ONODE_KEY_SUFFIX 'o'
169
170/*
171 * extent shard key
172 *
173 * object prefix key
174 * u32
175 * 'x'
176 */
177#define EXTENT_SHARD_KEY_SUFFIX 'x'
178
179/*
180 * string encoding in the key
181 *
182 * The key string needs to lexicographically sort the same way that
183 * ghobject_t does. We do this by escaping anything <= to '#' with #
184 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
185 * hex digits.
186 *
187 * We use ! as a terminator for strings; this works because it is < #
188 * and will get escaped if it is present in the string.
189 *
f91f0fd5
TL
190 * NOTE: There is a bug in this implementation: due to implicit
191 * character type conversion in comparison it may produce unexpected
192 * ordering. Unfortunately fixing the bug would mean invalidating the
193 * keys in existing deployments. Instead we do additional sorting
194 * where it is needed.
7c673cae
FG
195 */
196template<typename S>
197static void append_escaped(const string &in, S *out)
198{
224ce89b
WB
199 char hexbyte[in.length() * 3 + 1];
200 char* ptr = &hexbyte[0];
7c673cae 201 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
f91f0fd5 202 if (*i <= '#') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
203 *ptr++ = '#';
204 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
205 *ptr++ = "0123456789abcdef"[*i & 0x0f];
f91f0fd5 206 } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
207 *ptr++ = '~';
208 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
209 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 210 } else {
224ce89b 211 *ptr++ = *i;
7c673cae
FG
212 }
213 }
224ce89b
WB
214 *ptr++ = '!';
215 out->append(hexbyte, ptr - &hexbyte[0]);
216}
217
218inline unsigned h2i(char c)
219{
220 if ((c >= '0') && (c <= '9')) {
221 return c - 0x30;
222 } else if ((c >= 'a') && (c <= 'f')) {
223 return c - 'a' + 10;
224 } else if ((c >= 'A') && (c <= 'F')) {
225 return c - 'A' + 10;
226 } else {
227 return 256; // make it always larger than 255
228 }
7c673cae
FG
229}
230
231static int decode_escaped(const char *p, string *out)
232{
224ce89b
WB
233 char buff[256];
234 char* ptr = &buff[0];
235 char* max = &buff[252];
7c673cae
FG
236 const char *orig_p = p;
237 while (*p && *p != '!') {
238 if (*p == '#' || *p == '~') {
224ce89b
WB
239 unsigned hex = 0;
240 p++;
241 hex = h2i(*p++) << 4;
242 if (hex > 255) {
243 return -EINVAL;
244 }
245 hex |= h2i(*p++);
246 if (hex > 255) {
247 return -EINVAL;
248 }
249 *ptr++ = hex;
7c673cae 250 } else {
224ce89b
WB
251 *ptr++ = *p++;
252 }
253 if (ptr > max) {
254 out->append(buff, ptr-buff);
255 ptr = &buff[0];
7c673cae
FG
256 }
257 }
224ce89b
WB
258 if (ptr != buff) {
259 out->append(buff, ptr-buff);
260 }
7c673cae
FG
261 return p - orig_p;
262}
263
7c673cae
FG
264template<typename T>
265static void _key_encode_shard(shard_id_t shard, T *key)
266{
267 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
268}
269
270static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
271{
272 pshard->id = (uint8_t)*key - (uint8_t)0x80;
273 return key + 1;
274}
275
f91f0fd5 276static void get_coll_range(const coll_t& cid, int bits,
f67539c2
TL
277 ghobject_t *temp_start, ghobject_t *temp_end,
278 ghobject_t *start, ghobject_t *end)
7c673cae 279{
7c673cae
FG
280 spg_t pgid;
281 if (cid.is_pg(&pgid)) {
f91f0fd5 282 start->shard_id = pgid.shard;
7c673cae
FG
283 *temp_start = *start;
284
f91f0fd5
TL
285 start->hobj.pool = pgid.pool();
286 temp_start->hobj.pool = -2ll - pgid.pool();
7c673cae
FG
287
288 *end = *start;
289 *temp_end = *temp_start;
290
291 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
f91f0fd5
TL
292 start->hobj.set_bitwise_key_u32(reverse_hash);
293 temp_start->hobj.set_bitwise_key_u32(reverse_hash);
7c673cae
FG
294
295 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
296 if (end_hash > 0xffffffffull)
297 end_hash = 0xffffffffull;
298
f91f0fd5
TL
299 end->hobj.set_bitwise_key_u32(end_hash);
300 temp_end->hobj.set_bitwise_key_u32(end_hash);
7c673cae 301 } else {
f91f0fd5
TL
302 start->shard_id = shard_id_t::NO_SHARD;
303 start->hobj.pool = -1ull;
304
7c673cae 305 *end = *start;
f91f0fd5
TL
306 start->hobj.set_bitwise_key_u32(0);
307 end->hobj.set_bitwise_key_u32(0xffffffff);
7c673cae
FG
308
309 // no separate temp section
310 *temp_start = *end;
311 *temp_end = *end;
312 }
f91f0fd5
TL
313
314 start->generation = 0;
315 end->generation = 0;
316 temp_start->generation = 0;
317 temp_end->generation = 0;
7c673cae
FG
318}
319
320static void get_shared_blob_key(uint64_t sbid, string *key)
321{
322 key->clear();
323 _key_encode_u64(sbid, key);
324}
325
326static int get_key_shared_blob(const string& key, uint64_t *sbid)
327{
328 const char *p = key.c_str();
329 if (key.length() < sizeof(uint64_t))
330 return -1;
224ce89b 331 _key_decode_u64(p, sbid);
7c673cae
FG
332 return 0;
333}
334
335template<typename S>
f91f0fd5 336static void _key_encode_prefix(const ghobject_t& oid, S *key)
7c673cae 337{
f91f0fd5
TL
338 _key_encode_shard(oid.shard_id, key);
339 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
340 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
341}
7c673cae 342
f91f0fd5
TL
343static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
344{
7c673cae
FG
345 p = _key_decode_shard(p, &oid->shard_id);
346
347 uint64_t pool;
348 p = _key_decode_u64(p, &pool);
349 oid->hobj.pool = pool - 0x8000000000000000ull;
350
351 unsigned hash;
352 p = _key_decode_u32(p, &hash);
353
354 oid->hobj.set_bitwise_key_u32(hash);
355
f91f0fd5
TL
356 return p;
357}
358
359#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
360
361template<typename S>
362static int get_key_object(const S& key, ghobject_t *oid)
363{
364 int r;
365 const char *p = key.c_str();
366
367 if (key.length() < ENCODED_KEY_PREFIX_LEN)
368 return -1;
369
370 p = _key_decode_prefix(p, oid);
371
372 if (key.length() == ENCODED_KEY_PREFIX_LEN)
373 return -2;
374
7c673cae
FG
375 r = decode_escaped(p, &oid->hobj.nspace);
376 if (r < 0)
377 return -2;
378 p += r + 1;
379
380 string k;
381 r = decode_escaped(p, &k);
382 if (r < 0)
383 return -3;
384 p += r + 1;
385 if (*p == '=') {
386 // no key
387 ++p;
388 oid->hobj.oid.name = k;
389 } else if (*p == '<' || *p == '>') {
390 // key + name
391 ++p;
392 r = decode_escaped(p, &oid->hobj.oid.name);
393 if (r < 0)
394 return -5;
395 p += r + 1;
396 oid->hobj.set_key(k);
397 } else {
398 // malformed
399 return -6;
400 }
401
402 p = _key_decode_u64(p, &oid->hobj.snap.val);
403 p = _key_decode_u64(p, &oid->generation);
404
405 if (*p != ONODE_KEY_SUFFIX) {
406 return -7;
407 }
408 p++;
409 if (*p) {
410 // if we get something other than a null terminator here,
411 // something goes wrong.
412 return -8;
413 }
414
415 return 0;
416}
417
418template<typename S>
419static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
420{
421 key->clear();
422
f91f0fd5 423 size_t max_len = ENCODED_KEY_PREFIX_LEN +
7c673cae
FG
424 (oid.hobj.nspace.length() * 3 + 1) +
425 (oid.hobj.get_key().length() * 3 + 1) +
426 1 + // for '<', '=', or '>'
427 (oid.hobj.oid.name.length() * 3 + 1) +
428 8 + 8 + 1;
429 key->reserve(max_len);
430
f91f0fd5 431 _key_encode_prefix(oid, key);
7c673cae
FG
432
433 append_escaped(oid.hobj.nspace, key);
434
435 if (oid.hobj.get_key().length()) {
436 // is a key... could be < = or >.
437 append_escaped(oid.hobj.get_key(), key);
438 // (ASCII chars < = and > sort in that order, yay)
439 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
440 if (r) {
441 key->append(r > 0 ? ">" : "<");
442 append_escaped(oid.hobj.oid.name, key);
443 } else {
444 // same as no key
445 key->append("=");
446 }
447 } else {
448 // no key
449 append_escaped(oid.hobj.oid.name, key);
450 key->append("=");
451 }
452
453 _key_encode_u64(oid.hobj.snap, key);
454 _key_encode_u64(oid.generation, key);
455
456 key->push_back(ONODE_KEY_SUFFIX);
457
458 // sanity check
459 if (true) {
460 ghobject_t t;
461 int r = get_key_object(*key, &t);
462 if (r || t != oid) {
463 derr << " r " << r << dendl;
464 derr << "key " << pretty_binary_string(*key) << dendl;
465 derr << "oid " << oid << dendl;
466 derr << " t " << t << dendl;
11fdf7f2 467 ceph_assert(r == 0 && t == oid);
7c673cae
FG
468 }
469 }
470}
471
7c673cae
FG
472// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
473// char lets us quickly test whether it is a shard key without decoding any
474// of the prefix bytes.
475template<typename S>
476static void get_extent_shard_key(const S& onode_key, uint32_t offset,
477 string *key)
478{
479 key->clear();
480 key->reserve(onode_key.length() + 4 + 1);
481 key->append(onode_key.c_str(), onode_key.size());
482 _key_encode_u32(offset, key);
483 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
484}
485
486static void rewrite_extent_shard_key(uint32_t offset, string *key)
487{
11fdf7f2
TL
488 ceph_assert(key->size() > sizeof(uint32_t) + 1);
489 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
490 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
491}
492
493template<typename S>
494static void generate_extent_shard_key_and_apply(
495 const S& onode_key,
496 uint32_t offset,
497 string *key,
498 std::function<void(const string& final_key)> apply)
499{
500 if (key->empty()) { // make full key
11fdf7f2 501 ceph_assert(!onode_key.empty());
7c673cae
FG
502 get_extent_shard_key(onode_key, offset, key);
503 } else {
504 rewrite_extent_shard_key(offset, key);
505 }
506 apply(*key);
507}
508
509int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
510{
11fdf7f2
TL
511 ceph_assert(key.size() > sizeof(uint32_t) + 1);
512 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
513 int okey_len = key.size() - sizeof(uint32_t) - 1;
514 *onode_key = key.substr(0, okey_len);
515 const char *p = key.data() + okey_len;
224ce89b 516 _key_decode_u32(p, offset);
7c673cae
FG
517 return 0;
518}
519
520static bool is_extent_shard_key(const string& key)
521{
522 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
523}
524
7c673cae
FG
525static void get_deferred_key(uint64_t seq, string *out)
526{
527 _key_encode_u64(seq, out);
528}
529
11fdf7f2
TL
530static void get_pool_stat_key(int64_t pool_id, string *key)
531{
532 key->clear();
533 _key_encode_u64(pool_id, key);
534}
535
536static int get_key_pool_stat(const string& key, uint64_t* pool_id)
537{
538 const char *p = key.c_str();
539 if (key.length() < sizeof(uint64_t))
540 return -1;
541 _key_decode_u64(p, pool_id);
542 return 0;
543}
7c673cae 544
81eedcae
TL
545template <int LogLevelV>
546void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
547{
548 uint64_t pos = 0;
549 for (auto& s : em.shards) {
550 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
551 << (s.loaded ? " (loaded)" : "")
552 << (s.dirty ? " (dirty)" : "")
553 << dendl;
554 }
555 for (auto& e : em.extent_map) {
556 dout(LogLevelV) << __func__ << " " << e << dendl;
557 ceph_assert(e.logical_offset >= pos);
558 pos = e.logical_offset + e.length;
559 const bluestore_blob_t& blob = e.blob->get_blob();
560 if (blob.has_csum()) {
561 vector<uint64_t> v;
562 unsigned n = blob.get_csum_count();
563 for (unsigned i = 0; i < n; ++i)
564 v.push_back(blob.get_csum_item(i));
565 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
566 << dendl;
567 }
568 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
569 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
570 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
571 << "~" << i.second->length << std::dec
572 << " " << *i.second << dendl;
573 }
574 }
575}
576
577template <int LogLevelV>
578void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
579{
580 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
581 return;
582 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
583 << " nid " << o.onode.nid
584 << " size 0x" << std::hex << o.onode.size
585 << " (" << std::dec << o.onode.size << ")"
586 << " expected_object_size " << o.onode.expected_object_size
587 << " expected_write_size " << o.onode.expected_write_size
588 << " in " << o.onode.extent_map_shards.size() << " shards"
589 << ", " << o.extent_map.spanning_blob_map.size()
590 << " spanning blobs"
591 << dendl;
592 for (auto p = o.onode.attrs.begin();
593 p != o.onode.attrs.end();
594 ++p) {
595 dout(LogLevelV) << __func__ << " attr " << p->first
596 << " len " << p->second.length() << dendl;
597 }
598 _dump_extent_map<LogLevelV>(cct, o.extent_map);
599}
600
601template <int LogLevelV>
602void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
603{
604 dout(LogLevelV) << __func__ << " transaction dump:\n";
605 JSONFormatter f(true);
606 f.open_object_section("transaction");
607 t->dump(&f);
608 f.close_section();
609 f.flush(*_dout);
610 *_dout << dendl;
611}
612
7c673cae
FG
613// Buffer
614
615ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
616{
617 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
618 << b.offset << "~" << b.length << std::dec
619 << " " << BlueStore::Buffer::get_state_name(b.state);
620 if (b.flags)
621 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
622 return out << ")";
623}
624
f91f0fd5
TL
625namespace {
626
627/*
628 * Due to a bug in key string encoding (see a comment for append_escaped)
629 * the KeyValueDB iterator does not lexicographically sort the same
630 * way that ghobject_t does: objects with the same hash may have wrong order.
631 *
632 * This is the iterator wrapper that fixes the keys order.
633 */
634
635class CollectionListIterator {
636public:
637 CollectionListIterator(const KeyValueDB::Iterator &it)
638 : m_it(it) {
639 }
640 virtual ~CollectionListIterator() {
641 }
642
643 virtual bool valid() const = 0;
644 virtual const ghobject_t &oid() const = 0;
645 virtual void lower_bound(const ghobject_t &oid) = 0;
646 virtual void upper_bound(const ghobject_t &oid) = 0;
647 virtual void next() = 0;
648
adb31ebb
TL
649 virtual int cmp(const ghobject_t &oid) const = 0;
650
651 bool is_ge(const ghobject_t &oid) const {
652 return cmp(oid) >= 0;
653 }
654
655 bool is_lt(const ghobject_t &oid) const {
656 return cmp(oid) < 0;
657 }
658
f91f0fd5
TL
659protected:
660 KeyValueDB::Iterator m_it;
661};
662
663class SimpleCollectionListIterator : public CollectionListIterator {
664public:
665 SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
666 : CollectionListIterator(it), m_cct(cct) {
667 }
668
669 bool valid() const override {
670 return m_it->valid();
671 }
672
673 const ghobject_t &oid() const override {
674 ceph_assert(valid());
675
676 return m_oid;
677 }
678
679 void lower_bound(const ghobject_t &oid) override {
680 string key;
681 get_object_key(m_cct, oid, &key);
682
683 m_it->lower_bound(key);
684 get_oid();
685 }
686
687 void upper_bound(const ghobject_t &oid) override {
688 string key;
689 get_object_key(m_cct, oid, &key);
690
691 m_it->upper_bound(key);
692 get_oid();
693 }
694
695 void next() override {
696 ceph_assert(valid());
697
698 m_it->next();
699 get_oid();
700 }
701
adb31ebb
TL
702 int cmp(const ghobject_t &oid) const override {
703 ceph_assert(valid());
704
705 string key;
706 get_object_key(m_cct, oid, &key);
707
708 return m_it->key().compare(key);
709 }
710
f91f0fd5
TL
711private:
712 CephContext *m_cct;
713 ghobject_t m_oid;
714
715 void get_oid() {
f67539c2
TL
716 m_oid = ghobject_t();
717 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
718 m_it->next();
f91f0fd5 719 }
f67539c2 720 if (!valid()) {
f91f0fd5
TL
721 return;
722 }
723
f91f0fd5
TL
724 int r = get_key_object(m_it->key(), &m_oid);
725 ceph_assert(r == 0);
726 }
727};
728
729class SortedCollectionListIterator : public CollectionListIterator {
730public:
731 SortedCollectionListIterator(const KeyValueDB::Iterator &it)
732 : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
733 }
734
735 bool valid() const override {
736 return m_chunk_iter != m_chunk.end();
737 }
738
739 const ghobject_t &oid() const override {
740 ceph_assert(valid());
741
742 return m_chunk_iter->first;
743 }
744
745 void lower_bound(const ghobject_t &oid) override {
746 std::string key;
747 _key_encode_prefix(oid, &key);
748
749 m_it->lower_bound(key);
750 m_chunk_iter = m_chunk.end();
751 if (!get_next_chunk()) {
752 return;
753 }
754
755 if (this->oid().shard_id != oid.shard_id ||
756 this->oid().hobj.pool != oid.hobj.pool ||
757 this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
758 return;
759 }
760
761 m_chunk_iter = m_chunk.lower_bound(oid);
762 if (m_chunk_iter == m_chunk.end()) {
763 get_next_chunk();
764 }
765 }
766
767 void upper_bound(const ghobject_t &oid) override {
768 lower_bound(oid);
769
770 if (valid() && this->oid() == oid) {
771 next();
772 }
773 }
774
775 void next() override {
776 ceph_assert(valid());
777
778 m_chunk_iter++;
779 if (m_chunk_iter == m_chunk.end()) {
780 get_next_chunk();
781 }
782 }
783
adb31ebb
TL
784 int cmp(const ghobject_t &oid) const override {
785 ceph_assert(valid());
786
787 if (this->oid() < oid) {
788 return -1;
789 }
790 if (this->oid() > oid) {
791 return 1;
792 }
793 return 0;
794 }
795
f91f0fd5
TL
796private:
797 std::map<ghobject_t, std::string> m_chunk;
798 std::map<ghobject_t, std::string>::iterator m_chunk_iter;
799
800 bool get_next_chunk() {
801 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
802 m_it->next();
803 }
804
805 if (!m_it->valid()) {
806 return false;
807 }
808
809 ghobject_t oid;
810 int r = get_key_object(m_it->key(), &oid);
811 ceph_assert(r == 0);
812
813 m_chunk.clear();
814 while (true) {
815 m_chunk.insert({oid, m_it->key()});
816
817 do {
818 m_it->next();
819 } while (m_it->valid() && is_extent_shard_key(m_it->key()));
820
821 if (!m_it->valid()) {
822 break;
823 }
824
825 ghobject_t next;
826 r = get_key_object(m_it->key(), &next);
827 ceph_assert(r == 0);
828 if (next.shard_id != oid.shard_id ||
829 next.hobj.pool != oid.hobj.pool ||
830 next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
831 break;
832 }
833 oid = next;
834 }
835
836 m_chunk_iter = m_chunk.begin();
837 return true;
838 }
839};
840
841} // anonymous namespace
842
7c673cae
FG
843// Garbage Collector
844
845void BlueStore::GarbageCollector::process_protrusive_extents(
846 const BlueStore::ExtentMap& extent_map,
847 uint64_t start_offset,
848 uint64_t end_offset,
849 uint64_t start_touch_offset,
850 uint64_t end_touch_offset,
851 uint64_t min_alloc_size)
852{
11fdf7f2 853 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
7c673cae 854
11fdf7f2
TL
855 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
856 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
7c673cae
FG
857
858 dout(30) << __func__ << " (hex): [" << std::hex
859 << lookup_start_offset << ", " << lookup_end_offset
860 << ")" << std::dec << dendl;
861
862 for (auto it = extent_map.seek_lextent(lookup_start_offset);
863 it != extent_map.extent_map.end() &&
864 it->logical_offset < lookup_end_offset;
865 ++it) {
866 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
867 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
868
869 dout(30) << __func__ << " " << *it
870 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
871 << dendl;
872
873 Blob* b = it->blob.get();
874
875 if (it->logical_offset >=start_touch_offset &&
876 it->logical_end() <= end_touch_offset) {
877 // Process extents within the range affected by
878 // the current write request.
879 // Need to take into account if existing extents
880 // can be merged with them (uncompressed case)
881 if (!b->get_blob().is_compressed()) {
882 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
883 --blob_info_counted->expected_allocations; // don't need to allocate
884 // new AU for compressed
885 // data since another
886 // collocated uncompressed
887 // blob already exists
888 dout(30) << __func__ << " --expected:"
889 << alloc_unit_start << dendl;
890 }
891 used_alloc_unit = alloc_unit_end;
892 blob_info_counted = nullptr;
893 }
894 } else if (b->get_blob().is_compressed()) {
895
896 // additionally we take compressed blobs that were not impacted
897 // by the write into account too
898 BlobInfo& bi =
899 affected_blobs.emplace(
900 b, BlobInfo(b->get_referenced_bytes())).first->second;
901
902 int adjust =
903 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
904 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
905 dout(30) << __func__ << " expected_allocations="
906 << bi.expected_allocations << " end_au:"
907 << alloc_unit_end << dendl;
908
909 blob_info_counted = &bi;
910 used_alloc_unit = alloc_unit_end;
911
11fdf7f2 912 ceph_assert(it->length <= bi.referenced_bytes);
7c673cae
FG
913 bi.referenced_bytes -= it->length;
914 dout(30) << __func__ << " affected_blob:" << *b
915 << " unref 0x" << std::hex << it->length
916 << " referenced = 0x" << bi.referenced_bytes
917 << std::dec << dendl;
918 // NOTE: we can't move specific blob to resulting GC list here
919 // when reference counter == 0 since subsequent extents might
920 // decrement its expected_allocation.
921 // Hence need to enumerate all the extents first.
922 if (!bi.collect_candidate) {
923 bi.first_lextent = it;
924 bi.collect_candidate = true;
925 }
926 bi.last_lextent = it;
927 } else {
928 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
929 // don't need to allocate new AU for compressed data since another
930 // collocated uncompressed blob already exists
931 --blob_info_counted->expected_allocations;
932 dout(30) << __func__ << " --expected_allocations:"
933 << alloc_unit_start << dendl;
934 }
935 used_alloc_unit = alloc_unit_end;
936 blob_info_counted = nullptr;
937 }
938 }
939
940 for (auto b_it = affected_blobs.begin();
941 b_it != affected_blobs.end();
942 ++b_it) {
943 Blob* b = b_it->first;
944 BlobInfo& bi = b_it->second;
945 if (bi.referenced_bytes == 0) {
946 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
947 int64_t blob_expected_for_release =
11fdf7f2 948 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
7c673cae
FG
949
950 dout(30) << __func__ << " " << *(b_it->first)
951 << " expected4release=" << blob_expected_for_release
952 << " expected_allocations=" << bi.expected_allocations
953 << dendl;
954 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
11fdf7f2 955 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
7c673cae
FG
956 if (bi.collect_candidate) {
957 auto it = bi.first_lextent;
958 bool bExit = false;
959 do {
960 if (it->blob.get() == b) {
eafe8130 961 extents_to_collect.insert(it->logical_offset, it->length);
7c673cae
FG
962 }
963 bExit = it == bi.last_lextent;
964 ++it;
31f18b77 965 } while (!bExit);
7c673cae
FG
966 }
967 expected_for_release += blob_expected_for_release;
968 expected_allocations += bi.expected_allocations;
969 }
970 }
971 }
972}
973
974int64_t BlueStore::GarbageCollector::estimate(
975 uint64_t start_offset,
976 uint64_t length,
977 const BlueStore::ExtentMap& extent_map,
978 const BlueStore::old_extent_map_t& old_extents,
979 uint64_t min_alloc_size)
980{
981
982 affected_blobs.clear();
983 extents_to_collect.clear();
984 used_alloc_unit = boost::optional<uint64_t >();
985 blob_info_counted = nullptr;
986
eafe8130
TL
987 uint64_t gc_start_offset = start_offset;
988 uint64_t gc_end_offset = start_offset + length;
7c673cae
FG
989
990 uint64_t end_offset = start_offset + length;
991
992 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
993 Blob* b = it->e.blob.get();
994 if (b->get_blob().is_compressed()) {
995
996 // update gc_start_offset/gc_end_offset if needed
997 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
11fdf7f2 998 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
7c673cae
FG
999
1000 auto o = it->e.logical_offset;
1001 auto l = it->e.length;
1002
1003 uint64_t ref_bytes = b->get_referenced_bytes();
1004 // micro optimization to bypass blobs that have no more references
1005 if (ref_bytes != 0) {
1006 dout(30) << __func__ << " affected_blob:" << *b
1007 << " unref 0x" << std::hex << o << "~" << l
1008 << std::dec << dendl;
1009 affected_blobs.emplace(b, BlobInfo(ref_bytes));
1010 }
1011 }
1012 }
1013 dout(30) << __func__ << " gc range(hex): [" << std::hex
1014 << gc_start_offset << ", " << gc_end_offset
1015 << ")" << std::dec << dendl;
1016
1017 // enumerate preceeding extents to check if they reference affected blobs
1018 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
1019 process_protrusive_extents(extent_map,
1020 gc_start_offset,
1021 gc_end_offset,
1022 start_offset,
1023 end_offset,
1024 min_alloc_size);
1025 }
1026 return expected_for_release - expected_allocations;
1027}
1028
9f95a23c
TL
1029// LruOnodeCacheShard
1030struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
1031 typedef boost::intrusive::list<
1032 BlueStore::Onode,
1033 boost::intrusive::member_hook<
1034 BlueStore::Onode,
1035 boost::intrusive::list_member_hook<>,
1036 &BlueStore::Onode::lru_item> > list_t;
7c673cae 1037
9f95a23c 1038 list_t lru;
7c673cae 1039
9f95a23c 1040 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
7c673cae 1041
f6b5b4d7 1042 void _add(BlueStore::Onode* o, int level) override
9f95a23c 1043 {
f6b5b4d7 1044 if (o->put_cache()) {
9f95a23c 1045 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
f6b5b4d7
TL
1046 } else {
1047 ++num_pinned;
9f95a23c 1048 }
f6b5b4d7
TL
1049 ++num; // we count both pinned and unpinned entries
1050 dout(20) << __func__ << " " << this << " " << o->oid << " added, num=" << num << dendl;
eafe8130 1051 }
f6b5b4d7 1052 void _rm(BlueStore::Onode* o) override
9f95a23c 1053 {
f6b5b4d7 1054 if (o->pop_cache()) {
9f95a23c 1055 lru.erase(lru.iterator_to(*o));
f6b5b4d7
TL
1056 } else {
1057 ceph_assert(num_pinned);
1058 --num_pinned;
9f95a23c 1059 }
f6b5b4d7
TL
1060 ceph_assert(num);
1061 --num;
1062 dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
9f95a23c 1063 }
f6b5b4d7 1064 void _pin(BlueStore::Onode* o) override
9f95a23c 1065 {
9f95a23c 1066 lru.erase(lru.iterator_to(*o));
f6b5b4d7
TL
1067 ++num_pinned;
1068 dout(20) << __func__ << this << " " << " " << " " << o->oid << " pinned" << dendl;
9f95a23c 1069 }
f6b5b4d7 1070 void _unpin(BlueStore::Onode* o) override
9f95a23c 1071 {
f6b5b4d7
TL
1072 lru.push_front(*o);
1073 ceph_assert(num_pinned);
1074 --num_pinned;
1075 dout(20) << __func__ << this << " " << " " << " " << o->oid << " unpinned" << dendl;
9f95a23c 1076 }
adb31ebb
TL
1077 void _unpin_and_rm(BlueStore::Onode* o) override
1078 {
1079 o->pop_cache();
1080 ceph_assert(num_pinned);
1081 --num_pinned;
1082 ceph_assert(num);
1083 --num;
1084 }
9f95a23c
TL
1085 void _trim_to(uint64_t new_size) override
1086 {
1087 if (new_size >= lru.size()) {
1088 return; // don't even try
1089 }
1090 uint64_t n = lru.size() - new_size;
1091 auto p = lru.end();
1092 ceph_assert(p != lru.begin());
1093 --p;
f6b5b4d7
TL
1094 ceph_assert(num >= n);
1095 num -= n;
1096 while (n-- > 0) {
9f95a23c 1097 BlueStore::Onode *o = &*p;
f6b5b4d7
TL
1098 dout(20) << __func__ << " rm " << o->oid << " "
1099 << o->nref << " " << o->cached << " " << o->pinned << dendl;
9f95a23c
TL
1100 if (p != lru.begin()) {
1101 lru.erase(p--);
1102 } else {
f6b5b4d7 1103 ceph_assert(n == 0);
9f95a23c 1104 lru.erase(p);
9f95a23c 1105 }
f6b5b4d7
TL
1106 auto pinned = !o->pop_cache();
1107 ceph_assert(!pinned);
1108 o->c->onode_map._remove(o->oid);
9f95a23c 1109 }
f6b5b4d7
TL
1110 }
1111 void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
1112 {
1113 if (to == this) {
1114 return;
1115 }
1116 ceph_assert(o->cached);
1117 ceph_assert(o->pinned);
1118 ceph_assert(num);
1119 ceph_assert(num_pinned);
1120 --num_pinned;
1121 --num;
1122 ++to->num_pinned;
1123 ++to->num;
9f95a23c
TL
1124 }
1125 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
1126 {
f6b5b4d7 1127 *onodes += num;
9f95a23c
TL
1128 *pinned_onodes += num_pinned;
1129 }
1130};
7c673cae 1131
9f95a23c
TL
1132// OnodeCacheShard
1133BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
1134 CephContext* cct,
1135 string type,
1136 PerfCounters *logger)
7c673cae 1137{
9f95a23c
TL
1138 BlueStore::OnodeCacheShard *c = nullptr;
1139 // Currently we only implement an LRU cache for onodes
1140 c = new LruOnodeCacheShard(cct);
1141 c->logger = logger;
1142 return c;
7c673cae
FG
1143}
1144
9f95a23c
TL
1145// LruBufferCacheShard
1146struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
1147 typedef boost::intrusive::list<
1148 BlueStore::Buffer,
1149 boost::intrusive::member_hook<
1150 BlueStore::Buffer,
1151 boost::intrusive::list_member_hook<>,
1152 &BlueStore::Buffer::lru_item> > list_t;
1153 list_t lru;
1154
1155 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
1156
1157 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
1158 if (near) {
1159 auto q = lru.iterator_to(*near);
1160 lru.insert(q, *b);
1161 } else if (level > 0) {
1162 lru.push_front(*b);
1163 } else {
1164 lru.push_back(*b);
7c673cae 1165 }
9f95a23c
TL
1166 buffer_bytes += b->length;
1167 num = lru.size();
1168 }
1169 void _rm(BlueStore::Buffer *b) override {
1170 ceph_assert(buffer_bytes >= b->length);
1171 buffer_bytes -= b->length;
1172 auto q = lru.iterator_to(*b);
1173 lru.erase(q);
1174 num = lru.size();
1175 }
1176 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
1177 src->_rm(b);
1178 _add(b, 0, nullptr);
1179 }
1180 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
1181 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1182 buffer_bytes += delta;
1183 }
1184 void _touch(BlueStore::Buffer *b) override {
1185 auto p = lru.iterator_to(*b);
1186 lru.erase(p);
1187 lru.push_front(*b);
1188 num = lru.size();
1189 _audit("_touch_buffer end");
1190 }
7c673cae 1191
9f95a23c
TL
1192 void _trim_to(uint64_t max) override
1193 {
1194 while (buffer_bytes > max) {
1195 auto i = lru.rbegin();
1196 if (i == lru.rend()) {
1197 // stop if lru is now empty
7c673cae
FG
1198 break;
1199 }
1200
9f95a23c
TL
1201 BlueStore::Buffer *b = &*i;
1202 ceph_assert(b->is_clean());
1203 dout(20) << __func__ << " rm " << *b << dendl;
1204 b->space->_rm_buffer(this, b);
7c673cae 1205 }
9f95a23c 1206 num = lru.size();
7c673cae 1207 }
7c673cae 1208
9f95a23c
TL
1209 void add_stats(uint64_t *extents,
1210 uint64_t *blobs,
1211 uint64_t *buffers,
1212 uint64_t *bytes) override {
1213 *extents += num_extents;
1214 *blobs += num_blobs;
1215 *buffers += num;
1216 *bytes += buffer_bytes;
7c673cae 1217 }
9f95a23c
TL
1218#ifdef DEBUG_CACHE
1219 void _audit(const char *s) override
1220 {
1221 dout(10) << __func__ << " " << when << " start" << dendl;
1222 uint64_t s = 0;
1223 for (auto i = lru.begin(); i != lru.end(); ++i) {
1224 s += i->length;
1225 }
1226 if (s != buffer_bytes) {
1227 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1228 << dendl;
1229 for (auto i = lru.begin(); i != lru.end(); ++i) {
1230 derr << __func__ << " " << *i << dendl;
1231 }
1232 ceph_assert(s == buffer_bytes);
7c673cae 1233 }
9f95a23c
TL
1234 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1235 << " ok" << dendl;
7c673cae 1236 }
7c673cae 1237#endif
9f95a23c 1238};
7c673cae 1239
9f95a23c
TL
1240// TwoQBufferCacheShard
1241
1242struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1243 typedef boost::intrusive::list<
1244 BlueStore::Buffer,
1245 boost::intrusive::member_hook<
1246 BlueStore::Buffer,
1247 boost::intrusive::list_member_hook<>,
1248 &BlueStore::Buffer::lru_item> > list_t;
1249 list_t hot; ///< "Am" hot buffers
1250 list_t warm_in; ///< "A1in" newly warm buffers
1251 list_t warm_out; ///< "A1out" empty buffers we've evicted
9f95a23c
TL
1252
1253 enum {
1254 BUFFER_NEW = 0,
1255 BUFFER_WARM_IN, ///< in warm_in
1256 BUFFER_WARM_OUT, ///< in warm_out
1257 BUFFER_HOT, ///< in hot
1258 BUFFER_TYPE_MAX
1259 };
7c673cae 1260
9f95a23c 1261 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
7c673cae 1262
9f95a23c
TL
1263public:
1264 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
7c673cae 1265
9f95a23c
TL
1266 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1267 {
1268 dout(20) << __func__ << " level " << level << " near " << near
1269 << " on " << *b
1270 << " which has cache_private " << b->cache_private << dendl;
1271 if (near) {
1272 b->cache_private = near->cache_private;
1273 switch (b->cache_private) {
1274 case BUFFER_WARM_IN:
1275 warm_in.insert(warm_in.iterator_to(*near), *b);
1276 break;
1277 case BUFFER_WARM_OUT:
1278 ceph_assert(b->is_empty());
1279 warm_out.insert(warm_out.iterator_to(*near), *b);
1280 break;
1281 case BUFFER_HOT:
1282 hot.insert(hot.iterator_to(*near), *b);
1283 break;
1284 default:
1285 ceph_abort_msg("bad cache_private");
1286 }
1287 } else if (b->cache_private == BUFFER_NEW) {
1288 b->cache_private = BUFFER_WARM_IN;
1289 if (level > 0) {
1290 warm_in.push_front(*b);
1291 } else {
1292 // take caller hint to start at the back of the warm queue
1293 warm_in.push_back(*b);
1294 }
1295 } else {
1296 // we got a hint from discard
1297 switch (b->cache_private) {
1298 case BUFFER_WARM_IN:
1299 // stay in warm_in. move to front, even though 2Q doesn't actually
1300 // do this.
1301 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1302 warm_in.push_front(*b);
1303 break;
1304 case BUFFER_WARM_OUT:
1305 b->cache_private = BUFFER_HOT;
1306 // move to hot. fall-thru
1307 case BUFFER_HOT:
1308 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1309 hot.push_front(*b);
1310 break;
1311 default:
1312 ceph_abort_msg("bad cache_private");
1313 }
1314 }
1315 if (!b->is_empty()) {
1316 buffer_bytes += b->length;
1317 list_bytes[b->cache_private] += b->length;
1318 }
1319 num = hot.size() + warm_in.size();
1320 }
1321
1322 void _rm(BlueStore::Buffer *b) override
1323 {
1324 dout(20) << __func__ << " " << *b << dendl;
1325 if (!b->is_empty()) {
1326 ceph_assert(buffer_bytes >= b->length);
1327 buffer_bytes -= b->length;
1328 ceph_assert(list_bytes[b->cache_private] >= b->length);
1329 list_bytes[b->cache_private] -= b->length;
1330 }
7c673cae
FG
1331 switch (b->cache_private) {
1332 case BUFFER_WARM_IN:
9f95a23c 1333 warm_in.erase(warm_in.iterator_to(*b));
7c673cae
FG
1334 break;
1335 case BUFFER_WARM_OUT:
9f95a23c 1336 warm_out.erase(warm_out.iterator_to(*b));
7c673cae
FG
1337 break;
1338 case BUFFER_HOT:
9f95a23c 1339 hot.erase(hot.iterator_to(*b));
7c673cae
FG
1340 break;
1341 default:
11fdf7f2 1342 ceph_abort_msg("bad cache_private");
7c673cae 1343 }
9f95a23c
TL
1344 num = hot.size() + warm_in.size();
1345 }
1346
1347 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1348 {
1349 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1350 src->_rm(b);
1351
1352 // preserve which list we're on (even if we can't preserve the order!)
7c673cae
FG
1353 switch (b->cache_private) {
1354 case BUFFER_WARM_IN:
9f95a23c
TL
1355 ceph_assert(!b->is_empty());
1356 warm_in.push_back(*b);
7c673cae
FG
1357 break;
1358 case BUFFER_WARM_OUT:
9f95a23c
TL
1359 ceph_assert(b->is_empty());
1360 warm_out.push_back(*b);
1361 break;
7c673cae 1362 case BUFFER_HOT:
9f95a23c
TL
1363 ceph_assert(!b->is_empty());
1364 hot.push_back(*b);
7c673cae
FG
1365 break;
1366 default:
11fdf7f2 1367 ceph_abort_msg("bad cache_private");
7c673cae 1368 }
9f95a23c
TL
1369 if (!b->is_empty()) {
1370 buffer_bytes += b->length;
1371 list_bytes[b->cache_private] += b->length;
1372 }
1373 num = hot.size() + warm_in.size();
7c673cae 1374 }
7c673cae 1375
9f95a23c
TL
1376 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1377 {
1378 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1379 if (!b->is_empty()) {
1380 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1381 buffer_bytes += delta;
1382 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1383 list_bytes[b->cache_private] += delta;
1384 }
7c673cae 1385 }
7c673cae 1386
9f95a23c
TL
1387 void _touch(BlueStore::Buffer *b) override {
1388 switch (b->cache_private) {
1389 case BUFFER_WARM_IN:
1390 // do nothing (somewhat counter-intuitively!)
1391 break;
1392 case BUFFER_WARM_OUT:
1393 // move from warm_out to hot LRU
1394 ceph_abort_msg("this happens via discard hint");
1395 break;
1396 case BUFFER_HOT:
1397 // move to front of hot LRU
1398 hot.erase(hot.iterator_to(*b));
1399 hot.push_front(*b);
1400 break;
1401 }
1402 num = hot.size() + warm_in.size();
1403 _audit("_touch_buffer end");
7c673cae 1404 }
7c673cae 1405
9f95a23c
TL
1406 void _trim_to(uint64_t max) override
1407 {
1408 if (buffer_bytes > max) {
1409 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1410 uint64_t khot = max - kin;
1411
1412 // pre-calculate kout based on average buffer size too,
1413 // which is typical(the warm_in and hot lists may change later)
1414 uint64_t kout = 0;
1415 uint64_t buffer_num = hot.size() + warm_in.size();
1416 if (buffer_num) {
1417 uint64_t avg_size = buffer_bytes / buffer_num;
1418 ceph_assert(avg_size);
1419 uint64_t calculated_num = max / avg_size;
1420 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1421 }
1422
1423 if (list_bytes[BUFFER_HOT] < khot) {
1424 // hot is small, give slack to warm_in
1425 kin += khot - list_bytes[BUFFER_HOT];
1426 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1427 // warm_in is small, give slack to hot
1428 khot += kin - list_bytes[BUFFER_WARM_IN];
1429 }
1430
1431 // adjust warm_in list
1432 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1433 uint64_t evicted = 0;
1434
1435 while (to_evict_bytes > 0) {
1436 auto p = warm_in.rbegin();
1437 if (p == warm_in.rend()) {
1438 // stop if warm_in list is now empty
1439 break;
1440 }
7c673cae 1441
9f95a23c
TL
1442 BlueStore::Buffer *b = &*p;
1443 ceph_assert(b->is_clean());
1444 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1445 ceph_assert(buffer_bytes >= b->length);
1446 buffer_bytes -= b->length;
1447 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1448 list_bytes[BUFFER_WARM_IN] -= b->length;
1449 to_evict_bytes -= b->length;
1450 evicted += b->length;
1451 b->state = BlueStore::Buffer::STATE_EMPTY;
1452 b->data.clear();
1453 warm_in.erase(warm_in.iterator_to(*b));
1454 warm_out.push_front(*b);
1455 b->cache_private = BUFFER_WARM_OUT;
1456 }
1457
1458 if (evicted > 0) {
1459 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1460 << " from warm_in list, done evicting warm_in buffers"
1461 << dendl;
1462 }
7c673cae 1463
9f95a23c
TL
1464 // adjust hot list
1465 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1466 evicted = 0;
7c673cae 1467
9f95a23c
TL
1468 while (to_evict_bytes > 0) {
1469 auto p = hot.rbegin();
1470 if (p == hot.rend()) {
1471 // stop if hot list is now empty
1472 break;
1473 }
7c673cae 1474
9f95a23c
TL
1475 BlueStore::Buffer *b = &*p;
1476 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1477 ceph_assert(b->is_clean());
1478 // adjust evict size before buffer goes invalid
1479 to_evict_bytes -= b->length;
1480 evicted += b->length;
1481 b->space->_rm_buffer(this, b);
1482 }
7c673cae 1483
9f95a23c
TL
1484 if (evicted > 0) {
1485 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1486 << " from hot list, done evicting hot buffers"
1487 << dendl;
7c673cae
FG
1488 }
1489
9f95a23c
TL
1490 // adjust warm out list too, if necessary
1491 int64_t n = warm_out.size() - kout;
1492 while (n-- > 0) {
1493 BlueStore::Buffer *b = &*warm_out.rbegin();
1494 ceph_assert(b->is_empty());
1495 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1496 b->space->_rm_buffer(this, b);
1497 }
7c673cae 1498 }
9f95a23c
TL
1499 num = hot.size() + warm_in.size();
1500 }
7c673cae 1501
9f95a23c
TL
1502 void add_stats(uint64_t *extents,
1503 uint64_t *blobs,
1504 uint64_t *buffers,
1505 uint64_t *bytes) override {
1506 *extents += num_extents;
1507 *blobs += num_blobs;
1508 *buffers += num;
1509 *bytes += buffer_bytes;
1510 }
7c673cae 1511
9f95a23c
TL
1512#ifdef DEBUG_CACHE
1513 void _audit(const char *s) override
1514 {
1515 dout(10) << __func__ << " " << when << " start" << dendl;
1516 uint64_t s = 0;
1517 for (auto i = hot.begin(); i != hot.end(); ++i) {
1518 s += i->length;
7c673cae
FG
1519 }
1520
9f95a23c
TL
1521 uint64_t hot_bytes = s;
1522 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1523 derr << __func__ << " hot_list_bytes "
1524 << list_bytes[BUFFER_HOT]
1525 << " != actual " << hot_bytes
1526 << dendl;
1527 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
7c673cae
FG
1528 }
1529
9f95a23c
TL
1530 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1531 s += i->length;
7c673cae 1532 }
7c673cae 1533
9f95a23c
TL
1534 uint64_t warm_in_bytes = s - hot_bytes;
1535 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1536 derr << __func__ << " warm_in_list_bytes "
1537 << list_bytes[BUFFER_WARM_IN]
1538 << " != actual " << warm_in_bytes
1539 << dendl;
1540 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
7c673cae 1541 }
7c673cae 1542
9f95a23c
TL
1543 if (s != buffer_bytes) {
1544 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1545 << dendl;
1546 ceph_assert(s == buffer_bytes);
1547 }
7c673cae 1548
9f95a23c
TL
1549 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1550 << " ok" << dendl;
7c673cae 1551 }
9f95a23c
TL
1552#endif
1553};
7c673cae 1554
9f95a23c 1555// BuferCacheShard
7c673cae 1556
9f95a23c
TL
1557BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1558 CephContext* cct,
1559 string type,
1560 PerfCounters *logger)
1561{
1562 BufferCacheShard *c = nullptr;
1563 if (type == "lru")
1564 c = new LruBufferCacheShard(cct);
1565 else if (type == "2q")
1566 c = new TwoQBufferCacheShard(cct);
1567 else
1568 ceph_abort_msg("unrecognized cache type");
1569 c->logger = logger;
1570 return c;
7c673cae 1571}
7c673cae
FG
1572
1573// BufferSpace
1574
1575#undef dout_prefix
1576#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1577
9f95a23c 1578void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
7c673cae
FG
1579{
1580 // note: we already hold cache->lock
1581 ldout(cache->cct, 20) << __func__ << dendl;
1582 while (!buffer_map.empty()) {
1583 _rm_buffer(cache, buffer_map.begin());
1584 }
1585}
1586
9f95a23c 1587int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
7c673cae
FG
1588{
1589 // note: we already hold cache->lock
1590 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1591 << std::dec << dendl;
1592 int cache_private = 0;
1593 cache->_audit("discard start");
1594 auto i = _data_lower_bound(offset);
1595 uint32_t end = offset + length;
1596 while (i != buffer_map.end()) {
1597 Buffer *b = i->second.get();
1598 if (b->offset >= end) {
1599 break;
1600 }
1601 if (b->cache_private > cache_private) {
1602 cache_private = b->cache_private;
1603 }
1604 if (b->offset < offset) {
1605 int64_t front = offset - b->offset;
1606 if (b->end() > end) {
1607 // drop middle (split)
1608 uint32_t tail = b->end() - end;
1609 if (b->data.length()) {
1610 bufferlist bl;
1611 bl.substr_of(b->data, b->length - tail, tail);
f67539c2 1612 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
31f18b77
FG
1613 nb->maybe_rebuild();
1614 _add_buffer(cache, nb, 0, b);
7c673cae 1615 } else {
f67539c2
TL
1616 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail,
1617 b->flags),
1618 0, b);
7c673cae
FG
1619 }
1620 if (!b->is_writing()) {
9f95a23c 1621 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1622 }
1623 b->truncate(front);
31f18b77 1624 b->maybe_rebuild();
7c673cae
FG
1625 cache->_audit("discard end 1");
1626 break;
1627 } else {
1628 // drop tail
1629 if (!b->is_writing()) {
9f95a23c 1630 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1631 }
1632 b->truncate(front);
31f18b77 1633 b->maybe_rebuild();
7c673cae
FG
1634 ++i;
1635 continue;
1636 }
1637 }
1638 if (b->end() <= end) {
1639 // drop entire buffer
1640 _rm_buffer(cache, i++);
1641 continue;
1642 }
1643 // drop front
1644 uint32_t keep = b->end() - end;
1645 if (b->data.length()) {
1646 bufferlist bl;
1647 bl.substr_of(b->data, b->length - keep, keep);
f67539c2 1648 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
31f18b77
FG
1649 nb->maybe_rebuild();
1650 _add_buffer(cache, nb, 0, b);
7c673cae 1651 } else {
f67539c2
TL
1652 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep,
1653 b->flags),
1654 0, b);
7c673cae
FG
1655 }
1656 _rm_buffer(cache, i);
1657 cache->_audit("discard end 2");
1658 break;
1659 }
1660 return cache_private;
1661}
1662
1663void BlueStore::BufferSpace::read(
9f95a23c 1664 BufferCacheShard* cache,
224ce89b
WB
1665 uint32_t offset,
1666 uint32_t length,
7c673cae 1667 BlueStore::ready_regions_t& res,
91327a77
AA
1668 interval_set<uint32_t>& res_intervals,
1669 int flags)
7c673cae 1670{
7c673cae
FG
1671 res.clear();
1672 res_intervals.clear();
1673 uint32_t want_bytes = length;
1674 uint32_t end = offset + length;
224ce89b
WB
1675
1676 {
11fdf7f2 1677 std::lock_guard l(cache->lock);
224ce89b
WB
1678 for (auto i = _data_lower_bound(offset);
1679 i != buffer_map.end() && offset < end && i->first < end;
1680 ++i) {
1681 Buffer *b = i->second.get();
11fdf7f2 1682 ceph_assert(b->end() > offset);
91327a77
AA
1683
1684 bool val = false;
1685 if (flags & BYPASS_CLEAN_CACHE)
1686 val = b->is_writing();
1687 else
1688 val = b->is_writing() || b->is_clean();
1689 if (val) {
224ce89b
WB
1690 if (b->offset < offset) {
1691 uint32_t skip = offset - b->offset;
11fdf7f2 1692 uint32_t l = min(length, b->length - skip);
224ce89b
WB
1693 res[offset].substr_of(b->data, skip, l);
1694 res_intervals.insert(offset, l);
1695 offset += l;
1696 length -= l;
1697 if (!b->is_writing()) {
9f95a23c 1698 cache->_touch(b);
f67539c2 1699 }
224ce89b
WB
1700 continue;
1701 }
1702 if (b->offset > offset) {
1703 uint32_t gap = b->offset - offset;
1704 if (length <= gap) {
1705 break;
1706 }
1707 offset += gap;
1708 length -= gap;
1709 }
1710 if (!b->is_writing()) {
9f95a23c 1711 cache->_touch(b);
224ce89b
WB
1712 }
1713 if (b->length > length) {
1714 res[offset].substr_of(b->data, 0, length);
1715 res_intervals.insert(offset, length);
7c673cae 1716 break;
224ce89b
WB
1717 } else {
1718 res[offset].append(b->data);
1719 res_intervals.insert(offset, b->length);
1720 if (b->length == length)
1721 break;
1722 offset += b->length;
1723 length -= b->length;
1724 }
7c673cae
FG
1725 }
1726 }
1727 }
1728
1729 uint64_t hit_bytes = res_intervals.size();
11fdf7f2 1730 ceph_assert(hit_bytes <= want_bytes);
7c673cae
FG
1731 uint64_t miss_bytes = want_bytes - hit_bytes;
1732 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1733 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1734}
1735
9f95a23c 1736void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
7c673cae 1737{
7c673cae
FG
1738 auto i = writing.begin();
1739 while (i != writing.end()) {
1740 if (i->seq > seq) {
1741 break;
1742 }
1743 if (i->seq < seq) {
1744 ++i;
1745 continue;
1746 }
1747
1748 Buffer *b = &*i;
11fdf7f2 1749 ceph_assert(b->is_writing());
7c673cae
FG
1750
1751 if (b->flags & Buffer::FLAG_NOCACHE) {
1752 writing.erase(i++);
1753 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1754 buffer_map.erase(b->offset);
1755 } else {
1756 b->state = Buffer::STATE_CLEAN;
1757 writing.erase(i++);
31f18b77
FG
1758 b->maybe_rebuild();
1759 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
9f95a23c 1760 cache->_add(b, 1, nullptr);
7c673cae
FG
1761 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1762 }
1763 }
9f95a23c 1764 cache->_trim();
7c673cae
FG
1765 cache->_audit("finish_write end");
1766}
1767
9f95a23c 1768void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
7c673cae 1769{
11fdf7f2 1770 std::lock_guard lk(cache->lock);
7c673cae
FG
1771 if (buffer_map.empty())
1772 return;
1773
1774 auto p = --buffer_map.end();
1775 while (true) {
1776 if (p->second->end() <= pos)
1777 break;
1778
1779 if (p->second->offset < pos) {
1780 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1781 size_t left = pos - p->second->offset;
1782 size_t right = p->second->length - left;
1783 if (p->second->data.length()) {
1784 bufferlist bl;
1785 bl.substr_of(p->second->data, left, right);
f67539c2
TL
1786 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1787 0, bl, p->second->flags),
7c673cae
FG
1788 0, p->second.get());
1789 } else {
f67539c2
TL
1790 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1791 0, right, p->second->flags),
7c673cae
FG
1792 0, p->second.get());
1793 }
9f95a23c 1794 cache->_adjust_size(p->second.get(), -right);
7c673cae
FG
1795 p->second->truncate(left);
1796 break;
1797 }
1798
11fdf7f2 1799 ceph_assert(p->second->end() > pos);
7c673cae
FG
1800 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1801 if (p->second->data.length()) {
1802 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
f67539c2 1803 p->second->offset - pos, p->second->data, p->second->flags),
7c673cae
FG
1804 0, p->second.get());
1805 } else {
1806 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
f67539c2 1807 p->second->offset - pos, p->second->length, p->second->flags),
7c673cae
FG
1808 0, p->second.get());
1809 }
1810 if (p == buffer_map.begin()) {
1811 _rm_buffer(cache, p);
1812 break;
1813 } else {
1814 _rm_buffer(cache, p--);
1815 }
1816 }
11fdf7f2 1817 ceph_assert(writing.empty());
9f95a23c 1818 cache->_trim();
7c673cae
FG
1819}
1820
1821// OnodeSpace
1822
1823#undef dout_prefix
1824#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1825
f6b5b4d7
TL
1826BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
1827 OnodeRef& o)
7c673cae 1828{
11fdf7f2 1829 std::lock_guard l(cache->lock);
7c673cae
FG
1830 auto p = onode_map.find(oid);
1831 if (p != onode_map.end()) {
1832 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1833 << " raced, returning existing " << p->second
1834 << dendl;
1835 return p->second;
1836 }
f6b5b4d7 1837 ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
7c673cae 1838 onode_map[oid] = o;
f6b5b4d7 1839 cache->_add(o.get(), 1);
9f95a23c 1840 cache->_trim();
7c673cae
FG
1841 return o;
1842}
1843
f6b5b4d7
TL
1844void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
1845{
1846 ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
1847 onode_map.erase(oid);
1848}
1849
7c673cae
FG
1850BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1851{
7c673cae 1852 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b
WB
1853 OnodeRef o;
1854 bool hit = false;
1855
1856 {
11fdf7f2 1857 std::lock_guard l(cache->lock);
224ce89b
WB
1858 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1859 if (p == onode_map.end()) {
1860 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1861 } else {
1862 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
f6b5b4d7
TL
1863 << " " << p->second->nref
1864 << " " << p->second->cached
1865 << " " << p->second->pinned
224ce89b 1866 << dendl;
f6b5b4d7
TL
1867 // This will pin onode and implicitly touch the cache when Onode
1868 // eventually will become unpinned
224ce89b 1869 o = p->second;
f6b5b4d7
TL
1870 ceph_assert(!o->cached || o->pinned);
1871
1872 hit = true;
224ce89b
WB
1873 }
1874 }
1875
1876 if (hit) {
1877 cache->logger->inc(l_bluestore_onode_hits);
1878 } else {
7c673cae 1879 cache->logger->inc(l_bluestore_onode_misses);
7c673cae 1880 }
224ce89b 1881 return o;
7c673cae
FG
1882}
1883
1884void BlueStore::OnodeSpace::clear()
1885{
11fdf7f2 1886 std::lock_guard l(cache->lock);
f6b5b4d7 1887 ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
7c673cae 1888 for (auto &p : onode_map) {
f6b5b4d7 1889 cache->_rm(p.second.get());
7c673cae
FG
1890 }
1891 onode_map.clear();
1892}
1893
1894bool BlueStore::OnodeSpace::empty()
1895{
11fdf7f2 1896 std::lock_guard l(cache->lock);
7c673cae
FG
1897 return onode_map.empty();
1898}
1899
1900void BlueStore::OnodeSpace::rename(
1901 OnodeRef& oldo,
1902 const ghobject_t& old_oid,
1903 const ghobject_t& new_oid,
f91f0fd5 1904 const mempool::bluestore_cache_meta::string& new_okey)
7c673cae 1905{
11fdf7f2 1906 std::lock_guard l(cache->lock);
7c673cae
FG
1907 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1908 << dendl;
1909 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1910 po = onode_map.find(old_oid);
1911 pn = onode_map.find(new_oid);
11fdf7f2 1912 ceph_assert(po != pn);
7c673cae 1913
11fdf7f2 1914 ceph_assert(po != onode_map.end());
7c673cae
FG
1915 if (pn != onode_map.end()) {
1916 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1917 << dendl;
f6b5b4d7 1918 cache->_rm(pn->second.get());
7c673cae
FG
1919 onode_map.erase(pn);
1920 }
1921 OnodeRef o = po->second;
1922
1923 // install a non-existent onode at old location
1924 oldo.reset(new Onode(o->c, old_oid, o->key));
1925 po->second = oldo;
f6b5b4d7
TL
1926 cache->_add(oldo.get(), 1);
1927 // add at new position and fix oid, key.
1928 // This will pin 'o' and implicitly touch cache
1929 // when it will eventually become unpinned
7c673cae 1930 onode_map.insert(make_pair(new_oid, o));
f6b5b4d7
TL
1931 ceph_assert(o->pinned);
1932
7c673cae
FG
1933 o->oid = new_oid;
1934 o->key = new_okey;
9f95a23c 1935 cache->_trim();
7c673cae
FG
1936}
1937
adb31ebb 1938bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
7c673cae 1939{
11fdf7f2 1940 std::lock_guard l(cache->lock);
7c673cae
FG
1941 ldout(cache->cct, 20) << __func__ << dendl;
1942 for (auto& i : onode_map) {
adb31ebb 1943 if (f(i.second.get())) {
7c673cae
FG
1944 return true;
1945 }
1946 }
1947 return false;
1948}
1949
11fdf7f2
TL
1950template <int LogLevelV = 30>
1951void BlueStore::OnodeSpace::dump(CephContext *cct)
3efd9988
FG
1952{
1953 for (auto& i : onode_map) {
f6b5b4d7
TL
1954 ldout(cct, LogLevelV) << i.first << " : " << i.second
1955 << " " << i.second->nref
1956 << " " << i.second->cached
1957 << " " << i.second->pinned
1958 << dendl;
3efd9988
FG
1959 }
1960}
7c673cae
FG
1961
1962// SharedBlob
1963
1964#undef dout_prefix
1965#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
9f95a23c
TL
1966#undef dout_context
1967#define dout_context coll->store->cct
7c673cae 1968
9f95a23c 1969void BlueStore::SharedBlob::dump(Formatter* f) const
7c673cae 1970{
9f95a23c
TL
1971 f->dump_bool("loaded", loaded);
1972 if (loaded) {
1973 persistent->dump(f);
1974 } else {
1975 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
1976 }
1977}
1978
1979ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1980{
1981 out << "SharedBlob(" << &sb;
1982
7c673cae
FG
1983 if (sb.loaded) {
1984 out << " loaded " << *sb.persistent;
1985 } else {
1986 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1987 }
1988 return out << ")";
1989}
1990
1991BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1992 : coll(_coll), sbid_unloaded(i)
1993{
11fdf7f2 1994 ceph_assert(sbid_unloaded > 0);
7c673cae
FG
1995 if (get_cache()) {
1996 get_cache()->add_blob();
1997 }
1998}
1999
2000BlueStore::SharedBlob::~SharedBlob()
2001{
7c673cae
FG
2002 if (loaded && persistent) {
2003 delete persistent;
2004 }
2005}
2006
2007void BlueStore::SharedBlob::put()
2008{
2009 if (--nref == 0) {
9f95a23c
TL
2010 dout(20) << __func__ << " " << this
2011 << " removing self from set " << get_parent()
2012 << dendl;
1adf2230
AA
2013 again:
2014 auto coll_snap = coll;
2015 if (coll_snap) {
11fdf7f2 2016 std::lock_guard l(coll_snap->cache->lock);
1adf2230
AA
2017 if (coll_snap != coll) {
2018 goto again;
2019 }
91327a77
AA
2020 if (!coll_snap->shared_blob_set.remove(this, true)) {
2021 // race with lookup
2022 return;
2023 }
1adf2230
AA
2024 bc._clear(coll_snap->cache);
2025 coll_snap->cache->rm_blob();
7c673cae 2026 }
28e407b8 2027 delete this;
7c673cae
FG
2028 }
2029}
2030
2031void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
2032{
11fdf7f2 2033 ceph_assert(persistent);
7c673cae
FG
2034 persistent->ref_map.get(offset, length);
2035}
2036
2037void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77 2038 PExtentVector *r,
11fdf7f2 2039 bool *unshare)
7c673cae 2040{
11fdf7f2
TL
2041 ceph_assert(persistent);
2042 persistent->ref_map.put(offset, length, r,
2043 unshare && !*unshare ? unshare : nullptr);
7c673cae
FG
2044}
2045
f64942e4
AA
2046void BlueStore::SharedBlob::finish_write(uint64_t seq)
2047{
2048 while (true) {
9f95a23c 2049 BufferCacheShard *cache = coll->cache;
11fdf7f2 2050 std::lock_guard l(cache->lock);
f64942e4 2051 if (coll->cache != cache) {
9f95a23c
TL
2052 dout(20) << __func__
2053 << " raced with sb cache update, was " << cache
2054 << ", now " << coll->cache << ", retrying"
2055 << dendl;
f64942e4
AA
2056 continue;
2057 }
2058 bc._finish_write(cache, seq);
2059 break;
2060 }
2061}
2062
3efd9988
FG
2063// SharedBlobSet
2064
2065#undef dout_prefix
2066#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2067
11fdf7f2
TL
2068template <int LogLevelV = 30>
2069void BlueStore::SharedBlobSet::dump(CephContext *cct)
3efd9988 2070{
11fdf7f2 2071 std::lock_guard l(lock);
3efd9988 2072 for (auto& i : sb_map) {
11fdf7f2 2073 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
3efd9988
FG
2074 }
2075}
2076
7c673cae
FG
2077// Blob
2078
2079#undef dout_prefix
2080#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2081
9f95a23c
TL
2082void BlueStore::Blob::dump(Formatter* f) const
2083{
2084 if (is_spanning()) {
2085 f->dump_unsigned("spanning_id ", id);
2086 }
2087 blob.dump(f);
2088 if (shared_blob) {
2089 f->dump_object("shared", *shared_blob);
2090 }
2091}
2092
7c673cae
FG
2093ostream& operator<<(ostream& out, const BlueStore::Blob& b)
2094{
2095 out << "Blob(" << &b;
2096 if (b.is_spanning()) {
2097 out << " spanning " << b.id;
2098 }
35e4c445
FG
2099 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
2100 if (b.shared_blob) {
2101 out << " " << *b.shared_blob;
2102 } else {
2103 out << " (shared_blob=NULL)";
2104 }
2105 out << ")";
7c673cae
FG
2106 return out;
2107}
2108
2109void BlueStore::Blob::discard_unallocated(Collection *coll)
2110{
224ce89b 2111 if (get_blob().is_shared()) {
7c673cae
FG
2112 return;
2113 }
224ce89b 2114 if (get_blob().is_compressed()) {
7c673cae
FG
2115 bool discard = false;
2116 bool all_invalid = true;
224ce89b 2117 for (auto e : get_blob().get_extents()) {
7c673cae
FG
2118 if (!e.is_valid()) {
2119 discard = true;
2120 } else {
2121 all_invalid = false;
2122 }
2123 }
11fdf7f2 2124 ceph_assert(discard == all_invalid); // in case of compressed blob all
7c673cae
FG
2125 // or none pextents are invalid.
2126 if (discard) {
224ce89b
WB
2127 shared_blob->bc.discard(shared_blob->get_cache(), 0,
2128 get_blob().get_logical_length());
7c673cae
FG
2129 }
2130 } else {
2131 size_t pos = 0;
224ce89b 2132 for (auto e : get_blob().get_extents()) {
7c673cae 2133 if (!e.is_valid()) {
9f95a23c
TL
2134 dout(20) << __func__ << " 0x" << std::hex << pos
2135 << "~" << e.length
2136 << std::dec << dendl;
7c673cae
FG
2137 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
2138 }
2139 pos += e.length;
2140 }
224ce89b
WB
2141 if (get_blob().can_prune_tail()) {
2142 dirty_blob().prune_tail();
2143 used_in_blob.prune_tail(get_blob().get_ondisk_length());
224ce89b 2144 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
2145 }
2146 }
2147}
2148
2149void BlueStore::Blob::get_ref(
2150 Collection *coll,
2151 uint32_t offset,
2152 uint32_t length)
2153{
2154 // Caller has to initialize Blob's logical length prior to increment
2155 // references. Otherwise one is neither unable to determine required
2156 // amount of counters in case of per-au tracking nor obtain min_release_size
2157 // for single counter mode.
11fdf7f2 2158 ceph_assert(get_blob().get_logical_length() != 0);
7c673cae
FG
2159 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2160 << std::dec << " " << *this << dendl;
2161
2162 if (used_in_blob.is_empty()) {
2163 uint32_t min_release_size =
224ce89b
WB
2164 get_blob().get_release_size(coll->store->min_alloc_size);
2165 uint64_t l = get_blob().get_logical_length();
2166 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
2167 << min_release_size << std::dec << dendl;
7c673cae
FG
2168 used_in_blob.init(l, min_release_size);
2169 }
2170 used_in_blob.get(
2171 offset,
2172 length);
2173}
2174
2175bool BlueStore::Blob::put_ref(
2176 Collection *coll,
2177 uint32_t offset,
2178 uint32_t length,
2179 PExtentVector *r)
2180{
2181 PExtentVector logical;
2182
7c673cae
FG
2183 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2184 << std::dec << " " << *this << dendl;
2185
2186 bool empty = used_in_blob.put(
2187 offset,
2188 length,
2189 &logical);
2190 r->clear();
2191 // nothing to release
2192 if (!empty && logical.empty()) {
2193 return false;
2194 }
2195
2196 bluestore_blob_t& b = dirty_blob();
2197 return b.release_extents(empty, logical, r);
2198}
2199
224ce89b 2200bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
2201 uint32_t target_blob_size,
2202 uint32_t b_offset,
2203 uint32_t *length0) {
11fdf7f2
TL
2204 ceph_assert(min_alloc_size);
2205 ceph_assert(target_blob_size);
7c673cae
FG
2206 if (!get_blob().is_mutable()) {
2207 return false;
2208 }
2209
2210 uint32_t length = *length0;
2211 uint32_t end = b_offset + length;
2212
2213 // Currently for the sake of simplicity we omit blob reuse if data is
2214 // unaligned with csum chunk. Later we can perform padding if needed.
2215 if (get_blob().has_csum() &&
2216 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2217 (end % get_blob().get_csum_chunk_size()) != 0)) {
2218 return false;
2219 }
2220
2221 auto blen = get_blob().get_logical_length();
2222 uint32_t new_blen = blen;
2223
2224 // make sure target_blob_size isn't less than current blob len
11fdf7f2 2225 target_blob_size = std::max(blen, target_blob_size);
7c673cae
FG
2226
2227 if (b_offset >= blen) {
224ce89b
WB
2228 // new data totally stands out of the existing blob
2229 new_blen = end;
7c673cae 2230 } else {
224ce89b 2231 // new data overlaps with the existing blob
11fdf7f2 2232 new_blen = std::max(blen, end);
224ce89b
WB
2233
2234 uint32_t overlap = 0;
2235 if (new_blen > blen) {
2236 overlap = blen - b_offset;
2237 } else {
2238 overlap = length;
2239 }
2240
2241 if (!get_blob().is_unallocated(b_offset, overlap)) {
2242 // abort if any piece of the overlap has already been allocated
2243 return false;
7c673cae
FG
2244 }
2245 }
224ce89b 2246
7c673cae
FG
2247 if (new_blen > blen) {
2248 int64_t overflow = int64_t(new_blen) - target_blob_size;
2249 // Unable to decrease the provided length to fit into max_blob_size
2250 if (overflow >= length) {
2251 return false;
2252 }
2253
2254 // FIXME: in some cases we could reduce unused resolution
2255 if (get_blob().has_unused()) {
2256 return false;
2257 }
2258
2259 if (overflow > 0) {
2260 new_blen -= overflow;
2261 length -= overflow;
2262 *length0 = length;
2263 }
224ce89b 2264
7c673cae
FG
2265 if (new_blen > blen) {
2266 dirty_blob().add_tail(new_blen);
2267 used_in_blob.add_tail(new_blen,
224ce89b 2268 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
2269 }
2270 }
2271 return true;
2272}
2273
2274void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2275{
7c673cae
FG
2276 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2277 << " start " << *this << dendl;
11fdf7f2
TL
2278 ceph_assert(blob.can_split());
2279 ceph_assert(used_in_blob.can_split());
7c673cae
FG
2280 bluestore_blob_t &lb = dirty_blob();
2281 bluestore_blob_t &rb = r->dirty_blob();
2282
2283 used_in_blob.split(
2284 blob_offset,
2285 &(r->used_in_blob));
2286
2287 lb.split(blob_offset, rb);
2288 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2289
2290 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2291 << " finish " << *this << dendl;
2292 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2293 << " and " << *r << dendl;
2294}
2295
2296#ifndef CACHE_BLOB_BL
2297void BlueStore::Blob::decode(
2298 Collection *coll,
11fdf7f2 2299 bufferptr::const_iterator& p,
7c673cae
FG
2300 uint64_t struct_v,
2301 uint64_t* sbid,
2302 bool include_ref_map)
2303{
2304 denc(blob, p, struct_v);
2305 if (blob.is_shared()) {
2306 denc(*sbid, p);
2307 }
2308 if (include_ref_map) {
2309 if (struct_v > 1) {
2310 used_in_blob.decode(p);
2311 } else {
2312 used_in_blob.clear();
2313 bluestore_extent_ref_map_t legacy_ref_map;
2314 legacy_ref_map.decode(p);
2315 for (auto r : legacy_ref_map.ref_map) {
2316 get_ref(
2317 coll,
2318 r.first,
2319 r.second.refs * r.second.length);
2320 }
2321 }
2322 }
2323}
2324#endif
2325
2326// Extent
2327
9f95a23c
TL
2328void BlueStore::Extent::dump(Formatter* f) const
2329{
2330 f->dump_unsigned("logical_offset", logical_offset);
2331 f->dump_unsigned("length", length);
2332 f->dump_unsigned("blob_offset", blob_offset);
2333 f->dump_object("blob", *blob);
2334}
2335
7c673cae
FG
2336ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2337{
2338 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2339 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2340 << " " << *e.blob;
2341}
2342
2343// OldExtent
2344BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2345 uint32_t lo,
2346 uint32_t o,
2347 uint32_t l,
2348 BlobRef& b) {
2349 OldExtent* oe = new OldExtent(lo, o, l, b);
2350 b->put_ref(c.get(), o, l, &(oe->r));
adb31ebb 2351 oe->blob_empty = !b->is_referenced();
7c673cae
FG
2352 return oe;
2353}
2354
2355// ExtentMap
2356
2357#undef dout_prefix
2358#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
9f95a23c
TL
2359#undef dout_context
2360#define dout_context onode->c->store->cct
7c673cae
FG
2361
2362BlueStore::ExtentMap::ExtentMap(Onode *o)
2363 : onode(o),
2364 inline_bl(
2365 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2366}
2367
9f95a23c
TL
2368void BlueStore::ExtentMap::dump(Formatter* f) const
2369{
2370 f->open_array_section("extents");
2371
2372 for (auto& e : extent_map) {
2373 f->dump_object("extent", e);
2374 }
2375 f->close_section();
2376}
2377
11fdf7f2
TL
2378void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2379 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2380 uint64_t& length, uint64_t& dstoff) {
2381
2382 auto cct = onode->c->store->cct;
2383 bool inject_21040 =
2384 cct->_conf->bluestore_debug_inject_bug21040;
2385 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2386 for (auto& e : oldo->extent_map.extent_map) {
2387 e.blob->last_encoded_id = -1;
2388 }
2389
2390 int n = 0;
2391 uint64_t end = srcoff + length;
2392 uint32_t dirty_range_begin = 0;
2393 uint32_t dirty_range_end = 0;
2394 bool src_dirty = false;
2395 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2396 ep != oldo->extent_map.extent_map.end();
2397 ++ep) {
2398 auto& e = *ep;
2399 if (e.logical_offset >= end) {
2400 break;
2401 }
2402 dout(20) << __func__ << " src " << e << dendl;
2403 BlobRef cb;
2404 bool blob_duped = true;
2405 if (e.blob->last_encoded_id >= 0) {
2406 cb = id_to_blob[e.blob->last_encoded_id];
2407 blob_duped = false;
2408 } else {
2409 // dup the blob
2410 const bluestore_blob_t& blob = e.blob->get_blob();
2411 // make sure it is shared
2412 if (!blob.is_shared()) {
2413 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2414 if (!inject_21040 && !src_dirty) {
2415 src_dirty = true;
2416 dirty_range_begin = e.logical_offset;
2417 } else if (inject_21040 &&
2418 dirty_range_begin == 0 && dirty_range_end == 0) {
2419 dirty_range_begin = e.logical_offset;
2420 }
2421 ceph_assert(e.logical_end() > 0);
2422 // -1 to exclude next potential shard
2423 dirty_range_end = e.logical_end() - 1;
2424 } else {
2425 c->load_shared_blob(e.blob->shared_blob);
2426 }
2427 cb = new Blob();
2428 e.blob->last_encoded_id = n;
2429 id_to_blob[n] = cb;
2430 e.blob->dup(*cb);
2431 // bump the extent refs on the copied blob's extents
2432 for (auto p : blob.get_extents()) {
2433 if (p.is_valid()) {
2434 e.blob->shared_blob->get_ref(p.offset, p.length);
2435 }
2436 }
2437 txc->write_shared_blob(e.blob->shared_blob);
2438 dout(20) << __func__ << " new " << *cb << dendl;
2439 }
2440
2441 int skip_front, skip_back;
2442 if (e.logical_offset < srcoff) {
2443 skip_front = srcoff - e.logical_offset;
2444 } else {
2445 skip_front = 0;
2446 }
2447 if (e.logical_end() > end) {
2448 skip_back = e.logical_end() - end;
2449 } else {
2450 skip_back = 0;
2451 }
2452
2453 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2454 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2455 newo->extent_map.extent_map.insert(*ne);
2456 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2457 // fixme: we may leave parts of new blob unreferenced that could
2458 // be freed (relative to the shared_blob).
2459 txc->statfs_delta.stored() += ne->length;
2460 if (e.blob->get_blob().is_compressed()) {
2461 txc->statfs_delta.compressed_original() += ne->length;
2462 if (blob_duped) {
2463 txc->statfs_delta.compressed() +=
2464 cb->get_blob().get_compressed_payload_length();
2465 }
2466 }
2467 dout(20) << __func__ << " dst " << *ne << dendl;
2468 ++n;
2469 }
2470 if ((!inject_21040 && src_dirty) ||
2471 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2472 oldo->extent_map.dirty_range(dirty_range_begin,
2473 dirty_range_end - dirty_range_begin);
2474 txc->write_onode(oldo);
2475 }
2476 txc->write_onode(newo);
2477
2478 if (dstoff + length > newo->onode.size) {
2479 newo->onode.size = dstoff + length;
2480 }
2481 newo->extent_map.dirty_range(dstoff, length);
2482}
7c673cae
FG
2483void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2484 bool force)
2485{
2486 auto cct = onode->c->store->cct; //used by dout
2487 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2488 if (onode->onode.extent_map_shards.empty()) {
2489 if (inline_bl.length() == 0) {
2490 unsigned n;
2491 // we need to encode inline_bl to measure encoded length
2492 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
f91f0fd5 2493 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
11fdf7f2 2494 ceph_assert(!never_happen);
7c673cae
FG
2495 size_t len = inline_bl.length();
2496 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2497 << " extents" << dendl;
2498 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2499 request_reshard(0, OBJECT_MAX_SIZE);
2500 return;
2501 }
2502 }
2503 // will persist in the onode key.
2504 } else {
2505 // pending shard update
2506 struct dirty_shard_t {
2507 Shard *shard;
2508 bufferlist bl;
2509 dirty_shard_t(Shard *s) : shard(s) {}
2510 };
2511 vector<dirty_shard_t> encoded_shards;
2512 // allocate slots for all shards in a single call instead of
2513 // doing multiple allocations - one per each dirty shard
2514 encoded_shards.reserve(shards.size());
2515
2516 auto p = shards.begin();
2517 auto prev_p = p;
2518 while (p != shards.end()) {
11fdf7f2 2519 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2520 auto n = p;
2521 ++n;
2522 if (p->dirty) {
2523 uint32_t endoff;
2524 if (n == shards.end()) {
2525 endoff = OBJECT_MAX_SIZE;
2526 } else {
2527 endoff = n->shard_info->offset;
2528 }
2529 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2530 bufferlist& bl = encoded_shards.back().bl;
2531 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2532 bl, &p->extents)) {
2533 if (force) {
2534 derr << __func__ << " encode_some needs reshard" << dendl;
11fdf7f2 2535 ceph_assert(!force);
7c673cae
FG
2536 }
2537 }
2538 size_t len = bl.length();
2539
2540 dout(20) << __func__ << " shard 0x" << std::hex
2541 << p->shard_info->offset << std::dec << " is " << len
2542 << " bytes (was " << p->shard_info->bytes << ") from "
2543 << p->extents << " extents" << dendl;
2544
2545 if (!force) {
2546 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2547 // we are big; reshard ourselves
2548 request_reshard(p->shard_info->offset, endoff);
2549 }
2550 // avoid resharding the trailing shard, even if it is small
2551 else if (n != shards.end() &&
11fdf7f2
TL
2552 len < g_conf()->bluestore_extent_map_shard_min_size) {
2553 ceph_assert(endoff != OBJECT_MAX_SIZE);
31f18b77
FG
2554 if (p == shards.begin()) {
2555 // we are the first shard, combine with next shard
7c673cae 2556 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2557 } else {
31f18b77
FG
2558 // combine either with the previous shard or the next,
2559 // whichever is smaller
7c673cae
FG
2560 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2561 request_reshard(p->shard_info->offset, endoff + 1);
2562 } else {
2563 request_reshard(prev_p->shard_info->offset, endoff);
2564 }
2565 }
2566 }
2567 }
2568 }
2569 prev_p = p;
2570 p = n;
2571 }
2572 if (needs_reshard()) {
2573 return;
2574 }
2575
2576 // schedule DB update for dirty shards
2577 string key;
2578 for (auto& it : encoded_shards) {
2579 it.shard->dirty = false;
2580 it.shard->shard_info->bytes = it.bl.length();
2581 generate_extent_shard_key_and_apply(
2582 onode->key,
2583 it.shard->shard_info->offset,
2584 &key,
2585 [&](const string& final_key) {
2586 t->set(PREFIX_OBJ, final_key, it.bl);
2587 }
2588 );
2589 }
2590 }
2591}
2592
31f18b77
FG
2593bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2594{
2595 if (spanning_blob_map.empty())
2596 return 0;
2597 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2598 // bid is valid and available.
2599 if (bid >= 0)
2600 return bid;
2601 // Find next unused bid;
2602 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2603 const auto begin_bid = bid;
2604 do {
2605 if (!spanning_blob_map.count(bid))
2606 return bid;
2607 else {
2608 bid++;
2609 if (bid < 0) bid = 0;
2610 }
2611 } while (bid != begin_bid);
81eedcae
TL
2612 auto cct = onode->c->store->cct; // used by dout
2613 _dump_onode<0>(cct, *onode);
11fdf7f2 2614 ceph_abort_msg("no available blob id");
31f18b77
FG
2615}
2616
7c673cae
FG
2617void BlueStore::ExtentMap::reshard(
2618 KeyValueDB *db,
2619 KeyValueDB::Transaction t)
2620{
2621 auto cct = onode->c->store->cct; // used by dout
2622
2623 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2624 << needs_reshard_end << ")" << std::dec
2625 << " of " << onode->onode.extent_map_shards.size()
2626 << " shards on " << onode->oid << dendl;
2627 for (auto& p : spanning_blob_map) {
2628 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2629 << dendl;
2630 }
2631 // determine shard index range
2632 unsigned si_begin = 0, si_end = 0;
2633 if (!shards.empty()) {
2634 while (si_begin + 1 < shards.size() &&
2635 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2636 ++si_begin;
2637 }
2638 needs_reshard_begin = shards[si_begin].shard_info->offset;
2639 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2640 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2641 needs_reshard_end = shards[si_end].shard_info->offset;
2642 break;
2643 }
2644 }
2645 if (si_end == shards.size()) {
2646 needs_reshard_end = OBJECT_MAX_SIZE;
2647 }
2648 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2649 << " over 0x[" << std::hex << needs_reshard_begin << ","
2650 << needs_reshard_end << ")" << std::dec << dendl;
2651 }
2652
181888fb 2653 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2654
2655 // we may need to fault in a larger interval later must have all
2656 // referring extents for spanning blobs loaded in order to have
2657 // accurate use_tracker values.
2658 uint32_t spanning_scan_begin = needs_reshard_begin;
2659 uint32_t spanning_scan_end = needs_reshard_end;
2660
2661 // remove old keys
2662 string key;
2663 for (unsigned i = si_begin; i < si_end; ++i) {
2664 generate_extent_shard_key_and_apply(
2665 onode->key, shards[i].shard_info->offset, &key,
2666 [&](const string& final_key) {
2667 t->rmkey(PREFIX_OBJ, final_key);
2668 }
2669 );
2670 }
2671
2672 // calculate average extent size
2673 unsigned bytes = 0;
2674 unsigned extents = 0;
2675 if (onode->onode.extent_map_shards.empty()) {
2676 bytes = inline_bl.length();
2677 extents = extent_map.size();
2678 } else {
2679 for (unsigned i = si_begin; i < si_end; ++i) {
2680 bytes += shards[i].shard_info->bytes;
2681 extents += shards[i].extents;
2682 }
2683 }
2684 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2685 unsigned slop = target *
2686 cct->_conf->bluestore_extent_map_shard_target_size_slop;
11fdf7f2 2687 unsigned extent_avg = bytes / std::max(1u, extents);
7c673cae
FG
2688 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2689 << ", slop " << slop << dendl;
2690
2691 // reshard
2692 unsigned estimate = 0;
31f18b77 2693 unsigned offset = needs_reshard_begin;
7c673cae
FG
2694 vector<bluestore_onode_t::shard_info> new_shard_info;
2695 unsigned max_blob_end = 0;
2696 Extent dummy(needs_reshard_begin);
2697 for (auto e = extent_map.lower_bound(dummy);
2698 e != extent_map.end();
2699 ++e) {
2700 if (e->logical_offset >= needs_reshard_end) {
2701 break;
2702 }
2703 dout(30) << " extent " << *e << dendl;
2704
2705 // disfavor shard boundaries that span a blob
2706 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2707 if (estimate &&
2708 estimate + extent_avg > target + (would_span ? slop : 0)) {
2709 // new shard
31f18b77 2710 if (offset == needs_reshard_begin) {
7c673cae
FG
2711 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2712 new_shard_info.back().offset = offset;
2713 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2714 << std::dec << dendl;
7c673cae
FG
2715 }
2716 offset = e->logical_offset;
2717 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2718 new_shard_info.back().offset = offset;
2719 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2720 << std::dec << dendl;
2721 estimate = 0;
2722 }
2723 estimate += extent_avg;
31f18b77
FG
2724 unsigned bs = e->blob_start();
2725 if (bs < spanning_scan_begin) {
2726 spanning_scan_begin = bs;
7c673cae
FG
2727 }
2728 uint32_t be = e->blob_end();
2729 if (be > max_blob_end) {
2730 max_blob_end = be;
2731 }
2732 if (be > spanning_scan_end) {
2733 spanning_scan_end = be;
2734 }
2735 }
2736 if (new_shard_info.empty() && (si_begin > 0 ||
2737 si_end < shards.size())) {
2738 // we resharded a partial range; we must produce at least one output
2739 // shard
2740 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2741 new_shard_info.back().offset = needs_reshard_begin;
2742 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2743 << std::dec << " (singleton degenerate case)" << dendl;
2744 }
2745
2746 auto& sv = onode->onode.extent_map_shards;
2747 dout(20) << __func__ << " new " << new_shard_info << dendl;
2748 dout(20) << __func__ << " old " << sv << dendl;
2749 if (sv.empty()) {
2750 // no old shards to keep
2751 sv.swap(new_shard_info);
2752 init_shards(true, true);
2753 } else {
2754 // splice in new shards
2755 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2756 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2757 sv.insert(
2758 sv.begin() + si_begin,
2759 new_shard_info.begin(),
2760 new_shard_info.end());
2761 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2762 si_end = si_begin + new_shard_info.size();
31f18b77 2763
11fdf7f2 2764 ceph_assert(sv.size() == shards.size());
31f18b77
FG
2765
2766 // note that we need to update every shard_info of shards here,
2767 // as sv might have been totally re-allocated above
2768 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2769 shards[i].shard_info = &sv[i];
31f18b77
FG
2770 }
2771
2772 // mark newly added shards as dirty
2773 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2774 shards[i].loaded = true;
2775 shards[i].dirty = true;
2776 }
7c673cae
FG
2777 }
2778 dout(20) << __func__ << " fin " << sv << dendl;
2779 inline_bl.clear();
2780
2781 if (sv.empty()) {
2782 // no more shards; unspan all previously spanning blobs
2783 auto p = spanning_blob_map.begin();
2784 while (p != spanning_blob_map.end()) {
2785 p->second->id = -1;
2786 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2787 p = spanning_blob_map.erase(p);
2788 }
2789 } else {
2790 // identify new spanning blobs
2791 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2792 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2793 if (spanning_scan_begin < needs_reshard_begin) {
2794 fault_range(db, spanning_scan_begin,
2795 needs_reshard_begin - spanning_scan_begin);
2796 }
2797 if (spanning_scan_end > needs_reshard_end) {
2798 fault_range(db, needs_reshard_end,
31f18b77 2799 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2800 }
2801 auto sp = sv.begin() + si_begin;
2802 auto esp = sv.end();
2803 unsigned shard_start = sp->offset;
2804 unsigned shard_end;
2805 ++sp;
2806 if (sp == esp) {
2807 shard_end = OBJECT_MAX_SIZE;
2808 } else {
2809 shard_end = sp->offset;
2810 }
7c673cae 2811 Extent dummy(needs_reshard_begin);
9f95a23c
TL
2812
2813 bool was_too_many_blobs_check = false;
2814 auto too_many_blobs_threshold =
2815 g_conf()->bluestore_debug_too_many_blobs_threshold;
2816 auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
2817 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2818 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
2819
7c673cae
FG
2820 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2821 if (e->logical_offset >= needs_reshard_end) {
2822 break;
2823 }
2824 dout(30) << " extent " << *e << dendl;
2825 while (e->logical_offset >= shard_end) {
2826 shard_start = shard_end;
11fdf7f2 2827 ceph_assert(sp != esp);
7c673cae
FG
2828 ++sp;
2829 if (sp == esp) {
2830 shard_end = OBJECT_MAX_SIZE;
2831 } else {
2832 shard_end = sp->offset;
2833 }
2834 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2835 << " to 0x" << shard_end << std::dec << dendl;
2836 }
9f95a23c 2837
7c673cae
FG
2838 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2839 if (!e->blob->is_spanning()) {
2840 // We have two options: (1) split the blob into pieces at the
2841 // shard boundaries (and adjust extents accordingly), or (2)
2842 // mark it spanning. We prefer to cut the blob if we can. Note that
2843 // we may have to split it multiple times--potentially at every
2844 // shard boundary.
2845 bool must_span = false;
2846 BlobRef b = e->blob;
2847 if (b->can_split()) {
2848 uint32_t bstart = e->blob_start();
2849 uint32_t bend = e->blob_end();
2850 for (const auto& sh : shards) {
2851 if (bstart < sh.shard_info->offset &&
2852 bend > sh.shard_info->offset) {
2853 uint32_t blob_offset = sh.shard_info->offset - bstart;
2854 if (b->can_split_at(blob_offset)) {
2855 dout(20) << __func__ << " splitting blob, bstart 0x"
2856 << std::hex << bstart << " blob_offset 0x"
2857 << blob_offset << std::dec << " " << *b << dendl;
2858 b = split_blob(b, blob_offset, sh.shard_info->offset);
2859 // switch b to the new right-hand side, in case it
2860 // *also* has to get split.
2861 bstart += blob_offset;
2862 onode->c->store->logger->inc(l_bluestore_blob_split);
2863 } else {
2864 must_span = true;
2865 break;
2866 }
2867 }
2868 }
2869 } else {
2870 must_span = true;
2871 }
2872 if (must_span) {
31f18b77
FG
2873 auto bid = allocate_spanning_blob_id();
2874 b->id = bid;
7c673cae
FG
2875 spanning_blob_map[b->id] = b;
2876 dout(20) << __func__ << " adding spanning " << *b << dendl;
9f95a23c
TL
2877 if (!was_too_many_blobs_check &&
2878 too_many_blobs_threshold &&
2879 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2880
2881 was_too_many_blobs_check = true;
2882 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2883 if (dumped_onodes[i].first == onode->oid) {
2884 oid_slot = &dumped_onodes[i];
2885 break;
2886 }
2887 if (!oldest_slot || (oldest_slot &&
2888 dumped_onodes[i].second < oldest_slot->second)) {
2889 oldest_slot = &dumped_onodes[i];
2890 }
2891 }
2892 }
7c673cae
FG
2893 }
2894 }
2895 } else {
2896 if (e->blob->is_spanning()) {
2897 spanning_blob_map.erase(e->blob->id);
2898 e->blob->id = -1;
2899 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2900 }
2901 }
2902 }
9f95a23c
TL
2903 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
2904 (oid_slot &&
2905 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
2906 if (do_dump) {
2907 dout(0) << __func__
2908 << " spanning blob count exceeds threshold, "
2909 << spanning_blob_map.size() << " spanning blobs"
2910 << dendl;
2911 _dump_onode<0>(cct, *onode);
2912 if (oid_slot) {
2913 oid_slot->second = mono_clock::now();
2914 } else {
2915 ceph_assert(oldest_slot);
2916 oldest_slot->first = onode->oid;
2917 oldest_slot->second = mono_clock::now();
2918 }
2919 }
7c673cae
FG
2920 }
2921
2922 clear_needs_reshard();
2923}
2924
2925bool BlueStore::ExtentMap::encode_some(
2926 uint32_t offset,
2927 uint32_t length,
2928 bufferlist& bl,
2929 unsigned *pn)
2930{
7c673cae
FG
2931 Extent dummy(offset);
2932 auto start = extent_map.lower_bound(dummy);
2933 uint32_t end = offset + length;
2934
2935 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2936 // serialization only. Hence there is no specific
2937 // handling at ExtentMap level.
2938
2939 unsigned n = 0;
2940 size_t bound = 0;
7c673cae
FG
2941 bool must_reshard = false;
2942 for (auto p = start;
2943 p != extent_map.end() && p->logical_offset < end;
2944 ++p, ++n) {
11fdf7f2 2945 ceph_assert(p->logical_offset >= offset);
7c673cae
FG
2946 p->blob->last_encoded_id = -1;
2947 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2948 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2949 << std::dec << " hit new spanning blob " << *p << dendl;
2950 request_reshard(p->blob_start(), p->blob_end());
2951 must_reshard = true;
2952 }
31f18b77
FG
2953 if (!must_reshard) {
2954 denc_varint(0, bound); // blobid
2955 denc_varint(0, bound); // logical_offset
2956 denc_varint(0, bound); // len
2957 denc_varint(0, bound); // blob_offset
7c673cae 2958
31f18b77
FG
2959 p->blob->bound_encode(
2960 bound,
2961 struct_v,
2962 p->blob->shared_blob->get_sbid(),
2963 false);
2964 }
7c673cae
FG
2965 }
2966 if (must_reshard) {
2967 return true;
2968 }
2969
31f18b77
FG
2970 denc(struct_v, bound);
2971 denc_varint(0, bound); // number of extents
2972
7c673cae
FG
2973 {
2974 auto app = bl.get_contiguous_appender(bound);
2975 denc(struct_v, app);
2976 denc_varint(n, app);
2977 if (pn) {
2978 *pn = n;
2979 }
2980
2981 n = 0;
2982 uint64_t pos = 0;
2983 uint64_t prev_len = 0;
2984 for (auto p = start;
2985 p != extent_map.end() && p->logical_offset < end;
2986 ++p, ++n) {
2987 unsigned blobid;
2988 bool include_blob = false;
2989 if (p->blob->is_spanning()) {
2990 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2991 blobid |= BLOBID_FLAG_SPANNING;
2992 } else if (p->blob->last_encoded_id < 0) {
2993 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2994 include_blob = true;
2995 blobid = 0; // the decoder will infer the id from n
2996 } else {
2997 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2998 }
2999 if (p->logical_offset == pos) {
3000 blobid |= BLOBID_FLAG_CONTIGUOUS;
3001 }
3002 if (p->blob_offset == 0) {
3003 blobid |= BLOBID_FLAG_ZEROOFFSET;
3004 }
3005 if (p->length == prev_len) {
3006 blobid |= BLOBID_FLAG_SAMELENGTH;
3007 } else {
3008 prev_len = p->length;
3009 }
3010 denc_varint(blobid, app);
3011 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3012 denc_varint_lowz(p->logical_offset - pos, app);
3013 }
3014 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3015 denc_varint_lowz(p->blob_offset, app);
3016 }
3017 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3018 denc_varint_lowz(p->length, app);
3019 }
3020 pos = p->logical_end();
3021 if (include_blob) {
3022 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
3023 }
3024 }
3025 }
3026 /*derr << __func__ << bl << dendl;
3027 derr << __func__ << ":";
3028 bl.hexdump(*_dout);
3029 *_dout << dendl;
3030 */
3031 return false;
3032}
3033
3034unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
3035{
7c673cae
FG
3036 /*
3037 derr << __func__ << ":";
3038 bl.hexdump(*_dout);
3039 *_dout << dendl;
3040 */
3041
11fdf7f2 3042 ceph_assert(bl.get_num_buffers() <= 1);
7c673cae
FG
3043 auto p = bl.front().begin_deep();
3044 __u8 struct_v;
3045 denc(struct_v, p);
3046 // Version 2 differs from v1 in blob's ref_map
3047 // serialization only. Hence there is no specific
3048 // handling at ExtentMap level below.
11fdf7f2 3049 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
3050
3051 uint32_t num;
3052 denc_varint(num, p);
3053 vector<BlobRef> blobs(num);
3054 uint64_t pos = 0;
3055 uint64_t prev_len = 0;
3056 unsigned n = 0;
3057
3058 while (!p.end()) {
3059 Extent *le = new Extent();
3060 uint64_t blobid;
3061 denc_varint(blobid, p);
3062 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3063 uint64_t gap;
3064 denc_varint_lowz(gap, p);
3065 pos += gap;
3066 }
3067 le->logical_offset = pos;
3068 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3069 denc_varint_lowz(le->blob_offset, p);
3070 } else {
3071 le->blob_offset = 0;
3072 }
3073 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3074 denc_varint_lowz(prev_len, p);
3075 }
3076 le->length = prev_len;
3077
3078 if (blobid & BLOBID_FLAG_SPANNING) {
3079 dout(30) << __func__ << " getting spanning blob "
3080 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
3081 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
3082 } else {
3083 blobid >>= BLOBID_SHIFT_BITS;
3084 if (blobid) {
3085 le->assign_blob(blobs[blobid - 1]);
11fdf7f2 3086 ceph_assert(le->blob);
7c673cae
FG
3087 } else {
3088 Blob *b = new Blob();
3089 uint64_t sbid = 0;
3090 b->decode(onode->c, p, struct_v, &sbid, false);
3091 blobs[n] = b;
3092 onode->c->open_shared_blob(sbid, b);
3093 le->assign_blob(b);
3094 }
3095 // we build ref_map dynamically for non-spanning blobs
3096 le->blob->get_ref(
3097 onode->c,
3098 le->blob_offset,
3099 le->length);
3100 }
3101 pos += prev_len;
3102 ++n;
3103 extent_map.insert(*le);
3104 }
3105
11fdf7f2 3106 ceph_assert(n == num);
7c673cae
FG
3107 return num;
3108}
3109
3110void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
3111{
3112 // Version 2 differs from v1 in blob's ref_map
3113 // serialization only. Hence there is no specific
3114 // handling at ExtentMap level.
3115 __u8 struct_v = 2;
3116
3117 denc(struct_v, p);
3118 denc_varint((uint32_t)0, p);
3119 size_t key_size = 0;
3120 denc_varint((uint32_t)0, key_size);
3121 p += spanning_blob_map.size() * key_size;
3122 for (const auto& i : spanning_blob_map) {
3123 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3124 }
3125}
3126
3127void BlueStore::ExtentMap::encode_spanning_blobs(
3128 bufferlist::contiguous_appender& p)
3129{
3130 // Version 2 differs from v1 in blob's ref_map
3131 // serialization only. Hence there is no specific
3132 // handling at ExtentMap level.
3133 __u8 struct_v = 2;
3134
3135 denc(struct_v, p);
3136 denc_varint(spanning_blob_map.size(), p);
3137 for (auto& i : spanning_blob_map) {
3138 denc_varint(i.second->id, p);
3139 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3140 }
3141}
3142
3143void BlueStore::ExtentMap::decode_spanning_blobs(
11fdf7f2 3144 bufferptr::const_iterator& p)
7c673cae
FG
3145{
3146 __u8 struct_v;
3147 denc(struct_v, p);
3148 // Version 2 differs from v1 in blob's ref_map
3149 // serialization only. Hence there is no specific
3150 // handling at ExtentMap level.
11fdf7f2 3151 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
3152
3153 unsigned n;
3154 denc_varint(n, p);
3155 while (n--) {
3156 BlobRef b(new Blob());
3157 denc_varint(b->id, p);
3158 spanning_blob_map[b->id] = b;
3159 uint64_t sbid = 0;
3160 b->decode(onode->c, p, struct_v, &sbid, true);
3161 onode->c->open_shared_blob(sbid, b);
3162 }
3163}
3164
3165void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
3166{
3167 shards.resize(onode->onode.extent_map_shards.size());
3168 unsigned i = 0;
3169 for (auto &s : onode->onode.extent_map_shards) {
3170 shards[i].shard_info = &s;
3171 shards[i].loaded = loaded;
3172 shards[i].dirty = dirty;
3173 ++i;
3174 }
3175}
3176
3177void BlueStore::ExtentMap::fault_range(
3178 KeyValueDB *db,
3179 uint32_t offset,
3180 uint32_t length)
3181{
7c673cae
FG
3182 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3183 << std::dec << dendl;
3184 auto start = seek_shard(offset);
3185 auto last = seek_shard(offset + length);
3186
3187 if (start < 0)
3188 return;
3189
11fdf7f2 3190 ceph_assert(last >= start);
7c673cae
FG
3191 string key;
3192 while (start <= last) {
11fdf7f2 3193 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3194 auto p = &shards[start];
3195 if (!p->loaded) {
3196 dout(30) << __func__ << " opening shard 0x" << std::hex
3197 << p->shard_info->offset << std::dec << dendl;
3198 bufferlist v;
3199 generate_extent_shard_key_and_apply(
3200 onode->key, p->shard_info->offset, &key,
3201 [&](const string& final_key) {
3202 int r = db->get(PREFIX_OBJ, final_key, &v);
3203 if (r < 0) {
3204 derr << __func__ << " missing shard 0x" << std::hex
3205 << p->shard_info->offset << std::dec << " for " << onode->oid
3206 << dendl;
11fdf7f2 3207 ceph_assert(r >= 0);
7c673cae
FG
3208 }
3209 }
3210 );
3211 p->extents = decode_some(v);
3212 p->loaded = true;
3213 dout(20) << __func__ << " open shard 0x" << std::hex
81eedcae
TL
3214 << p->shard_info->offset
3215 << " for range 0x" << offset << "~" << length << std::dec
7c673cae 3216 << " (" << v.length() << " bytes)" << dendl;
11fdf7f2
TL
3217 ceph_assert(p->dirty == false);
3218 ceph_assert(v.length() == p->shard_info->bytes);
7c673cae
FG
3219 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3220 } else {
3221 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3222 }
3223 ++start;
3224 }
3225}
3226
3227void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
3228 uint32_t offset,
3229 uint32_t length)
3230{
7c673cae
FG
3231 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3232 << std::dec << dendl;
3233 if (shards.empty()) {
3234 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3235 inline_bl.clear();
3236 return;
3237 }
3238 auto start = seek_shard(offset);
11fdf7f2
TL
3239 if (length == 0) {
3240 length = 1;
3241 }
3242 auto last = seek_shard(offset + length - 1);
7c673cae
FG
3243 if (start < 0)
3244 return;
3245
11fdf7f2 3246 ceph_assert(last >= start);
7c673cae 3247 while (start <= last) {
11fdf7f2 3248 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3249 auto p = &shards[start];
3250 if (!p->loaded) {
11fdf7f2
TL
3251 derr << __func__ << "on write 0x" << std::hex << offset
3252 << "~" << length << " shard 0x" << p->shard_info->offset
3253 << std::dec << " is not loaded, can't mark dirty" << dendl;
3254 ceph_abort_msg("can't mark unloaded shard dirty");
7c673cae
FG
3255 }
3256 if (!p->dirty) {
3257 dout(20) << __func__ << " mark shard 0x" << std::hex
3258 << p->shard_info->offset << std::dec << " dirty" << dendl;
3259 p->dirty = true;
3260 }
3261 ++start;
3262 }
3263}
3264
3265BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3266 uint64_t offset)
3267{
3268 Extent dummy(offset);
3269 return extent_map.find(dummy);
3270}
3271
7c673cae
FG
3272BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3273 uint64_t offset)
3274{
3275 Extent dummy(offset);
3276 auto fp = extent_map.lower_bound(dummy);
3277 if (fp != extent_map.begin()) {
3278 --fp;
3279 if (fp->logical_end() <= offset) {
3280 ++fp;
3281 }
3282 }
3283 return fp;
3284}
3285
3286BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3287 uint64_t offset) const
3288{
3289 Extent dummy(offset);
3290 auto fp = extent_map.lower_bound(dummy);
3291 if (fp != extent_map.begin()) {
3292 --fp;
3293 if (fp->logical_end() <= offset) {
3294 ++fp;
3295 }
3296 }
3297 return fp;
3298}
3299
3300bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3301{
3302 auto fp = seek_lextent(offset);
3303 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3304 return false;
3305 }
3306 return true;
3307}
3308
3309int BlueStore::ExtentMap::compress_extent_map(
3310 uint64_t offset,
3311 uint64_t length)
3312{
7c673cae
FG
3313 if (extent_map.empty())
3314 return 0;
3315 int removed = 0;
3316 auto p = seek_lextent(offset);
3317 if (p != extent_map.begin()) {
3318 --p; // start to the left of offset
3319 }
3320 // the caller should have just written to this region
11fdf7f2 3321 ceph_assert(p != extent_map.end());
7c673cae
FG
3322
3323 // identify the *next* shard
3324 auto pshard = shards.begin();
3325 while (pshard != shards.end() &&
3326 p->logical_offset >= pshard->shard_info->offset) {
3327 ++pshard;
3328 }
3329 uint64_t shard_end;
3330 if (pshard != shards.end()) {
3331 shard_end = pshard->shard_info->offset;
3332 } else {
3333 shard_end = OBJECT_MAX_SIZE;
3334 }
3335
3336 auto n = p;
3337 for (++n; n != extent_map.end(); p = n++) {
3338 if (n->logical_offset > offset + length) {
3339 break; // stop after end
3340 }
3341 while (n != extent_map.end() &&
3342 p->logical_end() == n->logical_offset &&
3343 p->blob == n->blob &&
3344 p->blob_offset + p->length == n->blob_offset &&
3345 n->logical_offset < shard_end) {
3346 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3347 << " next shard 0x" << shard_end << std::dec
3348 << " merging " << *p << " and " << *n << dendl;
3349 p->length += n->length;
3350 rm(n++);
3351 ++removed;
3352 }
3353 if (n == extent_map.end()) {
3354 break;
3355 }
3356 if (n->logical_offset >= shard_end) {
11fdf7f2 3357 ceph_assert(pshard != shards.end());
7c673cae
FG
3358 ++pshard;
3359 if (pshard != shards.end()) {
3360 shard_end = pshard->shard_info->offset;
3361 } else {
3362 shard_end = OBJECT_MAX_SIZE;
3363 }
3364 }
3365 }
11fdf7f2 3366 if (removed) {
7c673cae
FG
3367 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3368 }
3369 return removed;
3370}
3371
3372void BlueStore::ExtentMap::punch_hole(
3373 CollectionRef &c,
3374 uint64_t offset,
3375 uint64_t length,
3376 old_extent_map_t *old_extents)
3377{
3378 auto p = seek_lextent(offset);
3379 uint64_t end = offset + length;
3380 while (p != extent_map.end()) {
3381 if (p->logical_offset >= end) {
3382 break;
3383 }
3384 if (p->logical_offset < offset) {
3385 if (p->logical_end() > end) {
3386 // split and deref middle
3387 uint64_t front = offset - p->logical_offset;
3388 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3389 length, p->blob);
3390 old_extents->push_back(*oe);
3391 add(end,
3392 p->blob_offset + front + length,
3393 p->length - front - length,
3394 p->blob);
3395 p->length = front;
3396 break;
3397 } else {
3398 // deref tail
11fdf7f2 3399 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
7c673cae
FG
3400 uint64_t keep = offset - p->logical_offset;
3401 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3402 p->length - keep, p->blob);
3403 old_extents->push_back(*oe);
3404 p->length = keep;
3405 ++p;
3406 continue;
3407 }
3408 }
3409 if (p->logical_offset + p->length <= end) {
3410 // deref whole lextent
3411 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3412 p->length, p->blob);
3413 old_extents->push_back(*oe);
3414 rm(p++);
3415 continue;
3416 }
3417 // deref head
3418 uint64_t keep = p->logical_end() - end;
3419 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3420 p->length - keep, p->blob);
3421 old_extents->push_back(*oe);
3422
3423 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3424 rm(p);
3425 break;
3426 }
3427}
3428
3429BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3430 CollectionRef &c,
3431 uint64_t logical_offset,
3432 uint64_t blob_offset, uint64_t length, BlobRef b,
3433 old_extent_map_t *old_extents)
3434{
3435 // We need to have completely initialized Blob to increment its ref counters.
11fdf7f2 3436 ceph_assert(b->get_blob().get_logical_length() != 0);
7c673cae
FG
3437
3438 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3439 // old_extents list if we overwre the blob totally
3440 // This might happen during WAL overwrite.
3441 b->get_ref(onode->c, blob_offset, length);
3442
3443 if (old_extents) {
3444 punch_hole(c, logical_offset, length, old_extents);
3445 }
3446
3447 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3448 extent_map.insert(*le);
3449 if (spans_shard(logical_offset, length)) {
3450 request_reshard(logical_offset, logical_offset + length);
3451 }
3452 return le;
3453}
3454
3455BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3456 BlobRef lb,
3457 uint32_t blob_offset,
3458 uint32_t pos)
3459{
7c673cae
FG
3460 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3461 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3462 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3463 << dendl;
3464 BlobRef rb = onode->c->new_blob();
3465 lb->split(onode->c, blob_offset, rb.get());
3466
3467 for (auto ep = seek_lextent(pos);
3468 ep != extent_map.end() && ep->logical_offset < end_pos;
3469 ++ep) {
3470 if (ep->blob != lb) {
3471 continue;
3472 }
3473 if (ep->logical_offset < pos) {
3474 // split extent
3475 size_t left = pos - ep->logical_offset;
3476 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3477 extent_map.insert(*ne);
3478 ep->length = left;
3479 dout(30) << __func__ << " split " << *ep << dendl;
3480 dout(30) << __func__ << " to " << *ne << dendl;
3481 } else {
3482 // switch blob
11fdf7f2 3483 ceph_assert(ep->blob_offset >= blob_offset);
7c673cae
FG
3484
3485 ep->blob = rb;
3486 ep->blob_offset -= blob_offset;
3487 dout(30) << __func__ << " adjusted " << *ep << dendl;
3488 }
3489 }
3490 return rb;
3491}
3492
3493// Onode
3494
3495#undef dout_prefix
3496#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3497
f6b5b4d7
TL
3498//
3499// A tricky thing about Onode's ref counter is that we do an additional
3500// increment when newly pinned instance is detected. And -1 on unpin.
3501// This prevents from a conflict with a delete call (when nref == 0).
3502// The latter might happen while the thread is in unpin() function
3503// (and e.g. waiting for lock acquisition) since nref is already
3504// decremented. And another 'putting' thread on the instance will release it.
3505//
3506void BlueStore::Onode::get() {
adb31ebb
TL
3507 if (++nref >= 2 && !pinned) {
3508 OnodeCacheShard* ocs = c->get_onode_cache();
f67539c2
TL
3509 ocs->lock.lock();
3510 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3511 while (ocs != c->get_onode_cache()) {
3512 ocs->lock.unlock();
3513 ocs = c->get_onode_cache();
3514 ocs->lock.lock();
3515 }
adb31ebb
TL
3516 bool was_pinned = pinned;
3517 pinned = nref >= 2;
3518 // additional increment for newly pinned instance
3519 bool r = !was_pinned && pinned;
3520 if (r) {
3521 ++nref;
3522 }
3523 if (cached && r) {
3524 ocs->_pin(this);
3525 }
f67539c2 3526 ocs->lock.unlock();
f6b5b4d7
TL
3527 }
3528}
3529void BlueStore::Onode::put() {
adb31ebb
TL
3530 int n = --nref;
3531 if (n == 2) {
3532 OnodeCacheShard* ocs = c->get_onode_cache();
f67539c2
TL
3533 ocs->lock.lock();
3534 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3535 while (ocs != c->get_onode_cache()) {
3536 ocs->lock.unlock();
3537 ocs = c->get_onode_cache();
3538 ocs->lock.lock();
3539 }
adb31ebb
TL
3540 bool need_unpin = pinned;
3541 pinned = pinned && nref > 2; // intentionally use > not >= as we have
3542 // +1 due to pinned state
3543 need_unpin = need_unpin && !pinned;
3544 if (cached && need_unpin) {
3545 if (exists) {
3546 ocs->_unpin(this);
3547 } else {
3548 ocs->_unpin_and_rm(this);
3549 // remove will also decrement nref and delete Onode
3550 c->onode_map._remove(oid);
3551 }
3552 }
3553 // additional decrement for newly unpinned instance
3554 // should be the last action since Onode can be released
3555 // at any point after this decrement
3556 if (need_unpin) {
3557 n = --nref;
3558 }
f67539c2 3559 ocs->lock.unlock();
f6b5b4d7 3560 }
adb31ebb 3561 if (n == 0) {
f6b5b4d7
TL
3562 delete this;
3563 }
3564}
3565
eafe8130
TL
3566BlueStore::Onode* BlueStore::Onode::decode(
3567 CollectionRef c,
3568 const ghobject_t& oid,
3569 const string& key,
3570 const bufferlist& v)
3571{
3572 Onode* on = new Onode(c.get(), oid, key);
3573 on->exists = true;
3574 auto p = v.front().begin_deep();
3575 on->onode.decode(p);
3576 for (auto& i : on->onode.attrs) {
f91f0fd5 3577 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
eafe8130
TL
3578 }
3579
3580 // initialize extent_map
3581 on->extent_map.decode_spanning_blobs(p);
3582 if (on->onode.extent_map_shards.empty()) {
3583 denc(on->extent_map.inline_bl, p);
3584 on->extent_map.decode_some(on->extent_map.inline_bl);
3585 on->extent_map.inline_bl.reassign_to_mempool(
f91f0fd5 3586 mempool::mempool_bluestore_cache_data);
eafe8130
TL
3587 }
3588 else {
3589 on->extent_map.init_shards(false, false);
3590 }
3591 return on;
3592}
3593
7c673cae
FG
3594void BlueStore::Onode::flush()
3595{
3596 if (flushing_count.load()) {
3597 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
9f95a23c 3598 waiting_count++;
11fdf7f2 3599 std::unique_lock l(flush_lock);
7c673cae
FG
3600 while (flushing_count.load()) {
3601 flush_cond.wait(l);
3602 }
9f95a23c 3603 waiting_count--;
7c673cae
FG
3604 }
3605 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3606}
3607
9f95a23c
TL
3608void BlueStore::Onode::dump(Formatter* f) const
3609{
3610 onode.dump(f);
3611 extent_map.dump(f);
3612}
3613
3614
3615const string& BlueStore::Onode::get_omap_prefix()
3616{
3617 if (onode.is_pgmeta_omap()) {
3618 return PREFIX_PGMETA_OMAP;
3619 }
f67539c2
TL
3620 if (onode.is_perpg_omap()) {
3621 return PREFIX_PERPG_OMAP;
3622 }
9f95a23c
TL
3623 if (onode.is_perpool_omap()) {
3624 return PREFIX_PERPOOL_OMAP;
3625 }
3626 return PREFIX_OMAP;
3627}
3628
3629// '-' < '.' < '~'
3630
3631void BlueStore::Onode::get_omap_header(string *out)
3632{
f67539c2
TL
3633 if (!onode.is_pgmeta_omap()) {
3634 if (onode.is_perpg_omap()) {
3635 _key_encode_u64(c->pool(), out);
3636 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3637 } else if (onode.is_perpool_omap()) {
3638 _key_encode_u64(c->pool(), out);
3639 }
9f95a23c
TL
3640 }
3641 _key_encode_u64(onode.nid, out);
3642 out->push_back('-');
3643}
3644
3645void BlueStore::Onode::get_omap_key(const string& key, string *out)
3646{
f67539c2
TL
3647 if (!onode.is_pgmeta_omap()) {
3648 if (onode.is_perpg_omap()) {
3649 _key_encode_u64(c->pool(), out);
3650 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3651 } else if (onode.is_perpool_omap()) {
3652 _key_encode_u64(c->pool(), out);
3653 }
9f95a23c
TL
3654 }
3655 _key_encode_u64(onode.nid, out);
3656 out->push_back('.');
3657 out->append(key);
3658}
3659
3660void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3661{
f67539c2
TL
3662 if (!onode.is_pgmeta_omap()) {
3663 if (onode.is_perpg_omap()) {
3664 _key_encode_u64(c->pool(), out);
3665 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3666 } else if (onode.is_perpool_omap()) {
3667 _key_encode_u64(c->pool(), out);
3668 }
9f95a23c
TL
3669 }
3670 _key_encode_u64(onode.nid, out);
3671 out->append(old.c_str() + out->length(), old.size() - out->length());
3672}
3673
3674void BlueStore::Onode::get_omap_tail(string *out)
3675{
f67539c2
TL
3676 if (!onode.is_pgmeta_omap()) {
3677 if (onode.is_perpg_omap()) {
3678 _key_encode_u64(c->pool(), out);
3679 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
3680 } else if (onode.is_perpool_omap()) {
3681 _key_encode_u64(c->pool(), out);
3682 }
9f95a23c
TL
3683 }
3684 _key_encode_u64(onode.nid, out);
3685 out->push_back('~');
3686}
3687
3688void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3689{
f67539c2
TL
3690 size_t pos = sizeof(uint64_t) + 1;
3691 if (!onode.is_pgmeta_omap()) {
3692 if (onode.is_perpg_omap()) {
3693 pos += sizeof(uint64_t) + sizeof(uint32_t);
3694 } else if (onode.is_perpool_omap()) {
3695 pos += sizeof(uint64_t);
3696 }
9f95a23c 3697 }
f67539c2 3698 *user_key = key.substr(pos);
9f95a23c
TL
3699}
3700
3701
7c673cae
FG
3702// =======================================================
3703// WriteContext
3704
3705/// Checks for writes to the same pextent within a blob
3706bool BlueStore::WriteContext::has_conflict(
3707 BlobRef b,
3708 uint64_t loffs,
3709 uint64_t loffs_end,
3710 uint64_t min_alloc_size)
3711{
11fdf7f2
TL
3712 ceph_assert((loffs % min_alloc_size) == 0);
3713 ceph_assert((loffs_end % min_alloc_size) == 0);
7c673cae
FG
3714 for (auto w : writes) {
3715 if (b == w.b) {
11fdf7f2
TL
3716 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3717 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
7c673cae 3718 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 3719 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
3720 return true;
3721 }
3722 }
3723 }
3724 return false;
3725}
3726
3727// =======================================================
3728
3729// DeferredBatch
3730#undef dout_prefix
3731#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
9f95a23c
TL
3732#undef dout_context
3733#define dout_context cct
7c673cae
FG
3734
3735void BlueStore::DeferredBatch::prepare_write(
3736 CephContext *cct,
3737 uint64_t seq, uint64_t offset, uint64_t length,
3738 bufferlist::const_iterator& blp)
3739{
3740 _discard(cct, offset, length);
3741 auto i = iomap.insert(make_pair(offset, deferred_io()));
11fdf7f2 3742 ceph_assert(i.second); // this should be a new insertion
7c673cae
FG
3743 i.first->second.seq = seq;
3744 blp.copy(length, i.first->second.bl);
31f18b77
FG
3745 i.first->second.bl.reassign_to_mempool(
3746 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3747 dout(20) << __func__ << " seq " << seq
3748 << " 0x" << std::hex << offset << "~" << length
3749 << " crc " << i.first->second.bl.crc32c(-1)
3750 << std::dec << dendl;
3751 seq_bytes[seq] += length;
3752#ifdef DEBUG_DEFERRED
3753 _audit(cct);
3754#endif
3755}
3756
3757void BlueStore::DeferredBatch::_discard(
3758 CephContext *cct, uint64_t offset, uint64_t length)
3759{
3760 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3761 << std::dec << dendl;
3762 auto p = iomap.lower_bound(offset);
3763 if (p != iomap.begin()) {
3764 --p;
3765 auto end = p->first + p->second.bl.length();
3766 if (end > offset) {
3767 bufferlist head;
3768 head.substr_of(p->second.bl, 0, offset - p->first);
3769 dout(20) << __func__ << " keep head " << p->second.seq
3770 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3771 << " -> 0x" << head.length() << std::dec << dendl;
3772 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3773 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3774 if (end > offset + length) {
3775 bufferlist tail;
3776 tail.substr_of(p->second.bl, offset + length - p->first,
3777 end - (offset + length));
3778 dout(20) << __func__ << " keep tail " << p->second.seq
3779 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3780 << " -> 0x" << tail.length() << std::dec << dendl;
3781 auto &n = iomap[offset + length];
3782 n.bl.swap(tail);
3783 n.seq = p->second.seq;
3784 i->second -= length;
3785 } else {
3786 i->second -= end - offset;
3787 }
11fdf7f2 3788 ceph_assert(i->second >= 0);
7c673cae
FG
3789 p->second.bl.swap(head);
3790 }
3791 ++p;
3792 }
3793 while (p != iomap.end()) {
3794 if (p->first >= offset + length) {
3795 break;
3796 }
3797 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3798 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3799 auto end = p->first + p->second.bl.length();
3800 if (end > offset + length) {
3801 unsigned drop_front = offset + length - p->first;
3802 unsigned keep_tail = end - (offset + length);
3803 dout(20) << __func__ << " truncate front " << p->second.seq
3804 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3805 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3806 << " to 0x" << (offset + length) << "~" << keep_tail
3807 << std::dec << dendl;
3808 auto &s = iomap[offset + length];
3809 s.seq = p->second.seq;
3810 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3811 i->second -= drop_front;
3812 } else {
3813 dout(20) << __func__ << " drop " << p->second.seq
3814 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3815 << std::dec << dendl;
3816 i->second -= p->second.bl.length();
3817 }
11fdf7f2 3818 ceph_assert(i->second >= 0);
7c673cae
FG
3819 p = iomap.erase(p);
3820 }
3821}
3822
3823void BlueStore::DeferredBatch::_audit(CephContext *cct)
3824{
3825 map<uint64_t,int> sb;
3826 for (auto p : seq_bytes) {
3827 sb[p.first] = 0; // make sure we have the same set of keys
3828 }
3829 uint64_t pos = 0;
3830 for (auto& p : iomap) {
11fdf7f2 3831 ceph_assert(p.first >= pos);
7c673cae
FG
3832 sb[p.second.seq] += p.second.bl.length();
3833 pos = p.first + p.second.bl.length();
3834 }
11fdf7f2 3835 ceph_assert(sb == seq_bytes);
7c673cae
FG
3836}
3837
3838
3839// Collection
3840
3841#undef dout_prefix
3842#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3843
9f95a23c
TL
3844BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3845 : CollectionImpl(store_->cct, cid),
11fdf7f2 3846 store(store_),
9f95a23c 3847 cache(bc),
7c673cae 3848 exists(true),
9f95a23c 3849 onode_map(oc),
11fdf7f2
TL
3850 commit_queue(nullptr)
3851{
3852}
3853
3854bool BlueStore::Collection::flush_commit(Context *c)
3855{
3856 return osr->flush_commit(c);
3857}
3858
3859void BlueStore::Collection::flush()
3860{
3861 osr->flush();
3862}
3863
3864void BlueStore::Collection::flush_all_but_last()
7c673cae 3865{
11fdf7f2 3866 osr->flush_all_but_last();
7c673cae
FG
3867}
3868
3869void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3870{
11fdf7f2 3871 ceph_assert(!b->shared_blob);
7c673cae
FG
3872 const bluestore_blob_t& blob = b->get_blob();
3873 if (!blob.is_shared()) {
3874 b->shared_blob = new SharedBlob(this);
3875 return;
3876 }
3877
3878 b->shared_blob = shared_blob_set.lookup(sbid);
3879 if (b->shared_blob) {
3880 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3881 << std::dec << " had " << *b->shared_blob << dendl;
3882 } else {
3883 b->shared_blob = new SharedBlob(sbid, this);
3884 shared_blob_set.add(this, b->shared_blob.get());
3885 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3886 << std::dec << " opened " << *b->shared_blob
3887 << dendl;
3888 }
3889}
3890
3891void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3892{
3893 if (!sb->is_loaded()) {
3894
3895 bufferlist v;
3896 string key;
3897 auto sbid = sb->get_sbid();
3898 get_shared_blob_key(sbid, &key);
3899 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3900 if (r < 0) {
3901 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3902 << std::dec << " not found at key "
3903 << pretty_binary_string(key) << dendl;
11fdf7f2 3904 ceph_abort_msg("uh oh, missing shared_blob");
7c673cae
FG
3905 }
3906
3907 sb->loaded = true;
3908 sb->persistent = new bluestore_shared_blob_t(sbid);
11fdf7f2
TL
3909 auto p = v.cbegin();
3910 decode(*(sb->persistent), p);
7c673cae
FG
3911 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3912 << std::dec << " loaded shared_blob " << *sb << dendl;
3913 }
3914}
3915
3916void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3917{
7c673cae 3918 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
11fdf7f2 3919 ceph_assert(!b->shared_blob->is_loaded());
7c673cae
FG
3920
3921 // update blob
31f18b77 3922 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 3923 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
3924
3925 // update shared blob
3926 b->shared_blob->loaded = true;
3927 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3928 shared_blob_set.add(this, b->shared_blob.get());
3929 for (auto p : blob.get_extents()) {
3930 if (p.is_valid()) {
3931 b->shared_blob->get_ref(
3932 p.offset,
3933 p.length);
3934 }
3935 }
3936 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3937}
3938
31f18b77
FG
3939uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3940{
3941 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
11fdf7f2 3942 ceph_assert(sb->is_loaded());
31f18b77
FG
3943
3944 uint64_t sbid = sb->get_sbid();
3945 shared_blob_set.remove(sb);
3946 sb->loaded = false;
3947 delete sb->persistent;
3948 sb->sbid_unloaded = 0;
3949 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3950 return sbid;
3951}
3952
7c673cae
FG
3953BlueStore::OnodeRef BlueStore::Collection::get_onode(
3954 const ghobject_t& oid,
9f95a23c
TL
3955 bool create,
3956 bool is_createop)
7c673cae 3957{
9f95a23c 3958 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
7c673cae
FG
3959
3960 spg_t pgid;
3961 if (cid.is_pg(&pgid)) {
3962 if (!oid.match(cnode.bits, pgid.ps())) {
3963 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3964 << pgid << " bits " << cnode.bits << dendl;
3965 ceph_abort();
3966 }
3967 }
3968
3969 OnodeRef o = onode_map.lookup(oid);
3970 if (o)
3971 return o;
3972
eafe8130 3973 string key;
7c673cae
FG
3974 get_object_key(store->cct, oid, &key);
3975
3976 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3977 << pretty_binary_string(key) << dendl;
3978
3979 bufferlist v;
9f95a23c 3980 int r = -ENOENT;
7c673cae 3981 Onode *on;
9f95a23c
TL
3982 if (!is_createop) {
3983 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3984 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3985 }
7c673cae 3986 if (v.length() == 0) {
11fdf7f2 3987 ceph_assert(r == -ENOENT);
f67539c2 3988 if (!create)
7c673cae
FG
3989 return OnodeRef();
3990
3991 // new object, new onode
3992 on = new Onode(this, oid, key);
3993 } else {
3994 // loaded
11fdf7f2 3995 ceph_assert(r >= 0);
eafe8130 3996 on = Onode::decode(this, oid, key, v);
7c673cae
FG
3997 }
3998 o.reset(on);
3999 return onode_map.add(oid, o);
4000}
4001
4002void BlueStore::Collection::split_cache(
4003 Collection *dest)
4004{
4005 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
4006
f67539c2
TL
4007 auto *ocache = get_onode_cache();
4008 auto *ocache_dest = dest->get_onode_cache();
4009
4010 // lock cache shards
4011 std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock);
4012 std::lock_guard l(ocache->lock, std::adopt_lock);
4013 std::lock_guard l2(ocache_dest->lock, std::adopt_lock);
4014 std::lock_guard l3(cache->lock, std::adopt_lock);
4015 std::lock_guard l4(dest->cache->lock, std::adopt_lock);
7c673cae
FG
4016
4017 int destbits = dest->cnode.bits;
4018 spg_t destpg;
4019 bool is_pg = dest->cid.is_pg(&destpg);
11fdf7f2 4020 ceph_assert(is_pg);
7c673cae
FG
4021
4022 auto p = onode_map.onode_map.begin();
4023 while (p != onode_map.onode_map.end()) {
11fdf7f2 4024 OnodeRef o = p->second;
7c673cae
FG
4025 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
4026 // onode does not belong to this child
11fdf7f2
TL
4027 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
4028 << dendl;
7c673cae
FG
4029 ++p;
4030 } else {
7c673cae
FG
4031 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
4032 << dendl;
4033
f6b5b4d7
TL
4034 // ensuring that nref is always >= 2 and hence onode is pinned and
4035 // physically out of cache during the transition
4036 OnodeRef o_pin = o;
4037 ceph_assert(o->pinned);
4038
7c673cae 4039 p = onode_map.onode_map.erase(p);
7c673cae 4040 dest->onode_map.onode_map[o->oid] = o;
adb31ebb 4041 if (o->cached) {
f6b5b4d7 4042 get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
9f95a23c 4043 }
f6b5b4d7 4044 o->c = dest;
7c673cae
FG
4045
4046 // move over shared blobs and buffers. cover shared blobs from
4047 // both extent map and spanning blob map (the full extent map
4048 // may not be faulted in)
4049 vector<SharedBlob*> sbvec;
4050 for (auto& e : o->extent_map.extent_map) {
4051 sbvec.push_back(e.blob->shared_blob.get());
4052 }
4053 for (auto& b : o->extent_map.spanning_blob_map) {
4054 sbvec.push_back(b.second->shared_blob.get());
4055 }
4056 for (auto sb : sbvec) {
4057 if (sb->coll == dest) {
4058 ldout(store->cct, 20) << __func__ << " already moved " << *sb
4059 << dendl;
4060 continue;
4061 }
4062 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
4063 if (sb->get_sbid()) {
4064 ldout(store->cct, 20) << __func__
4065 << " moving registration " << *sb << dendl;
4066 shared_blob_set.remove(sb);
4067 dest->shared_blob_set.add(dest, sb);
4068 }
3efd9988 4069 sb->coll = dest;
7c673cae 4070 if (dest->cache != cache) {
7c673cae
FG
4071 for (auto& i : sb->bc.buffer_map) {
4072 if (!i.second->is_writing()) {
4073 ldout(store->cct, 20) << __func__ << " moving " << *i.second
4074 << dendl;
9f95a23c 4075 dest->cache->_move(cache, i.second.get());
7c673cae
FG
4076 }
4077 }
4078 }
4079 }
7c673cae
FG
4080 }
4081 }
9f95a23c 4082 dest->cache->_trim();
7c673cae
FG
4083}
4084
7c673cae
FG
4085// =======================================================
4086
91327a77
AA
4087// MempoolThread
4088
4089#undef dout_prefix
4090#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
9f95a23c
TL
4091#undef dout_context
4092#define dout_context store->cct
91327a77 4093
7c673cae
FG
4094void *BlueStore::MempoolThread::entry()
4095{
9f95a23c 4096 std::unique_lock l{lock};
11fdf7f2 4097
92f5a8d4 4098 uint32_t prev_config_change = store->config_changed.load();
eafe8130
TL
4099 uint64_t base = store->osd_memory_base;
4100 double fragmentation = store->osd_memory_expected_fragmentation;
4101 uint64_t target = store->osd_memory_target;
4102 uint64_t min = store->osd_memory_cache_min;
4103 uint64_t max = min;
4104
4105 // When setting the maximum amount of memory to use for cache, first
4106 // assume some base amount of memory for the OSD and then fudge in
4107 // some overhead for fragmentation that scales with cache usage.
4108 uint64_t ltarget = (1.0 - fragmentation) * target;
4109 if (ltarget > base + min) {
4110 max = ltarget - base;
11fdf7f2 4111 }
31f18b77 4112
eafe8130 4113 binned_kv_cache = store->db->get_priority_cache();
f67539c2 4114 binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
eafe8130
TL
4115 if (store->cache_autotune && binned_kv_cache != nullptr) {
4116 pcm = std::make_shared<PriorityCache::Manager>(
f67539c2 4117 store->cct, min, max, target, true, "bluestore-pricache");
eafe8130
TL
4118 pcm->insert("kv", binned_kv_cache, true);
4119 pcm->insert("meta", meta_cache, true);
4120 pcm->insert("data", data_cache, true);
f67539c2
TL
4121 if (binned_kv_onode_cache != nullptr) {
4122 pcm->insert("kv_onode", binned_kv_onode_cache, true);
4123 }
eafe8130 4124 }
91327a77
AA
4125
4126 utime_t next_balance = ceph_clock_now();
4127 utime_t next_resize = ceph_clock_now();
9f95a23c
TL
4128 utime_t next_deferred_force_submit = ceph_clock_now();
4129 utime_t alloc_stats_dump_clock = ceph_clock_now();
31f18b77 4130
91327a77 4131 bool interval_stats_trim = false;
91327a77 4132 while (!stop) {
92f5a8d4
TL
4133 // Update pcm cache settings if related configuration was changed
4134 uint32_t cur_config_change = store->config_changed.load();
4135 if (cur_config_change != prev_config_change) {
4136 _update_cache_settings();
4137 prev_config_change = cur_config_change;
4138 }
4139
91327a77
AA
4140 // Before we trim, check and see if it's time to rebalance/resize.
4141 double autotune_interval = store->cache_autotune_interval;
4142 double resize_interval = store->osd_memory_cache_resize_interval;
9f95a23c
TL
4143 double max_defer_interval = store->max_defer_interval;
4144
4145 double alloc_stats_dump_interval =
4146 store->cct->_conf->bluestore_alloc_stats_dump_interval;
91327a77 4147
9f95a23c
TL
4148 if (alloc_stats_dump_interval > 0 &&
4149 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
4150 store->_record_allocation_stats();
4151 alloc_stats_dump_clock = ceph_clock_now();
4152 }
91327a77 4153 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
11fdf7f2
TL
4154 _adjust_cache_settings();
4155
91327a77 4156 // Log events at 5 instead of 20 when balance happens.
91327a77 4157 interval_stats_trim = true;
eafe8130
TL
4158
4159 if (pcm != nullptr) {
4160 pcm->balance();
91327a77 4161 }
31f18b77 4162
91327a77
AA
4163 next_balance = ceph_clock_now();
4164 next_balance += autotune_interval;
4165 }
4166 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
eafe8130
TL
4167 if (ceph_using_tcmalloc() && pcm != nullptr) {
4168 pcm->tune_memory();
91327a77
AA
4169 }
4170 next_resize = ceph_clock_now();
4171 next_resize += resize_interval;
31f18b77
FG
4172 }
4173
9f95a23c
TL
4174 if (max_defer_interval > 0 &&
4175 next_deferred_force_submit < ceph_clock_now()) {
4176 if (store->get_deferred_last_submitted() + max_defer_interval <
4177 ceph_clock_now()) {
4178 store->deferred_try_submit();
4179 }
4180 next_deferred_force_submit = ceph_clock_now();
4181 next_deferred_force_submit += max_defer_interval/3;
4182 }
4183
4184 // Now Resize the shards
4185 _resize_shards(interval_stats_trim);
91327a77 4186 interval_stats_trim = false;
31f18b77 4187
91327a77 4188 store->_update_cache_logger();
11fdf7f2
TL
4189 auto wait = ceph::make_timespan(
4190 store->cct->_conf->bluestore_cache_trim_interval);
4191 cond.wait_for(l, wait);
7c673cae 4192 }
9f95a23c
TL
4193 // do final dump
4194 store->_record_allocation_stats();
7c673cae 4195 stop = false;
f67539c2 4196 pcm = nullptr;
7c673cae
FG
4197 return NULL;
4198}
4199
91327a77
AA
4200void BlueStore::MempoolThread::_adjust_cache_settings()
4201{
11fdf7f2
TL
4202 if (binned_kv_cache != nullptr) {
4203 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
4204 }
f67539c2
TL
4205 if (binned_kv_onode_cache != nullptr) {
4206 binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
4207 }
11fdf7f2
TL
4208 meta_cache->set_cache_ratio(store->cache_meta_ratio);
4209 data_cache->set_cache_ratio(store->cache_data_ratio);
91327a77
AA
4210}
4211
9f95a23c 4212void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
91327a77 4213{
9f95a23c
TL
4214 size_t onode_shards = store->onode_cache_shards.size();
4215 size_t buffer_shards = store->buffer_cache_shards.size();
91327a77 4216 int64_t kv_used = store->db->get_cache_usage();
f67539c2 4217 int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
11fdf7f2
TL
4218 int64_t meta_used = meta_cache->_get_used_bytes();
4219 int64_t data_used = data_cache->_get_used_bytes();
91327a77
AA
4220
4221 uint64_t cache_size = store->cache_size;
4222 int64_t kv_alloc =
11fdf7f2 4223 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
f67539c2
TL
4224 int64_t kv_onode_alloc =
4225 static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
91327a77 4226 int64_t meta_alloc =
11fdf7f2 4227 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
91327a77 4228 int64_t data_alloc =
11fdf7f2 4229 static_cast<int64_t>(store->cache_data_ratio * cache_size);
91327a77 4230
eafe8130
TL
4231 if (pcm != nullptr && binned_kv_cache != nullptr) {
4232 cache_size = pcm->get_tuned_mem();
11fdf7f2
TL
4233 kv_alloc = binned_kv_cache->get_committed_size();
4234 meta_alloc = meta_cache->get_committed_size();
4235 data_alloc = data_cache->get_committed_size();
f67539c2
TL
4236 if (binned_kv_onode_cache != nullptr) {
4237 kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
4238 }
91327a77
AA
4239 }
4240
4241 if (interval_stats) {
9f95a23c 4242 dout(5) << __func__ << " cache_size: " << cache_size
91327a77
AA
4243 << " kv_alloc: " << kv_alloc
4244 << " kv_used: " << kv_used
f67539c2
TL
4245 << " kv_onode_alloc: " << kv_onode_alloc
4246 << " kv_onode_used: " << kv_onode_used
91327a77
AA
4247 << " meta_alloc: " << meta_alloc
4248 << " meta_used: " << meta_used
4249 << " data_alloc: " << data_alloc
4250 << " data_used: " << data_used << dendl;
4251 } else {
9f95a23c 4252 dout(20) << __func__ << " cache_size: " << cache_size
91327a77
AA
4253 << " kv_alloc: " << kv_alloc
4254 << " kv_used: " << kv_used
f67539c2
TL
4255 << " kv_onode_alloc: " << kv_onode_alloc
4256 << " kv_onode_used: " << kv_onode_used
91327a77
AA
4257 << " meta_alloc: " << meta_alloc
4258 << " meta_used: " << meta_used
4259 << " data_alloc: " << data_alloc
4260 << " data_used: " << data_used << dendl;
4261 }
4262
4263 uint64_t max_shard_onodes = static_cast<uint64_t>(
9f95a23c
TL
4264 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
4265 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
91327a77 4266
9f95a23c 4267 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
91327a77
AA
4268 << " max_shard_buffer: " << max_shard_buffer << dendl;
4269
9f95a23c
TL
4270 for (auto i : store->onode_cache_shards) {
4271 i->set_max(max_shard_onodes);
4272 }
4273 for (auto i : store->buffer_cache_shards) {
4274 i->set_max(max_shard_buffer);
91327a77
AA
4275 }
4276}
4277
92f5a8d4
TL
4278void BlueStore::MempoolThread::_update_cache_settings()
4279{
4280 // Nothing to do if pcm is not used.
4281 if (pcm == nullptr) {
4282 return;
4283 }
4284
92f5a8d4
TL
4285 uint64_t target = store->osd_memory_target;
4286 uint64_t base = store->osd_memory_base;
4287 uint64_t min = store->osd_memory_cache_min;
4288 uint64_t max = min;
4289 double fragmentation = store->osd_memory_expected_fragmentation;
4290
4291 uint64_t ltarget = (1.0 - fragmentation) * target;
4292 if (ltarget > base + min) {
4293 max = ltarget - base;
4294 }
4295
4296 // set pcm cache levels
4297 pcm->set_target_memory(target);
4298 pcm->set_min_memory(min);
4299 pcm->set_max_memory(max);
4300
9f95a23c 4301 dout(5) << __func__ << " updated pcm target: " << target
92f5a8d4
TL
4302 << " pcm min: " << min
4303 << " pcm max: " << max
4304 << dendl;
4305}
4306
7c673cae
FG
4307// =======================================================
4308
31f18b77
FG
4309// OmapIteratorImpl
4310
4311#undef dout_prefix
4312#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4313
4314BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4315 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
4316 : c(c), o(o), it(it)
4317{
9f95a23c 4318 std::shared_lock l(c->lock);
31f18b77 4319 if (o->onode.has_omap()) {
9f95a23c
TL
4320 o->get_omap_key(string(), &head);
4321 o->get_omap_tail(&tail);
31f18b77
FG
4322 it->lower_bound(head);
4323 }
4324}
4325
11fdf7f2
TL
4326string BlueStore::OmapIteratorImpl::_stringify() const
4327{
4328 stringstream s;
4329 s << " omap_iterator(cid = " << c->cid
4330 <<", oid = " << o->oid << ")";
4331 return s.str();
4332}
4333
31f18b77
FG
4334int BlueStore::OmapIteratorImpl::seek_to_first()
4335{
9f95a23c 4336 std::shared_lock l(c->lock);
11fdf7f2 4337 auto start1 = mono_clock::now();
31f18b77
FG
4338 if (o->onode.has_omap()) {
4339 it->lower_bound(head);
4340 } else {
4341 it = KeyValueDB::Iterator();
4342 }
494da23a
TL
4343 c->store->log_latency(
4344 __func__,
11fdf7f2
TL
4345 l_bluestore_omap_seek_to_first_lat,
4346 mono_clock::now() - start1,
494da23a 4347 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2 4348
31f18b77
FG
4349 return 0;
4350}
4351
4352int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4353{
9f95a23c 4354 std::shared_lock l(c->lock);
11fdf7f2 4355 auto start1 = mono_clock::now();
31f18b77
FG
4356 if (o->onode.has_omap()) {
4357 string key;
9f95a23c 4358 o->get_omap_key(after, &key);
31f18b77
FG
4359 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4360 << pretty_binary_string(key) << dendl;
4361 it->upper_bound(key);
4362 } else {
4363 it = KeyValueDB::Iterator();
4364 }
11fdf7f2 4365 c->store->log_latency_fn(
494da23a 4366 __func__,
11fdf7f2
TL
4367 l_bluestore_omap_upper_bound_lat,
4368 mono_clock::now() - start1,
494da23a 4369 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4370 [&] (const ceph::timespan& lat) {
494da23a 4371 return ", after = " + after +
11fdf7f2
TL
4372 _stringify();
4373 }
4374 );
31f18b77
FG
4375 return 0;
4376}
4377
4378int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4379{
9f95a23c 4380 std::shared_lock l(c->lock);
11fdf7f2 4381 auto start1 = mono_clock::now();
31f18b77
FG
4382 if (o->onode.has_omap()) {
4383 string key;
9f95a23c 4384 o->get_omap_key(to, &key);
31f18b77
FG
4385 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4386 << pretty_binary_string(key) << dendl;
4387 it->lower_bound(key);
4388 } else {
4389 it = KeyValueDB::Iterator();
4390 }
11fdf7f2 4391 c->store->log_latency_fn(
494da23a 4392 __func__,
11fdf7f2
TL
4393 l_bluestore_omap_lower_bound_lat,
4394 mono_clock::now() - start1,
494da23a 4395 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4396 [&] (const ceph::timespan& lat) {
494da23a 4397 return ", to = " + to +
11fdf7f2
TL
4398 _stringify();
4399 }
4400 );
31f18b77
FG
4401 return 0;
4402}
4403
4404bool BlueStore::OmapIteratorImpl::valid()
4405{
9f95a23c 4406 std::shared_lock l(c->lock);
31f18b77 4407 bool r = o->onode.has_omap() && it && it->valid() &&
494da23a 4408 it->raw_key().second < tail;
31f18b77
FG
4409 if (it && it->valid()) {
4410 ldout(c->store->cct,20) << __func__ << " is at "
4411 << pretty_binary_string(it->raw_key().second)
4412 << dendl;
4413 }
4414 return r;
4415}
4416
11fdf7f2 4417int BlueStore::OmapIteratorImpl::next()
31f18b77 4418{
11fdf7f2 4419 int r = -1;
9f95a23c 4420 std::shared_lock l(c->lock);
11fdf7f2 4421 auto start1 = mono_clock::now();
31f18b77
FG
4422 if (o->onode.has_omap()) {
4423 it->next();
11fdf7f2 4424 r = 0;
31f18b77 4425 }
494da23a
TL
4426 c->store->log_latency(
4427 __func__,
11fdf7f2
TL
4428 l_bluestore_omap_next_lat,
4429 mono_clock::now() - start1,
494da23a 4430 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2
TL
4431
4432 return r;
31f18b77
FG
4433}
4434
4435string BlueStore::OmapIteratorImpl::key()
4436{
9f95a23c 4437 std::shared_lock l(c->lock);
11fdf7f2 4438 ceph_assert(it->valid());
31f18b77
FG
4439 string db_key = it->raw_key().second;
4440 string user_key;
9f95a23c 4441 o->decode_omap_key(db_key, &user_key);
494da23a 4442
31f18b77
FG
4443 return user_key;
4444}
4445
4446bufferlist BlueStore::OmapIteratorImpl::value()
4447{
9f95a23c 4448 std::shared_lock l(c->lock);
11fdf7f2 4449 ceph_assert(it->valid());
31f18b77
FG
4450 return it->value();
4451}
4452
4453
4454// =====================================
4455
7c673cae
FG
4456#undef dout_prefix
4457#define dout_prefix *_dout << "bluestore(" << path << ") "
9f95a23c
TL
4458#undef dout_context
4459#define dout_context cct
7c673cae
FG
4460
4461
4462static void aio_cb(void *priv, void *priv2)
4463{
4464 BlueStore *store = static_cast<BlueStore*>(priv);
4465 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4466 c->aio_finish(store);
4467}
4468
11fdf7f2
TL
4469static void discard_cb(void *priv, void *priv2)
4470{
4471 BlueStore *store = static_cast<BlueStore*>(priv);
4472 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4473 store->handle_discard(*tmp);
4474}
4475
4476void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4477{
4478 dout(10) << __func__ << dendl;
f67539c2
TL
4479 ceph_assert(shared_alloc.a);
4480 shared_alloc.a->release(to_release);
11fdf7f2
TL
4481}
4482
7c673cae 4483BlueStore::BlueStore(CephContext *cct, const string& path)
9f95a23c 4484 : BlueStore(cct, path, 0) {}
7c673cae
FG
4485
4486BlueStore::BlueStore(CephContext *cct,
4487 const string& path,
4488 uint64_t _min_alloc_size)
4489 : ObjectStore(cct, path),
9f95a23c 4490 throttle(cct),
11fdf7f2 4491 finisher(cct, "commit_finisher", "cfin"),
7c673cae 4492 kv_sync_thread(this),
31f18b77 4493 kv_finalize_thread(this),
f67539c2 4494 zoned_cleaner_thread(this),
7c673cae
FG
4495 min_alloc_size(_min_alloc_size),
4496 min_alloc_size_order(ctz(_min_alloc_size)),
4497 mempool_thread(this)
4498{
4499 _init_logger();
11fdf7f2 4500 cct->_conf.add_observer(this);
7c673cae 4501 set_cache_shards(1);
7c673cae
FG
4502}
4503
4504BlueStore::~BlueStore()
4505{
11fdf7f2 4506 cct->_conf.remove_observer(this);
7c673cae 4507 _shutdown_logger();
11fdf7f2
TL
4508 ceph_assert(!mounted);
4509 ceph_assert(db == NULL);
4510 ceph_assert(bluefs == NULL);
4511 ceph_assert(fsid_fd < 0);
4512 ceph_assert(path_fd < 0);
9f95a23c
TL
4513 for (auto i : onode_cache_shards) {
4514 delete i;
4515 }
4516 for (auto i : buffer_cache_shards) {
7c673cae
FG
4517 delete i;
4518 }
9f95a23c
TL
4519 onode_cache_shards.clear();
4520 buffer_cache_shards.clear();
7c673cae
FG
4521}
4522
4523const char **BlueStore::get_tracked_conf_keys() const
4524{
4525 static const char* KEYS[] = {
4526 "bluestore_csum_type",
4527 "bluestore_compression_mode",
4528 "bluestore_compression_algorithm",
4529 "bluestore_compression_min_blob_size",
4530 "bluestore_compression_min_blob_size_ssd",
4531 "bluestore_compression_min_blob_size_hdd",
4532 "bluestore_compression_max_blob_size",
4533 "bluestore_compression_max_blob_size_ssd",
4534 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 4535 "bluestore_compression_required_ratio",
7c673cae
FG
4536 "bluestore_max_alloc_size",
4537 "bluestore_prefer_deferred_size",
181888fb
FG
4538 "bluestore_prefer_deferred_size_hdd",
4539 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
4540 "bluestore_deferred_batch_ops",
4541 "bluestore_deferred_batch_ops_hdd",
4542 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
4543 "bluestore_throttle_bytes",
4544 "bluestore_throttle_deferred_bytes",
4545 "bluestore_throttle_cost_per_io_hdd",
4546 "bluestore_throttle_cost_per_io_ssd",
4547 "bluestore_throttle_cost_per_io",
4548 "bluestore_max_blob_size",
4549 "bluestore_max_blob_size_ssd",
4550 "bluestore_max_blob_size_hdd",
11fdf7f2
TL
4551 "osd_memory_target",
4552 "osd_memory_target_cgroup_limit_ratio",
4553 "osd_memory_base",
4554 "osd_memory_cache_min",
92f5a8d4 4555 "osd_memory_expected_fragmentation",
11fdf7f2
TL
4556 "bluestore_cache_autotune",
4557 "bluestore_cache_autotune_interval",
81eedcae 4558 "bluestore_warn_on_legacy_statfs",
9f95a23c
TL
4559 "bluestore_warn_on_no_per_pool_omap",
4560 "bluestore_max_defer_interval",
7c673cae
FG
4561 NULL
4562 };
4563 return KEYS;
4564}
4565
11fdf7f2 4566void BlueStore::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
4567 const std::set<std::string> &changed)
4568{
eafe8130 4569 if (changed.count("bluestore_warn_on_legacy_statfs")) {
81eedcae
TL
4570 _check_legacy_statfs_alert();
4571 }
f67539c2
TL
4572 if (changed.count("bluestore_warn_on_no_per_pool_omap") ||
4573 changed.count("bluestore_warn_on_no_per_pg_omap")) {
4574 _check_no_per_pg_or_pool_omap_alert();
9f95a23c 4575 }
81eedcae 4576
7c673cae
FG
4577 if (changed.count("bluestore_csum_type")) {
4578 _set_csum();
4579 }
4580 if (changed.count("bluestore_compression_mode") ||
4581 changed.count("bluestore_compression_algorithm") ||
4582 changed.count("bluestore_compression_min_blob_size") ||
4583 changed.count("bluestore_compression_max_blob_size")) {
4584 if (bdev) {
4585 _set_compression();
4586 }
4587 }
4588 if (changed.count("bluestore_max_blob_size") ||
4589 changed.count("bluestore_max_blob_size_ssd") ||
4590 changed.count("bluestore_max_blob_size_hdd")) {
4591 if (bdev) {
4592 // only after startup
4593 _set_blob_size();
4594 }
4595 }
4596 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
4597 changed.count("bluestore_prefer_deferred_size_hdd") ||
4598 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
4599 changed.count("bluestore_max_alloc_size") ||
4600 changed.count("bluestore_deferred_batch_ops") ||
4601 changed.count("bluestore_deferred_batch_ops_hdd") ||
4602 changed.count("bluestore_deferred_batch_ops_ssd")) {
4603 if (bdev) {
4604 // only after startup
4605 _set_alloc_sizes();
4606 }
4607 }
4608 if (changed.count("bluestore_throttle_cost_per_io") ||
4609 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4610 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4611 if (bdev) {
4612 _set_throttle_params();
4613 }
4614 }
9f95a23c
TL
4615 if (changed.count("bluestore_throttle_bytes") ||
4616 changed.count("bluestore_throttle_deferred_bytes") ||
4617 changed.count("bluestore_throttle_trace_rate")) {
4618 throttle.reset_throttle(conf);
7c673cae 4619 }
9f95a23c
TL
4620 if (changed.count("bluestore_max_defer_interval")) {
4621 if (bdev) {
4622 _set_max_defer_interval();
4623 }
7c673cae 4624 }
92f5a8d4
TL
4625 if (changed.count("osd_memory_target") ||
4626 changed.count("osd_memory_base") ||
4627 changed.count("osd_memory_cache_min") ||
4628 changed.count("osd_memory_expected_fragmentation")) {
4629 _update_osd_memory_options();
4630 }
7c673cae
FG
4631}
4632
4633void BlueStore::_set_compression()
4634{
224ce89b
WB
4635 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4636 if (m) {
11fdf7f2 4637 _clear_compression_alert();
224ce89b
WB
4638 comp_mode = *m;
4639 } else {
4640 derr << __func__ << " unrecognized value '"
4641 << cct->_conf->bluestore_compression_mode
4642 << "' for bluestore_compression_mode, reverting to 'none'"
4643 << dendl;
4644 comp_mode = Compressor::COMP_NONE;
11fdf7f2
TL
4645 string s("unknown mode: ");
4646 s += cct->_conf->bluestore_compression_mode;
4647 _set_compression_alert(true, s.c_str());
224ce89b
WB
4648 }
4649
4650 compressor = nullptr;
4651
3efd9988
FG
4652 if (cct->_conf->bluestore_compression_min_blob_size) {
4653 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae 4654 } else {
11fdf7f2 4655 ceph_assert(bdev);
9f95a23c 4656 if (_use_rotational_settings()) {
7c673cae
FG
4657 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4658 } else {
4659 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4660 }
4661 }
4662
4663 if (cct->_conf->bluestore_compression_max_blob_size) {
4664 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4665 } else {
11fdf7f2 4666 ceph_assert(bdev);
9f95a23c 4667 if (_use_rotational_settings()) {
7c673cae
FG
4668 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4669 } else {
4670 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4671 }
4672 }
4673
7c673cae
FG
4674 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4675 if (!alg_name.empty()) {
4676 compressor = Compressor::create(cct, alg_name);
4677 if (!compressor) {
4678 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4679 << dendl;
11fdf7f2 4680 _set_compression_alert(false, alg_name.c_str());
7c673cae
FG
4681 }
4682 }
4683
4684 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4685 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
11fdf7f2
TL
4686 << " min_blob " << comp_min_blob_size
4687 << " max_blob " << comp_max_blob_size
7c673cae
FG
4688 << dendl;
4689}
4690
4691void BlueStore::_set_csum()
4692{
4693 csum_type = Checksummer::CSUM_NONE;
4694 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4695 if (t > Checksummer::CSUM_NONE)
4696 csum_type = t;
4697
4698 dout(10) << __func__ << " csum_type "
4699 << Checksummer::get_csum_type_string(csum_type)
4700 << dendl;
4701}
4702
4703void BlueStore::_set_throttle_params()
4704{
4705 if (cct->_conf->bluestore_throttle_cost_per_io) {
4706 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4707 } else {
11fdf7f2 4708 ceph_assert(bdev);
9f95a23c 4709 if (_use_rotational_settings()) {
7c673cae
FG
4710 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4711 } else {
4712 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4713 }
4714 }
4715
4716 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4717 << dendl;
4718}
4719void BlueStore::_set_blob_size()
4720{
4721 if (cct->_conf->bluestore_max_blob_size) {
4722 max_blob_size = cct->_conf->bluestore_max_blob_size;
4723 } else {
11fdf7f2 4724 ceph_assert(bdev);
9f95a23c 4725 if (_use_rotational_settings()) {
7c673cae
FG
4726 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4727 } else {
4728 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4729 }
4730 }
4731 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4732 << std::dec << dendl;
4733}
4734
92f5a8d4
TL
4735void BlueStore::_update_osd_memory_options()
4736{
4737 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4738 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4739 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4740 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4741 config_changed++;
4742 dout(10) << __func__
4743 << " osd_memory_target " << osd_memory_target
4744 << " osd_memory_base " << osd_memory_base
4745 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4746 << " osd_memory_cache_min " << osd_memory_cache_min
4747 << dendl;
4748}
4749
11fdf7f2 4750int BlueStore::_set_cache_sizes()
1adf2230 4751{
11fdf7f2
TL
4752 ceph_assert(bdev);
4753 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
91327a77 4754 cache_autotune_interval =
11fdf7f2
TL
4755 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4756 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4757 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
91327a77 4758 osd_memory_expected_fragmentation =
11fdf7f2
TL
4759 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4760 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
91327a77 4761 osd_memory_cache_resize_interval =
11fdf7f2 4762 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
91327a77 4763
224ce89b
WB
4764 if (cct->_conf->bluestore_cache_size) {
4765 cache_size = cct->_conf->bluestore_cache_size;
4766 } else {
4767 // choose global cache size based on backend type
9f95a23c 4768 if (_use_rotational_settings()) {
224ce89b
WB
4769 cache_size = cct->_conf->bluestore_cache_size_hdd;
4770 } else {
4771 cache_size = cct->_conf->bluestore_cache_size_ssd;
4772 }
4773 }
31f18b77 4774
f67539c2 4775 cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
224ce89b 4776 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 4777 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77 4778 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4779 return -EINVAL;
4780 }
91327a77 4781
f67539c2 4782 cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
224ce89b 4783 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 4784 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
91327a77 4785 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4786 return -EINVAL;
4787 }
91327a77 4788
f67539c2
TL
4789 cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
4790 if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
4791 derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
4792 << ") must be in range [0,1.0]" << dendl;
4793 return -EINVAL;
4794 }
4795
31f18b77 4796 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 4797 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77
AA
4798 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4799 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4800 << dendl;
31f18b77
FG
4801 return -EINVAL;
4802 }
91327a77 4803
f67539c2
TL
4804 cache_data_ratio = (double)1.0 -
4805 (double)cache_meta_ratio -
4806 (double)cache_kv_ratio -
4807 (double)cache_kv_onode_ratio;
31f18b77
FG
4808 if (cache_data_ratio < 0) {
4809 // deal with floating point imprecision
4810 cache_data_ratio = 0;
4811 }
91327a77 4812
224ce89b
WB
4813 dout(1) << __func__ << " cache_size " << cache_size
4814 << " meta " << cache_meta_ratio
31f18b77
FG
4815 << " kv " << cache_kv_ratio
4816 << " data " << cache_data_ratio
4817 << dendl;
4818 return 0;
4819}
4820
3efd9988
FG
4821int BlueStore::write_meta(const std::string& key, const std::string& value)
4822{
4823 bluestore_bdev_label_t label;
4824 string p = path + "/block";
4825 int r = _read_bdev_label(cct, p, &label);
4826 if (r < 0) {
4827 return ObjectStore::write_meta(key, value);
4828 }
4829 label.meta[key] = value;
4830 r = _write_bdev_label(cct, p, label);
11fdf7f2 4831 ceph_assert(r == 0);
3efd9988
FG
4832 return ObjectStore::write_meta(key, value);
4833}
4834
4835int BlueStore::read_meta(const std::string& key, std::string *value)
4836{
4837 bluestore_bdev_label_t label;
4838 string p = path + "/block";
4839 int r = _read_bdev_label(cct, p, &label);
4840 if (r < 0) {
4841 return ObjectStore::read_meta(key, value);
4842 }
4843 auto i = label.meta.find(key);
4844 if (i == label.meta.end()) {
4845 return ObjectStore::read_meta(key, value);
4846 }
4847 *value = i->second;
4848 return 0;
4849}
4850
7c673cae
FG
4851void BlueStore::_init_logger()
4852{
4853 PerfCountersBuilder b(cct, "bluestore",
4854 l_bluestore_first, l_bluestore_last);
4855 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4856 "Average kv_thread flush latency",
4857 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4858 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4859 "Average kv_thread commit latency");
11fdf7f2
TL
4860 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
4861 "Average kv_sync thread latency",
4862 "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
4863 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
4864 "Average kv_finalize thread latency",
4865 "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
4866 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4867 "Average prepare state latency");
4868 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4869 "Average aio_wait state latency",
4870 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4871 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4872 "Average io_done state latency");
4873 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4874 "Average kv_queued state latency");
4875 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4876 "Average kv_commiting state latency");
4877 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4878 "Average kv_done state latency");
4879 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4880 "Average deferred_queued state latency");
4881 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4882 "Average aio_wait state latency");
4883 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4884 "Average cleanup state latency");
4885 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4886 "Average finishing state latency");
4887 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4888 "Average done state latency");
4889 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4890 "Average submit throttle latency",
4891 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4892 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4893 "Average submit latency",
4894 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4895 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4896 "Average commit latency",
4897 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4898 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4899 "Average read latency",
4900 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4901 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4902 "Average read onode metadata latency");
4903 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4904 "Average read latency");
4905 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4906 "Average compress latency");
4907 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4908 "Average decompress latency");
4909 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4910 "Average checksum latency");
4911 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4912 "Sum for beneficial compress ops");
4913 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4914 "Sum for compress ops rejected due to low net gain of space");
4915 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
11fdf7f2 4916 "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4917 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4918 "Sum for deferred write op");
4919 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
11fdf7f2 4920 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
7c673cae
FG
4921 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4922 "Sum for write penalty read ops");
4923 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4924 "Sum for allocated bytes");
4925 b.add_u64(l_bluestore_stored, "bluestore_stored",
4926 "Sum for stored bytes");
4927 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
92f5a8d4
TL
4928 "Sum for stored compressed bytes",
4929 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4930 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
92f5a8d4
TL
4931 "Sum for bytes allocated for compressed data",
4932 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4933 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
92f5a8d4
TL
4934 "Sum for original bytes that were compressed",
4935 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
4936 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4937 "Number of onodes in cache");
9f95a23c
TL
4938 b.add_u64(l_bluestore_pinned_onodes, "bluestore_pinned_onodes",
4939 "Number of pinned onodes in cache");
7c673cae
FG
4940 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4941 "Sum for onode-lookups hit in the cache");
4942 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4943 "Sum for onode-lookups missed in the cache");
4944 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4945 "Sum for onode-shard lookups hit in the cache");
4946 b.add_u64_counter(l_bluestore_onode_shard_misses,
4947 "bluestore_onode_shard_misses",
4948 "Sum for onode-shard lookups missed in the cache");
4949 b.add_u64(l_bluestore_extents, "bluestore_extents",
4950 "Number of extents in cache");
4951 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4952 "Number of blobs in cache");
4953 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4954 "Number of buffers in cache");
4955 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
11fdf7f2 4956 "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4957 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
11fdf7f2 4958 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4959 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
11fdf7f2 4960 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4961
4962 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4963 "Large aligned writes into fresh blobs");
4964 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
11fdf7f2 4965 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4966 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4967 "Large aligned writes into fresh blobs (blobs)");
f67539c2
TL
4968 b.add_u64_counter(l_bluestore_write_big_deferred,
4969 "bluestore_write_big_deferred",
4970 "Big overwrites using deferred");
7c673cae
FG
4971 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4972 "Small writes into existing or sparse small blobs");
4973 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
11fdf7f2 4974 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4975 b.add_u64_counter(l_bluestore_write_small_unused,
4976 "bluestore_write_small_unused",
4977 "Small writes into unused portion of existing blob");
f67539c2
TL
4978 b.add_u64_counter(l_bluestore_write_deferred,
4979 "bluestore_write_deferred",
4980 "Overwrites using deferred");
7c673cae
FG
4981 b.add_u64_counter(l_bluestore_write_small_pre_read,
4982 "bluestore_write_small_pre_read",
4983 "Small writes that required we read some data (possibly "
4984 "cached) to fill out the block");
f67539c2
TL
4985 b.add_u64_counter(l_bluestore_write_new, "bluestore_write_new",
4986 "Write into new blob");
7c673cae
FG
4987
4988 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
4989 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
4990 "Onode extent map reshard events");
4991 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
4992 "Sum for blob splitting due to resharding");
4993 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
4994 "Sum for extents that have been removed due to compression");
4995 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
4996 "Sum for extents that have been merged due to garbage "
4997 "collection");
b32b8144
FG
4998 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
4999 "Read EIO errors propagated to high level callers");
f64942e4
AA
5000 b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
5001 "Read operations that required at least one retry due to failed checksum validation");
a8e16298
TL
5002 b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
5003 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
11fdf7f2
TL
5004 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
5005 "Average omap iterator seek_to_first call latency");
5006 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
5007 "Average omap iterator upper_bound call latency");
5008 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
5009 "Average omap iterator lower_bound call latency");
5010 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
5011 "Average omap iterator next call latency");
adb31ebb
TL
5012 b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
5013 "Average omap get_keys call latency");
5014 b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
5015 "Average omap get_values call latency");
494da23a
TL
5016 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
5017 "Average collection listing latency");
adb31ebb
TL
5018 b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
5019 "Average removal latency");
5020
7c673cae
FG
5021 logger = b.create_perf_counters();
5022 cct->get_perfcounters_collection()->add(logger);
5023}
5024
5025int BlueStore::_reload_logger()
5026{
5027 struct store_statfs_t store_statfs;
7c673cae 5028 int r = statfs(&store_statfs);
11fdf7f2 5029 if (r >= 0) {
7c673cae 5030 logger->set(l_bluestore_allocated, store_statfs.allocated);
11fdf7f2
TL
5031 logger->set(l_bluestore_stored, store_statfs.data_stored);
5032 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
5033 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
5034 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
7c673cae
FG
5035 }
5036 return r;
5037}
5038
5039void BlueStore::_shutdown_logger()
5040{
5041 cct->get_perfcounters_collection()->remove(logger);
5042 delete logger;
5043}
5044
5045int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
5046 uuid_d *fsid)
5047{
5048 bluestore_bdev_label_t label;
5049 int r = _read_bdev_label(cct, path, &label);
5050 if (r < 0)
5051 return r;
5052 *fsid = label.osd_uuid;
5053 return 0;
5054}
5055
5056int BlueStore::_open_path()
5057{
b32b8144 5058 // sanity check(s)
11fdf7f2 5059 ceph_assert(path_fd < 0);
91327a77 5060 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
7c673cae
FG
5061 if (path_fd < 0) {
5062 int r = -errno;
5063 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
5064 << dendl;
5065 return r;
5066 }
5067 return 0;
5068}
5069
5070void BlueStore::_close_path()
5071{
5072 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
5073 path_fd = -1;
5074}
5075
3efd9988
FG
5076int BlueStore::_write_bdev_label(CephContext *cct,
5077 string path, bluestore_bdev_label_t label)
7c673cae
FG
5078{
5079 dout(10) << __func__ << " path " << path << " label " << label << dendl;
5080 bufferlist bl;
11fdf7f2 5081 encode(label, bl);
7c673cae 5082 uint32_t crc = bl.crc32c(-1);
11fdf7f2
TL
5083 encode(crc, bl);
5084 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
7c673cae
FG
5085 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
5086 z.zero();
5087 bl.append(std::move(z));
5088
91327a77 5089 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
7c673cae
FG
5090 if (fd < 0) {
5091 fd = -errno;
5092 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5093 << dendl;
5094 return fd;
5095 }
5096 int r = bl.write_fd(fd);
5097 if (r < 0) {
5098 derr << __func__ << " failed to write to " << path
5099 << ": " << cpp_strerror(r) << dendl;
11fdf7f2 5100 goto out;
7c673cae 5101 }
3efd9988
FG
5102 r = ::fsync(fd);
5103 if (r < 0) {
5104 derr << __func__ << " failed to fsync " << path
5105 << ": " << cpp_strerror(r) << dendl;
5106 }
11fdf7f2 5107out:
7c673cae
FG
5108 VOID_TEMP_FAILURE_RETRY(::close(fd));
5109 return r;
5110}
5111
5112int BlueStore::_read_bdev_label(CephContext* cct, string path,
5113 bluestore_bdev_label_t *label)
5114{
5115 dout(10) << __func__ << dendl;
91327a77 5116 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
7c673cae
FG
5117 if (fd < 0) {
5118 fd = -errno;
5119 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5120 << dendl;
5121 return fd;
5122 }
5123 bufferlist bl;
5124 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
5125 VOID_TEMP_FAILURE_RETRY(::close(fd));
5126 if (r < 0) {
5127 derr << __func__ << " failed to read from " << path
5128 << ": " << cpp_strerror(r) << dendl;
5129 return r;
5130 }
5131
5132 uint32_t crc, expected_crc;
11fdf7f2 5133 auto p = bl.cbegin();
7c673cae 5134 try {
11fdf7f2 5135 decode(*label, p);
7c673cae
FG
5136 bufferlist t;
5137 t.substr_of(bl, 0, p.get_off());
5138 crc = t.crc32c(-1);
11fdf7f2 5139 decode(expected_crc, p);
7c673cae 5140 }
f67539c2 5141 catch (ceph::buffer::error& e) {
b32b8144 5142 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
5143 << ": " << e.what()
5144 << dendl;
b32b8144 5145 return -ENOENT;
7c673cae
FG
5146 }
5147 if (crc != expected_crc) {
5148 derr << __func__ << " bad crc on label, expected " << expected_crc
5149 << " != actual " << crc << dendl;
5150 return -EIO;
5151 }
5152 dout(10) << __func__ << " got " << *label << dendl;
5153 return 0;
5154}
5155
5156int BlueStore::_check_or_set_bdev_label(
5157 string path, uint64_t size, string desc, bool create)
5158{
5159 bluestore_bdev_label_t label;
5160 if (create) {
5161 label.osd_uuid = fsid;
5162 label.size = size;
5163 label.btime = ceph_clock_now();
5164 label.description = desc;
3efd9988 5165 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
5166 if (r < 0)
5167 return r;
5168 } else {
5169 int r = _read_bdev_label(cct, path, &label);
5170 if (r < 0)
5171 return r;
31f18b77
FG
5172 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
5173 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5174 << " and fsid " << fsid << " check bypassed" << dendl;
1911f103 5175 } else if (label.osd_uuid != fsid) {
7c673cae
FG
5176 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5177 << " does not match our fsid " << fsid << dendl;
5178 return -EIO;
5179 }
5180 }
5181 return 0;
5182}
5183
5184void BlueStore::_set_alloc_sizes(void)
5185{
7c673cae
FG
5186 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
5187
5188 if (cct->_conf->bluestore_prefer_deferred_size) {
5189 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
5190 } else {
11fdf7f2 5191 ceph_assert(bdev);
9f95a23c 5192 if (_use_rotational_settings()) {
7c673cae
FG
5193 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
5194 } else {
5195 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
5196 }
5197 }
5198
5199 if (cct->_conf->bluestore_deferred_batch_ops) {
5200 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
5201 } else {
11fdf7f2 5202 ceph_assert(bdev);
9f95a23c 5203 if (_use_rotational_settings()) {
7c673cae
FG
5204 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
5205 } else {
5206 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
5207 }
5208 }
5209
5210 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11fdf7f2 5211 << std::dec << " order " << (int)min_alloc_size_order
7c673cae
FG
5212 << " max_alloc_size 0x" << std::hex << max_alloc_size
5213 << " prefer_deferred_size 0x" << prefer_deferred_size
5214 << std::dec
5215 << " deferred_batch_ops " << deferred_batch_ops
5216 << dendl;
5217}
5218
5219int BlueStore::_open_bdev(bool create)
5220{
11fdf7f2 5221 ceph_assert(bdev == NULL);
7c673cae 5222 string p = path + "/block";
11fdf7f2 5223 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
7c673cae
FG
5224 int r = bdev->open(p);
5225 if (r < 0)
5226 goto fail;
5227
11fdf7f2
TL
5228 if (create && cct->_conf->bdev_enable_discard) {
5229 bdev->discard(0, bdev->get_size());
5230 }
5231
7c673cae
FG
5232 if (bdev->supported_bdev_label()) {
5233 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
5234 if (r < 0)
5235 goto fail_close;
5236 }
5237
5238 // initialize global block parameters
5239 block_size = bdev->get_block_size();
5240 block_mask = ~(block_size - 1);
5241 block_size_order = ctz(block_size);
11fdf7f2 5242 ceph_assert(block_size == 1u << block_size_order);
9f95a23c 5243 _set_max_defer_interval();
224ce89b
WB
5244 // and set cache_size based on device type
5245 r = _set_cache_sizes();
5246 if (r < 0) {
5247 goto fail_close;
5248 }
f67539c2
TL
5249
5250 if (bdev->is_smr()) {
5251 freelist_type = "zoned";
5252 }
7c673cae
FG
5253 return 0;
5254
5255 fail_close:
5256 bdev->close();
5257 fail:
5258 delete bdev;
5259 bdev = NULL;
5260 return r;
5261}
5262
11fdf7f2
TL
5263void BlueStore::_validate_bdev()
5264{
5265 ceph_assert(bdev);
11fdf7f2 5266 uint64_t dev_size = bdev->get_size();
f67539c2 5267 ceph_assert(dev_size > _get_ondisk_reserved());
11fdf7f2
TL
5268}
5269
7c673cae
FG
5270void BlueStore::_close_bdev()
5271{
11fdf7f2 5272 ceph_assert(bdev);
7c673cae
FG
5273 bdev->close();
5274 delete bdev;
5275 bdev = NULL;
5276}
5277
1911f103 5278int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only)
7c673cae 5279{
1911f103 5280 int r;
1911f103 5281
11fdf7f2
TL
5282 ceph_assert(fm == NULL);
5283 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
5284 ceph_assert(fm);
5285 if (t) {
5286 // create mode. initialize freespace
7c673cae 5287 dout(20) << __func__ << " initializing freespace" << dendl;
7c673cae
FG
5288 {
5289 bufferlist bl;
5290 bl.append(freelist_type);
5291 t->set(PREFIX_SUPER, "freelist_type", bl);
5292 }
b32b8144
FG
5293 // being able to allocate in units less than bdev block size
5294 // seems to be a bad idea.
11fdf7f2 5295 ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
f67539c2
TL
5296
5297 uint64_t alloc_size = min_alloc_size;
5298 if (bdev->is_smr()) {
5299 alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
5300 }
5301
5302 fm->create(bdev->get_size(), alloc_size, t);
7c673cae
FG
5303
5304 // allocate superblock reserved space. note that we do not mark
5305 // bluefs space as allocated in the freelist; we instead rely on
f67539c2 5306 // bluefs doing that itself.
11fdf7f2 5307 auto reserved = _get_ondisk_reserved();
3efd9988 5308 fm->allocate(0, reserved, t);
7c673cae 5309
7c673cae
FG
5310 if (cct->_conf->bluestore_debug_prefill > 0) {
5311 uint64_t end = bdev->get_size() - reserved;
5312 dout(1) << __func__ << " pre-fragmenting freespace, using "
5313 << cct->_conf->bluestore_debug_prefill << " with max free extent "
5314 << cct->_conf->bluestore_debug_prefragment_max << dendl;
11fdf7f2 5315 uint64_t start = p2roundup(reserved, min_alloc_size);
7c673cae
FG
5316 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
5317 float r = cct->_conf->bluestore_debug_prefill;
5318 r /= 1.0 - r;
5319 bool stop = false;
5320
5321 while (!stop && start < end) {
5322 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
5323 if (start + l > end) {
5324 l = end - start;
11fdf7f2 5325 l = p2align(l, min_alloc_size);
7c673cae 5326 }
11fdf7f2 5327 ceph_assert(start + l <= end);
7c673cae
FG
5328
5329 uint64_t u = 1 + (uint64_t)(r * (double)l);
11fdf7f2 5330 u = p2roundup(u, min_alloc_size);
7c673cae
FG
5331 if (start + l + u > end) {
5332 u = end - (start + l);
5333 // trim to align so we don't overflow again
11fdf7f2 5334 u = p2align(u, min_alloc_size);
7c673cae
FG
5335 stop = true;
5336 }
11fdf7f2 5337 ceph_assert(start + l + u <= end);
7c673cae 5338
11fdf7f2 5339 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
7c673cae
FG
5340 << " use 0x" << u << std::dec << dendl;
5341
5342 if (u == 0) {
5343 // break if u has been trimmed to nothing
5344 break;
5345 }
5346
5347 fm->allocate(start + l, u, t);
5348 start += l + u;
5349 }
5350 }
f67539c2 5351 r = _write_out_fm_meta(0);
1911f103
TL
5352 ceph_assert(r == 0);
5353 } else {
f67539c2
TL
5354 r = fm->init(db, read_only,
5355 [&](const std::string& key, std::string* result) {
5356 return read_meta(key, result);
5357 });
1911f103 5358 if (r < 0) {
f67539c2 5359 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
1911f103
TL
5360 delete fm;
5361 fm = NULL;
5362 return r;
5363 }
7c673cae 5364 }
81eedcae
TL
5365 // if space size tracked by free list manager is that higher than actual
5366 // dev size one can hit out-of-space allocation which will result
5367 // in data loss and/or assertions
5368 // Probably user altered the device size somehow.
5369 // The only fix for now is to redeploy OSD.
5370 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5371 ostringstream ss;
5372 ss << "slow device size mismatch detected, "
5373 << " fm size(" << fm->get_size()
5374 << ") > slow device size(" << bdev->get_size()
5375 << "), Please stop using this OSD as it might cause data loss.";
5376 _set_disk_size_mismatch_alert(ss.str());
5377 }
7c673cae
FG
5378 return 0;
5379}
5380
5381void BlueStore::_close_fm()
5382{
5383 dout(10) << __func__ << dendl;
11fdf7f2 5384 ceph_assert(fm);
7c673cae
FG
5385 fm->shutdown();
5386 delete fm;
5387 fm = NULL;
5388}
5389
f67539c2 5390int BlueStore::_write_out_fm_meta(uint64_t target_size)
1911f103 5391{
f67539c2 5392 int r = 0;
1911f103
TL
5393 string p = path + "/block";
5394
5395 std::vector<std::pair<string, string>> fm_meta;
5396 fm->get_meta(target_size, &fm_meta);
5397
1911f103 5398 for (auto& m : fm_meta) {
f67539c2
TL
5399 r = write_meta(m.first, m.second);
5400 ceph_assert(r == 0);
1911f103 5401 }
1911f103
TL
5402 return r;
5403}
5404
f67539c2 5405int BlueStore::_create_alloc()
7c673cae 5406{
f67539c2 5407 ceph_assert(shared_alloc.a == NULL);
11fdf7f2
TL
5408 ceph_assert(bdev->get_size());
5409
f67539c2
TL
5410 uint64_t alloc_size = min_alloc_size;
5411 if (bdev->is_smr()) {
5412 int r = _zoned_check_config_settings();
5413 if (r < 0)
11fdf7f2 5414 return r;
f67539c2 5415 alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
11fdf7f2
TL
5416 }
5417
f67539c2
TL
5418 shared_alloc.set(Allocator::create(cct, cct->_conf->bluestore_allocator,
5419 bdev->get_size(),
5420 alloc_size, "block"));
5421
5422 if (!shared_alloc.a) {
5423 lderr(cct) << __func__ << "Failed to create allocator:: "
5424 << cct->_conf->bluestore_allocator
5425 << dendl;
7c673cae
FG
5426 return -EINVAL;
5427 }
f67539c2
TL
5428 return 0;
5429}
5430
5431int BlueStore::_init_alloc()
5432{
5433 int r = _create_alloc();
5434 if (r < 0) {
5435 return r;
5436 }
5437 ceph_assert(shared_alloc.a != NULL);
5438
5439 if (bdev->is_smr()) {
5440 shared_alloc.a->zoned_set_zone_states(fm->get_zone_states(db));
5441 }
7c673cae
FG
5442
5443 uint64_t num = 0, bytes = 0;
5444
5445 dout(1) << __func__ << " opening allocation metadata" << dendl;
5446 // initialize from freelist
5447 fm->enumerate_reset();
5448 uint64_t offset, length;
11fdf7f2 5449 while (fm->enumerate_next(db, &offset, &length)) {
f67539c2 5450 shared_alloc.a->init_add_free(offset, length);
7c673cae
FG
5451 ++num;
5452 bytes += length;
5453 }
224ce89b 5454 fm->enumerate_reset();
7c673cae 5455
f67539c2
TL
5456 dout(1) << __func__
5457 << " loaded " << byte_u_t(bytes) << " in " << num << " extents"
5458 << std::hex
5459 << ", allocator type " << shared_alloc.a->get_type()
5460 << ", capacity 0x" << shared_alloc.a->get_capacity()
5461 << ", block size 0x" << shared_alloc.a->get_block_size()
5462 << ", free 0x" << shared_alloc.a->get_free()
5463 << ", fragmentation " << shared_alloc.a->get_fragmentation()
5464 << std::dec << dendl;
1911f103 5465
7c673cae
FG
5466 return 0;
5467}
5468
5469void BlueStore::_close_alloc()
5470{
11fdf7f2
TL
5471 ceph_assert(bdev);
5472 bdev->discard_drain();
5473
f67539c2
TL
5474 ceph_assert(shared_alloc.a);
5475 shared_alloc.a->shutdown();
5476 delete shared_alloc.a;
5477 shared_alloc.reset();
7c673cae
FG
5478}
5479
5480int BlueStore::_open_fsid(bool create)
5481{
11fdf7f2 5482 ceph_assert(fsid_fd < 0);
91327a77 5483 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
5484 if (create)
5485 flags |= O_CREAT;
5486 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
5487 if (fsid_fd < 0) {
5488 int err = -errno;
5489 derr << __func__ << " " << cpp_strerror(err) << dendl;
5490 return err;
5491 }
5492 return 0;
5493}
5494
5495int BlueStore::_read_fsid(uuid_d *uuid)
5496{
5497 char fsid_str[40];
5498 memset(fsid_str, 0, sizeof(fsid_str));
5499 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
5500 if (ret < 0) {
5501 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
5502 return ret;
5503 }
5504 if (ret > 36)
5505 fsid_str[36] = 0;
5506 else
5507 fsid_str[ret] = 0;
5508 if (!uuid->parse(fsid_str)) {
5509 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
5510 return -EINVAL;
5511 }
5512 return 0;
5513}
5514
5515int BlueStore::_write_fsid()
5516{
5517 int r = ::ftruncate(fsid_fd, 0);
5518 if (r < 0) {
5519 r = -errno;
5520 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
5521 return r;
5522 }
5523 string str = stringify(fsid) + "\n";
5524 r = safe_write(fsid_fd, str.c_str(), str.length());
5525 if (r < 0) {
5526 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
5527 return r;
5528 }
5529 r = ::fsync(fsid_fd);
5530 if (r < 0) {
5531 r = -errno;
5532 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
5533 return r;
5534 }
5535 return 0;
5536}
5537
5538void BlueStore::_close_fsid()
5539{
5540 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
5541 fsid_fd = -1;
5542}
5543
5544int BlueStore::_lock_fsid()
5545{
5546 struct flock l;
5547 memset(&l, 0, sizeof(l));
5548 l.l_type = F_WRLCK;
5549 l.l_whence = SEEK_SET;
5550 int r = ::fcntl(fsid_fd, F_SETLK, &l);
5551 if (r < 0) {
5552 int err = errno;
5553 derr << __func__ << " failed to lock " << path << "/fsid"
5554 << " (is another ceph-osd still running?)"
5555 << cpp_strerror(err) << dendl;
5556 return -err;
5557 }
5558 return 0;
5559}
5560
31f18b77
FG
5561bool BlueStore::is_rotational()
5562{
5563 if (bdev) {
5564 return bdev->is_rotational();
5565 }
5566
5567 bool rotational = true;
5568 int r = _open_path();
5569 if (r < 0)
5570 goto out;
5571 r = _open_fsid(false);
5572 if (r < 0)
5573 goto out_path;
5574 r = _read_fsid(&fsid);
5575 if (r < 0)
5576 goto out_fsid;
5577 r = _lock_fsid();
5578 if (r < 0)
5579 goto out_fsid;
5580 r = _open_bdev(false);
5581 if (r < 0)
5582 goto out_fsid;
5583 rotational = bdev->is_rotational();
5584 _close_bdev();
5585 out_fsid:
5586 _close_fsid();
5587 out_path:
5588 _close_path();
5589 out:
5590 return rotational;
5591}
5592
d2e6a577
FG
5593bool BlueStore::is_journal_rotational()
5594{
5595 if (!bluefs) {
5596 dout(5) << __func__ << " bluefs disabled, default to store media type"
5597 << dendl;
5598 return is_rotational();
5599 }
5600 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
5601 return bluefs->wal_is_rotational();
5602}
5603
9f95a23c
TL
5604bool BlueStore::_use_rotational_settings()
5605{
5606 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
5607 return true;
5608 }
5609 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
5610 return false;
5611 }
5612 return bdev->is_rotational();
5613}
5614
7c673cae
FG
5615bool BlueStore::test_mount_in_use()
5616{
5617 // most error conditions mean the mount is not in use (e.g., because
5618 // it doesn't exist). only if we fail to lock do we conclude it is
5619 // in use.
5620 bool ret = false;
5621 int r = _open_path();
5622 if (r < 0)
5623 return false;
5624 r = _open_fsid(false);
5625 if (r < 0)
5626 goto out_path;
5627 r = _lock_fsid();
5628 if (r < 0)
5629 ret = true; // if we can't lock, it is in use
5630 _close_fsid();
5631 out_path:
5632 _close_path();
5633 return ret;
5634}
5635
11fdf7f2 5636int BlueStore::_minimal_open_bluefs(bool create)
7c673cae
FG
5637{
5638 int r;
11fdf7f2 5639 bluefs = new BlueFS(cct);
7c673cae 5640
11fdf7f2
TL
5641 string bfn;
5642 struct stat st;
5643
5644 bfn = path + "/block.db";
5645 if (::stat(bfn.c_str(), &st) == 0) {
eafe8130
TL
5646 r = bluefs->add_block_device(
5647 BlueFS::BDEV_DB, bfn,
f67539c2
TL
5648 create && cct->_conf->bdev_enable_discard,
5649 SUPER_RESERVED);
7c673cae 5650 if (r < 0) {
11fdf7f2
TL
5651 derr << __func__ << " add block device(" << bfn << ") returned: "
5652 << cpp_strerror(r) << dendl;
5653 goto free_bluefs;
7c673cae 5654 }
7c673cae 5655
11fdf7f2
TL
5656 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
5657 r = _check_or_set_bdev_label(
5658 bfn,
5659 bluefs->get_block_device_size(BlueFS::BDEV_DB),
5660 "bluefs db", create);
5661 if (r < 0) {
5662 derr << __func__
5663 << " check block device(" << bfn << ") label returned: "
5664 << cpp_strerror(r) << dendl;
5665 goto free_bluefs;
5666 }
7c673cae 5667 }
9f95a23c
TL
5668 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
5669 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
5670 } else {
5671 r = -errno;
5672 if (::lstat(bfn.c_str(), &st) == -1) {
5673 r = 0;
9f95a23c 5674 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7c673cae 5675 } else {
11fdf7f2
TL
5676 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5677 << cpp_strerror(r) << dendl;
5678 goto free_bluefs;
7c673cae
FG
5679 }
5680 }
7c673cae 5681
11fdf7f2
TL
5682 // shared device
5683 bfn = path + "/block";
5684 // never trim here
9f95a23c 5685 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
f67539c2
TL
5686 0, // no need to provide valid 'reserved' for shared dev
5687 &shared_alloc);
11fdf7f2
TL
5688 if (r < 0) {
5689 derr << __func__ << " add block device(" << bfn << ") returned: "
5690 << cpp_strerror(r) << dendl;
5691 goto free_bluefs;
5692 }
11fdf7f2
TL
5693
5694 bfn = path + "/block.wal";
5695 if (::stat(bfn.c_str(), &st) == 0) {
5696 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
f67539c2
TL
5697 create && cct->_conf->bdev_enable_discard,
5698 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
5699 if (r < 0) {
5700 derr << __func__ << " add block device(" << bfn << ") returned: "
5701 << cpp_strerror(r) << dendl;
5702 goto free_bluefs;
5703 }
7c673cae 5704
11fdf7f2
TL
5705 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
5706 r = _check_or_set_bdev_label(
5707 bfn,
5708 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
5709 "bluefs wal", create);
7c673cae 5710 if (r < 0) {
11fdf7f2
TL
5711 derr << __func__ << " check block device(" << bfn
5712 << ") label returned: " << cpp_strerror(r) << dendl;
7c673cae
FG
5713 goto free_bluefs;
5714 }
7c673cae
FG
5715 }
5716
9f95a23c 5717 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
5718 } else {
5719 r = 0;
5720 if (::lstat(bfn.c_str(), &st) != -1) {
5721 r = -errno;
5722 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5723 << cpp_strerror(r) << dendl;
7c673cae
FG
5724 goto free_bluefs;
5725 }
11fdf7f2
TL
5726 }
5727 return 0;
7c673cae 5728
11fdf7f2
TL
5729free_bluefs:
5730 ceph_assert(bluefs);
5731 delete bluefs;
5732 bluefs = NULL;
5733 return r;
5734}
7c673cae 5735
f67539c2 5736int BlueStore::_open_bluefs(bool create, bool read_only)
11fdf7f2
TL
5737{
5738 int r = _minimal_open_bluefs(create);
5739 if (r < 0) {
5740 return r;
5741 }
f67539c2 5742 BlueFSVolumeSelector* vselector = nullptr;
9f95a23c
TL
5743 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
5744
5745 string options = cct->_conf->bluestore_rocksdb_options;
cd265ab1
TL
5746 string options_annex = cct->_conf->bluestore_rocksdb_options_annex;
5747 if (!options_annex.empty()) {
5748 if (!options.empty() &&
5749 *options.rbegin() != ',') {
5750 options += ',';
5751 }
5752 options += options_annex;
5753 }
9f95a23c
TL
5754
5755 rocksdb::Options rocks_opts;
f67539c2 5756 r = RocksDBStore::ParseOptionsFromStringStatic(
9f95a23c
TL
5757 cct,
5758 options,
5759 rocks_opts,
5760 nullptr);
5761 if (r < 0) {
5762 return r;
5763 }
f67539c2
TL
5764 if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
5765 vselector = new FitToFastVolumeSelector(
9f95a23c
TL
5766 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
5767 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
f67539c2
TL
5768 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100);
5769 } else {
5770 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
5771 vselector =
5772 new RocksDBBlueFSVolumeSelector(
5773 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
5774 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
5775 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
5776 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5777 rocks_opts.max_bytes_for_level_base,
5778 rocks_opts.max_bytes_for_level_multiplier,
5779 reserved_factor,
5780 cct->_conf->bluestore_volume_selection_reserved,
5781 cct->_conf->bluestore_volume_selection_policy == "use_some_extra");
5782 }
9f95a23c 5783 }
11fdf7f2 5784 if (create) {
9f95a23c 5785 bluefs->mkfs(fsid, bluefs_layout);
11fdf7f2 5786 }
9f95a23c 5787 bluefs->set_volume_selector(vselector);
11fdf7f2
TL
5788 r = bluefs->mount();
5789 if (r < 0) {
5790 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
5791 }
9f95a23c 5792 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
11fdf7f2
TL
5793 return r;
5794}
5795
1911f103 5796void BlueStore::_close_bluefs(bool cold_close)
11fdf7f2 5797{
1911f103 5798 bluefs->umount(cold_close);
11fdf7f2
TL
5799 _minimal_close_bluefs();
5800}
5801
5802void BlueStore::_minimal_close_bluefs()
5803{
5804 delete bluefs;
5805 bluefs = NULL;
5806}
5807
5808int BlueStore::_is_bluefs(bool create, bool* ret)
5809{
5810 if (create) {
5811 *ret = cct->_conf->bluestore_bluefs;
5812 } else {
5813 string s;
5814 int r = read_meta("bluefs", &s);
5815 if (r < 0) {
5816 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
5817 return -EIO;
5818 }
5819 if (s == "1") {
5820 *ret = true;
5821 } else if (s == "0") {
5822 *ret = false;
31f18b77 5823 } else {
11fdf7f2
TL
5824 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
5825 << dendl;
5826 return -EIO;
5827 }
5828 }
5829 return 0;
5830}
5831
5832/*
5833* opens both DB and dependant super_meta, FreelistManager and allocator
5834* in the proper order
5835*/
f67539c2 5836int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
11fdf7f2 5837{
f67539c2
TL
5838 dout(0) << __func__ << " read-only:" << read_only
5839 << " repair:" << to_repair << dendl;
5840 {
5841 string type;
5842 int r = read_meta("type", &type);
5843 if (r < 0) {
5844 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
5845 << dendl;
11fdf7f2 5846 return r;
f67539c2 5847 }
11fdf7f2 5848
f67539c2
TL
5849 if (type != "bluestore") {
5850 derr << __func__ << " expected bluestore, but type is " << type << dendl;
5851 return -EIO;
11fdf7f2 5852 }
f67539c2 5853 }
11fdf7f2 5854
f67539c2
TL
5855 int r = _open_path();
5856 if (r < 0)
5857 return r;
5858 r = _open_fsid(false);
5859 if (r < 0)
5860 goto out_path;
11fdf7f2 5861
f67539c2
TL
5862 r = _read_fsid(&fsid);
5863 if (r < 0)
5864 goto out_fsid;
11fdf7f2 5865
f67539c2
TL
5866 r = _lock_fsid();
5867 if (r < 0)
5868 goto out_fsid;
11fdf7f2 5869
f67539c2
TL
5870 r = _open_bdev(false);
5871 if (r < 0)
5872 goto out_fsid;
7c673cae 5873
f67539c2
TL
5874 // open in read-only first to read FM list and init allocator
5875 // as they might be needed for some BlueFS procedures
5876 r = _open_db(false, false, true);
5877 if (r < 0)
5878 goto out_bdev;
11fdf7f2 5879
f67539c2
TL
5880 r = _open_super_meta();
5881 if (r < 0) {
5882 goto out_db;
5883 }
5884
5885 r = _open_fm(nullptr, true);
5886 if (r < 0)
5887 goto out_db;
5888
5889 r = _init_alloc();
5890 if (r < 0)
5891 goto out_fm;
5892
5893 // Re-open in the proper mode(s).
5894
5895 // Can't simply bypass second open for read-only mode as we need to
5896 // load allocated extents from bluefs into allocator.
5897 // And now it's time to do that
5898 //
5899 _close_db(true);
5900
5901 r = _open_db(false, to_repair, read_only);
5902 if (r < 0) {
5903 goto out_alloc;
11fdf7f2
TL
5904 }
5905 return 0;
5906
f67539c2
TL
5907out_alloc:
5908 _close_alloc();
5909out_fm:
11fdf7f2
TL
5910 _close_fm();
5911 out_db:
1911f103 5912 _close_db(read_only);
f67539c2
TL
5913 out_bdev:
5914 _close_bdev();
5915 out_fsid:
5916 _close_fsid();
5917 out_path:
5918 _close_path();
11fdf7f2
TL
5919 return r;
5920}
5921
1911f103 5922void BlueStore::_close_db_and_around(bool read_only)
11fdf7f2 5923{
f67539c2
TL
5924 _close_db(read_only);
5925 _close_fm();
5926 _close_alloc();
5927 _close_bdev();
5928 _close_fsid();
5929 _close_path();
5930}
5931
5932int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair)
5933{
5934 _kv_only = true;
5935 int r = _open_db_and_around(false, to_repair);
5936 if (r == 0) {
5937 *pdb = db;
11fdf7f2 5938 } else {
f67539c2 5939 *pdb = nullptr;
11fdf7f2 5940 }
f67539c2 5941 return r;
11fdf7f2
TL
5942}
5943
f67539c2 5944int BlueStore::close_db_environment()
11fdf7f2 5945{
f67539c2
TL
5946 _close_db_and_around(false);
5947 return 0;
11fdf7f2
TL
5948}
5949
f67539c2
TL
5950int BlueStore::_prepare_db_environment(bool create, bool read_only,
5951 std::string* _fn, std::string* _kv_backend)
11fdf7f2
TL
5952{
5953 int r;
5954 ceph_assert(!db);
f67539c2
TL
5955 std::string& fn=*_fn;
5956 std::string& kv_backend=*_kv_backend;
5957 fn = path + "/db";
11fdf7f2
TL
5958 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
5959
11fdf7f2
TL
5960 if (create) {
5961 kv_backend = cct->_conf->bluestore_kvbackend;
5962 } else {
5963 r = read_meta("kv_backend", &kv_backend);
7c673cae 5964 if (r < 0) {
11fdf7f2
TL
5965 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
5966 return -EIO;
5967 }
5968 }
5969 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
5970
5971 bool do_bluefs;
5972 r = _is_bluefs(create, &do_bluefs);
5973 if (r < 0) {
5974 return r;
5975 }
5976 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
5977
5978 map<string,string> kv_options;
5979 // force separate wal dir for all new deployments.
5980 kv_options["separate_wal_dir"] = 1;
5981 rocksdb::Env *env = NULL;
5982 if (do_bluefs) {
5983 dout(10) << __func__ << " initializing bluefs" << dendl;
5984 if (kv_backend != "rocksdb") {
5985 derr << " backend must be rocksdb to use bluefs" << dendl;
5986 return -EINVAL;
7c673cae 5987 }
11fdf7f2 5988
f67539c2 5989 r = _open_bluefs(create, read_only);
11fdf7f2
TL
5990 if (r < 0) {
5991 return r;
5992 }
11fdf7f2 5993
7c673cae 5994 if (cct->_conf->bluestore_bluefs_env_mirror) {
9f95a23c
TL
5995 rocksdb::Env* a = new BlueRocksEnv(bluefs);
5996 rocksdb::Env* b = rocksdb::Env::Default();
7c673cae 5997 if (create) {
9f95a23c
TL
5998 string cmd = "rm -rf " + path + "/db " +
5999 path + "/db.slow " +
6000 path + "/db.wal";
6001 int r = system(cmd.c_str());
6002 (void)r;
7c673cae
FG
6003 }
6004 env = new rocksdb::EnvMirror(b, a, false, true);
1911f103 6005 } else {
7c673cae
FG
6006 env = new BlueRocksEnv(bluefs);
6007
6008 // simplify the dir names, too, as "seen" by rocksdb
6009 fn = "db";
6010 }
9f95a23c
TL
6011 BlueFSVolumeSelector::paths paths;
6012 bluefs->get_vselector_paths(fn, paths);
7c673cae 6013
9f95a23c 6014 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
7c673cae
FG
6015 // we have both block.db and block; tell rocksdb!
6016 // note: the second (last) size value doesn't really matter
6017 ostringstream db_paths;
9f95a23c
TL
6018 bool first = true;
6019 for (auto& p : paths) {
6020 if (!first) {
6021 db_paths << " ";
6022 }
6023 first = false;
6024 db_paths << p.first << "," << p.second;
6025
6026 }
11fdf7f2 6027 kv_options["db_paths"] = db_paths.str();
9f95a23c 6028 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
7c673cae
FG
6029 }
6030
6031 if (create) {
9f95a23c
TL
6032 for (auto& p : paths) {
6033 env->CreateDir(p.first);
6034 }
6035 // Selectors don't provide wal path so far hence create explicitly
11fdf7f2 6036 env->CreateDir(fn + ".wal");
11fdf7f2
TL
6037 } else {
6038 std::vector<std::string> res;
6039 // check for dir presence
6040 auto r = env->GetChildren(fn+".wal", &res);
6041 if (r.IsNotFound()) {
6042 kv_options.erase("separate_wal_dir");
6043 }
7c673cae 6044 }
11fdf7f2
TL
6045 } else {
6046 string walfn = path + "/db.wal";
7c673cae 6047
11fdf7f2
TL
6048 if (create) {
6049 int r = ::mkdir(fn.c_str(), 0755);
6050 if (r < 0)
6051 r = -errno;
6052 if (r < 0 && r != -EEXIST) {
6053 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
6054 << dendl;
6055 return r;
6056 }
6057
6058 // wal_dir, too!
7c673cae
FG
6059 r = ::mkdir(walfn.c_str(), 0755);
6060 if (r < 0)
6061 r = -errno;
6062 if (r < 0 && r != -EEXIST) {
6063 derr << __func__ << " failed to create " << walfn
6064 << ": " << cpp_strerror(r)
6065 << dendl;
6066 return r;
6067 }
11fdf7f2
TL
6068 } else {
6069 struct stat st;
6070 r = ::stat(walfn.c_str(), &st);
6071 if (r < 0 && errno == ENOENT) {
6072 kv_options.erase("separate_wal_dir");
6073 }
7c673cae
FG
6074 }
6075 }
6076
91327a77 6077
7c673cae
FG
6078 db = KeyValueDB::create(cct,
6079 kv_backend,
6080 fn,
11fdf7f2 6081 kv_options,
7c673cae
FG
6082 static_cast<void*>(env));
6083 if (!db) {
6084 derr << __func__ << " error creating db" << dendl;
6085 if (bluefs) {
1911f103 6086 _close_bluefs(read_only);
7c673cae
FG
6087 }
6088 // delete env manually here since we can't depend on db to do this
6089 // under this case
6090 delete env;
6091 env = NULL;
6092 return -EIO;
6093 }
6094
f67539c2 6095 FreelistManager::setup_merge_operators(db, freelist_type);
7c673cae 6096 db->set_merge_operator(PREFIX_STAT, merge_op);
91327a77 6097 db->set_cache_size(cache_kv_ratio * cache_size);
f67539c2
TL
6098 return 0;
6099}
31f18b77 6100
f67539c2
TL
6101int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
6102{
6103 int r;
6104 ceph_assert(!(create && read_only));
6105 string options;
6106 string options_annex;
6107 stringstream err;
6108 string kv_dir_fn;
6109 string kv_backend;
6110 std::string sharding_def;
6111 r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend);
6112 if (r < 0) {
6113 derr << __func__ << " failed to prepare db environment: " << err.str() << dendl;
6114 return -EIO;
6115 }
11fdf7f2 6116 if (kv_backend == "rocksdb") {
7c673cae 6117 options = cct->_conf->bluestore_rocksdb_options;
cd265ab1
TL
6118 options_annex = cct->_conf->bluestore_rocksdb_options_annex;
6119 if (!options_annex.empty()) {
6120 if (!options.empty() &&
6121 *options.rbegin() != ',') {
6122 options += ',';
6123 }
6124 options += options_annex;
6125 }
11fdf7f2 6126
f67539c2
TL
6127 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
6128 sharding_def = cct->_conf.get_val<std::string>("bluestore_rocksdb_cfs");
11fdf7f2
TL
6129 }
6130 }
6131
7c673cae 6132 db->init(options);
11fdf7f2
TL
6133 if (to_repair_db)
6134 return 0;
6135 if (create) {
f67539c2 6136 r = db->create_and_open(err, sharding_def);
11fdf7f2
TL
6137 } else {
6138 // we pass in cf list here, but it is only used if the db already has
6139 // column families created.
6140 r = read_only ?
f67539c2
TL
6141 db->open_read_only(err, sharding_def) :
6142 db->open(err, sharding_def);
11fdf7f2 6143 }
7c673cae
FG
6144 if (r) {
6145 derr << __func__ << " erroring opening db: " << err.str() << dendl;
1911f103 6146 _close_db(read_only);
7c673cae
FG
6147 return -EIO;
6148 }
6149 dout(1) << __func__ << " opened " << kv_backend
f67539c2 6150 << " path " << kv_dir_fn << " options " << options << dendl;
7c673cae 6151 return 0;
7c673cae
FG
6152}
6153
1911f103 6154void BlueStore::_close_db(bool cold_close)
7c673cae 6155{
11fdf7f2 6156 ceph_assert(db);
7c673cae
FG
6157 delete db;
6158 db = NULL;
6159 if (bluefs) {
1911f103 6160 _close_bluefs(cold_close);
7c673cae
FG
6161 }
6162}
6163
11fdf7f2 6164void BlueStore::_dump_alloc_on_failure()
7c673cae 6165{
11fdf7f2
TL
6166 auto dump_interval =
6167 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
6168 if (dump_interval > 0 &&
6169 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
f67539c2 6170 shared_alloc.a->dump();
11fdf7f2
TL
6171 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
6172 next_dump_on_bluefs_alloc_failure += dump_interval;
7c673cae 6173 }
11fdf7f2 6174}
7c673cae 6175
eafe8130 6176int BlueStore::_open_collections()
7c673cae 6177{
28e407b8 6178 dout(10) << __func__ << dendl;
eafe8130 6179 collections_had_errors = false;
11fdf7f2 6180 ceph_assert(coll_map.empty());
7c673cae
FG
6181 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6182 for (it->upper_bound(string());
6183 it->valid();
6184 it->next()) {
6185 coll_t cid;
6186 if (cid.parse(it->key())) {
9f95a23c 6187 auto c = ceph::make_ref<Collection>(
7c673cae 6188 this,
9f95a23c
TL
6189 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6190 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6191 cid);
7c673cae 6192 bufferlist bl = it->value();
11fdf7f2 6193 auto p = bl.cbegin();
7c673cae 6194 try {
11fdf7f2 6195 decode(c->cnode, p);
f67539c2 6196 } catch (ceph::buffer::error& e) {
7c673cae
FG
6197 derr << __func__ << " failed to decode cnode, key:"
6198 << pretty_binary_string(it->key()) << dendl;
6199 return -EIO;
6200 }
28e407b8
AA
6201 dout(20) << __func__ << " opened " << cid << " " << c
6202 << " " << c->cnode << dendl;
11fdf7f2 6203 _osr_attach(c.get());
7c673cae 6204 coll_map[cid] = c;
11fdf7f2 6205
7c673cae
FG
6206 } else {
6207 derr << __func__ << " unrecognized collection " << it->key() << dendl;
eafe8130 6208 collections_had_errors = true;
7c673cae
FG
6209 }
6210 }
6211 return 0;
6212}
6213
eafe8130
TL
6214void BlueStore::_fsck_collections(int64_t* errors)
6215{
6216 if (collections_had_errors) {
6217 dout(10) << __func__ << dendl;
f67539c2 6218 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
6219 for (it->upper_bound(string());
6220 it->valid();
6221 it->next()) {
6222 coll_t cid;
6223 if (!cid.parse(it->key())) {
6224 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6225 if (errors) {
6226 (*errors)++;
6227 }
6228 }
6229 }
6230 }
6231}
6232
9f95a23c
TL
6233void BlueStore::_set_per_pool_omap()
6234{
f67539c2 6235 per_pool_omap = OMAP_BULK;
9f95a23c
TL
6236 bufferlist bl;
6237 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6238 if (bl.length()) {
f67539c2
TL
6239 auto s = bl.to_str();
6240 if (s == stringify(OMAP_PER_POOL)) {
6241 per_pool_omap = OMAP_PER_POOL;
6242 } else {
6243 ceph_assert(s == stringify(OMAP_PER_PG));
6244 per_pool_omap = OMAP_PER_PG;
6245 }
6246 dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl;
9f95a23c
TL
6247 } else {
6248 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6249 }
f67539c2 6250 _check_no_per_pg_or_pool_omap_alert();
9f95a23c
TL
6251}
6252
224ce89b 6253void BlueStore::_open_statfs()
31f18b77 6254{
11fdf7f2
TL
6255 osd_pools.clear();
6256 vstatfs.reset();
6257
31f18b77 6258 bufferlist bl;
11fdf7f2 6259 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
31f18b77 6260 if (r >= 0) {
11fdf7f2 6261 per_pool_stat_collection = false;
31f18b77 6262 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
11fdf7f2 6263 auto it = bl.cbegin();
31f18b77 6264 vstatfs.decode(it);
11fdf7f2 6265 dout(10) << __func__ << " store_statfs is found" << dendl;
224ce89b 6266 } else {
31f18b77
FG
6267 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6268 }
81eedcae 6269 _check_legacy_statfs_alert();
11fdf7f2
TL
6270 } else {
6271 per_pool_stat_collection = true;
6272 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
f67539c2 6273 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2
TL
6274 for (it->upper_bound(string());
6275 it->valid();
6276 it->next()) {
6277
6278 uint64_t pool_id;
6279 int r = get_key_pool_stat(it->key(), &pool_id);
6280 ceph_assert(r == 0);
6281
6282 bufferlist bl;
6283 bl = it->value();
6284 auto p = bl.cbegin();
6285 auto& st = osd_pools[pool_id];
6286 try {
6287 st.decode(p);
6288 vstatfs += st;
6289
6290 dout(30) << __func__ << " pool " << pool_id
6291 << " statfs " << st << dendl;
f67539c2 6292 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
6293 derr << __func__ << " failed to decode pool stats, key:"
6294 << pretty_binary_string(it->key()) << dendl;
6295 }
6296 }
31f18b77 6297 }
11fdf7f2
TL
6298 dout(30) << __func__ << " statfs " << vstatfs << dendl;
6299
31f18b77
FG
6300}
6301
7c673cae
FG
6302int BlueStore::_setup_block_symlink_or_file(
6303 string name,
6304 string epath,
6305 uint64_t size,
6306 bool create)
6307{
6308 dout(20) << __func__ << " name " << name << " path " << epath
6309 << " size " << size << " create=" << (int)create << dendl;
6310 int r = 0;
91327a77 6311 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
6312 if (create)
6313 flags |= O_CREAT;
6314 if (epath.length()) {
6315 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6316 if (r < 0) {
6317 r = -errno;
6318 derr << __func__ << " failed to create " << name << " symlink to "
6319 << epath << ": " << cpp_strerror(r) << dendl;
6320 return r;
6321 }
6322
6323 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6324 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6325 if (fd < 0) {
6326 r = -errno;
6327 derr << __func__ << " failed to open " << epath << " file: "
6328 << cpp_strerror(r) << dendl;
6329 return r;
6330 }
11fdf7f2
TL
6331 // write the Transport ID of the NVMe device
6332 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6333 // where "0000:02:00.0" is the selector of a PCI device, see
6334 // the first column of "lspci -mm -n -D"
6335 string trid{"trtype:PCIe "};
6336 trid += "traddr:";
6337 trid += epath.substr(strlen(SPDK_PREFIX));
6338 r = ::write(fd, trid.c_str(), trid.size());
6339 ceph_assert(r == static_cast<int>(trid.size()));
7c673cae
FG
6340 dout(1) << __func__ << " created " << name << " symlink to "
6341 << epath << dendl;
6342 VOID_TEMP_FAILURE_RETRY(::close(fd));
6343 }
6344 }
6345 if (size) {
6346 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
6347 if (fd >= 0) {
6348 // block file is present
6349 struct stat st;
6350 int r = ::fstat(fd, &st);
6351 if (r == 0 &&
6352 S_ISREG(st.st_mode) && // if it is a regular file
6353 st.st_size == 0) { // and is 0 bytes
6354 r = ::ftruncate(fd, size);
6355 if (r < 0) {
6356 r = -errno;
6357 derr << __func__ << " failed to resize " << name << " file to "
6358 << size << ": " << cpp_strerror(r) << dendl;
6359 VOID_TEMP_FAILURE_RETRY(::close(fd));
6360 return r;
6361 }
6362
6363 if (cct->_conf->bluestore_block_preallocate_file) {
28e407b8
AA
6364 r = ::ceph_posix_fallocate(fd, 0, size);
6365 if (r > 0) {
7c673cae
FG
6366 derr << __func__ << " failed to prefallocate " << name << " file to "
6367 << size << ": " << cpp_strerror(r) << dendl;
6368 VOID_TEMP_FAILURE_RETRY(::close(fd));
6369 return -r;
6370 }
7c673cae
FG
6371 }
6372 dout(1) << __func__ << " resized " << name << " file to "
1adf2230 6373 << byte_u_t(size) << dendl;
7c673cae
FG
6374 }
6375 VOID_TEMP_FAILURE_RETRY(::close(fd));
6376 } else {
6377 int r = -errno;
6378 if (r != -ENOENT) {
6379 derr << __func__ << " failed to open " << name << " file: "
6380 << cpp_strerror(r) << dendl;
6381 return r;
6382 }
6383 }
6384 }
6385 return 0;
6386}
6387
6388int BlueStore::mkfs()
6389{
6390 dout(1) << __func__ << " path " << path << dendl;
6391 int r;
6392 uuid_d old_fsid;
f67539c2 6393 uint64_t reserved;
eafe8130
TL
6394 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6395 derr << __func__ << " osd_max_object_size "
6396 << cct->_conf->osd_max_object_size << " > bluestore max "
6397 << OBJECT_MAX_SIZE << dendl;
6398 return -EINVAL;
6399 }
6400
7c673cae
FG
6401 {
6402 string done;
6403 r = read_meta("mkfs_done", &done);
6404 if (r == 0) {
6405 dout(1) << __func__ << " already created" << dendl;
6406 if (cct->_conf->bluestore_fsck_on_mkfs) {
6407 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6408 if (r < 0) {
6409 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
6410 << dendl;
6411 return r;
6412 }
6413 if (r > 0) {
6414 derr << __func__ << " fsck found " << r << " errors" << dendl;
6415 r = -EIO;
6416 }
6417 }
6418 return r; // idempotent
6419 }
6420 }
6421
6422 {
6423 string type;
6424 r = read_meta("type", &type);
6425 if (r == 0) {
6426 if (type != "bluestore") {
6427 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6428 return -EIO;
6429 }
6430 } else {
6431 r = write_meta("type", "bluestore");
6432 if (r < 0)
6433 return r;
6434 }
6435 }
6436
6437 freelist_type = "bitmap";
6438
6439 r = _open_path();
6440 if (r < 0)
6441 return r;
6442
6443 r = _open_fsid(true);
6444 if (r < 0)
6445 goto out_path_fd;
6446
6447 r = _lock_fsid();
6448 if (r < 0)
6449 goto out_close_fsid;
6450
6451 r = _read_fsid(&old_fsid);
6452 if (r < 0 || old_fsid.is_zero()) {
6453 if (fsid.is_zero()) {
6454 fsid.generate_random();
6455 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6456 } else {
6457 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6458 }
6459 // we'll write it later.
6460 } else {
6461 if (!fsid.is_zero() && fsid != old_fsid) {
6462 derr << __func__ << " on-disk fsid " << old_fsid
6463 << " != provided " << fsid << dendl;
6464 r = -EINVAL;
6465 goto out_close_fsid;
6466 }
6467 fsid = old_fsid;
6468 }
6469
6470 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
6471 cct->_conf->bluestore_block_size,
6472 cct->_conf->bluestore_block_create);
6473 if (r < 0)
6474 goto out_close_fsid;
6475 if (cct->_conf->bluestore_bluefs) {
6476 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
6477 cct->_conf->bluestore_block_wal_size,
6478 cct->_conf->bluestore_block_wal_create);
6479 if (r < 0)
6480 goto out_close_fsid;
6481 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
6482 cct->_conf->bluestore_block_db_size,
6483 cct->_conf->bluestore_block_db_create);
6484 if (r < 0)
6485 goto out_close_fsid;
6486 }
6487
6488 r = _open_bdev(true);
6489 if (r < 0)
6490 goto out_close_fsid;
6491
3efd9988
FG
6492 // choose min_alloc_size
6493 if (cct->_conf->bluestore_min_alloc_size) {
6494 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
6495 } else {
11fdf7f2 6496 ceph_assert(bdev);
f67539c2 6497 if (_use_rotational_settings()) {
3efd9988
FG
6498 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
6499 } else {
6500 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
6501 }
6502 }
11fdf7f2 6503 _validate_bdev();
3efd9988
FG
6504
6505 // make sure min_alloc_size is power of 2 aligned.
11fdf7f2 6506 if (!isp2(min_alloc_size)) {
3efd9988
FG
6507 derr << __func__ << " min_alloc_size 0x"
6508 << std::hex << min_alloc_size << std::dec
6509 << " is not power of 2 aligned!"
6510 << dendl;
6511 r = -EINVAL;
6512 goto out_close_bdev;
6513 }
6514
f67539c2
TL
6515 r = _create_alloc();
6516 if (r < 0) {
6517 goto out_close_bdev;
6518 }
6519
6520 reserved = _get_ondisk_reserved();
6521 shared_alloc.a->init_add_free(reserved,
6522 p2align(bdev->get_size(), min_alloc_size) - reserved);
6523
7c673cae
FG
6524 r = _open_db(true);
6525 if (r < 0)
f67539c2 6526 goto out_close_alloc;
7c673cae 6527
7c673cae
FG
6528 {
6529 KeyValueDB::Transaction t = db->get_transaction();
1911f103 6530 r = _open_fm(t, true);
11fdf7f2
TL
6531 if (r < 0)
6532 goto out_close_db;
7c673cae
FG
6533 {
6534 bufferlist bl;
11fdf7f2 6535 encode((uint64_t)0, bl);
7c673cae
FG
6536 t->set(PREFIX_SUPER, "nid_max", bl);
6537 t->set(PREFIX_SUPER, "blobid_max", bl);
6538 }
6539
7c673cae
FG
6540 {
6541 bufferlist bl;
11fdf7f2 6542 encode((uint64_t)min_alloc_size, bl);
7c673cae
FG
6543 t->set(PREFIX_SUPER, "min_alloc_size", bl);
6544 }
9f95a23c
TL
6545 {
6546 bufferlist bl;
f67539c2 6547 bl.append(stringify(OMAP_PER_PG));
9f95a23c
TL
6548 t->set(PREFIX_SUPER, "per_pool_omap", bl);
6549 }
7c673cae
FG
6550 ondisk_format = latest_ondisk_format;
6551 _prepare_ondisk_format_super(t);
6552 db->submit_transaction_sync(t);
6553 }
6554
7c673cae
FG
6555 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
6556 if (r < 0)
224ce89b
WB
6557 goto out_close_fm;
6558
3efd9988 6559 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 6560 if (r < 0)
224ce89b 6561 goto out_close_fm;
7c673cae
FG
6562
6563 if (fsid != old_fsid) {
6564 r = _write_fsid();
6565 if (r < 0) {
6566 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 6567 goto out_close_fm;
7c673cae
FG
6568 }
6569 }
6570
7c673cae
FG
6571 out_close_fm:
6572 _close_fm();
6573 out_close_db:
1911f103 6574 _close_db(false);
f67539c2
TL
6575 out_close_alloc:
6576 _close_alloc();
7c673cae
FG
6577 out_close_bdev:
6578 _close_bdev();
6579 out_close_fsid:
6580 _close_fsid();
6581 out_path_fd:
6582 _close_path();
6583
6584 if (r == 0 &&
6585 cct->_conf->bluestore_fsck_on_mkfs) {
6586 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6587 if (rc < 0)
6588 return rc;
6589 if (rc > 0) {
6590 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6591 r = -EIO;
6592 }
11fdf7f2
TL
6593 }
6594
6595 if (r == 0) {
6596 // indicate success by writing the 'mkfs_done' file
6597 r = write_meta("mkfs_done", "yes");
6598 }
6599
6600 if (r < 0) {
6601 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6602 } else {
6603 dout(0) << __func__ << " success" << dendl;
6604 }
6605 return r;
6606}
6607
11fdf7f2
TL
6608int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
6609{
6610 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6611 int r;
6612 ceph_assert(path_fd < 0);
6613
6614 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6615
6616 if (!cct->_conf->bluestore_bluefs) {
6617 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6618 return -EIO;
6619 }
6620
f67539c2 6621 r = _open_db_and_around(true);
11fdf7f2 6622
11fdf7f2
TL
6623 if (id == BlueFS::BDEV_NEWWAL) {
6624 string p = path + "/block.wal";
6625 r = _setup_block_symlink_or_file("block.wal", dev_path,
6626 cct->_conf->bluestore_block_wal_size,
6627 true);
6628 ceph_assert(r == 0);
6629
6630 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
f67539c2
TL
6631 cct->_conf->bdev_enable_discard,
6632 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
6633 ceph_assert(r == 0);
6634
6635 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6636 r = _check_or_set_bdev_label(
6637 p,
6638 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6639 "bluefs wal",
6640 true);
6641 ceph_assert(r == 0);
6642 }
6643
9f95a23c 6644 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6645 } else if (id == BlueFS::BDEV_NEWDB) {
6646 string p = path + "/block.db";
6647 r = _setup_block_symlink_or_file("block.db", dev_path,
6648 cct->_conf->bluestore_block_db_size,
6649 true);
6650 ceph_assert(r == 0);
6651
6652 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
f67539c2
TL
6653 cct->_conf->bdev_enable_discard,
6654 SUPER_RESERVED);
11fdf7f2
TL
6655 ceph_assert(r == 0);
6656
6657 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6658 r = _check_or_set_bdev_label(
6659 p,
6660 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6661 "bluefs db",
6662 true);
6663 ceph_assert(r == 0);
6664 }
9f95a23c
TL
6665 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6666 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
6667 }
6668
6669 bluefs->umount();
6670 bluefs->mount();
6671
9f95a23c 6672 r = bluefs->prepare_new_device(id, bluefs_layout);
11fdf7f2
TL
6673 ceph_assert(r == 0);
6674
6675 if (r < 0) {
6676 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6677 } else {
6678 dout(0) << __func__ << " success" << dendl;
6679 }
6680
f67539c2 6681 _close_db_and_around(true);
11fdf7f2
TL
6682 return r;
6683}
6684
6685int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
6686 int id)
6687{
6688 dout(10) << __func__ << " id:" << id << dendl;
6689 ceph_assert(path_fd < 0);
6690
6691 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
6692
6693 if (!cct->_conf->bluestore_bluefs) {
6694 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6695 return -EIO;
6696 }
6697
f67539c2 6698 int r = _open_db_and_around(true);
11fdf7f2 6699
f67539c2 6700 uint64_t used_space = 0;
11fdf7f2 6701 for(auto src_id : devs_source) {
f67539c2 6702 used_space += bluefs->get_used(src_id);
11fdf7f2
TL
6703 }
6704 uint64_t target_free = bluefs->get_free(id);
f67539c2 6705 if (target_free < used_space) {
11fdf7f2
TL
6706 derr << __func__
6707 << " can't migrate, free space at target: " << target_free
6708 << " is less than required space: " << used_space
6709 << dendl;
f67539c2
TL
6710 r = -ENOSPC;
6711 goto shutdown;
11fdf7f2 6712 }
9f95a23c
TL
6713 if (devs_source.count(BlueFS::BDEV_DB)) {
6714 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6715 bluefs_layout.dedicated_db = false;
6716 }
6717 if (devs_source.count(BlueFS::BDEV_WAL)) {
6718 bluefs_layout.dedicated_wal = false;
6719 }
6720 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
11fdf7f2
TL
6721 if (r < 0) {
6722 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6723 goto shutdown;
6724 }
6725
6726 if (devs_source.count(BlueFS::BDEV_DB)) {
6727 r = unlink(string(path + "/block.db").c_str());
6728 ceph_assert(r == 0);
6729 }
6730 if (devs_source.count(BlueFS::BDEV_WAL)) {
6731 r = unlink(string(path + "/block.wal").c_str());
6732 ceph_assert(r == 0);
6733 }
6734
6735shutdown:
f67539c2 6736 _close_db_and_around(true);
11fdf7f2
TL
6737 return r;
6738}
6739
6740int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
6741 int id,
6742 const string& dev_path)
6743{
6744 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6745 int r;
6746 ceph_assert(path_fd < 0);
6747
6748 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6749
6750 if (!cct->_conf->bluestore_bluefs) {
6751 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6752 return -EIO;
6753 }
6754
f67539c2 6755 r = _open_db_and_around(true);
11fdf7f2 6756
11fdf7f2
TL
6757 string link_db;
6758 string link_wal;
6759 if (devs_source.count(BlueFS::BDEV_DB) &&
9f95a23c 6760 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 6761 link_db = path + "/block.db";
9f95a23c
TL
6762 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6763 bluefs_layout.dedicated_db = false;
11fdf7f2
TL
6764 }
6765 if (devs_source.count(BlueFS::BDEV_WAL)) {
6766 link_wal = path + "/block.wal";
9f95a23c 6767 bluefs_layout.dedicated_wal = false;
11fdf7f2
TL
6768 }
6769
6770 size_t target_size;
6771 string target_name;
6772 if (id == BlueFS::BDEV_NEWWAL) {
6773 target_name = "block.wal";
6774 target_size = cct->_conf->bluestore_block_wal_size;
9f95a23c 6775 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6776
6777 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
f67539c2
TL
6778 cct->_conf->bdev_enable_discard,
6779 BDEV_LABEL_BLOCK_SIZE);
11fdf7f2
TL
6780 ceph_assert(r == 0);
6781
6782 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6783 r = _check_or_set_bdev_label(
6784 dev_path,
6785 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6786 "bluefs wal",
6787 true);
6788 ceph_assert(r == 0);
6789 }
11fdf7f2
TL
6790 } else if (id == BlueFS::BDEV_NEWDB) {
6791 target_name = "block.db";
6792 target_size = cct->_conf->bluestore_block_db_size;
9f95a23c
TL
6793 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6794 bluefs_layout.dedicated_db = true;
31f18b77 6795
11fdf7f2 6796 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
f67539c2
TL
6797 cct->_conf->bdev_enable_discard,
6798 SUPER_RESERVED);
11fdf7f2
TL
6799 ceph_assert(r == 0);
6800
6801 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6802 r = _check_or_set_bdev_label(
6803 dev_path,
6804 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6805 "bluefs db",
6806 true);
6807 ceph_assert(r == 0);
6808 }
31f18b77
FG
6809 }
6810
11fdf7f2
TL
6811 bluefs->umount();
6812 bluefs->mount();
6813
9f95a23c 6814 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
11fdf7f2 6815
7c673cae 6816 if (r < 0) {
11fdf7f2
TL
6817 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6818 goto shutdown;
6819 }
6820
6821 if (!link_db.empty()) {
6822 r = unlink(link_db.c_str());
6823 ceph_assert(r == 0);
6824 }
6825 if (!link_wal.empty()) {
6826 r = unlink(link_wal.c_str());
6827 ceph_assert(r == 0);
6828 }
6829 r = _setup_block_symlink_or_file(
6830 target_name,
6831 dev_path,
6832 target_size,
6833 true);
6834 ceph_assert(r == 0);
6835 dout(0) << __func__ << " success" << dendl;
6836
6837shutdown:
f67539c2
TL
6838 _close_db_and_around(true);
6839
11fdf7f2
TL
6840 return r;
6841}
6842
6843string BlueStore::get_device_path(unsigned id)
6844{
6845 string res;
6846 if (id < BlueFS::MAX_BDEV) {
6847 switch (id) {
6848 case BlueFS::BDEV_WAL:
6849 res = path + "/block.wal";
6850 break;
6851 case BlueFS::BDEV_DB:
9f95a23c 6852 if (id == bluefs_layout.shared_bdev) {
11fdf7f2
TL
6853 res = path + "/block";
6854 } else {
6855 res = path + "/block.db";
6856 }
6857 break;
6858 case BlueFS::BDEV_SLOW:
6859 res = path + "/block";
6860 break;
6861 }
6862 }
6863 return res;
6864}
6865
f67539c2
TL
6866int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
6867{
6868 bluestore_bdev_label_t label;
6869 int r = _read_bdev_label(cct, path, &label);
6870 if (r < 0) {
6871 derr << "unable to read label for " << path << ": "
6872 << cpp_strerror(r) << dendl;
6873 } else {
6874 label.size = size;
6875 r = _write_bdev_label(cct, path, label);
6876 if (r < 0) {
6877 derr << "unable to write label for " << path << ": "
6878 << cpp_strerror(r) << dendl;
6879 }
6880 }
6881 return r;
6882}
6883
11fdf7f2
TL
6884int BlueStore::expand_devices(ostream& out)
6885{
f67539c2 6886 int r = _open_db_and_around(true);
11fdf7f2
TL
6887 ceph_assert(r == 0);
6888 bluefs->dump_block_extents(out);
1911f103 6889 out << "Expanding DB/WAL..." << std::endl;
11fdf7f2 6890 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
9f95a23c 6891 if (devid == bluefs_layout.shared_bdev ) {
11fdf7f2
TL
6892 continue;
6893 }
6894 uint64_t size = bluefs->get_block_device_size(devid);
6895 if (size == 0) {
6896 // no bdev
6897 continue;
6898 }
6899
f67539c2
TL
6900 out << devid
6901 <<" : expanding " << " to 0x" << size << std::dec << std::endl;
6902 string p = get_device_path(devid);
6903 const char* path = p.c_str();
6904 if (path == nullptr) {
6905 derr << devid
6906 <<": can't find device path " << dendl;
6907 continue;
6908 }
6909 if (bluefs->bdev_support_label(devid)) {
6910 if (_set_bdev_label_size(p, size) >= 0) {
6911 out << devid
6912 << " : size label updated to " << size
6913 << std::endl;
11fdf7f2 6914 }
11fdf7f2
TL
6915 }
6916 }
6917 uint64_t size0 = fm->get_size();
6918 uint64_t size = bdev->get_size();
6919 if (size0 < size) {
9f95a23c 6920 out << bluefs_layout.shared_bdev
1911f103
TL
6921 << " : expanding " << " from 0x" << std::hex
6922 << size0 << " to 0x" << size << std::dec << std::endl;
f67539c2
TL
6923 _write_out_fm_meta(size);
6924 if (bdev->supported_bdev_label()) {
6925 if (_set_bdev_label_size(path, size) >= 0) {
6926 out << bluefs_layout.shared_bdev
6927 << " : size label updated to " << size
6928 << std::endl;
6929 }
6930 }
6931 _close_db_and_around(true);
1911f103
TL
6932
6933 // mount in read/write to sync expansion changes
f67539c2 6934 r = _mount();
11fdf7f2 6935 ceph_assert(r == 0);
1911f103
TL
6936 umount();
6937 } else {
f67539c2 6938 _close_db_and_around(true);
7c673cae 6939 }
1911f103
TL
6940 return r;
6941}
6942
6943int BlueStore::dump_bluefs_sizes(ostream& out)
6944{
f67539c2 6945 int r = _open_db_and_around(true);
1911f103
TL
6946 ceph_assert(r == 0);
6947 bluefs->dump_block_extents(out);
f67539c2 6948 _close_db_and_around(true);
7c673cae
FG
6949 return r;
6950}
6951
6952void BlueStore::set_cache_shards(unsigned num)
6953{
6954 dout(10) << __func__ << " " << num << dendl;
9f95a23c
TL
6955 size_t oold = onode_cache_shards.size();
6956 size_t bold = buffer_cache_shards.size();
6957 ceph_assert(num >= oold && num >= bold);
6958 onode_cache_shards.resize(num);
6959 buffer_cache_shards.resize(num);
6960 for (unsigned i = oold; i < num; ++i) {
6961 onode_cache_shards[i] =
6962 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
6963 logger);
6964 }
6965 for (unsigned i = bold; i < num; ++i) {
6966 buffer_cache_shards[i] =
6967 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
6968 logger);
7c673cae
FG
6969 }
6970}
6971
f67539c2 6972int BlueStore::_mount()
7c673cae
FG
6973{
6974 dout(1) << __func__ << " path " << path << dendl;
6975
f67539c2 6976 _kv_only = false;
7c673cae
FG
6977 if (cct->_conf->bluestore_fsck_on_mount) {
6978 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
6979 if (rc < 0)
6980 return rc;
6981 if (rc > 0) {
6982 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6983 return -EIO;
6984 }
6985 }
6986
eafe8130
TL
6987 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6988 derr << __func__ << " osd_max_object_size "
6989 << cct->_conf->osd_max_object_size << " > bluestore max "
6990 << OBJECT_MAX_SIZE << dendl;
6991 return -EINVAL;
6992 }
6993
f67539c2 6994 int r = _open_db_and_around(false);
9f95a23c 6995 if (r < 0) {
f67539c2 6996 return r;
11fdf7f2 6997 }
7c673cae 6998
11fdf7f2
TL
6999 r = _upgrade_super();
7000 if (r < 0) {
7c673cae 7001 goto out_db;
11fdf7f2 7002 }
7c673cae
FG
7003
7004 r = _open_collections();
7005 if (r < 0)
11fdf7f2 7006 goto out_db;
7c673cae
FG
7007
7008 r = _reload_logger();
7009 if (r < 0)
7010 goto out_coll;
7011
31f18b77 7012 _kv_start();
7c673cae 7013
f67539c2
TL
7014 if (bdev->is_smr()) {
7015 _zoned_cleaner_start();
7016 }
7017
7c673cae
FG
7018 r = _deferred_replay();
7019 if (r < 0)
7020 goto out_stop;
7021
7022 mempool_thread.init();
7023
f67539c2 7024 if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
eafe8130 7025 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
9f95a23c 7026
f67539c2 7027 auto was_per_pool_omap = per_pool_omap;
9f95a23c 7028
eafe8130
TL
7029 dout(1) << __func__ << " quick-fix on mount" << dendl;
7030 _fsck_on_open(FSCK_SHALLOW, true);
7031
7032 //reread statfs
7033 //FIXME minor: replace with actual open/close?
7034 _open_statfs();
eafe8130 7035 _check_legacy_statfs_alert();
9f95a23c
TL
7036
7037 //set again as hopefully it has been fixed
f67539c2 7038 if (was_per_pool_omap != OMAP_PER_PG) {
9f95a23c
TL
7039 _set_per_pool_omap();
7040 }
eafe8130
TL
7041 }
7042
7c673cae
FG
7043 mounted = true;
7044 return 0;
7045
7046 out_stop:
f67539c2
TL
7047 if (bdev->is_smr()) {
7048 _zoned_cleaner_stop();
7049 }
7c673cae 7050 _kv_stop();
7c673cae 7051 out_coll:
f6b5b4d7 7052 _shutdown_cache();
7c673cae 7053 out_db:
1911f103 7054 _close_db_and_around(false);
7c673cae
FG
7055 return r;
7056}
7057
7058int BlueStore::umount()
7059{
11fdf7f2 7060 ceph_assert(_kv_only || mounted);
7c673cae
FG
7061 dout(1) << __func__ << dendl;
7062
7063 _osr_drain_all();
7c673cae 7064
7c673cae 7065 mounted = false;
3efd9988
FG
7066 if (!_kv_only) {
7067 mempool_thread.shutdown();
f67539c2
TL
7068 if (bdev->is_smr()) {
7069 dout(20) << __func__ << " stopping zone cleaner thread" << dendl;
7070 _zoned_cleaner_stop();
7071 }
3efd9988
FG
7072 dout(20) << __func__ << " stopping kv thread" << dendl;
7073 _kv_stop();
f6b5b4d7 7074 _shutdown_cache();
3efd9988
FG
7075 dout(20) << __func__ << " closing" << dendl;
7076
3efd9988 7077 }
1911f103 7078 _close_db_and_around(false);
7c673cae
FG
7079
7080 if (cct->_conf->bluestore_fsck_on_umount) {
7081 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7082 if (rc < 0)
7083 return rc;
7084 if (rc > 0) {
7085 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7086 return -EIO;
7087 }
7088 }
7089 return 0;
7090}
7091
eafe8130
TL
7092int BlueStore::cold_open()
7093{
f67539c2 7094 return _open_db_and_around(true);
eafe8130 7095}
f67539c2 7096
eafe8130
TL
7097int BlueStore::cold_close()
7098{
1911f103 7099 _close_db_and_around(true);
eafe8130
TL
7100 return 0;
7101}
7102
9f95a23c
TL
7103// derr wrapper to limit enormous output and avoid log flooding.
7104// Of limited use where such output is expected for now
7105#define fsck_derr(err_cnt, threshold) \
7106 if (err_cnt <= threshold) { \
7107 bool need_skip_print = err_cnt == threshold; \
7108 derr
7109
7110#define fsck_dendl \
7111 dendl; \
7112 if (need_skip_print) \
7113 derr << "more error lines skipped..." << dendl; \
7c673cae 7114 }
7c673cae 7115
eafe8130
TL
7116int _fsck_sum_extents(
7117 const PExtentVector& extents,
7118 bool compressed,
7119 store_statfs_t& expected_statfs)
7120{
7121 for (auto e : extents) {
7122 if (!e.is_valid())
7123 continue;
7124 expected_statfs.allocated += e.length;
7125 if (compressed) {
7126 expected_statfs.data_compressed_allocated += e.length;
7127 }
7128 }
7129 return 0;
7130}
7131
7c673cae 7132int BlueStore::_fsck_check_extents(
11fdf7f2 7133 const coll_t& cid,
7c673cae
FG
7134 const ghobject_t& oid,
7135 const PExtentVector& extents,
7136 bool compressed,
7137 mempool_dynamic_bitset &used_blocks,
b32b8144 7138 uint64_t granularity,
11fdf7f2 7139 BlueStoreRepairer* repairer,
eafe8130
TL
7140 store_statfs_t& expected_statfs,
7141 FSCKDepth depth)
7c673cae
FG
7142{
7143 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
7144 int errors = 0;
7145 for (auto e : extents) {
7146 if (!e.is_valid())
7147 continue;
7148 expected_statfs.allocated += e.length;
7149 if (compressed) {
11fdf7f2 7150 expected_statfs.data_compressed_allocated += e.length;
7c673cae 7151 }
eafe8130
TL
7152 if (depth != FSCK_SHALLOW) {
7153 bool already = false;
9f95a23c 7154 apply_for_bitset_range(
eafe8130
TL
7155 e.offset, e.length, granularity, used_blocks,
7156 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
7157 if (bs.test(pos)) {
7158 if (repairer) {
7159 repairer->note_misreference(
7160 pos * min_alloc_size, min_alloc_size, !already);
7161 }
7162 if (!already) {
7163 derr << "fsck error: " << oid << " extent " << e
7164 << " or a subset is already allocated (misreferenced)" << dendl;
7165 ++errors;
7166 already = true;
7167 }
11fdf7f2 7168 }
eafe8130
TL
7169 else
7170 bs.set(pos);
7171 });
7172 if (repairer) {
7173 repairer->get_space_usage_tracker().set_used( e.offset, e.length, cid, oid);
7174 }
11fdf7f2 7175
eafe8130
TL
7176 if (e.end() > bdev->get_size()) {
7177 derr << "fsck error: " << oid << " extent " << e
7178 << " past end of block device" << dendl;
7179 ++errors;
7180 }
7c673cae
FG
7181 }
7182 }
7183 return errors;
7184}
7185
11fdf7f2
TL
7186void BlueStore::_fsck_check_pool_statfs(
7187 BlueStore::per_pool_statfs& expected_pool_statfs,
eafe8130
TL
7188 int64_t& errors,
7189 int64_t& warnings,
11fdf7f2
TL
7190 BlueStoreRepairer* repairer)
7191{
f67539c2 7192 auto it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2
TL
7193 if (it) {
7194 for (it->lower_bound(string()); it->valid(); it->next()) {
7195 string key = it->key();
7196 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
7197 if (repairer) {
eafe8130
TL
7198 ++errors;
7199 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
7200 derr << "fsck error: " << "legacy statfs record found, removing"
11fdf7f2
TL
7201 << dendl;
7202 }
7203 continue;
7204 }
11fdf7f2
TL
7205 uint64_t pool_id;
7206 if (get_key_pool_stat(key, &pool_id) < 0) {
7207 derr << "fsck error: bad key " << key
7208 << "in statfs namespece" << dendl;
7209 if (repairer) {
7210 repairer->remove_key(db, PREFIX_STAT, key);
7211 }
7212 ++errors;
7213 continue;
7214 }
7215
7216 volatile_statfs vstatfs;
7217 bufferlist bl = it->value();
7218 auto blp = bl.cbegin();
7219 try {
7220 vstatfs.decode(blp);
f67539c2 7221 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
7222 derr << "fsck error: failed to decode Pool StatFS record"
7223 << pretty_binary_string(key) << dendl;
7224 if (repairer) {
7225 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
7226 << pretty_binary_string(key)
7227 << "', removing" << dendl;
7228 repairer->remove_key(db, PREFIX_STAT, key);
7229 }
7230 ++errors;
7231 vstatfs.reset();
7232 }
7233 auto stat_it = expected_pool_statfs.find(pool_id);
7234 if (stat_it == expected_pool_statfs.end()) {
7235 if (vstatfs.is_empty()) {
7236 // we don't consider that as an error since empty pool statfs
7237 // are left in DB for now
7238 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7239 << std::hex << pool_id << std::dec << dendl;
7240 if (repairer) {
7241 // but we need to increment error count in case of repair
7242 // to have proper counters at the end
7243 // (as repairer increments recovery counter anyway).
7244 ++errors;
7245 }
7246 } else {
7247 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
7248 << std::hex << pool_id << std::dec << dendl;
7249 ++errors;
7250 }
7251 if (repairer) {
7252 repairer->remove_key(db, PREFIX_SHARED_BLOB, key);
7253 }
7254 continue;
7255 }
7256 store_statfs_t statfs;
7257 vstatfs.publish(&statfs);
7258 if (!(stat_it->second == statfs)) {
7259 derr << "fsck error: actual " << statfs
7260 << " != expected " << stat_it->second
7261 << " for pool "
7262 << std::hex << pool_id << std::dec << dendl;
7263 if (repairer) {
7264 repairer->fix_statfs(db, key, stat_it->second);
7265 }
7266 ++errors;
7267 }
7268 expected_pool_statfs.erase(stat_it);
7269 }
7270 } // if (it)
eafe8130
TL
7271 for (auto& s : expected_pool_statfs) {
7272 if (s.second.is_zero()) {
11fdf7f2
TL
7273 // we might lack empty statfs recs in DB
7274 continue;
7275 }
7276 derr << "fsck error: missing Pool StatFS record for pool "
eafe8130 7277 << std::hex << s.first << std::dec << dendl;
11fdf7f2
TL
7278 if (repairer) {
7279 string key;
eafe8130
TL
7280 get_pool_stat_key(s.first, &key);
7281 repairer->fix_statfs(db, key, s.second);
11fdf7f2
TL
7282 }
7283 ++errors;
7284 }
eafe8130 7285 if (!per_pool_stat_collection &&
eafe8130
TL
7286 repairer) {
7287 // by virtue of running this method, we correct the top-level
7288 // error of having global stats
7289 repairer->inc_repaired();
7290 }
11fdf7f2
TL
7291}
7292
eafe8130
TL
7293BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
7294 BlueStore::FSCKDepth depth,
7295 int64_t pool_id,
7296 BlueStore::CollectionRef c,
7297 const ghobject_t& oid,
7298 const string& key,
7299 const bufferlist& value,
9f95a23c 7300 mempool::bluestore_fsck::list<string>* expecting_shards,
eafe8130
TL
7301 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
7302 const BlueStore::FSCK_ObjectCtx& ctx)
7303{
7304 auto& errors = ctx.errors;
7305 auto& num_objects = ctx.num_objects;
7306 auto& num_extents = ctx.num_extents;
7307 auto& num_blobs = ctx.num_blobs;
7308 auto& num_sharded_objects = ctx.num_sharded_objects;
7309 auto& num_spanning_blobs = ctx.num_spanning_blobs;
7310 auto used_blocks = ctx.used_blocks;
7311 auto sb_info_lock = ctx.sb_info_lock;
7312 auto& sb_info = ctx.sb_info;
7313 auto repairer = ctx.repairer;
7314
7315 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
7316 &ctx.expected_pool_statfs[pool_id] :
7317 &ctx.expected_store_statfs;
7318
7319 dout(10) << __func__ << " " << oid << dendl;
7320 OnodeRef o;
7321 o.reset(Onode::decode(c, oid, key, value));
7322 ++num_objects;
7c673cae 7323
eafe8130 7324 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7c673cae 7325
eafe8130
TL
7326 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7327 _dump_onode<30>(cct, *o);
7328 // shards
7329 if (!o->extent_map.shards.empty()) {
7330 ++num_sharded_objects;
7331 if (depth != FSCK_SHALLOW) {
9f95a23c 7332 ceph_assert(expecting_shards);
eafe8130
TL
7333 for (auto& s : o->extent_map.shards) {
7334 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
9f95a23c 7335 expecting_shards->push_back(string());
eafe8130 7336 get_extent_shard_key(o->key, s.shard_info->offset,
9f95a23c 7337 &expecting_shards->back());
eafe8130
TL
7338 if (s.shard_info->offset >= o->onode.size) {
7339 derr << "fsck error: " << oid << " shard 0x" << std::hex
7340 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
7341 << std::dec << dendl;
7342 ++errors;
7343 }
7344 }
7345 }
7346 }
7c673cae 7347
eafe8130
TL
7348 // lextents
7349 uint64_t pos = 0;
7350 mempool::bluestore_fsck::map<BlobRef,
7351 bluestore_blob_use_tracker_t> ref_map;
7352 for (auto& l : o->extent_map.extent_map) {
7353 dout(20) << __func__ << " " << l << dendl;
7354 if (l.logical_offset < pos) {
7355 derr << "fsck error: " << oid << " lextent at 0x"
7356 << std::hex << l.logical_offset
7357 << " overlaps with the previous, which ends at 0x" << pos
7358 << std::dec << dendl;
7359 ++errors;
7360 }
7361 if (depth != FSCK_SHALLOW &&
7362 o->extent_map.spans_shard(l.logical_offset, l.length)) {
7363 derr << "fsck error: " << oid << " lextent at 0x"
7364 << std::hex << l.logical_offset << "~" << l.length
7365 << " spans a shard boundary"
7366 << std::dec << dendl;
7367 ++errors;
7368 }
7369 pos = l.logical_offset + l.length;
7370 res_statfs->data_stored += l.length;
7371 ceph_assert(l.blob);
7372 const bluestore_blob_t& blob = l.blob->get_blob();
7373
7374 auto& ref = ref_map[l.blob];
7375 if (ref.is_empty()) {
7376 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
7377 uint32_t l = blob.get_logical_length();
7378 ref.init(l, min_release_size);
7379 }
7380 ref.get(
7381 l.blob_offset,
7382 l.length);
7383 ++num_extents;
7384 if (depth != FSCK_SHALLOW &&
7385 blob.has_unused()) {
7386 ceph_assert(referenced);
7387 auto p = referenced->find(l.blob);
7388 bluestore_blob_t::unused_t* pu;
7389 if (p == referenced->end()) {
7390 pu = &(*referenced)[l.blob];
7391 }
7392 else {
7393 pu = &p->second;
7394 }
7395 uint64_t blob_len = blob.get_logical_length();
7396 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
7397 ceph_assert(l.blob_offset + l.length <= blob_len);
7398 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
7399 uint64_t start = l.blob_offset / chunk_size;
7400 uint64_t end =
7401 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
7402 for (auto i = start; i < end; ++i) {
7403 (*pu) |= (1u << i);
7404 }
7405 }
7406 } //for (auto& l : o->extent_map.extent_map)
7407
7408 for (auto& i : ref_map) {
7409 ++num_blobs;
7410 const bluestore_blob_t& blob = i.first->get_blob();
7411 bool equal =
7412 depth == FSCK_SHALLOW ? true :
7413 i.first->get_blob_use_tracker().equal(i.second);
7414 if (!equal) {
7415 derr << "fsck error: " << oid << " blob " << *i.first
7416 << " doesn't match expected ref_map " << i.second << dendl;
7417 ++errors;
7418 }
7419 if (blob.is_compressed()) {
7420 res_statfs->data_compressed += blob.get_compressed_payload_length();
7421 res_statfs->data_compressed_original +=
7422 i.first->get_referenced_bytes();
7423 }
7424 if (blob.is_shared()) {
7425 if (i.first->shared_blob->get_sbid() > blobid_max) {
7426 derr << "fsck error: " << oid << " blob " << blob
7427 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
7428 << blobid_max << dendl;
7429 ++errors;
7430 }
7431 else if (i.first->shared_blob->get_sbid() == 0) {
7432 derr << "fsck error: " << oid << " blob " << blob
7433 << " marked as shared but has uninitialized sbid"
7434 << dendl;
7435 ++errors;
7436 }
7437 // the below lock is optional and provided in multithreading mode only
7438 if (sb_info_lock) {
7439 sb_info_lock->lock();
7440 }
7441 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
7442 ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
7443 ceph_assert(sbi.pool_id == INT64_MIN ||
7444 sbi.pool_id == oid.hobj.get_logical_pool());
7445 sbi.cid = c->cid;
7446 sbi.pool_id = oid.hobj.get_logical_pool();
7447 sbi.sb = i.first->shared_blob;
7448 sbi.oids.push_back(oid);
7449 sbi.compressed = blob.is_compressed();
7450 for (auto e : blob.get_extents()) {
7451 if (e.is_valid()) {
7452 sbi.ref_map.get(e.offset, e.length);
7453 }
7454 }
7455 if (sb_info_lock) {
7456 sb_info_lock->unlock();
7457 }
7458 } else if (depth != FSCK_SHALLOW) {
7459 ceph_assert(used_blocks);
7460 errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
7461 blob.is_compressed(),
7462 *used_blocks,
7463 fm->get_alloc_size(),
7464 repairer,
7465 *res_statfs,
7466 depth);
7467 } else {
7468 errors += _fsck_sum_extents(
7469 blob.get_extents(),
7470 blob.is_compressed(),
7471 *res_statfs);
7472 }
7473 } // for (auto& i : ref_map)
9f95a23c 7474
adb31ebb
TL
7475 {
7476 auto &sbm = o->extent_map.spanning_blob_map;
7477 size_t broken = 0;
7478 BlobRef first_broken;
7479 for (auto it = sbm.begin(); it != sbm.end();) {
7480 auto it1 = it++;
7481 if (ref_map.count(it1->second) == 0) {
7482 if (!broken) {
7483 first_broken = it1->second;
7484 ++errors;
7485 }
7486 broken++;
7487 if (repairer) {
7488 sbm.erase(it1);
7489 }
7490 }
7491 }
7492 if (broken) {
7493 derr << "fsck error: " << oid << " - " << broken
7494 << " zombie spanning blob(s) found, the first one: "
7495 << *first_broken << dendl;
7496 if(repairer) {
7497 auto txn = repairer->fix_spanning_blobs(db);
7498 _record_onode(o, txn);
7499 }
7500 }
7501 }
7502
9f95a23c
TL
7503 if (o->onode.has_omap()) {
7504 _fsck_check_object_omap(depth, o, ctx);
7505 }
7506
eafe8130
TL
7507 return o;
7508}
7509
7510#include "common/WorkQueue.h"
7511
7512class ShallowFSCKThreadPool : public ThreadPool
7513{
7514public:
7515 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
7516 ThreadPool(cct_, nm, tn, n) {
7517 }
7518 void worker(ThreadPool::WorkThread* wt) override {
7519 int next_wq = 0;
7520 while (!_stop) {
7521 next_wq %= work_queues.size();
7522 WorkQueue_ *wq = work_queues[next_wq++];
7523
7524 void* item = wq->_void_dequeue();
7525 if (item) {
7526 processing++;
7527 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
7528 wq->_void_process(item, tp_handle);
7529 processing--;
7530 }
7531 }
7532 }
7533 template <size_t BatchLen>
7534 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
7535 {
7536 struct Entry {
7537 int64_t pool_id;
7538 BlueStore::CollectionRef c;
7539 ghobject_t oid;
7540 string key;
7541 bufferlist value;
7542 };
7543 struct Batch {
7544 std::atomic<size_t> running = { 0 };
7545 size_t entry_count = 0;
7546 std::array<Entry, BatchLen> entries;
7547
7548 int64_t errors = 0;
7549 int64_t warnings = 0;
7550 uint64_t num_objects = 0;
7551 uint64_t num_extents = 0;
7552 uint64_t num_blobs = 0;
7553 uint64_t num_sharded_objects = 0;
7554 uint64_t num_spanning_blobs = 0;
7555 store_statfs_t expected_store_statfs;
7556 BlueStore::per_pool_statfs expected_pool_statfs;
7557 };
7558
7559 size_t batchCount;
7560 BlueStore* store = nullptr;
7561
eafe8130
TL
7562 ceph::mutex* sb_info_lock = nullptr;
7563 BlueStore::sb_info_map_t* sb_info = nullptr;
7564 BlueStoreRepairer* repairer = nullptr;
7565
7566 Batch* batches = nullptr;
7567 size_t last_batch_pos = 0;
7568 bool batch_acquired = false;
7569
7570 FSCKWorkQueue(std::string n,
7571 size_t _batchCount,
7572 BlueStore* _store,
eafe8130
TL
7573 ceph::mutex* _sb_info_lock,
7574 BlueStore::sb_info_map_t& _sb_info,
7575 BlueStoreRepairer* _repairer) :
f67539c2 7576 WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()),
eafe8130
TL
7577 batchCount(_batchCount),
7578 store(_store),
eafe8130
TL
7579 sb_info_lock(_sb_info_lock),
7580 sb_info(&_sb_info),
7581 repairer(_repairer)
7582 {
7583 batches = new Batch[batchCount];
7584 }
7585 ~FSCKWorkQueue() {
7586 delete[] batches;
7587 }
7588
7589 /// Remove all work items from the queue.
7590 void _clear() override {
7591 //do nothing
7592 }
7593 /// Check whether there is anything to do.
7594 bool _empty() override {
7595 ceph_assert(false);
7596 }
7597
7598 /// Get the next work item to process.
7599 void* _void_dequeue() override {
7600 size_t pos = rand() % batchCount;
7601 size_t pos0 = pos;
7602 do {
7603 auto& batch = batches[pos];
7604 if (batch.running.fetch_add(1) == 0) {
7605 if (batch.entry_count) {
7606 return &batch;
7607 }
7608 }
7609 batch.running--;
7610 pos++;
7611 pos %= batchCount;
7612 } while (pos != pos0);
7613 return nullptr;
7614 }
7615 /** @brief Process the work item.
7616 * This function will be called several times in parallel
7617 * and must therefore be thread-safe. */
7618 void _void_process(void* item, TPHandle& handle) override {
7619 Batch* batch = (Batch*)item;
7620
7621 BlueStore::FSCK_ObjectCtx ctx(
7622 batch->errors,
7623 batch->warnings,
7624 batch->num_objects,
7625 batch->num_extents,
7626 batch->num_blobs,
7627 batch->num_sharded_objects,
7628 batch->num_spanning_blobs,
7629 nullptr, // used_blocks
9f95a23c 7630 nullptr, //used_omap_head
eafe8130
TL
7631 sb_info_lock,
7632 *sb_info,
7633 batch->expected_store_statfs,
7634 batch->expected_pool_statfs,
7635 repairer);
7636
7637 for (size_t i = 0; i < batch->entry_count; i++) {
7638 auto& entry = batch->entries[i];
7639
7640 store->fsck_check_objects_shallow(
7641 BlueStore::FSCK_SHALLOW,
7642 entry.pool_id,
7643 entry.c,
7644 entry.oid,
7645 entry.key,
7646 entry.value,
9f95a23c 7647 nullptr, // expecting_shards - this will need a protection if passed
eafe8130
TL
7648 nullptr, // referenced
7649 ctx);
7650 }
7651 //std::cout << "processed " << batch << std::endl;
7652 batch->entry_count = 0;
7653 batch->running--;
7654 }
7655 /** @brief Synchronously finish processing a work item.
7656 * This function is called after _void_process with the global thread pool lock held,
7657 * so at most one copy will execute simultaneously for a given thread pool.
7658 * It can be used for non-thread-safe finalization. */
7659 void _void_process_finish(void*) override {
7660 ceph_assert(false);
7661 }
7662
7663 bool queue(
7664 int64_t pool_id,
7665 BlueStore::CollectionRef c,
7666 const ghobject_t& oid,
7667 const string& key,
7668 const bufferlist& value) {
7669 bool res = false;
7670 size_t pos0 = last_batch_pos;
7671 if (!batch_acquired) {
7672 do {
7673 auto& batch = batches[last_batch_pos];
7674 if (batch.running.fetch_add(1) == 0) {
7675 if (batch.entry_count < BatchLen) {
7676 batch_acquired = true;
7677 break;
7678 }
7679 }
7680 batch.running.fetch_sub(1);
7681 last_batch_pos++;
7682 last_batch_pos %= batchCount;
7683 } while (last_batch_pos != pos0);
7684 }
7685 if (batch_acquired) {
7686 auto& batch = batches[last_batch_pos];
7687 ceph_assert(batch.running);
7688 ceph_assert(batch.entry_count < BatchLen);
7689
7690 auto& entry = batch.entries[batch.entry_count];
7691 entry.pool_id = pool_id;
7692 entry.c = c;
7693 entry.oid = oid;
7694 entry.key = key;
7695 entry.value = value;
7696
7697 ++batch.entry_count;
7698 if (batch.entry_count == BatchLen) {
7699 batch_acquired = false;
7700 batch.running.fetch_sub(1);
7701 last_batch_pos++;
7702 last_batch_pos %= batchCount;
7703 }
7704 res = true;
7705 }
7706 return res;
7707 }
7708
7709 void finalize(ThreadPool& tp,
7710 BlueStore::FSCK_ObjectCtx& ctx) {
7711 if (batch_acquired) {
7712 auto& batch = batches[last_batch_pos];
7713 ceph_assert(batch.running);
7714 batch.running.fetch_sub(1);
7715 }
7716 tp.stop();
7717
7718 for (size_t i = 0; i < batchCount; i++) {
7719 auto& batch = batches[i];
7720
7721 //process leftovers if any
7722 if (batch.entry_count) {
7723 TPHandle tp_handle(store->cct,
7724 nullptr,
7725 timeout_interval,
7726 suicide_interval);
7727 ceph_assert(batch.running == 0);
7728
7729 batch.running++; // just to be on-par with the regular call
7730 _void_process(&batch, tp_handle);
7731 }
7732 ceph_assert(batch.entry_count == 0);
7733
7734 ctx.errors += batch.errors;
7735 ctx.warnings += batch.warnings;
7736 ctx.num_objects += batch.num_objects;
7737 ctx.num_extents += batch.num_extents;
7738 ctx.num_blobs += batch.num_blobs;
7739 ctx.num_sharded_objects += batch.num_sharded_objects;
7740 ctx.num_spanning_blobs += batch.num_spanning_blobs;
9f95a23c 7741
eafe8130
TL
7742 ctx.expected_store_statfs.add(batch.expected_store_statfs);
7743
7744 for (auto it = batch.expected_pool_statfs.begin();
7745 it != batch.expected_pool_statfs.end();
7746 it++) {
7747 ctx.expected_pool_statfs[it->first].add(it->second);
7748 }
7749 }
7750 }
7751 };
7752};
7753
9f95a23c
TL
7754void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
7755 OnodeRef& o,
7756 const BlueStore::FSCK_ObjectCtx& ctx)
eafe8130 7757{
9f95a23c
TL
7758 auto& errors = ctx.errors;
7759 auto& warnings = ctx.warnings;
7760 auto repairer = ctx.repairer;
7761
7762 ceph_assert(o->onode.has_omap());
7763 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
f67539c2 7764 if (per_pool_omap == OMAP_PER_POOL) {
9f95a23c
TL
7765 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
7766 << "fsck error: " << o->oid
7767 << " has omap that is not per-pool or pgmeta"
7768 << fsck_dendl;
7769 ++errors;
7770 } else {
7771 const char* w;
7772 int64_t num;
7773 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
7774 ++errors;
7775 num = errors;
7776 w = "error";
7777 } else {
7778 ++warnings;
7779 num = warnings;
7780 w = "warning";
7781 }
7782 fsck_derr(num, MAX_FSCK_ERROR_LINES)
7783 << "fsck " << w << ": " << o->oid
7784 << " has omap that is not per-pool or pgmeta"
7785 << fsck_dendl;
7786 }
f67539c2
TL
7787 } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) {
7788 if (per_pool_omap == OMAP_PER_PG) {
7789 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
7790 << "fsck error: " << o->oid
7791 << " has omap that is not per-pg or pgmeta"
7792 << fsck_dendl;
7793 ++errors;
7794 } else {
7795 const char* w;
7796 int64_t num;
7797 if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) {
7798 ++errors;
7799 num = errors;
7800 w = "error";
7801 } else {
7802 ++warnings;
7803 num = warnings;
7804 w = "warning";
7805 }
7806 fsck_derr(num, MAX_FSCK_ERROR_LINES)
7807 << "fsck " << w << ": " << o->oid
7808 << " has omap that is not per-pg or pgmeta"
7809 << fsck_dendl;
7810 }
9f95a23c
TL
7811 }
7812 if (repairer &&
f67539c2 7813 !o->onode.is_perpg_omap() &&
9f95a23c 7814 !o->onode.is_pgmeta_omap()) {
f67539c2 7815 dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl;
9f95a23c
TL
7816 bufferlist h;
7817 map<string, bufferlist> kv;
7818 int r = _onode_omap_get(o, &h, &kv);
7819 if (r < 0) {
7820 derr << " got " << r << " " << cpp_strerror(r) << dendl;
7821 } else {
7822 KeyValueDB::Transaction txn = db->get_transaction();
7823 // remove old keys
7824 const string& old_omap_prefix = o->get_omap_prefix();
7825 string old_head, old_tail;
7826 o->get_omap_header(&old_head);
7827 o->get_omap_tail(&old_tail);
7828 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
7829 txn->rmkey(old_omap_prefix, old_tail);
7830 // set flag
f67539c2 7831 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP);
9f95a23c
TL
7832 _record_onode(o, txn);
7833 const string& new_omap_prefix = o->get_omap_prefix();
7834 // head
7835 if (h.length()) {
7836 string new_head;
7837 o->get_omap_header(&new_head);
7838 txn->set(new_omap_prefix, new_head, h);
7839 }
7840 // tail
7841 string new_tail;
7842 o->get_omap_tail(&new_tail);
7843 bufferlist empty;
7844 txn->set(new_omap_prefix, new_tail, empty);
7845 // values
7846 string final_key;
7847 o->get_omap_key(string(), &final_key);
7848 size_t base_key_len = final_key.size();
7849 for (auto& i : kv) {
7850 final_key.resize(base_key_len);
7851 final_key += i.first;
7852 txn->set(new_omap_prefix, final_key, i.second);
7853 }
7854 db->submit_transaction_sync(txn);
7855 repairer->inc_repaired();
7856 }
eafe8130 7857 }
9f95a23c 7858}
eafe8130 7859
9f95a23c
TL
7860void BlueStore::_fsck_check_objects(FSCKDepth depth,
7861 BlueStore::FSCK_ObjectCtx& ctx)
7862{
eafe8130 7863 auto& errors = ctx.errors;
eafe8130
TL
7864 auto sb_info_lock = ctx.sb_info_lock;
7865 auto& sb_info = ctx.sb_info;
7866 auto repairer = ctx.repairer;
7867
7868 uint64_t_btree_t used_nids;
7869
7870 size_t processed_myself = 0;
7871
f67539c2 7872 auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
7873 mempool::bluestore_fsck::list<string> expecting_shards;
7874 if (it) {
7875 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
7876 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
7877 std::unique_ptr<WQ> wq(
7878 new WQ(
7879 "FSCKWorkQueue",
7880 (thread_count ? : 1) * 32,
7881 this,
eafe8130
TL
7882 sb_info_lock,
7883 sb_info,
7884 repairer));
7885
7886 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
7887
7888 thread_pool.add_work_queue(wq.get());
7889 if (depth == FSCK_SHALLOW && thread_count > 0) {
7890 //not the best place but let's check anyway
7891 ceph_assert(sb_info_lock);
7892 thread_pool.start();
7893 }
7894
7895 //fill global if not overriden below
7896 CollectionRef c;
7897 int64_t pool_id = -1;
7898 spg_t pgid;
7899 for (it->lower_bound(string()); it->valid(); it->next()) {
7900 dout(30) << __func__ << " key "
7901 << pretty_binary_string(it->key()) << dendl;
7902 if (is_extent_shard_key(it->key())) {
7903 if (depth == FSCK_SHALLOW) {
7904 continue;
7905 }
7906 while (!expecting_shards.empty() &&
7907 expecting_shards.front() < it->key()) {
7908 derr << "fsck error: missing shard key "
7909 << pretty_binary_string(expecting_shards.front())
7910 << dendl;
7911 ++errors;
7912 expecting_shards.pop_front();
7913 }
7914 if (!expecting_shards.empty() &&
7915 expecting_shards.front() == it->key()) {
7916 // all good
7917 expecting_shards.pop_front();
7918 continue;
7919 }
7920
7921 uint32_t offset;
7922 string okey;
7923 get_key_extent_shard(it->key(), &okey, &offset);
7924 derr << "fsck error: stray shard 0x" << std::hex << offset
7925 << std::dec << dendl;
7926 if (expecting_shards.empty()) {
7927 derr << "fsck error: " << pretty_binary_string(it->key())
7928 << " is unexpected" << dendl;
7929 ++errors;
7930 continue;
7931 }
7932 while (expecting_shards.front() > it->key()) {
7933 derr << "fsck error: saw " << pretty_binary_string(it->key())
7934 << dendl;
7935 derr << "fsck error: exp "
7936 << pretty_binary_string(expecting_shards.front()) << dendl;
7937 ++errors;
7938 expecting_shards.pop_front();
7939 if (expecting_shards.empty()) {
7940 break;
7941 }
7942 }
7943 continue;
7944 }
7945
7946 ghobject_t oid;
7947 int r = get_key_object(it->key(), &oid);
7948 if (r < 0) {
7949 derr << "fsck error: bad object key "
7950 << pretty_binary_string(it->key()) << dendl;
7951 ++errors;
7952 continue;
7953 }
7954 if (!c ||
7955 oid.shard_id != pgid.shard ||
7956 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
7957 !c->contains(oid)) {
7958 c = nullptr;
7959 for (auto& p : coll_map) {
7960 if (p.second->contains(oid)) {
7961 c = p.second;
7962 break;
7963 }
7964 }
7965 if (!c) {
7966 derr << "fsck error: stray object " << oid
7967 << " not owned by any collection" << dendl;
7968 ++errors;
7969 continue;
7970 }
7971 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
7972 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
7973 << dendl;
7974 }
7975
7976 if (depth != FSCK_SHALLOW &&
7977 !expecting_shards.empty()) {
7978 for (auto& k : expecting_shards) {
7979 derr << "fsck error: missing shard key "
7980 << pretty_binary_string(k) << dendl;
7981 }
7982 ++errors;
7983 expecting_shards.clear();
7984 }
7985
7986 bool queued = false;
7987 if (depth == FSCK_SHALLOW && thread_count > 0) {
7988 queued = wq->queue(
7989 pool_id,
7990 c,
7991 oid,
7992 it->key(),
7993 it->value());
7994 }
7995 OnodeRef o;
7996 map<BlobRef, bluestore_blob_t::unused_t> referenced;
7997
7998 if (!queued) {
7999 ++processed_myself;
8000
8001 o = fsck_check_objects_shallow(
8002 depth,
8003 pool_id,
8004 c,
8005 oid,
8006 it->key(),
8007 it->value(),
9f95a23c 8008 &expecting_shards,
eafe8130
TL
8009 &referenced,
8010 ctx);
8011 }
8012
8013 if (depth != FSCK_SHALLOW) {
8014 ceph_assert(o != nullptr);
8015 if (o->onode.nid) {
8016 if (o->onode.nid > nid_max) {
8017 derr << "fsck error: " << oid << " nid " << o->onode.nid
8018 << " > nid_max " << nid_max << dendl;
8019 ++errors;
8020 }
8021 if (used_nids.count(o->onode.nid)) {
8022 derr << "fsck error: " << oid << " nid " << o->onode.nid
8023 << " already in use" << dendl;
8024 ++errors;
8025 continue; // go for next object
8026 }
8027 used_nids.insert(o->onode.nid);
8028 }
8029 for (auto& i : referenced) {
8030 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
8031 << std::dec << " for " << *i.first << dendl;
8032 const bluestore_blob_t& blob = i.first->get_blob();
8033 if (i.second & blob.unused) {
8034 derr << "fsck error: " << oid << " blob claims unused 0x"
8035 << std::hex << blob.unused
8036 << " but extents reference 0x" << i.second << std::dec
8037 << " on blob " << *i.first << dendl;
8038 ++errors;
8039 }
8040 if (blob.has_csum()) {
8041 uint64_t blob_len = blob.get_logical_length();
8042 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
8043 unsigned csum_count = blob.get_csum_count();
8044 unsigned csum_chunk_size = blob.get_csum_chunk_size();
8045 for (unsigned p = 0; p < csum_count; ++p) {
8046 unsigned pos = p * csum_chunk_size;
8047 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
8048 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
8049 unsigned mask = 1u << firstbit;
8050 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
8051 mask |= 1u << b;
8052 }
8053 if ((blob.unused & mask) == mask) {
8054 // this csum chunk region is marked unused
8055 if (blob.get_csum_item(p) != 0) {
8056 derr << "fsck error: " << oid
8057 << " blob claims csum chunk 0x" << std::hex << pos
8058 << "~" << csum_chunk_size
8059 << " is unused (mask 0x" << mask << " of unused 0x"
8060 << blob.unused << ") but csum is non-zero 0x"
8061 << blob.get_csum_item(p) << std::dec << " on blob "
8062 << *i.first << dendl;
8063 ++errors;
8064 }
8065 }
8066 }
8067 }
8068 }
8069 // omap
8070 if (o->onode.has_omap()) {
9f95a23c
TL
8071 ceph_assert(ctx.used_omap_head);
8072 if (ctx.used_omap_head->count(o->onode.nid)) {
8073 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
8074 << " already in use" << dendl;
eafe8130
TL
8075 ++errors;
8076 } else {
9f95a23c 8077 ctx.used_omap_head->insert(o->onode.nid);
eafe8130 8078 }
9f95a23c 8079 } // if (o->onode.has_omap())
eafe8130
TL
8080 if (depth == FSCK_DEEP) {
8081 bufferlist bl;
8082 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
8083 uint64_t offset = 0;
8084 do {
8085 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
8086 int r = _do_read(c.get(), o, offset, l, bl,
8087 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
8088 if (r < 0) {
8089 ++errors;
8090 derr << "fsck error: " << oid << std::hex
8091 << " error during read: "
8092 << " " << offset << "~" << l
8093 << " " << cpp_strerror(r) << std::dec
8094 << dendl;
8095 break;
8096 }
8097 offset += l;
8098 } while (offset < o->onode.size);
8099 } // deep
8100 } //if (depth != FSCK_SHALLOW)
8101 } // for (it->lower_bound(string()); it->valid(); it->next())
8102 if (depth == FSCK_SHALLOW && thread_count > 0) {
8103 wq->finalize(thread_pool, ctx);
8104 if (processed_myself) {
8105 // may be needs more threads?
8106 dout(0) << __func__ << " partial offload"
8107 << ", done myself " << processed_myself
8108 << " of " << ctx.num_objects
8109 << "objects, threads " << thread_count
8110 << dendl;
8111 }
8112 }
8113 } // if (it)
8114}
8115/**
8116An overview for currently implemented repair logics
8117performed in fsck in two stages: detection(+preparation) and commit.
8118Detection stage (in processing order):
8119 (Issue -> Repair action to schedule)
8120 - Detect undecodable keys for Shared Blobs -> Remove
8121 - Detect undecodable records for Shared Blobs -> Remove
8122 (might trigger missed Shared Blob detection below)
8123 - Detect stray records for Shared Blobs -> Remove
8124 - Detect misreferenced pextents -> Fix
8125 Prepare Bloom-like filter to track cid/oid -> pextent
8126 Prepare list of extents that are improperly referenced
8127 Enumerate Onode records that might use 'misreferenced' pextents
8128 (Bloom-like filter applied to reduce computation)
8129 Per each questinable Onode enumerate all blobs and identify broken ones
8130 (i.e. blobs having 'misreferences')
8131 Rewrite each broken blob data by allocating another extents and
8132 copying data there
8133 If blob is shared - unshare it and mark corresponding Shared Blob
8134 for removal
8135 Release previously allocated space
8136 Update Extent Map
8137 - Detect missed Shared Blobs -> Recreate
8138 - Detect undecodable deferred transaction -> Remove
8139 - Detect Freelist Manager's 'false free' entries -> Mark as used
8140 - Detect Freelist Manager's leaked entries -> Mark as free
8141 - Detect statfs inconsistency - Update
8142 Commit stage (separate DB commit per each step):
8143 - Apply leaked FM entries fix
8144 - Apply 'false free' FM entries fix
8145 - Apply 'Remove' actions
8146 - Apply fix for misreference pextents
8147 - Apply Shared Blob recreate
8148 (can be merged with the step above if misreferences were dectected)
8149 - Apply StatFS update
8150*/
8151int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
8152{
8153 dout(1) << __func__
8154 << (repair ? " repair" : " check")
8155 << (depth == FSCK_DEEP ? " (deep)" :
8156 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8157 << dendl;
8158
8159 // in deep mode we need R/W write access to be able to replay deferred ops
8160 bool read_only = !(repair || depth == FSCK_DEEP);
8161
f67539c2 8162 int r = _open_db_and_around(read_only);
eafe8130
TL
8163 if (r < 0)
8164 return r;
7c673cae 8165
11fdf7f2
TL
8166 if (!read_only) {
8167 r = _upgrade_super();
8168 if (r < 0) {
8169 goto out_db;
8170 }
8171 }
7c673cae 8172
eafe8130 8173 r = _open_collections();
7c673cae 8174 if (r < 0)
11fdf7f2 8175 goto out_db;
7c673cae
FG
8176
8177 mempool_thread.init();
8178
11fdf7f2
TL
8179 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8180 // enable in repair or deep mode modes only
8181 if (!read_only) {
8182 _kv_start();
8183 r = _deferred_replay();
8184 _kv_stop();
8185 }
7c673cae
FG
8186 if (r < 0)
8187 goto out_scan;
8188
eafe8130
TL
8189 r = _fsck_on_open(depth, repair);
8190
8191out_scan:
8192 mempool_thread.shutdown();
f6b5b4d7 8193 _shutdown_cache();
eafe8130 8194out_db:
1911f103 8195 _close_db_and_around(false);
eafe8130
TL
8196
8197 return r;
8198}
8199
8200int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
8201{
8202 dout(1) << __func__
8203 << " <<<START>>>"
8204 << (repair ? " repair" : " check")
8205 << (depth == FSCK_DEEP ? " (deep)" :
8206 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8207 << " start" << dendl;
8208 int64_t errors = 0;
8209 int64_t warnings = 0;
8210 unsigned repaired = 0;
8211
8212 uint64_t_btree_t used_omap_head;
eafe8130
TL
8213 uint64_t_btree_t used_sbids;
8214
f67539c2 8215 mempool_dynamic_bitset used_blocks, bluefs_used_blocks;
eafe8130
TL
8216 KeyValueDB::Iterator it;
8217 store_statfs_t expected_store_statfs, actual_statfs;
8218 per_pool_statfs expected_pool_statfs;
8219
8220 sb_info_map_t sb_info;
8221
8222 uint64_t num_objects = 0;
8223 uint64_t num_extents = 0;
8224 uint64_t num_blobs = 0;
8225 uint64_t num_spanning_blobs = 0;
8226 uint64_t num_shared_blobs = 0;
8227 uint64_t num_sharded_objects = 0;
8228 BlueStoreRepairer repairer;
8229
f67539c2
TL
8230 auto alloc_size = fm->get_alloc_size();
8231
eafe8130
TL
8232 utime_t start = ceph_clock_now();
8233
8234 _fsck_collections(&errors);
b32b8144 8235 used_blocks.resize(fm->get_alloc_units());
7c673cae
FG
8236
8237 if (bluefs) {
f67539c2 8238 interval_set<uint64_t> bluefs_extents;
11fdf7f2 8239
f67539c2
TL
8240 int r = bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents);
8241 ceph_assert(r == 0);
8242 for (auto [start, len] : bluefs_extents) {
8243 apply_for_bitset_range(start, len, alloc_size, used_blocks,
8244 [&](uint64_t pos, mempool_dynamic_bitset& bs) {
8245 ceph_assert(pos < bs.size());
7c673cae 8246 bs.set(pos);
f67539c2
TL
8247 }
8248 );
8249 }
8250 }
8251
8252 bluefs_used_blocks = used_blocks;
8253
8254 apply_for_bitset_range(
8255 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks,
8256 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8257 bs.set(pos);
7c673cae 8258 }
f67539c2
TL
8259 );
8260
8261
8262 if (repair) {
8263 repairer.get_space_usage_tracker().init(
8264 bdev->get_size(),
8265 min_alloc_size);
8266 }
8267
8268 if (bluefs) {
eafe8130 8269 int r = bluefs->fsck();
7c673cae 8270 if (r < 0) {
eafe8130 8271 return r;
7c673cae
FG
8272 }
8273 if (r > 0)
8274 errors += r;
8275 }
8276
eafe8130
TL
8277 if (!per_pool_stat_collection) {
8278 const char *w;
8279 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
8280 w = "error";
8281 ++errors;
8282 } else {
8283 w = "warning";
8284 ++warnings;
8285 }
8286 derr << "fsck " << w << ": store not yet converted to per-pool stats"
8287 << dendl;
8288 }
f67539c2 8289 if (per_pool_omap != OMAP_PER_PG) {
9f95a23c
TL
8290 const char *w;
8291 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8292 w = "error";
8293 ++errors;
8294 } else {
8295 w = "warning";
8296 ++warnings;
8297 }
f67539c2 8298 derr << "fsck " << w << ": store not yet converted to per-pg omap"
9f95a23c
TL
8299 << dendl;
8300 }
8301
11fdf7f2 8302 // get expected statfs; reset unaffected fields to be able to compare
7c673cae
FG
8303 // structs
8304 statfs(&actual_statfs);
11fdf7f2
TL
8305 actual_statfs.total = 0;
8306 actual_statfs.internally_reserved = 0;
8307 actual_statfs.available = 0;
8308 actual_statfs.internal_metadata = 0;
8309 actual_statfs.omap_allocated = 0;
8310
eafe8130
TL
8311 if (g_conf()->bluestore_debug_fsck_abort) {
8312 dout(1) << __func__ << " debug abort" << dendl;
8313 goto out_scan;
8314 }
7c673cae 8315 // walk PREFIX_OBJ
eafe8130
TL
8316 {
8317 dout(1) << __func__ << " walking object keyspace" << dendl;
8318 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8319 BlueStore::FSCK_ObjectCtx ctx(
8320 errors,
8321 warnings,
8322 num_objects,
8323 num_extents,
8324 num_blobs,
8325 num_sharded_objects,
8326 num_spanning_blobs,
8327 &used_blocks,
8328 &used_omap_head,
9f95a23c
TL
8329 //no need for the below lock when in non-shallow mode as
8330 // there is no multithreading in this case
8331 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
eafe8130
TL
8332 sb_info,
8333 expected_store_statfs,
8334 expected_pool_statfs,
8335 repair ? &repairer : nullptr);
9f95a23c
TL
8336
8337 _fsck_check_objects(depth, ctx);
eafe8130 8338 }
11fdf7f2 8339
7c673cae 8340 dout(1) << __func__ << " checking shared_blobs" << dendl;
f67539c2 8341 it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
7c673cae 8342 if (it) {
eafe8130
TL
8343 // FIXME minor: perhaps simplify for shallow mode?
8344 // fill global if not overriden below
8345 auto expected_statfs = &expected_store_statfs;
11fdf7f2 8346
7c673cae
FG
8347 for (it->lower_bound(string()); it->valid(); it->next()) {
8348 string key = it->key();
8349 uint64_t sbid;
8350 if (get_key_shared_blob(key, &sbid)) {
3efd9988 8351 derr << "fsck error: bad key '" << key
7c673cae 8352 << "' in shared blob namespace" << dendl;
11fdf7f2
TL
8353 if (repair) {
8354 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8355 }
7c673cae
FG
8356 ++errors;
8357 continue;
8358 }
8359 auto p = sb_info.find(sbid);
8360 if (p == sb_info.end()) {
3efd9988 8361 derr << "fsck error: found stray shared blob data for sbid 0x"
7c673cae 8362 << std::hex << sbid << std::dec << dendl;
11fdf7f2
TL
8363 if (repair) {
8364 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8365 }
7c673cae
FG
8366 ++errors;
8367 } else {
8368 ++num_shared_blobs;
8369 sb_info_t& sbi = p->second;
8370 bluestore_shared_blob_t shared_blob(sbid);
8371 bufferlist bl = it->value();
11fdf7f2
TL
8372 auto blp = bl.cbegin();
8373 try {
8374 decode(shared_blob, blp);
f67539c2 8375 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
8376 ++errors;
8377 // Force update and don't report as missing
8378 sbi.updated = sbi.passed = true;
8379
8380 derr << "fsck error: failed to decode Shared Blob"
8381 << pretty_binary_string(it->key()) << dendl;
8382 if (repair) {
8383 dout(20) << __func__ << " undecodable Shared Blob, key:'"
8384 << pretty_binary_string(it->key())
8385 << "', removing" << dendl;
8386 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8387 }
8388 continue;
8389 }
7c673cae
FG
8390 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
8391 if (shared_blob.ref_map != sbi.ref_map) {
3efd9988 8392 derr << "fsck error: shared blob 0x" << std::hex << sbid
11fdf7f2
TL
8393 << std::dec << " ref_map " << shared_blob.ref_map
8394 << " != expected " << sbi.ref_map << dendl;
8395 sbi.updated = true; // will update later in repair mode only!
7c673cae
FG
8396 ++errors;
8397 }
8398 PExtentVector extents;
8399 for (auto &r : shared_blob.ref_map.ref_map) {
8400 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
8401 }
eafe8130 8402 if (per_pool_stat_collection || repair) {
11fdf7f2
TL
8403 expected_statfs = &expected_pool_statfs[sbi.pool_id];
8404 }
8405 errors += _fsck_check_extents(sbi.cid,
8406 p->second.oids.front(),
7c673cae
FG
8407 extents,
8408 p->second.compressed,
b32b8144
FG
8409 used_blocks,
8410 fm->get_alloc_size(),
11fdf7f2 8411 repair ? &repairer : nullptr,
eafe8130
TL
8412 *expected_statfs,
8413 depth);
11fdf7f2
TL
8414 sbi.passed = true;
8415 }
8416 }
8417 } // if (it)
8418
8419 if (repair && repairer.preprocess_misreference(db)) {
8420
8421 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
8422 auto& space_tracker = repairer.get_space_usage_tracker();
8423 auto& misref_extents = repairer.get_misreferences();
8424 interval_set<uint64_t> to_release;
f67539c2 8425 it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
11fdf7f2 8426 if (it) {
eafe8130
TL
8427 // fill global if not overriden below
8428 auto expected_statfs = &expected_store_statfs;
11fdf7f2
TL
8429
8430 CollectionRef c;
8431 spg_t pgid;
8432 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
8433 bool bypass_rest = false;
8434 for (it->lower_bound(string()); it->valid() && !bypass_rest;
8435 it->next()) {
8436 dout(30) << __func__ << " key "
8437 << pretty_binary_string(it->key()) << dendl;
8438 if (is_extent_shard_key(it->key())) {
8439 continue;
8440 }
8441
8442 ghobject_t oid;
8443 int r = get_key_object(it->key(), &oid);
8444 if (r < 0 || !space_tracker.is_used(oid)) {
8445 continue;
8446 }
8447
8448 if (!c ||
8449 oid.shard_id != pgid.shard ||
8450 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8451 !c->contains(oid)) {
8452 c = nullptr;
8453 for (auto& p : coll_map) {
8454 if (p.second->contains(oid)) {
8455 c = p.second;
8456 break;
8457 }
8458 }
8459 if (!c) {
8460 continue;
8461 }
eafe8130
TL
8462 if (per_pool_stat_collection || repair) {
8463 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
11fdf7f2
TL
8464 expected_statfs = &expected_pool_statfs[pool_id];
8465 }
8466 }
8467 if (!space_tracker.is_used(c->cid)) {
8468 continue;
8469 }
8470
8471 dout(20) << __func__ << " check misreference for col:" << c->cid
8472 << " obj:" << oid << dendl;
8473
eafe8130
TL
8474 OnodeRef o;
8475 o.reset(Onode::decode(c, oid, it->key(), it->value()));
11fdf7f2
TL
8476 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8477 mempool::bluestore_fsck::set<BlobRef> blobs;
8478
8479 for (auto& e : o->extent_map.extent_map) {
8480 blobs.insert(e.blob);
8481 }
8482 bool need_onode_update = false;
8483 bool first_dump = true;
8484 for(auto b : blobs) {
8485 bool broken_blob = false;
8486 auto& pextents = b->dirty_blob().dirty_extents();
8487 for (auto& e : pextents) {
8488 if (!e.is_valid()) {
8489 continue;
8490 }
8491 // for the sake of simplicity and proper shared blob handling
8492 // always rewrite the whole blob even when it's partially
8493 // misreferenced.
8494 if (misref_extents.intersects(e.offset, e.length)) {
8495 if (first_dump) {
8496 first_dump = false;
81eedcae 8497 _dump_onode<10>(cct, *o);
11fdf7f2
TL
8498 }
8499 broken_blob = true;
8500 break;
8501 }
8502 }
8503 if (!broken_blob)
8504 continue;
8505 bool compressed = b->get_blob().is_compressed();
8506 need_onode_update = true;
8507 dout(10) << __func__
8508 << " fix misreferences in oid:" << oid
8509 << " " << *b << dendl;
8510 uint64_t b_off = 0;
8511 PExtentVector pext_to_release;
8512 pext_to_release.reserve(pextents.size());
8513 // rewriting all valid pextents
8514 for (auto e = pextents.begin(); e != pextents.end();
8515 b_off += e->length, e++) {
8516 if (!e->is_valid()) {
8517 continue;
8518 }
8519 PExtentVector exts;
f67539c2
TL
8520 int64_t alloc_len =
8521 shared_alloc.a->allocate(e->length, min_alloc_size,
8522 0, 0, &exts);
eafe8130 8523 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
11fdf7f2
TL
8524 derr << __func__
8525 << " failed to allocate 0x" << std::hex << e->length
eafe8130 8526 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2 8527 << " min_alloc_size 0x" << min_alloc_size
f67539c2 8528 << " available 0x " << shared_alloc.a->get_free()
11fdf7f2
TL
8529 << std::dec << dendl;
8530 if (alloc_len > 0) {
f67539c2 8531 shared_alloc.a->release(exts);
11fdf7f2
TL
8532 }
8533 bypass_rest = true;
8534 break;
8535 }
8536 expected_statfs->allocated += e->length;
8537 if (compressed) {
8538 expected_statfs->data_compressed_allocated += e->length;
8539 }
8540
8541 bufferlist bl;
8542 IOContext ioc(cct, NULL, true); // allow EIO
8543 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
8544 if (r < 0) {
8545 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
8546 <<"~" << e->length << std::dec << dendl;
8547 ceph_abort_msg("read failed, wtf");
8548 }
8549 pext_to_release.push_back(*e);
8550 e = pextents.erase(e);
8551 e = pextents.insert(e, exts.begin(), exts.end());
8552 b->get_blob().map_bl(
8553 b_off, bl,
8554 [&](uint64_t offset, bufferlist& t) {
8555 int r = bdev->write(offset, t, false);
8556 ceph_assert(r == 0);
8557 });
8558 e += exts.size() - 1;
8559 for (auto& p : exts) {
8560 fm->allocate(p.offset, p.length, txn);
8561 }
8562 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8563
8564 if (b->get_blob().is_shared()) {
8565 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
8566
8567 auto sb_it = sb_info.find(b->shared_blob->get_sbid());
8568 ceph_assert(sb_it != sb_info.end());
8569 sb_info_t& sbi = sb_it->second;
8570
8571 for (auto& r : sbi.ref_map.ref_map) {
8572 expected_statfs->allocated -= r.second.length;
8573 if (sbi.compressed) {
8574 // NB: it's crucial to use compressed flag from sb_info_t
8575 // as we originally used that value while accumulating
8576 // expected_statfs
8577 expected_statfs->data_compressed_allocated -= r.second.length;
8578 }
8579 }
8580 sbi.updated = sbi.passed = true;
8581 sbi.ref_map.clear();
8582
8583 // relying on blob's pextents to decide what to release.
8584 for (auto& p : pext_to_release) {
8585 to_release.union_insert(p.offset, p.length);
8586 }
8587 } else {
8588 for (auto& p : pext_to_release) {
8589 expected_statfs->allocated -= p.length;
8590 if (compressed) {
8591 expected_statfs->data_compressed_allocated -= p.length;
8592 }
8593 to_release.union_insert(p.offset, p.length);
8594 }
8595 }
8596 if (bypass_rest) {
8597 break;
8598 }
8599 } // for(auto b : blobs)
8600 if (need_onode_update) {
8601 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
8602 _record_onode(o, txn);
8603 }
8604 } // for (it->lower_bound(string()); it->valid(); it->next())
8605
8606 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
8607 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
8608 << "~" << it.get_len() << std::dec << dendl;
8609 fm->release(it.get_start(), it.get_len(), txn);
8610 }
f67539c2 8611 shared_alloc.a->release(to_release);
11fdf7f2
TL
8612 to_release.clear();
8613 } // if (it) {
8614 } //if (repair && repairer.preprocess_misreference()) {
8615
eafe8130
TL
8616 if (depth != FSCK_SHALLOW) {
8617 for (auto &p : sb_info) {
8618 sb_info_t& sbi = p.second;
8619 if (!sbi.passed) {
8620 derr << "fsck error: missing " << *sbi.sb << dendl;
8621 ++errors;
8622 }
8623 if (repair && (!sbi.passed || sbi.updated)) {
8624 auto sbid = p.first;
8625 if (sbi.ref_map.empty()) {
8626 ceph_assert(sbi.passed);
8627 dout(20) << __func__ << " " << *sbi.sb
8628 << " is empty, removing" << dendl;
8629 repairer.fix_shared_blob(db, sbid, nullptr);
8630 } else {
8631 bufferlist bl;
8632 bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
8633 encode(persistent, bl);
8634 dout(20) << __func__ << " " << *sbi.sb
8635 << " is " << bl.length() << " bytes, updating" << dendl;
11fdf7f2 8636
eafe8130
TL
8637 repairer.fix_shared_blob(db, sbid, &bl);
8638 }
7c673cae
FG
8639 }
8640 }
8641 }
11fdf7f2
TL
8642 sb_info.clear();
8643
eafe8130
TL
8644 // check global stats only if fscking (not repairing) w/o per-pool stats
8645 if (!per_pool_stat_collection &&
8646 !repair &&
8647 !(actual_statfs == expected_store_statfs)) {
8648 derr << "fsck error: actual " << actual_statfs
8649 << " != expected " << expected_store_statfs << dendl;
8650 if (repair) {
8651 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
8652 expected_store_statfs);
11fdf7f2 8653 }
eafe8130 8654 ++errors;
7c673cae
FG
8655 }
8656
eafe8130
TL
8657 dout(1) << __func__ << " checking pool_statfs" << dendl;
8658 _fsck_check_pool_statfs(expected_pool_statfs,
8659 errors, warnings, repair ? &repairer : nullptr);
8660
8661 if (depth != FSCK_SHALLOW) {
9f95a23c 8662 dout(1) << __func__ << " checking for stray omap data " << dendl;
f67539c2 8663 it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE);
eafe8130 8664 if (it) {
9f95a23c 8665 uint64_t last_omap_head = 0;
eafe8130
TL
8666 for (it->lower_bound(string()); it->valid(); it->next()) {
8667 uint64_t omap_head;
f67539c2 8668
eafe8130 8669 _key_decode_u64(it->key().c_str(), &omap_head);
f67539c2 8670
9f95a23c 8671 if (used_omap_head.count(omap_head) == 0 &&
f67539c2 8672 omap_head != last_omap_head) {
9f95a23c
TL
8673 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8674 << "fsck error: found stray omap data on omap_head "
f67539c2
TL
8675 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8676 ++errors;
8677 last_omap_head = omap_head;
eafe8130 8678 }
7c673cae
FG
8679 }
8680 }
f67539c2 8681 it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE);
eafe8130 8682 if (it) {
9f95a23c 8683 uint64_t last_omap_head = 0;
eafe8130
TL
8684 for (it->lower_bound(string()); it->valid(); it->next()) {
8685 uint64_t omap_head;
8686 _key_decode_u64(it->key().c_str(), &omap_head);
9f95a23c
TL
8687 if (used_omap_head.count(omap_head) == 0 &&
8688 omap_head != last_omap_head) {
8689 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8690 << "fsck error: found stray (pgmeta) omap data on omap_head "
8691 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8692 last_omap_head = omap_head;
eafe8130
TL
8693 ++errors;
8694 }
11fdf7f2
TL
8695 }
8696 }
f67539c2 8697 it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE);
9f95a23c
TL
8698 if (it) {
8699 uint64_t last_omap_head = 0;
8700 for (it->lower_bound(string()); it->valid(); it->next()) {
8701 uint64_t pool;
8702 uint64_t omap_head;
8703 string k = it->key();
8704 const char *c = k.c_str();
8705 c = _key_decode_u64(c, &pool);
8706 c = _key_decode_u64(c, &omap_head);
8707 if (used_omap_head.count(omap_head) == 0 &&
f67539c2 8708 omap_head != last_omap_head) {
9f95a23c
TL
8709 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8710 << "fsck error: found stray (per-pool) omap data on omap_head "
8711 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8712 ++errors;
f67539c2
TL
8713 last_omap_head = omap_head;
8714 }
8715 }
8716 }
8717 it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE);
8718 if (it) {
8719 uint64_t last_omap_head = 0;
8720 for (it->lower_bound(string()); it->valid(); it->next()) {
8721 uint64_t pool;
8722 uint32_t hash;
8723 uint64_t omap_head;
8724 string k = it->key();
8725 const char* c = k.c_str();
8726 c = _key_decode_u64(c, &pool);
8727 c = _key_decode_u32(c, &hash);
8728 c = _key_decode_u64(c, &omap_head);
8729 if (used_omap_head.count(omap_head) == 0 &&
8730 omap_head != last_omap_head) {
8731 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8732 << "fsck error: found stray (per-pg) omap data on omap_head "
8733 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8734 ++errors;
8735 last_omap_head = omap_head;
9f95a23c
TL
8736 }
8737 }
8738 }
eafe8130 8739 dout(1) << __func__ << " checking deferred events" << dendl;
f67539c2 8740 it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE);
eafe8130
TL
8741 if (it) {
8742 for (it->lower_bound(string()); it->valid(); it->next()) {
8743 bufferlist bl = it->value();
8744 auto p = bl.cbegin();
8745 bluestore_deferred_transaction_t wt;
8746 try {
8747 decode(wt, p);
f67539c2 8748 } catch (ceph::buffer::error& e) {
eafe8130
TL
8749 derr << "fsck error: failed to decode deferred txn "
8750 << pretty_binary_string(it->key()) << dendl;
8751 if (repair) {
8752 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
8753 << pretty_binary_string(it->key())
8754 << "', removing" << dendl;
8755 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8756 }
8757 continue;
8758 }
8759 dout(20) << __func__ << " deferred " << wt.seq
8760 << " ops " << wt.ops.size()
8761 << " released 0x" << std::hex << wt.released << std::dec << dendl;
8762 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
9f95a23c 8763 apply_for_bitset_range(
f67539c2 8764 e.get_start(), e.get_len(), alloc_size, used_blocks,
eafe8130 8765 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
8766 bs.set(pos);
8767 }
8768 );
8769 }
7c673cae 8770 }
eafe8130
TL
8771 }
8772
8773 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
8774 {
eafe8130
TL
8775 fm->enumerate_reset();
8776 uint64_t offset, length;
8777 while (fm->enumerate_next(db, &offset, &length)) {
8778 bool intersects = false;
9f95a23c 8779 apply_for_bitset_range(
f67539c2 8780 offset, length, alloc_size, used_blocks,
eafe8130 8781 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
f67539c2
TL
8782 ceph_assert(pos < bs.size());
8783 if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
eafe8130
TL
8784 if (offset == SUPER_RESERVED &&
8785 length == min_alloc_size - SUPER_RESERVED) {
8786 // this is due to the change just after luminous to min_alloc_size
8787 // granularity allocations, and our baked in assumption at the top
8788 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
8789 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
8790 // since we will never allocate this region below min_alloc_size.
8791 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
8792 << " and min_alloc_size, 0x" << std::hex << offset << "~"
8793 << length << std::dec << dendl;
8794 } else {
8795 intersects = true;
8796 if (repair) {
8797 repairer.fix_false_free(db, fm,
8798 pos * min_alloc_size,
8799 min_alloc_size);
8800 }
11fdf7f2 8801 }
eafe8130
TL
8802 } else {
8803 bs.set(pos);
8804 }
7c673cae 8805 }
eafe8130
TL
8806 );
8807 if (intersects) {
8808 derr << "fsck error: free extent 0x" << std::hex << offset
8809 << "~" << length << std::dec
8810 << " intersects allocated blocks" << dendl;
8811 ++errors;
7c673cae 8812 }
b5b8bbf5 8813 }
eafe8130
TL
8814 fm->enumerate_reset();
8815 size_t count = used_blocks.count();
8816 if (used_blocks.size() != count) {
8817 ceph_assert(used_blocks.size() > count);
8818 used_blocks.flip();
8819 size_t start = used_blocks.find_first();
8820 while (start != decltype(used_blocks)::npos) {
8821 size_t cur = start;
8822 while (true) {
8823 size_t next = used_blocks.find_next(cur);
8824 if (next != cur + 1) {
8825 ++errors;
8826 derr << "fsck error: leaked extent 0x" << std::hex
8827 << ((uint64_t)start * fm->get_alloc_size()) << "~"
8828 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
8829 << dendl;
8830 if (repair) {
8831 repairer.fix_leaked(db,
8832 fm,
8833 start * min_alloc_size,
8834 (cur + 1 - start) * min_alloc_size);
8835 }
8836 start = next;
8837 break;
11fdf7f2 8838 }
eafe8130 8839 cur = next;
b5b8bbf5 8840 }
eafe8130
TL
8841 }
8842 used_blocks.flip();
b5b8bbf5 8843 }
7c673cae
FG
8844 }
8845 }
11fdf7f2 8846 if (repair) {
f67539c2
TL
8847 if (per_pool_omap != OMAP_PER_PG) {
8848 dout(5) << __func__ << " fixing per_pg_omap" << dendl;
8849 repairer.fix_per_pool_omap(db, OMAP_PER_PG);
9f95a23c
TL
8850 }
8851
11fdf7f2
TL
8852 dout(5) << __func__ << " applying repair results" << dendl;
8853 repaired = repairer.apply(db);
8854 dout(5) << __func__ << " repair applied" << dendl;
8855 }
7c673cae 8856
eafe8130 8857out_scan:
7c673cae
FG
8858 dout(2) << __func__ << " " << num_objects << " objects, "
8859 << num_sharded_objects << " of them sharded. "
8860 << dendl;
8861 dout(2) << __func__ << " " << num_extents << " extents to "
8862 << num_blobs << " blobs, "
8863 << num_spanning_blobs << " spanning, "
8864 << num_shared_blobs << " shared."
8865 << dendl;
8866
8867 utime_t duration = ceph_clock_now() - start;
9f95a23c
TL
8868 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
8869 << warnings << " warnings, "
8870 << repaired << " repaired, "
8871 << (errors + warnings - (int)repaired) << " remaining in "
7c673cae 8872 << duration << " seconds" << dendl;
9f95a23c
TL
8873
8874 // In non-repair mode we should return error count only as
8875 // it indicates if store status is OK.
8876 // In repair mode both errors and warnings are taken into account
8877 // since repaired counter relates to them both.
8878 return repair ? errors + warnings - (int)repaired : errors;
11fdf7f2
TL
8879}
8880
8881/// methods to inject various errors fsck can repair
8882void BlueStore::inject_broken_shared_blob_key(const string& key,
8883 const bufferlist& bl)
8884{
8885 KeyValueDB::Transaction txn;
8886 txn = db->get_transaction();
8887 txn->set(PREFIX_SHARED_BLOB, key, bl);
8888 db->submit_transaction_sync(txn);
8889};
8890
8891void BlueStore::inject_leaked(uint64_t len)
8892{
8893 KeyValueDB::Transaction txn;
8894 txn = db->get_transaction();
8895
8896 PExtentVector exts;
f67539c2 8897 int64_t alloc_len = shared_alloc.a->allocate(len, min_alloc_size,
11fdf7f2
TL
8898 min_alloc_size * 256, 0, &exts);
8899 ceph_assert(alloc_len >= (int64_t)len);
8900 for (auto& p : exts) {
8901 fm->allocate(p.offset, p.length, txn);
8902 }
8903 db->submit_transaction_sync(txn);
8904}
8905
8906void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
8907{
8908 KeyValueDB::Transaction txn;
8909 OnodeRef o;
8910 CollectionRef c = _get_collection(cid);
8911 ceph_assert(c);
8912 {
9f95a23c 8913 std::unique_lock l{c->lock}; // just to avoid internal asserts
11fdf7f2
TL
8914 o = c->get_onode(oid, false);
8915 ceph_assert(o);
8916 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8917 }
8918
8919 bool injected = false;
8920 txn = db->get_transaction();
8921 auto& em = o->extent_map.extent_map;
8922 std::vector<const PExtentVector*> v;
8923 if (em.size()) {
8924 v.push_back(&em.begin()->blob->get_blob().get_extents());
8925 }
8926 if (em.size() > 1) {
8927 auto it = em.end();
8928 --it;
8929 v.push_back(&(it->blob->get_blob().get_extents()));
8930 }
8931 for (auto pext : v) {
8932 if (pext->size()) {
8933 auto p = pext->begin();
8934 while (p != pext->end()) {
8935 if (p->is_valid()) {
8936 dout(20) << __func__ << " release 0x" << std::hex << p->offset
8937 << "~" << p->length << std::dec << dendl;
8938 fm->release(p->offset, p->length, txn);
8939 injected = true;
8940 break;
8941 }
8942 ++p;
8943 }
8944 }
8945 }
8946 ceph_assert(injected);
8947 db->submit_transaction_sync(txn);
8948}
8949
9f95a23c
TL
8950void BlueStore::inject_legacy_omap()
8951{
8952 dout(1) << __func__ << dendl;
f67539c2 8953 per_pool_omap = OMAP_BULK;
9f95a23c
TL
8954 KeyValueDB::Transaction txn;
8955 txn = db->get_transaction();
8956 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
8957 db->submit_transaction_sync(txn);
8958}
8959
8960void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
8961{
8962 dout(1) << __func__ << " "
8963 << cid << " " << oid
8964 <<dendl;
8965 KeyValueDB::Transaction txn;
8966 OnodeRef o;
8967 CollectionRef c = _get_collection(cid);
8968 ceph_assert(c);
8969 {
8970 std::unique_lock l{ c->lock }; // just to avoid internal asserts
8971 o = c->get_onode(oid, false);
8972 ceph_assert(o);
8973 }
f67539c2
TL
8974 o->onode.clear_flag(
8975 bluestore_onode_t::FLAG_PERPG_OMAP |
8976 bluestore_onode_t::FLAG_PERPOOL_OMAP |
8977 bluestore_onode_t::FLAG_PGMETA_OMAP);
9f95a23c
TL
8978 txn = db->get_transaction();
8979 _record_onode(o, txn);
8980 db->submit_transaction_sync(txn);
8981}
8982
8983
11fdf7f2
TL
8984void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
8985{
8986 BlueStoreRepairer repairer;
8987 repairer.fix_statfs(db, key, new_statfs);
8988 repairer.apply(db);
8989}
8990
eafe8130
TL
8991void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
8992{
8993 KeyValueDB::Transaction t = db->get_transaction();
8994 volatile_statfs v;
8995 v = new_statfs;
8996 bufferlist bl;
8997 v.encode(bl);
8998 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
8999 db->submit_transaction_sync(t);
9000}
9001
11fdf7f2
TL
9002void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
9003 coll_t cid2, ghobject_t oid2,
9004 uint64_t offset)
9005{
9006 OnodeRef o1;
9007 CollectionRef c1 = _get_collection(cid1);
9008 ceph_assert(c1);
9009 {
9f95a23c 9010 std::unique_lock l{c1->lock}; // just to avoid internal asserts
11fdf7f2
TL
9011 o1 = c1->get_onode(oid1, false);
9012 ceph_assert(o1);
9013 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9014 }
9015 OnodeRef o2;
9016 CollectionRef c2 = _get_collection(cid2);
9017 ceph_assert(c2);
9018 {
9f95a23c 9019 std::unique_lock l{c2->lock}; // just to avoid internal asserts
11fdf7f2
TL
9020 o2 = c2->get_onode(oid2, false);
9021 ceph_assert(o2);
9022 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9023 }
9024 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
9025 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
9026
9027 // require onode/extent layout to be the same (and simple)
9028 // to make things easier
9029 ceph_assert(o1->onode.extent_map_shards.empty());
9030 ceph_assert(o2->onode.extent_map_shards.empty());
9031 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
9032 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
9033 ceph_assert(e1.logical_offset == e2.logical_offset);
9034 ceph_assert(e1.length == e2.length);
9035 ceph_assert(e1.blob_offset == e2.blob_offset);
9036
9037 KeyValueDB::Transaction txn;
9038 txn = db->get_transaction();
9039
9040 // along with misreference error this will create space leaks errors
9041 e2.blob->dirty_blob() = e1.blob->get_blob();
9042 o2->extent_map.dirty_range(offset, e2.length);
9043 o2->extent_map.update(txn, false);
9044
9045 _record_onode(o2, txn);
9046 db->submit_transaction_sync(txn);
7c673cae
FG
9047}
9048
adb31ebb
TL
9049void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
9050 int16_t blob_id)
9051{
9052 OnodeRef o;
9053 CollectionRef c = _get_collection(cid);
9054 ceph_assert(c);
9055 {
9056 std::unique_lock l{ c->lock }; // just to avoid internal asserts
9057 o = c->get_onode(oid, false);
9058 ceph_assert(o);
9059 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9060 }
9061
9062 BlobRef b = c->new_blob();
9063 b->id = blob_id;
9064 o->extent_map.spanning_blob_map[blob_id] = b;
9065
9066 KeyValueDB::Transaction txn;
9067 txn = db->get_transaction();
9068
9069 _record_onode(o, txn);
9070 db->submit_transaction_sync(txn);
9071}
9072
7c673cae
FG
9073void BlueStore::collect_metadata(map<string,string> *pm)
9074{
9075 dout(10) << __func__ << dendl;
9076 bdev->collect_metadata("bluestore_bdev_", pm);
9077 if (bluefs) {
9078 (*pm)["bluefs"] = "1";
9f95a23c
TL
9079 // this value is for backward compatibility only
9080 (*pm)["bluefs_single_shared_device"] = \
9081 stringify((int)bluefs_layout.single_shared_device());
9082 (*pm)["bluefs_dedicated_db"] = \
9083 stringify((int)bluefs_layout.dedicated_db);
9084 (*pm)["bluefs_dedicated_wal"] = \
9085 stringify((int)bluefs_layout.dedicated_wal);
9086 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
7c673cae
FG
9087 } else {
9088 (*pm)["bluefs"] = "0";
9089 }
11fdf7f2
TL
9090
9091 // report numa mapping for underlying devices
9092 int node = -1;
9093 set<int> nodes;
9094 set<string> failed;
9095 int r = get_numa_node(&node, &nodes, &failed);
9096 if (r >= 0) {
9097 if (!failed.empty()) {
9098 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
9099 }
9100 if (!nodes.empty()) {
9101 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
9102 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
9103 }
9104 if (node >= 0) {
9105 (*pm)["objectstore_numa_node"] = stringify(node);
9106 }
9107 }
9108}
9109
9110int BlueStore::get_numa_node(
9111 int *final_node,
9112 set<int> *out_nodes,
9113 set<string> *out_failed)
9114{
9115 int node = -1;
9116 set<string> devices;
9117 get_devices(&devices);
9118 set<int> nodes;
9119 set<string> failed;
9120 for (auto& devname : devices) {
9121 int n;
9122 BlkDev bdev(devname);
9123 int r = bdev.get_numa_node(&n);
9124 if (r < 0) {
9125 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
9126 << dendl;
9127 failed.insert(devname);
9128 continue;
9129 }
9130 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
9131 << dendl;
9132 nodes.insert(n);
9133 if (node < 0) {
9134 node = n;
9135 }
9136 }
9137 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
9138 *final_node = node;
9139 }
9140 if (out_nodes) {
9141 *out_nodes = nodes;
9142 }
9143 if (out_failed) {
9144 *out_failed = failed;
9145 }
9146 return 0;
9147}
9148
9149int BlueStore::get_devices(set<string> *ls)
9150{
9151 if (bdev) {
9152 bdev->get_devices(ls);
9153 if (bluefs) {
9154 bluefs->get_devices(ls);
9155 }
9156 return 0;
9157 }
9158
9159 // grumble, we haven't started up yet.
9160 int r = _open_path();
9161 if (r < 0)
9162 goto out;
9163 r = _open_fsid(false);
9164 if (r < 0)
9165 goto out_path;
9166 r = _read_fsid(&fsid);
9167 if (r < 0)
9168 goto out_fsid;
9169 r = _lock_fsid();
9170 if (r < 0)
9171 goto out_fsid;
9172 r = _open_bdev(false);
9173 if (r < 0)
9174 goto out_fsid;
9175 r = _minimal_open_bluefs(false);
9176 if (r < 0)
9177 goto out_bdev;
9178 bdev->get_devices(ls);
9179 if (bluefs) {
9180 bluefs->get_devices(ls);
9181 }
9182 r = 0;
9183 _minimal_close_bluefs();
9184 out_bdev:
9185 _close_bdev();
9186 out_fsid:
9187 _close_fsid();
9188 out_path:
9189 _close_path();
9190 out:
9191 return r;
7c673cae
FG
9192}
9193
11fdf7f2 9194void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
7c673cae
FG
9195{
9196 buf->reset();
11fdf7f2 9197
f67539c2
TL
9198 auto prefix = per_pool_omap == OMAP_BULK ?
9199 PREFIX_OMAP :
9200 per_pool_omap == OMAP_PER_POOL ?
9201 PREFIX_PERPOOL_OMAP :
9202 PREFIX_PERPG_OMAP;
9f95a23c 9203 buf->omap_allocated =
f67539c2 9204 db->estimate_prefix_size(prefix, string());
11fdf7f2 9205
f67539c2 9206 uint64_t bfree = shared_alloc.a->get_free();
7c673cae
FG
9207
9208 if (bluefs) {
f67539c2 9209 buf->internally_reserved = 0;
11fdf7f2 9210 // include dedicated db, too, if that isn't the shared device.
9f95a23c 9211 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 9212 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
7c673cae 9213 }
11fdf7f2
TL
9214 // call any non-omap bluefs space "internal metadata"
9215 buf->internal_metadata =
f67539c2 9216 bluefs->get_used()
11fdf7f2 9217 - buf->omap_allocated;
7c673cae
FG
9218 }
9219
11fdf7f2
TL
9220 uint64_t thin_total, thin_avail;
9221 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
9222 buf->total += thin_total;
9223
9224 // we are limited by both the size of the virtual device and the
9225 // underlying physical device.
9226 bfree = std::min(bfree, thin_avail);
9227
9228 buf->allocated = thin_total - thin_avail;
9229 } else {
9230 buf->total += bdev->get_size();
9231 }
9232 buf->available = bfree;
9233}
9234
9235int BlueStore::statfs(struct store_statfs_t *buf,
9236 osd_alert_list_t* alerts)
9237{
9238 if (alerts) {
9239 alerts->clear();
9240 _log_alerts(*alerts);
9241 }
9242 _get_statfs_overall(buf);
31f18b77 9243 {
11fdf7f2 9244 std::lock_guard l(vstatfs_lock);
31f18b77 9245 buf->allocated = vstatfs.allocated();
11fdf7f2
TL
9246 buf->data_stored = vstatfs.stored();
9247 buf->data_compressed = vstatfs.compressed();
9248 buf->data_compressed_original = vstatfs.compressed_original();
9249 buf->data_compressed_allocated = vstatfs.compressed_allocated();
9250 }
9251
9252 dout(20) << __func__ << " " << *buf << dendl;
9253 return 0;
9254}
9255
9f95a23c
TL
9256int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
9257 bool *out_per_pool_omap)
11fdf7f2
TL
9258{
9259 dout(20) << __func__ << " pool " << pool_id<< dendl;
81eedcae 9260
11fdf7f2
TL
9261 if (!per_pool_stat_collection) {
9262 dout(20) << __func__ << " not supported in legacy mode " << dendl;
9263 return -ENOTSUP;
7c673cae 9264 }
11fdf7f2 9265 buf->reset();
7c673cae 9266
11fdf7f2
TL
9267 {
9268 std::lock_guard l(vstatfs_lock);
9269 osd_pools[pool_id].publish(buf);
9270 }
9f95a23c
TL
9271
9272 string key_prefix;
9273 _key_encode_u64(pool_id, &key_prefix);
f67539c2
TL
9274 *out_per_pool_omap = per_pool_omap != OMAP_BULK;
9275 if (*out_per_pool_omap) {
9276 auto prefix = per_pool_omap == OMAP_PER_POOL ?
9277 PREFIX_PERPOOL_OMAP :
9278 PREFIX_PERPG_OMAP;
9279 buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix);
9280 }
9f95a23c 9281
11fdf7f2 9282 dout(10) << __func__ << *buf << dendl;
7c673cae
FG
9283 return 0;
9284}
9285
81eedcae
TL
9286void BlueStore::_check_legacy_statfs_alert()
9287{
9288 string s;
9289 if (!per_pool_stat_collection &&
eafe8130 9290 cct->_conf->bluestore_warn_on_legacy_statfs) {
81eedcae
TL
9291 s = "legacy statfs reporting detected, "
9292 "suggest to run store repair to get consistent statistic reports";
9293 }
9294 std::lock_guard l(qlock);
9295 legacy_statfs_alert = s;
9296}
9297
f67539c2 9298void BlueStore::_check_no_per_pg_or_pool_omap_alert()
9f95a23c 9299{
f67539c2
TL
9300 string per_pg, per_pool;
9301 if (per_pool_omap != OMAP_PER_PG) {
9302 if (cct->_conf->bluestore_warn_on_no_per_pg_omap) {
9303 per_pg = "legacy (not per-pg) omap detected, "
9304 "suggest to run store repair to benefit from faster PG removal";
9305 }
9306 if (per_pool_omap != OMAP_PER_POOL) {
9307 if (cct->_conf->bluestore_warn_on_no_per_pool_omap) {
9308 per_pool = "legacy (not per-pool) omap detected, "
9309 "suggest to run store repair to benefit from per-pool omap usage statistics";
9310 }
9311 }
9f95a23c
TL
9312 }
9313 std::lock_guard l(qlock);
f67539c2
TL
9314 no_per_pg_omap_alert = per_pg;
9315 no_per_pool_omap_alert = per_pool;
9f95a23c
TL
9316}
9317
7c673cae
FG
9318// ---------------
9319// cache
9320
9321BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
9322{
9f95a23c 9323 std::shared_lock l(coll_lock);
7c673cae
FG
9324 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
9325 if (cp == coll_map.end())
9326 return CollectionRef();
9327 return cp->second;
9328}
9329
9330void BlueStore::_queue_reap_collection(CollectionRef& c)
9331{
9332 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
9333 // _reap_collections and this in the same thread,
9334 // so no need a lock.
7c673cae
FG
9335 removed_collections.push_back(c);
9336}
9337
9338void BlueStore::_reap_collections()
9339{
94b18763 9340
7c673cae
FG
9341 list<CollectionRef> removed_colls;
9342 {
94b18763
FG
9343 // _queue_reap_collection and this in the same thread.
9344 // So no need a lock.
9345 if (!removed_collections.empty())
9346 removed_colls.swap(removed_collections);
9347 else
9348 return;
7c673cae
FG
9349 }
9350
94b18763
FG
9351 list<CollectionRef>::iterator p = removed_colls.begin();
9352 while (p != removed_colls.end()) {
7c673cae
FG
9353 CollectionRef c = *p;
9354 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
adb31ebb 9355 if (c->onode_map.map_any([&](Onode* o) {
11fdf7f2 9356 ceph_assert(!o->exists);
7c673cae
FG
9357 if (o->flushing_count.load()) {
9358 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
9359 << " flush_txns " << o->flushing_count << dendl;
94b18763 9360 return true;
7c673cae 9361 }
94b18763 9362 return false;
7c673cae 9363 })) {
94b18763 9364 ++p;
7c673cae
FG
9365 continue;
9366 }
9367 c->onode_map.clear();
94b18763 9368 p = removed_colls.erase(p);
7c673cae
FG
9369 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
9370 }
94b18763 9371 if (removed_colls.empty()) {
7c673cae 9372 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
9373 } else {
9374 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
9375 }
9376}
9377
9378void BlueStore::_update_cache_logger()
9379{
9380 uint64_t num_onodes = 0;
9f95a23c 9381 uint64_t num_pinned_onodes = 0;
7c673cae
FG
9382 uint64_t num_extents = 0;
9383 uint64_t num_blobs = 0;
9384 uint64_t num_buffers = 0;
9385 uint64_t num_buffer_bytes = 0;
9f95a23c
TL
9386 for (auto c : onode_cache_shards) {
9387 c->add_stats(&num_onodes, &num_pinned_onodes);
9388 }
9389 for (auto c : buffer_cache_shards) {
9390 c->add_stats(&num_extents, &num_blobs,
9391 &num_buffers, &num_buffer_bytes);
7c673cae
FG
9392 }
9393 logger->set(l_bluestore_onodes, num_onodes);
9f95a23c 9394 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
7c673cae
FG
9395 logger->set(l_bluestore_extents, num_extents);
9396 logger->set(l_bluestore_blobs, num_blobs);
9397 logger->set(l_bluestore_buffers, num_buffers);
9398 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
9399}
9400
9401// ---------------
9402// read operations
9403
9404ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
9405{
9406 return _get_collection(cid);
9407}
9408
11fdf7f2
TL
9409ObjectStore::CollectionHandle BlueStore::create_new_collection(
9410 const coll_t& cid)
7c673cae 9411{
9f95a23c
TL
9412 std::unique_lock l{coll_lock};
9413 auto c = ceph::make_ref<Collection>(
11fdf7f2 9414 this,
9f95a23c
TL
9415 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
9416 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
11fdf7f2
TL
9417 cid);
9418 new_coll_map[cid] = c;
9f95a23c 9419 _osr_attach(c.get());
11fdf7f2
TL
9420 return c;
9421}
9422
9423void BlueStore::set_collection_commit_queue(
9424 const coll_t& cid,
9425 ContextQueue *commit_queue)
9426{
9427 if (commit_queue) {
9f95a23c 9428 std::shared_lock l(coll_lock);
11fdf7f2
TL
9429 if (coll_map.count(cid)) {
9430 coll_map[cid]->commit_queue = commit_queue;
9431 } else if (new_coll_map.count(cid)) {
9432 new_coll_map[cid]->commit_queue = commit_queue;
9433 }
9434 }
7c673cae
FG
9435}
9436
11fdf7f2 9437
7c673cae
FG
9438bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
9439{
9440 Collection *c = static_cast<Collection *>(c_.get());
9441 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
9442 if (!c->exists)
9443 return false;
9444
9445 bool r = true;
9446
9447 {
9f95a23c 9448 std::shared_lock l(c->lock);
7c673cae
FG
9449 OnodeRef o = c->get_onode(oid, false);
9450 if (!o || !o->exists)
9451 r = false;
9452 }
9453
7c673cae
FG
9454 return r;
9455}
9456
7c673cae
FG
9457int BlueStore::stat(
9458 CollectionHandle &c_,
9459 const ghobject_t& oid,
9460 struct stat *st,
9461 bool allow_eio)
9462{
9463 Collection *c = static_cast<Collection *>(c_.get());
9464 if (!c->exists)
9465 return -ENOENT;
9466 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
9467
9468 {
9f95a23c 9469 std::shared_lock l(c->lock);
7c673cae
FG
9470 OnodeRef o = c->get_onode(oid, false);
9471 if (!o || !o->exists)
9472 return -ENOENT;
9473 st->st_size = o->onode.size;
9474 st->st_blksize = 4096;
9475 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
9476 st->st_nlink = 1;
9477 }
9478
7c673cae
FG
9479 int r = 0;
9480 if (_debug_mdata_eio(oid)) {
9481 r = -EIO;
9482 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9483 }
9484 return r;
9485}
9486int BlueStore::set_collection_opts(
11fdf7f2 9487 CollectionHandle& ch,
7c673cae
FG
9488 const pool_opts_t& opts)
9489{
7c673cae 9490 Collection *c = static_cast<Collection *>(ch.get());
11fdf7f2 9491 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
7c673cae
FG
9492 if (!c->exists)
9493 return -ENOENT;
9f95a23c 9494 std::unique_lock l{c->lock};
7c673cae
FG
9495 c->pool_opts = opts;
9496 return 0;
9497}
9498
7c673cae
FG
9499int BlueStore::read(
9500 CollectionHandle &c_,
9501 const ghobject_t& oid,
9502 uint64_t offset,
9503 size_t length,
9504 bufferlist& bl,
224ce89b 9505 uint32_t op_flags)
7c673cae 9506{
11fdf7f2 9507 auto start = mono_clock::now();
7c673cae
FG
9508 Collection *c = static_cast<Collection *>(c_.get());
9509 const coll_t &cid = c->get_cid();
9510 dout(15) << __func__ << " " << cid << " " << oid
9511 << " 0x" << std::hex << offset << "~" << length << std::dec
9512 << dendl;
9513 if (!c->exists)
9514 return -ENOENT;
9515
9516 bl.clear();
9517 int r;
9518 {
9f95a23c 9519 std::shared_lock l(c->lock);
11fdf7f2 9520 auto start1 = mono_clock::now();
7c673cae 9521 OnodeRef o = c->get_onode(oid, false);
494da23a
TL
9522 log_latency("get_onode@read",
9523 l_bluestore_read_onode_meta_lat,
9524 mono_clock::now() - start1,
9525 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9526 if (!o || !o->exists) {
9527 r = -ENOENT;
9528 goto out;
9529 }
9530
9531 if (offset == length && offset == 0)
9532 length = o->onode.size;
9533
9534 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
9535 if (r == -EIO) {
9536 logger->inc(l_bluestore_read_eio);
9537 }
7c673cae
FG
9538 }
9539
9540 out:
28e407b8 9541 if (r >= 0 && _debug_data_eio(oid)) {
7c673cae
FG
9542 r = -EIO;
9543 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11fdf7f2
TL
9544 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
9545 cct->_conf->bluestore_debug_random_read_err &&
9546 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
9547 100.0)) == 0) {
224ce89b
WB
9548 dout(0) << __func__ << ": inject random EIO" << dendl;
9549 r = -EIO;
7c673cae
FG
9550 }
9551 dout(10) << __func__ << " " << cid << " " << oid
9552 << " 0x" << std::hex << offset << "~" << length << std::dec
9553 << " = " << r << dendl;
494da23a
TL
9554 log_latency(__func__,
9555 l_bluestore_read_lat,
9556 mono_clock::now() - start,
9557 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9558 return r;
9559}
9560
9f95a23c 9561void BlueStore::_read_cache(
7c673cae
FG
9562 OnodeRef o,
9563 uint64_t offset,
9564 size_t length,
9f95a23c
TL
9565 int read_cache_policy,
9566 ready_regions_t& ready_regions,
9567 blobs2read_t& blobs2read)
7c673cae 9568{
7c673cae 9569 // build blob-wise list to of stuff read (that isn't cached)
7c673cae
FG
9570 unsigned left = length;
9571 uint64_t pos = offset;
7c673cae
FG
9572 auto lp = o->extent_map.seek_lextent(offset);
9573 while (left > 0 && lp != o->extent_map.extent_map.end()) {
9574 if (pos < lp->logical_offset) {
9575 unsigned hole = lp->logical_offset - pos;
9576 if (hole >= left) {
9f95a23c 9577 break;
7c673cae
FG
9578 }
9579 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
9f95a23c 9580 << std::dec << dendl;
7c673cae
FG
9581 pos += hole;
9582 left -= hole;
9583 }
94b18763 9584 BlobRef& bptr = lp->blob;
7c673cae
FG
9585 unsigned l_off = pos - lp->logical_offset;
9586 unsigned b_off = l_off + lp->blob_offset;
9587 unsigned b_len = std::min(left, lp->length - l_off);
9588
9589 ready_regions_t cache_res;
9590 interval_set<uint32_t> cache_interval;
9591 bptr->shared_blob->bc.read(
91327a77
AA
9592 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
9593 read_cache_policy);
7c673cae 9594 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c
TL
9595 << " need 0x" << b_off << "~" << b_len
9596 << " cache has 0x" << cache_interval
9597 << std::dec << dendl;
7c673cae
FG
9598
9599 auto pc = cache_res.begin();
11fdf7f2 9600 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
7c673cae
FG
9601 while (b_len > 0) {
9602 unsigned l;
9603 if (pc != cache_res.end() &&
9f95a23c
TL
9604 pc->first == b_off) {
9605 l = pc->second.length();
f67539c2 9606 ready_regions[pos] = std::move(pc->second);
9f95a23c
TL
9607 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
9608 << b_off << "~" << l << std::dec << dendl;
9609 ++pc;
7c673cae 9610 } else {
9f95a23c
TL
9611 l = b_len;
9612 if (pc != cache_res.end()) {
9613 ceph_assert(pc->first > b_off);
9614 l = pc->first - b_off;
9615 }
9616 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
9617 << b_off << "~" << l << std::dec << dendl;
9618 // merge regions
9619 {
9620 uint64_t r_off = b_off;
9621 uint64_t r_len = l;
9622 uint64_t front = r_off % chunk_size;
9623 if (front) {
9624 r_off -= front;
9625 r_len += front;
9626 }
9627 unsigned tail = r_len % chunk_size;
9628 if (tail) {
9629 r_len += chunk_size - tail;
9630 }
9631 bool merged = false;
9632 regions2read_t& r2r = blobs2read[bptr];
9633 if (r2r.size()) {
9634 read_req_t& pre = r2r.back();
9635 if (r_off <= (pre.r_off + pre.r_len)) {
9636 front += (r_off - pre.r_off);
9637 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
9638 pre.regs.emplace_back(region_t(pos, b_off, l, front));
9639 merged = true;
9640 }
9641 }
9642 if (!merged) {
9643 read_req_t req(r_off, r_len);
9644 req.regs.emplace_back(region_t(pos, b_off, l, front));
9645 r2r.emplace_back(std::move(req));
9646 }
9647 }
7c673cae
FG
9648 }
9649 pos += l;
9650 b_off += l;
9651 left -= l;
9652 b_len -= l;
9653 }
9654 ++lp;
9655 }
9f95a23c 9656}
7c673cae 9657
9f95a23c
TL
9658int BlueStore::_prepare_read_ioc(
9659 blobs2read_t& blobs2read,
9660 vector<bufferlist>* compressed_blob_bls,
9661 IOContext* ioc)
9662{
7c673cae 9663 for (auto& p : blobs2read) {
94b18763 9664 const BlobRef& bptr = p.first;
11fdf7f2 9665 regions2read_t& r2r = p.second;
7c673cae 9666 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c 9667 << " need " << r2r << std::dec << dendl;
7c673cae
FG
9668 if (bptr->get_blob().is_compressed()) {
9669 // read the whole thing
9f95a23c
TL
9670 if (compressed_blob_bls->empty()) {
9671 // ensure we avoid any reallocation on subsequent blobs
9672 compressed_blob_bls->reserve(blobs2read.size());
9673 }
9674 compressed_blob_bls->push_back(bufferlist());
9675 bufferlist& bl = compressed_blob_bls->back();
9676 auto r = bptr->get_blob().map(
9677 0, bptr->get_blob().get_ondisk_length(),
9678 [&](uint64_t offset, uint64_t length) {
9679 int r = bdev->aio_read(offset, length, &bl, ioc);
9680 if (r < 0)
7c673cae
FG
9681 return r;
9682 return 0;
9f95a23c 9683 });
b32b8144
FG
9684 if (r < 0) {
9685 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
9686 if (r == -EIO) {
9687 // propagate EIO to caller
9688 return r;
9689 }
11fdf7f2 9690 ceph_assert(r == 0);
b32b8144 9691 }
7c673cae
FG
9692 } else {
9693 // read the pieces
11fdf7f2 9694 for (auto& req : r2r) {
9f95a23c
TL
9695 dout(20) << __func__ << " region 0x" << std::hex
9696 << req.regs.front().logical_offset
9697 << ": 0x" << req.regs.front().blob_xoffset
9698 << " reading 0x" << req.r_off
9699 << "~" << req.r_len << std::dec
9700 << dendl;
7c673cae 9701
9f95a23c
TL
9702 // read it
9703 auto r = bptr->get_blob().map(
9704 req.r_off, req.r_len,
9705 [&](uint64_t offset, uint64_t length) {
9706 int r = bdev->aio_read(offset, length, &req.bl, ioc);
9707 if (r < 0)
7c673cae
FG
9708 return r;
9709 return 0;
9f95a23c 9710 });
b32b8144
FG
9711 if (r < 0) {
9712 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
9713 << dendl;
9714 if (r == -EIO) {
9715 // propagate EIO to caller
9716 return r;
9717 }
11fdf7f2 9718 ceph_assert(r == 0);
b32b8144 9719 }
9f95a23c 9720 ceph_assert(req.bl.length() == req.r_len);
7c673cae
FG
9721 }
9722 }
9723 }
9f95a23c
TL
9724 return 0;
9725}
11fdf7f2 9726
9f95a23c
TL
9727int BlueStore::_generate_read_result_bl(
9728 OnodeRef o,
9729 uint64_t offset,
9730 size_t length,
9731 ready_regions_t& ready_regions,
9732 vector<bufferlist>& compressed_blob_bls,
9733 blobs2read_t& blobs2read,
9734 bool buffered,
9735 bool* csum_error,
9736 bufferlist& bl)
9737{
9738 // enumerate and decompress desired blobs
7c673cae
FG
9739 auto p = compressed_blob_bls.begin();
9740 blobs2read_t::iterator b2r_it = blobs2read.begin();
9741 while (b2r_it != blobs2read.end()) {
94b18763 9742 const BlobRef& bptr = b2r_it->first;
11fdf7f2 9743 regions2read_t& r2r = b2r_it->second;
7c673cae 9744 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c 9745 << " need 0x" << r2r << std::dec << dendl;
7c673cae 9746 if (bptr->get_blob().is_compressed()) {
11fdf7f2 9747 ceph_assert(p != compressed_blob_bls.end());
7c673cae
FG
9748 bufferlist& compressed_bl = *p++;
9749 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
9f95a23c
TL
9750 r2r.front().regs.front().logical_offset) < 0) {
9751 *csum_error = true;
9752 return -EIO;
7c673cae
FG
9753 }
9754 bufferlist raw_bl;
9f95a23c 9755 auto r = _decompress(compressed_bl, &raw_bl);
7c673cae 9756 if (r < 0)
9f95a23c 9757 return r;
7c673cae 9758 if (buffered) {
9f95a23c
TL
9759 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
9760 raw_bl);
7c673cae 9761 }
11fdf7f2
TL
9762 for (auto& req : r2r) {
9763 for (auto& r : req.regs) {
9764 ready_regions[r.logical_offset].substr_of(
9765 raw_bl, r.blob_xoffset, r.length);
9766 }
7c673cae
FG
9767 }
9768 } else {
11fdf7f2 9769 for (auto& req : r2r) {
9f95a23c
TL
9770 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
9771 req.regs.front().logical_offset) < 0) {
9772 *csum_error = true;
9773 return -EIO;
9774 }
9775 if (buffered) {
9776 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
9777 req.r_off, req.bl);
9778 }
7c673cae 9779
9f95a23c
TL
9780 // prune and keep result
9781 for (const auto& r : req.regs) {
9782 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
11fdf7f2 9783 }
7c673cae
FG
9784 }
9785 }
9786 ++b2r_it;
9787 }
9788
9789 // generate a resulting buffer
9790 auto pr = ready_regions.begin();
9791 auto pr_end = ready_regions.end();
9f95a23c 9792 uint64_t pos = 0;
7c673cae
FG
9793 while (pos < length) {
9794 if (pr != pr_end && pr->first == pos + offset) {
9795 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
9796 << ": data from 0x" << pr->first << "~" << pr->second.length()
9797 << std::dec << dendl;
7c673cae
FG
9798 pos += pr->second.length();
9799 bl.claim_append(pr->second);
9800 ++pr;
9801 } else {
9802 uint64_t l = length - pos;
9803 if (pr != pr_end) {
11fdf7f2 9804 ceph_assert(pr->first > pos + offset);
9f95a23c 9805 l = pr->first - (pos + offset);
7c673cae
FG
9806 }
9807 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
9808 << ": zeros for 0x" << (pos + offset) << "~" << l
9809 << std::dec << dendl;
7c673cae
FG
9810 bl.append_zero(l);
9811 pos += l;
9812 }
9813 }
11fdf7f2
TL
9814 ceph_assert(bl.length() == length);
9815 ceph_assert(pos == length);
9816 ceph_assert(pr == pr_end);
9f95a23c
TL
9817 return 0;
9818}
9819
9820int BlueStore::_do_read(
9821 Collection *c,
9822 OnodeRef o,
9823 uint64_t offset,
9824 size_t length,
9825 bufferlist& bl,
9826 uint32_t op_flags,
9827 uint64_t retry_count)
9828{
9829 FUNCTRACE(cct);
9830 int r = 0;
9831 int read_cache_policy = 0; // do not bypass clean or dirty cache
9832
9833 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
9834 << " size 0x" << o->onode.size << " (" << std::dec
9835 << o->onode.size << ")" << dendl;
9836 bl.clear();
9837
9838 if (offset >= o->onode.size) {
9839 return r;
9840 }
9841
9842 // generally, don't buffer anything, unless the client explicitly requests
9843 // it.
9844 bool buffered = false;
9845 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
9846 dout(20) << __func__ << " will do buffered read" << dendl;
9847 buffered = true;
9848 } else if (cct->_conf->bluestore_default_buffered_read &&
9849 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
9850 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
9851 dout(20) << __func__ << " defaulting to buffered read" << dendl;
9852 buffered = true;
9853 }
9854
9855 if (offset + length > o->onode.size) {
9856 length = o->onode.size - offset;
9857 }
9858
9859 auto start = mono_clock::now();
9860 o->extent_map.fault_range(db, offset, length);
9861 log_latency(__func__,
9862 l_bluestore_read_onode_meta_lat,
9863 mono_clock::now() - start,
9864 cct->_conf->bluestore_log_op_age);
9865 _dump_onode<30>(cct, *o);
9866
9867 // for deep-scrub, we only read dirty cache and bypass clean cache in
9868 // order to read underlying block device in case there are silent disk errors.
9869 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
9870 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
9871 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
9872 }
9873
9874 // build blob-wise list to of stuff read (that isn't cached)
9875 ready_regions_t ready_regions;
9876 blobs2read_t blobs2read;
9877 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
9878
9879
9880 // read raw blob data.
9881 start = mono_clock::now(); // for the sake of simplicity
9882 // measure the whole block below.
9883 // The error isn't that much...
9884 vector<bufferlist> compressed_blob_bls;
9885 IOContext ioc(cct, NULL, true); // allow EIO
9886 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
9887 // we always issue aio for reading, so errors other than EIO are not allowed
9888 if (r < 0)
9889 return r;
9890
f67539c2 9891 int64_t num_ios = blobs2read.size();
9f95a23c 9892 if (ioc.has_pending_aios()) {
f67539c2 9893 num_ios = ioc.get_num_ios();
9f95a23c
TL
9894 bdev->aio_submit(&ioc);
9895 dout(20) << __func__ << " waiting for aio" << dendl;
9896 ioc.aio_wait();
9897 r = ioc.get_return_value();
9898 if (r < 0) {
9899 ceph_assert(r == -EIO); // no other errors allowed
9900 return -EIO;
9901 }
9902 }
9903 log_latency_fn(__func__,
9904 l_bluestore_read_wait_aio_lat,
9905 mono_clock::now() - start,
9906 cct->_conf->bluestore_log_op_age,
9907 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
9908 );
9909
9910 bool csum_error = false;
9911 r = _generate_read_result_bl(o, offset, length, ready_regions,
9912 compressed_blob_bls, blobs2read,
9913 buffered, &csum_error, bl);
9914 if (csum_error) {
9915 // Handles spurious read errors caused by a kernel bug.
9916 // We sometimes get all-zero pages as a result of the read under
9917 // high memory pressure. Retrying the failing read succeeds in most
9918 // cases.
9919 // See also: http://tracker.ceph.com/issues/22464
9920 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
9921 return -EIO;
9922 }
9923 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
9924 }
7c673cae 9925 r = bl.length();
f64942e4
AA
9926 if (retry_count) {
9927 logger->inc(l_bluestore_reads_with_retries);
9928 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
9929 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
f67539c2
TL
9930 stringstream s;
9931 s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
9932 _set_spurious_read_errors_alert(s.str());
f64942e4 9933 }
7c673cae
FG
9934 return r;
9935}
9936
9937int BlueStore::_verify_csum(OnodeRef& o,
9938 const bluestore_blob_t* blob, uint64_t blob_xoffset,
9939 const bufferlist& bl,
9940 uint64_t logical_offset) const
9941{
9942 int bad;
9943 uint64_t bad_csum;
11fdf7f2 9944 auto start = mono_clock::now();
7c673cae 9945 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
f64942e4
AA
9946 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
9947 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
9948 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
9949 bad = blob_xoffset;
9950 r = -1;
9951 bad_csum = 0xDEADBEEF;
9952 }
7c673cae
FG
9953 if (r < 0) {
9954 if (r == -1) {
9955 PExtentVector pex;
9956 blob->map(
9957 bad,
9958 blob->get_csum_chunk_size(),
9959 [&](uint64_t offset, uint64_t length) {
9960 pex.emplace_back(bluestore_pextent_t(offset, length));
9961 return 0;
9962 });
9963 derr << __func__ << " bad "
9964 << Checksummer::get_csum_type_string(blob->csum_type)
9965 << "/0x" << std::hex << blob->get_csum_chunk_size()
9966 << " checksum at blob offset 0x" << bad
9967 << ", got 0x" << bad_csum << ", expected 0x"
9968 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
9969 << ", device location " << pex
9970 << ", logical extent 0x" << std::hex
9971 << (logical_offset + bad - blob_xoffset) << "~"
9972 << blob->get_csum_chunk_size() << std::dec
9973 << ", object " << o->oid
9974 << dendl;
9975 } else {
9976 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
9977 }
9978 }
494da23a
TL
9979 log_latency(__func__,
9980 l_bluestore_csum_lat,
9981 mono_clock::now() - start,
9982 cct->_conf->bluestore_log_op_age);
11fdf7f2
TL
9983 if (cct->_conf->bluestore_ignore_data_csum) {
9984 return 0;
9985 }
7c673cae
FG
9986 return r;
9987}
9988
9989int BlueStore::_decompress(bufferlist& source, bufferlist* result)
9990{
9991 int r = 0;
11fdf7f2
TL
9992 auto start = mono_clock::now();
9993 auto i = source.cbegin();
7c673cae 9994 bluestore_compression_header_t chdr;
11fdf7f2 9995 decode(chdr, i);
7c673cae
FG
9996 int alg = int(chdr.type);
9997 CompressorRef cp = compressor;
9998 if (!cp || (int)cp->get_type() != alg) {
9999 cp = Compressor::create(cct, alg);
10000 }
10001
10002 if (!cp.get()) {
10003 // if compressor isn't available - error, because cannot return
10004 // decompressed data?
11fdf7f2
TL
10005
10006 const char* alg_name = Compressor::get_comp_alg_name(alg);
10007 derr << __func__ << " can't load decompressor " << alg_name << dendl;
10008 _set_compression_alert(false, alg_name);
7c673cae
FG
10009 r = -EIO;
10010 } else {
f67539c2 10011 r = cp->decompress(i, chdr.length, *result, chdr.compressor_message);
7c673cae
FG
10012 if (r < 0) {
10013 derr << __func__ << " decompression failed with exit code " << r << dendl;
10014 r = -EIO;
10015 }
10016 }
494da23a
TL
10017 log_latency(__func__,
10018 l_bluestore_decompress_lat,
10019 mono_clock::now() - start,
10020 cct->_conf->bluestore_log_op_age);
7c673cae
FG
10021 return r;
10022}
10023
10024// this stores fiemap into interval_set, other variations
10025// use it internally
10026int BlueStore::_fiemap(
10027 CollectionHandle &c_,
10028 const ghobject_t& oid,
10029 uint64_t offset,
10030 size_t length,
10031 interval_set<uint64_t>& destset)
10032{
10033 Collection *c = static_cast<Collection *>(c_.get());
10034 if (!c->exists)
10035 return -ENOENT;
10036 {
9f95a23c 10037 std::shared_lock l(c->lock);
7c673cae
FG
10038
10039 OnodeRef o = c->get_onode(oid, false);
10040 if (!o || !o->exists) {
10041 return -ENOENT;
10042 }
81eedcae 10043 _dump_onode<30>(cct, *o);
7c673cae
FG
10044
10045 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10046 << " size 0x" << o->onode.size << std::dec << dendl;
10047
10048 boost::intrusive::set<Extent>::iterator ep, eend;
10049 if (offset >= o->onode.size)
10050 goto out;
10051
10052 if (offset + length > o->onode.size) {
10053 length = o->onode.size - offset;
10054 }
10055
10056 o->extent_map.fault_range(db, offset, length);
10057 eend = o->extent_map.extent_map.end();
10058 ep = o->extent_map.seek_lextent(offset);
10059 while (length > 0) {
10060 dout(20) << __func__ << " offset " << offset << dendl;
10061 if (ep != eend && ep->logical_offset + ep->length <= offset) {
10062 ++ep;
10063 continue;
10064 }
10065
10066 uint64_t x_len = length;
10067 if (ep != eend && ep->logical_offset <= offset) {
10068 uint64_t x_off = offset - ep->logical_offset;
11fdf7f2 10069 x_len = std::min(x_len, ep->length - x_off);
7c673cae
FG
10070 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
10071 << x_len << std::dec << " blob " << ep->blob << dendl;
10072 destset.insert(offset, x_len);
10073 length -= x_len;
10074 offset += x_len;
10075 if (x_off + x_len == ep->length)
10076 ++ep;
10077 continue;
10078 }
10079 if (ep != eend &&
10080 ep->logical_offset > offset &&
10081 ep->logical_offset - offset < x_len) {
10082 x_len = ep->logical_offset - offset;
10083 }
10084 offset += x_len;
10085 length -= x_len;
10086 }
10087 }
9f95a23c
TL
10088
10089 out:
10090 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10091 << " size = 0x(" << destset << ")" << std::dec << dendl;
10092 return 0;
10093}
10094
10095int BlueStore::fiemap(
10096 CollectionHandle &c_,
10097 const ghobject_t& oid,
10098 uint64_t offset,
10099 size_t length,
10100 bufferlist& bl)
10101{
10102 interval_set<uint64_t> m;
10103 int r = _fiemap(c_, oid, offset, length, m);
10104 if (r >= 0) {
10105 encode(m, bl);
10106 }
10107 return r;
10108}
10109
10110int BlueStore::fiemap(
10111 CollectionHandle &c_,
10112 const ghobject_t& oid,
10113 uint64_t offset,
10114 size_t length,
10115 map<uint64_t, uint64_t>& destmap)
10116{
10117 interval_set<uint64_t> m;
10118 int r = _fiemap(c_, oid, offset, length, m);
10119 if (r >= 0) {
10120 destmap = std::move(m).detach();
10121 }
10122 return r;
10123}
10124
10125int BlueStore::readv(
10126 CollectionHandle &c_,
10127 const ghobject_t& oid,
10128 interval_set<uint64_t>& m,
10129 bufferlist& bl,
10130 uint32_t op_flags)
10131{
10132 auto start = mono_clock::now();
10133 Collection *c = static_cast<Collection *>(c_.get());
10134 const coll_t &cid = c->get_cid();
10135 dout(15) << __func__ << " " << cid << " " << oid
10136 << " fiemap " << m
10137 << dendl;
10138 if (!c->exists)
10139 return -ENOENT;
10140
10141 bl.clear();
10142 int r;
10143 {
10144 std::shared_lock l(c->lock);
10145 auto start1 = mono_clock::now();
10146 OnodeRef o = c->get_onode(oid, false);
10147 log_latency("get_onode@read",
10148 l_bluestore_read_onode_meta_lat,
10149 mono_clock::now() - start1,
10150 cct->_conf->bluestore_log_op_age);
10151 if (!o || !o->exists) {
10152 r = -ENOENT;
10153 goto out;
10154 }
10155
10156 if (m.empty()) {
10157 r = 0;
10158 goto out;
10159 }
10160
10161 r = _do_readv(c, o, m, bl, op_flags);
10162 if (r == -EIO) {
10163 logger->inc(l_bluestore_read_eio);
10164 }
10165 }
10166
10167 out:
10168 if (r >= 0 && _debug_data_eio(oid)) {
10169 r = -EIO;
10170 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10171 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10172 cct->_conf->bluestore_debug_random_read_err &&
10173 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10174 100.0)) == 0) {
10175 dout(0) << __func__ << ": inject random EIO" << dendl;
10176 r = -EIO;
10177 }
10178 dout(10) << __func__ << " " << cid << " " << oid
10179 << " fiemap " << m << std::dec
10180 << " = " << r << dendl;
10181 log_latency(__func__,
10182 l_bluestore_read_lat,
10183 mono_clock::now() - start,
10184 cct->_conf->bluestore_log_op_age);
10185 return r;
10186}
10187
10188int BlueStore::_do_readv(
10189 Collection *c,
10190 OnodeRef o,
10191 const interval_set<uint64_t>& m,
10192 bufferlist& bl,
10193 uint32_t op_flags,
10194 uint64_t retry_count)
10195{
10196 FUNCTRACE(cct);
10197 int r = 0;
10198 int read_cache_policy = 0; // do not bypass clean or dirty cache
10199
10200 dout(20) << __func__ << " fiemap " << m << std::hex
10201 << " size 0x" << o->onode.size << " (" << std::dec
10202 << o->onode.size << ")" << dendl;
10203
10204 // generally, don't buffer anything, unless the client explicitly requests
10205 // it.
10206 bool buffered = false;
10207 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10208 dout(20) << __func__ << " will do buffered read" << dendl;
10209 buffered = true;
10210 } else if (cct->_conf->bluestore_default_buffered_read &&
10211 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10212 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10213 dout(20) << __func__ << " defaulting to buffered read" << dendl;
10214 buffered = true;
10215 }
10216 // this method must be idempotent since we may call it several times
10217 // before we finally read the expected result.
10218 bl.clear();
10219
10220 // call fiemap first!
10221 ceph_assert(m.range_start() <= o->onode.size);
10222 ceph_assert(m.range_end() <= o->onode.size);
10223 auto start = mono_clock::now();
10224 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
10225 log_latency(__func__,
10226 l_bluestore_read_onode_meta_lat,
10227 mono_clock::now() - start,
10228 cct->_conf->bluestore_log_op_age);
10229 _dump_onode<30>(cct, *o);
10230
10231 IOContext ioc(cct, NULL, true); // allow EIO
10232 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
10233 raw_results.reserve(m.num_intervals());
10234 int i = 0;
10235 for (auto p = m.begin(); p != m.end(); p++, i++) {
10236 raw_results.push_back({});
10237 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
10238 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
10239 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
10240 // we always issue aio for reading, so errors other than EIO are not allowed
10241 if (r < 0)
10242 return r;
10243 }
10244
10245 auto num_ios = m.size();
10246 if (ioc.has_pending_aios()) {
10247 num_ios = ioc.get_num_ios();
10248 bdev->aio_submit(&ioc);
10249 dout(20) << __func__ << " waiting for aio" << dendl;
10250 ioc.aio_wait();
10251 r = ioc.get_return_value();
10252 if (r < 0) {
10253 ceph_assert(r == -EIO); // no other errors allowed
10254 return -EIO;
10255 }
10256 }
10257 log_latency_fn(__func__,
10258 l_bluestore_read_wait_aio_lat,
10259 mono_clock::now() - start,
10260 cct->_conf->bluestore_log_op_age,
10261 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
10262 );
10263
10264 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
10265 i = 0;
10266 for (auto p = m.begin(); p != m.end(); p++, i++) {
10267 bool csum_error = false;
10268 bufferlist t;
10269 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
10270 std::get<0>(raw_results[i]),
10271 std::get<1>(raw_results[i]),
10272 std::get<2>(raw_results[i]),
10273 buffered, &csum_error, t);
10274 if (csum_error) {
10275 // Handles spurious read errors caused by a kernel bug.
10276 // We sometimes get all-zero pages as a result of the read under
10277 // high memory pressure. Retrying the failing read succeeds in most
10278 // cases.
10279 // See also: http://tracker.ceph.com/issues/22464
10280 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
10281 return -EIO;
10282 }
10283 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
10284 }
10285 bl.claim_append(t);
10286 }
10287 if (retry_count) {
10288 logger->inc(l_bluestore_reads_with_retries);
10289 dout(5) << __func__ << " read fiemap " << m
10290 << " failed " << retry_count << " times before succeeding"
10291 << dendl;
10292 }
10293 return bl.length();
7c673cae
FG
10294}
10295
9f95a23c 10296int BlueStore::dump_onode(CollectionHandle &c_,
7c673cae 10297 const ghobject_t& oid,
9f95a23c
TL
10298 const string& section_name,
10299 Formatter *f)
7c673cae 10300{
9f95a23c
TL
10301 Collection *c = static_cast<Collection *>(c_.get());
10302 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10303 if (!c->exists)
10304 return -ENOENT;
7c673cae 10305
9f95a23c
TL
10306 int r;
10307 {
10308 std::shared_lock l(c->lock);
10309
10310 OnodeRef o = c->get_onode(oid, false);
10311 if (!o || !o->exists) {
10312 r = -ENOENT;
10313 goto out;
10314 }
10315 // FIXME minor: actually the next line isn't enough to
10316 // load shared blobs. Leaving as is for now..
10317 //
10318 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10319
10320 _dump_onode<0>(cct, *o);
10321 f->open_object_section(section_name.c_str());
10322 o->dump(f);
10323 f->close_section();
10324 r = 0;
7c673cae 10325 }
9f95a23c
TL
10326 out:
10327 dout(10) << __func__ << " " << c->cid << " " << oid
10328 << " = " << r << dendl;
7c673cae
FG
10329 return r;
10330}
10331
7c673cae
FG
10332int BlueStore::getattr(
10333 CollectionHandle &c_,
10334 const ghobject_t& oid,
10335 const char *name,
10336 bufferptr& value)
10337{
10338 Collection *c = static_cast<Collection *>(c_.get());
10339 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
10340 if (!c->exists)
10341 return -ENOENT;
10342
10343 int r;
10344 {
9f95a23c 10345 std::shared_lock l(c->lock);
f91f0fd5 10346 mempool::bluestore_cache_meta::string k(name);
7c673cae
FG
10347
10348 OnodeRef o = c->get_onode(oid, false);
10349 if (!o || !o->exists) {
10350 r = -ENOENT;
10351 goto out;
10352 }
10353
10354 if (!o->onode.attrs.count(k)) {
10355 r = -ENODATA;
10356 goto out;
10357 }
10358 value = o->onode.attrs[k];
10359 r = 0;
10360 }
10361 out:
7c673cae
FG
10362 if (r == 0 && _debug_mdata_eio(oid)) {
10363 r = -EIO;
10364 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10365 }
10366 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
10367 << " = " << r << dendl;
10368 return r;
10369}
10370
7c673cae
FG
10371int BlueStore::getattrs(
10372 CollectionHandle &c_,
10373 const ghobject_t& oid,
10374 map<string,bufferptr>& aset)
10375{
10376 Collection *c = static_cast<Collection *>(c_.get());
10377 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10378 if (!c->exists)
10379 return -ENOENT;
10380
10381 int r;
10382 {
9f95a23c 10383 std::shared_lock l(c->lock);
7c673cae
FG
10384
10385 OnodeRef o = c->get_onode(oid, false);
10386 if (!o || !o->exists) {
10387 r = -ENOENT;
10388 goto out;
10389 }
10390 for (auto& i : o->onode.attrs) {
10391 aset.emplace(i.first.c_str(), i.second);
10392 }
10393 r = 0;
10394 }
10395
10396 out:
7c673cae
FG
10397 if (r == 0 && _debug_mdata_eio(oid)) {
10398 r = -EIO;
10399 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10400 }
10401 dout(10) << __func__ << " " << c->cid << " " << oid
10402 << " = " << r << dendl;
10403 return r;
10404}
10405
10406int BlueStore::list_collections(vector<coll_t>& ls)
10407{
9f95a23c 10408 std::shared_lock l(coll_lock);
11fdf7f2 10409 ls.reserve(coll_map.size());
7c673cae
FG
10410 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
10411 p != coll_map.end();
10412 ++p)
10413 ls.push_back(p->first);
10414 return 0;
10415}
10416
10417bool BlueStore::collection_exists(const coll_t& c)
10418{
9f95a23c 10419 std::shared_lock l(coll_lock);
7c673cae
FG
10420 return coll_map.count(c);
10421}
10422
11fdf7f2 10423int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
7c673cae 10424{
11fdf7f2 10425 dout(15) << __func__ << " " << ch->cid << dendl;
7c673cae
FG
10426 vector<ghobject_t> ls;
10427 ghobject_t next;
11fdf7f2 10428 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
7c673cae
FG
10429 &ls, &next);
10430 if (r < 0) {
10431 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
10432 << dendl;
10433 return r;
10434 }
10435 *empty = ls.empty();
11fdf7f2 10436 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
7c673cae
FG
10437 return 0;
10438}
10439
11fdf7f2 10440int BlueStore::collection_bits(CollectionHandle& ch)
7c673cae 10441{
11fdf7f2
TL
10442 dout(15) << __func__ << " " << ch->cid << dendl;
10443 Collection *c = static_cast<Collection*>(ch.get());
9f95a23c 10444 std::shared_lock l(c->lock);
11fdf7f2 10445 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
7c673cae
FG
10446 return c->cnode.bits;
10447}
10448
7c673cae
FG
10449int BlueStore::collection_list(
10450 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10451 vector<ghobject_t> *ls, ghobject_t *pnext)
10452{
10453 Collection *c = static_cast<Collection *>(c_.get());
11fdf7f2 10454 c->flush();
7c673cae
FG
10455 dout(15) << __func__ << " " << c->cid
10456 << " start " << start << " end " << end << " max " << max << dendl;
10457 int r;
10458 {
9f95a23c 10459 std::shared_lock l(c->lock);
f91f0fd5
TL
10460 r = _collection_list(c, start, end, max, false, ls, pnext);
10461 }
10462
10463 dout(10) << __func__ << " " << c->cid
10464 << " start " << start << " end " << end << " max " << max
10465 << " = " << r << ", ls.size() = " << ls->size()
10466 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10467 return r;
10468}
10469
10470int BlueStore::collection_list_legacy(
10471 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10472 vector<ghobject_t> *ls, ghobject_t *pnext)
10473{
10474 Collection *c = static_cast<Collection *>(c_.get());
10475 c->flush();
10476 dout(15) << __func__ << " " << c->cid
10477 << " start " << start << " end " << end << " max " << max << dendl;
10478 int r;
10479 {
10480 std::shared_lock l(c->lock);
10481 r = _collection_list(c, start, end, max, true, ls, pnext);
7c673cae
FG
10482 }
10483
7c673cae
FG
10484 dout(10) << __func__ << " " << c->cid
10485 << " start " << start << " end " << end << " max " << max
10486 << " = " << r << ", ls.size() = " << ls->size()
10487 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10488 return r;
10489}
10490
10491int BlueStore::_collection_list(
10492 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
f91f0fd5 10493 bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
7c673cae
FG
10494{
10495
10496 if (!c->exists)
10497 return -ENOENT;
10498
494da23a 10499 auto start_time = mono_clock::now();
7c673cae
FG
10500 int r = 0;
10501 ghobject_t static_next;
f91f0fd5
TL
10502 std::unique_ptr<CollectionListIterator> it;
10503 ghobject_t coll_range_temp_start, coll_range_temp_end;
10504 ghobject_t coll_range_start, coll_range_end;
7c673cae 10505 bool set_next = false;
f91f0fd5 10506 ghobject_t pend;
7c673cae
FG
10507 bool temp;
10508
10509 if (!pnext)
10510 pnext = &static_next;
10511
11fdf7f2 10512 if (start.is_max() || start.hobj.is_max()) {
7c673cae
FG
10513 goto out;
10514 }
f91f0fd5
TL
10515 get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
10516 &coll_range_temp_end, &coll_range_start, &coll_range_end);
7c673cae 10517 dout(20) << __func__
f91f0fd5
TL
10518 << " range " << coll_range_temp_start
10519 << " to " << coll_range_temp_end
10520 << " and " << coll_range_start
10521 << " to " << coll_range_end
7c673cae 10522 << " start " << start << dendl;
f91f0fd5
TL
10523 if (legacy) {
10524 it = std::make_unique<SimpleCollectionListIterator>(
10525 cct, db->get_iterator(PREFIX_OBJ));
10526 } else {
10527 it = std::make_unique<SortedCollectionListIterator>(
10528 db->get_iterator(PREFIX_OBJ));
10529 }
7c673cae
FG
10530 if (start == ghobject_t() ||
10531 start.hobj == hobject_t() ||
10532 start == c->cid.get_min_hobj()) {
f91f0fd5 10533 it->upper_bound(coll_range_temp_start);
7c673cae
FG
10534 temp = true;
10535 } else {
7c673cae
FG
10536 if (start.hobj.is_temp()) {
10537 temp = true;
f91f0fd5 10538 ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
7c673cae
FG
10539 } else {
10540 temp = false;
f91f0fd5 10541 ceph_assert(start >= coll_range_start && start < coll_range_end);
7c673cae 10542 }
f91f0fd5
TL
10543 dout(20) << __func__ << " temp=" << (int)temp << dendl;
10544 it->lower_bound(start);
7c673cae
FG
10545 }
10546 if (end.hobj.is_max()) {
f91f0fd5 10547 pend = temp ? coll_range_temp_end : coll_range_end;
7c673cae 10548 } else {
7c673cae
FG
10549 if (end.hobj.is_temp()) {
10550 if (temp)
f91f0fd5 10551 pend = end;
7c673cae 10552 else
f91f0fd5 10553 goto out;
7c673cae 10554 } else {
f91f0fd5 10555 pend = temp ? coll_range_temp_end : end;
7c673cae
FG
10556 }
10557 }
f91f0fd5 10558 dout(20) << __func__ << " pend " << pend << dendl;
7c673cae 10559 while (true) {
adb31ebb 10560 if (!it->valid() || it->is_ge(pend)) {
7c673cae
FG
10561 if (!it->valid())
10562 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
10563 else
f91f0fd5 10564 dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
7c673cae
FG
10565 if (temp) {
10566 if (end.hobj.is_temp()) {
adb31ebb 10567 if (it->valid() && it->is_lt(coll_range_temp_end)) {
f91f0fd5
TL
10568 *pnext = it->oid();
10569 set_next = true;
10570 }
7c673cae
FG
10571 break;
10572 }
10573 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
10574 temp = false;
f91f0fd5
TL
10575 it->upper_bound(coll_range_start);
10576 if (end.hobj.is_max())
10577 pend = coll_range_end;
10578 else
10579 pend = end;
10580 dout(30) << __func__ << " pend " << pend << dendl;
7c673cae
FG
10581 continue;
10582 }
adb31ebb 10583 if (it->valid() && it->is_lt(coll_range_end)) {
f91f0fd5
TL
10584 *pnext = it->oid();
10585 set_next = true;
10586 }
7c673cae
FG
10587 break;
10588 }
f91f0fd5 10589 dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
7c673cae
FG
10590 if (ls->size() >= (unsigned)max) {
10591 dout(20) << __func__ << " reached max " << max << dendl;
f91f0fd5 10592 *pnext = it->oid();
7c673cae
FG
10593 set_next = true;
10594 break;
10595 }
f91f0fd5 10596 ls->push_back(it->oid());
7c673cae
FG
10597 it->next();
10598 }
10599out:
10600 if (!set_next) {
10601 *pnext = ghobject_t::get_max();
10602 }
494da23a
TL
10603 log_latency_fn(
10604 __func__,
10605 l_bluestore_clist_lat,
10606 mono_clock::now() - start_time,
10607 cct->_conf->bluestore_log_collection_list_age,
10608 [&] (const ceph::timespan& lat) {
10609 ostringstream ostr;
10610 ostr << ", lat = " << timespan_str(lat)
10611 << " cid =" << c->cid
10612 << " start " << start << " end " << end
10613 << " max " << max;
10614 return ostr.str();
10615 }
10616 );
7c673cae
FG
10617 return r;
10618}
10619
7c673cae
FG
10620int BlueStore::omap_get(
10621 CollectionHandle &c_, ///< [in] Collection containing oid
10622 const ghobject_t &oid, ///< [in] Object containing omap
10623 bufferlist *header, ///< [out] omap header
10624 map<string, bufferlist> *out /// < [out] Key to value map
10625 )
10626{
10627 Collection *c = static_cast<Collection *>(c_.get());
9f95a23c
TL
10628 return _omap_get(c, oid, header, out);
10629}
10630
10631int BlueStore::_omap_get(
10632 Collection *c, ///< [in] Collection containing oid
10633 const ghobject_t &oid, ///< [in] Object containing omap
10634 bufferlist *header, ///< [out] omap header
10635 map<string, bufferlist> *out /// < [out] Key to value map
10636 )
10637{
7c673cae
FG
10638 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10639 if (!c->exists)
10640 return -ENOENT;
9f95a23c 10641 std::shared_lock l(c->lock);
7c673cae
FG
10642 int r = 0;
10643 OnodeRef o = c->get_onode(oid, false);
10644 if (!o || !o->exists) {
10645 r = -ENOENT;
10646 goto out;
10647 }
9f95a23c
TL
10648 r = _onode_omap_get(o, header, out);
10649 out:
10650 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10651 << dendl;
10652 return r;
10653}
10654
10655int BlueStore::_onode_omap_get(
10656 const OnodeRef &o, ///< [in] Object containing omap
10657 bufferlist *header, ///< [out] omap header
10658 map<string, bufferlist> *out /// < [out] Key to value map
10659)
10660{
10661 int r = 0;
10662 if (!o || !o->exists) {
10663 r = -ENOENT;
10664 goto out;
10665 }
7c673cae
FG
10666 if (!o->onode.has_omap())
10667 goto out;
10668 o->flush();
10669 {
9f95a23c 10670 const string& prefix = o->get_omap_prefix();
11fdf7f2 10671 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 10672 string head, tail;
9f95a23c
TL
10673 o->get_omap_header(&head);
10674 o->get_omap_tail(&tail);
7c673cae
FG
10675 it->lower_bound(head);
10676 while (it->valid()) {
10677 if (it->key() == head) {
9f95a23c
TL
10678 dout(30) << __func__ << " got header" << dendl;
10679 *header = it->value();
7c673cae 10680 } else if (it->key() >= tail) {
9f95a23c
TL
10681 dout(30) << __func__ << " reached tail" << dendl;
10682 break;
7c673cae 10683 } else {
9f95a23c
TL
10684 string user_key;
10685 o->decode_omap_key(it->key(), &user_key);
10686 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
10687 << " -> " << user_key << dendl;
10688 (*out)[user_key] = it->value();
7c673cae
FG
10689 }
10690 it->next();
10691 }
10692 }
9f95a23c 10693out:
7c673cae
FG
10694 return r;
10695}
10696
7c673cae
FG
10697int BlueStore::omap_get_header(
10698 CollectionHandle &c_, ///< [in] Collection containing oid
10699 const ghobject_t &oid, ///< [in] Object containing omap
10700 bufferlist *header, ///< [out] omap header
10701 bool allow_eio ///< [in] don't assert on eio
10702 )
10703{
10704 Collection *c = static_cast<Collection *>(c_.get());
10705 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10706 if (!c->exists)
10707 return -ENOENT;
9f95a23c 10708 std::shared_lock l(c->lock);
7c673cae
FG
10709 int r = 0;
10710 OnodeRef o = c->get_onode(oid, false);
10711 if (!o || !o->exists) {
10712 r = -ENOENT;
10713 goto out;
10714 }
10715 if (!o->onode.has_omap())
10716 goto out;
10717 o->flush();
10718 {
10719 string head;
9f95a23c
TL
10720 o->get_omap_header(&head);
10721 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
7c673cae
FG
10722 dout(30) << __func__ << " got header" << dendl;
10723 } else {
10724 dout(30) << __func__ << " no header" << dendl;
10725 }
10726 }
10727 out:
10728 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10729 << dendl;
10730 return r;
10731}
10732
7c673cae
FG
10733int BlueStore::omap_get_keys(
10734 CollectionHandle &c_, ///< [in] Collection containing oid
10735 const ghobject_t &oid, ///< [in] Object containing omap
10736 set<string> *keys ///< [out] Keys defined on oid
10737 )
10738{
10739 Collection *c = static_cast<Collection *>(c_.get());
10740 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10741 if (!c->exists)
10742 return -ENOENT;
adb31ebb 10743 auto start1 = mono_clock::now();
9f95a23c 10744 std::shared_lock l(c->lock);
7c673cae
FG
10745 int r = 0;
10746 OnodeRef o = c->get_onode(oid, false);
10747 if (!o || !o->exists) {
10748 r = -ENOENT;
10749 goto out;
10750 }
10751 if (!o->onode.has_omap())
10752 goto out;
10753 o->flush();
10754 {
9f95a23c 10755 const string& prefix = o->get_omap_prefix();
11fdf7f2 10756 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 10757 string head, tail;
9f95a23c
TL
10758 o->get_omap_key(string(), &head);
10759 o->get_omap_tail(&tail);
7c673cae
FG
10760 it->lower_bound(head);
10761 while (it->valid()) {
10762 if (it->key() >= tail) {
10763 dout(30) << __func__ << " reached tail" << dendl;
10764 break;
10765 }
10766 string user_key;
9f95a23c 10767 o->decode_omap_key(it->key(), &user_key);
11fdf7f2 10768 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
10769 << " -> " << user_key << dendl;
10770 keys->insert(user_key);
10771 it->next();
11fdf7f2
TL
10772 }
10773 }
10774 out:
adb31ebb
TL
10775 c->store->log_latency(
10776 __func__,
10777 l_bluestore_omap_get_keys_lat,
10778 mono_clock::now() - start1,
10779 c->store->cct->_conf->bluestore_log_omap_iterator_age);
10780
11fdf7f2
TL
10781 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10782 << dendl;
10783 return r;
7c673cae
FG
10784}
10785
10786int BlueStore::omap_get_values(
10787 CollectionHandle &c_, ///< [in] Collection containing oid
10788 const ghobject_t &oid, ///< [in] Object containing omap
10789 const set<string> &keys, ///< [in] Keys to get
10790 map<string, bufferlist> *out ///< [out] Returned keys and values
10791 )
10792{
10793 Collection *c = static_cast<Collection *>(c_.get());
10794 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10795 if (!c->exists)
10796 return -ENOENT;
9f95a23c 10797 std::shared_lock l(c->lock);
adb31ebb 10798 auto start1 = mono_clock::now();
7c673cae
FG
10799 int r = 0;
10800 string final_key;
10801 OnodeRef o = c->get_onode(oid, false);
10802 if (!o || !o->exists) {
10803 r = -ENOENT;
10804 goto out;
10805 }
9f95a23c 10806 if (!o->onode.has_omap()) {
7c673cae 10807 goto out;
9f95a23c
TL
10808 }
10809 o->flush();
11fdf7f2 10810 {
9f95a23c
TL
10811 const string& prefix = o->get_omap_prefix();
10812 o->get_omap_key(string(), &final_key);
10813 size_t base_key_len = final_key.size();
11fdf7f2 10814 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 10815 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
10816 final_key += *p;
10817 bufferlist val;
10818 if (db->get(prefix, final_key, &val) >= 0) {
10819 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
10820 << " -> " << *p << dendl;
10821 out->insert(make_pair(*p, val));
10822 }
7c673cae
FG
10823 }
10824 }
10825 out:
adb31ebb
TL
10826 c->store->log_latency(
10827 __func__,
10828 l_bluestore_omap_get_values_lat,
10829 mono_clock::now() - start1,
10830 c->store->cct->_conf->bluestore_log_omap_iterator_age);
10831
7c673cae
FG
10832 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10833 << dendl;
10834 return r;
10835}
10836
9f95a23c
TL
10837#ifdef WITH_SEASTAR
10838int BlueStore::omap_get_values(
10839 CollectionHandle &c_, ///< [in] Collection containing oid
10840 const ghobject_t &oid, ///< [in] Object containing omap
10841 const std::optional<string> &start_after, ///< [in] Keys to get
10842 map<string, bufferlist> *output ///< [out] Returned keys and values
10843 )
10844{
10845 Collection *c = static_cast<Collection *>(c_.get());
10846 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10847 if (!c->exists)
10848 return -ENOENT;
10849 std::shared_lock l(c->lock);
10850 int r = 0;
10851 OnodeRef o = c->get_onode(oid, false);
10852 if (!o || !o->exists) {
10853 r = -ENOENT;
10854 goto out;
10855 }
10856 if (!o->onode.has_omap()) {
10857 goto out;
10858 }
10859 o->flush();
10860 {
10861 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
10862 if (!iter) {
10863 r = -ENOENT;
10864 goto out;
10865 }
10866 iter->upper_bound(*start_after);
10867 for (; iter->valid(); iter->next()) {
10868 output->insert(make_pair(iter->key(), iter->value()));
10869 }
10870 }
10871
10872out:
10873 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10874 << dendl;
10875 return r;
10876}
10877#endif
10878
7c673cae
FG
10879int BlueStore::omap_check_keys(
10880 CollectionHandle &c_, ///< [in] Collection containing oid
10881 const ghobject_t &oid, ///< [in] Object containing omap
10882 const set<string> &keys, ///< [in] Keys to check
10883 set<string> *out ///< [out] Subset of keys defined on oid
10884 )
10885{
10886 Collection *c = static_cast<Collection *>(c_.get());
10887 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10888 if (!c->exists)
10889 return -ENOENT;
9f95a23c 10890 std::shared_lock l(c->lock);
7c673cae
FG
10891 int r = 0;
10892 string final_key;
10893 OnodeRef o = c->get_onode(oid, false);
10894 if (!o || !o->exists) {
10895 r = -ENOENT;
10896 goto out;
10897 }
9f95a23c 10898 if (!o->onode.has_omap()) {
7c673cae 10899 goto out;
9f95a23c
TL
10900 }
10901 o->flush();
11fdf7f2 10902 {
9f95a23c
TL
10903 const string& prefix = o->get_omap_prefix();
10904 o->get_omap_key(string(), &final_key);
10905 size_t base_key_len = final_key.size();
11fdf7f2 10906 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 10907 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
10908 final_key += *p;
10909 bufferlist val;
10910 if (db->get(prefix, final_key, &val) >= 0) {
10911 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
10912 << " -> " << *p << dendl;
10913 out->insert(*p);
10914 } else {
10915 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
10916 << " -> " << *p << dendl;
10917 }
7c673cae
FG
10918 }
10919 }
10920 out:
10921 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10922 << dendl;
10923 return r;
10924}
10925
7c673cae
FG
10926ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
10927 CollectionHandle &c_, ///< [in] collection
10928 const ghobject_t &oid ///< [in] object
10929 )
10930{
10931 Collection *c = static_cast<Collection *>(c_.get());
10932 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
10933 if (!c->exists) {
10934 return ObjectMap::ObjectMapIterator();
10935 }
9f95a23c 10936 std::shared_lock l(c->lock);
7c673cae
FG
10937 OnodeRef o = c->get_onode(oid, false);
10938 if (!o || !o->exists) {
10939 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
10940 return ObjectMap::ObjectMapIterator();
10941 }
10942 o->flush();
10943 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
9f95a23c 10944 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix());
7c673cae
FG
10945 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
10946}
10947
10948// -----------------
10949// write helpers
10950
11fdf7f2 10951uint64_t BlueStore::_get_ondisk_reserved() const {
f67539c2 10952 ceph_assert(min_alloc_size);
11fdf7f2
TL
10953 return round_up_to(
10954 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
10955}
10956
7c673cae
FG
10957void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
10958{
10959 dout(10) << __func__ << " ondisk_format " << ondisk_format
10960 << " min_compat_ondisk_format " << min_compat_ondisk_format
10961 << dendl;
11fdf7f2 10962 ceph_assert(ondisk_format == latest_ondisk_format);
7c673cae
FG
10963 {
10964 bufferlist bl;
11fdf7f2 10965 encode(ondisk_format, bl);
7c673cae
FG
10966 t->set(PREFIX_SUPER, "ondisk_format", bl);
10967 }
10968 {
10969 bufferlist bl;
11fdf7f2 10970 encode(min_compat_ondisk_format, bl);
7c673cae
FG
10971 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
10972 }
10973}
10974
10975int BlueStore::_open_super_meta()
10976{
10977 // nid
10978 {
10979 nid_max = 0;
10980 bufferlist bl;
10981 db->get(PREFIX_SUPER, "nid_max", &bl);
11fdf7f2 10982 auto p = bl.cbegin();
7c673cae
FG
10983 try {
10984 uint64_t v;
11fdf7f2 10985 decode(v, p);
7c673cae 10986 nid_max = v;
f67539c2 10987 } catch (ceph::buffer::error& e) {
7c673cae
FG
10988 derr << __func__ << " unable to read nid_max" << dendl;
10989 return -EIO;
10990 }
f67539c2 10991 dout(1) << __func__ << " old nid_max " << nid_max << dendl;
7c673cae
FG
10992 nid_last = nid_max.load();
10993 }
10994
10995 // blobid
10996 {
10997 blobid_max = 0;
10998 bufferlist bl;
10999 db->get(PREFIX_SUPER, "blobid_max", &bl);
11fdf7f2 11000 auto p = bl.cbegin();
7c673cae
FG
11001 try {
11002 uint64_t v;
11fdf7f2 11003 decode(v, p);
7c673cae 11004 blobid_max = v;
f67539c2 11005 } catch (ceph::buffer::error& e) {
7c673cae
FG
11006 derr << __func__ << " unable to read blobid_max" << dendl;
11007 return -EIO;
11008 }
f67539c2 11009 dout(1) << __func__ << " old blobid_max " << blobid_max << dendl;
7c673cae
FG
11010 blobid_last = blobid_max.load();
11011 }
11012
11013 // freelist
11014 {
11015 bufferlist bl;
11016 db->get(PREFIX_SUPER, "freelist_type", &bl);
11017 if (bl.length()) {
11018 freelist_type = std::string(bl.c_str(), bl.length());
f67539c2 11019 dout(1) << __func__ << " freelist_type " << freelist_type << dendl;
7c673cae 11020 } else {
11fdf7f2 11021 ceph_abort_msg("Not Support extent freelist manager");
7c673cae 11022 }
7c673cae
FG
11023 }
11024
11025 // ondisk format
11026 int32_t compat_ondisk_format = 0;
11027 {
11028 bufferlist bl;
11029 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
11030 if (r < 0) {
11031 // base case: kraken bluestore is v1 and readable by v1
11032 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
11033 << dendl;
11034 ondisk_format = 1;
11035 compat_ondisk_format = 1;
11036 } else {
11fdf7f2 11037 auto p = bl.cbegin();
7c673cae 11038 try {
11fdf7f2 11039 decode(ondisk_format, p);
f67539c2 11040 } catch (ceph::buffer::error& e) {
7c673cae
FG
11041 derr << __func__ << " unable to read ondisk_format" << dendl;
11042 return -EIO;
11043 }
11044 bl.clear();
11045 {
11046 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11fdf7f2
TL
11047 ceph_assert(!r);
11048 auto p = bl.cbegin();
7c673cae 11049 try {
11fdf7f2 11050 decode(compat_ondisk_format, p);
f67539c2 11051 } catch (ceph::buffer::error& e) {
7c673cae
FG
11052 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
11053 return -EIO;
11054 }
11055 }
11056 }
f67539c2 11057 dout(1) << __func__ << " ondisk_format " << ondisk_format
7c673cae
FG
11058 << " compat_ondisk_format " << compat_ondisk_format
11059 << dendl;
11060 }
11061
11062 if (latest_ondisk_format < compat_ondisk_format) {
11063 derr << __func__ << " compat_ondisk_format is "
11064 << compat_ondisk_format << " but we only understand version "
11065 << latest_ondisk_format << dendl;
11066 return -EPERM;
11067 }
7c673cae
FG
11068
11069 {
11070 bufferlist bl;
11071 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11fdf7f2 11072 auto p = bl.cbegin();
7c673cae
FG
11073 try {
11074 uint64_t val;
11fdf7f2 11075 decode(val, p);
7c673cae 11076 min_alloc_size = val;
224ce89b 11077 min_alloc_size_order = ctz(val);
11fdf7f2 11078 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
f67539c2 11079 } catch (ceph::buffer::error& e) {
7c673cae
FG
11080 derr << __func__ << " unable to read min_alloc_size" << dendl;
11081 return -EIO;
11082 }
f67539c2 11083 dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
7c673cae
FG
11084 << std::dec << dendl;
11085 }
9f95a23c
TL
11086
11087 _set_per_pool_omap();
11088
224ce89b 11089 _open_statfs();
7c673cae
FG
11090 _set_alloc_sizes();
11091 _set_throttle_params();
11092
11093 _set_csum();
11094 _set_compression();
11095 _set_blob_size();
11096
11fdf7f2 11097 _validate_bdev();
7c673cae
FG
11098 return 0;
11099}
11100
11101int BlueStore::_upgrade_super()
11102{
11103 dout(1) << __func__ << " from " << ondisk_format << ", latest "
11104 << latest_ondisk_format << dendl;
11fdf7f2
TL
11105 if (ondisk_format < latest_ondisk_format) {
11106 ceph_assert(ondisk_format > 0);
11107 ceph_assert(ondisk_format < latest_ondisk_format);
11108
1911f103 11109 KeyValueDB::Transaction t = db->get_transaction();
11fdf7f2
TL
11110 if (ondisk_format == 1) {
11111 // changes:
11112 // - super: added ondisk_format
11113 // - super: added min_readable_ondisk_format
11114 // - super: added min_compat_ondisk_format
11115 // - super: added min_alloc_size
11116 // - super: removed min_min_alloc_size
11fdf7f2
TL
11117 {
11118 bufferlist bl;
11119 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
11120 auto p = bl.cbegin();
11121 try {
11122 uint64_t val;
11123 decode(val, p);
11124 min_alloc_size = val;
f67539c2 11125 } catch (ceph::buffer::error& e) {
11fdf7f2
TL
11126 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
11127 return -EIO;
11128 }
11129 t->set(PREFIX_SUPER, "min_alloc_size", bl);
11130 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7c673cae 11131 }
11fdf7f2 11132 ondisk_format = 2;
7c673cae 11133 }
9f95a23c
TL
11134 if (ondisk_format == 2) {
11135 // changes:
f67539c2
TL
11136 // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all*
11137 // oondes are using the per-pool prefix until a repair is run; at that
9f95a23c
TL
11138 // point the per_pool_omap=1 key will be set.
11139 // - super: added per_pool_omap key, which indicates that *all* objects
11140 // are using the new prefix and key format
11141 ondisk_format = 3;
1911f103
TL
11142 }
11143 if (ondisk_format == 3) {
11144 // changes:
11145 // - FreelistManager keeps meta within bdev label
11146 int r = _write_out_fm_meta(0);
9f95a23c 11147 ceph_assert(r == 0);
1911f103 11148 ondisk_format = 4;
9f95a23c 11149 }
1911f103
TL
11150 // This to be the last operation
11151 _prepare_ondisk_format_super(t);
11152 int r = db->submit_transaction_sync(t);
11153 ceph_assert(r == 0);
7c673cae 11154 }
7c673cae
FG
11155 // done
11156 dout(1) << __func__ << " done" << dendl;
11157 return 0;
11158}
11159
11160void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
11161{
224ce89b 11162 if (o->onode.nid) {
11fdf7f2 11163 ceph_assert(o->exists);
7c673cae 11164 return;
224ce89b 11165 }
7c673cae
FG
11166 uint64_t nid = ++nid_last;
11167 dout(20) << __func__ << " " << nid << dendl;
11168 o->onode.nid = nid;
11169 txc->last_nid = nid;
224ce89b 11170 o->exists = true;
7c673cae
FG
11171}
11172
11173uint64_t BlueStore::_assign_blobid(TransContext *txc)
11174{
11175 uint64_t bid = ++blobid_last;
11176 dout(20) << __func__ << " " << bid << dendl;
11177 txc->last_blobid = bid;
11178 return bid;
11179}
11180
11181void BlueStore::get_db_statistics(Formatter *f)
11182{
11183 db->get_statistics(f);
11184}
11185
11fdf7f2
TL
11186BlueStore::TransContext *BlueStore::_txc_create(
11187 Collection *c, OpSequencer *osr,
f67539c2
TL
11188 list<Context*> *on_commits,
11189 TrackedOpRef osd_op)
7c673cae 11190{
11fdf7f2 11191 TransContext *txc = new TransContext(cct, c, osr, on_commits);
7c673cae 11192 txc->t = db->get_transaction();
f67539c2
TL
11193
11194#ifdef WITH_BLKIN
11195 if (osd_op && osd_op->pg_trace) {
11196 txc->trace.init("TransContext", &trace_endpoint,
11197 &osd_op->pg_trace);
11198 txc->trace.event("txc create");
11199 txc->trace.keyval("txc seq", txc->seq);
11200 }
11201#endif
11202
7c673cae
FG
11203 osr->queue_new(txc);
11204 dout(20) << __func__ << " osr " << osr << " = " << txc
11205 << " seq " << txc->seq << dendl;
11206 return txc;
11207}
11208
11209void BlueStore::_txc_calc_cost(TransContext *txc)
11210{
11fdf7f2
TL
11211 // one "io" for the kv commit
11212 auto ios = 1 + txc->ioc.get_num_ios();
7c673cae
FG
11213 auto cost = throttle_cost_per_io.load();
11214 txc->cost = ios * cost + txc->bytes;
9f95a23c 11215 txc->ios = ios;
7c673cae
FG
11216 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
11217 << ios << " ios * " << cost << " + " << txc->bytes
11218 << " bytes)" << dendl;
11219}
11220
11221void BlueStore::_txc_update_store_statfs(TransContext *txc)
11222{
11223 if (txc->statfs_delta.is_empty())
11224 return;
11225
11226 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
11227 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
11228 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
11229 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
11230 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
11231
11232 bufferlist bl;
11233 txc->statfs_delta.encode(bl);
11fdf7f2
TL
11234 if (per_pool_stat_collection) {
11235 string key;
11236 get_pool_stat_key(txc->osd_pool_id, &key);
11237 txc->t->merge(PREFIX_STAT, key, bl);
11238
11239 std::lock_guard l(vstatfs_lock);
11240 auto& stats = osd_pools[txc->osd_pool_id];
11241 stats += txc->statfs_delta;
11242
11243 vstatfs += txc->statfs_delta; //non-persistent in this mode
11244
11245 } else {
11246 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
7c673cae 11247
11fdf7f2
TL
11248 std::lock_guard l(vstatfs_lock);
11249 vstatfs += txc->statfs_delta;
11250 }
7c673cae
FG
11251 txc->statfs_delta.reset();
11252}
11253
11254void BlueStore::_txc_state_proc(TransContext *txc)
11255{
11256 while (true) {
11257 dout(10) << __func__ << " txc " << txc
11258 << " " << txc->get_state_name() << dendl;
f67539c2 11259 switch (txc->get_state()) {
7c673cae 11260 case TransContext::STATE_PREPARE:
9f95a23c 11261 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
7c673cae 11262 if (txc->ioc.has_pending_aios()) {
f67539c2
TL
11263 txc->set_state(TransContext::STATE_AIO_WAIT);
11264#ifdef WITH_BLKIN
11265 if (txc->trace) {
11266 txc->trace.keyval("pending aios", txc->ioc.num_pending.load());
11267 }
11268#endif
7c673cae
FG
11269 txc->had_ios = true;
11270 _txc_aio_submit(txc);
11271 return;
11272 }
11273 // ** fall-thru **
11274
11275 case TransContext::STATE_AIO_WAIT:
11fdf7f2 11276 {
9f95a23c
TL
11277 mono_clock::duration lat = throttle.log_state_latency(
11278 *txc, logger, l_bluestore_state_aio_wait_lat);
11279 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
11fdf7f2
TL
11280 dout(0) << __func__ << " slow aio_wait, txc = " << txc
11281 << ", latency = " << lat
11282 << dendl;
11283 }
11284 }
11285
7c673cae
FG
11286 _txc_finish_io(txc); // may trigger blocked txc's too
11287 return;
11288
11289 case TransContext::STATE_IO_DONE:
11fdf7f2 11290 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
7c673cae
FG
11291 if (txc->had_ios) {
11292 ++txc->osr->txc_with_unstable_io;
11293 }
9f95a23c 11294 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
f67539c2 11295 txc->set_state(TransContext::STATE_KV_QUEUED);
7c673cae
FG
11296 if (cct->_conf->bluestore_sync_submit_transaction) {
11297 if (txc->last_nid >= nid_max ||
11298 txc->last_blobid >= blobid_max) {
11299 dout(20) << __func__
11300 << " last_{nid,blobid} exceeds max, submit via kv thread"
11301 << dendl;
11302 } else if (txc->osr->kv_committing_serially) {
11303 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
11304 << dendl;
11305 // note: this is starvation-prone. once we have a txc in a busy
11306 // sequencer that is committing serially it is possible to keep
11307 // submitting new transactions fast enough that we get stuck doing
11308 // so. the alternative is to block here... fixme?
11309 } else if (txc->osr->txc_with_unstable_io) {
11310 dout(20) << __func__ << " prior txc(s) with unstable ios "
11311 << txc->osr->txc_with_unstable_io.load() << dendl;
11312 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
11313 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
11314 == 0) {
11315 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
11316 << dendl;
11317 } else {
9f95a23c 11318 _txc_apply_kv(txc, true);
7c673cae
FG
11319 }
11320 }
11321 {
11fdf7f2 11322 std::lock_guard l(kv_lock);
7c673cae 11323 kv_queue.push_back(txc);
9f95a23c
TL
11324 if (!kv_sync_in_progress) {
11325 kv_sync_in_progress = true;
11326 kv_cond.notify_one();
11327 }
f67539c2 11328 if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) {
7c673cae
FG
11329 kv_queue_unsubmitted.push_back(txc);
11330 ++txc->osr->kv_committing_serially;
11331 }
31f18b77
FG
11332 if (txc->had_ios)
11333 kv_ios++;
11334 kv_throttle_costs += txc->cost;
7c673cae
FG
11335 }
11336 return;
11337 case TransContext::STATE_KV_SUBMITTED:
7c673cae
FG
11338 _txc_committed_kv(txc);
11339 // ** fall-thru **
11340
11341 case TransContext::STATE_KV_DONE:
9f95a23c 11342 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
7c673cae 11343 if (txc->deferred_txn) {
f67539c2 11344 txc->set_state(TransContext::STATE_DEFERRED_QUEUED);
7c673cae
FG
11345 _deferred_queue(txc);
11346 return;
11347 }
f67539c2 11348 txc->set_state(TransContext::STATE_FINISHING);
7c673cae
FG
11349 break;
11350
11351 case TransContext::STATE_DEFERRED_CLEANUP:
9f95a23c 11352 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
f67539c2 11353 txc->set_state(TransContext::STATE_FINISHING);
7c673cae
FG
11354 // ** fall-thru **
11355
11356 case TransContext::STATE_FINISHING:
9f95a23c 11357 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
7c673cae
FG
11358 _txc_finish(txc);
11359 return;
11360
11361 default:
11362 derr << __func__ << " unexpected txc " << txc
11363 << " state " << txc->get_state_name() << dendl;
11fdf7f2 11364 ceph_abort_msg("unexpected txc state");
7c673cae
FG
11365 return;
11366 }
11367 }
11368}
11369
11370void BlueStore::_txc_finish_io(TransContext *txc)
11371{
11372 dout(20) << __func__ << " " << txc << dendl;
11373
11374 /*
11375 * we need to preserve the order of kv transactions,
11376 * even though aio will complete in any order.
11377 */
11378
11379 OpSequencer *osr = txc->osr.get();
11fdf7f2 11380 std::lock_guard l(osr->qlock);
f67539c2 11381 txc->set_state(TransContext::STATE_IO_DONE);
11fdf7f2 11382 txc->ioc.release_running_aios();
7c673cae
FG
11383 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
11384 while (p != osr->q.begin()) {
11385 --p;
f67539c2 11386 if (p->get_state() < TransContext::STATE_IO_DONE) {
7c673cae
FG
11387 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
11388 << p->get_state_name() << dendl;
11389 return;
11390 }
f67539c2 11391 if (p->get_state() > TransContext::STATE_IO_DONE) {
7c673cae
FG
11392 ++p;
11393 break;
11394 }
11395 }
11396 do {
11397 _txc_state_proc(&*p++);
11398 } while (p != osr->q.end() &&
f67539c2 11399 p->get_state() == TransContext::STATE_IO_DONE);
7c673cae 11400
11fdf7f2 11401 if (osr->kv_submitted_waiters) {
7c673cae
FG
11402 osr->qcond.notify_all();
11403 }
11404}
11405
11406void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
11407{
11408 dout(20) << __func__ << " txc " << txc
11409 << " onodes " << txc->onodes
11410 << " shared_blobs " << txc->shared_blobs
11411 << dendl;
11412
11413 // finalize onodes
11414 for (auto o : txc->onodes) {
11fdf7f2 11415 _record_onode(o, t);
7c673cae
FG
11416 o->flushing_count++;
11417 }
11418
11419 // objects we modified but didn't affect the onode
11420 auto p = txc->modified_objects.begin();
11421 while (p != txc->modified_objects.end()) {
11422 if (txc->onodes.count(*p) == 0) {
11423 (*p)->flushing_count++;
11424 ++p;
11425 } else {
11426 // remove dups with onodes list to avoid problems in _txc_finish
11427 p = txc->modified_objects.erase(p);
11428 }
11429 }
11430
11431 // finalize shared_blobs
11432 for (auto sb : txc->shared_blobs) {
11433 string key;
11434 auto sbid = sb->get_sbid();
11435 get_shared_blob_key(sbid, &key);
11436 if (sb->persistent->empty()) {
11fdf7f2
TL
11437 dout(20) << __func__ << " shared_blob 0x"
11438 << std::hex << sbid << std::dec
7c673cae
FG
11439 << " is empty" << dendl;
11440 t->rmkey(PREFIX_SHARED_BLOB, key);
11441 } else {
11442 bufferlist bl;
11fdf7f2
TL
11443 encode(*(sb->persistent), bl);
11444 dout(20) << __func__ << " shared_blob 0x"
11445 << std::hex << sbid << std::dec
31f18b77 11446 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
11447 t->set(PREFIX_SHARED_BLOB, key, bl);
11448 }
11449 }
11450}
11451
11452void BlueStore::BSPerfTracker::update_from_perfcounters(
11453 PerfCounters &logger)
11454{
11fdf7f2
TL
11455 os_commit_latency_ns.consume_next(
11456 logger.get_tavg_ns(
7c673cae 11457 l_bluestore_commit_lat));
11fdf7f2
TL
11458 os_apply_latency_ns.consume_next(
11459 logger.get_tavg_ns(
7c673cae
FG
11460 l_bluestore_commit_lat));
11461}
11462
f67539c2
TL
11463// For every object we maintain <zone_num+oid, offset> tuple in the key-value
11464// store. When a new object written to a zone, we insert the corresponding
11465// tuple to the database. When an object is truncated, we remove the
11466// corresponding tuple. When an object is overwritten, we remove the old tuple
11467// and insert a new tuple corresponding to the new location of the object. The
11468// cleaner can now identify live objects within the zone <zone_num> by
11469// enumerating all the keys starting with <zone_num> prefix.
11470void BlueStore::_zoned_update_cleaning_metadata(TransContext *txc) {
11471 for (const auto &[o, offsets] : txc->zoned_onode_to_offset_map) {
11472 std::string key;
11473 get_object_key(cct, o->oid, &key);
11474 for (auto offset : offsets) {
11475 if (offset > 0) {
11476 bufferlist offset_bl;
11477 encode(offset, offset_bl);
11478 txc->t->set(_zoned_get_prefix(offset), key, offset_bl);
11479 } else {
11480 txc->t->rmkey(_zoned_get_prefix(-offset), key);
11481 }
11482 }
11483 }
11484}
11485
11486std::string BlueStore::_zoned_get_prefix(uint64_t offset) {
11487 uint64_t zone_num = offset / bdev->get_zone_size();
11488 std::string zone_key;
11489 _key_encode_u64(zone_num, &zone_key);
11490 return PREFIX_ZONED_CL_INFO + zone_key;
11491}
11492
11493// For now, to avoid interface changes we piggyback zone_size (in MiB) and the
11494// first sequential zone number onto min_alloc_size and pass it to functions
11495// Allocator::create and FreelistManager::create.
11496uint64_t BlueStore::_zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size) {
11497 uint64_t zone_size = bdev->get_zone_size();
11498 uint64_t zone_size_mb = zone_size / (1024 * 1024);
11499 uint64_t first_seq_zone = bdev->get_conventional_region_size() / zone_size;
11500 min_alloc_size |= (zone_size_mb << 32);
11501 min_alloc_size |= (first_seq_zone << 48);
11502 return min_alloc_size;
11503}
11504
11505int BlueStore::_zoned_check_config_settings() {
11506 if (cct->_conf->bluestore_allocator != "zoned") {
11507 dout(1) << __func__ << " The drive is HM-SMR but "
11508 << cct->_conf->bluestore_allocator << " allocator is specified. "
11509 << "Only zoned allocator can be used with HM-SMR drive." << dendl;
11510 return -EINVAL;
11511 }
11512
11513 // At least for now we want to use large min_alloc_size with HM-SMR drives.
11514 // Populating used_blocks bitset on a debug build of ceph-osd takes about 5
11515 // minutes with a 14 TB HM-SMR drive and 4 KiB min_alloc_size.
11516 if (min_alloc_size < 64 * 1024) {
11517 dout(1) << __func__ << " The drive is HM-SMR but min_alloc_size is "
11518 << min_alloc_size << ". "
11519 << "Please set to at least 64 KiB." << dendl;
11520 return -EINVAL;
11521 }
11522
11523 // We don't want to defer writes with HM-SMR because it violates sequential
11524 // write requirement.
11525 if (prefer_deferred_size) {
11526 dout(1) << __func__ << " The drive is HM-SMR but prefer_deferred_size is "
11527 << prefer_deferred_size << ". "
11528 << "Please set to 0." << dendl;
11529 return -EINVAL;
11530 }
11531 return 0;
11532}
11533
7c673cae
FG
11534void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
11535{
11536 dout(20) << __func__ << " txc " << txc << std::hex
11537 << " allocated 0x" << txc->allocated
11538 << " released 0x" << txc->released
11539 << std::dec << dendl;
11540
11541 // We have to handle the case where we allocate *and* deallocate the
11542 // same region in this transaction. The freelist doesn't like that.
11543 // (Actually, the only thing that cares is the BitmapFreelistManager
11544 // debug check. But that's important.)
11545 interval_set<uint64_t> tmp_allocated, tmp_released;
11546 interval_set<uint64_t> *pallocated = &txc->allocated;
11547 interval_set<uint64_t> *preleased = &txc->released;
11548 if (!txc->allocated.empty() && !txc->released.empty()) {
11549 interval_set<uint64_t> overlap;
11550 overlap.intersection_of(txc->allocated, txc->released);
11551 if (!overlap.empty()) {
11552 tmp_allocated = txc->allocated;
11553 tmp_allocated.subtract(overlap);
11554 tmp_released = txc->released;
11555 tmp_released.subtract(overlap);
11556 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
11557 << ", new allocated 0x" << tmp_allocated
11558 << " released 0x" << tmp_released << std::dec
11559 << dendl;
11560 pallocated = &tmp_allocated;
11561 preleased = &tmp_released;
11562 }
11563 }
11564
11565 // update freelist with non-overlap sets
11566 for (interval_set<uint64_t>::iterator p = pallocated->begin();
11567 p != pallocated->end();
11568 ++p) {
11569 fm->allocate(p.get_start(), p.get_len(), t);
11570 }
11571 for (interval_set<uint64_t>::iterator p = preleased->begin();
11572 p != preleased->end();
11573 ++p) {
11574 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
11575 << "~" << p.get_len() << std::dec << dendl;
11576 fm->release(p.get_start(), p.get_len(), t);
11577 }
11578
f67539c2
TL
11579 if (bdev->is_smr()) {
11580 _zoned_update_cleaning_metadata(txc);
11581 }
11582
7c673cae
FG
11583 _txc_update_store_statfs(txc);
11584}
11585
9f95a23c 11586void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
7c673cae 11587{
f67539c2 11588 ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED);
9f95a23c
TL
11589 {
11590#if defined(WITH_LTTNG)
11591 auto start = mono_clock::now();
11592#endif
11593
f67539c2
TL
11594#ifdef WITH_BLKIN
11595 if (txc->trace) {
11596 txc->trace.event("db async submit");
11597 }
11598#endif
11599
9f95a23c
TL
11600 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11601 ceph_assert(r == 0);
f67539c2 11602 txc->set_state(TransContext::STATE_KV_SUBMITTED);
9f95a23c
TL
11603 if (txc->osr->kv_submitted_waiters) {
11604 std::lock_guard l(txc->osr->qlock);
11605 txc->osr->qcond.notify_all();
11606 }
11607
11608#if defined(WITH_LTTNG)
11609 if (txc->tracing) {
11610 tracepoint(
11611 bluestore,
11612 transaction_kv_submit_latency,
11613 txc->osr->get_sequencer_id(),
11614 txc->seq,
11615 sync_submit_transaction,
11616 ceph::to_seconds<double>(mono_clock::now() - start));
11617 }
11618#endif
11619 }
11620
7c673cae
FG
11621 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
11622 for (auto& o : *ls) {
11623 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
11624 << dendl;
9f95a23c 11625 if (--o->flushing_count == 0 && o->waiting_count.load()) {
11fdf7f2 11626 std::lock_guard l(o->flush_lock);
7c673cae
FG
11627 o->flush_cond.notify_all();
11628 }
11629 }
11630 }
11631}
11632
11633void BlueStore::_txc_committed_kv(TransContext *txc)
11634{
11635 dout(20) << __func__ << " txc " << txc << dendl;
9f95a23c 11636 throttle.complete_kv(*txc);
1adf2230 11637 {
11fdf7f2 11638 std::lock_guard l(txc->osr->qlock);
f67539c2 11639 txc->set_state(TransContext::STATE_KV_DONE);
11fdf7f2
TL
11640 if (txc->ch->commit_queue) {
11641 txc->ch->commit_queue->queue(txc->oncommits);
11642 } else {
11643 finisher.queue(txc->oncommits);
1adf2230 11644 }
7c673cae 11645 }
9f95a23c 11646 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
494da23a
TL
11647 log_latency_fn(
11648 __func__,
11649 l_bluestore_commit_lat,
9f95a23c 11650 mono_clock::now() - txc->start,
494da23a
TL
11651 cct->_conf->bluestore_log_op_age,
11652 [&](auto lat) {
11653 return ", txc = " + stringify(txc);
11654 }
11fdf7f2 11655 );
7c673cae
FG
11656}
11657
11658void BlueStore::_txc_finish(TransContext *txc)
11659{
11660 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
f67539c2 11661 ceph_assert(txc->get_state() == TransContext::STATE_FINISHING);
7c673cae
FG
11662
11663 for (auto& sb : txc->shared_blobs_written) {
f64942e4 11664 sb->finish_write(txc->seq);
7c673cae
FG
11665 }
11666 txc->shared_blobs_written.clear();
11667
11668 while (!txc->removed_collections.empty()) {
11669 _queue_reap_collection(txc->removed_collections.front());
11670 txc->removed_collections.pop_front();
11671 }
11672
11673 OpSequencerRef osr = txc->osr;
7c673cae 11674 bool empty = false;
31f18b77 11675 bool submit_deferred = false;
7c673cae
FG
11676 OpSequencer::q_list_t releasing_txc;
11677 {
11fdf7f2 11678 std::lock_guard l(osr->qlock);
f67539c2 11679 txc->set_state(TransContext::STATE_DONE);
7c673cae
FG
11680 bool notify = false;
11681 while (!osr->q.empty()) {
11682 TransContext *txc = &osr->q.front();
11683 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
11684 << dendl;
f67539c2
TL
11685 if (txc->get_state() != TransContext::STATE_DONE) {
11686 if (txc->get_state() == TransContext::STATE_PREPARE &&
7c673cae
FG
11687 deferred_aggressive) {
11688 // for _osr_drain_preceding()
11689 notify = true;
11690 }
f67539c2 11691 if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED &&
11fdf7f2 11692 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
31f18b77
FG
11693 submit_deferred = true;
11694 }
7c673cae
FG
11695 break;
11696 }
11697
7c673cae
FG
11698 osr->q.pop_front();
11699 releasing_txc.push_back(*txc);
7c673cae 11700 }
9f95a23c 11701
7c673cae
FG
11702 if (osr->q.empty()) {
11703 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
11704 empty = true;
11705 }
9f95a23c
TL
11706
11707 // only drain()/drain_preceding() need wakeup,
11708 // other cases use kv_submitted_waiters
11709 if (notify || empty) {
11710 osr->qcond.notify_all();
11711 }
7c673cae 11712 }
9f95a23c 11713
7c673cae
FG
11714 while (!releasing_txc.empty()) {
11715 // release to allocator only after all preceding txc's have also
11716 // finished any deferred writes that potentially land in these
11717 // blocks
11718 auto txc = &releasing_txc.front();
11719 _txc_release_alloc(txc);
11720 releasing_txc.pop_front();
9f95a23c
TL
11721 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
11722 throttle.complete(*txc);
7c673cae
FG
11723 delete txc;
11724 }
11725
31f18b77
FG
11726 if (submit_deferred) {
11727 // we're pinning memory; flush! we could be more fine-grained here but
11728 // i'm not sure it's worth the bother.
11729 deferred_try_submit();
7c673cae
FG
11730 }
11731
7c673cae 11732 if (empty && osr->zombie) {
11fdf7f2
TL
11733 std::lock_guard l(zombie_osr_lock);
11734 if (zombie_osr_set.erase(osr->cid)) {
11735 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
11736 } else {
11737 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
11738 << dendl;
11739 }
7c673cae 11740 }
9f95a23c 11741}
7c673cae
FG
11742
11743void BlueStore::_txc_release_alloc(TransContext *txc)
11744{
a8e16298 11745 // it's expected we're called with lazy_release_lock already taken!
11fdf7f2
TL
11746 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
11747 int r = 0;
11748 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
11749 r = bdev->queue_discard(txc->released);
11750 if (r == 0) {
11751 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
11752 << txc->released << std::dec << dendl;
11753 goto out;
11754 }
11755 } else if (cct->_conf->bdev_enable_discard) {
11756 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
11757 bdev->discard(p.get_start(), p.get_len());
11758 }
11759 }
11760 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
94b18763 11761 << txc->released << std::dec << dendl;
f67539c2 11762 shared_alloc.a->release(txc->released);
7c673cae
FG
11763 }
11764
11fdf7f2 11765out:
7c673cae
FG
11766 txc->allocated.clear();
11767 txc->released.clear();
11768}
11769
11fdf7f2
TL
11770void BlueStore::_osr_attach(Collection *c)
11771{
11772 // note: caller has RWLock on coll_map
11773 auto q = coll_map.find(c->cid);
11774 if (q != coll_map.end()) {
11775 c->osr = q->second->osr;
11776 ldout(cct, 10) << __func__ << " " << c->cid
11777 << " reusing osr " << c->osr << " from existing coll "
11778 << q->second << dendl;
11779 } else {
11780 std::lock_guard l(zombie_osr_lock);
11781 auto p = zombie_osr_set.find(c->cid);
11782 if (p == zombie_osr_set.end()) {
9f95a23c 11783 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
11fdf7f2
TL
11784 ldout(cct, 10) << __func__ << " " << c->cid
11785 << " fresh osr " << c->osr << dendl;
11786 } else {
11787 c->osr = p->second;
11788 zombie_osr_set.erase(p);
11789 ldout(cct, 10) << __func__ << " " << c->cid
11790 << " resurrecting zombie osr " << c->osr << dendl;
11791 c->osr->zombie = false;
11792 }
11793 }
11794}
11795
11796void BlueStore::_osr_register_zombie(OpSequencer *osr)
11797{
11798 std::lock_guard l(zombie_osr_lock);
11799 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
11800 osr->zombie = true;
11801 auto i = zombie_osr_set.emplace(osr->cid, osr);
11802 // this is either a new insertion or the same osr is already there
11803 ceph_assert(i.second || i.first->second == osr);
11804}
11805
7c673cae
FG
11806void BlueStore::_osr_drain_preceding(TransContext *txc)
11807{
11808 OpSequencer *osr = txc->osr.get();
11809 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
11810 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11811 {
11812 // submit anything pending
f67539c2 11813 osr->deferred_lock.lock();
11fdf7f2 11814 if (osr->deferred_pending && !osr->deferred_running) {
224ce89b
WB
11815 _deferred_submit_unlock(osr);
11816 } else {
f67539c2 11817 osr->deferred_lock.unlock();
7c673cae
FG
11818 }
11819 }
11820 {
11821 // wake up any previously finished deferred events
11fdf7f2 11822 std::lock_guard l(kv_lock);
9f95a23c
TL
11823 if (!kv_sync_in_progress) {
11824 kv_sync_in_progress = true;
11825 kv_cond.notify_one();
11826 }
7c673cae
FG
11827 }
11828 osr->drain_preceding(txc);
11829 --deferred_aggressive;
11830 dout(10) << __func__ << " " << osr << " done" << dendl;
11831}
11832
11fdf7f2
TL
11833void BlueStore::_osr_drain(OpSequencer *osr)
11834{
11835 dout(10) << __func__ << " " << osr << dendl;
11836 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11837 {
11838 // submit anything pending
f67539c2 11839 osr->deferred_lock.lock();
11fdf7f2
TL
11840 if (osr->deferred_pending && !osr->deferred_running) {
11841 _deferred_submit_unlock(osr);
11842 } else {
f67539c2 11843 osr->deferred_lock.unlock();
11fdf7f2
TL
11844 }
11845 }
11846 {
11847 // wake up any previously finished deferred events
11848 std::lock_guard l(kv_lock);
9f95a23c
TL
11849 if (!kv_sync_in_progress) {
11850 kv_sync_in_progress = true;
11851 kv_cond.notify_one();
11852 }
11fdf7f2
TL
11853 }
11854 osr->drain();
11855 --deferred_aggressive;
11856 dout(10) << __func__ << " " << osr << " done" << dendl;
11857}
11858
7c673cae
FG
11859void BlueStore::_osr_drain_all()
11860{
11861 dout(10) << __func__ << dendl;
11862
11863 set<OpSequencerRef> s;
11fdf7f2
TL
11864 vector<OpSequencerRef> zombies;
11865 {
9f95a23c 11866 std::shared_lock l(coll_lock);
11fdf7f2
TL
11867 for (auto& i : coll_map) {
11868 s.insert(i.second->osr);
11869 }
11870 }
7c673cae 11871 {
11fdf7f2
TL
11872 std::lock_guard l(zombie_osr_lock);
11873 for (auto& i : zombie_osr_set) {
11874 s.insert(i.second);
11875 zombies.push_back(i.second);
11876 }
7c673cae
FG
11877 }
11878 dout(20) << __func__ << " osr_set " << s << dendl;
11879
11880 ++deferred_aggressive;
11881 {
11882 // submit anything pending
224ce89b 11883 deferred_try_submit();
7c673cae
FG
11884 }
11885 {
11886 // wake up any previously finished deferred events
11fdf7f2 11887 std::lock_guard l(kv_lock);
7c673cae
FG
11888 kv_cond.notify_one();
11889 }
31f18b77 11890 {
11fdf7f2 11891 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
11892 kv_finalize_cond.notify_one();
11893 }
7c673cae
FG
11894 for (auto osr : s) {
11895 dout(20) << __func__ << " drain " << osr << dendl;
11896 osr->drain();
11897 }
11898 --deferred_aggressive;
11899
7c673cae 11900 {
11fdf7f2
TL
11901 std::lock_guard l(zombie_osr_lock);
11902 for (auto& osr : zombies) {
11903 if (zombie_osr_set.erase(osr->cid)) {
11904 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
11905 ceph_assert(osr->q.empty());
11906 } else if (osr->zombie) {
11907 dout(10) << __func__ << " empty zombie osr " << osr
11908 << " already reaped" << dendl;
11909 ceph_assert(osr->q.empty());
11910 } else {
11911 dout(10) << __func__ << " empty zombie osr " << osr
11912 << " resurrected" << dendl;
11913 }
7c673cae
FG
11914 }
11915 }
11fdf7f2
TL
11916
11917 dout(10) << __func__ << " done" << dendl;
7c673cae
FG
11918}
11919
11fdf7f2 11920
31f18b77
FG
11921void BlueStore::_kv_start()
11922{
11923 dout(10) << __func__ << dendl;
11924
11fdf7f2 11925 finisher.start();
31f18b77
FG
11926 kv_sync_thread.create("bstore_kv_sync");
11927 kv_finalize_thread.create("bstore_kv_final");
11928}
11929
11930void BlueStore::_kv_stop()
11931{
11932 dout(10) << __func__ << dendl;
11933 {
9f95a23c 11934 std::unique_lock l{kv_lock};
31f18b77
FG
11935 while (!kv_sync_started) {
11936 kv_cond.wait(l);
11937 }
11938 kv_stop = true;
11939 kv_cond.notify_all();
11940 }
11941 {
9f95a23c 11942 std::unique_lock l{kv_finalize_lock};
31f18b77
FG
11943 while (!kv_finalize_started) {
11944 kv_finalize_cond.wait(l);
11945 }
11946 kv_finalize_stop = true;
11947 kv_finalize_cond.notify_all();
11948 }
11949 kv_sync_thread.join();
11950 kv_finalize_thread.join();
11fdf7f2 11951 ceph_assert(removed_collections.empty());
31f18b77 11952 {
11fdf7f2 11953 std::lock_guard l(kv_lock);
31f18b77
FG
11954 kv_stop = false;
11955 }
11956 {
11fdf7f2 11957 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
11958 kv_finalize_stop = false;
11959 }
11960 dout(10) << __func__ << " stopping finishers" << dendl;
11fdf7f2
TL
11961 finisher.wait_for_empty();
11962 finisher.stop();
31f18b77
FG
11963 dout(10) << __func__ << " stopped" << dendl;
11964}
11965
7c673cae
FG
11966void BlueStore::_kv_sync_thread()
11967{
11968 dout(10) << __func__ << " start" << dendl;
11fdf7f2 11969 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
9f95a23c 11970 std::unique_lock l{kv_lock};
11fdf7f2 11971 ceph_assert(!kv_sync_started);
31f18b77
FG
11972 kv_sync_started = true;
11973 kv_cond.notify_all();
adb31ebb
TL
11974
11975 auto t0 = mono_clock::now();
11976 timespan twait = ceph::make_timespan(0);
11977 size_t kv_submitted = 0;
11978
7c673cae 11979 while (true) {
adb31ebb
TL
11980 auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
11981 auto observation_period =
11982 ceph::make_timespan(period);
11983 auto elapsed = mono_clock::now() - t0;
11984 if (period && elapsed >= observation_period) {
11985 dout(5) << __func__ << " utilization: idle "
11986 << twait << " of " << elapsed
11987 << ", submitted: " << kv_submitted
11988 <<dendl;
11989 t0 = mono_clock::now();
11990 twait = ceph::make_timespan(0);
11991 kv_submitted = 0;
11992 }
11fdf7f2 11993 ceph_assert(kv_committing.empty());
7c673cae
FG
11994 if (kv_queue.empty() &&
11995 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
11fdf7f2 11996 !deferred_aggressive)) {
7c673cae
FG
11997 if (kv_stop)
11998 break;
11999 dout(20) << __func__ << " sleep" << dendl;
adb31ebb 12000 auto t = mono_clock::now();
9f95a23c 12001 kv_sync_in_progress = false;
11fdf7f2 12002 kv_cond.wait(l);
adb31ebb
TL
12003 twait += mono_clock::now() - t;
12004
7c673cae
FG
12005 dout(20) << __func__ << " wake" << dendl;
12006 } else {
12007 deque<TransContext*> kv_submitting;
12008 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
12009 uint64_t aios = 0, costs = 0;
12010
7c673cae
FG
12011 dout(20) << __func__ << " committing " << kv_queue.size()
12012 << " submitting " << kv_queue_unsubmitted.size()
12013 << " deferred done " << deferred_done_queue.size()
12014 << " stable " << deferred_stable_queue.size()
12015 << dendl;
12016 kv_committing.swap(kv_queue);
12017 kv_submitting.swap(kv_queue_unsubmitted);
12018 deferred_done.swap(deferred_done_queue);
12019 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
12020 aios = kv_ios;
12021 costs = kv_throttle_costs;
12022 kv_ios = 0;
12023 kv_throttle_costs = 0;
7c673cae
FG
12024 l.unlock();
12025
12026 dout(30) << __func__ << " committing " << kv_committing << dendl;
12027 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
12028 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
12029 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
12030
11fdf7f2
TL
12031 auto start = mono_clock::now();
12032
7c673cae
FG
12033 bool force_flush = false;
12034 // if bluefs is sharing the same device as data (only), then we
12035 // can rely on the bluefs commit to flush the device and make
12036 // deferred aios stable. that means that if we do have done deferred
12037 // txcs AND we are not on a single device, we need to force a flush.
9f95a23c 12038 if (bluefs && bluefs_layout.single_shared_device()) {
31f18b77 12039 if (aios) {
7c673cae 12040 force_flush = true;
11fdf7f2 12041 } else if (kv_committing.empty() && deferred_stable.empty()) {
7c673cae
FG
12042 force_flush = true; // there's nothing else to commit!
12043 } else if (deferred_aggressive) {
12044 force_flush = true;
12045 }
11fdf7f2
TL
12046 } else {
12047 if (aios || !deferred_done.empty()) {
12048 force_flush = true;
12049 } else {
12050 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
12051 }
12052 }
7c673cae
FG
12053
12054 if (force_flush) {
31f18b77 12055 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
12056 << " force_flush=" << (int)force_flush
12057 << ", flushing, deferred done->stable" << dendl;
12058 // flush/barrier on block device
12059 bdev->flush();
12060
12061 // if we flush then deferred done are now deferred stable
12062 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
12063 deferred_done.end());
12064 deferred_done.clear();
12065 }
11fdf7f2 12066 auto after_flush = mono_clock::now();
7c673cae
FG
12067
12068 // we will use one final transaction to force a sync
12069 KeyValueDB::Transaction synct = db->get_transaction();
12070
12071 // increase {nid,blobid}_max? note that this covers both the
12072 // case where we are approaching the max and the case we passed
12073 // it. in either case, we increase the max in the earlier txn
12074 // we submit.
12075 uint64_t new_nid_max = 0, new_blobid_max = 0;
12076 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
12077 KeyValueDB::Transaction t =
12078 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12079 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
12080 bufferlist bl;
11fdf7f2 12081 encode(new_nid_max, bl);
7c673cae
FG
12082 t->set(PREFIX_SUPER, "nid_max", bl);
12083 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
12084 }
12085 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
12086 KeyValueDB::Transaction t =
12087 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12088 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
12089 bufferlist bl;
11fdf7f2 12090 encode(new_blobid_max, bl);
7c673cae
FG
12091 t->set(PREFIX_SUPER, "blobid_max", bl);
12092 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
12093 }
c07f9fc5
FG
12094
12095 for (auto txc : kv_committing) {
9f95a23c 12096 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
f67539c2 12097 if (txc->get_state() == TransContext::STATE_KV_QUEUED) {
adb31ebb 12098 ++kv_submitted;
9f95a23c 12099 _txc_apply_kv(txc, false);
c07f9fc5 12100 --txc->osr->kv_committing_serially;
c07f9fc5 12101 } else {
f67539c2 12102 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
7c673cae 12103 }
7c673cae
FG
12104 if (txc->had_ios) {
12105 --txc->osr->txc_with_unstable_io;
12106 }
7c673cae
FG
12107 }
12108
31f18b77
FG
12109 // release throttle *before* we commit. this allows new ops
12110 // to be prepared and enter pipeline while we are waiting on
12111 // the kv commit sync/flush. then hopefully on the next
12112 // iteration there will already be ops awake. otherwise, we
12113 // end up going to sleep, and then wake up when the very first
12114 // transaction is ready for commit.
9f95a23c 12115 throttle.release_kv_throttle(costs);
31f18b77 12116
7c673cae
FG
12117 // cleanup sync deferred keys
12118 for (auto b : deferred_stable) {
12119 for (auto& txc : b->txcs) {
12120 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
11fdf7f2 12121 ceph_assert(wt.released.empty()); // only kraken did this
7c673cae
FG
12122 string key;
12123 get_deferred_key(wt.seq, &key);
12124 synct->rm_single_key(PREFIX_DEFERRED, key);
12125 }
12126 }
12127
9f95a23c
TL
12128#if defined(WITH_LTTNG)
12129 auto sync_start = mono_clock::now();
12130#endif
7c673cae 12131 // submit synct synchronously (block and wait for it to commit)
31f18b77 12132 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
11fdf7f2
TL
12133 ceph_assert(r == 0);
12134
f67539c2
TL
12135#ifdef WITH_BLKIN
12136 for (auto txc : kv_committing) {
12137 if (txc->trace) {
12138 txc->trace.event("db sync submit");
12139 txc->trace.keyval("kv_committing size", kv_committing.size());
12140 }
12141 }
12142#endif
12143
9f95a23c
TL
12144 int committing_size = kv_committing.size();
12145 int deferred_size = deferred_stable.size();
12146
12147#if defined(WITH_LTTNG)
12148 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
12149 for (auto txc: kv_committing) {
12150 if (txc->tracing) {
12151 tracepoint(
12152 bluestore,
12153 transaction_kv_sync_latency,
12154 txc->osr->get_sequencer_id(),
12155 txc->seq,
12156 kv_committing.size(),
12157 deferred_done.size(),
12158 deferred_stable.size(),
12159 sync_latency);
12160 }
12161 }
12162#endif
12163
11fdf7f2 12164 {
9f95a23c 12165 std::unique_lock m{kv_finalize_lock};
11fdf7f2
TL
12166 if (kv_committing_to_finalize.empty()) {
12167 kv_committing_to_finalize.swap(kv_committing);
12168 } else {
12169 kv_committing_to_finalize.insert(
12170 kv_committing_to_finalize.end(),
12171 kv_committing.begin(),
12172 kv_committing.end());
12173 kv_committing.clear();
12174 }
12175 if (deferred_stable_to_finalize.empty()) {
12176 deferred_stable_to_finalize.swap(deferred_stable);
12177 } else {
12178 deferred_stable_to_finalize.insert(
12179 deferred_stable_to_finalize.end(),
12180 deferred_stable.begin(),
12181 deferred_stable.end());
12182 deferred_stable.clear();
12183 }
9f95a23c
TL
12184 if (!kv_finalize_in_progress) {
12185 kv_finalize_in_progress = true;
12186 kv_finalize_cond.notify_one();
12187 }
11fdf7f2 12188 }
7c673cae
FG
12189
12190 if (new_nid_max) {
12191 nid_max = new_nid_max;
12192 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
12193 }
12194 if (new_blobid_max) {
12195 blobid_max = new_blobid_max;
12196 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
12197 }
12198
224ce89b 12199 {
11fdf7f2
TL
12200 auto finish = mono_clock::now();
12201 ceph::timespan dur_flush = after_flush - start;
12202 ceph::timespan dur_kv = finish - after_flush;
12203 ceph::timespan dur = finish - start;
9f95a23c
TL
12204 dout(20) << __func__ << " committed " << committing_size
12205 << " cleaned " << deferred_size
224ce89b
WB
12206 << " in " << dur
12207 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
12208 << dendl;
494da23a
TL
12209 log_latency("kv_flush",
12210 l_bluestore_kv_flush_lat,
12211 dur_flush,
12212 cct->_conf->bluestore_log_op_age);
12213 log_latency("kv_commit",
12214 l_bluestore_kv_commit_lat,
12215 dur_kv,
12216 cct->_conf->bluestore_log_op_age);
12217 log_latency("kv_sync",
12218 l_bluestore_kv_sync_lat,
12219 dur,
12220 cct->_conf->bluestore_log_op_age);
7c673cae 12221 }
31f18b77 12222
31f18b77
FG
12223 l.lock();
12224 // previously deferred "done" are now "stable" by virtue of this
12225 // commit cycle.
12226 deferred_stable_queue.swap(deferred_done);
12227 }
12228 }
12229 dout(10) << __func__ << " finish" << dendl;
12230 kv_sync_started = false;
12231}
12232
12233void BlueStore::_kv_finalize_thread()
12234{
12235 deque<TransContext*> kv_committed;
12236 deque<DeferredBatch*> deferred_stable;
12237 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
12238 std::unique_lock l(kv_finalize_lock);
12239 ceph_assert(!kv_finalize_started);
31f18b77
FG
12240 kv_finalize_started = true;
12241 kv_finalize_cond.notify_all();
12242 while (true) {
11fdf7f2
TL
12243 ceph_assert(kv_committed.empty());
12244 ceph_assert(deferred_stable.empty());
31f18b77
FG
12245 if (kv_committing_to_finalize.empty() &&
12246 deferred_stable_to_finalize.empty()) {
12247 if (kv_finalize_stop)
12248 break;
12249 dout(20) << __func__ << " sleep" << dendl;
9f95a23c 12250 kv_finalize_in_progress = false;
31f18b77
FG
12251 kv_finalize_cond.wait(l);
12252 dout(20) << __func__ << " wake" << dendl;
12253 } else {
12254 kv_committed.swap(kv_committing_to_finalize);
12255 deferred_stable.swap(deferred_stable_to_finalize);
12256 l.unlock();
12257 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
12258 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
12259
11fdf7f2
TL
12260 auto start = mono_clock::now();
12261
31f18b77
FG
12262 while (!kv_committed.empty()) {
12263 TransContext *txc = kv_committed.front();
f67539c2 12264 ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
7c673cae 12265 _txc_state_proc(txc);
31f18b77 12266 kv_committed.pop_front();
7c673cae 12267 }
31f18b77 12268
7c673cae
FG
12269 for (auto b : deferred_stable) {
12270 auto p = b->txcs.begin();
12271 while (p != b->txcs.end()) {
12272 TransContext *txc = &*p;
12273 p = b->txcs.erase(p); // unlink here because
12274 _txc_state_proc(txc); // this may destroy txc
12275 }
12276 delete b;
12277 }
31f18b77 12278 deferred_stable.clear();
7c673cae
FG
12279
12280 if (!deferred_aggressive) {
31f18b77 12281 if (deferred_queue_size >= deferred_batch_ops.load() ||
9f95a23c 12282 throttle.should_submit_deferred()) {
224ce89b 12283 deferred_try_submit();
7c673cae
FG
12284 }
12285 }
12286
12287 // this is as good a place as any ...
12288 _reap_collections();
12289
11fdf7f2 12290 logger->set(l_bluestore_fragmentation,
f67539c2 12291 (uint64_t)(shared_alloc.a->get_fragmentation() * 1000));
11fdf7f2 12292
494da23a
TL
12293 log_latency("kv_final",
12294 l_bluestore_kv_final_lat,
12295 mono_clock::now() - start,
12296 cct->_conf->bluestore_log_op_age);
11fdf7f2 12297
7c673cae 12298 l.lock();
7c673cae
FG
12299 }
12300 }
12301 dout(10) << __func__ << " finish" << dendl;
31f18b77 12302 kv_finalize_started = false;
7c673cae
FG
12303}
12304
f67539c2
TL
12305void BlueStore::_zoned_cleaner_start() {
12306 dout(10) << __func__ << dendl;
12307
12308 zoned_cleaner_thread.create("bstore_zcleaner");
12309}
12310
12311void BlueStore::_zoned_cleaner_stop() {
12312 dout(10) << __func__ << dendl;
12313 {
12314 std::unique_lock l{zoned_cleaner_lock};
12315 while (!zoned_cleaner_started) {
12316 zoned_cleaner_cond.wait(l);
12317 }
12318 zoned_cleaner_stop = true;
12319 zoned_cleaner_cond.notify_all();
12320 }
12321 zoned_cleaner_thread.join();
12322 {
12323 std::lock_guard l{zoned_cleaner_lock};
12324 zoned_cleaner_stop = false;
12325 }
12326 dout(10) << __func__ << " done" << dendl;
12327}
12328
12329void BlueStore::_zoned_cleaner_thread() {
12330 dout(10) << __func__ << " start" << dendl;
12331 std::unique_lock l{zoned_cleaner_lock};
12332 ceph_assert(!zoned_cleaner_started);
12333 zoned_cleaner_started = true;
12334 zoned_cleaner_cond.notify_all();
12335 std::deque<uint64_t> zones_to_clean;
12336 while (true) {
12337 if (zoned_cleaner_queue.empty()) {
12338 if (zoned_cleaner_stop) {
12339 break;
12340 }
12341 dout(20) << __func__ << " sleep" << dendl;
12342 zoned_cleaner_cond.wait(l);
12343 dout(20) << __func__ << " wake" << dendl;
12344 } else {
12345 zones_to_clean.swap(zoned_cleaner_queue);
12346 l.unlock();
12347 while (!zones_to_clean.empty()) {
12348 _zoned_clean_zone(zones_to_clean.front());
12349 zones_to_clean.pop_front();
12350 }
12351 l.lock();
12352 }
12353 }
12354 dout(10) << __func__ << " finish" << dendl;
12355 zoned_cleaner_started = false;
12356}
12357
12358void BlueStore::_zoned_clean_zone(uint64_t zone_num) {
12359 dout(10) << __func__ << " cleaning zone " << zone_num << dendl;
12360}
12361
7c673cae 12362bluestore_deferred_op_t *BlueStore::_get_deferred_op(
9f95a23c 12363 TransContext *txc)
7c673cae
FG
12364{
12365 if (!txc->deferred_txn) {
12366 txc->deferred_txn = new bluestore_deferred_transaction_t;
12367 }
12368 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
12369 return &txc->deferred_txn->ops.back();
12370}
12371
12372void BlueStore::_deferred_queue(TransContext *txc)
12373{
12374 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
f67539c2
TL
12375
12376 DeferredBatch *tmp;
12377 txc->osr->deferred_lock.lock();
12378 {
12379 if (!txc->osr->deferred_pending) {
12380 tmp = new DeferredBatch(cct, txc->osr.get());
12381 } else {
12382 tmp = txc->osr->deferred_pending;
12383 }
7c673cae 12384 }
f67539c2
TL
12385
12386 tmp->txcs.push_back(*txc);
7c673cae
FG
12387 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
12388 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
12389 const auto& op = *opi;
11fdf7f2 12390 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
7c673cae
FG
12391 bufferlist::const_iterator p = op.data.begin();
12392 for (auto e : op.extents) {
f67539c2 12393 tmp->prepare_write(cct, wt.seq, e.offset, e.length, p);
7c673cae
FG
12394 }
12395 }
f67539c2
TL
12396
12397 {
12398 ++deferred_queue_size;
12399 txc->osr->deferred_pending = tmp;
12400 // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
12401 // So we should add osr into deferred_queue.
12402 if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) {
12403 deferred_lock.lock();
12404 deferred_queue.push_back(*txc->osr);
12405 deferred_lock.unlock();
12406 }
12407
12408 if (deferred_aggressive &&
12409 !txc->osr->deferred_running) {
12410 _deferred_submit_unlock(txc->osr.get());
12411 } else {
12412 txc->osr->deferred_lock.unlock();
12413 }
7c673cae 12414 }
f67539c2 12415 }
7c673cae 12416
224ce89b 12417void BlueStore::deferred_try_submit()
7c673cae
FG
12418{
12419 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
12420 << deferred_queue_size << " txcs" << dendl;
224ce89b 12421 vector<OpSequencerRef> osrs;
f67539c2
TL
12422
12423 {
12424 std::lock_guard l(deferred_lock);
12425 osrs.reserve(deferred_queue.size());
12426 for (auto& osr : deferred_queue) {
12427 osrs.push_back(&osr);
12428 }
224ce89b 12429 }
f67539c2 12430
224ce89b 12431 for (auto& osr : osrs) {
f67539c2 12432 osr->deferred_lock.lock();
181888fb
FG
12433 if (osr->deferred_pending) {
12434 if (!osr->deferred_running) {
12435 _deferred_submit_unlock(osr.get());
181888fb 12436 } else {
f67539c2 12437 osr->deferred_lock.unlock();
181888fb
FG
12438 dout(20) << __func__ << " osr " << osr << " already has running"
12439 << dendl;
12440 }
12441 } else {
f67539c2 12442 osr->deferred_lock.unlock();
181888fb 12443 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
12444 }
12445 }
9f95a23c 12446
f67539c2
TL
12447 {
12448 std::lock_guard l(deferred_lock);
12449 deferred_last_submitted = ceph_clock_now();
12450 }
7c673cae
FG
12451}
12452
224ce89b 12453void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
12454{
12455 dout(10) << __func__ << " osr " << osr
12456 << " " << osr->deferred_pending->iomap.size() << " ios pending "
12457 << dendl;
11fdf7f2
TL
12458 ceph_assert(osr->deferred_pending);
12459 ceph_assert(!osr->deferred_running);
7c673cae
FG
12460
12461 auto b = osr->deferred_pending;
12462 deferred_queue_size -= b->seq_bytes.size();
11fdf7f2 12463 ceph_assert(deferred_queue_size >= 0);
7c673cae
FG
12464
12465 osr->deferred_running = osr->deferred_pending;
12466 osr->deferred_pending = nullptr;
12467
f67539c2 12468 osr->deferred_lock.unlock();
11fdf7f2
TL
12469
12470 for (auto& txc : b->txcs) {
9f95a23c 12471 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
11fdf7f2 12472 }
7c673cae
FG
12473 uint64_t start = 0, pos = 0;
12474 bufferlist bl;
12475 auto i = b->iomap.begin();
12476 while (true) {
12477 if (i == b->iomap.end() || i->first != pos) {
12478 if (bl.length()) {
12479 dout(20) << __func__ << " write 0x" << std::hex
12480 << start << "~" << bl.length()
12481 << " crc " << bl.crc32c(-1) << std::dec << dendl;
11fdf7f2 12482 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
12483 logger->inc(l_bluestore_deferred_write_ops);
12484 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
12485 int r = bdev->aio_write(start, bl, &b->ioc, false);
11fdf7f2 12486 ceph_assert(r == 0);
7c673cae
FG
12487 }
12488 }
12489 if (i == b->iomap.end()) {
12490 break;
12491 }
12492 start = 0;
12493 pos = i->first;
12494 bl.clear();
12495 }
12496 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
12497 << std::hex << pos << "~" << i->second.bl.length() << std::dec
12498 << dendl;
12499 if (!bl.length()) {
12500 start = pos;
12501 }
12502 pos += i->second.bl.length();
12503 bl.claim_append(i->second.bl);
12504 ++i;
12505 }
224ce89b 12506
7c673cae
FG
12507 bdev->aio_submit(&b->ioc);
12508}
12509
3efd9988
FG
12510struct C_DeferredTrySubmit : public Context {
12511 BlueStore *store;
12512 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
12513 void finish(int r) {
12514 store->deferred_try_submit();
12515 }
12516};
12517
7c673cae
FG
12518void BlueStore::_deferred_aio_finish(OpSequencer *osr)
12519{
12520 dout(10) << __func__ << " osr " << osr << dendl;
11fdf7f2 12521 ceph_assert(osr->deferred_running);
7c673cae
FG
12522 DeferredBatch *b = osr->deferred_running;
12523
12524 {
f67539c2 12525 osr->deferred_lock.lock();
11fdf7f2 12526 ceph_assert(osr->deferred_running == b);
7c673cae
FG
12527 osr->deferred_running = nullptr;
12528 if (!osr->deferred_pending) {
181888fb 12529 dout(20) << __func__ << " dequeueing" << dendl;
f67539c2
TL
12530 {
12531 deferred_lock.lock();
12532 auto q = deferred_queue.iterator_to(*osr);
12533 deferred_queue.erase(q);
12534 deferred_lock.unlock();
12535 }
12536 osr->deferred_lock.unlock();
181888fb 12537 } else {
f67539c2 12538 osr->deferred_lock.unlock();
9f95a23c
TL
12539 if (deferred_aggressive) {
12540 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
12541 finisher.queue(new C_DeferredTrySubmit(this));
12542 } else {
12543 dout(20) << __func__ << " leaving queued, more pending" << dendl;
12544 }
7c673cae
FG
12545 }
12546 }
12547
12548 {
31f18b77 12549 uint64_t costs = 0;
11fdf7f2 12550 {
11fdf7f2
TL
12551 for (auto& i : b->txcs) {
12552 TransContext *txc = &i;
9f95a23c 12553 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
f67539c2 12554 txc->set_state(TransContext::STATE_DEFERRED_CLEANUP);
11fdf7f2
TL
12555 costs += txc->cost;
12556 }
7c673cae 12557 }
9f95a23c 12558 throttle.release_deferred_throttle(costs);
7c673cae
FG
12559 }
12560
9f95a23c 12561 {
11fdf7f2 12562 std::lock_guard l(kv_lock);
9f95a23c
TL
12563 deferred_done_queue.emplace_back(b);
12564
12565 // in the normal case, do not bother waking up the kv thread; it will
12566 // catch us on the next commit anyway.
12567 if (deferred_aggressive && !kv_sync_in_progress) {
12568 kv_sync_in_progress = true;
12569 kv_cond.notify_one();
12570 }
7c673cae
FG
12571 }
12572}
12573
12574int BlueStore::_deferred_replay()
12575{
12576 dout(10) << __func__ << " start" << dendl;
7c673cae
FG
12577 int count = 0;
12578 int r = 0;
11fdf7f2
TL
12579 CollectionRef ch = _get_collection(coll_t::meta());
12580 bool fake_ch = false;
12581 if (!ch) {
12582 // hmm, replaying initial mkfs?
12583 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
12584 fake_ch = true;
12585 }
12586 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
7c673cae
FG
12587 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
12588 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
12589 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
12590 << dendl;
12591 bluestore_deferred_transaction_t *deferred_txn =
12592 new bluestore_deferred_transaction_t;
12593 bufferlist bl = it->value();
11fdf7f2 12594 auto p = bl.cbegin();
7c673cae 12595 try {
11fdf7f2 12596 decode(*deferred_txn, p);
f67539c2 12597 } catch (ceph::buffer::error& e) {
7c673cae
FG
12598 derr << __func__ << " failed to decode deferred txn "
12599 << pretty_binary_string(it->key()) << dendl;
12600 delete deferred_txn;
12601 r = -EIO;
12602 goto out;
12603 }
11fdf7f2 12604 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
7c673cae 12605 txc->deferred_txn = deferred_txn;
f67539c2 12606 txc->set_state(TransContext::STATE_KV_DONE);
7c673cae
FG
12607 _txc_state_proc(txc);
12608 }
12609 out:
12610 dout(20) << __func__ << " draining osr" << dendl;
11fdf7f2 12611 _osr_register_zombie(osr);
7c673cae 12612 _osr_drain_all();
11fdf7f2
TL
12613 if (fake_ch) {
12614 new_coll_map.clear();
12615 }
7c673cae
FG
12616 dout(10) << __func__ << " completed " << count << " events" << dendl;
12617 return r;
12618}
12619
12620// ---------------------------
12621// transactions
12622
12623int BlueStore::queue_transactions(
11fdf7f2
TL
12624 CollectionHandle& ch,
12625 vector<Transaction>& tls,
12626 TrackedOpRef op,
12627 ThreadPool::TPHandle *handle)
12628{
12629 FUNCTRACE(cct);
12630 list<Context *> on_applied, on_commit, on_applied_sync;
7c673cae 12631 ObjectStore::Transaction::collect_contexts(
11fdf7f2 12632 tls, &on_applied, &on_commit, &on_applied_sync);
7c673cae 12633
11fdf7f2
TL
12634 auto start = mono_clock::now();
12635
12636 Collection *c = static_cast<Collection*>(ch.get());
12637 OpSequencer *osr = c->osr.get();
12638 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
7c673cae
FG
12639
12640 // prepare
11fdf7f2 12641 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
f67539c2 12642 &on_commit, op);
7c673cae 12643
f67539c2
TL
12644 // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
12645 // submission to happen atomically because if I/O submission happens in a
12646 // different order than I/O allocation, we end up issuing non-sequential
12647 // writes to the drive. This is a temporary solution until ZONE APPEND
12648 // support matures in the kernel. For more information please see:
12649 // https://www.usenix.org/conference/vault20/presentation/bjorling
12650 if (bdev->is_smr()) {
12651 atomic_alloc_and_submit_lock.lock();
12652 }
7c673cae 12653 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
7c673cae
FG
12654 txc->bytes += (*p).get_num_bytes();
12655 _txc_add_transaction(txc, &(*p));
12656 }
12657 _txc_calc_cost(txc);
12658
12659 _txc_write_nodes(txc, txc->t);
12660
12661 // journal deferred items
12662 if (txc->deferred_txn) {
12663 txc->deferred_txn->seq = ++deferred_seq;
12664 bufferlist bl;
11fdf7f2 12665 encode(*txc->deferred_txn, bl);
7c673cae
FG
12666 string key;
12667 get_deferred_key(txc->deferred_txn->seq, &key);
12668 txc->t->set(PREFIX_DEFERRED, key, bl);
12669 }
12670
12671 _txc_finalize_kv(txc, txc->t);
f67539c2
TL
12672
12673#ifdef WITH_BLKIN
12674 if (txc->trace) {
12675 txc->trace.event("txc encode finished");
12676 }
12677#endif
12678
7c673cae
FG
12679 if (handle)
12680 handle->suspend_tp_timeout();
12681
11fdf7f2 12682 auto tstart = mono_clock::now();
9f95a23c
TL
12683
12684 if (!throttle.try_start_transaction(
12685 *db,
12686 *txc,
12687 tstart)) {
7c673cae 12688 // ensure we do not block here because of deferred writes
9f95a23c
TL
12689 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
12690 << dendl;
12691 ++deferred_aggressive;
12692 deferred_try_submit();
12693 {
12694 // wake up any previously finished deferred events
12695 std::lock_guard l(kv_lock);
12696 if (!kv_sync_in_progress) {
12697 kv_sync_in_progress = true;
3efd9988
FG
12698 kv_cond.notify_one();
12699 }
9f95a23c
TL
12700 }
12701 throttle.finish_start_transaction(*db, *txc, tstart);
12702 --deferred_aggressive;
7c673cae 12703 }
11fdf7f2 12704 auto tend = mono_clock::now();
7c673cae
FG
12705
12706 if (handle)
12707 handle->reset_tp_timeout();
12708
12709 logger->inc(l_bluestore_txc);
12710
12711 // execute (start)
12712 _txc_state_proc(txc);
12713
f67539c2
TL
12714 if (bdev->is_smr()) {
12715 atomic_alloc_and_submit_lock.unlock();
12716 }
12717
11fdf7f2
TL
12718 // we're immediately readable (unlike FileStore)
12719 for (auto c : on_applied_sync) {
12720 c->complete(0);
12721 }
12722 if (!on_applied.empty()) {
12723 if (c->commit_queue) {
12724 c->commit_queue->queue(on_applied);
12725 } else {
12726 finisher.queue(on_applied);
12727 }
12728 }
12729
f67539c2
TL
12730#ifdef WITH_BLKIN
12731 if (txc->trace) {
12732 txc->trace.event("txc applied");
12733 }
12734#endif
12735
494da23a
TL
12736 log_latency("submit_transact",
12737 l_bluestore_submit_lat,
12738 mono_clock::now() - start,
12739 cct->_conf->bluestore_log_op_age);
12740 log_latency("throttle_transact",
12741 l_bluestore_throttle_lat,
12742 tend - tstart,
12743 cct->_conf->bluestore_log_op_age);
7c673cae
FG
12744 return 0;
12745}
12746
12747void BlueStore::_txc_aio_submit(TransContext *txc)
12748{
12749 dout(10) << __func__ << " txc " << txc << dendl;
12750 bdev->aio_submit(&txc->ioc);
12751}
12752
12753void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
12754{
12755 Transaction::iterator i = t->begin();
12756
81eedcae 12757 _dump_transaction<30>(cct, t);
7c673cae
FG
12758
12759 vector<CollectionRef> cvec(i.colls.size());
12760 unsigned j = 0;
12761 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
12762 ++p, ++j) {
12763 cvec[j] = _get_collection(*p);
7c673cae 12764 }
11fdf7f2 12765
7c673cae
FG
12766 vector<OnodeRef> ovec(i.objects.size());
12767
12768 for (int pos = 0; i.have_op(); ++pos) {
12769 Transaction::Op *op = i.decode_op();
12770 int r = 0;
12771
12772 // no coll or obj
12773 if (op->op == Transaction::OP_NOP)
12774 continue;
12775
11fdf7f2 12776
7c673cae
FG
12777 // collection operations
12778 CollectionRef &c = cvec[op->cid];
11fdf7f2
TL
12779
12780 // initialize osd_pool_id and do a smoke test that all collections belong
12781 // to the same pool
12782 spg_t pgid;
12783 if (!!c ? c->cid.is_pg(&pgid) : false) {
12784 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
12785 txc->osd_pool_id == pgid.pool());
12786 txc->osd_pool_id = pgid.pool();
12787 }
12788
7c673cae
FG
12789 switch (op->op) {
12790 case Transaction::OP_RMCOLL:
12791 {
12792 const coll_t &cid = i.get_cid(op->cid);
12793 r = _remove_collection(txc, cid, &c);
12794 if (!r)
12795 continue;
12796 }
12797 break;
12798
12799 case Transaction::OP_MKCOLL:
12800 {
11fdf7f2 12801 ceph_assert(!c);
7c673cae
FG
12802 const coll_t &cid = i.get_cid(op->cid);
12803 r = _create_collection(txc, cid, op->split_bits, &c);
12804 if (!r)
12805 continue;
12806 }
12807 break;
12808
12809 case Transaction::OP_SPLIT_COLLECTION:
11fdf7f2 12810 ceph_abort_msg("deprecated");
7c673cae
FG
12811 break;
12812
12813 case Transaction::OP_SPLIT_COLLECTION2:
12814 {
12815 uint32_t bits = op->split_bits;
12816 uint32_t rem = op->split_rem;
12817 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
12818 if (!r)
12819 continue;
12820 }
12821 break;
12822
11fdf7f2
TL
12823 case Transaction::OP_MERGE_COLLECTION:
12824 {
12825 uint32_t bits = op->split_bits;
12826 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
12827 if (!r)
12828 continue;
12829 }
12830 break;
12831
7c673cae
FG
12832 case Transaction::OP_COLL_HINT:
12833 {
f67539c2 12834 uint32_t type = op->hint;
7c673cae
FG
12835 bufferlist hint;
12836 i.decode_bl(hint);
11fdf7f2 12837 auto hiter = hint.cbegin();
7c673cae
FG
12838 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
12839 uint32_t pg_num;
12840 uint64_t num_objs;
11fdf7f2
TL
12841 decode(pg_num, hiter);
12842 decode(num_objs, hiter);
7c673cae
FG
12843 dout(10) << __func__ << " collection hint objects is a no-op, "
12844 << " pg_num " << pg_num << " num_objects " << num_objs
12845 << dendl;
12846 } else {
12847 // Ignore the hint
12848 dout(10) << __func__ << " unknown collection hint " << type << dendl;
12849 }
12850 continue;
12851 }
12852 break;
12853
12854 case Transaction::OP_COLL_SETATTR:
12855 r = -EOPNOTSUPP;
12856 break;
12857
12858 case Transaction::OP_COLL_RMATTR:
12859 r = -EOPNOTSUPP;
12860 break;
12861
12862 case Transaction::OP_COLL_RENAME:
11fdf7f2 12863 ceph_abort_msg("not implemented");
7c673cae
FG
12864 break;
12865 }
12866 if (r < 0) {
12867 derr << __func__ << " error " << cpp_strerror(r)
12868 << " not handled on operation " << op->op
12869 << " (op " << pos << ", counting from 0)" << dendl;
81eedcae 12870 _dump_transaction<0>(cct, t);
11fdf7f2 12871 ceph_abort_msg("unexpected error");
7c673cae
FG
12872 }
12873
12874 // these operations implicity create the object
12875 bool create = false;
12876 if (op->op == Transaction::OP_TOUCH ||
9f95a23c 12877 op->op == Transaction::OP_CREATE ||
7c673cae
FG
12878 op->op == Transaction::OP_WRITE ||
12879 op->op == Transaction::OP_ZERO) {
12880 create = true;
12881 }
12882
12883 // object operations
9f95a23c 12884 std::unique_lock l(c->lock);
7c673cae
FG
12885 OnodeRef &o = ovec[op->oid];
12886 if (!o) {
12887 ghobject_t oid = i.get_oid(op->oid);
9f95a23c 12888 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
7c673cae
FG
12889 }
12890 if (!create && (!o || !o->exists)) {
12891 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
12892 << i.get_oid(op->oid) << dendl;
12893 r = -ENOENT;
12894 goto endop;
12895 }
12896
12897 switch (op->op) {
9f95a23c 12898 case Transaction::OP_CREATE:
7c673cae
FG
12899 case Transaction::OP_TOUCH:
12900 r = _touch(txc, c, o);
12901 break;
12902
12903 case Transaction::OP_WRITE:
12904 {
12905 uint64_t off = op->off;
12906 uint64_t len = op->len;
12907 uint32_t fadvise_flags = i.get_fadvise_flags();
12908 bufferlist bl;
12909 i.decode_bl(bl);
12910 r = _write(txc, c, o, off, len, bl, fadvise_flags);
12911 }
12912 break;
12913
12914 case Transaction::OP_ZERO:
12915 {
12916 uint64_t off = op->off;
12917 uint64_t len = op->len;
12918 r = _zero(txc, c, o, off, len);
12919 }
12920 break;
12921
12922 case Transaction::OP_TRIMCACHE:
12923 {
12924 // deprecated, no-op
12925 }
12926 break;
12927
12928 case Transaction::OP_TRUNCATE:
12929 {
12930 uint64_t off = op->off;
35e4c445 12931 r = _truncate(txc, c, o, off);
7c673cae
FG
12932 }
12933 break;
12934
12935 case Transaction::OP_REMOVE:
12936 {
12937 r = _remove(txc, c, o);
12938 }
12939 break;
12940
12941 case Transaction::OP_SETATTR:
12942 {
12943 string name = i.decode_string();
12944 bufferptr bp;
12945 i.decode_bp(bp);
12946 r = _setattr(txc, c, o, name, bp);
12947 }
12948 break;
12949
12950 case Transaction::OP_SETATTRS:
12951 {
12952 map<string, bufferptr> aset;
12953 i.decode_attrset(aset);
12954 r = _setattrs(txc, c, o, aset);
12955 }
12956 break;
12957
12958 case Transaction::OP_RMATTR:
12959 {
12960 string name = i.decode_string();
12961 r = _rmattr(txc, c, o, name);
12962 }
12963 break;
12964
12965 case Transaction::OP_RMATTRS:
12966 {
12967 r = _rmattrs(txc, c, o);
12968 }
12969 break;
12970
12971 case Transaction::OP_CLONE:
12972 {
12973 OnodeRef& no = ovec[op->dest_oid];
12974 if (!no) {
12975 const ghobject_t& noid = i.get_oid(op->dest_oid);
12976 no = c->get_onode(noid, true);
12977 }
12978 r = _clone(txc, c, o, no);
12979 }
12980 break;
12981
12982 case Transaction::OP_CLONERANGE:
11fdf7f2 12983 ceph_abort_msg("deprecated");
7c673cae
FG
12984 break;
12985
12986 case Transaction::OP_CLONERANGE2:
12987 {
12988 OnodeRef& no = ovec[op->dest_oid];
12989 if (!no) {
12990 const ghobject_t& noid = i.get_oid(op->dest_oid);
12991 no = c->get_onode(noid, true);
12992 }
12993 uint64_t srcoff = op->off;
12994 uint64_t len = op->len;
12995 uint64_t dstoff = op->dest_off;
12996 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
12997 }
12998 break;
12999
13000 case Transaction::OP_COLL_ADD:
11fdf7f2 13001 ceph_abort_msg("not implemented");
7c673cae
FG
13002 break;
13003
13004 case Transaction::OP_COLL_REMOVE:
11fdf7f2 13005 ceph_abort_msg("not implemented");
7c673cae
FG
13006 break;
13007
13008 case Transaction::OP_COLL_MOVE:
11fdf7f2 13009 ceph_abort_msg("deprecated");
7c673cae
FG
13010 break;
13011
13012 case Transaction::OP_COLL_MOVE_RENAME:
13013 case Transaction::OP_TRY_RENAME:
13014 {
11fdf7f2 13015 ceph_assert(op->cid == op->dest_cid);
7c673cae
FG
13016 const ghobject_t& noid = i.get_oid(op->dest_oid);
13017 OnodeRef& no = ovec[op->dest_oid];
13018 if (!no) {
13019 no = c->get_onode(noid, false);
13020 }
13021 r = _rename(txc, c, o, no, noid);
13022 }
13023 break;
13024
13025 case Transaction::OP_OMAP_CLEAR:
13026 {
13027 r = _omap_clear(txc, c, o);
13028 }
13029 break;
13030 case Transaction::OP_OMAP_SETKEYS:
13031 {
13032 bufferlist aset_bl;
13033 i.decode_attrset_bl(&aset_bl);
13034 r = _omap_setkeys(txc, c, o, aset_bl);
13035 }
13036 break;
13037 case Transaction::OP_OMAP_RMKEYS:
13038 {
13039 bufferlist keys_bl;
13040 i.decode_keyset_bl(&keys_bl);
13041 r = _omap_rmkeys(txc, c, o, keys_bl);
13042 }
13043 break;
13044 case Transaction::OP_OMAP_RMKEYRANGE:
13045 {
13046 string first, last;
13047 first = i.decode_string();
13048 last = i.decode_string();
13049 r = _omap_rmkey_range(txc, c, o, first, last);
13050 }
13051 break;
13052 case Transaction::OP_OMAP_SETHEADER:
13053 {
13054 bufferlist bl;
13055 i.decode_bl(bl);
13056 r = _omap_setheader(txc, c, o, bl);
13057 }
13058 break;
13059
13060 case Transaction::OP_SETALLOCHINT:
13061 {
13062 r = _set_alloc_hint(txc, c, o,
13063 op->expected_object_size,
13064 op->expected_write_size,
f67539c2 13065 op->hint);
7c673cae
FG
13066 }
13067 break;
13068
13069 default:
11fdf7f2 13070 derr << __func__ << " bad op " << op->op << dendl;
7c673cae
FG
13071 ceph_abort();
13072 }
13073
13074 endop:
13075 if (r < 0) {
13076 bool ok = false;
13077
13078 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
13079 op->op == Transaction::OP_CLONE ||
13080 op->op == Transaction::OP_CLONERANGE2 ||
13081 op->op == Transaction::OP_COLL_ADD ||
13082 op->op == Transaction::OP_SETATTR ||
13083 op->op == Transaction::OP_SETATTRS ||
13084 op->op == Transaction::OP_RMATTR ||
13085 op->op == Transaction::OP_OMAP_SETKEYS ||
13086 op->op == Transaction::OP_OMAP_RMKEYS ||
13087 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
13088 op->op == Transaction::OP_OMAP_SETHEADER))
13089 // -ENOENT is usually okay
13090 ok = true;
13091 if (r == -ENODATA)
13092 ok = true;
13093
13094 if (!ok) {
13095 const char *msg = "unexpected error code";
13096
13097 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
13098 op->op == Transaction::OP_CLONE ||
13099 op->op == Transaction::OP_CLONERANGE2))
13100 msg = "ENOENT on clone suggests osd bug";
13101
13102 if (r == -ENOSPC)
13103 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
13104 // by partially applying transactions.
13105 msg = "ENOSPC from bluestore, misconfigured cluster";
13106
13107 if (r == -ENOTEMPTY) {
13108 msg = "ENOTEMPTY suggests garbage data in osd data dir";
13109 }
13110
13111 derr << __func__ << " error " << cpp_strerror(r)
13112 << " not handled on operation " << op->op
13113 << " (op " << pos << ", counting from 0)"
13114 << dendl;
13115 derr << msg << dendl;
81eedcae 13116 _dump_transaction<0>(cct, t);
11fdf7f2 13117 ceph_abort_msg("unexpected error");
7c673cae
FG
13118 }
13119 }
13120 }
13121}
13122
13123
13124
13125// -----------------
13126// write operations
13127
13128int BlueStore::_touch(TransContext *txc,
13129 CollectionRef& c,
13130 OnodeRef &o)
13131{
13132 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13133 int r = 0;
7c673cae
FG
13134 _assign_nid(txc, o);
13135 txc->write_onode(o);
13136 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13137 return r;
13138}
13139
7c673cae
FG
13140void BlueStore::_pad_zeros(
13141 bufferlist *bl, uint64_t *offset,
13142 uint64_t chunk_size)
13143{
13144 auto length = bl->length();
13145 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
13146 << " chunk_size 0x" << chunk_size << std::dec << dendl;
13147 dout(40) << "before:\n";
13148 bl->hexdump(*_dout);
13149 *_dout << dendl;
13150 // front
13151 size_t front_pad = *offset % chunk_size;
13152 size_t back_pad = 0;
13153 size_t pad_count = 0;
13154 if (front_pad) {
11fdf7f2 13155 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
f67539c2 13156 bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size);
224ce89b 13157 z.zero(0, front_pad, false);
7c673cae 13158 pad_count += front_pad;
9f95a23c 13159 bl->begin().copy(front_copy, z.c_str() + front_pad);
7c673cae
FG
13160 if (front_copy + front_pad < chunk_size) {
13161 back_pad = chunk_size - (length + front_pad);
224ce89b 13162 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
13163 pad_count += back_pad;
13164 }
13165 bufferlist old, t;
13166 old.swap(*bl);
13167 t.substr_of(old, front_copy, length - front_copy);
13168 bl->append(z);
13169 bl->claim_append(t);
13170 *offset -= front_pad;
224ce89b 13171 length += pad_count;
7c673cae
FG
13172 }
13173
13174 // back
13175 uint64_t end = *offset + length;
13176 unsigned back_copy = end % chunk_size;
13177 if (back_copy) {
11fdf7f2 13178 ceph_assert(back_pad == 0);
7c673cae 13179 back_pad = chunk_size - back_copy;
11fdf7f2 13180 ceph_assert(back_copy <= length);
7c673cae 13181 bufferptr tail(chunk_size);
9f95a23c 13182 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
224ce89b 13183 tail.zero(back_copy, back_pad, false);
7c673cae
FG
13184 bufferlist old;
13185 old.swap(*bl);
13186 bl->substr_of(old, 0, length - back_copy);
13187 bl->append(tail);
13188 length += back_pad;
13189 pad_count += back_pad;
13190 }
13191 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
13192 << back_pad << " on front/back, now 0x" << *offset << "~"
13193 << length << std::dec << dendl;
13194 dout(40) << "after:\n";
13195 bl->hexdump(*_dout);
13196 *_dout << dendl;
13197 if (pad_count)
13198 logger->inc(l_bluestore_write_pad_bytes, pad_count);
11fdf7f2 13199 ceph_assert(bl->length() == length);
7c673cae
FG
13200}
13201
13202void BlueStore::_do_write_small(
13203 TransContext *txc,
13204 CollectionRef &c,
13205 OnodeRef o,
13206 uint64_t offset, uint64_t length,
13207 bufferlist::iterator& blp,
13208 WriteContext *wctx)
13209{
13210 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13211 << std::dec << dendl;
11fdf7f2 13212 ceph_assert(length < min_alloc_size);
f67539c2 13213
7c673cae
FG
13214 uint64_t end_offs = offset + length;
13215
13216 logger->inc(l_bluestore_write_small);
13217 logger->inc(l_bluestore_write_small_bytes, length);
13218
13219 bufferlist bl;
13220 blp.copy(length, bl);
13221
81eedcae
TL
13222 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
13223 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13224 uint32_t alloc_len = min_alloc_size;
13225 auto offset0 = p2align<uint64_t>(offset, alloc_len);
13226
13227 bool any_change;
13228
13229 // search suitable extent in both forward and reverse direction in
13230 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13231 // then check if blob can be reused via can_reuse_blob func or apply
13232 // direct/deferred write (the latter for extents including or higher
13233 // than 'offset' only).
13234 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
13235
f67539c2
TL
13236 // On zoned devices, the first goal is to support non-overwrite workloads,
13237 // such as RGW, with large, aligned objects. Therefore, for user writes
13238 // _do_write_small should not trigger. OSDs, however, write and update a tiny
13239 // amount of metadata, such as OSD maps, to disk. For those cases, we
13240 // temporarily just pad them to min_alloc_size and write them to a new place
13241 // on every update.
13242 if (bdev->is_smr()) {
13243 BlobRef b = c->new_blob();
13244 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
13245 uint64_t b_off0 = b_off;
13246 _pad_zeros(&bl, &b_off0, min_alloc_size);
13247 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13248 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true);
13249 return;
13250 }
13251
7c673cae
FG
13252 // Look for an existing mutable blob we can use.
13253 auto begin = o->extent_map.extent_map.begin();
13254 auto end = o->extent_map.extent_map.end();
13255 auto ep = o->extent_map.seek_lextent(offset);
13256 if (ep != begin) {
13257 --ep;
13258 if (ep->blob_end() <= offset) {
13259 ++ep;
13260 }
13261 }
f67539c2
TL
13262 auto prev_ep = end;
13263 if (ep != begin) {
13264 prev_ep = ep;
7c673cae 13265 --prev_ep;
7c673cae
FG
13266 }
13267
eafe8130
TL
13268 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
13269 // We don't want to have more blobs than min alloc units fit
13270 // into 2 max blobs
13271 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
13272 bool above_blob_threshold = false;
13273
13274 inspected_blobs.reserve(blob_threshold);
13275
13276 uint64_t max_off = 0;
13277 auto start_ep = ep;
13278 auto end_ep = ep; // exclusively
7c673cae
FG
13279 do {
13280 any_change = false;
13281
13282 if (ep != end && ep->logical_offset < offset + max_bsize) {
13283 BlobRef b = ep->blob;
eafe8130
TL
13284 if (!above_blob_threshold) {
13285 inspected_blobs.insert(&b->get_blob());
13286 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13287 }
13288 max_off = ep->logical_end();
7c673cae 13289 auto bstart = ep->blob_start();
eafe8130 13290
7c673cae
FG
13291 dout(20) << __func__ << " considering " << *b
13292 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
13293 if (bstart >= end_offs) {
13294 dout(20) << __func__ << " ignoring distant " << *b << dendl;
13295 } else if (!b->get_blob().is_mutable()) {
13296 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
13297 } else if (ep->logical_offset % min_alloc_size !=
13298 ep->blob_offset % min_alloc_size) {
13299 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
13300 } else {
13301 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13302 // can we pad our head/tail out with zeros?
13303 uint64_t head_pad, tail_pad;
11fdf7f2
TL
13304 head_pad = p2phase(offset, chunk_size);
13305 tail_pad = p2nphase(end_offs, chunk_size);
7c673cae
FG
13306 if (head_pad || tail_pad) {
13307 o->extent_map.fault_range(db, offset - head_pad,
13308 end_offs - offset + head_pad + tail_pad);
13309 }
13310 if (head_pad &&
13311 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
13312 head_pad = 0;
13313 }
13314 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
13315 tail_pad = 0;
13316 }
13317
13318 uint64_t b_off = offset - head_pad - bstart;
13319 uint64_t b_len = length + head_pad + tail_pad;
13320
13321 // direct write into unused blocks of an existing mutable blob?
13322 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
13323 b->get_blob().get_ondisk_length() >= b_off + b_len &&
13324 b->get_blob().is_unused(b_off, b_len) &&
13325 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 13326 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
13327
13328 dout(20) << __func__ << " write to unused 0x" << std::hex
13329 << b_off << "~" << b_len
13330 << " pad 0x" << head_pad << " + 0x" << tail_pad
13331 << std::dec << " of mutable " << *b << dendl;
224ce89b 13332 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
13333 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13334
11fdf7f2 13335 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
13336 if (b_len <= prefer_deferred_size) {
13337 dout(20) << __func__ << " deferring small 0x" << std::hex
13338 << b_len << std::dec << " unused write via deferred" << dendl;
9f95a23c 13339 bluestore_deferred_op_t *op = _get_deferred_op(txc);
7c673cae
FG
13340 op->op = bluestore_deferred_op_t::OP_WRITE;
13341 b->get_blob().map(
13342 b_off, b_len,
13343 [&](uint64_t offset, uint64_t length) {
13344 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13345 return 0;
13346 });
224ce89b 13347 op->data = bl;
7c673cae
FG
13348 } else {
13349 b->get_blob().map_bl(
224ce89b 13350 b_off, bl,
7c673cae
FG
13351 [&](uint64_t offset, bufferlist& t) {
13352 bdev->aio_write(offset, t,
13353 &txc->ioc, wctx->buffered);
13354 });
13355 }
13356 }
224ce89b 13357 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
13358 dout(20) << __func__ << " lex old " << *ep << dendl;
13359 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
13360 b,
13361 &wctx->old_extents);
13362 b->dirty_blob().mark_used(le->blob_offset, le->length);
f67539c2 13363
7c673cae
FG
13364 txc->statfs_delta.stored() += le->length;
13365 dout(20) << __func__ << " lex " << *le << dendl;
13366 logger->inc(l_bluestore_write_small_unused);
13367 return;
13368 }
13369 // read some data to fill out the chunk?
11fdf7f2
TL
13370 uint64_t head_read = p2phase(b_off, chunk_size);
13371 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
7c673cae
FG
13372 if ((head_read || tail_read) &&
13373 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
13374 head_read + tail_read < min_alloc_size) {
13375 b_off -= head_read;
13376 b_len += head_read + tail_read;
13377
13378 } else {
13379 head_read = tail_read = 0;
13380 }
13381
13382 // chunk-aligned deferred overwrite?
13383 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
13384 b_off % chunk_size == 0 &&
13385 b_len % chunk_size == 0 &&
13386 b->get_blob().is_allocated(b_off, b_len)) {
13387
224ce89b 13388 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
13389
13390 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
13391 << " and tail 0x" << tail_read << std::dec << dendl;
13392 if (head_read) {
13393 bufferlist head_bl;
13394 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
13395 head_bl, 0);
11fdf7f2 13396 ceph_assert(r >= 0 && r <= (int)head_read);
7c673cae
FG
13397 size_t zlen = head_read - r;
13398 if (zlen) {
13399 head_bl.append_zero(zlen);
13400 logger->inc(l_bluestore_write_pad_bytes, zlen);
13401 }
11fdf7f2
TL
13402 head_bl.claim_append(bl);
13403 bl.swap(head_bl);
7c673cae
FG
13404 logger->inc(l_bluestore_write_penalty_read_ops);
13405 }
13406 if (tail_read) {
13407 bufferlist tail_bl;
13408 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
13409 tail_bl, 0);
11fdf7f2 13410 ceph_assert(r >= 0 && r <= (int)tail_read);
7c673cae
FG
13411 size_t zlen = tail_read - r;
13412 if (zlen) {
13413 tail_bl.append_zero(zlen);
13414 logger->inc(l_bluestore_write_pad_bytes, zlen);
13415 }
224ce89b 13416 bl.claim_append(tail_bl);
7c673cae
FG
13417 logger->inc(l_bluestore_write_penalty_read_ops);
13418 }
f67539c2 13419 logger->inc(l_bluestore_write_small_pre_read);
7c673cae 13420
224ce89b 13421 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
13422 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13423
f67539c2 13424 b->dirty_blob().calc_csum(b_off, bl);
11fdf7f2
TL
13425
13426 if (!g_conf()->bluestore_debug_omit_block_device_write) {
9f95a23c 13427 bluestore_deferred_op_t *op = _get_deferred_op(txc);
11fdf7f2
TL
13428 op->op = bluestore_deferred_op_t::OP_WRITE;
13429 int r = b->get_blob().map(
13430 b_off, b_len,
13431 [&](uint64_t offset, uint64_t length) {
13432 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13433 return 0;
13434 });
13435 ceph_assert(r == 0);
f67539c2 13436 op->data = std::move(bl);
11fdf7f2
TL
13437 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
13438 << b_len << std::dec << " of mutable " << *b
13439 << " at " << op->extents << dendl;
13440 }
13441
7c673cae
FG
13442 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
13443 b, &wctx->old_extents);
13444 b->dirty_blob().mark_used(le->blob_offset, le->length);
13445 txc->statfs_delta.stored() += le->length;
13446 dout(20) << __func__ << " lex " << *le << dendl;
f67539c2 13447 logger->inc(l_bluestore_write_deferred);
7c673cae
FG
13448 return;
13449 }
224ce89b
WB
13450 // try to reuse blob if we can
13451 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
13452 max_bsize,
13453 offset0 - bstart,
13454 &alloc_len)) {
11fdf7f2 13455 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
13456 // fit into reused blob
13457 // Need to check for pending writes desiring to
13458 // reuse the same pextent. The rationale is that during GC two chunks
13459 // from garbage blobs(compressed?) can share logical space within the same
13460 // AU. That's in turn might be caused by unaligned len in clone_range2.
13461 // Hence the second write will fail in an attempt to reuse blob at
13462 // do_alloc_write().
13463 if (!wctx->has_conflict(b,
13464 offset0,
13465 offset0 + alloc_len,
13466 min_alloc_size)) {
13467
13468 // we can't reuse pad_head/pad_tail since they might be truncated
13469 // due to existent extents
13470 uint64_t b_off = offset - bstart;
13471 uint64_t b_off0 = b_off;
13472 _pad_zeros(&bl, &b_off0, chunk_size);
13473
13474 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
13475 << " (0x" << b_off0 << "~" << bl.length() << ")"
13476 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
13477 << std::dec << dendl;
13478
13479 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13480 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13481 false, false);
13482 logger->inc(l_bluestore_write_small_unused);
13483 return;
13484 }
13485 }
13486 }
13487 ++ep;
eafe8130 13488 end_ep = ep;
7c673cae
FG
13489 any_change = true;
13490 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13491
13492 // check extent for reuse in reverse order
13493 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
13494 BlobRef b = prev_ep->blob;
eafe8130
TL
13495 if (!above_blob_threshold) {
13496 inspected_blobs.insert(&b->get_blob());
13497 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13498 }
13499 start_ep = prev_ep;
7c673cae
FG
13500 auto bstart = prev_ep->blob_start();
13501 dout(20) << __func__ << " considering " << *b
13502 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 13503 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
13504 max_bsize,
13505 offset0 - bstart,
13506 &alloc_len)) {
11fdf7f2 13507 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
13508 // fit into reused blob
13509 // Need to check for pending writes desiring to
13510 // reuse the same pextent. The rationale is that during GC two chunks
13511 // from garbage blobs(compressed?) can share logical space within the same
13512 // AU. That's in turn might be caused by unaligned len in clone_range2.
13513 // Hence the second write will fail in an attempt to reuse blob at
13514 // do_alloc_write().
13515 if (!wctx->has_conflict(b,
13516 offset0,
13517 offset0 + alloc_len,
13518 min_alloc_size)) {
13519
13520 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13521 uint64_t b_off = offset - bstart;
13522 uint64_t b_off0 = b_off;
13523 _pad_zeros(&bl, &b_off0, chunk_size);
13524
13525 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
13526 << " (0x" << b_off0 << "~" << bl.length() << ")"
13527 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
13528 << std::dec << dendl;
13529
13530 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13531 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13532 false, false);
13533 logger->inc(l_bluestore_write_small_unused);
13534 return;
13535 }
13536 }
13537 if (prev_ep != begin) {
13538 --prev_ep;
13539 any_change = true;
13540 } else {
13541 prev_ep = end; // to avoid useless first extent re-check
13542 }
13543 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13544 } while (any_change);
13545
eafe8130
TL
13546 if (above_blob_threshold) {
13547 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
13548 << " " << std::hex << min_off << "~" << max_off << std::dec
13549 << dendl;
13550 ceph_assert(start_ep != end_ep);
13551 for (auto ep = start_ep; ep != end_ep; ++ep) {
13552 dout(20) << __func__ << " inserting for GC "
13553 << std::hex << ep->logical_offset << "~" << ep->length
13554 << std::dec << dendl;
13555
13556 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
13557 }
13558 // insert newly written extent to GC
13559 wctx->extents_to_gc.union_insert(offset, length);
13560 dout(20) << __func__ << " inserting (last) for GC "
13561 << std::hex << offset << "~" << length
13562 << std::dec << dendl;
13563 }
7c673cae 13564 // new blob.
7c673cae 13565 BlobRef b = c->new_blob();
11fdf7f2 13566 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
7c673cae
FG
13567 uint64_t b_off0 = b_off;
13568 _pad_zeros(&bl, &b_off0, block_size);
13569 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
1911f103
TL
13570 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13571 min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
13572 // doesn't match disk one only
13573 true);
7c673cae
FG
13574
13575 return;
13576}
13577
f67539c2
TL
13578bool BlueStore::BigDeferredWriteContext::can_defer(
13579 BlueStore::extent_map_t::iterator ep,
13580 uint64_t prefer_deferred_size,
13581 uint64_t block_size,
13582 uint64_t offset,
13583 uint64_t l)
13584{
13585 bool res = false;
13586 auto& blob = ep->blob->get_blob();
13587 if (offset >= ep->blob_start() &&
13588 blob.is_mutable()) {
13589 off = offset;
13590 b_off = offset - ep->blob_start();
13591 uint64_t chunk_size = blob.get_chunk_size(block_size);
13592 uint64_t ondisk = blob.get_ondisk_length();
13593 used = std::min(l, ondisk - b_off);
13594
13595 // will read some data to fill out the chunk?
13596 head_read = p2phase<uint64_t>(b_off, chunk_size);
13597 tail_read = p2nphase<uint64_t>(b_off + used, chunk_size);
13598 b_off -= head_read;
13599
13600 ceph_assert(b_off % chunk_size == 0);
13601 ceph_assert(blob_aligned_len() % chunk_size == 0);
13602
13603 res = blob_aligned_len() <= prefer_deferred_size &&
13604 blob_aligned_len() <= ondisk &&
13605 blob.is_allocated(b_off, blob_aligned_len());
13606 if (res) {
13607 blob_ref = ep->blob;
13608 blob_start = ep->blob_start();
13609 }
13610 }
13611 return res;
13612}
13613
13614bool BlueStore::BigDeferredWriteContext::apply_defer()
13615{
13616 int r = blob_ref->get_blob().map(
13617 b_off, blob_aligned_len(),
13618 [&](const bluestore_pextent_t& pext,
13619 uint64_t offset,
13620 uint64_t length) {
13621 // apply deferred if overwrite breaks blob continuity only.
13622 // if it totally overlaps some pextent - fallback to regular write
13623 if (pext.offset < offset ||
13624 pext.end() > offset + length) {
13625 res_extents.emplace_back(bluestore_pextent_t(offset, length));
13626 return 0;
13627 }
13628 return -1;
13629 });
13630 return r >= 0;
13631}
13632
13633void BlueStore::_do_write_big_apply_deferred(
13634 TransContext* txc,
13635 CollectionRef& c,
13636 OnodeRef o,
13637 BlueStore::BigDeferredWriteContext& dctx,
13638 bufferlist::iterator& blp,
13639 WriteContext* wctx)
13640{
13641 bufferlist bl;
13642 dout(20) << __func__ << " reading head 0x" << std::hex << dctx.head_read
13643 << " and tail 0x" << dctx.tail_read << std::dec << dendl;
13644 if (dctx.head_read) {
13645 int r = _do_read(c.get(), o,
13646 dctx.off - dctx.head_read,
13647 dctx.head_read,
13648 bl,
13649 0);
13650 ceph_assert(r >= 0 && r <= (int)dctx.head_read);
13651 size_t zlen = dctx.head_read - r;
13652 if (zlen) {
13653 bl.append_zero(zlen);
13654 logger->inc(l_bluestore_write_pad_bytes, zlen);
13655 }
13656 logger->inc(l_bluestore_write_penalty_read_ops);
13657 }
13658 blp.copy(dctx.used, bl);
13659
13660 if (dctx.tail_read) {
13661 bufferlist tail_bl;
13662 int r = _do_read(c.get(), o,
13663 dctx.off + dctx.used, dctx.tail_read,
13664 tail_bl, 0);
13665 ceph_assert(r >= 0 && r <= (int)dctx.tail_read);
13666 size_t zlen = dctx.tail_read - r;
13667 if (zlen) {
13668 tail_bl.append_zero(zlen);
13669 logger->inc(l_bluestore_write_pad_bytes, zlen);
13670 }
13671 bl.claim_append(tail_bl);
13672 logger->inc(l_bluestore_write_penalty_read_ops);
13673 }
13674 auto& b0 = dctx.blob_ref;
13675 _buffer_cache_write(txc, b0, dctx.b_off, bl,
13676 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13677
13678 b0->dirty_blob().calc_csum(dctx.b_off, bl);
13679
13680 Extent* le = o->extent_map.set_lextent(c, dctx.off,
13681 dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents);
13682
13683 // in fact this is a no-op for big writes but left here to maintain
13684 // uniformity and avoid missing after some refactor.
13685 b0->dirty_blob().mark_used(le->blob_offset, le->length);
13686 txc->statfs_delta.stored() += le->length;
13687
13688 if (!g_conf()->bluestore_debug_omit_block_device_write) {
13689 bluestore_deferred_op_t* op = _get_deferred_op(txc);
13690 op->op = bluestore_deferred_op_t::OP_WRITE;
13691 op->extents.swap(dctx.res_extents);
13692 op->data = std::move(bl);
13693 }
13694}
13695
7c673cae
FG
13696void BlueStore::_do_write_big(
13697 TransContext *txc,
13698 CollectionRef &c,
13699 OnodeRef o,
13700 uint64_t offset, uint64_t length,
13701 bufferlist::iterator& blp,
13702 WriteContext *wctx)
13703{
13704 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13705 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
13706 << " compress " << (int)wctx->compress
13707 << dendl;
13708 logger->inc(l_bluestore_write_big);
13709 logger->inc(l_bluestore_write_big_bytes, length);
11fdf7f2 13710 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
f67539c2 13711 uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load();
7c673cae
FG
13712 while (length > 0) {
13713 bool new_blob = false;
11fdf7f2 13714 uint32_t l = std::min(max_bsize, length);
7c673cae
FG
13715 BlobRef b;
13716 uint32_t b_off = 0;
13717
13718 //attempting to reuse existing blob
13719 if (!wctx->compress) {
7c673cae 13720 auto end = o->extent_map.extent_map.end();
f67539c2
TL
13721
13722 if (prefer_deferred_size_snapshot &&
13723 l <= prefer_deferred_size_snapshot * 2) {
13724 // Single write that spans two adjusted existing blobs can result
13725 // in up to two deferred blocks of 'prefer_deferred_size'
13726 // So we're trying to minimize the amount of resulting blobs
13727 // and preserve 2 blobs rather than inserting one more in between
13728 // E.g. write 0x10000~20000 over existing blobs
13729 // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
13730 // performance point of view) to result in two deferred writes to
13731 // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
13732
13733 // look for an existing mutable blob we can write into
13734 auto ep = o->extent_map.seek_lextent(offset);
13735 auto ep_next = end;
13736 BigDeferredWriteContext head_info, tail_info;
13737
13738 bool will_defer = ep != end ?
13739 head_info.can_defer(ep,
13740 prefer_deferred_size_snapshot,
13741 block_size,
13742 offset,
13743 l) :
13744 false;
13745 auto offset_next = offset + head_info.used;
13746 auto remaining = l - head_info.used;
13747 if (will_defer && remaining) {
13748 will_defer = false;
13749 if (remaining <= prefer_deferred_size_snapshot) {
13750 ep_next = o->extent_map.seek_lextent(offset_next);
13751 // check if we can defer remaining totally
13752 will_defer = ep_next == end ?
13753 false :
13754 tail_info.can_defer(ep_next,
13755 prefer_deferred_size_snapshot,
13756 block_size,
13757 offset_next,
13758 remaining);
13759 will_defer = will_defer && remaining == tail_info.used;
13760 }
13761 }
13762 if (will_defer) {
13763 dout(20) << __func__ << " " << *(head_info.blob_ref)
13764 << " deferring big " << std::hex
13765 << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")"
13766 << std::dec << " write via deferred"
13767 << dendl;
13768 if (remaining) {
13769 dout(20) << __func__ << " " << *(tail_info.blob_ref)
13770 << " deferring big " << std::hex
13771 << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")"
13772 << std::dec << " write via deferred"
13773 << dendl;
13774 }
13775
13776 will_defer = head_info.apply_defer();
13777 if (!will_defer) {
13778 dout(20) << __func__
13779 << " deferring big fell back, head isn't continuous"
13780 << dendl;
13781 } else if (remaining) {
13782 will_defer = tail_info.apply_defer();
13783 if (!will_defer) {
13784 dout(20) << __func__
13785 << " deferring big fell back, tail isn't continuous"
13786 << dendl;
13787 }
13788 }
13789 }
13790 if (will_defer) {
13791 _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx);
13792 if (remaining) {
13793 _do_write_big_apply_deferred(txc, c, o, tail_info,
13794 blp, wctx);
13795 }
13796 offset += l;
13797 length -= l;
13798 logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1);
13799 logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1);
13800 continue;
13801 }
13802 }
13803
13804 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
13805
13806 // seek again as punch_hole could invalidate ep
7c673cae 13807 auto ep = o->extent_map.seek_lextent(offset);
f67539c2
TL
13808 auto begin = o->extent_map.extent_map.begin();
13809 auto prev_ep = end;
13810 if (ep != begin) {
13811 prev_ep = ep;
7c673cae 13812 --prev_ep;
7c673cae 13813 }
f67539c2
TL
13814 dout(20) << __func__ << " no deferred" << dendl;
13815
7c673cae
FG
13816 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13817 // search suitable extent in both forward and reverse direction in
13818 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 13819 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
13820 bool any_change;
13821 do {
13822 any_change = false;
13823 if (ep != end && ep->logical_offset < offset + max_bsize) {
f67539c2
TL
13824 dout(20) << __func__ << " considering " << *ep << dendl;
13825 dout(20) << __func__ << " considering " << *(ep->blob)
13826 << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl;
13827
13828 if (offset >= ep->blob_start() &&
224ce89b 13829 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
13830 offset - ep->blob_start(),
13831 &l)) {
13832 b = ep->blob;
f67539c2 13833 b_off = offset - ep->blob_start();
7c673cae
FG
13834 prev_ep = end; // to avoid check below
13835 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 13836 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
13837 } else {
13838 ++ep;
13839 any_change = true;
13840 }
13841 }
13842
13843 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
f67539c2
TL
13844 dout(20) << __func__ << " considering rev " << *prev_ep << dendl;
13845 dout(20) << __func__ << " considering reverse " << *(prev_ep->blob)
13846 << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl;
13847 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
13848 offset - prev_ep->blob_start(),
13849 &l)) {
13850 b = prev_ep->blob;
13851 b_off = offset - prev_ep->blob_start();
13852 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 13853 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
13854 } else if (prev_ep != begin) {
13855 --prev_ep;
13856 any_change = true;
13857 } else {
13858 prev_ep = end; // to avoid useless first extent re-check
13859 }
13860 }
13861 } while (b == nullptr && any_change);
f67539c2
TL
13862 } else {
13863 o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
13864 } // if (!wctx->compress)
13865
7c673cae
FG
13866 if (b == nullptr) {
13867 b = c->new_blob();
13868 b_off = 0;
13869 new_blob = true;
13870 }
7c673cae
FG
13871 bufferlist t;
13872 blp.copy(l, t);
13873 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
13874 offset += l;
13875 length -= l;
13876 logger->inc(l_bluestore_write_big_blobs);
13877 }
13878}
13879
13880int BlueStore::_do_alloc_write(
13881 TransContext *txc,
13882 CollectionRef coll,
13883 OnodeRef o,
13884 WriteContext *wctx)
13885{
13886 dout(20) << __func__ << " txc " << txc
13887 << " " << wctx->writes.size() << " blobs"
13888 << dendl;
3efd9988
FG
13889 if (wctx->writes.empty()) {
13890 return 0;
7c673cae
FG
13891 }
13892
7c673cae
FG
13893 CompressorRef c;
13894 double crr = 0;
13895 if (wctx->compress) {
13896 c = select_option(
13897 "compression_algorithm",
13898 compressor,
13899 [&]() {
13900 string val;
13901 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
13902 CompressorRef cp = compressor;
13903 if (!cp || cp->get_type_name() != val) {
13904 cp = Compressor::create(cct, val);
11fdf7f2
TL
13905 if (!cp) {
13906 if (_set_compression_alert(false, val.c_str())) {
13907 derr << __func__ << " unable to initialize " << val.c_str()
13908 << " compressor" << dendl;
13909 }
13910 }
7c673cae
FG
13911 }
13912 return boost::optional<CompressorRef>(cp);
13913 }
13914 return boost::optional<CompressorRef>();
13915 }
13916 );
13917
13918 crr = select_option(
13919 "compression_required_ratio",
13920 cct->_conf->bluestore_compression_required_ratio,
13921 [&]() {
13922 double val;
3efd9988 13923 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
7c673cae
FG
13924 return boost::optional<double>(val);
13925 }
13926 return boost::optional<double>();
13927 }
13928 );
13929 }
13930
13931 // checksum
11fdf7f2 13932 int64_t csum = csum_type.load();
7c673cae
FG
13933 csum = select_option(
13934 "csum_type",
13935 csum,
13936 [&]() {
11fdf7f2 13937 int64_t val;
3efd9988 13938 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
11fdf7f2 13939 return boost::optional<int64_t>(val);
7c673cae 13940 }
11fdf7f2 13941 return boost::optional<int64_t>();
7c673cae
FG
13942 }
13943 );
13944
3efd9988
FG
13945 // compress (as needed) and calc needed space
13946 uint64_t need = 0;
11fdf7f2 13947 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae 13948 for (auto& wi : wctx->writes) {
3efd9988 13949 if (c && wi.blob_length > min_alloc_size) {
11fdf7f2 13950 auto start = mono_clock::now();
7c673cae
FG
13951
13952 // compress
11fdf7f2
TL
13953 ceph_assert(wi.b_off == 0);
13954 ceph_assert(wi.blob_length == wi.bl.length());
3efd9988 13955
7c673cae
FG
13956 // FIXME: memory alignment here is bad
13957 bufferlist t;
f67539c2
TL
13958 boost::optional<int32_t> compressor_message;
13959 int r = c->compress(wi.bl, t, compressor_message);
3efd9988 13960 uint64_t want_len_raw = wi.blob_length * crr;
11fdf7f2 13961 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
a8e16298
TL
13962 bool rejected = false;
13963 uint64_t compressed_len = t.length();
13964 // do an approximate (fast) estimation for resulting blob size
13965 // that doesn't take header overhead into account
11fdf7f2 13966 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
13967 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
13968 bluestore_compression_header_t chdr;
13969 chdr.type = c->get_type();
13970 chdr.length = t.length();
f67539c2 13971 chdr.compressor_message = compressor_message;
a8e16298
TL
13972 encode(chdr, wi.compressed_bl);
13973 wi.compressed_bl.claim_append(t);
13974
13975 compressed_len = wi.compressed_bl.length();
11fdf7f2 13976 result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
13977 if (result_len <= want_len && result_len < wi.blob_length) {
13978 // Cool. We compressed at least as much as we were hoping to.
13979 // pad out to min_alloc_size
13980 wi.compressed_bl.append_zero(result_len - compressed_len);
13981 wi.compressed_len = compressed_len;
13982 wi.compressed = true;
13983 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
13984 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
13985 << " -> 0x" << compressed_len << " => 0x" << result_len
13986 << " with " << c->get_type()
13987 << std::dec << dendl;
13988 txc->statfs_delta.compressed() += compressed_len;
13989 txc->statfs_delta.compressed_original() += wi.blob_length;
13990 txc->statfs_delta.compressed_allocated() += result_len;
13991 logger->inc(l_bluestore_compress_success_count);
13992 need += result_len;
13993 } else {
13994 rejected = true;
13995 }
13996 } else if (r != 0) {
13997 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
13998 << " bytes compressed using " << c->get_type_name()
13999 << std::dec
14000 << " failed with errcode = " << r
14001 << ", leaving uncompressed"
14002 << dendl;
14003 logger->inc(l_bluestore_compress_rejected_count);
14004 need += wi.blob_length;
7c673cae 14005 } else {
a8e16298
TL
14006 rejected = true;
14007 }
14008
14009 if (rejected) {
3efd9988 14010 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
a8e16298 14011 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
3efd9988
FG
14012 << " with " << c->get_type()
14013 << ", which is more than required 0x" << want_len_raw
7c673cae 14014 << " -> 0x" << want_len
3efd9988
FG
14015 << ", leaving uncompressed"
14016 << std::dec << dendl;
14017 logger->inc(l_bluestore_compress_rejected_count);
14018 need += wi.blob_length;
7c673cae 14019 }
494da23a
TL
14020 log_latency("compress@_do_alloc_write",
14021 l_bluestore_compress_lat,
14022 mono_clock::now() - start,
14023 cct->_conf->bluestore_log_op_age );
3efd9988
FG
14024 } else {
14025 need += wi.blob_length;
7c673cae 14026 }
3efd9988 14027 }
a8e16298 14028 PExtentVector prealloc;
3efd9988 14029 prealloc.reserve(2 * wctx->writes.size());;
11fdf7f2 14030 int64_t prealloc_left = 0;
f67539c2 14031 prealloc_left = shared_alloc.a->allocate(
3efd9988
FG
14032 need, min_alloc_size, need,
14033 0, &prealloc);
eafe8130 14034 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
11fdf7f2 14035 derr << __func__ << " failed to allocate 0x" << std::hex << need
eafe8130 14036 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
11fdf7f2 14037 << " min_alloc_size 0x" << min_alloc_size
f67539c2 14038 << " available 0x " << shared_alloc.a->get_free()
11fdf7f2
TL
14039 << std::dec << dendl;
14040 if (prealloc.size()) {
f67539c2 14041 shared_alloc.a->release(prealloc);
11fdf7f2 14042 }
a8e16298
TL
14043 return -ENOSPC;
14044 }
9f95a23c 14045 _collect_allocation_stats(need, min_alloc_size, prealloc.size());
a8e16298 14046
f67539c2
TL
14047 if (bdev->is_smr()) {
14048 std::deque<uint64_t> zones_to_clean;
14049 if (shared_alloc.a->zoned_get_zones_to_clean(&zones_to_clean)) {
14050 std::lock_guard l{zoned_cleaner_lock};
14051 zoned_cleaner_queue.swap(zones_to_clean);
14052 zoned_cleaner_cond.notify_one();
14053 }
14054 }
14055
3efd9988
FG
14056 dout(20) << __func__ << " prealloc " << prealloc << dendl;
14057 auto prealloc_pos = prealloc.begin();
14058
14059 for (auto& wi : wctx->writes) {
14060 BlobRef b = wi.b;
14061 bluestore_blob_t& dblob = b->dirty_blob();
14062 uint64_t b_off = wi.b_off;
14063 bufferlist *l = &wi.bl;
14064 uint64_t final_length = wi.blob_length;
14065 uint64_t csum_length = wi.blob_length;
3efd9988
FG
14066 if (wi.compressed) {
14067 final_length = wi.compressed_bl.length();
14068 csum_length = final_length;
adb31ebb 14069 unsigned csum_order = ctz(csum_length);
3efd9988
FG
14070 l = &wi.compressed_bl;
14071 dblob.set_compressed(wi.blob_length, wi.compressed_len);
adb31ebb
TL
14072 if (csum != Checksummer::CSUM_NONE) {
14073 dout(20) << __func__ << " initialize csum setting for compressed blob " << *b
14074 << " csum_type " << Checksummer::get_csum_type_string(csum)
14075 << " csum_order " << csum_order
14076 << " csum_length 0x" << std::hex << csum_length
14077 << " blob_length 0x" << wi.blob_length
14078 << " compressed_length 0x" << wi.compressed_len << std::dec
14079 << dendl;
14080 dblob.init_csum(csum, csum_order, csum_length);
14081 }
3efd9988 14082 } else if (wi.new_blob) {
adb31ebb 14083 unsigned csum_order;
7c673cae 14084 // initialize newly created blob only
11fdf7f2 14085 ceph_assert(dblob.is_mutable());
7c673cae
FG
14086 if (l->length() != wi.blob_length) {
14087 // hrm, maybe we could do better here, but let's not bother.
14088 dout(20) << __func__ << " forcing csum_order to block_size_order "
14089 << block_size_order << dendl;
31f18b77 14090 csum_order = block_size_order;
7c673cae
FG
14091 } else {
14092 csum_order = std::min(wctx->csum_order, ctz(l->length()));
14093 }
14094 // try to align blob with max_blob_size to improve
14095 // its reuse ratio, e.g. in case of reverse write
14096 uint32_t suggested_boff =
14097 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
14098 if ((suggested_boff % (1 << csum_order)) == 0 &&
14099 suggested_boff + final_length <= max_bsize &&
14100 suggested_boff > b_off) {
181888fb 14101 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae 14102 << std::hex << suggested_boff << std::dec << dendl;
11fdf7f2 14103 ceph_assert(suggested_boff >= b_off);
7c673cae
FG
14104 csum_length += suggested_boff - b_off;
14105 b_off = suggested_boff;
14106 }
181888fb
FG
14107 if (csum != Checksummer::CSUM_NONE) {
14108 dout(20) << __func__ << " initialize csum setting for new blob " << *b
14109 << " csum_type " << Checksummer::get_csum_type_string(csum)
14110 << " csum_order " << csum_order
14111 << " csum_length 0x" << std::hex << csum_length << std::dec
14112 << dendl;
14113 dblob.init_csum(csum, csum_order, csum_length);
14114 }
7c673cae
FG
14115 }
14116
a8e16298 14117 PExtentVector extents;
3efd9988
FG
14118 int64_t left = final_length;
14119 while (left > 0) {
11fdf7f2 14120 ceph_assert(prealloc_left > 0);
3efd9988
FG
14121 if (prealloc_pos->length <= left) {
14122 prealloc_left -= prealloc_pos->length;
14123 left -= prealloc_pos->length;
14124 txc->statfs_delta.allocated() += prealloc_pos->length;
14125 extents.push_back(*prealloc_pos);
14126 ++prealloc_pos;
14127 } else {
14128 extents.emplace_back(prealloc_pos->offset, left);
14129 prealloc_pos->offset += left;
14130 prealloc_pos->length -= left;
14131 prealloc_left -= left;
14132 txc->statfs_delta.allocated() += left;
14133 left = 0;
14134 break;
14135 }
14136 }
7c673cae 14137 for (auto& p : extents) {
3efd9988 14138 txc->allocated.insert(p.offset, p.length);
7c673cae 14139 }
11fdf7f2 14140 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
7c673cae 14141
181888fb
FG
14142 dout(20) << __func__ << " blob " << *b << dendl;
14143 if (dblob.has_csum()) {
7c673cae
FG
14144 dblob.calc_csum(b_off, *l);
14145 }
181888fb 14146
7c673cae 14147 if (wi.mark_unused) {
1911f103 14148 ceph_assert(!dblob.is_compressed());
7c673cae
FG
14149 auto b_end = b_off + wi.bl.length();
14150 if (b_off) {
14151 dblob.add_unused(0, b_off);
14152 }
1911f103
TL
14153 uint64_t llen = dblob.get_logical_length();
14154 if (b_end < llen) {
14155 dblob.add_unused(b_end, llen - b_end);
7c673cae
FG
14156 }
14157 }
14158
14159 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
14160 b_off + (wi.b_off0 - wi.b_off),
14161 wi.length0,
14162 wi.b,
14163 nullptr);
14164 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
14165 txc->statfs_delta.stored() += le->length;
14166 dout(20) << __func__ << " lex " << *le << dendl;
14167 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
14168 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
14169
14170 // queue io
11fdf7f2 14171 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae 14172 if (l->length() <= prefer_deferred_size.load()) {
f67539c2 14173 dout(20) << __func__ << " deferring 0x" << std::hex
7c673cae 14174 << l->length() << std::dec << " write via deferred" << dendl;
9f95a23c 14175 bluestore_deferred_op_t *op = _get_deferred_op(txc);
7c673cae
FG
14176 op->op = bluestore_deferred_op_t::OP_WRITE;
14177 int r = b->get_blob().map(
14178 b_off, l->length(),
14179 [&](uint64_t offset, uint64_t length) {
14180 op->extents.emplace_back(bluestore_pextent_t(offset, length));
14181 return 0;
14182 });
11fdf7f2 14183 ceph_assert(r == 0);
7c673cae 14184 op->data = *l;
f67539c2 14185 logger->inc(l_bluestore_write_deferred);
7c673cae
FG
14186 } else {
14187 b->get_blob().map_bl(
14188 b_off, *l,
14189 [&](uint64_t offset, bufferlist& t) {
14190 bdev->aio_write(offset, t, &txc->ioc, false);
14191 });
f67539c2 14192 logger->inc(l_bluestore_write_new);
7c673cae
FG
14193 }
14194 }
14195 }
11fdf7f2
TL
14196 ceph_assert(prealloc_pos == prealloc.end());
14197 ceph_assert(prealloc_left == 0);
7c673cae
FG
14198 return 0;
14199}
14200
14201void BlueStore::_wctx_finish(
14202 TransContext *txc,
14203 CollectionRef& c,
14204 OnodeRef o,
31f18b77
FG
14205 WriteContext *wctx,
14206 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
14207{
14208 auto oep = wctx->old_extents.begin();
14209 while (oep != wctx->old_extents.end()) {
14210 auto &lo = *oep;
14211 oep = wctx->old_extents.erase(oep);
14212 dout(20) << __func__ << " lex_old " << lo.e << dendl;
14213 BlobRef b = lo.e.blob;
14214 const bluestore_blob_t& blob = b->get_blob();
14215 if (blob.is_compressed()) {
14216 if (lo.blob_empty) {
14217 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
14218 }
14219 txc->statfs_delta.compressed_original() -= lo.e.length;
14220 }
14221 auto& r = lo.r;
14222 txc->statfs_delta.stored() -= lo.e.length;
14223 if (!r.empty()) {
f67539c2 14224 dout(20) << __func__ << " blob " << *b << " release " << r << dendl;
7c673cae
FG
14225 if (blob.is_shared()) {
14226 PExtentVector final;
14227 c->load_shared_blob(b->shared_blob);
11fdf7f2
TL
14228 bool unshare = false;
14229 bool* unshare_ptr =
14230 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
7c673cae 14231 for (auto e : r) {
31f18b77
FG
14232 b->shared_blob->put_ref(
14233 e.offset, e.length, &final,
11fdf7f2
TL
14234 unshare_ptr);
14235 }
14236 if (unshare) {
14237 ceph_assert(maybe_unshared_blobs);
14238 maybe_unshared_blobs->insert(b->shared_blob.get());
7c673cae
FG
14239 }
14240 dout(20) << __func__ << " shared_blob release " << final
14241 << " from " << *b->shared_blob << dendl;
14242 txc->write_shared_blob(b->shared_blob);
14243 r.clear();
14244 r.swap(final);
14245 }
14246 }
14247 // we can't invalidate our logical extents as we drop them because
14248 // other lextents (either in our onode or others) may still
14249 // reference them. but we can throw out anything that is no
14250 // longer allocated. Note that this will leave behind edge bits
14251 // that are no longer referenced but not deallocated (until they
14252 // age out of the cache naturally).
14253 b->discard_unallocated(c.get());
14254 for (auto e : r) {
14255 dout(20) << __func__ << " release " << e << dendl;
14256 txc->released.insert(e.offset, e.length);
14257 txc->statfs_delta.allocated() -= e.length;
14258 if (blob.is_compressed()) {
14259 txc->statfs_delta.compressed_allocated() -= e.length;
14260 }
14261 }
9f95a23c
TL
14262
14263 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
7c673cae
FG
14264 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
14265 << dendl;
14266 o->extent_map.spanning_blob_map.erase(b->id);
14267 }
9f95a23c 14268 delete &lo;
7c673cae
FG
14269 }
14270}
14271
14272void BlueStore::_do_write_data(
14273 TransContext *txc,
14274 CollectionRef& c,
14275 OnodeRef o,
14276 uint64_t offset,
14277 uint64_t length,
14278 bufferlist& bl,
14279 WriteContext *wctx)
14280{
14281 uint64_t end = offset + length;
14282 bufferlist::iterator p = bl.begin();
14283
14284 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
14285 (length != min_alloc_size)) {
14286 // we fall within the same block
14287 _do_write_small(txc, c, o, offset, length, p, wctx);
14288 } else {
14289 uint64_t head_offset, head_length;
14290 uint64_t middle_offset, middle_length;
14291 uint64_t tail_offset, tail_length;
14292
14293 head_offset = offset;
11fdf7f2 14294 head_length = p2nphase(offset, min_alloc_size);
7c673cae 14295
11fdf7f2
TL
14296 tail_offset = p2align(end, min_alloc_size);
14297 tail_length = p2phase(end, min_alloc_size);
7c673cae
FG
14298
14299 middle_offset = head_offset + head_length;
14300 middle_length = length - head_length - tail_length;
14301
14302 if (head_length) {
14303 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
14304 }
14305
f67539c2 14306 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
7c673cae
FG
14307
14308 if (tail_length) {
14309 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
14310 }
14311 }
14312}
14313
31f18b77
FG
14314void BlueStore::_choose_write_options(
14315 CollectionRef& c,
14316 OnodeRef o,
14317 uint32_t fadvise_flags,
14318 WriteContext *wctx)
7c673cae 14319{
7c673cae
FG
14320 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
14321 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 14322 wctx->buffered = true;
7c673cae
FG
14323 } else if (cct->_conf->bluestore_default_buffered_write &&
14324 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
14325 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
14326 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 14327 wctx->buffered = true;
7c673cae
FG
14328 }
14329
31f18b77
FG
14330 // apply basic csum block size
14331 wctx->csum_order = block_size_order;
7c673cae
FG
14332
14333 // compression parameters
14334 unsigned alloc_hints = o->onode.alloc_hint_flags;
14335 auto cm = select_option(
14336 "compression_mode",
31f18b77 14337 comp_mode.load(),
7c673cae
FG
14338 [&]() {
14339 string val;
11fdf7f2 14340 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
14341 return boost::optional<Compressor::CompressionMode>(
14342 Compressor::get_comp_mode_type(val));
7c673cae
FG
14343 }
14344 return boost::optional<Compressor::CompressionMode>();
14345 }
14346 );
31f18b77
FG
14347
14348 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
14349 ((cm == Compressor::COMP_FORCE) ||
14350 (cm == Compressor::COMP_AGGRESSIVE &&
14351 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
14352 (cm == Compressor::COMP_PASSIVE &&
14353 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
14354
14355 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
14356 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
14357 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
14358 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 14359 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 14360
7c673cae 14361 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 14362
7c673cae 14363 if (o->onode.expected_write_size) {
224ce89b 14364 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 14365 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 14366 } else {
224ce89b 14367 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
14368 }
14369
31f18b77
FG
14370 if (wctx->compress) {
14371 wctx->target_blob_size = select_option(
7c673cae 14372 "compression_max_blob_size",
31f18b77 14373 comp_max_blob_size.load(),
7c673cae 14374 [&]() {
11fdf7f2
TL
14375 int64_t val;
14376 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
7c673cae
FG
14377 return boost::optional<uint64_t>((uint64_t)val);
14378 }
14379 return boost::optional<uint64_t>();
14380 }
14381 );
14382 }
14383 } else {
31f18b77
FG
14384 if (wctx->compress) {
14385 wctx->target_blob_size = select_option(
7c673cae 14386 "compression_min_blob_size",
31f18b77 14387 comp_min_blob_size.load(),
7c673cae 14388 [&]() {
11fdf7f2
TL
14389 int64_t val;
14390 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
7c673cae
FG
14391 return boost::optional<uint64_t>((uint64_t)val);
14392 }
14393 return boost::optional<uint64_t>();
14394 }
14395 );
14396 }
14397 }
31f18b77 14398
7c673cae 14399 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
14400 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
14401 wctx->target_blob_size = max_bsize;
7c673cae 14402 }
31f18b77 14403
7c673cae
FG
14404 // set the min blob size floor at 2x the min_alloc_size, or else we
14405 // won't be able to allocate a smaller extent for the compressed
14406 // data.
31f18b77
FG
14407 if (wctx->compress &&
14408 wctx->target_blob_size < min_alloc_size * 2) {
14409 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 14410 }
31f18b77
FG
14411
14412 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
14413 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
f64942e4
AA
14414 << " compress=" << (int)wctx->compress
14415 << " buffered=" << (int)wctx->buffered
31f18b77
FG
14416 << std::dec << dendl;
14417}
14418
14419int BlueStore::_do_gc(
14420 TransContext *txc,
14421 CollectionRef& c,
14422 OnodeRef o,
31f18b77
FG
14423 const WriteContext& wctx,
14424 uint64_t *dirty_start,
14425 uint64_t *dirty_end)
14426{
31f18b77 14427
1adf2230 14428 bool dirty_range_updated = false;
31f18b77 14429 WriteContext wctx_gc;
7c673cae 14430 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 14431
eafe8130 14432 auto & extents_to_collect = wctx.extents_to_gc;
31f18b77
FG
14433 for (auto it = extents_to_collect.begin();
14434 it != extents_to_collect.end();
14435 ++it) {
14436 bufferlist bl;
eafe8130
TL
14437 auto offset = (*it).first;
14438 auto length = (*it).second;
14439 dout(20) << __func__ << " processing " << std::hex
14440 << offset << "~" << length << std::dec
14441 << dendl;
14442 int r = _do_read(c.get(), o, offset, length, bl, 0);
14443 ceph_assert(r == (int)length);
31f18b77 14444
eafe8130
TL
14445 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
14446 logger->inc(l_bluestore_gc_merged, length);
31f18b77 14447
eafe8130
TL
14448 if (*dirty_start > offset) {
14449 *dirty_start = offset;
1adf2230 14450 dirty_range_updated = true;
31f18b77
FG
14451 }
14452
eafe8130
TL
14453 if (*dirty_end < offset + length) {
14454 *dirty_end = offset + length;
1adf2230 14455 dirty_range_updated = true;
31f18b77
FG
14456 }
14457 }
1adf2230
AA
14458 if (dirty_range_updated) {
14459 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
14460 }
31f18b77
FG
14461
14462 dout(30) << __func__ << " alloc write" << dendl;
14463 int r = _do_alloc_write(txc, c, o, &wctx_gc);
14464 if (r < 0) {
14465 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14466 << dendl;
14467 return r;
14468 }
14469
14470 _wctx_finish(txc, c, o, &wctx_gc);
14471 return 0;
14472}
14473
14474int BlueStore::_do_write(
14475 TransContext *txc,
14476 CollectionRef& c,
14477 OnodeRef o,
14478 uint64_t offset,
14479 uint64_t length,
14480 bufferlist& bl,
14481 uint32_t fadvise_flags)
14482{
14483 int r = 0;
14484
14485 dout(20) << __func__
14486 << " " << o->oid
14487 << " 0x" << std::hex << offset << "~" << length
14488 << " - have 0x" << o->onode.size
14489 << " (" << std::dec << o->onode.size << ")"
f67539c2
TL
14490 << " bytes" << std::hex
14491 << " fadvise_flags 0x" << fadvise_flags
14492 << " alloc_hint 0x" << o->onode.alloc_hint_flags
14493 << " expected_object_size " << o->onode.expected_object_size
14494 << " expected_write_size " << o->onode.expected_write_size
14495 << std::dec
31f18b77 14496 << dendl;
81eedcae 14497 _dump_onode<30>(cct, *o);
31f18b77
FG
14498
14499 if (length == 0) {
14500 return 0;
14501 }
14502
14503 uint64_t end = offset + length;
14504
14505 GarbageCollector gc(c->store->cct);
eafe8130 14506 int64_t benefit = 0;
31f18b77
FG
14507 auto dirty_start = offset;
14508 auto dirty_end = end;
14509
14510 WriteContext wctx;
14511 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
14512 o->extent_map.fault_range(db, offset, length);
14513 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
14514 r = _do_alloc_write(txc, c, o, &wctx);
14515 if (r < 0) {
14516 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14517 << dendl;
14518 goto out;
14519 }
14520
eafe8130
TL
14521 if (wctx.extents_to_gc.empty() ||
14522 wctx.extents_to_gc.range_start() > offset ||
14523 wctx.extents_to_gc.range_end() < offset + length) {
14524 benefit = gc.estimate(offset,
14525 length,
14526 o->extent_map,
14527 wctx.old_extents,
14528 min_alloc_size);
14529 }
14530
f67539c2
TL
14531 if (bdev->is_smr()) {
14532 if (wctx.old_extents.empty()) {
14533 txc->zoned_note_new_object(o);
14534 } else {
14535 int64_t old_ondisk_offset = wctx.old_extents.begin()->r.begin()->offset;
14536 txc->zoned_note_updated_object(o, old_ondisk_offset);
14537 }
14538 }
14539
31f18b77
FG
14540 // NB: _wctx_finish() will empty old_extents
14541 // so we must do gc estimation before that
7c673cae
FG
14542 _wctx_finish(txc, c, o, &wctx);
14543 if (end > o->onode.size) {
14544 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 14545 << std::dec << dendl;
7c673cae
FG
14546 o->onode.size = end;
14547 }
14548
11fdf7f2 14549 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
eafe8130
TL
14550 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
14551 dout(20) << __func__
14552 << " perform garbage collection for compressed extents, "
14553 << "expected benefit = " << benefit << " AUs" << dendl;
14554 }
14555 if (!wctx.extents_to_gc.empty()) {
14556 dout(20) << __func__ << " perform garbage collection" << dendl;
14557
14558 r = _do_gc(txc, c, o,
14559 wctx,
14560 &dirty_start, &dirty_end);
14561 if (r < 0) {
14562 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
14563 << dendl;
14564 goto out;
7c673cae 14565 }
eafe8130
TL
14566 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
14567 << "~" << dirty_end - dirty_start << std::dec << dendl;
7c673cae 14568 }
7c673cae 14569 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
14570 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
14571
7c673cae
FG
14572 r = 0;
14573
14574 out:
14575 return r;
14576}
14577
14578int BlueStore::_write(TransContext *txc,
14579 CollectionRef& c,
14580 OnodeRef& o,
31f18b77
FG
14581 uint64_t offset, size_t length,
14582 bufferlist& bl,
14583 uint32_t fadvise_flags)
7c673cae
FG
14584{
14585 dout(15) << __func__ << " " << c->cid << " " << o->oid
14586 << " 0x" << std::hex << offset << "~" << length << std::dec
14587 << dendl;
35e4c445
FG
14588 int r = 0;
14589 if (offset + length >= OBJECT_MAX_SIZE) {
14590 r = -E2BIG;
14591 } else {
14592 _assign_nid(txc, o);
14593 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
14594 txc->write_onode(o);
14595 }
7c673cae
FG
14596 dout(10) << __func__ << " " << c->cid << " " << o->oid
14597 << " 0x" << std::hex << offset << "~" << length << std::dec
14598 << " = " << r << dendl;
14599 return r;
14600}
14601
14602int BlueStore::_zero(TransContext *txc,
14603 CollectionRef& c,
14604 OnodeRef& o,
14605 uint64_t offset, size_t length)
14606{
14607 dout(15) << __func__ << " " << c->cid << " " << o->oid
14608 << " 0x" << std::hex << offset << "~" << length << std::dec
14609 << dendl;
35e4c445
FG
14610 int r = 0;
14611 if (offset + length >= OBJECT_MAX_SIZE) {
14612 r = -E2BIG;
14613 } else {
14614 _assign_nid(txc, o);
14615 r = _do_zero(txc, c, o, offset, length);
14616 }
7c673cae
FG
14617 dout(10) << __func__ << " " << c->cid << " " << o->oid
14618 << " 0x" << std::hex << offset << "~" << length << std::dec
14619 << " = " << r << dendl;
14620 return r;
14621}
14622
14623int BlueStore::_do_zero(TransContext *txc,
14624 CollectionRef& c,
14625 OnodeRef& o,
14626 uint64_t offset, size_t length)
14627{
14628 dout(15) << __func__ << " " << c->cid << " " << o->oid
14629 << " 0x" << std::hex << offset << "~" << length << std::dec
14630 << dendl;
14631 int r = 0;
14632
81eedcae 14633 _dump_onode<30>(cct, *o);
7c673cae
FG
14634
14635 WriteContext wctx;
14636 o->extent_map.fault_range(db, offset, length);
14637 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 14638 o->extent_map.dirty_range(offset, length);
7c673cae
FG
14639 _wctx_finish(txc, c, o, &wctx);
14640
b32b8144 14641 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
14642 o->onode.size = offset + length;
14643 dout(20) << __func__ << " extending size to " << offset + length
14644 << dendl;
14645 }
14646 txc->write_onode(o);
14647
14648 dout(10) << __func__ << " " << c->cid << " " << o->oid
14649 << " 0x" << std::hex << offset << "~" << length << std::dec
14650 << " = " << r << dendl;
14651 return r;
14652}
14653
14654void BlueStore::_do_truncate(
31f18b77
FG
14655 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
14656 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
14657{
14658 dout(15) << __func__ << " " << c->cid << " " << o->oid
14659 << " 0x" << std::hex << offset << std::dec << dendl;
14660
81eedcae 14661 _dump_onode<30>(cct, *o);
7c673cae
FG
14662
14663 if (offset == o->onode.size)
31f18b77 14664 return;
7c673cae 14665
f67539c2 14666 WriteContext wctx;
7c673cae 14667 if (offset < o->onode.size) {
7c673cae
FG
14668 uint64_t length = o->onode.size - offset;
14669 o->extent_map.fault_range(db, offset, length);
14670 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77
FG
14671 o->extent_map.dirty_range(offset, length);
14672 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
14673
14674 // if we have shards past EOF, ask for a reshard
14675 if (!o->onode.extent_map_shards.empty() &&
14676 o->onode.extent_map_shards.back().offset >= offset) {
14677 dout(10) << __func__ << " request reshard past EOF" << dendl;
14678 if (offset) {
14679 o->extent_map.request_reshard(offset - 1, offset + length);
14680 } else {
14681 o->extent_map.request_reshard(0, length);
14682 }
14683 }
14684 }
14685
14686 o->onode.size = offset;
14687
f67539c2
TL
14688 if (bdev->is_smr()) {
14689 // On zoned devices, we currently support only removing an object or
14690 // truncating it to zero size, both of which fall through this code path.
14691 ceph_assert(offset == 0 && !wctx.old_extents.empty());
14692 int64_t ondisk_offset = wctx.old_extents.begin()->r.begin()->offset;
14693 txc->zoned_note_truncated_object(o, ondisk_offset);
14694 }
14695
7c673cae
FG
14696 txc->write_onode(o);
14697}
14698
35e4c445 14699int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
14700 CollectionRef& c,
14701 OnodeRef& o,
14702 uint64_t offset)
14703{
14704 dout(15) << __func__ << " " << c->cid << " " << o->oid
14705 << " 0x" << std::hex << offset << std::dec
14706 << dendl;
35e4c445
FG
14707 int r = 0;
14708 if (offset >= OBJECT_MAX_SIZE) {
14709 r = -E2BIG;
14710 } else {
14711 _do_truncate(txc, c, o, offset);
14712 }
14713 dout(10) << __func__ << " " << c->cid << " " << o->oid
14714 << " 0x" << std::hex << offset << std::dec
14715 << " = " << r << dendl;
14716 return r;
7c673cae
FG
14717}
14718
14719int BlueStore::_do_remove(
14720 TransContext *txc,
14721 CollectionRef& c,
14722 OnodeRef o)
14723{
31f18b77 14724 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
14725 bool is_gen = !o->oid.is_no_gen();
14726 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
14727 if (o->onode.has_omap()) {
14728 o->flush();
9f95a23c 14729 _do_omap_clear(txc, o);
7c673cae
FG
14730 }
14731 o->exists = false;
14732 string key;
14733 for (auto &s : o->extent_map.shards) {
14734 dout(20) << __func__ << " removing shard 0x" << std::hex
14735 << s.shard_info->offset << std::dec << dendl;
14736 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
14737 [&](const string& final_key) {
14738 txc->t->rmkey(PREFIX_OBJ, final_key);
14739 }
14740 );
14741 }
14742 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
a8e16298 14743 txc->note_removed_object(o);
7c673cae
FG
14744 o->extent_map.clear();
14745 o->onode = bluestore_onode_t();
14746 _debug_obj_on_delete(o->oid);
31f18b77 14747
224ce89b
WB
14748 if (!is_gen || maybe_unshared_blobs.empty()) {
14749 return 0;
14750 }
31f18b77 14751
224ce89b
WB
14752 // see if we can unshare blobs still referenced by the head
14753 dout(10) << __func__ << " gen and maybe_unshared_blobs "
14754 << maybe_unshared_blobs << dendl;
14755 ghobject_t nogen = o->oid;
14756 nogen.generation = ghobject_t::NO_GEN;
f67539c2 14757 OnodeRef h = c->get_onode(nogen, false);
224ce89b
WB
14758
14759 if (!h || !h->exists) {
14760 return 0;
14761 }
14762
14763 dout(20) << __func__ << " checking for unshareable blobs on " << h
14764 << " " << h->oid << dendl;
14765 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
14766 for (auto& e : h->extent_map.extent_map) {
14767 const bluestore_blob_t& b = e.blob->get_blob();
14768 SharedBlob *sb = e.blob->shared_blob.get();
14769 if (b.is_shared() &&
14770 sb->loaded &&
14771 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
14772 if (b.is_compressed()) {
14773 expect[sb].get(0, b.get_ondisk_length());
14774 } else {
14775 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
14776 expect[sb].get(off, len);
14777 return 0;
14778 });
14779 }
224ce89b
WB
14780 }
14781 }
31f18b77 14782
224ce89b
WB
14783 vector<SharedBlob*> unshared_blobs;
14784 unshared_blobs.reserve(maybe_unshared_blobs.size());
14785 for (auto& p : expect) {
14786 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
14787 if (p.first->persistent->ref_map == p.second) {
14788 SharedBlob *sb = p.first;
14789 dout(20) << __func__ << " unsharing " << *sb << dendl;
14790 unshared_blobs.push_back(sb);
14791 txc->unshare_blob(sb);
14792 uint64_t sbid = c->make_blob_unshared(sb);
14793 string key;
14794 get_shared_blob_key(sbid, &key);
14795 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
14796 }
14797 }
14798
14799 if (unshared_blobs.empty()) {
14800 return 0;
14801 }
14802
224ce89b
WB
14803 for (auto& e : h->extent_map.extent_map) {
14804 const bluestore_blob_t& b = e.blob->get_blob();
14805 SharedBlob *sb = e.blob->shared_blob.get();
14806 if (b.is_shared() &&
14807 std::find(unshared_blobs.begin(), unshared_blobs.end(),
14808 sb) != unshared_blobs.end()) {
14809 dout(20) << __func__ << " unsharing " << e << dendl;
14810 bluestore_blob_t& blob = e.blob->dirty_blob();
14811 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 14812 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
14813 }
14814 }
224ce89b
WB
14815 txc->write_onode(h);
14816
7c673cae
FG
14817 return 0;
14818}
14819
14820int BlueStore::_remove(TransContext *txc,
14821 CollectionRef& c,
14822 OnodeRef &o)
14823{
11fdf7f2
TL
14824 dout(15) << __func__ << " " << c->cid << " " << o->oid
14825 << " onode " << o.get()
14826 << " txc "<< txc << dendl;
adb31ebb
TL
14827
14828 auto start_time = mono_clock::now();
7c673cae 14829 int r = _do_remove(txc, c, o);
adb31ebb
TL
14830 log_latency_fn(
14831 __func__,
14832 l_bluestore_remove_lat,
14833 mono_clock::now() - start_time,
14834 cct->_conf->bluestore_log_op_age,
14835 [&](const ceph::timespan& lat) {
14836 ostringstream ostr;
14837 ostr << ", lat = " << timespan_str(lat)
14838 << " cid =" << c->cid
14839 << " oid =" << o->oid;
14840 return ostr.str();
14841 }
14842 );
14843
7c673cae
FG
14844 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14845 return r;
14846}
14847
14848int BlueStore::_setattr(TransContext *txc,
14849 CollectionRef& c,
14850 OnodeRef& o,
14851 const string& name,
14852 bufferptr& val)
14853{
14854 dout(15) << __func__ << " " << c->cid << " " << o->oid
14855 << " " << name << " (" << val.length() << " bytes)"
14856 << dendl;
14857 int r = 0;
3efd9988
FG
14858 if (val.is_partial()) {
14859 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
14860 val.length());
f91f0fd5 14861 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
14862 } else {
14863 auto& b = o->onode.attrs[name.c_str()] = val;
f91f0fd5 14864 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 14865 }
7c673cae
FG
14866 txc->write_onode(o);
14867 dout(10) << __func__ << " " << c->cid << " " << o->oid
14868 << " " << name << " (" << val.length() << " bytes)"
14869 << " = " << r << dendl;
14870 return r;
14871}
14872
14873int BlueStore::_setattrs(TransContext *txc,
14874 CollectionRef& c,
14875 OnodeRef& o,
14876 const map<string,bufferptr>& aset)
14877{
14878 dout(15) << __func__ << " " << c->cid << " " << o->oid
14879 << " " << aset.size() << " keys"
14880 << dendl;
14881 int r = 0;
14882 for (map<string,bufferptr>::const_iterator p = aset.begin();
14883 p != aset.end(); ++p) {
3efd9988
FG
14884 if (p->second.is_partial()) {
14885 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 14886 bufferptr(p->second.c_str(), p->second.length());
f91f0fd5 14887 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
14888 } else {
14889 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
f91f0fd5 14890 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 14891 }
7c673cae
FG
14892 }
14893 txc->write_onode(o);
14894 dout(10) << __func__ << " " << c->cid << " " << o->oid
14895 << " " << aset.size() << " keys"
14896 << " = " << r << dendl;
14897 return r;
14898}
14899
14900
14901int BlueStore::_rmattr(TransContext *txc,
14902 CollectionRef& c,
14903 OnodeRef& o,
14904 const string& name)
14905{
14906 dout(15) << __func__ << " " << c->cid << " " << o->oid
14907 << " " << name << dendl;
14908 int r = 0;
14909 auto it = o->onode.attrs.find(name.c_str());
14910 if (it == o->onode.attrs.end())
14911 goto out;
14912
14913 o->onode.attrs.erase(it);
14914 txc->write_onode(o);
14915
14916 out:
14917 dout(10) << __func__ << " " << c->cid << " " << o->oid
14918 << " " << name << " = " << r << dendl;
14919 return r;
14920}
14921
14922int BlueStore::_rmattrs(TransContext *txc,
14923 CollectionRef& c,
14924 OnodeRef& o)
14925{
14926 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14927 int r = 0;
14928
14929 if (o->onode.attrs.empty())
14930 goto out;
14931
14932 o->onode.attrs.clear();
14933 txc->write_onode(o);
14934
14935 out:
14936 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14937 return r;
14938}
14939
9f95a23c 14940void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
7c673cae 14941{
9f95a23c 14942 const string& omap_prefix = o->get_omap_prefix();
7c673cae 14943 string prefix, tail;
9f95a23c
TL
14944 o->get_omap_header(&prefix);
14945 o->get_omap_tail(&tail);
11fdf7f2 14946 txc->t->rm_range_keys(omap_prefix, prefix, tail);
494da23a 14947 txc->t->rmkey(omap_prefix, tail);
11fdf7f2
TL
14948 dout(20) << __func__ << " remove range start: "
14949 << pretty_binary_string(prefix) << " end: "
14950 << pretty_binary_string(tail) << dendl;
7c673cae
FG
14951}
14952
14953int BlueStore::_omap_clear(TransContext *txc,
14954 CollectionRef& c,
14955 OnodeRef& o)
14956{
14957 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14958 int r = 0;
14959 if (o->onode.has_omap()) {
14960 o->flush();
9f95a23c 14961 _do_omap_clear(txc, o);
7c673cae
FG
14962 o->onode.clear_omap_flag();
14963 txc->write_onode(o);
14964 }
14965 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14966 return r;
14967}
14968
14969int BlueStore::_omap_setkeys(TransContext *txc,
14970 CollectionRef& c,
14971 OnodeRef& o,
14972 bufferlist &bl)
14973{
14974 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14975 int r;
11fdf7f2 14976 auto p = bl.cbegin();
7c673cae
FG
14977 __u32 num;
14978 if (!o->onode.has_omap()) {
11fdf7f2 14979 if (o->oid.is_pgmeta()) {
9f95a23c
TL
14980 o->onode.set_omap_flags_pgmeta();
14981 } else {
14982 o->onode.set_omap_flags();
11fdf7f2 14983 }
7c673cae 14984 txc->write_onode(o);
494da23a 14985
9f95a23c 14986 const string& prefix = o->get_omap_prefix();
494da23a
TL
14987 string key_tail;
14988 bufferlist tail;
9f95a23c 14989 o->get_omap_tail(&key_tail);
494da23a 14990 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
14991 } else {
14992 txc->note_modified_object(o);
14993 }
9f95a23c 14994 const string& prefix = o->get_omap_prefix();
7c673cae 14995 string final_key;
9f95a23c
TL
14996 o->get_omap_key(string(), &final_key);
14997 size_t base_key_len = final_key.size();
11fdf7f2 14998 decode(num, p);
7c673cae
FG
14999 while (num--) {
15000 string key;
15001 bufferlist value;
11fdf7f2
TL
15002 decode(key, p);
15003 decode(value, p);
9f95a23c 15004 final_key.resize(base_key_len); // keep prefix
7c673cae 15005 final_key += key;
11fdf7f2 15006 dout(20) << __func__ << " " << pretty_binary_string(final_key)
7c673cae 15007 << " <- " << key << dendl;
11fdf7f2 15008 txc->t->set(prefix, final_key, value);
7c673cae
FG
15009 }
15010 r = 0;
15011 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15012 return r;
15013}
15014
15015int BlueStore::_omap_setheader(TransContext *txc,
15016 CollectionRef& c,
15017 OnodeRef &o,
15018 bufferlist& bl)
15019{
15020 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15021 int r;
15022 string key;
15023 if (!o->onode.has_omap()) {
11fdf7f2 15024 if (o->oid.is_pgmeta()) {
9f95a23c
TL
15025 o->onode.set_omap_flags_pgmeta();
15026 } else {
15027 o->onode.set_omap_flags();
11fdf7f2 15028 }
7c673cae 15029 txc->write_onode(o);
494da23a 15030
9f95a23c 15031 const string& prefix = o->get_omap_prefix();
494da23a
TL
15032 string key_tail;
15033 bufferlist tail;
9f95a23c 15034 o->get_omap_tail(&key_tail);
494da23a 15035 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
15036 } else {
15037 txc->note_modified_object(o);
15038 }
9f95a23c
TL
15039 const string& prefix = o->get_omap_prefix();
15040 o->get_omap_header(&key);
11fdf7f2 15041 txc->t->set(prefix, key, bl);
7c673cae
FG
15042 r = 0;
15043 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15044 return r;
15045}
15046
15047int BlueStore::_omap_rmkeys(TransContext *txc,
15048 CollectionRef& c,
15049 OnodeRef& o,
15050 bufferlist& bl)
15051{
15052 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
15053 int r = 0;
11fdf7f2 15054 auto p = bl.cbegin();
7c673cae
FG
15055 __u32 num;
15056 string final_key;
15057
15058 if (!o->onode.has_omap()) {
15059 goto out;
15060 }
11fdf7f2 15061 {
9f95a23c
TL
15062 const string& prefix = o->get_omap_prefix();
15063 o->get_omap_key(string(), &final_key);
15064 size_t base_key_len = final_key.size();
11fdf7f2
TL
15065 decode(num, p);
15066 while (num--) {
15067 string key;
15068 decode(key, p);
9f95a23c 15069 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
15070 final_key += key;
15071 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
15072 << " <- " << key << dendl;
15073 txc->t->rmkey(prefix, final_key);
15074 }
7c673cae
FG
15075 }
15076 txc->note_modified_object(o);
15077
15078 out:
15079 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15080 return r;
15081}
15082
15083int BlueStore::_omap_rmkey_range(TransContext *txc,
15084 CollectionRef& c,
15085 OnodeRef& o,
15086 const string& first, const string& last)
15087{
15088 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
7c673cae
FG
15089 string key_first, key_last;
15090 int r = 0;
15091 if (!o->onode.has_omap()) {
15092 goto out;
15093 }
11fdf7f2 15094 {
9f95a23c 15095 const string& prefix = o->get_omap_prefix();
11fdf7f2 15096 o->flush();
9f95a23c
TL
15097 o->get_omap_key(first, &key_first);
15098 o->get_omap_key(last, &key_last);
11fdf7f2
TL
15099 txc->t->rm_range_keys(prefix, key_first, key_last);
15100 dout(20) << __func__ << " remove range start: "
15101 << pretty_binary_string(key_first) << " end: "
15102 << pretty_binary_string(key_last) << dendl;
7c673cae
FG
15103 }
15104 txc->note_modified_object(o);
15105
15106 out:
15107 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
15108 return r;
15109}
15110
15111int BlueStore::_set_alloc_hint(
15112 TransContext *txc,
15113 CollectionRef& c,
15114 OnodeRef& o,
15115 uint64_t expected_object_size,
15116 uint64_t expected_write_size,
15117 uint32_t flags)
15118{
15119 dout(15) << __func__ << " " << c->cid << " " << o->oid
15120 << " object_size " << expected_object_size
15121 << " write_size " << expected_write_size
15122 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
15123 << dendl;
15124 int r = 0;
15125 o->onode.expected_object_size = expected_object_size;
15126 o->onode.expected_write_size = expected_write_size;
15127 o->onode.alloc_hint_flags = flags;
15128 txc->write_onode(o);
15129 dout(10) << __func__ << " " << c->cid << " " << o->oid
15130 << " object_size " << expected_object_size
15131 << " write_size " << expected_write_size
15132 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
15133 << " = " << r << dendl;
15134 return r;
15135}
15136
15137int BlueStore::_clone(TransContext *txc,
15138 CollectionRef& c,
15139 OnodeRef& oldo,
15140 OnodeRef& newo)
15141{
15142 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15143 << newo->oid << dendl;
15144 int r = 0;
15145 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
15146 derr << __func__ << " mismatched hash on " << oldo->oid
15147 << " and " << newo->oid << dendl;
15148 return -EINVAL;
15149 }
15150
7c673cae
FG
15151 _assign_nid(txc, newo);
15152
15153 // clone data
15154 oldo->flush();
15155 _do_truncate(txc, c, newo, 0);
15156 if (cct->_conf->bluestore_clone_cow) {
15157 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
15158 } else {
15159 bufferlist bl;
15160 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
15161 if (r < 0)
15162 goto out;
15163 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
15164 if (r < 0)
15165 goto out;
15166 }
15167
15168 // clone attrs
15169 newo->onode.attrs = oldo->onode.attrs;
15170
15171 // clone omap
15172 if (newo->onode.has_omap()) {
15173 dout(20) << __func__ << " clearing old omap data" << dendl;
15174 newo->flush();
9f95a23c 15175 _do_omap_clear(txc, newo);
494da23a 15176 newo->onode.clear_omap_flag();
7c673cae
FG
15177 }
15178 if (oldo->onode.has_omap()) {
15179 dout(20) << __func__ << " copying omap data" << dendl;
494da23a 15180 if (newo->oid.is_pgmeta()) {
9f95a23c
TL
15181 newo->onode.set_omap_flags_pgmeta();
15182 } else {
15183 newo->onode.set_omap_flags();
7c673cae 15184 }
9f95a23c 15185 const string& prefix = newo->get_omap_prefix();
11fdf7f2 15186 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 15187 string head, tail;
9f95a23c
TL
15188 oldo->get_omap_header(&head);
15189 oldo->get_omap_tail(&tail);
7c673cae
FG
15190 it->lower_bound(head);
15191 while (it->valid()) {
15192 if (it->key() >= tail) {
15193 dout(30) << __func__ << " reached tail" << dendl;
15194 break;
15195 } else {
15196 dout(30) << __func__ << " got header/data "
15197 << pretty_binary_string(it->key()) << dendl;
15198 string key;
9f95a23c 15199 newo->rewrite_omap_key(it->key(), &key);
11fdf7f2 15200 txc->t->set(prefix, key, it->value());
7c673cae
FG
15201 }
15202 it->next();
15203 }
494da23a
TL
15204 string new_tail;
15205 bufferlist new_tail_value;
9f95a23c 15206 newo->get_omap_tail(&new_tail);
494da23a 15207 txc->t->set(prefix, new_tail, new_tail_value);
7c673cae
FG
15208 }
15209
15210 txc->write_onode(newo);
15211 r = 0;
15212
15213 out:
15214 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15215 << newo->oid << " = " << r << dendl;
15216 return r;
15217}
15218
15219int BlueStore::_do_clone_range(
15220 TransContext *txc,
15221 CollectionRef& c,
15222 OnodeRef& oldo,
15223 OnodeRef& newo,
224ce89b
WB
15224 uint64_t srcoff,
15225 uint64_t length,
15226 uint64_t dstoff)
7c673cae
FG
15227{
15228 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15229 << newo->oid
15230 << " 0x" << std::hex << srcoff << "~" << length << " -> "
15231 << " 0x" << dstoff << "~" << length << std::dec << dendl;
15232 oldo->extent_map.fault_range(db, srcoff, length);
15233 newo->extent_map.fault_range(db, dstoff, length);
81eedcae
TL
15234 _dump_onode<30>(cct, *oldo);
15235 _dump_onode<30>(cct, *newo);
7c673cae 15236
11fdf7f2 15237 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
81eedcae
TL
15238 _dump_onode<30>(cct, *oldo);
15239 _dump_onode<30>(cct, *newo);
7c673cae
FG
15240 return 0;
15241}
15242
15243int BlueStore::_clone_range(TransContext *txc,
15244 CollectionRef& c,
15245 OnodeRef& oldo,
15246 OnodeRef& newo,
15247 uint64_t srcoff, uint64_t length, uint64_t dstoff)
15248{
15249 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15250 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
15251 << " to offset 0x" << dstoff << std::dec << dendl;
15252 int r = 0;
15253
35e4c445
FG
15254 if (srcoff + length >= OBJECT_MAX_SIZE ||
15255 dstoff + length >= OBJECT_MAX_SIZE) {
15256 r = -E2BIG;
15257 goto out;
15258 }
7c673cae
FG
15259 if (srcoff + length > oldo->onode.size) {
15260 r = -EINVAL;
15261 goto out;
15262 }
15263
7c673cae
FG
15264 _assign_nid(txc, newo);
15265
15266 if (length > 0) {
15267 if (cct->_conf->bluestore_clone_cow) {
15268 _do_zero(txc, c, newo, dstoff, length);
15269 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
15270 } else {
15271 bufferlist bl;
15272 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
15273 if (r < 0)
15274 goto out;
15275 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
15276 if (r < 0)
15277 goto out;
15278 }
15279 }
15280
15281 txc->write_onode(newo);
15282 r = 0;
15283
15284 out:
15285 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15286 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
15287 << " to offset 0x" << dstoff << std::dec
15288 << " = " << r << dendl;
15289 return r;
15290}
15291
15292int BlueStore::_rename(TransContext *txc,
15293 CollectionRef& c,
15294 OnodeRef& oldo,
15295 OnodeRef& newo,
15296 const ghobject_t& new_oid)
15297{
15298 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15299 << new_oid << dendl;
15300 int r;
15301 ghobject_t old_oid = oldo->oid;
f91f0fd5 15302 mempool::bluestore_cache_meta::string new_okey;
7c673cae
FG
15303
15304 if (newo) {
15305 if (newo->exists) {
15306 r = -EEXIST;
15307 goto out;
15308 }
11fdf7f2 15309 ceph_assert(txc->onodes.count(newo) == 0);
7c673cae
FG
15310 }
15311
15312 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
15313
15314 // rewrite shards
15315 {
15316 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
15317 get_object_key(cct, new_oid, &new_okey);
15318 string key;
15319 for (auto &s : oldo->extent_map.shards) {
15320 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
15321 [&](const string& final_key) {
15322 txc->t->rmkey(PREFIX_OBJ, final_key);
15323 }
15324 );
15325 s.dirty = true;
15326 }
15327 }
15328
15329 newo = oldo;
15330 txc->write_onode(newo);
15331
15332 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
15333 // Onode in the old slot
15334 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
15335 r = 0;
15336
f64942e4
AA
15337 // hold a ref to new Onode in old name position, to ensure we don't drop
15338 // it from the cache before this txc commits (or else someone may come along
15339 // and read newo's metadata via the old name).
15340 txc->note_modified_object(oldo);
15341
7c673cae
FG
15342 out:
15343 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
15344 << new_oid << " = " << r << dendl;
15345 return r;
15346}
15347
15348// collections
15349
15350int BlueStore::_create_collection(
15351 TransContext *txc,
15352 const coll_t &cid,
15353 unsigned bits,
15354 CollectionRef *c)
15355{
15356 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
15357 int r;
15358 bufferlist bl;
15359
15360 {
9f95a23c 15361 std::unique_lock l(coll_lock);
7c673cae
FG
15362 if (*c) {
15363 r = -EEXIST;
15364 goto out;
15365 }
11fdf7f2
TL
15366 auto p = new_coll_map.find(cid);
15367 ceph_assert(p != new_coll_map.end());
15368 *c = p->second;
7c673cae
FG
15369 (*c)->cnode.bits = bits;
15370 coll_map[cid] = *c;
11fdf7f2 15371 new_coll_map.erase(p);
7c673cae 15372 }
11fdf7f2 15373 encode((*c)->cnode, bl);
7c673cae
FG
15374 txc->t->set(PREFIX_COLL, stringify(cid), bl);
15375 r = 0;
15376
15377 out:
15378 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
15379 return r;
15380}
15381
15382int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
15383 CollectionRef *c)
15384{
15385 dout(15) << __func__ << " " << cid << dendl;
15386 int r;
15387
11fdf7f2 15388 (*c)->flush_all_but_last();
7c673cae 15389 {
9f95a23c 15390 std::unique_lock l(coll_lock);
7c673cae
FG
15391 if (!*c) {
15392 r = -ENOENT;
15393 goto out;
15394 }
15395 size_t nonexistent_count = 0;
11fdf7f2 15396 ceph_assert((*c)->exists);
adb31ebb 15397 if ((*c)->onode_map.map_any([&](Onode* o) {
f67539c2
TL
15398 if (o->exists) {
15399 dout(1) << __func__ << " " << o->oid << " " << o
15400 << " exists in onode_map" << dendl;
7c673cae 15401 return true;
f67539c2
TL
15402 }
15403 ++nonexistent_count;
15404 return false;
15405 })) {
7c673cae
FG
15406 r = -ENOTEMPTY;
15407 goto out;
15408 }
7c673cae
FG
15409 vector<ghobject_t> ls;
15410 ghobject_t next;
15411 // Enumerate onodes in db, up to nonexistent_count + 1
15412 // then check if all of them are marked as non-existent.
11fdf7f2 15413 // Bypass the check if (next != ghobject_t::get_max())
7c673cae 15414 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
f91f0fd5 15415 nonexistent_count + 1, false, &ls, &next);
7c673cae 15416 if (r >= 0) {
11fdf7f2
TL
15417 // If true mean collecton has more objects than nonexistent_count,
15418 // so bypass check.
15419 bool exists = (!next.is_max());
7c673cae
FG
15420 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
15421 dout(10) << __func__ << " oid " << *it << dendl;
15422 auto onode = (*c)->onode_map.lookup(*it);
15423 exists = !onode || onode->exists;
15424 if (exists) {
494da23a 15425 dout(1) << __func__ << " " << *it
f67539c2
TL
15426 << " exists in db, "
15427 << (!onode ? "not present in ram" : "present in ram")
15428 << dendl;
7c673cae
FG
15429 }
15430 }
15431 if (!exists) {
f67539c2 15432 _do_remove_collection(txc, c);
7c673cae
FG
15433 r = 0;
15434 } else {
15435 dout(10) << __func__ << " " << cid
15436 << " is non-empty" << dendl;
f67539c2 15437 r = -ENOTEMPTY;
7c673cae
FG
15438 }
15439 }
15440 }
f67539c2 15441out:
7c673cae
FG
15442 dout(10) << __func__ << " " << cid << " = " << r << dendl;
15443 return r;
15444}
15445
11fdf7f2
TL
15446void BlueStore::_do_remove_collection(TransContext *txc,
15447 CollectionRef *c)
15448{
15449 coll_map.erase((*c)->cid);
15450 txc->removed_collections.push_back(*c);
15451 (*c)->exists = false;
15452 _osr_register_zombie((*c)->osr.get());
15453 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
15454 c->reset();
15455}
15456
7c673cae
FG
15457int BlueStore::_split_collection(TransContext *txc,
15458 CollectionRef& c,
15459 CollectionRef& d,
15460 unsigned bits, int rem)
15461{
15462 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
15463 << " bits " << bits << dendl;
9f95a23c
TL
15464 std::unique_lock l(c->lock);
15465 std::unique_lock l2(d->lock);
7c673cae
FG
15466 int r;
15467
15468 // flush all previous deferred writes on this sequencer. this is a bit
15469 // heavyweight, but we need to make sure all deferred writes complete
15470 // before we split as the new collection's sequencer may need to order
15471 // this after those writes, and we don't bother with the complexity of
15472 // moving those TransContexts over to the new osr.
15473 _osr_drain_preceding(txc);
15474
15475 // move any cached items (onodes and referenced shared blobs) that will
15476 // belong to the child collection post-split. leave everything else behind.
15477 // this may include things that don't strictly belong to the now-smaller
15478 // parent split, but the OSD will always send us a split for every new
15479 // child.
15480
15481 spg_t pgid, dest_pgid;
15482 bool is_pg = c->cid.is_pg(&pgid);
11fdf7f2 15483 ceph_assert(is_pg);
7c673cae 15484 is_pg = d->cid.is_pg(&dest_pgid);
11fdf7f2 15485 ceph_assert(is_pg);
7c673cae
FG
15486
15487 // the destination should initially be empty.
11fdf7f2
TL
15488 ceph_assert(d->onode_map.empty());
15489 ceph_assert(d->shared_blob_set.empty());
15490 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
15491
15492 c->split_cache(d.get());
15493
15494 // adjust bits. note that this will be redundant for all but the first
15495 // split call for this parent (first child).
15496 c->cnode.bits = bits;
11fdf7f2 15497 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
15498 r = 0;
15499
15500 bufferlist bl;
11fdf7f2 15501 encode(c->cnode, bl);
7c673cae
FG
15502 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
15503
15504 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
15505 << " bits " << bits << " = " << r << dendl;
15506 return r;
15507}
15508
11fdf7f2
TL
15509int BlueStore::_merge_collection(
15510 TransContext *txc,
15511 CollectionRef *c,
15512 CollectionRef& d,
15513 unsigned bits)
15514{
15515 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
15516 << " bits " << bits << dendl;
9f95a23c
TL
15517 std::unique_lock l((*c)->lock);
15518 std::unique_lock l2(d->lock);
11fdf7f2
TL
15519 int r;
15520
15521 coll_t cid = (*c)->cid;
15522
15523 // flush all previous deferred writes on the source collection to ensure
15524 // that all deferred writes complete before we merge as the target collection's
15525 // sequencer may need to order new ops after those writes.
15526
15527 _osr_drain((*c)->osr.get());
15528
15529 // move any cached items (onodes and referenced shared blobs) that will
15530 // belong to the child collection post-split. leave everything else behind.
15531 // this may include things that don't strictly belong to the now-smaller
15532 // parent split, but the OSD will always send us a split for every new
15533 // child.
15534
15535 spg_t pgid, dest_pgid;
15536 bool is_pg = cid.is_pg(&pgid);
15537 ceph_assert(is_pg);
15538 is_pg = d->cid.is_pg(&dest_pgid);
15539 ceph_assert(is_pg);
15540
15541 // adjust bits. note that this will be redundant for all but the first
15542 // merge call for the parent/target.
15543 d->cnode.bits = bits;
15544
15545 // behavior depends on target (d) bits, so this after that is updated.
15546 (*c)->split_cache(d.get());
15547
15548 // remove source collection
15549 {
9f95a23c 15550 std::unique_lock l3(coll_lock);
11fdf7f2
TL
15551 _do_remove_collection(txc, c);
15552 }
15553
15554 r = 0;
15555
15556 bufferlist bl;
15557 encode(d->cnode, bl);
15558 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
15559
15560 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
15561 << " bits " << bits << " = " << r << dendl;
15562 return r;
15563}
15564
494da23a
TL
15565void BlueStore::log_latency(
15566 const char* name,
15567 int idx,
15568 const ceph::timespan& l,
15569 double lat_threshold,
15570 const char* info) const
15571{
15572 logger->tinc(idx, l);
15573 if (lat_threshold > 0.0 &&
15574 l >= make_timespan(lat_threshold)) {
15575 dout(0) << __func__ << " slow operation observed for " << name
15576 << ", latency = " << l
15577 << info
15578 << dendl;
15579 }
15580}
15581
11fdf7f2 15582void BlueStore::log_latency_fn(
494da23a 15583 const char* name,
11fdf7f2
TL
15584 int idx,
15585 const ceph::timespan& l,
494da23a
TL
15586 double lat_threshold,
15587 std::function<string (const ceph::timespan& lat)> fn) const
11fdf7f2 15588{
494da23a
TL
15589 logger->tinc(idx, l);
15590 if (lat_threshold > 0.0 &&
15591 l >= make_timespan(lat_threshold)) {
15592 dout(0) << __func__ << " slow operation observed for " << name
15593 << ", latency = " << l
15594 << fn(l)
15595 << dendl;
15596 }
11fdf7f2
TL
15597}
15598
9f95a23c
TL
15599#if defined(WITH_LTTNG)
15600void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
15601 KeyValueDB &db,
15602 TransContext &txc,
15603 mono_clock::time_point start_throttle_acquire)
15604{
15605 pending_kv_ios += txc.ios;
15606 if (txc.deferred_txn) {
15607 pending_deferred_ios += txc.ios;
15608 }
15609
15610 uint64_t started = 0;
15611 uint64_t completed = 0;
15612 if (should_trace(&started, &completed)) {
15613 txc.tracing = true;
15614 uint64_t rocksdb_base_level,
15615 rocksdb_estimate_pending_compaction_bytes,
15616 rocksdb_cur_size_all_mem_tables,
15617 rocksdb_compaction_pending,
15618 rocksdb_mem_table_flush_pending,
15619 rocksdb_num_running_compactions,
15620 rocksdb_num_running_flushes,
15621 rocksdb_actual_delayed_write_rate;
15622 db.get_property(
15623 "rocksdb.base-level",
15624 &rocksdb_base_level);
15625 db.get_property(
15626 "rocksdb.estimate-pending-compaction-bytes",
15627 &rocksdb_estimate_pending_compaction_bytes);
15628 db.get_property(
15629 "rocksdb.cur-size-all-mem-tables",
15630 &rocksdb_cur_size_all_mem_tables);
15631 db.get_property(
15632 "rocksdb.compaction-pending",
15633 &rocksdb_compaction_pending);
15634 db.get_property(
15635 "rocksdb.mem-table-flush-pending",
15636 &rocksdb_mem_table_flush_pending);
15637 db.get_property(
15638 "rocksdb.num-running-compactions",
15639 &rocksdb_num_running_compactions);
15640 db.get_property(
15641 "rocksdb.num-running-flushes",
15642 &rocksdb_num_running_flushes);
15643 db.get_property(
15644 "rocksdb.actual-delayed-write-rate",
15645 &rocksdb_actual_delayed_write_rate);
15646
15647
15648 tracepoint(
15649 bluestore,
15650 transaction_initial_state,
15651 txc.osr->get_sequencer_id(),
15652 txc.seq,
15653 throttle_bytes.get_current(),
15654 throttle_deferred_bytes.get_current(),
15655 pending_kv_ios,
15656 pending_deferred_ios,
15657 started,
15658 completed,
15659 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
15660
15661 tracepoint(
15662 bluestore,
15663 transaction_initial_state_rocksdb,
15664 txc.osr->get_sequencer_id(),
15665 txc.seq,
15666 rocksdb_base_level,
15667 rocksdb_estimate_pending_compaction_bytes,
15668 rocksdb_cur_size_all_mem_tables,
15669 rocksdb_compaction_pending,
15670 rocksdb_mem_table_flush_pending,
15671 rocksdb_num_running_compactions,
15672 rocksdb_num_running_flushes,
15673 rocksdb_actual_delayed_write_rate);
15674 }
15675}
15676#endif
15677
15678mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
15679 TransContext &txc, PerfCounters *logger, int state)
15680{
15681 mono_clock::time_point now = mono_clock::now();
15682 mono_clock::duration lat = now - txc.last_stamp;
15683 logger->tinc(state, lat);
15684#if defined(WITH_LTTNG)
15685 if (txc.tracing &&
15686 state >= l_bluestore_state_prepare_lat &&
15687 state <= l_bluestore_state_done_lat) {
15688 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
15689 tracepoint(
15690 bluestore,
15691 transaction_state_duration,
15692 txc.osr->get_sequencer_id(),
15693 txc.seq,
15694 state,
15695 ceph::to_seconds<double>(lat));
15696 }
15697#endif
15698 txc.last_stamp = now;
15699 return lat;
15700}
15701
15702bool BlueStore::BlueStoreThrottle::try_start_transaction(
15703 KeyValueDB &db,
15704 TransContext &txc,
15705 mono_clock::time_point start_throttle_acquire)
15706{
15707 throttle_bytes.get(txc.cost);
15708
15709 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
15710 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15711 return true;
15712 } else {
15713 return false;
15714 }
15715}
15716
15717void BlueStore::BlueStoreThrottle::finish_start_transaction(
15718 KeyValueDB &db,
15719 TransContext &txc,
15720 mono_clock::time_point start_throttle_acquire)
15721{
15722 ceph_assert(txc.deferred_txn);
15723 throttle_deferred_bytes.get(txc.cost);
15724 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15725}
15726
15727#if defined(WITH_LTTNG)
15728void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
15729{
15730 pending_kv_ios -= 1;
15731 ios_completed_since_last_traced++;
15732 if (txc.tracing) {
15733 tracepoint(
15734 bluestore,
15735 transaction_commit_latency,
15736 txc.osr->get_sequencer_id(),
15737 txc.seq,
15738 ceph::to_seconds<double>(mono_clock::now() - txc.start));
15739 }
15740}
15741#endif
15742
15743#if defined(WITH_LTTNG)
15744void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
15745{
15746 if (txc.deferred_txn) {
15747 pending_deferred_ios -= 1;
15748 }
15749 if (txc.tracing) {
15750 mono_clock::time_point now = mono_clock::now();
15751 mono_clock::duration lat = now - txc.start;
15752 tracepoint(
15753 bluestore,
15754 transaction_total_duration,
15755 txc.osr->get_sequencer_id(),
15756 txc.seq,
15757 ceph::to_seconds<double>(lat));
15758 }
15759}
15760#endif
11fdf7f2 15761
7c673cae
FG
15762// DB key value Histogram
15763#define KEY_SLAB 32
15764#define VALUE_SLAB 64
15765
15766const string prefix_onode = "o";
15767const string prefix_onode_shard = "x";
15768const string prefix_other = "Z";
15769
15770int BlueStore::DBHistogram::get_key_slab(size_t sz)
15771{
15772 return (sz/KEY_SLAB);
15773}
15774
15775string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
15776{
15777 int lower_bound = slab * KEY_SLAB;
15778 int upper_bound = (slab + 1) * KEY_SLAB;
15779 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15780 return ret;
15781}
15782
15783int BlueStore::DBHistogram::get_value_slab(size_t sz)
15784{
15785 return (sz/VALUE_SLAB);
15786}
15787
15788string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
15789{
15790 int lower_bound = slab * VALUE_SLAB;
15791 int upper_bound = (slab + 1) * VALUE_SLAB;
15792 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15793 return ret;
15794}
15795
15796void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
15797 const string &prefix, size_t key_size, size_t value_size)
15798{
15799 uint32_t key_slab = get_key_slab(key_size);
15800 uint32_t value_slab = get_value_slab(value_size);
15801 key_hist[prefix][key_slab].count++;
11fdf7f2
TL
15802 key_hist[prefix][key_slab].max_len =
15803 std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
7c673cae
FG
15804 key_hist[prefix][key_slab].val_map[value_slab].count++;
15805 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11fdf7f2
TL
15806 std::max<size_t>(value_size,
15807 key_hist[prefix][key_slab].val_map[value_slab].max_len);
7c673cae
FG
15808}
15809
15810void BlueStore::DBHistogram::dump(Formatter *f)
15811{
15812 f->open_object_section("rocksdb_value_distribution");
15813 for (auto i : value_hist) {
15814 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
15815 }
15816 f->close_section();
15817
15818 f->open_object_section("rocksdb_key_value_histogram");
15819 for (auto i : key_hist) {
15820 f->dump_string("prefix", i.first);
15821 f->open_object_section("key_hist");
15822 for ( auto k : i.second) {
15823 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
15824 f->dump_unsigned("max_len", k.second.max_len);
15825 f->open_object_section("value_hist");
15826 for ( auto j : k.second.val_map) {
15827 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
15828 f->dump_unsigned("max_len", j.second.max_len);
15829 }
15830 f->close_section();
15831 }
15832 f->close_section();
15833 }
15834 f->close_section();
15835}
15836
15837//Itrerates through the db and collects the stats
15838void BlueStore::generate_db_histogram(Formatter *f)
15839{
15840 //globals
15841 uint64_t num_onodes = 0;
15842 uint64_t num_shards = 0;
15843 uint64_t num_super = 0;
15844 uint64_t num_coll = 0;
15845 uint64_t num_omap = 0;
11fdf7f2 15846 uint64_t num_pgmeta_omap = 0;
7c673cae
FG
15847 uint64_t num_deferred = 0;
15848 uint64_t num_alloc = 0;
15849 uint64_t num_stat = 0;
15850 uint64_t num_others = 0;
15851 uint64_t num_shared_shards = 0;
15852 size_t max_key_size =0, max_value_size = 0;
15853 uint64_t total_key_size = 0, total_value_size = 0;
15854 size_t key_size = 0, value_size = 0;
15855 DBHistogram hist;
15856
11fdf7f2 15857 auto start = coarse_mono_clock::now();
7c673cae 15858
11fdf7f2 15859 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
7c673cae
FG
15860 iter->seek_to_first();
15861 while (iter->valid()) {
15862 dout(30) << __func__ << " Key: " << iter->key() << dendl;
15863 key_size = iter->key_size();
15864 value_size = iter->value_size();
15865 hist.value_hist[hist.get_value_slab(value_size)]++;
11fdf7f2
TL
15866 max_key_size = std::max(max_key_size, key_size);
15867 max_value_size = std::max(max_value_size, value_size);
7c673cae
FG
15868 total_key_size += key_size;
15869 total_value_size += value_size;
15870
15871 pair<string,string> key(iter->raw_key());
15872
15873 if (key.first == PREFIX_SUPER) {
15874 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
15875 num_super++;
15876 } else if (key.first == PREFIX_STAT) {
15877 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
15878 num_stat++;
15879 } else if (key.first == PREFIX_COLL) {
15880 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
15881 num_coll++;
15882 } else if (key.first == PREFIX_OBJ) {
15883 if (key.second.back() == ONODE_KEY_SUFFIX) {
15884 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
15885 num_onodes++;
15886 } else {
15887 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
15888 num_shards++;
15889 }
15890 } else if (key.first == PREFIX_OMAP) {
15891 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
15892 num_omap++;
f67539c2
TL
15893 } else if (key.first == PREFIX_PERPOOL_OMAP) {
15894 hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size);
15895 num_omap++;
15896 } else if (key.first == PREFIX_PERPG_OMAP) {
15897 hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size);
15898 num_omap++;
11fdf7f2
TL
15899 } else if (key.first == PREFIX_PGMETA_OMAP) {
15900 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
15901 num_pgmeta_omap++;
7c673cae
FG
15902 } else if (key.first == PREFIX_DEFERRED) {
15903 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
15904 num_deferred++;
11fdf7f2 15905 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
7c673cae
FG
15906 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
15907 num_alloc++;
15908 } else if (key.first == PREFIX_SHARED_BLOB) {
15909 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
15910 num_shared_shards++;
15911 } else {
15912 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
15913 num_others++;
15914 }
15915 iter->next();
15916 }
15917
11fdf7f2 15918 ceph::timespan duration = coarse_mono_clock::now() - start;
7c673cae
FG
15919 f->open_object_section("rocksdb_key_value_stats");
15920 f->dump_unsigned("num_onodes", num_onodes);
15921 f->dump_unsigned("num_shards", num_shards);
15922 f->dump_unsigned("num_super", num_super);
15923 f->dump_unsigned("num_coll", num_coll);
15924 f->dump_unsigned("num_omap", num_omap);
11fdf7f2 15925 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
7c673cae
FG
15926 f->dump_unsigned("num_deferred", num_deferred);
15927 f->dump_unsigned("num_alloc", num_alloc);
15928 f->dump_unsigned("num_stat", num_stat);
15929 f->dump_unsigned("num_shared_shards", num_shared_shards);
15930 f->dump_unsigned("num_others", num_others);
15931 f->dump_unsigned("max_key_size", max_key_size);
15932 f->dump_unsigned("max_value_size", max_value_size);
15933 f->dump_unsigned("total_key_size", total_key_size);
15934 f->dump_unsigned("total_value_size", total_value_size);
15935 f->close_section();
15936
15937 hist.dump(f);
15938
15939 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
15940
15941}
15942
f6b5b4d7 15943void BlueStore::_shutdown_cache()
7c673cae
FG
15944{
15945 dout(10) << __func__ << dendl;
9f95a23c
TL
15946 for (auto i : buffer_cache_shards) {
15947 i->flush();
11fdf7f2 15948 ceph_assert(i->empty());
7c673cae
FG
15949 }
15950 for (auto& p : coll_map) {
f6b5b4d7 15951 p.second->onode_map.clear();
3efd9988
FG
15952 if (!p.second->shared_blob_set.empty()) {
15953 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11fdf7f2 15954 p.second->shared_blob_set.dump<0>(cct);
3efd9988 15955 }
11fdf7f2
TL
15956 ceph_assert(p.second->onode_map.empty());
15957 ceph_assert(p.second->shared_blob_set.empty());
7c673cae
FG
15958 }
15959 coll_map.clear();
f6b5b4d7
TL
15960 for (auto i : onode_cache_shards) {
15961 ceph_assert(i->empty());
15962 }
7c673cae
FG
15963}
15964
31f18b77
FG
15965// For external caller.
15966// We use a best-effort policy instead, e.g.,
15967// we don't care if there are still some pinned onodes/data in the cache
15968// after this command is completed.
11fdf7f2 15969int BlueStore::flush_cache(ostream *os)
31f18b77
FG
15970{
15971 dout(10) << __func__ << dendl;
9f95a23c
TL
15972 for (auto i : onode_cache_shards) {
15973 i->flush();
15974 }
15975 for (auto i : buffer_cache_shards) {
15976 i->flush();
31f18b77 15977 }
11fdf7f2
TL
15978
15979 return 0;
31f18b77
FG
15980}
15981
7c673cae
FG
15982void BlueStore::_apply_padding(uint64_t head_pad,
15983 uint64_t tail_pad,
7c673cae
FG
15984 bufferlist& padded)
15985{
7c673cae 15986 if (head_pad) {
224ce89b 15987 padded.prepend_zero(head_pad);
7c673cae
FG
15988 }
15989 if (tail_pad) {
15990 padded.append_zero(tail_pad);
15991 }
15992 if (head_pad || tail_pad) {
15993 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
15994 << " tail 0x" << tail_pad << std::dec << dendl;
15995 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
15996 }
15997}
15998
11fdf7f2
TL
15999void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
16000{
16001 // finalize extent_map shards
16002 o->extent_map.update(txn, false);
16003 if (o->extent_map.needs_reshard()) {
16004 o->extent_map.reshard(db, txn);
16005 o->extent_map.update(txn, true);
16006 if (o->extent_map.needs_reshard()) {
16007 dout(20) << __func__ << " warning: still wants reshard, check options?"
16008 << dendl;
16009 o->extent_map.clear_needs_reshard();
16010 }
16011 logger->inc(l_bluestore_onode_reshard);
16012 }
16013
16014 // bound encode
16015 size_t bound = 0;
16016 denc(o->onode, bound);
16017 o->extent_map.bound_encode_spanning_blobs(bound);
16018 if (o->onode.extent_map_shards.empty()) {
16019 denc(o->extent_map.inline_bl, bound);
16020 }
16021
16022 // encode
16023 bufferlist bl;
16024 unsigned onode_part, blob_part, extent_part;
16025 {
16026 auto p = bl.get_contiguous_appender(bound, true);
16027 denc(o->onode, p);
16028 onode_part = p.get_logical_offset();
16029 o->extent_map.encode_spanning_blobs(p);
16030 blob_part = p.get_logical_offset() - onode_part;
16031 if (o->onode.extent_map_shards.empty()) {
16032 denc(o->extent_map.inline_bl, p);
16033 }
16034 extent_part = p.get_logical_offset() - onode_part - blob_part;
16035 }
16036
16037 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
16038 << " (" << onode_part << " bytes onode + "
16039 << blob_part << " bytes spanning blobs + "
16040 << extent_part << " bytes inline extents)"
16041 << dendl;
16042
16043
16044 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
16045}
16046
16047void BlueStore::_log_alerts(osd_alert_list_t& alerts)
16048{
16049 std::lock_guard l(qlock);
16050
f67539c2
TL
16051 if (!spurious_read_errors_alert.empty()) {
16052 alerts.emplace(
16053 "BLUESTORE_SPURIOUS_READ_ERRORS",
16054 spurious_read_errors_alert);
16055 }
81eedcae
TL
16056 if (!disk_size_mismatch_alert.empty()) {
16057 alerts.emplace(
16058 "BLUESTORE_DISK_SIZE_MISMATCH",
16059 disk_size_mismatch_alert);
16060 }
16061 if (!legacy_statfs_alert.empty()) {
16062 alerts.emplace(
16063 "BLUESTORE_LEGACY_STATFS",
16064 legacy_statfs_alert);
16065 }
11fdf7f2
TL
16066 if (!spillover_alert.empty() &&
16067 cct->_conf->bluestore_warn_on_bluefs_spillover) {
16068 alerts.emplace(
16069 "BLUEFS_SPILLOVER",
16070 spillover_alert);
16071 }
f67539c2
TL
16072 if (!no_per_pg_omap_alert.empty()) {
16073 alerts.emplace(
16074 "BLUESTORE_NO_PER_PG_OMAP",
16075 no_per_pg_omap_alert);
16076 }
9f95a23c
TL
16077 if (!no_per_pool_omap_alert.empty()) {
16078 alerts.emplace(
16079 "BLUESTORE_NO_PER_POOL_OMAP",
16080 no_per_pool_omap_alert);
16081 }
11fdf7f2
TL
16082 string s0(failed_cmode);
16083
16084 if (!failed_compressors.empty()) {
16085 if (!s0.empty()) {
16086 s0 += ", ";
16087 }
16088 s0 += "unable to load:";
16089 bool first = true;
16090 for (auto& s : failed_compressors) {
16091 if (first) {
16092 first = false;
16093 } else {
16094 s0 += ", ";
16095 }
16096 s0 += s;
16097 }
16098 alerts.emplace(
16099 "BLUESTORE_NO_COMPRESSION",
16100 s0);
16101 }
16102}
16103
9f95a23c
TL
16104void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
16105 size_t extents)
16106{
16107 alloc_stats_count++;
16108 alloc_stats_fragments += extents;
16109 alloc_stats_size += need;
16110}
16111
16112void BlueStore::_record_allocation_stats()
16113{
16114 // don't care about data consistency,
16115 // fields can be partially modified while making the tuple
16116 auto t0 = std::make_tuple(
16117 alloc_stats_count.exchange(0),
16118 alloc_stats_fragments.exchange(0),
16119 alloc_stats_size.exchange(0));
16120
16121 dout(0) << " allocation stats probe "
16122 << probe_count << ":"
16123 << " cnt: " << std::get<0>(t0)
16124 << " frags: " << std::get<1>(t0)
16125 << " size: " << std::get<2>(t0)
16126 << dendl;
16127
16128
16129 //
16130 // Keep the history for probes from the power-of-two sequence:
16131 // -1, -2, -4, -8, -16
16132 //
16133 size_t base = 1;
16134 for (auto& t : alloc_stats_history) {
16135 dout(0) << " probe -"
16136 << base + (probe_count % base) << ": "
16137 << std::get<0>(t)
16138 << ", " << std::get<1>(t)
16139 << ", " << std::get<2>(t)
16140 << dendl;
16141 base <<= 1;
16142 }
16143 dout(0) << "------------" << dendl;
16144
f67539c2 16145 ++ probe_count;
9f95a23c 16146
f67539c2
TL
16147 for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) {
16148 if ((probe_count % (1 << i)) == 0) {
16149 alloc_stats_history[i] = alloc_stats_history[i - 1];
16150 }
9f95a23c
TL
16151 }
16152 alloc_stats_history[0].swap(t0);
16153}
16154
7c673cae 16155// ===========================================
11fdf7f2
TL
16156// BlueStoreRepairer
16157
16158size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
16159 const interval_set<uint64_t>& extents)
16160{
16161 ceph_assert(granularity); // initialized
16162 // can't call for the second time
16163 ceph_assert(!was_filtered_out);
16164 ceph_assert(collections_bfs.size() == objects_bfs.size());
16165
16166 uint64_t prev_pos = 0;
16167 uint64_t npos = collections_bfs.size();
16168
16169 bloom_vector collections_reduced;
16170 bloom_vector objects_reduced;
16171
16172 for (auto e : extents) {
16173 if (e.second == 0) {
16174 continue;
16175 }
16176 uint64_t pos = max(e.first / granularity, prev_pos);
16177 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
16178 while (pos != npos && pos < end_pos) {
16179 ceph_assert( collections_bfs[pos].element_count() ==
16180 objects_bfs[pos].element_count());
16181 if (collections_bfs[pos].element_count()) {
16182 collections_reduced.push_back(std::move(collections_bfs[pos]));
16183 objects_reduced.push_back(std::move(objects_bfs[pos]));
16184 }
16185 ++pos;
16186 }
16187 prev_pos = end_pos;
16188 }
16189 collections_reduced.swap(collections_bfs);
16190 objects_reduced.swap(objects_bfs);
16191 was_filtered_out = true;
16192 return collections_bfs.size();
16193}
16194
16195bool BlueStoreRepairer::remove_key(KeyValueDB *db,
16196 const string& prefix,
16197 const string& key)
16198{
16199 if (!remove_key_txn) {
16200 remove_key_txn = db->get_transaction();
16201 }
16202 ++to_repair_cnt;
16203 remove_key_txn->rmkey(prefix, key);
16204
16205 return true;
16206}
16207
f67539c2 16208void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val)
9f95a23c
TL
16209{
16210 fix_per_pool_omap_txn = db->get_transaction();
16211 ++to_repair_cnt;
16212 bufferlist bl;
f67539c2 16213 bl.append(stringify(val));
9f95a23c
TL
16214 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
16215}
16216
11fdf7f2
TL
16217bool BlueStoreRepairer::fix_shared_blob(
16218 KeyValueDB *db,
16219 uint64_t sbid,
16220 const bufferlist* bl)
16221{
16222 KeyValueDB::Transaction txn;
16223 if (fix_misreferences_txn) { // reuse this txn
16224 txn = fix_misreferences_txn;
16225 } else {
16226 if (!fix_shared_blob_txn) {
16227 fix_shared_blob_txn = db->get_transaction();
16228 }
16229 txn = fix_shared_blob_txn;
16230 }
16231 string key;
16232 get_shared_blob_key(sbid, &key);
16233
16234 ++to_repair_cnt;
16235 if (bl) {
16236 txn->set(PREFIX_SHARED_BLOB, key, *bl);
16237 } else {
16238 txn->rmkey(PREFIX_SHARED_BLOB, key);
16239 }
16240 return true;
16241}
16242
16243bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
16244 const string& key,
16245 const store_statfs_t& new_statfs)
16246{
16247 if (!fix_statfs_txn) {
16248 fix_statfs_txn = db->get_transaction();
16249 }
16250 BlueStore::volatile_statfs vstatfs;
16251 vstatfs = new_statfs;
16252 bufferlist bl;
16253 vstatfs.encode(bl);
16254 ++to_repair_cnt;
16255 fix_statfs_txn->set(PREFIX_STAT, key, bl);
16256 return true;
16257}
16258
16259bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
16260 FreelistManager* fm,
16261 uint64_t offset, uint64_t len)
16262{
16263 if (!fix_fm_leaked_txn) {
16264 fix_fm_leaked_txn = db->get_transaction();
16265 }
16266 ++to_repair_cnt;
16267 fm->release(offset, len, fix_fm_leaked_txn);
16268 return true;
16269}
16270bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
16271 FreelistManager* fm,
16272 uint64_t offset, uint64_t len)
16273{
16274 if (!fix_fm_false_free_txn) {
16275 fix_fm_false_free_txn = db->get_transaction();
16276 }
16277 ++to_repair_cnt;
16278 fm->allocate(offset, len, fix_fm_false_free_txn);
16279 return true;
16280}
16281
adb31ebb
TL
16282KeyValueDB::Transaction BlueStoreRepairer::fix_spanning_blobs(KeyValueDB* db)
16283{
16284 if (!fix_onode_txn) {
16285 fix_onode_txn = db->get_transaction();
16286 }
16287 ++to_repair_cnt;
16288 return fix_onode_txn;
16289}
16290
11fdf7f2
TL
16291bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
16292{
16293 if (misreferenced_extents.size()) {
16294 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
16295 ceph_assert(n > 0);
16296 if (!fix_misreferences_txn) {
16297 fix_misreferences_txn = db->get_transaction();
16298 }
16299 return true;
16300 }
16301 return false;
16302}
16303
16304unsigned BlueStoreRepairer::apply(KeyValueDB* db)
16305{
9f95a23c
TL
16306 if (fix_per_pool_omap_txn) {
16307 db->submit_transaction_sync(fix_per_pool_omap_txn);
16308 fix_per_pool_omap_txn = nullptr;
16309 }
11fdf7f2
TL
16310 if (fix_fm_leaked_txn) {
16311 db->submit_transaction_sync(fix_fm_leaked_txn);
16312 fix_fm_leaked_txn = nullptr;
16313 }
16314 if (fix_fm_false_free_txn) {
16315 db->submit_transaction_sync(fix_fm_false_free_txn);
16316 fix_fm_false_free_txn = nullptr;
16317 }
16318 if (remove_key_txn) {
16319 db->submit_transaction_sync(remove_key_txn);
16320 remove_key_txn = nullptr;
16321 }
16322 if (fix_misreferences_txn) {
16323 db->submit_transaction_sync(fix_misreferences_txn);
16324 fix_misreferences_txn = nullptr;
16325 }
adb31ebb
TL
16326 if (fix_onode_txn) {
16327 db->submit_transaction_sync(fix_onode_txn);
16328 fix_onode_txn = nullptr;
16329 }
11fdf7f2
TL
16330 if (fix_shared_blob_txn) {
16331 db->submit_transaction_sync(fix_shared_blob_txn);
16332 fix_shared_blob_txn = nullptr;
16333 }
16334
16335 if (fix_statfs_txn) {
16336 db->submit_transaction_sync(fix_statfs_txn);
16337 fix_statfs_txn = nullptr;
16338 }
16339 unsigned repaired = to_repair_cnt;
16340 to_repair_cnt = 0;
16341 return repaired;
16342}
16343
16344// =======================================================
9f95a23c
TL
16345// RocksDBBlueFSVolumeSelector
16346
16347uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
16348 ceph_assert(h != nullptr);
16349 uint64_t hint = reinterpret_cast<uint64_t>(h);
16350 uint8_t res;
16351 switch (hint) {
16352 case LEVEL_SLOW:
16353 res = BlueFS::BDEV_SLOW;
16354 if (db_avail4slow > 0) {
16355 // considering statically available db space vs.
16356 // - observed maximums on DB dev for DB/WAL/UNSORTED data
16357 // - observed maximum spillovers
16358 uint64_t max_db_use = 0; // max db usage we potentially observed
f6b5b4d7 16359 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
9f95a23c
TL
16360 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
16361 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
16362 // this could go to db hence using it in the estimation
16363 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
16364
16365 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
16366 uint64_t avail = min(
16367 db_avail4slow,
16368 max_db_use < db_total ? db_total - max_db_use : 0);
16369
16370 // considering current DB dev usage for SLOW data
16371 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
16372 res = BlueFS::BDEV_DB;
16373 }
16374 }
16375 break;
f6b5b4d7 16376 case LEVEL_LOG:
9f95a23c
TL
16377 case LEVEL_WAL:
16378 res = BlueFS::BDEV_WAL;
16379 break;
16380 case LEVEL_DB:
16381 default:
16382 res = BlueFS::BDEV_DB;
16383 break;
16384 }
16385 return res;
16386}
16387
16388void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
16389{
16390 res.emplace_back(base, l_totals[LEVEL_DB - LEVEL_FIRST]);
16391 res.emplace_back(base + ".slow", l_totals[LEVEL_SLOW - LEVEL_FIRST]);
16392}
16393
16394void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(const string& dirname) const {
16395 uint8_t res = LEVEL_DB;
16396 if (dirname.length() > 5) {
16397 // the "db.slow" and "db.wal" directory names are hard-coded at
16398 // match up with bluestore. the slow device is always the second
16399 // one (when a dedicated block.db device is present and used at
16400 // bdev 0). the wal device is always last.
16401 if (boost::algorithm::ends_with(dirname, ".slow")) {
16402 res = LEVEL_SLOW;
16403 }
16404 else if (boost::algorithm::ends_with(dirname, ".wal")) {
16405 res = LEVEL_WAL;
16406 }
16407 }
16408 return reinterpret_cast<void*>(res);
16409}
16410
16411void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
16412 auto max_x = per_level_per_dev_usage.get_max_x();
16413 auto max_y = per_level_per_dev_usage.get_max_y();
16414 sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
16415 << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
16416 << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
16417 << ", db_avail:" << db_avail4slow << std::endl
16418 << "Usage matrix:" << std::endl;
f6b5b4d7 16419 constexpr std::array<const char*, 8> names{ {
9f95a23c
TL
16420 "DEV/LEV",
16421 "WAL",
16422 "DB",
16423 "SLOW",
16424 "*",
16425 "*",
f6b5b4d7
TL
16426 "REAL",
16427 "FILES",
9f95a23c
TL
16428 } };
16429 const size_t width = 12;
16430 for (size_t i = 0; i < names.size(); ++i) {
16431 sout.setf(std::ios::left, std::ios::adjustfield);
16432 sout.width(width);
16433 sout << names[i];
16434 }
16435 sout << std::endl;
16436 for (size_t l = 0; l < max_y; l++) {
16437 sout.setf(std::ios::left, std::ios::adjustfield);
16438 sout.width(width);
16439 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
16440 case LEVEL_LOG:
16441 sout << "LOG"; break;
9f95a23c
TL
16442 case LEVEL_WAL:
16443 sout << "WAL"; break;
16444 case LEVEL_DB:
16445 sout << "DB"; break;
16446 case LEVEL_SLOW:
16447 sout << "SLOW"; break;
16448 case LEVEL_MAX:
16449 sout << "TOTALS"; break;
16450 }
f6b5b4d7 16451 for (size_t d = 0; d < max_x; d++) {
9f95a23c
TL
16452 sout.setf(std::ios::left, std::ios::adjustfield);
16453 sout.width(width);
16454 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
16455 }
16456 sout.setf(std::ios::left, std::ios::adjustfield);
16457 sout.width(width);
f6b5b4d7 16458 sout << stringify(per_level_files[l]) << std::endl;
9f95a23c
TL
16459 }
16460 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
16461 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
16462 sout << "MAXIMUMS:" << std::endl;
16463 for (size_t l = 0; l < max_y; l++) {
16464 sout.setf(std::ios::left, std::ios::adjustfield);
16465 sout.width(width);
16466 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
16467 case LEVEL_LOG:
16468 sout << "LOG"; break;
9f95a23c
TL
16469 case LEVEL_WAL:
16470 sout << "WAL"; break;
16471 case LEVEL_DB:
16472 sout << "DB"; break;
16473 case LEVEL_SLOW:
16474 sout << "SLOW"; break;
16475 case LEVEL_MAX:
16476 sout << "TOTALS"; break;
16477 }
16478 for (size_t d = 0; d < max_x - 1; d++) {
16479 sout.setf(std::ios::left, std::ios::adjustfield);
16480 sout.width(width);
16481 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
16482 }
16483 sout.setf(std::ios::left, std::ios::adjustfield);
16484 sout.width(width);
16485 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
16486 if (l < max_y - 1) {
16487 sout << std::endl;
16488 }
16489 }
16490}
11fdf7f2 16491
9f95a23c 16492// =======================================================