]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
bump version to 15.2.8-pve2
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
20
eafe8130 21#include <boost/container/flat_set.hpp>
9f95a23c 22#include "boost/algorithm/string.hpp"
eafe8130 23
31f18b77
FG
24#include "include/cpp-btree/btree_set.h"
25
9f95a23c 26#include "bluestore_common.h"
7c673cae
FG
27#include "BlueStore.h"
28#include "os/kv.h"
29#include "include/compat.h"
30#include "include/intarith.h"
31#include "include/stringify.h"
11fdf7f2
TL
32#include "include/str_map.h"
33#include "include/util.h"
7c673cae
FG
34#include "common/errno.h"
35#include "common/safe_io.h"
91327a77 36#include "common/PriorityCache.h"
9f95a23c 37#include "common/RWLock.h"
7c673cae
FG
38#include "Allocator.h"
39#include "FreelistManager.h"
40#include "BlueFS.h"
41#include "BlueRocksEnv.h"
42#include "auth/Crypto.h"
43#include "common/EventTrace.h"
91327a77 44#include "perfglue/heap_profiler.h"
11fdf7f2
TL
45#include "common/blkdev.h"
46#include "common/numa.h"
7c673cae 47
9f95a23c
TL
48#if defined(WITH_LTTNG)
49#define TRACEPOINT_DEFINE
50#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
51#include "tracing/bluestore.h"
52#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
53#undef TRACEPOINT_DEFINE
54#else
55#define tracepoint(...)
56#endif
57
7c673cae
FG
58#define dout_context cct
59#define dout_subsys ceph_subsys_bluestore
60
31f18b77
FG
61using bid_t = decltype(BlueStore::Blob::id);
62
63// bluestore_cache_onode
7c673cae 64MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 65 bluestore_cache_onode);
7c673cae 66
31f18b77 67// bluestore_cache_other
7c673cae 68MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
f91f0fd5 69 bluestore_Buffer);
7c673cae 70MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
f91f0fd5 71 bluestore_Extent);
7c673cae 72MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
f91f0fd5 73 bluestore_Blob);
7c673cae 74MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
f91f0fd5 75 bluestore_SharedBlob);
31f18b77
FG
76
77// bluestore_txc
78MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
79 bluestore_txc);
80
7c673cae
FG
81
82// kv store prefixes
11fdf7f2
TL
83const string PREFIX_SUPER = "S"; // field -> value
84const string PREFIX_STAT = "T"; // field -> value(int64 array)
85const string PREFIX_COLL = "C"; // collection name -> cnode_t
86const string PREFIX_OBJ = "O"; // object name -> onode_t
87const string PREFIX_OMAP = "M"; // u64 + keyname -> value
88const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
9f95a23c 89const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
11fdf7f2
TL
90const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
91const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
92const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
7c673cae
FG
93const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
94
11fdf7f2
TL
95const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
96
7c673cae
FG
97// write a label in the first block. always use this size. note that
98// bluefs makes a matching assumption about the location of its
99// superblock (always the second block of the device).
100#define BDEV_LABEL_BLOCK_SIZE 4096
101
102// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
103#define SUPER_RESERVED 8192
104
105#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
106
107
108/*
109 * extent map blob encoding
110 *
111 * we use the low bits of the blobid field to indicate some common scenarios
112 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
113 */
114#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
115#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
116#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
117#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
118#define BLOBID_SHIFT_BITS 4
119
120/*
121 * object name key structure
122 *
123 * encoded u8: shard + 2^7 (so that it sorts properly)
124 * encoded u64: poolid + 2^63 (so that it sorts properly)
125 * encoded u32: hash (bit reversed)
126 *
127 * escaped string: namespace
128 *
129 * escaped string: key or object name
130 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
131 * we are done. otherwise, we are followed by the object name.
132 * escaped string: object name (unless '=' above)
133 *
134 * encoded u64: snap
135 * encoded u64: generation
136 * 'o'
137 */
138#define ONODE_KEY_SUFFIX 'o'
139
140/*
141 * extent shard key
142 *
143 * object prefix key
144 * u32
145 * 'x'
146 */
147#define EXTENT_SHARD_KEY_SUFFIX 'x'
148
149/*
150 * string encoding in the key
151 *
152 * The key string needs to lexicographically sort the same way that
153 * ghobject_t does. We do this by escaping anything <= to '#' with #
154 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
155 * hex digits.
156 *
157 * We use ! as a terminator for strings; this works because it is < #
158 * and will get escaped if it is present in the string.
159 *
f91f0fd5
TL
160 * NOTE: There is a bug in this implementation: due to implicit
161 * character type conversion in comparison it may produce unexpected
162 * ordering. Unfortunately fixing the bug would mean invalidating the
163 * keys in existing deployments. Instead we do additional sorting
164 * where it is needed.
7c673cae
FG
165 */
166template<typename S>
167static void append_escaped(const string &in, S *out)
168{
224ce89b
WB
169 char hexbyte[in.length() * 3 + 1];
170 char* ptr = &hexbyte[0];
7c673cae 171 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
f91f0fd5 172 if (*i <= '#') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
173 *ptr++ = '#';
174 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
175 *ptr++ = "0123456789abcdef"[*i & 0x0f];
f91f0fd5 176 } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
224ce89b
WB
177 *ptr++ = '~';
178 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
179 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 180 } else {
224ce89b 181 *ptr++ = *i;
7c673cae
FG
182 }
183 }
224ce89b
WB
184 *ptr++ = '!';
185 out->append(hexbyte, ptr - &hexbyte[0]);
186}
187
188inline unsigned h2i(char c)
189{
190 if ((c >= '0') && (c <= '9')) {
191 return c - 0x30;
192 } else if ((c >= 'a') && (c <= 'f')) {
193 return c - 'a' + 10;
194 } else if ((c >= 'A') && (c <= 'F')) {
195 return c - 'A' + 10;
196 } else {
197 return 256; // make it always larger than 255
198 }
7c673cae
FG
199}
200
201static int decode_escaped(const char *p, string *out)
202{
224ce89b
WB
203 char buff[256];
204 char* ptr = &buff[0];
205 char* max = &buff[252];
7c673cae
FG
206 const char *orig_p = p;
207 while (*p && *p != '!') {
208 if (*p == '#' || *p == '~') {
224ce89b
WB
209 unsigned hex = 0;
210 p++;
211 hex = h2i(*p++) << 4;
212 if (hex > 255) {
213 return -EINVAL;
214 }
215 hex |= h2i(*p++);
216 if (hex > 255) {
217 return -EINVAL;
218 }
219 *ptr++ = hex;
7c673cae 220 } else {
224ce89b
WB
221 *ptr++ = *p++;
222 }
223 if (ptr > max) {
224 out->append(buff, ptr-buff);
225 ptr = &buff[0];
7c673cae
FG
226 }
227 }
224ce89b
WB
228 if (ptr != buff) {
229 out->append(buff, ptr-buff);
230 }
7c673cae
FG
231 return p - orig_p;
232}
233
234// some things we encode in binary (as le32 or le64); print the
235// resulting key strings nicely
236template<typename S>
237static string pretty_binary_string(const S& in)
238{
239 char buf[10];
240 string out;
241 out.reserve(in.length() * 3);
242 enum { NONE, HEX, STRING } mode = NONE;
243 unsigned from = 0, i;
244 for (i=0; i < in.length(); ++i) {
245 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
246 (mode == HEX && in.length() - i >= 4 &&
247 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
248 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
249 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
250 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
251 if (mode == STRING) {
252 out.append(in.c_str() + from, i - from);
253 out.push_back('\'');
254 }
255 if (mode != HEX) {
256 out.append("0x");
257 mode = HEX;
258 }
259 if (in.length() - i >= 4) {
260 // print a whole u32 at once
261 snprintf(buf, sizeof(buf), "%08x",
262 (uint32_t)(((unsigned char)in[i] << 24) |
263 ((unsigned char)in[i+1] << 16) |
264 ((unsigned char)in[i+2] << 8) |
265 ((unsigned char)in[i+3] << 0)));
266 i += 3;
267 } else {
268 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
269 }
270 out.append(buf);
271 } else {
272 if (mode != STRING) {
273 out.push_back('\'');
274 mode = STRING;
275 from = i;
276 }
277 }
278 }
279 if (mode == STRING) {
280 out.append(in.c_str() + from, i - from);
281 out.push_back('\'');
282 }
283 return out;
284}
285
286template<typename T>
287static void _key_encode_shard(shard_id_t shard, T *key)
288{
289 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
290}
291
292static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
293{
294 pshard->id = (uint8_t)*key - (uint8_t)0x80;
295 return key + 1;
296}
297
f91f0fd5
TL
298static void get_coll_range(const coll_t& cid, int bits,
299 ghobject_t *temp_start, ghobject_t *temp_end,
300 ghobject_t *start, ghobject_t *end)
7c673cae 301{
7c673cae
FG
302 spg_t pgid;
303 if (cid.is_pg(&pgid)) {
f91f0fd5 304 start->shard_id = pgid.shard;
7c673cae
FG
305 *temp_start = *start;
306
f91f0fd5
TL
307 start->hobj.pool = pgid.pool();
308 temp_start->hobj.pool = -2ll - pgid.pool();
7c673cae
FG
309
310 *end = *start;
311 *temp_end = *temp_start;
312
313 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
f91f0fd5
TL
314 start->hobj.set_bitwise_key_u32(reverse_hash);
315 temp_start->hobj.set_bitwise_key_u32(reverse_hash);
7c673cae
FG
316
317 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
318 if (end_hash > 0xffffffffull)
319 end_hash = 0xffffffffull;
320
f91f0fd5
TL
321 end->hobj.set_bitwise_key_u32(end_hash);
322 temp_end->hobj.set_bitwise_key_u32(end_hash);
7c673cae 323 } else {
f91f0fd5
TL
324 start->shard_id = shard_id_t::NO_SHARD;
325 start->hobj.pool = -1ull;
326
7c673cae 327 *end = *start;
f91f0fd5
TL
328 start->hobj.set_bitwise_key_u32(0);
329 end->hobj.set_bitwise_key_u32(0xffffffff);
7c673cae
FG
330
331 // no separate temp section
332 *temp_start = *end;
333 *temp_end = *end;
334 }
f91f0fd5
TL
335
336 start->generation = 0;
337 end->generation = 0;
338 temp_start->generation = 0;
339 temp_end->generation = 0;
7c673cae
FG
340}
341
342static void get_shared_blob_key(uint64_t sbid, string *key)
343{
344 key->clear();
345 _key_encode_u64(sbid, key);
346}
347
348static int get_key_shared_blob(const string& key, uint64_t *sbid)
349{
350 const char *p = key.c_str();
351 if (key.length() < sizeof(uint64_t))
352 return -1;
224ce89b 353 _key_decode_u64(p, sbid);
7c673cae
FG
354 return 0;
355}
356
357template<typename S>
f91f0fd5 358static void _key_encode_prefix(const ghobject_t& oid, S *key)
7c673cae 359{
f91f0fd5
TL
360 _key_encode_shard(oid.shard_id, key);
361 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
362 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
363}
7c673cae 364
f91f0fd5
TL
365static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
366{
7c673cae
FG
367 p = _key_decode_shard(p, &oid->shard_id);
368
369 uint64_t pool;
370 p = _key_decode_u64(p, &pool);
371 oid->hobj.pool = pool - 0x8000000000000000ull;
372
373 unsigned hash;
374 p = _key_decode_u32(p, &hash);
375
376 oid->hobj.set_bitwise_key_u32(hash);
377
f91f0fd5
TL
378 return p;
379}
380
381#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
382
383template<typename S>
384static int get_key_object(const S& key, ghobject_t *oid)
385{
386 int r;
387 const char *p = key.c_str();
388
389 if (key.length() < ENCODED_KEY_PREFIX_LEN)
390 return -1;
391
392 p = _key_decode_prefix(p, oid);
393
394 if (key.length() == ENCODED_KEY_PREFIX_LEN)
395 return -2;
396
7c673cae
FG
397 r = decode_escaped(p, &oid->hobj.nspace);
398 if (r < 0)
399 return -2;
400 p += r + 1;
401
402 string k;
403 r = decode_escaped(p, &k);
404 if (r < 0)
405 return -3;
406 p += r + 1;
407 if (*p == '=') {
408 // no key
409 ++p;
410 oid->hobj.oid.name = k;
411 } else if (*p == '<' || *p == '>') {
412 // key + name
413 ++p;
414 r = decode_escaped(p, &oid->hobj.oid.name);
415 if (r < 0)
416 return -5;
417 p += r + 1;
418 oid->hobj.set_key(k);
419 } else {
420 // malformed
421 return -6;
422 }
423
424 p = _key_decode_u64(p, &oid->hobj.snap.val);
425 p = _key_decode_u64(p, &oid->generation);
426
427 if (*p != ONODE_KEY_SUFFIX) {
428 return -7;
429 }
430 p++;
431 if (*p) {
432 // if we get something other than a null terminator here,
433 // something goes wrong.
434 return -8;
435 }
436
437 return 0;
438}
439
440template<typename S>
441static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
442{
443 key->clear();
444
f91f0fd5 445 size_t max_len = ENCODED_KEY_PREFIX_LEN +
7c673cae
FG
446 (oid.hobj.nspace.length() * 3 + 1) +
447 (oid.hobj.get_key().length() * 3 + 1) +
448 1 + // for '<', '=', or '>'
449 (oid.hobj.oid.name.length() * 3 + 1) +
450 8 + 8 + 1;
451 key->reserve(max_len);
452
f91f0fd5 453 _key_encode_prefix(oid, key);
7c673cae
FG
454
455 append_escaped(oid.hobj.nspace, key);
456
457 if (oid.hobj.get_key().length()) {
458 // is a key... could be < = or >.
459 append_escaped(oid.hobj.get_key(), key);
460 // (ASCII chars < = and > sort in that order, yay)
461 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
462 if (r) {
463 key->append(r > 0 ? ">" : "<");
464 append_escaped(oid.hobj.oid.name, key);
465 } else {
466 // same as no key
467 key->append("=");
468 }
469 } else {
470 // no key
471 append_escaped(oid.hobj.oid.name, key);
472 key->append("=");
473 }
474
475 _key_encode_u64(oid.hobj.snap, key);
476 _key_encode_u64(oid.generation, key);
477
478 key->push_back(ONODE_KEY_SUFFIX);
479
480 // sanity check
481 if (true) {
482 ghobject_t t;
483 int r = get_key_object(*key, &t);
484 if (r || t != oid) {
485 derr << " r " << r << dendl;
486 derr << "key " << pretty_binary_string(*key) << dendl;
487 derr << "oid " << oid << dendl;
488 derr << " t " << t << dendl;
11fdf7f2 489 ceph_assert(r == 0 && t == oid);
7c673cae
FG
490 }
491 }
492}
493
494
495// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
496// char lets us quickly test whether it is a shard key without decoding any
497// of the prefix bytes.
498template<typename S>
499static void get_extent_shard_key(const S& onode_key, uint32_t offset,
500 string *key)
501{
502 key->clear();
503 key->reserve(onode_key.length() + 4 + 1);
504 key->append(onode_key.c_str(), onode_key.size());
505 _key_encode_u32(offset, key);
506 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
507}
508
509static void rewrite_extent_shard_key(uint32_t offset, string *key)
510{
11fdf7f2
TL
511 ceph_assert(key->size() > sizeof(uint32_t) + 1);
512 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
513 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
514}
515
516template<typename S>
517static void generate_extent_shard_key_and_apply(
518 const S& onode_key,
519 uint32_t offset,
520 string *key,
521 std::function<void(const string& final_key)> apply)
522{
523 if (key->empty()) { // make full key
11fdf7f2 524 ceph_assert(!onode_key.empty());
7c673cae
FG
525 get_extent_shard_key(onode_key, offset, key);
526 } else {
527 rewrite_extent_shard_key(offset, key);
528 }
529 apply(*key);
530}
531
532int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
533{
11fdf7f2
TL
534 ceph_assert(key.size() > sizeof(uint32_t) + 1);
535 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
536 int okey_len = key.size() - sizeof(uint32_t) - 1;
537 *onode_key = key.substr(0, okey_len);
538 const char *p = key.data() + okey_len;
224ce89b 539 _key_decode_u32(p, offset);
7c673cae
FG
540 return 0;
541}
542
543static bool is_extent_shard_key(const string& key)
544{
545 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
546}
547
7c673cae
FG
548static void get_deferred_key(uint64_t seq, string *out)
549{
550 _key_encode_u64(seq, out);
551}
552
11fdf7f2
TL
553static void get_pool_stat_key(int64_t pool_id, string *key)
554{
555 key->clear();
556 _key_encode_u64(pool_id, key);
557}
558
559static int get_key_pool_stat(const string& key, uint64_t* pool_id)
560{
561 const char *p = key.c_str();
562 if (key.length() < sizeof(uint64_t))
563 return -1;
564 _key_decode_u64(p, pool_id);
565 return 0;
566}
7c673cae 567
81eedcae
TL
568template <int LogLevelV>
569void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
570{
571 uint64_t pos = 0;
572 for (auto& s : em.shards) {
573 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
574 << (s.loaded ? " (loaded)" : "")
575 << (s.dirty ? " (dirty)" : "")
576 << dendl;
577 }
578 for (auto& e : em.extent_map) {
579 dout(LogLevelV) << __func__ << " " << e << dendl;
580 ceph_assert(e.logical_offset >= pos);
581 pos = e.logical_offset + e.length;
582 const bluestore_blob_t& blob = e.blob->get_blob();
583 if (blob.has_csum()) {
584 vector<uint64_t> v;
585 unsigned n = blob.get_csum_count();
586 for (unsigned i = 0; i < n; ++i)
587 v.push_back(blob.get_csum_item(i));
588 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
589 << dendl;
590 }
591 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
592 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
593 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
594 << "~" << i.second->length << std::dec
595 << " " << *i.second << dendl;
596 }
597 }
598}
599
600template <int LogLevelV>
601void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
602{
603 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
604 return;
605 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
606 << " nid " << o.onode.nid
607 << " size 0x" << std::hex << o.onode.size
608 << " (" << std::dec << o.onode.size << ")"
609 << " expected_object_size " << o.onode.expected_object_size
610 << " expected_write_size " << o.onode.expected_write_size
611 << " in " << o.onode.extent_map_shards.size() << " shards"
612 << ", " << o.extent_map.spanning_blob_map.size()
613 << " spanning blobs"
614 << dendl;
615 for (auto p = o.onode.attrs.begin();
616 p != o.onode.attrs.end();
617 ++p) {
618 dout(LogLevelV) << __func__ << " attr " << p->first
619 << " len " << p->second.length() << dendl;
620 }
621 _dump_extent_map<LogLevelV>(cct, o.extent_map);
622}
623
624template <int LogLevelV>
625void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
626{
627 dout(LogLevelV) << __func__ << " transaction dump:\n";
628 JSONFormatter f(true);
629 f.open_object_section("transaction");
630 t->dump(&f);
631 f.close_section();
632 f.flush(*_dout);
633 *_dout << dendl;
634}
635
7c673cae
FG
636// merge operators
637
638struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
639 void merge_nonexistent(
640 const char *rdata, size_t rlen, std::string *new_value) override {
641 *new_value = std::string(rdata, rlen);
642 }
643 void merge(
644 const char *ldata, size_t llen,
645 const char *rdata, size_t rlen,
646 std::string *new_value) override {
11fdf7f2
TL
647 ceph_assert(llen == rlen);
648 ceph_assert((rlen % 8) == 0);
7c673cae 649 new_value->resize(rlen);
eafe8130
TL
650 const ceph_le64* lv = (const ceph_le64*)ldata;
651 const ceph_le64* rv = (const ceph_le64*)rdata;
652 ceph_le64* nv = &(ceph_le64&)new_value->at(0);
7c673cae
FG
653 for (size_t i = 0; i < rlen >> 3; ++i) {
654 nv[i] = lv[i] + rv[i];
655 }
656 }
657 // We use each operator name and each prefix to construct the
658 // overall RocksDB operator name for consistency check at open time.
91327a77 659 const char *name() const override {
7c673cae
FG
660 return "int64_array";
661 }
662};
663
664
665// Buffer
666
667ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
668{
669 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
670 << b.offset << "~" << b.length << std::dec
671 << " " << BlueStore::Buffer::get_state_name(b.state);
672 if (b.flags)
673 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
674 return out << ")";
675}
676
f91f0fd5
TL
677namespace {
678
679/*
680 * Due to a bug in key string encoding (see a comment for append_escaped)
681 * the KeyValueDB iterator does not lexicographically sort the same
682 * way that ghobject_t does: objects with the same hash may have wrong order.
683 *
684 * This is the iterator wrapper that fixes the keys order.
685 */
686
687class CollectionListIterator {
688public:
689 CollectionListIterator(const KeyValueDB::Iterator &it)
690 : m_it(it) {
691 }
692 virtual ~CollectionListIterator() {
693 }
694
695 virtual bool valid() const = 0;
696 virtual const ghobject_t &oid() const = 0;
697 virtual void lower_bound(const ghobject_t &oid) = 0;
698 virtual void upper_bound(const ghobject_t &oid) = 0;
699 virtual void next() = 0;
700
701protected:
702 KeyValueDB::Iterator m_it;
703};
704
705class SimpleCollectionListIterator : public CollectionListIterator {
706public:
707 SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
708 : CollectionListIterator(it), m_cct(cct) {
709 }
710
711 bool valid() const override {
712 return m_it->valid();
713 }
714
715 const ghobject_t &oid() const override {
716 ceph_assert(valid());
717
718 return m_oid;
719 }
720
721 void lower_bound(const ghobject_t &oid) override {
722 string key;
723 get_object_key(m_cct, oid, &key);
724
725 m_it->lower_bound(key);
726 get_oid();
727 }
728
729 void upper_bound(const ghobject_t &oid) override {
730 string key;
731 get_object_key(m_cct, oid, &key);
732
733 m_it->upper_bound(key);
734 get_oid();
735 }
736
737 void next() override {
738 ceph_assert(valid());
739
740 m_it->next();
741 get_oid();
742 }
743
744private:
745 CephContext *m_cct;
746 ghobject_t m_oid;
747
748 void get_oid() {
749 if (!valid()) {
750 return;
751 }
752
753 if (is_extent_shard_key(m_it->key())) {
754 next();
755 return;
756 }
757
758 m_oid = ghobject_t();
759 int r = get_key_object(m_it->key(), &m_oid);
760 ceph_assert(r == 0);
761 }
762};
763
764class SortedCollectionListIterator : public CollectionListIterator {
765public:
766 SortedCollectionListIterator(const KeyValueDB::Iterator &it)
767 : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
768 }
769
770 bool valid() const override {
771 return m_chunk_iter != m_chunk.end();
772 }
773
774 const ghobject_t &oid() const override {
775 ceph_assert(valid());
776
777 return m_chunk_iter->first;
778 }
779
780 void lower_bound(const ghobject_t &oid) override {
781 std::string key;
782 _key_encode_prefix(oid, &key);
783
784 m_it->lower_bound(key);
785 m_chunk_iter = m_chunk.end();
786 if (!get_next_chunk()) {
787 return;
788 }
789
790 if (this->oid().shard_id != oid.shard_id ||
791 this->oid().hobj.pool != oid.hobj.pool ||
792 this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
793 return;
794 }
795
796 m_chunk_iter = m_chunk.lower_bound(oid);
797 if (m_chunk_iter == m_chunk.end()) {
798 get_next_chunk();
799 }
800 }
801
802 void upper_bound(const ghobject_t &oid) override {
803 lower_bound(oid);
804
805 if (valid() && this->oid() == oid) {
806 next();
807 }
808 }
809
810 void next() override {
811 ceph_assert(valid());
812
813 m_chunk_iter++;
814 if (m_chunk_iter == m_chunk.end()) {
815 get_next_chunk();
816 }
817 }
818
819private:
820 std::map<ghobject_t, std::string> m_chunk;
821 std::map<ghobject_t, std::string>::iterator m_chunk_iter;
822
823 bool get_next_chunk() {
824 while (m_it->valid() && is_extent_shard_key(m_it->key())) {
825 m_it->next();
826 }
827
828 if (!m_it->valid()) {
829 return false;
830 }
831
832 ghobject_t oid;
833 int r = get_key_object(m_it->key(), &oid);
834 ceph_assert(r == 0);
835
836 m_chunk.clear();
837 while (true) {
838 m_chunk.insert({oid, m_it->key()});
839
840 do {
841 m_it->next();
842 } while (m_it->valid() && is_extent_shard_key(m_it->key()));
843
844 if (!m_it->valid()) {
845 break;
846 }
847
848 ghobject_t next;
849 r = get_key_object(m_it->key(), &next);
850 ceph_assert(r == 0);
851 if (next.shard_id != oid.shard_id ||
852 next.hobj.pool != oid.hobj.pool ||
853 next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
854 break;
855 }
856 oid = next;
857 }
858
859 m_chunk_iter = m_chunk.begin();
860 return true;
861 }
862};
863
864} // anonymous namespace
865
7c673cae
FG
866// Garbage Collector
867
868void BlueStore::GarbageCollector::process_protrusive_extents(
869 const BlueStore::ExtentMap& extent_map,
870 uint64_t start_offset,
871 uint64_t end_offset,
872 uint64_t start_touch_offset,
873 uint64_t end_touch_offset,
874 uint64_t min_alloc_size)
875{
11fdf7f2 876 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
7c673cae 877
11fdf7f2
TL
878 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
879 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
7c673cae
FG
880
881 dout(30) << __func__ << " (hex): [" << std::hex
882 << lookup_start_offset << ", " << lookup_end_offset
883 << ")" << std::dec << dendl;
884
885 for (auto it = extent_map.seek_lextent(lookup_start_offset);
886 it != extent_map.extent_map.end() &&
887 it->logical_offset < lookup_end_offset;
888 ++it) {
889 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
890 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
891
892 dout(30) << __func__ << " " << *it
893 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
894 << dendl;
895
896 Blob* b = it->blob.get();
897
898 if (it->logical_offset >=start_touch_offset &&
899 it->logical_end() <= end_touch_offset) {
900 // Process extents within the range affected by
901 // the current write request.
902 // Need to take into account if existing extents
903 // can be merged with them (uncompressed case)
904 if (!b->get_blob().is_compressed()) {
905 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
906 --blob_info_counted->expected_allocations; // don't need to allocate
907 // new AU for compressed
908 // data since another
909 // collocated uncompressed
910 // blob already exists
911 dout(30) << __func__ << " --expected:"
912 << alloc_unit_start << dendl;
913 }
914 used_alloc_unit = alloc_unit_end;
915 blob_info_counted = nullptr;
916 }
917 } else if (b->get_blob().is_compressed()) {
918
919 // additionally we take compressed blobs that were not impacted
920 // by the write into account too
921 BlobInfo& bi =
922 affected_blobs.emplace(
923 b, BlobInfo(b->get_referenced_bytes())).first->second;
924
925 int adjust =
926 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
927 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
928 dout(30) << __func__ << " expected_allocations="
929 << bi.expected_allocations << " end_au:"
930 << alloc_unit_end << dendl;
931
932 blob_info_counted = &bi;
933 used_alloc_unit = alloc_unit_end;
934
11fdf7f2 935 ceph_assert(it->length <= bi.referenced_bytes);
7c673cae
FG
936 bi.referenced_bytes -= it->length;
937 dout(30) << __func__ << " affected_blob:" << *b
938 << " unref 0x" << std::hex << it->length
939 << " referenced = 0x" << bi.referenced_bytes
940 << std::dec << dendl;
941 // NOTE: we can't move specific blob to resulting GC list here
942 // when reference counter == 0 since subsequent extents might
943 // decrement its expected_allocation.
944 // Hence need to enumerate all the extents first.
945 if (!bi.collect_candidate) {
946 bi.first_lextent = it;
947 bi.collect_candidate = true;
948 }
949 bi.last_lextent = it;
950 } else {
951 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
952 // don't need to allocate new AU for compressed data since another
953 // collocated uncompressed blob already exists
954 --blob_info_counted->expected_allocations;
955 dout(30) << __func__ << " --expected_allocations:"
956 << alloc_unit_start << dendl;
957 }
958 used_alloc_unit = alloc_unit_end;
959 blob_info_counted = nullptr;
960 }
961 }
962
963 for (auto b_it = affected_blobs.begin();
964 b_it != affected_blobs.end();
965 ++b_it) {
966 Blob* b = b_it->first;
967 BlobInfo& bi = b_it->second;
968 if (bi.referenced_bytes == 0) {
969 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
970 int64_t blob_expected_for_release =
11fdf7f2 971 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
7c673cae
FG
972
973 dout(30) << __func__ << " " << *(b_it->first)
974 << " expected4release=" << blob_expected_for_release
975 << " expected_allocations=" << bi.expected_allocations
976 << dendl;
977 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
11fdf7f2 978 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
7c673cae
FG
979 if (bi.collect_candidate) {
980 auto it = bi.first_lextent;
981 bool bExit = false;
982 do {
983 if (it->blob.get() == b) {
eafe8130 984 extents_to_collect.insert(it->logical_offset, it->length);
7c673cae
FG
985 }
986 bExit = it == bi.last_lextent;
987 ++it;
31f18b77 988 } while (!bExit);
7c673cae
FG
989 }
990 expected_for_release += blob_expected_for_release;
991 expected_allocations += bi.expected_allocations;
992 }
993 }
994 }
995}
996
997int64_t BlueStore::GarbageCollector::estimate(
998 uint64_t start_offset,
999 uint64_t length,
1000 const BlueStore::ExtentMap& extent_map,
1001 const BlueStore::old_extent_map_t& old_extents,
1002 uint64_t min_alloc_size)
1003{
1004
1005 affected_blobs.clear();
1006 extents_to_collect.clear();
1007 used_alloc_unit = boost::optional<uint64_t >();
1008 blob_info_counted = nullptr;
1009
eafe8130
TL
1010 uint64_t gc_start_offset = start_offset;
1011 uint64_t gc_end_offset = start_offset + length;
7c673cae
FG
1012
1013 uint64_t end_offset = start_offset + length;
1014
1015 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
1016 Blob* b = it->e.blob.get();
1017 if (b->get_blob().is_compressed()) {
1018
1019 // update gc_start_offset/gc_end_offset if needed
1020 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
11fdf7f2 1021 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
7c673cae
FG
1022
1023 auto o = it->e.logical_offset;
1024 auto l = it->e.length;
1025
1026 uint64_t ref_bytes = b->get_referenced_bytes();
1027 // micro optimization to bypass blobs that have no more references
1028 if (ref_bytes != 0) {
1029 dout(30) << __func__ << " affected_blob:" << *b
1030 << " unref 0x" << std::hex << o << "~" << l
1031 << std::dec << dendl;
1032 affected_blobs.emplace(b, BlobInfo(ref_bytes));
1033 }
1034 }
1035 }
1036 dout(30) << __func__ << " gc range(hex): [" << std::hex
1037 << gc_start_offset << ", " << gc_end_offset
1038 << ")" << std::dec << dendl;
1039
1040 // enumerate preceeding extents to check if they reference affected blobs
1041 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
1042 process_protrusive_extents(extent_map,
1043 gc_start_offset,
1044 gc_end_offset,
1045 start_offset,
1046 end_offset,
1047 min_alloc_size);
1048 }
1049 return expected_for_release - expected_allocations;
1050}
1051
9f95a23c
TL
1052// LruOnodeCacheShard
1053struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
1054 typedef boost::intrusive::list<
1055 BlueStore::Onode,
1056 boost::intrusive::member_hook<
1057 BlueStore::Onode,
1058 boost::intrusive::list_member_hook<>,
1059 &BlueStore::Onode::lru_item> > list_t;
7c673cae 1060
9f95a23c 1061 list_t lru;
7c673cae 1062
9f95a23c 1063 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
7c673cae 1064
f6b5b4d7 1065 void _add(BlueStore::Onode* o, int level) override
9f95a23c 1066 {
f6b5b4d7 1067 if (o->put_cache()) {
9f95a23c 1068 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
f6b5b4d7
TL
1069 } else {
1070 ++num_pinned;
9f95a23c 1071 }
f6b5b4d7
TL
1072 ++num; // we count both pinned and unpinned entries
1073 dout(20) << __func__ << " " << this << " " << o->oid << " added, num=" << num << dendl;
eafe8130 1074 }
f6b5b4d7 1075 void _rm(BlueStore::Onode* o) override
9f95a23c 1076 {
f6b5b4d7 1077 if (o->pop_cache()) {
9f95a23c 1078 lru.erase(lru.iterator_to(*o));
f6b5b4d7
TL
1079 } else {
1080 ceph_assert(num_pinned);
1081 --num_pinned;
9f95a23c 1082 }
f6b5b4d7
TL
1083 ceph_assert(num);
1084 --num;
1085 dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
9f95a23c 1086 }
f6b5b4d7 1087 void _pin(BlueStore::Onode* o) override
9f95a23c 1088 {
9f95a23c 1089 lru.erase(lru.iterator_to(*o));
f6b5b4d7
TL
1090 ++num_pinned;
1091 dout(20) << __func__ << this << " " << " " << " " << o->oid << " pinned" << dendl;
9f95a23c 1092 }
f6b5b4d7 1093 void _unpin(BlueStore::Onode* o) override
9f95a23c 1094 {
f6b5b4d7
TL
1095 lru.push_front(*o);
1096 ceph_assert(num_pinned);
1097 --num_pinned;
1098 dout(20) << __func__ << this << " " << " " << " " << o->oid << " unpinned" << dendl;
9f95a23c 1099 }
f6b5b4d7 1100
9f95a23c
TL
1101 void _trim_to(uint64_t new_size) override
1102 {
1103 if (new_size >= lru.size()) {
1104 return; // don't even try
1105 }
1106 uint64_t n = lru.size() - new_size;
1107 auto p = lru.end();
1108 ceph_assert(p != lru.begin());
1109 --p;
f6b5b4d7
TL
1110 ceph_assert(num >= n);
1111 num -= n;
1112 while (n-- > 0) {
9f95a23c 1113 BlueStore::Onode *o = &*p;
f6b5b4d7
TL
1114 dout(20) << __func__ << " rm " << o->oid << " "
1115 << o->nref << " " << o->cached << " " << o->pinned << dendl;
9f95a23c
TL
1116 if (p != lru.begin()) {
1117 lru.erase(p--);
1118 } else {
f6b5b4d7 1119 ceph_assert(n == 0);
9f95a23c 1120 lru.erase(p);
9f95a23c 1121 }
f6b5b4d7
TL
1122 auto pinned = !o->pop_cache();
1123 ceph_assert(!pinned);
1124 o->c->onode_map._remove(o->oid);
9f95a23c 1125 }
f6b5b4d7
TL
1126 }
1127 void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
1128 {
1129 if (to == this) {
1130 return;
1131 }
1132 ceph_assert(o->cached);
1133 ceph_assert(o->pinned);
1134 ceph_assert(num);
1135 ceph_assert(num_pinned);
1136 --num_pinned;
1137 --num;
1138 ++to->num_pinned;
1139 ++to->num;
9f95a23c
TL
1140 }
1141 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
1142 {
f6b5b4d7 1143 *onodes += num;
9f95a23c
TL
1144 *pinned_onodes += num_pinned;
1145 }
1146};
7c673cae 1147
9f95a23c
TL
1148// OnodeCacheShard
1149BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
1150 CephContext* cct,
1151 string type,
1152 PerfCounters *logger)
7c673cae 1153{
9f95a23c
TL
1154 BlueStore::OnodeCacheShard *c = nullptr;
1155 // Currently we only implement an LRU cache for onodes
1156 c = new LruOnodeCacheShard(cct);
1157 c->logger = logger;
1158 return c;
7c673cae
FG
1159}
1160
9f95a23c
TL
1161// LruBufferCacheShard
1162struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
1163 typedef boost::intrusive::list<
1164 BlueStore::Buffer,
1165 boost::intrusive::member_hook<
1166 BlueStore::Buffer,
1167 boost::intrusive::list_member_hook<>,
1168 &BlueStore::Buffer::lru_item> > list_t;
1169 list_t lru;
1170
1171 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
1172
1173 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
1174 if (near) {
1175 auto q = lru.iterator_to(*near);
1176 lru.insert(q, *b);
1177 } else if (level > 0) {
1178 lru.push_front(*b);
1179 } else {
1180 lru.push_back(*b);
7c673cae 1181 }
9f95a23c
TL
1182 buffer_bytes += b->length;
1183 num = lru.size();
1184 }
1185 void _rm(BlueStore::Buffer *b) override {
1186 ceph_assert(buffer_bytes >= b->length);
1187 buffer_bytes -= b->length;
1188 auto q = lru.iterator_to(*b);
1189 lru.erase(q);
1190 num = lru.size();
1191 }
1192 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
1193 src->_rm(b);
1194 _add(b, 0, nullptr);
1195 }
1196 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
1197 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1198 buffer_bytes += delta;
1199 }
1200 void _touch(BlueStore::Buffer *b) override {
1201 auto p = lru.iterator_to(*b);
1202 lru.erase(p);
1203 lru.push_front(*b);
1204 num = lru.size();
1205 _audit("_touch_buffer end");
1206 }
7c673cae 1207
9f95a23c
TL
1208 void _trim_to(uint64_t max) override
1209 {
1210 while (buffer_bytes > max) {
1211 auto i = lru.rbegin();
1212 if (i == lru.rend()) {
1213 // stop if lru is now empty
7c673cae
FG
1214 break;
1215 }
1216
9f95a23c
TL
1217 BlueStore::Buffer *b = &*i;
1218 ceph_assert(b->is_clean());
1219 dout(20) << __func__ << " rm " << *b << dendl;
1220 b->space->_rm_buffer(this, b);
7c673cae 1221 }
9f95a23c 1222 num = lru.size();
7c673cae 1223 }
7c673cae 1224
9f95a23c
TL
1225 void add_stats(uint64_t *extents,
1226 uint64_t *blobs,
1227 uint64_t *buffers,
1228 uint64_t *bytes) override {
1229 *extents += num_extents;
1230 *blobs += num_blobs;
1231 *buffers += num;
1232 *bytes += buffer_bytes;
7c673cae 1233 }
9f95a23c
TL
1234#ifdef DEBUG_CACHE
1235 void _audit(const char *s) override
1236 {
1237 dout(10) << __func__ << " " << when << " start" << dendl;
1238 uint64_t s = 0;
1239 for (auto i = lru.begin(); i != lru.end(); ++i) {
1240 s += i->length;
1241 }
1242 if (s != buffer_bytes) {
1243 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1244 << dendl;
1245 for (auto i = lru.begin(); i != lru.end(); ++i) {
1246 derr << __func__ << " " << *i << dendl;
1247 }
1248 ceph_assert(s == buffer_bytes);
7c673cae 1249 }
9f95a23c
TL
1250 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1251 << " ok" << dendl;
7c673cae 1252 }
7c673cae 1253#endif
9f95a23c 1254};
7c673cae 1255
9f95a23c
TL
1256// TwoQBufferCacheShard
1257
1258struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1259 typedef boost::intrusive::list<
1260 BlueStore::Buffer,
1261 boost::intrusive::member_hook<
1262 BlueStore::Buffer,
1263 boost::intrusive::list_member_hook<>,
1264 &BlueStore::Buffer::lru_item> > list_t;
1265 list_t hot; ///< "Am" hot buffers
1266 list_t warm_in; ///< "A1in" newly warm buffers
1267 list_t warm_out; ///< "A1out" empty buffers we've evicted
1268 uint64_t buffer_bytes = 0; ///< bytes
1269
1270 enum {
1271 BUFFER_NEW = 0,
1272 BUFFER_WARM_IN, ///< in warm_in
1273 BUFFER_WARM_OUT, ///< in warm_out
1274 BUFFER_HOT, ///< in hot
1275 BUFFER_TYPE_MAX
1276 };
7c673cae 1277
9f95a23c 1278 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
7c673cae 1279
9f95a23c
TL
1280public:
1281 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
7c673cae 1282
9f95a23c
TL
1283 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1284 {
1285 dout(20) << __func__ << " level " << level << " near " << near
1286 << " on " << *b
1287 << " which has cache_private " << b->cache_private << dendl;
1288 if (near) {
1289 b->cache_private = near->cache_private;
1290 switch (b->cache_private) {
1291 case BUFFER_WARM_IN:
1292 warm_in.insert(warm_in.iterator_to(*near), *b);
1293 break;
1294 case BUFFER_WARM_OUT:
1295 ceph_assert(b->is_empty());
1296 warm_out.insert(warm_out.iterator_to(*near), *b);
1297 break;
1298 case BUFFER_HOT:
1299 hot.insert(hot.iterator_to(*near), *b);
1300 break;
1301 default:
1302 ceph_abort_msg("bad cache_private");
1303 }
1304 } else if (b->cache_private == BUFFER_NEW) {
1305 b->cache_private = BUFFER_WARM_IN;
1306 if (level > 0) {
1307 warm_in.push_front(*b);
1308 } else {
1309 // take caller hint to start at the back of the warm queue
1310 warm_in.push_back(*b);
1311 }
1312 } else {
1313 // we got a hint from discard
1314 switch (b->cache_private) {
1315 case BUFFER_WARM_IN:
1316 // stay in warm_in. move to front, even though 2Q doesn't actually
1317 // do this.
1318 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1319 warm_in.push_front(*b);
1320 break;
1321 case BUFFER_WARM_OUT:
1322 b->cache_private = BUFFER_HOT;
1323 // move to hot. fall-thru
1324 case BUFFER_HOT:
1325 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1326 hot.push_front(*b);
1327 break;
1328 default:
1329 ceph_abort_msg("bad cache_private");
1330 }
1331 }
1332 if (!b->is_empty()) {
1333 buffer_bytes += b->length;
1334 list_bytes[b->cache_private] += b->length;
1335 }
1336 num = hot.size() + warm_in.size();
1337 }
1338
1339 void _rm(BlueStore::Buffer *b) override
1340 {
1341 dout(20) << __func__ << " " << *b << dendl;
1342 if (!b->is_empty()) {
1343 ceph_assert(buffer_bytes >= b->length);
1344 buffer_bytes -= b->length;
1345 ceph_assert(list_bytes[b->cache_private] >= b->length);
1346 list_bytes[b->cache_private] -= b->length;
1347 }
7c673cae
FG
1348 switch (b->cache_private) {
1349 case BUFFER_WARM_IN:
9f95a23c 1350 warm_in.erase(warm_in.iterator_to(*b));
7c673cae
FG
1351 break;
1352 case BUFFER_WARM_OUT:
9f95a23c 1353 warm_out.erase(warm_out.iterator_to(*b));
7c673cae
FG
1354 break;
1355 case BUFFER_HOT:
9f95a23c 1356 hot.erase(hot.iterator_to(*b));
7c673cae
FG
1357 break;
1358 default:
11fdf7f2 1359 ceph_abort_msg("bad cache_private");
7c673cae 1360 }
9f95a23c
TL
1361 num = hot.size() + warm_in.size();
1362 }
1363
1364 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1365 {
1366 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1367 src->_rm(b);
1368
1369 // preserve which list we're on (even if we can't preserve the order!)
7c673cae
FG
1370 switch (b->cache_private) {
1371 case BUFFER_WARM_IN:
9f95a23c
TL
1372 ceph_assert(!b->is_empty());
1373 warm_in.push_back(*b);
7c673cae
FG
1374 break;
1375 case BUFFER_WARM_OUT:
9f95a23c
TL
1376 ceph_assert(b->is_empty());
1377 warm_out.push_back(*b);
1378 break;
7c673cae 1379 case BUFFER_HOT:
9f95a23c
TL
1380 ceph_assert(!b->is_empty());
1381 hot.push_back(*b);
7c673cae
FG
1382 break;
1383 default:
11fdf7f2 1384 ceph_abort_msg("bad cache_private");
7c673cae 1385 }
9f95a23c
TL
1386 if (!b->is_empty()) {
1387 buffer_bytes += b->length;
1388 list_bytes[b->cache_private] += b->length;
1389 }
1390 num = hot.size() + warm_in.size();
7c673cae 1391 }
7c673cae 1392
9f95a23c
TL
1393 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1394 {
1395 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1396 if (!b->is_empty()) {
1397 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1398 buffer_bytes += delta;
1399 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1400 list_bytes[b->cache_private] += delta;
1401 }
7c673cae 1402 }
7c673cae 1403
9f95a23c
TL
1404 void _touch(BlueStore::Buffer *b) override {
1405 switch (b->cache_private) {
1406 case BUFFER_WARM_IN:
1407 // do nothing (somewhat counter-intuitively!)
1408 break;
1409 case BUFFER_WARM_OUT:
1410 // move from warm_out to hot LRU
1411 ceph_abort_msg("this happens via discard hint");
1412 break;
1413 case BUFFER_HOT:
1414 // move to front of hot LRU
1415 hot.erase(hot.iterator_to(*b));
1416 hot.push_front(*b);
1417 break;
1418 }
1419 num = hot.size() + warm_in.size();
1420 _audit("_touch_buffer end");
7c673cae 1421 }
7c673cae 1422
9f95a23c
TL
1423 void _trim_to(uint64_t max) override
1424 {
1425 if (buffer_bytes > max) {
1426 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1427 uint64_t khot = max - kin;
1428
1429 // pre-calculate kout based on average buffer size too,
1430 // which is typical(the warm_in and hot lists may change later)
1431 uint64_t kout = 0;
1432 uint64_t buffer_num = hot.size() + warm_in.size();
1433 if (buffer_num) {
1434 uint64_t avg_size = buffer_bytes / buffer_num;
1435 ceph_assert(avg_size);
1436 uint64_t calculated_num = max / avg_size;
1437 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1438 }
1439
1440 if (list_bytes[BUFFER_HOT] < khot) {
1441 // hot is small, give slack to warm_in
1442 kin += khot - list_bytes[BUFFER_HOT];
1443 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1444 // warm_in is small, give slack to hot
1445 khot += kin - list_bytes[BUFFER_WARM_IN];
1446 }
1447
1448 // adjust warm_in list
1449 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1450 uint64_t evicted = 0;
1451
1452 while (to_evict_bytes > 0) {
1453 auto p = warm_in.rbegin();
1454 if (p == warm_in.rend()) {
1455 // stop if warm_in list is now empty
1456 break;
1457 }
7c673cae 1458
9f95a23c
TL
1459 BlueStore::Buffer *b = &*p;
1460 ceph_assert(b->is_clean());
1461 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1462 ceph_assert(buffer_bytes >= b->length);
1463 buffer_bytes -= b->length;
1464 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1465 list_bytes[BUFFER_WARM_IN] -= b->length;
1466 to_evict_bytes -= b->length;
1467 evicted += b->length;
1468 b->state = BlueStore::Buffer::STATE_EMPTY;
1469 b->data.clear();
1470 warm_in.erase(warm_in.iterator_to(*b));
1471 warm_out.push_front(*b);
1472 b->cache_private = BUFFER_WARM_OUT;
1473 }
1474
1475 if (evicted > 0) {
1476 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1477 << " from warm_in list, done evicting warm_in buffers"
1478 << dendl;
1479 }
7c673cae 1480
9f95a23c
TL
1481 // adjust hot list
1482 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1483 evicted = 0;
7c673cae 1484
9f95a23c
TL
1485 while (to_evict_bytes > 0) {
1486 auto p = hot.rbegin();
1487 if (p == hot.rend()) {
1488 // stop if hot list is now empty
1489 break;
1490 }
7c673cae 1491
9f95a23c
TL
1492 BlueStore::Buffer *b = &*p;
1493 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1494 ceph_assert(b->is_clean());
1495 // adjust evict size before buffer goes invalid
1496 to_evict_bytes -= b->length;
1497 evicted += b->length;
1498 b->space->_rm_buffer(this, b);
1499 }
7c673cae 1500
9f95a23c
TL
1501 if (evicted > 0) {
1502 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1503 << " from hot list, done evicting hot buffers"
1504 << dendl;
7c673cae
FG
1505 }
1506
9f95a23c
TL
1507 // adjust warm out list too, if necessary
1508 int64_t n = warm_out.size() - kout;
1509 while (n-- > 0) {
1510 BlueStore::Buffer *b = &*warm_out.rbegin();
1511 ceph_assert(b->is_empty());
1512 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1513 b->space->_rm_buffer(this, b);
1514 }
7c673cae 1515 }
9f95a23c
TL
1516 num = hot.size() + warm_in.size();
1517 }
7c673cae 1518
9f95a23c
TL
1519 void add_stats(uint64_t *extents,
1520 uint64_t *blobs,
1521 uint64_t *buffers,
1522 uint64_t *bytes) override {
1523 *extents += num_extents;
1524 *blobs += num_blobs;
1525 *buffers += num;
1526 *bytes += buffer_bytes;
1527 }
7c673cae 1528
9f95a23c
TL
1529#ifdef DEBUG_CACHE
1530 void _audit(const char *s) override
1531 {
1532 dout(10) << __func__ << " " << when << " start" << dendl;
1533 uint64_t s = 0;
1534 for (auto i = hot.begin(); i != hot.end(); ++i) {
1535 s += i->length;
7c673cae
FG
1536 }
1537
9f95a23c
TL
1538 uint64_t hot_bytes = s;
1539 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1540 derr << __func__ << " hot_list_bytes "
1541 << list_bytes[BUFFER_HOT]
1542 << " != actual " << hot_bytes
1543 << dendl;
1544 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
7c673cae
FG
1545 }
1546
9f95a23c
TL
1547 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1548 s += i->length;
7c673cae 1549 }
7c673cae 1550
9f95a23c
TL
1551 uint64_t warm_in_bytes = s - hot_bytes;
1552 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1553 derr << __func__ << " warm_in_list_bytes "
1554 << list_bytes[BUFFER_WARM_IN]
1555 << " != actual " << warm_in_bytes
1556 << dendl;
1557 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
7c673cae 1558 }
7c673cae 1559
9f95a23c
TL
1560 if (s != buffer_bytes) {
1561 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1562 << dendl;
1563 ceph_assert(s == buffer_bytes);
1564 }
7c673cae 1565
9f95a23c
TL
1566 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1567 << " ok" << dendl;
7c673cae 1568 }
9f95a23c
TL
1569#endif
1570};
7c673cae 1571
9f95a23c 1572// BuferCacheShard
7c673cae 1573
9f95a23c
TL
1574BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1575 CephContext* cct,
1576 string type,
1577 PerfCounters *logger)
1578{
1579 BufferCacheShard *c = nullptr;
1580 if (type == "lru")
1581 c = new LruBufferCacheShard(cct);
1582 else if (type == "2q")
1583 c = new TwoQBufferCacheShard(cct);
1584 else
1585 ceph_abort_msg("unrecognized cache type");
1586 c->logger = logger;
1587 return c;
7c673cae 1588}
7c673cae
FG
1589
1590// BufferSpace
1591
1592#undef dout_prefix
1593#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1594
9f95a23c 1595void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
7c673cae
FG
1596{
1597 // note: we already hold cache->lock
1598 ldout(cache->cct, 20) << __func__ << dendl;
1599 while (!buffer_map.empty()) {
1600 _rm_buffer(cache, buffer_map.begin());
1601 }
1602}
1603
9f95a23c 1604int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
7c673cae
FG
1605{
1606 // note: we already hold cache->lock
1607 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1608 << std::dec << dendl;
1609 int cache_private = 0;
1610 cache->_audit("discard start");
1611 auto i = _data_lower_bound(offset);
1612 uint32_t end = offset + length;
1613 while (i != buffer_map.end()) {
1614 Buffer *b = i->second.get();
1615 if (b->offset >= end) {
1616 break;
1617 }
1618 if (b->cache_private > cache_private) {
1619 cache_private = b->cache_private;
1620 }
1621 if (b->offset < offset) {
1622 int64_t front = offset - b->offset;
1623 if (b->end() > end) {
1624 // drop middle (split)
1625 uint32_t tail = b->end() - end;
1626 if (b->data.length()) {
1627 bufferlist bl;
1628 bl.substr_of(b->data, b->length - tail, tail);
31f18b77
FG
1629 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1630 nb->maybe_rebuild();
1631 _add_buffer(cache, nb, 0, b);
7c673cae 1632 } else {
31f18b77
FG
1633 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1634 0, b);
7c673cae
FG
1635 }
1636 if (!b->is_writing()) {
9f95a23c 1637 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1638 }
1639 b->truncate(front);
31f18b77 1640 b->maybe_rebuild();
7c673cae
FG
1641 cache->_audit("discard end 1");
1642 break;
1643 } else {
1644 // drop tail
1645 if (!b->is_writing()) {
9f95a23c 1646 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1647 }
1648 b->truncate(front);
31f18b77 1649 b->maybe_rebuild();
7c673cae
FG
1650 ++i;
1651 continue;
1652 }
1653 }
1654 if (b->end() <= end) {
1655 // drop entire buffer
1656 _rm_buffer(cache, i++);
1657 continue;
1658 }
1659 // drop front
1660 uint32_t keep = b->end() - end;
1661 if (b->data.length()) {
1662 bufferlist bl;
1663 bl.substr_of(b->data, b->length - keep, keep);
31f18b77
FG
1664 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1665 nb->maybe_rebuild();
1666 _add_buffer(cache, nb, 0, b);
7c673cae
FG
1667 } else {
1668 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1669 }
1670 _rm_buffer(cache, i);
1671 cache->_audit("discard end 2");
1672 break;
1673 }
1674 return cache_private;
1675}
1676
1677void BlueStore::BufferSpace::read(
9f95a23c 1678 BufferCacheShard* cache,
224ce89b
WB
1679 uint32_t offset,
1680 uint32_t length,
7c673cae 1681 BlueStore::ready_regions_t& res,
91327a77
AA
1682 interval_set<uint32_t>& res_intervals,
1683 int flags)
7c673cae 1684{
7c673cae
FG
1685 res.clear();
1686 res_intervals.clear();
1687 uint32_t want_bytes = length;
1688 uint32_t end = offset + length;
224ce89b
WB
1689
1690 {
11fdf7f2 1691 std::lock_guard l(cache->lock);
224ce89b
WB
1692 for (auto i = _data_lower_bound(offset);
1693 i != buffer_map.end() && offset < end && i->first < end;
1694 ++i) {
1695 Buffer *b = i->second.get();
11fdf7f2 1696 ceph_assert(b->end() > offset);
91327a77
AA
1697
1698 bool val = false;
1699 if (flags & BYPASS_CLEAN_CACHE)
1700 val = b->is_writing();
1701 else
1702 val = b->is_writing() || b->is_clean();
1703 if (val) {
224ce89b
WB
1704 if (b->offset < offset) {
1705 uint32_t skip = offset - b->offset;
11fdf7f2 1706 uint32_t l = min(length, b->length - skip);
224ce89b
WB
1707 res[offset].substr_of(b->data, skip, l);
1708 res_intervals.insert(offset, l);
1709 offset += l;
1710 length -= l;
1711 if (!b->is_writing()) {
9f95a23c 1712 cache->_touch(b);
224ce89b
WB
1713 }
1714 continue;
1715 }
1716 if (b->offset > offset) {
1717 uint32_t gap = b->offset - offset;
1718 if (length <= gap) {
1719 break;
1720 }
1721 offset += gap;
1722 length -= gap;
1723 }
1724 if (!b->is_writing()) {
9f95a23c 1725 cache->_touch(b);
224ce89b
WB
1726 }
1727 if (b->length > length) {
1728 res[offset].substr_of(b->data, 0, length);
1729 res_intervals.insert(offset, length);
7c673cae 1730 break;
224ce89b
WB
1731 } else {
1732 res[offset].append(b->data);
1733 res_intervals.insert(offset, b->length);
1734 if (b->length == length)
1735 break;
1736 offset += b->length;
1737 length -= b->length;
1738 }
7c673cae
FG
1739 }
1740 }
1741 }
1742
1743 uint64_t hit_bytes = res_intervals.size();
11fdf7f2 1744 ceph_assert(hit_bytes <= want_bytes);
7c673cae
FG
1745 uint64_t miss_bytes = want_bytes - hit_bytes;
1746 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1747 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1748}
1749
9f95a23c 1750void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
7c673cae 1751{
7c673cae
FG
1752 auto i = writing.begin();
1753 while (i != writing.end()) {
1754 if (i->seq > seq) {
1755 break;
1756 }
1757 if (i->seq < seq) {
1758 ++i;
1759 continue;
1760 }
1761
1762 Buffer *b = &*i;
11fdf7f2 1763 ceph_assert(b->is_writing());
7c673cae
FG
1764
1765 if (b->flags & Buffer::FLAG_NOCACHE) {
1766 writing.erase(i++);
1767 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1768 buffer_map.erase(b->offset);
1769 } else {
1770 b->state = Buffer::STATE_CLEAN;
1771 writing.erase(i++);
31f18b77
FG
1772 b->maybe_rebuild();
1773 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
9f95a23c 1774 cache->_add(b, 1, nullptr);
7c673cae
FG
1775 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1776 }
1777 }
9f95a23c 1778 cache->_trim();
7c673cae
FG
1779 cache->_audit("finish_write end");
1780}
1781
9f95a23c 1782void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
7c673cae 1783{
11fdf7f2 1784 std::lock_guard lk(cache->lock);
7c673cae
FG
1785 if (buffer_map.empty())
1786 return;
1787
1788 auto p = --buffer_map.end();
1789 while (true) {
1790 if (p->second->end() <= pos)
1791 break;
1792
1793 if (p->second->offset < pos) {
1794 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1795 size_t left = pos - p->second->offset;
1796 size_t right = p->second->length - left;
1797 if (p->second->data.length()) {
1798 bufferlist bl;
1799 bl.substr_of(p->second->data, left, right);
1800 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1801 0, p->second.get());
1802 } else {
1803 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1804 0, p->second.get());
1805 }
9f95a23c 1806 cache->_adjust_size(p->second.get(), -right);
7c673cae
FG
1807 p->second->truncate(left);
1808 break;
1809 }
1810
11fdf7f2 1811 ceph_assert(p->second->end() > pos);
7c673cae
FG
1812 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1813 if (p->second->data.length()) {
1814 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1815 p->second->offset - pos, p->second->data),
1816 0, p->second.get());
1817 } else {
1818 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1819 p->second->offset - pos, p->second->length),
1820 0, p->second.get());
1821 }
1822 if (p == buffer_map.begin()) {
1823 _rm_buffer(cache, p);
1824 break;
1825 } else {
1826 _rm_buffer(cache, p--);
1827 }
1828 }
11fdf7f2 1829 ceph_assert(writing.empty());
9f95a23c 1830 cache->_trim();
7c673cae
FG
1831}
1832
1833// OnodeSpace
1834
1835#undef dout_prefix
1836#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1837
f6b5b4d7
TL
1838BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
1839 OnodeRef& o)
7c673cae 1840{
11fdf7f2 1841 std::lock_guard l(cache->lock);
7c673cae
FG
1842 auto p = onode_map.find(oid);
1843 if (p != onode_map.end()) {
1844 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1845 << " raced, returning existing " << p->second
1846 << dendl;
1847 return p->second;
1848 }
f6b5b4d7 1849 ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
7c673cae 1850 onode_map[oid] = o;
f6b5b4d7 1851 cache->_add(o.get(), 1);
9f95a23c 1852 cache->_trim();
7c673cae
FG
1853 return o;
1854}
1855
f6b5b4d7
TL
1856void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
1857{
1858 ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
1859 onode_map.erase(oid);
1860}
1861
7c673cae
FG
1862BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1863{
7c673cae 1864 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b
WB
1865 OnodeRef o;
1866 bool hit = false;
1867
1868 {
11fdf7f2 1869 std::lock_guard l(cache->lock);
224ce89b
WB
1870 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1871 if (p == onode_map.end()) {
1872 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1873 } else {
1874 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
f6b5b4d7
TL
1875 << " " << p->second->nref
1876 << " " << p->second->cached
1877 << " " << p->second->pinned
224ce89b 1878 << dendl;
f6b5b4d7
TL
1879 // This will pin onode and implicitly touch the cache when Onode
1880 // eventually will become unpinned
224ce89b 1881 o = p->second;
f6b5b4d7
TL
1882 ceph_assert(!o->cached || o->pinned);
1883
1884 hit = true;
224ce89b
WB
1885 }
1886 }
1887
1888 if (hit) {
1889 cache->logger->inc(l_bluestore_onode_hits);
1890 } else {
7c673cae 1891 cache->logger->inc(l_bluestore_onode_misses);
7c673cae 1892 }
224ce89b 1893 return o;
7c673cae
FG
1894}
1895
1896void BlueStore::OnodeSpace::clear()
1897{
11fdf7f2 1898 std::lock_guard l(cache->lock);
f6b5b4d7 1899 ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
7c673cae 1900 for (auto &p : onode_map) {
f6b5b4d7 1901 cache->_rm(p.second.get());
7c673cae
FG
1902 }
1903 onode_map.clear();
1904}
1905
1906bool BlueStore::OnodeSpace::empty()
1907{
11fdf7f2 1908 std::lock_guard l(cache->lock);
7c673cae
FG
1909 return onode_map.empty();
1910}
1911
1912void BlueStore::OnodeSpace::rename(
1913 OnodeRef& oldo,
1914 const ghobject_t& old_oid,
1915 const ghobject_t& new_oid,
f91f0fd5 1916 const mempool::bluestore_cache_meta::string& new_okey)
7c673cae 1917{
11fdf7f2 1918 std::lock_guard l(cache->lock);
7c673cae
FG
1919 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1920 << dendl;
1921 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1922 po = onode_map.find(old_oid);
1923 pn = onode_map.find(new_oid);
11fdf7f2 1924 ceph_assert(po != pn);
7c673cae 1925
11fdf7f2 1926 ceph_assert(po != onode_map.end());
7c673cae
FG
1927 if (pn != onode_map.end()) {
1928 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1929 << dendl;
f6b5b4d7 1930 cache->_rm(pn->second.get());
7c673cae
FG
1931 onode_map.erase(pn);
1932 }
1933 OnodeRef o = po->second;
1934
1935 // install a non-existent onode at old location
1936 oldo.reset(new Onode(o->c, old_oid, o->key));
1937 po->second = oldo;
f6b5b4d7
TL
1938 cache->_add(oldo.get(), 1);
1939 // add at new position and fix oid, key.
1940 // This will pin 'o' and implicitly touch cache
1941 // when it will eventually become unpinned
7c673cae 1942 onode_map.insert(make_pair(new_oid, o));
f6b5b4d7
TL
1943 ceph_assert(o->pinned);
1944
7c673cae
FG
1945 o->oid = new_oid;
1946 o->key = new_okey;
9f95a23c 1947 cache->_trim();
7c673cae
FG
1948}
1949
1950bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1951{
11fdf7f2 1952 std::lock_guard l(cache->lock);
7c673cae
FG
1953 ldout(cache->cct, 20) << __func__ << dendl;
1954 for (auto& i : onode_map) {
1955 if (f(i.second)) {
1956 return true;
1957 }
1958 }
1959 return false;
1960}
1961
11fdf7f2
TL
1962template <int LogLevelV = 30>
1963void BlueStore::OnodeSpace::dump(CephContext *cct)
3efd9988
FG
1964{
1965 for (auto& i : onode_map) {
f6b5b4d7
TL
1966 ldout(cct, LogLevelV) << i.first << " : " << i.second
1967 << " " << i.second->nref
1968 << " " << i.second->cached
1969 << " " << i.second->pinned
1970 << dendl;
3efd9988
FG
1971 }
1972}
7c673cae
FG
1973
1974// SharedBlob
1975
1976#undef dout_prefix
1977#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
9f95a23c
TL
1978#undef dout_context
1979#define dout_context coll->store->cct
7c673cae 1980
9f95a23c 1981void BlueStore::SharedBlob::dump(Formatter* f) const
7c673cae 1982{
9f95a23c
TL
1983 f->dump_bool("loaded", loaded);
1984 if (loaded) {
1985 persistent->dump(f);
1986 } else {
1987 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
1988 }
1989}
1990
1991ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1992{
1993 out << "SharedBlob(" << &sb;
1994
7c673cae
FG
1995 if (sb.loaded) {
1996 out << " loaded " << *sb.persistent;
1997 } else {
1998 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1999 }
2000 return out << ")";
2001}
2002
2003BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
2004 : coll(_coll), sbid_unloaded(i)
2005{
11fdf7f2 2006 ceph_assert(sbid_unloaded > 0);
7c673cae
FG
2007 if (get_cache()) {
2008 get_cache()->add_blob();
2009 }
2010}
2011
2012BlueStore::SharedBlob::~SharedBlob()
2013{
7c673cae
FG
2014 if (loaded && persistent) {
2015 delete persistent;
2016 }
2017}
2018
2019void BlueStore::SharedBlob::put()
2020{
2021 if (--nref == 0) {
9f95a23c
TL
2022 dout(20) << __func__ << " " << this
2023 << " removing self from set " << get_parent()
2024 << dendl;
1adf2230
AA
2025 again:
2026 auto coll_snap = coll;
2027 if (coll_snap) {
11fdf7f2 2028 std::lock_guard l(coll_snap->cache->lock);
1adf2230
AA
2029 if (coll_snap != coll) {
2030 goto again;
2031 }
91327a77
AA
2032 if (!coll_snap->shared_blob_set.remove(this, true)) {
2033 // race with lookup
2034 return;
2035 }
1adf2230
AA
2036 bc._clear(coll_snap->cache);
2037 coll_snap->cache->rm_blob();
7c673cae 2038 }
28e407b8 2039 delete this;
7c673cae
FG
2040 }
2041}
2042
2043void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
2044{
11fdf7f2 2045 ceph_assert(persistent);
7c673cae
FG
2046 persistent->ref_map.get(offset, length);
2047}
2048
2049void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77 2050 PExtentVector *r,
11fdf7f2 2051 bool *unshare)
7c673cae 2052{
11fdf7f2
TL
2053 ceph_assert(persistent);
2054 persistent->ref_map.put(offset, length, r,
2055 unshare && !*unshare ? unshare : nullptr);
7c673cae
FG
2056}
2057
f64942e4
AA
2058void BlueStore::SharedBlob::finish_write(uint64_t seq)
2059{
2060 while (true) {
9f95a23c 2061 BufferCacheShard *cache = coll->cache;
11fdf7f2 2062 std::lock_guard l(cache->lock);
f64942e4 2063 if (coll->cache != cache) {
9f95a23c
TL
2064 dout(20) << __func__
2065 << " raced with sb cache update, was " << cache
2066 << ", now " << coll->cache << ", retrying"
2067 << dendl;
f64942e4
AA
2068 continue;
2069 }
2070 bc._finish_write(cache, seq);
2071 break;
2072 }
2073}
2074
3efd9988
FG
2075// SharedBlobSet
2076
2077#undef dout_prefix
2078#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2079
11fdf7f2
TL
2080template <int LogLevelV = 30>
2081void BlueStore::SharedBlobSet::dump(CephContext *cct)
3efd9988 2082{
11fdf7f2 2083 std::lock_guard l(lock);
3efd9988 2084 for (auto& i : sb_map) {
11fdf7f2 2085 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
3efd9988
FG
2086 }
2087}
2088
7c673cae
FG
2089// Blob
2090
2091#undef dout_prefix
2092#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2093
9f95a23c
TL
2094void BlueStore::Blob::dump(Formatter* f) const
2095{
2096 if (is_spanning()) {
2097 f->dump_unsigned("spanning_id ", id);
2098 }
2099 blob.dump(f);
2100 if (shared_blob) {
2101 f->dump_object("shared", *shared_blob);
2102 }
2103}
2104
7c673cae
FG
2105ostream& operator<<(ostream& out, const BlueStore::Blob& b)
2106{
2107 out << "Blob(" << &b;
2108 if (b.is_spanning()) {
2109 out << " spanning " << b.id;
2110 }
35e4c445
FG
2111 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
2112 if (b.shared_blob) {
2113 out << " " << *b.shared_blob;
2114 } else {
2115 out << " (shared_blob=NULL)";
2116 }
2117 out << ")";
7c673cae
FG
2118 return out;
2119}
2120
2121void BlueStore::Blob::discard_unallocated(Collection *coll)
2122{
224ce89b 2123 if (get_blob().is_shared()) {
7c673cae
FG
2124 return;
2125 }
224ce89b 2126 if (get_blob().is_compressed()) {
7c673cae
FG
2127 bool discard = false;
2128 bool all_invalid = true;
224ce89b 2129 for (auto e : get_blob().get_extents()) {
7c673cae
FG
2130 if (!e.is_valid()) {
2131 discard = true;
2132 } else {
2133 all_invalid = false;
2134 }
2135 }
11fdf7f2 2136 ceph_assert(discard == all_invalid); // in case of compressed blob all
7c673cae
FG
2137 // or none pextents are invalid.
2138 if (discard) {
224ce89b
WB
2139 shared_blob->bc.discard(shared_blob->get_cache(), 0,
2140 get_blob().get_logical_length());
7c673cae
FG
2141 }
2142 } else {
2143 size_t pos = 0;
224ce89b 2144 for (auto e : get_blob().get_extents()) {
7c673cae 2145 if (!e.is_valid()) {
9f95a23c
TL
2146 dout(20) << __func__ << " 0x" << std::hex << pos
2147 << "~" << e.length
2148 << std::dec << dendl;
7c673cae
FG
2149 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
2150 }
2151 pos += e.length;
2152 }
224ce89b
WB
2153 if (get_blob().can_prune_tail()) {
2154 dirty_blob().prune_tail();
2155 used_in_blob.prune_tail(get_blob().get_ondisk_length());
224ce89b 2156 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
2157 }
2158 }
2159}
2160
2161void BlueStore::Blob::get_ref(
2162 Collection *coll,
2163 uint32_t offset,
2164 uint32_t length)
2165{
2166 // Caller has to initialize Blob's logical length prior to increment
2167 // references. Otherwise one is neither unable to determine required
2168 // amount of counters in case of per-au tracking nor obtain min_release_size
2169 // for single counter mode.
11fdf7f2 2170 ceph_assert(get_blob().get_logical_length() != 0);
7c673cae
FG
2171 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2172 << std::dec << " " << *this << dendl;
2173
2174 if (used_in_blob.is_empty()) {
2175 uint32_t min_release_size =
224ce89b
WB
2176 get_blob().get_release_size(coll->store->min_alloc_size);
2177 uint64_t l = get_blob().get_logical_length();
2178 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
2179 << min_release_size << std::dec << dendl;
7c673cae
FG
2180 used_in_blob.init(l, min_release_size);
2181 }
2182 used_in_blob.get(
2183 offset,
2184 length);
2185}
2186
2187bool BlueStore::Blob::put_ref(
2188 Collection *coll,
2189 uint32_t offset,
2190 uint32_t length,
2191 PExtentVector *r)
2192{
2193 PExtentVector logical;
2194
7c673cae
FG
2195 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
2196 << std::dec << " " << *this << dendl;
2197
2198 bool empty = used_in_blob.put(
2199 offset,
2200 length,
2201 &logical);
2202 r->clear();
2203 // nothing to release
2204 if (!empty && logical.empty()) {
2205 return false;
2206 }
2207
2208 bluestore_blob_t& b = dirty_blob();
2209 return b.release_extents(empty, logical, r);
2210}
2211
224ce89b 2212bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
2213 uint32_t target_blob_size,
2214 uint32_t b_offset,
2215 uint32_t *length0) {
11fdf7f2
TL
2216 ceph_assert(min_alloc_size);
2217 ceph_assert(target_blob_size);
7c673cae
FG
2218 if (!get_blob().is_mutable()) {
2219 return false;
2220 }
2221
2222 uint32_t length = *length0;
2223 uint32_t end = b_offset + length;
2224
2225 // Currently for the sake of simplicity we omit blob reuse if data is
2226 // unaligned with csum chunk. Later we can perform padding if needed.
2227 if (get_blob().has_csum() &&
2228 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2229 (end % get_blob().get_csum_chunk_size()) != 0)) {
2230 return false;
2231 }
2232
2233 auto blen = get_blob().get_logical_length();
2234 uint32_t new_blen = blen;
2235
2236 // make sure target_blob_size isn't less than current blob len
11fdf7f2 2237 target_blob_size = std::max(blen, target_blob_size);
7c673cae
FG
2238
2239 if (b_offset >= blen) {
224ce89b
WB
2240 // new data totally stands out of the existing blob
2241 new_blen = end;
7c673cae 2242 } else {
224ce89b 2243 // new data overlaps with the existing blob
11fdf7f2 2244 new_blen = std::max(blen, end);
224ce89b
WB
2245
2246 uint32_t overlap = 0;
2247 if (new_blen > blen) {
2248 overlap = blen - b_offset;
2249 } else {
2250 overlap = length;
2251 }
2252
2253 if (!get_blob().is_unallocated(b_offset, overlap)) {
2254 // abort if any piece of the overlap has already been allocated
2255 return false;
7c673cae
FG
2256 }
2257 }
224ce89b 2258
7c673cae
FG
2259 if (new_blen > blen) {
2260 int64_t overflow = int64_t(new_blen) - target_blob_size;
2261 // Unable to decrease the provided length to fit into max_blob_size
2262 if (overflow >= length) {
2263 return false;
2264 }
2265
2266 // FIXME: in some cases we could reduce unused resolution
2267 if (get_blob().has_unused()) {
2268 return false;
2269 }
2270
2271 if (overflow > 0) {
2272 new_blen -= overflow;
2273 length -= overflow;
2274 *length0 = length;
2275 }
224ce89b 2276
7c673cae
FG
2277 if (new_blen > blen) {
2278 dirty_blob().add_tail(new_blen);
2279 used_in_blob.add_tail(new_blen,
224ce89b 2280 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
2281 }
2282 }
2283 return true;
2284}
2285
2286void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2287{
7c673cae
FG
2288 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2289 << " start " << *this << dendl;
11fdf7f2
TL
2290 ceph_assert(blob.can_split());
2291 ceph_assert(used_in_blob.can_split());
7c673cae
FG
2292 bluestore_blob_t &lb = dirty_blob();
2293 bluestore_blob_t &rb = r->dirty_blob();
2294
2295 used_in_blob.split(
2296 blob_offset,
2297 &(r->used_in_blob));
2298
2299 lb.split(blob_offset, rb);
2300 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2301
2302 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2303 << " finish " << *this << dendl;
2304 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2305 << " and " << *r << dendl;
2306}
2307
2308#ifndef CACHE_BLOB_BL
2309void BlueStore::Blob::decode(
2310 Collection *coll,
11fdf7f2 2311 bufferptr::const_iterator& p,
7c673cae
FG
2312 uint64_t struct_v,
2313 uint64_t* sbid,
2314 bool include_ref_map)
2315{
2316 denc(blob, p, struct_v);
2317 if (blob.is_shared()) {
2318 denc(*sbid, p);
2319 }
2320 if (include_ref_map) {
2321 if (struct_v > 1) {
2322 used_in_blob.decode(p);
2323 } else {
2324 used_in_blob.clear();
2325 bluestore_extent_ref_map_t legacy_ref_map;
2326 legacy_ref_map.decode(p);
2327 for (auto r : legacy_ref_map.ref_map) {
2328 get_ref(
2329 coll,
2330 r.first,
2331 r.second.refs * r.second.length);
2332 }
2333 }
2334 }
2335}
2336#endif
2337
2338// Extent
2339
9f95a23c
TL
2340void BlueStore::Extent::dump(Formatter* f) const
2341{
2342 f->dump_unsigned("logical_offset", logical_offset);
2343 f->dump_unsigned("length", length);
2344 f->dump_unsigned("blob_offset", blob_offset);
2345 f->dump_object("blob", *blob);
2346}
2347
7c673cae
FG
2348ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2349{
2350 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2351 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2352 << " " << *e.blob;
2353}
2354
2355// OldExtent
2356BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2357 uint32_t lo,
2358 uint32_t o,
2359 uint32_t l,
2360 BlobRef& b) {
2361 OldExtent* oe = new OldExtent(lo, o, l, b);
2362 b->put_ref(c.get(), o, l, &(oe->r));
2363 oe->blob_empty = b->get_referenced_bytes() == 0;
2364 return oe;
2365}
2366
2367// ExtentMap
2368
2369#undef dout_prefix
2370#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
9f95a23c
TL
2371#undef dout_context
2372#define dout_context onode->c->store->cct
7c673cae
FG
2373
2374BlueStore::ExtentMap::ExtentMap(Onode *o)
2375 : onode(o),
2376 inline_bl(
2377 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2378}
2379
9f95a23c
TL
2380void BlueStore::ExtentMap::dump(Formatter* f) const
2381{
2382 f->open_array_section("extents");
2383
2384 for (auto& e : extent_map) {
2385 f->dump_object("extent", e);
2386 }
2387 f->close_section();
2388}
2389
11fdf7f2
TL
2390void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2391 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2392 uint64_t& length, uint64_t& dstoff) {
2393
2394 auto cct = onode->c->store->cct;
2395 bool inject_21040 =
2396 cct->_conf->bluestore_debug_inject_bug21040;
2397 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2398 for (auto& e : oldo->extent_map.extent_map) {
2399 e.blob->last_encoded_id = -1;
2400 }
2401
2402 int n = 0;
2403 uint64_t end = srcoff + length;
2404 uint32_t dirty_range_begin = 0;
2405 uint32_t dirty_range_end = 0;
2406 bool src_dirty = false;
2407 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2408 ep != oldo->extent_map.extent_map.end();
2409 ++ep) {
2410 auto& e = *ep;
2411 if (e.logical_offset >= end) {
2412 break;
2413 }
2414 dout(20) << __func__ << " src " << e << dendl;
2415 BlobRef cb;
2416 bool blob_duped = true;
2417 if (e.blob->last_encoded_id >= 0) {
2418 cb = id_to_blob[e.blob->last_encoded_id];
2419 blob_duped = false;
2420 } else {
2421 // dup the blob
2422 const bluestore_blob_t& blob = e.blob->get_blob();
2423 // make sure it is shared
2424 if (!blob.is_shared()) {
2425 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2426 if (!inject_21040 && !src_dirty) {
2427 src_dirty = true;
2428 dirty_range_begin = e.logical_offset;
2429 } else if (inject_21040 &&
2430 dirty_range_begin == 0 && dirty_range_end == 0) {
2431 dirty_range_begin = e.logical_offset;
2432 }
2433 ceph_assert(e.logical_end() > 0);
2434 // -1 to exclude next potential shard
2435 dirty_range_end = e.logical_end() - 1;
2436 } else {
2437 c->load_shared_blob(e.blob->shared_blob);
2438 }
2439 cb = new Blob();
2440 e.blob->last_encoded_id = n;
2441 id_to_blob[n] = cb;
2442 e.blob->dup(*cb);
2443 // bump the extent refs on the copied blob's extents
2444 for (auto p : blob.get_extents()) {
2445 if (p.is_valid()) {
2446 e.blob->shared_blob->get_ref(p.offset, p.length);
2447 }
2448 }
2449 txc->write_shared_blob(e.blob->shared_blob);
2450 dout(20) << __func__ << " new " << *cb << dendl;
2451 }
2452
2453 int skip_front, skip_back;
2454 if (e.logical_offset < srcoff) {
2455 skip_front = srcoff - e.logical_offset;
2456 } else {
2457 skip_front = 0;
2458 }
2459 if (e.logical_end() > end) {
2460 skip_back = e.logical_end() - end;
2461 } else {
2462 skip_back = 0;
2463 }
2464
2465 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2466 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2467 newo->extent_map.extent_map.insert(*ne);
2468 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2469 // fixme: we may leave parts of new blob unreferenced that could
2470 // be freed (relative to the shared_blob).
2471 txc->statfs_delta.stored() += ne->length;
2472 if (e.blob->get_blob().is_compressed()) {
2473 txc->statfs_delta.compressed_original() += ne->length;
2474 if (blob_duped) {
2475 txc->statfs_delta.compressed() +=
2476 cb->get_blob().get_compressed_payload_length();
2477 }
2478 }
2479 dout(20) << __func__ << " dst " << *ne << dendl;
2480 ++n;
2481 }
2482 if ((!inject_21040 && src_dirty) ||
2483 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2484 oldo->extent_map.dirty_range(dirty_range_begin,
2485 dirty_range_end - dirty_range_begin);
2486 txc->write_onode(oldo);
2487 }
2488 txc->write_onode(newo);
2489
2490 if (dstoff + length > newo->onode.size) {
2491 newo->onode.size = dstoff + length;
2492 }
2493 newo->extent_map.dirty_range(dstoff, length);
2494}
7c673cae
FG
2495void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2496 bool force)
2497{
2498 auto cct = onode->c->store->cct; //used by dout
2499 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2500 if (onode->onode.extent_map_shards.empty()) {
2501 if (inline_bl.length() == 0) {
2502 unsigned n;
2503 // we need to encode inline_bl to measure encoded length
2504 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
f91f0fd5 2505 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
11fdf7f2 2506 ceph_assert(!never_happen);
7c673cae
FG
2507 size_t len = inline_bl.length();
2508 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2509 << " extents" << dendl;
2510 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2511 request_reshard(0, OBJECT_MAX_SIZE);
2512 return;
2513 }
2514 }
2515 // will persist in the onode key.
2516 } else {
2517 // pending shard update
2518 struct dirty_shard_t {
2519 Shard *shard;
2520 bufferlist bl;
2521 dirty_shard_t(Shard *s) : shard(s) {}
2522 };
2523 vector<dirty_shard_t> encoded_shards;
2524 // allocate slots for all shards in a single call instead of
2525 // doing multiple allocations - one per each dirty shard
2526 encoded_shards.reserve(shards.size());
2527
2528 auto p = shards.begin();
2529 auto prev_p = p;
2530 while (p != shards.end()) {
11fdf7f2 2531 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2532 auto n = p;
2533 ++n;
2534 if (p->dirty) {
2535 uint32_t endoff;
2536 if (n == shards.end()) {
2537 endoff = OBJECT_MAX_SIZE;
2538 } else {
2539 endoff = n->shard_info->offset;
2540 }
2541 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2542 bufferlist& bl = encoded_shards.back().bl;
2543 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2544 bl, &p->extents)) {
2545 if (force) {
2546 derr << __func__ << " encode_some needs reshard" << dendl;
11fdf7f2 2547 ceph_assert(!force);
7c673cae
FG
2548 }
2549 }
2550 size_t len = bl.length();
2551
2552 dout(20) << __func__ << " shard 0x" << std::hex
2553 << p->shard_info->offset << std::dec << " is " << len
2554 << " bytes (was " << p->shard_info->bytes << ") from "
2555 << p->extents << " extents" << dendl;
2556
2557 if (!force) {
2558 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2559 // we are big; reshard ourselves
2560 request_reshard(p->shard_info->offset, endoff);
2561 }
2562 // avoid resharding the trailing shard, even if it is small
2563 else if (n != shards.end() &&
11fdf7f2
TL
2564 len < g_conf()->bluestore_extent_map_shard_min_size) {
2565 ceph_assert(endoff != OBJECT_MAX_SIZE);
31f18b77
FG
2566 if (p == shards.begin()) {
2567 // we are the first shard, combine with next shard
7c673cae 2568 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2569 } else {
31f18b77
FG
2570 // combine either with the previous shard or the next,
2571 // whichever is smaller
7c673cae
FG
2572 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2573 request_reshard(p->shard_info->offset, endoff + 1);
2574 } else {
2575 request_reshard(prev_p->shard_info->offset, endoff);
2576 }
2577 }
2578 }
2579 }
2580 }
2581 prev_p = p;
2582 p = n;
2583 }
2584 if (needs_reshard()) {
2585 return;
2586 }
2587
2588 // schedule DB update for dirty shards
2589 string key;
2590 for (auto& it : encoded_shards) {
2591 it.shard->dirty = false;
2592 it.shard->shard_info->bytes = it.bl.length();
2593 generate_extent_shard_key_and_apply(
2594 onode->key,
2595 it.shard->shard_info->offset,
2596 &key,
2597 [&](const string& final_key) {
2598 t->set(PREFIX_OBJ, final_key, it.bl);
2599 }
2600 );
2601 }
2602 }
2603}
2604
31f18b77
FG
2605bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2606{
2607 if (spanning_blob_map.empty())
2608 return 0;
2609 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2610 // bid is valid and available.
2611 if (bid >= 0)
2612 return bid;
2613 // Find next unused bid;
2614 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2615 const auto begin_bid = bid;
2616 do {
2617 if (!spanning_blob_map.count(bid))
2618 return bid;
2619 else {
2620 bid++;
2621 if (bid < 0) bid = 0;
2622 }
2623 } while (bid != begin_bid);
81eedcae
TL
2624 auto cct = onode->c->store->cct; // used by dout
2625 _dump_onode<0>(cct, *onode);
11fdf7f2 2626 ceph_abort_msg("no available blob id");
31f18b77
FG
2627}
2628
7c673cae
FG
2629void BlueStore::ExtentMap::reshard(
2630 KeyValueDB *db,
2631 KeyValueDB::Transaction t)
2632{
2633 auto cct = onode->c->store->cct; // used by dout
2634
2635 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2636 << needs_reshard_end << ")" << std::dec
2637 << " of " << onode->onode.extent_map_shards.size()
2638 << " shards on " << onode->oid << dendl;
2639 for (auto& p : spanning_blob_map) {
2640 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2641 << dendl;
2642 }
2643 // determine shard index range
2644 unsigned si_begin = 0, si_end = 0;
2645 if (!shards.empty()) {
2646 while (si_begin + 1 < shards.size() &&
2647 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2648 ++si_begin;
2649 }
2650 needs_reshard_begin = shards[si_begin].shard_info->offset;
2651 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2652 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2653 needs_reshard_end = shards[si_end].shard_info->offset;
2654 break;
2655 }
2656 }
2657 if (si_end == shards.size()) {
2658 needs_reshard_end = OBJECT_MAX_SIZE;
2659 }
2660 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2661 << " over 0x[" << std::hex << needs_reshard_begin << ","
2662 << needs_reshard_end << ")" << std::dec << dendl;
2663 }
2664
181888fb 2665 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2666
2667 // we may need to fault in a larger interval later must have all
2668 // referring extents for spanning blobs loaded in order to have
2669 // accurate use_tracker values.
2670 uint32_t spanning_scan_begin = needs_reshard_begin;
2671 uint32_t spanning_scan_end = needs_reshard_end;
2672
2673 // remove old keys
2674 string key;
2675 for (unsigned i = si_begin; i < si_end; ++i) {
2676 generate_extent_shard_key_and_apply(
2677 onode->key, shards[i].shard_info->offset, &key,
2678 [&](const string& final_key) {
2679 t->rmkey(PREFIX_OBJ, final_key);
2680 }
2681 );
2682 }
2683
2684 // calculate average extent size
2685 unsigned bytes = 0;
2686 unsigned extents = 0;
2687 if (onode->onode.extent_map_shards.empty()) {
2688 bytes = inline_bl.length();
2689 extents = extent_map.size();
2690 } else {
2691 for (unsigned i = si_begin; i < si_end; ++i) {
2692 bytes += shards[i].shard_info->bytes;
2693 extents += shards[i].extents;
2694 }
2695 }
2696 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2697 unsigned slop = target *
2698 cct->_conf->bluestore_extent_map_shard_target_size_slop;
11fdf7f2 2699 unsigned extent_avg = bytes / std::max(1u, extents);
7c673cae
FG
2700 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2701 << ", slop " << slop << dendl;
2702
2703 // reshard
2704 unsigned estimate = 0;
31f18b77 2705 unsigned offset = needs_reshard_begin;
7c673cae
FG
2706 vector<bluestore_onode_t::shard_info> new_shard_info;
2707 unsigned max_blob_end = 0;
2708 Extent dummy(needs_reshard_begin);
2709 for (auto e = extent_map.lower_bound(dummy);
2710 e != extent_map.end();
2711 ++e) {
2712 if (e->logical_offset >= needs_reshard_end) {
2713 break;
2714 }
2715 dout(30) << " extent " << *e << dendl;
2716
2717 // disfavor shard boundaries that span a blob
2718 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2719 if (estimate &&
2720 estimate + extent_avg > target + (would_span ? slop : 0)) {
2721 // new shard
31f18b77 2722 if (offset == needs_reshard_begin) {
7c673cae
FG
2723 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2724 new_shard_info.back().offset = offset;
2725 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2726 << std::dec << dendl;
7c673cae
FG
2727 }
2728 offset = e->logical_offset;
2729 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2730 new_shard_info.back().offset = offset;
2731 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2732 << std::dec << dendl;
2733 estimate = 0;
2734 }
2735 estimate += extent_avg;
31f18b77
FG
2736 unsigned bs = e->blob_start();
2737 if (bs < spanning_scan_begin) {
2738 spanning_scan_begin = bs;
7c673cae
FG
2739 }
2740 uint32_t be = e->blob_end();
2741 if (be > max_blob_end) {
2742 max_blob_end = be;
2743 }
2744 if (be > spanning_scan_end) {
2745 spanning_scan_end = be;
2746 }
2747 }
2748 if (new_shard_info.empty() && (si_begin > 0 ||
2749 si_end < shards.size())) {
2750 // we resharded a partial range; we must produce at least one output
2751 // shard
2752 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2753 new_shard_info.back().offset = needs_reshard_begin;
2754 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2755 << std::dec << " (singleton degenerate case)" << dendl;
2756 }
2757
2758 auto& sv = onode->onode.extent_map_shards;
2759 dout(20) << __func__ << " new " << new_shard_info << dendl;
2760 dout(20) << __func__ << " old " << sv << dendl;
2761 if (sv.empty()) {
2762 // no old shards to keep
2763 sv.swap(new_shard_info);
2764 init_shards(true, true);
2765 } else {
2766 // splice in new shards
2767 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2768 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2769 sv.insert(
2770 sv.begin() + si_begin,
2771 new_shard_info.begin(),
2772 new_shard_info.end());
2773 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2774 si_end = si_begin + new_shard_info.size();
31f18b77 2775
11fdf7f2 2776 ceph_assert(sv.size() == shards.size());
31f18b77
FG
2777
2778 // note that we need to update every shard_info of shards here,
2779 // as sv might have been totally re-allocated above
2780 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2781 shards[i].shard_info = &sv[i];
31f18b77
FG
2782 }
2783
2784 // mark newly added shards as dirty
2785 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2786 shards[i].loaded = true;
2787 shards[i].dirty = true;
2788 }
7c673cae
FG
2789 }
2790 dout(20) << __func__ << " fin " << sv << dendl;
2791 inline_bl.clear();
2792
2793 if (sv.empty()) {
2794 // no more shards; unspan all previously spanning blobs
2795 auto p = spanning_blob_map.begin();
2796 while (p != spanning_blob_map.end()) {
2797 p->second->id = -1;
2798 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2799 p = spanning_blob_map.erase(p);
2800 }
2801 } else {
2802 // identify new spanning blobs
2803 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2804 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2805 if (spanning_scan_begin < needs_reshard_begin) {
2806 fault_range(db, spanning_scan_begin,
2807 needs_reshard_begin - spanning_scan_begin);
2808 }
2809 if (spanning_scan_end > needs_reshard_end) {
2810 fault_range(db, needs_reshard_end,
31f18b77 2811 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2812 }
2813 auto sp = sv.begin() + si_begin;
2814 auto esp = sv.end();
2815 unsigned shard_start = sp->offset;
2816 unsigned shard_end;
2817 ++sp;
2818 if (sp == esp) {
2819 shard_end = OBJECT_MAX_SIZE;
2820 } else {
2821 shard_end = sp->offset;
2822 }
7c673cae 2823 Extent dummy(needs_reshard_begin);
9f95a23c
TL
2824
2825 bool was_too_many_blobs_check = false;
2826 auto too_many_blobs_threshold =
2827 g_conf()->bluestore_debug_too_many_blobs_threshold;
2828 auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
2829 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2830 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
2831
7c673cae
FG
2832 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2833 if (e->logical_offset >= needs_reshard_end) {
2834 break;
2835 }
2836 dout(30) << " extent " << *e << dendl;
2837 while (e->logical_offset >= shard_end) {
2838 shard_start = shard_end;
11fdf7f2 2839 ceph_assert(sp != esp);
7c673cae
FG
2840 ++sp;
2841 if (sp == esp) {
2842 shard_end = OBJECT_MAX_SIZE;
2843 } else {
2844 shard_end = sp->offset;
2845 }
2846 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2847 << " to 0x" << shard_end << std::dec << dendl;
2848 }
9f95a23c 2849
7c673cae
FG
2850 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2851 if (!e->blob->is_spanning()) {
2852 // We have two options: (1) split the blob into pieces at the
2853 // shard boundaries (and adjust extents accordingly), or (2)
2854 // mark it spanning. We prefer to cut the blob if we can. Note that
2855 // we may have to split it multiple times--potentially at every
2856 // shard boundary.
2857 bool must_span = false;
2858 BlobRef b = e->blob;
2859 if (b->can_split()) {
2860 uint32_t bstart = e->blob_start();
2861 uint32_t bend = e->blob_end();
2862 for (const auto& sh : shards) {
2863 if (bstart < sh.shard_info->offset &&
2864 bend > sh.shard_info->offset) {
2865 uint32_t blob_offset = sh.shard_info->offset - bstart;
2866 if (b->can_split_at(blob_offset)) {
2867 dout(20) << __func__ << " splitting blob, bstart 0x"
2868 << std::hex << bstart << " blob_offset 0x"
2869 << blob_offset << std::dec << " " << *b << dendl;
2870 b = split_blob(b, blob_offset, sh.shard_info->offset);
2871 // switch b to the new right-hand side, in case it
2872 // *also* has to get split.
2873 bstart += blob_offset;
2874 onode->c->store->logger->inc(l_bluestore_blob_split);
2875 } else {
2876 must_span = true;
2877 break;
2878 }
2879 }
2880 }
2881 } else {
2882 must_span = true;
2883 }
2884 if (must_span) {
31f18b77
FG
2885 auto bid = allocate_spanning_blob_id();
2886 b->id = bid;
7c673cae
FG
2887 spanning_blob_map[b->id] = b;
2888 dout(20) << __func__ << " adding spanning " << *b << dendl;
9f95a23c
TL
2889 if (!was_too_many_blobs_check &&
2890 too_many_blobs_threshold &&
2891 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2892
2893 was_too_many_blobs_check = true;
2894 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2895 if (dumped_onodes[i].first == onode->oid) {
2896 oid_slot = &dumped_onodes[i];
2897 break;
2898 }
2899 if (!oldest_slot || (oldest_slot &&
2900 dumped_onodes[i].second < oldest_slot->second)) {
2901 oldest_slot = &dumped_onodes[i];
2902 }
2903 }
2904 }
7c673cae
FG
2905 }
2906 }
2907 } else {
2908 if (e->blob->is_spanning()) {
2909 spanning_blob_map.erase(e->blob->id);
2910 e->blob->id = -1;
2911 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2912 }
2913 }
2914 }
9f95a23c
TL
2915 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
2916 (oid_slot &&
2917 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
2918 if (do_dump) {
2919 dout(0) << __func__
2920 << " spanning blob count exceeds threshold, "
2921 << spanning_blob_map.size() << " spanning blobs"
2922 << dendl;
2923 _dump_onode<0>(cct, *onode);
2924 if (oid_slot) {
2925 oid_slot->second = mono_clock::now();
2926 } else {
2927 ceph_assert(oldest_slot);
2928 oldest_slot->first = onode->oid;
2929 oldest_slot->second = mono_clock::now();
2930 }
2931 }
7c673cae
FG
2932 }
2933
2934 clear_needs_reshard();
2935}
2936
2937bool BlueStore::ExtentMap::encode_some(
2938 uint32_t offset,
2939 uint32_t length,
2940 bufferlist& bl,
2941 unsigned *pn)
2942{
7c673cae
FG
2943 Extent dummy(offset);
2944 auto start = extent_map.lower_bound(dummy);
2945 uint32_t end = offset + length;
2946
2947 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2948 // serialization only. Hence there is no specific
2949 // handling at ExtentMap level.
2950
2951 unsigned n = 0;
2952 size_t bound = 0;
7c673cae
FG
2953 bool must_reshard = false;
2954 for (auto p = start;
2955 p != extent_map.end() && p->logical_offset < end;
2956 ++p, ++n) {
11fdf7f2 2957 ceph_assert(p->logical_offset >= offset);
7c673cae
FG
2958 p->blob->last_encoded_id = -1;
2959 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2960 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2961 << std::dec << " hit new spanning blob " << *p << dendl;
2962 request_reshard(p->blob_start(), p->blob_end());
2963 must_reshard = true;
2964 }
31f18b77
FG
2965 if (!must_reshard) {
2966 denc_varint(0, bound); // blobid
2967 denc_varint(0, bound); // logical_offset
2968 denc_varint(0, bound); // len
2969 denc_varint(0, bound); // blob_offset
7c673cae 2970
31f18b77
FG
2971 p->blob->bound_encode(
2972 bound,
2973 struct_v,
2974 p->blob->shared_blob->get_sbid(),
2975 false);
2976 }
7c673cae
FG
2977 }
2978 if (must_reshard) {
2979 return true;
2980 }
2981
31f18b77
FG
2982 denc(struct_v, bound);
2983 denc_varint(0, bound); // number of extents
2984
7c673cae
FG
2985 {
2986 auto app = bl.get_contiguous_appender(bound);
2987 denc(struct_v, app);
2988 denc_varint(n, app);
2989 if (pn) {
2990 *pn = n;
2991 }
2992
2993 n = 0;
2994 uint64_t pos = 0;
2995 uint64_t prev_len = 0;
2996 for (auto p = start;
2997 p != extent_map.end() && p->logical_offset < end;
2998 ++p, ++n) {
2999 unsigned blobid;
3000 bool include_blob = false;
3001 if (p->blob->is_spanning()) {
3002 blobid = p->blob->id << BLOBID_SHIFT_BITS;
3003 blobid |= BLOBID_FLAG_SPANNING;
3004 } else if (p->blob->last_encoded_id < 0) {
3005 p->blob->last_encoded_id = n + 1; // so it is always non-zero
3006 include_blob = true;
3007 blobid = 0; // the decoder will infer the id from n
3008 } else {
3009 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
3010 }
3011 if (p->logical_offset == pos) {
3012 blobid |= BLOBID_FLAG_CONTIGUOUS;
3013 }
3014 if (p->blob_offset == 0) {
3015 blobid |= BLOBID_FLAG_ZEROOFFSET;
3016 }
3017 if (p->length == prev_len) {
3018 blobid |= BLOBID_FLAG_SAMELENGTH;
3019 } else {
3020 prev_len = p->length;
3021 }
3022 denc_varint(blobid, app);
3023 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3024 denc_varint_lowz(p->logical_offset - pos, app);
3025 }
3026 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3027 denc_varint_lowz(p->blob_offset, app);
3028 }
3029 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3030 denc_varint_lowz(p->length, app);
3031 }
3032 pos = p->logical_end();
3033 if (include_blob) {
3034 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
3035 }
3036 }
3037 }
3038 /*derr << __func__ << bl << dendl;
3039 derr << __func__ << ":";
3040 bl.hexdump(*_dout);
3041 *_dout << dendl;
3042 */
3043 return false;
3044}
3045
3046unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
3047{
7c673cae
FG
3048 /*
3049 derr << __func__ << ":";
3050 bl.hexdump(*_dout);
3051 *_dout << dendl;
3052 */
3053
11fdf7f2 3054 ceph_assert(bl.get_num_buffers() <= 1);
7c673cae
FG
3055 auto p = bl.front().begin_deep();
3056 __u8 struct_v;
3057 denc(struct_v, p);
3058 // Version 2 differs from v1 in blob's ref_map
3059 // serialization only. Hence there is no specific
3060 // handling at ExtentMap level below.
11fdf7f2 3061 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
3062
3063 uint32_t num;
3064 denc_varint(num, p);
3065 vector<BlobRef> blobs(num);
3066 uint64_t pos = 0;
3067 uint64_t prev_len = 0;
3068 unsigned n = 0;
3069
3070 while (!p.end()) {
3071 Extent *le = new Extent();
3072 uint64_t blobid;
3073 denc_varint(blobid, p);
3074 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
3075 uint64_t gap;
3076 denc_varint_lowz(gap, p);
3077 pos += gap;
3078 }
3079 le->logical_offset = pos;
3080 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
3081 denc_varint_lowz(le->blob_offset, p);
3082 } else {
3083 le->blob_offset = 0;
3084 }
3085 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
3086 denc_varint_lowz(prev_len, p);
3087 }
3088 le->length = prev_len;
3089
3090 if (blobid & BLOBID_FLAG_SPANNING) {
3091 dout(30) << __func__ << " getting spanning blob "
3092 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
3093 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
3094 } else {
3095 blobid >>= BLOBID_SHIFT_BITS;
3096 if (blobid) {
3097 le->assign_blob(blobs[blobid - 1]);
11fdf7f2 3098 ceph_assert(le->blob);
7c673cae
FG
3099 } else {
3100 Blob *b = new Blob();
3101 uint64_t sbid = 0;
3102 b->decode(onode->c, p, struct_v, &sbid, false);
3103 blobs[n] = b;
3104 onode->c->open_shared_blob(sbid, b);
3105 le->assign_blob(b);
3106 }
3107 // we build ref_map dynamically for non-spanning blobs
3108 le->blob->get_ref(
3109 onode->c,
3110 le->blob_offset,
3111 le->length);
3112 }
3113 pos += prev_len;
3114 ++n;
3115 extent_map.insert(*le);
3116 }
3117
11fdf7f2 3118 ceph_assert(n == num);
7c673cae
FG
3119 return num;
3120}
3121
3122void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
3123{
3124 // Version 2 differs from v1 in blob's ref_map
3125 // serialization only. Hence there is no specific
3126 // handling at ExtentMap level.
3127 __u8 struct_v = 2;
3128
3129 denc(struct_v, p);
3130 denc_varint((uint32_t)0, p);
3131 size_t key_size = 0;
3132 denc_varint((uint32_t)0, key_size);
3133 p += spanning_blob_map.size() * key_size;
3134 for (const auto& i : spanning_blob_map) {
3135 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3136 }
3137}
3138
3139void BlueStore::ExtentMap::encode_spanning_blobs(
3140 bufferlist::contiguous_appender& p)
3141{
3142 // Version 2 differs from v1 in blob's ref_map
3143 // serialization only. Hence there is no specific
3144 // handling at ExtentMap level.
3145 __u8 struct_v = 2;
3146
3147 denc(struct_v, p);
3148 denc_varint(spanning_blob_map.size(), p);
3149 for (auto& i : spanning_blob_map) {
3150 denc_varint(i.second->id, p);
3151 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
3152 }
3153}
3154
3155void BlueStore::ExtentMap::decode_spanning_blobs(
11fdf7f2 3156 bufferptr::const_iterator& p)
7c673cae
FG
3157{
3158 __u8 struct_v;
3159 denc(struct_v, p);
3160 // Version 2 differs from v1 in blob's ref_map
3161 // serialization only. Hence there is no specific
3162 // handling at ExtentMap level.
11fdf7f2 3163 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
3164
3165 unsigned n;
3166 denc_varint(n, p);
3167 while (n--) {
3168 BlobRef b(new Blob());
3169 denc_varint(b->id, p);
3170 spanning_blob_map[b->id] = b;
3171 uint64_t sbid = 0;
3172 b->decode(onode->c, p, struct_v, &sbid, true);
3173 onode->c->open_shared_blob(sbid, b);
3174 }
3175}
3176
3177void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
3178{
3179 shards.resize(onode->onode.extent_map_shards.size());
3180 unsigned i = 0;
3181 for (auto &s : onode->onode.extent_map_shards) {
3182 shards[i].shard_info = &s;
3183 shards[i].loaded = loaded;
3184 shards[i].dirty = dirty;
3185 ++i;
3186 }
3187}
3188
3189void BlueStore::ExtentMap::fault_range(
3190 KeyValueDB *db,
3191 uint32_t offset,
3192 uint32_t length)
3193{
7c673cae
FG
3194 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3195 << std::dec << dendl;
3196 auto start = seek_shard(offset);
3197 auto last = seek_shard(offset + length);
3198
3199 if (start < 0)
3200 return;
3201
11fdf7f2 3202 ceph_assert(last >= start);
7c673cae
FG
3203 string key;
3204 while (start <= last) {
11fdf7f2 3205 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3206 auto p = &shards[start];
3207 if (!p->loaded) {
3208 dout(30) << __func__ << " opening shard 0x" << std::hex
3209 << p->shard_info->offset << std::dec << dendl;
3210 bufferlist v;
3211 generate_extent_shard_key_and_apply(
3212 onode->key, p->shard_info->offset, &key,
3213 [&](const string& final_key) {
3214 int r = db->get(PREFIX_OBJ, final_key, &v);
3215 if (r < 0) {
3216 derr << __func__ << " missing shard 0x" << std::hex
3217 << p->shard_info->offset << std::dec << " for " << onode->oid
3218 << dendl;
11fdf7f2 3219 ceph_assert(r >= 0);
7c673cae
FG
3220 }
3221 }
3222 );
3223 p->extents = decode_some(v);
3224 p->loaded = true;
3225 dout(20) << __func__ << " open shard 0x" << std::hex
81eedcae
TL
3226 << p->shard_info->offset
3227 << " for range 0x" << offset << "~" << length << std::dec
7c673cae 3228 << " (" << v.length() << " bytes)" << dendl;
11fdf7f2
TL
3229 ceph_assert(p->dirty == false);
3230 ceph_assert(v.length() == p->shard_info->bytes);
7c673cae
FG
3231 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3232 } else {
3233 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3234 }
3235 ++start;
3236 }
3237}
3238
3239void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
3240 uint32_t offset,
3241 uint32_t length)
3242{
7c673cae
FG
3243 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3244 << std::dec << dendl;
3245 if (shards.empty()) {
3246 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3247 inline_bl.clear();
3248 return;
3249 }
3250 auto start = seek_shard(offset);
11fdf7f2
TL
3251 if (length == 0) {
3252 length = 1;
3253 }
3254 auto last = seek_shard(offset + length - 1);
7c673cae
FG
3255 if (start < 0)
3256 return;
3257
11fdf7f2 3258 ceph_assert(last >= start);
7c673cae 3259 while (start <= last) {
11fdf7f2 3260 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3261 auto p = &shards[start];
3262 if (!p->loaded) {
11fdf7f2
TL
3263 derr << __func__ << "on write 0x" << std::hex << offset
3264 << "~" << length << " shard 0x" << p->shard_info->offset
3265 << std::dec << " is not loaded, can't mark dirty" << dendl;
3266 ceph_abort_msg("can't mark unloaded shard dirty");
7c673cae
FG
3267 }
3268 if (!p->dirty) {
3269 dout(20) << __func__ << " mark shard 0x" << std::hex
3270 << p->shard_info->offset << std::dec << " dirty" << dendl;
3271 p->dirty = true;
3272 }
3273 ++start;
3274 }
3275}
3276
3277BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3278 uint64_t offset)
3279{
3280 Extent dummy(offset);
3281 return extent_map.find(dummy);
3282}
3283
7c673cae
FG
3284BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3285 uint64_t offset)
3286{
3287 Extent dummy(offset);
3288 auto fp = extent_map.lower_bound(dummy);
3289 if (fp != extent_map.begin()) {
3290 --fp;
3291 if (fp->logical_end() <= offset) {
3292 ++fp;
3293 }
3294 }
3295 return fp;
3296}
3297
3298BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3299 uint64_t offset) const
3300{
3301 Extent dummy(offset);
3302 auto fp = extent_map.lower_bound(dummy);
3303 if (fp != extent_map.begin()) {
3304 --fp;
3305 if (fp->logical_end() <= offset) {
3306 ++fp;
3307 }
3308 }
3309 return fp;
3310}
3311
3312bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3313{
3314 auto fp = seek_lextent(offset);
3315 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3316 return false;
3317 }
3318 return true;
3319}
3320
3321int BlueStore::ExtentMap::compress_extent_map(
3322 uint64_t offset,
3323 uint64_t length)
3324{
7c673cae
FG
3325 if (extent_map.empty())
3326 return 0;
3327 int removed = 0;
3328 auto p = seek_lextent(offset);
3329 if (p != extent_map.begin()) {
3330 --p; // start to the left of offset
3331 }
3332 // the caller should have just written to this region
11fdf7f2 3333 ceph_assert(p != extent_map.end());
7c673cae
FG
3334
3335 // identify the *next* shard
3336 auto pshard = shards.begin();
3337 while (pshard != shards.end() &&
3338 p->logical_offset >= pshard->shard_info->offset) {
3339 ++pshard;
3340 }
3341 uint64_t shard_end;
3342 if (pshard != shards.end()) {
3343 shard_end = pshard->shard_info->offset;
3344 } else {
3345 shard_end = OBJECT_MAX_SIZE;
3346 }
3347
3348 auto n = p;
3349 for (++n; n != extent_map.end(); p = n++) {
3350 if (n->logical_offset > offset + length) {
3351 break; // stop after end
3352 }
3353 while (n != extent_map.end() &&
3354 p->logical_end() == n->logical_offset &&
3355 p->blob == n->blob &&
3356 p->blob_offset + p->length == n->blob_offset &&
3357 n->logical_offset < shard_end) {
3358 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3359 << " next shard 0x" << shard_end << std::dec
3360 << " merging " << *p << " and " << *n << dendl;
3361 p->length += n->length;
3362 rm(n++);
3363 ++removed;
3364 }
3365 if (n == extent_map.end()) {
3366 break;
3367 }
3368 if (n->logical_offset >= shard_end) {
11fdf7f2 3369 ceph_assert(pshard != shards.end());
7c673cae
FG
3370 ++pshard;
3371 if (pshard != shards.end()) {
3372 shard_end = pshard->shard_info->offset;
3373 } else {
3374 shard_end = OBJECT_MAX_SIZE;
3375 }
3376 }
3377 }
11fdf7f2 3378 if (removed) {
7c673cae
FG
3379 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3380 }
3381 return removed;
3382}
3383
3384void BlueStore::ExtentMap::punch_hole(
3385 CollectionRef &c,
3386 uint64_t offset,
3387 uint64_t length,
3388 old_extent_map_t *old_extents)
3389{
3390 auto p = seek_lextent(offset);
3391 uint64_t end = offset + length;
3392 while (p != extent_map.end()) {
3393 if (p->logical_offset >= end) {
3394 break;
3395 }
3396 if (p->logical_offset < offset) {
3397 if (p->logical_end() > end) {
3398 // split and deref middle
3399 uint64_t front = offset - p->logical_offset;
3400 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3401 length, p->blob);
3402 old_extents->push_back(*oe);
3403 add(end,
3404 p->blob_offset + front + length,
3405 p->length - front - length,
3406 p->blob);
3407 p->length = front;
3408 break;
3409 } else {
3410 // deref tail
11fdf7f2 3411 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
7c673cae
FG
3412 uint64_t keep = offset - p->logical_offset;
3413 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3414 p->length - keep, p->blob);
3415 old_extents->push_back(*oe);
3416 p->length = keep;
3417 ++p;
3418 continue;
3419 }
3420 }
3421 if (p->logical_offset + p->length <= end) {
3422 // deref whole lextent
3423 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3424 p->length, p->blob);
3425 old_extents->push_back(*oe);
3426 rm(p++);
3427 continue;
3428 }
3429 // deref head
3430 uint64_t keep = p->logical_end() - end;
3431 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3432 p->length - keep, p->blob);
3433 old_extents->push_back(*oe);
3434
3435 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3436 rm(p);
3437 break;
3438 }
3439}
3440
3441BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3442 CollectionRef &c,
3443 uint64_t logical_offset,
3444 uint64_t blob_offset, uint64_t length, BlobRef b,
3445 old_extent_map_t *old_extents)
3446{
3447 // We need to have completely initialized Blob to increment its ref counters.
11fdf7f2 3448 ceph_assert(b->get_blob().get_logical_length() != 0);
7c673cae
FG
3449
3450 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3451 // old_extents list if we overwre the blob totally
3452 // This might happen during WAL overwrite.
3453 b->get_ref(onode->c, blob_offset, length);
3454
3455 if (old_extents) {
3456 punch_hole(c, logical_offset, length, old_extents);
3457 }
3458
3459 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3460 extent_map.insert(*le);
3461 if (spans_shard(logical_offset, length)) {
3462 request_reshard(logical_offset, logical_offset + length);
3463 }
3464 return le;
3465}
3466
3467BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3468 BlobRef lb,
3469 uint32_t blob_offset,
3470 uint32_t pos)
3471{
7c673cae
FG
3472 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3473 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3474 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3475 << dendl;
3476 BlobRef rb = onode->c->new_blob();
3477 lb->split(onode->c, blob_offset, rb.get());
3478
3479 for (auto ep = seek_lextent(pos);
3480 ep != extent_map.end() && ep->logical_offset < end_pos;
3481 ++ep) {
3482 if (ep->blob != lb) {
3483 continue;
3484 }
3485 if (ep->logical_offset < pos) {
3486 // split extent
3487 size_t left = pos - ep->logical_offset;
3488 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3489 extent_map.insert(*ne);
3490 ep->length = left;
3491 dout(30) << __func__ << " split " << *ep << dendl;
3492 dout(30) << __func__ << " to " << *ne << dendl;
3493 } else {
3494 // switch blob
11fdf7f2 3495 ceph_assert(ep->blob_offset >= blob_offset);
7c673cae
FG
3496
3497 ep->blob = rb;
3498 ep->blob_offset -= blob_offset;
3499 dout(30) << __func__ << " adjusted " << *ep << dendl;
3500 }
3501 }
3502 return rb;
3503}
3504
3505// Onode
3506
3507#undef dout_prefix
3508#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3509
f6b5b4d7
TL
3510//
3511// A tricky thing about Onode's ref counter is that we do an additional
3512// increment when newly pinned instance is detected. And -1 on unpin.
3513// This prevents from a conflict with a delete call (when nref == 0).
3514// The latter might happen while the thread is in unpin() function
3515// (and e.g. waiting for lock acquisition) since nref is already
3516// decremented. And another 'putting' thread on the instance will release it.
3517//
3518void BlueStore::Onode::get() {
3519 if (++nref == 2) {
3520 c->get_onode_cache()->pin(this, [&]() {
3521 bool was_pinned = pinned;
3522 pinned = nref >= 2;
3523 // additional increment for newly pinned instance
3524 bool r = !was_pinned && pinned;
3525 if (r) {
3526 ++nref;
3527 }
3528 return cached && r;
3529 });
3530 }
3531}
3532void BlueStore::Onode::put() {
3533 if (--nref == 2) {
3534 c->get_onode_cache()->unpin(this, [&]() {
3535 bool was_pinned = pinned;
3536 pinned = pinned && nref > 2; // intentionally use > not >= as we have
3537 // +1 due to pinned state
3538 bool r = was_pinned && !pinned;
3539 // additional decrement for newly unpinned instance
3540 if (r) {
3541 --nref;
3542 }
3543 return cached && r;
3544 });
3545 }
3546 if (nref == 0) {
3547 delete this;
3548 }
3549}
3550
eafe8130
TL
3551BlueStore::Onode* BlueStore::Onode::decode(
3552 CollectionRef c,
3553 const ghobject_t& oid,
3554 const string& key,
3555 const bufferlist& v)
3556{
3557 Onode* on = new Onode(c.get(), oid, key);
3558 on->exists = true;
3559 auto p = v.front().begin_deep();
3560 on->onode.decode(p);
3561 for (auto& i : on->onode.attrs) {
f91f0fd5 3562 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
eafe8130
TL
3563 }
3564
3565 // initialize extent_map
3566 on->extent_map.decode_spanning_blobs(p);
3567 if (on->onode.extent_map_shards.empty()) {
3568 denc(on->extent_map.inline_bl, p);
3569 on->extent_map.decode_some(on->extent_map.inline_bl);
3570 on->extent_map.inline_bl.reassign_to_mempool(
f91f0fd5 3571 mempool::mempool_bluestore_cache_data);
eafe8130
TL
3572 }
3573 else {
3574 on->extent_map.init_shards(false, false);
3575 }
3576 return on;
3577}
3578
7c673cae
FG
3579void BlueStore::Onode::flush()
3580{
3581 if (flushing_count.load()) {
3582 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
9f95a23c 3583 waiting_count++;
11fdf7f2 3584 std::unique_lock l(flush_lock);
7c673cae
FG
3585 while (flushing_count.load()) {
3586 flush_cond.wait(l);
3587 }
9f95a23c 3588 waiting_count--;
7c673cae
FG
3589 }
3590 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3591}
3592
9f95a23c
TL
3593void BlueStore::Onode::dump(Formatter* f) const
3594{
3595 onode.dump(f);
3596 extent_map.dump(f);
3597}
3598
3599
3600const string& BlueStore::Onode::get_omap_prefix()
3601{
3602 if (onode.is_pgmeta_omap()) {
3603 return PREFIX_PGMETA_OMAP;
3604 }
3605 if (onode.is_perpool_omap()) {
3606 return PREFIX_PERPOOL_OMAP;
3607 }
3608 return PREFIX_OMAP;
3609}
3610
3611// '-' < '.' < '~'
3612
3613void BlueStore::Onode::get_omap_header(string *out)
3614{
3615 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3616 _key_encode_u64(c->pool(), out);
3617 }
3618 _key_encode_u64(onode.nid, out);
3619 out->push_back('-');
3620}
3621
3622void BlueStore::Onode::get_omap_key(const string& key, string *out)
3623{
3624 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3625 _key_encode_u64(c->pool(), out);
3626 }
3627 _key_encode_u64(onode.nid, out);
3628 out->push_back('.');
3629 out->append(key);
3630}
3631
3632void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3633{
3634 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3635 _key_encode_u64(c->pool(), out);
3636 }
3637 _key_encode_u64(onode.nid, out);
3638 out->append(old.c_str() + out->length(), old.size() - out->length());
3639}
3640
3641void BlueStore::Onode::get_omap_tail(string *out)
3642{
3643 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3644 _key_encode_u64(c->pool(), out);
3645 }
3646 _key_encode_u64(onode.nid, out);
3647 out->push_back('~');
3648}
3649
3650void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3651{
3652 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3653 *user_key = key.substr(sizeof(uint64_t)*2 + 1);
3654 } else {
3655 *user_key = key.substr(sizeof(uint64_t) + 1);
3656 }
3657}
3658
3659
7c673cae
FG
3660// =======================================================
3661// WriteContext
3662
3663/// Checks for writes to the same pextent within a blob
3664bool BlueStore::WriteContext::has_conflict(
3665 BlobRef b,
3666 uint64_t loffs,
3667 uint64_t loffs_end,
3668 uint64_t min_alloc_size)
3669{
11fdf7f2
TL
3670 ceph_assert((loffs % min_alloc_size) == 0);
3671 ceph_assert((loffs_end % min_alloc_size) == 0);
7c673cae
FG
3672 for (auto w : writes) {
3673 if (b == w.b) {
11fdf7f2
TL
3674 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3675 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
7c673cae 3676 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 3677 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
3678 return true;
3679 }
3680 }
3681 }
3682 return false;
3683}
3684
3685// =======================================================
3686
3687// DeferredBatch
3688#undef dout_prefix
3689#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
9f95a23c
TL
3690#undef dout_context
3691#define dout_context cct
7c673cae
FG
3692
3693void BlueStore::DeferredBatch::prepare_write(
3694 CephContext *cct,
3695 uint64_t seq, uint64_t offset, uint64_t length,
3696 bufferlist::const_iterator& blp)
3697{
3698 _discard(cct, offset, length);
3699 auto i = iomap.insert(make_pair(offset, deferred_io()));
11fdf7f2 3700 ceph_assert(i.second); // this should be a new insertion
7c673cae
FG
3701 i.first->second.seq = seq;
3702 blp.copy(length, i.first->second.bl);
31f18b77
FG
3703 i.first->second.bl.reassign_to_mempool(
3704 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3705 dout(20) << __func__ << " seq " << seq
3706 << " 0x" << std::hex << offset << "~" << length
3707 << " crc " << i.first->second.bl.crc32c(-1)
3708 << std::dec << dendl;
3709 seq_bytes[seq] += length;
3710#ifdef DEBUG_DEFERRED
3711 _audit(cct);
3712#endif
3713}
3714
3715void BlueStore::DeferredBatch::_discard(
3716 CephContext *cct, uint64_t offset, uint64_t length)
3717{
3718 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3719 << std::dec << dendl;
3720 auto p = iomap.lower_bound(offset);
3721 if (p != iomap.begin()) {
3722 --p;
3723 auto end = p->first + p->second.bl.length();
3724 if (end > offset) {
3725 bufferlist head;
3726 head.substr_of(p->second.bl, 0, offset - p->first);
3727 dout(20) << __func__ << " keep head " << p->second.seq
3728 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3729 << " -> 0x" << head.length() << std::dec << dendl;
3730 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3731 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3732 if (end > offset + length) {
3733 bufferlist tail;
3734 tail.substr_of(p->second.bl, offset + length - p->first,
3735 end - (offset + length));
3736 dout(20) << __func__ << " keep tail " << p->second.seq
3737 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3738 << " -> 0x" << tail.length() << std::dec << dendl;
3739 auto &n = iomap[offset + length];
3740 n.bl.swap(tail);
3741 n.seq = p->second.seq;
3742 i->second -= length;
3743 } else {
3744 i->second -= end - offset;
3745 }
11fdf7f2 3746 ceph_assert(i->second >= 0);
7c673cae
FG
3747 p->second.bl.swap(head);
3748 }
3749 ++p;
3750 }
3751 while (p != iomap.end()) {
3752 if (p->first >= offset + length) {
3753 break;
3754 }
3755 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3756 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3757 auto end = p->first + p->second.bl.length();
3758 if (end > offset + length) {
3759 unsigned drop_front = offset + length - p->first;
3760 unsigned keep_tail = end - (offset + length);
3761 dout(20) << __func__ << " truncate front " << p->second.seq
3762 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3763 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3764 << " to 0x" << (offset + length) << "~" << keep_tail
3765 << std::dec << dendl;
3766 auto &s = iomap[offset + length];
3767 s.seq = p->second.seq;
3768 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3769 i->second -= drop_front;
3770 } else {
3771 dout(20) << __func__ << " drop " << p->second.seq
3772 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3773 << std::dec << dendl;
3774 i->second -= p->second.bl.length();
3775 }
11fdf7f2 3776 ceph_assert(i->second >= 0);
7c673cae
FG
3777 p = iomap.erase(p);
3778 }
3779}
3780
3781void BlueStore::DeferredBatch::_audit(CephContext *cct)
3782{
3783 map<uint64_t,int> sb;
3784 for (auto p : seq_bytes) {
3785 sb[p.first] = 0; // make sure we have the same set of keys
3786 }
3787 uint64_t pos = 0;
3788 for (auto& p : iomap) {
11fdf7f2 3789 ceph_assert(p.first >= pos);
7c673cae
FG
3790 sb[p.second.seq] += p.second.bl.length();
3791 pos = p.first + p.second.bl.length();
3792 }
11fdf7f2 3793 ceph_assert(sb == seq_bytes);
7c673cae
FG
3794}
3795
3796
3797// Collection
3798
3799#undef dout_prefix
3800#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3801
9f95a23c
TL
3802BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3803 : CollectionImpl(store_->cct, cid),
11fdf7f2 3804 store(store_),
9f95a23c 3805 cache(bc),
7c673cae 3806 exists(true),
9f95a23c 3807 onode_map(oc),
11fdf7f2
TL
3808 commit_queue(nullptr)
3809{
3810}
3811
3812bool BlueStore::Collection::flush_commit(Context *c)
3813{
3814 return osr->flush_commit(c);
3815}
3816
3817void BlueStore::Collection::flush()
3818{
3819 osr->flush();
3820}
3821
3822void BlueStore::Collection::flush_all_but_last()
7c673cae 3823{
11fdf7f2 3824 osr->flush_all_but_last();
7c673cae
FG
3825}
3826
3827void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3828{
11fdf7f2 3829 ceph_assert(!b->shared_blob);
7c673cae
FG
3830 const bluestore_blob_t& blob = b->get_blob();
3831 if (!blob.is_shared()) {
3832 b->shared_blob = new SharedBlob(this);
3833 return;
3834 }
3835
3836 b->shared_blob = shared_blob_set.lookup(sbid);
3837 if (b->shared_blob) {
3838 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3839 << std::dec << " had " << *b->shared_blob << dendl;
3840 } else {
3841 b->shared_blob = new SharedBlob(sbid, this);
3842 shared_blob_set.add(this, b->shared_blob.get());
3843 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3844 << std::dec << " opened " << *b->shared_blob
3845 << dendl;
3846 }
3847}
3848
3849void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3850{
3851 if (!sb->is_loaded()) {
3852
3853 bufferlist v;
3854 string key;
3855 auto sbid = sb->get_sbid();
3856 get_shared_blob_key(sbid, &key);
3857 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3858 if (r < 0) {
3859 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3860 << std::dec << " not found at key "
3861 << pretty_binary_string(key) << dendl;
11fdf7f2 3862 ceph_abort_msg("uh oh, missing shared_blob");
7c673cae
FG
3863 }
3864
3865 sb->loaded = true;
3866 sb->persistent = new bluestore_shared_blob_t(sbid);
11fdf7f2
TL
3867 auto p = v.cbegin();
3868 decode(*(sb->persistent), p);
7c673cae
FG
3869 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3870 << std::dec << " loaded shared_blob " << *sb << dendl;
3871 }
3872}
3873
3874void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3875{
7c673cae 3876 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
11fdf7f2 3877 ceph_assert(!b->shared_blob->is_loaded());
7c673cae
FG
3878
3879 // update blob
31f18b77 3880 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 3881 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
3882
3883 // update shared blob
3884 b->shared_blob->loaded = true;
3885 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3886 shared_blob_set.add(this, b->shared_blob.get());
3887 for (auto p : blob.get_extents()) {
3888 if (p.is_valid()) {
3889 b->shared_blob->get_ref(
3890 p.offset,
3891 p.length);
3892 }
3893 }
3894 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3895}
3896
31f18b77
FG
3897uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3898{
3899 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
11fdf7f2 3900 ceph_assert(sb->is_loaded());
31f18b77
FG
3901
3902 uint64_t sbid = sb->get_sbid();
3903 shared_blob_set.remove(sb);
3904 sb->loaded = false;
3905 delete sb->persistent;
3906 sb->sbid_unloaded = 0;
3907 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3908 return sbid;
3909}
3910
7c673cae
FG
3911BlueStore::OnodeRef BlueStore::Collection::get_onode(
3912 const ghobject_t& oid,
9f95a23c
TL
3913 bool create,
3914 bool is_createop)
7c673cae 3915{
9f95a23c 3916 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
7c673cae
FG
3917
3918 spg_t pgid;
3919 if (cid.is_pg(&pgid)) {
3920 if (!oid.match(cnode.bits, pgid.ps())) {
3921 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3922 << pgid << " bits " << cnode.bits << dendl;
3923 ceph_abort();
3924 }
3925 }
3926
3927 OnodeRef o = onode_map.lookup(oid);
3928 if (o)
3929 return o;
3930
eafe8130 3931 string key;
7c673cae
FG
3932 get_object_key(store->cct, oid, &key);
3933
3934 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3935 << pretty_binary_string(key) << dendl;
3936
3937 bufferlist v;
9f95a23c 3938 int r = -ENOENT;
7c673cae 3939 Onode *on;
9f95a23c
TL
3940 if (!is_createop) {
3941 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3942 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3943 }
7c673cae 3944 if (v.length() == 0) {
11fdf7f2 3945 ceph_assert(r == -ENOENT);
7c673cae
FG
3946 if (!store->cct->_conf->bluestore_debug_misc &&
3947 !create)
3948 return OnodeRef();
3949
3950 // new object, new onode
3951 on = new Onode(this, oid, key);
3952 } else {
3953 // loaded
11fdf7f2 3954 ceph_assert(r >= 0);
eafe8130 3955 on = Onode::decode(this, oid, key, v);
7c673cae
FG
3956 }
3957 o.reset(on);
3958 return onode_map.add(oid, o);
3959}
3960
3961void BlueStore::Collection::split_cache(
3962 Collection *dest)
3963{
3964 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3965
3966 // lock (one or both) cache shards
3967 std::lock(cache->lock, dest->cache->lock);
11fdf7f2
TL
3968 std::lock_guard l(cache->lock, std::adopt_lock);
3969 std::lock_guard l2(dest->cache->lock, std::adopt_lock);
7c673cae
FG
3970
3971 int destbits = dest->cnode.bits;
3972 spg_t destpg;
3973 bool is_pg = dest->cid.is_pg(&destpg);
11fdf7f2 3974 ceph_assert(is_pg);
7c673cae
FG
3975
3976 auto p = onode_map.onode_map.begin();
3977 while (p != onode_map.onode_map.end()) {
11fdf7f2 3978 OnodeRef o = p->second;
7c673cae
FG
3979 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3980 // onode does not belong to this child
11fdf7f2
TL
3981 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
3982 << dendl;
7c673cae
FG
3983 ++p;
3984 } else {
7c673cae
FG
3985 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3986 << dendl;
3987
f6b5b4d7
TL
3988 // ensuring that nref is always >= 2 and hence onode is pinned and
3989 // physically out of cache during the transition
3990 OnodeRef o_pin = o;
3991 ceph_assert(o->pinned);
3992
7c673cae 3993 p = onode_map.onode_map.erase(p);
7c673cae 3994 dest->onode_map.onode_map[o->oid] = o;
f6b5b4d7
TL
3995 if (get_onode_cache() != dest->get_onode_cache()) {
3996 get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
9f95a23c 3997 }
f6b5b4d7 3998 o->c = dest;
7c673cae
FG
3999
4000 // move over shared blobs and buffers. cover shared blobs from
4001 // both extent map and spanning blob map (the full extent map
4002 // may not be faulted in)
4003 vector<SharedBlob*> sbvec;
4004 for (auto& e : o->extent_map.extent_map) {
4005 sbvec.push_back(e.blob->shared_blob.get());
4006 }
4007 for (auto& b : o->extent_map.spanning_blob_map) {
4008 sbvec.push_back(b.second->shared_blob.get());
4009 }
4010 for (auto sb : sbvec) {
4011 if (sb->coll == dest) {
4012 ldout(store->cct, 20) << __func__ << " already moved " << *sb
4013 << dendl;
4014 continue;
4015 }
4016 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
4017 if (sb->get_sbid()) {
4018 ldout(store->cct, 20) << __func__
4019 << " moving registration " << *sb << dendl;
4020 shared_blob_set.remove(sb);
4021 dest->shared_blob_set.add(dest, sb);
4022 }
3efd9988 4023 sb->coll = dest;
7c673cae 4024 if (dest->cache != cache) {
7c673cae
FG
4025 for (auto& i : sb->bc.buffer_map) {
4026 if (!i.second->is_writing()) {
4027 ldout(store->cct, 20) << __func__ << " moving " << *i.second
4028 << dendl;
9f95a23c 4029 dest->cache->_move(cache, i.second.get());
7c673cae
FG
4030 }
4031 }
4032 }
4033 }
7c673cae
FG
4034 }
4035 }
9f95a23c 4036 dest->cache->_trim();
7c673cae
FG
4037}
4038
7c673cae
FG
4039// =======================================================
4040
91327a77
AA
4041// MempoolThread
4042
4043#undef dout_prefix
4044#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
9f95a23c
TL
4045#undef dout_context
4046#define dout_context store->cct
91327a77 4047
7c673cae
FG
4048void *BlueStore::MempoolThread::entry()
4049{
9f95a23c 4050 std::unique_lock l{lock};
11fdf7f2 4051
92f5a8d4 4052 uint32_t prev_config_change = store->config_changed.load();
eafe8130
TL
4053 uint64_t base = store->osd_memory_base;
4054 double fragmentation = store->osd_memory_expected_fragmentation;
4055 uint64_t target = store->osd_memory_target;
4056 uint64_t min = store->osd_memory_cache_min;
4057 uint64_t max = min;
4058
4059 // When setting the maximum amount of memory to use for cache, first
4060 // assume some base amount of memory for the OSD and then fudge in
4061 // some overhead for fragmentation that scales with cache usage.
4062 uint64_t ltarget = (1.0 - fragmentation) * target;
4063 if (ltarget > base + min) {
4064 max = ltarget - base;
11fdf7f2 4065 }
31f18b77 4066
eafe8130
TL
4067 binned_kv_cache = store->db->get_priority_cache();
4068 if (store->cache_autotune && binned_kv_cache != nullptr) {
4069 pcm = std::make_shared<PriorityCache::Manager>(
4070 store->cct, min, max, target, true);
4071 pcm->insert("kv", binned_kv_cache, true);
4072 pcm->insert("meta", meta_cache, true);
4073 pcm->insert("data", data_cache, true);
4074 }
91327a77
AA
4075
4076 utime_t next_balance = ceph_clock_now();
4077 utime_t next_resize = ceph_clock_now();
9f95a23c
TL
4078 utime_t next_deferred_force_submit = ceph_clock_now();
4079 utime_t alloc_stats_dump_clock = ceph_clock_now();
31f18b77 4080
91327a77 4081 bool interval_stats_trim = false;
91327a77 4082 while (!stop) {
92f5a8d4
TL
4083 // Update pcm cache settings if related configuration was changed
4084 uint32_t cur_config_change = store->config_changed.load();
4085 if (cur_config_change != prev_config_change) {
4086 _update_cache_settings();
4087 prev_config_change = cur_config_change;
4088 }
4089
91327a77
AA
4090 // Before we trim, check and see if it's time to rebalance/resize.
4091 double autotune_interval = store->cache_autotune_interval;
4092 double resize_interval = store->osd_memory_cache_resize_interval;
9f95a23c
TL
4093 double max_defer_interval = store->max_defer_interval;
4094
4095 double alloc_stats_dump_interval =
4096 store->cct->_conf->bluestore_alloc_stats_dump_interval;
91327a77 4097
9f95a23c
TL
4098 if (alloc_stats_dump_interval > 0 &&
4099 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
4100 store->_record_allocation_stats();
4101 alloc_stats_dump_clock = ceph_clock_now();
4102 }
91327a77 4103 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
11fdf7f2
TL
4104 _adjust_cache_settings();
4105
91327a77 4106 // Log events at 5 instead of 20 when balance happens.
91327a77 4107 interval_stats_trim = true;
eafe8130
TL
4108
4109 if (pcm != nullptr) {
4110 pcm->balance();
91327a77 4111 }
31f18b77 4112
91327a77
AA
4113 next_balance = ceph_clock_now();
4114 next_balance += autotune_interval;
4115 }
4116 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
eafe8130
TL
4117 if (ceph_using_tcmalloc() && pcm != nullptr) {
4118 pcm->tune_memory();
91327a77
AA
4119 }
4120 next_resize = ceph_clock_now();
4121 next_resize += resize_interval;
31f18b77
FG
4122 }
4123
9f95a23c
TL
4124 if (max_defer_interval > 0 &&
4125 next_deferred_force_submit < ceph_clock_now()) {
4126 if (store->get_deferred_last_submitted() + max_defer_interval <
4127 ceph_clock_now()) {
4128 store->deferred_try_submit();
4129 }
4130 next_deferred_force_submit = ceph_clock_now();
4131 next_deferred_force_submit += max_defer_interval/3;
4132 }
4133
4134 // Now Resize the shards
4135 _resize_shards(interval_stats_trim);
91327a77 4136 interval_stats_trim = false;
31f18b77 4137
91327a77 4138 store->_update_cache_logger();
11fdf7f2
TL
4139 auto wait = ceph::make_timespan(
4140 store->cct->_conf->bluestore_cache_trim_interval);
4141 cond.wait_for(l, wait);
7c673cae 4142 }
9f95a23c
TL
4143 // do final dump
4144 store->_record_allocation_stats();
7c673cae
FG
4145 stop = false;
4146 return NULL;
4147}
4148
91327a77
AA
4149void BlueStore::MempoolThread::_adjust_cache_settings()
4150{
11fdf7f2
TL
4151 if (binned_kv_cache != nullptr) {
4152 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
4153 }
4154 meta_cache->set_cache_ratio(store->cache_meta_ratio);
4155 data_cache->set_cache_ratio(store->cache_data_ratio);
91327a77
AA
4156}
4157
9f95a23c 4158void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
91327a77 4159{
9f95a23c
TL
4160 size_t onode_shards = store->onode_cache_shards.size();
4161 size_t buffer_shards = store->buffer_cache_shards.size();
91327a77 4162 int64_t kv_used = store->db->get_cache_usage();
11fdf7f2
TL
4163 int64_t meta_used = meta_cache->_get_used_bytes();
4164 int64_t data_used = data_cache->_get_used_bytes();
91327a77
AA
4165
4166 uint64_t cache_size = store->cache_size;
4167 int64_t kv_alloc =
11fdf7f2 4168 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
91327a77 4169 int64_t meta_alloc =
11fdf7f2 4170 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
91327a77 4171 int64_t data_alloc =
11fdf7f2 4172 static_cast<int64_t>(store->cache_data_ratio * cache_size);
91327a77 4173
eafe8130
TL
4174 if (pcm != nullptr && binned_kv_cache != nullptr) {
4175 cache_size = pcm->get_tuned_mem();
11fdf7f2
TL
4176 kv_alloc = binned_kv_cache->get_committed_size();
4177 meta_alloc = meta_cache->get_committed_size();
4178 data_alloc = data_cache->get_committed_size();
91327a77
AA
4179 }
4180
4181 if (interval_stats) {
9f95a23c 4182 dout(5) << __func__ << " cache_size: " << cache_size
91327a77
AA
4183 << " kv_alloc: " << kv_alloc
4184 << " kv_used: " << kv_used
4185 << " meta_alloc: " << meta_alloc
4186 << " meta_used: " << meta_used
4187 << " data_alloc: " << data_alloc
4188 << " data_used: " << data_used << dendl;
4189 } else {
9f95a23c 4190 dout(20) << __func__ << " cache_size: " << cache_size
91327a77
AA
4191 << " kv_alloc: " << kv_alloc
4192 << " kv_used: " << kv_used
4193 << " meta_alloc: " << meta_alloc
4194 << " meta_used: " << meta_used
4195 << " data_alloc: " << data_alloc
4196 << " data_used: " << data_used << dendl;
4197 }
4198
4199 uint64_t max_shard_onodes = static_cast<uint64_t>(
9f95a23c
TL
4200 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
4201 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
91327a77 4202
9f95a23c 4203 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
91327a77
AA
4204 << " max_shard_buffer: " << max_shard_buffer << dendl;
4205
9f95a23c
TL
4206 for (auto i : store->onode_cache_shards) {
4207 i->set_max(max_shard_onodes);
4208 }
4209 for (auto i : store->buffer_cache_shards) {
4210 i->set_max(max_shard_buffer);
91327a77
AA
4211 }
4212}
4213
92f5a8d4
TL
4214void BlueStore::MempoolThread::_update_cache_settings()
4215{
4216 // Nothing to do if pcm is not used.
4217 if (pcm == nullptr) {
4218 return;
4219 }
4220
92f5a8d4
TL
4221 uint64_t target = store->osd_memory_target;
4222 uint64_t base = store->osd_memory_base;
4223 uint64_t min = store->osd_memory_cache_min;
4224 uint64_t max = min;
4225 double fragmentation = store->osd_memory_expected_fragmentation;
4226
4227 uint64_t ltarget = (1.0 - fragmentation) * target;
4228 if (ltarget > base + min) {
4229 max = ltarget - base;
4230 }
4231
4232 // set pcm cache levels
4233 pcm->set_target_memory(target);
4234 pcm->set_min_memory(min);
4235 pcm->set_max_memory(max);
4236
9f95a23c 4237 dout(5) << __func__ << " updated pcm target: " << target
92f5a8d4
TL
4238 << " pcm min: " << min
4239 << " pcm max: " << max
4240 << dendl;
4241}
4242
7c673cae
FG
4243// =======================================================
4244
31f18b77
FG
4245// OmapIteratorImpl
4246
4247#undef dout_prefix
4248#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4249
4250BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4251 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
4252 : c(c), o(o), it(it)
4253{
9f95a23c 4254 std::shared_lock l(c->lock);
31f18b77 4255 if (o->onode.has_omap()) {
9f95a23c
TL
4256 o->get_omap_key(string(), &head);
4257 o->get_omap_tail(&tail);
31f18b77
FG
4258 it->lower_bound(head);
4259 }
4260}
4261
11fdf7f2
TL
4262string BlueStore::OmapIteratorImpl::_stringify() const
4263{
4264 stringstream s;
4265 s << " omap_iterator(cid = " << c->cid
4266 <<", oid = " << o->oid << ")";
4267 return s.str();
4268}
4269
31f18b77
FG
4270int BlueStore::OmapIteratorImpl::seek_to_first()
4271{
9f95a23c 4272 std::shared_lock l(c->lock);
11fdf7f2 4273 auto start1 = mono_clock::now();
31f18b77
FG
4274 if (o->onode.has_omap()) {
4275 it->lower_bound(head);
4276 } else {
4277 it = KeyValueDB::Iterator();
4278 }
494da23a
TL
4279 c->store->log_latency(
4280 __func__,
11fdf7f2
TL
4281 l_bluestore_omap_seek_to_first_lat,
4282 mono_clock::now() - start1,
494da23a 4283 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2 4284
31f18b77
FG
4285 return 0;
4286}
4287
4288int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4289{
9f95a23c 4290 std::shared_lock l(c->lock);
11fdf7f2 4291 auto start1 = mono_clock::now();
31f18b77
FG
4292 if (o->onode.has_omap()) {
4293 string key;
9f95a23c 4294 o->get_omap_key(after, &key);
31f18b77
FG
4295 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4296 << pretty_binary_string(key) << dendl;
4297 it->upper_bound(key);
4298 } else {
4299 it = KeyValueDB::Iterator();
4300 }
11fdf7f2 4301 c->store->log_latency_fn(
494da23a 4302 __func__,
11fdf7f2
TL
4303 l_bluestore_omap_upper_bound_lat,
4304 mono_clock::now() - start1,
494da23a 4305 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4306 [&] (const ceph::timespan& lat) {
494da23a 4307 return ", after = " + after +
11fdf7f2
TL
4308 _stringify();
4309 }
4310 );
31f18b77
FG
4311 return 0;
4312}
4313
4314int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4315{
9f95a23c 4316 std::shared_lock l(c->lock);
11fdf7f2 4317 auto start1 = mono_clock::now();
31f18b77
FG
4318 if (o->onode.has_omap()) {
4319 string key;
9f95a23c 4320 o->get_omap_key(to, &key);
31f18b77
FG
4321 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4322 << pretty_binary_string(key) << dendl;
4323 it->lower_bound(key);
4324 } else {
4325 it = KeyValueDB::Iterator();
4326 }
11fdf7f2 4327 c->store->log_latency_fn(
494da23a 4328 __func__,
11fdf7f2
TL
4329 l_bluestore_omap_lower_bound_lat,
4330 mono_clock::now() - start1,
494da23a 4331 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4332 [&] (const ceph::timespan& lat) {
494da23a 4333 return ", to = " + to +
11fdf7f2
TL
4334 _stringify();
4335 }
4336 );
31f18b77
FG
4337 return 0;
4338}
4339
4340bool BlueStore::OmapIteratorImpl::valid()
4341{
9f95a23c 4342 std::shared_lock l(c->lock);
31f18b77 4343 bool r = o->onode.has_omap() && it && it->valid() &&
494da23a 4344 it->raw_key().second < tail;
31f18b77
FG
4345 if (it && it->valid()) {
4346 ldout(c->store->cct,20) << __func__ << " is at "
4347 << pretty_binary_string(it->raw_key().second)
4348 << dendl;
4349 }
4350 return r;
4351}
4352
11fdf7f2 4353int BlueStore::OmapIteratorImpl::next()
31f18b77 4354{
11fdf7f2 4355 int r = -1;
9f95a23c 4356 std::shared_lock l(c->lock);
11fdf7f2 4357 auto start1 = mono_clock::now();
31f18b77
FG
4358 if (o->onode.has_omap()) {
4359 it->next();
11fdf7f2 4360 r = 0;
31f18b77 4361 }
494da23a
TL
4362 c->store->log_latency(
4363 __func__,
11fdf7f2
TL
4364 l_bluestore_omap_next_lat,
4365 mono_clock::now() - start1,
494da23a 4366 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2
TL
4367
4368 return r;
31f18b77
FG
4369}
4370
4371string BlueStore::OmapIteratorImpl::key()
4372{
9f95a23c 4373 std::shared_lock l(c->lock);
11fdf7f2 4374 ceph_assert(it->valid());
31f18b77
FG
4375 string db_key = it->raw_key().second;
4376 string user_key;
9f95a23c 4377 o->decode_omap_key(db_key, &user_key);
494da23a 4378
31f18b77
FG
4379 return user_key;
4380}
4381
4382bufferlist BlueStore::OmapIteratorImpl::value()
4383{
9f95a23c 4384 std::shared_lock l(c->lock);
11fdf7f2 4385 ceph_assert(it->valid());
31f18b77
FG
4386 return it->value();
4387}
4388
4389
4390// =====================================
4391
7c673cae
FG
4392#undef dout_prefix
4393#define dout_prefix *_dout << "bluestore(" << path << ") "
9f95a23c
TL
4394#undef dout_context
4395#define dout_context cct
7c673cae
FG
4396
4397
4398static void aio_cb(void *priv, void *priv2)
4399{
4400 BlueStore *store = static_cast<BlueStore*>(priv);
4401 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4402 c->aio_finish(store);
4403}
4404
11fdf7f2
TL
4405static void discard_cb(void *priv, void *priv2)
4406{
4407 BlueStore *store = static_cast<BlueStore*>(priv);
4408 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4409 store->handle_discard(*tmp);
4410}
4411
4412void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4413{
4414 dout(10) << __func__ << dendl;
4415 ceph_assert(alloc);
4416 alloc->release(to_release);
4417}
4418
7c673cae 4419BlueStore::BlueStore(CephContext *cct, const string& path)
9f95a23c 4420 : BlueStore(cct, path, 0) {}
7c673cae
FG
4421
4422BlueStore::BlueStore(CephContext *cct,
4423 const string& path,
4424 uint64_t _min_alloc_size)
4425 : ObjectStore(cct, path),
9f95a23c 4426 throttle(cct),
11fdf7f2 4427 finisher(cct, "commit_finisher", "cfin"),
7c673cae 4428 kv_sync_thread(this),
31f18b77 4429 kv_finalize_thread(this),
7c673cae
FG
4430 min_alloc_size(_min_alloc_size),
4431 min_alloc_size_order(ctz(_min_alloc_size)),
4432 mempool_thread(this)
4433{
4434 _init_logger();
11fdf7f2 4435 cct->_conf.add_observer(this);
7c673cae 4436 set_cache_shards(1);
7c673cae
FG
4437}
4438
4439BlueStore::~BlueStore()
4440{
11fdf7f2 4441 cct->_conf.remove_observer(this);
7c673cae 4442 _shutdown_logger();
11fdf7f2
TL
4443 ceph_assert(!mounted);
4444 ceph_assert(db == NULL);
4445 ceph_assert(bluefs == NULL);
4446 ceph_assert(fsid_fd < 0);
4447 ceph_assert(path_fd < 0);
9f95a23c
TL
4448 for (auto i : onode_cache_shards) {
4449 delete i;
4450 }
4451 for (auto i : buffer_cache_shards) {
7c673cae
FG
4452 delete i;
4453 }
9f95a23c
TL
4454 onode_cache_shards.clear();
4455 buffer_cache_shards.clear();
7c673cae
FG
4456}
4457
4458const char **BlueStore::get_tracked_conf_keys() const
4459{
4460 static const char* KEYS[] = {
4461 "bluestore_csum_type",
4462 "bluestore_compression_mode",
4463 "bluestore_compression_algorithm",
4464 "bluestore_compression_min_blob_size",
4465 "bluestore_compression_min_blob_size_ssd",
4466 "bluestore_compression_min_blob_size_hdd",
4467 "bluestore_compression_max_blob_size",
4468 "bluestore_compression_max_blob_size_ssd",
4469 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 4470 "bluestore_compression_required_ratio",
7c673cae
FG
4471 "bluestore_max_alloc_size",
4472 "bluestore_prefer_deferred_size",
181888fb
FG
4473 "bluestore_prefer_deferred_size_hdd",
4474 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
4475 "bluestore_deferred_batch_ops",
4476 "bluestore_deferred_batch_ops_hdd",
4477 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
4478 "bluestore_throttle_bytes",
4479 "bluestore_throttle_deferred_bytes",
4480 "bluestore_throttle_cost_per_io_hdd",
4481 "bluestore_throttle_cost_per_io_ssd",
4482 "bluestore_throttle_cost_per_io",
4483 "bluestore_max_blob_size",
4484 "bluestore_max_blob_size_ssd",
4485 "bluestore_max_blob_size_hdd",
11fdf7f2
TL
4486 "osd_memory_target",
4487 "osd_memory_target_cgroup_limit_ratio",
4488 "osd_memory_base",
4489 "osd_memory_cache_min",
92f5a8d4 4490 "osd_memory_expected_fragmentation",
11fdf7f2
TL
4491 "bluestore_cache_autotune",
4492 "bluestore_cache_autotune_interval",
81eedcae 4493 "bluestore_warn_on_legacy_statfs",
9f95a23c
TL
4494 "bluestore_warn_on_no_per_pool_omap",
4495 "bluestore_max_defer_interval",
7c673cae
FG
4496 NULL
4497 };
4498 return KEYS;
4499}
4500
11fdf7f2 4501void BlueStore::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
4502 const std::set<std::string> &changed)
4503{
eafe8130 4504 if (changed.count("bluestore_warn_on_legacy_statfs")) {
81eedcae
TL
4505 _check_legacy_statfs_alert();
4506 }
9f95a23c
TL
4507 if (changed.count("bluestore_warn_on_no_per_pool_omap")) {
4508 _check_no_per_pool_omap_alert();
4509 }
81eedcae 4510
7c673cae
FG
4511 if (changed.count("bluestore_csum_type")) {
4512 _set_csum();
4513 }
4514 if (changed.count("bluestore_compression_mode") ||
4515 changed.count("bluestore_compression_algorithm") ||
4516 changed.count("bluestore_compression_min_blob_size") ||
4517 changed.count("bluestore_compression_max_blob_size")) {
4518 if (bdev) {
4519 _set_compression();
4520 }
4521 }
4522 if (changed.count("bluestore_max_blob_size") ||
4523 changed.count("bluestore_max_blob_size_ssd") ||
4524 changed.count("bluestore_max_blob_size_hdd")) {
4525 if (bdev) {
4526 // only after startup
4527 _set_blob_size();
4528 }
4529 }
4530 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
4531 changed.count("bluestore_prefer_deferred_size_hdd") ||
4532 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
4533 changed.count("bluestore_max_alloc_size") ||
4534 changed.count("bluestore_deferred_batch_ops") ||
4535 changed.count("bluestore_deferred_batch_ops_hdd") ||
4536 changed.count("bluestore_deferred_batch_ops_ssd")) {
4537 if (bdev) {
4538 // only after startup
4539 _set_alloc_sizes();
4540 }
4541 }
4542 if (changed.count("bluestore_throttle_cost_per_io") ||
4543 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4544 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4545 if (bdev) {
4546 _set_throttle_params();
4547 }
4548 }
9f95a23c
TL
4549 if (changed.count("bluestore_throttle_bytes") ||
4550 changed.count("bluestore_throttle_deferred_bytes") ||
4551 changed.count("bluestore_throttle_trace_rate")) {
4552 throttle.reset_throttle(conf);
7c673cae 4553 }
9f95a23c
TL
4554 if (changed.count("bluestore_max_defer_interval")) {
4555 if (bdev) {
4556 _set_max_defer_interval();
4557 }
7c673cae 4558 }
92f5a8d4
TL
4559 if (changed.count("osd_memory_target") ||
4560 changed.count("osd_memory_base") ||
4561 changed.count("osd_memory_cache_min") ||
4562 changed.count("osd_memory_expected_fragmentation")) {
4563 _update_osd_memory_options();
4564 }
7c673cae
FG
4565}
4566
4567void BlueStore::_set_compression()
4568{
224ce89b
WB
4569 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4570 if (m) {
11fdf7f2 4571 _clear_compression_alert();
224ce89b
WB
4572 comp_mode = *m;
4573 } else {
4574 derr << __func__ << " unrecognized value '"
4575 << cct->_conf->bluestore_compression_mode
4576 << "' for bluestore_compression_mode, reverting to 'none'"
4577 << dendl;
4578 comp_mode = Compressor::COMP_NONE;
11fdf7f2
TL
4579 string s("unknown mode: ");
4580 s += cct->_conf->bluestore_compression_mode;
4581 _set_compression_alert(true, s.c_str());
224ce89b
WB
4582 }
4583
4584 compressor = nullptr;
4585
3efd9988
FG
4586 if (cct->_conf->bluestore_compression_min_blob_size) {
4587 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae 4588 } else {
11fdf7f2 4589 ceph_assert(bdev);
9f95a23c 4590 if (_use_rotational_settings()) {
7c673cae
FG
4591 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4592 } else {
4593 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4594 }
4595 }
4596
4597 if (cct->_conf->bluestore_compression_max_blob_size) {
4598 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4599 } else {
11fdf7f2 4600 ceph_assert(bdev);
9f95a23c 4601 if (_use_rotational_settings()) {
7c673cae
FG
4602 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4603 } else {
4604 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4605 }
4606 }
4607
7c673cae
FG
4608 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4609 if (!alg_name.empty()) {
4610 compressor = Compressor::create(cct, alg_name);
4611 if (!compressor) {
4612 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4613 << dendl;
11fdf7f2 4614 _set_compression_alert(false, alg_name.c_str());
7c673cae
FG
4615 }
4616 }
4617
4618 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4619 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
11fdf7f2
TL
4620 << " min_blob " << comp_min_blob_size
4621 << " max_blob " << comp_max_blob_size
7c673cae
FG
4622 << dendl;
4623}
4624
4625void BlueStore::_set_csum()
4626{
4627 csum_type = Checksummer::CSUM_NONE;
4628 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4629 if (t > Checksummer::CSUM_NONE)
4630 csum_type = t;
4631
4632 dout(10) << __func__ << " csum_type "
4633 << Checksummer::get_csum_type_string(csum_type)
4634 << dendl;
4635}
4636
4637void BlueStore::_set_throttle_params()
4638{
4639 if (cct->_conf->bluestore_throttle_cost_per_io) {
4640 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4641 } else {
11fdf7f2 4642 ceph_assert(bdev);
9f95a23c 4643 if (_use_rotational_settings()) {
7c673cae
FG
4644 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4645 } else {
4646 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4647 }
4648 }
4649
4650 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4651 << dendl;
4652}
4653void BlueStore::_set_blob_size()
4654{
4655 if (cct->_conf->bluestore_max_blob_size) {
4656 max_blob_size = cct->_conf->bluestore_max_blob_size;
4657 } else {
11fdf7f2 4658 ceph_assert(bdev);
9f95a23c 4659 if (_use_rotational_settings()) {
7c673cae
FG
4660 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4661 } else {
4662 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4663 }
4664 }
4665 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4666 << std::dec << dendl;
4667}
4668
92f5a8d4
TL
4669void BlueStore::_update_osd_memory_options()
4670{
4671 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4672 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4673 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4674 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4675 config_changed++;
4676 dout(10) << __func__
4677 << " osd_memory_target " << osd_memory_target
4678 << " osd_memory_base " << osd_memory_base
4679 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4680 << " osd_memory_cache_min " << osd_memory_cache_min
4681 << dendl;
4682}
4683
11fdf7f2 4684int BlueStore::_set_cache_sizes()
1adf2230 4685{
11fdf7f2
TL
4686 ceph_assert(bdev);
4687 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
91327a77 4688 cache_autotune_interval =
11fdf7f2
TL
4689 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4690 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4691 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
91327a77 4692 osd_memory_expected_fragmentation =
11fdf7f2
TL
4693 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4694 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
91327a77 4695 osd_memory_cache_resize_interval =
11fdf7f2 4696 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
91327a77 4697
224ce89b
WB
4698 if (cct->_conf->bluestore_cache_size) {
4699 cache_size = cct->_conf->bluestore_cache_size;
4700 } else {
4701 // choose global cache size based on backend type
9f95a23c 4702 if (_use_rotational_settings()) {
224ce89b
WB
4703 cache_size = cct->_conf->bluestore_cache_size_hdd;
4704 } else {
4705 cache_size = cct->_conf->bluestore_cache_size_ssd;
4706 }
4707 }
31f18b77 4708
91327a77 4709 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
224ce89b 4710 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 4711 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77 4712 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4713 return -EINVAL;
4714 }
91327a77
AA
4715
4716 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
224ce89b 4717 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 4718 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
91327a77 4719 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4720 return -EINVAL;
4721 }
91327a77 4722
31f18b77 4723 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 4724 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77
AA
4725 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4726 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4727 << dendl;
31f18b77
FG
4728 return -EINVAL;
4729 }
91327a77
AA
4730
4731 cache_data_ratio =
4732 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
31f18b77
FG
4733 if (cache_data_ratio < 0) {
4734 // deal with floating point imprecision
4735 cache_data_ratio = 0;
4736 }
91327a77 4737
224ce89b
WB
4738 dout(1) << __func__ << " cache_size " << cache_size
4739 << " meta " << cache_meta_ratio
31f18b77
FG
4740 << " kv " << cache_kv_ratio
4741 << " data " << cache_data_ratio
4742 << dendl;
4743 return 0;
4744}
4745
3efd9988
FG
4746int BlueStore::write_meta(const std::string& key, const std::string& value)
4747{
4748 bluestore_bdev_label_t label;
4749 string p = path + "/block";
4750 int r = _read_bdev_label(cct, p, &label);
4751 if (r < 0) {
4752 return ObjectStore::write_meta(key, value);
4753 }
4754 label.meta[key] = value;
4755 r = _write_bdev_label(cct, p, label);
11fdf7f2 4756 ceph_assert(r == 0);
3efd9988
FG
4757 return ObjectStore::write_meta(key, value);
4758}
4759
4760int BlueStore::read_meta(const std::string& key, std::string *value)
4761{
4762 bluestore_bdev_label_t label;
4763 string p = path + "/block";
4764 int r = _read_bdev_label(cct, p, &label);
4765 if (r < 0) {
4766 return ObjectStore::read_meta(key, value);
4767 }
4768 auto i = label.meta.find(key);
4769 if (i == label.meta.end()) {
4770 return ObjectStore::read_meta(key, value);
4771 }
4772 *value = i->second;
4773 return 0;
4774}
4775
7c673cae
FG
4776void BlueStore::_init_logger()
4777{
4778 PerfCountersBuilder b(cct, "bluestore",
4779 l_bluestore_first, l_bluestore_last);
4780 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4781 "Average kv_thread flush latency",
4782 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4783 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4784 "Average kv_thread commit latency");
11fdf7f2
TL
4785 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
4786 "Average kv_sync thread latency",
4787 "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
4788 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
4789 "Average kv_finalize thread latency",
4790 "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
4791 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4792 "Average prepare state latency");
4793 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4794 "Average aio_wait state latency",
4795 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4796 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4797 "Average io_done state latency");
4798 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4799 "Average kv_queued state latency");
4800 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4801 "Average kv_commiting state latency");
4802 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4803 "Average kv_done state latency");
4804 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4805 "Average deferred_queued state latency");
4806 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4807 "Average aio_wait state latency");
4808 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4809 "Average cleanup state latency");
4810 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4811 "Average finishing state latency");
4812 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4813 "Average done state latency");
4814 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4815 "Average submit throttle latency",
4816 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4817 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4818 "Average submit latency",
4819 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4820 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4821 "Average commit latency",
4822 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4823 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4824 "Average read latency",
4825 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4826 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4827 "Average read onode metadata latency");
4828 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4829 "Average read latency");
4830 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4831 "Average compress latency");
4832 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4833 "Average decompress latency");
4834 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4835 "Average checksum latency");
4836 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4837 "Sum for beneficial compress ops");
4838 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4839 "Sum for compress ops rejected due to low net gain of space");
4840 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
11fdf7f2 4841 "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4842 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4843 "Sum for deferred write op");
4844 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
11fdf7f2 4845 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
7c673cae
FG
4846 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4847 "Sum for write penalty read ops");
4848 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4849 "Sum for allocated bytes");
4850 b.add_u64(l_bluestore_stored, "bluestore_stored",
4851 "Sum for stored bytes");
4852 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
92f5a8d4
TL
4853 "Sum for stored compressed bytes",
4854 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4855 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
92f5a8d4
TL
4856 "Sum for bytes allocated for compressed data",
4857 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4858 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
92f5a8d4
TL
4859 "Sum for original bytes that were compressed",
4860 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
4861 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4862 "Number of onodes in cache");
9f95a23c
TL
4863 b.add_u64(l_bluestore_pinned_onodes, "bluestore_pinned_onodes",
4864 "Number of pinned onodes in cache");
7c673cae
FG
4865 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4866 "Sum for onode-lookups hit in the cache");
4867 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4868 "Sum for onode-lookups missed in the cache");
4869 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4870 "Sum for onode-shard lookups hit in the cache");
4871 b.add_u64_counter(l_bluestore_onode_shard_misses,
4872 "bluestore_onode_shard_misses",
4873 "Sum for onode-shard lookups missed in the cache");
4874 b.add_u64(l_bluestore_extents, "bluestore_extents",
4875 "Number of extents in cache");
4876 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4877 "Number of blobs in cache");
4878 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4879 "Number of buffers in cache");
4880 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
11fdf7f2 4881 "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4882 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
11fdf7f2 4883 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4884 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
11fdf7f2 4885 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4886
4887 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4888 "Large aligned writes into fresh blobs");
4889 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
11fdf7f2 4890 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4891 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4892 "Large aligned writes into fresh blobs (blobs)");
4893 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4894 "Small writes into existing or sparse small blobs");
4895 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
11fdf7f2 4896 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4897 b.add_u64_counter(l_bluestore_write_small_unused,
4898 "bluestore_write_small_unused",
4899 "Small writes into unused portion of existing blob");
4900 b.add_u64_counter(l_bluestore_write_small_deferred,
4901 "bluestore_write_small_deferred",
4902 "Small overwrites using deferred");
4903 b.add_u64_counter(l_bluestore_write_small_pre_read,
4904 "bluestore_write_small_pre_read",
4905 "Small writes that required we read some data (possibly "
4906 "cached) to fill out the block");
4907 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
4908 "Small write into new (sparse) blob");
4909
4910 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
4911 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
4912 "Onode extent map reshard events");
4913 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
4914 "Sum for blob splitting due to resharding");
4915 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
4916 "Sum for extents that have been removed due to compression");
4917 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
4918 "Sum for extents that have been merged due to garbage "
4919 "collection");
b32b8144
FG
4920 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
4921 "Read EIO errors propagated to high level callers");
f64942e4
AA
4922 b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
4923 "Read operations that required at least one retry due to failed checksum validation");
a8e16298
TL
4924 b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
4925 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
11fdf7f2
TL
4926 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
4927 "Average omap iterator seek_to_first call latency");
4928 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
4929 "Average omap iterator upper_bound call latency");
4930 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
4931 "Average omap iterator lower_bound call latency");
4932 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
4933 "Average omap iterator next call latency");
494da23a
TL
4934 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
4935 "Average collection listing latency");
7c673cae
FG
4936 logger = b.create_perf_counters();
4937 cct->get_perfcounters_collection()->add(logger);
4938}
4939
4940int BlueStore::_reload_logger()
4941{
4942 struct store_statfs_t store_statfs;
7c673cae 4943 int r = statfs(&store_statfs);
11fdf7f2 4944 if (r >= 0) {
7c673cae 4945 logger->set(l_bluestore_allocated, store_statfs.allocated);
11fdf7f2
TL
4946 logger->set(l_bluestore_stored, store_statfs.data_stored);
4947 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
4948 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
4949 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
7c673cae
FG
4950 }
4951 return r;
4952}
4953
4954void BlueStore::_shutdown_logger()
4955{
4956 cct->get_perfcounters_collection()->remove(logger);
4957 delete logger;
4958}
4959
4960int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
4961 uuid_d *fsid)
4962{
4963 bluestore_bdev_label_t label;
4964 int r = _read_bdev_label(cct, path, &label);
4965 if (r < 0)
4966 return r;
4967 *fsid = label.osd_uuid;
4968 return 0;
4969}
4970
4971int BlueStore::_open_path()
4972{
b32b8144 4973 // sanity check(s)
11fdf7f2 4974 ceph_assert(path_fd < 0);
91327a77 4975 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
7c673cae
FG
4976 if (path_fd < 0) {
4977 int r = -errno;
4978 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
4979 << dendl;
4980 return r;
4981 }
4982 return 0;
4983}
4984
4985void BlueStore::_close_path()
4986{
4987 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
4988 path_fd = -1;
4989}
4990
3efd9988
FG
4991int BlueStore::_write_bdev_label(CephContext *cct,
4992 string path, bluestore_bdev_label_t label)
7c673cae
FG
4993{
4994 dout(10) << __func__ << " path " << path << " label " << label << dendl;
4995 bufferlist bl;
11fdf7f2 4996 encode(label, bl);
7c673cae 4997 uint32_t crc = bl.crc32c(-1);
11fdf7f2
TL
4998 encode(crc, bl);
4999 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
7c673cae
FG
5000 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
5001 z.zero();
5002 bl.append(std::move(z));
5003
91327a77 5004 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
7c673cae
FG
5005 if (fd < 0) {
5006 fd = -errno;
5007 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5008 << dendl;
5009 return fd;
5010 }
5011 int r = bl.write_fd(fd);
5012 if (r < 0) {
5013 derr << __func__ << " failed to write to " << path
5014 << ": " << cpp_strerror(r) << dendl;
11fdf7f2 5015 goto out;
7c673cae 5016 }
3efd9988
FG
5017 r = ::fsync(fd);
5018 if (r < 0) {
5019 derr << __func__ << " failed to fsync " << path
5020 << ": " << cpp_strerror(r) << dendl;
5021 }
11fdf7f2 5022out:
7c673cae
FG
5023 VOID_TEMP_FAILURE_RETRY(::close(fd));
5024 return r;
5025}
5026
5027int BlueStore::_read_bdev_label(CephContext* cct, string path,
5028 bluestore_bdev_label_t *label)
5029{
5030 dout(10) << __func__ << dendl;
91327a77 5031 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
7c673cae
FG
5032 if (fd < 0) {
5033 fd = -errno;
5034 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
5035 << dendl;
5036 return fd;
5037 }
5038 bufferlist bl;
5039 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
5040 VOID_TEMP_FAILURE_RETRY(::close(fd));
5041 if (r < 0) {
5042 derr << __func__ << " failed to read from " << path
5043 << ": " << cpp_strerror(r) << dendl;
5044 return r;
5045 }
5046
5047 uint32_t crc, expected_crc;
11fdf7f2 5048 auto p = bl.cbegin();
7c673cae 5049 try {
11fdf7f2 5050 decode(*label, p);
7c673cae
FG
5051 bufferlist t;
5052 t.substr_of(bl, 0, p.get_off());
5053 crc = t.crc32c(-1);
11fdf7f2 5054 decode(expected_crc, p);
7c673cae
FG
5055 }
5056 catch (buffer::error& e) {
b32b8144 5057 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
5058 << ": " << e.what()
5059 << dendl;
b32b8144 5060 return -ENOENT;
7c673cae
FG
5061 }
5062 if (crc != expected_crc) {
5063 derr << __func__ << " bad crc on label, expected " << expected_crc
5064 << " != actual " << crc << dendl;
5065 return -EIO;
5066 }
5067 dout(10) << __func__ << " got " << *label << dendl;
5068 return 0;
5069}
5070
5071int BlueStore::_check_or_set_bdev_label(
5072 string path, uint64_t size, string desc, bool create)
5073{
5074 bluestore_bdev_label_t label;
5075 if (create) {
5076 label.osd_uuid = fsid;
5077 label.size = size;
5078 label.btime = ceph_clock_now();
5079 label.description = desc;
3efd9988 5080 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
5081 if (r < 0)
5082 return r;
5083 } else {
5084 int r = _read_bdev_label(cct, path, &label);
5085 if (r < 0)
5086 return r;
31f18b77
FG
5087 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
5088 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5089 << " and fsid " << fsid << " check bypassed" << dendl;
1911f103 5090 } else if (label.osd_uuid != fsid) {
7c673cae
FG
5091 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
5092 << " does not match our fsid " << fsid << dendl;
5093 return -EIO;
5094 }
5095 }
5096 return 0;
5097}
5098
5099void BlueStore::_set_alloc_sizes(void)
5100{
7c673cae
FG
5101 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
5102
5103 if (cct->_conf->bluestore_prefer_deferred_size) {
5104 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
5105 } else {
11fdf7f2 5106 ceph_assert(bdev);
9f95a23c 5107 if (_use_rotational_settings()) {
7c673cae
FG
5108 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
5109 } else {
5110 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
5111 }
5112 }
5113
5114 if (cct->_conf->bluestore_deferred_batch_ops) {
5115 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
5116 } else {
11fdf7f2 5117 ceph_assert(bdev);
9f95a23c 5118 if (_use_rotational_settings()) {
7c673cae
FG
5119 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
5120 } else {
5121 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
5122 }
5123 }
5124
5125 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11fdf7f2 5126 << std::dec << " order " << (int)min_alloc_size_order
7c673cae
FG
5127 << " max_alloc_size 0x" << std::hex << max_alloc_size
5128 << " prefer_deferred_size 0x" << prefer_deferred_size
5129 << std::dec
5130 << " deferred_batch_ops " << deferred_batch_ops
5131 << dendl;
5132}
5133
5134int BlueStore::_open_bdev(bool create)
5135{
11fdf7f2 5136 ceph_assert(bdev == NULL);
7c673cae 5137 string p = path + "/block";
11fdf7f2 5138 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
7c673cae
FG
5139 int r = bdev->open(p);
5140 if (r < 0)
5141 goto fail;
5142
11fdf7f2
TL
5143 if (create && cct->_conf->bdev_enable_discard) {
5144 bdev->discard(0, bdev->get_size());
5145 }
5146
7c673cae
FG
5147 if (bdev->supported_bdev_label()) {
5148 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
5149 if (r < 0)
5150 goto fail_close;
5151 }
5152
5153 // initialize global block parameters
5154 block_size = bdev->get_block_size();
5155 block_mask = ~(block_size - 1);
5156 block_size_order = ctz(block_size);
11fdf7f2 5157 ceph_assert(block_size == 1u << block_size_order);
9f95a23c 5158 _set_max_defer_interval();
224ce89b
WB
5159 // and set cache_size based on device type
5160 r = _set_cache_sizes();
5161 if (r < 0) {
5162 goto fail_close;
5163 }
7c673cae
FG
5164 return 0;
5165
5166 fail_close:
5167 bdev->close();
5168 fail:
5169 delete bdev;
5170 bdev = NULL;
5171 return r;
5172}
5173
11fdf7f2
TL
5174void BlueStore::_validate_bdev()
5175{
5176 ceph_assert(bdev);
5177 ceph_assert(min_alloc_size); // _get_odisk_reserved depends on that
5178 uint64_t dev_size = bdev->get_size();
5179 if (dev_size <
5180 _get_ondisk_reserved() + cct->_conf->bluestore_bluefs_min) {
5181 dout(1) << __func__ << " main device size " << byte_u_t(dev_size)
5182 << " is too small, disable bluestore_bluefs_min for now"
5183 << dendl;
5184 ceph_assert(dev_size >= _get_ondisk_reserved());
5185
5186 int r = cct->_conf.set_val("bluestore_bluefs_min", "0");
5187 ceph_assert(r == 0);
5188 }
5189}
5190
7c673cae
FG
5191void BlueStore::_close_bdev()
5192{
11fdf7f2 5193 ceph_assert(bdev);
7c673cae
FG
5194 bdev->close();
5195 delete bdev;
5196 bdev = NULL;
5197}
5198
1911f103 5199int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only)
7c673cae 5200{
1911f103
TL
5201 int r;
5202 bluestore_bdev_label_t label;
5203
11fdf7f2
TL
5204 ceph_assert(fm == NULL);
5205 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
5206 ceph_assert(fm);
5207 if (t) {
5208 // create mode. initialize freespace
7c673cae 5209 dout(20) << __func__ << " initializing freespace" << dendl;
7c673cae
FG
5210 {
5211 bufferlist bl;
5212 bl.append(freelist_type);
5213 t->set(PREFIX_SUPER, "freelist_type", bl);
5214 }
b32b8144
FG
5215 // being able to allocate in units less than bdev block size
5216 // seems to be a bad idea.
11fdf7f2 5217 ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
b32b8144 5218 fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
7c673cae
FG
5219
5220 // allocate superblock reserved space. note that we do not mark
5221 // bluefs space as allocated in the freelist; we instead rely on
5222 // bluefs_extents.
11fdf7f2 5223 auto reserved = _get_ondisk_reserved();
3efd9988 5224 fm->allocate(0, reserved, t);
7c673cae 5225
7c673cae 5226 if (cct->_conf->bluestore_bluefs) {
11fdf7f2 5227 ceph_assert(bluefs_extents.num_intervals() == 1);
7c673cae 5228 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
11fdf7f2 5229 reserved = round_up_to(p.get_start() + p.get_len(), min_alloc_size);
7c673cae
FG
5230 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
5231 << " for bluefs" << dendl;
7c673cae
FG
5232 }
5233
5234 if (cct->_conf->bluestore_debug_prefill > 0) {
5235 uint64_t end = bdev->get_size() - reserved;
5236 dout(1) << __func__ << " pre-fragmenting freespace, using "
5237 << cct->_conf->bluestore_debug_prefill << " with max free extent "
5238 << cct->_conf->bluestore_debug_prefragment_max << dendl;
11fdf7f2 5239 uint64_t start = p2roundup(reserved, min_alloc_size);
7c673cae
FG
5240 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
5241 float r = cct->_conf->bluestore_debug_prefill;
5242 r /= 1.0 - r;
5243 bool stop = false;
5244
5245 while (!stop && start < end) {
5246 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
5247 if (start + l > end) {
5248 l = end - start;
11fdf7f2 5249 l = p2align(l, min_alloc_size);
7c673cae 5250 }
11fdf7f2 5251 ceph_assert(start + l <= end);
7c673cae
FG
5252
5253 uint64_t u = 1 + (uint64_t)(r * (double)l);
11fdf7f2 5254 u = p2roundup(u, min_alloc_size);
7c673cae
FG
5255 if (start + l + u > end) {
5256 u = end - (start + l);
5257 // trim to align so we don't overflow again
11fdf7f2 5258 u = p2align(u, min_alloc_size);
7c673cae
FG
5259 stop = true;
5260 }
11fdf7f2 5261 ceph_assert(start + l + u <= end);
7c673cae 5262
11fdf7f2 5263 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
7c673cae
FG
5264 << " use 0x" << u << std::dec << dendl;
5265
5266 if (u == 0) {
5267 // break if u has been trimmed to nothing
5268 break;
5269 }
5270
5271 fm->allocate(start + l, u, t);
5272 start += l + u;
5273 }
5274 }
1911f103
TL
5275 r = _write_out_fm_meta(0, false, &label);
5276 ceph_assert(r == 0);
5277 } else {
5278 string p = path + "/block";
5279 r = _read_bdev_label(cct, p, &label);
5280 if (r < 0) {
5281 derr << __func__ << " freelist init failed, error reading bdev label: " << cpp_strerror(r) << dendl;
5282 delete fm;
5283 fm = NULL;
5284 return r;
5285 }
7c673cae 5286 }
1911f103 5287 r = fm->init(label, db, read_only);
7c673cae
FG
5288 if (r < 0) {
5289 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
5290 delete fm;
5291 fm = NULL;
5292 return r;
5293 }
81eedcae
TL
5294 // if space size tracked by free list manager is that higher than actual
5295 // dev size one can hit out-of-space allocation which will result
5296 // in data loss and/or assertions
5297 // Probably user altered the device size somehow.
5298 // The only fix for now is to redeploy OSD.
5299 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5300 ostringstream ss;
5301 ss << "slow device size mismatch detected, "
5302 << " fm size(" << fm->get_size()
5303 << ") > slow device size(" << bdev->get_size()
5304 << "), Please stop using this OSD as it might cause data loss.";
5305 _set_disk_size_mismatch_alert(ss.str());
5306 }
7c673cae
FG
5307 return 0;
5308}
5309
5310void BlueStore::_close_fm()
5311{
5312 dout(10) << __func__ << dendl;
11fdf7f2 5313 ceph_assert(fm);
7c673cae
FG
5314 fm->shutdown();
5315 delete fm;
5316 fm = NULL;
5317}
5318
1911f103
TL
5319int BlueStore::_write_out_fm_meta(uint64_t target_size,
5320 bool update_root_size,
5321 bluestore_bdev_label_t* res_label)
5322{
5323 string p = path + "/block";
5324
5325 std::vector<std::pair<string, string>> fm_meta;
5326 fm->get_meta(target_size, &fm_meta);
5327
5328 bluestore_bdev_label_t label;
5329 int r = _read_bdev_label(cct, p, &label);
5330 if (r < 0)
5331 return r;
5332
5333 for (auto& m : fm_meta) {
5334 label.meta[m.first] = m.second;
5335 }
5336 if (update_root_size) {
5337 label.size = target_size;
5338 }
5339 r = _write_bdev_label(cct, p, label);
5340 if (res_label) {
5341 *res_label = label;
5342 }
5343
5344 return r;
5345}
5346
7c673cae
FG
5347int BlueStore::_open_alloc()
5348{
11fdf7f2
TL
5349 ceph_assert(alloc == NULL);
5350 ceph_assert(bdev->get_size());
5351
5352 if (bluefs) {
5353 bluefs_extents.clear();
9f95a23c
TL
5354 auto r = bluefs->get_block_extents(bluefs_layout.shared_bdev,
5355 &bluefs_extents);
11fdf7f2
TL
5356 if (r < 0) {
5357 lderr(cct) << __func__ << " failed to retrieve bluefs_extents: "
5358 << cpp_strerror(r) << dendl;
5359
5360 return r;
5361 }
5362 dout(10) << __func__ << " bluefs extents 0x"
5363 << std::hex << bluefs_extents << std::dec
5364 << dendl;
5365 }
5366
7c673cae
FG
5367 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
5368 bdev->get_size(),
eafe8130 5369 min_alloc_size, "block");
7c673cae
FG
5370 if (!alloc) {
5371 lderr(cct) << __func__ << " Allocator::unknown alloc type "
5372 << cct->_conf->bluestore_allocator
5373 << dendl;
5374 return -EINVAL;
5375 }
5376
5377 uint64_t num = 0, bytes = 0;
5378
5379 dout(1) << __func__ << " opening allocation metadata" << dendl;
5380 // initialize from freelist
5381 fm->enumerate_reset();
5382 uint64_t offset, length;
11fdf7f2 5383 while (fm->enumerate_next(db, &offset, &length)) {
7c673cae
FG
5384 alloc->init_add_free(offset, length);
5385 ++num;
5386 bytes += length;
5387 }
224ce89b 5388 fm->enumerate_reset();
7c673cae
FG
5389
5390 // also mark bluefs space as allocated
5391 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5392 alloc->init_rm_free(e.get_start(), e.get_len());
5393 }
7c673cae 5394
1911f103
TL
5395 dout(1) << __func__ << " loaded " << byte_u_t(bytes)
5396 << " in " << num << " extents"
5397 << " available " << byte_u_t(alloc->get_free())
5398 << dendl;
5399
7c673cae
FG
5400 return 0;
5401}
5402
5403void BlueStore::_close_alloc()
5404{
11fdf7f2
TL
5405 ceph_assert(bdev);
5406 bdev->discard_drain();
5407
5408 ceph_assert(alloc);
7c673cae
FG
5409 alloc->shutdown();
5410 delete alloc;
5411 alloc = NULL;
11fdf7f2 5412 bluefs_extents.clear();
7c673cae
FG
5413}
5414
5415int BlueStore::_open_fsid(bool create)
5416{
11fdf7f2 5417 ceph_assert(fsid_fd < 0);
91327a77 5418 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
5419 if (create)
5420 flags |= O_CREAT;
5421 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
5422 if (fsid_fd < 0) {
5423 int err = -errno;
5424 derr << __func__ << " " << cpp_strerror(err) << dendl;
5425 return err;
5426 }
5427 return 0;
5428}
5429
5430int BlueStore::_read_fsid(uuid_d *uuid)
5431{
5432 char fsid_str[40];
5433 memset(fsid_str, 0, sizeof(fsid_str));
5434 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
5435 if (ret < 0) {
5436 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
5437 return ret;
5438 }
5439 if (ret > 36)
5440 fsid_str[36] = 0;
5441 else
5442 fsid_str[ret] = 0;
5443 if (!uuid->parse(fsid_str)) {
5444 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
5445 return -EINVAL;
5446 }
5447 return 0;
5448}
5449
5450int BlueStore::_write_fsid()
5451{
5452 int r = ::ftruncate(fsid_fd, 0);
5453 if (r < 0) {
5454 r = -errno;
5455 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
5456 return r;
5457 }
5458 string str = stringify(fsid) + "\n";
5459 r = safe_write(fsid_fd, str.c_str(), str.length());
5460 if (r < 0) {
5461 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
5462 return r;
5463 }
5464 r = ::fsync(fsid_fd);
5465 if (r < 0) {
5466 r = -errno;
5467 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
5468 return r;
5469 }
5470 return 0;
5471}
5472
5473void BlueStore::_close_fsid()
5474{
5475 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
5476 fsid_fd = -1;
5477}
5478
5479int BlueStore::_lock_fsid()
5480{
5481 struct flock l;
5482 memset(&l, 0, sizeof(l));
5483 l.l_type = F_WRLCK;
5484 l.l_whence = SEEK_SET;
5485 int r = ::fcntl(fsid_fd, F_SETLK, &l);
5486 if (r < 0) {
5487 int err = errno;
5488 derr << __func__ << " failed to lock " << path << "/fsid"
5489 << " (is another ceph-osd still running?)"
5490 << cpp_strerror(err) << dendl;
5491 return -err;
5492 }
5493 return 0;
5494}
5495
31f18b77
FG
5496bool BlueStore::is_rotational()
5497{
5498 if (bdev) {
5499 return bdev->is_rotational();
5500 }
5501
5502 bool rotational = true;
5503 int r = _open_path();
5504 if (r < 0)
5505 goto out;
5506 r = _open_fsid(false);
5507 if (r < 0)
5508 goto out_path;
5509 r = _read_fsid(&fsid);
5510 if (r < 0)
5511 goto out_fsid;
5512 r = _lock_fsid();
5513 if (r < 0)
5514 goto out_fsid;
5515 r = _open_bdev(false);
5516 if (r < 0)
5517 goto out_fsid;
5518 rotational = bdev->is_rotational();
5519 _close_bdev();
5520 out_fsid:
5521 _close_fsid();
5522 out_path:
5523 _close_path();
5524 out:
5525 return rotational;
5526}
5527
d2e6a577
FG
5528bool BlueStore::is_journal_rotational()
5529{
5530 if (!bluefs) {
5531 dout(5) << __func__ << " bluefs disabled, default to store media type"
5532 << dendl;
5533 return is_rotational();
5534 }
5535 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
5536 return bluefs->wal_is_rotational();
5537}
5538
9f95a23c
TL
5539bool BlueStore::_use_rotational_settings()
5540{
5541 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
5542 return true;
5543 }
5544 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
5545 return false;
5546 }
5547 return bdev->is_rotational();
5548}
5549
7c673cae
FG
5550bool BlueStore::test_mount_in_use()
5551{
5552 // most error conditions mean the mount is not in use (e.g., because
5553 // it doesn't exist). only if we fail to lock do we conclude it is
5554 // in use.
5555 bool ret = false;
5556 int r = _open_path();
5557 if (r < 0)
5558 return false;
5559 r = _open_fsid(false);
5560 if (r < 0)
5561 goto out_path;
5562 r = _lock_fsid();
5563 if (r < 0)
5564 ret = true; // if we can't lock, it is in use
5565 _close_fsid();
5566 out_path:
5567 _close_path();
5568 return ret;
5569}
5570
11fdf7f2 5571int BlueStore::_minimal_open_bluefs(bool create)
7c673cae
FG
5572{
5573 int r;
11fdf7f2 5574 bluefs = new BlueFS(cct);
7c673cae 5575
11fdf7f2
TL
5576 string bfn;
5577 struct stat st;
5578
5579 bfn = path + "/block.db";
5580 if (::stat(bfn.c_str(), &st) == 0) {
eafe8130
TL
5581 r = bluefs->add_block_device(
5582 BlueFS::BDEV_DB, bfn,
5583 create && cct->_conf->bdev_enable_discard);
7c673cae 5584 if (r < 0) {
11fdf7f2
TL
5585 derr << __func__ << " add block device(" << bfn << ") returned: "
5586 << cpp_strerror(r) << dendl;
5587 goto free_bluefs;
7c673cae 5588 }
7c673cae 5589
11fdf7f2
TL
5590 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
5591 r = _check_or_set_bdev_label(
5592 bfn,
5593 bluefs->get_block_device_size(BlueFS::BDEV_DB),
5594 "bluefs db", create);
5595 if (r < 0) {
5596 derr << __func__
5597 << " check block device(" << bfn << ") label returned: "
5598 << cpp_strerror(r) << dendl;
5599 goto free_bluefs;
5600 }
7c673cae 5601 }
11fdf7f2
TL
5602 if (create) {
5603 bluefs->add_block_extent(
5604 BlueFS::BDEV_DB,
5605 SUPER_RESERVED,
5606 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
5607 }
9f95a23c
TL
5608 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
5609 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
5610 } else {
5611 r = -errno;
5612 if (::lstat(bfn.c_str(), &st) == -1) {
5613 r = 0;
9f95a23c 5614 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7c673cae 5615 } else {
11fdf7f2
TL
5616 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5617 << cpp_strerror(r) << dendl;
5618 goto free_bluefs;
7c673cae
FG
5619 }
5620 }
7c673cae 5621
11fdf7f2
TL
5622 // shared device
5623 bfn = path + "/block";
5624 // never trim here
9f95a23c 5625 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
11fdf7f2
TL
5626 true /* shared with bluestore */);
5627 if (r < 0) {
5628 derr << __func__ << " add block device(" << bfn << ") returned: "
5629 << cpp_strerror(r) << dendl;
5630 goto free_bluefs;
5631 }
5632 if (create) {
5633 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
5634 uint64_t initial =
5635 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
5636 cct->_conf->bluestore_bluefs_gift_ratio);
5637 initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
eafe8130
TL
5638 uint64_t alloc_size = cct->_conf->bluefs_shared_alloc_size;
5639 if (alloc_size % min_alloc_size) {
5640 derr << __func__ << " bluefs_shared_alloc_size 0x" << std::hex
5641 << alloc_size << " is not a multiple of "
11fdf7f2
TL
5642 << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
5643 r = -EINVAL;
5644 goto free_bluefs;
7c673cae 5645 }
11fdf7f2 5646 // align to bluefs's alloc_size
eafe8130 5647 initial = p2roundup(initial, alloc_size);
11fdf7f2 5648 // put bluefs in the middle of the device in case it is an HDD
eafe8130 5649 uint64_t start = p2align((bdev->get_size() - initial) / 2, alloc_size);
11fdf7f2 5650 //avoiding superblock overwrite
eafe8130
TL
5651 start = std::max(alloc_size, start);
5652 ceph_assert(start >=_get_ondisk_reserved());
7c673cae 5653
9f95a23c 5654 bluefs->add_block_extent(bluefs_layout.shared_bdev, start, initial);
11fdf7f2
TL
5655 bluefs_extents.insert(start, initial);
5656 ++out_of_sync_fm;
5657 }
5658
5659 bfn = path + "/block.wal";
5660 if (::stat(bfn.c_str(), &st) == 0) {
5661 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
eafe8130 5662 create && cct->_conf->bdev_enable_discard);
11fdf7f2
TL
5663 if (r < 0) {
5664 derr << __func__ << " add block device(" << bfn << ") returned: "
5665 << cpp_strerror(r) << dendl;
5666 goto free_bluefs;
5667 }
7c673cae 5668
11fdf7f2
TL
5669 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
5670 r = _check_or_set_bdev_label(
5671 bfn,
5672 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
5673 "bluefs wal", create);
7c673cae 5674 if (r < 0) {
11fdf7f2
TL
5675 derr << __func__ << " check block device(" << bfn
5676 << ") label returned: " << cpp_strerror(r) << dendl;
7c673cae
FG
5677 goto free_bluefs;
5678 }
7c673cae
FG
5679 }
5680
11fdf7f2
TL
5681 if (create) {
5682 bluefs->add_block_extent(
5683 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
5684 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
5685 BDEV_LABEL_BLOCK_SIZE);
5686 }
9f95a23c 5687 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
5688 } else {
5689 r = 0;
5690 if (::lstat(bfn.c_str(), &st) != -1) {
5691 r = -errno;
5692 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5693 << cpp_strerror(r) << dendl;
7c673cae
FG
5694 goto free_bluefs;
5695 }
11fdf7f2
TL
5696 }
5697 return 0;
7c673cae 5698
11fdf7f2
TL
5699free_bluefs:
5700 ceph_assert(bluefs);
5701 delete bluefs;
5702 bluefs = NULL;
5703 return r;
5704}
7c673cae 5705
11fdf7f2
TL
5706int BlueStore::_open_bluefs(bool create)
5707{
5708 int r = _minimal_open_bluefs(create);
5709 if (r < 0) {
5710 return r;
5711 }
9f95a23c
TL
5712 RocksDBBlueFSVolumeSelector* vselector = nullptr;
5713 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
5714
5715 string options = cct->_conf->bluestore_rocksdb_options;
5716
5717 rocksdb::Options rocks_opts;
5718 int r = RocksDBStore::ParseOptionsFromStringStatic(
5719 cct,
5720 options,
5721 rocks_opts,
5722 nullptr);
5723 if (r < 0) {
5724 return r;
5725 }
5726
5727 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
5728 vselector =
5729 new RocksDBBlueFSVolumeSelector(
5730 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
5731 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
5732 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
5733 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5734 rocks_opts.max_bytes_for_level_base,
5735 rocks_opts.max_bytes_for_level_multiplier,
5736 reserved_factor,
5737 cct->_conf->bluestore_volume_selection_reserved,
5738 cct->_conf->bluestore_volume_selection_policy != "rocksdb_original");
5739 }
11fdf7f2 5740 if (create) {
9f95a23c 5741 bluefs->mkfs(fsid, bluefs_layout);
11fdf7f2 5742 }
9f95a23c 5743 bluefs->set_volume_selector(vselector);
11fdf7f2
TL
5744 r = bluefs->mount();
5745 if (r < 0) {
5746 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
5747 }
9f95a23c 5748 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
11fdf7f2
TL
5749 return r;
5750}
5751
1911f103 5752void BlueStore::_close_bluefs(bool cold_close)
11fdf7f2 5753{
1911f103 5754 bluefs->umount(cold_close);
11fdf7f2
TL
5755 _minimal_close_bluefs();
5756}
5757
5758void BlueStore::_minimal_close_bluefs()
5759{
5760 delete bluefs;
5761 bluefs = NULL;
5762}
5763
5764int BlueStore::_is_bluefs(bool create, bool* ret)
5765{
5766 if (create) {
5767 *ret = cct->_conf->bluestore_bluefs;
5768 } else {
5769 string s;
5770 int r = read_meta("bluefs", &s);
5771 if (r < 0) {
5772 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
5773 return -EIO;
5774 }
5775 if (s == "1") {
5776 *ret = true;
5777 } else if (s == "0") {
5778 *ret = false;
31f18b77 5779 } else {
11fdf7f2
TL
5780 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
5781 << dendl;
5782 return -EIO;
5783 }
5784 }
5785 return 0;
5786}
5787
5788/*
5789* opens both DB and dependant super_meta, FreelistManager and allocator
5790* in the proper order
5791*/
5792int BlueStore::_open_db_and_around(bool read_only)
5793{
5794 int r;
5795 bool do_bluefs = false;
5796 _is_bluefs(false, &do_bluefs); // ignore err code
5797 if (do_bluefs) {
5798 // open in read-only first to read FM list and init allocator
5799 // as they might be needed for some BlueFS procedures
5800 r = _open_db(false, false, true);
5801 if (r < 0)
5802 return r;
5803
5804 r = _open_super_meta();
5805 if (r < 0) {
5806 goto out_db;
5807 }
5808
1911f103 5809 r = _open_fm(nullptr, true);
11fdf7f2
TL
5810 if (r < 0)
5811 goto out_db;
5812
5813 r = _open_alloc();
5814 if (r < 0)
5815 goto out_fm;
5816
5817 // now open in R/W mode
5818 if (!read_only) {
1911f103 5819 _close_db(true);
11fdf7f2
TL
5820
5821 r = _open_db(false, false, false);
5822 if (r < 0) {
5823 _close_alloc();
5824 _close_fm();
5825 return r;
28e407b8 5826 }
1911f103 5827 fm->sync(db);
7c673cae 5828 }
11fdf7f2
TL
5829 } else {
5830 r = _open_db(false, false);
5831 if (r < 0) {
5832 return r;
5833 }
5834 r = _open_super_meta();
5835 if (r < 0) {
5836 goto out_db;
5837 }
7c673cae 5838
1911f103 5839 r = _open_fm(nullptr, false);
11fdf7f2
TL
5840 if (r < 0)
5841 goto out_db;
5842
5843 r = _open_alloc();
5844 if (r < 0)
5845 goto out_fm;
5846 }
5847 return 0;
5848
5849 out_fm:
5850 _close_fm();
5851 out_db:
1911f103 5852 _close_db(read_only);
11fdf7f2
TL
5853 return r;
5854}
5855
1911f103 5856void BlueStore::_close_db_and_around(bool read_only)
11fdf7f2
TL
5857{
5858 if (bluefs) {
1911f103 5859 if (!read_only && out_of_sync_fm.fetch_and(0)) {
11fdf7f2
TL
5860 _sync_bluefs_and_fm();
5861 }
1911f103
TL
5862 _close_db(read_only);
5863 while(!read_only && out_of_sync_fm.fetch_and(0)) {
11fdf7f2
TL
5864 // if seen some allocations during close - repeat open_db, sync fm, close
5865 dout(0) << __func__ << " syncing FreelistManager" << dendl;
5866 int r = _open_db(false, false, false);
5867 if (r < 0) {
5868 derr << __func__
5869 << " unable to open db, FreelistManager is probably out of sync"
5870 << dendl;
5871 break;
5872 }
5873 _sync_bluefs_and_fm();
1911f103 5874 _close_db(false);
7c673cae 5875 }
11fdf7f2
TL
5876 if (!_kv_only) {
5877 _close_alloc();
5878 _close_fm();
5879 }
5880 } else {
5881 _close_alloc();
5882 _close_fm();
1911f103 5883 _close_db(read_only);
11fdf7f2
TL
5884 }
5885}
5886
5887// updates legacy bluefs related recs in DB to a state valid for
5888// downgrades from nautilus.
5889void BlueStore::_sync_bluefs_and_fm()
5890{
5891 if (cct->_conf->bluestore_bluefs_db_compatibility) {
5892 bufferlist bl;
5893 encode(bluefs_extents, bl);
5894 dout(20) << __func__ << " bluefs_extents at KV is now 0x"
5895 << std::hex << bluefs_extents << std::dec
5896 << dendl;
5897 KeyValueDB::Transaction synct = db->get_transaction();
5898 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
5899 synct->set(PREFIX_SUPER, "bluefs_extents_back", bl);
5900
5901 // Nice thing is that we don't need to update FreelistManager here.
5902 // It always has corresponding bits set to 'Free' for both Nautilus+ and
5903 // pre-Nautilis releases.
5904 // So once we get an extent to bluefs_extents this means it's
5905 // been free in allocator and hence it's free in FM too.
5906
5907 db->submit_transaction_sync(synct);
5908 }
5909}
5910
5911int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
5912{
5913 int r;
5914 ceph_assert(!db);
5915 ceph_assert(!(create && read_only));
5916 string fn = path + "/db";
5917 string options;
5918 stringstream err;
5919 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
5920
5921 string kv_backend;
5922 std::vector<KeyValueDB::ColumnFamily> cfs;
5923
5924 if (create) {
5925 kv_backend = cct->_conf->bluestore_kvbackend;
5926 } else {
5927 r = read_meta("kv_backend", &kv_backend);
7c673cae 5928 if (r < 0) {
11fdf7f2
TL
5929 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
5930 return -EIO;
5931 }
5932 }
5933 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
5934
5935 bool do_bluefs;
5936 r = _is_bluefs(create, &do_bluefs);
5937 if (r < 0) {
5938 return r;
5939 }
5940 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
5941
5942 map<string,string> kv_options;
5943 // force separate wal dir for all new deployments.
5944 kv_options["separate_wal_dir"] = 1;
5945 rocksdb::Env *env = NULL;
5946 if (do_bluefs) {
5947 dout(10) << __func__ << " initializing bluefs" << dendl;
5948 if (kv_backend != "rocksdb") {
5949 derr << " backend must be rocksdb to use bluefs" << dendl;
5950 return -EINVAL;
7c673cae 5951 }
11fdf7f2
TL
5952
5953 r = _open_bluefs(create);
5954 if (r < 0) {
5955 return r;
5956 }
11fdf7f2 5957
7c673cae 5958 if (cct->_conf->bluestore_bluefs_env_mirror) {
9f95a23c
TL
5959 rocksdb::Env* a = new BlueRocksEnv(bluefs);
5960 rocksdb::Env* b = rocksdb::Env::Default();
7c673cae 5961 if (create) {
9f95a23c
TL
5962 string cmd = "rm -rf " + path + "/db " +
5963 path + "/db.slow " +
5964 path + "/db.wal";
5965 int r = system(cmd.c_str());
5966 (void)r;
7c673cae
FG
5967 }
5968 env = new rocksdb::EnvMirror(b, a, false, true);
1911f103 5969 } else {
7c673cae
FG
5970 env = new BlueRocksEnv(bluefs);
5971
5972 // simplify the dir names, too, as "seen" by rocksdb
5973 fn = "db";
5974 }
9f95a23c
TL
5975 bluefs->set_slow_device_expander(this);
5976 BlueFSVolumeSelector::paths paths;
5977 bluefs->get_vselector_paths(fn, paths);
7c673cae 5978
9f95a23c 5979 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
7c673cae
FG
5980 // we have both block.db and block; tell rocksdb!
5981 // note: the second (last) size value doesn't really matter
5982 ostringstream db_paths;
9f95a23c
TL
5983 bool first = true;
5984 for (auto& p : paths) {
5985 if (!first) {
5986 db_paths << " ";
5987 }
5988 first = false;
5989 db_paths << p.first << "," << p.second;
5990
5991 }
11fdf7f2 5992 kv_options["db_paths"] = db_paths.str();
9f95a23c 5993 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
7c673cae
FG
5994 }
5995
5996 if (create) {
9f95a23c
TL
5997 for (auto& p : paths) {
5998 env->CreateDir(p.first);
5999 }
6000 // Selectors don't provide wal path so far hence create explicitly
11fdf7f2 6001 env->CreateDir(fn + ".wal");
11fdf7f2
TL
6002 } else {
6003 std::vector<std::string> res;
6004 // check for dir presence
6005 auto r = env->GetChildren(fn+".wal", &res);
6006 if (r.IsNotFound()) {
6007 kv_options.erase("separate_wal_dir");
6008 }
7c673cae 6009 }
11fdf7f2
TL
6010 } else {
6011 string walfn = path + "/db.wal";
7c673cae 6012
11fdf7f2
TL
6013 if (create) {
6014 int r = ::mkdir(fn.c_str(), 0755);
6015 if (r < 0)
6016 r = -errno;
6017 if (r < 0 && r != -EEXIST) {
6018 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
6019 << dendl;
6020 return r;
6021 }
6022
6023 // wal_dir, too!
7c673cae
FG
6024 r = ::mkdir(walfn.c_str(), 0755);
6025 if (r < 0)
6026 r = -errno;
6027 if (r < 0 && r != -EEXIST) {
6028 derr << __func__ << " failed to create " << walfn
6029 << ": " << cpp_strerror(r)
6030 << dendl;
6031 return r;
6032 }
11fdf7f2
TL
6033 } else {
6034 struct stat st;
6035 r = ::stat(walfn.c_str(), &st);
6036 if (r < 0 && errno == ENOENT) {
6037 kv_options.erase("separate_wal_dir");
6038 }
7c673cae
FG
6039 }
6040 }
6041
91327a77 6042
7c673cae
FG
6043 db = KeyValueDB::create(cct,
6044 kv_backend,
6045 fn,
11fdf7f2 6046 kv_options,
7c673cae
FG
6047 static_cast<void*>(env));
6048 if (!db) {
6049 derr << __func__ << " error creating db" << dendl;
6050 if (bluefs) {
1911f103 6051 _close_bluefs(read_only);
7c673cae
FG
6052 }
6053 // delete env manually here since we can't depend on db to do this
6054 // under this case
6055 delete env;
6056 env = NULL;
6057 return -EIO;
6058 }
6059
6060 FreelistManager::setup_merge_operators(db);
6061 db->set_merge_operator(PREFIX_STAT, merge_op);
91327a77 6062 db->set_cache_size(cache_kv_ratio * cache_size);
31f18b77 6063
11fdf7f2 6064 if (kv_backend == "rocksdb") {
7c673cae 6065 options = cct->_conf->bluestore_rocksdb_options;
11fdf7f2
TL
6066
6067 map<string,string> cf_map;
6068 cct->_conf.with_val<string>("bluestore_rocksdb_cfs",
6069 get_str_map,
6070 &cf_map,
6071 " \t");
6072 for (auto& i : cf_map) {
6073 dout(10) << "column family " << i.first << ": " << i.second << dendl;
6074 cfs.push_back(KeyValueDB::ColumnFamily(i.first, i.second));
6075 }
6076 }
6077
7c673cae 6078 db->init(options);
11fdf7f2
TL
6079 if (to_repair_db)
6080 return 0;
6081 if (create) {
6082 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
6083 r = db->create_and_open(err, cfs);
6084 } else {
6085 r = db->create_and_open(err);
6086 }
6087 } else {
6088 // we pass in cf list here, but it is only used if the db already has
6089 // column families created.
6090 r = read_only ?
6091 db->open_read_only(err, cfs) :
6092 db->open(err, cfs);
6093 }
7c673cae
FG
6094 if (r) {
6095 derr << __func__ << " erroring opening db: " << err.str() << dendl;
1911f103 6096 _close_db(read_only);
7c673cae
FG
6097 return -EIO;
6098 }
6099 dout(1) << __func__ << " opened " << kv_backend
6100 << " path " << fn << " options " << options << dendl;
6101 return 0;
7c673cae
FG
6102}
6103
1911f103 6104void BlueStore::_close_db(bool cold_close)
7c673cae 6105{
11fdf7f2 6106 ceph_assert(db);
7c673cae
FG
6107 delete db;
6108 db = NULL;
6109 if (bluefs) {
1911f103 6110 _close_bluefs(cold_close);
7c673cae
FG
6111 }
6112}
6113
11fdf7f2 6114void BlueStore::_dump_alloc_on_failure()
7c673cae 6115{
11fdf7f2
TL
6116 auto dump_interval =
6117 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
6118 if (dump_interval > 0 &&
6119 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
6120 alloc->dump();
6121 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
6122 next_dump_on_bluefs_alloc_failure += dump_interval;
7c673cae 6123 }
11fdf7f2 6124}
7c673cae 6125
7c673cae 6126
11fdf7f2
TL
6127int BlueStore::allocate_bluefs_freespace(
6128 uint64_t min_size,
6129 uint64_t size,
6130 PExtentVector* extents_out)
6131{
6132 ceph_assert(min_size <= size);
6133 if (size) {
6134 // round up to alloc size
9f95a23c 6135 uint64_t alloc_size = bluefs->get_alloc_size(bluefs_layout.shared_bdev);
eafe8130
TL
6136 min_size = p2roundup(min_size, alloc_size);
6137 size = p2roundup(size, alloc_size);
11fdf7f2
TL
6138
6139 PExtentVector extents_local;
6140 PExtentVector* extents = extents_out ? extents_out : &extents_local;
6141
6142
6143 uint64_t gift;
6144 uint64_t allocated = 0;
6145 int64_t alloc_len;
9f95a23c
TL
6146 auto need = size;
6147 auto extent_count0 = extents->size();
11fdf7f2
TL
6148 do {
6149 // hard cap to fit into 32 bits
9f95a23c 6150 gift = std::min<uint64_t>(size, 1ull << 30);
11fdf7f2
TL
6151 dout(10) << __func__ << " gifting " << gift
6152 << " (" << byte_u_t(gift) << ")" << dendl;
6153
eafe8130
TL
6154 alloc_len = alloc->allocate(gift, alloc_size, 0, 0, extents);
6155 if (alloc_len > 0) {
11fdf7f2
TL
6156 allocated += alloc_len;
6157 size -= alloc_len;
6158 }
6159
eafe8130
TL
6160 if (alloc_len < 0 ||
6161 (alloc_len < (int64_t)gift && (min_size > allocated))) {
11fdf7f2
TL
6162 derr << __func__
6163 << " failed to allocate on 0x" << std::hex << gift
6164 << " min_size 0x" << min_size
6165 << " > allocated total 0x" << allocated
eafe8130
TL
6166 << " bluefs_shared_alloc_size 0x" << alloc_size
6167 << " allocated 0x" << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2
TL
6168 << " available 0x " << alloc->get_free()
6169 << std::dec << dendl;
7c673cae 6170
494da23a 6171 _dump_alloc_on_failure();
11fdf7f2
TL
6172 alloc->release(*extents);
6173 extents->clear();
6174 return -ENOSPC;
6175 }
6176 } while (size && alloc_len > 0);
9f95a23c
TL
6177 _collect_allocation_stats(need, alloc_size, extents->size() - extent_count0);
6178
11fdf7f2
TL
6179 for (auto& e : *extents) {
6180 dout(5) << __func__ << " gifting " << e << " to bluefs" << dendl;
6181 bluefs_extents.insert(e.offset, e.length);
6182 ++out_of_sync_fm;
6183 // apply to bluefs if not requested from outside
6184 if (!extents_out) {
9f95a23c 6185 bluefs->add_block_extent(bluefs_layout.shared_bdev, e.offset, e.length);
11fdf7f2 6186 }
7c673cae
FG
6187 }
6188 }
7c673cae
FG
6189 return 0;
6190}
6191
9f95a23c
TL
6192uint64_t BlueStore::available_freespace(uint64_t alloc_size) {
6193 uint64_t total = 0;
6194 auto iterated_allocation = [&](uint64_t off, uint64_t len) {
eafe8130 6195 //only count in size that is alloc_size aligned
9f95a23c
TL
6196 uint64_t dist_to_alignment;
6197 uint64_t offset_in_block = off & (alloc_size - 1);
eafe8130
TL
6198 if (offset_in_block == 0)
6199 dist_to_alignment = 0;
6200 else
6201 dist_to_alignment = alloc_size - offset_in_block;
6202 if (dist_to_alignment >= len)
6203 return;
6204 len -= dist_to_alignment;
6205 total += p2align(len, alloc_size);
6206 };
6207 alloc->dump(iterated_allocation);
6208 return total;
6209}
6210
11fdf7f2 6211int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total)
f64942e4 6212{
7c673cae
FG
6213 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
6214
6215 uint64_t my_free = alloc->get_free();
6216 uint64_t total = bdev->get_size();
6217 float my_free_ratio = (float)my_free / (float)total;
6218
6219 uint64_t total_free = bluefs_free + my_free;
6220
6221 float bluefs_ratio = (float)bluefs_free / (float)total_free;
6222
6223 dout(10) << __func__
1adf2230 6224 << " bluefs " << byte_u_t(bluefs_free)
7c673cae 6225 << " free (" << bluefs_free_ratio
1adf2230 6226 << ") bluestore " << byte_u_t(my_free)
7c673cae
FG
6227 << " free (" << my_free_ratio
6228 << "), bluefs_ratio " << bluefs_ratio
6229 << dendl;
6230
6231 uint64_t gift = 0;
6232 uint64_t reclaim = 0;
6233 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
6234 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
9f95a23c
TL
6235 if (gift >= my_free)
6236 gift = my_free / 2;
7c673cae
FG
6237 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
6238 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
1adf2230 6239 << ", should gift " << byte_u_t(gift) << dendl;
7c673cae
FG
6240 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
6241 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
6242 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
6243 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
9f95a23c
TL
6244 if (reclaim >= bluefs_free)
6245 reclaim = bluefs_free / 2;
7c673cae
FG
6246 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
6247 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
1adf2230 6248 << ", should reclaim " << byte_u_t(reclaim) << dendl;
7c673cae 6249 }
3efd9988
FG
6250
6251 // don't take over too much of the freespace
6252 uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
7c673cae 6253 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
3efd9988 6254 cct->_conf->bluestore_bluefs_min < free_cap) {
7c673cae
FG
6255 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
6256 dout(10) << __func__ << " bluefs_total " << bluefs_total
6257 << " < min " << cct->_conf->bluestore_bluefs_min
1adf2230 6258 << ", should gift " << byte_u_t(g) << dendl;
7c673cae
FG
6259 if (g > gift)
6260 gift = g;
6261 reclaim = 0;
6262 }
9f95a23c
TL
6263 uint64_t min_free =
6264 cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
3efd9988
FG
6265 if (bluefs_free < min_free &&
6266 min_free < free_cap) {
6267 uint64_t g = min_free - bluefs_free;
11fdf7f2 6268 dout(10) << __func__ << " bluefs_free " << bluefs_free
3efd9988 6269 << " < min " << min_free
1adf2230 6270 << ", should gift " << byte_u_t(g) << dendl;
3efd9988
FG
6271 if (g > gift)
6272 gift = g;
6273 reclaim = 0;
6274 }
9f95a23c
TL
6275 uint64_t max_free =
6276 cct->_conf.get_val<Option::size_t>("bluestore_bluefs_max_free");
6277 if (bluefs_free > max_free) {
6278 dout(10) << __func__ << " bluefs_free " << bluefs_free
6279 << " > max " << max_free
6280 << ", stop gifting for now" << dendl;
6281 gift = 0;
6282 }
11fdf7f2
TL
6283 ceph_assert((int64_t)gift >= 0);
6284 ceph_assert((int64_t)reclaim >= 0);
6285 return gift > 0 ? (int64_t)gift : -(int64_t)reclaim;
6286}
7c673cae 6287
11fdf7f2
TL
6288int BlueStore::_balance_bluefs_freespace()
6289{
6290 int ret = 0;
6291 ceph_assert(bluefs);
7c673cae 6292
11fdf7f2
TL
6293 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
6294 bluefs->get_usage(&bluefs_usage);
9f95a23c 6295 ceph_assert(bluefs_usage.size() > bluefs_layout.shared_bdev);
7c673cae 6296
11fdf7f2 6297 bool clear_alert = true;
9f95a23c
TL
6298 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
6299 auto& p = bluefs_usage[bluefs_layout.shared_bdev];
11fdf7f2
TL
6300 if (p.first != p.second) {
6301 auto& db = bluefs_usage[BlueFS::BDEV_DB];
6302 ostringstream ss;
6303 ss << "spilled over " << byte_u_t(p.second - p.first)
6304 << " metadata from 'db' device (" << byte_u_t(db.second - db.first)
6305 << " used of " << byte_u_t(db.second) << ") to slow device";
6306 _set_spillover_alert(ss.str());
6307 clear_alert = false;
6308 }
6309 }
6310 if (clear_alert) {
6311 _clear_spillover_alert();
7c673cae
FG
6312 }
6313
11fdf7f2
TL
6314 // fixme: look at primary bdev only for now
6315 int64_t delta = _get_bluefs_size_delta(
9f95a23c
TL
6316 bluefs_usage[bluefs_layout.shared_bdev].first,
6317 bluefs_usage[bluefs_layout.shared_bdev].second);
11fdf7f2 6318
7c673cae 6319 // reclaim from bluefs?
11fdf7f2 6320 if (delta < 0) {
7c673cae 6321 // round up to alloc size
9f95a23c 6322 uint64_t alloc_size = bluefs->get_alloc_size(bluefs_layout.shared_bdev);
eafe8130 6323 auto reclaim = p2roundup(uint64_t(-delta), alloc_size);
7c673cae
FG
6324
6325 // hard cap to fit into 32 bits
9f95a23c 6326 reclaim = std::min<uint64_t>(reclaim, 1ull << 30);
7c673cae 6327 dout(10) << __func__ << " reclaiming " << reclaim
1adf2230 6328 << " (" << byte_u_t(reclaim) << ")" << dendl;
7c673cae
FG
6329
6330 while (reclaim > 0) {
6331 // NOTE: this will block and do IO.
a8e16298 6332 PExtentVector extents;
9f95a23c 6333 int r = bluefs->reclaim_blocks(bluefs_layout.shared_bdev, reclaim,
7c673cae
FG
6334 &extents);
6335 if (r < 0) {
6336 derr << __func__ << " failed to reclaim space from bluefs"
6337 << dendl;
6338 break;
6339 }
6340 for (auto e : extents) {
11fdf7f2 6341 ++out_of_sync_fm;
7c673cae
FG
6342 bluefs_extents.erase(e.offset, e.length);
6343 bluefs_extents_reclaiming.insert(e.offset, e.length);
6344 reclaim -= e.length;
6345 }
6346 }
6347
6348 ret = 1;
6349 }
6350
6351 return ret;
6352}
6353
eafe8130 6354int BlueStore::_open_collections()
7c673cae 6355{
28e407b8 6356 dout(10) << __func__ << dendl;
eafe8130 6357 collections_had_errors = false;
11fdf7f2 6358 ceph_assert(coll_map.empty());
7c673cae
FG
6359 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6360 for (it->upper_bound(string());
6361 it->valid();
6362 it->next()) {
6363 coll_t cid;
6364 if (cid.parse(it->key())) {
9f95a23c 6365 auto c = ceph::make_ref<Collection>(
7c673cae 6366 this,
9f95a23c
TL
6367 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6368 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6369 cid);
7c673cae 6370 bufferlist bl = it->value();
11fdf7f2 6371 auto p = bl.cbegin();
7c673cae 6372 try {
11fdf7f2 6373 decode(c->cnode, p);
7c673cae
FG
6374 } catch (buffer::error& e) {
6375 derr << __func__ << " failed to decode cnode, key:"
6376 << pretty_binary_string(it->key()) << dendl;
6377 return -EIO;
6378 }
28e407b8
AA
6379 dout(20) << __func__ << " opened " << cid << " " << c
6380 << " " << c->cnode << dendl;
11fdf7f2 6381 _osr_attach(c.get());
7c673cae 6382 coll_map[cid] = c;
11fdf7f2 6383
7c673cae
FG
6384 } else {
6385 derr << __func__ << " unrecognized collection " << it->key() << dendl;
eafe8130 6386 collections_had_errors = true;
7c673cae
FG
6387 }
6388 }
6389 return 0;
6390}
6391
eafe8130
TL
6392void BlueStore::_fsck_collections(int64_t* errors)
6393{
6394 if (collections_had_errors) {
6395 dout(10) << __func__ << dendl;
6396 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6397 for (it->upper_bound(string());
6398 it->valid();
6399 it->next()) {
6400 coll_t cid;
6401 if (!cid.parse(it->key())) {
6402 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6403 if (errors) {
6404 (*errors)++;
6405 }
6406 }
6407 }
6408 }
6409}
6410
9f95a23c
TL
6411void BlueStore::_set_per_pool_omap()
6412{
6413 per_pool_omap = false;
6414 bufferlist bl;
6415 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6416 if (bl.length()) {
6417 per_pool_omap = true;
6418 dout(10) << __func__ << " per_pool_omap=1" << dendl;
6419 } else {
6420 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6421 }
6422 _check_no_per_pool_omap_alert();
6423}
6424
224ce89b 6425void BlueStore::_open_statfs()
31f18b77 6426{
11fdf7f2
TL
6427 osd_pools.clear();
6428 vstatfs.reset();
6429
31f18b77 6430 bufferlist bl;
11fdf7f2 6431 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
31f18b77 6432 if (r >= 0) {
11fdf7f2 6433 per_pool_stat_collection = false;
31f18b77 6434 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
11fdf7f2 6435 auto it = bl.cbegin();
31f18b77 6436 vstatfs.decode(it);
11fdf7f2 6437 dout(10) << __func__ << " store_statfs is found" << dendl;
224ce89b 6438 } else {
31f18b77
FG
6439 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6440 }
81eedcae 6441 _check_legacy_statfs_alert();
11fdf7f2
TL
6442 } else {
6443 per_pool_stat_collection = true;
6444 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
6445 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT);
6446 for (it->upper_bound(string());
6447 it->valid();
6448 it->next()) {
6449
6450 uint64_t pool_id;
6451 int r = get_key_pool_stat(it->key(), &pool_id);
6452 ceph_assert(r == 0);
6453
6454 bufferlist bl;
6455 bl = it->value();
6456 auto p = bl.cbegin();
6457 auto& st = osd_pools[pool_id];
6458 try {
6459 st.decode(p);
6460 vstatfs += st;
6461
6462 dout(30) << __func__ << " pool " << pool_id
6463 << " statfs " << st << dendl;
6464 } catch (buffer::error& e) {
6465 derr << __func__ << " failed to decode pool stats, key:"
6466 << pretty_binary_string(it->key()) << dendl;
6467 }
6468 }
31f18b77 6469 }
11fdf7f2
TL
6470 dout(30) << __func__ << " statfs " << vstatfs << dendl;
6471
31f18b77
FG
6472}
6473
7c673cae
FG
6474int BlueStore::_setup_block_symlink_or_file(
6475 string name,
6476 string epath,
6477 uint64_t size,
6478 bool create)
6479{
6480 dout(20) << __func__ << " name " << name << " path " << epath
6481 << " size " << size << " create=" << (int)create << dendl;
6482 int r = 0;
91327a77 6483 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
6484 if (create)
6485 flags |= O_CREAT;
6486 if (epath.length()) {
6487 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6488 if (r < 0) {
6489 r = -errno;
6490 derr << __func__ << " failed to create " << name << " symlink to "
6491 << epath << ": " << cpp_strerror(r) << dendl;
6492 return r;
6493 }
6494
6495 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6496 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6497 if (fd < 0) {
6498 r = -errno;
6499 derr << __func__ << " failed to open " << epath << " file: "
6500 << cpp_strerror(r) << dendl;
6501 return r;
6502 }
11fdf7f2
TL
6503 // write the Transport ID of the NVMe device
6504 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6505 // where "0000:02:00.0" is the selector of a PCI device, see
6506 // the first column of "lspci -mm -n -D"
6507 string trid{"trtype:PCIe "};
6508 trid += "traddr:";
6509 trid += epath.substr(strlen(SPDK_PREFIX));
6510 r = ::write(fd, trid.c_str(), trid.size());
6511 ceph_assert(r == static_cast<int>(trid.size()));
7c673cae
FG
6512 dout(1) << __func__ << " created " << name << " symlink to "
6513 << epath << dendl;
6514 VOID_TEMP_FAILURE_RETRY(::close(fd));
6515 }
6516 }
6517 if (size) {
6518 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
6519 if (fd >= 0) {
6520 // block file is present
6521 struct stat st;
6522 int r = ::fstat(fd, &st);
6523 if (r == 0 &&
6524 S_ISREG(st.st_mode) && // if it is a regular file
6525 st.st_size == 0) { // and is 0 bytes
6526 r = ::ftruncate(fd, size);
6527 if (r < 0) {
6528 r = -errno;
6529 derr << __func__ << " failed to resize " << name << " file to "
6530 << size << ": " << cpp_strerror(r) << dendl;
6531 VOID_TEMP_FAILURE_RETRY(::close(fd));
6532 return r;
6533 }
6534
6535 if (cct->_conf->bluestore_block_preallocate_file) {
28e407b8
AA
6536 r = ::ceph_posix_fallocate(fd, 0, size);
6537 if (r > 0) {
7c673cae
FG
6538 derr << __func__ << " failed to prefallocate " << name << " file to "
6539 << size << ": " << cpp_strerror(r) << dendl;
6540 VOID_TEMP_FAILURE_RETRY(::close(fd));
6541 return -r;
6542 }
7c673cae
FG
6543 }
6544 dout(1) << __func__ << " resized " << name << " file to "
1adf2230 6545 << byte_u_t(size) << dendl;
7c673cae
FG
6546 }
6547 VOID_TEMP_FAILURE_RETRY(::close(fd));
6548 } else {
6549 int r = -errno;
6550 if (r != -ENOENT) {
6551 derr << __func__ << " failed to open " << name << " file: "
6552 << cpp_strerror(r) << dendl;
6553 return r;
6554 }
6555 }
6556 }
6557 return 0;
6558}
6559
6560int BlueStore::mkfs()
6561{
6562 dout(1) << __func__ << " path " << path << dendl;
6563 int r;
6564 uuid_d old_fsid;
6565
eafe8130
TL
6566 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6567 derr << __func__ << " osd_max_object_size "
6568 << cct->_conf->osd_max_object_size << " > bluestore max "
6569 << OBJECT_MAX_SIZE << dendl;
6570 return -EINVAL;
6571 }
6572
7c673cae
FG
6573 {
6574 string done;
6575 r = read_meta("mkfs_done", &done);
6576 if (r == 0) {
6577 dout(1) << __func__ << " already created" << dendl;
6578 if (cct->_conf->bluestore_fsck_on_mkfs) {
6579 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6580 if (r < 0) {
6581 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
6582 << dendl;
6583 return r;
6584 }
6585 if (r > 0) {
6586 derr << __func__ << " fsck found " << r << " errors" << dendl;
6587 r = -EIO;
6588 }
6589 }
6590 return r; // idempotent
6591 }
6592 }
6593
6594 {
6595 string type;
6596 r = read_meta("type", &type);
6597 if (r == 0) {
6598 if (type != "bluestore") {
6599 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6600 return -EIO;
6601 }
6602 } else {
6603 r = write_meta("type", "bluestore");
6604 if (r < 0)
6605 return r;
6606 }
6607 }
6608
6609 freelist_type = "bitmap";
6610
6611 r = _open_path();
6612 if (r < 0)
6613 return r;
6614
6615 r = _open_fsid(true);
6616 if (r < 0)
6617 goto out_path_fd;
6618
6619 r = _lock_fsid();
6620 if (r < 0)
6621 goto out_close_fsid;
6622
6623 r = _read_fsid(&old_fsid);
6624 if (r < 0 || old_fsid.is_zero()) {
6625 if (fsid.is_zero()) {
6626 fsid.generate_random();
6627 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6628 } else {
6629 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6630 }
6631 // we'll write it later.
6632 } else {
6633 if (!fsid.is_zero() && fsid != old_fsid) {
6634 derr << __func__ << " on-disk fsid " << old_fsid
6635 << " != provided " << fsid << dendl;
6636 r = -EINVAL;
6637 goto out_close_fsid;
6638 }
6639 fsid = old_fsid;
6640 }
6641
6642 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
6643 cct->_conf->bluestore_block_size,
6644 cct->_conf->bluestore_block_create);
6645 if (r < 0)
6646 goto out_close_fsid;
6647 if (cct->_conf->bluestore_bluefs) {
6648 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
6649 cct->_conf->bluestore_block_wal_size,
6650 cct->_conf->bluestore_block_wal_create);
6651 if (r < 0)
6652 goto out_close_fsid;
6653 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
6654 cct->_conf->bluestore_block_db_size,
6655 cct->_conf->bluestore_block_db_create);
6656 if (r < 0)
6657 goto out_close_fsid;
6658 }
6659
6660 r = _open_bdev(true);
6661 if (r < 0)
6662 goto out_close_fsid;
6663
3efd9988
FG
6664 // choose min_alloc_size
6665 if (cct->_conf->bluestore_min_alloc_size) {
6666 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
6667 } else {
11fdf7f2 6668 ceph_assert(bdev);
3efd9988
FG
6669 if (bdev->is_rotational()) {
6670 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
6671 } else {
6672 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
6673 }
6674 }
11fdf7f2 6675 _validate_bdev();
3efd9988
FG
6676
6677 // make sure min_alloc_size is power of 2 aligned.
11fdf7f2 6678 if (!isp2(min_alloc_size)) {
3efd9988
FG
6679 derr << __func__ << " min_alloc_size 0x"
6680 << std::hex << min_alloc_size << std::dec
6681 << " is not power of 2 aligned!"
6682 << dendl;
6683 r = -EINVAL;
6684 goto out_close_bdev;
6685 }
6686
7c673cae
FG
6687 r = _open_db(true);
6688 if (r < 0)
6689 goto out_close_bdev;
6690
7c673cae
FG
6691 {
6692 KeyValueDB::Transaction t = db->get_transaction();
1911f103 6693 r = _open_fm(t, true);
11fdf7f2
TL
6694 if (r < 0)
6695 goto out_close_db;
7c673cae
FG
6696 {
6697 bufferlist bl;
11fdf7f2 6698 encode((uint64_t)0, bl);
7c673cae
FG
6699 t->set(PREFIX_SUPER, "nid_max", bl);
6700 t->set(PREFIX_SUPER, "blobid_max", bl);
6701 }
6702
7c673cae
FG
6703 {
6704 bufferlist bl;
11fdf7f2 6705 encode((uint64_t)min_alloc_size, bl);
7c673cae
FG
6706 t->set(PREFIX_SUPER, "min_alloc_size", bl);
6707 }
9f95a23c
TL
6708 {
6709 bufferlist bl;
6710 bl.append("1");
6711 t->set(PREFIX_SUPER, "per_pool_omap", bl);
6712 }
7c673cae
FG
6713 ondisk_format = latest_ondisk_format;
6714 _prepare_ondisk_format_super(t);
6715 db->submit_transaction_sync(t);
6716 }
6717
7c673cae
FG
6718 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
6719 if (r < 0)
224ce89b
WB
6720 goto out_close_fm;
6721
3efd9988 6722 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 6723 if (r < 0)
224ce89b 6724 goto out_close_fm;
7c673cae
FG
6725
6726 if (fsid != old_fsid) {
6727 r = _write_fsid();
6728 if (r < 0) {
6729 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 6730 goto out_close_fm;
7c673cae
FG
6731 }
6732 }
6733
11fdf7f2
TL
6734 if (out_of_sync_fm.fetch_and(0)) {
6735 _sync_bluefs_and_fm();
6736 }
6737
7c673cae
FG
6738 out_close_fm:
6739 _close_fm();
6740 out_close_db:
1911f103 6741 _close_db(false);
7c673cae
FG
6742 out_close_bdev:
6743 _close_bdev();
6744 out_close_fsid:
6745 _close_fsid();
6746 out_path_fd:
6747 _close_path();
6748
6749 if (r == 0 &&
6750 cct->_conf->bluestore_fsck_on_mkfs) {
6751 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6752 if (rc < 0)
6753 return rc;
6754 if (rc > 0) {
6755 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6756 r = -EIO;
6757 }
11fdf7f2
TL
6758 }
6759
6760 if (r == 0) {
6761 // indicate success by writing the 'mkfs_done' file
6762 r = write_meta("mkfs_done", "yes");
6763 }
6764
6765 if (r < 0) {
6766 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6767 } else {
6768 dout(0) << __func__ << " success" << dendl;
6769 }
6770 return r;
6771}
6772
6773int BlueStore::_mount_for_bluefs()
6774{
6775 int r = _open_path();
6776 ceph_assert(r == 0);
6777 r = _open_fsid(false);
6778 ceph_assert(r == 0);
6779 r = _read_fsid(&fsid);
6780 ceph_assert(r == 0);
6781 r = _lock_fsid();
6782 ceph_assert(r == 0);
6783 r = _open_bluefs(false);
6784 ceph_assert(r == 0);
6785 return r;
6786}
6787
6788void BlueStore::_umount_for_bluefs()
6789{
1911f103 6790 _close_bluefs(false);
11fdf7f2
TL
6791 _close_fsid();
6792 _close_path();
6793}
6794
6795int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
6796{
6797 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6798 int r;
6799 ceph_assert(path_fd < 0);
6800
6801 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6802
6803 if (!cct->_conf->bluestore_bluefs) {
6804 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6805 return -EIO;
6806 }
6807
6808 r = _mount_for_bluefs();
6809
6810 int reserved = 0;
6811 if (id == BlueFS::BDEV_NEWWAL) {
6812 string p = path + "/block.wal";
6813 r = _setup_block_symlink_or_file("block.wal", dev_path,
6814 cct->_conf->bluestore_block_wal_size,
6815 true);
6816 ceph_assert(r == 0);
6817
6818 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
eafe8130 6819 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6820 ceph_assert(r == 0);
6821
6822 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6823 r = _check_or_set_bdev_label(
6824 p,
6825 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6826 "bluefs wal",
6827 true);
6828 ceph_assert(r == 0);
6829 }
6830
6831 reserved = BDEV_LABEL_BLOCK_SIZE;
9f95a23c 6832 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6833 } else if (id == BlueFS::BDEV_NEWDB) {
6834 string p = path + "/block.db";
6835 r = _setup_block_symlink_or_file("block.db", dev_path,
6836 cct->_conf->bluestore_block_db_size,
6837 true);
6838 ceph_assert(r == 0);
6839
6840 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
eafe8130 6841 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6842 ceph_assert(r == 0);
6843
6844 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6845 r = _check_or_set_bdev_label(
6846 p,
6847 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6848 "bluefs db",
6849 true);
6850 ceph_assert(r == 0);
6851 }
6852 reserved = SUPER_RESERVED;
9f95a23c
TL
6853 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6854 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
6855 }
6856
6857 bluefs->umount();
6858 bluefs->mount();
6859
6860 bluefs->add_block_extent(
6861 id,
6862 reserved,
1911f103 6863 bluefs->get_block_device_size(id) - reserved, true);
11fdf7f2 6864
9f95a23c 6865 r = bluefs->prepare_new_device(id, bluefs_layout);
11fdf7f2
TL
6866 ceph_assert(r == 0);
6867
6868 if (r < 0) {
6869 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6870 } else {
6871 dout(0) << __func__ << " success" << dendl;
6872 }
6873
6874 _umount_for_bluefs();
6875 return r;
6876}
6877
6878int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
6879 int id)
6880{
6881 dout(10) << __func__ << " id:" << id << dendl;
6882 ceph_assert(path_fd < 0);
6883
6884 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
6885
6886 if (!cct->_conf->bluestore_bluefs) {
6887 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6888 return -EIO;
6889 }
6890
6891 int r = _mount_for_bluefs();
6892
6893 // require bluestore_bluefs_min_free to be free at target device!
6894 uint64_t used_space = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
6895 for(auto src_id : devs_source) {
6896 used_space += bluefs->get_total(src_id) - bluefs->get_free(src_id);
6897 }
6898 uint64_t target_free = bluefs->get_free(id);
6899 if (id == BlueFS::BDEV_SLOW && target_free < used_space) {
6900 // will need to remount full BlueStore instance to allocate more space
6901 _umount_for_bluefs();
6902
6903 r = mount();
6904 ceph_assert(r == 0);
6905 dout(1) << __func__
6906 << " Allocating more space at slow device for BlueFS: +"
6907 << used_space - target_free << " bytes" << dendl;
6908 r = allocate_bluefs_freespace(
6909 used_space - target_free,
6910 used_space - target_free,
6911 nullptr);
6912
6913 umount();
6914 if (r != 0) {
6915 derr << __func__
6916 << " can't migrate, unable to allocate extra space: "
6917 << used_space - target_free << " at target:" << id
6918 << dendl;
6919 return -ENOSPC;
6920 }
6921
6922 r = _mount_for_bluefs();
6923 ceph_assert(r == 0);
6924 } else if (target_free < used_space) {
6925 derr << __func__
6926 << " can't migrate, free space at target: " << target_free
6927 << " is less than required space: " << used_space
6928 << dendl;
6929 return -ENOSPC;
6930 }
9f95a23c
TL
6931 if (devs_source.count(BlueFS::BDEV_DB)) {
6932 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6933 bluefs_layout.dedicated_db = false;
6934 }
6935 if (devs_source.count(BlueFS::BDEV_WAL)) {
6936 bluefs_layout.dedicated_wal = false;
6937 }
6938 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
11fdf7f2
TL
6939 if (r < 0) {
6940 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6941 goto shutdown;
6942 }
6943
6944 if (devs_source.count(BlueFS::BDEV_DB)) {
6945 r = unlink(string(path + "/block.db").c_str());
6946 ceph_assert(r == 0);
6947 }
6948 if (devs_source.count(BlueFS::BDEV_WAL)) {
6949 r = unlink(string(path + "/block.wal").c_str());
6950 ceph_assert(r == 0);
6951 }
6952
6953shutdown:
6954 _umount_for_bluefs();
6955 return r;
6956}
6957
6958int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
6959 int id,
6960 const string& dev_path)
6961{
6962 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6963 int r;
6964 ceph_assert(path_fd < 0);
6965
6966 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6967
6968 if (!cct->_conf->bluestore_bluefs) {
6969 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6970 return -EIO;
6971 }
6972
6973 r = _mount_for_bluefs();
6974
6975 int reserved = 0;
6976 string link_db;
6977 string link_wal;
6978 if (devs_source.count(BlueFS::BDEV_DB) &&
9f95a23c 6979 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 6980 link_db = path + "/block.db";
9f95a23c
TL
6981 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6982 bluefs_layout.dedicated_db = false;
11fdf7f2
TL
6983 }
6984 if (devs_source.count(BlueFS::BDEV_WAL)) {
6985 link_wal = path + "/block.wal";
9f95a23c 6986 bluefs_layout.dedicated_wal = false;
11fdf7f2
TL
6987 }
6988
6989 size_t target_size;
6990 string target_name;
6991 if (id == BlueFS::BDEV_NEWWAL) {
6992 target_name = "block.wal";
6993 target_size = cct->_conf->bluestore_block_wal_size;
9f95a23c 6994 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6995
6996 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
eafe8130 6997 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6998 ceph_assert(r == 0);
6999
7000 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
7001 r = _check_or_set_bdev_label(
7002 dev_path,
7003 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
7004 "bluefs wal",
7005 true);
7006 ceph_assert(r == 0);
7007 }
7008 reserved = BDEV_LABEL_BLOCK_SIZE;
7009 } else if (id == BlueFS::BDEV_NEWDB) {
7010 target_name = "block.db";
7011 target_size = cct->_conf->bluestore_block_db_size;
9f95a23c
TL
7012 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
7013 bluefs_layout.dedicated_db = true;
31f18b77 7014
11fdf7f2 7015 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
eafe8130 7016 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
7017 ceph_assert(r == 0);
7018
7019 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
7020 r = _check_or_set_bdev_label(
7021 dev_path,
7022 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
7023 "bluefs db",
7024 true);
7025 ceph_assert(r == 0);
7026 }
7027 reserved = SUPER_RESERVED;
31f18b77
FG
7028 }
7029
11fdf7f2
TL
7030 bluefs->umount();
7031 bluefs->mount();
7032
7033 bluefs->add_block_extent(
7034 id, reserved, bluefs->get_block_device_size(id) - reserved);
7035
9f95a23c 7036 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
11fdf7f2 7037
7c673cae 7038 if (r < 0) {
11fdf7f2
TL
7039 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
7040 goto shutdown;
7041 }
7042
7043 if (!link_db.empty()) {
7044 r = unlink(link_db.c_str());
7045 ceph_assert(r == 0);
7046 }
7047 if (!link_wal.empty()) {
7048 r = unlink(link_wal.c_str());
7049 ceph_assert(r == 0);
7050 }
7051 r = _setup_block_symlink_or_file(
7052 target_name,
7053 dev_path,
7054 target_size,
7055 true);
7056 ceph_assert(r == 0);
7057 dout(0) << __func__ << " success" << dendl;
7058
7059shutdown:
7060 _umount_for_bluefs();
7061 return r;
7062}
7063
7064string BlueStore::get_device_path(unsigned id)
7065{
7066 string res;
7067 if (id < BlueFS::MAX_BDEV) {
7068 switch (id) {
7069 case BlueFS::BDEV_WAL:
7070 res = path + "/block.wal";
7071 break;
7072 case BlueFS::BDEV_DB:
9f95a23c 7073 if (id == bluefs_layout.shared_bdev) {
11fdf7f2
TL
7074 res = path + "/block";
7075 } else {
7076 res = path + "/block.db";
7077 }
7078 break;
7079 case BlueFS::BDEV_SLOW:
7080 res = path + "/block";
7081 break;
7082 }
7083 }
7084 return res;
7085}
7086
7087int BlueStore::expand_devices(ostream& out)
7088{
1911f103 7089 int r = cold_open();
11fdf7f2
TL
7090 ceph_assert(r == 0);
7091 bluefs->dump_block_extents(out);
1911f103 7092 out << "Expanding DB/WAL..." << std::endl;
11fdf7f2 7093 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
9f95a23c 7094 if (devid == bluefs_layout.shared_bdev ) {
11fdf7f2
TL
7095 continue;
7096 }
7097 uint64_t size = bluefs->get_block_device_size(devid);
7098 if (size == 0) {
7099 // no bdev
7100 continue;
7101 }
7102
7103 interval_set<uint64_t> before;
7104 bluefs->get_block_extents(devid, &before);
7105 ceph_assert(!before.empty());
7106 uint64_t end = before.range_end();
7107 if (end < size) {
7108 out << devid
7109 <<" : expanding " << " from 0x" << std::hex
7110 << end << " to 0x" << size << std::dec << std::endl;
7111 bluefs->add_block_extent(devid, end, size-end);
7112 string p = get_device_path(devid);
7113 const char* path = p.c_str();
7114 if (path == nullptr) {
7115 derr << devid
7116 <<": can't find device path " << dendl;
7117 continue;
7118 }
7119 bluestore_bdev_label_t label;
7120 int r = _read_bdev_label(cct, path, &label);
7121 if (r < 0) {
7122 derr << "unable to read label for " << path << ": "
7123 << cpp_strerror(r) << dendl;
7124 continue;
7125 }
7126 label.size = size;
7127 r = _write_bdev_label(cct, path, label);
7128 if (r < 0) {
7129 derr << "unable to write label for " << path << ": "
7130 << cpp_strerror(r) << dendl;
7131 continue;
7132 }
7133 out << devid
7134 <<" : size label updated to " << size
7135 << std::endl;
7136 }
7137 }
7138 uint64_t size0 = fm->get_size();
7139 uint64_t size = bdev->get_size();
7140 if (size0 < size) {
9f95a23c 7141 out << bluefs_layout.shared_bdev
1911f103
TL
7142 << " : expanding " << " from 0x" << std::hex
7143 << size0 << " to 0x" << size << std::dec << std::endl;
7144 _write_out_fm_meta(size, true);
7145 cold_close();
7146
7147 // mount in read/write to sync expansion changes
7148 r = _mount(false);
11fdf7f2 7149 ceph_assert(r == 0);
1911f103
TL
7150 umount();
7151 } else {
7152 cold_close();
7c673cae 7153 }
1911f103
TL
7154 return r;
7155}
7156
7157int BlueStore::dump_bluefs_sizes(ostream& out)
7158{
7159 int r = cold_open();
7160 ceph_assert(r == 0);
7161 bluefs->dump_block_extents(out);
7162 cold_close();
7c673cae
FG
7163 return r;
7164}
7165
7166void BlueStore::set_cache_shards(unsigned num)
7167{
7168 dout(10) << __func__ << " " << num << dendl;
9f95a23c
TL
7169 size_t oold = onode_cache_shards.size();
7170 size_t bold = buffer_cache_shards.size();
7171 ceph_assert(num >= oold && num >= bold);
7172 onode_cache_shards.resize(num);
7173 buffer_cache_shards.resize(num);
7174 for (unsigned i = oold; i < num; ++i) {
7175 onode_cache_shards[i] =
7176 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7177 logger);
7178 }
7179 for (unsigned i = bold; i < num; ++i) {
7180 buffer_cache_shards[i] =
7181 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
7182 logger);
7c673cae
FG
7183 }
7184}
7185
11fdf7f2 7186int BlueStore::_mount(bool kv_only, bool open_db)
7c673cae
FG
7187{
7188 dout(1) << __func__ << " path " << path << dendl;
7189
3efd9988
FG
7190 _kv_only = kv_only;
7191
7c673cae
FG
7192 {
7193 string type;
7194 int r = read_meta("type", &type);
7195 if (r < 0) {
7196 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
7197 << dendl;
7198 return r;
7199 }
7200
7201 if (type != "bluestore") {
7202 derr << __func__ << " expected bluestore, but type is " << type << dendl;
7203 return -EIO;
7204 }
7205 }
7206
7207 if (cct->_conf->bluestore_fsck_on_mount) {
7208 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
7209 if (rc < 0)
7210 return rc;
7211 if (rc > 0) {
7212 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7213 return -EIO;
7214 }
7215 }
7216
eafe8130
TL
7217 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
7218 derr << __func__ << " osd_max_object_size "
7219 << cct->_conf->osd_max_object_size << " > bluestore max "
7220 << OBJECT_MAX_SIZE << dendl;
7221 return -EINVAL;
7222 }
7223
7c673cae
FG
7224 int r = _open_path();
7225 if (r < 0)
7226 return r;
7227 r = _open_fsid(false);
7228 if (r < 0)
7229 goto out_path;
7230
7231 r = _read_fsid(&fsid);
7232 if (r < 0)
7233 goto out_fsid;
7234
7235 r = _lock_fsid();
7236 if (r < 0)
7237 goto out_fsid;
7238
7239 r = _open_bdev(false);
7240 if (r < 0)
7241 goto out_fsid;
7242
11fdf7f2
TL
7243 if (open_db) {
7244 r = _open_db_and_around(false);
7245 } else {
7246 // we can bypass db open exclusively in case of kv_only mode
7247 ceph_assert(kv_only);
7248 r = _open_db(false, true);
9f95a23c
TL
7249 }
7250 if (r < 0) {
7251 goto out_bdev;
11fdf7f2 7252 }
7c673cae
FG
7253
7254 if (kv_only)
7255 return 0;
7256
11fdf7f2
TL
7257 r = _upgrade_super();
7258 if (r < 0) {
7c673cae 7259 goto out_db;
11fdf7f2 7260 }
7c673cae
FG
7261
7262 r = _open_collections();
7263 if (r < 0)
11fdf7f2 7264 goto out_db;
7c673cae
FG
7265
7266 r = _reload_logger();
7267 if (r < 0)
7268 goto out_coll;
7269
31f18b77 7270 _kv_start();
7c673cae
FG
7271
7272 r = _deferred_replay();
7273 if (r < 0)
7274 goto out_stop;
7275
7276 mempool_thread.init();
7277
9f95a23c 7278 if ((!per_pool_stat_collection || !per_pool_omap) &&
eafe8130 7279 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
9f95a23c
TL
7280
7281 bool was_per_pool_omap = per_pool_omap;
7282
eafe8130
TL
7283 dout(1) << __func__ << " quick-fix on mount" << dendl;
7284 _fsck_on_open(FSCK_SHALLOW, true);
7285
7286 //reread statfs
7287 //FIXME minor: replace with actual open/close?
7288 _open_statfs();
eafe8130 7289 _check_legacy_statfs_alert();
9f95a23c
TL
7290
7291 //set again as hopefully it has been fixed
7292 if (!was_per_pool_omap) {
7293 _set_per_pool_omap();
7294 }
eafe8130
TL
7295 }
7296
7c673cae
FG
7297 mounted = true;
7298 return 0;
7299
7300 out_stop:
7301 _kv_stop();
7c673cae 7302 out_coll:
f6b5b4d7 7303 _shutdown_cache();
7c673cae 7304 out_db:
1911f103 7305 _close_db_and_around(false);
7c673cae
FG
7306 out_bdev:
7307 _close_bdev();
7308 out_fsid:
7309 _close_fsid();
7310 out_path:
7311 _close_path();
7312 return r;
7313}
7314
7315int BlueStore::umount()
7316{
11fdf7f2 7317 ceph_assert(_kv_only || mounted);
7c673cae
FG
7318 dout(1) << __func__ << dendl;
7319
7320 _osr_drain_all();
7c673cae 7321
7c673cae 7322 mounted = false;
3efd9988
FG
7323 if (!_kv_only) {
7324 mempool_thread.shutdown();
7325 dout(20) << __func__ << " stopping kv thread" << dendl;
7326 _kv_stop();
f6b5b4d7 7327 _shutdown_cache();
3efd9988
FG
7328 dout(20) << __func__ << " closing" << dendl;
7329
3efd9988 7330 }
1911f103 7331 _close_db_and_around(false);
7c673cae
FG
7332 _close_bdev();
7333 _close_fsid();
7334 _close_path();
7335
7336 if (cct->_conf->bluestore_fsck_on_umount) {
7337 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7338 if (rc < 0)
7339 return rc;
7340 if (rc > 0) {
7341 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7342 return -EIO;
7343 }
7344 }
7345 return 0;
7346}
7347
eafe8130
TL
7348int BlueStore::cold_open()
7349{
7350 int r = _open_path();
7351 if (r < 0)
7352 return r;
7353 r = _open_fsid(false);
7354 if (r < 0)
7355 goto out_path;
7356
7357 r = _read_fsid(&fsid);
7358 if (r < 0)
7359 goto out_fsid;
7360
7361 r = _lock_fsid();
7362 if (r < 0)
7363 goto out_fsid;
7364
7365 r = _open_bdev(false);
7366 if (r < 0)
7367 goto out_fsid;
7368 r = _open_db_and_around(true);
7369 if (r < 0) {
7370 goto out_bdev;
7371 }
7372 return 0;
7373 out_bdev:
7374 _close_bdev();
7375 out_fsid:
7376 _close_fsid();
7377 out_path:
7378 _close_path();
7379 return r;
7380}
7381int BlueStore::cold_close()
7382{
1911f103 7383 _close_db_and_around(true);
eafe8130
TL
7384 _close_bdev();
7385 _close_fsid();
7386 _close_path();
7387 return 0;
7388}
7389
9f95a23c
TL
7390// derr wrapper to limit enormous output and avoid log flooding.
7391// Of limited use where such output is expected for now
7392#define fsck_derr(err_cnt, threshold) \
7393 if (err_cnt <= threshold) { \
7394 bool need_skip_print = err_cnt == threshold; \
7395 derr
7396
7397#define fsck_dendl \
7398 dendl; \
7399 if (need_skip_print) \
7400 derr << "more error lines skipped..." << dendl; \
7c673cae 7401 }
7c673cae 7402
eafe8130
TL
7403int _fsck_sum_extents(
7404 const PExtentVector& extents,
7405 bool compressed,
7406 store_statfs_t& expected_statfs)
7407{
7408 for (auto e : extents) {
7409 if (!e.is_valid())
7410 continue;
7411 expected_statfs.allocated += e.length;
7412 if (compressed) {
7413 expected_statfs.data_compressed_allocated += e.length;
7414 }
7415 }
7416 return 0;
7417}
7418
7c673cae 7419int BlueStore::_fsck_check_extents(
11fdf7f2 7420 const coll_t& cid,
7c673cae
FG
7421 const ghobject_t& oid,
7422 const PExtentVector& extents,
7423 bool compressed,
7424 mempool_dynamic_bitset &used_blocks,
b32b8144 7425 uint64_t granularity,
11fdf7f2 7426 BlueStoreRepairer* repairer,
eafe8130
TL
7427 store_statfs_t& expected_statfs,
7428 FSCKDepth depth)
7c673cae
FG
7429{
7430 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
7431 int errors = 0;
7432 for (auto e : extents) {
7433 if (!e.is_valid())
7434 continue;
7435 expected_statfs.allocated += e.length;
7436 if (compressed) {
11fdf7f2 7437 expected_statfs.data_compressed_allocated += e.length;
7c673cae 7438 }
eafe8130
TL
7439 if (depth != FSCK_SHALLOW) {
7440 bool already = false;
9f95a23c 7441 apply_for_bitset_range(
eafe8130
TL
7442 e.offset, e.length, granularity, used_blocks,
7443 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
7444 if (bs.test(pos)) {
7445 if (repairer) {
7446 repairer->note_misreference(
7447 pos * min_alloc_size, min_alloc_size, !already);
7448 }
7449 if (!already) {
7450 derr << "fsck error: " << oid << " extent " << e
7451 << " or a subset is already allocated (misreferenced)" << dendl;
7452 ++errors;
7453 already = true;
7454 }
11fdf7f2 7455 }
eafe8130
TL
7456 else
7457 bs.set(pos);
7458 });
7459 if (repairer) {
7460 repairer->get_space_usage_tracker().set_used( e.offset, e.length, cid, oid);
7461 }
11fdf7f2 7462
eafe8130
TL
7463 if (e.end() > bdev->get_size()) {
7464 derr << "fsck error: " << oid << " extent " << e
7465 << " past end of block device" << dendl;
7466 ++errors;
7467 }
7c673cae
FG
7468 }
7469 }
7470 return errors;
7471}
7472
11fdf7f2
TL
7473void BlueStore::_fsck_check_pool_statfs(
7474 BlueStore::per_pool_statfs& expected_pool_statfs,
eafe8130
TL
7475 int64_t& errors,
7476 int64_t& warnings,
11fdf7f2
TL
7477 BlueStoreRepairer* repairer)
7478{
7479 auto it = db->get_iterator(PREFIX_STAT);
7480 if (it) {
7481 for (it->lower_bound(string()); it->valid(); it->next()) {
7482 string key = it->key();
7483 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
7484 if (repairer) {
eafe8130
TL
7485 ++errors;
7486 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
7487 derr << "fsck error: " << "legacy statfs record found, removing"
11fdf7f2
TL
7488 << dendl;
7489 }
7490 continue;
7491 }
11fdf7f2
TL
7492 uint64_t pool_id;
7493 if (get_key_pool_stat(key, &pool_id) < 0) {
7494 derr << "fsck error: bad key " << key
7495 << "in statfs namespece" << dendl;
7496 if (repairer) {
7497 repairer->remove_key(db, PREFIX_STAT, key);
7498 }
7499 ++errors;
7500 continue;
7501 }
7502
7503 volatile_statfs vstatfs;
7504 bufferlist bl = it->value();
7505 auto blp = bl.cbegin();
7506 try {
7507 vstatfs.decode(blp);
7508 } catch (buffer::error& e) {
7509 derr << "fsck error: failed to decode Pool StatFS record"
7510 << pretty_binary_string(key) << dendl;
7511 if (repairer) {
7512 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
7513 << pretty_binary_string(key)
7514 << "', removing" << dendl;
7515 repairer->remove_key(db, PREFIX_STAT, key);
7516 }
7517 ++errors;
7518 vstatfs.reset();
7519 }
7520 auto stat_it = expected_pool_statfs.find(pool_id);
7521 if (stat_it == expected_pool_statfs.end()) {
7522 if (vstatfs.is_empty()) {
7523 // we don't consider that as an error since empty pool statfs
7524 // are left in DB for now
7525 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7526 << std::hex << pool_id << std::dec << dendl;
7527 if (repairer) {
7528 // but we need to increment error count in case of repair
7529 // to have proper counters at the end
7530 // (as repairer increments recovery counter anyway).
7531 ++errors;
7532 }
7533 } else {
7534 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
7535 << std::hex << pool_id << std::dec << dendl;
7536 ++errors;
7537 }
7538 if (repairer) {
7539 repairer->remove_key(db, PREFIX_SHARED_BLOB, key);
7540 }
7541 continue;
7542 }
7543 store_statfs_t statfs;
7544 vstatfs.publish(&statfs);
7545 if (!(stat_it->second == statfs)) {
7546 derr << "fsck error: actual " << statfs
7547 << " != expected " << stat_it->second
7548 << " for pool "
7549 << std::hex << pool_id << std::dec << dendl;
7550 if (repairer) {
7551 repairer->fix_statfs(db, key, stat_it->second);
7552 }
7553 ++errors;
7554 }
7555 expected_pool_statfs.erase(stat_it);
7556 }
7557 } // if (it)
eafe8130
TL
7558 for (auto& s : expected_pool_statfs) {
7559 if (s.second.is_zero()) {
11fdf7f2
TL
7560 // we might lack empty statfs recs in DB
7561 continue;
7562 }
7563 derr << "fsck error: missing Pool StatFS record for pool "
eafe8130 7564 << std::hex << s.first << std::dec << dendl;
11fdf7f2
TL
7565 if (repairer) {
7566 string key;
eafe8130
TL
7567 get_pool_stat_key(s.first, &key);
7568 repairer->fix_statfs(db, key, s.second);
11fdf7f2
TL
7569 }
7570 ++errors;
7571 }
eafe8130 7572 if (!per_pool_stat_collection &&
eafe8130
TL
7573 repairer) {
7574 // by virtue of running this method, we correct the top-level
7575 // error of having global stats
7576 repairer->inc_repaired();
7577 }
11fdf7f2
TL
7578}
7579
eafe8130
TL
7580BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
7581 BlueStore::FSCKDepth depth,
7582 int64_t pool_id,
7583 BlueStore::CollectionRef c,
7584 const ghobject_t& oid,
7585 const string& key,
7586 const bufferlist& value,
9f95a23c 7587 mempool::bluestore_fsck::list<string>* expecting_shards,
eafe8130
TL
7588 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
7589 const BlueStore::FSCK_ObjectCtx& ctx)
7590{
7591 auto& errors = ctx.errors;
7592 auto& num_objects = ctx.num_objects;
7593 auto& num_extents = ctx.num_extents;
7594 auto& num_blobs = ctx.num_blobs;
7595 auto& num_sharded_objects = ctx.num_sharded_objects;
7596 auto& num_spanning_blobs = ctx.num_spanning_blobs;
7597 auto used_blocks = ctx.used_blocks;
7598 auto sb_info_lock = ctx.sb_info_lock;
7599 auto& sb_info = ctx.sb_info;
7600 auto repairer = ctx.repairer;
7601
7602 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
7603 &ctx.expected_pool_statfs[pool_id] :
7604 &ctx.expected_store_statfs;
7605
7606 dout(10) << __func__ << " " << oid << dendl;
7607 OnodeRef o;
7608 o.reset(Onode::decode(c, oid, key, value));
7609 ++num_objects;
7c673cae 7610
eafe8130 7611 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7c673cae 7612
eafe8130
TL
7613 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7614 _dump_onode<30>(cct, *o);
7615 // shards
7616 if (!o->extent_map.shards.empty()) {
7617 ++num_sharded_objects;
7618 if (depth != FSCK_SHALLOW) {
9f95a23c 7619 ceph_assert(expecting_shards);
eafe8130
TL
7620 for (auto& s : o->extent_map.shards) {
7621 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
9f95a23c 7622 expecting_shards->push_back(string());
eafe8130 7623 get_extent_shard_key(o->key, s.shard_info->offset,
9f95a23c 7624 &expecting_shards->back());
eafe8130
TL
7625 if (s.shard_info->offset >= o->onode.size) {
7626 derr << "fsck error: " << oid << " shard 0x" << std::hex
7627 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
7628 << std::dec << dendl;
7629 ++errors;
7630 }
7631 }
7632 }
7633 }
7c673cae 7634
eafe8130
TL
7635 // lextents
7636 uint64_t pos = 0;
7637 mempool::bluestore_fsck::map<BlobRef,
7638 bluestore_blob_use_tracker_t> ref_map;
7639 for (auto& l : o->extent_map.extent_map) {
7640 dout(20) << __func__ << " " << l << dendl;
7641 if (l.logical_offset < pos) {
7642 derr << "fsck error: " << oid << " lextent at 0x"
7643 << std::hex << l.logical_offset
7644 << " overlaps with the previous, which ends at 0x" << pos
7645 << std::dec << dendl;
7646 ++errors;
7647 }
7648 if (depth != FSCK_SHALLOW &&
7649 o->extent_map.spans_shard(l.logical_offset, l.length)) {
7650 derr << "fsck error: " << oid << " lextent at 0x"
7651 << std::hex << l.logical_offset << "~" << l.length
7652 << " spans a shard boundary"
7653 << std::dec << dendl;
7654 ++errors;
7655 }
7656 pos = l.logical_offset + l.length;
7657 res_statfs->data_stored += l.length;
7658 ceph_assert(l.blob);
7659 const bluestore_blob_t& blob = l.blob->get_blob();
7660
7661 auto& ref = ref_map[l.blob];
7662 if (ref.is_empty()) {
7663 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
7664 uint32_t l = blob.get_logical_length();
7665 ref.init(l, min_release_size);
7666 }
7667 ref.get(
7668 l.blob_offset,
7669 l.length);
7670 ++num_extents;
7671 if (depth != FSCK_SHALLOW &&
7672 blob.has_unused()) {
7673 ceph_assert(referenced);
7674 auto p = referenced->find(l.blob);
7675 bluestore_blob_t::unused_t* pu;
7676 if (p == referenced->end()) {
7677 pu = &(*referenced)[l.blob];
7678 }
7679 else {
7680 pu = &p->second;
7681 }
7682 uint64_t blob_len = blob.get_logical_length();
7683 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
7684 ceph_assert(l.blob_offset + l.length <= blob_len);
7685 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
7686 uint64_t start = l.blob_offset / chunk_size;
7687 uint64_t end =
7688 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
7689 for (auto i = start; i < end; ++i) {
7690 (*pu) |= (1u << i);
7691 }
7692 }
7693 } //for (auto& l : o->extent_map.extent_map)
7694
7695 for (auto& i : ref_map) {
7696 ++num_blobs;
7697 const bluestore_blob_t& blob = i.first->get_blob();
7698 bool equal =
7699 depth == FSCK_SHALLOW ? true :
7700 i.first->get_blob_use_tracker().equal(i.second);
7701 if (!equal) {
7702 derr << "fsck error: " << oid << " blob " << *i.first
7703 << " doesn't match expected ref_map " << i.second << dendl;
7704 ++errors;
7705 }
7706 if (blob.is_compressed()) {
7707 res_statfs->data_compressed += blob.get_compressed_payload_length();
7708 res_statfs->data_compressed_original +=
7709 i.first->get_referenced_bytes();
7710 }
7711 if (blob.is_shared()) {
7712 if (i.first->shared_blob->get_sbid() > blobid_max) {
7713 derr << "fsck error: " << oid << " blob " << blob
7714 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
7715 << blobid_max << dendl;
7716 ++errors;
7717 }
7718 else if (i.first->shared_blob->get_sbid() == 0) {
7719 derr << "fsck error: " << oid << " blob " << blob
7720 << " marked as shared but has uninitialized sbid"
7721 << dendl;
7722 ++errors;
7723 }
7724 // the below lock is optional and provided in multithreading mode only
7725 if (sb_info_lock) {
7726 sb_info_lock->lock();
7727 }
7728 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
7729 ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
7730 ceph_assert(sbi.pool_id == INT64_MIN ||
7731 sbi.pool_id == oid.hobj.get_logical_pool());
7732 sbi.cid = c->cid;
7733 sbi.pool_id = oid.hobj.get_logical_pool();
7734 sbi.sb = i.first->shared_blob;
7735 sbi.oids.push_back(oid);
7736 sbi.compressed = blob.is_compressed();
7737 for (auto e : blob.get_extents()) {
7738 if (e.is_valid()) {
7739 sbi.ref_map.get(e.offset, e.length);
7740 }
7741 }
7742 if (sb_info_lock) {
7743 sb_info_lock->unlock();
7744 }
7745 } else if (depth != FSCK_SHALLOW) {
7746 ceph_assert(used_blocks);
7747 errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
7748 blob.is_compressed(),
7749 *used_blocks,
7750 fm->get_alloc_size(),
7751 repairer,
7752 *res_statfs,
7753 depth);
7754 } else {
7755 errors += _fsck_sum_extents(
7756 blob.get_extents(),
7757 blob.is_compressed(),
7758 *res_statfs);
7759 }
7760 } // for (auto& i : ref_map)
9f95a23c
TL
7761
7762 if (o->onode.has_omap()) {
7763 _fsck_check_object_omap(depth, o, ctx);
7764 }
7765
eafe8130
TL
7766 return o;
7767}
7768
7769#include "common/WorkQueue.h"
7770
7771class ShallowFSCKThreadPool : public ThreadPool
7772{
7773public:
7774 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
7775 ThreadPool(cct_, nm, tn, n) {
7776 }
7777 void worker(ThreadPool::WorkThread* wt) override {
7778 int next_wq = 0;
7779 while (!_stop) {
7780 next_wq %= work_queues.size();
7781 WorkQueue_ *wq = work_queues[next_wq++];
7782
7783 void* item = wq->_void_dequeue();
7784 if (item) {
7785 processing++;
7786 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
7787 wq->_void_process(item, tp_handle);
7788 processing--;
7789 }
7790 }
7791 }
7792 template <size_t BatchLen>
7793 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
7794 {
7795 struct Entry {
7796 int64_t pool_id;
7797 BlueStore::CollectionRef c;
7798 ghobject_t oid;
7799 string key;
7800 bufferlist value;
7801 };
7802 struct Batch {
7803 std::atomic<size_t> running = { 0 };
7804 size_t entry_count = 0;
7805 std::array<Entry, BatchLen> entries;
7806
7807 int64_t errors = 0;
7808 int64_t warnings = 0;
7809 uint64_t num_objects = 0;
7810 uint64_t num_extents = 0;
7811 uint64_t num_blobs = 0;
7812 uint64_t num_sharded_objects = 0;
7813 uint64_t num_spanning_blobs = 0;
7814 store_statfs_t expected_store_statfs;
7815 BlueStore::per_pool_statfs expected_pool_statfs;
7816 };
7817
7818 size_t batchCount;
7819 BlueStore* store = nullptr;
7820
eafe8130
TL
7821 ceph::mutex* sb_info_lock = nullptr;
7822 BlueStore::sb_info_map_t* sb_info = nullptr;
7823 BlueStoreRepairer* repairer = nullptr;
7824
7825 Batch* batches = nullptr;
7826 size_t last_batch_pos = 0;
7827 bool batch_acquired = false;
7828
7829 FSCKWorkQueue(std::string n,
7830 size_t _batchCount,
7831 BlueStore* _store,
eafe8130
TL
7832 ceph::mutex* _sb_info_lock,
7833 BlueStore::sb_info_map_t& _sb_info,
7834 BlueStoreRepairer* _repairer) :
7835 WorkQueue_(n, time_t(), time_t()),
7836 batchCount(_batchCount),
7837 store(_store),
eafe8130
TL
7838 sb_info_lock(_sb_info_lock),
7839 sb_info(&_sb_info),
7840 repairer(_repairer)
7841 {
7842 batches = new Batch[batchCount];
7843 }
7844 ~FSCKWorkQueue() {
7845 delete[] batches;
7846 }
7847
7848 /// Remove all work items from the queue.
7849 void _clear() override {
7850 //do nothing
7851 }
7852 /// Check whether there is anything to do.
7853 bool _empty() override {
7854 ceph_assert(false);
7855 }
7856
7857 /// Get the next work item to process.
7858 void* _void_dequeue() override {
7859 size_t pos = rand() % batchCount;
7860 size_t pos0 = pos;
7861 do {
7862 auto& batch = batches[pos];
7863 if (batch.running.fetch_add(1) == 0) {
7864 if (batch.entry_count) {
7865 return &batch;
7866 }
7867 }
7868 batch.running--;
7869 pos++;
7870 pos %= batchCount;
7871 } while (pos != pos0);
7872 return nullptr;
7873 }
7874 /** @brief Process the work item.
7875 * This function will be called several times in parallel
7876 * and must therefore be thread-safe. */
7877 void _void_process(void* item, TPHandle& handle) override {
7878 Batch* batch = (Batch*)item;
7879
7880 BlueStore::FSCK_ObjectCtx ctx(
7881 batch->errors,
7882 batch->warnings,
7883 batch->num_objects,
7884 batch->num_extents,
7885 batch->num_blobs,
7886 batch->num_sharded_objects,
7887 batch->num_spanning_blobs,
7888 nullptr, // used_blocks
9f95a23c 7889 nullptr, //used_omap_head
eafe8130
TL
7890 sb_info_lock,
7891 *sb_info,
7892 batch->expected_store_statfs,
7893 batch->expected_pool_statfs,
7894 repairer);
7895
7896 for (size_t i = 0; i < batch->entry_count; i++) {
7897 auto& entry = batch->entries[i];
7898
7899 store->fsck_check_objects_shallow(
7900 BlueStore::FSCK_SHALLOW,
7901 entry.pool_id,
7902 entry.c,
7903 entry.oid,
7904 entry.key,
7905 entry.value,
9f95a23c 7906 nullptr, // expecting_shards - this will need a protection if passed
eafe8130
TL
7907 nullptr, // referenced
7908 ctx);
7909 }
7910 //std::cout << "processed " << batch << std::endl;
7911 batch->entry_count = 0;
7912 batch->running--;
7913 }
7914 /** @brief Synchronously finish processing a work item.
7915 * This function is called after _void_process with the global thread pool lock held,
7916 * so at most one copy will execute simultaneously for a given thread pool.
7917 * It can be used for non-thread-safe finalization. */
7918 void _void_process_finish(void*) override {
7919 ceph_assert(false);
7920 }
7921
7922 bool queue(
7923 int64_t pool_id,
7924 BlueStore::CollectionRef c,
7925 const ghobject_t& oid,
7926 const string& key,
7927 const bufferlist& value) {
7928 bool res = false;
7929 size_t pos0 = last_batch_pos;
7930 if (!batch_acquired) {
7931 do {
7932 auto& batch = batches[last_batch_pos];
7933 if (batch.running.fetch_add(1) == 0) {
7934 if (batch.entry_count < BatchLen) {
7935 batch_acquired = true;
7936 break;
7937 }
7938 }
7939 batch.running.fetch_sub(1);
7940 last_batch_pos++;
7941 last_batch_pos %= batchCount;
7942 } while (last_batch_pos != pos0);
7943 }
7944 if (batch_acquired) {
7945 auto& batch = batches[last_batch_pos];
7946 ceph_assert(batch.running);
7947 ceph_assert(batch.entry_count < BatchLen);
7948
7949 auto& entry = batch.entries[batch.entry_count];
7950 entry.pool_id = pool_id;
7951 entry.c = c;
7952 entry.oid = oid;
7953 entry.key = key;
7954 entry.value = value;
7955
7956 ++batch.entry_count;
7957 if (batch.entry_count == BatchLen) {
7958 batch_acquired = false;
7959 batch.running.fetch_sub(1);
7960 last_batch_pos++;
7961 last_batch_pos %= batchCount;
7962 }
7963 res = true;
7964 }
7965 return res;
7966 }
7967
7968 void finalize(ThreadPool& tp,
7969 BlueStore::FSCK_ObjectCtx& ctx) {
7970 if (batch_acquired) {
7971 auto& batch = batches[last_batch_pos];
7972 ceph_assert(batch.running);
7973 batch.running.fetch_sub(1);
7974 }
7975 tp.stop();
7976
7977 for (size_t i = 0; i < batchCount; i++) {
7978 auto& batch = batches[i];
7979
7980 //process leftovers if any
7981 if (batch.entry_count) {
7982 TPHandle tp_handle(store->cct,
7983 nullptr,
7984 timeout_interval,
7985 suicide_interval);
7986 ceph_assert(batch.running == 0);
7987
7988 batch.running++; // just to be on-par with the regular call
7989 _void_process(&batch, tp_handle);
7990 }
7991 ceph_assert(batch.entry_count == 0);
7992
7993 ctx.errors += batch.errors;
7994 ctx.warnings += batch.warnings;
7995 ctx.num_objects += batch.num_objects;
7996 ctx.num_extents += batch.num_extents;
7997 ctx.num_blobs += batch.num_blobs;
7998 ctx.num_sharded_objects += batch.num_sharded_objects;
7999 ctx.num_spanning_blobs += batch.num_spanning_blobs;
9f95a23c 8000
eafe8130
TL
8001 ctx.expected_store_statfs.add(batch.expected_store_statfs);
8002
8003 for (auto it = batch.expected_pool_statfs.begin();
8004 it != batch.expected_pool_statfs.end();
8005 it++) {
8006 ctx.expected_pool_statfs[it->first].add(it->second);
8007 }
8008 }
8009 }
8010 };
8011};
8012
9f95a23c
TL
8013void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
8014 OnodeRef& o,
8015 const BlueStore::FSCK_ObjectCtx& ctx)
eafe8130 8016{
9f95a23c
TL
8017 auto& errors = ctx.errors;
8018 auto& warnings = ctx.warnings;
8019 auto repairer = ctx.repairer;
8020
8021 ceph_assert(o->onode.has_omap());
8022 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
8023 if (per_pool_omap) {
8024 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8025 << "fsck error: " << o->oid
8026 << " has omap that is not per-pool or pgmeta"
8027 << fsck_dendl;
8028 ++errors;
8029 } else {
8030 const char* w;
8031 int64_t num;
8032 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8033 ++errors;
8034 num = errors;
8035 w = "error";
8036 } else {
8037 ++warnings;
8038 num = warnings;
8039 w = "warning";
8040 }
8041 fsck_derr(num, MAX_FSCK_ERROR_LINES)
8042 << "fsck " << w << ": " << o->oid
8043 << " has omap that is not per-pool or pgmeta"
8044 << fsck_dendl;
8045 }
8046 }
8047 if (repairer &&
8048 !o->onode.is_perpool_omap() &&
8049 !o->onode.is_pgmeta_omap()) {
8050 dout(10) << "fsck converting " << o->oid << " omap to per-pool" << dendl;
8051 bufferlist h;
8052 map<string, bufferlist> kv;
8053 int r = _onode_omap_get(o, &h, &kv);
8054 if (r < 0) {
8055 derr << " got " << r << " " << cpp_strerror(r) << dendl;
8056 } else {
8057 KeyValueDB::Transaction txn = db->get_transaction();
8058 // remove old keys
8059 const string& old_omap_prefix = o->get_omap_prefix();
8060 string old_head, old_tail;
8061 o->get_omap_header(&old_head);
8062 o->get_omap_tail(&old_tail);
8063 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
8064 txn->rmkey(old_omap_prefix, old_tail);
8065 // set flag
8066 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP);
8067 _record_onode(o, txn);
8068 const string& new_omap_prefix = o->get_omap_prefix();
8069 // head
8070 if (h.length()) {
8071 string new_head;
8072 o->get_omap_header(&new_head);
8073 txn->set(new_omap_prefix, new_head, h);
8074 }
8075 // tail
8076 string new_tail;
8077 o->get_omap_tail(&new_tail);
8078 bufferlist empty;
8079 txn->set(new_omap_prefix, new_tail, empty);
8080 // values
8081 string final_key;
8082 o->get_omap_key(string(), &final_key);
8083 size_t base_key_len = final_key.size();
8084 for (auto& i : kv) {
8085 final_key.resize(base_key_len);
8086 final_key += i.first;
8087 txn->set(new_omap_prefix, final_key, i.second);
8088 }
8089 db->submit_transaction_sync(txn);
8090 repairer->inc_repaired();
8091 }
eafe8130 8092 }
9f95a23c 8093}
eafe8130 8094
9f95a23c
TL
8095void BlueStore::_fsck_check_objects(FSCKDepth depth,
8096 BlueStore::FSCK_ObjectCtx& ctx)
8097{
eafe8130 8098 auto& errors = ctx.errors;
eafe8130
TL
8099 auto sb_info_lock = ctx.sb_info_lock;
8100 auto& sb_info = ctx.sb_info;
8101 auto repairer = ctx.repairer;
8102
8103 uint64_t_btree_t used_nids;
8104
8105 size_t processed_myself = 0;
8106
8107 auto it = db->get_iterator(PREFIX_OBJ);
8108 mempool::bluestore_fsck::list<string> expecting_shards;
8109 if (it) {
8110 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
8111 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
8112 std::unique_ptr<WQ> wq(
8113 new WQ(
8114 "FSCKWorkQueue",
8115 (thread_count ? : 1) * 32,
8116 this,
eafe8130
TL
8117 sb_info_lock,
8118 sb_info,
8119 repairer));
8120
8121 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
8122
8123 thread_pool.add_work_queue(wq.get());
8124 if (depth == FSCK_SHALLOW && thread_count > 0) {
8125 //not the best place but let's check anyway
8126 ceph_assert(sb_info_lock);
8127 thread_pool.start();
8128 }
8129
8130 //fill global if not overriden below
8131 CollectionRef c;
8132 int64_t pool_id = -1;
8133 spg_t pgid;
8134 for (it->lower_bound(string()); it->valid(); it->next()) {
8135 dout(30) << __func__ << " key "
8136 << pretty_binary_string(it->key()) << dendl;
8137 if (is_extent_shard_key(it->key())) {
8138 if (depth == FSCK_SHALLOW) {
8139 continue;
8140 }
8141 while (!expecting_shards.empty() &&
8142 expecting_shards.front() < it->key()) {
8143 derr << "fsck error: missing shard key "
8144 << pretty_binary_string(expecting_shards.front())
8145 << dendl;
8146 ++errors;
8147 expecting_shards.pop_front();
8148 }
8149 if (!expecting_shards.empty() &&
8150 expecting_shards.front() == it->key()) {
8151 // all good
8152 expecting_shards.pop_front();
8153 continue;
8154 }
8155
8156 uint32_t offset;
8157 string okey;
8158 get_key_extent_shard(it->key(), &okey, &offset);
8159 derr << "fsck error: stray shard 0x" << std::hex << offset
8160 << std::dec << dendl;
8161 if (expecting_shards.empty()) {
8162 derr << "fsck error: " << pretty_binary_string(it->key())
8163 << " is unexpected" << dendl;
8164 ++errors;
8165 continue;
8166 }
8167 while (expecting_shards.front() > it->key()) {
8168 derr << "fsck error: saw " << pretty_binary_string(it->key())
8169 << dendl;
8170 derr << "fsck error: exp "
8171 << pretty_binary_string(expecting_shards.front()) << dendl;
8172 ++errors;
8173 expecting_shards.pop_front();
8174 if (expecting_shards.empty()) {
8175 break;
8176 }
8177 }
8178 continue;
8179 }
8180
8181 ghobject_t oid;
8182 int r = get_key_object(it->key(), &oid);
8183 if (r < 0) {
8184 derr << "fsck error: bad object key "
8185 << pretty_binary_string(it->key()) << dendl;
8186 ++errors;
8187 continue;
8188 }
8189 if (!c ||
8190 oid.shard_id != pgid.shard ||
8191 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8192 !c->contains(oid)) {
8193 c = nullptr;
8194 for (auto& p : coll_map) {
8195 if (p.second->contains(oid)) {
8196 c = p.second;
8197 break;
8198 }
8199 }
8200 if (!c) {
8201 derr << "fsck error: stray object " << oid
8202 << " not owned by any collection" << dendl;
8203 ++errors;
8204 continue;
8205 }
8206 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
8207 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
8208 << dendl;
8209 }
8210
8211 if (depth != FSCK_SHALLOW &&
8212 !expecting_shards.empty()) {
8213 for (auto& k : expecting_shards) {
8214 derr << "fsck error: missing shard key "
8215 << pretty_binary_string(k) << dendl;
8216 }
8217 ++errors;
8218 expecting_shards.clear();
8219 }
8220
8221 bool queued = false;
8222 if (depth == FSCK_SHALLOW && thread_count > 0) {
8223 queued = wq->queue(
8224 pool_id,
8225 c,
8226 oid,
8227 it->key(),
8228 it->value());
8229 }
8230 OnodeRef o;
8231 map<BlobRef, bluestore_blob_t::unused_t> referenced;
8232
8233 if (!queued) {
8234 ++processed_myself;
8235
8236 o = fsck_check_objects_shallow(
8237 depth,
8238 pool_id,
8239 c,
8240 oid,
8241 it->key(),
8242 it->value(),
9f95a23c 8243 &expecting_shards,
eafe8130
TL
8244 &referenced,
8245 ctx);
8246 }
8247
8248 if (depth != FSCK_SHALLOW) {
8249 ceph_assert(o != nullptr);
8250 if (o->onode.nid) {
8251 if (o->onode.nid > nid_max) {
8252 derr << "fsck error: " << oid << " nid " << o->onode.nid
8253 << " > nid_max " << nid_max << dendl;
8254 ++errors;
8255 }
8256 if (used_nids.count(o->onode.nid)) {
8257 derr << "fsck error: " << oid << " nid " << o->onode.nid
8258 << " already in use" << dendl;
8259 ++errors;
8260 continue; // go for next object
8261 }
8262 used_nids.insert(o->onode.nid);
8263 }
8264 for (auto& i : referenced) {
8265 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
8266 << std::dec << " for " << *i.first << dendl;
8267 const bluestore_blob_t& blob = i.first->get_blob();
8268 if (i.second & blob.unused) {
8269 derr << "fsck error: " << oid << " blob claims unused 0x"
8270 << std::hex << blob.unused
8271 << " but extents reference 0x" << i.second << std::dec
8272 << " on blob " << *i.first << dendl;
8273 ++errors;
8274 }
8275 if (blob.has_csum()) {
8276 uint64_t blob_len = blob.get_logical_length();
8277 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
8278 unsigned csum_count = blob.get_csum_count();
8279 unsigned csum_chunk_size = blob.get_csum_chunk_size();
8280 for (unsigned p = 0; p < csum_count; ++p) {
8281 unsigned pos = p * csum_chunk_size;
8282 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
8283 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
8284 unsigned mask = 1u << firstbit;
8285 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
8286 mask |= 1u << b;
8287 }
8288 if ((blob.unused & mask) == mask) {
8289 // this csum chunk region is marked unused
8290 if (blob.get_csum_item(p) != 0) {
8291 derr << "fsck error: " << oid
8292 << " blob claims csum chunk 0x" << std::hex << pos
8293 << "~" << csum_chunk_size
8294 << " is unused (mask 0x" << mask << " of unused 0x"
8295 << blob.unused << ") but csum is non-zero 0x"
8296 << blob.get_csum_item(p) << std::dec << " on blob "
8297 << *i.first << dendl;
8298 ++errors;
8299 }
8300 }
8301 }
8302 }
8303 }
8304 // omap
8305 if (o->onode.has_omap()) {
9f95a23c
TL
8306 ceph_assert(ctx.used_omap_head);
8307 if (ctx.used_omap_head->count(o->onode.nid)) {
8308 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
8309 << " already in use" << dendl;
eafe8130
TL
8310 ++errors;
8311 } else {
9f95a23c 8312 ctx.used_omap_head->insert(o->onode.nid);
eafe8130 8313 }
9f95a23c 8314 } // if (o->onode.has_omap())
eafe8130
TL
8315 if (depth == FSCK_DEEP) {
8316 bufferlist bl;
8317 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
8318 uint64_t offset = 0;
8319 do {
8320 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
8321 int r = _do_read(c.get(), o, offset, l, bl,
8322 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
8323 if (r < 0) {
8324 ++errors;
8325 derr << "fsck error: " << oid << std::hex
8326 << " error during read: "
8327 << " " << offset << "~" << l
8328 << " " << cpp_strerror(r) << std::dec
8329 << dendl;
8330 break;
8331 }
8332 offset += l;
8333 } while (offset < o->onode.size);
8334 } // deep
8335 } //if (depth != FSCK_SHALLOW)
8336 } // for (it->lower_bound(string()); it->valid(); it->next())
8337 if (depth == FSCK_SHALLOW && thread_count > 0) {
8338 wq->finalize(thread_pool, ctx);
8339 if (processed_myself) {
8340 // may be needs more threads?
8341 dout(0) << __func__ << " partial offload"
8342 << ", done myself " << processed_myself
8343 << " of " << ctx.num_objects
8344 << "objects, threads " << thread_count
8345 << dendl;
8346 }
8347 }
8348 } // if (it)
8349}
8350/**
8351An overview for currently implemented repair logics
8352performed in fsck in two stages: detection(+preparation) and commit.
8353Detection stage (in processing order):
8354 (Issue -> Repair action to schedule)
8355 - Detect undecodable keys for Shared Blobs -> Remove
8356 - Detect undecodable records for Shared Blobs -> Remove
8357 (might trigger missed Shared Blob detection below)
8358 - Detect stray records for Shared Blobs -> Remove
8359 - Detect misreferenced pextents -> Fix
8360 Prepare Bloom-like filter to track cid/oid -> pextent
8361 Prepare list of extents that are improperly referenced
8362 Enumerate Onode records that might use 'misreferenced' pextents
8363 (Bloom-like filter applied to reduce computation)
8364 Per each questinable Onode enumerate all blobs and identify broken ones
8365 (i.e. blobs having 'misreferences')
8366 Rewrite each broken blob data by allocating another extents and
8367 copying data there
8368 If blob is shared - unshare it and mark corresponding Shared Blob
8369 for removal
8370 Release previously allocated space
8371 Update Extent Map
8372 - Detect missed Shared Blobs -> Recreate
8373 - Detect undecodable deferred transaction -> Remove
8374 - Detect Freelist Manager's 'false free' entries -> Mark as used
8375 - Detect Freelist Manager's leaked entries -> Mark as free
8376 - Detect statfs inconsistency - Update
8377 Commit stage (separate DB commit per each step):
8378 - Apply leaked FM entries fix
8379 - Apply 'false free' FM entries fix
8380 - Apply 'Remove' actions
8381 - Apply fix for misreference pextents
8382 - Apply Shared Blob recreate
8383 (can be merged with the step above if misreferences were dectected)
8384 - Apply StatFS update
8385*/
8386int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
8387{
8388 dout(1) << __func__
8389 << (repair ? " repair" : " check")
8390 << (depth == FSCK_DEEP ? " (deep)" :
8391 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8392 << dendl;
8393
8394 // in deep mode we need R/W write access to be able to replay deferred ops
8395 bool read_only = !(repair || depth == FSCK_DEEP);
8396
8397 int r = _open_path();
8398 if (r < 0)
8399 return r;
7c673cae
FG
8400 r = _open_fsid(false);
8401 if (r < 0)
8402 goto out_path;
8403
8404 r = _read_fsid(&fsid);
8405 if (r < 0)
8406 goto out_fsid;
8407
8408 r = _lock_fsid();
8409 if (r < 0)
8410 goto out_fsid;
8411
8412 r = _open_bdev(false);
8413 if (r < 0)
8414 goto out_fsid;
8415
11fdf7f2 8416 r = _open_db_and_around(read_only);
7c673cae
FG
8417 if (r < 0)
8418 goto out_bdev;
8419
11fdf7f2
TL
8420 if (!read_only) {
8421 r = _upgrade_super();
8422 if (r < 0) {
8423 goto out_db;
8424 }
8425 }
7c673cae 8426
eafe8130 8427 r = _open_collections();
7c673cae 8428 if (r < 0)
11fdf7f2 8429 goto out_db;
7c673cae
FG
8430
8431 mempool_thread.init();
8432
11fdf7f2
TL
8433 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8434 // enable in repair or deep mode modes only
8435 if (!read_only) {
8436 _kv_start();
8437 r = _deferred_replay();
8438 _kv_stop();
8439 }
7c673cae
FG
8440 if (r < 0)
8441 goto out_scan;
8442
eafe8130
TL
8443 r = _fsck_on_open(depth, repair);
8444
8445out_scan:
8446 mempool_thread.shutdown();
f6b5b4d7 8447 _shutdown_cache();
eafe8130 8448out_db:
1911f103 8449 _close_db_and_around(false);
eafe8130
TL
8450out_bdev:
8451 _close_bdev();
8452out_fsid:
8453 _close_fsid();
8454out_path:
8455 _close_path();
8456
8457 return r;
8458}
8459
8460int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
8461{
8462 dout(1) << __func__
8463 << " <<<START>>>"
8464 << (repair ? " repair" : " check")
8465 << (depth == FSCK_DEEP ? " (deep)" :
8466 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8467 << " start" << dendl;
8468 int64_t errors = 0;
8469 int64_t warnings = 0;
8470 unsigned repaired = 0;
8471
8472 uint64_t_btree_t used_omap_head;
eafe8130
TL
8473 uint64_t_btree_t used_sbids;
8474
8475 mempool_dynamic_bitset used_blocks;
8476 KeyValueDB::Iterator it;
8477 store_statfs_t expected_store_statfs, actual_statfs;
8478 per_pool_statfs expected_pool_statfs;
8479
8480 sb_info_map_t sb_info;
8481
8482 uint64_t num_objects = 0;
8483 uint64_t num_extents = 0;
8484 uint64_t num_blobs = 0;
8485 uint64_t num_spanning_blobs = 0;
8486 uint64_t num_shared_blobs = 0;
8487 uint64_t num_sharded_objects = 0;
8488 BlueStoreRepairer repairer;
8489
8490 utime_t start = ceph_clock_now();
8491
8492 _fsck_collections(&errors);
b32b8144 8493 used_blocks.resize(fm->get_alloc_units());
9f95a23c 8494 apply_for_bitset_range(
11fdf7f2 8495 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
7c673cae
FG
8496 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8497 bs.set(pos);
8498 }
8499 );
11fdf7f2
TL
8500 if (repair) {
8501 repairer.get_space_usage_tracker().init(
8502 bdev->get_size(),
8503 min_alloc_size);
8504 }
7c673cae
FG
8505
8506 if (bluefs) {
11fdf7f2
TL
8507 if( cct->_conf->bluestore_bluefs_db_compatibility) {
8508 interval_set<uint64_t> bluefs_extents_db;
8509 bufferlist bl;
8510 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
8511 auto p = bl.cbegin();
8512 auto prev_errors = errors;
8513 try {
8514 decode(bluefs_extents_db, p);
8515 bluefs_extents_db.union_of(bluefs_extents);
8516 bluefs_extents_db.subtract(bluefs_extents);
8517 if (!bluefs_extents_db.empty()) {
8518 derr << "fsck error: bluefs_extents inconsistency, "
8519 << "downgrade to previous releases might be broken."
8520 << dendl;
8521 ++errors;
8522 }
8523 }
8524 catch (buffer::error& e) {
8525 derr << "fsck error: failed to retrieve bluefs_extents from kv" << dendl;
8526 ++errors;
8527 }
8528 if (errors != prev_errors && repair) {
8529 repairer.fix_bluefs_extents(out_of_sync_fm);
8530 }
8531 }
8532
7c673cae 8533 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
9f95a23c 8534 apply_for_bitset_range(
b32b8144 8535 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae
FG
8536 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8537 bs.set(pos);
1911f103 8538 });
7c673cae 8539 }
eafe8130 8540 int r = bluefs->fsck();
7c673cae 8541 if (r < 0) {
eafe8130 8542 return r;
7c673cae
FG
8543 }
8544 if (r > 0)
8545 errors += r;
8546 }
8547
eafe8130
TL
8548 if (!per_pool_stat_collection) {
8549 const char *w;
8550 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
8551 w = "error";
8552 ++errors;
8553 } else {
8554 w = "warning";
8555 ++warnings;
8556 }
8557 derr << "fsck " << w << ": store not yet converted to per-pool stats"
8558 << dendl;
8559 }
9f95a23c
TL
8560 if (!per_pool_omap) {
8561 const char *w;
8562 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8563 w = "error";
8564 ++errors;
8565 } else {
8566 w = "warning";
8567 ++warnings;
8568 }
8569 derr << "fsck " << w << ": store not yet converted to per-pool omap"
8570 << dendl;
8571 }
8572
11fdf7f2 8573 // get expected statfs; reset unaffected fields to be able to compare
7c673cae
FG
8574 // structs
8575 statfs(&actual_statfs);
11fdf7f2
TL
8576 actual_statfs.total = 0;
8577 actual_statfs.internally_reserved = 0;
8578 actual_statfs.available = 0;
8579 actual_statfs.internal_metadata = 0;
8580 actual_statfs.omap_allocated = 0;
8581
eafe8130
TL
8582 if (g_conf()->bluestore_debug_fsck_abort) {
8583 dout(1) << __func__ << " debug abort" << dendl;
8584 goto out_scan;
8585 }
7c673cae 8586 // walk PREFIX_OBJ
eafe8130
TL
8587 {
8588 dout(1) << __func__ << " walking object keyspace" << dendl;
8589 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8590 BlueStore::FSCK_ObjectCtx ctx(
8591 errors,
8592 warnings,
8593 num_objects,
8594 num_extents,
8595 num_blobs,
8596 num_sharded_objects,
8597 num_spanning_blobs,
8598 &used_blocks,
8599 &used_omap_head,
9f95a23c
TL
8600 //no need for the below lock when in non-shallow mode as
8601 // there is no multithreading in this case
8602 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
eafe8130
TL
8603 sb_info,
8604 expected_store_statfs,
8605 expected_pool_statfs,
8606 repair ? &repairer : nullptr);
9f95a23c
TL
8607
8608 _fsck_check_objects(depth, ctx);
eafe8130 8609 }
11fdf7f2 8610
7c673cae
FG
8611 dout(1) << __func__ << " checking shared_blobs" << dendl;
8612 it = db->get_iterator(PREFIX_SHARED_BLOB);
8613 if (it) {
eafe8130
TL
8614 // FIXME minor: perhaps simplify for shallow mode?
8615 // fill global if not overriden below
8616 auto expected_statfs = &expected_store_statfs;
11fdf7f2 8617
7c673cae
FG
8618 for (it->lower_bound(string()); it->valid(); it->next()) {
8619 string key = it->key();
8620 uint64_t sbid;
8621 if (get_key_shared_blob(key, &sbid)) {
3efd9988 8622 derr << "fsck error: bad key '" << key
7c673cae 8623 << "' in shared blob namespace" << dendl;
11fdf7f2
TL
8624 if (repair) {
8625 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8626 }
7c673cae
FG
8627 ++errors;
8628 continue;
8629 }
8630 auto p = sb_info.find(sbid);
8631 if (p == sb_info.end()) {
3efd9988 8632 derr << "fsck error: found stray shared blob data for sbid 0x"
7c673cae 8633 << std::hex << sbid << std::dec << dendl;
11fdf7f2
TL
8634 if (repair) {
8635 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8636 }
7c673cae
FG
8637 ++errors;
8638 } else {
8639 ++num_shared_blobs;
8640 sb_info_t& sbi = p->second;
8641 bluestore_shared_blob_t shared_blob(sbid);
8642 bufferlist bl = it->value();
11fdf7f2
TL
8643 auto blp = bl.cbegin();
8644 try {
8645 decode(shared_blob, blp);
8646 } catch (buffer::error& e) {
8647 ++errors;
8648 // Force update and don't report as missing
8649 sbi.updated = sbi.passed = true;
8650
8651 derr << "fsck error: failed to decode Shared Blob"
8652 << pretty_binary_string(it->key()) << dendl;
8653 if (repair) {
8654 dout(20) << __func__ << " undecodable Shared Blob, key:'"
8655 << pretty_binary_string(it->key())
8656 << "', removing" << dendl;
8657 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8658 }
8659 continue;
8660 }
7c673cae
FG
8661 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
8662 if (shared_blob.ref_map != sbi.ref_map) {
3efd9988 8663 derr << "fsck error: shared blob 0x" << std::hex << sbid
11fdf7f2
TL
8664 << std::dec << " ref_map " << shared_blob.ref_map
8665 << " != expected " << sbi.ref_map << dendl;
8666 sbi.updated = true; // will update later in repair mode only!
7c673cae
FG
8667 ++errors;
8668 }
8669 PExtentVector extents;
8670 for (auto &r : shared_blob.ref_map.ref_map) {
8671 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
8672 }
eafe8130 8673 if (per_pool_stat_collection || repair) {
11fdf7f2
TL
8674 expected_statfs = &expected_pool_statfs[sbi.pool_id];
8675 }
8676 errors += _fsck_check_extents(sbi.cid,
8677 p->second.oids.front(),
7c673cae
FG
8678 extents,
8679 p->second.compressed,
b32b8144
FG
8680 used_blocks,
8681 fm->get_alloc_size(),
11fdf7f2 8682 repair ? &repairer : nullptr,
eafe8130
TL
8683 *expected_statfs,
8684 depth);
11fdf7f2
TL
8685 sbi.passed = true;
8686 }
8687 }
8688 } // if (it)
8689
8690 if (repair && repairer.preprocess_misreference(db)) {
8691
8692 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
8693 auto& space_tracker = repairer.get_space_usage_tracker();
8694 auto& misref_extents = repairer.get_misreferences();
8695 interval_set<uint64_t> to_release;
8696 it = db->get_iterator(PREFIX_OBJ);
8697 if (it) {
eafe8130
TL
8698 // fill global if not overriden below
8699 auto expected_statfs = &expected_store_statfs;
11fdf7f2
TL
8700
8701 CollectionRef c;
8702 spg_t pgid;
8703 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
8704 bool bypass_rest = false;
8705 for (it->lower_bound(string()); it->valid() && !bypass_rest;
8706 it->next()) {
8707 dout(30) << __func__ << " key "
8708 << pretty_binary_string(it->key()) << dendl;
8709 if (is_extent_shard_key(it->key())) {
8710 continue;
8711 }
8712
8713 ghobject_t oid;
8714 int r = get_key_object(it->key(), &oid);
8715 if (r < 0 || !space_tracker.is_used(oid)) {
8716 continue;
8717 }
8718
8719 if (!c ||
8720 oid.shard_id != pgid.shard ||
8721 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8722 !c->contains(oid)) {
8723 c = nullptr;
8724 for (auto& p : coll_map) {
8725 if (p.second->contains(oid)) {
8726 c = p.second;
8727 break;
8728 }
8729 }
8730 if (!c) {
8731 continue;
8732 }
eafe8130
TL
8733 if (per_pool_stat_collection || repair) {
8734 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
11fdf7f2
TL
8735 expected_statfs = &expected_pool_statfs[pool_id];
8736 }
8737 }
8738 if (!space_tracker.is_used(c->cid)) {
8739 continue;
8740 }
8741
8742 dout(20) << __func__ << " check misreference for col:" << c->cid
8743 << " obj:" << oid << dendl;
8744
eafe8130
TL
8745 OnodeRef o;
8746 o.reset(Onode::decode(c, oid, it->key(), it->value()));
11fdf7f2
TL
8747 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8748 mempool::bluestore_fsck::set<BlobRef> blobs;
8749
8750 for (auto& e : o->extent_map.extent_map) {
8751 blobs.insert(e.blob);
8752 }
8753 bool need_onode_update = false;
8754 bool first_dump = true;
8755 for(auto b : blobs) {
8756 bool broken_blob = false;
8757 auto& pextents = b->dirty_blob().dirty_extents();
8758 for (auto& e : pextents) {
8759 if (!e.is_valid()) {
8760 continue;
8761 }
8762 // for the sake of simplicity and proper shared blob handling
8763 // always rewrite the whole blob even when it's partially
8764 // misreferenced.
8765 if (misref_extents.intersects(e.offset, e.length)) {
8766 if (first_dump) {
8767 first_dump = false;
81eedcae 8768 _dump_onode<10>(cct, *o);
11fdf7f2
TL
8769 }
8770 broken_blob = true;
8771 break;
8772 }
8773 }
8774 if (!broken_blob)
8775 continue;
8776 bool compressed = b->get_blob().is_compressed();
8777 need_onode_update = true;
8778 dout(10) << __func__
8779 << " fix misreferences in oid:" << oid
8780 << " " << *b << dendl;
8781 uint64_t b_off = 0;
8782 PExtentVector pext_to_release;
8783 pext_to_release.reserve(pextents.size());
8784 // rewriting all valid pextents
8785 for (auto e = pextents.begin(); e != pextents.end();
8786 b_off += e->length, e++) {
8787 if (!e->is_valid()) {
8788 continue;
8789 }
8790 PExtentVector exts;
8791 int64_t alloc_len = alloc->allocate(e->length, min_alloc_size,
8792 0, 0, &exts);
eafe8130 8793 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
11fdf7f2
TL
8794 derr << __func__
8795 << " failed to allocate 0x" << std::hex << e->length
eafe8130 8796 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2
TL
8797 << " min_alloc_size 0x" << min_alloc_size
8798 << " available 0x " << alloc->get_free()
8799 << std::dec << dendl;
8800 if (alloc_len > 0) {
8801 alloc->release(exts);
8802 }
8803 bypass_rest = true;
8804 break;
8805 }
8806 expected_statfs->allocated += e->length;
8807 if (compressed) {
8808 expected_statfs->data_compressed_allocated += e->length;
8809 }
8810
8811 bufferlist bl;
8812 IOContext ioc(cct, NULL, true); // allow EIO
8813 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
8814 if (r < 0) {
8815 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
8816 <<"~" << e->length << std::dec << dendl;
8817 ceph_abort_msg("read failed, wtf");
8818 }
8819 pext_to_release.push_back(*e);
8820 e = pextents.erase(e);
8821 e = pextents.insert(e, exts.begin(), exts.end());
8822 b->get_blob().map_bl(
8823 b_off, bl,
8824 [&](uint64_t offset, bufferlist& t) {
8825 int r = bdev->write(offset, t, false);
8826 ceph_assert(r == 0);
8827 });
8828 e += exts.size() - 1;
8829 for (auto& p : exts) {
8830 fm->allocate(p.offset, p.length, txn);
8831 }
8832 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8833
8834 if (b->get_blob().is_shared()) {
8835 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
8836
8837 auto sb_it = sb_info.find(b->shared_blob->get_sbid());
8838 ceph_assert(sb_it != sb_info.end());
8839 sb_info_t& sbi = sb_it->second;
8840
8841 for (auto& r : sbi.ref_map.ref_map) {
8842 expected_statfs->allocated -= r.second.length;
8843 if (sbi.compressed) {
8844 // NB: it's crucial to use compressed flag from sb_info_t
8845 // as we originally used that value while accumulating
8846 // expected_statfs
8847 expected_statfs->data_compressed_allocated -= r.second.length;
8848 }
8849 }
8850 sbi.updated = sbi.passed = true;
8851 sbi.ref_map.clear();
8852
8853 // relying on blob's pextents to decide what to release.
8854 for (auto& p : pext_to_release) {
8855 to_release.union_insert(p.offset, p.length);
8856 }
8857 } else {
8858 for (auto& p : pext_to_release) {
8859 expected_statfs->allocated -= p.length;
8860 if (compressed) {
8861 expected_statfs->data_compressed_allocated -= p.length;
8862 }
8863 to_release.union_insert(p.offset, p.length);
8864 }
8865 }
8866 if (bypass_rest) {
8867 break;
8868 }
8869 } // for(auto b : blobs)
8870 if (need_onode_update) {
8871 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
8872 _record_onode(o, txn);
8873 }
8874 } // for (it->lower_bound(string()); it->valid(); it->next())
8875
8876 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
8877 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
8878 << "~" << it.get_len() << std::dec << dendl;
8879 fm->release(it.get_start(), it.get_len(), txn);
8880 }
8881 alloc->release(to_release);
8882 to_release.clear();
8883 } // if (it) {
8884 } //if (repair && repairer.preprocess_misreference()) {
8885
eafe8130
TL
8886 if (depth != FSCK_SHALLOW) {
8887 for (auto &p : sb_info) {
8888 sb_info_t& sbi = p.second;
8889 if (!sbi.passed) {
8890 derr << "fsck error: missing " << *sbi.sb << dendl;
8891 ++errors;
8892 }
8893 if (repair && (!sbi.passed || sbi.updated)) {
8894 auto sbid = p.first;
8895 if (sbi.ref_map.empty()) {
8896 ceph_assert(sbi.passed);
8897 dout(20) << __func__ << " " << *sbi.sb
8898 << " is empty, removing" << dendl;
8899 repairer.fix_shared_blob(db, sbid, nullptr);
8900 } else {
8901 bufferlist bl;
8902 bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
8903 encode(persistent, bl);
8904 dout(20) << __func__ << " " << *sbi.sb
8905 << " is " << bl.length() << " bytes, updating" << dendl;
11fdf7f2 8906
eafe8130
TL
8907 repairer.fix_shared_blob(db, sbid, &bl);
8908 }
7c673cae
FG
8909 }
8910 }
8911 }
11fdf7f2
TL
8912 sb_info.clear();
8913
eafe8130
TL
8914 // check global stats only if fscking (not repairing) w/o per-pool stats
8915 if (!per_pool_stat_collection &&
8916 !repair &&
8917 !(actual_statfs == expected_store_statfs)) {
8918 derr << "fsck error: actual " << actual_statfs
8919 << " != expected " << expected_store_statfs << dendl;
8920 if (repair) {
8921 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
8922 expected_store_statfs);
11fdf7f2 8923 }
eafe8130 8924 ++errors;
7c673cae
FG
8925 }
8926
eafe8130
TL
8927 dout(1) << __func__ << " checking pool_statfs" << dendl;
8928 _fsck_check_pool_statfs(expected_pool_statfs,
8929 errors, warnings, repair ? &repairer : nullptr);
8930
8931 if (depth != FSCK_SHALLOW) {
9f95a23c 8932 dout(1) << __func__ << " checking for stray omap data " << dendl;
eafe8130
TL
8933 it = db->get_iterator(PREFIX_OMAP);
8934 if (it) {
9f95a23c 8935 uint64_t last_omap_head = 0;
eafe8130
TL
8936 for (it->lower_bound(string()); it->valid(); it->next()) {
8937 uint64_t omap_head;
8938 _key_decode_u64(it->key().c_str(), &omap_head);
9f95a23c
TL
8939 if (used_omap_head.count(omap_head) == 0 &&
8940 omap_head != last_omap_head) {
8941 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8942 << "fsck error: found stray omap data on omap_head "
8943 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head)<< fsck_dendl;
eafe8130 8944 ++errors;
9f95a23c 8945 last_omap_head = omap_head;
eafe8130 8946 }
7c673cae
FG
8947 }
8948 }
eafe8130
TL
8949 it = db->get_iterator(PREFIX_PGMETA_OMAP);
8950 if (it) {
9f95a23c 8951 uint64_t last_omap_head = 0;
eafe8130
TL
8952 for (it->lower_bound(string()); it->valid(); it->next()) {
8953 uint64_t omap_head;
8954 _key_decode_u64(it->key().c_str(), &omap_head);
9f95a23c
TL
8955 if (used_omap_head.count(omap_head) == 0 &&
8956 omap_head != last_omap_head) {
8957 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8958 << "fsck error: found stray (pgmeta) omap data on omap_head "
8959 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8960 last_omap_head = omap_head;
eafe8130
TL
8961 ++errors;
8962 }
11fdf7f2
TL
8963 }
8964 }
9f95a23c
TL
8965 it = db->get_iterator(PREFIX_PERPOOL_OMAP);
8966 if (it) {
8967 uint64_t last_omap_head = 0;
8968 for (it->lower_bound(string()); it->valid(); it->next()) {
8969 uint64_t pool;
8970 uint64_t omap_head;
8971 string k = it->key();
8972 const char *c = k.c_str();
8973 c = _key_decode_u64(c, &pool);
8974 c = _key_decode_u64(c, &omap_head);
8975 if (used_omap_head.count(omap_head) == 0 &&
8976 omap_head != last_omap_head) {
8977 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8978 << "fsck error: found stray (per-pool) omap data on omap_head "
8979 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8980 ++errors;
8981 last_omap_head = omap_head;
8982 }
8983 }
8984 }
eafe8130
TL
8985 dout(1) << __func__ << " checking deferred events" << dendl;
8986 it = db->get_iterator(PREFIX_DEFERRED);
8987 if (it) {
8988 for (it->lower_bound(string()); it->valid(); it->next()) {
8989 bufferlist bl = it->value();
8990 auto p = bl.cbegin();
8991 bluestore_deferred_transaction_t wt;
8992 try {
8993 decode(wt, p);
8994 } catch (buffer::error& e) {
8995 derr << "fsck error: failed to decode deferred txn "
8996 << pretty_binary_string(it->key()) << dendl;
8997 if (repair) {
8998 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
8999 << pretty_binary_string(it->key())
9000 << "', removing" << dendl;
9001 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
9002 }
9003 continue;
9004 }
9005 dout(20) << __func__ << " deferred " << wt.seq
9006 << " ops " << wt.ops.size()
9007 << " released 0x" << std::hex << wt.released << std::dec << dendl;
9008 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
9f95a23c 9009 apply_for_bitset_range(
eafe8130
TL
9010 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
9011 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
9012 bs.set(pos);
9013 }
9014 );
9015 }
7c673cae 9016 }
eafe8130
TL
9017 }
9018
9019 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
9020 {
9021 // remove bluefs_extents from used set since the freelist doesn't
9022 // know they are allocated.
9023 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
9f95a23c 9024 apply_for_bitset_range(
b32b8144 9025 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 9026 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130 9027 bs.reset(pos);
7c673cae
FG
9028 }
9029 );
9030 }
eafe8130
TL
9031 fm->enumerate_reset();
9032 uint64_t offset, length;
9033 while (fm->enumerate_next(db, &offset, &length)) {
9034 bool intersects = false;
9f95a23c 9035 apply_for_bitset_range(
eafe8130
TL
9036 offset, length, fm->get_alloc_size(), used_blocks,
9037 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
9038 if (bs.test(pos)) {
9039 if (offset == SUPER_RESERVED &&
9040 length == min_alloc_size - SUPER_RESERVED) {
9041 // this is due to the change just after luminous to min_alloc_size
9042 // granularity allocations, and our baked in assumption at the top
9043 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
9044 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
9045 // since we will never allocate this region below min_alloc_size.
9046 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
9047 << " and min_alloc_size, 0x" << std::hex << offset << "~"
9048 << length << std::dec << dendl;
9049 } else {
9050 intersects = true;
9051 if (repair) {
9052 repairer.fix_false_free(db, fm,
9053 pos * min_alloc_size,
9054 min_alloc_size);
9055 }
11fdf7f2 9056 }
eafe8130
TL
9057 } else {
9058 bs.set(pos);
9059 }
7c673cae 9060 }
eafe8130
TL
9061 );
9062 if (intersects) {
9063 derr << "fsck error: free extent 0x" << std::hex << offset
9064 << "~" << length << std::dec
9065 << " intersects allocated blocks" << dendl;
9066 ++errors;
7c673cae 9067 }
b5b8bbf5 9068 }
eafe8130
TL
9069 fm->enumerate_reset();
9070 size_t count = used_blocks.count();
9071 if (used_blocks.size() != count) {
9072 ceph_assert(used_blocks.size() > count);
9073 used_blocks.flip();
9074 size_t start = used_blocks.find_first();
9075 while (start != decltype(used_blocks)::npos) {
9076 size_t cur = start;
9077 while (true) {
9078 size_t next = used_blocks.find_next(cur);
9079 if (next != cur + 1) {
9080 ++errors;
9081 derr << "fsck error: leaked extent 0x" << std::hex
9082 << ((uint64_t)start * fm->get_alloc_size()) << "~"
9083 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
9084 << dendl;
9085 if (repair) {
9086 repairer.fix_leaked(db,
9087 fm,
9088 start * min_alloc_size,
9089 (cur + 1 - start) * min_alloc_size);
9090 }
9091 start = next;
9092 break;
11fdf7f2 9093 }
eafe8130 9094 cur = next;
b5b8bbf5 9095 }
eafe8130
TL
9096 }
9097 used_blocks.flip();
b5b8bbf5 9098 }
7c673cae
FG
9099 }
9100 }
11fdf7f2 9101 if (repair) {
9f95a23c
TL
9102 if (!per_pool_omap) {
9103 dout(5) << __func__ << " marking per_pool_omap=1" << dendl;
9104 repairer.fix_per_pool_omap(db);
9105 }
9106
11fdf7f2
TL
9107 dout(5) << __func__ << " applying repair results" << dendl;
9108 repaired = repairer.apply(db);
9109 dout(5) << __func__ << " repair applied" << dendl;
9110 }
7c673cae 9111
eafe8130 9112out_scan:
7c673cae
FG
9113 dout(2) << __func__ << " " << num_objects << " objects, "
9114 << num_sharded_objects << " of them sharded. "
9115 << dendl;
9116 dout(2) << __func__ << " " << num_extents << " extents to "
9117 << num_blobs << " blobs, "
9118 << num_spanning_blobs << " spanning, "
9119 << num_shared_blobs << " shared."
9120 << dendl;
9121
9122 utime_t duration = ceph_clock_now() - start;
9f95a23c
TL
9123 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
9124 << warnings << " warnings, "
9125 << repaired << " repaired, "
9126 << (errors + warnings - (int)repaired) << " remaining in "
7c673cae 9127 << duration << " seconds" << dendl;
9f95a23c
TL
9128
9129 // In non-repair mode we should return error count only as
9130 // it indicates if store status is OK.
9131 // In repair mode both errors and warnings are taken into account
9132 // since repaired counter relates to them both.
9133 return repair ? errors + warnings - (int)repaired : errors;
11fdf7f2
TL
9134}
9135
9136/// methods to inject various errors fsck can repair
9137void BlueStore::inject_broken_shared_blob_key(const string& key,
9138 const bufferlist& bl)
9139{
9140 KeyValueDB::Transaction txn;
9141 txn = db->get_transaction();
9142 txn->set(PREFIX_SHARED_BLOB, key, bl);
9143 db->submit_transaction_sync(txn);
9144};
9145
9146void BlueStore::inject_leaked(uint64_t len)
9147{
9148 KeyValueDB::Transaction txn;
9149 txn = db->get_transaction();
9150
9151 PExtentVector exts;
9152 int64_t alloc_len = alloc->allocate(len, min_alloc_size,
9153 min_alloc_size * 256, 0, &exts);
9154 ceph_assert(alloc_len >= (int64_t)len);
9155 for (auto& p : exts) {
9156 fm->allocate(p.offset, p.length, txn);
9157 }
9158 db->submit_transaction_sync(txn);
9159}
9160
9161void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
9162{
9163 KeyValueDB::Transaction txn;
9164 OnodeRef o;
9165 CollectionRef c = _get_collection(cid);
9166 ceph_assert(c);
9167 {
9f95a23c 9168 std::unique_lock l{c->lock}; // just to avoid internal asserts
11fdf7f2
TL
9169 o = c->get_onode(oid, false);
9170 ceph_assert(o);
9171 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
9172 }
9173
9174 bool injected = false;
9175 txn = db->get_transaction();
9176 auto& em = o->extent_map.extent_map;
9177 std::vector<const PExtentVector*> v;
9178 if (em.size()) {
9179 v.push_back(&em.begin()->blob->get_blob().get_extents());
9180 }
9181 if (em.size() > 1) {
9182 auto it = em.end();
9183 --it;
9184 v.push_back(&(it->blob->get_blob().get_extents()));
9185 }
9186 for (auto pext : v) {
9187 if (pext->size()) {
9188 auto p = pext->begin();
9189 while (p != pext->end()) {
9190 if (p->is_valid()) {
9191 dout(20) << __func__ << " release 0x" << std::hex << p->offset
9192 << "~" << p->length << std::dec << dendl;
9193 fm->release(p->offset, p->length, txn);
9194 injected = true;
9195 break;
9196 }
9197 ++p;
9198 }
9199 }
9200 }
9201 ceph_assert(injected);
9202 db->submit_transaction_sync(txn);
9203}
9204
9f95a23c
TL
9205void BlueStore::inject_legacy_omap()
9206{
9207 dout(1) << __func__ << dendl;
9208 per_pool_omap = false;
9209 KeyValueDB::Transaction txn;
9210 txn = db->get_transaction();
9211 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
9212 db->submit_transaction_sync(txn);
9213}
9214
9215void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
9216{
9217 dout(1) << __func__ << " "
9218 << cid << " " << oid
9219 <<dendl;
9220 KeyValueDB::Transaction txn;
9221 OnodeRef o;
9222 CollectionRef c = _get_collection(cid);
9223 ceph_assert(c);
9224 {
9225 std::unique_lock l{ c->lock }; // just to avoid internal asserts
9226 o = c->get_onode(oid, false);
9227 ceph_assert(o);
9228 }
9229 o->onode.clear_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PGMETA_OMAP);
9230 txn = db->get_transaction();
9231 _record_onode(o, txn);
9232 db->submit_transaction_sync(txn);
9233}
9234
9235
11fdf7f2
TL
9236void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
9237{
9238 BlueStoreRepairer repairer;
9239 repairer.fix_statfs(db, key, new_statfs);
9240 repairer.apply(db);
9241}
9242
eafe8130
TL
9243void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
9244{
9245 KeyValueDB::Transaction t = db->get_transaction();
9246 volatile_statfs v;
9247 v = new_statfs;
9248 bufferlist bl;
9249 v.encode(bl);
9250 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
9251 db->submit_transaction_sync(t);
9252}
9253
11fdf7f2
TL
9254void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
9255 coll_t cid2, ghobject_t oid2,
9256 uint64_t offset)
9257{
9258 OnodeRef o1;
9259 CollectionRef c1 = _get_collection(cid1);
9260 ceph_assert(c1);
9261 {
9f95a23c 9262 std::unique_lock l{c1->lock}; // just to avoid internal asserts
11fdf7f2
TL
9263 o1 = c1->get_onode(oid1, false);
9264 ceph_assert(o1);
9265 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9266 }
9267 OnodeRef o2;
9268 CollectionRef c2 = _get_collection(cid2);
9269 ceph_assert(c2);
9270 {
9f95a23c 9271 std::unique_lock l{c2->lock}; // just to avoid internal asserts
11fdf7f2
TL
9272 o2 = c2->get_onode(oid2, false);
9273 ceph_assert(o2);
9274 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
9275 }
9276 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
9277 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
9278
9279 // require onode/extent layout to be the same (and simple)
9280 // to make things easier
9281 ceph_assert(o1->onode.extent_map_shards.empty());
9282 ceph_assert(o2->onode.extent_map_shards.empty());
9283 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
9284 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
9285 ceph_assert(e1.logical_offset == e2.logical_offset);
9286 ceph_assert(e1.length == e2.length);
9287 ceph_assert(e1.blob_offset == e2.blob_offset);
9288
9289 KeyValueDB::Transaction txn;
9290 txn = db->get_transaction();
9291
9292 // along with misreference error this will create space leaks errors
9293 e2.blob->dirty_blob() = e1.blob->get_blob();
9294 o2->extent_map.dirty_range(offset, e2.length);
9295 o2->extent_map.update(txn, false);
9296
9297 _record_onode(o2, txn);
9298 db->submit_transaction_sync(txn);
7c673cae
FG
9299}
9300
9301void BlueStore::collect_metadata(map<string,string> *pm)
9302{
9303 dout(10) << __func__ << dendl;
9304 bdev->collect_metadata("bluestore_bdev_", pm);
9305 if (bluefs) {
9306 (*pm)["bluefs"] = "1";
9f95a23c
TL
9307 // this value is for backward compatibility only
9308 (*pm)["bluefs_single_shared_device"] = \
9309 stringify((int)bluefs_layout.single_shared_device());
9310 (*pm)["bluefs_dedicated_db"] = \
9311 stringify((int)bluefs_layout.dedicated_db);
9312 (*pm)["bluefs_dedicated_wal"] = \
9313 stringify((int)bluefs_layout.dedicated_wal);
9314 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
7c673cae
FG
9315 } else {
9316 (*pm)["bluefs"] = "0";
9317 }
11fdf7f2
TL
9318
9319 // report numa mapping for underlying devices
9320 int node = -1;
9321 set<int> nodes;
9322 set<string> failed;
9323 int r = get_numa_node(&node, &nodes, &failed);
9324 if (r >= 0) {
9325 if (!failed.empty()) {
9326 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
9327 }
9328 if (!nodes.empty()) {
9329 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
9330 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
9331 }
9332 if (node >= 0) {
9333 (*pm)["objectstore_numa_node"] = stringify(node);
9334 }
9335 }
9336}
9337
9338int BlueStore::get_numa_node(
9339 int *final_node,
9340 set<int> *out_nodes,
9341 set<string> *out_failed)
9342{
9343 int node = -1;
9344 set<string> devices;
9345 get_devices(&devices);
9346 set<int> nodes;
9347 set<string> failed;
9348 for (auto& devname : devices) {
9349 int n;
9350 BlkDev bdev(devname);
9351 int r = bdev.get_numa_node(&n);
9352 if (r < 0) {
9353 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
9354 << dendl;
9355 failed.insert(devname);
9356 continue;
9357 }
9358 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
9359 << dendl;
9360 nodes.insert(n);
9361 if (node < 0) {
9362 node = n;
9363 }
9364 }
9365 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
9366 *final_node = node;
9367 }
9368 if (out_nodes) {
9369 *out_nodes = nodes;
9370 }
9371 if (out_failed) {
9372 *out_failed = failed;
9373 }
9374 return 0;
9375}
9376
9377int BlueStore::get_devices(set<string> *ls)
9378{
9379 if (bdev) {
9380 bdev->get_devices(ls);
9381 if (bluefs) {
9382 bluefs->get_devices(ls);
9383 }
9384 return 0;
9385 }
9386
9387 // grumble, we haven't started up yet.
9388 int r = _open_path();
9389 if (r < 0)
9390 goto out;
9391 r = _open_fsid(false);
9392 if (r < 0)
9393 goto out_path;
9394 r = _read_fsid(&fsid);
9395 if (r < 0)
9396 goto out_fsid;
9397 r = _lock_fsid();
9398 if (r < 0)
9399 goto out_fsid;
9400 r = _open_bdev(false);
9401 if (r < 0)
9402 goto out_fsid;
9403 r = _minimal_open_bluefs(false);
9404 if (r < 0)
9405 goto out_bdev;
9406 bdev->get_devices(ls);
9407 if (bluefs) {
9408 bluefs->get_devices(ls);
9409 }
9410 r = 0;
9411 _minimal_close_bluefs();
9412 out_bdev:
9413 _close_bdev();
9414 out_fsid:
9415 _close_fsid();
9416 out_path:
9417 _close_path();
9418 out:
9419 return r;
7c673cae
FG
9420}
9421
11fdf7f2 9422void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
7c673cae
FG
9423{
9424 buf->reset();
11fdf7f2 9425
9f95a23c
TL
9426 buf->omap_allocated =
9427 db->estimate_prefix_size(PREFIX_OMAP, string()) +
9428 db->estimate_prefix_size(PREFIX_PERPOOL_OMAP, string());
11fdf7f2
TL
9429
9430 uint64_t bfree = alloc->get_free();
7c673cae
FG
9431
9432 if (bluefs) {
9f95a23c
TL
9433 int64_t bluefs_total = bluefs->get_total(bluefs_layout.shared_bdev);
9434 int64_t bluefs_free = bluefs->get_free(bluefs_layout.shared_bdev);
94b18763
FG
9435 // part of our shared device is "free" according to BlueFS, but we
9436 // can't touch bluestore_bluefs_min of it.
9437 int64_t shared_available = std::min(
11fdf7f2
TL
9438 bluefs_free,
9439 int64_t(bluefs_total - cct->_conf->bluestore_bluefs_min));
9440 buf->internally_reserved = bluefs_total - shared_available;
94b18763 9441 if (shared_available > 0) {
11fdf7f2
TL
9442 bfree += shared_available;
9443 }
9444 // include dedicated db, too, if that isn't the shared device.
9f95a23c 9445 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 9446 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
7c673cae 9447 }
11fdf7f2
TL
9448 // call any non-omap bluefs space "internal metadata"
9449 buf->internal_metadata =
9450 std::max(bluefs->get_used(), (uint64_t)cct->_conf->bluestore_bluefs_min)
9451 - buf->omap_allocated;
7c673cae
FG
9452 }
9453
11fdf7f2
TL
9454 uint64_t thin_total, thin_avail;
9455 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
9456 buf->total += thin_total;
9457
9458 // we are limited by both the size of the virtual device and the
9459 // underlying physical device.
9460 bfree = std::min(bfree, thin_avail);
9461
9462 buf->allocated = thin_total - thin_avail;
9463 } else {
9464 buf->total += bdev->get_size();
9465 }
9466 buf->available = bfree;
9467}
9468
9469int BlueStore::statfs(struct store_statfs_t *buf,
9470 osd_alert_list_t* alerts)
9471{
9472 if (alerts) {
9473 alerts->clear();
9474 _log_alerts(*alerts);
9475 }
9476 _get_statfs_overall(buf);
31f18b77 9477 {
11fdf7f2 9478 std::lock_guard l(vstatfs_lock);
31f18b77 9479 buf->allocated = vstatfs.allocated();
11fdf7f2
TL
9480 buf->data_stored = vstatfs.stored();
9481 buf->data_compressed = vstatfs.compressed();
9482 buf->data_compressed_original = vstatfs.compressed_original();
9483 buf->data_compressed_allocated = vstatfs.compressed_allocated();
9484 }
9485
9486 dout(20) << __func__ << " " << *buf << dendl;
9487 return 0;
9488}
9489
9f95a23c
TL
9490int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
9491 bool *out_per_pool_omap)
11fdf7f2
TL
9492{
9493 dout(20) << __func__ << " pool " << pool_id<< dendl;
81eedcae 9494
11fdf7f2
TL
9495 if (!per_pool_stat_collection) {
9496 dout(20) << __func__ << " not supported in legacy mode " << dendl;
9497 return -ENOTSUP;
7c673cae 9498 }
11fdf7f2 9499 buf->reset();
7c673cae 9500
11fdf7f2
TL
9501 {
9502 std::lock_guard l(vstatfs_lock);
9503 osd_pools[pool_id].publish(buf);
9504 }
9f95a23c
TL
9505
9506 string key_prefix;
9507 _key_encode_u64(pool_id, &key_prefix);
9508 buf->omap_allocated = db->estimate_prefix_size(PREFIX_PERPOOL_OMAP,
9509 key_prefix);
9510 *out_per_pool_omap = per_pool_omap;
9511
11fdf7f2 9512 dout(10) << __func__ << *buf << dendl;
7c673cae
FG
9513 return 0;
9514}
9515
81eedcae
TL
9516void BlueStore::_check_legacy_statfs_alert()
9517{
9518 string s;
9519 if (!per_pool_stat_collection &&
eafe8130 9520 cct->_conf->bluestore_warn_on_legacy_statfs) {
81eedcae
TL
9521 s = "legacy statfs reporting detected, "
9522 "suggest to run store repair to get consistent statistic reports";
9523 }
9524 std::lock_guard l(qlock);
9525 legacy_statfs_alert = s;
9526}
9527
9f95a23c
TL
9528void BlueStore::_check_no_per_pool_omap_alert()
9529{
9530 string s;
9531 if (!per_pool_omap &&
9532 cct->_conf->bluestore_warn_on_no_per_pool_omap) {
9533 s = "legacy (not per-pool) omap detected, "
9534 "suggest to run store repair to measure per-pool omap usage";
9535 }
9536 std::lock_guard l(qlock);
9537 no_per_pool_omap_alert = s;
9538}
9539
7c673cae
FG
9540// ---------------
9541// cache
9542
9543BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
9544{
9f95a23c 9545 std::shared_lock l(coll_lock);
7c673cae
FG
9546 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
9547 if (cp == coll_map.end())
9548 return CollectionRef();
9549 return cp->second;
9550}
9551
9552void BlueStore::_queue_reap_collection(CollectionRef& c)
9553{
9554 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
9555 // _reap_collections and this in the same thread,
9556 // so no need a lock.
7c673cae
FG
9557 removed_collections.push_back(c);
9558}
9559
9560void BlueStore::_reap_collections()
9561{
94b18763 9562
7c673cae
FG
9563 list<CollectionRef> removed_colls;
9564 {
94b18763
FG
9565 // _queue_reap_collection and this in the same thread.
9566 // So no need a lock.
9567 if (!removed_collections.empty())
9568 removed_colls.swap(removed_collections);
9569 else
9570 return;
7c673cae
FG
9571 }
9572
94b18763
FG
9573 list<CollectionRef>::iterator p = removed_colls.begin();
9574 while (p != removed_colls.end()) {
7c673cae
FG
9575 CollectionRef c = *p;
9576 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
9577 if (c->onode_map.map_any([&](OnodeRef o) {
11fdf7f2 9578 ceph_assert(!o->exists);
7c673cae
FG
9579 if (o->flushing_count.load()) {
9580 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
9581 << " flush_txns " << o->flushing_count << dendl;
94b18763 9582 return true;
7c673cae 9583 }
94b18763 9584 return false;
7c673cae 9585 })) {
94b18763 9586 ++p;
7c673cae
FG
9587 continue;
9588 }
9589 c->onode_map.clear();
94b18763 9590 p = removed_colls.erase(p);
7c673cae
FG
9591 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
9592 }
94b18763 9593 if (removed_colls.empty()) {
7c673cae 9594 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
9595 } else {
9596 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
9597 }
9598}
9599
9600void BlueStore::_update_cache_logger()
9601{
9602 uint64_t num_onodes = 0;
9f95a23c 9603 uint64_t num_pinned_onodes = 0;
7c673cae
FG
9604 uint64_t num_extents = 0;
9605 uint64_t num_blobs = 0;
9606 uint64_t num_buffers = 0;
9607 uint64_t num_buffer_bytes = 0;
9f95a23c
TL
9608 for (auto c : onode_cache_shards) {
9609 c->add_stats(&num_onodes, &num_pinned_onodes);
9610 }
9611 for (auto c : buffer_cache_shards) {
9612 c->add_stats(&num_extents, &num_blobs,
9613 &num_buffers, &num_buffer_bytes);
7c673cae
FG
9614 }
9615 logger->set(l_bluestore_onodes, num_onodes);
9f95a23c 9616 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
7c673cae
FG
9617 logger->set(l_bluestore_extents, num_extents);
9618 logger->set(l_bluestore_blobs, num_blobs);
9619 logger->set(l_bluestore_buffers, num_buffers);
9620 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
9621}
9622
9623// ---------------
9624// read operations
9625
9626ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
9627{
9628 return _get_collection(cid);
9629}
9630
11fdf7f2
TL
9631ObjectStore::CollectionHandle BlueStore::create_new_collection(
9632 const coll_t& cid)
7c673cae 9633{
9f95a23c
TL
9634 std::unique_lock l{coll_lock};
9635 auto c = ceph::make_ref<Collection>(
11fdf7f2 9636 this,
9f95a23c
TL
9637 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
9638 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
11fdf7f2
TL
9639 cid);
9640 new_coll_map[cid] = c;
9f95a23c 9641 _osr_attach(c.get());
11fdf7f2
TL
9642 return c;
9643}
9644
9645void BlueStore::set_collection_commit_queue(
9646 const coll_t& cid,
9647 ContextQueue *commit_queue)
9648{
9649 if (commit_queue) {
9f95a23c 9650 std::shared_lock l(coll_lock);
11fdf7f2
TL
9651 if (coll_map.count(cid)) {
9652 coll_map[cid]->commit_queue = commit_queue;
9653 } else if (new_coll_map.count(cid)) {
9654 new_coll_map[cid]->commit_queue = commit_queue;
9655 }
9656 }
7c673cae
FG
9657}
9658
11fdf7f2 9659
7c673cae
FG
9660bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
9661{
9662 Collection *c = static_cast<Collection *>(c_.get());
9663 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
9664 if (!c->exists)
9665 return false;
9666
9667 bool r = true;
9668
9669 {
9f95a23c 9670 std::shared_lock l(c->lock);
7c673cae
FG
9671 OnodeRef o = c->get_onode(oid, false);
9672 if (!o || !o->exists)
9673 r = false;
9674 }
9675
7c673cae
FG
9676 return r;
9677}
9678
7c673cae
FG
9679int BlueStore::stat(
9680 CollectionHandle &c_,
9681 const ghobject_t& oid,
9682 struct stat *st,
9683 bool allow_eio)
9684{
9685 Collection *c = static_cast<Collection *>(c_.get());
9686 if (!c->exists)
9687 return -ENOENT;
9688 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
9689
9690 {
9f95a23c 9691 std::shared_lock l(c->lock);
7c673cae
FG
9692 OnodeRef o = c->get_onode(oid, false);
9693 if (!o || !o->exists)
9694 return -ENOENT;
9695 st->st_size = o->onode.size;
9696 st->st_blksize = 4096;
9697 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
9698 st->st_nlink = 1;
9699 }
9700
7c673cae
FG
9701 int r = 0;
9702 if (_debug_mdata_eio(oid)) {
9703 r = -EIO;
9704 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9705 }
9706 return r;
9707}
9708int BlueStore::set_collection_opts(
11fdf7f2 9709 CollectionHandle& ch,
7c673cae
FG
9710 const pool_opts_t& opts)
9711{
7c673cae 9712 Collection *c = static_cast<Collection *>(ch.get());
11fdf7f2 9713 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
7c673cae
FG
9714 if (!c->exists)
9715 return -ENOENT;
9f95a23c 9716 std::unique_lock l{c->lock};
7c673cae
FG
9717 c->pool_opts = opts;
9718 return 0;
9719}
9720
7c673cae
FG
9721int BlueStore::read(
9722 CollectionHandle &c_,
9723 const ghobject_t& oid,
9724 uint64_t offset,
9725 size_t length,
9726 bufferlist& bl,
224ce89b 9727 uint32_t op_flags)
7c673cae 9728{
11fdf7f2 9729 auto start = mono_clock::now();
7c673cae
FG
9730 Collection *c = static_cast<Collection *>(c_.get());
9731 const coll_t &cid = c->get_cid();
9732 dout(15) << __func__ << " " << cid << " " << oid
9733 << " 0x" << std::hex << offset << "~" << length << std::dec
9734 << dendl;
9735 if (!c->exists)
9736 return -ENOENT;
9737
9738 bl.clear();
9739 int r;
9740 {
9f95a23c 9741 std::shared_lock l(c->lock);
11fdf7f2 9742 auto start1 = mono_clock::now();
7c673cae 9743 OnodeRef o = c->get_onode(oid, false);
494da23a
TL
9744 log_latency("get_onode@read",
9745 l_bluestore_read_onode_meta_lat,
9746 mono_clock::now() - start1,
9747 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9748 if (!o || !o->exists) {
9749 r = -ENOENT;
9750 goto out;
9751 }
9752
9753 if (offset == length && offset == 0)
9754 length = o->onode.size;
9755
9756 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
9757 if (r == -EIO) {
9758 logger->inc(l_bluestore_read_eio);
9759 }
7c673cae
FG
9760 }
9761
9762 out:
28e407b8 9763 if (r >= 0 && _debug_data_eio(oid)) {
7c673cae
FG
9764 r = -EIO;
9765 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11fdf7f2
TL
9766 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
9767 cct->_conf->bluestore_debug_random_read_err &&
9768 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
9769 100.0)) == 0) {
224ce89b
WB
9770 dout(0) << __func__ << ": inject random EIO" << dendl;
9771 r = -EIO;
7c673cae
FG
9772 }
9773 dout(10) << __func__ << " " << cid << " " << oid
9774 << " 0x" << std::hex << offset << "~" << length << std::dec
9775 << " = " << r << dendl;
494da23a
TL
9776 log_latency(__func__,
9777 l_bluestore_read_lat,
9778 mono_clock::now() - start,
9779 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9780 return r;
9781}
9782
9f95a23c 9783void BlueStore::_read_cache(
7c673cae
FG
9784 OnodeRef o,
9785 uint64_t offset,
9786 size_t length,
9f95a23c
TL
9787 int read_cache_policy,
9788 ready_regions_t& ready_regions,
9789 blobs2read_t& blobs2read)
7c673cae 9790{
7c673cae 9791 // build blob-wise list to of stuff read (that isn't cached)
7c673cae
FG
9792 unsigned left = length;
9793 uint64_t pos = offset;
7c673cae
FG
9794 auto lp = o->extent_map.seek_lextent(offset);
9795 while (left > 0 && lp != o->extent_map.extent_map.end()) {
9796 if (pos < lp->logical_offset) {
9797 unsigned hole = lp->logical_offset - pos;
9798 if (hole >= left) {
9f95a23c 9799 break;
7c673cae
FG
9800 }
9801 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
9f95a23c 9802 << std::dec << dendl;
7c673cae
FG
9803 pos += hole;
9804 left -= hole;
9805 }
94b18763 9806 BlobRef& bptr = lp->blob;
7c673cae
FG
9807 unsigned l_off = pos - lp->logical_offset;
9808 unsigned b_off = l_off + lp->blob_offset;
9809 unsigned b_len = std::min(left, lp->length - l_off);
9810
9811 ready_regions_t cache_res;
9812 interval_set<uint32_t> cache_interval;
9813 bptr->shared_blob->bc.read(
91327a77
AA
9814 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
9815 read_cache_policy);
7c673cae 9816 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c
TL
9817 << " need 0x" << b_off << "~" << b_len
9818 << " cache has 0x" << cache_interval
9819 << std::dec << dendl;
7c673cae
FG
9820
9821 auto pc = cache_res.begin();
11fdf7f2 9822 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
7c673cae
FG
9823 while (b_len > 0) {
9824 unsigned l;
9825 if (pc != cache_res.end() &&
9f95a23c
TL
9826 pc->first == b_off) {
9827 l = pc->second.length();
9828 ready_regions[pos].claim(pc->second);
9829 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
9830 << b_off << "~" << l << std::dec << dendl;
9831 ++pc;
7c673cae 9832 } else {
9f95a23c
TL
9833 l = b_len;
9834 if (pc != cache_res.end()) {
9835 ceph_assert(pc->first > b_off);
9836 l = pc->first - b_off;
9837 }
9838 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
9839 << b_off << "~" << l << std::dec << dendl;
9840 // merge regions
9841 {
9842 uint64_t r_off = b_off;
9843 uint64_t r_len = l;
9844 uint64_t front = r_off % chunk_size;
9845 if (front) {
9846 r_off -= front;
9847 r_len += front;
9848 }
9849 unsigned tail = r_len % chunk_size;
9850 if (tail) {
9851 r_len += chunk_size - tail;
9852 }
9853 bool merged = false;
9854 regions2read_t& r2r = blobs2read[bptr];
9855 if (r2r.size()) {
9856 read_req_t& pre = r2r.back();
9857 if (r_off <= (pre.r_off + pre.r_len)) {
9858 front += (r_off - pre.r_off);
9859 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
9860 pre.regs.emplace_back(region_t(pos, b_off, l, front));
9861 merged = true;
9862 }
9863 }
9864 if (!merged) {
9865 read_req_t req(r_off, r_len);
9866 req.regs.emplace_back(region_t(pos, b_off, l, front));
9867 r2r.emplace_back(std::move(req));
9868 }
9869 }
7c673cae
FG
9870 }
9871 pos += l;
9872 b_off += l;
9873 left -= l;
9874 b_len -= l;
9875 }
9876 ++lp;
9877 }
9f95a23c 9878}
7c673cae 9879
9f95a23c
TL
9880int BlueStore::_prepare_read_ioc(
9881 blobs2read_t& blobs2read,
9882 vector<bufferlist>* compressed_blob_bls,
9883 IOContext* ioc)
9884{
7c673cae 9885 for (auto& p : blobs2read) {
94b18763 9886 const BlobRef& bptr = p.first;
11fdf7f2 9887 regions2read_t& r2r = p.second;
7c673cae 9888 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c 9889 << " need " << r2r << std::dec << dendl;
7c673cae
FG
9890 if (bptr->get_blob().is_compressed()) {
9891 // read the whole thing
9f95a23c
TL
9892 if (compressed_blob_bls->empty()) {
9893 // ensure we avoid any reallocation on subsequent blobs
9894 compressed_blob_bls->reserve(blobs2read.size());
9895 }
9896 compressed_blob_bls->push_back(bufferlist());
9897 bufferlist& bl = compressed_blob_bls->back();
9898 auto r = bptr->get_blob().map(
9899 0, bptr->get_blob().get_ondisk_length(),
9900 [&](uint64_t offset, uint64_t length) {
9901 int r = bdev->aio_read(offset, length, &bl, ioc);
9902 if (r < 0)
7c673cae
FG
9903 return r;
9904 return 0;
9f95a23c 9905 });
b32b8144
FG
9906 if (r < 0) {
9907 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
9908 if (r == -EIO) {
9909 // propagate EIO to caller
9910 return r;
9911 }
11fdf7f2 9912 ceph_assert(r == 0);
b32b8144 9913 }
7c673cae
FG
9914 } else {
9915 // read the pieces
11fdf7f2 9916 for (auto& req : r2r) {
9f95a23c
TL
9917 dout(20) << __func__ << " region 0x" << std::hex
9918 << req.regs.front().logical_offset
9919 << ": 0x" << req.regs.front().blob_xoffset
9920 << " reading 0x" << req.r_off
9921 << "~" << req.r_len << std::dec
9922 << dendl;
7c673cae 9923
9f95a23c
TL
9924 // read it
9925 auto r = bptr->get_blob().map(
9926 req.r_off, req.r_len,
9927 [&](uint64_t offset, uint64_t length) {
9928 int r = bdev->aio_read(offset, length, &req.bl, ioc);
9929 if (r < 0)
7c673cae
FG
9930 return r;
9931 return 0;
9f95a23c 9932 });
b32b8144
FG
9933 if (r < 0) {
9934 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
9935 << dendl;
9936 if (r == -EIO) {
9937 // propagate EIO to caller
9938 return r;
9939 }
11fdf7f2 9940 ceph_assert(r == 0);
b32b8144 9941 }
9f95a23c 9942 ceph_assert(req.bl.length() == req.r_len);
7c673cae
FG
9943 }
9944 }
9945 }
9f95a23c
TL
9946 return 0;
9947}
11fdf7f2 9948
9f95a23c
TL
9949int BlueStore::_generate_read_result_bl(
9950 OnodeRef o,
9951 uint64_t offset,
9952 size_t length,
9953 ready_regions_t& ready_regions,
9954 vector<bufferlist>& compressed_blob_bls,
9955 blobs2read_t& blobs2read,
9956 bool buffered,
9957 bool* csum_error,
9958 bufferlist& bl)
9959{
9960 // enumerate and decompress desired blobs
7c673cae
FG
9961 auto p = compressed_blob_bls.begin();
9962 blobs2read_t::iterator b2r_it = blobs2read.begin();
9963 while (b2r_it != blobs2read.end()) {
94b18763 9964 const BlobRef& bptr = b2r_it->first;
11fdf7f2 9965 regions2read_t& r2r = b2r_it->second;
7c673cae 9966 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c 9967 << " need 0x" << r2r << std::dec << dendl;
7c673cae 9968 if (bptr->get_blob().is_compressed()) {
11fdf7f2 9969 ceph_assert(p != compressed_blob_bls.end());
7c673cae
FG
9970 bufferlist& compressed_bl = *p++;
9971 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
9f95a23c
TL
9972 r2r.front().regs.front().logical_offset) < 0) {
9973 *csum_error = true;
9974 return -EIO;
7c673cae
FG
9975 }
9976 bufferlist raw_bl;
9f95a23c 9977 auto r = _decompress(compressed_bl, &raw_bl);
7c673cae 9978 if (r < 0)
9f95a23c 9979 return r;
7c673cae 9980 if (buffered) {
9f95a23c
TL
9981 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
9982 raw_bl);
7c673cae 9983 }
11fdf7f2
TL
9984 for (auto& req : r2r) {
9985 for (auto& r : req.regs) {
9986 ready_regions[r.logical_offset].substr_of(
9987 raw_bl, r.blob_xoffset, r.length);
9988 }
7c673cae
FG
9989 }
9990 } else {
11fdf7f2 9991 for (auto& req : r2r) {
9f95a23c
TL
9992 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
9993 req.regs.front().logical_offset) < 0) {
9994 *csum_error = true;
9995 return -EIO;
9996 }
9997 if (buffered) {
9998 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
9999 req.r_off, req.bl);
10000 }
7c673cae 10001
9f95a23c
TL
10002 // prune and keep result
10003 for (const auto& r : req.regs) {
10004 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
11fdf7f2 10005 }
7c673cae
FG
10006 }
10007 }
10008 ++b2r_it;
10009 }
10010
10011 // generate a resulting buffer
10012 auto pr = ready_regions.begin();
10013 auto pr_end = ready_regions.end();
9f95a23c 10014 uint64_t pos = 0;
7c673cae
FG
10015 while (pos < length) {
10016 if (pr != pr_end && pr->first == pos + offset) {
10017 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
10018 << ": data from 0x" << pr->first << "~" << pr->second.length()
10019 << std::dec << dendl;
7c673cae
FG
10020 pos += pr->second.length();
10021 bl.claim_append(pr->second);
10022 ++pr;
10023 } else {
10024 uint64_t l = length - pos;
10025 if (pr != pr_end) {
11fdf7f2 10026 ceph_assert(pr->first > pos + offset);
9f95a23c 10027 l = pr->first - (pos + offset);
7c673cae
FG
10028 }
10029 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
10030 << ": zeros for 0x" << (pos + offset) << "~" << l
10031 << std::dec << dendl;
7c673cae
FG
10032 bl.append_zero(l);
10033 pos += l;
10034 }
10035 }
11fdf7f2
TL
10036 ceph_assert(bl.length() == length);
10037 ceph_assert(pos == length);
10038 ceph_assert(pr == pr_end);
9f95a23c
TL
10039 return 0;
10040}
10041
10042int BlueStore::_do_read(
10043 Collection *c,
10044 OnodeRef o,
10045 uint64_t offset,
10046 size_t length,
10047 bufferlist& bl,
10048 uint32_t op_flags,
10049 uint64_t retry_count)
10050{
10051 FUNCTRACE(cct);
10052 int r = 0;
10053 int read_cache_policy = 0; // do not bypass clean or dirty cache
10054
10055 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10056 << " size 0x" << o->onode.size << " (" << std::dec
10057 << o->onode.size << ")" << dendl;
10058 bl.clear();
10059
10060 if (offset >= o->onode.size) {
10061 return r;
10062 }
10063
10064 // generally, don't buffer anything, unless the client explicitly requests
10065 // it.
10066 bool buffered = false;
10067 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10068 dout(20) << __func__ << " will do buffered read" << dendl;
10069 buffered = true;
10070 } else if (cct->_conf->bluestore_default_buffered_read &&
10071 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10072 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10073 dout(20) << __func__ << " defaulting to buffered read" << dendl;
10074 buffered = true;
10075 }
10076
10077 if (offset + length > o->onode.size) {
10078 length = o->onode.size - offset;
10079 }
10080
10081 auto start = mono_clock::now();
10082 o->extent_map.fault_range(db, offset, length);
10083 log_latency(__func__,
10084 l_bluestore_read_onode_meta_lat,
10085 mono_clock::now() - start,
10086 cct->_conf->bluestore_log_op_age);
10087 _dump_onode<30>(cct, *o);
10088
10089 // for deep-scrub, we only read dirty cache and bypass clean cache in
10090 // order to read underlying block device in case there are silent disk errors.
10091 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
10092 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
10093 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
10094 }
10095
10096 // build blob-wise list to of stuff read (that isn't cached)
10097 ready_regions_t ready_regions;
10098 blobs2read_t blobs2read;
10099 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
10100
10101
10102 // read raw blob data.
10103 start = mono_clock::now(); // for the sake of simplicity
10104 // measure the whole block below.
10105 // The error isn't that much...
10106 vector<bufferlist> compressed_blob_bls;
10107 IOContext ioc(cct, NULL, true); // allow EIO
10108 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
10109 // we always issue aio for reading, so errors other than EIO are not allowed
10110 if (r < 0)
10111 return r;
10112
10113 int64_t num_ios = length;
10114 if (ioc.has_pending_aios()) {
10115 num_ios = -ioc.get_num_ios();
10116 bdev->aio_submit(&ioc);
10117 dout(20) << __func__ << " waiting for aio" << dendl;
10118 ioc.aio_wait();
10119 r = ioc.get_return_value();
10120 if (r < 0) {
10121 ceph_assert(r == -EIO); // no other errors allowed
10122 return -EIO;
10123 }
10124 }
10125 log_latency_fn(__func__,
10126 l_bluestore_read_wait_aio_lat,
10127 mono_clock::now() - start,
10128 cct->_conf->bluestore_log_op_age,
10129 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
10130 );
10131
10132 bool csum_error = false;
10133 r = _generate_read_result_bl(o, offset, length, ready_regions,
10134 compressed_blob_bls, blobs2read,
10135 buffered, &csum_error, bl);
10136 if (csum_error) {
10137 // Handles spurious read errors caused by a kernel bug.
10138 // We sometimes get all-zero pages as a result of the read under
10139 // high memory pressure. Retrying the failing read succeeds in most
10140 // cases.
10141 // See also: http://tracker.ceph.com/issues/22464
10142 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
10143 return -EIO;
10144 }
10145 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
10146 }
7c673cae 10147 r = bl.length();
f64942e4
AA
10148 if (retry_count) {
10149 logger->inc(l_bluestore_reads_with_retries);
10150 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
10151 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
10152 }
7c673cae
FG
10153 return r;
10154}
10155
10156int BlueStore::_verify_csum(OnodeRef& o,
10157 const bluestore_blob_t* blob, uint64_t blob_xoffset,
10158 const bufferlist& bl,
10159 uint64_t logical_offset) const
10160{
10161 int bad;
10162 uint64_t bad_csum;
11fdf7f2 10163 auto start = mono_clock::now();
7c673cae 10164 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
f64942e4
AA
10165 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
10166 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
10167 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
10168 bad = blob_xoffset;
10169 r = -1;
10170 bad_csum = 0xDEADBEEF;
10171 }
7c673cae
FG
10172 if (r < 0) {
10173 if (r == -1) {
10174 PExtentVector pex;
10175 blob->map(
10176 bad,
10177 blob->get_csum_chunk_size(),
10178 [&](uint64_t offset, uint64_t length) {
10179 pex.emplace_back(bluestore_pextent_t(offset, length));
10180 return 0;
10181 });
10182 derr << __func__ << " bad "
10183 << Checksummer::get_csum_type_string(blob->csum_type)
10184 << "/0x" << std::hex << blob->get_csum_chunk_size()
10185 << " checksum at blob offset 0x" << bad
10186 << ", got 0x" << bad_csum << ", expected 0x"
10187 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
10188 << ", device location " << pex
10189 << ", logical extent 0x" << std::hex
10190 << (logical_offset + bad - blob_xoffset) << "~"
10191 << blob->get_csum_chunk_size() << std::dec
10192 << ", object " << o->oid
10193 << dendl;
10194 } else {
10195 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
10196 }
10197 }
494da23a
TL
10198 log_latency(__func__,
10199 l_bluestore_csum_lat,
10200 mono_clock::now() - start,
10201 cct->_conf->bluestore_log_op_age);
11fdf7f2
TL
10202 if (cct->_conf->bluestore_ignore_data_csum) {
10203 return 0;
10204 }
7c673cae
FG
10205 return r;
10206}
10207
10208int BlueStore::_decompress(bufferlist& source, bufferlist* result)
10209{
10210 int r = 0;
11fdf7f2
TL
10211 auto start = mono_clock::now();
10212 auto i = source.cbegin();
7c673cae 10213 bluestore_compression_header_t chdr;
11fdf7f2 10214 decode(chdr, i);
7c673cae
FG
10215 int alg = int(chdr.type);
10216 CompressorRef cp = compressor;
10217 if (!cp || (int)cp->get_type() != alg) {
10218 cp = Compressor::create(cct, alg);
10219 }
10220
10221 if (!cp.get()) {
10222 // if compressor isn't available - error, because cannot return
10223 // decompressed data?
11fdf7f2
TL
10224
10225 const char* alg_name = Compressor::get_comp_alg_name(alg);
10226 derr << __func__ << " can't load decompressor " << alg_name << dendl;
10227 _set_compression_alert(false, alg_name);
7c673cae
FG
10228 r = -EIO;
10229 } else {
10230 r = cp->decompress(i, chdr.length, *result);
10231 if (r < 0) {
10232 derr << __func__ << " decompression failed with exit code " << r << dendl;
10233 r = -EIO;
10234 }
10235 }
494da23a
TL
10236 log_latency(__func__,
10237 l_bluestore_decompress_lat,
10238 mono_clock::now() - start,
10239 cct->_conf->bluestore_log_op_age);
7c673cae
FG
10240 return r;
10241}
10242
10243// this stores fiemap into interval_set, other variations
10244// use it internally
10245int BlueStore::_fiemap(
10246 CollectionHandle &c_,
10247 const ghobject_t& oid,
10248 uint64_t offset,
10249 size_t length,
10250 interval_set<uint64_t>& destset)
10251{
10252 Collection *c = static_cast<Collection *>(c_.get());
10253 if (!c->exists)
10254 return -ENOENT;
10255 {
9f95a23c 10256 std::shared_lock l(c->lock);
7c673cae
FG
10257
10258 OnodeRef o = c->get_onode(oid, false);
10259 if (!o || !o->exists) {
10260 return -ENOENT;
10261 }
81eedcae 10262 _dump_onode<30>(cct, *o);
7c673cae
FG
10263
10264 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10265 << " size 0x" << o->onode.size << std::dec << dendl;
10266
10267 boost::intrusive::set<Extent>::iterator ep, eend;
10268 if (offset >= o->onode.size)
10269 goto out;
10270
10271 if (offset + length > o->onode.size) {
10272 length = o->onode.size - offset;
10273 }
10274
10275 o->extent_map.fault_range(db, offset, length);
10276 eend = o->extent_map.extent_map.end();
10277 ep = o->extent_map.seek_lextent(offset);
10278 while (length > 0) {
10279 dout(20) << __func__ << " offset " << offset << dendl;
10280 if (ep != eend && ep->logical_offset + ep->length <= offset) {
10281 ++ep;
10282 continue;
10283 }
10284
10285 uint64_t x_len = length;
10286 if (ep != eend && ep->logical_offset <= offset) {
10287 uint64_t x_off = offset - ep->logical_offset;
11fdf7f2 10288 x_len = std::min(x_len, ep->length - x_off);
7c673cae
FG
10289 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
10290 << x_len << std::dec << " blob " << ep->blob << dendl;
10291 destset.insert(offset, x_len);
10292 length -= x_len;
10293 offset += x_len;
10294 if (x_off + x_len == ep->length)
10295 ++ep;
10296 continue;
10297 }
10298 if (ep != eend &&
10299 ep->logical_offset > offset &&
10300 ep->logical_offset - offset < x_len) {
10301 x_len = ep->logical_offset - offset;
10302 }
10303 offset += x_len;
10304 length -= x_len;
10305 }
10306 }
9f95a23c
TL
10307
10308 out:
10309 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10310 << " size = 0x(" << destset << ")" << std::dec << dendl;
10311 return 0;
10312}
10313
10314int BlueStore::fiemap(
10315 CollectionHandle &c_,
10316 const ghobject_t& oid,
10317 uint64_t offset,
10318 size_t length,
10319 bufferlist& bl)
10320{
10321 interval_set<uint64_t> m;
10322 int r = _fiemap(c_, oid, offset, length, m);
10323 if (r >= 0) {
10324 encode(m, bl);
10325 }
10326 return r;
10327}
10328
10329int BlueStore::fiemap(
10330 CollectionHandle &c_,
10331 const ghobject_t& oid,
10332 uint64_t offset,
10333 size_t length,
10334 map<uint64_t, uint64_t>& destmap)
10335{
10336 interval_set<uint64_t> m;
10337 int r = _fiemap(c_, oid, offset, length, m);
10338 if (r >= 0) {
10339 destmap = std::move(m).detach();
10340 }
10341 return r;
10342}
10343
10344int BlueStore::readv(
10345 CollectionHandle &c_,
10346 const ghobject_t& oid,
10347 interval_set<uint64_t>& m,
10348 bufferlist& bl,
10349 uint32_t op_flags)
10350{
10351 auto start = mono_clock::now();
10352 Collection *c = static_cast<Collection *>(c_.get());
10353 const coll_t &cid = c->get_cid();
10354 dout(15) << __func__ << " " << cid << " " << oid
10355 << " fiemap " << m
10356 << dendl;
10357 if (!c->exists)
10358 return -ENOENT;
10359
10360 bl.clear();
10361 int r;
10362 {
10363 std::shared_lock l(c->lock);
10364 auto start1 = mono_clock::now();
10365 OnodeRef o = c->get_onode(oid, false);
10366 log_latency("get_onode@read",
10367 l_bluestore_read_onode_meta_lat,
10368 mono_clock::now() - start1,
10369 cct->_conf->bluestore_log_op_age);
10370 if (!o || !o->exists) {
10371 r = -ENOENT;
10372 goto out;
10373 }
10374
10375 if (m.empty()) {
10376 r = 0;
10377 goto out;
10378 }
10379
10380 r = _do_readv(c, o, m, bl, op_flags);
10381 if (r == -EIO) {
10382 logger->inc(l_bluestore_read_eio);
10383 }
10384 }
10385
10386 out:
10387 if (r >= 0 && _debug_data_eio(oid)) {
10388 r = -EIO;
10389 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10390 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10391 cct->_conf->bluestore_debug_random_read_err &&
10392 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10393 100.0)) == 0) {
10394 dout(0) << __func__ << ": inject random EIO" << dendl;
10395 r = -EIO;
10396 }
10397 dout(10) << __func__ << " " << cid << " " << oid
10398 << " fiemap " << m << std::dec
10399 << " = " << r << dendl;
10400 log_latency(__func__,
10401 l_bluestore_read_lat,
10402 mono_clock::now() - start,
10403 cct->_conf->bluestore_log_op_age);
10404 return r;
10405}
10406
10407int BlueStore::_do_readv(
10408 Collection *c,
10409 OnodeRef o,
10410 const interval_set<uint64_t>& m,
10411 bufferlist& bl,
10412 uint32_t op_flags,
10413 uint64_t retry_count)
10414{
10415 FUNCTRACE(cct);
10416 int r = 0;
10417 int read_cache_policy = 0; // do not bypass clean or dirty cache
10418
10419 dout(20) << __func__ << " fiemap " << m << std::hex
10420 << " size 0x" << o->onode.size << " (" << std::dec
10421 << o->onode.size << ")" << dendl;
10422
10423 // generally, don't buffer anything, unless the client explicitly requests
10424 // it.
10425 bool buffered = false;
10426 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10427 dout(20) << __func__ << " will do buffered read" << dendl;
10428 buffered = true;
10429 } else if (cct->_conf->bluestore_default_buffered_read &&
10430 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10431 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10432 dout(20) << __func__ << " defaulting to buffered read" << dendl;
10433 buffered = true;
10434 }
10435 // this method must be idempotent since we may call it several times
10436 // before we finally read the expected result.
10437 bl.clear();
10438
10439 // call fiemap first!
10440 ceph_assert(m.range_start() <= o->onode.size);
10441 ceph_assert(m.range_end() <= o->onode.size);
10442 auto start = mono_clock::now();
10443 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
10444 log_latency(__func__,
10445 l_bluestore_read_onode_meta_lat,
10446 mono_clock::now() - start,
10447 cct->_conf->bluestore_log_op_age);
10448 _dump_onode<30>(cct, *o);
10449
10450 IOContext ioc(cct, NULL, true); // allow EIO
10451 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
10452 raw_results.reserve(m.num_intervals());
10453 int i = 0;
10454 for (auto p = m.begin(); p != m.end(); p++, i++) {
10455 raw_results.push_back({});
10456 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
10457 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
10458 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
10459 // we always issue aio for reading, so errors other than EIO are not allowed
10460 if (r < 0)
10461 return r;
10462 }
10463
10464 auto num_ios = m.size();
10465 if (ioc.has_pending_aios()) {
10466 num_ios = ioc.get_num_ios();
10467 bdev->aio_submit(&ioc);
10468 dout(20) << __func__ << " waiting for aio" << dendl;
10469 ioc.aio_wait();
10470 r = ioc.get_return_value();
10471 if (r < 0) {
10472 ceph_assert(r == -EIO); // no other errors allowed
10473 return -EIO;
10474 }
10475 }
10476 log_latency_fn(__func__,
10477 l_bluestore_read_wait_aio_lat,
10478 mono_clock::now() - start,
10479 cct->_conf->bluestore_log_op_age,
10480 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
10481 );
10482
10483 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
10484 i = 0;
10485 for (auto p = m.begin(); p != m.end(); p++, i++) {
10486 bool csum_error = false;
10487 bufferlist t;
10488 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
10489 std::get<0>(raw_results[i]),
10490 std::get<1>(raw_results[i]),
10491 std::get<2>(raw_results[i]),
10492 buffered, &csum_error, t);
10493 if (csum_error) {
10494 // Handles spurious read errors caused by a kernel bug.
10495 // We sometimes get all-zero pages as a result of the read under
10496 // high memory pressure. Retrying the failing read succeeds in most
10497 // cases.
10498 // See also: http://tracker.ceph.com/issues/22464
10499 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
10500 return -EIO;
10501 }
10502 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
10503 }
10504 bl.claim_append(t);
10505 }
10506 if (retry_count) {
10507 logger->inc(l_bluestore_reads_with_retries);
10508 dout(5) << __func__ << " read fiemap " << m
10509 << " failed " << retry_count << " times before succeeding"
10510 << dendl;
10511 }
10512 return bl.length();
7c673cae
FG
10513}
10514
9f95a23c 10515int BlueStore::dump_onode(CollectionHandle &c_,
7c673cae 10516 const ghobject_t& oid,
9f95a23c
TL
10517 const string& section_name,
10518 Formatter *f)
7c673cae 10519{
9f95a23c
TL
10520 Collection *c = static_cast<Collection *>(c_.get());
10521 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10522 if (!c->exists)
10523 return -ENOENT;
7c673cae 10524
9f95a23c
TL
10525 int r;
10526 {
10527 std::shared_lock l(c->lock);
10528
10529 OnodeRef o = c->get_onode(oid, false);
10530 if (!o || !o->exists) {
10531 r = -ENOENT;
10532 goto out;
10533 }
10534 // FIXME minor: actually the next line isn't enough to
10535 // load shared blobs. Leaving as is for now..
10536 //
10537 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10538
10539 _dump_onode<0>(cct, *o);
10540 f->open_object_section(section_name.c_str());
10541 o->dump(f);
10542 f->close_section();
10543 r = 0;
7c673cae 10544 }
9f95a23c
TL
10545 out:
10546 dout(10) << __func__ << " " << c->cid << " " << oid
10547 << " = " << r << dendl;
7c673cae
FG
10548 return r;
10549}
10550
7c673cae
FG
10551int BlueStore::getattr(
10552 CollectionHandle &c_,
10553 const ghobject_t& oid,
10554 const char *name,
10555 bufferptr& value)
10556{
10557 Collection *c = static_cast<Collection *>(c_.get());
10558 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
10559 if (!c->exists)
10560 return -ENOENT;
10561
10562 int r;
10563 {
9f95a23c 10564 std::shared_lock l(c->lock);
f91f0fd5 10565 mempool::bluestore_cache_meta::string k(name);
7c673cae
FG
10566
10567 OnodeRef o = c->get_onode(oid, false);
10568 if (!o || !o->exists) {
10569 r = -ENOENT;
10570 goto out;
10571 }
10572
10573 if (!o->onode.attrs.count(k)) {
10574 r = -ENODATA;
10575 goto out;
10576 }
10577 value = o->onode.attrs[k];
10578 r = 0;
10579 }
10580 out:
7c673cae
FG
10581 if (r == 0 && _debug_mdata_eio(oid)) {
10582 r = -EIO;
10583 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10584 }
10585 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
10586 << " = " << r << dendl;
10587 return r;
10588}
10589
7c673cae
FG
10590int BlueStore::getattrs(
10591 CollectionHandle &c_,
10592 const ghobject_t& oid,
10593 map<string,bufferptr>& aset)
10594{
10595 Collection *c = static_cast<Collection *>(c_.get());
10596 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10597 if (!c->exists)
10598 return -ENOENT;
10599
10600 int r;
10601 {
9f95a23c 10602 std::shared_lock l(c->lock);
7c673cae
FG
10603
10604 OnodeRef o = c->get_onode(oid, false);
10605 if (!o || !o->exists) {
10606 r = -ENOENT;
10607 goto out;
10608 }
10609 for (auto& i : o->onode.attrs) {
10610 aset.emplace(i.first.c_str(), i.second);
10611 }
10612 r = 0;
10613 }
10614
10615 out:
7c673cae
FG
10616 if (r == 0 && _debug_mdata_eio(oid)) {
10617 r = -EIO;
10618 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10619 }
10620 dout(10) << __func__ << " " << c->cid << " " << oid
10621 << " = " << r << dendl;
10622 return r;
10623}
10624
10625int BlueStore::list_collections(vector<coll_t>& ls)
10626{
9f95a23c 10627 std::shared_lock l(coll_lock);
11fdf7f2 10628 ls.reserve(coll_map.size());
7c673cae
FG
10629 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
10630 p != coll_map.end();
10631 ++p)
10632 ls.push_back(p->first);
10633 return 0;
10634}
10635
10636bool BlueStore::collection_exists(const coll_t& c)
10637{
9f95a23c 10638 std::shared_lock l(coll_lock);
7c673cae
FG
10639 return coll_map.count(c);
10640}
10641
11fdf7f2 10642int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
7c673cae 10643{
11fdf7f2 10644 dout(15) << __func__ << " " << ch->cid << dendl;
7c673cae
FG
10645 vector<ghobject_t> ls;
10646 ghobject_t next;
11fdf7f2 10647 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
7c673cae
FG
10648 &ls, &next);
10649 if (r < 0) {
10650 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
10651 << dendl;
10652 return r;
10653 }
10654 *empty = ls.empty();
11fdf7f2 10655 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
7c673cae
FG
10656 return 0;
10657}
10658
11fdf7f2 10659int BlueStore::collection_bits(CollectionHandle& ch)
7c673cae 10660{
11fdf7f2
TL
10661 dout(15) << __func__ << " " << ch->cid << dendl;
10662 Collection *c = static_cast<Collection*>(ch.get());
9f95a23c 10663 std::shared_lock l(c->lock);
11fdf7f2 10664 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
7c673cae
FG
10665 return c->cnode.bits;
10666}
10667
7c673cae
FG
10668int BlueStore::collection_list(
10669 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10670 vector<ghobject_t> *ls, ghobject_t *pnext)
10671{
10672 Collection *c = static_cast<Collection *>(c_.get());
11fdf7f2 10673 c->flush();
7c673cae
FG
10674 dout(15) << __func__ << " " << c->cid
10675 << " start " << start << " end " << end << " max " << max << dendl;
10676 int r;
10677 {
9f95a23c 10678 std::shared_lock l(c->lock);
f91f0fd5
TL
10679 r = _collection_list(c, start, end, max, false, ls, pnext);
10680 }
10681
10682 dout(10) << __func__ << " " << c->cid
10683 << " start " << start << " end " << end << " max " << max
10684 << " = " << r << ", ls.size() = " << ls->size()
10685 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10686 return r;
10687}
10688
10689int BlueStore::collection_list_legacy(
10690 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10691 vector<ghobject_t> *ls, ghobject_t *pnext)
10692{
10693 Collection *c = static_cast<Collection *>(c_.get());
10694 c->flush();
10695 dout(15) << __func__ << " " << c->cid
10696 << " start " << start << " end " << end << " max " << max << dendl;
10697 int r;
10698 {
10699 std::shared_lock l(c->lock);
10700 r = _collection_list(c, start, end, max, true, ls, pnext);
7c673cae
FG
10701 }
10702
7c673cae
FG
10703 dout(10) << __func__ << " " << c->cid
10704 << " start " << start << " end " << end << " max " << max
10705 << " = " << r << ", ls.size() = " << ls->size()
10706 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10707 return r;
10708}
10709
10710int BlueStore::_collection_list(
10711 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
f91f0fd5 10712 bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
7c673cae
FG
10713{
10714
10715 if (!c->exists)
10716 return -ENOENT;
10717
494da23a 10718 auto start_time = mono_clock::now();
7c673cae
FG
10719 int r = 0;
10720 ghobject_t static_next;
f91f0fd5
TL
10721 std::unique_ptr<CollectionListIterator> it;
10722 ghobject_t coll_range_temp_start, coll_range_temp_end;
10723 ghobject_t coll_range_start, coll_range_end;
7c673cae 10724 bool set_next = false;
f91f0fd5 10725 ghobject_t pend;
7c673cae
FG
10726 bool temp;
10727
10728 if (!pnext)
10729 pnext = &static_next;
10730
11fdf7f2 10731 if (start.is_max() || start.hobj.is_max()) {
7c673cae
FG
10732 goto out;
10733 }
f91f0fd5
TL
10734 get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
10735 &coll_range_temp_end, &coll_range_start, &coll_range_end);
7c673cae 10736 dout(20) << __func__
f91f0fd5
TL
10737 << " range " << coll_range_temp_start
10738 << " to " << coll_range_temp_end
10739 << " and " << coll_range_start
10740 << " to " << coll_range_end
7c673cae 10741 << " start " << start << dendl;
f91f0fd5
TL
10742 if (legacy) {
10743 it = std::make_unique<SimpleCollectionListIterator>(
10744 cct, db->get_iterator(PREFIX_OBJ));
10745 } else {
10746 it = std::make_unique<SortedCollectionListIterator>(
10747 db->get_iterator(PREFIX_OBJ));
10748 }
7c673cae
FG
10749 if (start == ghobject_t() ||
10750 start.hobj == hobject_t() ||
10751 start == c->cid.get_min_hobj()) {
f91f0fd5 10752 it->upper_bound(coll_range_temp_start);
7c673cae
FG
10753 temp = true;
10754 } else {
7c673cae
FG
10755 if (start.hobj.is_temp()) {
10756 temp = true;
f91f0fd5 10757 ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
7c673cae
FG
10758 } else {
10759 temp = false;
f91f0fd5 10760 ceph_assert(start >= coll_range_start && start < coll_range_end);
7c673cae 10761 }
f91f0fd5
TL
10762 dout(20) << __func__ << " temp=" << (int)temp << dendl;
10763 it->lower_bound(start);
7c673cae
FG
10764 }
10765 if (end.hobj.is_max()) {
f91f0fd5 10766 pend = temp ? coll_range_temp_end : coll_range_end;
7c673cae 10767 } else {
7c673cae
FG
10768 if (end.hobj.is_temp()) {
10769 if (temp)
f91f0fd5 10770 pend = end;
7c673cae 10771 else
f91f0fd5 10772 goto out;
7c673cae 10773 } else {
f91f0fd5 10774 pend = temp ? coll_range_temp_end : end;
7c673cae
FG
10775 }
10776 }
f91f0fd5 10777 dout(20) << __func__ << " pend " << pend << dendl;
7c673cae 10778 while (true) {
f91f0fd5 10779 if (!it->valid() || it->oid() >= pend) {
7c673cae
FG
10780 if (!it->valid())
10781 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
10782 else
f91f0fd5 10783 dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
7c673cae
FG
10784 if (temp) {
10785 if (end.hobj.is_temp()) {
f91f0fd5
TL
10786 if (it->valid() && it->oid() < coll_range_temp_end) {
10787 *pnext = it->oid();
10788 set_next = true;
10789 }
7c673cae
FG
10790 break;
10791 }
10792 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
10793 temp = false;
f91f0fd5
TL
10794 it->upper_bound(coll_range_start);
10795 if (end.hobj.is_max())
10796 pend = coll_range_end;
10797 else
10798 pend = end;
10799 dout(30) << __func__ << " pend " << pend << dendl;
7c673cae
FG
10800 continue;
10801 }
f91f0fd5
TL
10802 if (it->valid() && it->oid() < coll_range_end) {
10803 *pnext = it->oid();
10804 set_next = true;
10805 }
7c673cae
FG
10806 break;
10807 }
f91f0fd5 10808 dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
7c673cae
FG
10809 if (ls->size() >= (unsigned)max) {
10810 dout(20) << __func__ << " reached max " << max << dendl;
f91f0fd5 10811 *pnext = it->oid();
7c673cae
FG
10812 set_next = true;
10813 break;
10814 }
f91f0fd5 10815 ls->push_back(it->oid());
7c673cae
FG
10816 it->next();
10817 }
10818out:
10819 if (!set_next) {
10820 *pnext = ghobject_t::get_max();
10821 }
494da23a
TL
10822 log_latency_fn(
10823 __func__,
10824 l_bluestore_clist_lat,
10825 mono_clock::now() - start_time,
10826 cct->_conf->bluestore_log_collection_list_age,
10827 [&] (const ceph::timespan& lat) {
10828 ostringstream ostr;
10829 ostr << ", lat = " << timespan_str(lat)
10830 << " cid =" << c->cid
10831 << " start " << start << " end " << end
10832 << " max " << max;
10833 return ostr.str();
10834 }
10835 );
7c673cae
FG
10836 return r;
10837}
10838
7c673cae
FG
10839int BlueStore::omap_get(
10840 CollectionHandle &c_, ///< [in] Collection containing oid
10841 const ghobject_t &oid, ///< [in] Object containing omap
10842 bufferlist *header, ///< [out] omap header
10843 map<string, bufferlist> *out /// < [out] Key to value map
10844 )
10845{
10846 Collection *c = static_cast<Collection *>(c_.get());
9f95a23c
TL
10847 return _omap_get(c, oid, header, out);
10848}
10849
10850int BlueStore::_omap_get(
10851 Collection *c, ///< [in] Collection containing oid
10852 const ghobject_t &oid, ///< [in] Object containing omap
10853 bufferlist *header, ///< [out] omap header
10854 map<string, bufferlist> *out /// < [out] Key to value map
10855 )
10856{
7c673cae
FG
10857 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10858 if (!c->exists)
10859 return -ENOENT;
9f95a23c 10860 std::shared_lock l(c->lock);
7c673cae
FG
10861 int r = 0;
10862 OnodeRef o = c->get_onode(oid, false);
10863 if (!o || !o->exists) {
10864 r = -ENOENT;
10865 goto out;
10866 }
9f95a23c
TL
10867 r = _onode_omap_get(o, header, out);
10868 out:
10869 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10870 << dendl;
10871 return r;
10872}
10873
10874int BlueStore::_onode_omap_get(
10875 const OnodeRef &o, ///< [in] Object containing omap
10876 bufferlist *header, ///< [out] omap header
10877 map<string, bufferlist> *out /// < [out] Key to value map
10878)
10879{
10880 int r = 0;
10881 if (!o || !o->exists) {
10882 r = -ENOENT;
10883 goto out;
10884 }
7c673cae
FG
10885 if (!o->onode.has_omap())
10886 goto out;
10887 o->flush();
10888 {
9f95a23c 10889 const string& prefix = o->get_omap_prefix();
11fdf7f2 10890 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 10891 string head, tail;
9f95a23c
TL
10892 o->get_omap_header(&head);
10893 o->get_omap_tail(&tail);
7c673cae
FG
10894 it->lower_bound(head);
10895 while (it->valid()) {
10896 if (it->key() == head) {
9f95a23c
TL
10897 dout(30) << __func__ << " got header" << dendl;
10898 *header = it->value();
7c673cae 10899 } else if (it->key() >= tail) {
9f95a23c
TL
10900 dout(30) << __func__ << " reached tail" << dendl;
10901 break;
7c673cae 10902 } else {
9f95a23c
TL
10903 string user_key;
10904 o->decode_omap_key(it->key(), &user_key);
10905 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
10906 << " -> " << user_key << dendl;
10907 (*out)[user_key] = it->value();
7c673cae
FG
10908 }
10909 it->next();
10910 }
10911 }
9f95a23c 10912out:
7c673cae
FG
10913 return r;
10914}
10915
7c673cae
FG
10916int BlueStore::omap_get_header(
10917 CollectionHandle &c_, ///< [in] Collection containing oid
10918 const ghobject_t &oid, ///< [in] Object containing omap
10919 bufferlist *header, ///< [out] omap header
10920 bool allow_eio ///< [in] don't assert on eio
10921 )
10922{
10923 Collection *c = static_cast<Collection *>(c_.get());
10924 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10925 if (!c->exists)
10926 return -ENOENT;
9f95a23c 10927 std::shared_lock l(c->lock);
7c673cae
FG
10928 int r = 0;
10929 OnodeRef o = c->get_onode(oid, false);
10930 if (!o || !o->exists) {
10931 r = -ENOENT;
10932 goto out;
10933 }
10934 if (!o->onode.has_omap())
10935 goto out;
10936 o->flush();
10937 {
10938 string head;
9f95a23c
TL
10939 o->get_omap_header(&head);
10940 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
7c673cae
FG
10941 dout(30) << __func__ << " got header" << dendl;
10942 } else {
10943 dout(30) << __func__ << " no header" << dendl;
10944 }
10945 }
10946 out:
10947 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10948 << dendl;
10949 return r;
10950}
10951
7c673cae
FG
10952int BlueStore::omap_get_keys(
10953 CollectionHandle &c_, ///< [in] Collection containing oid
10954 const ghobject_t &oid, ///< [in] Object containing omap
10955 set<string> *keys ///< [out] Keys defined on oid
10956 )
10957{
10958 Collection *c = static_cast<Collection *>(c_.get());
10959 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10960 if (!c->exists)
10961 return -ENOENT;
9f95a23c 10962 std::shared_lock l(c->lock);
7c673cae
FG
10963 int r = 0;
10964 OnodeRef o = c->get_onode(oid, false);
10965 if (!o || !o->exists) {
10966 r = -ENOENT;
10967 goto out;
10968 }
10969 if (!o->onode.has_omap())
10970 goto out;
10971 o->flush();
10972 {
9f95a23c 10973 const string& prefix = o->get_omap_prefix();
11fdf7f2 10974 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 10975 string head, tail;
9f95a23c
TL
10976 o->get_omap_key(string(), &head);
10977 o->get_omap_tail(&tail);
7c673cae
FG
10978 it->lower_bound(head);
10979 while (it->valid()) {
10980 if (it->key() >= tail) {
10981 dout(30) << __func__ << " reached tail" << dendl;
10982 break;
10983 }
10984 string user_key;
9f95a23c 10985 o->decode_omap_key(it->key(), &user_key);
11fdf7f2 10986 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
10987 << " -> " << user_key << dendl;
10988 keys->insert(user_key);
10989 it->next();
11fdf7f2
TL
10990 }
10991 }
10992 out:
10993 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10994 << dendl;
10995 return r;
7c673cae
FG
10996}
10997
10998int BlueStore::omap_get_values(
10999 CollectionHandle &c_, ///< [in] Collection containing oid
11000 const ghobject_t &oid, ///< [in] Object containing omap
11001 const set<string> &keys, ///< [in] Keys to get
11002 map<string, bufferlist> *out ///< [out] Returned keys and values
11003 )
11004{
11005 Collection *c = static_cast<Collection *>(c_.get());
11006 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11007 if (!c->exists)
11008 return -ENOENT;
9f95a23c 11009 std::shared_lock l(c->lock);
7c673cae
FG
11010 int r = 0;
11011 string final_key;
11012 OnodeRef o = c->get_onode(oid, false);
11013 if (!o || !o->exists) {
11014 r = -ENOENT;
11015 goto out;
11016 }
9f95a23c 11017 if (!o->onode.has_omap()) {
7c673cae 11018 goto out;
9f95a23c
TL
11019 }
11020 o->flush();
11fdf7f2 11021 {
9f95a23c
TL
11022 const string& prefix = o->get_omap_prefix();
11023 o->get_omap_key(string(), &final_key);
11024 size_t base_key_len = final_key.size();
11fdf7f2 11025 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 11026 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
11027 final_key += *p;
11028 bufferlist val;
11029 if (db->get(prefix, final_key, &val) >= 0) {
11030 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
11031 << " -> " << *p << dendl;
11032 out->insert(make_pair(*p, val));
11033 }
7c673cae
FG
11034 }
11035 }
11036 out:
11037 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11038 << dendl;
11039 return r;
11040}
11041
9f95a23c
TL
11042#ifdef WITH_SEASTAR
11043int BlueStore::omap_get_values(
11044 CollectionHandle &c_, ///< [in] Collection containing oid
11045 const ghobject_t &oid, ///< [in] Object containing omap
11046 const std::optional<string> &start_after, ///< [in] Keys to get
11047 map<string, bufferlist> *output ///< [out] Returned keys and values
11048 )
11049{
11050 Collection *c = static_cast<Collection *>(c_.get());
11051 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11052 if (!c->exists)
11053 return -ENOENT;
11054 std::shared_lock l(c->lock);
11055 int r = 0;
11056 OnodeRef o = c->get_onode(oid, false);
11057 if (!o || !o->exists) {
11058 r = -ENOENT;
11059 goto out;
11060 }
11061 if (!o->onode.has_omap()) {
11062 goto out;
11063 }
11064 o->flush();
11065 {
11066 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
11067 if (!iter) {
11068 r = -ENOENT;
11069 goto out;
11070 }
11071 iter->upper_bound(*start_after);
11072 for (; iter->valid(); iter->next()) {
11073 output->insert(make_pair(iter->key(), iter->value()));
11074 }
11075 }
11076
11077out:
11078 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11079 << dendl;
11080 return r;
11081}
11082#endif
11083
7c673cae
FG
11084int BlueStore::omap_check_keys(
11085 CollectionHandle &c_, ///< [in] Collection containing oid
11086 const ghobject_t &oid, ///< [in] Object containing omap
11087 const set<string> &keys, ///< [in] Keys to check
11088 set<string> *out ///< [out] Subset of keys defined on oid
11089 )
11090{
11091 Collection *c = static_cast<Collection *>(c_.get());
11092 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
11093 if (!c->exists)
11094 return -ENOENT;
9f95a23c 11095 std::shared_lock l(c->lock);
7c673cae
FG
11096 int r = 0;
11097 string final_key;
11098 OnodeRef o = c->get_onode(oid, false);
11099 if (!o || !o->exists) {
11100 r = -ENOENT;
11101 goto out;
11102 }
9f95a23c 11103 if (!o->onode.has_omap()) {
7c673cae 11104 goto out;
9f95a23c
TL
11105 }
11106 o->flush();
11fdf7f2 11107 {
9f95a23c
TL
11108 const string& prefix = o->get_omap_prefix();
11109 o->get_omap_key(string(), &final_key);
11110 size_t base_key_len = final_key.size();
11fdf7f2 11111 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 11112 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
11113 final_key += *p;
11114 bufferlist val;
11115 if (db->get(prefix, final_key, &val) >= 0) {
11116 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
11117 << " -> " << *p << dendl;
11118 out->insert(*p);
11119 } else {
11120 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
11121 << " -> " << *p << dendl;
11122 }
7c673cae
FG
11123 }
11124 }
11125 out:
11126 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
11127 << dendl;
11128 return r;
11129}
11130
7c673cae
FG
11131ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
11132 CollectionHandle &c_, ///< [in] collection
11133 const ghobject_t &oid ///< [in] object
11134 )
11135{
11136 Collection *c = static_cast<Collection *>(c_.get());
11137 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
11138 if (!c->exists) {
11139 return ObjectMap::ObjectMapIterator();
11140 }
9f95a23c 11141 std::shared_lock l(c->lock);
7c673cae
FG
11142 OnodeRef o = c->get_onode(oid, false);
11143 if (!o || !o->exists) {
11144 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
11145 return ObjectMap::ObjectMapIterator();
11146 }
11147 o->flush();
11148 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
9f95a23c 11149 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix());
7c673cae
FG
11150 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
11151}
11152
11153// -----------------
11154// write helpers
11155
11fdf7f2
TL
11156uint64_t BlueStore::_get_ondisk_reserved() const {
11157 return round_up_to(
11158 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
11159}
11160
7c673cae
FG
11161void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
11162{
11163 dout(10) << __func__ << " ondisk_format " << ondisk_format
11164 << " min_compat_ondisk_format " << min_compat_ondisk_format
11165 << dendl;
11fdf7f2 11166 ceph_assert(ondisk_format == latest_ondisk_format);
7c673cae
FG
11167 {
11168 bufferlist bl;
11fdf7f2 11169 encode(ondisk_format, bl);
7c673cae
FG
11170 t->set(PREFIX_SUPER, "ondisk_format", bl);
11171 }
11172 {
11173 bufferlist bl;
11fdf7f2 11174 encode(min_compat_ondisk_format, bl);
7c673cae
FG
11175 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
11176 }
11177}
11178
11179int BlueStore::_open_super_meta()
11180{
11181 // nid
11182 {
11183 nid_max = 0;
11184 bufferlist bl;
11185 db->get(PREFIX_SUPER, "nid_max", &bl);
11fdf7f2 11186 auto p = bl.cbegin();
7c673cae
FG
11187 try {
11188 uint64_t v;
11fdf7f2 11189 decode(v, p);
7c673cae
FG
11190 nid_max = v;
11191 } catch (buffer::error& e) {
11192 derr << __func__ << " unable to read nid_max" << dendl;
11193 return -EIO;
11194 }
11195 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
11196 nid_last = nid_max.load();
11197 }
11198
11199 // blobid
11200 {
11201 blobid_max = 0;
11202 bufferlist bl;
11203 db->get(PREFIX_SUPER, "blobid_max", &bl);
11fdf7f2 11204 auto p = bl.cbegin();
7c673cae
FG
11205 try {
11206 uint64_t v;
11fdf7f2 11207 decode(v, p);
7c673cae
FG
11208 blobid_max = v;
11209 } catch (buffer::error& e) {
11210 derr << __func__ << " unable to read blobid_max" << dendl;
11211 return -EIO;
11212 }
11213 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
11214 blobid_last = blobid_max.load();
11215 }
11216
11217 // freelist
11218 {
11219 bufferlist bl;
11220 db->get(PREFIX_SUPER, "freelist_type", &bl);
11221 if (bl.length()) {
11222 freelist_type = std::string(bl.c_str(), bl.length());
11223 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
11224 } else {
11fdf7f2 11225 ceph_abort_msg("Not Support extent freelist manager");
7c673cae 11226 }
7c673cae
FG
11227 }
11228
11229 // ondisk format
11230 int32_t compat_ondisk_format = 0;
11231 {
11232 bufferlist bl;
11233 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
11234 if (r < 0) {
11235 // base case: kraken bluestore is v1 and readable by v1
11236 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
11237 << dendl;
11238 ondisk_format = 1;
11239 compat_ondisk_format = 1;
11240 } else {
11fdf7f2 11241 auto p = bl.cbegin();
7c673cae 11242 try {
11fdf7f2 11243 decode(ondisk_format, p);
7c673cae
FG
11244 } catch (buffer::error& e) {
11245 derr << __func__ << " unable to read ondisk_format" << dendl;
11246 return -EIO;
11247 }
11248 bl.clear();
11249 {
11250 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11fdf7f2
TL
11251 ceph_assert(!r);
11252 auto p = bl.cbegin();
7c673cae 11253 try {
11fdf7f2 11254 decode(compat_ondisk_format, p);
7c673cae
FG
11255 } catch (buffer::error& e) {
11256 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
11257 return -EIO;
11258 }
11259 }
11260 }
11261 dout(10) << __func__ << " ondisk_format " << ondisk_format
11262 << " compat_ondisk_format " << compat_ondisk_format
11263 << dendl;
11264 }
11265
11266 if (latest_ondisk_format < compat_ondisk_format) {
11267 derr << __func__ << " compat_ondisk_format is "
11268 << compat_ondisk_format << " but we only understand version "
11269 << latest_ondisk_format << dendl;
11270 return -EPERM;
11271 }
7c673cae
FG
11272
11273 {
11274 bufferlist bl;
11275 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11fdf7f2 11276 auto p = bl.cbegin();
7c673cae
FG
11277 try {
11278 uint64_t val;
11fdf7f2 11279 decode(val, p);
7c673cae 11280 min_alloc_size = val;
224ce89b 11281 min_alloc_size_order = ctz(val);
11fdf7f2 11282 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
7c673cae
FG
11283 } catch (buffer::error& e) {
11284 derr << __func__ << " unable to read min_alloc_size" << dendl;
11285 return -EIO;
11286 }
11287 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11288 << std::dec << dendl;
11289 }
9f95a23c
TL
11290
11291 _set_per_pool_omap();
11292
224ce89b 11293 _open_statfs();
7c673cae
FG
11294 _set_alloc_sizes();
11295 _set_throttle_params();
11296
11297 _set_csum();
11298 _set_compression();
11299 _set_blob_size();
11300
11fdf7f2 11301 _validate_bdev();
7c673cae
FG
11302 return 0;
11303}
11304
11305int BlueStore::_upgrade_super()
11306{
11307 dout(1) << __func__ << " from " << ondisk_format << ", latest "
11308 << latest_ondisk_format << dendl;
11fdf7f2
TL
11309 if (ondisk_format < latest_ondisk_format) {
11310 ceph_assert(ondisk_format > 0);
11311 ceph_assert(ondisk_format < latest_ondisk_format);
11312
1911f103 11313 KeyValueDB::Transaction t = db->get_transaction();
11fdf7f2
TL
11314 if (ondisk_format == 1) {
11315 // changes:
11316 // - super: added ondisk_format
11317 // - super: added min_readable_ondisk_format
11318 // - super: added min_compat_ondisk_format
11319 // - super: added min_alloc_size
11320 // - super: removed min_min_alloc_size
11fdf7f2
TL
11321 {
11322 bufferlist bl;
11323 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
11324 auto p = bl.cbegin();
11325 try {
11326 uint64_t val;
11327 decode(val, p);
11328 min_alloc_size = val;
11329 } catch (buffer::error& e) {
11330 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
11331 return -EIO;
11332 }
11333 t->set(PREFIX_SUPER, "min_alloc_size", bl);
11334 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7c673cae 11335 }
11fdf7f2 11336 ondisk_format = 2;
7c673cae 11337 }
9f95a23c
TL
11338 if (ondisk_format == 2) {
11339 // changes:
11340 // - onode has FLAG_PER_POOL_OMAP. Note that we do not know that *all*
11341 // ondes are using the per-pool prefix until a repair is run; at that
11342 // point the per_pool_omap=1 key will be set.
11343 // - super: added per_pool_omap key, which indicates that *all* objects
11344 // are using the new prefix and key format
11345 ondisk_format = 3;
1911f103
TL
11346 }
11347 if (ondisk_format == 3) {
11348 // changes:
11349 // - FreelistManager keeps meta within bdev label
11350 int r = _write_out_fm_meta(0);
9f95a23c 11351 ceph_assert(r == 0);
1911f103 11352 ondisk_format = 4;
9f95a23c 11353 }
1911f103
TL
11354 // This to be the last operation
11355 _prepare_ondisk_format_super(t);
11356 int r = db->submit_transaction_sync(t);
11357 ceph_assert(r == 0);
7c673cae 11358 }
7c673cae
FG
11359 // done
11360 dout(1) << __func__ << " done" << dendl;
11361 return 0;
11362}
11363
11364void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
11365{
224ce89b 11366 if (o->onode.nid) {
11fdf7f2 11367 ceph_assert(o->exists);
7c673cae 11368 return;
224ce89b 11369 }
7c673cae
FG
11370 uint64_t nid = ++nid_last;
11371 dout(20) << __func__ << " " << nid << dendl;
11372 o->onode.nid = nid;
11373 txc->last_nid = nid;
224ce89b 11374 o->exists = true;
7c673cae
FG
11375}
11376
11377uint64_t BlueStore::_assign_blobid(TransContext *txc)
11378{
11379 uint64_t bid = ++blobid_last;
11380 dout(20) << __func__ << " " << bid << dendl;
11381 txc->last_blobid = bid;
11382 return bid;
11383}
11384
11385void BlueStore::get_db_statistics(Formatter *f)
11386{
11387 db->get_statistics(f);
11388}
11389
11fdf7f2
TL
11390BlueStore::TransContext *BlueStore::_txc_create(
11391 Collection *c, OpSequencer *osr,
11392 list<Context*> *on_commits)
7c673cae 11393{
11fdf7f2 11394 TransContext *txc = new TransContext(cct, c, osr, on_commits);
7c673cae
FG
11395 txc->t = db->get_transaction();
11396 osr->queue_new(txc);
11397 dout(20) << __func__ << " osr " << osr << " = " << txc
11398 << " seq " << txc->seq << dendl;
11399 return txc;
11400}
11401
11402void BlueStore::_txc_calc_cost(TransContext *txc)
11403{
11fdf7f2
TL
11404 // one "io" for the kv commit
11405 auto ios = 1 + txc->ioc.get_num_ios();
7c673cae
FG
11406 auto cost = throttle_cost_per_io.load();
11407 txc->cost = ios * cost + txc->bytes;
9f95a23c 11408 txc->ios = ios;
7c673cae
FG
11409 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
11410 << ios << " ios * " << cost << " + " << txc->bytes
11411 << " bytes)" << dendl;
11412}
11413
11414void BlueStore::_txc_update_store_statfs(TransContext *txc)
11415{
11416 if (txc->statfs_delta.is_empty())
11417 return;
11418
11419 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
11420 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
11421 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
11422 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
11423 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
11424
11425 bufferlist bl;
11426 txc->statfs_delta.encode(bl);
11fdf7f2
TL
11427 if (per_pool_stat_collection) {
11428 string key;
11429 get_pool_stat_key(txc->osd_pool_id, &key);
11430 txc->t->merge(PREFIX_STAT, key, bl);
11431
11432 std::lock_guard l(vstatfs_lock);
11433 auto& stats = osd_pools[txc->osd_pool_id];
11434 stats += txc->statfs_delta;
11435
11436 vstatfs += txc->statfs_delta; //non-persistent in this mode
11437
11438 } else {
11439 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
7c673cae 11440
11fdf7f2
TL
11441 std::lock_guard l(vstatfs_lock);
11442 vstatfs += txc->statfs_delta;
11443 }
7c673cae
FG
11444 txc->statfs_delta.reset();
11445}
11446
11447void BlueStore::_txc_state_proc(TransContext *txc)
11448{
11449 while (true) {
11450 dout(10) << __func__ << " txc " << txc
11451 << " " << txc->get_state_name() << dendl;
11452 switch (txc->state) {
11453 case TransContext::STATE_PREPARE:
9f95a23c 11454 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
7c673cae
FG
11455 if (txc->ioc.has_pending_aios()) {
11456 txc->state = TransContext::STATE_AIO_WAIT;
11457 txc->had_ios = true;
11458 _txc_aio_submit(txc);
11459 return;
11460 }
11461 // ** fall-thru **
11462
11463 case TransContext::STATE_AIO_WAIT:
11fdf7f2 11464 {
9f95a23c
TL
11465 mono_clock::duration lat = throttle.log_state_latency(
11466 *txc, logger, l_bluestore_state_aio_wait_lat);
11467 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
11fdf7f2
TL
11468 dout(0) << __func__ << " slow aio_wait, txc = " << txc
11469 << ", latency = " << lat
11470 << dendl;
11471 }
11472 }
11473
7c673cae
FG
11474 _txc_finish_io(txc); // may trigger blocked txc's too
11475 return;
11476
11477 case TransContext::STATE_IO_DONE:
11fdf7f2 11478 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
7c673cae
FG
11479 if (txc->had_ios) {
11480 ++txc->osr->txc_with_unstable_io;
11481 }
9f95a23c 11482 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
7c673cae
FG
11483 txc->state = TransContext::STATE_KV_QUEUED;
11484 if (cct->_conf->bluestore_sync_submit_transaction) {
11485 if (txc->last_nid >= nid_max ||
11486 txc->last_blobid >= blobid_max) {
11487 dout(20) << __func__
11488 << " last_{nid,blobid} exceeds max, submit via kv thread"
11489 << dendl;
11490 } else if (txc->osr->kv_committing_serially) {
11491 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
11492 << dendl;
11493 // note: this is starvation-prone. once we have a txc in a busy
11494 // sequencer that is committing serially it is possible to keep
11495 // submitting new transactions fast enough that we get stuck doing
11496 // so. the alternative is to block here... fixme?
11497 } else if (txc->osr->txc_with_unstable_io) {
11498 dout(20) << __func__ << " prior txc(s) with unstable ios "
11499 << txc->osr->txc_with_unstable_io.load() << dendl;
11500 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
11501 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
11502 == 0) {
11503 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
11504 << dendl;
11505 } else {
9f95a23c 11506 _txc_apply_kv(txc, true);
7c673cae
FG
11507 }
11508 }
11509 {
11fdf7f2 11510 std::lock_guard l(kv_lock);
7c673cae 11511 kv_queue.push_back(txc);
9f95a23c
TL
11512 if (!kv_sync_in_progress) {
11513 kv_sync_in_progress = true;
11514 kv_cond.notify_one();
11515 }
7c673cae
FG
11516 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
11517 kv_queue_unsubmitted.push_back(txc);
11518 ++txc->osr->kv_committing_serially;
11519 }
31f18b77
FG
11520 if (txc->had_ios)
11521 kv_ios++;
11522 kv_throttle_costs += txc->cost;
7c673cae
FG
11523 }
11524 return;
11525 case TransContext::STATE_KV_SUBMITTED:
7c673cae
FG
11526 _txc_committed_kv(txc);
11527 // ** fall-thru **
11528
11529 case TransContext::STATE_KV_DONE:
9f95a23c 11530 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
7c673cae
FG
11531 if (txc->deferred_txn) {
11532 txc->state = TransContext::STATE_DEFERRED_QUEUED;
11533 _deferred_queue(txc);
11534 return;
11535 }
11536 txc->state = TransContext::STATE_FINISHING;
11537 break;
11538
11539 case TransContext::STATE_DEFERRED_CLEANUP:
9f95a23c 11540 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
7c673cae
FG
11541 txc->state = TransContext::STATE_FINISHING;
11542 // ** fall-thru **
11543
11544 case TransContext::STATE_FINISHING:
9f95a23c 11545 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
7c673cae
FG
11546 _txc_finish(txc);
11547 return;
11548
11549 default:
11550 derr << __func__ << " unexpected txc " << txc
11551 << " state " << txc->get_state_name() << dendl;
11fdf7f2 11552 ceph_abort_msg("unexpected txc state");
7c673cae
FG
11553 return;
11554 }
11555 }
11556}
11557
11558void BlueStore::_txc_finish_io(TransContext *txc)
11559{
11560 dout(20) << __func__ << " " << txc << dendl;
11561
11562 /*
11563 * we need to preserve the order of kv transactions,
11564 * even though aio will complete in any order.
11565 */
11566
11567 OpSequencer *osr = txc->osr.get();
11fdf7f2 11568 std::lock_guard l(osr->qlock);
7c673cae 11569 txc->state = TransContext::STATE_IO_DONE;
11fdf7f2 11570 txc->ioc.release_running_aios();
7c673cae
FG
11571 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
11572 while (p != osr->q.begin()) {
11573 --p;
11574 if (p->state < TransContext::STATE_IO_DONE) {
11575 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
11576 << p->get_state_name() << dendl;
11577 return;
11578 }
11579 if (p->state > TransContext::STATE_IO_DONE) {
11580 ++p;
11581 break;
11582 }
11583 }
11584 do {
11585 _txc_state_proc(&*p++);
11586 } while (p != osr->q.end() &&
11587 p->state == TransContext::STATE_IO_DONE);
11588
11fdf7f2 11589 if (osr->kv_submitted_waiters) {
7c673cae
FG
11590 osr->qcond.notify_all();
11591 }
11592}
11593
11594void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
11595{
11596 dout(20) << __func__ << " txc " << txc
11597 << " onodes " << txc->onodes
11598 << " shared_blobs " << txc->shared_blobs
11599 << dendl;
11600
11601 // finalize onodes
11602 for (auto o : txc->onodes) {
11fdf7f2 11603 _record_onode(o, t);
7c673cae
FG
11604 o->flushing_count++;
11605 }
11606
11607 // objects we modified but didn't affect the onode
11608 auto p = txc->modified_objects.begin();
11609 while (p != txc->modified_objects.end()) {
11610 if (txc->onodes.count(*p) == 0) {
11611 (*p)->flushing_count++;
11612 ++p;
11613 } else {
11614 // remove dups with onodes list to avoid problems in _txc_finish
11615 p = txc->modified_objects.erase(p);
11616 }
11617 }
11618
11619 // finalize shared_blobs
11620 for (auto sb : txc->shared_blobs) {
11621 string key;
11622 auto sbid = sb->get_sbid();
11623 get_shared_blob_key(sbid, &key);
11624 if (sb->persistent->empty()) {
11fdf7f2
TL
11625 dout(20) << __func__ << " shared_blob 0x"
11626 << std::hex << sbid << std::dec
7c673cae
FG
11627 << " is empty" << dendl;
11628 t->rmkey(PREFIX_SHARED_BLOB, key);
11629 } else {
11630 bufferlist bl;
11fdf7f2
TL
11631 encode(*(sb->persistent), bl);
11632 dout(20) << __func__ << " shared_blob 0x"
11633 << std::hex << sbid << std::dec
31f18b77 11634 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
11635 t->set(PREFIX_SHARED_BLOB, key, bl);
11636 }
11637 }
11638}
11639
11640void BlueStore::BSPerfTracker::update_from_perfcounters(
11641 PerfCounters &logger)
11642{
11fdf7f2
TL
11643 os_commit_latency_ns.consume_next(
11644 logger.get_tavg_ns(
7c673cae 11645 l_bluestore_commit_lat));
11fdf7f2
TL
11646 os_apply_latency_ns.consume_next(
11647 logger.get_tavg_ns(
7c673cae
FG
11648 l_bluestore_commit_lat));
11649}
11650
11651void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
11652{
11653 dout(20) << __func__ << " txc " << txc << std::hex
11654 << " allocated 0x" << txc->allocated
11655 << " released 0x" << txc->released
11656 << std::dec << dendl;
11657
11658 // We have to handle the case where we allocate *and* deallocate the
11659 // same region in this transaction. The freelist doesn't like that.
11660 // (Actually, the only thing that cares is the BitmapFreelistManager
11661 // debug check. But that's important.)
11662 interval_set<uint64_t> tmp_allocated, tmp_released;
11663 interval_set<uint64_t> *pallocated = &txc->allocated;
11664 interval_set<uint64_t> *preleased = &txc->released;
11665 if (!txc->allocated.empty() && !txc->released.empty()) {
11666 interval_set<uint64_t> overlap;
11667 overlap.intersection_of(txc->allocated, txc->released);
11668 if (!overlap.empty()) {
11669 tmp_allocated = txc->allocated;
11670 tmp_allocated.subtract(overlap);
11671 tmp_released = txc->released;
11672 tmp_released.subtract(overlap);
11673 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
11674 << ", new allocated 0x" << tmp_allocated
11675 << " released 0x" << tmp_released << std::dec
11676 << dendl;
11677 pallocated = &tmp_allocated;
11678 preleased = &tmp_released;
11679 }
11680 }
11681
11682 // update freelist with non-overlap sets
11683 for (interval_set<uint64_t>::iterator p = pallocated->begin();
11684 p != pallocated->end();
11685 ++p) {
11686 fm->allocate(p.get_start(), p.get_len(), t);
11687 }
11688 for (interval_set<uint64_t>::iterator p = preleased->begin();
11689 p != preleased->end();
11690 ++p) {
11691 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
11692 << "~" << p.get_len() << std::dec << dendl;
11693 fm->release(p.get_start(), p.get_len(), t);
11694 }
11695
11696 _txc_update_store_statfs(txc);
11697}
11698
9f95a23c 11699void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
7c673cae 11700{
9f95a23c
TL
11701 ceph_assert(txc->state == TransContext::STATE_KV_QUEUED);
11702 {
11703#if defined(WITH_LTTNG)
11704 auto start = mono_clock::now();
11705#endif
11706
11707 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11708 ceph_assert(r == 0);
11709 txc->state = TransContext::STATE_KV_SUBMITTED;
11710 if (txc->osr->kv_submitted_waiters) {
11711 std::lock_guard l(txc->osr->qlock);
11712 txc->osr->qcond.notify_all();
11713 }
11714
11715#if defined(WITH_LTTNG)
11716 if (txc->tracing) {
11717 tracepoint(
11718 bluestore,
11719 transaction_kv_submit_latency,
11720 txc->osr->get_sequencer_id(),
11721 txc->seq,
11722 sync_submit_transaction,
11723 ceph::to_seconds<double>(mono_clock::now() - start));
11724 }
11725#endif
11726 }
11727
7c673cae
FG
11728 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
11729 for (auto& o : *ls) {
11730 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
11731 << dendl;
9f95a23c 11732 if (--o->flushing_count == 0 && o->waiting_count.load()) {
11fdf7f2 11733 std::lock_guard l(o->flush_lock);
7c673cae
FG
11734 o->flush_cond.notify_all();
11735 }
11736 }
11737 }
11738}
11739
11740void BlueStore::_txc_committed_kv(TransContext *txc)
11741{
11742 dout(20) << __func__ << " txc " << txc << dendl;
9f95a23c 11743 throttle.complete_kv(*txc);
1adf2230 11744 {
11fdf7f2 11745 std::lock_guard l(txc->osr->qlock);
1adf2230 11746 txc->state = TransContext::STATE_KV_DONE;
11fdf7f2
TL
11747 if (txc->ch->commit_queue) {
11748 txc->ch->commit_queue->queue(txc->oncommits);
11749 } else {
11750 finisher.queue(txc->oncommits);
1adf2230 11751 }
7c673cae 11752 }
9f95a23c 11753 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
494da23a
TL
11754 log_latency_fn(
11755 __func__,
11756 l_bluestore_commit_lat,
9f95a23c 11757 mono_clock::now() - txc->start,
494da23a
TL
11758 cct->_conf->bluestore_log_op_age,
11759 [&](auto lat) {
11760 return ", txc = " + stringify(txc);
11761 }
11fdf7f2 11762 );
7c673cae
FG
11763}
11764
11765void BlueStore::_txc_finish(TransContext *txc)
11766{
11767 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
11fdf7f2 11768 ceph_assert(txc->state == TransContext::STATE_FINISHING);
7c673cae
FG
11769
11770 for (auto& sb : txc->shared_blobs_written) {
f64942e4 11771 sb->finish_write(txc->seq);
7c673cae
FG
11772 }
11773 txc->shared_blobs_written.clear();
11774
11775 while (!txc->removed_collections.empty()) {
11776 _queue_reap_collection(txc->removed_collections.front());
11777 txc->removed_collections.pop_front();
11778 }
11779
11780 OpSequencerRef osr = txc->osr;
7c673cae 11781 bool empty = false;
31f18b77 11782 bool submit_deferred = false;
7c673cae
FG
11783 OpSequencer::q_list_t releasing_txc;
11784 {
11fdf7f2 11785 std::lock_guard l(osr->qlock);
7c673cae
FG
11786 txc->state = TransContext::STATE_DONE;
11787 bool notify = false;
11788 while (!osr->q.empty()) {
11789 TransContext *txc = &osr->q.front();
11790 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
11791 << dendl;
11792 if (txc->state != TransContext::STATE_DONE) {
11793 if (txc->state == TransContext::STATE_PREPARE &&
11794 deferred_aggressive) {
11795 // for _osr_drain_preceding()
11796 notify = true;
11797 }
31f18b77 11798 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
11fdf7f2 11799 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
31f18b77
FG
11800 submit_deferred = true;
11801 }
7c673cae
FG
11802 break;
11803 }
11804
7c673cae
FG
11805 osr->q.pop_front();
11806 releasing_txc.push_back(*txc);
7c673cae 11807 }
9f95a23c 11808
7c673cae
FG
11809 if (osr->q.empty()) {
11810 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
11811 empty = true;
11812 }
9f95a23c
TL
11813
11814 // only drain()/drain_preceding() need wakeup,
11815 // other cases use kv_submitted_waiters
11816 if (notify || empty) {
11817 osr->qcond.notify_all();
11818 }
7c673cae 11819 }
9f95a23c 11820
7c673cae
FG
11821 while (!releasing_txc.empty()) {
11822 // release to allocator only after all preceding txc's have also
11823 // finished any deferred writes that potentially land in these
11824 // blocks
11825 auto txc = &releasing_txc.front();
11826 _txc_release_alloc(txc);
11827 releasing_txc.pop_front();
9f95a23c
TL
11828 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
11829 throttle.complete(*txc);
7c673cae
FG
11830 delete txc;
11831 }
11832
31f18b77
FG
11833 if (submit_deferred) {
11834 // we're pinning memory; flush! we could be more fine-grained here but
11835 // i'm not sure it's worth the bother.
11836 deferred_try_submit();
7c673cae
FG
11837 }
11838
7c673cae 11839 if (empty && osr->zombie) {
11fdf7f2
TL
11840 std::lock_guard l(zombie_osr_lock);
11841 if (zombie_osr_set.erase(osr->cid)) {
11842 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
11843 } else {
11844 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
11845 << dendl;
11846 }
7c673cae 11847 }
9f95a23c 11848}
7c673cae
FG
11849
11850void BlueStore::_txc_release_alloc(TransContext *txc)
11851{
a8e16298 11852 // it's expected we're called with lazy_release_lock already taken!
11fdf7f2
TL
11853 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
11854 int r = 0;
11855 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
11856 r = bdev->queue_discard(txc->released);
11857 if (r == 0) {
11858 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
11859 << txc->released << std::dec << dendl;
11860 goto out;
11861 }
11862 } else if (cct->_conf->bdev_enable_discard) {
11863 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
11864 bdev->discard(p.get_start(), p.get_len());
11865 }
11866 }
11867 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
94b18763 11868 << txc->released << std::dec << dendl;
11fdf7f2 11869 alloc->release(txc->released);
7c673cae
FG
11870 }
11871
11fdf7f2 11872out:
7c673cae
FG
11873 txc->allocated.clear();
11874 txc->released.clear();
11875}
11876
11fdf7f2
TL
11877void BlueStore::_osr_attach(Collection *c)
11878{
11879 // note: caller has RWLock on coll_map
11880 auto q = coll_map.find(c->cid);
11881 if (q != coll_map.end()) {
11882 c->osr = q->second->osr;
11883 ldout(cct, 10) << __func__ << " " << c->cid
11884 << " reusing osr " << c->osr << " from existing coll "
11885 << q->second << dendl;
11886 } else {
11887 std::lock_guard l(zombie_osr_lock);
11888 auto p = zombie_osr_set.find(c->cid);
11889 if (p == zombie_osr_set.end()) {
9f95a23c 11890 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
11fdf7f2
TL
11891 ldout(cct, 10) << __func__ << " " << c->cid
11892 << " fresh osr " << c->osr << dendl;
11893 } else {
11894 c->osr = p->second;
11895 zombie_osr_set.erase(p);
11896 ldout(cct, 10) << __func__ << " " << c->cid
11897 << " resurrecting zombie osr " << c->osr << dendl;
11898 c->osr->zombie = false;
11899 }
11900 }
11901}
11902
11903void BlueStore::_osr_register_zombie(OpSequencer *osr)
11904{
11905 std::lock_guard l(zombie_osr_lock);
11906 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
11907 osr->zombie = true;
11908 auto i = zombie_osr_set.emplace(osr->cid, osr);
11909 // this is either a new insertion or the same osr is already there
11910 ceph_assert(i.second || i.first->second == osr);
11911}
11912
7c673cae
FG
11913void BlueStore::_osr_drain_preceding(TransContext *txc)
11914{
11915 OpSequencer *osr = txc->osr.get();
11916 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
11917 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11918 {
11919 // submit anything pending
224ce89b 11920 deferred_lock.lock();
11fdf7f2 11921 if (osr->deferred_pending && !osr->deferred_running) {
224ce89b
WB
11922 _deferred_submit_unlock(osr);
11923 } else {
11924 deferred_lock.unlock();
7c673cae
FG
11925 }
11926 }
11927 {
11928 // wake up any previously finished deferred events
11fdf7f2 11929 std::lock_guard l(kv_lock);
9f95a23c
TL
11930 if (!kv_sync_in_progress) {
11931 kv_sync_in_progress = true;
11932 kv_cond.notify_one();
11933 }
7c673cae
FG
11934 }
11935 osr->drain_preceding(txc);
11936 --deferred_aggressive;
11937 dout(10) << __func__ << " " << osr << " done" << dendl;
11938}
11939
11fdf7f2
TL
11940void BlueStore::_osr_drain(OpSequencer *osr)
11941{
11942 dout(10) << __func__ << " " << osr << dendl;
11943 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11944 {
11945 // submit anything pending
11946 deferred_lock.lock();
11947 if (osr->deferred_pending && !osr->deferred_running) {
11948 _deferred_submit_unlock(osr);
11949 } else {
11950 deferred_lock.unlock();
11951 }
11952 }
11953 {
11954 // wake up any previously finished deferred events
11955 std::lock_guard l(kv_lock);
9f95a23c
TL
11956 if (!kv_sync_in_progress) {
11957 kv_sync_in_progress = true;
11958 kv_cond.notify_one();
11959 }
11fdf7f2
TL
11960 }
11961 osr->drain();
11962 --deferred_aggressive;
11963 dout(10) << __func__ << " " << osr << " done" << dendl;
11964}
11965
7c673cae
FG
11966void BlueStore::_osr_drain_all()
11967{
11968 dout(10) << __func__ << dendl;
11969
11970 set<OpSequencerRef> s;
11fdf7f2
TL
11971 vector<OpSequencerRef> zombies;
11972 {
9f95a23c 11973 std::shared_lock l(coll_lock);
11fdf7f2
TL
11974 for (auto& i : coll_map) {
11975 s.insert(i.second->osr);
11976 }
11977 }
7c673cae 11978 {
11fdf7f2
TL
11979 std::lock_guard l(zombie_osr_lock);
11980 for (auto& i : zombie_osr_set) {
11981 s.insert(i.second);
11982 zombies.push_back(i.second);
11983 }
7c673cae
FG
11984 }
11985 dout(20) << __func__ << " osr_set " << s << dendl;
11986
11987 ++deferred_aggressive;
11988 {
11989 // submit anything pending
224ce89b 11990 deferred_try_submit();
7c673cae
FG
11991 }
11992 {
11993 // wake up any previously finished deferred events
11fdf7f2 11994 std::lock_guard l(kv_lock);
7c673cae
FG
11995 kv_cond.notify_one();
11996 }
31f18b77 11997 {
11fdf7f2 11998 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
11999 kv_finalize_cond.notify_one();
12000 }
7c673cae
FG
12001 for (auto osr : s) {
12002 dout(20) << __func__ << " drain " << osr << dendl;
12003 osr->drain();
12004 }
12005 --deferred_aggressive;
12006
7c673cae 12007 {
11fdf7f2
TL
12008 std::lock_guard l(zombie_osr_lock);
12009 for (auto& osr : zombies) {
12010 if (zombie_osr_set.erase(osr->cid)) {
12011 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
12012 ceph_assert(osr->q.empty());
12013 } else if (osr->zombie) {
12014 dout(10) << __func__ << " empty zombie osr " << osr
12015 << " already reaped" << dendl;
12016 ceph_assert(osr->q.empty());
12017 } else {
12018 dout(10) << __func__ << " empty zombie osr " << osr
12019 << " resurrected" << dendl;
12020 }
7c673cae
FG
12021 }
12022 }
11fdf7f2
TL
12023
12024 dout(10) << __func__ << " done" << dendl;
7c673cae
FG
12025}
12026
11fdf7f2 12027
31f18b77
FG
12028void BlueStore::_kv_start()
12029{
12030 dout(10) << __func__ << dendl;
12031
11fdf7f2 12032 finisher.start();
31f18b77
FG
12033 kv_sync_thread.create("bstore_kv_sync");
12034 kv_finalize_thread.create("bstore_kv_final");
12035}
12036
12037void BlueStore::_kv_stop()
12038{
12039 dout(10) << __func__ << dendl;
12040 {
9f95a23c 12041 std::unique_lock l{kv_lock};
31f18b77
FG
12042 while (!kv_sync_started) {
12043 kv_cond.wait(l);
12044 }
12045 kv_stop = true;
12046 kv_cond.notify_all();
12047 }
12048 {
9f95a23c 12049 std::unique_lock l{kv_finalize_lock};
31f18b77
FG
12050 while (!kv_finalize_started) {
12051 kv_finalize_cond.wait(l);
12052 }
12053 kv_finalize_stop = true;
12054 kv_finalize_cond.notify_all();
12055 }
12056 kv_sync_thread.join();
12057 kv_finalize_thread.join();
11fdf7f2 12058 ceph_assert(removed_collections.empty());
31f18b77 12059 {
11fdf7f2 12060 std::lock_guard l(kv_lock);
31f18b77
FG
12061 kv_stop = false;
12062 }
12063 {
11fdf7f2 12064 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
12065 kv_finalize_stop = false;
12066 }
12067 dout(10) << __func__ << " stopping finishers" << dendl;
11fdf7f2
TL
12068 finisher.wait_for_empty();
12069 finisher.stop();
31f18b77
FG
12070 dout(10) << __func__ << " stopped" << dendl;
12071}
12072
7c673cae
FG
12073void BlueStore::_kv_sync_thread()
12074{
12075 dout(10) << __func__ << " start" << dendl;
11fdf7f2 12076 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
9f95a23c 12077 std::unique_lock l{kv_lock};
11fdf7f2 12078 ceph_assert(!kv_sync_started);
31f18b77
FG
12079 kv_sync_started = true;
12080 kv_cond.notify_all();
7c673cae 12081 while (true) {
11fdf7f2 12082 ceph_assert(kv_committing.empty());
7c673cae
FG
12083 if (kv_queue.empty() &&
12084 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
11fdf7f2 12085 !deferred_aggressive)) {
7c673cae
FG
12086 if (kv_stop)
12087 break;
12088 dout(20) << __func__ << " sleep" << dendl;
9f95a23c 12089 kv_sync_in_progress = false;
11fdf7f2 12090 kv_cond.wait(l);
7c673cae
FG
12091 dout(20) << __func__ << " wake" << dendl;
12092 } else {
12093 deque<TransContext*> kv_submitting;
12094 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
12095 uint64_t aios = 0, costs = 0;
12096
7c673cae
FG
12097 dout(20) << __func__ << " committing " << kv_queue.size()
12098 << " submitting " << kv_queue_unsubmitted.size()
12099 << " deferred done " << deferred_done_queue.size()
12100 << " stable " << deferred_stable_queue.size()
12101 << dendl;
12102 kv_committing.swap(kv_queue);
12103 kv_submitting.swap(kv_queue_unsubmitted);
12104 deferred_done.swap(deferred_done_queue);
12105 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
12106 aios = kv_ios;
12107 costs = kv_throttle_costs;
12108 kv_ios = 0;
12109 kv_throttle_costs = 0;
7c673cae
FG
12110 l.unlock();
12111
12112 dout(30) << __func__ << " committing " << kv_committing << dendl;
12113 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
12114 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
12115 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
12116
11fdf7f2
TL
12117 auto start = mono_clock::now();
12118
7c673cae
FG
12119 bool force_flush = false;
12120 // if bluefs is sharing the same device as data (only), then we
12121 // can rely on the bluefs commit to flush the device and make
12122 // deferred aios stable. that means that if we do have done deferred
12123 // txcs AND we are not on a single device, we need to force a flush.
9f95a23c 12124 if (bluefs && bluefs_layout.single_shared_device()) {
31f18b77 12125 if (aios) {
7c673cae 12126 force_flush = true;
11fdf7f2 12127 } else if (kv_committing.empty() && deferred_stable.empty()) {
7c673cae
FG
12128 force_flush = true; // there's nothing else to commit!
12129 } else if (deferred_aggressive) {
12130 force_flush = true;
12131 }
11fdf7f2
TL
12132 } else {
12133 if (aios || !deferred_done.empty()) {
12134 force_flush = true;
12135 } else {
12136 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
12137 }
12138 }
7c673cae
FG
12139
12140 if (force_flush) {
31f18b77 12141 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
12142 << " force_flush=" << (int)force_flush
12143 << ", flushing, deferred done->stable" << dendl;
12144 // flush/barrier on block device
12145 bdev->flush();
12146
12147 // if we flush then deferred done are now deferred stable
12148 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
12149 deferred_done.end());
12150 deferred_done.clear();
12151 }
11fdf7f2 12152 auto after_flush = mono_clock::now();
7c673cae
FG
12153
12154 // we will use one final transaction to force a sync
12155 KeyValueDB::Transaction synct = db->get_transaction();
12156
12157 // increase {nid,blobid}_max? note that this covers both the
12158 // case where we are approaching the max and the case we passed
12159 // it. in either case, we increase the max in the earlier txn
12160 // we submit.
12161 uint64_t new_nid_max = 0, new_blobid_max = 0;
12162 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
12163 KeyValueDB::Transaction t =
12164 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12165 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
12166 bufferlist bl;
11fdf7f2 12167 encode(new_nid_max, bl);
7c673cae
FG
12168 t->set(PREFIX_SUPER, "nid_max", bl);
12169 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
12170 }
12171 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
12172 KeyValueDB::Transaction t =
12173 kv_submitting.empty() ? synct : kv_submitting.front()->t;
12174 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
12175 bufferlist bl;
11fdf7f2 12176 encode(new_blobid_max, bl);
7c673cae
FG
12177 t->set(PREFIX_SUPER, "blobid_max", bl);
12178 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
12179 }
c07f9fc5
FG
12180
12181 for (auto txc : kv_committing) {
9f95a23c 12182 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
c07f9fc5 12183 if (txc->state == TransContext::STATE_KV_QUEUED) {
9f95a23c 12184 _txc_apply_kv(txc, false);
c07f9fc5 12185 --txc->osr->kv_committing_serially;
c07f9fc5 12186 } else {
11fdf7f2 12187 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
7c673cae 12188 }
7c673cae
FG
12189 if (txc->had_ios) {
12190 --txc->osr->txc_with_unstable_io;
12191 }
7c673cae
FG
12192 }
12193
31f18b77
FG
12194 // release throttle *before* we commit. this allows new ops
12195 // to be prepared and enter pipeline while we are waiting on
12196 // the kv commit sync/flush. then hopefully on the next
12197 // iteration there will already be ops awake. otherwise, we
12198 // end up going to sleep, and then wake up when the very first
12199 // transaction is ready for commit.
9f95a23c 12200 throttle.release_kv_throttle(costs);
31f18b77 12201
7c673cae
FG
12202 if (bluefs &&
12203 after_flush - bluefs_last_balance >
11fdf7f2 12204 ceph::make_timespan(cct->_conf->bluestore_bluefs_balance_interval)) {
7c673cae 12205 bluefs_last_balance = after_flush;
11fdf7f2
TL
12206 int r = _balance_bluefs_freespace();
12207 ceph_assert(r >= 0);
7c673cae
FG
12208 }
12209
12210 // cleanup sync deferred keys
12211 for (auto b : deferred_stable) {
12212 for (auto& txc : b->txcs) {
12213 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
11fdf7f2 12214 ceph_assert(wt.released.empty()); // only kraken did this
7c673cae
FG
12215 string key;
12216 get_deferred_key(wt.seq, &key);
12217 synct->rm_single_key(PREFIX_DEFERRED, key);
12218 }
12219 }
12220
9f95a23c
TL
12221#if defined(WITH_LTTNG)
12222 auto sync_start = mono_clock::now();
12223#endif
7c673cae 12224 // submit synct synchronously (block and wait for it to commit)
31f18b77 12225 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
11fdf7f2
TL
12226 ceph_assert(r == 0);
12227
9f95a23c
TL
12228 int committing_size = kv_committing.size();
12229 int deferred_size = deferred_stable.size();
12230
12231#if defined(WITH_LTTNG)
12232 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
12233 for (auto txc: kv_committing) {
12234 if (txc->tracing) {
12235 tracepoint(
12236 bluestore,
12237 transaction_kv_sync_latency,
12238 txc->osr->get_sequencer_id(),
12239 txc->seq,
12240 kv_committing.size(),
12241 deferred_done.size(),
12242 deferred_stable.size(),
12243 sync_latency);
12244 }
12245 }
12246#endif
12247
11fdf7f2 12248 {
9f95a23c 12249 std::unique_lock m{kv_finalize_lock};
11fdf7f2
TL
12250 if (kv_committing_to_finalize.empty()) {
12251 kv_committing_to_finalize.swap(kv_committing);
12252 } else {
12253 kv_committing_to_finalize.insert(
12254 kv_committing_to_finalize.end(),
12255 kv_committing.begin(),
12256 kv_committing.end());
12257 kv_committing.clear();
12258 }
12259 if (deferred_stable_to_finalize.empty()) {
12260 deferred_stable_to_finalize.swap(deferred_stable);
12261 } else {
12262 deferred_stable_to_finalize.insert(
12263 deferred_stable_to_finalize.end(),
12264 deferred_stable.begin(),
12265 deferred_stable.end());
12266 deferred_stable.clear();
12267 }
9f95a23c
TL
12268 if (!kv_finalize_in_progress) {
12269 kv_finalize_in_progress = true;
12270 kv_finalize_cond.notify_one();
12271 }
11fdf7f2 12272 }
7c673cae
FG
12273
12274 if (new_nid_max) {
12275 nid_max = new_nid_max;
12276 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
12277 }
12278 if (new_blobid_max) {
12279 blobid_max = new_blobid_max;
12280 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
12281 }
12282
224ce89b 12283 {
11fdf7f2
TL
12284 auto finish = mono_clock::now();
12285 ceph::timespan dur_flush = after_flush - start;
12286 ceph::timespan dur_kv = finish - after_flush;
12287 ceph::timespan dur = finish - start;
9f95a23c
TL
12288 dout(20) << __func__ << " committed " << committing_size
12289 << " cleaned " << deferred_size
224ce89b
WB
12290 << " in " << dur
12291 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
12292 << dendl;
494da23a
TL
12293 log_latency("kv_flush",
12294 l_bluestore_kv_flush_lat,
12295 dur_flush,
12296 cct->_conf->bluestore_log_op_age);
12297 log_latency("kv_commit",
12298 l_bluestore_kv_commit_lat,
12299 dur_kv,
12300 cct->_conf->bluestore_log_op_age);
12301 log_latency("kv_sync",
12302 l_bluestore_kv_sync_lat,
12303 dur,
12304 cct->_conf->bluestore_log_op_age);
7c673cae 12305 }
31f18b77
FG
12306
12307 if (bluefs) {
11fdf7f2
TL
12308 if (!bluefs_extents_reclaiming.empty()) {
12309 dout(0) << __func__ << " releasing old bluefs 0x" << std::hex
12310 << bluefs_extents_reclaiming << std::dec << dendl;
81eedcae
TL
12311 int r = 0;
12312 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
12313 r = bdev->queue_discard(bluefs_extents_reclaiming);
12314 if (r == 0) {
12315 goto clear;
12316 }
12317 } else if (cct->_conf->bdev_enable_discard) {
12318 for (auto p = bluefs_extents_reclaiming.begin(); p != bluefs_extents_reclaiming.end(); ++p) {
12319 bdev->discard(p.get_start(), p.get_len());
12320 }
12321 }
12322
11fdf7f2 12323 alloc->release(bluefs_extents_reclaiming);
81eedcae 12324clear:
11fdf7f2 12325 bluefs_extents_reclaiming.clear();
31f18b77 12326 }
31f18b77
FG
12327 }
12328
12329 l.lock();
12330 // previously deferred "done" are now "stable" by virtue of this
12331 // commit cycle.
12332 deferred_stable_queue.swap(deferred_done);
12333 }
12334 }
12335 dout(10) << __func__ << " finish" << dendl;
12336 kv_sync_started = false;
12337}
12338
12339void BlueStore::_kv_finalize_thread()
12340{
12341 deque<TransContext*> kv_committed;
12342 deque<DeferredBatch*> deferred_stable;
12343 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
12344 std::unique_lock l(kv_finalize_lock);
12345 ceph_assert(!kv_finalize_started);
31f18b77
FG
12346 kv_finalize_started = true;
12347 kv_finalize_cond.notify_all();
12348 while (true) {
11fdf7f2
TL
12349 ceph_assert(kv_committed.empty());
12350 ceph_assert(deferred_stable.empty());
31f18b77
FG
12351 if (kv_committing_to_finalize.empty() &&
12352 deferred_stable_to_finalize.empty()) {
12353 if (kv_finalize_stop)
12354 break;
12355 dout(20) << __func__ << " sleep" << dendl;
9f95a23c 12356 kv_finalize_in_progress = false;
31f18b77
FG
12357 kv_finalize_cond.wait(l);
12358 dout(20) << __func__ << " wake" << dendl;
12359 } else {
12360 kv_committed.swap(kv_committing_to_finalize);
12361 deferred_stable.swap(deferred_stable_to_finalize);
12362 l.unlock();
12363 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
12364 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
12365
11fdf7f2
TL
12366 auto start = mono_clock::now();
12367
31f18b77
FG
12368 while (!kv_committed.empty()) {
12369 TransContext *txc = kv_committed.front();
11fdf7f2 12370 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
7c673cae 12371 _txc_state_proc(txc);
31f18b77 12372 kv_committed.pop_front();
7c673cae 12373 }
31f18b77 12374
7c673cae
FG
12375 for (auto b : deferred_stable) {
12376 auto p = b->txcs.begin();
12377 while (p != b->txcs.end()) {
12378 TransContext *txc = &*p;
12379 p = b->txcs.erase(p); // unlink here because
12380 _txc_state_proc(txc); // this may destroy txc
12381 }
12382 delete b;
12383 }
31f18b77 12384 deferred_stable.clear();
7c673cae
FG
12385
12386 if (!deferred_aggressive) {
31f18b77 12387 if (deferred_queue_size >= deferred_batch_ops.load() ||
9f95a23c 12388 throttle.should_submit_deferred()) {
224ce89b 12389 deferred_try_submit();
7c673cae
FG
12390 }
12391 }
12392
12393 // this is as good a place as any ...
12394 _reap_collections();
12395
11fdf7f2 12396 logger->set(l_bluestore_fragmentation,
9f95a23c 12397 (uint64_t)(alloc->get_fragmentation() * 1000));
11fdf7f2 12398
494da23a
TL
12399 log_latency("kv_final",
12400 l_bluestore_kv_final_lat,
12401 mono_clock::now() - start,
12402 cct->_conf->bluestore_log_op_age);
11fdf7f2 12403
7c673cae 12404 l.lock();
7c673cae
FG
12405 }
12406 }
12407 dout(10) << __func__ << " finish" << dendl;
31f18b77 12408 kv_finalize_started = false;
7c673cae
FG
12409}
12410
12411bluestore_deferred_op_t *BlueStore::_get_deferred_op(
9f95a23c 12412 TransContext *txc)
7c673cae
FG
12413{
12414 if (!txc->deferred_txn) {
12415 txc->deferred_txn = new bluestore_deferred_transaction_t;
12416 }
12417 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
12418 return &txc->deferred_txn->ops.back();
12419}
12420
12421void BlueStore::_deferred_queue(TransContext *txc)
12422{
12423 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
224ce89b 12424 deferred_lock.lock();
7c673cae
FG
12425 if (!txc->osr->deferred_pending &&
12426 !txc->osr->deferred_running) {
12427 deferred_queue.push_back(*txc->osr);
12428 }
12429 if (!txc->osr->deferred_pending) {
12430 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
12431 }
12432 ++deferred_queue_size;
12433 txc->osr->deferred_pending->txcs.push_back(*txc);
12434 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
12435 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
12436 const auto& op = *opi;
11fdf7f2 12437 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
7c673cae
FG
12438 bufferlist::const_iterator p = op.data.begin();
12439 for (auto e : op.extents) {
12440 txc->osr->deferred_pending->prepare_write(
12441 cct, wt.seq, e.offset, e.length, p);
12442 }
12443 }
12444 if (deferred_aggressive &&
12445 !txc->osr->deferred_running) {
224ce89b
WB
12446 _deferred_submit_unlock(txc->osr.get());
12447 } else {
12448 deferred_lock.unlock();
7c673cae
FG
12449 }
12450}
12451
224ce89b 12452void BlueStore::deferred_try_submit()
7c673cae
FG
12453{
12454 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
12455 << deferred_queue_size << " txcs" << dendl;
11fdf7f2 12456 std::lock_guard l(deferred_lock);
224ce89b
WB
12457 vector<OpSequencerRef> osrs;
12458 osrs.reserve(deferred_queue.size());
7c673cae 12459 for (auto& osr : deferred_queue) {
224ce89b
WB
12460 osrs.push_back(&osr);
12461 }
12462 for (auto& osr : osrs) {
181888fb
FG
12463 if (osr->deferred_pending) {
12464 if (!osr->deferred_running) {
12465 _deferred_submit_unlock(osr.get());
12466 deferred_lock.lock();
12467 } else {
12468 dout(20) << __func__ << " osr " << osr << " already has running"
12469 << dendl;
12470 }
12471 } else {
12472 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
12473 }
12474 }
9f95a23c
TL
12475
12476 deferred_last_submitted = ceph_clock_now();
7c673cae
FG
12477}
12478
224ce89b 12479void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
12480{
12481 dout(10) << __func__ << " osr " << osr
12482 << " " << osr->deferred_pending->iomap.size() << " ios pending "
12483 << dendl;
11fdf7f2
TL
12484 ceph_assert(osr->deferred_pending);
12485 ceph_assert(!osr->deferred_running);
7c673cae
FG
12486
12487 auto b = osr->deferred_pending;
12488 deferred_queue_size -= b->seq_bytes.size();
11fdf7f2 12489 ceph_assert(deferred_queue_size >= 0);
7c673cae
FG
12490
12491 osr->deferred_running = osr->deferred_pending;
12492 osr->deferred_pending = nullptr;
12493
11fdf7f2
TL
12494 deferred_lock.unlock();
12495
12496 for (auto& txc : b->txcs) {
9f95a23c 12497 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
11fdf7f2 12498 }
7c673cae
FG
12499 uint64_t start = 0, pos = 0;
12500 bufferlist bl;
12501 auto i = b->iomap.begin();
12502 while (true) {
12503 if (i == b->iomap.end() || i->first != pos) {
12504 if (bl.length()) {
12505 dout(20) << __func__ << " write 0x" << std::hex
12506 << start << "~" << bl.length()
12507 << " crc " << bl.crc32c(-1) << std::dec << dendl;
11fdf7f2 12508 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
12509 logger->inc(l_bluestore_deferred_write_ops);
12510 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
12511 int r = bdev->aio_write(start, bl, &b->ioc, false);
11fdf7f2 12512 ceph_assert(r == 0);
7c673cae
FG
12513 }
12514 }
12515 if (i == b->iomap.end()) {
12516 break;
12517 }
12518 start = 0;
12519 pos = i->first;
12520 bl.clear();
12521 }
12522 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
12523 << std::hex << pos << "~" << i->second.bl.length() << std::dec
12524 << dendl;
12525 if (!bl.length()) {
12526 start = pos;
12527 }
12528 pos += i->second.bl.length();
12529 bl.claim_append(i->second.bl);
12530 ++i;
12531 }
224ce89b 12532
7c673cae
FG
12533 bdev->aio_submit(&b->ioc);
12534}
12535
3efd9988
FG
12536struct C_DeferredTrySubmit : public Context {
12537 BlueStore *store;
12538 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
12539 void finish(int r) {
12540 store->deferred_try_submit();
12541 }
12542};
12543
7c673cae
FG
12544void BlueStore::_deferred_aio_finish(OpSequencer *osr)
12545{
12546 dout(10) << __func__ << " osr " << osr << dendl;
11fdf7f2 12547 ceph_assert(osr->deferred_running);
7c673cae
FG
12548 DeferredBatch *b = osr->deferred_running;
12549
12550 {
9f95a23c 12551 deferred_lock.lock();
11fdf7f2 12552 ceph_assert(osr->deferred_running == b);
7c673cae
FG
12553 osr->deferred_running = nullptr;
12554 if (!osr->deferred_pending) {
181888fb 12555 dout(20) << __func__ << " dequeueing" << dendl;
7c673cae
FG
12556 auto q = deferred_queue.iterator_to(*osr);
12557 deferred_queue.erase(q);
9f95a23c 12558 deferred_lock.unlock();
181888fb 12559 } else {
9f95a23c
TL
12560 deferred_lock.unlock();
12561 if (deferred_aggressive) {
12562 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
12563 finisher.queue(new C_DeferredTrySubmit(this));
12564 } else {
12565 dout(20) << __func__ << " leaving queued, more pending" << dendl;
12566 }
7c673cae
FG
12567 }
12568 }
12569
12570 {
31f18b77 12571 uint64_t costs = 0;
11fdf7f2 12572 {
11fdf7f2
TL
12573 for (auto& i : b->txcs) {
12574 TransContext *txc = &i;
9f95a23c 12575 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
11fdf7f2
TL
12576 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
12577 costs += txc->cost;
12578 }
7c673cae 12579 }
9f95a23c 12580 throttle.release_deferred_throttle(costs);
7c673cae
FG
12581 }
12582
9f95a23c 12583 {
11fdf7f2 12584 std::lock_guard l(kv_lock);
9f95a23c
TL
12585 deferred_done_queue.emplace_back(b);
12586
12587 // in the normal case, do not bother waking up the kv thread; it will
12588 // catch us on the next commit anyway.
12589 if (deferred_aggressive && !kv_sync_in_progress) {
12590 kv_sync_in_progress = true;
12591 kv_cond.notify_one();
12592 }
7c673cae
FG
12593 }
12594}
12595
12596int BlueStore::_deferred_replay()
12597{
12598 dout(10) << __func__ << " start" << dendl;
7c673cae
FG
12599 int count = 0;
12600 int r = 0;
11fdf7f2
TL
12601 CollectionRef ch = _get_collection(coll_t::meta());
12602 bool fake_ch = false;
12603 if (!ch) {
12604 // hmm, replaying initial mkfs?
12605 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
12606 fake_ch = true;
12607 }
12608 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
7c673cae
FG
12609 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
12610 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
12611 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
12612 << dendl;
12613 bluestore_deferred_transaction_t *deferred_txn =
12614 new bluestore_deferred_transaction_t;
12615 bufferlist bl = it->value();
11fdf7f2 12616 auto p = bl.cbegin();
7c673cae 12617 try {
11fdf7f2 12618 decode(*deferred_txn, p);
7c673cae
FG
12619 } catch (buffer::error& e) {
12620 derr << __func__ << " failed to decode deferred txn "
12621 << pretty_binary_string(it->key()) << dendl;
12622 delete deferred_txn;
12623 r = -EIO;
12624 goto out;
12625 }
11fdf7f2 12626 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
7c673cae
FG
12627 txc->deferred_txn = deferred_txn;
12628 txc->state = TransContext::STATE_KV_DONE;
12629 _txc_state_proc(txc);
12630 }
12631 out:
12632 dout(20) << __func__ << " draining osr" << dendl;
11fdf7f2 12633 _osr_register_zombie(osr);
7c673cae 12634 _osr_drain_all();
11fdf7f2
TL
12635 if (fake_ch) {
12636 new_coll_map.clear();
12637 }
7c673cae
FG
12638 dout(10) << __func__ << " completed " << count << " events" << dendl;
12639 return r;
12640}
12641
12642// ---------------------------
12643// transactions
12644
12645int BlueStore::queue_transactions(
11fdf7f2
TL
12646 CollectionHandle& ch,
12647 vector<Transaction>& tls,
12648 TrackedOpRef op,
12649 ThreadPool::TPHandle *handle)
12650{
12651 FUNCTRACE(cct);
12652 list<Context *> on_applied, on_commit, on_applied_sync;
7c673cae 12653 ObjectStore::Transaction::collect_contexts(
11fdf7f2 12654 tls, &on_applied, &on_commit, &on_applied_sync);
7c673cae 12655
11fdf7f2
TL
12656 auto start = mono_clock::now();
12657
12658 Collection *c = static_cast<Collection*>(ch.get());
12659 OpSequencer *osr = c->osr.get();
12660 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
7c673cae
FG
12661
12662 // prepare
11fdf7f2
TL
12663 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
12664 &on_commit);
7c673cae
FG
12665
12666 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
7c673cae
FG
12667 txc->bytes += (*p).get_num_bytes();
12668 _txc_add_transaction(txc, &(*p));
12669 }
12670 _txc_calc_cost(txc);
12671
12672 _txc_write_nodes(txc, txc->t);
12673
12674 // journal deferred items
12675 if (txc->deferred_txn) {
12676 txc->deferred_txn->seq = ++deferred_seq;
12677 bufferlist bl;
11fdf7f2 12678 encode(*txc->deferred_txn, bl);
7c673cae
FG
12679 string key;
12680 get_deferred_key(txc->deferred_txn->seq, &key);
12681 txc->t->set(PREFIX_DEFERRED, key, bl);
12682 }
12683
12684 _txc_finalize_kv(txc, txc->t);
12685 if (handle)
12686 handle->suspend_tp_timeout();
12687
11fdf7f2 12688 auto tstart = mono_clock::now();
9f95a23c
TL
12689
12690 if (!throttle.try_start_transaction(
12691 *db,
12692 *txc,
12693 tstart)) {
7c673cae 12694 // ensure we do not block here because of deferred writes
9f95a23c
TL
12695 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
12696 << dendl;
12697 ++deferred_aggressive;
12698 deferred_try_submit();
12699 {
12700 // wake up any previously finished deferred events
12701 std::lock_guard l(kv_lock);
12702 if (!kv_sync_in_progress) {
12703 kv_sync_in_progress = true;
3efd9988
FG
12704 kv_cond.notify_one();
12705 }
9f95a23c
TL
12706 }
12707 throttle.finish_start_transaction(*db, *txc, tstart);
12708 --deferred_aggressive;
7c673cae 12709 }
11fdf7f2 12710 auto tend = mono_clock::now();
7c673cae
FG
12711
12712 if (handle)
12713 handle->reset_tp_timeout();
12714
12715 logger->inc(l_bluestore_txc);
12716
12717 // execute (start)
12718 _txc_state_proc(txc);
12719
11fdf7f2
TL
12720 // we're immediately readable (unlike FileStore)
12721 for (auto c : on_applied_sync) {
12722 c->complete(0);
12723 }
12724 if (!on_applied.empty()) {
12725 if (c->commit_queue) {
12726 c->commit_queue->queue(on_applied);
12727 } else {
12728 finisher.queue(on_applied);
12729 }
12730 }
12731
494da23a
TL
12732 log_latency("submit_transact",
12733 l_bluestore_submit_lat,
12734 mono_clock::now() - start,
12735 cct->_conf->bluestore_log_op_age);
12736 log_latency("throttle_transact",
12737 l_bluestore_throttle_lat,
12738 tend - tstart,
12739 cct->_conf->bluestore_log_op_age);
7c673cae
FG
12740 return 0;
12741}
12742
12743void BlueStore::_txc_aio_submit(TransContext *txc)
12744{
12745 dout(10) << __func__ << " txc " << txc << dendl;
12746 bdev->aio_submit(&txc->ioc);
12747}
12748
12749void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
12750{
12751 Transaction::iterator i = t->begin();
12752
81eedcae 12753 _dump_transaction<30>(cct, t);
7c673cae
FG
12754
12755 vector<CollectionRef> cvec(i.colls.size());
12756 unsigned j = 0;
12757 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
12758 ++p, ++j) {
12759 cvec[j] = _get_collection(*p);
7c673cae 12760 }
11fdf7f2 12761
7c673cae
FG
12762 vector<OnodeRef> ovec(i.objects.size());
12763
12764 for (int pos = 0; i.have_op(); ++pos) {
12765 Transaction::Op *op = i.decode_op();
12766 int r = 0;
12767
12768 // no coll or obj
12769 if (op->op == Transaction::OP_NOP)
12770 continue;
12771
11fdf7f2 12772
7c673cae
FG
12773 // collection operations
12774 CollectionRef &c = cvec[op->cid];
11fdf7f2
TL
12775
12776 // initialize osd_pool_id and do a smoke test that all collections belong
12777 // to the same pool
12778 spg_t pgid;
12779 if (!!c ? c->cid.is_pg(&pgid) : false) {
12780 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
12781 txc->osd_pool_id == pgid.pool());
12782 txc->osd_pool_id = pgid.pool();
12783 }
12784
7c673cae
FG
12785 switch (op->op) {
12786 case Transaction::OP_RMCOLL:
12787 {
12788 const coll_t &cid = i.get_cid(op->cid);
12789 r = _remove_collection(txc, cid, &c);
12790 if (!r)
12791 continue;
12792 }
12793 break;
12794
12795 case Transaction::OP_MKCOLL:
12796 {
11fdf7f2 12797 ceph_assert(!c);
7c673cae
FG
12798 const coll_t &cid = i.get_cid(op->cid);
12799 r = _create_collection(txc, cid, op->split_bits, &c);
12800 if (!r)
12801 continue;
12802 }
12803 break;
12804
12805 case Transaction::OP_SPLIT_COLLECTION:
11fdf7f2 12806 ceph_abort_msg("deprecated");
7c673cae
FG
12807 break;
12808
12809 case Transaction::OP_SPLIT_COLLECTION2:
12810 {
12811 uint32_t bits = op->split_bits;
12812 uint32_t rem = op->split_rem;
12813 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
12814 if (!r)
12815 continue;
12816 }
12817 break;
12818
11fdf7f2
TL
12819 case Transaction::OP_MERGE_COLLECTION:
12820 {
12821 uint32_t bits = op->split_bits;
12822 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
12823 if (!r)
12824 continue;
12825 }
12826 break;
12827
7c673cae
FG
12828 case Transaction::OP_COLL_HINT:
12829 {
12830 uint32_t type = op->hint_type;
12831 bufferlist hint;
12832 i.decode_bl(hint);
11fdf7f2 12833 auto hiter = hint.cbegin();
7c673cae
FG
12834 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
12835 uint32_t pg_num;
12836 uint64_t num_objs;
11fdf7f2
TL
12837 decode(pg_num, hiter);
12838 decode(num_objs, hiter);
7c673cae
FG
12839 dout(10) << __func__ << " collection hint objects is a no-op, "
12840 << " pg_num " << pg_num << " num_objects " << num_objs
12841 << dendl;
12842 } else {
12843 // Ignore the hint
12844 dout(10) << __func__ << " unknown collection hint " << type << dendl;
12845 }
12846 continue;
12847 }
12848 break;
12849
12850 case Transaction::OP_COLL_SETATTR:
12851 r = -EOPNOTSUPP;
12852 break;
12853
12854 case Transaction::OP_COLL_RMATTR:
12855 r = -EOPNOTSUPP;
12856 break;
12857
12858 case Transaction::OP_COLL_RENAME:
11fdf7f2 12859 ceph_abort_msg("not implemented");
7c673cae
FG
12860 break;
12861 }
12862 if (r < 0) {
12863 derr << __func__ << " error " << cpp_strerror(r)
12864 << " not handled on operation " << op->op
12865 << " (op " << pos << ", counting from 0)" << dendl;
81eedcae 12866 _dump_transaction<0>(cct, t);
11fdf7f2 12867 ceph_abort_msg("unexpected error");
7c673cae
FG
12868 }
12869
12870 // these operations implicity create the object
12871 bool create = false;
12872 if (op->op == Transaction::OP_TOUCH ||
9f95a23c 12873 op->op == Transaction::OP_CREATE ||
7c673cae
FG
12874 op->op == Transaction::OP_WRITE ||
12875 op->op == Transaction::OP_ZERO) {
12876 create = true;
12877 }
12878
12879 // object operations
9f95a23c 12880 std::unique_lock l(c->lock);
7c673cae
FG
12881 OnodeRef &o = ovec[op->oid];
12882 if (!o) {
12883 ghobject_t oid = i.get_oid(op->oid);
9f95a23c 12884 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
7c673cae
FG
12885 }
12886 if (!create && (!o || !o->exists)) {
12887 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
12888 << i.get_oid(op->oid) << dendl;
12889 r = -ENOENT;
12890 goto endop;
12891 }
12892
12893 switch (op->op) {
9f95a23c 12894 case Transaction::OP_CREATE:
7c673cae
FG
12895 case Transaction::OP_TOUCH:
12896 r = _touch(txc, c, o);
12897 break;
12898
12899 case Transaction::OP_WRITE:
12900 {
12901 uint64_t off = op->off;
12902 uint64_t len = op->len;
12903 uint32_t fadvise_flags = i.get_fadvise_flags();
12904 bufferlist bl;
12905 i.decode_bl(bl);
12906 r = _write(txc, c, o, off, len, bl, fadvise_flags);
12907 }
12908 break;
12909
12910 case Transaction::OP_ZERO:
12911 {
12912 uint64_t off = op->off;
12913 uint64_t len = op->len;
12914 r = _zero(txc, c, o, off, len);
12915 }
12916 break;
12917
12918 case Transaction::OP_TRIMCACHE:
12919 {
12920 // deprecated, no-op
12921 }
12922 break;
12923
12924 case Transaction::OP_TRUNCATE:
12925 {
12926 uint64_t off = op->off;
35e4c445 12927 r = _truncate(txc, c, o, off);
7c673cae
FG
12928 }
12929 break;
12930
12931 case Transaction::OP_REMOVE:
12932 {
12933 r = _remove(txc, c, o);
12934 }
12935 break;
12936
12937 case Transaction::OP_SETATTR:
12938 {
12939 string name = i.decode_string();
12940 bufferptr bp;
12941 i.decode_bp(bp);
12942 r = _setattr(txc, c, o, name, bp);
12943 }
12944 break;
12945
12946 case Transaction::OP_SETATTRS:
12947 {
12948 map<string, bufferptr> aset;
12949 i.decode_attrset(aset);
12950 r = _setattrs(txc, c, o, aset);
12951 }
12952 break;
12953
12954 case Transaction::OP_RMATTR:
12955 {
12956 string name = i.decode_string();
12957 r = _rmattr(txc, c, o, name);
12958 }
12959 break;
12960
12961 case Transaction::OP_RMATTRS:
12962 {
12963 r = _rmattrs(txc, c, o);
12964 }
12965 break;
12966
12967 case Transaction::OP_CLONE:
12968 {
12969 OnodeRef& no = ovec[op->dest_oid];
12970 if (!no) {
12971 const ghobject_t& noid = i.get_oid(op->dest_oid);
12972 no = c->get_onode(noid, true);
12973 }
12974 r = _clone(txc, c, o, no);
12975 }
12976 break;
12977
12978 case Transaction::OP_CLONERANGE:
11fdf7f2 12979 ceph_abort_msg("deprecated");
7c673cae
FG
12980 break;
12981
12982 case Transaction::OP_CLONERANGE2:
12983 {
12984 OnodeRef& no = ovec[op->dest_oid];
12985 if (!no) {
12986 const ghobject_t& noid = i.get_oid(op->dest_oid);
12987 no = c->get_onode(noid, true);
12988 }
12989 uint64_t srcoff = op->off;
12990 uint64_t len = op->len;
12991 uint64_t dstoff = op->dest_off;
12992 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
12993 }
12994 break;
12995
12996 case Transaction::OP_COLL_ADD:
11fdf7f2 12997 ceph_abort_msg("not implemented");
7c673cae
FG
12998 break;
12999
13000 case Transaction::OP_COLL_REMOVE:
11fdf7f2 13001 ceph_abort_msg("not implemented");
7c673cae
FG
13002 break;
13003
13004 case Transaction::OP_COLL_MOVE:
11fdf7f2 13005 ceph_abort_msg("deprecated");
7c673cae
FG
13006 break;
13007
13008 case Transaction::OP_COLL_MOVE_RENAME:
13009 case Transaction::OP_TRY_RENAME:
13010 {
11fdf7f2 13011 ceph_assert(op->cid == op->dest_cid);
7c673cae
FG
13012 const ghobject_t& noid = i.get_oid(op->dest_oid);
13013 OnodeRef& no = ovec[op->dest_oid];
13014 if (!no) {
13015 no = c->get_onode(noid, false);
13016 }
13017 r = _rename(txc, c, o, no, noid);
13018 }
13019 break;
13020
13021 case Transaction::OP_OMAP_CLEAR:
13022 {
13023 r = _omap_clear(txc, c, o);
13024 }
13025 break;
13026 case Transaction::OP_OMAP_SETKEYS:
13027 {
13028 bufferlist aset_bl;
13029 i.decode_attrset_bl(&aset_bl);
13030 r = _omap_setkeys(txc, c, o, aset_bl);
13031 }
13032 break;
13033 case Transaction::OP_OMAP_RMKEYS:
13034 {
13035 bufferlist keys_bl;
13036 i.decode_keyset_bl(&keys_bl);
13037 r = _omap_rmkeys(txc, c, o, keys_bl);
13038 }
13039 break;
13040 case Transaction::OP_OMAP_RMKEYRANGE:
13041 {
13042 string first, last;
13043 first = i.decode_string();
13044 last = i.decode_string();
13045 r = _omap_rmkey_range(txc, c, o, first, last);
13046 }
13047 break;
13048 case Transaction::OP_OMAP_SETHEADER:
13049 {
13050 bufferlist bl;
13051 i.decode_bl(bl);
13052 r = _omap_setheader(txc, c, o, bl);
13053 }
13054 break;
13055
13056 case Transaction::OP_SETALLOCHINT:
13057 {
13058 r = _set_alloc_hint(txc, c, o,
13059 op->expected_object_size,
13060 op->expected_write_size,
13061 op->alloc_hint_flags);
13062 }
13063 break;
13064
13065 default:
11fdf7f2 13066 derr << __func__ << " bad op " << op->op << dendl;
7c673cae
FG
13067 ceph_abort();
13068 }
13069
13070 endop:
13071 if (r < 0) {
13072 bool ok = false;
13073
13074 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
13075 op->op == Transaction::OP_CLONE ||
13076 op->op == Transaction::OP_CLONERANGE2 ||
13077 op->op == Transaction::OP_COLL_ADD ||
13078 op->op == Transaction::OP_SETATTR ||
13079 op->op == Transaction::OP_SETATTRS ||
13080 op->op == Transaction::OP_RMATTR ||
13081 op->op == Transaction::OP_OMAP_SETKEYS ||
13082 op->op == Transaction::OP_OMAP_RMKEYS ||
13083 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
13084 op->op == Transaction::OP_OMAP_SETHEADER))
13085 // -ENOENT is usually okay
13086 ok = true;
13087 if (r == -ENODATA)
13088 ok = true;
13089
13090 if (!ok) {
13091 const char *msg = "unexpected error code";
13092
13093 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
13094 op->op == Transaction::OP_CLONE ||
13095 op->op == Transaction::OP_CLONERANGE2))
13096 msg = "ENOENT on clone suggests osd bug";
13097
13098 if (r == -ENOSPC)
13099 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
13100 // by partially applying transactions.
13101 msg = "ENOSPC from bluestore, misconfigured cluster";
13102
13103 if (r == -ENOTEMPTY) {
13104 msg = "ENOTEMPTY suggests garbage data in osd data dir";
13105 }
13106
13107 derr << __func__ << " error " << cpp_strerror(r)
13108 << " not handled on operation " << op->op
13109 << " (op " << pos << ", counting from 0)"
13110 << dendl;
13111 derr << msg << dendl;
81eedcae 13112 _dump_transaction<0>(cct, t);
11fdf7f2 13113 ceph_abort_msg("unexpected error");
7c673cae
FG
13114 }
13115 }
13116 }
13117}
13118
13119
13120
13121// -----------------
13122// write operations
13123
13124int BlueStore::_touch(TransContext *txc,
13125 CollectionRef& c,
13126 OnodeRef &o)
13127{
13128 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
13129 int r = 0;
7c673cae
FG
13130 _assign_nid(txc, o);
13131 txc->write_onode(o);
13132 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
13133 return r;
13134}
13135
7c673cae
FG
13136void BlueStore::_pad_zeros(
13137 bufferlist *bl, uint64_t *offset,
13138 uint64_t chunk_size)
13139{
13140 auto length = bl->length();
13141 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
13142 << " chunk_size 0x" << chunk_size << std::dec << dendl;
13143 dout(40) << "before:\n";
13144 bl->hexdump(*_dout);
13145 *_dout << dendl;
13146 // front
13147 size_t front_pad = *offset % chunk_size;
13148 size_t back_pad = 0;
13149 size_t pad_count = 0;
13150 if (front_pad) {
11fdf7f2
TL
13151 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
13152 bufferptr z = buffer::create_small_page_aligned(chunk_size);
224ce89b 13153 z.zero(0, front_pad, false);
7c673cae 13154 pad_count += front_pad;
9f95a23c 13155 bl->begin().copy(front_copy, z.c_str() + front_pad);
7c673cae
FG
13156 if (front_copy + front_pad < chunk_size) {
13157 back_pad = chunk_size - (length + front_pad);
224ce89b 13158 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
13159 pad_count += back_pad;
13160 }
13161 bufferlist old, t;
13162 old.swap(*bl);
13163 t.substr_of(old, front_copy, length - front_copy);
13164 bl->append(z);
13165 bl->claim_append(t);
13166 *offset -= front_pad;
224ce89b 13167 length += pad_count;
7c673cae
FG
13168 }
13169
13170 // back
13171 uint64_t end = *offset + length;
13172 unsigned back_copy = end % chunk_size;
13173 if (back_copy) {
11fdf7f2 13174 ceph_assert(back_pad == 0);
7c673cae 13175 back_pad = chunk_size - back_copy;
11fdf7f2 13176 ceph_assert(back_copy <= length);
7c673cae 13177 bufferptr tail(chunk_size);
9f95a23c 13178 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
224ce89b 13179 tail.zero(back_copy, back_pad, false);
7c673cae
FG
13180 bufferlist old;
13181 old.swap(*bl);
13182 bl->substr_of(old, 0, length - back_copy);
13183 bl->append(tail);
13184 length += back_pad;
13185 pad_count += back_pad;
13186 }
13187 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
13188 << back_pad << " on front/back, now 0x" << *offset << "~"
13189 << length << std::dec << dendl;
13190 dout(40) << "after:\n";
13191 bl->hexdump(*_dout);
13192 *_dout << dendl;
13193 if (pad_count)
13194 logger->inc(l_bluestore_write_pad_bytes, pad_count);
11fdf7f2 13195 ceph_assert(bl->length() == length);
7c673cae
FG
13196}
13197
13198void BlueStore::_do_write_small(
13199 TransContext *txc,
13200 CollectionRef &c,
13201 OnodeRef o,
13202 uint64_t offset, uint64_t length,
13203 bufferlist::iterator& blp,
13204 WriteContext *wctx)
13205{
13206 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13207 << std::dec << dendl;
11fdf7f2 13208 ceph_assert(length < min_alloc_size);
7c673cae
FG
13209 uint64_t end_offs = offset + length;
13210
13211 logger->inc(l_bluestore_write_small);
13212 logger->inc(l_bluestore_write_small_bytes, length);
13213
13214 bufferlist bl;
13215 blp.copy(length, bl);
13216
81eedcae
TL
13217 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
13218 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13219 uint32_t alloc_len = min_alloc_size;
13220 auto offset0 = p2align<uint64_t>(offset, alloc_len);
13221
13222 bool any_change;
13223
13224 // search suitable extent in both forward and reverse direction in
13225 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13226 // then check if blob can be reused via can_reuse_blob func or apply
13227 // direct/deferred write (the latter for extents including or higher
13228 // than 'offset' only).
13229 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
13230
7c673cae
FG
13231 // Look for an existing mutable blob we can use.
13232 auto begin = o->extent_map.extent_map.begin();
13233 auto end = o->extent_map.extent_map.end();
13234 auto ep = o->extent_map.seek_lextent(offset);
13235 if (ep != begin) {
13236 --ep;
13237 if (ep->blob_end() <= offset) {
13238 ++ep;
13239 }
13240 }
13241 auto prev_ep = ep;
13242 if (prev_ep != begin) {
13243 --prev_ep;
13244 } else {
13245 prev_ep = end; // to avoid this extent check as it's a duplicate
13246 }
13247
eafe8130
TL
13248 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
13249 // We don't want to have more blobs than min alloc units fit
13250 // into 2 max blobs
13251 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
13252 bool above_blob_threshold = false;
13253
13254 inspected_blobs.reserve(blob_threshold);
13255
13256 uint64_t max_off = 0;
13257 auto start_ep = ep;
13258 auto end_ep = ep; // exclusively
7c673cae
FG
13259 do {
13260 any_change = false;
13261
13262 if (ep != end && ep->logical_offset < offset + max_bsize) {
13263 BlobRef b = ep->blob;
eafe8130
TL
13264 if (!above_blob_threshold) {
13265 inspected_blobs.insert(&b->get_blob());
13266 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13267 }
13268 max_off = ep->logical_end();
7c673cae 13269 auto bstart = ep->blob_start();
eafe8130 13270
7c673cae
FG
13271 dout(20) << __func__ << " considering " << *b
13272 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
13273 if (bstart >= end_offs) {
13274 dout(20) << __func__ << " ignoring distant " << *b << dendl;
13275 } else if (!b->get_blob().is_mutable()) {
13276 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
13277 } else if (ep->logical_offset % min_alloc_size !=
13278 ep->blob_offset % min_alloc_size) {
13279 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
13280 } else {
13281 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13282 // can we pad our head/tail out with zeros?
13283 uint64_t head_pad, tail_pad;
11fdf7f2
TL
13284 head_pad = p2phase(offset, chunk_size);
13285 tail_pad = p2nphase(end_offs, chunk_size);
7c673cae
FG
13286 if (head_pad || tail_pad) {
13287 o->extent_map.fault_range(db, offset - head_pad,
13288 end_offs - offset + head_pad + tail_pad);
13289 }
13290 if (head_pad &&
13291 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
13292 head_pad = 0;
13293 }
13294 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
13295 tail_pad = 0;
13296 }
13297
13298 uint64_t b_off = offset - head_pad - bstart;
13299 uint64_t b_len = length + head_pad + tail_pad;
13300
13301 // direct write into unused blocks of an existing mutable blob?
13302 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
13303 b->get_blob().get_ondisk_length() >= b_off + b_len &&
13304 b->get_blob().is_unused(b_off, b_len) &&
13305 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 13306 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
13307
13308 dout(20) << __func__ << " write to unused 0x" << std::hex
13309 << b_off << "~" << b_len
13310 << " pad 0x" << head_pad << " + 0x" << tail_pad
13311 << std::dec << " of mutable " << *b << dendl;
224ce89b 13312 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
13313 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13314
11fdf7f2 13315 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
13316 if (b_len <= prefer_deferred_size) {
13317 dout(20) << __func__ << " deferring small 0x" << std::hex
13318 << b_len << std::dec << " unused write via deferred" << dendl;
9f95a23c 13319 bluestore_deferred_op_t *op = _get_deferred_op(txc);
7c673cae
FG
13320 op->op = bluestore_deferred_op_t::OP_WRITE;
13321 b->get_blob().map(
13322 b_off, b_len,
13323 [&](uint64_t offset, uint64_t length) {
13324 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13325 return 0;
13326 });
224ce89b 13327 op->data = bl;
7c673cae
FG
13328 } else {
13329 b->get_blob().map_bl(
224ce89b 13330 b_off, bl,
7c673cae
FG
13331 [&](uint64_t offset, bufferlist& t) {
13332 bdev->aio_write(offset, t,
13333 &txc->ioc, wctx->buffered);
13334 });
13335 }
13336 }
224ce89b 13337 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
13338 dout(20) << __func__ << " lex old " << *ep << dendl;
13339 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
13340 b,
13341 &wctx->old_extents);
13342 b->dirty_blob().mark_used(le->blob_offset, le->length);
13343 txc->statfs_delta.stored() += le->length;
13344 dout(20) << __func__ << " lex " << *le << dendl;
13345 logger->inc(l_bluestore_write_small_unused);
13346 return;
13347 }
13348 // read some data to fill out the chunk?
11fdf7f2
TL
13349 uint64_t head_read = p2phase(b_off, chunk_size);
13350 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
7c673cae
FG
13351 if ((head_read || tail_read) &&
13352 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
13353 head_read + tail_read < min_alloc_size) {
13354 b_off -= head_read;
13355 b_len += head_read + tail_read;
13356
13357 } else {
13358 head_read = tail_read = 0;
13359 }
13360
13361 // chunk-aligned deferred overwrite?
13362 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
13363 b_off % chunk_size == 0 &&
13364 b_len % chunk_size == 0 &&
13365 b->get_blob().is_allocated(b_off, b_len)) {
13366
224ce89b 13367 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
13368
13369 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
13370 << " and tail 0x" << tail_read << std::dec << dendl;
13371 if (head_read) {
13372 bufferlist head_bl;
13373 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
13374 head_bl, 0);
11fdf7f2 13375 ceph_assert(r >= 0 && r <= (int)head_read);
7c673cae
FG
13376 size_t zlen = head_read - r;
13377 if (zlen) {
13378 head_bl.append_zero(zlen);
13379 logger->inc(l_bluestore_write_pad_bytes, zlen);
13380 }
11fdf7f2
TL
13381 head_bl.claim_append(bl);
13382 bl.swap(head_bl);
7c673cae
FG
13383 logger->inc(l_bluestore_write_penalty_read_ops);
13384 }
13385 if (tail_read) {
13386 bufferlist tail_bl;
13387 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
13388 tail_bl, 0);
11fdf7f2 13389 ceph_assert(r >= 0 && r <= (int)tail_read);
7c673cae
FG
13390 size_t zlen = tail_read - r;
13391 if (zlen) {
13392 tail_bl.append_zero(zlen);
13393 logger->inc(l_bluestore_write_pad_bytes, zlen);
13394 }
224ce89b 13395 bl.claim_append(tail_bl);
7c673cae
FG
13396 logger->inc(l_bluestore_write_penalty_read_ops);
13397 }
13398 logger->inc(l_bluestore_write_small_pre_read);
13399
224ce89b 13400 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
13401 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13402
7c673cae 13403 if (b->get_blob().csum_type) {
224ce89b 13404 b->dirty_blob().calc_csum(b_off, bl);
7c673cae 13405 }
11fdf7f2
TL
13406
13407 if (!g_conf()->bluestore_debug_omit_block_device_write) {
9f95a23c 13408 bluestore_deferred_op_t *op = _get_deferred_op(txc);
11fdf7f2
TL
13409 op->op = bluestore_deferred_op_t::OP_WRITE;
13410 int r = b->get_blob().map(
13411 b_off, b_len,
13412 [&](uint64_t offset, uint64_t length) {
13413 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13414 return 0;
13415 });
13416 ceph_assert(r == 0);
13417 op->data.claim(bl);
13418 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
13419 << b_len << std::dec << " of mutable " << *b
13420 << " at " << op->extents << dendl;
13421 }
13422
7c673cae
FG
13423 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
13424 b, &wctx->old_extents);
13425 b->dirty_blob().mark_used(le->blob_offset, le->length);
13426 txc->statfs_delta.stored() += le->length;
13427 dout(20) << __func__ << " lex " << *le << dendl;
13428 logger->inc(l_bluestore_write_small_deferred);
13429 return;
13430 }
224ce89b
WB
13431 // try to reuse blob if we can
13432 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
13433 max_bsize,
13434 offset0 - bstart,
13435 &alloc_len)) {
11fdf7f2 13436 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
13437 // fit into reused blob
13438 // Need to check for pending writes desiring to
13439 // reuse the same pextent. The rationale is that during GC two chunks
13440 // from garbage blobs(compressed?) can share logical space within the same
13441 // AU. That's in turn might be caused by unaligned len in clone_range2.
13442 // Hence the second write will fail in an attempt to reuse blob at
13443 // do_alloc_write().
13444 if (!wctx->has_conflict(b,
13445 offset0,
13446 offset0 + alloc_len,
13447 min_alloc_size)) {
13448
13449 // we can't reuse pad_head/pad_tail since they might be truncated
13450 // due to existent extents
13451 uint64_t b_off = offset - bstart;
13452 uint64_t b_off0 = b_off;
13453 _pad_zeros(&bl, &b_off0, chunk_size);
13454
13455 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
13456 << " (0x" << b_off0 << "~" << bl.length() << ")"
13457 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
13458 << std::dec << dendl;
13459
13460 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13461 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13462 false, false);
13463 logger->inc(l_bluestore_write_small_unused);
13464 return;
13465 }
13466 }
13467 }
13468 ++ep;
eafe8130 13469 end_ep = ep;
7c673cae
FG
13470 any_change = true;
13471 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13472
13473 // check extent for reuse in reverse order
13474 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
13475 BlobRef b = prev_ep->blob;
eafe8130
TL
13476 if (!above_blob_threshold) {
13477 inspected_blobs.insert(&b->get_blob());
13478 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13479 }
13480 start_ep = prev_ep;
7c673cae
FG
13481 auto bstart = prev_ep->blob_start();
13482 dout(20) << __func__ << " considering " << *b
13483 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 13484 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
13485 max_bsize,
13486 offset0 - bstart,
13487 &alloc_len)) {
11fdf7f2 13488 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
13489 // fit into reused blob
13490 // Need to check for pending writes desiring to
13491 // reuse the same pextent. The rationale is that during GC two chunks
13492 // from garbage blobs(compressed?) can share logical space within the same
13493 // AU. That's in turn might be caused by unaligned len in clone_range2.
13494 // Hence the second write will fail in an attempt to reuse blob at
13495 // do_alloc_write().
13496 if (!wctx->has_conflict(b,
13497 offset0,
13498 offset0 + alloc_len,
13499 min_alloc_size)) {
13500
13501 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13502 uint64_t b_off = offset - bstart;
13503 uint64_t b_off0 = b_off;
13504 _pad_zeros(&bl, &b_off0, chunk_size);
13505
13506 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
13507 << " (0x" << b_off0 << "~" << bl.length() << ")"
13508 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
13509 << std::dec << dendl;
13510
13511 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13512 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13513 false, false);
13514 logger->inc(l_bluestore_write_small_unused);
13515 return;
13516 }
13517 }
13518 if (prev_ep != begin) {
13519 --prev_ep;
13520 any_change = true;
13521 } else {
13522 prev_ep = end; // to avoid useless first extent re-check
13523 }
13524 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13525 } while (any_change);
13526
eafe8130
TL
13527 if (above_blob_threshold) {
13528 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
13529 << " " << std::hex << min_off << "~" << max_off << std::dec
13530 << dendl;
13531 ceph_assert(start_ep != end_ep);
13532 for (auto ep = start_ep; ep != end_ep; ++ep) {
13533 dout(20) << __func__ << " inserting for GC "
13534 << std::hex << ep->logical_offset << "~" << ep->length
13535 << std::dec << dendl;
13536
13537 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
13538 }
13539 // insert newly written extent to GC
13540 wctx->extents_to_gc.union_insert(offset, length);
13541 dout(20) << __func__ << " inserting (last) for GC "
13542 << std::hex << offset << "~" << length
13543 << std::dec << dendl;
13544 }
7c673cae 13545 // new blob.
7c673cae 13546 BlobRef b = c->new_blob();
11fdf7f2 13547 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
7c673cae
FG
13548 uint64_t b_off0 = b_off;
13549 _pad_zeros(&bl, &b_off0, block_size);
13550 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
1911f103
TL
13551 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13552 min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
13553 // doesn't match disk one only
13554 true);
7c673cae
FG
13555
13556 return;
13557}
13558
13559void BlueStore::_do_write_big(
13560 TransContext *txc,
13561 CollectionRef &c,
13562 OnodeRef o,
13563 uint64_t offset, uint64_t length,
13564 bufferlist::iterator& blp,
13565 WriteContext *wctx)
13566{
13567 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13568 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
13569 << " compress " << (int)wctx->compress
13570 << dendl;
13571 logger->inc(l_bluestore_write_big);
13572 logger->inc(l_bluestore_write_big_bytes, length);
13573 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
11fdf7f2 13574 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae
FG
13575 while (length > 0) {
13576 bool new_blob = false;
11fdf7f2 13577 uint32_t l = std::min(max_bsize, length);
7c673cae
FG
13578 BlobRef b;
13579 uint32_t b_off = 0;
13580
13581 //attempting to reuse existing blob
13582 if (!wctx->compress) {
13583 // look for an existing mutable blob we can reuse
13584 auto begin = o->extent_map.extent_map.begin();
13585 auto end = o->extent_map.extent_map.end();
13586 auto ep = o->extent_map.seek_lextent(offset);
13587 auto prev_ep = ep;
13588 if (prev_ep != begin) {
13589 --prev_ep;
13590 } else {
13591 prev_ep = end; // to avoid this extent check as it's a duplicate
13592 }
13593 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13594 // search suitable extent in both forward and reverse direction in
13595 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 13596 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
13597 bool any_change;
13598 do {
13599 any_change = false;
13600 if (ep != end && ep->logical_offset < offset + max_bsize) {
13601 if (offset >= ep->blob_start() &&
224ce89b 13602 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
13603 offset - ep->blob_start(),
13604 &l)) {
13605 b = ep->blob;
13606 b_off = offset - ep->blob_start();
13607 prev_ep = end; // to avoid check below
13608 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 13609 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
13610 } else {
13611 ++ep;
13612 any_change = true;
13613 }
13614 }
13615
13616 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
224ce89b 13617 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
13618 offset - prev_ep->blob_start(),
13619 &l)) {
13620 b = prev_ep->blob;
13621 b_off = offset - prev_ep->blob_start();
13622 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 13623 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
13624 } else if (prev_ep != begin) {
13625 --prev_ep;
13626 any_change = true;
13627 } else {
13628 prev_ep = end; // to avoid useless first extent re-check
13629 }
13630 }
13631 } while (b == nullptr && any_change);
13632 }
13633 if (b == nullptr) {
13634 b = c->new_blob();
13635 b_off = 0;
13636 new_blob = true;
13637 }
13638
13639 bufferlist t;
13640 blp.copy(l, t);
13641 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
13642 offset += l;
13643 length -= l;
13644 logger->inc(l_bluestore_write_big_blobs);
13645 }
13646}
13647
13648int BlueStore::_do_alloc_write(
13649 TransContext *txc,
13650 CollectionRef coll,
13651 OnodeRef o,
13652 WriteContext *wctx)
13653{
13654 dout(20) << __func__ << " txc " << txc
13655 << " " << wctx->writes.size() << " blobs"
13656 << dendl;
3efd9988
FG
13657 if (wctx->writes.empty()) {
13658 return 0;
7c673cae
FG
13659 }
13660
7c673cae
FG
13661 CompressorRef c;
13662 double crr = 0;
13663 if (wctx->compress) {
13664 c = select_option(
13665 "compression_algorithm",
13666 compressor,
13667 [&]() {
13668 string val;
13669 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
13670 CompressorRef cp = compressor;
13671 if (!cp || cp->get_type_name() != val) {
13672 cp = Compressor::create(cct, val);
11fdf7f2
TL
13673 if (!cp) {
13674 if (_set_compression_alert(false, val.c_str())) {
13675 derr << __func__ << " unable to initialize " << val.c_str()
13676 << " compressor" << dendl;
13677 }
13678 }
7c673cae
FG
13679 }
13680 return boost::optional<CompressorRef>(cp);
13681 }
13682 return boost::optional<CompressorRef>();
13683 }
13684 );
13685
13686 crr = select_option(
13687 "compression_required_ratio",
13688 cct->_conf->bluestore_compression_required_ratio,
13689 [&]() {
13690 double val;
3efd9988 13691 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
7c673cae
FG
13692 return boost::optional<double>(val);
13693 }
13694 return boost::optional<double>();
13695 }
13696 );
13697 }
13698
13699 // checksum
11fdf7f2 13700 int64_t csum = csum_type.load();
7c673cae
FG
13701 csum = select_option(
13702 "csum_type",
13703 csum,
13704 [&]() {
11fdf7f2 13705 int64_t val;
3efd9988 13706 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
11fdf7f2 13707 return boost::optional<int64_t>(val);
7c673cae 13708 }
11fdf7f2 13709 return boost::optional<int64_t>();
7c673cae
FG
13710 }
13711 );
13712
3efd9988
FG
13713 // compress (as needed) and calc needed space
13714 uint64_t need = 0;
11fdf7f2 13715 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae 13716 for (auto& wi : wctx->writes) {
3efd9988 13717 if (c && wi.blob_length > min_alloc_size) {
11fdf7f2 13718 auto start = mono_clock::now();
7c673cae
FG
13719
13720 // compress
11fdf7f2
TL
13721 ceph_assert(wi.b_off == 0);
13722 ceph_assert(wi.blob_length == wi.bl.length());
3efd9988 13723
7c673cae
FG
13724 // FIXME: memory alignment here is bad
13725 bufferlist t;
3efd9988 13726 int r = c->compress(wi.bl, t);
3efd9988 13727 uint64_t want_len_raw = wi.blob_length * crr;
11fdf7f2 13728 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
a8e16298
TL
13729 bool rejected = false;
13730 uint64_t compressed_len = t.length();
13731 // do an approximate (fast) estimation for resulting blob size
13732 // that doesn't take header overhead into account
11fdf7f2 13733 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
13734 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
13735 bluestore_compression_header_t chdr;
13736 chdr.type = c->get_type();
13737 chdr.length = t.length();
13738 encode(chdr, wi.compressed_bl);
13739 wi.compressed_bl.claim_append(t);
13740
13741 compressed_len = wi.compressed_bl.length();
11fdf7f2 13742 result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
13743 if (result_len <= want_len && result_len < wi.blob_length) {
13744 // Cool. We compressed at least as much as we were hoping to.
13745 // pad out to min_alloc_size
13746 wi.compressed_bl.append_zero(result_len - compressed_len);
13747 wi.compressed_len = compressed_len;
13748 wi.compressed = true;
13749 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
13750 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
13751 << " -> 0x" << compressed_len << " => 0x" << result_len
13752 << " with " << c->get_type()
13753 << std::dec << dendl;
13754 txc->statfs_delta.compressed() += compressed_len;
13755 txc->statfs_delta.compressed_original() += wi.blob_length;
13756 txc->statfs_delta.compressed_allocated() += result_len;
13757 logger->inc(l_bluestore_compress_success_count);
13758 need += result_len;
13759 } else {
13760 rejected = true;
13761 }
13762 } else if (r != 0) {
13763 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
13764 << " bytes compressed using " << c->get_type_name()
13765 << std::dec
13766 << " failed with errcode = " << r
13767 << ", leaving uncompressed"
13768 << dendl;
13769 logger->inc(l_bluestore_compress_rejected_count);
13770 need += wi.blob_length;
7c673cae 13771 } else {
a8e16298
TL
13772 rejected = true;
13773 }
13774
13775 if (rejected) {
3efd9988 13776 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
a8e16298 13777 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
3efd9988
FG
13778 << " with " << c->get_type()
13779 << ", which is more than required 0x" << want_len_raw
7c673cae 13780 << " -> 0x" << want_len
3efd9988
FG
13781 << ", leaving uncompressed"
13782 << std::dec << dendl;
13783 logger->inc(l_bluestore_compress_rejected_count);
13784 need += wi.blob_length;
7c673cae 13785 }
494da23a
TL
13786 log_latency("compress@_do_alloc_write",
13787 l_bluestore_compress_lat,
13788 mono_clock::now() - start,
13789 cct->_conf->bluestore_log_op_age );
3efd9988
FG
13790 } else {
13791 need += wi.blob_length;
7c673cae 13792 }
3efd9988 13793 }
a8e16298 13794 PExtentVector prealloc;
3efd9988 13795 prealloc.reserve(2 * wctx->writes.size());;
11fdf7f2 13796 int64_t prealloc_left = 0;
3efd9988
FG
13797 prealloc_left = alloc->allocate(
13798 need, min_alloc_size, need,
13799 0, &prealloc);
eafe8130 13800 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
11fdf7f2 13801 derr << __func__ << " failed to allocate 0x" << std::hex << need
eafe8130 13802 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
11fdf7f2
TL
13803 << " min_alloc_size 0x" << min_alloc_size
13804 << " available 0x " << alloc->get_free()
13805 << std::dec << dendl;
13806 if (prealloc.size()) {
13807 alloc->release(prealloc);
13808 }
a8e16298
TL
13809 return -ENOSPC;
13810 }
9f95a23c 13811 _collect_allocation_stats(need, min_alloc_size, prealloc.size());
a8e16298 13812
3efd9988
FG
13813 dout(20) << __func__ << " prealloc " << prealloc << dendl;
13814 auto prealloc_pos = prealloc.begin();
13815
13816 for (auto& wi : wctx->writes) {
13817 BlobRef b = wi.b;
13818 bluestore_blob_t& dblob = b->dirty_blob();
13819 uint64_t b_off = wi.b_off;
13820 bufferlist *l = &wi.bl;
13821 uint64_t final_length = wi.blob_length;
13822 uint64_t csum_length = wi.blob_length;
3efd9988
FG
13823 if (wi.compressed) {
13824 final_length = wi.compressed_bl.length();
13825 csum_length = final_length;
3efd9988
FG
13826 l = &wi.compressed_bl;
13827 dblob.set_compressed(wi.blob_length, wi.compressed_len);
13828 } else if (wi.new_blob) {
7c673cae 13829 // initialize newly created blob only
11fdf7f2
TL
13830 ceph_assert(dblob.is_mutable());
13831 unsigned csum_order;
7c673cae
FG
13832 if (l->length() != wi.blob_length) {
13833 // hrm, maybe we could do better here, but let's not bother.
13834 dout(20) << __func__ << " forcing csum_order to block_size_order "
13835 << block_size_order << dendl;
31f18b77 13836 csum_order = block_size_order;
7c673cae
FG
13837 } else {
13838 csum_order = std::min(wctx->csum_order, ctz(l->length()));
13839 }
13840 // try to align blob with max_blob_size to improve
13841 // its reuse ratio, e.g. in case of reverse write
13842 uint32_t suggested_boff =
13843 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
13844 if ((suggested_boff % (1 << csum_order)) == 0 &&
13845 suggested_boff + final_length <= max_bsize &&
13846 suggested_boff > b_off) {
181888fb 13847 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae 13848 << std::hex << suggested_boff << std::dec << dendl;
11fdf7f2 13849 ceph_assert(suggested_boff >= b_off);
7c673cae
FG
13850 csum_length += suggested_boff - b_off;
13851 b_off = suggested_boff;
13852 }
181888fb
FG
13853 if (csum != Checksummer::CSUM_NONE) {
13854 dout(20) << __func__ << " initialize csum setting for new blob " << *b
13855 << " csum_type " << Checksummer::get_csum_type_string(csum)
13856 << " csum_order " << csum_order
13857 << " csum_length 0x" << std::hex << csum_length << std::dec
13858 << dendl;
13859 dblob.init_csum(csum, csum_order, csum_length);
13860 }
7c673cae
FG
13861 }
13862
a8e16298 13863 PExtentVector extents;
3efd9988
FG
13864 int64_t left = final_length;
13865 while (left > 0) {
11fdf7f2 13866 ceph_assert(prealloc_left > 0);
3efd9988
FG
13867 if (prealloc_pos->length <= left) {
13868 prealloc_left -= prealloc_pos->length;
13869 left -= prealloc_pos->length;
13870 txc->statfs_delta.allocated() += prealloc_pos->length;
13871 extents.push_back(*prealloc_pos);
13872 ++prealloc_pos;
13873 } else {
13874 extents.emplace_back(prealloc_pos->offset, left);
13875 prealloc_pos->offset += left;
13876 prealloc_pos->length -= left;
13877 prealloc_left -= left;
13878 txc->statfs_delta.allocated() += left;
13879 left = 0;
13880 break;
13881 }
13882 }
7c673cae 13883 for (auto& p : extents) {
3efd9988 13884 txc->allocated.insert(p.offset, p.length);
7c673cae 13885 }
11fdf7f2 13886 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
7c673cae 13887
181888fb
FG
13888 dout(20) << __func__ << " blob " << *b << dendl;
13889 if (dblob.has_csum()) {
7c673cae
FG
13890 dblob.calc_csum(b_off, *l);
13891 }
181888fb 13892
7c673cae 13893 if (wi.mark_unused) {
1911f103 13894 ceph_assert(!dblob.is_compressed());
7c673cae
FG
13895 auto b_end = b_off + wi.bl.length();
13896 if (b_off) {
13897 dblob.add_unused(0, b_off);
13898 }
1911f103
TL
13899 uint64_t llen = dblob.get_logical_length();
13900 if (b_end < llen) {
13901 dblob.add_unused(b_end, llen - b_end);
7c673cae
FG
13902 }
13903 }
13904
13905 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
13906 b_off + (wi.b_off0 - wi.b_off),
13907 wi.length0,
13908 wi.b,
13909 nullptr);
13910 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
13911 txc->statfs_delta.stored() += le->length;
13912 dout(20) << __func__ << " lex " << *le << dendl;
13913 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
13914 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13915
13916 // queue io
11fdf7f2 13917 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
13918 if (l->length() <= prefer_deferred_size.load()) {
13919 dout(20) << __func__ << " deferring small 0x" << std::hex
13920 << l->length() << std::dec << " write via deferred" << dendl;
9f95a23c 13921 bluestore_deferred_op_t *op = _get_deferred_op(txc);
7c673cae
FG
13922 op->op = bluestore_deferred_op_t::OP_WRITE;
13923 int r = b->get_blob().map(
13924 b_off, l->length(),
13925 [&](uint64_t offset, uint64_t length) {
13926 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13927 return 0;
13928 });
11fdf7f2 13929 ceph_assert(r == 0);
7c673cae 13930 op->data = *l;
81eedcae 13931 logger->inc(l_bluestore_write_small_deferred);
7c673cae
FG
13932 } else {
13933 b->get_blob().map_bl(
13934 b_off, *l,
13935 [&](uint64_t offset, bufferlist& t) {
13936 bdev->aio_write(offset, t, &txc->ioc, false);
13937 });
81eedcae 13938 logger->inc(l_bluestore_write_small_new);
7c673cae
FG
13939 }
13940 }
13941 }
11fdf7f2
TL
13942 ceph_assert(prealloc_pos == prealloc.end());
13943 ceph_assert(prealloc_left == 0);
7c673cae
FG
13944 return 0;
13945}
13946
13947void BlueStore::_wctx_finish(
13948 TransContext *txc,
13949 CollectionRef& c,
13950 OnodeRef o,
31f18b77
FG
13951 WriteContext *wctx,
13952 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
13953{
13954 auto oep = wctx->old_extents.begin();
13955 while (oep != wctx->old_extents.end()) {
13956 auto &lo = *oep;
13957 oep = wctx->old_extents.erase(oep);
13958 dout(20) << __func__ << " lex_old " << lo.e << dendl;
13959 BlobRef b = lo.e.blob;
13960 const bluestore_blob_t& blob = b->get_blob();
13961 if (blob.is_compressed()) {
13962 if (lo.blob_empty) {
13963 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
13964 }
13965 txc->statfs_delta.compressed_original() -= lo.e.length;
13966 }
13967 auto& r = lo.r;
13968 txc->statfs_delta.stored() -= lo.e.length;
13969 if (!r.empty()) {
13970 dout(20) << __func__ << " blob release " << r << dendl;
13971 if (blob.is_shared()) {
13972 PExtentVector final;
13973 c->load_shared_blob(b->shared_blob);
11fdf7f2
TL
13974 bool unshare = false;
13975 bool* unshare_ptr =
13976 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
7c673cae 13977 for (auto e : r) {
31f18b77
FG
13978 b->shared_blob->put_ref(
13979 e.offset, e.length, &final,
11fdf7f2
TL
13980 unshare_ptr);
13981 }
13982 if (unshare) {
13983 ceph_assert(maybe_unshared_blobs);
13984 maybe_unshared_blobs->insert(b->shared_blob.get());
7c673cae
FG
13985 }
13986 dout(20) << __func__ << " shared_blob release " << final
13987 << " from " << *b->shared_blob << dendl;
13988 txc->write_shared_blob(b->shared_blob);
13989 r.clear();
13990 r.swap(final);
13991 }
13992 }
13993 // we can't invalidate our logical extents as we drop them because
13994 // other lextents (either in our onode or others) may still
13995 // reference them. but we can throw out anything that is no
13996 // longer allocated. Note that this will leave behind edge bits
13997 // that are no longer referenced but not deallocated (until they
13998 // age out of the cache naturally).
13999 b->discard_unallocated(c.get());
14000 for (auto e : r) {
14001 dout(20) << __func__ << " release " << e << dendl;
14002 txc->released.insert(e.offset, e.length);
14003 txc->statfs_delta.allocated() -= e.length;
14004 if (blob.is_compressed()) {
14005 txc->statfs_delta.compressed_allocated() -= e.length;
14006 }
14007 }
9f95a23c
TL
14008
14009 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
7c673cae
FG
14010 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
14011 << dendl;
14012 o->extent_map.spanning_blob_map.erase(b->id);
14013 }
9f95a23c 14014 delete &lo;
7c673cae
FG
14015 }
14016}
14017
14018void BlueStore::_do_write_data(
14019 TransContext *txc,
14020 CollectionRef& c,
14021 OnodeRef o,
14022 uint64_t offset,
14023 uint64_t length,
14024 bufferlist& bl,
14025 WriteContext *wctx)
14026{
14027 uint64_t end = offset + length;
14028 bufferlist::iterator p = bl.begin();
14029
14030 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
14031 (length != min_alloc_size)) {
14032 // we fall within the same block
14033 _do_write_small(txc, c, o, offset, length, p, wctx);
14034 } else {
14035 uint64_t head_offset, head_length;
14036 uint64_t middle_offset, middle_length;
14037 uint64_t tail_offset, tail_length;
14038
14039 head_offset = offset;
11fdf7f2 14040 head_length = p2nphase(offset, min_alloc_size);
7c673cae 14041
11fdf7f2
TL
14042 tail_offset = p2align(end, min_alloc_size);
14043 tail_length = p2phase(end, min_alloc_size);
7c673cae
FG
14044
14045 middle_offset = head_offset + head_length;
14046 middle_length = length - head_length - tail_length;
14047
14048 if (head_length) {
14049 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
14050 }
14051
14052 if (middle_length) {
14053 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
14054 }
14055
14056 if (tail_length) {
14057 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
14058 }
14059 }
14060}
14061
31f18b77
FG
14062void BlueStore::_choose_write_options(
14063 CollectionRef& c,
14064 OnodeRef o,
14065 uint32_t fadvise_flags,
14066 WriteContext *wctx)
7c673cae 14067{
7c673cae
FG
14068 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
14069 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 14070 wctx->buffered = true;
7c673cae
FG
14071 } else if (cct->_conf->bluestore_default_buffered_write &&
14072 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
14073 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
14074 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 14075 wctx->buffered = true;
7c673cae
FG
14076 }
14077
31f18b77
FG
14078 // apply basic csum block size
14079 wctx->csum_order = block_size_order;
7c673cae
FG
14080
14081 // compression parameters
14082 unsigned alloc_hints = o->onode.alloc_hint_flags;
14083 auto cm = select_option(
14084 "compression_mode",
31f18b77 14085 comp_mode.load(),
7c673cae
FG
14086 [&]() {
14087 string val;
11fdf7f2 14088 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
14089 return boost::optional<Compressor::CompressionMode>(
14090 Compressor::get_comp_mode_type(val));
7c673cae
FG
14091 }
14092 return boost::optional<Compressor::CompressionMode>();
14093 }
14094 );
31f18b77
FG
14095
14096 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
14097 ((cm == Compressor::COMP_FORCE) ||
14098 (cm == Compressor::COMP_AGGRESSIVE &&
14099 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
14100 (cm == Compressor::COMP_PASSIVE &&
14101 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
14102
14103 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
14104 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
14105 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
14106 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 14107 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 14108
7c673cae 14109 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 14110
7c673cae 14111 if (o->onode.expected_write_size) {
224ce89b 14112 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 14113 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 14114 } else {
224ce89b 14115 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
14116 }
14117
31f18b77
FG
14118 if (wctx->compress) {
14119 wctx->target_blob_size = select_option(
7c673cae 14120 "compression_max_blob_size",
31f18b77 14121 comp_max_blob_size.load(),
7c673cae 14122 [&]() {
11fdf7f2
TL
14123 int64_t val;
14124 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
7c673cae
FG
14125 return boost::optional<uint64_t>((uint64_t)val);
14126 }
14127 return boost::optional<uint64_t>();
14128 }
14129 );
14130 }
14131 } else {
31f18b77
FG
14132 if (wctx->compress) {
14133 wctx->target_blob_size = select_option(
7c673cae 14134 "compression_min_blob_size",
31f18b77 14135 comp_min_blob_size.load(),
7c673cae 14136 [&]() {
11fdf7f2
TL
14137 int64_t val;
14138 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
7c673cae
FG
14139 return boost::optional<uint64_t>((uint64_t)val);
14140 }
14141 return boost::optional<uint64_t>();
14142 }
14143 );
14144 }
14145 }
31f18b77 14146
7c673cae 14147 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
14148 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
14149 wctx->target_blob_size = max_bsize;
7c673cae 14150 }
31f18b77 14151
7c673cae
FG
14152 // set the min blob size floor at 2x the min_alloc_size, or else we
14153 // won't be able to allocate a smaller extent for the compressed
14154 // data.
31f18b77
FG
14155 if (wctx->compress &&
14156 wctx->target_blob_size < min_alloc_size * 2) {
14157 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 14158 }
31f18b77
FG
14159
14160 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
14161 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
f64942e4
AA
14162 << " compress=" << (int)wctx->compress
14163 << " buffered=" << (int)wctx->buffered
31f18b77
FG
14164 << std::dec << dendl;
14165}
14166
14167int BlueStore::_do_gc(
14168 TransContext *txc,
14169 CollectionRef& c,
14170 OnodeRef o,
31f18b77
FG
14171 const WriteContext& wctx,
14172 uint64_t *dirty_start,
14173 uint64_t *dirty_end)
14174{
31f18b77 14175
1adf2230 14176 bool dirty_range_updated = false;
31f18b77 14177 WriteContext wctx_gc;
7c673cae 14178 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 14179
eafe8130 14180 auto & extents_to_collect = wctx.extents_to_gc;
31f18b77
FG
14181 for (auto it = extents_to_collect.begin();
14182 it != extents_to_collect.end();
14183 ++it) {
14184 bufferlist bl;
eafe8130
TL
14185 auto offset = (*it).first;
14186 auto length = (*it).second;
14187 dout(20) << __func__ << " processing " << std::hex
14188 << offset << "~" << length << std::dec
14189 << dendl;
14190 int r = _do_read(c.get(), o, offset, length, bl, 0);
14191 ceph_assert(r == (int)length);
31f18b77 14192
eafe8130
TL
14193 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
14194 logger->inc(l_bluestore_gc_merged, length);
31f18b77 14195
eafe8130
TL
14196 if (*dirty_start > offset) {
14197 *dirty_start = offset;
1adf2230 14198 dirty_range_updated = true;
31f18b77
FG
14199 }
14200
eafe8130
TL
14201 if (*dirty_end < offset + length) {
14202 *dirty_end = offset + length;
1adf2230 14203 dirty_range_updated = true;
31f18b77
FG
14204 }
14205 }
1adf2230
AA
14206 if (dirty_range_updated) {
14207 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
14208 }
31f18b77
FG
14209
14210 dout(30) << __func__ << " alloc write" << dendl;
14211 int r = _do_alloc_write(txc, c, o, &wctx_gc);
14212 if (r < 0) {
14213 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14214 << dendl;
14215 return r;
14216 }
14217
14218 _wctx_finish(txc, c, o, &wctx_gc);
14219 return 0;
14220}
14221
14222int BlueStore::_do_write(
14223 TransContext *txc,
14224 CollectionRef& c,
14225 OnodeRef o,
14226 uint64_t offset,
14227 uint64_t length,
14228 bufferlist& bl,
14229 uint32_t fadvise_flags)
14230{
14231 int r = 0;
14232
14233 dout(20) << __func__
14234 << " " << o->oid
14235 << " 0x" << std::hex << offset << "~" << length
14236 << " - have 0x" << o->onode.size
14237 << " (" << std::dec << o->onode.size << ")"
14238 << " bytes"
14239 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
14240 << dendl;
81eedcae 14241 _dump_onode<30>(cct, *o);
31f18b77
FG
14242
14243 if (length == 0) {
14244 return 0;
14245 }
14246
14247 uint64_t end = offset + length;
14248
14249 GarbageCollector gc(c->store->cct);
eafe8130 14250 int64_t benefit = 0;
31f18b77
FG
14251 auto dirty_start = offset;
14252 auto dirty_end = end;
14253
14254 WriteContext wctx;
14255 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
14256 o->extent_map.fault_range(db, offset, length);
14257 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
14258 r = _do_alloc_write(txc, c, o, &wctx);
14259 if (r < 0) {
14260 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
14261 << dendl;
14262 goto out;
14263 }
14264
eafe8130
TL
14265 if (wctx.extents_to_gc.empty() ||
14266 wctx.extents_to_gc.range_start() > offset ||
14267 wctx.extents_to_gc.range_end() < offset + length) {
14268 benefit = gc.estimate(offset,
14269 length,
14270 o->extent_map,
14271 wctx.old_extents,
14272 min_alloc_size);
14273 }
14274
31f18b77
FG
14275 // NB: _wctx_finish() will empty old_extents
14276 // so we must do gc estimation before that
7c673cae
FG
14277 _wctx_finish(txc, c, o, &wctx);
14278 if (end > o->onode.size) {
14279 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 14280 << std::dec << dendl;
7c673cae
FG
14281 o->onode.size = end;
14282 }
14283
11fdf7f2 14284 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
eafe8130
TL
14285 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
14286 dout(20) << __func__
14287 << " perform garbage collection for compressed extents, "
14288 << "expected benefit = " << benefit << " AUs" << dendl;
14289 }
14290 if (!wctx.extents_to_gc.empty()) {
14291 dout(20) << __func__ << " perform garbage collection" << dendl;
14292
14293 r = _do_gc(txc, c, o,
14294 wctx,
14295 &dirty_start, &dirty_end);
14296 if (r < 0) {
14297 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
14298 << dendl;
14299 goto out;
7c673cae 14300 }
eafe8130
TL
14301 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
14302 << "~" << dirty_end - dirty_start << std::dec << dendl;
7c673cae 14303 }
7c673cae 14304 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
14305 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
14306
7c673cae
FG
14307 r = 0;
14308
14309 out:
14310 return r;
14311}
14312
14313int BlueStore::_write(TransContext *txc,
14314 CollectionRef& c,
14315 OnodeRef& o,
31f18b77
FG
14316 uint64_t offset, size_t length,
14317 bufferlist& bl,
14318 uint32_t fadvise_flags)
7c673cae
FG
14319{
14320 dout(15) << __func__ << " " << c->cid << " " << o->oid
14321 << " 0x" << std::hex << offset << "~" << length << std::dec
14322 << dendl;
35e4c445
FG
14323 int r = 0;
14324 if (offset + length >= OBJECT_MAX_SIZE) {
14325 r = -E2BIG;
14326 } else {
14327 _assign_nid(txc, o);
14328 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
14329 txc->write_onode(o);
14330 }
7c673cae
FG
14331 dout(10) << __func__ << " " << c->cid << " " << o->oid
14332 << " 0x" << std::hex << offset << "~" << length << std::dec
14333 << " = " << r << dendl;
14334 return r;
14335}
14336
14337int BlueStore::_zero(TransContext *txc,
14338 CollectionRef& c,
14339 OnodeRef& o,
14340 uint64_t offset, size_t length)
14341{
14342 dout(15) << __func__ << " " << c->cid << " " << o->oid
14343 << " 0x" << std::hex << offset << "~" << length << std::dec
14344 << dendl;
35e4c445
FG
14345 int r = 0;
14346 if (offset + length >= OBJECT_MAX_SIZE) {
14347 r = -E2BIG;
14348 } else {
14349 _assign_nid(txc, o);
14350 r = _do_zero(txc, c, o, offset, length);
14351 }
7c673cae
FG
14352 dout(10) << __func__ << " " << c->cid << " " << o->oid
14353 << " 0x" << std::hex << offset << "~" << length << std::dec
14354 << " = " << r << dendl;
14355 return r;
14356}
14357
14358int BlueStore::_do_zero(TransContext *txc,
14359 CollectionRef& c,
14360 OnodeRef& o,
14361 uint64_t offset, size_t length)
14362{
14363 dout(15) << __func__ << " " << c->cid << " " << o->oid
14364 << " 0x" << std::hex << offset << "~" << length << std::dec
14365 << dendl;
14366 int r = 0;
14367
81eedcae 14368 _dump_onode<30>(cct, *o);
7c673cae
FG
14369
14370 WriteContext wctx;
14371 o->extent_map.fault_range(db, offset, length);
14372 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 14373 o->extent_map.dirty_range(offset, length);
7c673cae
FG
14374 _wctx_finish(txc, c, o, &wctx);
14375
b32b8144 14376 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
14377 o->onode.size = offset + length;
14378 dout(20) << __func__ << " extending size to " << offset + length
14379 << dendl;
14380 }
14381 txc->write_onode(o);
14382
14383 dout(10) << __func__ << " " << c->cid << " " << o->oid
14384 << " 0x" << std::hex << offset << "~" << length << std::dec
14385 << " = " << r << dendl;
14386 return r;
14387}
14388
14389void BlueStore::_do_truncate(
31f18b77
FG
14390 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
14391 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
14392{
14393 dout(15) << __func__ << " " << c->cid << " " << o->oid
14394 << " 0x" << std::hex << offset << std::dec << dendl;
14395
81eedcae 14396 _dump_onode<30>(cct, *o);
7c673cae
FG
14397
14398 if (offset == o->onode.size)
31f18b77 14399 return;
7c673cae
FG
14400
14401 if (offset < o->onode.size) {
14402 WriteContext wctx;
14403 uint64_t length = o->onode.size - offset;
14404 o->extent_map.fault_range(db, offset, length);
14405 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77
FG
14406 o->extent_map.dirty_range(offset, length);
14407 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
14408
14409 // if we have shards past EOF, ask for a reshard
14410 if (!o->onode.extent_map_shards.empty() &&
14411 o->onode.extent_map_shards.back().offset >= offset) {
14412 dout(10) << __func__ << " request reshard past EOF" << dendl;
14413 if (offset) {
14414 o->extent_map.request_reshard(offset - 1, offset + length);
14415 } else {
14416 o->extent_map.request_reshard(0, length);
14417 }
14418 }
14419 }
14420
14421 o->onode.size = offset;
14422
14423 txc->write_onode(o);
14424}
14425
35e4c445 14426int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
14427 CollectionRef& c,
14428 OnodeRef& o,
14429 uint64_t offset)
14430{
14431 dout(15) << __func__ << " " << c->cid << " " << o->oid
14432 << " 0x" << std::hex << offset << std::dec
14433 << dendl;
35e4c445
FG
14434 int r = 0;
14435 if (offset >= OBJECT_MAX_SIZE) {
14436 r = -E2BIG;
14437 } else {
14438 _do_truncate(txc, c, o, offset);
14439 }
14440 dout(10) << __func__ << " " << c->cid << " " << o->oid
14441 << " 0x" << std::hex << offset << std::dec
14442 << " = " << r << dendl;
14443 return r;
7c673cae
FG
14444}
14445
14446int BlueStore::_do_remove(
14447 TransContext *txc,
14448 CollectionRef& c,
14449 OnodeRef o)
14450{
31f18b77 14451 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
14452 bool is_gen = !o->oid.is_no_gen();
14453 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
14454 if (o->onode.has_omap()) {
14455 o->flush();
9f95a23c 14456 _do_omap_clear(txc, o);
7c673cae
FG
14457 }
14458 o->exists = false;
14459 string key;
14460 for (auto &s : o->extent_map.shards) {
14461 dout(20) << __func__ << " removing shard 0x" << std::hex
14462 << s.shard_info->offset << std::dec << dendl;
14463 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
14464 [&](const string& final_key) {
14465 txc->t->rmkey(PREFIX_OBJ, final_key);
14466 }
14467 );
14468 }
14469 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
a8e16298 14470 txc->note_removed_object(o);
7c673cae
FG
14471 o->extent_map.clear();
14472 o->onode = bluestore_onode_t();
14473 _debug_obj_on_delete(o->oid);
31f18b77 14474
224ce89b
WB
14475 if (!is_gen || maybe_unshared_blobs.empty()) {
14476 return 0;
14477 }
31f18b77 14478
224ce89b
WB
14479 // see if we can unshare blobs still referenced by the head
14480 dout(10) << __func__ << " gen and maybe_unshared_blobs "
14481 << maybe_unshared_blobs << dendl;
14482 ghobject_t nogen = o->oid;
14483 nogen.generation = ghobject_t::NO_GEN;
14484 OnodeRef h = c->onode_map.lookup(nogen);
14485
14486 if (!h || !h->exists) {
14487 return 0;
14488 }
14489
14490 dout(20) << __func__ << " checking for unshareable blobs on " << h
14491 << " " << h->oid << dendl;
14492 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
14493 for (auto& e : h->extent_map.extent_map) {
14494 const bluestore_blob_t& b = e.blob->get_blob();
14495 SharedBlob *sb = e.blob->shared_blob.get();
14496 if (b.is_shared() &&
14497 sb->loaded &&
14498 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
14499 if (b.is_compressed()) {
14500 expect[sb].get(0, b.get_ondisk_length());
14501 } else {
14502 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
14503 expect[sb].get(off, len);
14504 return 0;
14505 });
14506 }
224ce89b
WB
14507 }
14508 }
31f18b77 14509
224ce89b
WB
14510 vector<SharedBlob*> unshared_blobs;
14511 unshared_blobs.reserve(maybe_unshared_blobs.size());
14512 for (auto& p : expect) {
14513 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
14514 if (p.first->persistent->ref_map == p.second) {
14515 SharedBlob *sb = p.first;
14516 dout(20) << __func__ << " unsharing " << *sb << dendl;
14517 unshared_blobs.push_back(sb);
14518 txc->unshare_blob(sb);
14519 uint64_t sbid = c->make_blob_unshared(sb);
14520 string key;
14521 get_shared_blob_key(sbid, &key);
14522 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
14523 }
14524 }
14525
14526 if (unshared_blobs.empty()) {
14527 return 0;
14528 }
14529
224ce89b
WB
14530 for (auto& e : h->extent_map.extent_map) {
14531 const bluestore_blob_t& b = e.blob->get_blob();
14532 SharedBlob *sb = e.blob->shared_blob.get();
14533 if (b.is_shared() &&
14534 std::find(unshared_blobs.begin(), unshared_blobs.end(),
14535 sb) != unshared_blobs.end()) {
14536 dout(20) << __func__ << " unsharing " << e << dendl;
14537 bluestore_blob_t& blob = e.blob->dirty_blob();
14538 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 14539 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
14540 }
14541 }
224ce89b
WB
14542 txc->write_onode(h);
14543
7c673cae
FG
14544 return 0;
14545}
14546
14547int BlueStore::_remove(TransContext *txc,
14548 CollectionRef& c,
14549 OnodeRef &o)
14550{
11fdf7f2
TL
14551 dout(15) << __func__ << " " << c->cid << " " << o->oid
14552 << " onode " << o.get()
14553 << " txc "<< txc << dendl;
7c673cae
FG
14554 int r = _do_remove(txc, c, o);
14555 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14556 return r;
14557}
14558
14559int BlueStore::_setattr(TransContext *txc,
14560 CollectionRef& c,
14561 OnodeRef& o,
14562 const string& name,
14563 bufferptr& val)
14564{
14565 dout(15) << __func__ << " " << c->cid << " " << o->oid
14566 << " " << name << " (" << val.length() << " bytes)"
14567 << dendl;
14568 int r = 0;
3efd9988
FG
14569 if (val.is_partial()) {
14570 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
14571 val.length());
f91f0fd5 14572 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
14573 } else {
14574 auto& b = o->onode.attrs[name.c_str()] = val;
f91f0fd5 14575 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 14576 }
7c673cae
FG
14577 txc->write_onode(o);
14578 dout(10) << __func__ << " " << c->cid << " " << o->oid
14579 << " " << name << " (" << val.length() << " bytes)"
14580 << " = " << r << dendl;
14581 return r;
14582}
14583
14584int BlueStore::_setattrs(TransContext *txc,
14585 CollectionRef& c,
14586 OnodeRef& o,
14587 const map<string,bufferptr>& aset)
14588{
14589 dout(15) << __func__ << " " << c->cid << " " << o->oid
14590 << " " << aset.size() << " keys"
14591 << dendl;
14592 int r = 0;
14593 for (map<string,bufferptr>::const_iterator p = aset.begin();
14594 p != aset.end(); ++p) {
3efd9988
FG
14595 if (p->second.is_partial()) {
14596 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 14597 bufferptr(p->second.c_str(), p->second.length());
f91f0fd5 14598 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988
FG
14599 } else {
14600 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
f91f0fd5 14601 b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
3efd9988 14602 }
7c673cae
FG
14603 }
14604 txc->write_onode(o);
14605 dout(10) << __func__ << " " << c->cid << " " << o->oid
14606 << " " << aset.size() << " keys"
14607 << " = " << r << dendl;
14608 return r;
14609}
14610
14611
14612int BlueStore::_rmattr(TransContext *txc,
14613 CollectionRef& c,
14614 OnodeRef& o,
14615 const string& name)
14616{
14617 dout(15) << __func__ << " " << c->cid << " " << o->oid
14618 << " " << name << dendl;
14619 int r = 0;
14620 auto it = o->onode.attrs.find(name.c_str());
14621 if (it == o->onode.attrs.end())
14622 goto out;
14623
14624 o->onode.attrs.erase(it);
14625 txc->write_onode(o);
14626
14627 out:
14628 dout(10) << __func__ << " " << c->cid << " " << o->oid
14629 << " " << name << " = " << r << dendl;
14630 return r;
14631}
14632
14633int BlueStore::_rmattrs(TransContext *txc,
14634 CollectionRef& c,
14635 OnodeRef& o)
14636{
14637 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14638 int r = 0;
14639
14640 if (o->onode.attrs.empty())
14641 goto out;
14642
14643 o->onode.attrs.clear();
14644 txc->write_onode(o);
14645
14646 out:
14647 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14648 return r;
14649}
14650
9f95a23c 14651void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
7c673cae 14652{
9f95a23c 14653 const string& omap_prefix = o->get_omap_prefix();
7c673cae 14654 string prefix, tail;
9f95a23c
TL
14655 o->get_omap_header(&prefix);
14656 o->get_omap_tail(&tail);
11fdf7f2 14657 txc->t->rm_range_keys(omap_prefix, prefix, tail);
494da23a 14658 txc->t->rmkey(omap_prefix, tail);
11fdf7f2
TL
14659 dout(20) << __func__ << " remove range start: "
14660 << pretty_binary_string(prefix) << " end: "
14661 << pretty_binary_string(tail) << dendl;
7c673cae
FG
14662}
14663
14664int BlueStore::_omap_clear(TransContext *txc,
14665 CollectionRef& c,
14666 OnodeRef& o)
14667{
14668 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14669 int r = 0;
14670 if (o->onode.has_omap()) {
14671 o->flush();
9f95a23c 14672 _do_omap_clear(txc, o);
7c673cae
FG
14673 o->onode.clear_omap_flag();
14674 txc->write_onode(o);
14675 }
14676 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14677 return r;
14678}
14679
14680int BlueStore::_omap_setkeys(TransContext *txc,
14681 CollectionRef& c,
14682 OnodeRef& o,
14683 bufferlist &bl)
14684{
14685 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14686 int r;
11fdf7f2 14687 auto p = bl.cbegin();
7c673cae
FG
14688 __u32 num;
14689 if (!o->onode.has_omap()) {
11fdf7f2 14690 if (o->oid.is_pgmeta()) {
9f95a23c
TL
14691 o->onode.set_omap_flags_pgmeta();
14692 } else {
14693 o->onode.set_omap_flags();
11fdf7f2 14694 }
7c673cae 14695 txc->write_onode(o);
494da23a 14696
9f95a23c 14697 const string& prefix = o->get_omap_prefix();
494da23a
TL
14698 string key_tail;
14699 bufferlist tail;
9f95a23c 14700 o->get_omap_tail(&key_tail);
494da23a 14701 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
14702 } else {
14703 txc->note_modified_object(o);
14704 }
9f95a23c 14705 const string& prefix = o->get_omap_prefix();
7c673cae 14706 string final_key;
9f95a23c
TL
14707 o->get_omap_key(string(), &final_key);
14708 size_t base_key_len = final_key.size();
11fdf7f2 14709 decode(num, p);
7c673cae
FG
14710 while (num--) {
14711 string key;
14712 bufferlist value;
11fdf7f2
TL
14713 decode(key, p);
14714 decode(value, p);
9f95a23c 14715 final_key.resize(base_key_len); // keep prefix
7c673cae 14716 final_key += key;
11fdf7f2 14717 dout(20) << __func__ << " " << pretty_binary_string(final_key)
7c673cae 14718 << " <- " << key << dendl;
11fdf7f2 14719 txc->t->set(prefix, final_key, value);
7c673cae
FG
14720 }
14721 r = 0;
14722 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14723 return r;
14724}
14725
14726int BlueStore::_omap_setheader(TransContext *txc,
14727 CollectionRef& c,
14728 OnodeRef &o,
14729 bufferlist& bl)
14730{
14731 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14732 int r;
14733 string key;
14734 if (!o->onode.has_omap()) {
11fdf7f2 14735 if (o->oid.is_pgmeta()) {
9f95a23c
TL
14736 o->onode.set_omap_flags_pgmeta();
14737 } else {
14738 o->onode.set_omap_flags();
11fdf7f2 14739 }
7c673cae 14740 txc->write_onode(o);
494da23a 14741
9f95a23c 14742 const string& prefix = o->get_omap_prefix();
494da23a
TL
14743 string key_tail;
14744 bufferlist tail;
9f95a23c 14745 o->get_omap_tail(&key_tail);
494da23a 14746 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
14747 } else {
14748 txc->note_modified_object(o);
14749 }
9f95a23c
TL
14750 const string& prefix = o->get_omap_prefix();
14751 o->get_omap_header(&key);
11fdf7f2 14752 txc->t->set(prefix, key, bl);
7c673cae
FG
14753 r = 0;
14754 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14755 return r;
14756}
14757
14758int BlueStore::_omap_rmkeys(TransContext *txc,
14759 CollectionRef& c,
14760 OnodeRef& o,
14761 bufferlist& bl)
14762{
14763 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14764 int r = 0;
11fdf7f2 14765 auto p = bl.cbegin();
7c673cae
FG
14766 __u32 num;
14767 string final_key;
14768
14769 if (!o->onode.has_omap()) {
14770 goto out;
14771 }
11fdf7f2 14772 {
9f95a23c
TL
14773 const string& prefix = o->get_omap_prefix();
14774 o->get_omap_key(string(), &final_key);
14775 size_t base_key_len = final_key.size();
11fdf7f2
TL
14776 decode(num, p);
14777 while (num--) {
14778 string key;
14779 decode(key, p);
9f95a23c 14780 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
14781 final_key += key;
14782 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
14783 << " <- " << key << dendl;
14784 txc->t->rmkey(prefix, final_key);
14785 }
7c673cae
FG
14786 }
14787 txc->note_modified_object(o);
14788
14789 out:
14790 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14791 return r;
14792}
14793
14794int BlueStore::_omap_rmkey_range(TransContext *txc,
14795 CollectionRef& c,
14796 OnodeRef& o,
14797 const string& first, const string& last)
14798{
14799 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
7c673cae
FG
14800 string key_first, key_last;
14801 int r = 0;
14802 if (!o->onode.has_omap()) {
14803 goto out;
14804 }
11fdf7f2 14805 {
9f95a23c 14806 const string& prefix = o->get_omap_prefix();
11fdf7f2 14807 o->flush();
9f95a23c
TL
14808 o->get_omap_key(first, &key_first);
14809 o->get_omap_key(last, &key_last);
11fdf7f2
TL
14810 txc->t->rm_range_keys(prefix, key_first, key_last);
14811 dout(20) << __func__ << " remove range start: "
14812 << pretty_binary_string(key_first) << " end: "
14813 << pretty_binary_string(key_last) << dendl;
7c673cae
FG
14814 }
14815 txc->note_modified_object(o);
14816
14817 out:
14818 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14819 return r;
14820}
14821
14822int BlueStore::_set_alloc_hint(
14823 TransContext *txc,
14824 CollectionRef& c,
14825 OnodeRef& o,
14826 uint64_t expected_object_size,
14827 uint64_t expected_write_size,
14828 uint32_t flags)
14829{
14830 dout(15) << __func__ << " " << c->cid << " " << o->oid
14831 << " object_size " << expected_object_size
14832 << " write_size " << expected_write_size
14833 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
14834 << dendl;
14835 int r = 0;
14836 o->onode.expected_object_size = expected_object_size;
14837 o->onode.expected_write_size = expected_write_size;
14838 o->onode.alloc_hint_flags = flags;
14839 txc->write_onode(o);
14840 dout(10) << __func__ << " " << c->cid << " " << o->oid
14841 << " object_size " << expected_object_size
14842 << " write_size " << expected_write_size
14843 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
14844 << " = " << r << dendl;
14845 return r;
14846}
14847
14848int BlueStore::_clone(TransContext *txc,
14849 CollectionRef& c,
14850 OnodeRef& oldo,
14851 OnodeRef& newo)
14852{
14853 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14854 << newo->oid << dendl;
14855 int r = 0;
14856 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
14857 derr << __func__ << " mismatched hash on " << oldo->oid
14858 << " and " << newo->oid << dendl;
14859 return -EINVAL;
14860 }
14861
7c673cae
FG
14862 _assign_nid(txc, newo);
14863
14864 // clone data
14865 oldo->flush();
14866 _do_truncate(txc, c, newo, 0);
14867 if (cct->_conf->bluestore_clone_cow) {
14868 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
14869 } else {
14870 bufferlist bl;
14871 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
14872 if (r < 0)
14873 goto out;
14874 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
14875 if (r < 0)
14876 goto out;
14877 }
14878
14879 // clone attrs
14880 newo->onode.attrs = oldo->onode.attrs;
14881
14882 // clone omap
14883 if (newo->onode.has_omap()) {
14884 dout(20) << __func__ << " clearing old omap data" << dendl;
14885 newo->flush();
9f95a23c 14886 _do_omap_clear(txc, newo);
494da23a 14887 newo->onode.clear_omap_flag();
7c673cae
FG
14888 }
14889 if (oldo->onode.has_omap()) {
14890 dout(20) << __func__ << " copying omap data" << dendl;
494da23a 14891 if (newo->oid.is_pgmeta()) {
9f95a23c
TL
14892 newo->onode.set_omap_flags_pgmeta();
14893 } else {
14894 newo->onode.set_omap_flags();
7c673cae 14895 }
9f95a23c 14896 const string& prefix = newo->get_omap_prefix();
11fdf7f2 14897 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 14898 string head, tail;
9f95a23c
TL
14899 oldo->get_omap_header(&head);
14900 oldo->get_omap_tail(&tail);
7c673cae
FG
14901 it->lower_bound(head);
14902 while (it->valid()) {
14903 if (it->key() >= tail) {
14904 dout(30) << __func__ << " reached tail" << dendl;
14905 break;
14906 } else {
14907 dout(30) << __func__ << " got header/data "
14908 << pretty_binary_string(it->key()) << dendl;
14909 string key;
9f95a23c 14910 newo->rewrite_omap_key(it->key(), &key);
11fdf7f2 14911 txc->t->set(prefix, key, it->value());
7c673cae
FG
14912 }
14913 it->next();
14914 }
494da23a
TL
14915 string new_tail;
14916 bufferlist new_tail_value;
9f95a23c 14917 newo->get_omap_tail(&new_tail);
494da23a 14918 txc->t->set(prefix, new_tail, new_tail_value);
7c673cae
FG
14919 }
14920
14921 txc->write_onode(newo);
14922 r = 0;
14923
14924 out:
14925 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14926 << newo->oid << " = " << r << dendl;
14927 return r;
14928}
14929
14930int BlueStore::_do_clone_range(
14931 TransContext *txc,
14932 CollectionRef& c,
14933 OnodeRef& oldo,
14934 OnodeRef& newo,
224ce89b
WB
14935 uint64_t srcoff,
14936 uint64_t length,
14937 uint64_t dstoff)
7c673cae
FG
14938{
14939 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14940 << newo->oid
14941 << " 0x" << std::hex << srcoff << "~" << length << " -> "
14942 << " 0x" << dstoff << "~" << length << std::dec << dendl;
14943 oldo->extent_map.fault_range(db, srcoff, length);
14944 newo->extent_map.fault_range(db, dstoff, length);
81eedcae
TL
14945 _dump_onode<30>(cct, *oldo);
14946 _dump_onode<30>(cct, *newo);
7c673cae 14947
11fdf7f2 14948 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
81eedcae
TL
14949 _dump_onode<30>(cct, *oldo);
14950 _dump_onode<30>(cct, *newo);
7c673cae
FG
14951 return 0;
14952}
14953
14954int BlueStore::_clone_range(TransContext *txc,
14955 CollectionRef& c,
14956 OnodeRef& oldo,
14957 OnodeRef& newo,
14958 uint64_t srcoff, uint64_t length, uint64_t dstoff)
14959{
14960 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14961 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
14962 << " to offset 0x" << dstoff << std::dec << dendl;
14963 int r = 0;
14964
35e4c445
FG
14965 if (srcoff + length >= OBJECT_MAX_SIZE ||
14966 dstoff + length >= OBJECT_MAX_SIZE) {
14967 r = -E2BIG;
14968 goto out;
14969 }
7c673cae
FG
14970 if (srcoff + length > oldo->onode.size) {
14971 r = -EINVAL;
14972 goto out;
14973 }
14974
7c673cae
FG
14975 _assign_nid(txc, newo);
14976
14977 if (length > 0) {
14978 if (cct->_conf->bluestore_clone_cow) {
14979 _do_zero(txc, c, newo, dstoff, length);
14980 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
14981 } else {
14982 bufferlist bl;
14983 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
14984 if (r < 0)
14985 goto out;
14986 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
14987 if (r < 0)
14988 goto out;
14989 }
14990 }
14991
14992 txc->write_onode(newo);
14993 r = 0;
14994
14995 out:
14996 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14997 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
14998 << " to offset 0x" << dstoff << std::dec
14999 << " = " << r << dendl;
15000 return r;
15001}
15002
15003int BlueStore::_rename(TransContext *txc,
15004 CollectionRef& c,
15005 OnodeRef& oldo,
15006 OnodeRef& newo,
15007 const ghobject_t& new_oid)
15008{
15009 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
15010 << new_oid << dendl;
15011 int r;
15012 ghobject_t old_oid = oldo->oid;
f91f0fd5 15013 mempool::bluestore_cache_meta::string new_okey;
7c673cae
FG
15014
15015 if (newo) {
15016 if (newo->exists) {
15017 r = -EEXIST;
15018 goto out;
15019 }
11fdf7f2 15020 ceph_assert(txc->onodes.count(newo) == 0);
7c673cae
FG
15021 }
15022
15023 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
15024
15025 // rewrite shards
15026 {
15027 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
15028 get_object_key(cct, new_oid, &new_okey);
15029 string key;
15030 for (auto &s : oldo->extent_map.shards) {
15031 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
15032 [&](const string& final_key) {
15033 txc->t->rmkey(PREFIX_OBJ, final_key);
15034 }
15035 );
15036 s.dirty = true;
15037 }
15038 }
15039
15040 newo = oldo;
15041 txc->write_onode(newo);
15042
15043 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
15044 // Onode in the old slot
15045 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
15046 r = 0;
15047
f64942e4
AA
15048 // hold a ref to new Onode in old name position, to ensure we don't drop
15049 // it from the cache before this txc commits (or else someone may come along
15050 // and read newo's metadata via the old name).
15051 txc->note_modified_object(oldo);
15052
7c673cae
FG
15053 out:
15054 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
15055 << new_oid << " = " << r << dendl;
15056 return r;
15057}
15058
15059// collections
15060
15061int BlueStore::_create_collection(
15062 TransContext *txc,
15063 const coll_t &cid,
15064 unsigned bits,
15065 CollectionRef *c)
15066{
15067 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
15068 int r;
15069 bufferlist bl;
15070
15071 {
9f95a23c 15072 std::unique_lock l(coll_lock);
7c673cae
FG
15073 if (*c) {
15074 r = -EEXIST;
15075 goto out;
15076 }
11fdf7f2
TL
15077 auto p = new_coll_map.find(cid);
15078 ceph_assert(p != new_coll_map.end());
15079 *c = p->second;
7c673cae
FG
15080 (*c)->cnode.bits = bits;
15081 coll_map[cid] = *c;
11fdf7f2 15082 new_coll_map.erase(p);
7c673cae 15083 }
11fdf7f2 15084 encode((*c)->cnode, bl);
7c673cae
FG
15085 txc->t->set(PREFIX_COLL, stringify(cid), bl);
15086 r = 0;
15087
15088 out:
15089 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
15090 return r;
15091}
15092
15093int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
15094 CollectionRef *c)
15095{
15096 dout(15) << __func__ << " " << cid << dendl;
15097 int r;
15098
11fdf7f2 15099 (*c)->flush_all_but_last();
7c673cae 15100 {
9f95a23c 15101 std::unique_lock l(coll_lock);
7c673cae
FG
15102 if (!*c) {
15103 r = -ENOENT;
15104 goto out;
15105 }
15106 size_t nonexistent_count = 0;
11fdf7f2 15107 ceph_assert((*c)->exists);
7c673cae
FG
15108 if ((*c)->onode_map.map_any([&](OnodeRef o) {
15109 if (o->exists) {
494da23a
TL
15110 dout(1) << __func__ << " " << o->oid << " " << o
15111 << " exists in onode_map" << dendl;
7c673cae
FG
15112 return true;
15113 }
15114 ++nonexistent_count;
15115 return false;
15116 })) {
15117 r = -ENOTEMPTY;
15118 goto out;
15119 }
15120
15121 vector<ghobject_t> ls;
15122 ghobject_t next;
15123 // Enumerate onodes in db, up to nonexistent_count + 1
15124 // then check if all of them are marked as non-existent.
11fdf7f2 15125 // Bypass the check if (next != ghobject_t::get_max())
7c673cae 15126 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
f91f0fd5 15127 nonexistent_count + 1, false, &ls, &next);
7c673cae 15128 if (r >= 0) {
11fdf7f2
TL
15129 // If true mean collecton has more objects than nonexistent_count,
15130 // so bypass check.
15131 bool exists = (!next.is_max());
7c673cae
FG
15132 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
15133 dout(10) << __func__ << " oid " << *it << dendl;
15134 auto onode = (*c)->onode_map.lookup(*it);
15135 exists = !onode || onode->exists;
15136 if (exists) {
494da23a
TL
15137 dout(1) << __func__ << " " << *it
15138 << " exists in db, "
15139 << (!onode ? "not present in ram" : "present in ram")
15140 << dendl;
7c673cae
FG
15141 }
15142 }
15143 if (!exists) {
11fdf7f2 15144 _do_remove_collection(txc, c);
7c673cae
FG
15145 r = 0;
15146 } else {
15147 dout(10) << __func__ << " " << cid
15148 << " is non-empty" << dendl;
15149 r = -ENOTEMPTY;
15150 }
15151 }
15152 }
15153
15154 out:
15155 dout(10) << __func__ << " " << cid << " = " << r << dendl;
15156 return r;
15157}
15158
11fdf7f2
TL
15159void BlueStore::_do_remove_collection(TransContext *txc,
15160 CollectionRef *c)
15161{
15162 coll_map.erase((*c)->cid);
15163 txc->removed_collections.push_back(*c);
15164 (*c)->exists = false;
15165 _osr_register_zombie((*c)->osr.get());
15166 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
15167 c->reset();
15168}
15169
7c673cae
FG
15170int BlueStore::_split_collection(TransContext *txc,
15171 CollectionRef& c,
15172 CollectionRef& d,
15173 unsigned bits, int rem)
15174{
15175 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
15176 << " bits " << bits << dendl;
9f95a23c
TL
15177 std::unique_lock l(c->lock);
15178 std::unique_lock l2(d->lock);
7c673cae
FG
15179 int r;
15180
15181 // flush all previous deferred writes on this sequencer. this is a bit
15182 // heavyweight, but we need to make sure all deferred writes complete
15183 // before we split as the new collection's sequencer may need to order
15184 // this after those writes, and we don't bother with the complexity of
15185 // moving those TransContexts over to the new osr.
15186 _osr_drain_preceding(txc);
15187
15188 // move any cached items (onodes and referenced shared blobs) that will
15189 // belong to the child collection post-split. leave everything else behind.
15190 // this may include things that don't strictly belong to the now-smaller
15191 // parent split, but the OSD will always send us a split for every new
15192 // child.
15193
15194 spg_t pgid, dest_pgid;
15195 bool is_pg = c->cid.is_pg(&pgid);
11fdf7f2 15196 ceph_assert(is_pg);
7c673cae 15197 is_pg = d->cid.is_pg(&dest_pgid);
11fdf7f2 15198 ceph_assert(is_pg);
7c673cae
FG
15199
15200 // the destination should initially be empty.
11fdf7f2
TL
15201 ceph_assert(d->onode_map.empty());
15202 ceph_assert(d->shared_blob_set.empty());
15203 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
15204
15205 c->split_cache(d.get());
15206
15207 // adjust bits. note that this will be redundant for all but the first
15208 // split call for this parent (first child).
15209 c->cnode.bits = bits;
11fdf7f2 15210 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
15211 r = 0;
15212
15213 bufferlist bl;
11fdf7f2 15214 encode(c->cnode, bl);
7c673cae
FG
15215 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
15216
15217 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
15218 << " bits " << bits << " = " << r << dendl;
15219 return r;
15220}
15221
11fdf7f2
TL
15222int BlueStore::_merge_collection(
15223 TransContext *txc,
15224 CollectionRef *c,
15225 CollectionRef& d,
15226 unsigned bits)
15227{
15228 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
15229 << " bits " << bits << dendl;
9f95a23c
TL
15230 std::unique_lock l((*c)->lock);
15231 std::unique_lock l2(d->lock);
11fdf7f2
TL
15232 int r;
15233
15234 coll_t cid = (*c)->cid;
15235
15236 // flush all previous deferred writes on the source collection to ensure
15237 // that all deferred writes complete before we merge as the target collection's
15238 // sequencer may need to order new ops after those writes.
15239
15240 _osr_drain((*c)->osr.get());
15241
15242 // move any cached items (onodes and referenced shared blobs) that will
15243 // belong to the child collection post-split. leave everything else behind.
15244 // this may include things that don't strictly belong to the now-smaller
15245 // parent split, but the OSD will always send us a split for every new
15246 // child.
15247
15248 spg_t pgid, dest_pgid;
15249 bool is_pg = cid.is_pg(&pgid);
15250 ceph_assert(is_pg);
15251 is_pg = d->cid.is_pg(&dest_pgid);
15252 ceph_assert(is_pg);
15253
15254 // adjust bits. note that this will be redundant for all but the first
15255 // merge call for the parent/target.
15256 d->cnode.bits = bits;
15257
15258 // behavior depends on target (d) bits, so this after that is updated.
15259 (*c)->split_cache(d.get());
15260
15261 // remove source collection
15262 {
9f95a23c 15263 std::unique_lock l3(coll_lock);
11fdf7f2
TL
15264 _do_remove_collection(txc, c);
15265 }
15266
15267 r = 0;
15268
15269 bufferlist bl;
15270 encode(d->cnode, bl);
15271 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
15272
15273 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
15274 << " bits " << bits << " = " << r << dendl;
15275 return r;
15276}
15277
494da23a
TL
15278void BlueStore::log_latency(
15279 const char* name,
15280 int idx,
15281 const ceph::timespan& l,
15282 double lat_threshold,
15283 const char* info) const
15284{
15285 logger->tinc(idx, l);
15286 if (lat_threshold > 0.0 &&
15287 l >= make_timespan(lat_threshold)) {
15288 dout(0) << __func__ << " slow operation observed for " << name
15289 << ", latency = " << l
15290 << info
15291 << dendl;
15292 }
15293}
15294
11fdf7f2 15295void BlueStore::log_latency_fn(
494da23a 15296 const char* name,
11fdf7f2
TL
15297 int idx,
15298 const ceph::timespan& l,
494da23a
TL
15299 double lat_threshold,
15300 std::function<string (const ceph::timespan& lat)> fn) const
11fdf7f2 15301{
494da23a
TL
15302 logger->tinc(idx, l);
15303 if (lat_threshold > 0.0 &&
15304 l >= make_timespan(lat_threshold)) {
15305 dout(0) << __func__ << " slow operation observed for " << name
15306 << ", latency = " << l
15307 << fn(l)
15308 << dendl;
15309 }
11fdf7f2
TL
15310}
15311
9f95a23c
TL
15312#if defined(WITH_LTTNG)
15313void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
15314 KeyValueDB &db,
15315 TransContext &txc,
15316 mono_clock::time_point start_throttle_acquire)
15317{
15318 pending_kv_ios += txc.ios;
15319 if (txc.deferred_txn) {
15320 pending_deferred_ios += txc.ios;
15321 }
15322
15323 uint64_t started = 0;
15324 uint64_t completed = 0;
15325 if (should_trace(&started, &completed)) {
15326 txc.tracing = true;
15327 uint64_t rocksdb_base_level,
15328 rocksdb_estimate_pending_compaction_bytes,
15329 rocksdb_cur_size_all_mem_tables,
15330 rocksdb_compaction_pending,
15331 rocksdb_mem_table_flush_pending,
15332 rocksdb_num_running_compactions,
15333 rocksdb_num_running_flushes,
15334 rocksdb_actual_delayed_write_rate;
15335 db.get_property(
15336 "rocksdb.base-level",
15337 &rocksdb_base_level);
15338 db.get_property(
15339 "rocksdb.estimate-pending-compaction-bytes",
15340 &rocksdb_estimate_pending_compaction_bytes);
15341 db.get_property(
15342 "rocksdb.cur-size-all-mem-tables",
15343 &rocksdb_cur_size_all_mem_tables);
15344 db.get_property(
15345 "rocksdb.compaction-pending",
15346 &rocksdb_compaction_pending);
15347 db.get_property(
15348 "rocksdb.mem-table-flush-pending",
15349 &rocksdb_mem_table_flush_pending);
15350 db.get_property(
15351 "rocksdb.num-running-compactions",
15352 &rocksdb_num_running_compactions);
15353 db.get_property(
15354 "rocksdb.num-running-flushes",
15355 &rocksdb_num_running_flushes);
15356 db.get_property(
15357 "rocksdb.actual-delayed-write-rate",
15358 &rocksdb_actual_delayed_write_rate);
15359
15360
15361 tracepoint(
15362 bluestore,
15363 transaction_initial_state,
15364 txc.osr->get_sequencer_id(),
15365 txc.seq,
15366 throttle_bytes.get_current(),
15367 throttle_deferred_bytes.get_current(),
15368 pending_kv_ios,
15369 pending_deferred_ios,
15370 started,
15371 completed,
15372 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
15373
15374 tracepoint(
15375 bluestore,
15376 transaction_initial_state_rocksdb,
15377 txc.osr->get_sequencer_id(),
15378 txc.seq,
15379 rocksdb_base_level,
15380 rocksdb_estimate_pending_compaction_bytes,
15381 rocksdb_cur_size_all_mem_tables,
15382 rocksdb_compaction_pending,
15383 rocksdb_mem_table_flush_pending,
15384 rocksdb_num_running_compactions,
15385 rocksdb_num_running_flushes,
15386 rocksdb_actual_delayed_write_rate);
15387 }
15388}
15389#endif
15390
15391mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
15392 TransContext &txc, PerfCounters *logger, int state)
15393{
15394 mono_clock::time_point now = mono_clock::now();
15395 mono_clock::duration lat = now - txc.last_stamp;
15396 logger->tinc(state, lat);
15397#if defined(WITH_LTTNG)
15398 if (txc.tracing &&
15399 state >= l_bluestore_state_prepare_lat &&
15400 state <= l_bluestore_state_done_lat) {
15401 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
15402 tracepoint(
15403 bluestore,
15404 transaction_state_duration,
15405 txc.osr->get_sequencer_id(),
15406 txc.seq,
15407 state,
15408 ceph::to_seconds<double>(lat));
15409 }
15410#endif
15411 txc.last_stamp = now;
15412 return lat;
15413}
15414
15415bool BlueStore::BlueStoreThrottle::try_start_transaction(
15416 KeyValueDB &db,
15417 TransContext &txc,
15418 mono_clock::time_point start_throttle_acquire)
15419{
15420 throttle_bytes.get(txc.cost);
15421
15422 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
15423 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15424 return true;
15425 } else {
15426 return false;
15427 }
15428}
15429
15430void BlueStore::BlueStoreThrottle::finish_start_transaction(
15431 KeyValueDB &db,
15432 TransContext &txc,
15433 mono_clock::time_point start_throttle_acquire)
15434{
15435 ceph_assert(txc.deferred_txn);
15436 throttle_deferred_bytes.get(txc.cost);
15437 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15438}
15439
15440#if defined(WITH_LTTNG)
15441void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
15442{
15443 pending_kv_ios -= 1;
15444 ios_completed_since_last_traced++;
15445 if (txc.tracing) {
15446 tracepoint(
15447 bluestore,
15448 transaction_commit_latency,
15449 txc.osr->get_sequencer_id(),
15450 txc.seq,
15451 ceph::to_seconds<double>(mono_clock::now() - txc.start));
15452 }
15453}
15454#endif
15455
15456#if defined(WITH_LTTNG)
15457void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
15458{
15459 if (txc.deferred_txn) {
15460 pending_deferred_ios -= 1;
15461 }
15462 if (txc.tracing) {
15463 mono_clock::time_point now = mono_clock::now();
15464 mono_clock::duration lat = now - txc.start;
15465 tracepoint(
15466 bluestore,
15467 transaction_total_duration,
15468 txc.osr->get_sequencer_id(),
15469 txc.seq,
15470 ceph::to_seconds<double>(lat));
15471 }
15472}
15473#endif
11fdf7f2 15474
7c673cae
FG
15475// DB key value Histogram
15476#define KEY_SLAB 32
15477#define VALUE_SLAB 64
15478
15479const string prefix_onode = "o";
15480const string prefix_onode_shard = "x";
15481const string prefix_other = "Z";
15482
15483int BlueStore::DBHistogram::get_key_slab(size_t sz)
15484{
15485 return (sz/KEY_SLAB);
15486}
15487
15488string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
15489{
15490 int lower_bound = slab * KEY_SLAB;
15491 int upper_bound = (slab + 1) * KEY_SLAB;
15492 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15493 return ret;
15494}
15495
15496int BlueStore::DBHistogram::get_value_slab(size_t sz)
15497{
15498 return (sz/VALUE_SLAB);
15499}
15500
15501string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
15502{
15503 int lower_bound = slab * VALUE_SLAB;
15504 int upper_bound = (slab + 1) * VALUE_SLAB;
15505 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15506 return ret;
15507}
15508
15509void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
15510 const string &prefix, size_t key_size, size_t value_size)
15511{
15512 uint32_t key_slab = get_key_slab(key_size);
15513 uint32_t value_slab = get_value_slab(value_size);
15514 key_hist[prefix][key_slab].count++;
11fdf7f2
TL
15515 key_hist[prefix][key_slab].max_len =
15516 std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
7c673cae
FG
15517 key_hist[prefix][key_slab].val_map[value_slab].count++;
15518 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11fdf7f2
TL
15519 std::max<size_t>(value_size,
15520 key_hist[prefix][key_slab].val_map[value_slab].max_len);
7c673cae
FG
15521}
15522
15523void BlueStore::DBHistogram::dump(Formatter *f)
15524{
15525 f->open_object_section("rocksdb_value_distribution");
15526 for (auto i : value_hist) {
15527 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
15528 }
15529 f->close_section();
15530
15531 f->open_object_section("rocksdb_key_value_histogram");
15532 for (auto i : key_hist) {
15533 f->dump_string("prefix", i.first);
15534 f->open_object_section("key_hist");
15535 for ( auto k : i.second) {
15536 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
15537 f->dump_unsigned("max_len", k.second.max_len);
15538 f->open_object_section("value_hist");
15539 for ( auto j : k.second.val_map) {
15540 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
15541 f->dump_unsigned("max_len", j.second.max_len);
15542 }
15543 f->close_section();
15544 }
15545 f->close_section();
15546 }
15547 f->close_section();
15548}
15549
15550//Itrerates through the db and collects the stats
15551void BlueStore::generate_db_histogram(Formatter *f)
15552{
15553 //globals
15554 uint64_t num_onodes = 0;
15555 uint64_t num_shards = 0;
15556 uint64_t num_super = 0;
15557 uint64_t num_coll = 0;
15558 uint64_t num_omap = 0;
11fdf7f2 15559 uint64_t num_pgmeta_omap = 0;
7c673cae
FG
15560 uint64_t num_deferred = 0;
15561 uint64_t num_alloc = 0;
15562 uint64_t num_stat = 0;
15563 uint64_t num_others = 0;
15564 uint64_t num_shared_shards = 0;
15565 size_t max_key_size =0, max_value_size = 0;
15566 uint64_t total_key_size = 0, total_value_size = 0;
15567 size_t key_size = 0, value_size = 0;
15568 DBHistogram hist;
15569
11fdf7f2 15570 auto start = coarse_mono_clock::now();
7c673cae 15571
11fdf7f2 15572 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
7c673cae
FG
15573 iter->seek_to_first();
15574 while (iter->valid()) {
15575 dout(30) << __func__ << " Key: " << iter->key() << dendl;
15576 key_size = iter->key_size();
15577 value_size = iter->value_size();
15578 hist.value_hist[hist.get_value_slab(value_size)]++;
11fdf7f2
TL
15579 max_key_size = std::max(max_key_size, key_size);
15580 max_value_size = std::max(max_value_size, value_size);
7c673cae
FG
15581 total_key_size += key_size;
15582 total_value_size += value_size;
15583
15584 pair<string,string> key(iter->raw_key());
15585
15586 if (key.first == PREFIX_SUPER) {
15587 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
15588 num_super++;
15589 } else if (key.first == PREFIX_STAT) {
15590 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
15591 num_stat++;
15592 } else if (key.first == PREFIX_COLL) {
15593 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
15594 num_coll++;
15595 } else if (key.first == PREFIX_OBJ) {
15596 if (key.second.back() == ONODE_KEY_SUFFIX) {
15597 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
15598 num_onodes++;
15599 } else {
15600 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
15601 num_shards++;
15602 }
15603 } else if (key.first == PREFIX_OMAP) {
15604 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
15605 num_omap++;
11fdf7f2
TL
15606 } else if (key.first == PREFIX_PGMETA_OMAP) {
15607 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
15608 num_pgmeta_omap++;
7c673cae
FG
15609 } else if (key.first == PREFIX_DEFERRED) {
15610 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
15611 num_deferred++;
11fdf7f2 15612 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
7c673cae
FG
15613 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
15614 num_alloc++;
15615 } else if (key.first == PREFIX_SHARED_BLOB) {
15616 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
15617 num_shared_shards++;
15618 } else {
15619 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
15620 num_others++;
15621 }
15622 iter->next();
15623 }
15624
11fdf7f2 15625 ceph::timespan duration = coarse_mono_clock::now() - start;
7c673cae
FG
15626 f->open_object_section("rocksdb_key_value_stats");
15627 f->dump_unsigned("num_onodes", num_onodes);
15628 f->dump_unsigned("num_shards", num_shards);
15629 f->dump_unsigned("num_super", num_super);
15630 f->dump_unsigned("num_coll", num_coll);
15631 f->dump_unsigned("num_omap", num_omap);
11fdf7f2 15632 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
7c673cae
FG
15633 f->dump_unsigned("num_deferred", num_deferred);
15634 f->dump_unsigned("num_alloc", num_alloc);
15635 f->dump_unsigned("num_stat", num_stat);
15636 f->dump_unsigned("num_shared_shards", num_shared_shards);
15637 f->dump_unsigned("num_others", num_others);
15638 f->dump_unsigned("max_key_size", max_key_size);
15639 f->dump_unsigned("max_value_size", max_value_size);
15640 f->dump_unsigned("total_key_size", total_key_size);
15641 f->dump_unsigned("total_value_size", total_value_size);
15642 f->close_section();
15643
15644 hist.dump(f);
15645
15646 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
15647
15648}
15649
f6b5b4d7 15650void BlueStore::_shutdown_cache()
7c673cae
FG
15651{
15652 dout(10) << __func__ << dendl;
9f95a23c
TL
15653 for (auto i : buffer_cache_shards) {
15654 i->flush();
11fdf7f2 15655 ceph_assert(i->empty());
7c673cae
FG
15656 }
15657 for (auto& p : coll_map) {
f6b5b4d7 15658 p.second->onode_map.clear();
3efd9988
FG
15659 if (!p.second->shared_blob_set.empty()) {
15660 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11fdf7f2 15661 p.second->shared_blob_set.dump<0>(cct);
3efd9988 15662 }
11fdf7f2
TL
15663 ceph_assert(p.second->onode_map.empty());
15664 ceph_assert(p.second->shared_blob_set.empty());
7c673cae
FG
15665 }
15666 coll_map.clear();
f6b5b4d7
TL
15667 for (auto i : onode_cache_shards) {
15668 ceph_assert(i->empty());
15669 }
7c673cae
FG
15670}
15671
31f18b77
FG
15672// For external caller.
15673// We use a best-effort policy instead, e.g.,
15674// we don't care if there are still some pinned onodes/data in the cache
15675// after this command is completed.
11fdf7f2 15676int BlueStore::flush_cache(ostream *os)
31f18b77
FG
15677{
15678 dout(10) << __func__ << dendl;
9f95a23c
TL
15679 for (auto i : onode_cache_shards) {
15680 i->flush();
15681 }
15682 for (auto i : buffer_cache_shards) {
15683 i->flush();
31f18b77 15684 }
11fdf7f2
TL
15685
15686 return 0;
31f18b77
FG
15687}
15688
7c673cae
FG
15689void BlueStore::_apply_padding(uint64_t head_pad,
15690 uint64_t tail_pad,
7c673cae
FG
15691 bufferlist& padded)
15692{
7c673cae 15693 if (head_pad) {
224ce89b 15694 padded.prepend_zero(head_pad);
7c673cae
FG
15695 }
15696 if (tail_pad) {
15697 padded.append_zero(tail_pad);
15698 }
15699 if (head_pad || tail_pad) {
15700 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
15701 << " tail 0x" << tail_pad << std::dec << dendl;
15702 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
15703 }
15704}
15705
11fdf7f2
TL
15706void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
15707{
15708 // finalize extent_map shards
15709 o->extent_map.update(txn, false);
15710 if (o->extent_map.needs_reshard()) {
15711 o->extent_map.reshard(db, txn);
15712 o->extent_map.update(txn, true);
15713 if (o->extent_map.needs_reshard()) {
15714 dout(20) << __func__ << " warning: still wants reshard, check options?"
15715 << dendl;
15716 o->extent_map.clear_needs_reshard();
15717 }
15718 logger->inc(l_bluestore_onode_reshard);
15719 }
15720
15721 // bound encode
15722 size_t bound = 0;
15723 denc(o->onode, bound);
15724 o->extent_map.bound_encode_spanning_blobs(bound);
15725 if (o->onode.extent_map_shards.empty()) {
15726 denc(o->extent_map.inline_bl, bound);
15727 }
15728
15729 // encode
15730 bufferlist bl;
15731 unsigned onode_part, blob_part, extent_part;
15732 {
15733 auto p = bl.get_contiguous_appender(bound, true);
15734 denc(o->onode, p);
15735 onode_part = p.get_logical_offset();
15736 o->extent_map.encode_spanning_blobs(p);
15737 blob_part = p.get_logical_offset() - onode_part;
15738 if (o->onode.extent_map_shards.empty()) {
15739 denc(o->extent_map.inline_bl, p);
15740 }
15741 extent_part = p.get_logical_offset() - onode_part - blob_part;
15742 }
15743
15744 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
15745 << " (" << onode_part << " bytes onode + "
15746 << blob_part << " bytes spanning blobs + "
15747 << extent_part << " bytes inline extents)"
15748 << dendl;
15749
15750
15751 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
15752}
15753
15754void BlueStore::_log_alerts(osd_alert_list_t& alerts)
15755{
15756 std::lock_guard l(qlock);
15757
81eedcae
TL
15758 if (!disk_size_mismatch_alert.empty()) {
15759 alerts.emplace(
15760 "BLUESTORE_DISK_SIZE_MISMATCH",
15761 disk_size_mismatch_alert);
15762 }
15763 if (!legacy_statfs_alert.empty()) {
15764 alerts.emplace(
15765 "BLUESTORE_LEGACY_STATFS",
15766 legacy_statfs_alert);
15767 }
11fdf7f2
TL
15768 if (!spillover_alert.empty() &&
15769 cct->_conf->bluestore_warn_on_bluefs_spillover) {
15770 alerts.emplace(
15771 "BLUEFS_SPILLOVER",
15772 spillover_alert);
15773 }
9f95a23c
TL
15774 if (!no_per_pool_omap_alert.empty()) {
15775 alerts.emplace(
15776 "BLUESTORE_NO_PER_POOL_OMAP",
15777 no_per_pool_omap_alert);
15778 }
11fdf7f2
TL
15779 string s0(failed_cmode);
15780
15781 if (!failed_compressors.empty()) {
15782 if (!s0.empty()) {
15783 s0 += ", ";
15784 }
15785 s0 += "unable to load:";
15786 bool first = true;
15787 for (auto& s : failed_compressors) {
15788 if (first) {
15789 first = false;
15790 } else {
15791 s0 += ", ";
15792 }
15793 s0 += s;
15794 }
15795 alerts.emplace(
15796 "BLUESTORE_NO_COMPRESSION",
15797 s0);
15798 }
15799}
15800
9f95a23c
TL
15801void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
15802 size_t extents)
15803{
15804 alloc_stats_count++;
15805 alloc_stats_fragments += extents;
15806 alloc_stats_size += need;
15807}
15808
15809void BlueStore::_record_allocation_stats()
15810{
15811 // don't care about data consistency,
15812 // fields can be partially modified while making the tuple
15813 auto t0 = std::make_tuple(
15814 alloc_stats_count.exchange(0),
15815 alloc_stats_fragments.exchange(0),
15816 alloc_stats_size.exchange(0));
15817
15818 dout(0) << " allocation stats probe "
15819 << probe_count << ":"
15820 << " cnt: " << std::get<0>(t0)
15821 << " frags: " << std::get<1>(t0)
15822 << " size: " << std::get<2>(t0)
15823 << dendl;
15824
15825
15826 //
15827 // Keep the history for probes from the power-of-two sequence:
15828 // -1, -2, -4, -8, -16
15829 //
15830 size_t base = 1;
15831 for (auto& t : alloc_stats_history) {
15832 dout(0) << " probe -"
15833 << base + (probe_count % base) << ": "
15834 << std::get<0>(t)
15835 << ", " << std::get<1>(t)
15836 << ", " << std::get<2>(t)
15837 << dendl;
15838 base <<= 1;
15839 }
15840 dout(0) << "------------" << dendl;
15841
15842 auto prev = probe_count++;
15843 auto mask = (1 << alloc_stats_history.size()) - 1;
15844 probe_count &= mask;
15845
15846 for (size_t i = cbits(prev ^ probe_count) - 1; i > 0 ; --i) {
15847 alloc_stats_history[i] = alloc_stats_history[i - 1];
15848 }
15849 alloc_stats_history[0].swap(t0);
15850}
15851
7c673cae 15852// ===========================================
11fdf7f2
TL
15853// BlueStoreRepairer
15854
15855size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
15856 const interval_set<uint64_t>& extents)
15857{
15858 ceph_assert(granularity); // initialized
15859 // can't call for the second time
15860 ceph_assert(!was_filtered_out);
15861 ceph_assert(collections_bfs.size() == objects_bfs.size());
15862
15863 uint64_t prev_pos = 0;
15864 uint64_t npos = collections_bfs.size();
15865
15866 bloom_vector collections_reduced;
15867 bloom_vector objects_reduced;
15868
15869 for (auto e : extents) {
15870 if (e.second == 0) {
15871 continue;
15872 }
15873 uint64_t pos = max(e.first / granularity, prev_pos);
15874 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
15875 while (pos != npos && pos < end_pos) {
15876 ceph_assert( collections_bfs[pos].element_count() ==
15877 objects_bfs[pos].element_count());
15878 if (collections_bfs[pos].element_count()) {
15879 collections_reduced.push_back(std::move(collections_bfs[pos]));
15880 objects_reduced.push_back(std::move(objects_bfs[pos]));
15881 }
15882 ++pos;
15883 }
15884 prev_pos = end_pos;
15885 }
15886 collections_reduced.swap(collections_bfs);
15887 objects_reduced.swap(objects_bfs);
15888 was_filtered_out = true;
15889 return collections_bfs.size();
15890}
15891
15892bool BlueStoreRepairer::remove_key(KeyValueDB *db,
15893 const string& prefix,
15894 const string& key)
15895{
15896 if (!remove_key_txn) {
15897 remove_key_txn = db->get_transaction();
15898 }
15899 ++to_repair_cnt;
15900 remove_key_txn->rmkey(prefix, key);
15901
15902 return true;
15903}
15904
9f95a23c
TL
15905void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db)
15906{
15907 fix_per_pool_omap_txn = db->get_transaction();
15908 ++to_repair_cnt;
15909 bufferlist bl;
15910 bl.append("1");
15911 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
15912}
15913
11fdf7f2
TL
15914bool BlueStoreRepairer::fix_shared_blob(
15915 KeyValueDB *db,
15916 uint64_t sbid,
15917 const bufferlist* bl)
15918{
15919 KeyValueDB::Transaction txn;
15920 if (fix_misreferences_txn) { // reuse this txn
15921 txn = fix_misreferences_txn;
15922 } else {
15923 if (!fix_shared_blob_txn) {
15924 fix_shared_blob_txn = db->get_transaction();
15925 }
15926 txn = fix_shared_blob_txn;
15927 }
15928 string key;
15929 get_shared_blob_key(sbid, &key);
15930
15931 ++to_repair_cnt;
15932 if (bl) {
15933 txn->set(PREFIX_SHARED_BLOB, key, *bl);
15934 } else {
15935 txn->rmkey(PREFIX_SHARED_BLOB, key);
15936 }
15937 return true;
15938}
15939
15940bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
15941 const string& key,
15942 const store_statfs_t& new_statfs)
15943{
15944 if (!fix_statfs_txn) {
15945 fix_statfs_txn = db->get_transaction();
15946 }
15947 BlueStore::volatile_statfs vstatfs;
15948 vstatfs = new_statfs;
15949 bufferlist bl;
15950 vstatfs.encode(bl);
15951 ++to_repair_cnt;
15952 fix_statfs_txn->set(PREFIX_STAT, key, bl);
15953 return true;
15954}
15955
15956bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
15957 FreelistManager* fm,
15958 uint64_t offset, uint64_t len)
15959{
15960 if (!fix_fm_leaked_txn) {
15961 fix_fm_leaked_txn = db->get_transaction();
15962 }
15963 ++to_repair_cnt;
15964 fm->release(offset, len, fix_fm_leaked_txn);
15965 return true;
15966}
15967bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
15968 FreelistManager* fm,
15969 uint64_t offset, uint64_t len)
15970{
15971 if (!fix_fm_false_free_txn) {
15972 fix_fm_false_free_txn = db->get_transaction();
15973 }
15974 ++to_repair_cnt;
15975 fm->allocate(offset, len, fix_fm_false_free_txn);
15976 return true;
15977}
15978
15979bool BlueStoreRepairer::fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag)
15980{
15981 // this is just a stub to count num of repairs properly,
15982 // actual repair happens in BlueStore::_close_db_and_around()
15983 // while doing _sync_bluefs_and_fm
15984 ++out_of_sync_flag;
15985 ++to_repair_cnt;
15986 return true;
15987}
15988
15989bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
15990{
15991 if (misreferenced_extents.size()) {
15992 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
15993 ceph_assert(n > 0);
15994 if (!fix_misreferences_txn) {
15995 fix_misreferences_txn = db->get_transaction();
15996 }
15997 return true;
15998 }
15999 return false;
16000}
16001
16002unsigned BlueStoreRepairer::apply(KeyValueDB* db)
16003{
9f95a23c
TL
16004 if (fix_per_pool_omap_txn) {
16005 db->submit_transaction_sync(fix_per_pool_omap_txn);
16006 fix_per_pool_omap_txn = nullptr;
16007 }
11fdf7f2
TL
16008 if (fix_fm_leaked_txn) {
16009 db->submit_transaction_sync(fix_fm_leaked_txn);
16010 fix_fm_leaked_txn = nullptr;
16011 }
16012 if (fix_fm_false_free_txn) {
16013 db->submit_transaction_sync(fix_fm_false_free_txn);
16014 fix_fm_false_free_txn = nullptr;
16015 }
16016 if (remove_key_txn) {
16017 db->submit_transaction_sync(remove_key_txn);
16018 remove_key_txn = nullptr;
16019 }
16020 if (fix_misreferences_txn) {
16021 db->submit_transaction_sync(fix_misreferences_txn);
16022 fix_misreferences_txn = nullptr;
16023 }
16024 if (fix_shared_blob_txn) {
16025 db->submit_transaction_sync(fix_shared_blob_txn);
16026 fix_shared_blob_txn = nullptr;
16027 }
16028
16029 if (fix_statfs_txn) {
16030 db->submit_transaction_sync(fix_statfs_txn);
16031 fix_statfs_txn = nullptr;
16032 }
16033 unsigned repaired = to_repair_cnt;
16034 to_repair_cnt = 0;
16035 return repaired;
16036}
16037
16038// =======================================================
9f95a23c
TL
16039// RocksDBBlueFSVolumeSelector
16040
16041uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
16042 ceph_assert(h != nullptr);
16043 uint64_t hint = reinterpret_cast<uint64_t>(h);
16044 uint8_t res;
16045 switch (hint) {
16046 case LEVEL_SLOW:
16047 res = BlueFS::BDEV_SLOW;
16048 if (db_avail4slow > 0) {
16049 // considering statically available db space vs.
16050 // - observed maximums on DB dev for DB/WAL/UNSORTED data
16051 // - observed maximum spillovers
16052 uint64_t max_db_use = 0; // max db usage we potentially observed
f6b5b4d7 16053 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
9f95a23c
TL
16054 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
16055 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
16056 // this could go to db hence using it in the estimation
16057 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
16058
16059 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
16060 uint64_t avail = min(
16061 db_avail4slow,
16062 max_db_use < db_total ? db_total - max_db_use : 0);
16063
16064 // considering current DB dev usage for SLOW data
16065 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
16066 res = BlueFS::BDEV_DB;
16067 }
16068 }
16069 break;
f6b5b4d7 16070 case LEVEL_LOG:
9f95a23c
TL
16071 case LEVEL_WAL:
16072 res = BlueFS::BDEV_WAL;
16073 break;
16074 case LEVEL_DB:
16075 default:
16076 res = BlueFS::BDEV_DB;
16077 break;
16078 }
16079 return res;
16080}
16081
16082void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
16083{
16084 res.emplace_back(base, l_totals[LEVEL_DB - LEVEL_FIRST]);
16085 res.emplace_back(base + ".slow", l_totals[LEVEL_SLOW - LEVEL_FIRST]);
16086}
16087
16088void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(const string& dirname) const {
16089 uint8_t res = LEVEL_DB;
16090 if (dirname.length() > 5) {
16091 // the "db.slow" and "db.wal" directory names are hard-coded at
16092 // match up with bluestore. the slow device is always the second
16093 // one (when a dedicated block.db device is present and used at
16094 // bdev 0). the wal device is always last.
16095 if (boost::algorithm::ends_with(dirname, ".slow")) {
16096 res = LEVEL_SLOW;
16097 }
16098 else if (boost::algorithm::ends_with(dirname, ".wal")) {
16099 res = LEVEL_WAL;
16100 }
16101 }
16102 return reinterpret_cast<void*>(res);
16103}
16104
16105void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
16106 auto max_x = per_level_per_dev_usage.get_max_x();
16107 auto max_y = per_level_per_dev_usage.get_max_y();
16108 sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
16109 << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
16110 << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
16111 << ", db_avail:" << db_avail4slow << std::endl
16112 << "Usage matrix:" << std::endl;
f6b5b4d7 16113 constexpr std::array<const char*, 8> names{ {
9f95a23c
TL
16114 "DEV/LEV",
16115 "WAL",
16116 "DB",
16117 "SLOW",
16118 "*",
16119 "*",
f6b5b4d7
TL
16120 "REAL",
16121 "FILES",
9f95a23c
TL
16122 } };
16123 const size_t width = 12;
16124 for (size_t i = 0; i < names.size(); ++i) {
16125 sout.setf(std::ios::left, std::ios::adjustfield);
16126 sout.width(width);
16127 sout << names[i];
16128 }
16129 sout << std::endl;
16130 for (size_t l = 0; l < max_y; l++) {
16131 sout.setf(std::ios::left, std::ios::adjustfield);
16132 sout.width(width);
16133 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
16134 case LEVEL_LOG:
16135 sout << "LOG"; break;
9f95a23c
TL
16136 case LEVEL_WAL:
16137 sout << "WAL"; break;
16138 case LEVEL_DB:
16139 sout << "DB"; break;
16140 case LEVEL_SLOW:
16141 sout << "SLOW"; break;
16142 case LEVEL_MAX:
16143 sout << "TOTALS"; break;
16144 }
f6b5b4d7 16145 for (size_t d = 0; d < max_x; d++) {
9f95a23c
TL
16146 sout.setf(std::ios::left, std::ios::adjustfield);
16147 sout.width(width);
16148 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
16149 }
16150 sout.setf(std::ios::left, std::ios::adjustfield);
16151 sout.width(width);
f6b5b4d7 16152 sout << stringify(per_level_files[l]) << std::endl;
9f95a23c
TL
16153 }
16154 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
16155 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
16156 sout << "MAXIMUMS:" << std::endl;
16157 for (size_t l = 0; l < max_y; l++) {
16158 sout.setf(std::ios::left, std::ios::adjustfield);
16159 sout.width(width);
16160 switch (l + LEVEL_FIRST) {
f6b5b4d7
TL
16161 case LEVEL_LOG:
16162 sout << "LOG"; break;
9f95a23c
TL
16163 case LEVEL_WAL:
16164 sout << "WAL"; break;
16165 case LEVEL_DB:
16166 sout << "DB"; break;
16167 case LEVEL_SLOW:
16168 sout << "SLOW"; break;
16169 case LEVEL_MAX:
16170 sout << "TOTALS"; break;
16171 }
16172 for (size_t d = 0; d < max_x - 1; d++) {
16173 sout.setf(std::ios::left, std::ios::adjustfield);
16174 sout.width(width);
16175 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
16176 }
16177 sout.setf(std::ios::left, std::ios::adjustfield);
16178 sout.width(width);
16179 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
16180 if (l < max_y - 1) {
16181 sout << std::endl;
16182 }
16183 }
16184}
11fdf7f2 16185
9f95a23c 16186// =======================================================