]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / os / bluestore / BlueStore.cc
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
20
eafe8130 21#include <boost/container/flat_set.hpp>
9f95a23c 22#include "boost/algorithm/string.hpp"
eafe8130 23
31f18b77
FG
24#include "include/cpp-btree/btree_set.h"
25
9f95a23c 26#include "bluestore_common.h"
7c673cae
FG
27#include "BlueStore.h"
28#include "os/kv.h"
29#include "include/compat.h"
30#include "include/intarith.h"
31#include "include/stringify.h"
11fdf7f2
TL
32#include "include/str_map.h"
33#include "include/util.h"
7c673cae
FG
34#include "common/errno.h"
35#include "common/safe_io.h"
91327a77 36#include "common/PriorityCache.h"
9f95a23c 37#include "common/RWLock.h"
7c673cae
FG
38#include "Allocator.h"
39#include "FreelistManager.h"
40#include "BlueFS.h"
41#include "BlueRocksEnv.h"
42#include "auth/Crypto.h"
43#include "common/EventTrace.h"
91327a77 44#include "perfglue/heap_profiler.h"
11fdf7f2
TL
45#include "common/blkdev.h"
46#include "common/numa.h"
7c673cae 47
9f95a23c
TL
48#if defined(WITH_LTTNG)
49#define TRACEPOINT_DEFINE
50#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
51#include "tracing/bluestore.h"
52#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
53#undef TRACEPOINT_DEFINE
54#else
55#define tracepoint(...)
56#endif
57
7c673cae
FG
58#define dout_context cct
59#define dout_subsys ceph_subsys_bluestore
60
31f18b77
FG
61using bid_t = decltype(BlueStore::Blob::id);
62
63// bluestore_cache_onode
7c673cae 64MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
31f18b77 65 bluestore_cache_onode);
7c673cae 66
31f18b77 67// bluestore_cache_other
7c673cae 68MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
31f18b77 69 bluestore_cache_other);
7c673cae 70MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
31f18b77 71 bluestore_cache_other);
7c673cae 72MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
31f18b77 73 bluestore_cache_other);
7c673cae 74MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
31f18b77
FG
75 bluestore_cache_other);
76
77// bluestore_txc
78MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
79 bluestore_txc);
80
7c673cae
FG
81
82// kv store prefixes
11fdf7f2
TL
83const string PREFIX_SUPER = "S"; // field -> value
84const string PREFIX_STAT = "T"; // field -> value(int64 array)
85const string PREFIX_COLL = "C"; // collection name -> cnode_t
86const string PREFIX_OBJ = "O"; // object name -> onode_t
87const string PREFIX_OMAP = "M"; // u64 + keyname -> value
88const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
9f95a23c 89const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
11fdf7f2
TL
90const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t
91const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
92const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
7c673cae
FG
93const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
94
11fdf7f2
TL
95const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
96
7c673cae
FG
97// write a label in the first block. always use this size. note that
98// bluefs makes a matching assumption about the location of its
99// superblock (always the second block of the device).
100#define BDEV_LABEL_BLOCK_SIZE 4096
101
102// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
103#define SUPER_RESERVED 8192
104
105#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
106
107
108/*
109 * extent map blob encoding
110 *
111 * we use the low bits of the blobid field to indicate some common scenarios
112 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
113 */
114#define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
115#define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
116#define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
117#define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
118#define BLOBID_SHIFT_BITS 4
119
120/*
121 * object name key structure
122 *
123 * encoded u8: shard + 2^7 (so that it sorts properly)
124 * encoded u64: poolid + 2^63 (so that it sorts properly)
125 * encoded u32: hash (bit reversed)
126 *
127 * escaped string: namespace
128 *
129 * escaped string: key or object name
130 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
131 * we are done. otherwise, we are followed by the object name.
132 * escaped string: object name (unless '=' above)
133 *
134 * encoded u64: snap
135 * encoded u64: generation
136 * 'o'
137 */
138#define ONODE_KEY_SUFFIX 'o'
139
140/*
141 * extent shard key
142 *
143 * object prefix key
144 * u32
145 * 'x'
146 */
147#define EXTENT_SHARD_KEY_SUFFIX 'x'
148
149/*
150 * string encoding in the key
151 *
152 * The key string needs to lexicographically sort the same way that
153 * ghobject_t does. We do this by escaping anything <= to '#' with #
154 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
155 * hex digits.
156 *
157 * We use ! as a terminator for strings; this works because it is < #
158 * and will get escaped if it is present in the string.
159 *
160 */
161template<typename S>
162static void append_escaped(const string &in, S *out)
163{
224ce89b
WB
164 char hexbyte[in.length() * 3 + 1];
165 char* ptr = &hexbyte[0];
7c673cae
FG
166 for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
167 if (*i <= '#') {
224ce89b
WB
168 *ptr++ = '#';
169 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
170 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 171 } else if (*i >= '~') {
224ce89b
WB
172 *ptr++ = '~';
173 *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
174 *ptr++ = "0123456789abcdef"[*i & 0x0f];
7c673cae 175 } else {
224ce89b 176 *ptr++ = *i;
7c673cae
FG
177 }
178 }
224ce89b
WB
179 *ptr++ = '!';
180 out->append(hexbyte, ptr - &hexbyte[0]);
181}
182
183inline unsigned h2i(char c)
184{
185 if ((c >= '0') && (c <= '9')) {
186 return c - 0x30;
187 } else if ((c >= 'a') && (c <= 'f')) {
188 return c - 'a' + 10;
189 } else if ((c >= 'A') && (c <= 'F')) {
190 return c - 'A' + 10;
191 } else {
192 return 256; // make it always larger than 255
193 }
7c673cae
FG
194}
195
196static int decode_escaped(const char *p, string *out)
197{
224ce89b
WB
198 char buff[256];
199 char* ptr = &buff[0];
200 char* max = &buff[252];
7c673cae
FG
201 const char *orig_p = p;
202 while (*p && *p != '!') {
203 if (*p == '#' || *p == '~') {
224ce89b
WB
204 unsigned hex = 0;
205 p++;
206 hex = h2i(*p++) << 4;
207 if (hex > 255) {
208 return -EINVAL;
209 }
210 hex |= h2i(*p++);
211 if (hex > 255) {
212 return -EINVAL;
213 }
214 *ptr++ = hex;
7c673cae 215 } else {
224ce89b
WB
216 *ptr++ = *p++;
217 }
218 if (ptr > max) {
219 out->append(buff, ptr-buff);
220 ptr = &buff[0];
7c673cae
FG
221 }
222 }
224ce89b
WB
223 if (ptr != buff) {
224 out->append(buff, ptr-buff);
225 }
7c673cae
FG
226 return p - orig_p;
227}
228
229// some things we encode in binary (as le32 or le64); print the
230// resulting key strings nicely
231template<typename S>
232static string pretty_binary_string(const S& in)
233{
234 char buf[10];
235 string out;
236 out.reserve(in.length() * 3);
237 enum { NONE, HEX, STRING } mode = NONE;
238 unsigned from = 0, i;
239 for (i=0; i < in.length(); ++i) {
240 if ((in[i] < 32 || (unsigned char)in[i] > 126) ||
241 (mode == HEX && in.length() - i >= 4 &&
242 ((in[i] < 32 || (unsigned char)in[i] > 126) ||
243 (in[i+1] < 32 || (unsigned char)in[i+1] > 126) ||
244 (in[i+2] < 32 || (unsigned char)in[i+2] > 126) ||
245 (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) {
246 if (mode == STRING) {
247 out.append(in.c_str() + from, i - from);
248 out.push_back('\'');
249 }
250 if (mode != HEX) {
251 out.append("0x");
252 mode = HEX;
253 }
254 if (in.length() - i >= 4) {
255 // print a whole u32 at once
256 snprintf(buf, sizeof(buf), "%08x",
257 (uint32_t)(((unsigned char)in[i] << 24) |
258 ((unsigned char)in[i+1] << 16) |
259 ((unsigned char)in[i+2] << 8) |
260 ((unsigned char)in[i+3] << 0)));
261 i += 3;
262 } else {
263 snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]);
264 }
265 out.append(buf);
266 } else {
267 if (mode != STRING) {
268 out.push_back('\'');
269 mode = STRING;
270 from = i;
271 }
272 }
273 }
274 if (mode == STRING) {
275 out.append(in.c_str() + from, i - from);
276 out.push_back('\'');
277 }
278 return out;
279}
280
281template<typename T>
282static void _key_encode_shard(shard_id_t shard, T *key)
283{
284 key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
285}
286
287static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
288{
289 pshard->id = (uint8_t)*key - (uint8_t)0x80;
290 return key + 1;
291}
292
293static void get_coll_key_range(const coll_t& cid, int bits,
294 string *temp_start, string *temp_end,
295 string *start, string *end)
296{
297 temp_start->clear();
298 temp_end->clear();
299 start->clear();
300 end->clear();
301
302 spg_t pgid;
303 if (cid.is_pg(&pgid)) {
304 _key_encode_shard(pgid.shard, start);
305 *temp_start = *start;
306
307 _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
308 _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
309
310 *end = *start;
311 *temp_end = *temp_start;
312
313 uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
314 _key_encode_u32(reverse_hash, start);
315 _key_encode_u32(reverse_hash, temp_start);
316
317 uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
318 if (end_hash > 0xffffffffull)
319 end_hash = 0xffffffffull;
320
321 _key_encode_u32(end_hash, end);
322 _key_encode_u32(end_hash, temp_end);
323 } else {
324 _key_encode_shard(shard_id_t::NO_SHARD, start);
325 _key_encode_u64(-1ull + 0x8000000000000000ull, start);
326 *end = *start;
327 _key_encode_u32(0, start);
328 _key_encode_u32(0xffffffff, end);
329
330 // no separate temp section
331 *temp_start = *end;
332 *temp_end = *end;
333 }
334}
335
336static void get_shared_blob_key(uint64_t sbid, string *key)
337{
338 key->clear();
339 _key_encode_u64(sbid, key);
340}
341
342static int get_key_shared_blob(const string& key, uint64_t *sbid)
343{
344 const char *p = key.c_str();
345 if (key.length() < sizeof(uint64_t))
346 return -1;
224ce89b 347 _key_decode_u64(p, sbid);
7c673cae
FG
348 return 0;
349}
350
351template<typename S>
352static int get_key_object(const S& key, ghobject_t *oid)
353{
354 int r;
355 const char *p = key.c_str();
356
357 if (key.length() < 1 + 8 + 4)
358 return -1;
359 p = _key_decode_shard(p, &oid->shard_id);
360
361 uint64_t pool;
362 p = _key_decode_u64(p, &pool);
363 oid->hobj.pool = pool - 0x8000000000000000ull;
364
365 unsigned hash;
366 p = _key_decode_u32(p, &hash);
367
368 oid->hobj.set_bitwise_key_u32(hash);
369
370 r = decode_escaped(p, &oid->hobj.nspace);
371 if (r < 0)
372 return -2;
373 p += r + 1;
374
375 string k;
376 r = decode_escaped(p, &k);
377 if (r < 0)
378 return -3;
379 p += r + 1;
380 if (*p == '=') {
381 // no key
382 ++p;
383 oid->hobj.oid.name = k;
384 } else if (*p == '<' || *p == '>') {
385 // key + name
386 ++p;
387 r = decode_escaped(p, &oid->hobj.oid.name);
388 if (r < 0)
389 return -5;
390 p += r + 1;
391 oid->hobj.set_key(k);
392 } else {
393 // malformed
394 return -6;
395 }
396
397 p = _key_decode_u64(p, &oid->hobj.snap.val);
398 p = _key_decode_u64(p, &oid->generation);
399
400 if (*p != ONODE_KEY_SUFFIX) {
401 return -7;
402 }
403 p++;
404 if (*p) {
405 // if we get something other than a null terminator here,
406 // something goes wrong.
407 return -8;
408 }
409
410 return 0;
411}
412
413template<typename S>
414static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
415{
416 key->clear();
417
418 size_t max_len = 1 + 8 + 4 +
419 (oid.hobj.nspace.length() * 3 + 1) +
420 (oid.hobj.get_key().length() * 3 + 1) +
421 1 + // for '<', '=', or '>'
422 (oid.hobj.oid.name.length() * 3 + 1) +
423 8 + 8 + 1;
424 key->reserve(max_len);
425
426 _key_encode_shard(oid.shard_id, key);
427 _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
428 _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
429
430 append_escaped(oid.hobj.nspace, key);
431
432 if (oid.hobj.get_key().length()) {
433 // is a key... could be < = or >.
434 append_escaped(oid.hobj.get_key(), key);
435 // (ASCII chars < = and > sort in that order, yay)
436 int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
437 if (r) {
438 key->append(r > 0 ? ">" : "<");
439 append_escaped(oid.hobj.oid.name, key);
440 } else {
441 // same as no key
442 key->append("=");
443 }
444 } else {
445 // no key
446 append_escaped(oid.hobj.oid.name, key);
447 key->append("=");
448 }
449
450 _key_encode_u64(oid.hobj.snap, key);
451 _key_encode_u64(oid.generation, key);
452
453 key->push_back(ONODE_KEY_SUFFIX);
454
455 // sanity check
456 if (true) {
457 ghobject_t t;
458 int r = get_key_object(*key, &t);
459 if (r || t != oid) {
460 derr << " r " << r << dendl;
461 derr << "key " << pretty_binary_string(*key) << dendl;
462 derr << "oid " << oid << dendl;
463 derr << " t " << t << dendl;
11fdf7f2 464 ceph_assert(r == 0 && t == oid);
7c673cae
FG
465 }
466 }
467}
468
469
470// extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
471// char lets us quickly test whether it is a shard key without decoding any
472// of the prefix bytes.
473template<typename S>
474static void get_extent_shard_key(const S& onode_key, uint32_t offset,
475 string *key)
476{
477 key->clear();
478 key->reserve(onode_key.length() + 4 + 1);
479 key->append(onode_key.c_str(), onode_key.size());
480 _key_encode_u32(offset, key);
481 key->push_back(EXTENT_SHARD_KEY_SUFFIX);
482}
483
484static void rewrite_extent_shard_key(uint32_t offset, string *key)
485{
11fdf7f2
TL
486 ceph_assert(key->size() > sizeof(uint32_t) + 1);
487 ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
488 _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
489}
490
491template<typename S>
492static void generate_extent_shard_key_and_apply(
493 const S& onode_key,
494 uint32_t offset,
495 string *key,
496 std::function<void(const string& final_key)> apply)
497{
498 if (key->empty()) { // make full key
11fdf7f2 499 ceph_assert(!onode_key.empty());
7c673cae
FG
500 get_extent_shard_key(onode_key, offset, key);
501 } else {
502 rewrite_extent_shard_key(offset, key);
503 }
504 apply(*key);
505}
506
507int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
508{
11fdf7f2
TL
509 ceph_assert(key.size() > sizeof(uint32_t) + 1);
510 ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
7c673cae
FG
511 int okey_len = key.size() - sizeof(uint32_t) - 1;
512 *onode_key = key.substr(0, okey_len);
513 const char *p = key.data() + okey_len;
224ce89b 514 _key_decode_u32(p, offset);
7c673cae
FG
515 return 0;
516}
517
518static bool is_extent_shard_key(const string& key)
519{
520 return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
521}
522
7c673cae
FG
523static void get_deferred_key(uint64_t seq, string *out)
524{
525 _key_encode_u64(seq, out);
526}
527
11fdf7f2
TL
528static void get_pool_stat_key(int64_t pool_id, string *key)
529{
530 key->clear();
531 _key_encode_u64(pool_id, key);
532}
533
534static int get_key_pool_stat(const string& key, uint64_t* pool_id)
535{
536 const char *p = key.c_str();
537 if (key.length() < sizeof(uint64_t))
538 return -1;
539 _key_decode_u64(p, pool_id);
540 return 0;
541}
7c673cae 542
81eedcae
TL
543template <int LogLevelV>
544void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
545{
546 uint64_t pos = 0;
547 for (auto& s : em.shards) {
548 dout(LogLevelV) << __func__ << " shard " << *s.shard_info
549 << (s.loaded ? " (loaded)" : "")
550 << (s.dirty ? " (dirty)" : "")
551 << dendl;
552 }
553 for (auto& e : em.extent_map) {
554 dout(LogLevelV) << __func__ << " " << e << dendl;
555 ceph_assert(e.logical_offset >= pos);
556 pos = e.logical_offset + e.length;
557 const bluestore_blob_t& blob = e.blob->get_blob();
558 if (blob.has_csum()) {
559 vector<uint64_t> v;
560 unsigned n = blob.get_csum_count();
561 for (unsigned i = 0; i < n; ++i)
562 v.push_back(blob.get_csum_item(i));
563 dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec
564 << dendl;
565 }
566 std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
567 for (auto& i : e.blob->shared_blob->bc.buffer_map) {
568 dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first
569 << "~" << i.second->length << std::dec
570 << " " << *i.second << dendl;
571 }
572 }
573}
574
575template <int LogLevelV>
576void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
577{
578 if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
579 return;
580 dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
581 << " nid " << o.onode.nid
582 << " size 0x" << std::hex << o.onode.size
583 << " (" << std::dec << o.onode.size << ")"
584 << " expected_object_size " << o.onode.expected_object_size
585 << " expected_write_size " << o.onode.expected_write_size
586 << " in " << o.onode.extent_map_shards.size() << " shards"
587 << ", " << o.extent_map.spanning_blob_map.size()
588 << " spanning blobs"
589 << dendl;
590 for (auto p = o.onode.attrs.begin();
591 p != o.onode.attrs.end();
592 ++p) {
593 dout(LogLevelV) << __func__ << " attr " << p->first
594 << " len " << p->second.length() << dendl;
595 }
596 _dump_extent_map<LogLevelV>(cct, o.extent_map);
597}
598
599template <int LogLevelV>
600void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
601{
602 dout(LogLevelV) << __func__ << " transaction dump:\n";
603 JSONFormatter f(true);
604 f.open_object_section("transaction");
605 t->dump(&f);
606 f.close_section();
607 f.flush(*_dout);
608 *_dout << dendl;
609}
610
7c673cae
FG
611// merge operators
612
613struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
614 void merge_nonexistent(
615 const char *rdata, size_t rlen, std::string *new_value) override {
616 *new_value = std::string(rdata, rlen);
617 }
618 void merge(
619 const char *ldata, size_t llen,
620 const char *rdata, size_t rlen,
621 std::string *new_value) override {
11fdf7f2
TL
622 ceph_assert(llen == rlen);
623 ceph_assert((rlen % 8) == 0);
7c673cae 624 new_value->resize(rlen);
eafe8130
TL
625 const ceph_le64* lv = (const ceph_le64*)ldata;
626 const ceph_le64* rv = (const ceph_le64*)rdata;
627 ceph_le64* nv = &(ceph_le64&)new_value->at(0);
7c673cae
FG
628 for (size_t i = 0; i < rlen >> 3; ++i) {
629 nv[i] = lv[i] + rv[i];
630 }
631 }
632 // We use each operator name and each prefix to construct the
633 // overall RocksDB operator name for consistency check at open time.
91327a77 634 const char *name() const override {
7c673cae
FG
635 return "int64_array";
636 }
637};
638
639
640// Buffer
641
642ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
643{
644 out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
645 << b.offset << "~" << b.length << std::dec
646 << " " << BlueStore::Buffer::get_state_name(b.state);
647 if (b.flags)
648 out << " " << BlueStore::Buffer::get_flag_name(b.flags);
649 return out << ")";
650}
651
652// Garbage Collector
653
654void BlueStore::GarbageCollector::process_protrusive_extents(
655 const BlueStore::ExtentMap& extent_map,
656 uint64_t start_offset,
657 uint64_t end_offset,
658 uint64_t start_touch_offset,
659 uint64_t end_touch_offset,
660 uint64_t min_alloc_size)
661{
11fdf7f2 662 ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
7c673cae 663
11fdf7f2
TL
664 uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
665 uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
7c673cae
FG
666
667 dout(30) << __func__ << " (hex): [" << std::hex
668 << lookup_start_offset << ", " << lookup_end_offset
669 << ")" << std::dec << dendl;
670
671 for (auto it = extent_map.seek_lextent(lookup_start_offset);
672 it != extent_map.extent_map.end() &&
673 it->logical_offset < lookup_end_offset;
674 ++it) {
675 uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
676 uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
677
678 dout(30) << __func__ << " " << *it
679 << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
680 << dendl;
681
682 Blob* b = it->blob.get();
683
684 if (it->logical_offset >=start_touch_offset &&
685 it->logical_end() <= end_touch_offset) {
686 // Process extents within the range affected by
687 // the current write request.
688 // Need to take into account if existing extents
689 // can be merged with them (uncompressed case)
690 if (!b->get_blob().is_compressed()) {
691 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
692 --blob_info_counted->expected_allocations; // don't need to allocate
693 // new AU for compressed
694 // data since another
695 // collocated uncompressed
696 // blob already exists
697 dout(30) << __func__ << " --expected:"
698 << alloc_unit_start << dendl;
699 }
700 used_alloc_unit = alloc_unit_end;
701 blob_info_counted = nullptr;
702 }
703 } else if (b->get_blob().is_compressed()) {
704
705 // additionally we take compressed blobs that were not impacted
706 // by the write into account too
707 BlobInfo& bi =
708 affected_blobs.emplace(
709 b, BlobInfo(b->get_referenced_bytes())).first->second;
710
711 int adjust =
712 (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
713 bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
714 dout(30) << __func__ << " expected_allocations="
715 << bi.expected_allocations << " end_au:"
716 << alloc_unit_end << dendl;
717
718 blob_info_counted = &bi;
719 used_alloc_unit = alloc_unit_end;
720
11fdf7f2 721 ceph_assert(it->length <= bi.referenced_bytes);
7c673cae
FG
722 bi.referenced_bytes -= it->length;
723 dout(30) << __func__ << " affected_blob:" << *b
724 << " unref 0x" << std::hex << it->length
725 << " referenced = 0x" << bi.referenced_bytes
726 << std::dec << dendl;
727 // NOTE: we can't move specific blob to resulting GC list here
728 // when reference counter == 0 since subsequent extents might
729 // decrement its expected_allocation.
730 // Hence need to enumerate all the extents first.
731 if (!bi.collect_candidate) {
732 bi.first_lextent = it;
733 bi.collect_candidate = true;
734 }
735 bi.last_lextent = it;
736 } else {
737 if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
738 // don't need to allocate new AU for compressed data since another
739 // collocated uncompressed blob already exists
740 --blob_info_counted->expected_allocations;
741 dout(30) << __func__ << " --expected_allocations:"
742 << alloc_unit_start << dendl;
743 }
744 used_alloc_unit = alloc_unit_end;
745 blob_info_counted = nullptr;
746 }
747 }
748
749 for (auto b_it = affected_blobs.begin();
750 b_it != affected_blobs.end();
751 ++b_it) {
752 Blob* b = b_it->first;
753 BlobInfo& bi = b_it->second;
754 if (bi.referenced_bytes == 0) {
755 uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
756 int64_t blob_expected_for_release =
11fdf7f2 757 round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
7c673cae
FG
758
759 dout(30) << __func__ << " " << *(b_it->first)
760 << " expected4release=" << blob_expected_for_release
761 << " expected_allocations=" << bi.expected_allocations
762 << dendl;
763 int64_t benefit = blob_expected_for_release - bi.expected_allocations;
11fdf7f2 764 if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
7c673cae
FG
765 if (bi.collect_candidate) {
766 auto it = bi.first_lextent;
767 bool bExit = false;
768 do {
769 if (it->blob.get() == b) {
eafe8130 770 extents_to_collect.insert(it->logical_offset, it->length);
7c673cae
FG
771 }
772 bExit = it == bi.last_lextent;
773 ++it;
31f18b77 774 } while (!bExit);
7c673cae
FG
775 }
776 expected_for_release += blob_expected_for_release;
777 expected_allocations += bi.expected_allocations;
778 }
779 }
780 }
781}
782
783int64_t BlueStore::GarbageCollector::estimate(
784 uint64_t start_offset,
785 uint64_t length,
786 const BlueStore::ExtentMap& extent_map,
787 const BlueStore::old_extent_map_t& old_extents,
788 uint64_t min_alloc_size)
789{
790
791 affected_blobs.clear();
792 extents_to_collect.clear();
793 used_alloc_unit = boost::optional<uint64_t >();
794 blob_info_counted = nullptr;
795
eafe8130
TL
796 uint64_t gc_start_offset = start_offset;
797 uint64_t gc_end_offset = start_offset + length;
7c673cae
FG
798
799 uint64_t end_offset = start_offset + length;
800
801 for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
802 Blob* b = it->e.blob.get();
803 if (b->get_blob().is_compressed()) {
804
805 // update gc_start_offset/gc_end_offset if needed
806 gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
11fdf7f2 807 gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
7c673cae
FG
808
809 auto o = it->e.logical_offset;
810 auto l = it->e.length;
811
812 uint64_t ref_bytes = b->get_referenced_bytes();
813 // micro optimization to bypass blobs that have no more references
814 if (ref_bytes != 0) {
815 dout(30) << __func__ << " affected_blob:" << *b
816 << " unref 0x" << std::hex << o << "~" << l
817 << std::dec << dendl;
818 affected_blobs.emplace(b, BlobInfo(ref_bytes));
819 }
820 }
821 }
822 dout(30) << __func__ << " gc range(hex): [" << std::hex
823 << gc_start_offset << ", " << gc_end_offset
824 << ")" << std::dec << dendl;
825
826 // enumerate preceeding extents to check if they reference affected blobs
827 if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
828 process_protrusive_extents(extent_map,
829 gc_start_offset,
830 gc_end_offset,
831 start_offset,
832 end_offset,
833 min_alloc_size);
834 }
835 return expected_for_release - expected_allocations;
836}
837
9f95a23c
TL
838// LruOnodeCacheShard
839struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
840 typedef boost::intrusive::list<
841 BlueStore::Onode,
842 boost::intrusive::member_hook<
843 BlueStore::Onode,
844 boost::intrusive::list_member_hook<>,
845 &BlueStore::Onode::lru_item> > list_t;
846 typedef boost::intrusive::list<
847 BlueStore::Onode,
848 boost::intrusive::member_hook<
849 BlueStore::Onode,
850 boost::intrusive::list_member_hook<>,
851 &BlueStore::Onode::pin_item> > pin_list_t;
7c673cae 852
9f95a23c
TL
853 list_t lru;
854 pin_list_t pin_list;
7c673cae 855
9f95a23c 856 explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
7c673cae 857
9f95a23c
TL
858 void _add(BlueStore::OnodeRef& o, int level) override
859 {
860 ceph_assert(o->s == nullptr);
861 o->s = this;
862 if (o->nref > 1) {
863 pin_list.push_front(*o);
864 o->pinned = true;
865 num_pinned = pin_list.size();
866 } else {
867 (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
868 }
869 num = lru.size();
eafe8130 870 }
9f95a23c
TL
871 void _rm(BlueStore::OnodeRef& o) override
872 {
873 o->s = nullptr;
874 if (o->pinned) {
875 o->pinned = false;
876 pin_list.erase(pin_list.iterator_to(*o));
877 } else {
878 lru.erase(lru.iterator_to(*o));
879 }
880 num = lru.size();
881 num_pinned = pin_list.size();
882 }
883 void _touch(BlueStore::OnodeRef& o) override
884 {
885 if (o->pinned) {
886 return;
887 }
888 lru.erase(lru.iterator_to(*o));
889 lru.push_front(*o);
890 num = lru.size();
891 }
892 void _pin(BlueStore::Onode& o) override
893 {
894 if (o.pinned == true) {
895 return;
896 }
897 lru.erase(lru.iterator_to(o));
898 pin_list.push_front(o);
899 o.pinned = true;
900 num = lru.size();
901 num_pinned = pin_list.size();
902 dout(30) << __func__ << " " << o.oid << " pinned" << dendl;
7c673cae 903
9f95a23c
TL
904 }
905 void _unpin(BlueStore::Onode& o) override
906 {
907 if (o.pinned == false) {
908 return;
909 }
910 pin_list.erase(pin_list.iterator_to(o));
911 lru.push_front(o);
912 o.pinned = false;
913 num = lru.size();
914 num_pinned = pin_list.size();
915 dout(30) << __func__ << " " << o.oid << " unpinned" << dendl;
916 }
917 void _trim_to(uint64_t new_size) override
918 {
919 if (new_size >= lru.size()) {
920 return; // don't even try
921 }
922 uint64_t n = lru.size() - new_size;
923 auto p = lru.end();
924 ceph_assert(p != lru.begin());
925 --p;
926 while (n > 0) {
927 BlueStore::Onode *o = &*p;
928 dout(30) << __func__ << " rm " << o->oid << dendl;
929 if (p != lru.begin()) {
930 lru.erase(p--);
931 } else {
932 lru.erase(p);
933 ceph_assert(n == 1);
934 }
935 o->s = nullptr;
936 o->get(); // paranoia
937 o->c->onode_map.remove(o->oid);
938 o->put();
939 --n;
940 }
941 num = lru.size();
942 }
943 void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
944 {
945 *onodes += num + num_pinned;
946 *pinned_onodes += num_pinned;
947 }
948};
7c673cae 949
9f95a23c
TL
950// OnodeCacheShard
951BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
952 CephContext* cct,
953 string type,
954 PerfCounters *logger)
7c673cae 955{
9f95a23c
TL
956 BlueStore::OnodeCacheShard *c = nullptr;
957 // Currently we only implement an LRU cache for onodes
958 c = new LruOnodeCacheShard(cct);
959 c->logger = logger;
960 return c;
7c673cae
FG
961}
962
9f95a23c
TL
963// LruBufferCacheShard
964struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
965 typedef boost::intrusive::list<
966 BlueStore::Buffer,
967 boost::intrusive::member_hook<
968 BlueStore::Buffer,
969 boost::intrusive::list_member_hook<>,
970 &BlueStore::Buffer::lru_item> > list_t;
971 list_t lru;
972
973 explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
974
975 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
976 if (near) {
977 auto q = lru.iterator_to(*near);
978 lru.insert(q, *b);
979 } else if (level > 0) {
980 lru.push_front(*b);
981 } else {
982 lru.push_back(*b);
7c673cae 983 }
9f95a23c
TL
984 buffer_bytes += b->length;
985 num = lru.size();
986 }
987 void _rm(BlueStore::Buffer *b) override {
988 ceph_assert(buffer_bytes >= b->length);
989 buffer_bytes -= b->length;
990 auto q = lru.iterator_to(*b);
991 lru.erase(q);
992 num = lru.size();
993 }
994 void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
995 src->_rm(b);
996 _add(b, 0, nullptr);
997 }
998 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
999 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1000 buffer_bytes += delta;
1001 }
1002 void _touch(BlueStore::Buffer *b) override {
1003 auto p = lru.iterator_to(*b);
1004 lru.erase(p);
1005 lru.push_front(*b);
1006 num = lru.size();
1007 _audit("_touch_buffer end");
1008 }
7c673cae 1009
9f95a23c
TL
1010 void _trim_to(uint64_t max) override
1011 {
1012 while (buffer_bytes > max) {
1013 auto i = lru.rbegin();
1014 if (i == lru.rend()) {
1015 // stop if lru is now empty
7c673cae
FG
1016 break;
1017 }
1018
9f95a23c
TL
1019 BlueStore::Buffer *b = &*i;
1020 ceph_assert(b->is_clean());
1021 dout(20) << __func__ << " rm " << *b << dendl;
1022 b->space->_rm_buffer(this, b);
7c673cae 1023 }
9f95a23c 1024 num = lru.size();
7c673cae 1025 }
7c673cae 1026
9f95a23c
TL
1027 void add_stats(uint64_t *extents,
1028 uint64_t *blobs,
1029 uint64_t *buffers,
1030 uint64_t *bytes) override {
1031 *extents += num_extents;
1032 *blobs += num_blobs;
1033 *buffers += num;
1034 *bytes += buffer_bytes;
7c673cae 1035 }
9f95a23c
TL
1036#ifdef DEBUG_CACHE
1037 void _audit(const char *s) override
1038 {
1039 dout(10) << __func__ << " " << when << " start" << dendl;
1040 uint64_t s = 0;
1041 for (auto i = lru.begin(); i != lru.end(); ++i) {
1042 s += i->length;
1043 }
1044 if (s != buffer_bytes) {
1045 derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
1046 << dendl;
1047 for (auto i = lru.begin(); i != lru.end(); ++i) {
1048 derr << __func__ << " " << *i << dendl;
1049 }
1050 ceph_assert(s == buffer_bytes);
7c673cae 1051 }
9f95a23c
TL
1052 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1053 << " ok" << dendl;
7c673cae 1054 }
7c673cae 1055#endif
9f95a23c 1056};
7c673cae 1057
9f95a23c
TL
1058// TwoQBufferCacheShard
1059
1060struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
1061 typedef boost::intrusive::list<
1062 BlueStore::Buffer,
1063 boost::intrusive::member_hook<
1064 BlueStore::Buffer,
1065 boost::intrusive::list_member_hook<>,
1066 &BlueStore::Buffer::lru_item> > list_t;
1067 list_t hot; ///< "Am" hot buffers
1068 list_t warm_in; ///< "A1in" newly warm buffers
1069 list_t warm_out; ///< "A1out" empty buffers we've evicted
1070 uint64_t buffer_bytes = 0; ///< bytes
1071
1072 enum {
1073 BUFFER_NEW = 0,
1074 BUFFER_WARM_IN, ///< in warm_in
1075 BUFFER_WARM_OUT, ///< in warm_out
1076 BUFFER_HOT, ///< in hot
1077 BUFFER_TYPE_MAX
1078 };
7c673cae 1079
9f95a23c 1080 uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
7c673cae 1081
9f95a23c
TL
1082public:
1083 explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
7c673cae 1084
9f95a23c
TL
1085 void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
1086 {
1087 dout(20) << __func__ << " level " << level << " near " << near
1088 << " on " << *b
1089 << " which has cache_private " << b->cache_private << dendl;
1090 if (near) {
1091 b->cache_private = near->cache_private;
1092 switch (b->cache_private) {
1093 case BUFFER_WARM_IN:
1094 warm_in.insert(warm_in.iterator_to(*near), *b);
1095 break;
1096 case BUFFER_WARM_OUT:
1097 ceph_assert(b->is_empty());
1098 warm_out.insert(warm_out.iterator_to(*near), *b);
1099 break;
1100 case BUFFER_HOT:
1101 hot.insert(hot.iterator_to(*near), *b);
1102 break;
1103 default:
1104 ceph_abort_msg("bad cache_private");
1105 }
1106 } else if (b->cache_private == BUFFER_NEW) {
1107 b->cache_private = BUFFER_WARM_IN;
1108 if (level > 0) {
1109 warm_in.push_front(*b);
1110 } else {
1111 // take caller hint to start at the back of the warm queue
1112 warm_in.push_back(*b);
1113 }
1114 } else {
1115 // we got a hint from discard
1116 switch (b->cache_private) {
1117 case BUFFER_WARM_IN:
1118 // stay in warm_in. move to front, even though 2Q doesn't actually
1119 // do this.
1120 dout(20) << __func__ << " move to front of warm " << *b << dendl;
1121 warm_in.push_front(*b);
1122 break;
1123 case BUFFER_WARM_OUT:
1124 b->cache_private = BUFFER_HOT;
1125 // move to hot. fall-thru
1126 case BUFFER_HOT:
1127 dout(20) << __func__ << " move to front of hot " << *b << dendl;
1128 hot.push_front(*b);
1129 break;
1130 default:
1131 ceph_abort_msg("bad cache_private");
1132 }
1133 }
1134 if (!b->is_empty()) {
1135 buffer_bytes += b->length;
1136 list_bytes[b->cache_private] += b->length;
1137 }
1138 num = hot.size() + warm_in.size();
1139 }
1140
1141 void _rm(BlueStore::Buffer *b) override
1142 {
1143 dout(20) << __func__ << " " << *b << dendl;
1144 if (!b->is_empty()) {
1145 ceph_assert(buffer_bytes >= b->length);
1146 buffer_bytes -= b->length;
1147 ceph_assert(list_bytes[b->cache_private] >= b->length);
1148 list_bytes[b->cache_private] -= b->length;
1149 }
7c673cae
FG
1150 switch (b->cache_private) {
1151 case BUFFER_WARM_IN:
9f95a23c 1152 warm_in.erase(warm_in.iterator_to(*b));
7c673cae
FG
1153 break;
1154 case BUFFER_WARM_OUT:
9f95a23c 1155 warm_out.erase(warm_out.iterator_to(*b));
7c673cae
FG
1156 break;
1157 case BUFFER_HOT:
9f95a23c 1158 hot.erase(hot.iterator_to(*b));
7c673cae
FG
1159 break;
1160 default:
11fdf7f2 1161 ceph_abort_msg("bad cache_private");
7c673cae 1162 }
9f95a23c
TL
1163 num = hot.size() + warm_in.size();
1164 }
1165
1166 void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
1167 {
1168 TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
1169 src->_rm(b);
1170
1171 // preserve which list we're on (even if we can't preserve the order!)
7c673cae
FG
1172 switch (b->cache_private) {
1173 case BUFFER_WARM_IN:
9f95a23c
TL
1174 ceph_assert(!b->is_empty());
1175 warm_in.push_back(*b);
7c673cae
FG
1176 break;
1177 case BUFFER_WARM_OUT:
9f95a23c
TL
1178 ceph_assert(b->is_empty());
1179 warm_out.push_back(*b);
1180 break;
7c673cae 1181 case BUFFER_HOT:
9f95a23c
TL
1182 ceph_assert(!b->is_empty());
1183 hot.push_back(*b);
7c673cae
FG
1184 break;
1185 default:
11fdf7f2 1186 ceph_abort_msg("bad cache_private");
7c673cae 1187 }
9f95a23c
TL
1188 if (!b->is_empty()) {
1189 buffer_bytes += b->length;
1190 list_bytes[b->cache_private] += b->length;
1191 }
1192 num = hot.size() + warm_in.size();
7c673cae 1193 }
7c673cae 1194
9f95a23c
TL
1195 void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
1196 {
1197 dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
1198 if (!b->is_empty()) {
1199 ceph_assert((int64_t)buffer_bytes + delta >= 0);
1200 buffer_bytes += delta;
1201 ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
1202 list_bytes[b->cache_private] += delta;
1203 }
7c673cae 1204 }
7c673cae 1205
9f95a23c
TL
1206 void _touch(BlueStore::Buffer *b) override {
1207 switch (b->cache_private) {
1208 case BUFFER_WARM_IN:
1209 // do nothing (somewhat counter-intuitively!)
1210 break;
1211 case BUFFER_WARM_OUT:
1212 // move from warm_out to hot LRU
1213 ceph_abort_msg("this happens via discard hint");
1214 break;
1215 case BUFFER_HOT:
1216 // move to front of hot LRU
1217 hot.erase(hot.iterator_to(*b));
1218 hot.push_front(*b);
1219 break;
1220 }
1221 num = hot.size() + warm_in.size();
1222 _audit("_touch_buffer end");
7c673cae 1223 }
7c673cae 1224
9f95a23c
TL
1225 void _trim_to(uint64_t max) override
1226 {
1227 if (buffer_bytes > max) {
1228 uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
1229 uint64_t khot = max - kin;
1230
1231 // pre-calculate kout based on average buffer size too,
1232 // which is typical(the warm_in and hot lists may change later)
1233 uint64_t kout = 0;
1234 uint64_t buffer_num = hot.size() + warm_in.size();
1235 if (buffer_num) {
1236 uint64_t avg_size = buffer_bytes / buffer_num;
1237 ceph_assert(avg_size);
1238 uint64_t calculated_num = max / avg_size;
1239 kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
1240 }
1241
1242 if (list_bytes[BUFFER_HOT] < khot) {
1243 // hot is small, give slack to warm_in
1244 kin += khot - list_bytes[BUFFER_HOT];
1245 } else if (list_bytes[BUFFER_WARM_IN] < kin) {
1246 // warm_in is small, give slack to hot
1247 khot += kin - list_bytes[BUFFER_WARM_IN];
1248 }
1249
1250 // adjust warm_in list
1251 int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
1252 uint64_t evicted = 0;
1253
1254 while (to_evict_bytes > 0) {
1255 auto p = warm_in.rbegin();
1256 if (p == warm_in.rend()) {
1257 // stop if warm_in list is now empty
1258 break;
1259 }
7c673cae 1260
9f95a23c
TL
1261 BlueStore::Buffer *b = &*p;
1262 ceph_assert(b->is_clean());
1263 dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
1264 ceph_assert(buffer_bytes >= b->length);
1265 buffer_bytes -= b->length;
1266 ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
1267 list_bytes[BUFFER_WARM_IN] -= b->length;
1268 to_evict_bytes -= b->length;
1269 evicted += b->length;
1270 b->state = BlueStore::Buffer::STATE_EMPTY;
1271 b->data.clear();
1272 warm_in.erase(warm_in.iterator_to(*b));
1273 warm_out.push_front(*b);
1274 b->cache_private = BUFFER_WARM_OUT;
1275 }
1276
1277 if (evicted > 0) {
1278 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1279 << " from warm_in list, done evicting warm_in buffers"
1280 << dendl;
1281 }
7c673cae 1282
9f95a23c
TL
1283 // adjust hot list
1284 to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
1285 evicted = 0;
7c673cae 1286
9f95a23c
TL
1287 while (to_evict_bytes > 0) {
1288 auto p = hot.rbegin();
1289 if (p == hot.rend()) {
1290 // stop if hot list is now empty
1291 break;
1292 }
7c673cae 1293
9f95a23c
TL
1294 BlueStore::Buffer *b = &*p;
1295 dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
1296 ceph_assert(b->is_clean());
1297 // adjust evict size before buffer goes invalid
1298 to_evict_bytes -= b->length;
1299 evicted += b->length;
1300 b->space->_rm_buffer(this, b);
1301 }
7c673cae 1302
9f95a23c
TL
1303 if (evicted > 0) {
1304 dout(20) << __func__ << " evicted " << byte_u_t(evicted)
1305 << " from hot list, done evicting hot buffers"
1306 << dendl;
7c673cae
FG
1307 }
1308
9f95a23c
TL
1309 // adjust warm out list too, if necessary
1310 int64_t n = warm_out.size() - kout;
1311 while (n-- > 0) {
1312 BlueStore::Buffer *b = &*warm_out.rbegin();
1313 ceph_assert(b->is_empty());
1314 dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
1315 b->space->_rm_buffer(this, b);
1316 }
7c673cae 1317 }
9f95a23c
TL
1318 num = hot.size() + warm_in.size();
1319 }
7c673cae 1320
9f95a23c
TL
1321 void add_stats(uint64_t *extents,
1322 uint64_t *blobs,
1323 uint64_t *buffers,
1324 uint64_t *bytes) override {
1325 *extents += num_extents;
1326 *blobs += num_blobs;
1327 *buffers += num;
1328 *bytes += buffer_bytes;
1329 }
7c673cae 1330
9f95a23c
TL
1331#ifdef DEBUG_CACHE
1332 void _audit(const char *s) override
1333 {
1334 dout(10) << __func__ << " " << when << " start" << dendl;
1335 uint64_t s = 0;
1336 for (auto i = hot.begin(); i != hot.end(); ++i) {
1337 s += i->length;
7c673cae
FG
1338 }
1339
9f95a23c
TL
1340 uint64_t hot_bytes = s;
1341 if (hot_bytes != list_bytes[BUFFER_HOT]) {
1342 derr << __func__ << " hot_list_bytes "
1343 << list_bytes[BUFFER_HOT]
1344 << " != actual " << hot_bytes
1345 << dendl;
1346 ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
7c673cae
FG
1347 }
1348
9f95a23c
TL
1349 for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
1350 s += i->length;
7c673cae 1351 }
7c673cae 1352
9f95a23c
TL
1353 uint64_t warm_in_bytes = s - hot_bytes;
1354 if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
1355 derr << __func__ << " warm_in_list_bytes "
1356 << list_bytes[BUFFER_WARM_IN]
1357 << " != actual " << warm_in_bytes
1358 << dendl;
1359 ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
7c673cae 1360 }
7c673cae 1361
9f95a23c
TL
1362 if (s != buffer_bytes) {
1363 derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
1364 << dendl;
1365 ceph_assert(s == buffer_bytes);
1366 }
7c673cae 1367
9f95a23c
TL
1368 dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
1369 << " ok" << dendl;
7c673cae 1370 }
9f95a23c
TL
1371#endif
1372};
7c673cae 1373
9f95a23c 1374// BuferCacheShard
7c673cae 1375
9f95a23c
TL
1376BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
1377 CephContext* cct,
1378 string type,
1379 PerfCounters *logger)
1380{
1381 BufferCacheShard *c = nullptr;
1382 if (type == "lru")
1383 c = new LruBufferCacheShard(cct);
1384 else if (type == "2q")
1385 c = new TwoQBufferCacheShard(cct);
1386 else
1387 ceph_abort_msg("unrecognized cache type");
1388 c->logger = logger;
1389 return c;
7c673cae 1390}
7c673cae
FG
1391
1392// BufferSpace
1393
1394#undef dout_prefix
1395#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1396
9f95a23c 1397void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
7c673cae
FG
1398{
1399 // note: we already hold cache->lock
1400 ldout(cache->cct, 20) << __func__ << dendl;
1401 while (!buffer_map.empty()) {
1402 _rm_buffer(cache, buffer_map.begin());
1403 }
1404}
1405
9f95a23c 1406int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
7c673cae
FG
1407{
1408 // note: we already hold cache->lock
1409 ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
1410 << std::dec << dendl;
1411 int cache_private = 0;
1412 cache->_audit("discard start");
1413 auto i = _data_lower_bound(offset);
1414 uint32_t end = offset + length;
1415 while (i != buffer_map.end()) {
1416 Buffer *b = i->second.get();
1417 if (b->offset >= end) {
1418 break;
1419 }
1420 if (b->cache_private > cache_private) {
1421 cache_private = b->cache_private;
1422 }
1423 if (b->offset < offset) {
1424 int64_t front = offset - b->offset;
1425 if (b->end() > end) {
1426 // drop middle (split)
1427 uint32_t tail = b->end() - end;
1428 if (b->data.length()) {
1429 bufferlist bl;
1430 bl.substr_of(b->data, b->length - tail, tail);
31f18b77
FG
1431 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1432 nb->maybe_rebuild();
1433 _add_buffer(cache, nb, 0, b);
7c673cae 1434 } else {
31f18b77
FG
1435 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail),
1436 0, b);
7c673cae
FG
1437 }
1438 if (!b->is_writing()) {
9f95a23c 1439 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1440 }
1441 b->truncate(front);
31f18b77 1442 b->maybe_rebuild();
7c673cae
FG
1443 cache->_audit("discard end 1");
1444 break;
1445 } else {
1446 // drop tail
1447 if (!b->is_writing()) {
9f95a23c 1448 cache->_adjust_size(b, front - (int64_t)b->length);
7c673cae
FG
1449 }
1450 b->truncate(front);
31f18b77 1451 b->maybe_rebuild();
7c673cae
FG
1452 ++i;
1453 continue;
1454 }
1455 }
1456 if (b->end() <= end) {
1457 // drop entire buffer
1458 _rm_buffer(cache, i++);
1459 continue;
1460 }
1461 // drop front
1462 uint32_t keep = b->end() - end;
1463 if (b->data.length()) {
1464 bufferlist bl;
1465 bl.substr_of(b->data, b->length - keep, keep);
31f18b77
FG
1466 Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
1467 nb->maybe_rebuild();
1468 _add_buffer(cache, nb, 0, b);
7c673cae
FG
1469 } else {
1470 _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep), 0, b);
1471 }
1472 _rm_buffer(cache, i);
1473 cache->_audit("discard end 2");
1474 break;
1475 }
1476 return cache_private;
1477}
1478
1479void BlueStore::BufferSpace::read(
9f95a23c 1480 BufferCacheShard* cache,
224ce89b
WB
1481 uint32_t offset,
1482 uint32_t length,
7c673cae 1483 BlueStore::ready_regions_t& res,
91327a77
AA
1484 interval_set<uint32_t>& res_intervals,
1485 int flags)
7c673cae 1486{
7c673cae
FG
1487 res.clear();
1488 res_intervals.clear();
1489 uint32_t want_bytes = length;
1490 uint32_t end = offset + length;
224ce89b
WB
1491
1492 {
11fdf7f2 1493 std::lock_guard l(cache->lock);
224ce89b
WB
1494 for (auto i = _data_lower_bound(offset);
1495 i != buffer_map.end() && offset < end && i->first < end;
1496 ++i) {
1497 Buffer *b = i->second.get();
11fdf7f2 1498 ceph_assert(b->end() > offset);
91327a77
AA
1499
1500 bool val = false;
1501 if (flags & BYPASS_CLEAN_CACHE)
1502 val = b->is_writing();
1503 else
1504 val = b->is_writing() || b->is_clean();
1505 if (val) {
224ce89b
WB
1506 if (b->offset < offset) {
1507 uint32_t skip = offset - b->offset;
11fdf7f2 1508 uint32_t l = min(length, b->length - skip);
224ce89b
WB
1509 res[offset].substr_of(b->data, skip, l);
1510 res_intervals.insert(offset, l);
1511 offset += l;
1512 length -= l;
1513 if (!b->is_writing()) {
9f95a23c 1514 cache->_touch(b);
224ce89b
WB
1515 }
1516 continue;
1517 }
1518 if (b->offset > offset) {
1519 uint32_t gap = b->offset - offset;
1520 if (length <= gap) {
1521 break;
1522 }
1523 offset += gap;
1524 length -= gap;
1525 }
1526 if (!b->is_writing()) {
9f95a23c 1527 cache->_touch(b);
224ce89b
WB
1528 }
1529 if (b->length > length) {
1530 res[offset].substr_of(b->data, 0, length);
1531 res_intervals.insert(offset, length);
7c673cae 1532 break;
224ce89b
WB
1533 } else {
1534 res[offset].append(b->data);
1535 res_intervals.insert(offset, b->length);
1536 if (b->length == length)
1537 break;
1538 offset += b->length;
1539 length -= b->length;
1540 }
7c673cae
FG
1541 }
1542 }
1543 }
1544
1545 uint64_t hit_bytes = res_intervals.size();
11fdf7f2 1546 ceph_assert(hit_bytes <= want_bytes);
7c673cae
FG
1547 uint64_t miss_bytes = want_bytes - hit_bytes;
1548 cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
1549 cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
1550}
1551
9f95a23c 1552void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
7c673cae 1553{
7c673cae
FG
1554 auto i = writing.begin();
1555 while (i != writing.end()) {
1556 if (i->seq > seq) {
1557 break;
1558 }
1559 if (i->seq < seq) {
1560 ++i;
1561 continue;
1562 }
1563
1564 Buffer *b = &*i;
11fdf7f2 1565 ceph_assert(b->is_writing());
7c673cae
FG
1566
1567 if (b->flags & Buffer::FLAG_NOCACHE) {
1568 writing.erase(i++);
1569 ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
1570 buffer_map.erase(b->offset);
1571 } else {
1572 b->state = Buffer::STATE_CLEAN;
1573 writing.erase(i++);
31f18b77
FG
1574 b->maybe_rebuild();
1575 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
9f95a23c 1576 cache->_add(b, 1, nullptr);
7c673cae
FG
1577 ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
1578 }
1579 }
9f95a23c 1580 cache->_trim();
7c673cae
FG
1581 cache->_audit("finish_write end");
1582}
1583
9f95a23c 1584void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
7c673cae 1585{
11fdf7f2 1586 std::lock_guard lk(cache->lock);
7c673cae
FG
1587 if (buffer_map.empty())
1588 return;
1589
1590 auto p = --buffer_map.end();
1591 while (true) {
1592 if (p->second->end() <= pos)
1593 break;
1594
1595 if (p->second->offset < pos) {
1596 ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
1597 size_t left = pos - p->second->offset;
1598 size_t right = p->second->length - left;
1599 if (p->second->data.length()) {
1600 bufferlist bl;
1601 bl.substr_of(p->second->data, left, right);
1602 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, bl),
1603 0, p->second.get());
1604 } else {
1605 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
1606 0, p->second.get());
1607 }
9f95a23c 1608 cache->_adjust_size(p->second.get(), -right);
7c673cae
FG
1609 p->second->truncate(left);
1610 break;
1611 }
1612
11fdf7f2 1613 ceph_assert(p->second->end() > pos);
7c673cae
FG
1614 ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
1615 if (p->second->data.length()) {
1616 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1617 p->second->offset - pos, p->second->data),
1618 0, p->second.get());
1619 } else {
1620 r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
1621 p->second->offset - pos, p->second->length),
1622 0, p->second.get());
1623 }
1624 if (p == buffer_map.begin()) {
1625 _rm_buffer(cache, p);
1626 break;
1627 } else {
1628 _rm_buffer(cache, p--);
1629 }
1630 }
11fdf7f2 1631 ceph_assert(writing.empty());
9f95a23c 1632 cache->_trim();
7c673cae
FG
1633}
1634
1635// OnodeSpace
1636
1637#undef dout_prefix
1638#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1639
1640BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
1641{
11fdf7f2 1642 std::lock_guard l(cache->lock);
7c673cae
FG
1643 auto p = onode_map.find(oid);
1644 if (p != onode_map.end()) {
1645 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
1646 << " raced, returning existing " << p->second
1647 << dendl;
1648 return p->second;
1649 }
1650 ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
1651 onode_map[oid] = o;
9f95a23c
TL
1652 cache->_add(o, 1);
1653 cache->_trim();
7c673cae
FG
1654 return o;
1655}
1656
1657BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
1658{
7c673cae 1659 ldout(cache->cct, 30) << __func__ << dendl;
224ce89b
WB
1660 OnodeRef o;
1661 bool hit = false;
1662
1663 {
11fdf7f2 1664 std::lock_guard l(cache->lock);
224ce89b
WB
1665 ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
1666 if (p == onode_map.end()) {
1667 ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
1668 } else {
1669 ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
1670 << dendl;
9f95a23c 1671 cache->_touch(p->second);
224ce89b
WB
1672 hit = true;
1673 o = p->second;
1674 }
1675 }
1676
1677 if (hit) {
1678 cache->logger->inc(l_bluestore_onode_hits);
1679 } else {
7c673cae 1680 cache->logger->inc(l_bluestore_onode_misses);
7c673cae 1681 }
224ce89b 1682 return o;
7c673cae
FG
1683}
1684
1685void BlueStore::OnodeSpace::clear()
1686{
11fdf7f2 1687 std::lock_guard l(cache->lock);
7c673cae
FG
1688 ldout(cache->cct, 10) << __func__ << dendl;
1689 for (auto &p : onode_map) {
9f95a23c 1690 cache->_rm(p.second);
7c673cae
FG
1691 }
1692 onode_map.clear();
1693}
1694
1695bool BlueStore::OnodeSpace::empty()
1696{
11fdf7f2 1697 std::lock_guard l(cache->lock);
7c673cae
FG
1698 return onode_map.empty();
1699}
1700
1701void BlueStore::OnodeSpace::rename(
1702 OnodeRef& oldo,
1703 const ghobject_t& old_oid,
1704 const ghobject_t& new_oid,
31f18b77 1705 const mempool::bluestore_cache_other::string& new_okey)
7c673cae 1706{
11fdf7f2 1707 std::lock_guard l(cache->lock);
7c673cae
FG
1708 ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
1709 << dendl;
1710 ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
1711 po = onode_map.find(old_oid);
1712 pn = onode_map.find(new_oid);
11fdf7f2 1713 ceph_assert(po != pn);
7c673cae 1714
11fdf7f2 1715 ceph_assert(po != onode_map.end());
7c673cae
FG
1716 if (pn != onode_map.end()) {
1717 ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
1718 << dendl;
9f95a23c 1719 cache->_rm(pn->second);
7c673cae
FG
1720 onode_map.erase(pn);
1721 }
1722 OnodeRef o = po->second;
1723
1724 // install a non-existent onode at old location
1725 oldo.reset(new Onode(o->c, old_oid, o->key));
1726 po->second = oldo;
9f95a23c 1727 cache->_add(po->second, 1);
7c673cae
FG
1728 // add at new position and fix oid, key
1729 onode_map.insert(make_pair(new_oid, o));
9f95a23c 1730 cache->_touch(o);
7c673cae
FG
1731 o->oid = new_oid;
1732 o->key = new_okey;
9f95a23c 1733 cache->_trim();
7c673cae
FG
1734}
1735
1736bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
1737{
11fdf7f2 1738 std::lock_guard l(cache->lock);
7c673cae
FG
1739 ldout(cache->cct, 20) << __func__ << dendl;
1740 for (auto& i : onode_map) {
1741 if (f(i.second)) {
1742 return true;
1743 }
1744 }
1745 return false;
1746}
1747
11fdf7f2
TL
1748template <int LogLevelV = 30>
1749void BlueStore::OnodeSpace::dump(CephContext *cct)
3efd9988
FG
1750{
1751 for (auto& i : onode_map) {
11fdf7f2 1752 ldout(cct, LogLevelV) << i.first << " : " << i.second << dendl;
3efd9988
FG
1753 }
1754}
7c673cae
FG
1755
1756// SharedBlob
1757
1758#undef dout_prefix
1759#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
9f95a23c
TL
1760#undef dout_context
1761#define dout_context coll->store->cct
7c673cae 1762
9f95a23c 1763void BlueStore::SharedBlob::dump(Formatter* f) const
7c673cae 1764{
9f95a23c
TL
1765 f->dump_bool("loaded", loaded);
1766 if (loaded) {
1767 persistent->dump(f);
1768 } else {
1769 f->dump_unsigned("sbid_unloaded", sbid_unloaded);
1770 }
1771}
1772
1773ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
1774{
1775 out << "SharedBlob(" << &sb;
1776
7c673cae
FG
1777 if (sb.loaded) {
1778 out << " loaded " << *sb.persistent;
1779 } else {
1780 out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
1781 }
1782 return out << ")";
1783}
1784
1785BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
1786 : coll(_coll), sbid_unloaded(i)
1787{
11fdf7f2 1788 ceph_assert(sbid_unloaded > 0);
7c673cae
FG
1789 if (get_cache()) {
1790 get_cache()->add_blob();
1791 }
1792}
1793
1794BlueStore::SharedBlob::~SharedBlob()
1795{
7c673cae
FG
1796 if (loaded && persistent) {
1797 delete persistent;
1798 }
1799}
1800
1801void BlueStore::SharedBlob::put()
1802{
1803 if (--nref == 0) {
9f95a23c
TL
1804 dout(20) << __func__ << " " << this
1805 << " removing self from set " << get_parent()
1806 << dendl;
1adf2230
AA
1807 again:
1808 auto coll_snap = coll;
1809 if (coll_snap) {
11fdf7f2 1810 std::lock_guard l(coll_snap->cache->lock);
1adf2230
AA
1811 if (coll_snap != coll) {
1812 goto again;
1813 }
91327a77
AA
1814 if (!coll_snap->shared_blob_set.remove(this, true)) {
1815 // race with lookup
1816 return;
1817 }
1adf2230
AA
1818 bc._clear(coll_snap->cache);
1819 coll_snap->cache->rm_blob();
7c673cae 1820 }
28e407b8 1821 delete this;
7c673cae
FG
1822 }
1823}
1824
1825void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
1826{
11fdf7f2 1827 ceph_assert(persistent);
7c673cae
FG
1828 persistent->ref_map.get(offset, length);
1829}
1830
1831void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
31f18b77 1832 PExtentVector *r,
11fdf7f2 1833 bool *unshare)
7c673cae 1834{
11fdf7f2
TL
1835 ceph_assert(persistent);
1836 persistent->ref_map.put(offset, length, r,
1837 unshare && !*unshare ? unshare : nullptr);
7c673cae
FG
1838}
1839
f64942e4
AA
1840void BlueStore::SharedBlob::finish_write(uint64_t seq)
1841{
1842 while (true) {
9f95a23c 1843 BufferCacheShard *cache = coll->cache;
11fdf7f2 1844 std::lock_guard l(cache->lock);
f64942e4 1845 if (coll->cache != cache) {
9f95a23c
TL
1846 dout(20) << __func__
1847 << " raced with sb cache update, was " << cache
1848 << ", now " << coll->cache << ", retrying"
1849 << dendl;
f64942e4
AA
1850 continue;
1851 }
1852 bc._finish_write(cache, seq);
1853 break;
1854 }
1855}
1856
3efd9988
FG
1857// SharedBlobSet
1858
1859#undef dout_prefix
1860#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
1861
11fdf7f2
TL
1862template <int LogLevelV = 30>
1863void BlueStore::SharedBlobSet::dump(CephContext *cct)
3efd9988 1864{
11fdf7f2 1865 std::lock_guard l(lock);
3efd9988 1866 for (auto& i : sb_map) {
11fdf7f2 1867 ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
3efd9988
FG
1868 }
1869}
1870
7c673cae
FG
1871// Blob
1872
1873#undef dout_prefix
1874#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1875
9f95a23c
TL
1876void BlueStore::Blob::dump(Formatter* f) const
1877{
1878 if (is_spanning()) {
1879 f->dump_unsigned("spanning_id ", id);
1880 }
1881 blob.dump(f);
1882 if (shared_blob) {
1883 f->dump_object("shared", *shared_blob);
1884 }
1885}
1886
7c673cae
FG
1887ostream& operator<<(ostream& out, const BlueStore::Blob& b)
1888{
1889 out << "Blob(" << &b;
1890 if (b.is_spanning()) {
1891 out << " spanning " << b.id;
1892 }
35e4c445
FG
1893 out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
1894 if (b.shared_blob) {
1895 out << " " << *b.shared_blob;
1896 } else {
1897 out << " (shared_blob=NULL)";
1898 }
1899 out << ")";
7c673cae
FG
1900 return out;
1901}
1902
1903void BlueStore::Blob::discard_unallocated(Collection *coll)
1904{
224ce89b 1905 if (get_blob().is_shared()) {
7c673cae
FG
1906 return;
1907 }
224ce89b 1908 if (get_blob().is_compressed()) {
7c673cae
FG
1909 bool discard = false;
1910 bool all_invalid = true;
224ce89b 1911 for (auto e : get_blob().get_extents()) {
7c673cae
FG
1912 if (!e.is_valid()) {
1913 discard = true;
1914 } else {
1915 all_invalid = false;
1916 }
1917 }
11fdf7f2 1918 ceph_assert(discard == all_invalid); // in case of compressed blob all
7c673cae
FG
1919 // or none pextents are invalid.
1920 if (discard) {
224ce89b
WB
1921 shared_blob->bc.discard(shared_blob->get_cache(), 0,
1922 get_blob().get_logical_length());
7c673cae
FG
1923 }
1924 } else {
1925 size_t pos = 0;
224ce89b 1926 for (auto e : get_blob().get_extents()) {
7c673cae 1927 if (!e.is_valid()) {
9f95a23c
TL
1928 dout(20) << __func__ << " 0x" << std::hex << pos
1929 << "~" << e.length
1930 << std::dec << dendl;
7c673cae
FG
1931 shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
1932 }
1933 pos += e.length;
1934 }
224ce89b
WB
1935 if (get_blob().can_prune_tail()) {
1936 dirty_blob().prune_tail();
1937 used_in_blob.prune_tail(get_blob().get_ondisk_length());
224ce89b 1938 dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
7c673cae
FG
1939 }
1940 }
1941}
1942
1943void BlueStore::Blob::get_ref(
1944 Collection *coll,
1945 uint32_t offset,
1946 uint32_t length)
1947{
1948 // Caller has to initialize Blob's logical length prior to increment
1949 // references. Otherwise one is neither unable to determine required
1950 // amount of counters in case of per-au tracking nor obtain min_release_size
1951 // for single counter mode.
11fdf7f2 1952 ceph_assert(get_blob().get_logical_length() != 0);
7c673cae
FG
1953 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1954 << std::dec << " " << *this << dendl;
1955
1956 if (used_in_blob.is_empty()) {
1957 uint32_t min_release_size =
224ce89b
WB
1958 get_blob().get_release_size(coll->store->min_alloc_size);
1959 uint64_t l = get_blob().get_logical_length();
1960 dout(20) << __func__ << " init 0x" << std::hex << l << ", "
1961 << min_release_size << std::dec << dendl;
7c673cae
FG
1962 used_in_blob.init(l, min_release_size);
1963 }
1964 used_in_blob.get(
1965 offset,
1966 length);
1967}
1968
1969bool BlueStore::Blob::put_ref(
1970 Collection *coll,
1971 uint32_t offset,
1972 uint32_t length,
1973 PExtentVector *r)
1974{
1975 PExtentVector logical;
1976
7c673cae
FG
1977 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
1978 << std::dec << " " << *this << dendl;
1979
1980 bool empty = used_in_blob.put(
1981 offset,
1982 length,
1983 &logical);
1984 r->clear();
1985 // nothing to release
1986 if (!empty && logical.empty()) {
1987 return false;
1988 }
1989
1990 bluestore_blob_t& b = dirty_blob();
1991 return b.release_extents(empty, logical, r);
1992}
1993
224ce89b 1994bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
1995 uint32_t target_blob_size,
1996 uint32_t b_offset,
1997 uint32_t *length0) {
11fdf7f2
TL
1998 ceph_assert(min_alloc_size);
1999 ceph_assert(target_blob_size);
7c673cae
FG
2000 if (!get_blob().is_mutable()) {
2001 return false;
2002 }
2003
2004 uint32_t length = *length0;
2005 uint32_t end = b_offset + length;
2006
2007 // Currently for the sake of simplicity we omit blob reuse if data is
2008 // unaligned with csum chunk. Later we can perform padding if needed.
2009 if (get_blob().has_csum() &&
2010 ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
2011 (end % get_blob().get_csum_chunk_size()) != 0)) {
2012 return false;
2013 }
2014
2015 auto blen = get_blob().get_logical_length();
2016 uint32_t new_blen = blen;
2017
2018 // make sure target_blob_size isn't less than current blob len
11fdf7f2 2019 target_blob_size = std::max(blen, target_blob_size);
7c673cae
FG
2020
2021 if (b_offset >= blen) {
224ce89b
WB
2022 // new data totally stands out of the existing blob
2023 new_blen = end;
7c673cae 2024 } else {
224ce89b 2025 // new data overlaps with the existing blob
11fdf7f2 2026 new_blen = std::max(blen, end);
224ce89b
WB
2027
2028 uint32_t overlap = 0;
2029 if (new_blen > blen) {
2030 overlap = blen - b_offset;
2031 } else {
2032 overlap = length;
2033 }
2034
2035 if (!get_blob().is_unallocated(b_offset, overlap)) {
2036 // abort if any piece of the overlap has already been allocated
2037 return false;
7c673cae
FG
2038 }
2039 }
224ce89b 2040
7c673cae
FG
2041 if (new_blen > blen) {
2042 int64_t overflow = int64_t(new_blen) - target_blob_size;
2043 // Unable to decrease the provided length to fit into max_blob_size
2044 if (overflow >= length) {
2045 return false;
2046 }
2047
2048 // FIXME: in some cases we could reduce unused resolution
2049 if (get_blob().has_unused()) {
2050 return false;
2051 }
2052
2053 if (overflow > 0) {
2054 new_blen -= overflow;
2055 length -= overflow;
2056 *length0 = length;
2057 }
224ce89b 2058
7c673cae
FG
2059 if (new_blen > blen) {
2060 dirty_blob().add_tail(new_blen);
2061 used_in_blob.add_tail(new_blen,
224ce89b 2062 get_blob().get_release_size(min_alloc_size));
7c673cae
FG
2063 }
2064 }
2065 return true;
2066}
2067
2068void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
2069{
7c673cae
FG
2070 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2071 << " start " << *this << dendl;
11fdf7f2
TL
2072 ceph_assert(blob.can_split());
2073 ceph_assert(used_in_blob.can_split());
7c673cae
FG
2074 bluestore_blob_t &lb = dirty_blob();
2075 bluestore_blob_t &rb = r->dirty_blob();
2076
2077 used_in_blob.split(
2078 blob_offset,
2079 &(r->used_in_blob));
2080
2081 lb.split(blob_offset, rb);
2082 shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
2083
2084 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2085 << " finish " << *this << dendl;
2086 dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
2087 << " and " << *r << dendl;
2088}
2089
2090#ifndef CACHE_BLOB_BL
2091void BlueStore::Blob::decode(
2092 Collection *coll,
11fdf7f2 2093 bufferptr::const_iterator& p,
7c673cae
FG
2094 uint64_t struct_v,
2095 uint64_t* sbid,
2096 bool include_ref_map)
2097{
2098 denc(blob, p, struct_v);
2099 if (blob.is_shared()) {
2100 denc(*sbid, p);
2101 }
2102 if (include_ref_map) {
2103 if (struct_v > 1) {
2104 used_in_blob.decode(p);
2105 } else {
2106 used_in_blob.clear();
2107 bluestore_extent_ref_map_t legacy_ref_map;
2108 legacy_ref_map.decode(p);
2109 for (auto r : legacy_ref_map.ref_map) {
2110 get_ref(
2111 coll,
2112 r.first,
2113 r.second.refs * r.second.length);
2114 }
2115 }
2116 }
2117}
2118#endif
2119
2120// Extent
2121
9f95a23c
TL
2122void BlueStore::Extent::dump(Formatter* f) const
2123{
2124 f->dump_unsigned("logical_offset", logical_offset);
2125 f->dump_unsigned("length", length);
2126 f->dump_unsigned("blob_offset", blob_offset);
2127 f->dump_object("blob", *blob);
2128}
2129
7c673cae
FG
2130ostream& operator<<(ostream& out, const BlueStore::Extent& e)
2131{
2132 return out << std::hex << "0x" << e.logical_offset << "~" << e.length
2133 << ": 0x" << e.blob_offset << "~" << e.length << std::dec
2134 << " " << *e.blob;
2135}
2136
2137// OldExtent
2138BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
2139 uint32_t lo,
2140 uint32_t o,
2141 uint32_t l,
2142 BlobRef& b) {
2143 OldExtent* oe = new OldExtent(lo, o, l, b);
2144 b->put_ref(c.get(), o, l, &(oe->r));
2145 oe->blob_empty = b->get_referenced_bytes() == 0;
2146 return oe;
2147}
2148
2149// ExtentMap
2150
2151#undef dout_prefix
2152#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
9f95a23c
TL
2153#undef dout_context
2154#define dout_context onode->c->store->cct
7c673cae
FG
2155
2156BlueStore::ExtentMap::ExtentMap(Onode *o)
2157 : onode(o),
2158 inline_bl(
2159 o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
2160}
2161
9f95a23c
TL
2162void BlueStore::ExtentMap::dump(Formatter* f) const
2163{
2164 f->open_array_section("extents");
2165
2166 for (auto& e : extent_map) {
2167 f->dump_object("extent", e);
2168 }
2169 f->close_section();
2170}
2171
11fdf7f2
TL
2172void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
2173 CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
2174 uint64_t& length, uint64_t& dstoff) {
2175
2176 auto cct = onode->c->store->cct;
2177 bool inject_21040 =
2178 cct->_conf->bluestore_debug_inject_bug21040;
2179 vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
2180 for (auto& e : oldo->extent_map.extent_map) {
2181 e.blob->last_encoded_id = -1;
2182 }
2183
2184 int n = 0;
2185 uint64_t end = srcoff + length;
2186 uint32_t dirty_range_begin = 0;
2187 uint32_t dirty_range_end = 0;
2188 bool src_dirty = false;
2189 for (auto ep = oldo->extent_map.seek_lextent(srcoff);
2190 ep != oldo->extent_map.extent_map.end();
2191 ++ep) {
2192 auto& e = *ep;
2193 if (e.logical_offset >= end) {
2194 break;
2195 }
2196 dout(20) << __func__ << " src " << e << dendl;
2197 BlobRef cb;
2198 bool blob_duped = true;
2199 if (e.blob->last_encoded_id >= 0) {
2200 cb = id_to_blob[e.blob->last_encoded_id];
2201 blob_duped = false;
2202 } else {
2203 // dup the blob
2204 const bluestore_blob_t& blob = e.blob->get_blob();
2205 // make sure it is shared
2206 if (!blob.is_shared()) {
2207 c->make_blob_shared(b->_assign_blobid(txc), e.blob);
2208 if (!inject_21040 && !src_dirty) {
2209 src_dirty = true;
2210 dirty_range_begin = e.logical_offset;
2211 } else if (inject_21040 &&
2212 dirty_range_begin == 0 && dirty_range_end == 0) {
2213 dirty_range_begin = e.logical_offset;
2214 }
2215 ceph_assert(e.logical_end() > 0);
2216 // -1 to exclude next potential shard
2217 dirty_range_end = e.logical_end() - 1;
2218 } else {
2219 c->load_shared_blob(e.blob->shared_blob);
2220 }
2221 cb = new Blob();
2222 e.blob->last_encoded_id = n;
2223 id_to_blob[n] = cb;
2224 e.blob->dup(*cb);
2225 // bump the extent refs on the copied blob's extents
2226 for (auto p : blob.get_extents()) {
2227 if (p.is_valid()) {
2228 e.blob->shared_blob->get_ref(p.offset, p.length);
2229 }
2230 }
2231 txc->write_shared_blob(e.blob->shared_blob);
2232 dout(20) << __func__ << " new " << *cb << dendl;
2233 }
2234
2235 int skip_front, skip_back;
2236 if (e.logical_offset < srcoff) {
2237 skip_front = srcoff - e.logical_offset;
2238 } else {
2239 skip_front = 0;
2240 }
2241 if (e.logical_end() > end) {
2242 skip_back = e.logical_end() - end;
2243 } else {
2244 skip_back = 0;
2245 }
2246
2247 Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
2248 e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
2249 newo->extent_map.extent_map.insert(*ne);
2250 ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
2251 // fixme: we may leave parts of new blob unreferenced that could
2252 // be freed (relative to the shared_blob).
2253 txc->statfs_delta.stored() += ne->length;
2254 if (e.blob->get_blob().is_compressed()) {
2255 txc->statfs_delta.compressed_original() += ne->length;
2256 if (blob_duped) {
2257 txc->statfs_delta.compressed() +=
2258 cb->get_blob().get_compressed_payload_length();
2259 }
2260 }
2261 dout(20) << __func__ << " dst " << *ne << dendl;
2262 ++n;
2263 }
2264 if ((!inject_21040 && src_dirty) ||
2265 (inject_21040 && dirty_range_end > dirty_range_begin)) {
2266 oldo->extent_map.dirty_range(dirty_range_begin,
2267 dirty_range_end - dirty_range_begin);
2268 txc->write_onode(oldo);
2269 }
2270 txc->write_onode(newo);
2271
2272 if (dstoff + length > newo->onode.size) {
2273 newo->onode.size = dstoff + length;
2274 }
2275 newo->extent_map.dirty_range(dstoff, length);
2276}
7c673cae
FG
2277void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
2278 bool force)
2279{
2280 auto cct = onode->c->store->cct; //used by dout
2281 dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
2282 if (onode->onode.extent_map_shards.empty()) {
2283 if (inline_bl.length() == 0) {
2284 unsigned n;
2285 // we need to encode inline_bl to measure encoded length
2286 bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
3efd9988 2287 inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
11fdf7f2 2288 ceph_assert(!never_happen);
7c673cae
FG
2289 size_t len = inline_bl.length();
2290 dout(20) << __func__ << " inline shard " << len << " bytes from " << n
2291 << " extents" << dendl;
2292 if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
2293 request_reshard(0, OBJECT_MAX_SIZE);
2294 return;
2295 }
2296 }
2297 // will persist in the onode key.
2298 } else {
2299 // pending shard update
2300 struct dirty_shard_t {
2301 Shard *shard;
2302 bufferlist bl;
2303 dirty_shard_t(Shard *s) : shard(s) {}
2304 };
2305 vector<dirty_shard_t> encoded_shards;
2306 // allocate slots for all shards in a single call instead of
2307 // doing multiple allocations - one per each dirty shard
2308 encoded_shards.reserve(shards.size());
2309
2310 auto p = shards.begin();
2311 auto prev_p = p;
2312 while (p != shards.end()) {
11fdf7f2 2313 ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
7c673cae
FG
2314 auto n = p;
2315 ++n;
2316 if (p->dirty) {
2317 uint32_t endoff;
2318 if (n == shards.end()) {
2319 endoff = OBJECT_MAX_SIZE;
2320 } else {
2321 endoff = n->shard_info->offset;
2322 }
2323 encoded_shards.emplace_back(dirty_shard_t(&(*p)));
2324 bufferlist& bl = encoded_shards.back().bl;
2325 if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
2326 bl, &p->extents)) {
2327 if (force) {
2328 derr << __func__ << " encode_some needs reshard" << dendl;
11fdf7f2 2329 ceph_assert(!force);
7c673cae
FG
2330 }
2331 }
2332 size_t len = bl.length();
2333
2334 dout(20) << __func__ << " shard 0x" << std::hex
2335 << p->shard_info->offset << std::dec << " is " << len
2336 << " bytes (was " << p->shard_info->bytes << ") from "
2337 << p->extents << " extents" << dendl;
2338
2339 if (!force) {
2340 if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
2341 // we are big; reshard ourselves
2342 request_reshard(p->shard_info->offset, endoff);
2343 }
2344 // avoid resharding the trailing shard, even if it is small
2345 else if (n != shards.end() &&
11fdf7f2
TL
2346 len < g_conf()->bluestore_extent_map_shard_min_size) {
2347 ceph_assert(endoff != OBJECT_MAX_SIZE);
31f18b77
FG
2348 if (p == shards.begin()) {
2349 // we are the first shard, combine with next shard
7c673cae 2350 request_reshard(p->shard_info->offset, endoff + 1);
7c673cae 2351 } else {
31f18b77
FG
2352 // combine either with the previous shard or the next,
2353 // whichever is smaller
7c673cae
FG
2354 if (prev_p->shard_info->bytes > n->shard_info->bytes) {
2355 request_reshard(p->shard_info->offset, endoff + 1);
2356 } else {
2357 request_reshard(prev_p->shard_info->offset, endoff);
2358 }
2359 }
2360 }
2361 }
2362 }
2363 prev_p = p;
2364 p = n;
2365 }
2366 if (needs_reshard()) {
2367 return;
2368 }
2369
2370 // schedule DB update for dirty shards
2371 string key;
2372 for (auto& it : encoded_shards) {
2373 it.shard->dirty = false;
2374 it.shard->shard_info->bytes = it.bl.length();
2375 generate_extent_shard_key_and_apply(
2376 onode->key,
2377 it.shard->shard_info->offset,
2378 &key,
2379 [&](const string& final_key) {
2380 t->set(PREFIX_OBJ, final_key, it.bl);
2381 }
2382 );
2383 }
2384 }
2385}
2386
31f18b77
FG
2387bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
2388{
2389 if (spanning_blob_map.empty())
2390 return 0;
2391 bid_t bid = spanning_blob_map.rbegin()->first + 1;
2392 // bid is valid and available.
2393 if (bid >= 0)
2394 return bid;
2395 // Find next unused bid;
2396 bid = rand() % (numeric_limits<bid_t>::max() + 1);
2397 const auto begin_bid = bid;
2398 do {
2399 if (!spanning_blob_map.count(bid))
2400 return bid;
2401 else {
2402 bid++;
2403 if (bid < 0) bid = 0;
2404 }
2405 } while (bid != begin_bid);
81eedcae
TL
2406 auto cct = onode->c->store->cct; // used by dout
2407 _dump_onode<0>(cct, *onode);
11fdf7f2 2408 ceph_abort_msg("no available blob id");
31f18b77
FG
2409}
2410
7c673cae
FG
2411void BlueStore::ExtentMap::reshard(
2412 KeyValueDB *db,
2413 KeyValueDB::Transaction t)
2414{
2415 auto cct = onode->c->store->cct; // used by dout
2416
2417 dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
2418 << needs_reshard_end << ")" << std::dec
2419 << " of " << onode->onode.extent_map_shards.size()
2420 << " shards on " << onode->oid << dendl;
2421 for (auto& p : spanning_blob_map) {
2422 dout(20) << __func__ << " spanning blob " << p.first << " " << *p.second
2423 << dendl;
2424 }
2425 // determine shard index range
2426 unsigned si_begin = 0, si_end = 0;
2427 if (!shards.empty()) {
2428 while (si_begin + 1 < shards.size() &&
2429 shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
2430 ++si_begin;
2431 }
2432 needs_reshard_begin = shards[si_begin].shard_info->offset;
2433 for (si_end = si_begin; si_end < shards.size(); ++si_end) {
2434 if (shards[si_end].shard_info->offset >= needs_reshard_end) {
2435 needs_reshard_end = shards[si_end].shard_info->offset;
2436 break;
2437 }
2438 }
2439 if (si_end == shards.size()) {
2440 needs_reshard_end = OBJECT_MAX_SIZE;
2441 }
2442 dout(20) << __func__ << " shards [" << si_begin << "," << si_end << ")"
2443 << " over 0x[" << std::hex << needs_reshard_begin << ","
2444 << needs_reshard_end << ")" << std::dec << dendl;
2445 }
2446
181888fb 2447 fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
7c673cae
FG
2448
2449 // we may need to fault in a larger interval later must have all
2450 // referring extents for spanning blobs loaded in order to have
2451 // accurate use_tracker values.
2452 uint32_t spanning_scan_begin = needs_reshard_begin;
2453 uint32_t spanning_scan_end = needs_reshard_end;
2454
2455 // remove old keys
2456 string key;
2457 for (unsigned i = si_begin; i < si_end; ++i) {
2458 generate_extent_shard_key_and_apply(
2459 onode->key, shards[i].shard_info->offset, &key,
2460 [&](const string& final_key) {
2461 t->rmkey(PREFIX_OBJ, final_key);
2462 }
2463 );
2464 }
2465
2466 // calculate average extent size
2467 unsigned bytes = 0;
2468 unsigned extents = 0;
2469 if (onode->onode.extent_map_shards.empty()) {
2470 bytes = inline_bl.length();
2471 extents = extent_map.size();
2472 } else {
2473 for (unsigned i = si_begin; i < si_end; ++i) {
2474 bytes += shards[i].shard_info->bytes;
2475 extents += shards[i].extents;
2476 }
2477 }
2478 unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
2479 unsigned slop = target *
2480 cct->_conf->bluestore_extent_map_shard_target_size_slop;
11fdf7f2 2481 unsigned extent_avg = bytes / std::max(1u, extents);
7c673cae
FG
2482 dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target
2483 << ", slop " << slop << dendl;
2484
2485 // reshard
2486 unsigned estimate = 0;
31f18b77 2487 unsigned offset = needs_reshard_begin;
7c673cae
FG
2488 vector<bluestore_onode_t::shard_info> new_shard_info;
2489 unsigned max_blob_end = 0;
2490 Extent dummy(needs_reshard_begin);
2491 for (auto e = extent_map.lower_bound(dummy);
2492 e != extent_map.end();
2493 ++e) {
2494 if (e->logical_offset >= needs_reshard_end) {
2495 break;
2496 }
2497 dout(30) << " extent " << *e << dendl;
2498
2499 // disfavor shard boundaries that span a blob
2500 bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
2501 if (estimate &&
2502 estimate + extent_avg > target + (would_span ? slop : 0)) {
2503 // new shard
31f18b77 2504 if (offset == needs_reshard_begin) {
7c673cae
FG
2505 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2506 new_shard_info.back().offset = offset;
2507 dout(20) << __func__ << " new shard 0x" << std::hex << offset
31f18b77 2508 << std::dec << dendl;
7c673cae
FG
2509 }
2510 offset = e->logical_offset;
2511 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2512 new_shard_info.back().offset = offset;
2513 dout(20) << __func__ << " new shard 0x" << std::hex << offset
2514 << std::dec << dendl;
2515 estimate = 0;
2516 }
2517 estimate += extent_avg;
31f18b77
FG
2518 unsigned bs = e->blob_start();
2519 if (bs < spanning_scan_begin) {
2520 spanning_scan_begin = bs;
7c673cae
FG
2521 }
2522 uint32_t be = e->blob_end();
2523 if (be > max_blob_end) {
2524 max_blob_end = be;
2525 }
2526 if (be > spanning_scan_end) {
2527 spanning_scan_end = be;
2528 }
2529 }
2530 if (new_shard_info.empty() && (si_begin > 0 ||
2531 si_end < shards.size())) {
2532 // we resharded a partial range; we must produce at least one output
2533 // shard
2534 new_shard_info.emplace_back(bluestore_onode_t::shard_info());
2535 new_shard_info.back().offset = needs_reshard_begin;
2536 dout(20) << __func__ << " new shard 0x" << std::hex << needs_reshard_begin
2537 << std::dec << " (singleton degenerate case)" << dendl;
2538 }
2539
2540 auto& sv = onode->onode.extent_map_shards;
2541 dout(20) << __func__ << " new " << new_shard_info << dendl;
2542 dout(20) << __func__ << " old " << sv << dendl;
2543 if (sv.empty()) {
2544 // no old shards to keep
2545 sv.swap(new_shard_info);
2546 init_shards(true, true);
2547 } else {
2548 // splice in new shards
2549 sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
2550 shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
2551 sv.insert(
2552 sv.begin() + si_begin,
2553 new_shard_info.begin(),
2554 new_shard_info.end());
2555 shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
7c673cae 2556 si_end = si_begin + new_shard_info.size();
31f18b77 2557
11fdf7f2 2558 ceph_assert(sv.size() == shards.size());
31f18b77
FG
2559
2560 // note that we need to update every shard_info of shards here,
2561 // as sv might have been totally re-allocated above
2562 for (unsigned i = 0; i < shards.size(); i++) {
7c673cae 2563 shards[i].shard_info = &sv[i];
31f18b77
FG
2564 }
2565
2566 // mark newly added shards as dirty
2567 for (unsigned i = si_begin; i < si_end; ++i) {
7c673cae
FG
2568 shards[i].loaded = true;
2569 shards[i].dirty = true;
2570 }
7c673cae
FG
2571 }
2572 dout(20) << __func__ << " fin " << sv << dendl;
2573 inline_bl.clear();
2574
2575 if (sv.empty()) {
2576 // no more shards; unspan all previously spanning blobs
2577 auto p = spanning_blob_map.begin();
2578 while (p != spanning_blob_map.end()) {
2579 p->second->id = -1;
2580 dout(30) << __func__ << " un-spanning " << *p->second << dendl;
2581 p = spanning_blob_map.erase(p);
2582 }
2583 } else {
2584 // identify new spanning blobs
2585 dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
2586 << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
2587 if (spanning_scan_begin < needs_reshard_begin) {
2588 fault_range(db, spanning_scan_begin,
2589 needs_reshard_begin - spanning_scan_begin);
2590 }
2591 if (spanning_scan_end > needs_reshard_end) {
2592 fault_range(db, needs_reshard_end,
31f18b77 2593 spanning_scan_end - needs_reshard_end);
7c673cae
FG
2594 }
2595 auto sp = sv.begin() + si_begin;
2596 auto esp = sv.end();
2597 unsigned shard_start = sp->offset;
2598 unsigned shard_end;
2599 ++sp;
2600 if (sp == esp) {
2601 shard_end = OBJECT_MAX_SIZE;
2602 } else {
2603 shard_end = sp->offset;
2604 }
7c673cae 2605 Extent dummy(needs_reshard_begin);
9f95a23c
TL
2606
2607 bool was_too_many_blobs_check = false;
2608 auto too_many_blobs_threshold =
2609 g_conf()->bluestore_debug_too_many_blobs_threshold;
2610 auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
2611 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
2612 decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
2613
7c673cae
FG
2614 for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
2615 if (e->logical_offset >= needs_reshard_end) {
2616 break;
2617 }
2618 dout(30) << " extent " << *e << dendl;
2619 while (e->logical_offset >= shard_end) {
2620 shard_start = shard_end;
11fdf7f2 2621 ceph_assert(sp != esp);
7c673cae
FG
2622 ++sp;
2623 if (sp == esp) {
2624 shard_end = OBJECT_MAX_SIZE;
2625 } else {
2626 shard_end = sp->offset;
2627 }
2628 dout(30) << __func__ << " shard 0x" << std::hex << shard_start
2629 << " to 0x" << shard_end << std::dec << dendl;
2630 }
9f95a23c 2631
7c673cae
FG
2632 if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
2633 if (!e->blob->is_spanning()) {
2634 // We have two options: (1) split the blob into pieces at the
2635 // shard boundaries (and adjust extents accordingly), or (2)
2636 // mark it spanning. We prefer to cut the blob if we can. Note that
2637 // we may have to split it multiple times--potentially at every
2638 // shard boundary.
2639 bool must_span = false;
2640 BlobRef b = e->blob;
2641 if (b->can_split()) {
2642 uint32_t bstart = e->blob_start();
2643 uint32_t bend = e->blob_end();
2644 for (const auto& sh : shards) {
2645 if (bstart < sh.shard_info->offset &&
2646 bend > sh.shard_info->offset) {
2647 uint32_t blob_offset = sh.shard_info->offset - bstart;
2648 if (b->can_split_at(blob_offset)) {
2649 dout(20) << __func__ << " splitting blob, bstart 0x"
2650 << std::hex << bstart << " blob_offset 0x"
2651 << blob_offset << std::dec << " " << *b << dendl;
2652 b = split_blob(b, blob_offset, sh.shard_info->offset);
2653 // switch b to the new right-hand side, in case it
2654 // *also* has to get split.
2655 bstart += blob_offset;
2656 onode->c->store->logger->inc(l_bluestore_blob_split);
2657 } else {
2658 must_span = true;
2659 break;
2660 }
2661 }
2662 }
2663 } else {
2664 must_span = true;
2665 }
2666 if (must_span) {
31f18b77
FG
2667 auto bid = allocate_spanning_blob_id();
2668 b->id = bid;
7c673cae
FG
2669 spanning_blob_map[b->id] = b;
2670 dout(20) << __func__ << " adding spanning " << *b << dendl;
9f95a23c
TL
2671 if (!was_too_many_blobs_check &&
2672 too_many_blobs_threshold &&
2673 spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
2674
2675 was_too_many_blobs_check = true;
2676 for (size_t i = 0; i < dumped_onodes.size(); ++i) {
2677 if (dumped_onodes[i].first == onode->oid) {
2678 oid_slot = &dumped_onodes[i];
2679 break;
2680 }
2681 if (!oldest_slot || (oldest_slot &&
2682 dumped_onodes[i].second < oldest_slot->second)) {
2683 oldest_slot = &dumped_onodes[i];
2684 }
2685 }
2686 }
7c673cae
FG
2687 }
2688 }
2689 } else {
2690 if (e->blob->is_spanning()) {
2691 spanning_blob_map.erase(e->blob->id);
2692 e->blob->id = -1;
2693 dout(30) << __func__ << " un-spanning " << *e->blob << dendl;
2694 }
2695 }
2696 }
9f95a23c
TL
2697 bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
2698 (oid_slot &&
2699 (mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
2700 if (do_dump) {
2701 dout(0) << __func__
2702 << " spanning blob count exceeds threshold, "
2703 << spanning_blob_map.size() << " spanning blobs"
2704 << dendl;
2705 _dump_onode<0>(cct, *onode);
2706 if (oid_slot) {
2707 oid_slot->second = mono_clock::now();
2708 } else {
2709 ceph_assert(oldest_slot);
2710 oldest_slot->first = onode->oid;
2711 oldest_slot->second = mono_clock::now();
2712 }
2713 }
7c673cae
FG
2714 }
2715
2716 clear_needs_reshard();
2717}
2718
2719bool BlueStore::ExtentMap::encode_some(
2720 uint32_t offset,
2721 uint32_t length,
2722 bufferlist& bl,
2723 unsigned *pn)
2724{
7c673cae
FG
2725 Extent dummy(offset);
2726 auto start = extent_map.lower_bound(dummy);
2727 uint32_t end = offset + length;
2728
2729 __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
2730 // serialization only. Hence there is no specific
2731 // handling at ExtentMap level.
2732
2733 unsigned n = 0;
2734 size_t bound = 0;
7c673cae
FG
2735 bool must_reshard = false;
2736 for (auto p = start;
2737 p != extent_map.end() && p->logical_offset < end;
2738 ++p, ++n) {
11fdf7f2 2739 ceph_assert(p->logical_offset >= offset);
7c673cae
FG
2740 p->blob->last_encoded_id = -1;
2741 if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
2742 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2743 << std::dec << " hit new spanning blob " << *p << dendl;
2744 request_reshard(p->blob_start(), p->blob_end());
2745 must_reshard = true;
2746 }
31f18b77
FG
2747 if (!must_reshard) {
2748 denc_varint(0, bound); // blobid
2749 denc_varint(0, bound); // logical_offset
2750 denc_varint(0, bound); // len
2751 denc_varint(0, bound); // blob_offset
7c673cae 2752
31f18b77
FG
2753 p->blob->bound_encode(
2754 bound,
2755 struct_v,
2756 p->blob->shared_blob->get_sbid(),
2757 false);
2758 }
7c673cae
FG
2759 }
2760 if (must_reshard) {
2761 return true;
2762 }
2763
31f18b77
FG
2764 denc(struct_v, bound);
2765 denc_varint(0, bound); // number of extents
2766
7c673cae
FG
2767 {
2768 auto app = bl.get_contiguous_appender(bound);
2769 denc(struct_v, app);
2770 denc_varint(n, app);
2771 if (pn) {
2772 *pn = n;
2773 }
2774
2775 n = 0;
2776 uint64_t pos = 0;
2777 uint64_t prev_len = 0;
2778 for (auto p = start;
2779 p != extent_map.end() && p->logical_offset < end;
2780 ++p, ++n) {
2781 unsigned blobid;
2782 bool include_blob = false;
2783 if (p->blob->is_spanning()) {
2784 blobid = p->blob->id << BLOBID_SHIFT_BITS;
2785 blobid |= BLOBID_FLAG_SPANNING;
2786 } else if (p->blob->last_encoded_id < 0) {
2787 p->blob->last_encoded_id = n + 1; // so it is always non-zero
2788 include_blob = true;
2789 blobid = 0; // the decoder will infer the id from n
2790 } else {
2791 blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
2792 }
2793 if (p->logical_offset == pos) {
2794 blobid |= BLOBID_FLAG_CONTIGUOUS;
2795 }
2796 if (p->blob_offset == 0) {
2797 blobid |= BLOBID_FLAG_ZEROOFFSET;
2798 }
2799 if (p->length == prev_len) {
2800 blobid |= BLOBID_FLAG_SAMELENGTH;
2801 } else {
2802 prev_len = p->length;
2803 }
2804 denc_varint(blobid, app);
2805 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2806 denc_varint_lowz(p->logical_offset - pos, app);
2807 }
2808 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2809 denc_varint_lowz(p->blob_offset, app);
2810 }
2811 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2812 denc_varint_lowz(p->length, app);
2813 }
2814 pos = p->logical_end();
2815 if (include_blob) {
2816 p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
2817 }
2818 }
2819 }
2820 /*derr << __func__ << bl << dendl;
2821 derr << __func__ << ":";
2822 bl.hexdump(*_dout);
2823 *_dout << dendl;
2824 */
2825 return false;
2826}
2827
2828unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
2829{
7c673cae
FG
2830 /*
2831 derr << __func__ << ":";
2832 bl.hexdump(*_dout);
2833 *_dout << dendl;
2834 */
2835
11fdf7f2 2836 ceph_assert(bl.get_num_buffers() <= 1);
7c673cae
FG
2837 auto p = bl.front().begin_deep();
2838 __u8 struct_v;
2839 denc(struct_v, p);
2840 // Version 2 differs from v1 in blob's ref_map
2841 // serialization only. Hence there is no specific
2842 // handling at ExtentMap level below.
11fdf7f2 2843 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
2844
2845 uint32_t num;
2846 denc_varint(num, p);
2847 vector<BlobRef> blobs(num);
2848 uint64_t pos = 0;
2849 uint64_t prev_len = 0;
2850 unsigned n = 0;
2851
2852 while (!p.end()) {
2853 Extent *le = new Extent();
2854 uint64_t blobid;
2855 denc_varint(blobid, p);
2856 if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
2857 uint64_t gap;
2858 denc_varint_lowz(gap, p);
2859 pos += gap;
2860 }
2861 le->logical_offset = pos;
2862 if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
2863 denc_varint_lowz(le->blob_offset, p);
2864 } else {
2865 le->blob_offset = 0;
2866 }
2867 if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
2868 denc_varint_lowz(prev_len, p);
2869 }
2870 le->length = prev_len;
2871
2872 if (blobid & BLOBID_FLAG_SPANNING) {
2873 dout(30) << __func__ << " getting spanning blob "
2874 << (blobid >> BLOBID_SHIFT_BITS) << dendl;
2875 le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
2876 } else {
2877 blobid >>= BLOBID_SHIFT_BITS;
2878 if (blobid) {
2879 le->assign_blob(blobs[blobid - 1]);
11fdf7f2 2880 ceph_assert(le->blob);
7c673cae
FG
2881 } else {
2882 Blob *b = new Blob();
2883 uint64_t sbid = 0;
2884 b->decode(onode->c, p, struct_v, &sbid, false);
2885 blobs[n] = b;
2886 onode->c->open_shared_blob(sbid, b);
2887 le->assign_blob(b);
2888 }
2889 // we build ref_map dynamically for non-spanning blobs
2890 le->blob->get_ref(
2891 onode->c,
2892 le->blob_offset,
2893 le->length);
2894 }
2895 pos += prev_len;
2896 ++n;
2897 extent_map.insert(*le);
2898 }
2899
11fdf7f2 2900 ceph_assert(n == num);
7c673cae
FG
2901 return num;
2902}
2903
2904void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
2905{
2906 // Version 2 differs from v1 in blob's ref_map
2907 // serialization only. Hence there is no specific
2908 // handling at ExtentMap level.
2909 __u8 struct_v = 2;
2910
2911 denc(struct_v, p);
2912 denc_varint((uint32_t)0, p);
2913 size_t key_size = 0;
2914 denc_varint((uint32_t)0, key_size);
2915 p += spanning_blob_map.size() * key_size;
2916 for (const auto& i : spanning_blob_map) {
2917 i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2918 }
2919}
2920
2921void BlueStore::ExtentMap::encode_spanning_blobs(
2922 bufferlist::contiguous_appender& p)
2923{
2924 // Version 2 differs from v1 in blob's ref_map
2925 // serialization only. Hence there is no specific
2926 // handling at ExtentMap level.
2927 __u8 struct_v = 2;
2928
2929 denc(struct_v, p);
2930 denc_varint(spanning_blob_map.size(), p);
2931 for (auto& i : spanning_blob_map) {
2932 denc_varint(i.second->id, p);
2933 i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
2934 }
2935}
2936
2937void BlueStore::ExtentMap::decode_spanning_blobs(
11fdf7f2 2938 bufferptr::const_iterator& p)
7c673cae
FG
2939{
2940 __u8 struct_v;
2941 denc(struct_v, p);
2942 // Version 2 differs from v1 in blob's ref_map
2943 // serialization only. Hence there is no specific
2944 // handling at ExtentMap level.
11fdf7f2 2945 ceph_assert(struct_v == 1 || struct_v == 2);
7c673cae
FG
2946
2947 unsigned n;
2948 denc_varint(n, p);
2949 while (n--) {
2950 BlobRef b(new Blob());
2951 denc_varint(b->id, p);
2952 spanning_blob_map[b->id] = b;
2953 uint64_t sbid = 0;
2954 b->decode(onode->c, p, struct_v, &sbid, true);
2955 onode->c->open_shared_blob(sbid, b);
2956 }
2957}
2958
2959void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
2960{
2961 shards.resize(onode->onode.extent_map_shards.size());
2962 unsigned i = 0;
2963 for (auto &s : onode->onode.extent_map_shards) {
2964 shards[i].shard_info = &s;
2965 shards[i].loaded = loaded;
2966 shards[i].dirty = dirty;
2967 ++i;
2968 }
2969}
2970
2971void BlueStore::ExtentMap::fault_range(
2972 KeyValueDB *db,
2973 uint32_t offset,
2974 uint32_t length)
2975{
7c673cae
FG
2976 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
2977 << std::dec << dendl;
2978 auto start = seek_shard(offset);
2979 auto last = seek_shard(offset + length);
2980
2981 if (start < 0)
2982 return;
2983
11fdf7f2 2984 ceph_assert(last >= start);
7c673cae
FG
2985 string key;
2986 while (start <= last) {
11fdf7f2 2987 ceph_assert((size_t)start < shards.size());
7c673cae
FG
2988 auto p = &shards[start];
2989 if (!p->loaded) {
2990 dout(30) << __func__ << " opening shard 0x" << std::hex
2991 << p->shard_info->offset << std::dec << dendl;
2992 bufferlist v;
2993 generate_extent_shard_key_and_apply(
2994 onode->key, p->shard_info->offset, &key,
2995 [&](const string& final_key) {
2996 int r = db->get(PREFIX_OBJ, final_key, &v);
2997 if (r < 0) {
2998 derr << __func__ << " missing shard 0x" << std::hex
2999 << p->shard_info->offset << std::dec << " for " << onode->oid
3000 << dendl;
11fdf7f2 3001 ceph_assert(r >= 0);
7c673cae
FG
3002 }
3003 }
3004 );
3005 p->extents = decode_some(v);
3006 p->loaded = true;
3007 dout(20) << __func__ << " open shard 0x" << std::hex
81eedcae
TL
3008 << p->shard_info->offset
3009 << " for range 0x" << offset << "~" << length << std::dec
7c673cae 3010 << " (" << v.length() << " bytes)" << dendl;
11fdf7f2
TL
3011 ceph_assert(p->dirty == false);
3012 ceph_assert(v.length() == p->shard_info->bytes);
7c673cae
FG
3013 onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
3014 } else {
3015 onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
3016 }
3017 ++start;
3018 }
3019}
3020
3021void BlueStore::ExtentMap::dirty_range(
7c673cae
FG
3022 uint32_t offset,
3023 uint32_t length)
3024{
7c673cae
FG
3025 dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
3026 << std::dec << dendl;
3027 if (shards.empty()) {
3028 dout(20) << __func__ << " mark inline shard dirty" << dendl;
3029 inline_bl.clear();
3030 return;
3031 }
3032 auto start = seek_shard(offset);
11fdf7f2
TL
3033 if (length == 0) {
3034 length = 1;
3035 }
3036 auto last = seek_shard(offset + length - 1);
7c673cae
FG
3037 if (start < 0)
3038 return;
3039
11fdf7f2 3040 ceph_assert(last >= start);
7c673cae 3041 while (start <= last) {
11fdf7f2 3042 ceph_assert((size_t)start < shards.size());
7c673cae
FG
3043 auto p = &shards[start];
3044 if (!p->loaded) {
11fdf7f2
TL
3045 derr << __func__ << "on write 0x" << std::hex << offset
3046 << "~" << length << " shard 0x" << p->shard_info->offset
3047 << std::dec << " is not loaded, can't mark dirty" << dendl;
3048 ceph_abort_msg("can't mark unloaded shard dirty");
7c673cae
FG
3049 }
3050 if (!p->dirty) {
3051 dout(20) << __func__ << " mark shard 0x" << std::hex
3052 << p->shard_info->offset << std::dec << " dirty" << dendl;
3053 p->dirty = true;
3054 }
3055 ++start;
3056 }
3057}
3058
3059BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
3060 uint64_t offset)
3061{
3062 Extent dummy(offset);
3063 return extent_map.find(dummy);
3064}
3065
7c673cae
FG
3066BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
3067 uint64_t offset)
3068{
3069 Extent dummy(offset);
3070 auto fp = extent_map.lower_bound(dummy);
3071 if (fp != extent_map.begin()) {
3072 --fp;
3073 if (fp->logical_end() <= offset) {
3074 ++fp;
3075 }
3076 }
3077 return fp;
3078}
3079
3080BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
3081 uint64_t offset) const
3082{
3083 Extent dummy(offset);
3084 auto fp = extent_map.lower_bound(dummy);
3085 if (fp != extent_map.begin()) {
3086 --fp;
3087 if (fp->logical_end() <= offset) {
3088 ++fp;
3089 }
3090 }
3091 return fp;
3092}
3093
3094bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
3095{
3096 auto fp = seek_lextent(offset);
3097 if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
3098 return false;
3099 }
3100 return true;
3101}
3102
3103int BlueStore::ExtentMap::compress_extent_map(
3104 uint64_t offset,
3105 uint64_t length)
3106{
7c673cae
FG
3107 if (extent_map.empty())
3108 return 0;
3109 int removed = 0;
3110 auto p = seek_lextent(offset);
3111 if (p != extent_map.begin()) {
3112 --p; // start to the left of offset
3113 }
3114 // the caller should have just written to this region
11fdf7f2 3115 ceph_assert(p != extent_map.end());
7c673cae
FG
3116
3117 // identify the *next* shard
3118 auto pshard = shards.begin();
3119 while (pshard != shards.end() &&
3120 p->logical_offset >= pshard->shard_info->offset) {
3121 ++pshard;
3122 }
3123 uint64_t shard_end;
3124 if (pshard != shards.end()) {
3125 shard_end = pshard->shard_info->offset;
3126 } else {
3127 shard_end = OBJECT_MAX_SIZE;
3128 }
3129
3130 auto n = p;
3131 for (++n; n != extent_map.end(); p = n++) {
3132 if (n->logical_offset > offset + length) {
3133 break; // stop after end
3134 }
3135 while (n != extent_map.end() &&
3136 p->logical_end() == n->logical_offset &&
3137 p->blob == n->blob &&
3138 p->blob_offset + p->length == n->blob_offset &&
3139 n->logical_offset < shard_end) {
3140 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3141 << " next shard 0x" << shard_end << std::dec
3142 << " merging " << *p << " and " << *n << dendl;
3143 p->length += n->length;
3144 rm(n++);
3145 ++removed;
3146 }
3147 if (n == extent_map.end()) {
3148 break;
3149 }
3150 if (n->logical_offset >= shard_end) {
11fdf7f2 3151 ceph_assert(pshard != shards.end());
7c673cae
FG
3152 ++pshard;
3153 if (pshard != shards.end()) {
3154 shard_end = pshard->shard_info->offset;
3155 } else {
3156 shard_end = OBJECT_MAX_SIZE;
3157 }
3158 }
3159 }
11fdf7f2 3160 if (removed) {
7c673cae
FG
3161 onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
3162 }
3163 return removed;
3164}
3165
3166void BlueStore::ExtentMap::punch_hole(
3167 CollectionRef &c,
3168 uint64_t offset,
3169 uint64_t length,
3170 old_extent_map_t *old_extents)
3171{
3172 auto p = seek_lextent(offset);
3173 uint64_t end = offset + length;
3174 while (p != extent_map.end()) {
3175 if (p->logical_offset >= end) {
3176 break;
3177 }
3178 if (p->logical_offset < offset) {
3179 if (p->logical_end() > end) {
3180 // split and deref middle
3181 uint64_t front = offset - p->logical_offset;
3182 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front,
3183 length, p->blob);
3184 old_extents->push_back(*oe);
3185 add(end,
3186 p->blob_offset + front + length,
3187 p->length - front - length,
3188 p->blob);
3189 p->length = front;
3190 break;
3191 } else {
3192 // deref tail
11fdf7f2 3193 ceph_assert(p->logical_end() > offset); // else seek_lextent bug
7c673cae
FG
3194 uint64_t keep = offset - p->logical_offset;
3195 OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
3196 p->length - keep, p->blob);
3197 old_extents->push_back(*oe);
3198 p->length = keep;
3199 ++p;
3200 continue;
3201 }
3202 }
3203 if (p->logical_offset + p->length <= end) {
3204 // deref whole lextent
3205 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3206 p->length, p->blob);
3207 old_extents->push_back(*oe);
3208 rm(p++);
3209 continue;
3210 }
3211 // deref head
3212 uint64_t keep = p->logical_end() - end;
3213 OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
3214 p->length - keep, p->blob);
3215 old_extents->push_back(*oe);
3216
3217 add(end, p->blob_offset + p->length - keep, keep, p->blob);
3218 rm(p);
3219 break;
3220 }
3221}
3222
3223BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
3224 CollectionRef &c,
3225 uint64_t logical_offset,
3226 uint64_t blob_offset, uint64_t length, BlobRef b,
3227 old_extent_map_t *old_extents)
3228{
3229 // We need to have completely initialized Blob to increment its ref counters.
11fdf7f2 3230 ceph_assert(b->get_blob().get_logical_length() != 0);
7c673cae
FG
3231
3232 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3233 // old_extents list if we overwre the blob totally
3234 // This might happen during WAL overwrite.
3235 b->get_ref(onode->c, blob_offset, length);
3236
3237 if (old_extents) {
3238 punch_hole(c, logical_offset, length, old_extents);
3239 }
3240
3241 Extent *le = new Extent(logical_offset, blob_offset, length, b);
3242 extent_map.insert(*le);
3243 if (spans_shard(logical_offset, length)) {
3244 request_reshard(logical_offset, logical_offset + length);
3245 }
3246 return le;
3247}
3248
3249BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
3250 BlobRef lb,
3251 uint32_t blob_offset,
3252 uint32_t pos)
3253{
7c673cae
FG
3254 uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
3255 dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
3256 << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
3257 << dendl;
3258 BlobRef rb = onode->c->new_blob();
3259 lb->split(onode->c, blob_offset, rb.get());
3260
3261 for (auto ep = seek_lextent(pos);
3262 ep != extent_map.end() && ep->logical_offset < end_pos;
3263 ++ep) {
3264 if (ep->blob != lb) {
3265 continue;
3266 }
3267 if (ep->logical_offset < pos) {
3268 // split extent
3269 size_t left = pos - ep->logical_offset;
3270 Extent *ne = new Extent(pos, 0, ep->length - left, rb);
3271 extent_map.insert(*ne);
3272 ep->length = left;
3273 dout(30) << __func__ << " split " << *ep << dendl;
3274 dout(30) << __func__ << " to " << *ne << dendl;
3275 } else {
3276 // switch blob
11fdf7f2 3277 ceph_assert(ep->blob_offset >= blob_offset);
7c673cae
FG
3278
3279 ep->blob = rb;
3280 ep->blob_offset -= blob_offset;
3281 dout(30) << __func__ << " adjusted " << *ep << dendl;
3282 }
3283 }
3284 return rb;
3285}
3286
3287// Onode
3288
3289#undef dout_prefix
3290#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3291
eafe8130
TL
3292BlueStore::Onode* BlueStore::Onode::decode(
3293 CollectionRef c,
3294 const ghobject_t& oid,
3295 const string& key,
3296 const bufferlist& v)
3297{
3298 Onode* on = new Onode(c.get(), oid, key);
3299 on->exists = true;
3300 auto p = v.front().begin_deep();
3301 on->onode.decode(p);
3302 for (auto& i : on->onode.attrs) {
3303 i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
3304 }
3305
3306 // initialize extent_map
3307 on->extent_map.decode_spanning_blobs(p);
3308 if (on->onode.extent_map_shards.empty()) {
3309 denc(on->extent_map.inline_bl, p);
3310 on->extent_map.decode_some(on->extent_map.inline_bl);
3311 on->extent_map.inline_bl.reassign_to_mempool(
3312 mempool::mempool_bluestore_cache_other);
3313 }
3314 else {
3315 on->extent_map.init_shards(false, false);
3316 }
3317 return on;
3318}
3319
7c673cae
FG
3320void BlueStore::Onode::flush()
3321{
3322 if (flushing_count.load()) {
3323 ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
9f95a23c 3324 waiting_count++;
11fdf7f2 3325 std::unique_lock l(flush_lock);
7c673cae
FG
3326 while (flushing_count.load()) {
3327 flush_cond.wait(l);
3328 }
9f95a23c 3329 waiting_count--;
7c673cae
FG
3330 }
3331 ldout(c->store->cct, 20) << __func__ << " done" << dendl;
3332}
3333
9f95a23c
TL
3334void BlueStore::Onode::dump(Formatter* f) const
3335{
3336 onode.dump(f);
3337 extent_map.dump(f);
3338}
3339
3340
3341const string& BlueStore::Onode::get_omap_prefix()
3342{
3343 if (onode.is_pgmeta_omap()) {
3344 return PREFIX_PGMETA_OMAP;
3345 }
3346 if (onode.is_perpool_omap()) {
3347 return PREFIX_PERPOOL_OMAP;
3348 }
3349 return PREFIX_OMAP;
3350}
3351
3352// '-' < '.' < '~'
3353
3354void BlueStore::Onode::get_omap_header(string *out)
3355{
3356 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3357 _key_encode_u64(c->pool(), out);
3358 }
3359 _key_encode_u64(onode.nid, out);
3360 out->push_back('-');
3361}
3362
3363void BlueStore::Onode::get_omap_key(const string& key, string *out)
3364{
3365 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3366 _key_encode_u64(c->pool(), out);
3367 }
3368 _key_encode_u64(onode.nid, out);
3369 out->push_back('.');
3370 out->append(key);
3371}
3372
3373void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
3374{
3375 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3376 _key_encode_u64(c->pool(), out);
3377 }
3378 _key_encode_u64(onode.nid, out);
3379 out->append(old.c_str() + out->length(), old.size() - out->length());
3380}
3381
3382void BlueStore::Onode::get_omap_tail(string *out)
3383{
3384 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3385 _key_encode_u64(c->pool(), out);
3386 }
3387 _key_encode_u64(onode.nid, out);
3388 out->push_back('~');
3389}
3390
3391void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
3392{
3393 if (onode.is_perpool_omap() && !onode.is_pgmeta_omap()) {
3394 *user_key = key.substr(sizeof(uint64_t)*2 + 1);
3395 } else {
3396 *user_key = key.substr(sizeof(uint64_t) + 1);
3397 }
3398}
3399
3400
7c673cae
FG
3401// =======================================================
3402// WriteContext
3403
3404/// Checks for writes to the same pextent within a blob
3405bool BlueStore::WriteContext::has_conflict(
3406 BlobRef b,
3407 uint64_t loffs,
3408 uint64_t loffs_end,
3409 uint64_t min_alloc_size)
3410{
11fdf7f2
TL
3411 ceph_assert((loffs % min_alloc_size) == 0);
3412 ceph_assert((loffs_end % min_alloc_size) == 0);
7c673cae
FG
3413 for (auto w : writes) {
3414 if (b == w.b) {
11fdf7f2
TL
3415 auto loffs2 = p2align(w.logical_offset, min_alloc_size);
3416 auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
7c673cae 3417 if ((loffs <= loffs2 && loffs_end > loffs2) ||
224ce89b 3418 (loffs >= loffs2 && loffs < loffs2_end)) {
7c673cae
FG
3419 return true;
3420 }
3421 }
3422 }
3423 return false;
3424}
3425
3426// =======================================================
3427
3428// DeferredBatch
3429#undef dout_prefix
3430#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
9f95a23c
TL
3431#undef dout_context
3432#define dout_context cct
7c673cae
FG
3433
3434void BlueStore::DeferredBatch::prepare_write(
3435 CephContext *cct,
3436 uint64_t seq, uint64_t offset, uint64_t length,
3437 bufferlist::const_iterator& blp)
3438{
3439 _discard(cct, offset, length);
3440 auto i = iomap.insert(make_pair(offset, deferred_io()));
11fdf7f2 3441 ceph_assert(i.second); // this should be a new insertion
7c673cae
FG
3442 i.first->second.seq = seq;
3443 blp.copy(length, i.first->second.bl);
31f18b77
FG
3444 i.first->second.bl.reassign_to_mempool(
3445 mempool::mempool_bluestore_writing_deferred);
7c673cae
FG
3446 dout(20) << __func__ << " seq " << seq
3447 << " 0x" << std::hex << offset << "~" << length
3448 << " crc " << i.first->second.bl.crc32c(-1)
3449 << std::dec << dendl;
3450 seq_bytes[seq] += length;
3451#ifdef DEBUG_DEFERRED
3452 _audit(cct);
3453#endif
3454}
3455
3456void BlueStore::DeferredBatch::_discard(
3457 CephContext *cct, uint64_t offset, uint64_t length)
3458{
3459 generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
3460 << std::dec << dendl;
3461 auto p = iomap.lower_bound(offset);
3462 if (p != iomap.begin()) {
3463 --p;
3464 auto end = p->first + p->second.bl.length();
3465 if (end > offset) {
3466 bufferlist head;
3467 head.substr_of(p->second.bl, 0, offset - p->first);
3468 dout(20) << __func__ << " keep head " << p->second.seq
3469 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3470 << " -> 0x" << head.length() << std::dec << dendl;
3471 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3472 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3473 if (end > offset + length) {
3474 bufferlist tail;
3475 tail.substr_of(p->second.bl, offset + length - p->first,
3476 end - (offset + length));
3477 dout(20) << __func__ << " keep tail " << p->second.seq
3478 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3479 << " -> 0x" << tail.length() << std::dec << dendl;
3480 auto &n = iomap[offset + length];
3481 n.bl.swap(tail);
3482 n.seq = p->second.seq;
3483 i->second -= length;
3484 } else {
3485 i->second -= end - offset;
3486 }
11fdf7f2 3487 ceph_assert(i->second >= 0);
7c673cae
FG
3488 p->second.bl.swap(head);
3489 }
3490 ++p;
3491 }
3492 while (p != iomap.end()) {
3493 if (p->first >= offset + length) {
3494 break;
3495 }
3496 auto i = seq_bytes.find(p->second.seq);
11fdf7f2 3497 ceph_assert(i != seq_bytes.end());
7c673cae
FG
3498 auto end = p->first + p->second.bl.length();
3499 if (end > offset + length) {
3500 unsigned drop_front = offset + length - p->first;
3501 unsigned keep_tail = end - (offset + length);
3502 dout(20) << __func__ << " truncate front " << p->second.seq
3503 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3504 << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
3505 << " to 0x" << (offset + length) << "~" << keep_tail
3506 << std::dec << dendl;
3507 auto &s = iomap[offset + length];
3508 s.seq = p->second.seq;
3509 s.bl.substr_of(p->second.bl, drop_front, keep_tail);
3510 i->second -= drop_front;
3511 } else {
3512 dout(20) << __func__ << " drop " << p->second.seq
3513 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
3514 << std::dec << dendl;
3515 i->second -= p->second.bl.length();
3516 }
11fdf7f2 3517 ceph_assert(i->second >= 0);
7c673cae
FG
3518 p = iomap.erase(p);
3519 }
3520}
3521
3522void BlueStore::DeferredBatch::_audit(CephContext *cct)
3523{
3524 map<uint64_t,int> sb;
3525 for (auto p : seq_bytes) {
3526 sb[p.first] = 0; // make sure we have the same set of keys
3527 }
3528 uint64_t pos = 0;
3529 for (auto& p : iomap) {
11fdf7f2 3530 ceph_assert(p.first >= pos);
7c673cae
FG
3531 sb[p.second.seq] += p.second.bl.length();
3532 pos = p.first + p.second.bl.length();
3533 }
11fdf7f2 3534 ceph_assert(sb == seq_bytes);
7c673cae
FG
3535}
3536
3537
3538// Collection
3539
3540#undef dout_prefix
3541#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3542
9f95a23c
TL
3543BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
3544 : CollectionImpl(store_->cct, cid),
11fdf7f2 3545 store(store_),
9f95a23c 3546 cache(bc),
7c673cae 3547 exists(true),
9f95a23c 3548 onode_map(oc),
11fdf7f2
TL
3549 commit_queue(nullptr)
3550{
3551}
3552
3553bool BlueStore::Collection::flush_commit(Context *c)
3554{
3555 return osr->flush_commit(c);
3556}
3557
3558void BlueStore::Collection::flush()
3559{
3560 osr->flush();
3561}
3562
3563void BlueStore::Collection::flush_all_but_last()
7c673cae 3564{
11fdf7f2 3565 osr->flush_all_but_last();
7c673cae
FG
3566}
3567
3568void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
3569{
11fdf7f2 3570 ceph_assert(!b->shared_blob);
7c673cae
FG
3571 const bluestore_blob_t& blob = b->get_blob();
3572 if (!blob.is_shared()) {
3573 b->shared_blob = new SharedBlob(this);
3574 return;
3575 }
3576
3577 b->shared_blob = shared_blob_set.lookup(sbid);
3578 if (b->shared_blob) {
3579 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3580 << std::dec << " had " << *b->shared_blob << dendl;
3581 } else {
3582 b->shared_blob = new SharedBlob(sbid, this);
3583 shared_blob_set.add(this, b->shared_blob.get());
3584 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3585 << std::dec << " opened " << *b->shared_blob
3586 << dendl;
3587 }
3588}
3589
3590void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
3591{
3592 if (!sb->is_loaded()) {
3593
3594 bufferlist v;
3595 string key;
3596 auto sbid = sb->get_sbid();
3597 get_shared_blob_key(sbid, &key);
3598 int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
3599 if (r < 0) {
3600 lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
3601 << std::dec << " not found at key "
3602 << pretty_binary_string(key) << dendl;
11fdf7f2 3603 ceph_abort_msg("uh oh, missing shared_blob");
7c673cae
FG
3604 }
3605
3606 sb->loaded = true;
3607 sb->persistent = new bluestore_shared_blob_t(sbid);
11fdf7f2
TL
3608 auto p = v.cbegin();
3609 decode(*(sb->persistent), p);
7c673cae
FG
3610 ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
3611 << std::dec << " loaded shared_blob " << *sb << dendl;
3612 }
3613}
3614
3615void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
3616{
7c673cae 3617 ldout(store->cct, 10) << __func__ << " " << *b << dendl;
11fdf7f2 3618 ceph_assert(!b->shared_blob->is_loaded());
7c673cae
FG
3619
3620 // update blob
31f18b77 3621 bluestore_blob_t& blob = b->dirty_blob();
7c673cae 3622 blob.set_flag(bluestore_blob_t::FLAG_SHARED);
7c673cae
FG
3623
3624 // update shared blob
3625 b->shared_blob->loaded = true;
3626 b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
3627 shared_blob_set.add(this, b->shared_blob.get());
3628 for (auto p : blob.get_extents()) {
3629 if (p.is_valid()) {
3630 b->shared_blob->get_ref(
3631 p.offset,
3632 p.length);
3633 }
3634 }
3635 ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
3636}
3637
31f18b77
FG
3638uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
3639{
3640 ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
11fdf7f2 3641 ceph_assert(sb->is_loaded());
31f18b77
FG
3642
3643 uint64_t sbid = sb->get_sbid();
3644 shared_blob_set.remove(sb);
3645 sb->loaded = false;
3646 delete sb->persistent;
3647 sb->sbid_unloaded = 0;
3648 ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
3649 return sbid;
3650}
3651
7c673cae
FG
3652BlueStore::OnodeRef BlueStore::Collection::get_onode(
3653 const ghobject_t& oid,
9f95a23c
TL
3654 bool create,
3655 bool is_createop)
7c673cae 3656{
9f95a23c 3657 ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
7c673cae
FG
3658
3659 spg_t pgid;
3660 if (cid.is_pg(&pgid)) {
3661 if (!oid.match(cnode.bits, pgid.ps())) {
3662 lderr(store->cct) << __func__ << " oid " << oid << " not part of "
3663 << pgid << " bits " << cnode.bits << dendl;
3664 ceph_abort();
3665 }
3666 }
3667
3668 OnodeRef o = onode_map.lookup(oid);
3669 if (o)
3670 return o;
3671
eafe8130 3672 string key;
7c673cae
FG
3673 get_object_key(store->cct, oid, &key);
3674
3675 ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
3676 << pretty_binary_string(key) << dendl;
3677
3678 bufferlist v;
9f95a23c 3679 int r = -ENOENT;
7c673cae 3680 Onode *on;
9f95a23c
TL
3681 if (!is_createop) {
3682 r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
3683 ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
3684 }
7c673cae 3685 if (v.length() == 0) {
11fdf7f2 3686 ceph_assert(r == -ENOENT);
7c673cae
FG
3687 if (!store->cct->_conf->bluestore_debug_misc &&
3688 !create)
3689 return OnodeRef();
3690
3691 // new object, new onode
3692 on = new Onode(this, oid, key);
3693 } else {
3694 // loaded
11fdf7f2 3695 ceph_assert(r >= 0);
eafe8130 3696 on = Onode::decode(this, oid, key, v);
7c673cae
FG
3697 }
3698 o.reset(on);
3699 return onode_map.add(oid, o);
3700}
3701
3702void BlueStore::Collection::split_cache(
3703 Collection *dest)
3704{
3705 ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
3706
3707 // lock (one or both) cache shards
3708 std::lock(cache->lock, dest->cache->lock);
11fdf7f2
TL
3709 std::lock_guard l(cache->lock, std::adopt_lock);
3710 std::lock_guard l2(dest->cache->lock, std::adopt_lock);
7c673cae
FG
3711
3712 int destbits = dest->cnode.bits;
3713 spg_t destpg;
3714 bool is_pg = dest->cid.is_pg(&destpg);
11fdf7f2 3715 ceph_assert(is_pg);
7c673cae
FG
3716
3717 auto p = onode_map.onode_map.begin();
3718 while (p != onode_map.onode_map.end()) {
11fdf7f2 3719 OnodeRef o = p->second;
7c673cae
FG
3720 if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
3721 // onode does not belong to this child
11fdf7f2
TL
3722 ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
3723 << dendl;
7c673cae
FG
3724 ++p;
3725 } else {
7c673cae
FG
3726 ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
3727 << dendl;
3728
9f95a23c
TL
3729 // move the onode to the new map before futzing with the cache
3730 // shard, ensuring that nref is always >= 2, and no racing
3731 // thread can trigger a pin or unpin (which does *not* behave
3732 // well when we are clearing and resetting the 's' shard
3733 // pointer!).
7c673cae 3734 p = onode_map.onode_map.erase(p);
7c673cae 3735 dest->onode_map.onode_map[o->oid] = o;
9f95a23c
TL
3736
3737 if (onode_map.cache != dest->onode_map.cache) {
3738 // move onode to a different cache shard
3739 onode_map.cache->_rm(o);
3740 o->c = dest;
3741 dest->onode_map.cache->_add(o, 1);
3742 } else {
3743 // the onode is in the same cache shard, making our move simpler.
3744 o->c = dest;
3745 }
7c673cae
FG
3746
3747 // move over shared blobs and buffers. cover shared blobs from
3748 // both extent map and spanning blob map (the full extent map
3749 // may not be faulted in)
3750 vector<SharedBlob*> sbvec;
3751 for (auto& e : o->extent_map.extent_map) {
3752 sbvec.push_back(e.blob->shared_blob.get());
3753 }
3754 for (auto& b : o->extent_map.spanning_blob_map) {
3755 sbvec.push_back(b.second->shared_blob.get());
3756 }
3757 for (auto sb : sbvec) {
3758 if (sb->coll == dest) {
3759 ldout(store->cct, 20) << __func__ << " already moved " << *sb
3760 << dendl;
3761 continue;
3762 }
3763 ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
31f18b77
FG
3764 if (sb->get_sbid()) {
3765 ldout(store->cct, 20) << __func__
3766 << " moving registration " << *sb << dendl;
3767 shared_blob_set.remove(sb);
3768 dest->shared_blob_set.add(dest, sb);
3769 }
3efd9988 3770 sb->coll = dest;
7c673cae 3771 if (dest->cache != cache) {
7c673cae
FG
3772 for (auto& i : sb->bc.buffer_map) {
3773 if (!i.second->is_writing()) {
3774 ldout(store->cct, 20) << __func__ << " moving " << *i.second
3775 << dendl;
9f95a23c 3776 dest->cache->_move(cache, i.second.get());
7c673cae
FG
3777 }
3778 }
3779 }
3780 }
7c673cae
FG
3781 }
3782 }
9f95a23c 3783 dest->cache->_trim();
7c673cae
FG
3784}
3785
7c673cae
FG
3786// =======================================================
3787
91327a77
AA
3788// MempoolThread
3789
3790#undef dout_prefix
3791#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
9f95a23c
TL
3792#undef dout_context
3793#define dout_context store->cct
91327a77 3794
7c673cae
FG
3795void *BlueStore::MempoolThread::entry()
3796{
9f95a23c 3797 std::unique_lock l{lock};
11fdf7f2 3798
92f5a8d4 3799 uint32_t prev_config_change = store->config_changed.load();
eafe8130
TL
3800 uint64_t base = store->osd_memory_base;
3801 double fragmentation = store->osd_memory_expected_fragmentation;
3802 uint64_t target = store->osd_memory_target;
3803 uint64_t min = store->osd_memory_cache_min;
3804 uint64_t max = min;
3805
3806 // When setting the maximum amount of memory to use for cache, first
3807 // assume some base amount of memory for the OSD and then fudge in
3808 // some overhead for fragmentation that scales with cache usage.
3809 uint64_t ltarget = (1.0 - fragmentation) * target;
3810 if (ltarget > base + min) {
3811 max = ltarget - base;
11fdf7f2 3812 }
31f18b77 3813
eafe8130
TL
3814 binned_kv_cache = store->db->get_priority_cache();
3815 if (store->cache_autotune && binned_kv_cache != nullptr) {
3816 pcm = std::make_shared<PriorityCache::Manager>(
3817 store->cct, min, max, target, true);
3818 pcm->insert("kv", binned_kv_cache, true);
3819 pcm->insert("meta", meta_cache, true);
3820 pcm->insert("data", data_cache, true);
3821 }
91327a77
AA
3822
3823 utime_t next_balance = ceph_clock_now();
3824 utime_t next_resize = ceph_clock_now();
9f95a23c
TL
3825 utime_t next_deferred_force_submit = ceph_clock_now();
3826 utime_t alloc_stats_dump_clock = ceph_clock_now();
31f18b77 3827
91327a77 3828 bool interval_stats_trim = false;
91327a77 3829 while (!stop) {
92f5a8d4
TL
3830 // Update pcm cache settings if related configuration was changed
3831 uint32_t cur_config_change = store->config_changed.load();
3832 if (cur_config_change != prev_config_change) {
3833 _update_cache_settings();
3834 prev_config_change = cur_config_change;
3835 }
3836
91327a77
AA
3837 // Before we trim, check and see if it's time to rebalance/resize.
3838 double autotune_interval = store->cache_autotune_interval;
3839 double resize_interval = store->osd_memory_cache_resize_interval;
9f95a23c
TL
3840 double max_defer_interval = store->max_defer_interval;
3841
3842 double alloc_stats_dump_interval =
3843 store->cct->_conf->bluestore_alloc_stats_dump_interval;
91327a77 3844
9f95a23c
TL
3845 if (alloc_stats_dump_interval > 0 &&
3846 alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
3847 store->_record_allocation_stats();
3848 alloc_stats_dump_clock = ceph_clock_now();
3849 }
91327a77 3850 if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
11fdf7f2
TL
3851 _adjust_cache_settings();
3852
91327a77 3853 // Log events at 5 instead of 20 when balance happens.
91327a77 3854 interval_stats_trim = true;
eafe8130
TL
3855
3856 if (pcm != nullptr) {
3857 pcm->balance();
91327a77 3858 }
31f18b77 3859
91327a77
AA
3860 next_balance = ceph_clock_now();
3861 next_balance += autotune_interval;
3862 }
3863 if (resize_interval > 0 && next_resize < ceph_clock_now()) {
eafe8130
TL
3864 if (ceph_using_tcmalloc() && pcm != nullptr) {
3865 pcm->tune_memory();
91327a77
AA
3866 }
3867 next_resize = ceph_clock_now();
3868 next_resize += resize_interval;
31f18b77
FG
3869 }
3870
9f95a23c
TL
3871 if (max_defer_interval > 0 &&
3872 next_deferred_force_submit < ceph_clock_now()) {
3873 if (store->get_deferred_last_submitted() + max_defer_interval <
3874 ceph_clock_now()) {
3875 store->deferred_try_submit();
3876 }
3877 next_deferred_force_submit = ceph_clock_now();
3878 next_deferred_force_submit += max_defer_interval/3;
3879 }
3880
3881 // Now Resize the shards
3882 _resize_shards(interval_stats_trim);
91327a77 3883 interval_stats_trim = false;
31f18b77 3884
91327a77 3885 store->_update_cache_logger();
11fdf7f2
TL
3886 auto wait = ceph::make_timespan(
3887 store->cct->_conf->bluestore_cache_trim_interval);
3888 cond.wait_for(l, wait);
7c673cae 3889 }
9f95a23c
TL
3890 // do final dump
3891 store->_record_allocation_stats();
7c673cae
FG
3892 stop = false;
3893 return NULL;
3894}
3895
91327a77
AA
3896void BlueStore::MempoolThread::_adjust_cache_settings()
3897{
11fdf7f2
TL
3898 if (binned_kv_cache != nullptr) {
3899 binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
3900 }
3901 meta_cache->set_cache_ratio(store->cache_meta_ratio);
3902 data_cache->set_cache_ratio(store->cache_data_ratio);
91327a77
AA
3903}
3904
9f95a23c 3905void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
91327a77 3906{
9f95a23c
TL
3907 size_t onode_shards = store->onode_cache_shards.size();
3908 size_t buffer_shards = store->buffer_cache_shards.size();
91327a77 3909 int64_t kv_used = store->db->get_cache_usage();
11fdf7f2
TL
3910 int64_t meta_used = meta_cache->_get_used_bytes();
3911 int64_t data_used = data_cache->_get_used_bytes();
91327a77
AA
3912
3913 uint64_t cache_size = store->cache_size;
3914 int64_t kv_alloc =
11fdf7f2 3915 static_cast<int64_t>(store->cache_kv_ratio * cache_size);
91327a77 3916 int64_t meta_alloc =
11fdf7f2 3917 static_cast<int64_t>(store->cache_meta_ratio * cache_size);
91327a77 3918 int64_t data_alloc =
11fdf7f2 3919 static_cast<int64_t>(store->cache_data_ratio * cache_size);
91327a77 3920
eafe8130
TL
3921 if (pcm != nullptr && binned_kv_cache != nullptr) {
3922 cache_size = pcm->get_tuned_mem();
11fdf7f2
TL
3923 kv_alloc = binned_kv_cache->get_committed_size();
3924 meta_alloc = meta_cache->get_committed_size();
3925 data_alloc = data_cache->get_committed_size();
91327a77
AA
3926 }
3927
3928 if (interval_stats) {
9f95a23c 3929 dout(5) << __func__ << " cache_size: " << cache_size
91327a77
AA
3930 << " kv_alloc: " << kv_alloc
3931 << " kv_used: " << kv_used
3932 << " meta_alloc: " << meta_alloc
3933 << " meta_used: " << meta_used
3934 << " data_alloc: " << data_alloc
3935 << " data_used: " << data_used << dendl;
3936 } else {
9f95a23c 3937 dout(20) << __func__ << " cache_size: " << cache_size
91327a77
AA
3938 << " kv_alloc: " << kv_alloc
3939 << " kv_used: " << kv_used
3940 << " meta_alloc: " << meta_alloc
3941 << " meta_used: " << meta_used
3942 << " data_alloc: " << data_alloc
3943 << " data_used: " << data_used << dendl;
3944 }
3945
3946 uint64_t max_shard_onodes = static_cast<uint64_t>(
9f95a23c
TL
3947 (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
3948 uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
91327a77 3949
9f95a23c 3950 dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
91327a77
AA
3951 << " max_shard_buffer: " << max_shard_buffer << dendl;
3952
9f95a23c
TL
3953 for (auto i : store->onode_cache_shards) {
3954 i->set_max(max_shard_onodes);
3955 }
3956 for (auto i : store->buffer_cache_shards) {
3957 i->set_max(max_shard_buffer);
91327a77
AA
3958 }
3959}
3960
92f5a8d4
TL
3961void BlueStore::MempoolThread::_update_cache_settings()
3962{
3963 // Nothing to do if pcm is not used.
3964 if (pcm == nullptr) {
3965 return;
3966 }
3967
92f5a8d4
TL
3968 uint64_t target = store->osd_memory_target;
3969 uint64_t base = store->osd_memory_base;
3970 uint64_t min = store->osd_memory_cache_min;
3971 uint64_t max = min;
3972 double fragmentation = store->osd_memory_expected_fragmentation;
3973
3974 uint64_t ltarget = (1.0 - fragmentation) * target;
3975 if (ltarget > base + min) {
3976 max = ltarget - base;
3977 }
3978
3979 // set pcm cache levels
3980 pcm->set_target_memory(target);
3981 pcm->set_min_memory(min);
3982 pcm->set_max_memory(max);
3983
9f95a23c 3984 dout(5) << __func__ << " updated pcm target: " << target
92f5a8d4
TL
3985 << " pcm min: " << min
3986 << " pcm max: " << max
3987 << dendl;
3988}
3989
7c673cae
FG
3990// =======================================================
3991
31f18b77
FG
3992// OmapIteratorImpl
3993
3994#undef dout_prefix
3995#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3996
3997BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3998 CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
3999 : c(c), o(o), it(it)
4000{
9f95a23c 4001 std::shared_lock l(c->lock);
31f18b77 4002 if (o->onode.has_omap()) {
9f95a23c
TL
4003 o->get_omap_key(string(), &head);
4004 o->get_omap_tail(&tail);
31f18b77
FG
4005 it->lower_bound(head);
4006 }
4007}
4008
11fdf7f2
TL
4009string BlueStore::OmapIteratorImpl::_stringify() const
4010{
4011 stringstream s;
4012 s << " omap_iterator(cid = " << c->cid
4013 <<", oid = " << o->oid << ")";
4014 return s.str();
4015}
4016
31f18b77
FG
4017int BlueStore::OmapIteratorImpl::seek_to_first()
4018{
9f95a23c 4019 std::shared_lock l(c->lock);
11fdf7f2 4020 auto start1 = mono_clock::now();
31f18b77
FG
4021 if (o->onode.has_omap()) {
4022 it->lower_bound(head);
4023 } else {
4024 it = KeyValueDB::Iterator();
4025 }
494da23a
TL
4026 c->store->log_latency(
4027 __func__,
11fdf7f2
TL
4028 l_bluestore_omap_seek_to_first_lat,
4029 mono_clock::now() - start1,
494da23a 4030 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2 4031
31f18b77
FG
4032 return 0;
4033}
4034
4035int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
4036{
9f95a23c 4037 std::shared_lock l(c->lock);
11fdf7f2 4038 auto start1 = mono_clock::now();
31f18b77
FG
4039 if (o->onode.has_omap()) {
4040 string key;
9f95a23c 4041 o->get_omap_key(after, &key);
31f18b77
FG
4042 ldout(c->store->cct,20) << __func__ << " after " << after << " key "
4043 << pretty_binary_string(key) << dendl;
4044 it->upper_bound(key);
4045 } else {
4046 it = KeyValueDB::Iterator();
4047 }
11fdf7f2 4048 c->store->log_latency_fn(
494da23a 4049 __func__,
11fdf7f2
TL
4050 l_bluestore_omap_upper_bound_lat,
4051 mono_clock::now() - start1,
494da23a 4052 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4053 [&] (const ceph::timespan& lat) {
494da23a 4054 return ", after = " + after +
11fdf7f2
TL
4055 _stringify();
4056 }
4057 );
31f18b77
FG
4058 return 0;
4059}
4060
4061int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
4062{
9f95a23c 4063 std::shared_lock l(c->lock);
11fdf7f2 4064 auto start1 = mono_clock::now();
31f18b77
FG
4065 if (o->onode.has_omap()) {
4066 string key;
9f95a23c 4067 o->get_omap_key(to, &key);
31f18b77
FG
4068 ldout(c->store->cct,20) << __func__ << " to " << to << " key "
4069 << pretty_binary_string(key) << dendl;
4070 it->lower_bound(key);
4071 } else {
4072 it = KeyValueDB::Iterator();
4073 }
11fdf7f2 4074 c->store->log_latency_fn(
494da23a 4075 __func__,
11fdf7f2
TL
4076 l_bluestore_omap_lower_bound_lat,
4077 mono_clock::now() - start1,
494da23a 4078 c->store->cct->_conf->bluestore_log_omap_iterator_age,
11fdf7f2 4079 [&] (const ceph::timespan& lat) {
494da23a 4080 return ", to = " + to +
11fdf7f2
TL
4081 _stringify();
4082 }
4083 );
31f18b77
FG
4084 return 0;
4085}
4086
4087bool BlueStore::OmapIteratorImpl::valid()
4088{
9f95a23c 4089 std::shared_lock l(c->lock);
31f18b77 4090 bool r = o->onode.has_omap() && it && it->valid() &&
494da23a 4091 it->raw_key().second < tail;
31f18b77
FG
4092 if (it && it->valid()) {
4093 ldout(c->store->cct,20) << __func__ << " is at "
4094 << pretty_binary_string(it->raw_key().second)
4095 << dendl;
4096 }
4097 return r;
4098}
4099
11fdf7f2 4100int BlueStore::OmapIteratorImpl::next()
31f18b77 4101{
11fdf7f2 4102 int r = -1;
9f95a23c 4103 std::shared_lock l(c->lock);
11fdf7f2 4104 auto start1 = mono_clock::now();
31f18b77
FG
4105 if (o->onode.has_omap()) {
4106 it->next();
11fdf7f2 4107 r = 0;
31f18b77 4108 }
494da23a
TL
4109 c->store->log_latency(
4110 __func__,
11fdf7f2
TL
4111 l_bluestore_omap_next_lat,
4112 mono_clock::now() - start1,
494da23a 4113 c->store->cct->_conf->bluestore_log_omap_iterator_age);
11fdf7f2
TL
4114
4115 return r;
31f18b77
FG
4116}
4117
4118string BlueStore::OmapIteratorImpl::key()
4119{
9f95a23c 4120 std::shared_lock l(c->lock);
11fdf7f2 4121 ceph_assert(it->valid());
31f18b77
FG
4122 string db_key = it->raw_key().second;
4123 string user_key;
9f95a23c 4124 o->decode_omap_key(db_key, &user_key);
494da23a 4125
31f18b77
FG
4126 return user_key;
4127}
4128
4129bufferlist BlueStore::OmapIteratorImpl::value()
4130{
9f95a23c 4131 std::shared_lock l(c->lock);
11fdf7f2 4132 ceph_assert(it->valid());
31f18b77
FG
4133 return it->value();
4134}
4135
4136
4137// =====================================
4138
7c673cae
FG
4139#undef dout_prefix
4140#define dout_prefix *_dout << "bluestore(" << path << ") "
9f95a23c
TL
4141#undef dout_context
4142#define dout_context cct
7c673cae
FG
4143
4144
4145static void aio_cb(void *priv, void *priv2)
4146{
4147 BlueStore *store = static_cast<BlueStore*>(priv);
4148 BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
4149 c->aio_finish(store);
4150}
4151
11fdf7f2
TL
4152static void discard_cb(void *priv, void *priv2)
4153{
4154 BlueStore *store = static_cast<BlueStore*>(priv);
4155 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
4156 store->handle_discard(*tmp);
4157}
4158
4159void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
4160{
4161 dout(10) << __func__ << dendl;
4162 ceph_assert(alloc);
4163 alloc->release(to_release);
4164}
4165
7c673cae 4166BlueStore::BlueStore(CephContext *cct, const string& path)
9f95a23c 4167 : BlueStore(cct, path, 0) {}
7c673cae
FG
4168
4169BlueStore::BlueStore(CephContext *cct,
4170 const string& path,
4171 uint64_t _min_alloc_size)
4172 : ObjectStore(cct, path),
9f95a23c 4173 throttle(cct),
11fdf7f2 4174 finisher(cct, "commit_finisher", "cfin"),
7c673cae 4175 kv_sync_thread(this),
31f18b77 4176 kv_finalize_thread(this),
7c673cae
FG
4177 min_alloc_size(_min_alloc_size),
4178 min_alloc_size_order(ctz(_min_alloc_size)),
4179 mempool_thread(this)
4180{
4181 _init_logger();
11fdf7f2 4182 cct->_conf.add_observer(this);
7c673cae 4183 set_cache_shards(1);
7c673cae
FG
4184}
4185
4186BlueStore::~BlueStore()
4187{
11fdf7f2 4188 cct->_conf.remove_observer(this);
7c673cae 4189 _shutdown_logger();
11fdf7f2
TL
4190 ceph_assert(!mounted);
4191 ceph_assert(db == NULL);
4192 ceph_assert(bluefs == NULL);
4193 ceph_assert(fsid_fd < 0);
4194 ceph_assert(path_fd < 0);
9f95a23c
TL
4195 for (auto i : onode_cache_shards) {
4196 delete i;
4197 }
4198 for (auto i : buffer_cache_shards) {
7c673cae
FG
4199 delete i;
4200 }
9f95a23c
TL
4201 onode_cache_shards.clear();
4202 buffer_cache_shards.clear();
7c673cae
FG
4203}
4204
4205const char **BlueStore::get_tracked_conf_keys() const
4206{
4207 static const char* KEYS[] = {
4208 "bluestore_csum_type",
4209 "bluestore_compression_mode",
4210 "bluestore_compression_algorithm",
4211 "bluestore_compression_min_blob_size",
4212 "bluestore_compression_min_blob_size_ssd",
4213 "bluestore_compression_min_blob_size_hdd",
4214 "bluestore_compression_max_blob_size",
4215 "bluestore_compression_max_blob_size_ssd",
4216 "bluestore_compression_max_blob_size_hdd",
c07f9fc5 4217 "bluestore_compression_required_ratio",
7c673cae
FG
4218 "bluestore_max_alloc_size",
4219 "bluestore_prefer_deferred_size",
181888fb
FG
4220 "bluestore_prefer_deferred_size_hdd",
4221 "bluestore_prefer_deferred_size_ssd",
31f18b77
FG
4222 "bluestore_deferred_batch_ops",
4223 "bluestore_deferred_batch_ops_hdd",
4224 "bluestore_deferred_batch_ops_ssd",
7c673cae
FG
4225 "bluestore_throttle_bytes",
4226 "bluestore_throttle_deferred_bytes",
4227 "bluestore_throttle_cost_per_io_hdd",
4228 "bluestore_throttle_cost_per_io_ssd",
4229 "bluestore_throttle_cost_per_io",
4230 "bluestore_max_blob_size",
4231 "bluestore_max_blob_size_ssd",
4232 "bluestore_max_blob_size_hdd",
11fdf7f2
TL
4233 "osd_memory_target",
4234 "osd_memory_target_cgroup_limit_ratio",
4235 "osd_memory_base",
4236 "osd_memory_cache_min",
92f5a8d4 4237 "osd_memory_expected_fragmentation",
11fdf7f2
TL
4238 "bluestore_cache_autotune",
4239 "bluestore_cache_autotune_interval",
81eedcae 4240 "bluestore_warn_on_legacy_statfs",
9f95a23c
TL
4241 "bluestore_warn_on_no_per_pool_omap",
4242 "bluestore_max_defer_interval",
7c673cae
FG
4243 NULL
4244 };
4245 return KEYS;
4246}
4247
11fdf7f2 4248void BlueStore::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
4249 const std::set<std::string> &changed)
4250{
eafe8130 4251 if (changed.count("bluestore_warn_on_legacy_statfs")) {
81eedcae
TL
4252 _check_legacy_statfs_alert();
4253 }
9f95a23c
TL
4254 if (changed.count("bluestore_warn_on_no_per_pool_omap")) {
4255 _check_no_per_pool_omap_alert();
4256 }
81eedcae 4257
7c673cae
FG
4258 if (changed.count("bluestore_csum_type")) {
4259 _set_csum();
4260 }
4261 if (changed.count("bluestore_compression_mode") ||
4262 changed.count("bluestore_compression_algorithm") ||
4263 changed.count("bluestore_compression_min_blob_size") ||
4264 changed.count("bluestore_compression_max_blob_size")) {
4265 if (bdev) {
4266 _set_compression();
4267 }
4268 }
4269 if (changed.count("bluestore_max_blob_size") ||
4270 changed.count("bluestore_max_blob_size_ssd") ||
4271 changed.count("bluestore_max_blob_size_hdd")) {
4272 if (bdev) {
4273 // only after startup
4274 _set_blob_size();
4275 }
4276 }
4277 if (changed.count("bluestore_prefer_deferred_size") ||
181888fb
FG
4278 changed.count("bluestore_prefer_deferred_size_hdd") ||
4279 changed.count("bluestore_prefer_deferred_size_ssd") ||
7c673cae
FG
4280 changed.count("bluestore_max_alloc_size") ||
4281 changed.count("bluestore_deferred_batch_ops") ||
4282 changed.count("bluestore_deferred_batch_ops_hdd") ||
4283 changed.count("bluestore_deferred_batch_ops_ssd")) {
4284 if (bdev) {
4285 // only after startup
4286 _set_alloc_sizes();
4287 }
4288 }
4289 if (changed.count("bluestore_throttle_cost_per_io") ||
4290 changed.count("bluestore_throttle_cost_per_io_hdd") ||
4291 changed.count("bluestore_throttle_cost_per_io_ssd")) {
4292 if (bdev) {
4293 _set_throttle_params();
4294 }
4295 }
9f95a23c
TL
4296 if (changed.count("bluestore_throttle_bytes") ||
4297 changed.count("bluestore_throttle_deferred_bytes") ||
4298 changed.count("bluestore_throttle_trace_rate")) {
4299 throttle.reset_throttle(conf);
7c673cae 4300 }
9f95a23c
TL
4301 if (changed.count("bluestore_max_defer_interval")) {
4302 if (bdev) {
4303 _set_max_defer_interval();
4304 }
7c673cae 4305 }
92f5a8d4
TL
4306 if (changed.count("osd_memory_target") ||
4307 changed.count("osd_memory_base") ||
4308 changed.count("osd_memory_cache_min") ||
4309 changed.count("osd_memory_expected_fragmentation")) {
4310 _update_osd_memory_options();
4311 }
7c673cae
FG
4312}
4313
4314void BlueStore::_set_compression()
4315{
224ce89b
WB
4316 auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
4317 if (m) {
11fdf7f2 4318 _clear_compression_alert();
224ce89b
WB
4319 comp_mode = *m;
4320 } else {
4321 derr << __func__ << " unrecognized value '"
4322 << cct->_conf->bluestore_compression_mode
4323 << "' for bluestore_compression_mode, reverting to 'none'"
4324 << dendl;
4325 comp_mode = Compressor::COMP_NONE;
11fdf7f2
TL
4326 string s("unknown mode: ");
4327 s += cct->_conf->bluestore_compression_mode;
4328 _set_compression_alert(true, s.c_str());
224ce89b
WB
4329 }
4330
4331 compressor = nullptr;
4332
3efd9988
FG
4333 if (cct->_conf->bluestore_compression_min_blob_size) {
4334 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
7c673cae 4335 } else {
11fdf7f2 4336 ceph_assert(bdev);
9f95a23c 4337 if (_use_rotational_settings()) {
7c673cae
FG
4338 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
4339 } else {
4340 comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
4341 }
4342 }
4343
4344 if (cct->_conf->bluestore_compression_max_blob_size) {
4345 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
4346 } else {
11fdf7f2 4347 ceph_assert(bdev);
9f95a23c 4348 if (_use_rotational_settings()) {
7c673cae
FG
4349 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
4350 } else {
4351 comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
4352 }
4353 }
4354
7c673cae
FG
4355 auto& alg_name = cct->_conf->bluestore_compression_algorithm;
4356 if (!alg_name.empty()) {
4357 compressor = Compressor::create(cct, alg_name);
4358 if (!compressor) {
4359 derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
4360 << dendl;
11fdf7f2 4361 _set_compression_alert(false, alg_name.c_str());
7c673cae
FG
4362 }
4363 }
4364
4365 dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
4366 << " alg " << (compressor ? compressor->get_type_name() : "(none)")
11fdf7f2
TL
4367 << " min_blob " << comp_min_blob_size
4368 << " max_blob " << comp_max_blob_size
7c673cae
FG
4369 << dendl;
4370}
4371
4372void BlueStore::_set_csum()
4373{
4374 csum_type = Checksummer::CSUM_NONE;
4375 int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
4376 if (t > Checksummer::CSUM_NONE)
4377 csum_type = t;
4378
4379 dout(10) << __func__ << " csum_type "
4380 << Checksummer::get_csum_type_string(csum_type)
4381 << dendl;
4382}
4383
4384void BlueStore::_set_throttle_params()
4385{
4386 if (cct->_conf->bluestore_throttle_cost_per_io) {
4387 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
4388 } else {
11fdf7f2 4389 ceph_assert(bdev);
9f95a23c 4390 if (_use_rotational_settings()) {
7c673cae
FG
4391 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
4392 } else {
4393 throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
4394 }
4395 }
4396
4397 dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
4398 << dendl;
4399}
4400void BlueStore::_set_blob_size()
4401{
4402 if (cct->_conf->bluestore_max_blob_size) {
4403 max_blob_size = cct->_conf->bluestore_max_blob_size;
4404 } else {
11fdf7f2 4405 ceph_assert(bdev);
9f95a23c 4406 if (_use_rotational_settings()) {
7c673cae
FG
4407 max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
4408 } else {
4409 max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
4410 }
4411 }
4412 dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
4413 << std::dec << dendl;
4414}
4415
92f5a8d4
TL
4416void BlueStore::_update_osd_memory_options()
4417{
4418 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4419 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
4420 osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4421 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
4422 config_changed++;
4423 dout(10) << __func__
4424 << " osd_memory_target " << osd_memory_target
4425 << " osd_memory_base " << osd_memory_base
4426 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4427 << " osd_memory_cache_min " << osd_memory_cache_min
4428 << dendl;
4429}
4430
11fdf7f2 4431int BlueStore::_set_cache_sizes()
1adf2230 4432{
11fdf7f2
TL
4433 ceph_assert(bdev);
4434 cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
91327a77 4435 cache_autotune_interval =
11fdf7f2
TL
4436 cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
4437 osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
4438 osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
91327a77 4439 osd_memory_expected_fragmentation =
11fdf7f2
TL
4440 cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
4441 osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
91327a77 4442 osd_memory_cache_resize_interval =
11fdf7f2 4443 cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
91327a77 4444
224ce89b
WB
4445 if (cct->_conf->bluestore_cache_size) {
4446 cache_size = cct->_conf->bluestore_cache_size;
4447 } else {
4448 // choose global cache size based on backend type
9f95a23c 4449 if (_use_rotational_settings()) {
224ce89b
WB
4450 cache_size = cct->_conf->bluestore_cache_size_hdd;
4451 } else {
4452 cache_size = cct->_conf->bluestore_cache_size_ssd;
4453 }
4454 }
31f18b77 4455
91327a77 4456 cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
224ce89b 4457 if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
d2e6a577 4458 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77 4459 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4460 return -EINVAL;
4461 }
91327a77
AA
4462
4463 cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
224ce89b 4464 if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
d2e6a577 4465 derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
91327a77 4466 << ") must be in range [0,1.0]" << dendl;
31f18b77
FG
4467 return -EINVAL;
4468 }
91327a77 4469
31f18b77 4470 if (cache_meta_ratio + cache_kv_ratio > 1.0) {
d2e6a577 4471 derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
91327a77
AA
4472 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4473 << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
4474 << dendl;
31f18b77
FG
4475 return -EINVAL;
4476 }
91327a77
AA
4477
4478 cache_data_ratio =
4479 (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
31f18b77
FG
4480 if (cache_data_ratio < 0) {
4481 // deal with floating point imprecision
4482 cache_data_ratio = 0;
4483 }
91327a77 4484
224ce89b
WB
4485 dout(1) << __func__ << " cache_size " << cache_size
4486 << " meta " << cache_meta_ratio
31f18b77
FG
4487 << " kv " << cache_kv_ratio
4488 << " data " << cache_data_ratio
4489 << dendl;
4490 return 0;
4491}
4492
3efd9988
FG
4493int BlueStore::write_meta(const std::string& key, const std::string& value)
4494{
4495 bluestore_bdev_label_t label;
4496 string p = path + "/block";
4497 int r = _read_bdev_label(cct, p, &label);
4498 if (r < 0) {
4499 return ObjectStore::write_meta(key, value);
4500 }
4501 label.meta[key] = value;
4502 r = _write_bdev_label(cct, p, label);
11fdf7f2 4503 ceph_assert(r == 0);
3efd9988
FG
4504 return ObjectStore::write_meta(key, value);
4505}
4506
4507int BlueStore::read_meta(const std::string& key, std::string *value)
4508{
4509 bluestore_bdev_label_t label;
4510 string p = path + "/block";
4511 int r = _read_bdev_label(cct, p, &label);
4512 if (r < 0) {
4513 return ObjectStore::read_meta(key, value);
4514 }
4515 auto i = label.meta.find(key);
4516 if (i == label.meta.end()) {
4517 return ObjectStore::read_meta(key, value);
4518 }
4519 *value = i->second;
4520 return 0;
4521}
4522
7c673cae
FG
4523void BlueStore::_init_logger()
4524{
4525 PerfCountersBuilder b(cct, "bluestore",
4526 l_bluestore_first, l_bluestore_last);
4527 b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
4528 "Average kv_thread flush latency",
4529 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
4530 b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
4531 "Average kv_thread commit latency");
11fdf7f2
TL
4532 b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
4533 "Average kv_sync thread latency",
4534 "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
4535 b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
4536 "Average kv_finalize thread latency",
4537 "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
4538 b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
4539 "Average prepare state latency");
4540 b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
4541 "Average aio_wait state latency",
4542 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
4543 b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
4544 "Average io_done state latency");
4545 b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
4546 "Average kv_queued state latency");
4547 b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
4548 "Average kv_commiting state latency");
4549 b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
4550 "Average kv_done state latency");
4551 b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
4552 "Average deferred_queued state latency");
4553 b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
4554 "Average aio_wait state latency");
4555 b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
4556 "Average cleanup state latency");
4557 b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
4558 "Average finishing state latency");
4559 b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
4560 "Average done state latency");
4561 b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
4562 "Average submit throttle latency",
4563 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
4564 b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
4565 "Average submit latency",
4566 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
4567 b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
4568 "Average commit latency",
4569 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
4570 b.add_time_avg(l_bluestore_read_lat, "read_lat",
4571 "Average read latency",
4572 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
4573 b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
4574 "Average read onode metadata latency");
4575 b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
4576 "Average read latency");
4577 b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
4578 "Average compress latency");
4579 b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
4580 "Average decompress latency");
4581 b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
4582 "Average checksum latency");
4583 b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
4584 "Sum for beneficial compress ops");
4585 b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
4586 "Sum for compress ops rejected due to low net gain of space");
4587 b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
11fdf7f2 4588 "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4589 b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
4590 "Sum for deferred write op");
4591 b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
11fdf7f2 4592 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
7c673cae
FG
4593 b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
4594 "Sum for write penalty read ops");
4595 b.add_u64(l_bluestore_allocated, "bluestore_allocated",
4596 "Sum for allocated bytes");
4597 b.add_u64(l_bluestore_stored, "bluestore_stored",
4598 "Sum for stored bytes");
4599 b.add_u64(l_bluestore_compressed, "bluestore_compressed",
92f5a8d4
TL
4600 "Sum for stored compressed bytes",
4601 "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4602 b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
92f5a8d4
TL
4603 "Sum for bytes allocated for compressed data",
4604 "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae 4605 b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
92f5a8d4
TL
4606 "Sum for original bytes that were compressed",
4607 "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
4608 b.add_u64(l_bluestore_onodes, "bluestore_onodes",
4609 "Number of onodes in cache");
9f95a23c
TL
4610 b.add_u64(l_bluestore_pinned_onodes, "bluestore_pinned_onodes",
4611 "Number of pinned onodes in cache");
7c673cae
FG
4612 b.add_u64_counter(l_bluestore_onode_hits, "bluestore_onode_hits",
4613 "Sum for onode-lookups hit in the cache");
4614 b.add_u64_counter(l_bluestore_onode_misses, "bluestore_onode_misses",
4615 "Sum for onode-lookups missed in the cache");
4616 b.add_u64_counter(l_bluestore_onode_shard_hits, "bluestore_onode_shard_hits",
4617 "Sum for onode-shard lookups hit in the cache");
4618 b.add_u64_counter(l_bluestore_onode_shard_misses,
4619 "bluestore_onode_shard_misses",
4620 "Sum for onode-shard lookups missed in the cache");
4621 b.add_u64(l_bluestore_extents, "bluestore_extents",
4622 "Number of extents in cache");
4623 b.add_u64(l_bluestore_blobs, "bluestore_blobs",
4624 "Number of blobs in cache");
4625 b.add_u64(l_bluestore_buffers, "bluestore_buffers",
4626 "Number of buffers in cache");
4627 b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
11fdf7f2 4628 "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4629 b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
11fdf7f2 4630 "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
91327a77 4631 b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
11fdf7f2 4632 "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4633
4634 b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
4635 "Large aligned writes into fresh blobs");
4636 b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
11fdf7f2 4637 "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4638 b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
4639 "Large aligned writes into fresh blobs (blobs)");
4640 b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
4641 "Small writes into existing or sparse small blobs");
4642 b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
11fdf7f2 4643 "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
4644 b.add_u64_counter(l_bluestore_write_small_unused,
4645 "bluestore_write_small_unused",
4646 "Small writes into unused portion of existing blob");
4647 b.add_u64_counter(l_bluestore_write_small_deferred,
4648 "bluestore_write_small_deferred",
4649 "Small overwrites using deferred");
4650 b.add_u64_counter(l_bluestore_write_small_pre_read,
4651 "bluestore_write_small_pre_read",
4652 "Small writes that required we read some data (possibly "
4653 "cached) to fill out the block");
4654 b.add_u64_counter(l_bluestore_write_small_new, "bluestore_write_small_new",
4655 "Small write into new (sparse) blob");
4656
4657 b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
4658 b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
4659 "Onode extent map reshard events");
4660 b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
4661 "Sum for blob splitting due to resharding");
4662 b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
4663 "Sum for extents that have been removed due to compression");
4664 b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
4665 "Sum for extents that have been merged due to garbage "
4666 "collection");
b32b8144
FG
4667 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
4668 "Read EIO errors propagated to high level callers");
f64942e4
AA
4669 b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
4670 "Read operations that required at least one retry due to failed checksum validation");
a8e16298
TL
4671 b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
4672 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
11fdf7f2
TL
4673 b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
4674 "Average omap iterator seek_to_first call latency");
4675 b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
4676 "Average omap iterator upper_bound call latency");
4677 b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
4678 "Average omap iterator lower_bound call latency");
4679 b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
4680 "Average omap iterator next call latency");
494da23a
TL
4681 b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
4682 "Average collection listing latency");
7c673cae
FG
4683 logger = b.create_perf_counters();
4684 cct->get_perfcounters_collection()->add(logger);
4685}
4686
4687int BlueStore::_reload_logger()
4688{
4689 struct store_statfs_t store_statfs;
7c673cae 4690 int r = statfs(&store_statfs);
11fdf7f2 4691 if (r >= 0) {
7c673cae 4692 logger->set(l_bluestore_allocated, store_statfs.allocated);
11fdf7f2
TL
4693 logger->set(l_bluestore_stored, store_statfs.data_stored);
4694 logger->set(l_bluestore_compressed, store_statfs.data_compressed);
4695 logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
4696 logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
7c673cae
FG
4697 }
4698 return r;
4699}
4700
4701void BlueStore::_shutdown_logger()
4702{
4703 cct->get_perfcounters_collection()->remove(logger);
4704 delete logger;
4705}
4706
4707int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
4708 uuid_d *fsid)
4709{
4710 bluestore_bdev_label_t label;
4711 int r = _read_bdev_label(cct, path, &label);
4712 if (r < 0)
4713 return r;
4714 *fsid = label.osd_uuid;
4715 return 0;
4716}
4717
4718int BlueStore::_open_path()
4719{
b32b8144 4720 // sanity check(s)
11fdf7f2 4721 ceph_assert(path_fd < 0);
91327a77 4722 path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
7c673cae
FG
4723 if (path_fd < 0) {
4724 int r = -errno;
4725 derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
4726 << dendl;
4727 return r;
4728 }
4729 return 0;
4730}
4731
4732void BlueStore::_close_path()
4733{
4734 VOID_TEMP_FAILURE_RETRY(::close(path_fd));
4735 path_fd = -1;
4736}
4737
3efd9988
FG
4738int BlueStore::_write_bdev_label(CephContext *cct,
4739 string path, bluestore_bdev_label_t label)
7c673cae
FG
4740{
4741 dout(10) << __func__ << " path " << path << " label " << label << dendl;
4742 bufferlist bl;
11fdf7f2 4743 encode(label, bl);
7c673cae 4744 uint32_t crc = bl.crc32c(-1);
11fdf7f2
TL
4745 encode(crc, bl);
4746 ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
7c673cae
FG
4747 bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
4748 z.zero();
4749 bl.append(std::move(z));
4750
91327a77 4751 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC));
7c673cae
FG
4752 if (fd < 0) {
4753 fd = -errno;
4754 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4755 << dendl;
4756 return fd;
4757 }
4758 int r = bl.write_fd(fd);
4759 if (r < 0) {
4760 derr << __func__ << " failed to write to " << path
4761 << ": " << cpp_strerror(r) << dendl;
11fdf7f2 4762 goto out;
7c673cae 4763 }
3efd9988
FG
4764 r = ::fsync(fd);
4765 if (r < 0) {
4766 derr << __func__ << " failed to fsync " << path
4767 << ": " << cpp_strerror(r) << dendl;
4768 }
11fdf7f2 4769out:
7c673cae
FG
4770 VOID_TEMP_FAILURE_RETRY(::close(fd));
4771 return r;
4772}
4773
4774int BlueStore::_read_bdev_label(CephContext* cct, string path,
4775 bluestore_bdev_label_t *label)
4776{
4777 dout(10) << __func__ << dendl;
91327a77 4778 int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
7c673cae
FG
4779 if (fd < 0) {
4780 fd = -errno;
4781 derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
4782 << dendl;
4783 return fd;
4784 }
4785 bufferlist bl;
4786 int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
4787 VOID_TEMP_FAILURE_RETRY(::close(fd));
4788 if (r < 0) {
4789 derr << __func__ << " failed to read from " << path
4790 << ": " << cpp_strerror(r) << dendl;
4791 return r;
4792 }
4793
4794 uint32_t crc, expected_crc;
11fdf7f2 4795 auto p = bl.cbegin();
7c673cae 4796 try {
11fdf7f2 4797 decode(*label, p);
7c673cae
FG
4798 bufferlist t;
4799 t.substr_of(bl, 0, p.get_off());
4800 crc = t.crc32c(-1);
11fdf7f2 4801 decode(expected_crc, p);
7c673cae
FG
4802 }
4803 catch (buffer::error& e) {
b32b8144 4804 dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
7c673cae
FG
4805 << ": " << e.what()
4806 << dendl;
b32b8144 4807 return -ENOENT;
7c673cae
FG
4808 }
4809 if (crc != expected_crc) {
4810 derr << __func__ << " bad crc on label, expected " << expected_crc
4811 << " != actual " << crc << dendl;
4812 return -EIO;
4813 }
4814 dout(10) << __func__ << " got " << *label << dendl;
4815 return 0;
4816}
4817
4818int BlueStore::_check_or_set_bdev_label(
4819 string path, uint64_t size, string desc, bool create)
4820{
4821 bluestore_bdev_label_t label;
4822 if (create) {
4823 label.osd_uuid = fsid;
4824 label.size = size;
4825 label.btime = ceph_clock_now();
4826 label.description = desc;
3efd9988 4827 int r = _write_bdev_label(cct, path, label);
7c673cae
FG
4828 if (r < 0)
4829 return r;
4830 } else {
4831 int r = _read_bdev_label(cct, path, &label);
4832 if (r < 0)
4833 return r;
31f18b77
FG
4834 if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
4835 dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4836 << " and fsid " << fsid << " check bypassed" << dendl;
4837 }
4838 else if (label.osd_uuid != fsid) {
7c673cae
FG
4839 derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
4840 << " does not match our fsid " << fsid << dendl;
4841 return -EIO;
4842 }
4843 }
4844 return 0;
4845}
4846
4847void BlueStore::_set_alloc_sizes(void)
4848{
7c673cae
FG
4849 max_alloc_size = cct->_conf->bluestore_max_alloc_size;
4850
4851 if (cct->_conf->bluestore_prefer_deferred_size) {
4852 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
4853 } else {
11fdf7f2 4854 ceph_assert(bdev);
9f95a23c 4855 if (_use_rotational_settings()) {
7c673cae
FG
4856 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
4857 } else {
4858 prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
4859 }
4860 }
4861
4862 if (cct->_conf->bluestore_deferred_batch_ops) {
4863 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
4864 } else {
11fdf7f2 4865 ceph_assert(bdev);
9f95a23c 4866 if (_use_rotational_settings()) {
7c673cae
FG
4867 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
4868 } else {
4869 deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
4870 }
4871 }
4872
4873 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
11fdf7f2 4874 << std::dec << " order " << (int)min_alloc_size_order
7c673cae
FG
4875 << " max_alloc_size 0x" << std::hex << max_alloc_size
4876 << " prefer_deferred_size 0x" << prefer_deferred_size
4877 << std::dec
4878 << " deferred_batch_ops " << deferred_batch_ops
4879 << dendl;
4880}
4881
4882int BlueStore::_open_bdev(bool create)
4883{
11fdf7f2 4884 ceph_assert(bdev == NULL);
7c673cae 4885 string p = path + "/block";
11fdf7f2 4886 bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
7c673cae
FG
4887 int r = bdev->open(p);
4888 if (r < 0)
4889 goto fail;
4890
11fdf7f2
TL
4891 if (create && cct->_conf->bdev_enable_discard) {
4892 bdev->discard(0, bdev->get_size());
4893 }
4894
7c673cae
FG
4895 if (bdev->supported_bdev_label()) {
4896 r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
4897 if (r < 0)
4898 goto fail_close;
4899 }
4900
4901 // initialize global block parameters
4902 block_size = bdev->get_block_size();
4903 block_mask = ~(block_size - 1);
4904 block_size_order = ctz(block_size);
11fdf7f2 4905 ceph_assert(block_size == 1u << block_size_order);
9f95a23c 4906 _set_max_defer_interval();
224ce89b
WB
4907 // and set cache_size based on device type
4908 r = _set_cache_sizes();
4909 if (r < 0) {
4910 goto fail_close;
4911 }
7c673cae
FG
4912 return 0;
4913
4914 fail_close:
4915 bdev->close();
4916 fail:
4917 delete bdev;
4918 bdev = NULL;
4919 return r;
4920}
4921
11fdf7f2
TL
4922void BlueStore::_validate_bdev()
4923{
4924 ceph_assert(bdev);
4925 ceph_assert(min_alloc_size); // _get_odisk_reserved depends on that
4926 uint64_t dev_size = bdev->get_size();
4927 if (dev_size <
4928 _get_ondisk_reserved() + cct->_conf->bluestore_bluefs_min) {
4929 dout(1) << __func__ << " main device size " << byte_u_t(dev_size)
4930 << " is too small, disable bluestore_bluefs_min for now"
4931 << dendl;
4932 ceph_assert(dev_size >= _get_ondisk_reserved());
4933
4934 int r = cct->_conf.set_val("bluestore_bluefs_min", "0");
4935 ceph_assert(r == 0);
4936 }
4937}
4938
7c673cae
FG
4939void BlueStore::_close_bdev()
4940{
11fdf7f2 4941 ceph_assert(bdev);
7c673cae
FG
4942 bdev->close();
4943 delete bdev;
4944 bdev = NULL;
4945}
4946
11fdf7f2 4947int BlueStore::_open_fm(KeyValueDB::Transaction t)
7c673cae 4948{
11fdf7f2
TL
4949 ceph_assert(fm == NULL);
4950 fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
4951 ceph_assert(fm);
4952 if (t) {
4953 // create mode. initialize freespace
7c673cae 4954 dout(20) << __func__ << " initializing freespace" << dendl;
7c673cae
FG
4955 {
4956 bufferlist bl;
4957 bl.append(freelist_type);
4958 t->set(PREFIX_SUPER, "freelist_type", bl);
4959 }
b32b8144
FG
4960 // being able to allocate in units less than bdev block size
4961 // seems to be a bad idea.
11fdf7f2 4962 ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
b32b8144 4963 fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
7c673cae
FG
4964
4965 // allocate superblock reserved space. note that we do not mark
4966 // bluefs space as allocated in the freelist; we instead rely on
4967 // bluefs_extents.
11fdf7f2 4968 auto reserved = _get_ondisk_reserved();
3efd9988 4969 fm->allocate(0, reserved, t);
7c673cae 4970
7c673cae 4971 if (cct->_conf->bluestore_bluefs) {
11fdf7f2 4972 ceph_assert(bluefs_extents.num_intervals() == 1);
7c673cae 4973 interval_set<uint64_t>::iterator p = bluefs_extents.begin();
11fdf7f2 4974 reserved = round_up_to(p.get_start() + p.get_len(), min_alloc_size);
7c673cae
FG
4975 dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
4976 << " for bluefs" << dendl;
7c673cae
FG
4977 }
4978
4979 if (cct->_conf->bluestore_debug_prefill > 0) {
4980 uint64_t end = bdev->get_size() - reserved;
4981 dout(1) << __func__ << " pre-fragmenting freespace, using "
4982 << cct->_conf->bluestore_debug_prefill << " with max free extent "
4983 << cct->_conf->bluestore_debug_prefragment_max << dendl;
11fdf7f2 4984 uint64_t start = p2roundup(reserved, min_alloc_size);
7c673cae
FG
4985 uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
4986 float r = cct->_conf->bluestore_debug_prefill;
4987 r /= 1.0 - r;
4988 bool stop = false;
4989
4990 while (!stop && start < end) {
4991 uint64_t l = (rand() % max_b + 1) * min_alloc_size;
4992 if (start + l > end) {
4993 l = end - start;
11fdf7f2 4994 l = p2align(l, min_alloc_size);
7c673cae 4995 }
11fdf7f2 4996 ceph_assert(start + l <= end);
7c673cae
FG
4997
4998 uint64_t u = 1 + (uint64_t)(r * (double)l);
11fdf7f2 4999 u = p2roundup(u, min_alloc_size);
7c673cae
FG
5000 if (start + l + u > end) {
5001 u = end - (start + l);
5002 // trim to align so we don't overflow again
11fdf7f2 5003 u = p2align(u, min_alloc_size);
7c673cae
FG
5004 stop = true;
5005 }
11fdf7f2 5006 ceph_assert(start + l + u <= end);
7c673cae 5007
11fdf7f2 5008 dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
7c673cae
FG
5009 << " use 0x" << u << std::dec << dendl;
5010
5011 if (u == 0) {
5012 // break if u has been trimmed to nothing
5013 break;
5014 }
5015
5016 fm->allocate(start + l, u, t);
5017 start += l + u;
5018 }
5019 }
7c673cae
FG
5020 }
5021
11fdf7f2 5022 int r = fm->init(db);
7c673cae
FG
5023 if (r < 0) {
5024 derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
5025 delete fm;
5026 fm = NULL;
5027 return r;
5028 }
81eedcae
TL
5029 // if space size tracked by free list manager is that higher than actual
5030 // dev size one can hit out-of-space allocation which will result
5031 // in data loss and/or assertions
5032 // Probably user altered the device size somehow.
5033 // The only fix for now is to redeploy OSD.
5034 if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
5035 ostringstream ss;
5036 ss << "slow device size mismatch detected, "
5037 << " fm size(" << fm->get_size()
5038 << ") > slow device size(" << bdev->get_size()
5039 << "), Please stop using this OSD as it might cause data loss.";
5040 _set_disk_size_mismatch_alert(ss.str());
5041 }
7c673cae
FG
5042 return 0;
5043}
5044
5045void BlueStore::_close_fm()
5046{
5047 dout(10) << __func__ << dendl;
11fdf7f2 5048 ceph_assert(fm);
7c673cae
FG
5049 fm->shutdown();
5050 delete fm;
5051 fm = NULL;
5052}
5053
5054int BlueStore::_open_alloc()
5055{
11fdf7f2
TL
5056 ceph_assert(alloc == NULL);
5057 ceph_assert(bdev->get_size());
5058
5059 if (bluefs) {
5060 bluefs_extents.clear();
9f95a23c
TL
5061 auto r = bluefs->get_block_extents(bluefs_layout.shared_bdev,
5062 &bluefs_extents);
11fdf7f2
TL
5063 if (r < 0) {
5064 lderr(cct) << __func__ << " failed to retrieve bluefs_extents: "
5065 << cpp_strerror(r) << dendl;
5066
5067 return r;
5068 }
5069 dout(10) << __func__ << " bluefs extents 0x"
5070 << std::hex << bluefs_extents << std::dec
5071 << dendl;
5072 }
5073
7c673cae
FG
5074 alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
5075 bdev->get_size(),
eafe8130 5076 min_alloc_size, "block");
7c673cae
FG
5077 if (!alloc) {
5078 lderr(cct) << __func__ << " Allocator::unknown alloc type "
5079 << cct->_conf->bluestore_allocator
5080 << dendl;
5081 return -EINVAL;
5082 }
5083
5084 uint64_t num = 0, bytes = 0;
5085
5086 dout(1) << __func__ << " opening allocation metadata" << dendl;
5087 // initialize from freelist
5088 fm->enumerate_reset();
5089 uint64_t offset, length;
11fdf7f2 5090 while (fm->enumerate_next(db, &offset, &length)) {
7c673cae
FG
5091 alloc->init_add_free(offset, length);
5092 ++num;
5093 bytes += length;
5094 }
224ce89b 5095 fm->enumerate_reset();
1adf2230 5096 dout(1) << __func__ << " loaded " << byte_u_t(bytes)
7c673cae
FG
5097 << " in " << num << " extents"
5098 << dendl;
5099
5100 // also mark bluefs space as allocated
5101 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
5102 alloc->init_rm_free(e.get_start(), e.get_len());
5103 }
7c673cae
FG
5104
5105 return 0;
5106}
5107
5108void BlueStore::_close_alloc()
5109{
11fdf7f2
TL
5110 ceph_assert(bdev);
5111 bdev->discard_drain();
5112
5113 ceph_assert(alloc);
7c673cae
FG
5114 alloc->shutdown();
5115 delete alloc;
5116 alloc = NULL;
11fdf7f2 5117 bluefs_extents.clear();
7c673cae
FG
5118}
5119
5120int BlueStore::_open_fsid(bool create)
5121{
11fdf7f2 5122 ceph_assert(fsid_fd < 0);
91327a77 5123 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
5124 if (create)
5125 flags |= O_CREAT;
5126 fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
5127 if (fsid_fd < 0) {
5128 int err = -errno;
5129 derr << __func__ << " " << cpp_strerror(err) << dendl;
5130 return err;
5131 }
5132 return 0;
5133}
5134
5135int BlueStore::_read_fsid(uuid_d *uuid)
5136{
5137 char fsid_str[40];
5138 memset(fsid_str, 0, sizeof(fsid_str));
5139 int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
5140 if (ret < 0) {
5141 derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
5142 return ret;
5143 }
5144 if (ret > 36)
5145 fsid_str[36] = 0;
5146 else
5147 fsid_str[ret] = 0;
5148 if (!uuid->parse(fsid_str)) {
5149 derr << __func__ << " unparsable uuid " << fsid_str << dendl;
5150 return -EINVAL;
5151 }
5152 return 0;
5153}
5154
5155int BlueStore::_write_fsid()
5156{
5157 int r = ::ftruncate(fsid_fd, 0);
5158 if (r < 0) {
5159 r = -errno;
5160 derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
5161 return r;
5162 }
5163 string str = stringify(fsid) + "\n";
5164 r = safe_write(fsid_fd, str.c_str(), str.length());
5165 if (r < 0) {
5166 derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
5167 return r;
5168 }
5169 r = ::fsync(fsid_fd);
5170 if (r < 0) {
5171 r = -errno;
5172 derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
5173 return r;
5174 }
5175 return 0;
5176}
5177
5178void BlueStore::_close_fsid()
5179{
5180 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
5181 fsid_fd = -1;
5182}
5183
5184int BlueStore::_lock_fsid()
5185{
5186 struct flock l;
5187 memset(&l, 0, sizeof(l));
5188 l.l_type = F_WRLCK;
5189 l.l_whence = SEEK_SET;
5190 int r = ::fcntl(fsid_fd, F_SETLK, &l);
5191 if (r < 0) {
5192 int err = errno;
5193 derr << __func__ << " failed to lock " << path << "/fsid"
5194 << " (is another ceph-osd still running?)"
5195 << cpp_strerror(err) << dendl;
5196 return -err;
5197 }
5198 return 0;
5199}
5200
31f18b77
FG
5201bool BlueStore::is_rotational()
5202{
5203 if (bdev) {
5204 return bdev->is_rotational();
5205 }
5206
5207 bool rotational = true;
5208 int r = _open_path();
5209 if (r < 0)
5210 goto out;
5211 r = _open_fsid(false);
5212 if (r < 0)
5213 goto out_path;
5214 r = _read_fsid(&fsid);
5215 if (r < 0)
5216 goto out_fsid;
5217 r = _lock_fsid();
5218 if (r < 0)
5219 goto out_fsid;
5220 r = _open_bdev(false);
5221 if (r < 0)
5222 goto out_fsid;
5223 rotational = bdev->is_rotational();
5224 _close_bdev();
5225 out_fsid:
5226 _close_fsid();
5227 out_path:
5228 _close_path();
5229 out:
5230 return rotational;
5231}
5232
d2e6a577
FG
5233bool BlueStore::is_journal_rotational()
5234{
5235 if (!bluefs) {
5236 dout(5) << __func__ << " bluefs disabled, default to store media type"
5237 << dendl;
5238 return is_rotational();
5239 }
5240 dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
5241 return bluefs->wal_is_rotational();
5242}
5243
9f95a23c
TL
5244bool BlueStore::_use_rotational_settings()
5245{
5246 if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
5247 return true;
5248 }
5249 if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
5250 return false;
5251 }
5252 return bdev->is_rotational();
5253}
5254
7c673cae
FG
5255bool BlueStore::test_mount_in_use()
5256{
5257 // most error conditions mean the mount is not in use (e.g., because
5258 // it doesn't exist). only if we fail to lock do we conclude it is
5259 // in use.
5260 bool ret = false;
5261 int r = _open_path();
5262 if (r < 0)
5263 return false;
5264 r = _open_fsid(false);
5265 if (r < 0)
5266 goto out_path;
5267 r = _lock_fsid();
5268 if (r < 0)
5269 ret = true; // if we can't lock, it is in use
5270 _close_fsid();
5271 out_path:
5272 _close_path();
5273 return ret;
5274}
5275
11fdf7f2 5276int BlueStore::_minimal_open_bluefs(bool create)
7c673cae
FG
5277{
5278 int r;
11fdf7f2 5279 bluefs = new BlueFS(cct);
7c673cae 5280
11fdf7f2
TL
5281 string bfn;
5282 struct stat st;
5283
5284 bfn = path + "/block.db";
5285 if (::stat(bfn.c_str(), &st) == 0) {
eafe8130
TL
5286 r = bluefs->add_block_device(
5287 BlueFS::BDEV_DB, bfn,
5288 create && cct->_conf->bdev_enable_discard);
7c673cae 5289 if (r < 0) {
11fdf7f2
TL
5290 derr << __func__ << " add block device(" << bfn << ") returned: "
5291 << cpp_strerror(r) << dendl;
5292 goto free_bluefs;
7c673cae 5293 }
7c673cae 5294
11fdf7f2
TL
5295 if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
5296 r = _check_or_set_bdev_label(
5297 bfn,
5298 bluefs->get_block_device_size(BlueFS::BDEV_DB),
5299 "bluefs db", create);
5300 if (r < 0) {
5301 derr << __func__
5302 << " check block device(" << bfn << ") label returned: "
5303 << cpp_strerror(r) << dendl;
5304 goto free_bluefs;
5305 }
7c673cae 5306 }
11fdf7f2
TL
5307 if (create) {
5308 bluefs->add_block_extent(
5309 BlueFS::BDEV_DB,
5310 SUPER_RESERVED,
5311 bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
5312 }
9f95a23c
TL
5313 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
5314 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
5315 } else {
5316 r = -errno;
5317 if (::lstat(bfn.c_str(), &st) == -1) {
5318 r = 0;
9f95a23c 5319 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
7c673cae 5320 } else {
11fdf7f2
TL
5321 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5322 << cpp_strerror(r) << dendl;
5323 goto free_bluefs;
7c673cae
FG
5324 }
5325 }
7c673cae 5326
11fdf7f2
TL
5327 // shared device
5328 bfn = path + "/block";
5329 // never trim here
9f95a23c 5330 r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
11fdf7f2
TL
5331 true /* shared with bluestore */);
5332 if (r < 0) {
5333 derr << __func__ << " add block device(" << bfn << ") returned: "
5334 << cpp_strerror(r) << dendl;
5335 goto free_bluefs;
5336 }
5337 if (create) {
5338 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
5339 uint64_t initial =
5340 bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
5341 cct->_conf->bluestore_bluefs_gift_ratio);
5342 initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
eafe8130
TL
5343 uint64_t alloc_size = cct->_conf->bluefs_shared_alloc_size;
5344 if (alloc_size % min_alloc_size) {
5345 derr << __func__ << " bluefs_shared_alloc_size 0x" << std::hex
5346 << alloc_size << " is not a multiple of "
11fdf7f2
TL
5347 << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
5348 r = -EINVAL;
5349 goto free_bluefs;
7c673cae 5350 }
11fdf7f2 5351 // align to bluefs's alloc_size
eafe8130 5352 initial = p2roundup(initial, alloc_size);
11fdf7f2 5353 // put bluefs in the middle of the device in case it is an HDD
eafe8130 5354 uint64_t start = p2align((bdev->get_size() - initial) / 2, alloc_size);
11fdf7f2 5355 //avoiding superblock overwrite
eafe8130
TL
5356 start = std::max(alloc_size, start);
5357 ceph_assert(start >=_get_ondisk_reserved());
7c673cae 5358
9f95a23c 5359 bluefs->add_block_extent(bluefs_layout.shared_bdev, start, initial);
11fdf7f2
TL
5360 bluefs_extents.insert(start, initial);
5361 ++out_of_sync_fm;
5362 }
5363
5364 bfn = path + "/block.wal";
5365 if (::stat(bfn.c_str(), &st) == 0) {
5366 r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
eafe8130 5367 create && cct->_conf->bdev_enable_discard);
11fdf7f2
TL
5368 if (r < 0) {
5369 derr << __func__ << " add block device(" << bfn << ") returned: "
5370 << cpp_strerror(r) << dendl;
5371 goto free_bluefs;
5372 }
7c673cae 5373
11fdf7f2
TL
5374 if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
5375 r = _check_or_set_bdev_label(
5376 bfn,
5377 bluefs->get_block_device_size(BlueFS::BDEV_WAL),
5378 "bluefs wal", create);
7c673cae 5379 if (r < 0) {
11fdf7f2
TL
5380 derr << __func__ << " check block device(" << bfn
5381 << ") label returned: " << cpp_strerror(r) << dendl;
7c673cae
FG
5382 goto free_bluefs;
5383 }
7c673cae
FG
5384 }
5385
11fdf7f2
TL
5386 if (create) {
5387 bluefs->add_block_extent(
5388 BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
5389 bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
5390 BDEV_LABEL_BLOCK_SIZE);
5391 }
9f95a23c 5392 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
5393 } else {
5394 r = 0;
5395 if (::lstat(bfn.c_str(), &st) != -1) {
5396 r = -errno;
5397 derr << __func__ << " " << bfn << " symlink exists but target unusable: "
5398 << cpp_strerror(r) << dendl;
7c673cae
FG
5399 goto free_bluefs;
5400 }
11fdf7f2
TL
5401 }
5402 return 0;
7c673cae 5403
11fdf7f2
TL
5404free_bluefs:
5405 ceph_assert(bluefs);
5406 delete bluefs;
5407 bluefs = NULL;
5408 return r;
5409}
7c673cae 5410
11fdf7f2
TL
5411int BlueStore::_open_bluefs(bool create)
5412{
5413 int r = _minimal_open_bluefs(create);
5414 if (r < 0) {
5415 return r;
5416 }
9f95a23c
TL
5417 RocksDBBlueFSVolumeSelector* vselector = nullptr;
5418 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
5419
5420 string options = cct->_conf->bluestore_rocksdb_options;
5421
5422 rocksdb::Options rocks_opts;
5423 int r = RocksDBStore::ParseOptionsFromStringStatic(
5424 cct,
5425 options,
5426 rocks_opts,
5427 nullptr);
5428 if (r < 0) {
5429 return r;
5430 }
5431
5432 double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
5433 vselector =
5434 new RocksDBBlueFSVolumeSelector(
5435 bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
5436 bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
5437 bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
5438 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5439 rocks_opts.max_bytes_for_level_base,
5440 rocks_opts.max_bytes_for_level_multiplier,
5441 reserved_factor,
5442 cct->_conf->bluestore_volume_selection_reserved,
5443 cct->_conf->bluestore_volume_selection_policy != "rocksdb_original");
5444 }
11fdf7f2 5445 if (create) {
9f95a23c 5446 bluefs->mkfs(fsid, bluefs_layout);
11fdf7f2 5447 }
9f95a23c 5448 bluefs->set_volume_selector(vselector);
11fdf7f2
TL
5449 r = bluefs->mount();
5450 if (r < 0) {
5451 derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
5452 }
9f95a23c 5453 ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
11fdf7f2
TL
5454 return r;
5455}
5456
5457void BlueStore::_close_bluefs()
5458{
5459 bluefs->umount();
5460 _minimal_close_bluefs();
5461}
5462
5463void BlueStore::_minimal_close_bluefs()
5464{
5465 delete bluefs;
5466 bluefs = NULL;
5467}
5468
5469int BlueStore::_is_bluefs(bool create, bool* ret)
5470{
5471 if (create) {
5472 *ret = cct->_conf->bluestore_bluefs;
5473 } else {
5474 string s;
5475 int r = read_meta("bluefs", &s);
5476 if (r < 0) {
5477 derr << __func__ << " unable to read 'bluefs' meta" << dendl;
5478 return -EIO;
5479 }
5480 if (s == "1") {
5481 *ret = true;
5482 } else if (s == "0") {
5483 *ret = false;
31f18b77 5484 } else {
11fdf7f2
TL
5485 derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
5486 << dendl;
5487 return -EIO;
5488 }
5489 }
5490 return 0;
5491}
5492
5493/*
5494* opens both DB and dependant super_meta, FreelistManager and allocator
5495* in the proper order
5496*/
5497int BlueStore::_open_db_and_around(bool read_only)
5498{
5499 int r;
5500 bool do_bluefs = false;
5501 _is_bluefs(false, &do_bluefs); // ignore err code
5502 if (do_bluefs) {
5503 // open in read-only first to read FM list and init allocator
5504 // as they might be needed for some BlueFS procedures
5505 r = _open_db(false, false, true);
5506 if (r < 0)
5507 return r;
5508
5509 r = _open_super_meta();
5510 if (r < 0) {
5511 goto out_db;
5512 }
5513
5514 r = _open_fm(nullptr);
5515 if (r < 0)
5516 goto out_db;
5517
5518 r = _open_alloc();
5519 if (r < 0)
5520 goto out_fm;
5521
5522 // now open in R/W mode
5523 if (!read_only) {
5524 _close_db();
5525
5526 r = _open_db(false, false, false);
5527 if (r < 0) {
5528 _close_alloc();
5529 _close_fm();
5530 return r;
28e407b8 5531 }
7c673cae 5532 }
11fdf7f2
TL
5533 } else {
5534 r = _open_db(false, false);
5535 if (r < 0) {
5536 return r;
5537 }
5538 r = _open_super_meta();
5539 if (r < 0) {
5540 goto out_db;
5541 }
7c673cae 5542
11fdf7f2
TL
5543 r = _open_fm(nullptr);
5544 if (r < 0)
5545 goto out_db;
5546
5547 r = _open_alloc();
5548 if (r < 0)
5549 goto out_fm;
5550 }
5551 return 0;
5552
5553 out_fm:
5554 _close_fm();
5555 out_db:
5556 _close_db();
5557 return r;
5558}
5559
5560void BlueStore::_close_db_and_around()
5561{
5562 if (bluefs) {
5563 if (out_of_sync_fm.fetch_and(0)) {
5564 _sync_bluefs_and_fm();
5565 }
5566 _close_db();
5567 while(out_of_sync_fm.fetch_and(0)) {
5568 // if seen some allocations during close - repeat open_db, sync fm, close
5569 dout(0) << __func__ << " syncing FreelistManager" << dendl;
5570 int r = _open_db(false, false, false);
5571 if (r < 0) {
5572 derr << __func__
5573 << " unable to open db, FreelistManager is probably out of sync"
5574 << dendl;
5575 break;
5576 }
5577 _sync_bluefs_and_fm();
5578 _close_db();
7c673cae 5579 }
11fdf7f2
TL
5580 if (!_kv_only) {
5581 _close_alloc();
5582 _close_fm();
5583 }
5584 } else {
5585 _close_alloc();
5586 _close_fm();
5587 _close_db();
5588 }
5589}
5590
5591// updates legacy bluefs related recs in DB to a state valid for
5592// downgrades from nautilus.
5593void BlueStore::_sync_bluefs_and_fm()
5594{
5595 if (cct->_conf->bluestore_bluefs_db_compatibility) {
5596 bufferlist bl;
5597 encode(bluefs_extents, bl);
5598 dout(20) << __func__ << " bluefs_extents at KV is now 0x"
5599 << std::hex << bluefs_extents << std::dec
5600 << dendl;
5601 KeyValueDB::Transaction synct = db->get_transaction();
5602 synct->set(PREFIX_SUPER, "bluefs_extents", bl);
5603 synct->set(PREFIX_SUPER, "bluefs_extents_back", bl);
5604
5605 // Nice thing is that we don't need to update FreelistManager here.
5606 // It always has corresponding bits set to 'Free' for both Nautilus+ and
5607 // pre-Nautilis releases.
5608 // So once we get an extent to bluefs_extents this means it's
5609 // been free in allocator and hence it's free in FM too.
5610
5611 db->submit_transaction_sync(synct);
5612 }
5613}
5614
5615int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
5616{
5617 int r;
5618 ceph_assert(!db);
5619 ceph_assert(!(create && read_only));
5620 string fn = path + "/db";
5621 string options;
5622 stringstream err;
5623 std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
5624
5625 string kv_backend;
5626 std::vector<KeyValueDB::ColumnFamily> cfs;
5627
5628 if (create) {
5629 kv_backend = cct->_conf->bluestore_kvbackend;
5630 } else {
5631 r = read_meta("kv_backend", &kv_backend);
7c673cae 5632 if (r < 0) {
11fdf7f2
TL
5633 derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
5634 return -EIO;
5635 }
5636 }
5637 dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
5638
5639 bool do_bluefs;
5640 r = _is_bluefs(create, &do_bluefs);
5641 if (r < 0) {
5642 return r;
5643 }
5644 dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
5645
5646 map<string,string> kv_options;
5647 // force separate wal dir for all new deployments.
5648 kv_options["separate_wal_dir"] = 1;
5649 rocksdb::Env *env = NULL;
5650 if (do_bluefs) {
5651 dout(10) << __func__ << " initializing bluefs" << dendl;
5652 if (kv_backend != "rocksdb") {
5653 derr << " backend must be rocksdb to use bluefs" << dendl;
5654 return -EINVAL;
7c673cae 5655 }
11fdf7f2
TL
5656
5657 r = _open_bluefs(create);
5658 if (r < 0) {
5659 return r;
5660 }
11fdf7f2 5661
7c673cae 5662 if (cct->_conf->bluestore_bluefs_env_mirror) {
9f95a23c
TL
5663 rocksdb::Env* a = new BlueRocksEnv(bluefs);
5664 rocksdb::Env* b = rocksdb::Env::Default();
7c673cae 5665 if (create) {
9f95a23c
TL
5666 string cmd = "rm -rf " + path + "/db " +
5667 path + "/db.slow " +
5668 path + "/db.wal";
5669 int r = system(cmd.c_str());
5670 (void)r;
7c673cae
FG
5671 }
5672 env = new rocksdb::EnvMirror(b, a, false, true);
9f95a23c
TL
5673 }
5674 else {
7c673cae
FG
5675 env = new BlueRocksEnv(bluefs);
5676
5677 // simplify the dir names, too, as "seen" by rocksdb
5678 fn = "db";
5679 }
9f95a23c
TL
5680 bluefs->set_slow_device_expander(this);
5681 BlueFSVolumeSelector::paths paths;
5682 bluefs->get_vselector_paths(fn, paths);
7c673cae 5683
9f95a23c 5684 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
7c673cae
FG
5685 // we have both block.db and block; tell rocksdb!
5686 // note: the second (last) size value doesn't really matter
5687 ostringstream db_paths;
9f95a23c
TL
5688 bool first = true;
5689 for (auto& p : paths) {
5690 if (!first) {
5691 db_paths << " ";
5692 }
5693 first = false;
5694 db_paths << p.first << "," << p.second;
5695
5696 }
11fdf7f2 5697 kv_options["db_paths"] = db_paths.str();
9f95a23c 5698 dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
7c673cae
FG
5699 }
5700
5701 if (create) {
9f95a23c
TL
5702 for (auto& p : paths) {
5703 env->CreateDir(p.first);
5704 }
5705 // Selectors don't provide wal path so far hence create explicitly
11fdf7f2 5706 env->CreateDir(fn + ".wal");
11fdf7f2
TL
5707 } else {
5708 std::vector<std::string> res;
5709 // check for dir presence
5710 auto r = env->GetChildren(fn+".wal", &res);
5711 if (r.IsNotFound()) {
5712 kv_options.erase("separate_wal_dir");
5713 }
7c673cae 5714 }
11fdf7f2
TL
5715 } else {
5716 string walfn = path + "/db.wal";
7c673cae 5717
11fdf7f2
TL
5718 if (create) {
5719 int r = ::mkdir(fn.c_str(), 0755);
5720 if (r < 0)
5721 r = -errno;
5722 if (r < 0 && r != -EEXIST) {
5723 derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
5724 << dendl;
5725 return r;
5726 }
5727
5728 // wal_dir, too!
7c673cae
FG
5729 r = ::mkdir(walfn.c_str(), 0755);
5730 if (r < 0)
5731 r = -errno;
5732 if (r < 0 && r != -EEXIST) {
5733 derr << __func__ << " failed to create " << walfn
5734 << ": " << cpp_strerror(r)
5735 << dendl;
5736 return r;
5737 }
11fdf7f2
TL
5738 } else {
5739 struct stat st;
5740 r = ::stat(walfn.c_str(), &st);
5741 if (r < 0 && errno == ENOENT) {
5742 kv_options.erase("separate_wal_dir");
5743 }
7c673cae
FG
5744 }
5745 }
5746
91327a77 5747
7c673cae
FG
5748 db = KeyValueDB::create(cct,
5749 kv_backend,
5750 fn,
11fdf7f2 5751 kv_options,
7c673cae
FG
5752 static_cast<void*>(env));
5753 if (!db) {
5754 derr << __func__ << " error creating db" << dendl;
5755 if (bluefs) {
11fdf7f2 5756 _close_bluefs();
7c673cae
FG
5757 }
5758 // delete env manually here since we can't depend on db to do this
5759 // under this case
5760 delete env;
5761 env = NULL;
5762 return -EIO;
5763 }
5764
5765 FreelistManager::setup_merge_operators(db);
5766 db->set_merge_operator(PREFIX_STAT, merge_op);
91327a77 5767 db->set_cache_size(cache_kv_ratio * cache_size);
31f18b77 5768
11fdf7f2 5769 if (kv_backend == "rocksdb") {
7c673cae 5770 options = cct->_conf->bluestore_rocksdb_options;
11fdf7f2
TL
5771
5772 map<string,string> cf_map;
5773 cct->_conf.with_val<string>("bluestore_rocksdb_cfs",
5774 get_str_map,
5775 &cf_map,
5776 " \t");
5777 for (auto& i : cf_map) {
5778 dout(10) << "column family " << i.first << ": " << i.second << dendl;
5779 cfs.push_back(KeyValueDB::ColumnFamily(i.first, i.second));
5780 }
5781 }
5782
7c673cae 5783 db->init(options);
11fdf7f2
TL
5784 if (to_repair_db)
5785 return 0;
5786 if (create) {
5787 if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
5788 r = db->create_and_open(err, cfs);
5789 } else {
5790 r = db->create_and_open(err);
5791 }
5792 } else {
5793 // we pass in cf list here, but it is only used if the db already has
5794 // column families created.
5795 r = read_only ?
5796 db->open_read_only(err, cfs) :
5797 db->open(err, cfs);
5798 }
7c673cae
FG
5799 if (r) {
5800 derr << __func__ << " erroring opening db: " << err.str() << dendl;
11fdf7f2 5801 _close_db();
7c673cae
FG
5802 return -EIO;
5803 }
5804 dout(1) << __func__ << " opened " << kv_backend
5805 << " path " << fn << " options " << options << dendl;
5806 return 0;
7c673cae
FG
5807}
5808
5809void BlueStore::_close_db()
5810{
11fdf7f2 5811 ceph_assert(db);
7c673cae
FG
5812 delete db;
5813 db = NULL;
5814 if (bluefs) {
11fdf7f2 5815 _close_bluefs();
7c673cae
FG
5816 }
5817}
5818
11fdf7f2 5819void BlueStore::_dump_alloc_on_failure()
7c673cae 5820{
11fdf7f2
TL
5821 auto dump_interval =
5822 cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
5823 if (dump_interval > 0 &&
5824 next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
5825 alloc->dump();
5826 next_dump_on_bluefs_alloc_failure = ceph_clock_now();
5827 next_dump_on_bluefs_alloc_failure += dump_interval;
7c673cae 5828 }
11fdf7f2 5829}
7c673cae 5830
7c673cae 5831
11fdf7f2
TL
5832int BlueStore::allocate_bluefs_freespace(
5833 uint64_t min_size,
5834 uint64_t size,
5835 PExtentVector* extents_out)
5836{
5837 ceph_assert(min_size <= size);
5838 if (size) {
5839 // round up to alloc size
9f95a23c 5840 uint64_t alloc_size = bluefs->get_alloc_size(bluefs_layout.shared_bdev);
eafe8130
TL
5841 min_size = p2roundup(min_size, alloc_size);
5842 size = p2roundup(size, alloc_size);
11fdf7f2
TL
5843
5844 PExtentVector extents_local;
5845 PExtentVector* extents = extents_out ? extents_out : &extents_local;
5846
5847
5848 uint64_t gift;
5849 uint64_t allocated = 0;
5850 int64_t alloc_len;
9f95a23c
TL
5851 auto need = size;
5852 auto extent_count0 = extents->size();
11fdf7f2
TL
5853 do {
5854 // hard cap to fit into 32 bits
9f95a23c 5855 gift = std::min<uint64_t>(size, 1ull << 30);
11fdf7f2
TL
5856 dout(10) << __func__ << " gifting " << gift
5857 << " (" << byte_u_t(gift) << ")" << dendl;
5858
eafe8130
TL
5859 alloc_len = alloc->allocate(gift, alloc_size, 0, 0, extents);
5860 if (alloc_len > 0) {
11fdf7f2
TL
5861 allocated += alloc_len;
5862 size -= alloc_len;
5863 }
5864
eafe8130
TL
5865 if (alloc_len < 0 ||
5866 (alloc_len < (int64_t)gift && (min_size > allocated))) {
11fdf7f2
TL
5867 derr << __func__
5868 << " failed to allocate on 0x" << std::hex << gift
5869 << " min_size 0x" << min_size
5870 << " > allocated total 0x" << allocated
eafe8130
TL
5871 << " bluefs_shared_alloc_size 0x" << alloc_size
5872 << " allocated 0x" << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2
TL
5873 << " available 0x " << alloc->get_free()
5874 << std::dec << dendl;
7c673cae 5875
494da23a 5876 _dump_alloc_on_failure();
11fdf7f2
TL
5877 alloc->release(*extents);
5878 extents->clear();
5879 return -ENOSPC;
5880 }
5881 } while (size && alloc_len > 0);
9f95a23c
TL
5882 _collect_allocation_stats(need, alloc_size, extents->size() - extent_count0);
5883
11fdf7f2
TL
5884 for (auto& e : *extents) {
5885 dout(5) << __func__ << " gifting " << e << " to bluefs" << dendl;
5886 bluefs_extents.insert(e.offset, e.length);
5887 ++out_of_sync_fm;
5888 // apply to bluefs if not requested from outside
5889 if (!extents_out) {
9f95a23c 5890 bluefs->add_block_extent(bluefs_layout.shared_bdev, e.offset, e.length);
11fdf7f2 5891 }
7c673cae
FG
5892 }
5893 }
7c673cae
FG
5894 return 0;
5895}
5896
9f95a23c
TL
5897uint64_t BlueStore::available_freespace(uint64_t alloc_size) {
5898 uint64_t total = 0;
5899 auto iterated_allocation = [&](uint64_t off, uint64_t len) {
eafe8130 5900 //only count in size that is alloc_size aligned
9f95a23c
TL
5901 uint64_t dist_to_alignment;
5902 uint64_t offset_in_block = off & (alloc_size - 1);
eafe8130
TL
5903 if (offset_in_block == 0)
5904 dist_to_alignment = 0;
5905 else
5906 dist_to_alignment = alloc_size - offset_in_block;
5907 if (dist_to_alignment >= len)
5908 return;
5909 len -= dist_to_alignment;
5910 total += p2align(len, alloc_size);
5911 };
5912 alloc->dump(iterated_allocation);
5913 return total;
5914}
5915
11fdf7f2 5916int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total)
f64942e4 5917{
7c673cae
FG
5918 float bluefs_free_ratio = (float)bluefs_free / (float)bluefs_total;
5919
5920 uint64_t my_free = alloc->get_free();
5921 uint64_t total = bdev->get_size();
5922 float my_free_ratio = (float)my_free / (float)total;
5923
5924 uint64_t total_free = bluefs_free + my_free;
5925
5926 float bluefs_ratio = (float)bluefs_free / (float)total_free;
5927
5928 dout(10) << __func__
1adf2230 5929 << " bluefs " << byte_u_t(bluefs_free)
7c673cae 5930 << " free (" << bluefs_free_ratio
1adf2230 5931 << ") bluestore " << byte_u_t(my_free)
7c673cae
FG
5932 << " free (" << my_free_ratio
5933 << "), bluefs_ratio " << bluefs_ratio
5934 << dendl;
5935
5936 uint64_t gift = 0;
5937 uint64_t reclaim = 0;
5938 if (bluefs_ratio < cct->_conf->bluestore_bluefs_min_ratio) {
5939 gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
9f95a23c
TL
5940 if (gift >= my_free)
5941 gift = my_free / 2;
7c673cae
FG
5942 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
5943 << " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
1adf2230 5944 << ", should gift " << byte_u_t(gift) << dendl;
7c673cae
FG
5945 } else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
5946 reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
5947 if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
5948 reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
9f95a23c
TL
5949 if (reclaim >= bluefs_free)
5950 reclaim = bluefs_free / 2;
7c673cae
FG
5951 dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
5952 << " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
1adf2230 5953 << ", should reclaim " << byte_u_t(reclaim) << dendl;
7c673cae 5954 }
3efd9988
FG
5955
5956 // don't take over too much of the freespace
5957 uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
7c673cae 5958 if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
3efd9988 5959 cct->_conf->bluestore_bluefs_min < free_cap) {
7c673cae
FG
5960 uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
5961 dout(10) << __func__ << " bluefs_total " << bluefs_total
5962 << " < min " << cct->_conf->bluestore_bluefs_min
1adf2230 5963 << ", should gift " << byte_u_t(g) << dendl;
7c673cae
FG
5964 if (g > gift)
5965 gift = g;
5966 reclaim = 0;
5967 }
9f95a23c
TL
5968 uint64_t min_free =
5969 cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
3efd9988
FG
5970 if (bluefs_free < min_free &&
5971 min_free < free_cap) {
5972 uint64_t g = min_free - bluefs_free;
11fdf7f2 5973 dout(10) << __func__ << " bluefs_free " << bluefs_free
3efd9988 5974 << " < min " << min_free
1adf2230 5975 << ", should gift " << byte_u_t(g) << dendl;
3efd9988
FG
5976 if (g > gift)
5977 gift = g;
5978 reclaim = 0;
5979 }
9f95a23c
TL
5980 uint64_t max_free =
5981 cct->_conf.get_val<Option::size_t>("bluestore_bluefs_max_free");
5982 if (bluefs_free > max_free) {
5983 dout(10) << __func__ << " bluefs_free " << bluefs_free
5984 << " > max " << max_free
5985 << ", stop gifting for now" << dendl;
5986 gift = 0;
5987 }
11fdf7f2
TL
5988 ceph_assert((int64_t)gift >= 0);
5989 ceph_assert((int64_t)reclaim >= 0);
5990 return gift > 0 ? (int64_t)gift : -(int64_t)reclaim;
5991}
7c673cae 5992
11fdf7f2
TL
5993int BlueStore::_balance_bluefs_freespace()
5994{
5995 int ret = 0;
5996 ceph_assert(bluefs);
7c673cae 5997
11fdf7f2
TL
5998 vector<pair<uint64_t,uint64_t>> bluefs_usage; // <free, total> ...
5999 bluefs->get_usage(&bluefs_usage);
9f95a23c 6000 ceph_assert(bluefs_usage.size() > bluefs_layout.shared_bdev);
7c673cae 6001
11fdf7f2 6002 bool clear_alert = true;
9f95a23c
TL
6003 if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
6004 auto& p = bluefs_usage[bluefs_layout.shared_bdev];
11fdf7f2
TL
6005 if (p.first != p.second) {
6006 auto& db = bluefs_usage[BlueFS::BDEV_DB];
6007 ostringstream ss;
6008 ss << "spilled over " << byte_u_t(p.second - p.first)
6009 << " metadata from 'db' device (" << byte_u_t(db.second - db.first)
6010 << " used of " << byte_u_t(db.second) << ") to slow device";
6011 _set_spillover_alert(ss.str());
6012 clear_alert = false;
6013 }
6014 }
6015 if (clear_alert) {
6016 _clear_spillover_alert();
7c673cae
FG
6017 }
6018
11fdf7f2
TL
6019 // fixme: look at primary bdev only for now
6020 int64_t delta = _get_bluefs_size_delta(
9f95a23c
TL
6021 bluefs_usage[bluefs_layout.shared_bdev].first,
6022 bluefs_usage[bluefs_layout.shared_bdev].second);
11fdf7f2 6023
7c673cae 6024 // reclaim from bluefs?
11fdf7f2 6025 if (delta < 0) {
7c673cae 6026 // round up to alloc size
9f95a23c 6027 uint64_t alloc_size = bluefs->get_alloc_size(bluefs_layout.shared_bdev);
eafe8130 6028 auto reclaim = p2roundup(uint64_t(-delta), alloc_size);
7c673cae
FG
6029
6030 // hard cap to fit into 32 bits
9f95a23c 6031 reclaim = std::min<uint64_t>(reclaim, 1ull << 30);
7c673cae 6032 dout(10) << __func__ << " reclaiming " << reclaim
1adf2230 6033 << " (" << byte_u_t(reclaim) << ")" << dendl;
7c673cae
FG
6034
6035 while (reclaim > 0) {
6036 // NOTE: this will block and do IO.
a8e16298 6037 PExtentVector extents;
9f95a23c 6038 int r = bluefs->reclaim_blocks(bluefs_layout.shared_bdev, reclaim,
7c673cae
FG
6039 &extents);
6040 if (r < 0) {
6041 derr << __func__ << " failed to reclaim space from bluefs"
6042 << dendl;
6043 break;
6044 }
6045 for (auto e : extents) {
11fdf7f2 6046 ++out_of_sync_fm;
7c673cae
FG
6047 bluefs_extents.erase(e.offset, e.length);
6048 bluefs_extents_reclaiming.insert(e.offset, e.length);
6049 reclaim -= e.length;
6050 }
6051 }
6052
6053 ret = 1;
6054 }
6055
6056 return ret;
6057}
6058
eafe8130 6059int BlueStore::_open_collections()
7c673cae 6060{
28e407b8 6061 dout(10) << __func__ << dendl;
eafe8130 6062 collections_had_errors = false;
11fdf7f2 6063 ceph_assert(coll_map.empty());
7c673cae
FG
6064 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6065 for (it->upper_bound(string());
6066 it->valid();
6067 it->next()) {
6068 coll_t cid;
6069 if (cid.parse(it->key())) {
9f95a23c 6070 auto c = ceph::make_ref<Collection>(
7c673cae 6071 this,
9f95a23c
TL
6072 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
6073 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
6074 cid);
7c673cae 6075 bufferlist bl = it->value();
11fdf7f2 6076 auto p = bl.cbegin();
7c673cae 6077 try {
11fdf7f2 6078 decode(c->cnode, p);
7c673cae
FG
6079 } catch (buffer::error& e) {
6080 derr << __func__ << " failed to decode cnode, key:"
6081 << pretty_binary_string(it->key()) << dendl;
6082 return -EIO;
6083 }
28e407b8
AA
6084 dout(20) << __func__ << " opened " << cid << " " << c
6085 << " " << c->cnode << dendl;
11fdf7f2 6086 _osr_attach(c.get());
7c673cae 6087 coll_map[cid] = c;
11fdf7f2 6088
7c673cae
FG
6089 } else {
6090 derr << __func__ << " unrecognized collection " << it->key() << dendl;
eafe8130 6091 collections_had_errors = true;
7c673cae
FG
6092 }
6093 }
6094 return 0;
6095}
6096
eafe8130
TL
6097void BlueStore::_fsck_collections(int64_t* errors)
6098{
6099 if (collections_had_errors) {
6100 dout(10) << __func__ << dendl;
6101 KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
6102 for (it->upper_bound(string());
6103 it->valid();
6104 it->next()) {
6105 coll_t cid;
6106 if (!cid.parse(it->key())) {
6107 derr << __func__ << " unrecognized collection " << it->key() << dendl;
6108 if (errors) {
6109 (*errors)++;
6110 }
6111 }
6112 }
6113 }
6114}
6115
9f95a23c
TL
6116void BlueStore::_set_per_pool_omap()
6117{
6118 per_pool_omap = false;
6119 bufferlist bl;
6120 db->get(PREFIX_SUPER, "per_pool_omap", &bl);
6121 if (bl.length()) {
6122 per_pool_omap = true;
6123 dout(10) << __func__ << " per_pool_omap=1" << dendl;
6124 } else {
6125 dout(10) << __func__ << " per_pool_omap not present" << dendl;
6126 }
6127 _check_no_per_pool_omap_alert();
6128}
6129
224ce89b 6130void BlueStore::_open_statfs()
31f18b77 6131{
11fdf7f2
TL
6132 osd_pools.clear();
6133 vstatfs.reset();
6134
31f18b77 6135 bufferlist bl;
11fdf7f2 6136 int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
31f18b77 6137 if (r >= 0) {
11fdf7f2 6138 per_pool_stat_collection = false;
31f18b77 6139 if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
11fdf7f2 6140 auto it = bl.cbegin();
31f18b77 6141 vstatfs.decode(it);
11fdf7f2 6142 dout(10) << __func__ << " store_statfs is found" << dendl;
224ce89b 6143 } else {
31f18b77
FG
6144 dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
6145 }
81eedcae 6146 _check_legacy_statfs_alert();
11fdf7f2
TL
6147 } else {
6148 per_pool_stat_collection = true;
6149 dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
6150 KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT);
6151 for (it->upper_bound(string());
6152 it->valid();
6153 it->next()) {
6154
6155 uint64_t pool_id;
6156 int r = get_key_pool_stat(it->key(), &pool_id);
6157 ceph_assert(r == 0);
6158
6159 bufferlist bl;
6160 bl = it->value();
6161 auto p = bl.cbegin();
6162 auto& st = osd_pools[pool_id];
6163 try {
6164 st.decode(p);
6165 vstatfs += st;
6166
6167 dout(30) << __func__ << " pool " << pool_id
6168 << " statfs " << st << dendl;
6169 } catch (buffer::error& e) {
6170 derr << __func__ << " failed to decode pool stats, key:"
6171 << pretty_binary_string(it->key()) << dendl;
6172 }
6173 }
31f18b77 6174 }
11fdf7f2
TL
6175 dout(30) << __func__ << " statfs " << vstatfs << dendl;
6176
31f18b77
FG
6177}
6178
7c673cae
FG
6179int BlueStore::_setup_block_symlink_or_file(
6180 string name,
6181 string epath,
6182 uint64_t size,
6183 bool create)
6184{
6185 dout(20) << __func__ << " name " << name << " path " << epath
6186 << " size " << size << " create=" << (int)create << dendl;
6187 int r = 0;
91327a77 6188 int flags = O_RDWR|O_CLOEXEC;
7c673cae
FG
6189 if (create)
6190 flags |= O_CREAT;
6191 if (epath.length()) {
6192 r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
6193 if (r < 0) {
6194 r = -errno;
6195 derr << __func__ << " failed to create " << name << " symlink to "
6196 << epath << ": " << cpp_strerror(r) << dendl;
6197 return r;
6198 }
6199
6200 if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
6201 int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
6202 if (fd < 0) {
6203 r = -errno;
6204 derr << __func__ << " failed to open " << epath << " file: "
6205 << cpp_strerror(r) << dendl;
6206 return r;
6207 }
11fdf7f2
TL
6208 // write the Transport ID of the NVMe device
6209 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6210 // where "0000:02:00.0" is the selector of a PCI device, see
6211 // the first column of "lspci -mm -n -D"
6212 string trid{"trtype:PCIe "};
6213 trid += "traddr:";
6214 trid += epath.substr(strlen(SPDK_PREFIX));
6215 r = ::write(fd, trid.c_str(), trid.size());
6216 ceph_assert(r == static_cast<int>(trid.size()));
7c673cae
FG
6217 dout(1) << __func__ << " created " << name << " symlink to "
6218 << epath << dendl;
6219 VOID_TEMP_FAILURE_RETRY(::close(fd));
6220 }
6221 }
6222 if (size) {
6223 int fd = ::openat(path_fd, name.c_str(), flags, 0644);
6224 if (fd >= 0) {
6225 // block file is present
6226 struct stat st;
6227 int r = ::fstat(fd, &st);
6228 if (r == 0 &&
6229 S_ISREG(st.st_mode) && // if it is a regular file
6230 st.st_size == 0) { // and is 0 bytes
6231 r = ::ftruncate(fd, size);
6232 if (r < 0) {
6233 r = -errno;
6234 derr << __func__ << " failed to resize " << name << " file to "
6235 << size << ": " << cpp_strerror(r) << dendl;
6236 VOID_TEMP_FAILURE_RETRY(::close(fd));
6237 return r;
6238 }
6239
6240 if (cct->_conf->bluestore_block_preallocate_file) {
28e407b8
AA
6241 r = ::ceph_posix_fallocate(fd, 0, size);
6242 if (r > 0) {
7c673cae
FG
6243 derr << __func__ << " failed to prefallocate " << name << " file to "
6244 << size << ": " << cpp_strerror(r) << dendl;
6245 VOID_TEMP_FAILURE_RETRY(::close(fd));
6246 return -r;
6247 }
7c673cae
FG
6248 }
6249 dout(1) << __func__ << " resized " << name << " file to "
1adf2230 6250 << byte_u_t(size) << dendl;
7c673cae
FG
6251 }
6252 VOID_TEMP_FAILURE_RETRY(::close(fd));
6253 } else {
6254 int r = -errno;
6255 if (r != -ENOENT) {
6256 derr << __func__ << " failed to open " << name << " file: "
6257 << cpp_strerror(r) << dendl;
6258 return r;
6259 }
6260 }
6261 }
6262 return 0;
6263}
6264
6265int BlueStore::mkfs()
6266{
6267 dout(1) << __func__ << " path " << path << dendl;
6268 int r;
6269 uuid_d old_fsid;
6270
eafe8130
TL
6271 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6272 derr << __func__ << " osd_max_object_size "
6273 << cct->_conf->osd_max_object_size << " > bluestore max "
6274 << OBJECT_MAX_SIZE << dendl;
6275 return -EINVAL;
6276 }
6277
7c673cae
FG
6278 {
6279 string done;
6280 r = read_meta("mkfs_done", &done);
6281 if (r == 0) {
6282 dout(1) << __func__ << " already created" << dendl;
6283 if (cct->_conf->bluestore_fsck_on_mkfs) {
6284 r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6285 if (r < 0) {
6286 derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
6287 << dendl;
6288 return r;
6289 }
6290 if (r > 0) {
6291 derr << __func__ << " fsck found " << r << " errors" << dendl;
6292 r = -EIO;
6293 }
6294 }
6295 return r; // idempotent
6296 }
6297 }
6298
6299 {
6300 string type;
6301 r = read_meta("type", &type);
6302 if (r == 0) {
6303 if (type != "bluestore") {
6304 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6305 return -EIO;
6306 }
6307 } else {
6308 r = write_meta("type", "bluestore");
6309 if (r < 0)
6310 return r;
6311 }
6312 }
6313
6314 freelist_type = "bitmap";
6315
6316 r = _open_path();
6317 if (r < 0)
6318 return r;
6319
6320 r = _open_fsid(true);
6321 if (r < 0)
6322 goto out_path_fd;
6323
6324 r = _lock_fsid();
6325 if (r < 0)
6326 goto out_close_fsid;
6327
6328 r = _read_fsid(&old_fsid);
6329 if (r < 0 || old_fsid.is_zero()) {
6330 if (fsid.is_zero()) {
6331 fsid.generate_random();
6332 dout(1) << __func__ << " generated fsid " << fsid << dendl;
6333 } else {
6334 dout(1) << __func__ << " using provided fsid " << fsid << dendl;
6335 }
6336 // we'll write it later.
6337 } else {
6338 if (!fsid.is_zero() && fsid != old_fsid) {
6339 derr << __func__ << " on-disk fsid " << old_fsid
6340 << " != provided " << fsid << dendl;
6341 r = -EINVAL;
6342 goto out_close_fsid;
6343 }
6344 fsid = old_fsid;
6345 }
6346
6347 r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
6348 cct->_conf->bluestore_block_size,
6349 cct->_conf->bluestore_block_create);
6350 if (r < 0)
6351 goto out_close_fsid;
6352 if (cct->_conf->bluestore_bluefs) {
6353 r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
6354 cct->_conf->bluestore_block_wal_size,
6355 cct->_conf->bluestore_block_wal_create);
6356 if (r < 0)
6357 goto out_close_fsid;
6358 r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
6359 cct->_conf->bluestore_block_db_size,
6360 cct->_conf->bluestore_block_db_create);
6361 if (r < 0)
6362 goto out_close_fsid;
6363 }
6364
6365 r = _open_bdev(true);
6366 if (r < 0)
6367 goto out_close_fsid;
6368
3efd9988
FG
6369 // choose min_alloc_size
6370 if (cct->_conf->bluestore_min_alloc_size) {
6371 min_alloc_size = cct->_conf->bluestore_min_alloc_size;
6372 } else {
11fdf7f2 6373 ceph_assert(bdev);
3efd9988
FG
6374 if (bdev->is_rotational()) {
6375 min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
6376 } else {
6377 min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
6378 }
6379 }
11fdf7f2 6380 _validate_bdev();
3efd9988
FG
6381
6382 // make sure min_alloc_size is power of 2 aligned.
11fdf7f2 6383 if (!isp2(min_alloc_size)) {
3efd9988
FG
6384 derr << __func__ << " min_alloc_size 0x"
6385 << std::hex << min_alloc_size << std::dec
6386 << " is not power of 2 aligned!"
6387 << dendl;
6388 r = -EINVAL;
6389 goto out_close_bdev;
6390 }
6391
7c673cae
FG
6392 r = _open_db(true);
6393 if (r < 0)
6394 goto out_close_bdev;
6395
7c673cae
FG
6396 {
6397 KeyValueDB::Transaction t = db->get_transaction();
11fdf7f2
TL
6398 r = _open_fm(t);
6399 if (r < 0)
6400 goto out_close_db;
7c673cae
FG
6401 {
6402 bufferlist bl;
11fdf7f2 6403 encode((uint64_t)0, bl);
7c673cae
FG
6404 t->set(PREFIX_SUPER, "nid_max", bl);
6405 t->set(PREFIX_SUPER, "blobid_max", bl);
6406 }
6407
7c673cae
FG
6408 {
6409 bufferlist bl;
11fdf7f2 6410 encode((uint64_t)min_alloc_size, bl);
7c673cae
FG
6411 t->set(PREFIX_SUPER, "min_alloc_size", bl);
6412 }
9f95a23c
TL
6413 {
6414 bufferlist bl;
6415 bl.append("1");
6416 t->set(PREFIX_SUPER, "per_pool_omap", bl);
6417 }
7c673cae
FG
6418 ondisk_format = latest_ondisk_format;
6419 _prepare_ondisk_format_super(t);
6420 db->submit_transaction_sync(t);
6421 }
6422
7c673cae
FG
6423 r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
6424 if (r < 0)
224ce89b
WB
6425 goto out_close_fm;
6426
3efd9988 6427 r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
7c673cae 6428 if (r < 0)
224ce89b 6429 goto out_close_fm;
7c673cae
FG
6430
6431 if (fsid != old_fsid) {
6432 r = _write_fsid();
6433 if (r < 0) {
6434 derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
224ce89b 6435 goto out_close_fm;
7c673cae
FG
6436 }
6437 }
6438
11fdf7f2
TL
6439 if (out_of_sync_fm.fetch_and(0)) {
6440 _sync_bluefs_and_fm();
6441 }
6442
7c673cae
FG
6443 out_close_fm:
6444 _close_fm();
6445 out_close_db:
6446 _close_db();
6447 out_close_bdev:
6448 _close_bdev();
6449 out_close_fsid:
6450 _close_fsid();
6451 out_path_fd:
6452 _close_path();
6453
6454 if (r == 0 &&
6455 cct->_conf->bluestore_fsck_on_mkfs) {
6456 int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
6457 if (rc < 0)
6458 return rc;
6459 if (rc > 0) {
6460 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6461 r = -EIO;
6462 }
11fdf7f2
TL
6463 }
6464
6465 if (r == 0) {
6466 // indicate success by writing the 'mkfs_done' file
6467 r = write_meta("mkfs_done", "yes");
6468 }
6469
6470 if (r < 0) {
6471 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6472 } else {
6473 dout(0) << __func__ << " success" << dendl;
6474 }
6475 return r;
6476}
6477
6478int BlueStore::_mount_for_bluefs()
6479{
6480 int r = _open_path();
6481 ceph_assert(r == 0);
6482 r = _open_fsid(false);
6483 ceph_assert(r == 0);
6484 r = _read_fsid(&fsid);
6485 ceph_assert(r == 0);
6486 r = _lock_fsid();
6487 ceph_assert(r == 0);
6488 r = _open_bluefs(false);
6489 ceph_assert(r == 0);
6490 return r;
6491}
6492
6493void BlueStore::_umount_for_bluefs()
6494{
6495 _close_bluefs();
6496 _close_fsid();
6497 _close_path();
6498}
6499
6500int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
6501{
6502 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6503 int r;
6504 ceph_assert(path_fd < 0);
6505
6506 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6507
6508 if (!cct->_conf->bluestore_bluefs) {
6509 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6510 return -EIO;
6511 }
6512
6513 r = _mount_for_bluefs();
6514
6515 int reserved = 0;
6516 if (id == BlueFS::BDEV_NEWWAL) {
6517 string p = path + "/block.wal";
6518 r = _setup_block_symlink_or_file("block.wal", dev_path,
6519 cct->_conf->bluestore_block_wal_size,
6520 true);
6521 ceph_assert(r == 0);
6522
6523 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
eafe8130 6524 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6525 ceph_assert(r == 0);
6526
6527 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6528 r = _check_or_set_bdev_label(
6529 p,
6530 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6531 "bluefs wal",
6532 true);
6533 ceph_assert(r == 0);
6534 }
6535
6536 reserved = BDEV_LABEL_BLOCK_SIZE;
9f95a23c 6537 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6538 } else if (id == BlueFS::BDEV_NEWDB) {
6539 string p = path + "/block.db";
6540 r = _setup_block_symlink_or_file("block.db", dev_path,
6541 cct->_conf->bluestore_block_db_size,
6542 true);
6543 ceph_assert(r == 0);
6544
6545 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
eafe8130 6546 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6547 ceph_assert(r == 0);
6548
6549 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6550 r = _check_or_set_bdev_label(
6551 p,
6552 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6553 "bluefs db",
6554 true);
6555 ceph_assert(r == 0);
6556 }
6557 reserved = SUPER_RESERVED;
9f95a23c
TL
6558 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6559 bluefs_layout.dedicated_db = true;
11fdf7f2
TL
6560 }
6561
6562 bluefs->umount();
6563 bluefs->mount();
6564
6565 bluefs->add_block_extent(
6566 id,
6567 reserved,
6568 bluefs->get_block_device_size(id) - reserved);
6569
9f95a23c 6570 r = bluefs->prepare_new_device(id, bluefs_layout);
11fdf7f2
TL
6571 ceph_assert(r == 0);
6572
6573 if (r < 0) {
6574 derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
6575 } else {
6576 dout(0) << __func__ << " success" << dendl;
6577 }
6578
6579 _umount_for_bluefs();
6580 return r;
6581}
6582
6583int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
6584 int id)
6585{
6586 dout(10) << __func__ << " id:" << id << dendl;
6587 ceph_assert(path_fd < 0);
6588
6589 ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
6590
6591 if (!cct->_conf->bluestore_bluefs) {
6592 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6593 return -EIO;
6594 }
6595
6596 int r = _mount_for_bluefs();
6597
6598 // require bluestore_bluefs_min_free to be free at target device!
6599 uint64_t used_space = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
6600 for(auto src_id : devs_source) {
6601 used_space += bluefs->get_total(src_id) - bluefs->get_free(src_id);
6602 }
6603 uint64_t target_free = bluefs->get_free(id);
6604 if (id == BlueFS::BDEV_SLOW && target_free < used_space) {
6605 // will need to remount full BlueStore instance to allocate more space
6606 _umount_for_bluefs();
6607
6608 r = mount();
6609 ceph_assert(r == 0);
6610 dout(1) << __func__
6611 << " Allocating more space at slow device for BlueFS: +"
6612 << used_space - target_free << " bytes" << dendl;
6613 r = allocate_bluefs_freespace(
6614 used_space - target_free,
6615 used_space - target_free,
6616 nullptr);
6617
6618 umount();
6619 if (r != 0) {
6620 derr << __func__
6621 << " can't migrate, unable to allocate extra space: "
6622 << used_space - target_free << " at target:" << id
6623 << dendl;
6624 return -ENOSPC;
6625 }
6626
6627 r = _mount_for_bluefs();
6628 ceph_assert(r == 0);
6629 } else if (target_free < used_space) {
6630 derr << __func__
6631 << " can't migrate, free space at target: " << target_free
6632 << " is less than required space: " << used_space
6633 << dendl;
6634 return -ENOSPC;
6635 }
9f95a23c
TL
6636 if (devs_source.count(BlueFS::BDEV_DB)) {
6637 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6638 bluefs_layout.dedicated_db = false;
6639 }
6640 if (devs_source.count(BlueFS::BDEV_WAL)) {
6641 bluefs_layout.dedicated_wal = false;
6642 }
6643 r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
11fdf7f2
TL
6644 if (r < 0) {
6645 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6646 goto shutdown;
6647 }
6648
6649 if (devs_source.count(BlueFS::BDEV_DB)) {
6650 r = unlink(string(path + "/block.db").c_str());
6651 ceph_assert(r == 0);
6652 }
6653 if (devs_source.count(BlueFS::BDEV_WAL)) {
6654 r = unlink(string(path + "/block.wal").c_str());
6655 ceph_assert(r == 0);
6656 }
6657
6658shutdown:
6659 _umount_for_bluefs();
6660 return r;
6661}
6662
6663int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
6664 int id,
6665 const string& dev_path)
6666{
6667 dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
6668 int r;
6669 ceph_assert(path_fd < 0);
6670
6671 ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
6672
6673 if (!cct->_conf->bluestore_bluefs) {
6674 derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
6675 return -EIO;
6676 }
6677
6678 r = _mount_for_bluefs();
6679
6680 int reserved = 0;
6681 string link_db;
6682 string link_wal;
6683 if (devs_source.count(BlueFS::BDEV_DB) &&
9f95a23c 6684 bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 6685 link_db = path + "/block.db";
9f95a23c
TL
6686 bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
6687 bluefs_layout.dedicated_db = false;
11fdf7f2
TL
6688 }
6689 if (devs_source.count(BlueFS::BDEV_WAL)) {
6690 link_wal = path + "/block.wal";
9f95a23c 6691 bluefs_layout.dedicated_wal = false;
11fdf7f2
TL
6692 }
6693
6694 size_t target_size;
6695 string target_name;
6696 if (id == BlueFS::BDEV_NEWWAL) {
6697 target_name = "block.wal";
6698 target_size = cct->_conf->bluestore_block_wal_size;
9f95a23c 6699 bluefs_layout.dedicated_wal = true;
11fdf7f2
TL
6700
6701 r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
eafe8130 6702 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6703 ceph_assert(r == 0);
6704
6705 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
6706 r = _check_or_set_bdev_label(
6707 dev_path,
6708 bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
6709 "bluefs wal",
6710 true);
6711 ceph_assert(r == 0);
6712 }
6713 reserved = BDEV_LABEL_BLOCK_SIZE;
6714 } else if (id == BlueFS::BDEV_NEWDB) {
6715 target_name = "block.db";
6716 target_size = cct->_conf->bluestore_block_db_size;
9f95a23c
TL
6717 bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
6718 bluefs_layout.dedicated_db = true;
31f18b77 6719
11fdf7f2 6720 r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
eafe8130 6721 cct->_conf->bdev_enable_discard);
11fdf7f2
TL
6722 ceph_assert(r == 0);
6723
6724 if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
6725 r = _check_or_set_bdev_label(
6726 dev_path,
6727 bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
6728 "bluefs db",
6729 true);
6730 ceph_assert(r == 0);
6731 }
6732 reserved = SUPER_RESERVED;
31f18b77
FG
6733 }
6734
11fdf7f2
TL
6735 bluefs->umount();
6736 bluefs->mount();
6737
6738 bluefs->add_block_extent(
6739 id, reserved, bluefs->get_block_device_size(id) - reserved);
6740
9f95a23c 6741 r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
11fdf7f2 6742
7c673cae 6743 if (r < 0) {
11fdf7f2
TL
6744 derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
6745 goto shutdown;
6746 }
6747
6748 if (!link_db.empty()) {
6749 r = unlink(link_db.c_str());
6750 ceph_assert(r == 0);
6751 }
6752 if (!link_wal.empty()) {
6753 r = unlink(link_wal.c_str());
6754 ceph_assert(r == 0);
6755 }
6756 r = _setup_block_symlink_or_file(
6757 target_name,
6758 dev_path,
6759 target_size,
6760 true);
6761 ceph_assert(r == 0);
6762 dout(0) << __func__ << " success" << dendl;
6763
6764shutdown:
6765 _umount_for_bluefs();
6766 return r;
6767}
6768
6769string BlueStore::get_device_path(unsigned id)
6770{
6771 string res;
6772 if (id < BlueFS::MAX_BDEV) {
6773 switch (id) {
6774 case BlueFS::BDEV_WAL:
6775 res = path + "/block.wal";
6776 break;
6777 case BlueFS::BDEV_DB:
9f95a23c 6778 if (id == bluefs_layout.shared_bdev) {
11fdf7f2
TL
6779 res = path + "/block";
6780 } else {
6781 res = path + "/block.db";
6782 }
6783 break;
6784 case BlueFS::BDEV_SLOW:
6785 res = path + "/block";
6786 break;
6787 }
6788 }
6789 return res;
6790}
6791
6792int BlueStore::expand_devices(ostream& out)
6793{
6794 int r = _mount(false);
6795 ceph_assert(r == 0);
6796 bluefs->dump_block_extents(out);
6797 out << "Expanding..." << std::endl;
6798 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
9f95a23c 6799 if (devid == bluefs_layout.shared_bdev ) {
11fdf7f2
TL
6800 continue;
6801 }
6802 uint64_t size = bluefs->get_block_device_size(devid);
6803 if (size == 0) {
6804 // no bdev
6805 continue;
6806 }
6807
6808 interval_set<uint64_t> before;
6809 bluefs->get_block_extents(devid, &before);
6810 ceph_assert(!before.empty());
6811 uint64_t end = before.range_end();
6812 if (end < size) {
6813 out << devid
6814 <<" : expanding " << " from 0x" << std::hex
6815 << end << " to 0x" << size << std::dec << std::endl;
6816 bluefs->add_block_extent(devid, end, size-end);
6817 string p = get_device_path(devid);
6818 const char* path = p.c_str();
6819 if (path == nullptr) {
6820 derr << devid
6821 <<": can't find device path " << dendl;
6822 continue;
6823 }
6824 bluestore_bdev_label_t label;
6825 int r = _read_bdev_label(cct, path, &label);
6826 if (r < 0) {
6827 derr << "unable to read label for " << path << ": "
6828 << cpp_strerror(r) << dendl;
6829 continue;
6830 }
6831 label.size = size;
6832 r = _write_bdev_label(cct, path, label);
6833 if (r < 0) {
6834 derr << "unable to write label for " << path << ": "
6835 << cpp_strerror(r) << dendl;
6836 continue;
6837 }
6838 out << devid
6839 <<" : size label updated to " << size
6840 << std::endl;
6841 }
6842 }
6843 uint64_t size0 = fm->get_size();
6844 uint64_t size = bdev->get_size();
6845 if (size0 < size) {
9f95a23c 6846 out << bluefs_layout.shared_bdev
11fdf7f2
TL
6847 <<" : expanding " << " from 0x" << std::hex
6848 << size0 << " to 0x" << size << std::dec << std::endl;
6849 KeyValueDB::Transaction txn;
6850 txn = db->get_transaction();
6851 int r = fm->expand(size, txn);
6852 ceph_assert(r == 0);
6853 db->submit_transaction_sync(txn);
6854
6855 // always reference to slow device here
6856 string p = get_device_path(BlueFS::BDEV_SLOW);
6857 ceph_assert(!p.empty());
6858 const char* path = p.c_str();
6859 bluestore_bdev_label_t label;
6860 r = _read_bdev_label(cct, path, &label);
6861 if (r < 0) {
6862 derr << "unable to read label for " << path << ": "
6863 << cpp_strerror(r) << dendl;
6864 } else {
6865 label.size = size;
6866 r = _write_bdev_label(cct, path, label);
6867 if (r < 0) {
6868 derr << "unable to write label for " << path << ": "
6869 << cpp_strerror(r) << dendl;
6870 } else {
9f95a23c 6871 out << bluefs_layout.shared_bdev
11fdf7f2
TL
6872 <<" : size label updated to " << size
6873 << std::endl;
6874 }
6875 }
7c673cae 6876 }
11fdf7f2 6877 umount();
7c673cae
FG
6878 return r;
6879}
6880
6881void BlueStore::set_cache_shards(unsigned num)
6882{
6883 dout(10) << __func__ << " " << num << dendl;
9f95a23c
TL
6884 size_t oold = onode_cache_shards.size();
6885 size_t bold = buffer_cache_shards.size();
6886 ceph_assert(num >= oold && num >= bold);
6887 onode_cache_shards.resize(num);
6888 buffer_cache_shards.resize(num);
6889 for (unsigned i = oold; i < num; ++i) {
6890 onode_cache_shards[i] =
6891 OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
6892 logger);
6893 }
6894 for (unsigned i = bold; i < num; ++i) {
6895 buffer_cache_shards[i] =
6896 BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
6897 logger);
7c673cae
FG
6898 }
6899}
6900
11fdf7f2 6901int BlueStore::_mount(bool kv_only, bool open_db)
7c673cae
FG
6902{
6903 dout(1) << __func__ << " path " << path << dendl;
6904
3efd9988
FG
6905 _kv_only = kv_only;
6906
7c673cae
FG
6907 {
6908 string type;
6909 int r = read_meta("type", &type);
6910 if (r < 0) {
6911 derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
6912 << dendl;
6913 return r;
6914 }
6915
6916 if (type != "bluestore") {
6917 derr << __func__ << " expected bluestore, but type is " << type << dendl;
6918 return -EIO;
6919 }
6920 }
6921
6922 if (cct->_conf->bluestore_fsck_on_mount) {
6923 int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
6924 if (rc < 0)
6925 return rc;
6926 if (rc > 0) {
6927 derr << __func__ << " fsck found " << rc << " errors" << dendl;
6928 return -EIO;
6929 }
6930 }
6931
eafe8130
TL
6932 if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
6933 derr << __func__ << " osd_max_object_size "
6934 << cct->_conf->osd_max_object_size << " > bluestore max "
6935 << OBJECT_MAX_SIZE << dendl;
6936 return -EINVAL;
6937 }
6938
7c673cae
FG
6939 int r = _open_path();
6940 if (r < 0)
6941 return r;
6942 r = _open_fsid(false);
6943 if (r < 0)
6944 goto out_path;
6945
6946 r = _read_fsid(&fsid);
6947 if (r < 0)
6948 goto out_fsid;
6949
6950 r = _lock_fsid();
6951 if (r < 0)
6952 goto out_fsid;
6953
6954 r = _open_bdev(false);
6955 if (r < 0)
6956 goto out_fsid;
6957
11fdf7f2
TL
6958 if (open_db) {
6959 r = _open_db_and_around(false);
6960 } else {
6961 // we can bypass db open exclusively in case of kv_only mode
6962 ceph_assert(kv_only);
6963 r = _open_db(false, true);
9f95a23c
TL
6964 }
6965 if (r < 0) {
6966 goto out_bdev;
11fdf7f2 6967 }
7c673cae
FG
6968
6969 if (kv_only)
6970 return 0;
6971
11fdf7f2
TL
6972 r = _upgrade_super();
6973 if (r < 0) {
7c673cae 6974 goto out_db;
11fdf7f2 6975 }
7c673cae
FG
6976
6977 r = _open_collections();
6978 if (r < 0)
11fdf7f2 6979 goto out_db;
7c673cae
FG
6980
6981 r = _reload_logger();
6982 if (r < 0)
6983 goto out_coll;
6984
31f18b77 6985 _kv_start();
7c673cae
FG
6986
6987 r = _deferred_replay();
6988 if (r < 0)
6989 goto out_stop;
6990
6991 mempool_thread.init();
6992
9f95a23c 6993 if ((!per_pool_stat_collection || !per_pool_omap) &&
eafe8130 6994 cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
9f95a23c
TL
6995
6996 bool was_per_pool_omap = per_pool_omap;
6997
eafe8130
TL
6998 dout(1) << __func__ << " quick-fix on mount" << dendl;
6999 _fsck_on_open(FSCK_SHALLOW, true);
7000
7001 //reread statfs
7002 //FIXME minor: replace with actual open/close?
7003 _open_statfs();
eafe8130 7004 _check_legacy_statfs_alert();
9f95a23c
TL
7005
7006 //set again as hopefully it has been fixed
7007 if (!was_per_pool_omap) {
7008 _set_per_pool_omap();
7009 }
eafe8130
TL
7010 }
7011
7c673cae
FG
7012 mounted = true;
7013 return 0;
7014
7015 out_stop:
7016 _kv_stop();
7c673cae 7017 out_coll:
31f18b77 7018 _flush_cache();
7c673cae 7019 out_db:
11fdf7f2 7020 _close_db_and_around();
7c673cae
FG
7021 out_bdev:
7022 _close_bdev();
7023 out_fsid:
7024 _close_fsid();
7025 out_path:
7026 _close_path();
7027 return r;
7028}
7029
7030int BlueStore::umount()
7031{
11fdf7f2 7032 ceph_assert(_kv_only || mounted);
7c673cae
FG
7033 dout(1) << __func__ << dendl;
7034
7035 _osr_drain_all();
7c673cae 7036
7c673cae 7037 mounted = false;
3efd9988
FG
7038 if (!_kv_only) {
7039 mempool_thread.shutdown();
7040 dout(20) << __func__ << " stopping kv thread" << dendl;
7041 _kv_stop();
3efd9988
FG
7042 _flush_cache();
7043 dout(20) << __func__ << " closing" << dendl;
7044
3efd9988 7045 }
11fdf7f2 7046 _close_db_and_around();
7c673cae
FG
7047 _close_bdev();
7048 _close_fsid();
7049 _close_path();
7050
7051 if (cct->_conf->bluestore_fsck_on_umount) {
7052 int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
7053 if (rc < 0)
7054 return rc;
7055 if (rc > 0) {
7056 derr << __func__ << " fsck found " << rc << " errors" << dendl;
7057 return -EIO;
7058 }
7059 }
7060 return 0;
7061}
7062
eafe8130
TL
7063int BlueStore::cold_open()
7064{
7065 int r = _open_path();
7066 if (r < 0)
7067 return r;
7068 r = _open_fsid(false);
7069 if (r < 0)
7070 goto out_path;
7071
7072 r = _read_fsid(&fsid);
7073 if (r < 0)
7074 goto out_fsid;
7075
7076 r = _lock_fsid();
7077 if (r < 0)
7078 goto out_fsid;
7079
7080 r = _open_bdev(false);
7081 if (r < 0)
7082 goto out_fsid;
7083 r = _open_db_and_around(true);
7084 if (r < 0) {
7085 goto out_bdev;
7086 }
7087 return 0;
7088 out_bdev:
7089 _close_bdev();
7090 out_fsid:
7091 _close_fsid();
7092 out_path:
7093 _close_path();
7094 return r;
7095}
7096int BlueStore::cold_close()
7097{
7098 _close_db_and_around();
7099 _close_bdev();
7100 _close_fsid();
7101 _close_path();
7102 return 0;
7103}
7104
9f95a23c
TL
7105// derr wrapper to limit enormous output and avoid log flooding.
7106// Of limited use where such output is expected for now
7107#define fsck_derr(err_cnt, threshold) \
7108 if (err_cnt <= threshold) { \
7109 bool need_skip_print = err_cnt == threshold; \
7110 derr
7111
7112#define fsck_dendl \
7113 dendl; \
7114 if (need_skip_print) \
7115 derr << "more error lines skipped..." << dendl; \
7c673cae 7116 }
7c673cae 7117
eafe8130
TL
7118int _fsck_sum_extents(
7119 const PExtentVector& extents,
7120 bool compressed,
7121 store_statfs_t& expected_statfs)
7122{
7123 for (auto e : extents) {
7124 if (!e.is_valid())
7125 continue;
7126 expected_statfs.allocated += e.length;
7127 if (compressed) {
7128 expected_statfs.data_compressed_allocated += e.length;
7129 }
7130 }
7131 return 0;
7132}
7133
7c673cae 7134int BlueStore::_fsck_check_extents(
11fdf7f2 7135 const coll_t& cid,
7c673cae
FG
7136 const ghobject_t& oid,
7137 const PExtentVector& extents,
7138 bool compressed,
7139 mempool_dynamic_bitset &used_blocks,
b32b8144 7140 uint64_t granularity,
11fdf7f2 7141 BlueStoreRepairer* repairer,
eafe8130
TL
7142 store_statfs_t& expected_statfs,
7143 FSCKDepth depth)
7c673cae
FG
7144{
7145 dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
7146 int errors = 0;
7147 for (auto e : extents) {
7148 if (!e.is_valid())
7149 continue;
7150 expected_statfs.allocated += e.length;
7151 if (compressed) {
11fdf7f2 7152 expected_statfs.data_compressed_allocated += e.length;
7c673cae 7153 }
eafe8130
TL
7154 if (depth != FSCK_SHALLOW) {
7155 bool already = false;
9f95a23c 7156 apply_for_bitset_range(
eafe8130
TL
7157 e.offset, e.length, granularity, used_blocks,
7158 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
7159 if (bs.test(pos)) {
7160 if (repairer) {
7161 repairer->note_misreference(
7162 pos * min_alloc_size, min_alloc_size, !already);
7163 }
7164 if (!already) {
7165 derr << "fsck error: " << oid << " extent " << e
7166 << " or a subset is already allocated (misreferenced)" << dendl;
7167 ++errors;
7168 already = true;
7169 }
11fdf7f2 7170 }
eafe8130
TL
7171 else
7172 bs.set(pos);
7173 });
7174 if (repairer) {
7175 repairer->get_space_usage_tracker().set_used( e.offset, e.length, cid, oid);
7176 }
11fdf7f2 7177
eafe8130
TL
7178 if (e.end() > bdev->get_size()) {
7179 derr << "fsck error: " << oid << " extent " << e
7180 << " past end of block device" << dendl;
7181 ++errors;
7182 }
7c673cae
FG
7183 }
7184 }
7185 return errors;
7186}
7187
11fdf7f2
TL
7188void BlueStore::_fsck_check_pool_statfs(
7189 BlueStore::per_pool_statfs& expected_pool_statfs,
eafe8130
TL
7190 int64_t& errors,
7191 int64_t& warnings,
11fdf7f2
TL
7192 BlueStoreRepairer* repairer)
7193{
7194 auto it = db->get_iterator(PREFIX_STAT);
7195 if (it) {
7196 for (it->lower_bound(string()); it->valid(); it->next()) {
7197 string key = it->key();
7198 if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
7199 if (repairer) {
eafe8130
TL
7200 ++errors;
7201 repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
7202 derr << "fsck error: " << "legacy statfs record found, removing"
11fdf7f2
TL
7203 << dendl;
7204 }
7205 continue;
7206 }
11fdf7f2
TL
7207 uint64_t pool_id;
7208 if (get_key_pool_stat(key, &pool_id) < 0) {
7209 derr << "fsck error: bad key " << key
7210 << "in statfs namespece" << dendl;
7211 if (repairer) {
7212 repairer->remove_key(db, PREFIX_STAT, key);
7213 }
7214 ++errors;
7215 continue;
7216 }
7217
7218 volatile_statfs vstatfs;
7219 bufferlist bl = it->value();
7220 auto blp = bl.cbegin();
7221 try {
7222 vstatfs.decode(blp);
7223 } catch (buffer::error& e) {
7224 derr << "fsck error: failed to decode Pool StatFS record"
7225 << pretty_binary_string(key) << dendl;
7226 if (repairer) {
7227 dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
7228 << pretty_binary_string(key)
7229 << "', removing" << dendl;
7230 repairer->remove_key(db, PREFIX_STAT, key);
7231 }
7232 ++errors;
7233 vstatfs.reset();
7234 }
7235 auto stat_it = expected_pool_statfs.find(pool_id);
7236 if (stat_it == expected_pool_statfs.end()) {
7237 if (vstatfs.is_empty()) {
7238 // we don't consider that as an error since empty pool statfs
7239 // are left in DB for now
7240 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7241 << std::hex << pool_id << std::dec << dendl;
7242 if (repairer) {
7243 // but we need to increment error count in case of repair
7244 // to have proper counters at the end
7245 // (as repairer increments recovery counter anyway).
7246 ++errors;
7247 }
7248 } else {
7249 derr << "fsck error: found stray Pool StatFS record for pool id 0x"
7250 << std::hex << pool_id << std::dec << dendl;
7251 ++errors;
7252 }
7253 if (repairer) {
7254 repairer->remove_key(db, PREFIX_SHARED_BLOB, key);
7255 }
7256 continue;
7257 }
7258 store_statfs_t statfs;
7259 vstatfs.publish(&statfs);
7260 if (!(stat_it->second == statfs)) {
7261 derr << "fsck error: actual " << statfs
7262 << " != expected " << stat_it->second
7263 << " for pool "
7264 << std::hex << pool_id << std::dec << dendl;
7265 if (repairer) {
7266 repairer->fix_statfs(db, key, stat_it->second);
7267 }
7268 ++errors;
7269 }
7270 expected_pool_statfs.erase(stat_it);
7271 }
7272 } // if (it)
eafe8130
TL
7273 for (auto& s : expected_pool_statfs) {
7274 if (s.second.is_zero()) {
11fdf7f2
TL
7275 // we might lack empty statfs recs in DB
7276 continue;
7277 }
7278 derr << "fsck error: missing Pool StatFS record for pool "
eafe8130 7279 << std::hex << s.first << std::dec << dendl;
11fdf7f2
TL
7280 if (repairer) {
7281 string key;
eafe8130
TL
7282 get_pool_stat_key(s.first, &key);
7283 repairer->fix_statfs(db, key, s.second);
11fdf7f2
TL
7284 }
7285 ++errors;
7286 }
eafe8130 7287 if (!per_pool_stat_collection &&
eafe8130
TL
7288 repairer) {
7289 // by virtue of running this method, we correct the top-level
7290 // error of having global stats
7291 repairer->inc_repaired();
7292 }
11fdf7f2
TL
7293}
7294
eafe8130
TL
7295BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
7296 BlueStore::FSCKDepth depth,
7297 int64_t pool_id,
7298 BlueStore::CollectionRef c,
7299 const ghobject_t& oid,
7300 const string& key,
7301 const bufferlist& value,
9f95a23c 7302 mempool::bluestore_fsck::list<string>* expecting_shards,
eafe8130
TL
7303 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
7304 const BlueStore::FSCK_ObjectCtx& ctx)
7305{
7306 auto& errors = ctx.errors;
7307 auto& num_objects = ctx.num_objects;
7308 auto& num_extents = ctx.num_extents;
7309 auto& num_blobs = ctx.num_blobs;
7310 auto& num_sharded_objects = ctx.num_sharded_objects;
7311 auto& num_spanning_blobs = ctx.num_spanning_blobs;
7312 auto used_blocks = ctx.used_blocks;
7313 auto sb_info_lock = ctx.sb_info_lock;
7314 auto& sb_info = ctx.sb_info;
7315 auto repairer = ctx.repairer;
7316
7317 store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
7318 &ctx.expected_pool_statfs[pool_id] :
7319 &ctx.expected_store_statfs;
7320
7321 dout(10) << __func__ << " " << oid << dendl;
7322 OnodeRef o;
7323 o.reset(Onode::decode(c, oid, key, value));
7324 ++num_objects;
7c673cae 7325
eafe8130 7326 num_spanning_blobs += o->extent_map.spanning_blob_map.size();
7c673cae 7327
eafe8130
TL
7328 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
7329 _dump_onode<30>(cct, *o);
7330 // shards
7331 if (!o->extent_map.shards.empty()) {
7332 ++num_sharded_objects;
7333 if (depth != FSCK_SHALLOW) {
9f95a23c 7334 ceph_assert(expecting_shards);
eafe8130
TL
7335 for (auto& s : o->extent_map.shards) {
7336 dout(20) << __func__ << " shard " << *s.shard_info << dendl;
9f95a23c 7337 expecting_shards->push_back(string());
eafe8130 7338 get_extent_shard_key(o->key, s.shard_info->offset,
9f95a23c 7339 &expecting_shards->back());
eafe8130
TL
7340 if (s.shard_info->offset >= o->onode.size) {
7341 derr << "fsck error: " << oid << " shard 0x" << std::hex
7342 << s.shard_info->offset << " past EOF at 0x" << o->onode.size
7343 << std::dec << dendl;
7344 ++errors;
7345 }
7346 }
7347 }
7348 }
7c673cae 7349
eafe8130
TL
7350 // lextents
7351 uint64_t pos = 0;
7352 mempool::bluestore_fsck::map<BlobRef,
7353 bluestore_blob_use_tracker_t> ref_map;
7354 for (auto& l : o->extent_map.extent_map) {
7355 dout(20) << __func__ << " " << l << dendl;
7356 if (l.logical_offset < pos) {
7357 derr << "fsck error: " << oid << " lextent at 0x"
7358 << std::hex << l.logical_offset
7359 << " overlaps with the previous, which ends at 0x" << pos
7360 << std::dec << dendl;
7361 ++errors;
7362 }
7363 if (depth != FSCK_SHALLOW &&
7364 o->extent_map.spans_shard(l.logical_offset, l.length)) {
7365 derr << "fsck error: " << oid << " lextent at 0x"
7366 << std::hex << l.logical_offset << "~" << l.length
7367 << " spans a shard boundary"
7368 << std::dec << dendl;
7369 ++errors;
7370 }
7371 pos = l.logical_offset + l.length;
7372 res_statfs->data_stored += l.length;
7373 ceph_assert(l.blob);
7374 const bluestore_blob_t& blob = l.blob->get_blob();
7375
7376 auto& ref = ref_map[l.blob];
7377 if (ref.is_empty()) {
7378 uint32_t min_release_size = blob.get_release_size(min_alloc_size);
7379 uint32_t l = blob.get_logical_length();
7380 ref.init(l, min_release_size);
7381 }
7382 ref.get(
7383 l.blob_offset,
7384 l.length);
7385 ++num_extents;
7386 if (depth != FSCK_SHALLOW &&
7387 blob.has_unused()) {
7388 ceph_assert(referenced);
7389 auto p = referenced->find(l.blob);
7390 bluestore_blob_t::unused_t* pu;
7391 if (p == referenced->end()) {
7392 pu = &(*referenced)[l.blob];
7393 }
7394 else {
7395 pu = &p->second;
7396 }
7397 uint64_t blob_len = blob.get_logical_length();
7398 ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
7399 ceph_assert(l.blob_offset + l.length <= blob_len);
7400 uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
7401 uint64_t start = l.blob_offset / chunk_size;
7402 uint64_t end =
7403 round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
7404 for (auto i = start; i < end; ++i) {
7405 (*pu) |= (1u << i);
7406 }
7407 }
7408 } //for (auto& l : o->extent_map.extent_map)
7409
7410 for (auto& i : ref_map) {
7411 ++num_blobs;
7412 const bluestore_blob_t& blob = i.first->get_blob();
7413 bool equal =
7414 depth == FSCK_SHALLOW ? true :
7415 i.first->get_blob_use_tracker().equal(i.second);
7416 if (!equal) {
7417 derr << "fsck error: " << oid << " blob " << *i.first
7418 << " doesn't match expected ref_map " << i.second << dendl;
7419 ++errors;
7420 }
7421 if (blob.is_compressed()) {
7422 res_statfs->data_compressed += blob.get_compressed_payload_length();
7423 res_statfs->data_compressed_original +=
7424 i.first->get_referenced_bytes();
7425 }
7426 if (blob.is_shared()) {
7427 if (i.first->shared_blob->get_sbid() > blobid_max) {
7428 derr << "fsck error: " << oid << " blob " << blob
7429 << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
7430 << blobid_max << dendl;
7431 ++errors;
7432 }
7433 else if (i.first->shared_blob->get_sbid() == 0) {
7434 derr << "fsck error: " << oid << " blob " << blob
7435 << " marked as shared but has uninitialized sbid"
7436 << dendl;
7437 ++errors;
7438 }
7439 // the below lock is optional and provided in multithreading mode only
7440 if (sb_info_lock) {
7441 sb_info_lock->lock();
7442 }
7443 sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
7444 ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
7445 ceph_assert(sbi.pool_id == INT64_MIN ||
7446 sbi.pool_id == oid.hobj.get_logical_pool());
7447 sbi.cid = c->cid;
7448 sbi.pool_id = oid.hobj.get_logical_pool();
7449 sbi.sb = i.first->shared_blob;
7450 sbi.oids.push_back(oid);
7451 sbi.compressed = blob.is_compressed();
7452 for (auto e : blob.get_extents()) {
7453 if (e.is_valid()) {
7454 sbi.ref_map.get(e.offset, e.length);
7455 }
7456 }
7457 if (sb_info_lock) {
7458 sb_info_lock->unlock();
7459 }
7460 } else if (depth != FSCK_SHALLOW) {
7461 ceph_assert(used_blocks);
7462 errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
7463 blob.is_compressed(),
7464 *used_blocks,
7465 fm->get_alloc_size(),
7466 repairer,
7467 *res_statfs,
7468 depth);
7469 } else {
7470 errors += _fsck_sum_extents(
7471 blob.get_extents(),
7472 blob.is_compressed(),
7473 *res_statfs);
7474 }
7475 } // for (auto& i : ref_map)
9f95a23c
TL
7476
7477 if (o->onode.has_omap()) {
7478 _fsck_check_object_omap(depth, o, ctx);
7479 }
7480
eafe8130
TL
7481 return o;
7482}
7483
7484#include "common/WorkQueue.h"
7485
7486class ShallowFSCKThreadPool : public ThreadPool
7487{
7488public:
7489 ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
7490 ThreadPool(cct_, nm, tn, n) {
7491 }
7492 void worker(ThreadPool::WorkThread* wt) override {
7493 int next_wq = 0;
7494 while (!_stop) {
7495 next_wq %= work_queues.size();
7496 WorkQueue_ *wq = work_queues[next_wq++];
7497
7498 void* item = wq->_void_dequeue();
7499 if (item) {
7500 processing++;
7501 TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
7502 wq->_void_process(item, tp_handle);
7503 processing--;
7504 }
7505 }
7506 }
7507 template <size_t BatchLen>
7508 struct FSCKWorkQueue : public ThreadPool::WorkQueue_
7509 {
7510 struct Entry {
7511 int64_t pool_id;
7512 BlueStore::CollectionRef c;
7513 ghobject_t oid;
7514 string key;
7515 bufferlist value;
7516 };
7517 struct Batch {
7518 std::atomic<size_t> running = { 0 };
7519 size_t entry_count = 0;
7520 std::array<Entry, BatchLen> entries;
7521
7522 int64_t errors = 0;
7523 int64_t warnings = 0;
7524 uint64_t num_objects = 0;
7525 uint64_t num_extents = 0;
7526 uint64_t num_blobs = 0;
7527 uint64_t num_sharded_objects = 0;
7528 uint64_t num_spanning_blobs = 0;
7529 store_statfs_t expected_store_statfs;
7530 BlueStore::per_pool_statfs expected_pool_statfs;
7531 };
7532
7533 size_t batchCount;
7534 BlueStore* store = nullptr;
7535
eafe8130
TL
7536 ceph::mutex* sb_info_lock = nullptr;
7537 BlueStore::sb_info_map_t* sb_info = nullptr;
7538 BlueStoreRepairer* repairer = nullptr;
7539
7540 Batch* batches = nullptr;
7541 size_t last_batch_pos = 0;
7542 bool batch_acquired = false;
7543
7544 FSCKWorkQueue(std::string n,
7545 size_t _batchCount,
7546 BlueStore* _store,
eafe8130
TL
7547 ceph::mutex* _sb_info_lock,
7548 BlueStore::sb_info_map_t& _sb_info,
7549 BlueStoreRepairer* _repairer) :
7550 WorkQueue_(n, time_t(), time_t()),
7551 batchCount(_batchCount),
7552 store(_store),
eafe8130
TL
7553 sb_info_lock(_sb_info_lock),
7554 sb_info(&_sb_info),
7555 repairer(_repairer)
7556 {
7557 batches = new Batch[batchCount];
7558 }
7559 ~FSCKWorkQueue() {
7560 delete[] batches;
7561 }
7562
7563 /// Remove all work items from the queue.
7564 void _clear() override {
7565 //do nothing
7566 }
7567 /// Check whether there is anything to do.
7568 bool _empty() override {
7569 ceph_assert(false);
7570 }
7571
7572 /// Get the next work item to process.
7573 void* _void_dequeue() override {
7574 size_t pos = rand() % batchCount;
7575 size_t pos0 = pos;
7576 do {
7577 auto& batch = batches[pos];
7578 if (batch.running.fetch_add(1) == 0) {
7579 if (batch.entry_count) {
7580 return &batch;
7581 }
7582 }
7583 batch.running--;
7584 pos++;
7585 pos %= batchCount;
7586 } while (pos != pos0);
7587 return nullptr;
7588 }
7589 /** @brief Process the work item.
7590 * This function will be called several times in parallel
7591 * and must therefore be thread-safe. */
7592 void _void_process(void* item, TPHandle& handle) override {
7593 Batch* batch = (Batch*)item;
7594
7595 BlueStore::FSCK_ObjectCtx ctx(
7596 batch->errors,
7597 batch->warnings,
7598 batch->num_objects,
7599 batch->num_extents,
7600 batch->num_blobs,
7601 batch->num_sharded_objects,
7602 batch->num_spanning_blobs,
7603 nullptr, // used_blocks
9f95a23c 7604 nullptr, //used_omap_head
eafe8130
TL
7605 sb_info_lock,
7606 *sb_info,
7607 batch->expected_store_statfs,
7608 batch->expected_pool_statfs,
7609 repairer);
7610
7611 for (size_t i = 0; i < batch->entry_count; i++) {
7612 auto& entry = batch->entries[i];
7613
7614 store->fsck_check_objects_shallow(
7615 BlueStore::FSCK_SHALLOW,
7616 entry.pool_id,
7617 entry.c,
7618 entry.oid,
7619 entry.key,
7620 entry.value,
9f95a23c 7621 nullptr, // expecting_shards - this will need a protection if passed
eafe8130
TL
7622 nullptr, // referenced
7623 ctx);
7624 }
7625 //std::cout << "processed " << batch << std::endl;
7626 batch->entry_count = 0;
7627 batch->running--;
7628 }
7629 /** @brief Synchronously finish processing a work item.
7630 * This function is called after _void_process with the global thread pool lock held,
7631 * so at most one copy will execute simultaneously for a given thread pool.
7632 * It can be used for non-thread-safe finalization. */
7633 void _void_process_finish(void*) override {
7634 ceph_assert(false);
7635 }
7636
7637 bool queue(
7638 int64_t pool_id,
7639 BlueStore::CollectionRef c,
7640 const ghobject_t& oid,
7641 const string& key,
7642 const bufferlist& value) {
7643 bool res = false;
7644 size_t pos0 = last_batch_pos;
7645 if (!batch_acquired) {
7646 do {
7647 auto& batch = batches[last_batch_pos];
7648 if (batch.running.fetch_add(1) == 0) {
7649 if (batch.entry_count < BatchLen) {
7650 batch_acquired = true;
7651 break;
7652 }
7653 }
7654 batch.running.fetch_sub(1);
7655 last_batch_pos++;
7656 last_batch_pos %= batchCount;
7657 } while (last_batch_pos != pos0);
7658 }
7659 if (batch_acquired) {
7660 auto& batch = batches[last_batch_pos];
7661 ceph_assert(batch.running);
7662 ceph_assert(batch.entry_count < BatchLen);
7663
7664 auto& entry = batch.entries[batch.entry_count];
7665 entry.pool_id = pool_id;
7666 entry.c = c;
7667 entry.oid = oid;
7668 entry.key = key;
7669 entry.value = value;
7670
7671 ++batch.entry_count;
7672 if (batch.entry_count == BatchLen) {
7673 batch_acquired = false;
7674 batch.running.fetch_sub(1);
7675 last_batch_pos++;
7676 last_batch_pos %= batchCount;
7677 }
7678 res = true;
7679 }
7680 return res;
7681 }
7682
7683 void finalize(ThreadPool& tp,
7684 BlueStore::FSCK_ObjectCtx& ctx) {
7685 if (batch_acquired) {
7686 auto& batch = batches[last_batch_pos];
7687 ceph_assert(batch.running);
7688 batch.running.fetch_sub(1);
7689 }
7690 tp.stop();
7691
7692 for (size_t i = 0; i < batchCount; i++) {
7693 auto& batch = batches[i];
7694
7695 //process leftovers if any
7696 if (batch.entry_count) {
7697 TPHandle tp_handle(store->cct,
7698 nullptr,
7699 timeout_interval,
7700 suicide_interval);
7701 ceph_assert(batch.running == 0);
7702
7703 batch.running++; // just to be on-par with the regular call
7704 _void_process(&batch, tp_handle);
7705 }
7706 ceph_assert(batch.entry_count == 0);
7707
7708 ctx.errors += batch.errors;
7709 ctx.warnings += batch.warnings;
7710 ctx.num_objects += batch.num_objects;
7711 ctx.num_extents += batch.num_extents;
7712 ctx.num_blobs += batch.num_blobs;
7713 ctx.num_sharded_objects += batch.num_sharded_objects;
7714 ctx.num_spanning_blobs += batch.num_spanning_blobs;
9f95a23c 7715
eafe8130
TL
7716 ctx.expected_store_statfs.add(batch.expected_store_statfs);
7717
7718 for (auto it = batch.expected_pool_statfs.begin();
7719 it != batch.expected_pool_statfs.end();
7720 it++) {
7721 ctx.expected_pool_statfs[it->first].add(it->second);
7722 }
7723 }
7724 }
7725 };
7726};
7727
9f95a23c
TL
7728void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
7729 OnodeRef& o,
7730 const BlueStore::FSCK_ObjectCtx& ctx)
eafe8130 7731{
9f95a23c
TL
7732 auto& errors = ctx.errors;
7733 auto& warnings = ctx.warnings;
7734 auto repairer = ctx.repairer;
7735
7736 ceph_assert(o->onode.has_omap());
7737 if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
7738 if (per_pool_omap) {
7739 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
7740 << "fsck error: " << o->oid
7741 << " has omap that is not per-pool or pgmeta"
7742 << fsck_dendl;
7743 ++errors;
7744 } else {
7745 const char* w;
7746 int64_t num;
7747 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
7748 ++errors;
7749 num = errors;
7750 w = "error";
7751 } else {
7752 ++warnings;
7753 num = warnings;
7754 w = "warning";
7755 }
7756 fsck_derr(num, MAX_FSCK_ERROR_LINES)
7757 << "fsck " << w << ": " << o->oid
7758 << " has omap that is not per-pool or pgmeta"
7759 << fsck_dendl;
7760 }
7761 }
7762 if (repairer &&
7763 !o->onode.is_perpool_omap() &&
7764 !o->onode.is_pgmeta_omap()) {
7765 dout(10) << "fsck converting " << o->oid << " omap to per-pool" << dendl;
7766 bufferlist h;
7767 map<string, bufferlist> kv;
7768 int r = _onode_omap_get(o, &h, &kv);
7769 if (r < 0) {
7770 derr << " got " << r << " " << cpp_strerror(r) << dendl;
7771 } else {
7772 KeyValueDB::Transaction txn = db->get_transaction();
7773 // remove old keys
7774 const string& old_omap_prefix = o->get_omap_prefix();
7775 string old_head, old_tail;
7776 o->get_omap_header(&old_head);
7777 o->get_omap_tail(&old_tail);
7778 txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
7779 txn->rmkey(old_omap_prefix, old_tail);
7780 // set flag
7781 o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP);
7782 _record_onode(o, txn);
7783 const string& new_omap_prefix = o->get_omap_prefix();
7784 // head
7785 if (h.length()) {
7786 string new_head;
7787 o->get_omap_header(&new_head);
7788 txn->set(new_omap_prefix, new_head, h);
7789 }
7790 // tail
7791 string new_tail;
7792 o->get_omap_tail(&new_tail);
7793 bufferlist empty;
7794 txn->set(new_omap_prefix, new_tail, empty);
7795 // values
7796 string final_key;
7797 o->get_omap_key(string(), &final_key);
7798 size_t base_key_len = final_key.size();
7799 for (auto& i : kv) {
7800 final_key.resize(base_key_len);
7801 final_key += i.first;
7802 txn->set(new_omap_prefix, final_key, i.second);
7803 }
7804 db->submit_transaction_sync(txn);
7805 repairer->inc_repaired();
7806 }
eafe8130 7807 }
9f95a23c 7808}
eafe8130 7809
9f95a23c
TL
7810void BlueStore::_fsck_check_objects(FSCKDepth depth,
7811 BlueStore::FSCK_ObjectCtx& ctx)
7812{
eafe8130 7813 auto& errors = ctx.errors;
eafe8130
TL
7814 auto sb_info_lock = ctx.sb_info_lock;
7815 auto& sb_info = ctx.sb_info;
7816 auto repairer = ctx.repairer;
7817
7818 uint64_t_btree_t used_nids;
7819
7820 size_t processed_myself = 0;
7821
7822 auto it = db->get_iterator(PREFIX_OBJ);
7823 mempool::bluestore_fsck::list<string> expecting_shards;
7824 if (it) {
7825 const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
7826 typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
7827 std::unique_ptr<WQ> wq(
7828 new WQ(
7829 "FSCKWorkQueue",
7830 (thread_count ? : 1) * 32,
7831 this,
eafe8130
TL
7832 sb_info_lock,
7833 sb_info,
7834 repairer));
7835
7836 ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
7837
7838 thread_pool.add_work_queue(wq.get());
7839 if (depth == FSCK_SHALLOW && thread_count > 0) {
7840 //not the best place but let's check anyway
7841 ceph_assert(sb_info_lock);
7842 thread_pool.start();
7843 }
7844
7845 //fill global if not overriden below
7846 CollectionRef c;
7847 int64_t pool_id = -1;
7848 spg_t pgid;
7849 for (it->lower_bound(string()); it->valid(); it->next()) {
7850 dout(30) << __func__ << " key "
7851 << pretty_binary_string(it->key()) << dendl;
7852 if (is_extent_shard_key(it->key())) {
7853 if (depth == FSCK_SHALLOW) {
7854 continue;
7855 }
7856 while (!expecting_shards.empty() &&
7857 expecting_shards.front() < it->key()) {
7858 derr << "fsck error: missing shard key "
7859 << pretty_binary_string(expecting_shards.front())
7860 << dendl;
7861 ++errors;
7862 expecting_shards.pop_front();
7863 }
7864 if (!expecting_shards.empty() &&
7865 expecting_shards.front() == it->key()) {
7866 // all good
7867 expecting_shards.pop_front();
7868 continue;
7869 }
7870
7871 uint32_t offset;
7872 string okey;
7873 get_key_extent_shard(it->key(), &okey, &offset);
7874 derr << "fsck error: stray shard 0x" << std::hex << offset
7875 << std::dec << dendl;
7876 if (expecting_shards.empty()) {
7877 derr << "fsck error: " << pretty_binary_string(it->key())
7878 << " is unexpected" << dendl;
7879 ++errors;
7880 continue;
7881 }
7882 while (expecting_shards.front() > it->key()) {
7883 derr << "fsck error: saw " << pretty_binary_string(it->key())
7884 << dendl;
7885 derr << "fsck error: exp "
7886 << pretty_binary_string(expecting_shards.front()) << dendl;
7887 ++errors;
7888 expecting_shards.pop_front();
7889 if (expecting_shards.empty()) {
7890 break;
7891 }
7892 }
7893 continue;
7894 }
7895
7896 ghobject_t oid;
7897 int r = get_key_object(it->key(), &oid);
7898 if (r < 0) {
7899 derr << "fsck error: bad object key "
7900 << pretty_binary_string(it->key()) << dendl;
7901 ++errors;
7902 continue;
7903 }
7904 if (!c ||
7905 oid.shard_id != pgid.shard ||
7906 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
7907 !c->contains(oid)) {
7908 c = nullptr;
7909 for (auto& p : coll_map) {
7910 if (p.second->contains(oid)) {
7911 c = p.second;
7912 break;
7913 }
7914 }
7915 if (!c) {
7916 derr << "fsck error: stray object " << oid
7917 << " not owned by any collection" << dendl;
7918 ++errors;
7919 continue;
7920 }
7921 pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
7922 dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
7923 << dendl;
7924 }
7925
7926 if (depth != FSCK_SHALLOW &&
7927 !expecting_shards.empty()) {
7928 for (auto& k : expecting_shards) {
7929 derr << "fsck error: missing shard key "
7930 << pretty_binary_string(k) << dendl;
7931 }
7932 ++errors;
7933 expecting_shards.clear();
7934 }
7935
7936 bool queued = false;
7937 if (depth == FSCK_SHALLOW && thread_count > 0) {
7938 queued = wq->queue(
7939 pool_id,
7940 c,
7941 oid,
7942 it->key(),
7943 it->value());
7944 }
7945 OnodeRef o;
7946 map<BlobRef, bluestore_blob_t::unused_t> referenced;
7947
7948 if (!queued) {
7949 ++processed_myself;
7950
7951 o = fsck_check_objects_shallow(
7952 depth,
7953 pool_id,
7954 c,
7955 oid,
7956 it->key(),
7957 it->value(),
9f95a23c 7958 &expecting_shards,
eafe8130
TL
7959 &referenced,
7960 ctx);
7961 }
7962
7963 if (depth != FSCK_SHALLOW) {
7964 ceph_assert(o != nullptr);
7965 if (o->onode.nid) {
7966 if (o->onode.nid > nid_max) {
7967 derr << "fsck error: " << oid << " nid " << o->onode.nid
7968 << " > nid_max " << nid_max << dendl;
7969 ++errors;
7970 }
7971 if (used_nids.count(o->onode.nid)) {
7972 derr << "fsck error: " << oid << " nid " << o->onode.nid
7973 << " already in use" << dendl;
7974 ++errors;
7975 continue; // go for next object
7976 }
7977 used_nids.insert(o->onode.nid);
7978 }
7979 for (auto& i : referenced) {
7980 dout(20) << __func__ << " referenced 0x" << std::hex << i.second
7981 << std::dec << " for " << *i.first << dendl;
7982 const bluestore_blob_t& blob = i.first->get_blob();
7983 if (i.second & blob.unused) {
7984 derr << "fsck error: " << oid << " blob claims unused 0x"
7985 << std::hex << blob.unused
7986 << " but extents reference 0x" << i.second << std::dec
7987 << " on blob " << *i.first << dendl;
7988 ++errors;
7989 }
7990 if (blob.has_csum()) {
7991 uint64_t blob_len = blob.get_logical_length();
7992 uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
7993 unsigned csum_count = blob.get_csum_count();
7994 unsigned csum_chunk_size = blob.get_csum_chunk_size();
7995 for (unsigned p = 0; p < csum_count; ++p) {
7996 unsigned pos = p * csum_chunk_size;
7997 unsigned firstbit = pos / unused_chunk_size; // [firstbit,lastbit]
7998 unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
7999 unsigned mask = 1u << firstbit;
8000 for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
8001 mask |= 1u << b;
8002 }
8003 if ((blob.unused & mask) == mask) {
8004 // this csum chunk region is marked unused
8005 if (blob.get_csum_item(p) != 0) {
8006 derr << "fsck error: " << oid
8007 << " blob claims csum chunk 0x" << std::hex << pos
8008 << "~" << csum_chunk_size
8009 << " is unused (mask 0x" << mask << " of unused 0x"
8010 << blob.unused << ") but csum is non-zero 0x"
8011 << blob.get_csum_item(p) << std::dec << " on blob "
8012 << *i.first << dendl;
8013 ++errors;
8014 }
8015 }
8016 }
8017 }
8018 }
8019 // omap
8020 if (o->onode.has_omap()) {
9f95a23c
TL
8021 ceph_assert(ctx.used_omap_head);
8022 if (ctx.used_omap_head->count(o->onode.nid)) {
8023 derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
8024 << " already in use" << dendl;
eafe8130
TL
8025 ++errors;
8026 } else {
9f95a23c 8027 ctx.used_omap_head->insert(o->onode.nid);
eafe8130 8028 }
9f95a23c 8029 } // if (o->onode.has_omap())
eafe8130
TL
8030 if (depth == FSCK_DEEP) {
8031 bufferlist bl;
8032 uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
8033 uint64_t offset = 0;
8034 do {
8035 uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
8036 int r = _do_read(c.get(), o, offset, l, bl,
8037 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
8038 if (r < 0) {
8039 ++errors;
8040 derr << "fsck error: " << oid << std::hex
8041 << " error during read: "
8042 << " " << offset << "~" << l
8043 << " " << cpp_strerror(r) << std::dec
8044 << dendl;
8045 break;
8046 }
8047 offset += l;
8048 } while (offset < o->onode.size);
8049 } // deep
8050 } //if (depth != FSCK_SHALLOW)
8051 } // for (it->lower_bound(string()); it->valid(); it->next())
8052 if (depth == FSCK_SHALLOW && thread_count > 0) {
8053 wq->finalize(thread_pool, ctx);
8054 if (processed_myself) {
8055 // may be needs more threads?
8056 dout(0) << __func__ << " partial offload"
8057 << ", done myself " << processed_myself
8058 << " of " << ctx.num_objects
8059 << "objects, threads " << thread_count
8060 << dendl;
8061 }
8062 }
8063 } // if (it)
8064}
8065/**
8066An overview for currently implemented repair logics
8067performed in fsck in two stages: detection(+preparation) and commit.
8068Detection stage (in processing order):
8069 (Issue -> Repair action to schedule)
8070 - Detect undecodable keys for Shared Blobs -> Remove
8071 - Detect undecodable records for Shared Blobs -> Remove
8072 (might trigger missed Shared Blob detection below)
8073 - Detect stray records for Shared Blobs -> Remove
8074 - Detect misreferenced pextents -> Fix
8075 Prepare Bloom-like filter to track cid/oid -> pextent
8076 Prepare list of extents that are improperly referenced
8077 Enumerate Onode records that might use 'misreferenced' pextents
8078 (Bloom-like filter applied to reduce computation)
8079 Per each questinable Onode enumerate all blobs and identify broken ones
8080 (i.e. blobs having 'misreferences')
8081 Rewrite each broken blob data by allocating another extents and
8082 copying data there
8083 If blob is shared - unshare it and mark corresponding Shared Blob
8084 for removal
8085 Release previously allocated space
8086 Update Extent Map
8087 - Detect missed Shared Blobs -> Recreate
8088 - Detect undecodable deferred transaction -> Remove
8089 - Detect Freelist Manager's 'false free' entries -> Mark as used
8090 - Detect Freelist Manager's leaked entries -> Mark as free
8091 - Detect statfs inconsistency - Update
8092 Commit stage (separate DB commit per each step):
8093 - Apply leaked FM entries fix
8094 - Apply 'false free' FM entries fix
8095 - Apply 'Remove' actions
8096 - Apply fix for misreference pextents
8097 - Apply Shared Blob recreate
8098 (can be merged with the step above if misreferences were dectected)
8099 - Apply StatFS update
8100*/
8101int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
8102{
8103 dout(1) << __func__
8104 << (repair ? " repair" : " check")
8105 << (depth == FSCK_DEEP ? " (deep)" :
8106 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8107 << dendl;
8108
8109 // in deep mode we need R/W write access to be able to replay deferred ops
8110 bool read_only = !(repair || depth == FSCK_DEEP);
8111
8112 int r = _open_path();
8113 if (r < 0)
8114 return r;
7c673cae
FG
8115 r = _open_fsid(false);
8116 if (r < 0)
8117 goto out_path;
8118
8119 r = _read_fsid(&fsid);
8120 if (r < 0)
8121 goto out_fsid;
8122
8123 r = _lock_fsid();
8124 if (r < 0)
8125 goto out_fsid;
8126
8127 r = _open_bdev(false);
8128 if (r < 0)
8129 goto out_fsid;
8130
11fdf7f2 8131 r = _open_db_and_around(read_only);
7c673cae
FG
8132 if (r < 0)
8133 goto out_bdev;
8134
11fdf7f2
TL
8135 if (!read_only) {
8136 r = _upgrade_super();
8137 if (r < 0) {
8138 goto out_db;
8139 }
8140 }
7c673cae 8141
eafe8130 8142 r = _open_collections();
7c673cae 8143 if (r < 0)
11fdf7f2 8144 goto out_db;
7c673cae
FG
8145
8146 mempool_thread.init();
8147
11fdf7f2
TL
8148 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8149 // enable in repair or deep mode modes only
8150 if (!read_only) {
8151 _kv_start();
8152 r = _deferred_replay();
8153 _kv_stop();
8154 }
7c673cae
FG
8155 if (r < 0)
8156 goto out_scan;
8157
eafe8130
TL
8158 r = _fsck_on_open(depth, repair);
8159
8160out_scan:
8161 mempool_thread.shutdown();
8162 _flush_cache();
8163out_db:
8164 _close_db_and_around();
8165out_bdev:
8166 _close_bdev();
8167out_fsid:
8168 _close_fsid();
8169out_path:
8170 _close_path();
8171
8172 return r;
8173}
8174
8175int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
8176{
8177 dout(1) << __func__
8178 << " <<<START>>>"
8179 << (repair ? " repair" : " check")
8180 << (depth == FSCK_DEEP ? " (deep)" :
8181 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
8182 << " start" << dendl;
8183 int64_t errors = 0;
8184 int64_t warnings = 0;
8185 unsigned repaired = 0;
8186
8187 uint64_t_btree_t used_omap_head;
eafe8130
TL
8188 uint64_t_btree_t used_sbids;
8189
8190 mempool_dynamic_bitset used_blocks;
8191 KeyValueDB::Iterator it;
8192 store_statfs_t expected_store_statfs, actual_statfs;
8193 per_pool_statfs expected_pool_statfs;
8194
8195 sb_info_map_t sb_info;
8196
8197 uint64_t num_objects = 0;
8198 uint64_t num_extents = 0;
8199 uint64_t num_blobs = 0;
8200 uint64_t num_spanning_blobs = 0;
8201 uint64_t num_shared_blobs = 0;
8202 uint64_t num_sharded_objects = 0;
8203 BlueStoreRepairer repairer;
8204
8205 utime_t start = ceph_clock_now();
8206
8207 _fsck_collections(&errors);
b32b8144 8208 used_blocks.resize(fm->get_alloc_units());
9f95a23c 8209 apply_for_bitset_range(
11fdf7f2 8210 0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
7c673cae
FG
8211 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8212 bs.set(pos);
8213 }
8214 );
11fdf7f2
TL
8215 if (repair) {
8216 repairer.get_space_usage_tracker().init(
8217 bdev->get_size(),
8218 min_alloc_size);
8219 }
7c673cae
FG
8220
8221 if (bluefs) {
11fdf7f2
TL
8222 if( cct->_conf->bluestore_bluefs_db_compatibility) {
8223 interval_set<uint64_t> bluefs_extents_db;
8224 bufferlist bl;
8225 db->get(PREFIX_SUPER, "bluefs_extents", &bl);
8226 auto p = bl.cbegin();
8227 auto prev_errors = errors;
8228 try {
8229 decode(bluefs_extents_db, p);
8230 bluefs_extents_db.union_of(bluefs_extents);
8231 bluefs_extents_db.subtract(bluefs_extents);
8232 if (!bluefs_extents_db.empty()) {
8233 derr << "fsck error: bluefs_extents inconsistency, "
8234 << "downgrade to previous releases might be broken."
8235 << dendl;
8236 ++errors;
8237 }
8238 }
8239 catch (buffer::error& e) {
8240 derr << "fsck error: failed to retrieve bluefs_extents from kv" << dendl;
8241 ++errors;
8242 }
8243 if (errors != prev_errors && repair) {
8244 repairer.fix_bluefs_extents(out_of_sync_fm);
8245 }
8246 }
8247
7c673cae 8248 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
9f95a23c 8249 apply_for_bitset_range(
b32b8144 8250 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae
FG
8251 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
8252 bs.set(pos);
8253 }
8254 );
8255 }
eafe8130 8256 int r = bluefs->fsck();
7c673cae 8257 if (r < 0) {
eafe8130 8258 return r;
7c673cae
FG
8259 }
8260 if (r > 0)
8261 errors += r;
8262 }
8263
eafe8130
TL
8264 if (!per_pool_stat_collection) {
8265 const char *w;
8266 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
8267 w = "error";
8268 ++errors;
8269 } else {
8270 w = "warning";
8271 ++warnings;
8272 }
8273 derr << "fsck " << w << ": store not yet converted to per-pool stats"
8274 << dendl;
8275 }
9f95a23c
TL
8276 if (!per_pool_omap) {
8277 const char *w;
8278 if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
8279 w = "error";
8280 ++errors;
8281 } else {
8282 w = "warning";
8283 ++warnings;
8284 }
8285 derr << "fsck " << w << ": store not yet converted to per-pool omap"
8286 << dendl;
8287 }
8288
11fdf7f2 8289 // get expected statfs; reset unaffected fields to be able to compare
7c673cae
FG
8290 // structs
8291 statfs(&actual_statfs);
11fdf7f2
TL
8292 actual_statfs.total = 0;
8293 actual_statfs.internally_reserved = 0;
8294 actual_statfs.available = 0;
8295 actual_statfs.internal_metadata = 0;
8296 actual_statfs.omap_allocated = 0;
8297
eafe8130
TL
8298 if (g_conf()->bluestore_debug_fsck_abort) {
8299 dout(1) << __func__ << " debug abort" << dendl;
8300 goto out_scan;
8301 }
7c673cae 8302 // walk PREFIX_OBJ
eafe8130
TL
8303 {
8304 dout(1) << __func__ << " walking object keyspace" << dendl;
8305 ceph::mutex sb_info_lock = ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8306 BlueStore::FSCK_ObjectCtx ctx(
8307 errors,
8308 warnings,
8309 num_objects,
8310 num_extents,
8311 num_blobs,
8312 num_sharded_objects,
8313 num_spanning_blobs,
8314 &used_blocks,
8315 &used_omap_head,
9f95a23c
TL
8316 //no need for the below lock when in non-shallow mode as
8317 // there is no multithreading in this case
8318 depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
eafe8130
TL
8319 sb_info,
8320 expected_store_statfs,
8321 expected_pool_statfs,
8322 repair ? &repairer : nullptr);
9f95a23c
TL
8323
8324 _fsck_check_objects(depth, ctx);
eafe8130 8325 }
11fdf7f2 8326
7c673cae
FG
8327 dout(1) << __func__ << " checking shared_blobs" << dendl;
8328 it = db->get_iterator(PREFIX_SHARED_BLOB);
8329 if (it) {
eafe8130
TL
8330 // FIXME minor: perhaps simplify for shallow mode?
8331 // fill global if not overriden below
8332 auto expected_statfs = &expected_store_statfs;
11fdf7f2 8333
7c673cae
FG
8334 for (it->lower_bound(string()); it->valid(); it->next()) {
8335 string key = it->key();
8336 uint64_t sbid;
8337 if (get_key_shared_blob(key, &sbid)) {
3efd9988 8338 derr << "fsck error: bad key '" << key
7c673cae 8339 << "' in shared blob namespace" << dendl;
11fdf7f2
TL
8340 if (repair) {
8341 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8342 }
7c673cae
FG
8343 ++errors;
8344 continue;
8345 }
8346 auto p = sb_info.find(sbid);
8347 if (p == sb_info.end()) {
3efd9988 8348 derr << "fsck error: found stray shared blob data for sbid 0x"
7c673cae 8349 << std::hex << sbid << std::dec << dendl;
11fdf7f2
TL
8350 if (repair) {
8351 repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
8352 }
7c673cae
FG
8353 ++errors;
8354 } else {
8355 ++num_shared_blobs;
8356 sb_info_t& sbi = p->second;
8357 bluestore_shared_blob_t shared_blob(sbid);
8358 bufferlist bl = it->value();
11fdf7f2
TL
8359 auto blp = bl.cbegin();
8360 try {
8361 decode(shared_blob, blp);
8362 } catch (buffer::error& e) {
8363 ++errors;
8364 // Force update and don't report as missing
8365 sbi.updated = sbi.passed = true;
8366
8367 derr << "fsck error: failed to decode Shared Blob"
8368 << pretty_binary_string(it->key()) << dendl;
8369 if (repair) {
8370 dout(20) << __func__ << " undecodable Shared Blob, key:'"
8371 << pretty_binary_string(it->key())
8372 << "', removing" << dendl;
8373 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8374 }
8375 continue;
8376 }
7c673cae
FG
8377 dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
8378 if (shared_blob.ref_map != sbi.ref_map) {
3efd9988 8379 derr << "fsck error: shared blob 0x" << std::hex << sbid
11fdf7f2
TL
8380 << std::dec << " ref_map " << shared_blob.ref_map
8381 << " != expected " << sbi.ref_map << dendl;
8382 sbi.updated = true; // will update later in repair mode only!
7c673cae
FG
8383 ++errors;
8384 }
8385 PExtentVector extents;
8386 for (auto &r : shared_blob.ref_map.ref_map) {
8387 extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
8388 }
eafe8130 8389 if (per_pool_stat_collection || repair) {
11fdf7f2
TL
8390 expected_statfs = &expected_pool_statfs[sbi.pool_id];
8391 }
8392 errors += _fsck_check_extents(sbi.cid,
8393 p->second.oids.front(),
7c673cae
FG
8394 extents,
8395 p->second.compressed,
b32b8144
FG
8396 used_blocks,
8397 fm->get_alloc_size(),
11fdf7f2 8398 repair ? &repairer : nullptr,
eafe8130
TL
8399 *expected_statfs,
8400 depth);
11fdf7f2
TL
8401 sbi.passed = true;
8402 }
8403 }
8404 } // if (it)
8405
8406 if (repair && repairer.preprocess_misreference(db)) {
8407
8408 dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
8409 auto& space_tracker = repairer.get_space_usage_tracker();
8410 auto& misref_extents = repairer.get_misreferences();
8411 interval_set<uint64_t> to_release;
8412 it = db->get_iterator(PREFIX_OBJ);
8413 if (it) {
eafe8130
TL
8414 // fill global if not overriden below
8415 auto expected_statfs = &expected_store_statfs;
11fdf7f2
TL
8416
8417 CollectionRef c;
8418 spg_t pgid;
8419 KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
8420 bool bypass_rest = false;
8421 for (it->lower_bound(string()); it->valid() && !bypass_rest;
8422 it->next()) {
8423 dout(30) << __func__ << " key "
8424 << pretty_binary_string(it->key()) << dendl;
8425 if (is_extent_shard_key(it->key())) {
8426 continue;
8427 }
8428
8429 ghobject_t oid;
8430 int r = get_key_object(it->key(), &oid);
8431 if (r < 0 || !space_tracker.is_used(oid)) {
8432 continue;
8433 }
8434
8435 if (!c ||
8436 oid.shard_id != pgid.shard ||
8437 oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
8438 !c->contains(oid)) {
8439 c = nullptr;
8440 for (auto& p : coll_map) {
8441 if (p.second->contains(oid)) {
8442 c = p.second;
8443 break;
8444 }
8445 }
8446 if (!c) {
8447 continue;
8448 }
eafe8130
TL
8449 if (per_pool_stat_collection || repair) {
8450 auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
11fdf7f2
TL
8451 expected_statfs = &expected_pool_statfs[pool_id];
8452 }
8453 }
8454 if (!space_tracker.is_used(c->cid)) {
8455 continue;
8456 }
8457
8458 dout(20) << __func__ << " check misreference for col:" << c->cid
8459 << " obj:" << oid << dendl;
8460
eafe8130
TL
8461 OnodeRef o;
8462 o.reset(Onode::decode(c, oid, it->key(), it->value()));
11fdf7f2
TL
8463 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8464 mempool::bluestore_fsck::set<BlobRef> blobs;
8465
8466 for (auto& e : o->extent_map.extent_map) {
8467 blobs.insert(e.blob);
8468 }
8469 bool need_onode_update = false;
8470 bool first_dump = true;
8471 for(auto b : blobs) {
8472 bool broken_blob = false;
8473 auto& pextents = b->dirty_blob().dirty_extents();
8474 for (auto& e : pextents) {
8475 if (!e.is_valid()) {
8476 continue;
8477 }
8478 // for the sake of simplicity and proper shared blob handling
8479 // always rewrite the whole blob even when it's partially
8480 // misreferenced.
8481 if (misref_extents.intersects(e.offset, e.length)) {
8482 if (first_dump) {
8483 first_dump = false;
81eedcae 8484 _dump_onode<10>(cct, *o);
11fdf7f2
TL
8485 }
8486 broken_blob = true;
8487 break;
8488 }
8489 }
8490 if (!broken_blob)
8491 continue;
8492 bool compressed = b->get_blob().is_compressed();
8493 need_onode_update = true;
8494 dout(10) << __func__
8495 << " fix misreferences in oid:" << oid
8496 << " " << *b << dendl;
8497 uint64_t b_off = 0;
8498 PExtentVector pext_to_release;
8499 pext_to_release.reserve(pextents.size());
8500 // rewriting all valid pextents
8501 for (auto e = pextents.begin(); e != pextents.end();
8502 b_off += e->length, e++) {
8503 if (!e->is_valid()) {
8504 continue;
8505 }
8506 PExtentVector exts;
8507 int64_t alloc_len = alloc->allocate(e->length, min_alloc_size,
8508 0, 0, &exts);
eafe8130 8509 if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
11fdf7f2
TL
8510 derr << __func__
8511 << " failed to allocate 0x" << std::hex << e->length
eafe8130 8512 << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
11fdf7f2
TL
8513 << " min_alloc_size 0x" << min_alloc_size
8514 << " available 0x " << alloc->get_free()
8515 << std::dec << dendl;
8516 if (alloc_len > 0) {
8517 alloc->release(exts);
8518 }
8519 bypass_rest = true;
8520 break;
8521 }
8522 expected_statfs->allocated += e->length;
8523 if (compressed) {
8524 expected_statfs->data_compressed_allocated += e->length;
8525 }
8526
8527 bufferlist bl;
8528 IOContext ioc(cct, NULL, true); // allow EIO
8529 r = bdev->read(e->offset, e->length, &bl, &ioc, false);
8530 if (r < 0) {
8531 derr << __func__ << " failed to read from 0x" << std::hex << e->offset
8532 <<"~" << e->length << std::dec << dendl;
8533 ceph_abort_msg("read failed, wtf");
8534 }
8535 pext_to_release.push_back(*e);
8536 e = pextents.erase(e);
8537 e = pextents.insert(e, exts.begin(), exts.end());
8538 b->get_blob().map_bl(
8539 b_off, bl,
8540 [&](uint64_t offset, bufferlist& t) {
8541 int r = bdev->write(offset, t, false);
8542 ceph_assert(r == 0);
8543 });
8544 e += exts.size() - 1;
8545 for (auto& p : exts) {
8546 fm->allocate(p.offset, p.length, txn);
8547 }
8548 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8549
8550 if (b->get_blob().is_shared()) {
8551 b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
8552
8553 auto sb_it = sb_info.find(b->shared_blob->get_sbid());
8554 ceph_assert(sb_it != sb_info.end());
8555 sb_info_t& sbi = sb_it->second;
8556
8557 for (auto& r : sbi.ref_map.ref_map) {
8558 expected_statfs->allocated -= r.second.length;
8559 if (sbi.compressed) {
8560 // NB: it's crucial to use compressed flag from sb_info_t
8561 // as we originally used that value while accumulating
8562 // expected_statfs
8563 expected_statfs->data_compressed_allocated -= r.second.length;
8564 }
8565 }
8566 sbi.updated = sbi.passed = true;
8567 sbi.ref_map.clear();
8568
8569 // relying on blob's pextents to decide what to release.
8570 for (auto& p : pext_to_release) {
8571 to_release.union_insert(p.offset, p.length);
8572 }
8573 } else {
8574 for (auto& p : pext_to_release) {
8575 expected_statfs->allocated -= p.length;
8576 if (compressed) {
8577 expected_statfs->data_compressed_allocated -= p.length;
8578 }
8579 to_release.union_insert(p.offset, p.length);
8580 }
8581 }
8582 if (bypass_rest) {
8583 break;
8584 }
8585 } // for(auto b : blobs)
8586 if (need_onode_update) {
8587 o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
8588 _record_onode(o, txn);
8589 }
8590 } // for (it->lower_bound(string()); it->valid(); it->next())
8591
8592 for (auto it = to_release.begin(); it != to_release.end(); ++it) {
8593 dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
8594 << "~" << it.get_len() << std::dec << dendl;
8595 fm->release(it.get_start(), it.get_len(), txn);
8596 }
8597 alloc->release(to_release);
8598 to_release.clear();
8599 } // if (it) {
8600 } //if (repair && repairer.preprocess_misreference()) {
8601
eafe8130
TL
8602 if (depth != FSCK_SHALLOW) {
8603 for (auto &p : sb_info) {
8604 sb_info_t& sbi = p.second;
8605 if (!sbi.passed) {
8606 derr << "fsck error: missing " << *sbi.sb << dendl;
8607 ++errors;
8608 }
8609 if (repair && (!sbi.passed || sbi.updated)) {
8610 auto sbid = p.first;
8611 if (sbi.ref_map.empty()) {
8612 ceph_assert(sbi.passed);
8613 dout(20) << __func__ << " " << *sbi.sb
8614 << " is empty, removing" << dendl;
8615 repairer.fix_shared_blob(db, sbid, nullptr);
8616 } else {
8617 bufferlist bl;
8618 bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
8619 encode(persistent, bl);
8620 dout(20) << __func__ << " " << *sbi.sb
8621 << " is " << bl.length() << " bytes, updating" << dendl;
11fdf7f2 8622
eafe8130
TL
8623 repairer.fix_shared_blob(db, sbid, &bl);
8624 }
7c673cae
FG
8625 }
8626 }
8627 }
11fdf7f2
TL
8628 sb_info.clear();
8629
eafe8130
TL
8630 // check global stats only if fscking (not repairing) w/o per-pool stats
8631 if (!per_pool_stat_collection &&
8632 !repair &&
8633 !(actual_statfs == expected_store_statfs)) {
8634 derr << "fsck error: actual " << actual_statfs
8635 << " != expected " << expected_store_statfs << dendl;
8636 if (repair) {
8637 repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
8638 expected_store_statfs);
11fdf7f2 8639 }
eafe8130 8640 ++errors;
7c673cae
FG
8641 }
8642
eafe8130
TL
8643 dout(1) << __func__ << " checking pool_statfs" << dendl;
8644 _fsck_check_pool_statfs(expected_pool_statfs,
8645 errors, warnings, repair ? &repairer : nullptr);
8646
8647 if (depth != FSCK_SHALLOW) {
9f95a23c 8648 dout(1) << __func__ << " checking for stray omap data " << dendl;
eafe8130
TL
8649 it = db->get_iterator(PREFIX_OMAP);
8650 if (it) {
9f95a23c 8651 uint64_t last_omap_head = 0;
eafe8130
TL
8652 for (it->lower_bound(string()); it->valid(); it->next()) {
8653 uint64_t omap_head;
8654 _key_decode_u64(it->key().c_str(), &omap_head);
9f95a23c
TL
8655 if (used_omap_head.count(omap_head) == 0 &&
8656 omap_head != last_omap_head) {
8657 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8658 << "fsck error: found stray omap data on omap_head "
8659 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head)<< fsck_dendl;
eafe8130 8660 ++errors;
9f95a23c 8661 last_omap_head = omap_head;
eafe8130 8662 }
7c673cae
FG
8663 }
8664 }
eafe8130
TL
8665 it = db->get_iterator(PREFIX_PGMETA_OMAP);
8666 if (it) {
9f95a23c 8667 uint64_t last_omap_head = 0;
eafe8130
TL
8668 for (it->lower_bound(string()); it->valid(); it->next()) {
8669 uint64_t omap_head;
8670 _key_decode_u64(it->key().c_str(), &omap_head);
9f95a23c
TL
8671 if (used_omap_head.count(omap_head) == 0 &&
8672 omap_head != last_omap_head) {
8673 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8674 << "fsck error: found stray (pgmeta) omap data on omap_head "
8675 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8676 last_omap_head = omap_head;
eafe8130
TL
8677 ++errors;
8678 }
11fdf7f2
TL
8679 }
8680 }
9f95a23c
TL
8681 it = db->get_iterator(PREFIX_PERPOOL_OMAP);
8682 if (it) {
8683 uint64_t last_omap_head = 0;
8684 for (it->lower_bound(string()); it->valid(); it->next()) {
8685 uint64_t pool;
8686 uint64_t omap_head;
8687 string k = it->key();
8688 const char *c = k.c_str();
8689 c = _key_decode_u64(c, &pool);
8690 c = _key_decode_u64(c, &omap_head);
8691 if (used_omap_head.count(omap_head) == 0 &&
8692 omap_head != last_omap_head) {
8693 fsck_derr(errors, MAX_FSCK_ERROR_LINES)
8694 << "fsck error: found stray (per-pool) omap data on omap_head "
8695 << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
8696 ++errors;
8697 last_omap_head = omap_head;
8698 }
8699 }
8700 }
eafe8130
TL
8701 dout(1) << __func__ << " checking deferred events" << dendl;
8702 it = db->get_iterator(PREFIX_DEFERRED);
8703 if (it) {
8704 for (it->lower_bound(string()); it->valid(); it->next()) {
8705 bufferlist bl = it->value();
8706 auto p = bl.cbegin();
8707 bluestore_deferred_transaction_t wt;
8708 try {
8709 decode(wt, p);
8710 } catch (buffer::error& e) {
8711 derr << "fsck error: failed to decode deferred txn "
8712 << pretty_binary_string(it->key()) << dendl;
8713 if (repair) {
8714 dout(20) << __func__ << " undecodable deferred TXN record, key: '"
8715 << pretty_binary_string(it->key())
8716 << "', removing" << dendl;
8717 repairer.remove_key(db, PREFIX_DEFERRED, it->key());
8718 }
8719 continue;
8720 }
8721 dout(20) << __func__ << " deferred " << wt.seq
8722 << " ops " << wt.ops.size()
8723 << " released 0x" << std::hex << wt.released << std::dec << dendl;
8724 for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
9f95a23c 8725 apply_for_bitset_range(
eafe8130
TL
8726 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
8727 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
8728 bs.set(pos);
8729 }
8730 );
8731 }
7c673cae 8732 }
eafe8130
TL
8733 }
8734
8735 dout(1) << __func__ << " checking freelist vs allocated" << dendl;
8736 {
8737 // remove bluefs_extents from used set since the freelist doesn't
8738 // know they are allocated.
8739 for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
9f95a23c 8740 apply_for_bitset_range(
b32b8144 8741 e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
7c673cae 8742 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130 8743 bs.reset(pos);
7c673cae
FG
8744 }
8745 );
8746 }
eafe8130
TL
8747 fm->enumerate_reset();
8748 uint64_t offset, length;
8749 while (fm->enumerate_next(db, &offset, &length)) {
8750 bool intersects = false;
9f95a23c 8751 apply_for_bitset_range(
eafe8130
TL
8752 offset, length, fm->get_alloc_size(), used_blocks,
8753 [&](uint64_t pos, mempool_dynamic_bitset &bs) {
eafe8130
TL
8754 if (bs.test(pos)) {
8755 if (offset == SUPER_RESERVED &&
8756 length == min_alloc_size - SUPER_RESERVED) {
8757 // this is due to the change just after luminous to min_alloc_size
8758 // granularity allocations, and our baked in assumption at the top
8759 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
8760 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
8761 // since we will never allocate this region below min_alloc_size.
8762 dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
8763 << " and min_alloc_size, 0x" << std::hex << offset << "~"
8764 << length << std::dec << dendl;
8765 } else {
8766 intersects = true;
8767 if (repair) {
8768 repairer.fix_false_free(db, fm,
8769 pos * min_alloc_size,
8770 min_alloc_size);
8771 }
11fdf7f2 8772 }
eafe8130
TL
8773 } else {
8774 bs.set(pos);
8775 }
7c673cae 8776 }
eafe8130
TL
8777 );
8778 if (intersects) {
8779 derr << "fsck error: free extent 0x" << std::hex << offset
8780 << "~" << length << std::dec
8781 << " intersects allocated blocks" << dendl;
8782 ++errors;
7c673cae 8783 }
b5b8bbf5 8784 }
eafe8130
TL
8785 fm->enumerate_reset();
8786 size_t count = used_blocks.count();
8787 if (used_blocks.size() != count) {
8788 ceph_assert(used_blocks.size() > count);
8789 used_blocks.flip();
8790 size_t start = used_blocks.find_first();
8791 while (start != decltype(used_blocks)::npos) {
8792 size_t cur = start;
8793 while (true) {
8794 size_t next = used_blocks.find_next(cur);
8795 if (next != cur + 1) {
8796 ++errors;
8797 derr << "fsck error: leaked extent 0x" << std::hex
8798 << ((uint64_t)start * fm->get_alloc_size()) << "~"
8799 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
8800 << dendl;
8801 if (repair) {
8802 repairer.fix_leaked(db,
8803 fm,
8804 start * min_alloc_size,
8805 (cur + 1 - start) * min_alloc_size);
8806 }
8807 start = next;
8808 break;
11fdf7f2 8809 }
eafe8130 8810 cur = next;
b5b8bbf5 8811 }
eafe8130
TL
8812 }
8813 used_blocks.flip();
b5b8bbf5 8814 }
7c673cae
FG
8815 }
8816 }
11fdf7f2 8817 if (repair) {
9f95a23c
TL
8818 if (!per_pool_omap) {
8819 dout(5) << __func__ << " marking per_pool_omap=1" << dendl;
8820 repairer.fix_per_pool_omap(db);
8821 }
8822
11fdf7f2
TL
8823 dout(5) << __func__ << " applying repair results" << dendl;
8824 repaired = repairer.apply(db);
8825 dout(5) << __func__ << " repair applied" << dendl;
8826 }
7c673cae 8827
eafe8130 8828out_scan:
7c673cae
FG
8829 dout(2) << __func__ << " " << num_objects << " objects, "
8830 << num_sharded_objects << " of them sharded. "
8831 << dendl;
8832 dout(2) << __func__ << " " << num_extents << " extents to "
8833 << num_blobs << " blobs, "
8834 << num_spanning_blobs << " spanning, "
8835 << num_shared_blobs << " shared."
8836 << dendl;
8837
8838 utime_t duration = ceph_clock_now() - start;
9f95a23c
TL
8839 dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
8840 << warnings << " warnings, "
8841 << repaired << " repaired, "
8842 << (errors + warnings - (int)repaired) << " remaining in "
7c673cae 8843 << duration << " seconds" << dendl;
9f95a23c
TL
8844
8845 // In non-repair mode we should return error count only as
8846 // it indicates if store status is OK.
8847 // In repair mode both errors and warnings are taken into account
8848 // since repaired counter relates to them both.
8849 return repair ? errors + warnings - (int)repaired : errors;
11fdf7f2
TL
8850}
8851
8852/// methods to inject various errors fsck can repair
8853void BlueStore::inject_broken_shared_blob_key(const string& key,
8854 const bufferlist& bl)
8855{
8856 KeyValueDB::Transaction txn;
8857 txn = db->get_transaction();
8858 txn->set(PREFIX_SHARED_BLOB, key, bl);
8859 db->submit_transaction_sync(txn);
8860};
8861
8862void BlueStore::inject_leaked(uint64_t len)
8863{
8864 KeyValueDB::Transaction txn;
8865 txn = db->get_transaction();
8866
8867 PExtentVector exts;
8868 int64_t alloc_len = alloc->allocate(len, min_alloc_size,
8869 min_alloc_size * 256, 0, &exts);
8870 ceph_assert(alloc_len >= (int64_t)len);
8871 for (auto& p : exts) {
8872 fm->allocate(p.offset, p.length, txn);
8873 }
8874 db->submit_transaction_sync(txn);
8875}
8876
8877void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
8878{
8879 KeyValueDB::Transaction txn;
8880 OnodeRef o;
8881 CollectionRef c = _get_collection(cid);
8882 ceph_assert(c);
8883 {
9f95a23c 8884 std::unique_lock l{c->lock}; // just to avoid internal asserts
11fdf7f2
TL
8885 o = c->get_onode(oid, false);
8886 ceph_assert(o);
8887 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
8888 }
8889
8890 bool injected = false;
8891 txn = db->get_transaction();
8892 auto& em = o->extent_map.extent_map;
8893 std::vector<const PExtentVector*> v;
8894 if (em.size()) {
8895 v.push_back(&em.begin()->blob->get_blob().get_extents());
8896 }
8897 if (em.size() > 1) {
8898 auto it = em.end();
8899 --it;
8900 v.push_back(&(it->blob->get_blob().get_extents()));
8901 }
8902 for (auto pext : v) {
8903 if (pext->size()) {
8904 auto p = pext->begin();
8905 while (p != pext->end()) {
8906 if (p->is_valid()) {
8907 dout(20) << __func__ << " release 0x" << std::hex << p->offset
8908 << "~" << p->length << std::dec << dendl;
8909 fm->release(p->offset, p->length, txn);
8910 injected = true;
8911 break;
8912 }
8913 ++p;
8914 }
8915 }
8916 }
8917 ceph_assert(injected);
8918 db->submit_transaction_sync(txn);
8919}
8920
9f95a23c
TL
8921void BlueStore::inject_legacy_omap()
8922{
8923 dout(1) << __func__ << dendl;
8924 per_pool_omap = false;
8925 KeyValueDB::Transaction txn;
8926 txn = db->get_transaction();
8927 txn->rmkey(PREFIX_SUPER, "per_pool_omap");
8928 db->submit_transaction_sync(txn);
8929}
8930
8931void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
8932{
8933 dout(1) << __func__ << " "
8934 << cid << " " << oid
8935 <<dendl;
8936 KeyValueDB::Transaction txn;
8937 OnodeRef o;
8938 CollectionRef c = _get_collection(cid);
8939 ceph_assert(c);
8940 {
8941 std::unique_lock l{ c->lock }; // just to avoid internal asserts
8942 o = c->get_onode(oid, false);
8943 ceph_assert(o);
8944 }
8945 o->onode.clear_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PGMETA_OMAP);
8946 txn = db->get_transaction();
8947 _record_onode(o, txn);
8948 db->submit_transaction_sync(txn);
8949}
8950
8951
11fdf7f2
TL
8952void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
8953{
8954 BlueStoreRepairer repairer;
8955 repairer.fix_statfs(db, key, new_statfs);
8956 repairer.apply(db);
8957}
8958
eafe8130
TL
8959void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
8960{
8961 KeyValueDB::Transaction t = db->get_transaction();
8962 volatile_statfs v;
8963 v = new_statfs;
8964 bufferlist bl;
8965 v.encode(bl);
8966 t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
8967 db->submit_transaction_sync(t);
8968}
8969
11fdf7f2
TL
8970void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
8971 coll_t cid2, ghobject_t oid2,
8972 uint64_t offset)
8973{
8974 OnodeRef o1;
8975 CollectionRef c1 = _get_collection(cid1);
8976 ceph_assert(c1);
8977 {
9f95a23c 8978 std::unique_lock l{c1->lock}; // just to avoid internal asserts
11fdf7f2
TL
8979 o1 = c1->get_onode(oid1, false);
8980 ceph_assert(o1);
8981 o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
8982 }
8983 OnodeRef o2;
8984 CollectionRef c2 = _get_collection(cid2);
8985 ceph_assert(c2);
8986 {
9f95a23c 8987 std::unique_lock l{c2->lock}; // just to avoid internal asserts
11fdf7f2
TL
8988 o2 = c2->get_onode(oid2, false);
8989 ceph_assert(o2);
8990 o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
8991 }
8992 Extent& e1 = *(o1->extent_map.seek_lextent(offset));
8993 Extent& e2 = *(o2->extent_map.seek_lextent(offset));
8994
8995 // require onode/extent layout to be the same (and simple)
8996 // to make things easier
8997 ceph_assert(o1->onode.extent_map_shards.empty());
8998 ceph_assert(o2->onode.extent_map_shards.empty());
8999 ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
9000 ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
9001 ceph_assert(e1.logical_offset == e2.logical_offset);
9002 ceph_assert(e1.length == e2.length);
9003 ceph_assert(e1.blob_offset == e2.blob_offset);
9004
9005 KeyValueDB::Transaction txn;
9006 txn = db->get_transaction();
9007
9008 // along with misreference error this will create space leaks errors
9009 e2.blob->dirty_blob() = e1.blob->get_blob();
9010 o2->extent_map.dirty_range(offset, e2.length);
9011 o2->extent_map.update(txn, false);
9012
9013 _record_onode(o2, txn);
9014 db->submit_transaction_sync(txn);
7c673cae
FG
9015}
9016
9017void BlueStore::collect_metadata(map<string,string> *pm)
9018{
9019 dout(10) << __func__ << dendl;
9020 bdev->collect_metadata("bluestore_bdev_", pm);
9021 if (bluefs) {
9022 (*pm)["bluefs"] = "1";
9f95a23c
TL
9023 // this value is for backward compatibility only
9024 (*pm)["bluefs_single_shared_device"] = \
9025 stringify((int)bluefs_layout.single_shared_device());
9026 (*pm)["bluefs_dedicated_db"] = \
9027 stringify((int)bluefs_layout.dedicated_db);
9028 (*pm)["bluefs_dedicated_wal"] = \
9029 stringify((int)bluefs_layout.dedicated_wal);
9030 bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
7c673cae
FG
9031 } else {
9032 (*pm)["bluefs"] = "0";
9033 }
11fdf7f2
TL
9034
9035 // report numa mapping for underlying devices
9036 int node = -1;
9037 set<int> nodes;
9038 set<string> failed;
9039 int r = get_numa_node(&node, &nodes, &failed);
9040 if (r >= 0) {
9041 if (!failed.empty()) {
9042 (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
9043 }
9044 if (!nodes.empty()) {
9045 dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
9046 (*pm)["objectstore_numa_nodes"] = stringify(nodes);
9047 }
9048 if (node >= 0) {
9049 (*pm)["objectstore_numa_node"] = stringify(node);
9050 }
9051 }
9052}
9053
9054int BlueStore::get_numa_node(
9055 int *final_node,
9056 set<int> *out_nodes,
9057 set<string> *out_failed)
9058{
9059 int node = -1;
9060 set<string> devices;
9061 get_devices(&devices);
9062 set<int> nodes;
9063 set<string> failed;
9064 for (auto& devname : devices) {
9065 int n;
9066 BlkDev bdev(devname);
9067 int r = bdev.get_numa_node(&n);
9068 if (r < 0) {
9069 dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
9070 << dendl;
9071 failed.insert(devname);
9072 continue;
9073 }
9074 dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
9075 << dendl;
9076 nodes.insert(n);
9077 if (node < 0) {
9078 node = n;
9079 }
9080 }
9081 if (node >= 0 && nodes.size() == 1 && failed.empty()) {
9082 *final_node = node;
9083 }
9084 if (out_nodes) {
9085 *out_nodes = nodes;
9086 }
9087 if (out_failed) {
9088 *out_failed = failed;
9089 }
9090 return 0;
9091}
9092
9093int BlueStore::get_devices(set<string> *ls)
9094{
9095 if (bdev) {
9096 bdev->get_devices(ls);
9097 if (bluefs) {
9098 bluefs->get_devices(ls);
9099 }
9100 return 0;
9101 }
9102
9103 // grumble, we haven't started up yet.
9104 int r = _open_path();
9105 if (r < 0)
9106 goto out;
9107 r = _open_fsid(false);
9108 if (r < 0)
9109 goto out_path;
9110 r = _read_fsid(&fsid);
9111 if (r < 0)
9112 goto out_fsid;
9113 r = _lock_fsid();
9114 if (r < 0)
9115 goto out_fsid;
9116 r = _open_bdev(false);
9117 if (r < 0)
9118 goto out_fsid;
9119 r = _minimal_open_bluefs(false);
9120 if (r < 0)
9121 goto out_bdev;
9122 bdev->get_devices(ls);
9123 if (bluefs) {
9124 bluefs->get_devices(ls);
9125 }
9126 r = 0;
9127 _minimal_close_bluefs();
9128 out_bdev:
9129 _close_bdev();
9130 out_fsid:
9131 _close_fsid();
9132 out_path:
9133 _close_path();
9134 out:
9135 return r;
7c673cae
FG
9136}
9137
11fdf7f2 9138void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
7c673cae
FG
9139{
9140 buf->reset();
11fdf7f2 9141
9f95a23c
TL
9142 buf->omap_allocated =
9143 db->estimate_prefix_size(PREFIX_OMAP, string()) +
9144 db->estimate_prefix_size(PREFIX_PERPOOL_OMAP, string());
11fdf7f2
TL
9145
9146 uint64_t bfree = alloc->get_free();
7c673cae
FG
9147
9148 if (bluefs) {
9f95a23c
TL
9149 int64_t bluefs_total = bluefs->get_total(bluefs_layout.shared_bdev);
9150 int64_t bluefs_free = bluefs->get_free(bluefs_layout.shared_bdev);
94b18763
FG
9151 // part of our shared device is "free" according to BlueFS, but we
9152 // can't touch bluestore_bluefs_min of it.
9153 int64_t shared_available = std::min(
11fdf7f2
TL
9154 bluefs_free,
9155 int64_t(bluefs_total - cct->_conf->bluestore_bluefs_min));
9156 buf->internally_reserved = bluefs_total - shared_available;
94b18763 9157 if (shared_available > 0) {
11fdf7f2
TL
9158 bfree += shared_available;
9159 }
9160 // include dedicated db, too, if that isn't the shared device.
9f95a23c 9161 if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
11fdf7f2 9162 buf->total += bluefs->get_total(BlueFS::BDEV_DB);
7c673cae 9163 }
11fdf7f2
TL
9164 // call any non-omap bluefs space "internal metadata"
9165 buf->internal_metadata =
9166 std::max(bluefs->get_used(), (uint64_t)cct->_conf->bluestore_bluefs_min)
9167 - buf->omap_allocated;
7c673cae
FG
9168 }
9169
11fdf7f2
TL
9170 uint64_t thin_total, thin_avail;
9171 if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
9172 buf->total += thin_total;
9173
9174 // we are limited by both the size of the virtual device and the
9175 // underlying physical device.
9176 bfree = std::min(bfree, thin_avail);
9177
9178 buf->allocated = thin_total - thin_avail;
9179 } else {
9180 buf->total += bdev->get_size();
9181 }
9182 buf->available = bfree;
9183}
9184
9185int BlueStore::statfs(struct store_statfs_t *buf,
9186 osd_alert_list_t* alerts)
9187{
9188 if (alerts) {
9189 alerts->clear();
9190 _log_alerts(*alerts);
9191 }
9192 _get_statfs_overall(buf);
31f18b77 9193 {
11fdf7f2 9194 std::lock_guard l(vstatfs_lock);
31f18b77 9195 buf->allocated = vstatfs.allocated();
11fdf7f2
TL
9196 buf->data_stored = vstatfs.stored();
9197 buf->data_compressed = vstatfs.compressed();
9198 buf->data_compressed_original = vstatfs.compressed_original();
9199 buf->data_compressed_allocated = vstatfs.compressed_allocated();
9200 }
9201
9202 dout(20) << __func__ << " " << *buf << dendl;
9203 return 0;
9204}
9205
9f95a23c
TL
9206int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
9207 bool *out_per_pool_omap)
11fdf7f2
TL
9208{
9209 dout(20) << __func__ << " pool " << pool_id<< dendl;
81eedcae 9210
11fdf7f2
TL
9211 if (!per_pool_stat_collection) {
9212 dout(20) << __func__ << " not supported in legacy mode " << dendl;
9213 return -ENOTSUP;
7c673cae 9214 }
11fdf7f2 9215 buf->reset();
7c673cae 9216
11fdf7f2
TL
9217 {
9218 std::lock_guard l(vstatfs_lock);
9219 osd_pools[pool_id].publish(buf);
9220 }
9f95a23c
TL
9221
9222 string key_prefix;
9223 _key_encode_u64(pool_id, &key_prefix);
9224 buf->omap_allocated = db->estimate_prefix_size(PREFIX_PERPOOL_OMAP,
9225 key_prefix);
9226 *out_per_pool_omap = per_pool_omap;
9227
11fdf7f2 9228 dout(10) << __func__ << *buf << dendl;
7c673cae
FG
9229 return 0;
9230}
9231
81eedcae
TL
9232void BlueStore::_check_legacy_statfs_alert()
9233{
9234 string s;
9235 if (!per_pool_stat_collection &&
eafe8130 9236 cct->_conf->bluestore_warn_on_legacy_statfs) {
81eedcae
TL
9237 s = "legacy statfs reporting detected, "
9238 "suggest to run store repair to get consistent statistic reports";
9239 }
9240 std::lock_guard l(qlock);
9241 legacy_statfs_alert = s;
9242}
9243
9f95a23c
TL
9244void BlueStore::_check_no_per_pool_omap_alert()
9245{
9246 string s;
9247 if (!per_pool_omap &&
9248 cct->_conf->bluestore_warn_on_no_per_pool_omap) {
9249 s = "legacy (not per-pool) omap detected, "
9250 "suggest to run store repair to measure per-pool omap usage";
9251 }
9252 std::lock_guard l(qlock);
9253 no_per_pool_omap_alert = s;
9254}
9255
7c673cae
FG
9256// ---------------
9257// cache
9258
9259BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
9260{
9f95a23c 9261 std::shared_lock l(coll_lock);
7c673cae
FG
9262 ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
9263 if (cp == coll_map.end())
9264 return CollectionRef();
9265 return cp->second;
9266}
9267
9268void BlueStore::_queue_reap_collection(CollectionRef& c)
9269{
9270 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
94b18763
FG
9271 // _reap_collections and this in the same thread,
9272 // so no need a lock.
7c673cae
FG
9273 removed_collections.push_back(c);
9274}
9275
9276void BlueStore::_reap_collections()
9277{
94b18763 9278
7c673cae
FG
9279 list<CollectionRef> removed_colls;
9280 {
94b18763
FG
9281 // _queue_reap_collection and this in the same thread.
9282 // So no need a lock.
9283 if (!removed_collections.empty())
9284 removed_colls.swap(removed_collections);
9285 else
9286 return;
7c673cae
FG
9287 }
9288
94b18763
FG
9289 list<CollectionRef>::iterator p = removed_colls.begin();
9290 while (p != removed_colls.end()) {
7c673cae
FG
9291 CollectionRef c = *p;
9292 dout(10) << __func__ << " " << c << " " << c->cid << dendl;
9293 if (c->onode_map.map_any([&](OnodeRef o) {
11fdf7f2 9294 ceph_assert(!o->exists);
7c673cae
FG
9295 if (o->flushing_count.load()) {
9296 dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
9297 << " flush_txns " << o->flushing_count << dendl;
94b18763 9298 return true;
7c673cae 9299 }
94b18763 9300 return false;
7c673cae 9301 })) {
94b18763 9302 ++p;
7c673cae
FG
9303 continue;
9304 }
9305 c->onode_map.clear();
94b18763 9306 p = removed_colls.erase(p);
7c673cae
FG
9307 dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
9308 }
94b18763 9309 if (removed_colls.empty()) {
7c673cae 9310 dout(10) << __func__ << " all reaped" << dendl;
94b18763
FG
9311 } else {
9312 removed_collections.splice(removed_collections.begin(), removed_colls);
7c673cae
FG
9313 }
9314}
9315
9316void BlueStore::_update_cache_logger()
9317{
9318 uint64_t num_onodes = 0;
9f95a23c 9319 uint64_t num_pinned_onodes = 0;
7c673cae
FG
9320 uint64_t num_extents = 0;
9321 uint64_t num_blobs = 0;
9322 uint64_t num_buffers = 0;
9323 uint64_t num_buffer_bytes = 0;
9f95a23c
TL
9324 for (auto c : onode_cache_shards) {
9325 c->add_stats(&num_onodes, &num_pinned_onodes);
9326 }
9327 for (auto c : buffer_cache_shards) {
9328 c->add_stats(&num_extents, &num_blobs,
9329 &num_buffers, &num_buffer_bytes);
7c673cae
FG
9330 }
9331 logger->set(l_bluestore_onodes, num_onodes);
9f95a23c 9332 logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
7c673cae
FG
9333 logger->set(l_bluestore_extents, num_extents);
9334 logger->set(l_bluestore_blobs, num_blobs);
9335 logger->set(l_bluestore_buffers, num_buffers);
9336 logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
9337}
9338
9339// ---------------
9340// read operations
9341
9342ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
9343{
9344 return _get_collection(cid);
9345}
9346
11fdf7f2
TL
9347ObjectStore::CollectionHandle BlueStore::create_new_collection(
9348 const coll_t& cid)
7c673cae 9349{
9f95a23c
TL
9350 std::unique_lock l{coll_lock};
9351 auto c = ceph::make_ref<Collection>(
11fdf7f2 9352 this,
9f95a23c
TL
9353 onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
9354 buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
11fdf7f2
TL
9355 cid);
9356 new_coll_map[cid] = c;
9f95a23c 9357 _osr_attach(c.get());
11fdf7f2
TL
9358 return c;
9359}
9360
9361void BlueStore::set_collection_commit_queue(
9362 const coll_t& cid,
9363 ContextQueue *commit_queue)
9364{
9365 if (commit_queue) {
9f95a23c 9366 std::shared_lock l(coll_lock);
11fdf7f2
TL
9367 if (coll_map.count(cid)) {
9368 coll_map[cid]->commit_queue = commit_queue;
9369 } else if (new_coll_map.count(cid)) {
9370 new_coll_map[cid]->commit_queue = commit_queue;
9371 }
9372 }
7c673cae
FG
9373}
9374
11fdf7f2 9375
7c673cae
FG
9376bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
9377{
9378 Collection *c = static_cast<Collection *>(c_.get());
9379 dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
9380 if (!c->exists)
9381 return false;
9382
9383 bool r = true;
9384
9385 {
9f95a23c 9386 std::shared_lock l(c->lock);
7c673cae
FG
9387 OnodeRef o = c->get_onode(oid, false);
9388 if (!o || !o->exists)
9389 r = false;
9390 }
9391
7c673cae
FG
9392 return r;
9393}
9394
7c673cae
FG
9395int BlueStore::stat(
9396 CollectionHandle &c_,
9397 const ghobject_t& oid,
9398 struct stat *st,
9399 bool allow_eio)
9400{
9401 Collection *c = static_cast<Collection *>(c_.get());
9402 if (!c->exists)
9403 return -ENOENT;
9404 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
9405
9406 {
9f95a23c 9407 std::shared_lock l(c->lock);
7c673cae
FG
9408 OnodeRef o = c->get_onode(oid, false);
9409 if (!o || !o->exists)
9410 return -ENOENT;
9411 st->st_size = o->onode.size;
9412 st->st_blksize = 4096;
9413 st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
9414 st->st_nlink = 1;
9415 }
9416
7c673cae
FG
9417 int r = 0;
9418 if (_debug_mdata_eio(oid)) {
9419 r = -EIO;
9420 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
9421 }
9422 return r;
9423}
9424int BlueStore::set_collection_opts(
11fdf7f2 9425 CollectionHandle& ch,
7c673cae
FG
9426 const pool_opts_t& opts)
9427{
7c673cae 9428 Collection *c = static_cast<Collection *>(ch.get());
11fdf7f2 9429 dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
7c673cae
FG
9430 if (!c->exists)
9431 return -ENOENT;
9f95a23c 9432 std::unique_lock l{c->lock};
7c673cae
FG
9433 c->pool_opts = opts;
9434 return 0;
9435}
9436
7c673cae
FG
9437int BlueStore::read(
9438 CollectionHandle &c_,
9439 const ghobject_t& oid,
9440 uint64_t offset,
9441 size_t length,
9442 bufferlist& bl,
224ce89b 9443 uint32_t op_flags)
7c673cae 9444{
11fdf7f2 9445 auto start = mono_clock::now();
7c673cae
FG
9446 Collection *c = static_cast<Collection *>(c_.get());
9447 const coll_t &cid = c->get_cid();
9448 dout(15) << __func__ << " " << cid << " " << oid
9449 << " 0x" << std::hex << offset << "~" << length << std::dec
9450 << dendl;
9451 if (!c->exists)
9452 return -ENOENT;
9453
9454 bl.clear();
9455 int r;
9456 {
9f95a23c 9457 std::shared_lock l(c->lock);
11fdf7f2 9458 auto start1 = mono_clock::now();
7c673cae 9459 OnodeRef o = c->get_onode(oid, false);
494da23a
TL
9460 log_latency("get_onode@read",
9461 l_bluestore_read_onode_meta_lat,
9462 mono_clock::now() - start1,
9463 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9464 if (!o || !o->exists) {
9465 r = -ENOENT;
9466 goto out;
9467 }
9468
9469 if (offset == length && offset == 0)
9470 length = o->onode.size;
9471
9472 r = _do_read(c, o, offset, length, bl, op_flags);
b32b8144
FG
9473 if (r == -EIO) {
9474 logger->inc(l_bluestore_read_eio);
9475 }
7c673cae
FG
9476 }
9477
9478 out:
28e407b8 9479 if (r >= 0 && _debug_data_eio(oid)) {
7c673cae
FG
9480 r = -EIO;
9481 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
11fdf7f2
TL
9482 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
9483 cct->_conf->bluestore_debug_random_read_err &&
9484 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
9485 100.0)) == 0) {
224ce89b
WB
9486 dout(0) << __func__ << ": inject random EIO" << dendl;
9487 r = -EIO;
7c673cae
FG
9488 }
9489 dout(10) << __func__ << " " << cid << " " << oid
9490 << " 0x" << std::hex << offset << "~" << length << std::dec
9491 << " = " << r << dendl;
494da23a
TL
9492 log_latency(__func__,
9493 l_bluestore_read_lat,
9494 mono_clock::now() - start,
9495 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9496 return r;
9497}
9498
9f95a23c 9499void BlueStore::_read_cache(
7c673cae
FG
9500 OnodeRef o,
9501 uint64_t offset,
9502 size_t length,
9f95a23c
TL
9503 int read_cache_policy,
9504 ready_regions_t& ready_regions,
9505 blobs2read_t& blobs2read)
7c673cae 9506{
7c673cae 9507 // build blob-wise list to of stuff read (that isn't cached)
7c673cae
FG
9508 unsigned left = length;
9509 uint64_t pos = offset;
7c673cae
FG
9510 auto lp = o->extent_map.seek_lextent(offset);
9511 while (left > 0 && lp != o->extent_map.extent_map.end()) {
9512 if (pos < lp->logical_offset) {
9513 unsigned hole = lp->logical_offset - pos;
9514 if (hole >= left) {
9f95a23c 9515 break;
7c673cae
FG
9516 }
9517 dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole
9f95a23c 9518 << std::dec << dendl;
7c673cae
FG
9519 pos += hole;
9520 left -= hole;
9521 }
94b18763 9522 BlobRef& bptr = lp->blob;
7c673cae
FG
9523 unsigned l_off = pos - lp->logical_offset;
9524 unsigned b_off = l_off + lp->blob_offset;
9525 unsigned b_len = std::min(left, lp->length - l_off);
9526
9527 ready_regions_t cache_res;
9528 interval_set<uint32_t> cache_interval;
9529 bptr->shared_blob->bc.read(
91327a77
AA
9530 bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
9531 read_cache_policy);
7c673cae 9532 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c
TL
9533 << " need 0x" << b_off << "~" << b_len
9534 << " cache has 0x" << cache_interval
9535 << std::dec << dendl;
7c673cae
FG
9536
9537 auto pc = cache_res.begin();
11fdf7f2 9538 uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
7c673cae
FG
9539 while (b_len > 0) {
9540 unsigned l;
9541 if (pc != cache_res.end() &&
9f95a23c
TL
9542 pc->first == b_off) {
9543 l = pc->second.length();
9544 ready_regions[pos].claim(pc->second);
9545 dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x"
9546 << b_off << "~" << l << std::dec << dendl;
9547 ++pc;
7c673cae 9548 } else {
9f95a23c
TL
9549 l = b_len;
9550 if (pc != cache_res.end()) {
9551 ceph_assert(pc->first > b_off);
9552 l = pc->first - b_off;
9553 }
9554 dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
9555 << b_off << "~" << l << std::dec << dendl;
9556 // merge regions
9557 {
9558 uint64_t r_off = b_off;
9559 uint64_t r_len = l;
9560 uint64_t front = r_off % chunk_size;
9561 if (front) {
9562 r_off -= front;
9563 r_len += front;
9564 }
9565 unsigned tail = r_len % chunk_size;
9566 if (tail) {
9567 r_len += chunk_size - tail;
9568 }
9569 bool merged = false;
9570 regions2read_t& r2r = blobs2read[bptr];
9571 if (r2r.size()) {
9572 read_req_t& pre = r2r.back();
9573 if (r_off <= (pre.r_off + pre.r_len)) {
9574 front += (r_off - pre.r_off);
9575 pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
9576 pre.regs.emplace_back(region_t(pos, b_off, l, front));
9577 merged = true;
9578 }
9579 }
9580 if (!merged) {
9581 read_req_t req(r_off, r_len);
9582 req.regs.emplace_back(region_t(pos, b_off, l, front));
9583 r2r.emplace_back(std::move(req));
9584 }
9585 }
7c673cae
FG
9586 }
9587 pos += l;
9588 b_off += l;
9589 left -= l;
9590 b_len -= l;
9591 }
9592 ++lp;
9593 }
9f95a23c 9594}
7c673cae 9595
9f95a23c
TL
9596int BlueStore::_prepare_read_ioc(
9597 blobs2read_t& blobs2read,
9598 vector<bufferlist>* compressed_blob_bls,
9599 IOContext* ioc)
9600{
7c673cae 9601 for (auto& p : blobs2read) {
94b18763 9602 const BlobRef& bptr = p.first;
11fdf7f2 9603 regions2read_t& r2r = p.second;
7c673cae 9604 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c 9605 << " need " << r2r << std::dec << dendl;
7c673cae
FG
9606 if (bptr->get_blob().is_compressed()) {
9607 // read the whole thing
9f95a23c
TL
9608 if (compressed_blob_bls->empty()) {
9609 // ensure we avoid any reallocation on subsequent blobs
9610 compressed_blob_bls->reserve(blobs2read.size());
9611 }
9612 compressed_blob_bls->push_back(bufferlist());
9613 bufferlist& bl = compressed_blob_bls->back();
9614 auto r = bptr->get_blob().map(
9615 0, bptr->get_blob().get_ondisk_length(),
9616 [&](uint64_t offset, uint64_t length) {
9617 int r = bdev->aio_read(offset, length, &bl, ioc);
9618 if (r < 0)
7c673cae
FG
9619 return r;
9620 return 0;
9f95a23c 9621 });
b32b8144
FG
9622 if (r < 0) {
9623 derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
9624 if (r == -EIO) {
9625 // propagate EIO to caller
9626 return r;
9627 }
11fdf7f2 9628 ceph_assert(r == 0);
b32b8144 9629 }
7c673cae
FG
9630 } else {
9631 // read the pieces
11fdf7f2 9632 for (auto& req : r2r) {
9f95a23c
TL
9633 dout(20) << __func__ << " region 0x" << std::hex
9634 << req.regs.front().logical_offset
9635 << ": 0x" << req.regs.front().blob_xoffset
9636 << " reading 0x" << req.r_off
9637 << "~" << req.r_len << std::dec
9638 << dendl;
7c673cae 9639
9f95a23c
TL
9640 // read it
9641 auto r = bptr->get_blob().map(
9642 req.r_off, req.r_len,
9643 [&](uint64_t offset, uint64_t length) {
9644 int r = bdev->aio_read(offset, length, &req.bl, ioc);
9645 if (r < 0)
7c673cae
FG
9646 return r;
9647 return 0;
9f95a23c 9648 });
b32b8144
FG
9649 if (r < 0) {
9650 derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
9651 << dendl;
9652 if (r == -EIO) {
9653 // propagate EIO to caller
9654 return r;
9655 }
11fdf7f2 9656 ceph_assert(r == 0);
b32b8144 9657 }
9f95a23c 9658 ceph_assert(req.bl.length() == req.r_len);
7c673cae
FG
9659 }
9660 }
9661 }
9f95a23c
TL
9662 return 0;
9663}
11fdf7f2 9664
9f95a23c
TL
9665int BlueStore::_generate_read_result_bl(
9666 OnodeRef o,
9667 uint64_t offset,
9668 size_t length,
9669 ready_regions_t& ready_regions,
9670 vector<bufferlist>& compressed_blob_bls,
9671 blobs2read_t& blobs2read,
9672 bool buffered,
9673 bool* csum_error,
9674 bufferlist& bl)
9675{
9676 // enumerate and decompress desired blobs
7c673cae
FG
9677 auto p = compressed_blob_bls.begin();
9678 blobs2read_t::iterator b2r_it = blobs2read.begin();
9679 while (b2r_it != blobs2read.end()) {
94b18763 9680 const BlobRef& bptr = b2r_it->first;
11fdf7f2 9681 regions2read_t& r2r = b2r_it->second;
7c673cae 9682 dout(20) << __func__ << " blob " << *bptr << std::hex
9f95a23c 9683 << " need 0x" << r2r << std::dec << dendl;
7c673cae 9684 if (bptr->get_blob().is_compressed()) {
11fdf7f2 9685 ceph_assert(p != compressed_blob_bls.end());
7c673cae
FG
9686 bufferlist& compressed_bl = *p++;
9687 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
9f95a23c
TL
9688 r2r.front().regs.front().logical_offset) < 0) {
9689 *csum_error = true;
9690 return -EIO;
7c673cae
FG
9691 }
9692 bufferlist raw_bl;
9f95a23c 9693 auto r = _decompress(compressed_bl, &raw_bl);
7c673cae 9694 if (r < 0)
9f95a23c 9695 return r;
7c673cae 9696 if (buffered) {
9f95a23c
TL
9697 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
9698 raw_bl);
7c673cae 9699 }
11fdf7f2
TL
9700 for (auto& req : r2r) {
9701 for (auto& r : req.regs) {
9702 ready_regions[r.logical_offset].substr_of(
9703 raw_bl, r.blob_xoffset, r.length);
9704 }
7c673cae
FG
9705 }
9706 } else {
11fdf7f2 9707 for (auto& req : r2r) {
9f95a23c
TL
9708 if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
9709 req.regs.front().logical_offset) < 0) {
9710 *csum_error = true;
9711 return -EIO;
9712 }
9713 if (buffered) {
9714 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
9715 req.r_off, req.bl);
9716 }
7c673cae 9717
9f95a23c
TL
9718 // prune and keep result
9719 for (const auto& r : req.regs) {
9720 ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
11fdf7f2 9721 }
7c673cae
FG
9722 }
9723 }
9724 ++b2r_it;
9725 }
9726
9727 // generate a resulting buffer
9728 auto pr = ready_regions.begin();
9729 auto pr_end = ready_regions.end();
9f95a23c 9730 uint64_t pos = 0;
7c673cae
FG
9731 while (pos < length) {
9732 if (pr != pr_end && pr->first == pos + offset) {
9733 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
9734 << ": data from 0x" << pr->first << "~" << pr->second.length()
9735 << std::dec << dendl;
7c673cae
FG
9736 pos += pr->second.length();
9737 bl.claim_append(pr->second);
9738 ++pr;
9739 } else {
9740 uint64_t l = length - pos;
9741 if (pr != pr_end) {
11fdf7f2 9742 ceph_assert(pr->first > pos + offset);
9f95a23c 9743 l = pr->first - (pos + offset);
7c673cae
FG
9744 }
9745 dout(30) << __func__ << " assemble 0x" << std::hex << pos
9f95a23c
TL
9746 << ": zeros for 0x" << (pos + offset) << "~" << l
9747 << std::dec << dendl;
7c673cae
FG
9748 bl.append_zero(l);
9749 pos += l;
9750 }
9751 }
11fdf7f2
TL
9752 ceph_assert(bl.length() == length);
9753 ceph_assert(pos == length);
9754 ceph_assert(pr == pr_end);
9f95a23c
TL
9755 return 0;
9756}
9757
9758int BlueStore::_do_read(
9759 Collection *c,
9760 OnodeRef o,
9761 uint64_t offset,
9762 size_t length,
9763 bufferlist& bl,
9764 uint32_t op_flags,
9765 uint64_t retry_count)
9766{
9767 FUNCTRACE(cct);
9768 int r = 0;
9769 int read_cache_policy = 0; // do not bypass clean or dirty cache
9770
9771 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
9772 << " size 0x" << o->onode.size << " (" << std::dec
9773 << o->onode.size << ")" << dendl;
9774 bl.clear();
9775
9776 if (offset >= o->onode.size) {
9777 return r;
9778 }
9779
9780 // generally, don't buffer anything, unless the client explicitly requests
9781 // it.
9782 bool buffered = false;
9783 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
9784 dout(20) << __func__ << " will do buffered read" << dendl;
9785 buffered = true;
9786 } else if (cct->_conf->bluestore_default_buffered_read &&
9787 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
9788 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
9789 dout(20) << __func__ << " defaulting to buffered read" << dendl;
9790 buffered = true;
9791 }
9792
9793 if (offset + length > o->onode.size) {
9794 length = o->onode.size - offset;
9795 }
9796
9797 auto start = mono_clock::now();
9798 o->extent_map.fault_range(db, offset, length);
9799 log_latency(__func__,
9800 l_bluestore_read_onode_meta_lat,
9801 mono_clock::now() - start,
9802 cct->_conf->bluestore_log_op_age);
9803 _dump_onode<30>(cct, *o);
9804
9805 // for deep-scrub, we only read dirty cache and bypass clean cache in
9806 // order to read underlying block device in case there are silent disk errors.
9807 if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
9808 dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
9809 read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
9810 }
9811
9812 // build blob-wise list to of stuff read (that isn't cached)
9813 ready_regions_t ready_regions;
9814 blobs2read_t blobs2read;
9815 _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
9816
9817
9818 // read raw blob data.
9819 start = mono_clock::now(); // for the sake of simplicity
9820 // measure the whole block below.
9821 // The error isn't that much...
9822 vector<bufferlist> compressed_blob_bls;
9823 IOContext ioc(cct, NULL, true); // allow EIO
9824 r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
9825 // we always issue aio for reading, so errors other than EIO are not allowed
9826 if (r < 0)
9827 return r;
9828
9829 int64_t num_ios = length;
9830 if (ioc.has_pending_aios()) {
9831 num_ios = -ioc.get_num_ios();
9832 bdev->aio_submit(&ioc);
9833 dout(20) << __func__ << " waiting for aio" << dendl;
9834 ioc.aio_wait();
9835 r = ioc.get_return_value();
9836 if (r < 0) {
9837 ceph_assert(r == -EIO); // no other errors allowed
9838 return -EIO;
9839 }
9840 }
9841 log_latency_fn(__func__,
9842 l_bluestore_read_wait_aio_lat,
9843 mono_clock::now() - start,
9844 cct->_conf->bluestore_log_op_age,
9845 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
9846 );
9847
9848 bool csum_error = false;
9849 r = _generate_read_result_bl(o, offset, length, ready_regions,
9850 compressed_blob_bls, blobs2read,
9851 buffered, &csum_error, bl);
9852 if (csum_error) {
9853 // Handles spurious read errors caused by a kernel bug.
9854 // We sometimes get all-zero pages as a result of the read under
9855 // high memory pressure. Retrying the failing read succeeds in most
9856 // cases.
9857 // See also: http://tracker.ceph.com/issues/22464
9858 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
9859 return -EIO;
9860 }
9861 return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
9862 }
7c673cae 9863 r = bl.length();
f64942e4
AA
9864 if (retry_count) {
9865 logger->inc(l_bluestore_reads_with_retries);
9866 dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
9867 << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
9868 }
7c673cae
FG
9869 return r;
9870}
9871
9872int BlueStore::_verify_csum(OnodeRef& o,
9873 const bluestore_blob_t* blob, uint64_t blob_xoffset,
9874 const bufferlist& bl,
9875 uint64_t logical_offset) const
9876{
9877 int bad;
9878 uint64_t bad_csum;
11fdf7f2 9879 auto start = mono_clock::now();
7c673cae 9880 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
f64942e4
AA
9881 if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
9882 (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
9883 derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
9884 bad = blob_xoffset;
9885 r = -1;
9886 bad_csum = 0xDEADBEEF;
9887 }
7c673cae
FG
9888 if (r < 0) {
9889 if (r == -1) {
9890 PExtentVector pex;
9891 blob->map(
9892 bad,
9893 blob->get_csum_chunk_size(),
9894 [&](uint64_t offset, uint64_t length) {
9895 pex.emplace_back(bluestore_pextent_t(offset, length));
9896 return 0;
9897 });
9898 derr << __func__ << " bad "
9899 << Checksummer::get_csum_type_string(blob->csum_type)
9900 << "/0x" << std::hex << blob->get_csum_chunk_size()
9901 << " checksum at blob offset 0x" << bad
9902 << ", got 0x" << bad_csum << ", expected 0x"
9903 << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
9904 << ", device location " << pex
9905 << ", logical extent 0x" << std::hex
9906 << (logical_offset + bad - blob_xoffset) << "~"
9907 << blob->get_csum_chunk_size() << std::dec
9908 << ", object " << o->oid
9909 << dendl;
9910 } else {
9911 derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
9912 }
9913 }
494da23a
TL
9914 log_latency(__func__,
9915 l_bluestore_csum_lat,
9916 mono_clock::now() - start,
9917 cct->_conf->bluestore_log_op_age);
11fdf7f2
TL
9918 if (cct->_conf->bluestore_ignore_data_csum) {
9919 return 0;
9920 }
7c673cae
FG
9921 return r;
9922}
9923
9924int BlueStore::_decompress(bufferlist& source, bufferlist* result)
9925{
9926 int r = 0;
11fdf7f2
TL
9927 auto start = mono_clock::now();
9928 auto i = source.cbegin();
7c673cae 9929 bluestore_compression_header_t chdr;
11fdf7f2 9930 decode(chdr, i);
7c673cae
FG
9931 int alg = int(chdr.type);
9932 CompressorRef cp = compressor;
9933 if (!cp || (int)cp->get_type() != alg) {
9934 cp = Compressor::create(cct, alg);
9935 }
9936
9937 if (!cp.get()) {
9938 // if compressor isn't available - error, because cannot return
9939 // decompressed data?
11fdf7f2
TL
9940
9941 const char* alg_name = Compressor::get_comp_alg_name(alg);
9942 derr << __func__ << " can't load decompressor " << alg_name << dendl;
9943 _set_compression_alert(false, alg_name);
7c673cae
FG
9944 r = -EIO;
9945 } else {
9946 r = cp->decompress(i, chdr.length, *result);
9947 if (r < 0) {
9948 derr << __func__ << " decompression failed with exit code " << r << dendl;
9949 r = -EIO;
9950 }
9951 }
494da23a
TL
9952 log_latency(__func__,
9953 l_bluestore_decompress_lat,
9954 mono_clock::now() - start,
9955 cct->_conf->bluestore_log_op_age);
7c673cae
FG
9956 return r;
9957}
9958
9959// this stores fiemap into interval_set, other variations
9960// use it internally
9961int BlueStore::_fiemap(
9962 CollectionHandle &c_,
9963 const ghobject_t& oid,
9964 uint64_t offset,
9965 size_t length,
9966 interval_set<uint64_t>& destset)
9967{
9968 Collection *c = static_cast<Collection *>(c_.get());
9969 if (!c->exists)
9970 return -ENOENT;
9971 {
9f95a23c 9972 std::shared_lock l(c->lock);
7c673cae
FG
9973
9974 OnodeRef o = c->get_onode(oid, false);
9975 if (!o || !o->exists) {
9976 return -ENOENT;
9977 }
81eedcae 9978 _dump_onode<30>(cct, *o);
7c673cae
FG
9979
9980 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
9981 << " size 0x" << o->onode.size << std::dec << dendl;
9982
9983 boost::intrusive::set<Extent>::iterator ep, eend;
9984 if (offset >= o->onode.size)
9985 goto out;
9986
9987 if (offset + length > o->onode.size) {
9988 length = o->onode.size - offset;
9989 }
9990
9991 o->extent_map.fault_range(db, offset, length);
9992 eend = o->extent_map.extent_map.end();
9993 ep = o->extent_map.seek_lextent(offset);
9994 while (length > 0) {
9995 dout(20) << __func__ << " offset " << offset << dendl;
9996 if (ep != eend && ep->logical_offset + ep->length <= offset) {
9997 ++ep;
9998 continue;
9999 }
10000
10001 uint64_t x_len = length;
10002 if (ep != eend && ep->logical_offset <= offset) {
10003 uint64_t x_off = offset - ep->logical_offset;
11fdf7f2 10004 x_len = std::min(x_len, ep->length - x_off);
7c673cae
FG
10005 dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
10006 << x_len << std::dec << " blob " << ep->blob << dendl;
10007 destset.insert(offset, x_len);
10008 length -= x_len;
10009 offset += x_len;
10010 if (x_off + x_len == ep->length)
10011 ++ep;
10012 continue;
10013 }
10014 if (ep != eend &&
10015 ep->logical_offset > offset &&
10016 ep->logical_offset - offset < x_len) {
10017 x_len = ep->logical_offset - offset;
10018 }
10019 offset += x_len;
10020 length -= x_len;
10021 }
10022 }
9f95a23c
TL
10023
10024 out:
10025 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
10026 << " size = 0x(" << destset << ")" << std::dec << dendl;
10027 return 0;
10028}
10029
10030int BlueStore::fiemap(
10031 CollectionHandle &c_,
10032 const ghobject_t& oid,
10033 uint64_t offset,
10034 size_t length,
10035 bufferlist& bl)
10036{
10037 interval_set<uint64_t> m;
10038 int r = _fiemap(c_, oid, offset, length, m);
10039 if (r >= 0) {
10040 encode(m, bl);
10041 }
10042 return r;
10043}
10044
10045int BlueStore::fiemap(
10046 CollectionHandle &c_,
10047 const ghobject_t& oid,
10048 uint64_t offset,
10049 size_t length,
10050 map<uint64_t, uint64_t>& destmap)
10051{
10052 interval_set<uint64_t> m;
10053 int r = _fiemap(c_, oid, offset, length, m);
10054 if (r >= 0) {
10055 destmap = std::move(m).detach();
10056 }
10057 return r;
10058}
10059
10060int BlueStore::readv(
10061 CollectionHandle &c_,
10062 const ghobject_t& oid,
10063 interval_set<uint64_t>& m,
10064 bufferlist& bl,
10065 uint32_t op_flags)
10066{
10067 auto start = mono_clock::now();
10068 Collection *c = static_cast<Collection *>(c_.get());
10069 const coll_t &cid = c->get_cid();
10070 dout(15) << __func__ << " " << cid << " " << oid
10071 << " fiemap " << m
10072 << dendl;
10073 if (!c->exists)
10074 return -ENOENT;
10075
10076 bl.clear();
10077 int r;
10078 {
10079 std::shared_lock l(c->lock);
10080 auto start1 = mono_clock::now();
10081 OnodeRef o = c->get_onode(oid, false);
10082 log_latency("get_onode@read",
10083 l_bluestore_read_onode_meta_lat,
10084 mono_clock::now() - start1,
10085 cct->_conf->bluestore_log_op_age);
10086 if (!o || !o->exists) {
10087 r = -ENOENT;
10088 goto out;
10089 }
10090
10091 if (m.empty()) {
10092 r = 0;
10093 goto out;
10094 }
10095
10096 r = _do_readv(c, o, m, bl, op_flags);
10097 if (r == -EIO) {
10098 logger->inc(l_bluestore_read_eio);
10099 }
10100 }
10101
10102 out:
10103 if (r >= 0 && _debug_data_eio(oid)) {
10104 r = -EIO;
10105 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10106 } else if (oid.hobj.pool > 0 && /* FIXME, see #23029 */
10107 cct->_conf->bluestore_debug_random_read_err &&
10108 (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
10109 100.0)) == 0) {
10110 dout(0) << __func__ << ": inject random EIO" << dendl;
10111 r = -EIO;
10112 }
10113 dout(10) << __func__ << " " << cid << " " << oid
10114 << " fiemap " << m << std::dec
10115 << " = " << r << dendl;
10116 log_latency(__func__,
10117 l_bluestore_read_lat,
10118 mono_clock::now() - start,
10119 cct->_conf->bluestore_log_op_age);
10120 return r;
10121}
10122
10123int BlueStore::_do_readv(
10124 Collection *c,
10125 OnodeRef o,
10126 const interval_set<uint64_t>& m,
10127 bufferlist& bl,
10128 uint32_t op_flags,
10129 uint64_t retry_count)
10130{
10131 FUNCTRACE(cct);
10132 int r = 0;
10133 int read_cache_policy = 0; // do not bypass clean or dirty cache
10134
10135 dout(20) << __func__ << " fiemap " << m << std::hex
10136 << " size 0x" << o->onode.size << " (" << std::dec
10137 << o->onode.size << ")" << dendl;
10138
10139 // generally, don't buffer anything, unless the client explicitly requests
10140 // it.
10141 bool buffered = false;
10142 if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
10143 dout(20) << __func__ << " will do buffered read" << dendl;
10144 buffered = true;
10145 } else if (cct->_conf->bluestore_default_buffered_read &&
10146 (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
10147 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
10148 dout(20) << __func__ << " defaulting to buffered read" << dendl;
10149 buffered = true;
10150 }
10151 // this method must be idempotent since we may call it several times
10152 // before we finally read the expected result.
10153 bl.clear();
10154
10155 // call fiemap first!
10156 ceph_assert(m.range_start() <= o->onode.size);
10157 ceph_assert(m.range_end() <= o->onode.size);
10158 auto start = mono_clock::now();
10159 o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
10160 log_latency(__func__,
10161 l_bluestore_read_onode_meta_lat,
10162 mono_clock::now() - start,
10163 cct->_conf->bluestore_log_op_age);
10164 _dump_onode<30>(cct, *o);
10165
10166 IOContext ioc(cct, NULL, true); // allow EIO
10167 vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
10168 raw_results.reserve(m.num_intervals());
10169 int i = 0;
10170 for (auto p = m.begin(); p != m.end(); p++, i++) {
10171 raw_results.push_back({});
10172 _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
10173 std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
10174 r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
10175 // we always issue aio for reading, so errors other than EIO are not allowed
10176 if (r < 0)
10177 return r;
10178 }
10179
10180 auto num_ios = m.size();
10181 if (ioc.has_pending_aios()) {
10182 num_ios = ioc.get_num_ios();
10183 bdev->aio_submit(&ioc);
10184 dout(20) << __func__ << " waiting for aio" << dendl;
10185 ioc.aio_wait();
10186 r = ioc.get_return_value();
10187 if (r < 0) {
10188 ceph_assert(r == -EIO); // no other errors allowed
10189 return -EIO;
10190 }
10191 }
10192 log_latency_fn(__func__,
10193 l_bluestore_read_wait_aio_lat,
10194 mono_clock::now() - start,
10195 cct->_conf->bluestore_log_op_age,
10196 [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
10197 );
10198
10199 ceph_assert(raw_results.size() == (size_t)m.num_intervals());
10200 i = 0;
10201 for (auto p = m.begin(); p != m.end(); p++, i++) {
10202 bool csum_error = false;
10203 bufferlist t;
10204 r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
10205 std::get<0>(raw_results[i]),
10206 std::get<1>(raw_results[i]),
10207 std::get<2>(raw_results[i]),
10208 buffered, &csum_error, t);
10209 if (csum_error) {
10210 // Handles spurious read errors caused by a kernel bug.
10211 // We sometimes get all-zero pages as a result of the read under
10212 // high memory pressure. Retrying the failing read succeeds in most
10213 // cases.
10214 // See also: http://tracker.ceph.com/issues/22464
10215 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
10216 return -EIO;
10217 }
10218 return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
10219 }
10220 bl.claim_append(t);
10221 }
10222 if (retry_count) {
10223 logger->inc(l_bluestore_reads_with_retries);
10224 dout(5) << __func__ << " read fiemap " << m
10225 << " failed " << retry_count << " times before succeeding"
10226 << dendl;
10227 }
10228 return bl.length();
7c673cae
FG
10229}
10230
9f95a23c 10231int BlueStore::dump_onode(CollectionHandle &c_,
7c673cae 10232 const ghobject_t& oid,
9f95a23c
TL
10233 const string& section_name,
10234 Formatter *f)
7c673cae 10235{
9f95a23c
TL
10236 Collection *c = static_cast<Collection *>(c_.get());
10237 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10238 if (!c->exists)
10239 return -ENOENT;
7c673cae 10240
9f95a23c
TL
10241 int r;
10242 {
10243 std::shared_lock l(c->lock);
10244
10245 OnodeRef o = c->get_onode(oid, false);
10246 if (!o || !o->exists) {
10247 r = -ENOENT;
10248 goto out;
10249 }
10250 // FIXME minor: actually the next line isn't enough to
10251 // load shared blobs. Leaving as is for now..
10252 //
10253 o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
10254
10255 _dump_onode<0>(cct, *o);
10256 f->open_object_section(section_name.c_str());
10257 o->dump(f);
10258 f->close_section();
10259 r = 0;
7c673cae 10260 }
9f95a23c
TL
10261 out:
10262 dout(10) << __func__ << " " << c->cid << " " << oid
10263 << " = " << r << dendl;
7c673cae
FG
10264 return r;
10265}
10266
7c673cae
FG
10267int BlueStore::getattr(
10268 CollectionHandle &c_,
10269 const ghobject_t& oid,
10270 const char *name,
10271 bufferptr& value)
10272{
10273 Collection *c = static_cast<Collection *>(c_.get());
10274 dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
10275 if (!c->exists)
10276 return -ENOENT;
10277
10278 int r;
10279 {
9f95a23c 10280 std::shared_lock l(c->lock);
31f18b77 10281 mempool::bluestore_cache_other::string k(name);
7c673cae
FG
10282
10283 OnodeRef o = c->get_onode(oid, false);
10284 if (!o || !o->exists) {
10285 r = -ENOENT;
10286 goto out;
10287 }
10288
10289 if (!o->onode.attrs.count(k)) {
10290 r = -ENODATA;
10291 goto out;
10292 }
10293 value = o->onode.attrs[k];
10294 r = 0;
10295 }
10296 out:
7c673cae
FG
10297 if (r == 0 && _debug_mdata_eio(oid)) {
10298 r = -EIO;
10299 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10300 }
10301 dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
10302 << " = " << r << dendl;
10303 return r;
10304}
10305
7c673cae
FG
10306int BlueStore::getattrs(
10307 CollectionHandle &c_,
10308 const ghobject_t& oid,
10309 map<string,bufferptr>& aset)
10310{
10311 Collection *c = static_cast<Collection *>(c_.get());
10312 dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
10313 if (!c->exists)
10314 return -ENOENT;
10315
10316 int r;
10317 {
9f95a23c 10318 std::shared_lock l(c->lock);
7c673cae
FG
10319
10320 OnodeRef o = c->get_onode(oid, false);
10321 if (!o || !o->exists) {
10322 r = -ENOENT;
10323 goto out;
10324 }
10325 for (auto& i : o->onode.attrs) {
10326 aset.emplace(i.first.c_str(), i.second);
10327 }
10328 r = 0;
10329 }
10330
10331 out:
7c673cae
FG
10332 if (r == 0 && _debug_mdata_eio(oid)) {
10333 r = -EIO;
10334 derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
10335 }
10336 dout(10) << __func__ << " " << c->cid << " " << oid
10337 << " = " << r << dendl;
10338 return r;
10339}
10340
10341int BlueStore::list_collections(vector<coll_t>& ls)
10342{
9f95a23c 10343 std::shared_lock l(coll_lock);
11fdf7f2 10344 ls.reserve(coll_map.size());
7c673cae
FG
10345 for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
10346 p != coll_map.end();
10347 ++p)
10348 ls.push_back(p->first);
10349 return 0;
10350}
10351
10352bool BlueStore::collection_exists(const coll_t& c)
10353{
9f95a23c 10354 std::shared_lock l(coll_lock);
7c673cae
FG
10355 return coll_map.count(c);
10356}
10357
11fdf7f2 10358int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
7c673cae 10359{
11fdf7f2 10360 dout(15) << __func__ << " " << ch->cid << dendl;
7c673cae
FG
10361 vector<ghobject_t> ls;
10362 ghobject_t next;
11fdf7f2 10363 int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
7c673cae
FG
10364 &ls, &next);
10365 if (r < 0) {
10366 derr << __func__ << " collection_list returned: " << cpp_strerror(r)
10367 << dendl;
10368 return r;
10369 }
10370 *empty = ls.empty();
11fdf7f2 10371 dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
7c673cae
FG
10372 return 0;
10373}
10374
11fdf7f2 10375int BlueStore::collection_bits(CollectionHandle& ch)
7c673cae 10376{
11fdf7f2
TL
10377 dout(15) << __func__ << " " << ch->cid << dendl;
10378 Collection *c = static_cast<Collection*>(ch.get());
9f95a23c 10379 std::shared_lock l(c->lock);
11fdf7f2 10380 dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
7c673cae
FG
10381 return c->cnode.bits;
10382}
10383
7c673cae
FG
10384int BlueStore::collection_list(
10385 CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
10386 vector<ghobject_t> *ls, ghobject_t *pnext)
10387{
10388 Collection *c = static_cast<Collection *>(c_.get());
11fdf7f2 10389 c->flush();
7c673cae
FG
10390 dout(15) << __func__ << " " << c->cid
10391 << " start " << start << " end " << end << " max " << max << dendl;
10392 int r;
10393 {
9f95a23c 10394 std::shared_lock l(c->lock);
7c673cae
FG
10395 r = _collection_list(c, start, end, max, ls, pnext);
10396 }
10397
7c673cae
FG
10398 dout(10) << __func__ << " " << c->cid
10399 << " start " << start << " end " << end << " max " << max
10400 << " = " << r << ", ls.size() = " << ls->size()
10401 << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
10402 return r;
10403}
10404
10405int BlueStore::_collection_list(
10406 Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
10407 vector<ghobject_t> *ls, ghobject_t *pnext)
10408{
10409
10410 if (!c->exists)
10411 return -ENOENT;
10412
494da23a 10413 auto start_time = mono_clock::now();
7c673cae
FG
10414 int r = 0;
10415 ghobject_t static_next;
10416 KeyValueDB::Iterator it;
10417 string temp_start_key, temp_end_key;
10418 string start_key, end_key;
10419 bool set_next = false;
10420 string pend;
10421 bool temp;
10422
10423 if (!pnext)
10424 pnext = &static_next;
10425
11fdf7f2 10426 if (start.is_max() || start.hobj.is_max()) {
7c673cae
FG
10427 goto out;
10428 }
10429 get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
10430 &start_key, &end_key);
10431 dout(20) << __func__
10432 << " range " << pretty_binary_string(temp_start_key)
10433 << " to " << pretty_binary_string(temp_end_key)
10434 << " and " << pretty_binary_string(start_key)
10435 << " to " << pretty_binary_string(end_key)
10436 << " start " << start << dendl;
10437 it = db->get_iterator(PREFIX_OBJ);
10438 if (start == ghobject_t() ||
10439 start.hobj == hobject_t() ||
10440 start == c->cid.get_min_hobj()) {
10441 it->upper_bound(temp_start_key);
10442 temp = true;
10443 } else {
10444 string k;
10445 get_object_key(cct, start, &k);
10446 if (start.hobj.is_temp()) {
10447 temp = true;
11fdf7f2 10448 ceph_assert(k >= temp_start_key && k < temp_end_key);
7c673cae
FG
10449 } else {
10450 temp = false;
11fdf7f2 10451 ceph_assert(k >= start_key && k < end_key);
7c673cae 10452 }
11fdf7f2 10453 dout(20) << __func__ << " start from " << pretty_binary_string(k)
7c673cae
FG
10454 << " temp=" << (int)temp << dendl;
10455 it->lower_bound(k);
10456 }
10457 if (end.hobj.is_max()) {
10458 pend = temp ? temp_end_key : end_key;
10459 } else {
10460 get_object_key(cct, end, &end_key);
10461 if (end.hobj.is_temp()) {
10462 if (temp)
10463 pend = end_key;
10464 else
10465 goto out;
10466 } else {
10467 pend = temp ? temp_end_key : end_key;
10468 }
10469 }
10470 dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
10471 while (true) {
10472 if (!it->valid() || it->key() >= pend) {
10473 if (!it->valid())
10474 dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
10475 else
10476 dout(20) << __func__ << " key " << pretty_binary_string(it->key())
10477 << " >= " << end << dendl;
10478 if (temp) {
10479 if (end.hobj.is_temp()) {
10480 break;
10481 }
10482 dout(30) << __func__ << " switch to non-temp namespace" << dendl;
10483 temp = false;
10484 it->upper_bound(start_key);
10485 pend = end_key;
10486 dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
10487 continue;
10488 }
10489 break;
10490 }
10491 dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
10492 if (is_extent_shard_key(it->key())) {
10493 it->next();
10494 continue;
10495 }
10496 ghobject_t oid;
10497 int r = get_key_object(it->key(), &oid);
11fdf7f2 10498 ceph_assert(r == 0);
7c673cae
FG
10499 dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
10500 if (ls->size() >= (unsigned)max) {
10501 dout(20) << __func__ << " reached max " << max << dendl;
10502 *pnext = oid;
10503 set_next = true;
10504 break;
10505 }
10506 ls->push_back(oid);
10507 it->next();
10508 }
10509out:
10510 if (!set_next) {
10511 *pnext = ghobject_t::get_max();
10512 }
494da23a
TL
10513 log_latency_fn(
10514 __func__,
10515 l_bluestore_clist_lat,
10516 mono_clock::now() - start_time,
10517 cct->_conf->bluestore_log_collection_list_age,
10518 [&] (const ceph::timespan& lat) {
10519 ostringstream ostr;
10520 ostr << ", lat = " << timespan_str(lat)
10521 << " cid =" << c->cid
10522 << " start " << start << " end " << end
10523 << " max " << max;
10524 return ostr.str();
10525 }
10526 );
7c673cae
FG
10527 return r;
10528}
10529
7c673cae
FG
10530int BlueStore::omap_get(
10531 CollectionHandle &c_, ///< [in] Collection containing oid
10532 const ghobject_t &oid, ///< [in] Object containing omap
10533 bufferlist *header, ///< [out] omap header
10534 map<string, bufferlist> *out /// < [out] Key to value map
10535 )
10536{
10537 Collection *c = static_cast<Collection *>(c_.get());
9f95a23c
TL
10538 return _omap_get(c, oid, header, out);
10539}
10540
10541int BlueStore::_omap_get(
10542 Collection *c, ///< [in] Collection containing oid
10543 const ghobject_t &oid, ///< [in] Object containing omap
10544 bufferlist *header, ///< [out] omap header
10545 map<string, bufferlist> *out /// < [out] Key to value map
10546 )
10547{
7c673cae
FG
10548 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10549 if (!c->exists)
10550 return -ENOENT;
9f95a23c 10551 std::shared_lock l(c->lock);
7c673cae
FG
10552 int r = 0;
10553 OnodeRef o = c->get_onode(oid, false);
10554 if (!o || !o->exists) {
10555 r = -ENOENT;
10556 goto out;
10557 }
9f95a23c
TL
10558 r = _onode_omap_get(o, header, out);
10559 out:
10560 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10561 << dendl;
10562 return r;
10563}
10564
10565int BlueStore::_onode_omap_get(
10566 const OnodeRef &o, ///< [in] Object containing omap
10567 bufferlist *header, ///< [out] omap header
10568 map<string, bufferlist> *out /// < [out] Key to value map
10569)
10570{
10571 int r = 0;
10572 if (!o || !o->exists) {
10573 r = -ENOENT;
10574 goto out;
10575 }
7c673cae
FG
10576 if (!o->onode.has_omap())
10577 goto out;
10578 o->flush();
10579 {
9f95a23c 10580 const string& prefix = o->get_omap_prefix();
11fdf7f2 10581 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 10582 string head, tail;
9f95a23c
TL
10583 o->get_omap_header(&head);
10584 o->get_omap_tail(&tail);
7c673cae
FG
10585 it->lower_bound(head);
10586 while (it->valid()) {
10587 if (it->key() == head) {
9f95a23c
TL
10588 dout(30) << __func__ << " got header" << dendl;
10589 *header = it->value();
7c673cae 10590 } else if (it->key() >= tail) {
9f95a23c
TL
10591 dout(30) << __func__ << " reached tail" << dendl;
10592 break;
7c673cae 10593 } else {
9f95a23c
TL
10594 string user_key;
10595 o->decode_omap_key(it->key(), &user_key);
10596 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
10597 << " -> " << user_key << dendl;
10598 (*out)[user_key] = it->value();
7c673cae
FG
10599 }
10600 it->next();
10601 }
10602 }
9f95a23c 10603out:
7c673cae
FG
10604 return r;
10605}
10606
7c673cae
FG
10607int BlueStore::omap_get_header(
10608 CollectionHandle &c_, ///< [in] Collection containing oid
10609 const ghobject_t &oid, ///< [in] Object containing omap
10610 bufferlist *header, ///< [out] omap header
10611 bool allow_eio ///< [in] don't assert on eio
10612 )
10613{
10614 Collection *c = static_cast<Collection *>(c_.get());
10615 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10616 if (!c->exists)
10617 return -ENOENT;
9f95a23c 10618 std::shared_lock l(c->lock);
7c673cae
FG
10619 int r = 0;
10620 OnodeRef o = c->get_onode(oid, false);
10621 if (!o || !o->exists) {
10622 r = -ENOENT;
10623 goto out;
10624 }
10625 if (!o->onode.has_omap())
10626 goto out;
10627 o->flush();
10628 {
10629 string head;
9f95a23c
TL
10630 o->get_omap_header(&head);
10631 if (db->get(o->get_omap_prefix(), head, header) >= 0) {
7c673cae
FG
10632 dout(30) << __func__ << " got header" << dendl;
10633 } else {
10634 dout(30) << __func__ << " no header" << dendl;
10635 }
10636 }
10637 out:
10638 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10639 << dendl;
10640 return r;
10641}
10642
7c673cae
FG
10643int BlueStore::omap_get_keys(
10644 CollectionHandle &c_, ///< [in] Collection containing oid
10645 const ghobject_t &oid, ///< [in] Object containing omap
10646 set<string> *keys ///< [out] Keys defined on oid
10647 )
10648{
10649 Collection *c = static_cast<Collection *>(c_.get());
10650 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10651 if (!c->exists)
10652 return -ENOENT;
9f95a23c 10653 std::shared_lock l(c->lock);
7c673cae
FG
10654 int r = 0;
10655 OnodeRef o = c->get_onode(oid, false);
10656 if (!o || !o->exists) {
10657 r = -ENOENT;
10658 goto out;
10659 }
10660 if (!o->onode.has_omap())
10661 goto out;
10662 o->flush();
10663 {
9f95a23c 10664 const string& prefix = o->get_omap_prefix();
11fdf7f2 10665 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 10666 string head, tail;
9f95a23c
TL
10667 o->get_omap_key(string(), &head);
10668 o->get_omap_tail(&tail);
7c673cae
FG
10669 it->lower_bound(head);
10670 while (it->valid()) {
10671 if (it->key() >= tail) {
10672 dout(30) << __func__ << " reached tail" << dendl;
10673 break;
10674 }
10675 string user_key;
9f95a23c 10676 o->decode_omap_key(it->key(), &user_key);
11fdf7f2 10677 dout(20) << __func__ << " got " << pretty_binary_string(it->key())
7c673cae
FG
10678 << " -> " << user_key << dendl;
10679 keys->insert(user_key);
10680 it->next();
11fdf7f2
TL
10681 }
10682 }
10683 out:
10684 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10685 << dendl;
10686 return r;
7c673cae
FG
10687}
10688
10689int BlueStore::omap_get_values(
10690 CollectionHandle &c_, ///< [in] Collection containing oid
10691 const ghobject_t &oid, ///< [in] Object containing omap
10692 const set<string> &keys, ///< [in] Keys to get
10693 map<string, bufferlist> *out ///< [out] Returned keys and values
10694 )
10695{
10696 Collection *c = static_cast<Collection *>(c_.get());
10697 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10698 if (!c->exists)
10699 return -ENOENT;
9f95a23c 10700 std::shared_lock l(c->lock);
7c673cae
FG
10701 int r = 0;
10702 string final_key;
10703 OnodeRef o = c->get_onode(oid, false);
10704 if (!o || !o->exists) {
10705 r = -ENOENT;
10706 goto out;
10707 }
9f95a23c 10708 if (!o->onode.has_omap()) {
7c673cae 10709 goto out;
9f95a23c
TL
10710 }
10711 o->flush();
11fdf7f2 10712 {
9f95a23c
TL
10713 const string& prefix = o->get_omap_prefix();
10714 o->get_omap_key(string(), &final_key);
10715 size_t base_key_len = final_key.size();
11fdf7f2 10716 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 10717 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
10718 final_key += *p;
10719 bufferlist val;
10720 if (db->get(prefix, final_key, &val) >= 0) {
10721 dout(30) << __func__ << " got " << pretty_binary_string(final_key)
10722 << " -> " << *p << dendl;
10723 out->insert(make_pair(*p, val));
10724 }
7c673cae
FG
10725 }
10726 }
10727 out:
10728 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10729 << dendl;
10730 return r;
10731}
10732
9f95a23c
TL
10733#ifdef WITH_SEASTAR
10734int BlueStore::omap_get_values(
10735 CollectionHandle &c_, ///< [in] Collection containing oid
10736 const ghobject_t &oid, ///< [in] Object containing omap
10737 const std::optional<string> &start_after, ///< [in] Keys to get
10738 map<string, bufferlist> *output ///< [out] Returned keys and values
10739 )
10740{
10741 Collection *c = static_cast<Collection *>(c_.get());
10742 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10743 if (!c->exists)
10744 return -ENOENT;
10745 std::shared_lock l(c->lock);
10746 int r = 0;
10747 OnodeRef o = c->get_onode(oid, false);
10748 if (!o || !o->exists) {
10749 r = -ENOENT;
10750 goto out;
10751 }
10752 if (!o->onode.has_omap()) {
10753 goto out;
10754 }
10755 o->flush();
10756 {
10757 ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
10758 if (!iter) {
10759 r = -ENOENT;
10760 goto out;
10761 }
10762 iter->upper_bound(*start_after);
10763 for (; iter->valid(); iter->next()) {
10764 output->insert(make_pair(iter->key(), iter->value()));
10765 }
10766 }
10767
10768out:
10769 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10770 << dendl;
10771 return r;
10772}
10773#endif
10774
7c673cae
FG
10775int BlueStore::omap_check_keys(
10776 CollectionHandle &c_, ///< [in] Collection containing oid
10777 const ghobject_t &oid, ///< [in] Object containing omap
10778 const set<string> &keys, ///< [in] Keys to check
10779 set<string> *out ///< [out] Subset of keys defined on oid
10780 )
10781{
10782 Collection *c = static_cast<Collection *>(c_.get());
10783 dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
10784 if (!c->exists)
10785 return -ENOENT;
9f95a23c 10786 std::shared_lock l(c->lock);
7c673cae
FG
10787 int r = 0;
10788 string final_key;
10789 OnodeRef o = c->get_onode(oid, false);
10790 if (!o || !o->exists) {
10791 r = -ENOENT;
10792 goto out;
10793 }
9f95a23c 10794 if (!o->onode.has_omap()) {
7c673cae 10795 goto out;
9f95a23c
TL
10796 }
10797 o->flush();
11fdf7f2 10798 {
9f95a23c
TL
10799 const string& prefix = o->get_omap_prefix();
10800 o->get_omap_key(string(), &final_key);
10801 size_t base_key_len = final_key.size();
11fdf7f2 10802 for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
9f95a23c 10803 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
10804 final_key += *p;
10805 bufferlist val;
10806 if (db->get(prefix, final_key, &val) >= 0) {
10807 dout(30) << __func__ << " have " << pretty_binary_string(final_key)
10808 << " -> " << *p << dendl;
10809 out->insert(*p);
10810 } else {
10811 dout(30) << __func__ << " miss " << pretty_binary_string(final_key)
10812 << " -> " << *p << dendl;
10813 }
7c673cae
FG
10814 }
10815 }
10816 out:
10817 dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
10818 << dendl;
10819 return r;
10820}
10821
7c673cae
FG
10822ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
10823 CollectionHandle &c_, ///< [in] collection
10824 const ghobject_t &oid ///< [in] object
10825 )
10826{
10827 Collection *c = static_cast<Collection *>(c_.get());
10828 dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
10829 if (!c->exists) {
10830 return ObjectMap::ObjectMapIterator();
10831 }
9f95a23c 10832 std::shared_lock l(c->lock);
7c673cae
FG
10833 OnodeRef o = c->get_onode(oid, false);
10834 if (!o || !o->exists) {
10835 dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
10836 return ObjectMap::ObjectMapIterator();
10837 }
10838 o->flush();
10839 dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
9f95a23c 10840 KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix());
7c673cae
FG
10841 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
10842}
10843
10844// -----------------
10845// write helpers
10846
11fdf7f2
TL
10847uint64_t BlueStore::_get_ondisk_reserved() const {
10848 return round_up_to(
10849 std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
10850}
10851
7c673cae
FG
10852void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
10853{
10854 dout(10) << __func__ << " ondisk_format " << ondisk_format
10855 << " min_compat_ondisk_format " << min_compat_ondisk_format
10856 << dendl;
11fdf7f2 10857 ceph_assert(ondisk_format == latest_ondisk_format);
7c673cae
FG
10858 {
10859 bufferlist bl;
11fdf7f2 10860 encode(ondisk_format, bl);
7c673cae
FG
10861 t->set(PREFIX_SUPER, "ondisk_format", bl);
10862 }
10863 {
10864 bufferlist bl;
11fdf7f2 10865 encode(min_compat_ondisk_format, bl);
7c673cae
FG
10866 t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
10867 }
10868}
10869
10870int BlueStore::_open_super_meta()
10871{
10872 // nid
10873 {
10874 nid_max = 0;
10875 bufferlist bl;
10876 db->get(PREFIX_SUPER, "nid_max", &bl);
11fdf7f2 10877 auto p = bl.cbegin();
7c673cae
FG
10878 try {
10879 uint64_t v;
11fdf7f2 10880 decode(v, p);
7c673cae
FG
10881 nid_max = v;
10882 } catch (buffer::error& e) {
10883 derr << __func__ << " unable to read nid_max" << dendl;
10884 return -EIO;
10885 }
10886 dout(10) << __func__ << " old nid_max " << nid_max << dendl;
10887 nid_last = nid_max.load();
10888 }
10889
10890 // blobid
10891 {
10892 blobid_max = 0;
10893 bufferlist bl;
10894 db->get(PREFIX_SUPER, "blobid_max", &bl);
11fdf7f2 10895 auto p = bl.cbegin();
7c673cae
FG
10896 try {
10897 uint64_t v;
11fdf7f2 10898 decode(v, p);
7c673cae
FG
10899 blobid_max = v;
10900 } catch (buffer::error& e) {
10901 derr << __func__ << " unable to read blobid_max" << dendl;
10902 return -EIO;
10903 }
10904 dout(10) << __func__ << " old blobid_max " << blobid_max << dendl;
10905 blobid_last = blobid_max.load();
10906 }
10907
10908 // freelist
10909 {
10910 bufferlist bl;
10911 db->get(PREFIX_SUPER, "freelist_type", &bl);
10912 if (bl.length()) {
10913 freelist_type = std::string(bl.c_str(), bl.length());
10914 dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
10915 } else {
11fdf7f2 10916 ceph_abort_msg("Not Support extent freelist manager");
7c673cae 10917 }
7c673cae
FG
10918 }
10919
10920 // ondisk format
10921 int32_t compat_ondisk_format = 0;
10922 {
10923 bufferlist bl;
10924 int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
10925 if (r < 0) {
10926 // base case: kraken bluestore is v1 and readable by v1
10927 dout(20) << __func__ << " missing ondisk_format; assuming kraken"
10928 << dendl;
10929 ondisk_format = 1;
10930 compat_ondisk_format = 1;
10931 } else {
11fdf7f2 10932 auto p = bl.cbegin();
7c673cae 10933 try {
11fdf7f2 10934 decode(ondisk_format, p);
7c673cae
FG
10935 } catch (buffer::error& e) {
10936 derr << __func__ << " unable to read ondisk_format" << dendl;
10937 return -EIO;
10938 }
10939 bl.clear();
10940 {
10941 r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
11fdf7f2
TL
10942 ceph_assert(!r);
10943 auto p = bl.cbegin();
7c673cae 10944 try {
11fdf7f2 10945 decode(compat_ondisk_format, p);
7c673cae
FG
10946 } catch (buffer::error& e) {
10947 derr << __func__ << " unable to read compat_ondisk_format" << dendl;
10948 return -EIO;
10949 }
10950 }
10951 }
10952 dout(10) << __func__ << " ondisk_format " << ondisk_format
10953 << " compat_ondisk_format " << compat_ondisk_format
10954 << dendl;
10955 }
10956
10957 if (latest_ondisk_format < compat_ondisk_format) {
10958 derr << __func__ << " compat_ondisk_format is "
10959 << compat_ondisk_format << " but we only understand version "
10960 << latest_ondisk_format << dendl;
10961 return -EPERM;
10962 }
7c673cae
FG
10963
10964 {
10965 bufferlist bl;
10966 db->get(PREFIX_SUPER, "min_alloc_size", &bl);
11fdf7f2 10967 auto p = bl.cbegin();
7c673cae
FG
10968 try {
10969 uint64_t val;
11fdf7f2 10970 decode(val, p);
7c673cae 10971 min_alloc_size = val;
224ce89b 10972 min_alloc_size_order = ctz(val);
11fdf7f2 10973 ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
7c673cae
FG
10974 } catch (buffer::error& e) {
10975 derr << __func__ << " unable to read min_alloc_size" << dendl;
10976 return -EIO;
10977 }
10978 dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
10979 << std::dec << dendl;
10980 }
9f95a23c
TL
10981
10982 _set_per_pool_omap();
10983
224ce89b 10984 _open_statfs();
7c673cae
FG
10985 _set_alloc_sizes();
10986 _set_throttle_params();
10987
10988 _set_csum();
10989 _set_compression();
10990 _set_blob_size();
10991
11fdf7f2 10992 _validate_bdev();
7c673cae
FG
10993 return 0;
10994}
10995
10996int BlueStore::_upgrade_super()
10997{
10998 dout(1) << __func__ << " from " << ondisk_format << ", latest "
10999 << latest_ondisk_format << dendl;
11fdf7f2
TL
11000 if (ondisk_format < latest_ondisk_format) {
11001 ceph_assert(ondisk_format > 0);
11002 ceph_assert(ondisk_format < latest_ondisk_format);
11003
11004 if (ondisk_format == 1) {
11005 // changes:
11006 // - super: added ondisk_format
11007 // - super: added min_readable_ondisk_format
11008 // - super: added min_compat_ondisk_format
11009 // - super: added min_alloc_size
11010 // - super: removed min_min_alloc_size
11011 KeyValueDB::Transaction t = db->get_transaction();
11012 {
11013 bufferlist bl;
11014 db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
11015 auto p = bl.cbegin();
11016 try {
11017 uint64_t val;
11018 decode(val, p);
11019 min_alloc_size = val;
11020 } catch (buffer::error& e) {
11021 derr << __func__ << " failed to read min_min_alloc_size" << dendl;
11022 return -EIO;
11023 }
11024 t->set(PREFIX_SUPER, "min_alloc_size", bl);
11025 t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
7c673cae 11026 }
11fdf7f2
TL
11027 ondisk_format = 2;
11028 _prepare_ondisk_format_super(t);
11029 int r = db->submit_transaction_sync(t);
11030 ceph_assert(r == 0);
7c673cae 11031 }
9f95a23c
TL
11032 if (ondisk_format == 2) {
11033 // changes:
11034 // - onode has FLAG_PER_POOL_OMAP. Note that we do not know that *all*
11035 // ondes are using the per-pool prefix until a repair is run; at that
11036 // point the per_pool_omap=1 key will be set.
11037 // - super: added per_pool_omap key, which indicates that *all* objects
11038 // are using the new prefix and key format
11039 ondisk_format = 3;
11040 KeyValueDB::Transaction t = db->get_transaction();
11041 _prepare_ondisk_format_super(t);
11042 int r = db->submit_transaction_sync(t);
11043 ceph_assert(r == 0);
11044 }
7c673cae 11045 }
7c673cae
FG
11046 // done
11047 dout(1) << __func__ << " done" << dendl;
11048 return 0;
11049}
11050
11051void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
11052{
224ce89b 11053 if (o->onode.nid) {
11fdf7f2 11054 ceph_assert(o->exists);
7c673cae 11055 return;
224ce89b 11056 }
7c673cae
FG
11057 uint64_t nid = ++nid_last;
11058 dout(20) << __func__ << " " << nid << dendl;
11059 o->onode.nid = nid;
11060 txc->last_nid = nid;
224ce89b 11061 o->exists = true;
7c673cae
FG
11062}
11063
11064uint64_t BlueStore::_assign_blobid(TransContext *txc)
11065{
11066 uint64_t bid = ++blobid_last;
11067 dout(20) << __func__ << " " << bid << dendl;
11068 txc->last_blobid = bid;
11069 return bid;
11070}
11071
11072void BlueStore::get_db_statistics(Formatter *f)
11073{
11074 db->get_statistics(f);
11075}
11076
11fdf7f2
TL
11077BlueStore::TransContext *BlueStore::_txc_create(
11078 Collection *c, OpSequencer *osr,
11079 list<Context*> *on_commits)
7c673cae 11080{
11fdf7f2 11081 TransContext *txc = new TransContext(cct, c, osr, on_commits);
7c673cae
FG
11082 txc->t = db->get_transaction();
11083 osr->queue_new(txc);
11084 dout(20) << __func__ << " osr " << osr << " = " << txc
11085 << " seq " << txc->seq << dendl;
11086 return txc;
11087}
11088
11089void BlueStore::_txc_calc_cost(TransContext *txc)
11090{
11fdf7f2
TL
11091 // one "io" for the kv commit
11092 auto ios = 1 + txc->ioc.get_num_ios();
7c673cae
FG
11093 auto cost = throttle_cost_per_io.load();
11094 txc->cost = ios * cost + txc->bytes;
9f95a23c 11095 txc->ios = ios;
7c673cae
FG
11096 dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
11097 << ios << " ios * " << cost << " + " << txc->bytes
11098 << " bytes)" << dendl;
11099}
11100
11101void BlueStore::_txc_update_store_statfs(TransContext *txc)
11102{
11103 if (txc->statfs_delta.is_empty())
11104 return;
11105
11106 logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
11107 logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
11108 logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
11109 logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
11110 logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
11111
11112 bufferlist bl;
11113 txc->statfs_delta.encode(bl);
11fdf7f2
TL
11114 if (per_pool_stat_collection) {
11115 string key;
11116 get_pool_stat_key(txc->osd_pool_id, &key);
11117 txc->t->merge(PREFIX_STAT, key, bl);
11118
11119 std::lock_guard l(vstatfs_lock);
11120 auto& stats = osd_pools[txc->osd_pool_id];
11121 stats += txc->statfs_delta;
11122
11123 vstatfs += txc->statfs_delta; //non-persistent in this mode
11124
11125 } else {
11126 txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
7c673cae 11127
11fdf7f2
TL
11128 std::lock_guard l(vstatfs_lock);
11129 vstatfs += txc->statfs_delta;
11130 }
7c673cae
FG
11131 txc->statfs_delta.reset();
11132}
11133
11134void BlueStore::_txc_state_proc(TransContext *txc)
11135{
11136 while (true) {
11137 dout(10) << __func__ << " txc " << txc
11138 << " " << txc->get_state_name() << dendl;
11139 switch (txc->state) {
11140 case TransContext::STATE_PREPARE:
9f95a23c 11141 throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
7c673cae
FG
11142 if (txc->ioc.has_pending_aios()) {
11143 txc->state = TransContext::STATE_AIO_WAIT;
11144 txc->had_ios = true;
11145 _txc_aio_submit(txc);
11146 return;
11147 }
11148 // ** fall-thru **
11149
11150 case TransContext::STATE_AIO_WAIT:
11fdf7f2 11151 {
9f95a23c
TL
11152 mono_clock::duration lat = throttle.log_state_latency(
11153 *txc, logger, l_bluestore_state_aio_wait_lat);
11154 if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
11fdf7f2
TL
11155 dout(0) << __func__ << " slow aio_wait, txc = " << txc
11156 << ", latency = " << lat
11157 << dendl;
11158 }
11159 }
11160
7c673cae
FG
11161 _txc_finish_io(txc); // may trigger blocked txc's too
11162 return;
11163
11164 case TransContext::STATE_IO_DONE:
11fdf7f2 11165 ceph_assert(ceph_mutex_is_locked(txc->osr->qlock)); // see _txc_finish_io
7c673cae
FG
11166 if (txc->had_ios) {
11167 ++txc->osr->txc_with_unstable_io;
11168 }
9f95a23c 11169 throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
7c673cae
FG
11170 txc->state = TransContext::STATE_KV_QUEUED;
11171 if (cct->_conf->bluestore_sync_submit_transaction) {
11172 if (txc->last_nid >= nid_max ||
11173 txc->last_blobid >= blobid_max) {
11174 dout(20) << __func__
11175 << " last_{nid,blobid} exceeds max, submit via kv thread"
11176 << dendl;
11177 } else if (txc->osr->kv_committing_serially) {
11178 dout(20) << __func__ << " prior txc submitted via kv thread, us too"
11179 << dendl;
11180 // note: this is starvation-prone. once we have a txc in a busy
11181 // sequencer that is committing serially it is possible to keep
11182 // submitting new transactions fast enough that we get stuck doing
11183 // so. the alternative is to block here... fixme?
11184 } else if (txc->osr->txc_with_unstable_io) {
11185 dout(20) << __func__ << " prior txc(s) with unstable ios "
11186 << txc->osr->txc_with_unstable_io.load() << dendl;
11187 } else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
11188 rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
11189 == 0) {
11190 dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
11191 << dendl;
11192 } else {
9f95a23c 11193 _txc_apply_kv(txc, true);
7c673cae
FG
11194 }
11195 }
11196 {
11fdf7f2 11197 std::lock_guard l(kv_lock);
7c673cae 11198 kv_queue.push_back(txc);
9f95a23c
TL
11199 if (!kv_sync_in_progress) {
11200 kv_sync_in_progress = true;
11201 kv_cond.notify_one();
11202 }
7c673cae
FG
11203 if (txc->state != TransContext::STATE_KV_SUBMITTED) {
11204 kv_queue_unsubmitted.push_back(txc);
11205 ++txc->osr->kv_committing_serially;
11206 }
31f18b77
FG
11207 if (txc->had_ios)
11208 kv_ios++;
11209 kv_throttle_costs += txc->cost;
7c673cae
FG
11210 }
11211 return;
11212 case TransContext::STATE_KV_SUBMITTED:
7c673cae
FG
11213 _txc_committed_kv(txc);
11214 // ** fall-thru **
11215
11216 case TransContext::STATE_KV_DONE:
9f95a23c 11217 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
7c673cae
FG
11218 if (txc->deferred_txn) {
11219 txc->state = TransContext::STATE_DEFERRED_QUEUED;
11220 _deferred_queue(txc);
11221 return;
11222 }
11223 txc->state = TransContext::STATE_FINISHING;
11224 break;
11225
11226 case TransContext::STATE_DEFERRED_CLEANUP:
9f95a23c 11227 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
7c673cae
FG
11228 txc->state = TransContext::STATE_FINISHING;
11229 // ** fall-thru **
11230
11231 case TransContext::STATE_FINISHING:
9f95a23c 11232 throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
7c673cae
FG
11233 _txc_finish(txc);
11234 return;
11235
11236 default:
11237 derr << __func__ << " unexpected txc " << txc
11238 << " state " << txc->get_state_name() << dendl;
11fdf7f2 11239 ceph_abort_msg("unexpected txc state");
7c673cae
FG
11240 return;
11241 }
11242 }
11243}
11244
11245void BlueStore::_txc_finish_io(TransContext *txc)
11246{
11247 dout(20) << __func__ << " " << txc << dendl;
11248
11249 /*
11250 * we need to preserve the order of kv transactions,
11251 * even though aio will complete in any order.
11252 */
11253
11254 OpSequencer *osr = txc->osr.get();
11fdf7f2 11255 std::lock_guard l(osr->qlock);
7c673cae 11256 txc->state = TransContext::STATE_IO_DONE;
11fdf7f2 11257 txc->ioc.release_running_aios();
7c673cae
FG
11258 OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
11259 while (p != osr->q.begin()) {
11260 --p;
11261 if (p->state < TransContext::STATE_IO_DONE) {
11262 dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
11263 << p->get_state_name() << dendl;
11264 return;
11265 }
11266 if (p->state > TransContext::STATE_IO_DONE) {
11267 ++p;
11268 break;
11269 }
11270 }
11271 do {
11272 _txc_state_proc(&*p++);
11273 } while (p != osr->q.end() &&
11274 p->state == TransContext::STATE_IO_DONE);
11275
11fdf7f2 11276 if (osr->kv_submitted_waiters) {
7c673cae
FG
11277 osr->qcond.notify_all();
11278 }
11279}
11280
11281void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
11282{
11283 dout(20) << __func__ << " txc " << txc
11284 << " onodes " << txc->onodes
11285 << " shared_blobs " << txc->shared_blobs
11286 << dendl;
11287
11288 // finalize onodes
11289 for (auto o : txc->onodes) {
11fdf7f2 11290 _record_onode(o, t);
7c673cae
FG
11291 o->flushing_count++;
11292 }
11293
11294 // objects we modified but didn't affect the onode
11295 auto p = txc->modified_objects.begin();
11296 while (p != txc->modified_objects.end()) {
11297 if (txc->onodes.count(*p) == 0) {
11298 (*p)->flushing_count++;
11299 ++p;
11300 } else {
11301 // remove dups with onodes list to avoid problems in _txc_finish
11302 p = txc->modified_objects.erase(p);
11303 }
11304 }
11305
11306 // finalize shared_blobs
11307 for (auto sb : txc->shared_blobs) {
11308 string key;
11309 auto sbid = sb->get_sbid();
11310 get_shared_blob_key(sbid, &key);
11311 if (sb->persistent->empty()) {
11fdf7f2
TL
11312 dout(20) << __func__ << " shared_blob 0x"
11313 << std::hex << sbid << std::dec
7c673cae
FG
11314 << " is empty" << dendl;
11315 t->rmkey(PREFIX_SHARED_BLOB, key);
11316 } else {
11317 bufferlist bl;
11fdf7f2
TL
11318 encode(*(sb->persistent), bl);
11319 dout(20) << __func__ << " shared_blob 0x"
11320 << std::hex << sbid << std::dec
31f18b77 11321 << " is " << bl.length() << " " << *sb << dendl;
7c673cae
FG
11322 t->set(PREFIX_SHARED_BLOB, key, bl);
11323 }
11324 }
11325}
11326
11327void BlueStore::BSPerfTracker::update_from_perfcounters(
11328 PerfCounters &logger)
11329{
11fdf7f2
TL
11330 os_commit_latency_ns.consume_next(
11331 logger.get_tavg_ns(
7c673cae 11332 l_bluestore_commit_lat));
11fdf7f2
TL
11333 os_apply_latency_ns.consume_next(
11334 logger.get_tavg_ns(
7c673cae
FG
11335 l_bluestore_commit_lat));
11336}
11337
11338void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
11339{
11340 dout(20) << __func__ << " txc " << txc << std::hex
11341 << " allocated 0x" << txc->allocated
11342 << " released 0x" << txc->released
11343 << std::dec << dendl;
11344
11345 // We have to handle the case where we allocate *and* deallocate the
11346 // same region in this transaction. The freelist doesn't like that.
11347 // (Actually, the only thing that cares is the BitmapFreelistManager
11348 // debug check. But that's important.)
11349 interval_set<uint64_t> tmp_allocated, tmp_released;
11350 interval_set<uint64_t> *pallocated = &txc->allocated;
11351 interval_set<uint64_t> *preleased = &txc->released;
11352 if (!txc->allocated.empty() && !txc->released.empty()) {
11353 interval_set<uint64_t> overlap;
11354 overlap.intersection_of(txc->allocated, txc->released);
11355 if (!overlap.empty()) {
11356 tmp_allocated = txc->allocated;
11357 tmp_allocated.subtract(overlap);
11358 tmp_released = txc->released;
11359 tmp_released.subtract(overlap);
11360 dout(20) << __func__ << " overlap 0x" << std::hex << overlap
11361 << ", new allocated 0x" << tmp_allocated
11362 << " released 0x" << tmp_released << std::dec
11363 << dendl;
11364 pallocated = &tmp_allocated;
11365 preleased = &tmp_released;
11366 }
11367 }
11368
11369 // update freelist with non-overlap sets
11370 for (interval_set<uint64_t>::iterator p = pallocated->begin();
11371 p != pallocated->end();
11372 ++p) {
11373 fm->allocate(p.get_start(), p.get_len(), t);
11374 }
11375 for (interval_set<uint64_t>::iterator p = preleased->begin();
11376 p != preleased->end();
11377 ++p) {
11378 dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
11379 << "~" << p.get_len() << std::dec << dendl;
11380 fm->release(p.get_start(), p.get_len(), t);
11381 }
11382
11383 _txc_update_store_statfs(txc);
11384}
11385
9f95a23c 11386void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
7c673cae 11387{
9f95a23c
TL
11388 ceph_assert(txc->state == TransContext::STATE_KV_QUEUED);
11389 {
11390#if defined(WITH_LTTNG)
11391 auto start = mono_clock::now();
11392#endif
11393
11394 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
11395 ceph_assert(r == 0);
11396 txc->state = TransContext::STATE_KV_SUBMITTED;
11397 if (txc->osr->kv_submitted_waiters) {
11398 std::lock_guard l(txc->osr->qlock);
11399 txc->osr->qcond.notify_all();
11400 }
11401
11402#if defined(WITH_LTTNG)
11403 if (txc->tracing) {
11404 tracepoint(
11405 bluestore,
11406 transaction_kv_submit_latency,
11407 txc->osr->get_sequencer_id(),
11408 txc->seq,
11409 sync_submit_transaction,
11410 ceph::to_seconds<double>(mono_clock::now() - start));
11411 }
11412#endif
11413 }
11414
7c673cae
FG
11415 for (auto ls : { &txc->onodes, &txc->modified_objects }) {
11416 for (auto& o : *ls) {
11417 dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
11418 << dendl;
9f95a23c 11419 if (--o->flushing_count == 0 && o->waiting_count.load()) {
11fdf7f2 11420 std::lock_guard l(o->flush_lock);
7c673cae
FG
11421 o->flush_cond.notify_all();
11422 }
11423 }
11424 }
11425}
11426
11427void BlueStore::_txc_committed_kv(TransContext *txc)
11428{
11429 dout(20) << __func__ << " txc " << txc << dendl;
9f95a23c 11430 throttle.complete_kv(*txc);
1adf2230 11431 {
11fdf7f2 11432 std::lock_guard l(txc->osr->qlock);
1adf2230 11433 txc->state = TransContext::STATE_KV_DONE;
11fdf7f2
TL
11434 if (txc->ch->commit_queue) {
11435 txc->ch->commit_queue->queue(txc->oncommits);
11436 } else {
11437 finisher.queue(txc->oncommits);
1adf2230 11438 }
7c673cae 11439 }
9f95a23c 11440 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
494da23a
TL
11441 log_latency_fn(
11442 __func__,
11443 l_bluestore_commit_lat,
9f95a23c 11444 mono_clock::now() - txc->start,
494da23a
TL
11445 cct->_conf->bluestore_log_op_age,
11446 [&](auto lat) {
11447 return ", txc = " + stringify(txc);
11448 }
11fdf7f2 11449 );
7c673cae
FG
11450}
11451
11452void BlueStore::_txc_finish(TransContext *txc)
11453{
11454 dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
11fdf7f2 11455 ceph_assert(txc->state == TransContext::STATE_FINISHING);
7c673cae
FG
11456
11457 for (auto& sb : txc->shared_blobs_written) {
f64942e4 11458 sb->finish_write(txc->seq);
7c673cae
FG
11459 }
11460 txc->shared_blobs_written.clear();
11461
11462 while (!txc->removed_collections.empty()) {
11463 _queue_reap_collection(txc->removed_collections.front());
11464 txc->removed_collections.pop_front();
11465 }
11466
11467 OpSequencerRef osr = txc->osr;
7c673cae 11468 bool empty = false;
31f18b77 11469 bool submit_deferred = false;
7c673cae
FG
11470 OpSequencer::q_list_t releasing_txc;
11471 {
11fdf7f2 11472 std::lock_guard l(osr->qlock);
7c673cae
FG
11473 txc->state = TransContext::STATE_DONE;
11474 bool notify = false;
11475 while (!osr->q.empty()) {
11476 TransContext *txc = &osr->q.front();
11477 dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name()
11478 << dendl;
11479 if (txc->state != TransContext::STATE_DONE) {
11480 if (txc->state == TransContext::STATE_PREPARE &&
11481 deferred_aggressive) {
11482 // for _osr_drain_preceding()
11483 notify = true;
11484 }
31f18b77 11485 if (txc->state == TransContext::STATE_DEFERRED_QUEUED &&
11fdf7f2 11486 osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
31f18b77
FG
11487 submit_deferred = true;
11488 }
7c673cae
FG
11489 break;
11490 }
11491
7c673cae
FG
11492 osr->q.pop_front();
11493 releasing_txc.push_back(*txc);
7c673cae 11494 }
9f95a23c 11495
7c673cae
FG
11496 if (osr->q.empty()) {
11497 dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
11498 empty = true;
11499 }
9f95a23c
TL
11500
11501 // only drain()/drain_preceding() need wakeup,
11502 // other cases use kv_submitted_waiters
11503 if (notify || empty) {
11504 osr->qcond.notify_all();
11505 }
7c673cae 11506 }
9f95a23c 11507
7c673cae
FG
11508 while (!releasing_txc.empty()) {
11509 // release to allocator only after all preceding txc's have also
11510 // finished any deferred writes that potentially land in these
11511 // blocks
11512 auto txc = &releasing_txc.front();
11513 _txc_release_alloc(txc);
11514 releasing_txc.pop_front();
9f95a23c
TL
11515 throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
11516 throttle.complete(*txc);
7c673cae
FG
11517 delete txc;
11518 }
11519
31f18b77
FG
11520 if (submit_deferred) {
11521 // we're pinning memory; flush! we could be more fine-grained here but
11522 // i'm not sure it's worth the bother.
11523 deferred_try_submit();
7c673cae
FG
11524 }
11525
7c673cae 11526 if (empty && osr->zombie) {
11fdf7f2
TL
11527 std::lock_guard l(zombie_osr_lock);
11528 if (zombie_osr_set.erase(osr->cid)) {
11529 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
11530 } else {
11531 dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
11532 << dendl;
11533 }
7c673cae 11534 }
9f95a23c 11535}
7c673cae
FG
11536
11537void BlueStore::_txc_release_alloc(TransContext *txc)
11538{
a8e16298 11539 // it's expected we're called with lazy_release_lock already taken!
11fdf7f2
TL
11540 if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
11541 int r = 0;
11542 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
11543 r = bdev->queue_discard(txc->released);
11544 if (r == 0) {
11545 dout(10) << __func__ << "(queued) " << txc << " " << std::hex
11546 << txc->released << std::dec << dendl;
11547 goto out;
11548 }
11549 } else if (cct->_conf->bdev_enable_discard) {
11550 for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
11551 bdev->discard(p.get_start(), p.get_len());
11552 }
11553 }
11554 dout(10) << __func__ << "(sync) " << txc << " " << std::hex
94b18763 11555 << txc->released << std::dec << dendl;
11fdf7f2 11556 alloc->release(txc->released);
7c673cae
FG
11557 }
11558
11fdf7f2 11559out:
7c673cae
FG
11560 txc->allocated.clear();
11561 txc->released.clear();
11562}
11563
11fdf7f2
TL
11564void BlueStore::_osr_attach(Collection *c)
11565{
11566 // note: caller has RWLock on coll_map
11567 auto q = coll_map.find(c->cid);
11568 if (q != coll_map.end()) {
11569 c->osr = q->second->osr;
11570 ldout(cct, 10) << __func__ << " " << c->cid
11571 << " reusing osr " << c->osr << " from existing coll "
11572 << q->second << dendl;
11573 } else {
11574 std::lock_guard l(zombie_osr_lock);
11575 auto p = zombie_osr_set.find(c->cid);
11576 if (p == zombie_osr_set.end()) {
9f95a23c 11577 c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
11fdf7f2
TL
11578 ldout(cct, 10) << __func__ << " " << c->cid
11579 << " fresh osr " << c->osr << dendl;
11580 } else {
11581 c->osr = p->second;
11582 zombie_osr_set.erase(p);
11583 ldout(cct, 10) << __func__ << " " << c->cid
11584 << " resurrecting zombie osr " << c->osr << dendl;
11585 c->osr->zombie = false;
11586 }
11587 }
11588}
11589
11590void BlueStore::_osr_register_zombie(OpSequencer *osr)
11591{
11592 std::lock_guard l(zombie_osr_lock);
11593 dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
11594 osr->zombie = true;
11595 auto i = zombie_osr_set.emplace(osr->cid, osr);
11596 // this is either a new insertion or the same osr is already there
11597 ceph_assert(i.second || i.first->second == osr);
11598}
11599
7c673cae
FG
11600void BlueStore::_osr_drain_preceding(TransContext *txc)
11601{
11602 OpSequencer *osr = txc->osr.get();
11603 dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
11604 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11605 {
11606 // submit anything pending
224ce89b 11607 deferred_lock.lock();
11fdf7f2 11608 if (osr->deferred_pending && !osr->deferred_running) {
224ce89b
WB
11609 _deferred_submit_unlock(osr);
11610 } else {
11611 deferred_lock.unlock();
7c673cae
FG
11612 }
11613 }
11614 {
11615 // wake up any previously finished deferred events
11fdf7f2 11616 std::lock_guard l(kv_lock);
9f95a23c
TL
11617 if (!kv_sync_in_progress) {
11618 kv_sync_in_progress = true;
11619 kv_cond.notify_one();
11620 }
7c673cae
FG
11621 }
11622 osr->drain_preceding(txc);
11623 --deferred_aggressive;
11624 dout(10) << __func__ << " " << osr << " done" << dendl;
11625}
11626
11fdf7f2
TL
11627void BlueStore::_osr_drain(OpSequencer *osr)
11628{
11629 dout(10) << __func__ << " " << osr << dendl;
11630 ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
11631 {
11632 // submit anything pending
11633 deferred_lock.lock();
11634 if (osr->deferred_pending && !osr->deferred_running) {
11635 _deferred_submit_unlock(osr);
11636 } else {
11637 deferred_lock.unlock();
11638 }
11639 }
11640 {
11641 // wake up any previously finished deferred events
11642 std::lock_guard l(kv_lock);
9f95a23c
TL
11643 if (!kv_sync_in_progress) {
11644 kv_sync_in_progress = true;
11645 kv_cond.notify_one();
11646 }
11fdf7f2
TL
11647 }
11648 osr->drain();
11649 --deferred_aggressive;
11650 dout(10) << __func__ << " " << osr << " done" << dendl;
11651}
11652
7c673cae
FG
11653void BlueStore::_osr_drain_all()
11654{
11655 dout(10) << __func__ << dendl;
11656
11657 set<OpSequencerRef> s;
11fdf7f2
TL
11658 vector<OpSequencerRef> zombies;
11659 {
9f95a23c 11660 std::shared_lock l(coll_lock);
11fdf7f2
TL
11661 for (auto& i : coll_map) {
11662 s.insert(i.second->osr);
11663 }
11664 }
7c673cae 11665 {
11fdf7f2
TL
11666 std::lock_guard l(zombie_osr_lock);
11667 for (auto& i : zombie_osr_set) {
11668 s.insert(i.second);
11669 zombies.push_back(i.second);
11670 }
7c673cae
FG
11671 }
11672 dout(20) << __func__ << " osr_set " << s << dendl;
11673
11674 ++deferred_aggressive;
11675 {
11676 // submit anything pending
224ce89b 11677 deferred_try_submit();
7c673cae
FG
11678 }
11679 {
11680 // wake up any previously finished deferred events
11fdf7f2 11681 std::lock_guard l(kv_lock);
7c673cae
FG
11682 kv_cond.notify_one();
11683 }
31f18b77 11684 {
11fdf7f2 11685 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
11686 kv_finalize_cond.notify_one();
11687 }
7c673cae
FG
11688 for (auto osr : s) {
11689 dout(20) << __func__ << " drain " << osr << dendl;
11690 osr->drain();
11691 }
11692 --deferred_aggressive;
11693
7c673cae 11694 {
11fdf7f2
TL
11695 std::lock_guard l(zombie_osr_lock);
11696 for (auto& osr : zombies) {
11697 if (zombie_osr_set.erase(osr->cid)) {
11698 dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
11699 ceph_assert(osr->q.empty());
11700 } else if (osr->zombie) {
11701 dout(10) << __func__ << " empty zombie osr " << osr
11702 << " already reaped" << dendl;
11703 ceph_assert(osr->q.empty());
11704 } else {
11705 dout(10) << __func__ << " empty zombie osr " << osr
11706 << " resurrected" << dendl;
11707 }
7c673cae
FG
11708 }
11709 }
11fdf7f2
TL
11710
11711 dout(10) << __func__ << " done" << dendl;
7c673cae
FG
11712}
11713
11fdf7f2 11714
31f18b77
FG
11715void BlueStore::_kv_start()
11716{
11717 dout(10) << __func__ << dendl;
11718
11fdf7f2 11719 finisher.start();
31f18b77
FG
11720 kv_sync_thread.create("bstore_kv_sync");
11721 kv_finalize_thread.create("bstore_kv_final");
11722}
11723
11724void BlueStore::_kv_stop()
11725{
11726 dout(10) << __func__ << dendl;
11727 {
9f95a23c 11728 std::unique_lock l{kv_lock};
31f18b77
FG
11729 while (!kv_sync_started) {
11730 kv_cond.wait(l);
11731 }
11732 kv_stop = true;
11733 kv_cond.notify_all();
11734 }
11735 {
9f95a23c 11736 std::unique_lock l{kv_finalize_lock};
31f18b77
FG
11737 while (!kv_finalize_started) {
11738 kv_finalize_cond.wait(l);
11739 }
11740 kv_finalize_stop = true;
11741 kv_finalize_cond.notify_all();
11742 }
11743 kv_sync_thread.join();
11744 kv_finalize_thread.join();
11fdf7f2 11745 ceph_assert(removed_collections.empty());
31f18b77 11746 {
11fdf7f2 11747 std::lock_guard l(kv_lock);
31f18b77
FG
11748 kv_stop = false;
11749 }
11750 {
11fdf7f2 11751 std::lock_guard l(kv_finalize_lock);
31f18b77
FG
11752 kv_finalize_stop = false;
11753 }
11754 dout(10) << __func__ << " stopping finishers" << dendl;
11fdf7f2
TL
11755 finisher.wait_for_empty();
11756 finisher.stop();
31f18b77
FG
11757 dout(10) << __func__ << " stopped" << dendl;
11758}
11759
7c673cae
FG
11760void BlueStore::_kv_sync_thread()
11761{
11762 dout(10) << __func__ << " start" << dendl;
11fdf7f2 11763 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
9f95a23c 11764 std::unique_lock l{kv_lock};
11fdf7f2 11765 ceph_assert(!kv_sync_started);
31f18b77
FG
11766 kv_sync_started = true;
11767 kv_cond.notify_all();
7c673cae 11768 while (true) {
11fdf7f2 11769 ceph_assert(kv_committing.empty());
7c673cae
FG
11770 if (kv_queue.empty() &&
11771 ((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
11fdf7f2 11772 !deferred_aggressive)) {
7c673cae
FG
11773 if (kv_stop)
11774 break;
11775 dout(20) << __func__ << " sleep" << dendl;
9f95a23c 11776 kv_sync_in_progress = false;
11fdf7f2 11777 kv_cond.wait(l);
7c673cae
FG
11778 dout(20) << __func__ << " wake" << dendl;
11779 } else {
11780 deque<TransContext*> kv_submitting;
11781 deque<DeferredBatch*> deferred_done, deferred_stable;
31f18b77
FG
11782 uint64_t aios = 0, costs = 0;
11783
7c673cae
FG
11784 dout(20) << __func__ << " committing " << kv_queue.size()
11785 << " submitting " << kv_queue_unsubmitted.size()
11786 << " deferred done " << deferred_done_queue.size()
11787 << " stable " << deferred_stable_queue.size()
11788 << dendl;
11789 kv_committing.swap(kv_queue);
11790 kv_submitting.swap(kv_queue_unsubmitted);
11791 deferred_done.swap(deferred_done_queue);
11792 deferred_stable.swap(deferred_stable_queue);
31f18b77
FG
11793 aios = kv_ios;
11794 costs = kv_throttle_costs;
11795 kv_ios = 0;
11796 kv_throttle_costs = 0;
7c673cae
FG
11797 l.unlock();
11798
11799 dout(30) << __func__ << " committing " << kv_committing << dendl;
11800 dout(30) << __func__ << " submitting " << kv_submitting << dendl;
11801 dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
11802 dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
11803
11fdf7f2
TL
11804 auto start = mono_clock::now();
11805
7c673cae
FG
11806 bool force_flush = false;
11807 // if bluefs is sharing the same device as data (only), then we
11808 // can rely on the bluefs commit to flush the device and make
11809 // deferred aios stable. that means that if we do have done deferred
11810 // txcs AND we are not on a single device, we need to force a flush.
9f95a23c 11811 if (bluefs && bluefs_layout.single_shared_device()) {
31f18b77 11812 if (aios) {
7c673cae 11813 force_flush = true;
11fdf7f2 11814 } else if (kv_committing.empty() && deferred_stable.empty()) {
7c673cae
FG
11815 force_flush = true; // there's nothing else to commit!
11816 } else if (deferred_aggressive) {
11817 force_flush = true;
11818 }
11fdf7f2
TL
11819 } else {
11820 if (aios || !deferred_done.empty()) {
11821 force_flush = true;
11822 } else {
11823 dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
11824 }
11825 }
7c673cae
FG
11826
11827 if (force_flush) {
31f18b77 11828 dout(20) << __func__ << " num_aios=" << aios
7c673cae
FG
11829 << " force_flush=" << (int)force_flush
11830 << ", flushing, deferred done->stable" << dendl;
11831 // flush/barrier on block device
11832 bdev->flush();
11833
11834 // if we flush then deferred done are now deferred stable
11835 deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
11836 deferred_done.end());
11837 deferred_done.clear();
11838 }
11fdf7f2 11839 auto after_flush = mono_clock::now();
7c673cae
FG
11840
11841 // we will use one final transaction to force a sync
11842 KeyValueDB::Transaction synct = db->get_transaction();
11843
11844 // increase {nid,blobid}_max? note that this covers both the
11845 // case where we are approaching the max and the case we passed
11846 // it. in either case, we increase the max in the earlier txn
11847 // we submit.
11848 uint64_t new_nid_max = 0, new_blobid_max = 0;
11849 if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
11850 KeyValueDB::Transaction t =
11851 kv_submitting.empty() ? synct : kv_submitting.front()->t;
11852 new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
11853 bufferlist bl;
11fdf7f2 11854 encode(new_nid_max, bl);
7c673cae
FG
11855 t->set(PREFIX_SUPER, "nid_max", bl);
11856 dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
11857 }
11858 if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
11859 KeyValueDB::Transaction t =
11860 kv_submitting.empty() ? synct : kv_submitting.front()->t;
11861 new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
11862 bufferlist bl;
11fdf7f2 11863 encode(new_blobid_max, bl);
7c673cae
FG
11864 t->set(PREFIX_SUPER, "blobid_max", bl);
11865 dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
11866 }
c07f9fc5
FG
11867
11868 for (auto txc : kv_committing) {
9f95a23c 11869 throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
c07f9fc5 11870 if (txc->state == TransContext::STATE_KV_QUEUED) {
9f95a23c 11871 _txc_apply_kv(txc, false);
c07f9fc5 11872 --txc->osr->kv_committing_serially;
c07f9fc5 11873 } else {
11fdf7f2 11874 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
7c673cae 11875 }
7c673cae
FG
11876 if (txc->had_ios) {
11877 --txc->osr->txc_with_unstable_io;
11878 }
7c673cae
FG
11879 }
11880
31f18b77
FG
11881 // release throttle *before* we commit. this allows new ops
11882 // to be prepared and enter pipeline while we are waiting on
11883 // the kv commit sync/flush. then hopefully on the next
11884 // iteration there will already be ops awake. otherwise, we
11885 // end up going to sleep, and then wake up when the very first
11886 // transaction is ready for commit.
9f95a23c 11887 throttle.release_kv_throttle(costs);
31f18b77 11888
7c673cae
FG
11889 if (bluefs &&
11890 after_flush - bluefs_last_balance >
11fdf7f2 11891 ceph::make_timespan(cct->_conf->bluestore_bluefs_balance_interval)) {
7c673cae 11892 bluefs_last_balance = after_flush;
11fdf7f2
TL
11893 int r = _balance_bluefs_freespace();
11894 ceph_assert(r >= 0);
7c673cae
FG
11895 }
11896
11897 // cleanup sync deferred keys
11898 for (auto b : deferred_stable) {
11899 for (auto& txc : b->txcs) {
11900 bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
11fdf7f2 11901 ceph_assert(wt.released.empty()); // only kraken did this
7c673cae
FG
11902 string key;
11903 get_deferred_key(wt.seq, &key);
11904 synct->rm_single_key(PREFIX_DEFERRED, key);
11905 }
11906 }
11907
9f95a23c
TL
11908#if defined(WITH_LTTNG)
11909 auto sync_start = mono_clock::now();
11910#endif
7c673cae 11911 // submit synct synchronously (block and wait for it to commit)
31f18b77 11912 int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
11fdf7f2
TL
11913 ceph_assert(r == 0);
11914
9f95a23c
TL
11915 int committing_size = kv_committing.size();
11916 int deferred_size = deferred_stable.size();
11917
11918#if defined(WITH_LTTNG)
11919 double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
11920 for (auto txc: kv_committing) {
11921 if (txc->tracing) {
11922 tracepoint(
11923 bluestore,
11924 transaction_kv_sync_latency,
11925 txc->osr->get_sequencer_id(),
11926 txc->seq,
11927 kv_committing.size(),
11928 deferred_done.size(),
11929 deferred_stable.size(),
11930 sync_latency);
11931 }
11932 }
11933#endif
11934
11fdf7f2 11935 {
9f95a23c 11936 std::unique_lock m{kv_finalize_lock};
11fdf7f2
TL
11937 if (kv_committing_to_finalize.empty()) {
11938 kv_committing_to_finalize.swap(kv_committing);
11939 } else {
11940 kv_committing_to_finalize.insert(
11941 kv_committing_to_finalize.end(),
11942 kv_committing.begin(),
11943 kv_committing.end());
11944 kv_committing.clear();
11945 }
11946 if (deferred_stable_to_finalize.empty()) {
11947 deferred_stable_to_finalize.swap(deferred_stable);
11948 } else {
11949 deferred_stable_to_finalize.insert(
11950 deferred_stable_to_finalize.end(),
11951 deferred_stable.begin(),
11952 deferred_stable.end());
11953 deferred_stable.clear();
11954 }
9f95a23c
TL
11955 if (!kv_finalize_in_progress) {
11956 kv_finalize_in_progress = true;
11957 kv_finalize_cond.notify_one();
11958 }
11fdf7f2 11959 }
7c673cae
FG
11960
11961 if (new_nid_max) {
11962 nid_max = new_nid_max;
11963 dout(10) << __func__ << " nid_max now " << nid_max << dendl;
11964 }
11965 if (new_blobid_max) {
11966 blobid_max = new_blobid_max;
11967 dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
11968 }
11969
224ce89b 11970 {
11fdf7f2
TL
11971 auto finish = mono_clock::now();
11972 ceph::timespan dur_flush = after_flush - start;
11973 ceph::timespan dur_kv = finish - after_flush;
11974 ceph::timespan dur = finish - start;
9f95a23c
TL
11975 dout(20) << __func__ << " committed " << committing_size
11976 << " cleaned " << deferred_size
224ce89b
WB
11977 << " in " << dur
11978 << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
11979 << dendl;
494da23a
TL
11980 log_latency("kv_flush",
11981 l_bluestore_kv_flush_lat,
11982 dur_flush,
11983 cct->_conf->bluestore_log_op_age);
11984 log_latency("kv_commit",
11985 l_bluestore_kv_commit_lat,
11986 dur_kv,
11987 cct->_conf->bluestore_log_op_age);
11988 log_latency("kv_sync",
11989 l_bluestore_kv_sync_lat,
11990 dur,
11991 cct->_conf->bluestore_log_op_age);
7c673cae 11992 }
31f18b77
FG
11993
11994 if (bluefs) {
11fdf7f2
TL
11995 if (!bluefs_extents_reclaiming.empty()) {
11996 dout(0) << __func__ << " releasing old bluefs 0x" << std::hex
11997 << bluefs_extents_reclaiming << std::dec << dendl;
81eedcae
TL
11998 int r = 0;
11999 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
12000 r = bdev->queue_discard(bluefs_extents_reclaiming);
12001 if (r == 0) {
12002 goto clear;
12003 }
12004 } else if (cct->_conf->bdev_enable_discard) {
12005 for (auto p = bluefs_extents_reclaiming.begin(); p != bluefs_extents_reclaiming.end(); ++p) {
12006 bdev->discard(p.get_start(), p.get_len());
12007 }
12008 }
12009
11fdf7f2 12010 alloc->release(bluefs_extents_reclaiming);
81eedcae 12011clear:
11fdf7f2 12012 bluefs_extents_reclaiming.clear();
31f18b77 12013 }
31f18b77
FG
12014 }
12015
12016 l.lock();
12017 // previously deferred "done" are now "stable" by virtue of this
12018 // commit cycle.
12019 deferred_stable_queue.swap(deferred_done);
12020 }
12021 }
12022 dout(10) << __func__ << " finish" << dendl;
12023 kv_sync_started = false;
12024}
12025
12026void BlueStore::_kv_finalize_thread()
12027{
12028 deque<TransContext*> kv_committed;
12029 deque<DeferredBatch*> deferred_stable;
12030 dout(10) << __func__ << " start" << dendl;
11fdf7f2
TL
12031 std::unique_lock l(kv_finalize_lock);
12032 ceph_assert(!kv_finalize_started);
31f18b77
FG
12033 kv_finalize_started = true;
12034 kv_finalize_cond.notify_all();
12035 while (true) {
11fdf7f2
TL
12036 ceph_assert(kv_committed.empty());
12037 ceph_assert(deferred_stable.empty());
31f18b77
FG
12038 if (kv_committing_to_finalize.empty() &&
12039 deferred_stable_to_finalize.empty()) {
12040 if (kv_finalize_stop)
12041 break;
12042 dout(20) << __func__ << " sleep" << dendl;
9f95a23c 12043 kv_finalize_in_progress = false;
31f18b77
FG
12044 kv_finalize_cond.wait(l);
12045 dout(20) << __func__ << " wake" << dendl;
12046 } else {
12047 kv_committed.swap(kv_committing_to_finalize);
12048 deferred_stable.swap(deferred_stable_to_finalize);
12049 l.unlock();
12050 dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
12051 dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
12052
11fdf7f2
TL
12053 auto start = mono_clock::now();
12054
31f18b77
FG
12055 while (!kv_committed.empty()) {
12056 TransContext *txc = kv_committed.front();
11fdf7f2 12057 ceph_assert(txc->state == TransContext::STATE_KV_SUBMITTED);
7c673cae 12058 _txc_state_proc(txc);
31f18b77 12059 kv_committed.pop_front();
7c673cae 12060 }
31f18b77 12061
7c673cae
FG
12062 for (auto b : deferred_stable) {
12063 auto p = b->txcs.begin();
12064 while (p != b->txcs.end()) {
12065 TransContext *txc = &*p;
12066 p = b->txcs.erase(p); // unlink here because
12067 _txc_state_proc(txc); // this may destroy txc
12068 }
12069 delete b;
12070 }
31f18b77 12071 deferred_stable.clear();
7c673cae
FG
12072
12073 if (!deferred_aggressive) {
31f18b77 12074 if (deferred_queue_size >= deferred_batch_ops.load() ||
9f95a23c 12075 throttle.should_submit_deferred()) {
224ce89b 12076 deferred_try_submit();
7c673cae
FG
12077 }
12078 }
12079
12080 // this is as good a place as any ...
12081 _reap_collections();
12082
11fdf7f2 12083 logger->set(l_bluestore_fragmentation,
9f95a23c 12084 (uint64_t)(alloc->get_fragmentation() * 1000));
11fdf7f2 12085
494da23a
TL
12086 log_latency("kv_final",
12087 l_bluestore_kv_final_lat,
12088 mono_clock::now() - start,
12089 cct->_conf->bluestore_log_op_age);
11fdf7f2 12090
7c673cae 12091 l.lock();
7c673cae
FG
12092 }
12093 }
12094 dout(10) << __func__ << " finish" << dendl;
31f18b77 12095 kv_finalize_started = false;
7c673cae
FG
12096}
12097
12098bluestore_deferred_op_t *BlueStore::_get_deferred_op(
9f95a23c 12099 TransContext *txc)
7c673cae
FG
12100{
12101 if (!txc->deferred_txn) {
12102 txc->deferred_txn = new bluestore_deferred_transaction_t;
12103 }
12104 txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
12105 return &txc->deferred_txn->ops.back();
12106}
12107
12108void BlueStore::_deferred_queue(TransContext *txc)
12109{
12110 dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
224ce89b 12111 deferred_lock.lock();
7c673cae
FG
12112 if (!txc->osr->deferred_pending &&
12113 !txc->osr->deferred_running) {
12114 deferred_queue.push_back(*txc->osr);
12115 }
12116 if (!txc->osr->deferred_pending) {
12117 txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
12118 }
12119 ++deferred_queue_size;
12120 txc->osr->deferred_pending->txcs.push_back(*txc);
12121 bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
12122 for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
12123 const auto& op = *opi;
11fdf7f2 12124 ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
7c673cae
FG
12125 bufferlist::const_iterator p = op.data.begin();
12126 for (auto e : op.extents) {
12127 txc->osr->deferred_pending->prepare_write(
12128 cct, wt.seq, e.offset, e.length, p);
12129 }
12130 }
12131 if (deferred_aggressive &&
12132 !txc->osr->deferred_running) {
224ce89b
WB
12133 _deferred_submit_unlock(txc->osr.get());
12134 } else {
12135 deferred_lock.unlock();
7c673cae
FG
12136 }
12137}
12138
224ce89b 12139void BlueStore::deferred_try_submit()
7c673cae
FG
12140{
12141 dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
12142 << deferred_queue_size << " txcs" << dendl;
11fdf7f2 12143 std::lock_guard l(deferred_lock);
224ce89b
WB
12144 vector<OpSequencerRef> osrs;
12145 osrs.reserve(deferred_queue.size());
7c673cae 12146 for (auto& osr : deferred_queue) {
224ce89b
WB
12147 osrs.push_back(&osr);
12148 }
12149 for (auto& osr : osrs) {
181888fb
FG
12150 if (osr->deferred_pending) {
12151 if (!osr->deferred_running) {
12152 _deferred_submit_unlock(osr.get());
12153 deferred_lock.lock();
12154 } else {
12155 dout(20) << __func__ << " osr " << osr << " already has running"
12156 << dendl;
12157 }
12158 } else {
12159 dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
7c673cae
FG
12160 }
12161 }
9f95a23c
TL
12162
12163 deferred_last_submitted = ceph_clock_now();
7c673cae
FG
12164}
12165
224ce89b 12166void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
7c673cae
FG
12167{
12168 dout(10) << __func__ << " osr " << osr
12169 << " " << osr->deferred_pending->iomap.size() << " ios pending "
12170 << dendl;
11fdf7f2
TL
12171 ceph_assert(osr->deferred_pending);
12172 ceph_assert(!osr->deferred_running);
7c673cae
FG
12173
12174 auto b = osr->deferred_pending;
12175 deferred_queue_size -= b->seq_bytes.size();
11fdf7f2 12176 ceph_assert(deferred_queue_size >= 0);
7c673cae
FG
12177
12178 osr->deferred_running = osr->deferred_pending;
12179 osr->deferred_pending = nullptr;
12180
11fdf7f2
TL
12181 deferred_lock.unlock();
12182
12183 for (auto& txc : b->txcs) {
9f95a23c 12184 throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
11fdf7f2 12185 }
7c673cae
FG
12186 uint64_t start = 0, pos = 0;
12187 bufferlist bl;
12188 auto i = b->iomap.begin();
12189 while (true) {
12190 if (i == b->iomap.end() || i->first != pos) {
12191 if (bl.length()) {
12192 dout(20) << __func__ << " write 0x" << std::hex
12193 << start << "~" << bl.length()
12194 << " crc " << bl.crc32c(-1) << std::dec << dendl;
11fdf7f2 12195 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
12196 logger->inc(l_bluestore_deferred_write_ops);
12197 logger->inc(l_bluestore_deferred_write_bytes, bl.length());
12198 int r = bdev->aio_write(start, bl, &b->ioc, false);
11fdf7f2 12199 ceph_assert(r == 0);
7c673cae
FG
12200 }
12201 }
12202 if (i == b->iomap.end()) {
12203 break;
12204 }
12205 start = 0;
12206 pos = i->first;
12207 bl.clear();
12208 }
12209 dout(20) << __func__ << " seq " << i->second.seq << " 0x"
12210 << std::hex << pos << "~" << i->second.bl.length() << std::dec
12211 << dendl;
12212 if (!bl.length()) {
12213 start = pos;
12214 }
12215 pos += i->second.bl.length();
12216 bl.claim_append(i->second.bl);
12217 ++i;
12218 }
224ce89b 12219
7c673cae
FG
12220 bdev->aio_submit(&b->ioc);
12221}
12222
3efd9988
FG
12223struct C_DeferredTrySubmit : public Context {
12224 BlueStore *store;
12225 C_DeferredTrySubmit(BlueStore *s) : store(s) {}
12226 void finish(int r) {
12227 store->deferred_try_submit();
12228 }
12229};
12230
7c673cae
FG
12231void BlueStore::_deferred_aio_finish(OpSequencer *osr)
12232{
12233 dout(10) << __func__ << " osr " << osr << dendl;
11fdf7f2 12234 ceph_assert(osr->deferred_running);
7c673cae
FG
12235 DeferredBatch *b = osr->deferred_running;
12236
12237 {
9f95a23c 12238 deferred_lock.lock();
11fdf7f2 12239 ceph_assert(osr->deferred_running == b);
7c673cae
FG
12240 osr->deferred_running = nullptr;
12241 if (!osr->deferred_pending) {
181888fb 12242 dout(20) << __func__ << " dequeueing" << dendl;
7c673cae
FG
12243 auto q = deferred_queue.iterator_to(*osr);
12244 deferred_queue.erase(q);
9f95a23c 12245 deferred_lock.unlock();
181888fb 12246 } else {
9f95a23c
TL
12247 deferred_lock.unlock();
12248 if (deferred_aggressive) {
12249 dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
12250 finisher.queue(new C_DeferredTrySubmit(this));
12251 } else {
12252 dout(20) << __func__ << " leaving queued, more pending" << dendl;
12253 }
7c673cae
FG
12254 }
12255 }
12256
12257 {
31f18b77 12258 uint64_t costs = 0;
11fdf7f2 12259 {
11fdf7f2
TL
12260 for (auto& i : b->txcs) {
12261 TransContext *txc = &i;
9f95a23c 12262 throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
11fdf7f2
TL
12263 txc->state = TransContext::STATE_DEFERRED_CLEANUP;
12264 costs += txc->cost;
12265 }
7c673cae 12266 }
9f95a23c 12267 throttle.release_deferred_throttle(costs);
7c673cae
FG
12268 }
12269
9f95a23c 12270 {
11fdf7f2 12271 std::lock_guard l(kv_lock);
9f95a23c
TL
12272 deferred_done_queue.emplace_back(b);
12273
12274 // in the normal case, do not bother waking up the kv thread; it will
12275 // catch us on the next commit anyway.
12276 if (deferred_aggressive && !kv_sync_in_progress) {
12277 kv_sync_in_progress = true;
12278 kv_cond.notify_one();
12279 }
7c673cae
FG
12280 }
12281}
12282
12283int BlueStore::_deferred_replay()
12284{
12285 dout(10) << __func__ << " start" << dendl;
7c673cae
FG
12286 int count = 0;
12287 int r = 0;
11fdf7f2
TL
12288 CollectionRef ch = _get_collection(coll_t::meta());
12289 bool fake_ch = false;
12290 if (!ch) {
12291 // hmm, replaying initial mkfs?
12292 ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
12293 fake_ch = true;
12294 }
12295 OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
7c673cae
FG
12296 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
12297 for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
12298 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
12299 << dendl;
12300 bluestore_deferred_transaction_t *deferred_txn =
12301 new bluestore_deferred_transaction_t;
12302 bufferlist bl = it->value();
11fdf7f2 12303 auto p = bl.cbegin();
7c673cae 12304 try {
11fdf7f2 12305 decode(*deferred_txn, p);
7c673cae
FG
12306 } catch (buffer::error& e) {
12307 derr << __func__ << " failed to decode deferred txn "
12308 << pretty_binary_string(it->key()) << dendl;
12309 delete deferred_txn;
12310 r = -EIO;
12311 goto out;
12312 }
11fdf7f2 12313 TransContext *txc = _txc_create(ch.get(), osr, nullptr);
7c673cae
FG
12314 txc->deferred_txn = deferred_txn;
12315 txc->state = TransContext::STATE_KV_DONE;
12316 _txc_state_proc(txc);
12317 }
12318 out:
12319 dout(20) << __func__ << " draining osr" << dendl;
11fdf7f2 12320 _osr_register_zombie(osr);
7c673cae 12321 _osr_drain_all();
11fdf7f2
TL
12322 if (fake_ch) {
12323 new_coll_map.clear();
12324 }
7c673cae
FG
12325 dout(10) << __func__ << " completed " << count << " events" << dendl;
12326 return r;
12327}
12328
12329// ---------------------------
12330// transactions
12331
12332int BlueStore::queue_transactions(
11fdf7f2
TL
12333 CollectionHandle& ch,
12334 vector<Transaction>& tls,
12335 TrackedOpRef op,
12336 ThreadPool::TPHandle *handle)
12337{
12338 FUNCTRACE(cct);
12339 list<Context *> on_applied, on_commit, on_applied_sync;
7c673cae 12340 ObjectStore::Transaction::collect_contexts(
11fdf7f2 12341 tls, &on_applied, &on_commit, &on_applied_sync);
7c673cae 12342
11fdf7f2
TL
12343 auto start = mono_clock::now();
12344
12345 Collection *c = static_cast<Collection*>(ch.get());
12346 OpSequencer *osr = c->osr.get();
12347 dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
7c673cae
FG
12348
12349 // prepare
11fdf7f2
TL
12350 TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
12351 &on_commit);
7c673cae
FG
12352
12353 for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
7c673cae
FG
12354 txc->bytes += (*p).get_num_bytes();
12355 _txc_add_transaction(txc, &(*p));
12356 }
12357 _txc_calc_cost(txc);
12358
12359 _txc_write_nodes(txc, txc->t);
12360
12361 // journal deferred items
12362 if (txc->deferred_txn) {
12363 txc->deferred_txn->seq = ++deferred_seq;
12364 bufferlist bl;
11fdf7f2 12365 encode(*txc->deferred_txn, bl);
7c673cae
FG
12366 string key;
12367 get_deferred_key(txc->deferred_txn->seq, &key);
12368 txc->t->set(PREFIX_DEFERRED, key, bl);
12369 }
12370
12371 _txc_finalize_kv(txc, txc->t);
12372 if (handle)
12373 handle->suspend_tp_timeout();
12374
11fdf7f2 12375 auto tstart = mono_clock::now();
9f95a23c
TL
12376
12377 if (!throttle.try_start_transaction(
12378 *db,
12379 *txc,
12380 tstart)) {
7c673cae 12381 // ensure we do not block here because of deferred writes
9f95a23c
TL
12382 dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
12383 << dendl;
12384 ++deferred_aggressive;
12385 deferred_try_submit();
12386 {
12387 // wake up any previously finished deferred events
12388 std::lock_guard l(kv_lock);
12389 if (!kv_sync_in_progress) {
12390 kv_sync_in_progress = true;
3efd9988
FG
12391 kv_cond.notify_one();
12392 }
9f95a23c
TL
12393 }
12394 throttle.finish_start_transaction(*db, *txc, tstart);
12395 --deferred_aggressive;
7c673cae 12396 }
11fdf7f2 12397 auto tend = mono_clock::now();
7c673cae
FG
12398
12399 if (handle)
12400 handle->reset_tp_timeout();
12401
12402 logger->inc(l_bluestore_txc);
12403
12404 // execute (start)
12405 _txc_state_proc(txc);
12406
11fdf7f2
TL
12407 // we're immediately readable (unlike FileStore)
12408 for (auto c : on_applied_sync) {
12409 c->complete(0);
12410 }
12411 if (!on_applied.empty()) {
12412 if (c->commit_queue) {
12413 c->commit_queue->queue(on_applied);
12414 } else {
12415 finisher.queue(on_applied);
12416 }
12417 }
12418
494da23a
TL
12419 log_latency("submit_transact",
12420 l_bluestore_submit_lat,
12421 mono_clock::now() - start,
12422 cct->_conf->bluestore_log_op_age);
12423 log_latency("throttle_transact",
12424 l_bluestore_throttle_lat,
12425 tend - tstart,
12426 cct->_conf->bluestore_log_op_age);
7c673cae
FG
12427 return 0;
12428}
12429
12430void BlueStore::_txc_aio_submit(TransContext *txc)
12431{
12432 dout(10) << __func__ << " txc " << txc << dendl;
12433 bdev->aio_submit(&txc->ioc);
12434}
12435
12436void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
12437{
12438 Transaction::iterator i = t->begin();
12439
81eedcae 12440 _dump_transaction<30>(cct, t);
7c673cae
FG
12441
12442 vector<CollectionRef> cvec(i.colls.size());
12443 unsigned j = 0;
12444 for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
12445 ++p, ++j) {
12446 cvec[j] = _get_collection(*p);
7c673cae 12447 }
11fdf7f2 12448
7c673cae
FG
12449 vector<OnodeRef> ovec(i.objects.size());
12450
12451 for (int pos = 0; i.have_op(); ++pos) {
12452 Transaction::Op *op = i.decode_op();
12453 int r = 0;
12454
12455 // no coll or obj
12456 if (op->op == Transaction::OP_NOP)
12457 continue;
12458
11fdf7f2 12459
7c673cae
FG
12460 // collection operations
12461 CollectionRef &c = cvec[op->cid];
11fdf7f2
TL
12462
12463 // initialize osd_pool_id and do a smoke test that all collections belong
12464 // to the same pool
12465 spg_t pgid;
12466 if (!!c ? c->cid.is_pg(&pgid) : false) {
12467 ceph_assert(txc->osd_pool_id == META_POOL_ID ||
12468 txc->osd_pool_id == pgid.pool());
12469 txc->osd_pool_id = pgid.pool();
12470 }
12471
7c673cae
FG
12472 switch (op->op) {
12473 case Transaction::OP_RMCOLL:
12474 {
12475 const coll_t &cid = i.get_cid(op->cid);
12476 r = _remove_collection(txc, cid, &c);
12477 if (!r)
12478 continue;
12479 }
12480 break;
12481
12482 case Transaction::OP_MKCOLL:
12483 {
11fdf7f2 12484 ceph_assert(!c);
7c673cae
FG
12485 const coll_t &cid = i.get_cid(op->cid);
12486 r = _create_collection(txc, cid, op->split_bits, &c);
12487 if (!r)
12488 continue;
12489 }
12490 break;
12491
12492 case Transaction::OP_SPLIT_COLLECTION:
11fdf7f2 12493 ceph_abort_msg("deprecated");
7c673cae
FG
12494 break;
12495
12496 case Transaction::OP_SPLIT_COLLECTION2:
12497 {
12498 uint32_t bits = op->split_bits;
12499 uint32_t rem = op->split_rem;
12500 r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
12501 if (!r)
12502 continue;
12503 }
12504 break;
12505
11fdf7f2
TL
12506 case Transaction::OP_MERGE_COLLECTION:
12507 {
12508 uint32_t bits = op->split_bits;
12509 r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
12510 if (!r)
12511 continue;
12512 }
12513 break;
12514
7c673cae
FG
12515 case Transaction::OP_COLL_HINT:
12516 {
12517 uint32_t type = op->hint_type;
12518 bufferlist hint;
12519 i.decode_bl(hint);
11fdf7f2 12520 auto hiter = hint.cbegin();
7c673cae
FG
12521 if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
12522 uint32_t pg_num;
12523 uint64_t num_objs;
11fdf7f2
TL
12524 decode(pg_num, hiter);
12525 decode(num_objs, hiter);
7c673cae
FG
12526 dout(10) << __func__ << " collection hint objects is a no-op, "
12527 << " pg_num " << pg_num << " num_objects " << num_objs
12528 << dendl;
12529 } else {
12530 // Ignore the hint
12531 dout(10) << __func__ << " unknown collection hint " << type << dendl;
12532 }
12533 continue;
12534 }
12535 break;
12536
12537 case Transaction::OP_COLL_SETATTR:
12538 r = -EOPNOTSUPP;
12539 break;
12540
12541 case Transaction::OP_COLL_RMATTR:
12542 r = -EOPNOTSUPP;
12543 break;
12544
12545 case Transaction::OP_COLL_RENAME:
11fdf7f2 12546 ceph_abort_msg("not implemented");
7c673cae
FG
12547 break;
12548 }
12549 if (r < 0) {
12550 derr << __func__ << " error " << cpp_strerror(r)
12551 << " not handled on operation " << op->op
12552 << " (op " << pos << ", counting from 0)" << dendl;
81eedcae 12553 _dump_transaction<0>(cct, t);
11fdf7f2 12554 ceph_abort_msg("unexpected error");
7c673cae
FG
12555 }
12556
12557 // these operations implicity create the object
12558 bool create = false;
12559 if (op->op == Transaction::OP_TOUCH ||
9f95a23c 12560 op->op == Transaction::OP_CREATE ||
7c673cae
FG
12561 op->op == Transaction::OP_WRITE ||
12562 op->op == Transaction::OP_ZERO) {
12563 create = true;
12564 }
12565
12566 // object operations
9f95a23c 12567 std::unique_lock l(c->lock);
7c673cae
FG
12568 OnodeRef &o = ovec[op->oid];
12569 if (!o) {
12570 ghobject_t oid = i.get_oid(op->oid);
9f95a23c 12571 o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
7c673cae
FG
12572 }
12573 if (!create && (!o || !o->exists)) {
12574 dout(10) << __func__ << " op " << op->op << " got ENOENT on "
12575 << i.get_oid(op->oid) << dendl;
12576 r = -ENOENT;
12577 goto endop;
12578 }
12579
12580 switch (op->op) {
9f95a23c 12581 case Transaction::OP_CREATE:
7c673cae
FG
12582 case Transaction::OP_TOUCH:
12583 r = _touch(txc, c, o);
12584 break;
12585
12586 case Transaction::OP_WRITE:
12587 {
12588 uint64_t off = op->off;
12589 uint64_t len = op->len;
12590 uint32_t fadvise_flags = i.get_fadvise_flags();
12591 bufferlist bl;
12592 i.decode_bl(bl);
12593 r = _write(txc, c, o, off, len, bl, fadvise_flags);
12594 }
12595 break;
12596
12597 case Transaction::OP_ZERO:
12598 {
12599 uint64_t off = op->off;
12600 uint64_t len = op->len;
12601 r = _zero(txc, c, o, off, len);
12602 }
12603 break;
12604
12605 case Transaction::OP_TRIMCACHE:
12606 {
12607 // deprecated, no-op
12608 }
12609 break;
12610
12611 case Transaction::OP_TRUNCATE:
12612 {
12613 uint64_t off = op->off;
35e4c445 12614 r = _truncate(txc, c, o, off);
7c673cae
FG
12615 }
12616 break;
12617
12618 case Transaction::OP_REMOVE:
12619 {
12620 r = _remove(txc, c, o);
12621 }
12622 break;
12623
12624 case Transaction::OP_SETATTR:
12625 {
12626 string name = i.decode_string();
12627 bufferptr bp;
12628 i.decode_bp(bp);
12629 r = _setattr(txc, c, o, name, bp);
12630 }
12631 break;
12632
12633 case Transaction::OP_SETATTRS:
12634 {
12635 map<string, bufferptr> aset;
12636 i.decode_attrset(aset);
12637 r = _setattrs(txc, c, o, aset);
12638 }
12639 break;
12640
12641 case Transaction::OP_RMATTR:
12642 {
12643 string name = i.decode_string();
12644 r = _rmattr(txc, c, o, name);
12645 }
12646 break;
12647
12648 case Transaction::OP_RMATTRS:
12649 {
12650 r = _rmattrs(txc, c, o);
12651 }
12652 break;
12653
12654 case Transaction::OP_CLONE:
12655 {
12656 OnodeRef& no = ovec[op->dest_oid];
12657 if (!no) {
12658 const ghobject_t& noid = i.get_oid(op->dest_oid);
12659 no = c->get_onode(noid, true);
12660 }
12661 r = _clone(txc, c, o, no);
12662 }
12663 break;
12664
12665 case Transaction::OP_CLONERANGE:
11fdf7f2 12666 ceph_abort_msg("deprecated");
7c673cae
FG
12667 break;
12668
12669 case Transaction::OP_CLONERANGE2:
12670 {
12671 OnodeRef& no = ovec[op->dest_oid];
12672 if (!no) {
12673 const ghobject_t& noid = i.get_oid(op->dest_oid);
12674 no = c->get_onode(noid, true);
12675 }
12676 uint64_t srcoff = op->off;
12677 uint64_t len = op->len;
12678 uint64_t dstoff = op->dest_off;
12679 r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
12680 }
12681 break;
12682
12683 case Transaction::OP_COLL_ADD:
11fdf7f2 12684 ceph_abort_msg("not implemented");
7c673cae
FG
12685 break;
12686
12687 case Transaction::OP_COLL_REMOVE:
11fdf7f2 12688 ceph_abort_msg("not implemented");
7c673cae
FG
12689 break;
12690
12691 case Transaction::OP_COLL_MOVE:
11fdf7f2 12692 ceph_abort_msg("deprecated");
7c673cae
FG
12693 break;
12694
12695 case Transaction::OP_COLL_MOVE_RENAME:
12696 case Transaction::OP_TRY_RENAME:
12697 {
11fdf7f2 12698 ceph_assert(op->cid == op->dest_cid);
7c673cae
FG
12699 const ghobject_t& noid = i.get_oid(op->dest_oid);
12700 OnodeRef& no = ovec[op->dest_oid];
12701 if (!no) {
12702 no = c->get_onode(noid, false);
12703 }
12704 r = _rename(txc, c, o, no, noid);
12705 }
12706 break;
12707
12708 case Transaction::OP_OMAP_CLEAR:
12709 {
12710 r = _omap_clear(txc, c, o);
12711 }
12712 break;
12713 case Transaction::OP_OMAP_SETKEYS:
12714 {
12715 bufferlist aset_bl;
12716 i.decode_attrset_bl(&aset_bl);
12717 r = _omap_setkeys(txc, c, o, aset_bl);
12718 }
12719 break;
12720 case Transaction::OP_OMAP_RMKEYS:
12721 {
12722 bufferlist keys_bl;
12723 i.decode_keyset_bl(&keys_bl);
12724 r = _omap_rmkeys(txc, c, o, keys_bl);
12725 }
12726 break;
12727 case Transaction::OP_OMAP_RMKEYRANGE:
12728 {
12729 string first, last;
12730 first = i.decode_string();
12731 last = i.decode_string();
12732 r = _omap_rmkey_range(txc, c, o, first, last);
12733 }
12734 break;
12735 case Transaction::OP_OMAP_SETHEADER:
12736 {
12737 bufferlist bl;
12738 i.decode_bl(bl);
12739 r = _omap_setheader(txc, c, o, bl);
12740 }
12741 break;
12742
12743 case Transaction::OP_SETALLOCHINT:
12744 {
12745 r = _set_alloc_hint(txc, c, o,
12746 op->expected_object_size,
12747 op->expected_write_size,
12748 op->alloc_hint_flags);
12749 }
12750 break;
12751
12752 default:
11fdf7f2 12753 derr << __func__ << " bad op " << op->op << dendl;
7c673cae
FG
12754 ceph_abort();
12755 }
12756
12757 endop:
12758 if (r < 0) {
12759 bool ok = false;
12760
12761 if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
12762 op->op == Transaction::OP_CLONE ||
12763 op->op == Transaction::OP_CLONERANGE2 ||
12764 op->op == Transaction::OP_COLL_ADD ||
12765 op->op == Transaction::OP_SETATTR ||
12766 op->op == Transaction::OP_SETATTRS ||
12767 op->op == Transaction::OP_RMATTR ||
12768 op->op == Transaction::OP_OMAP_SETKEYS ||
12769 op->op == Transaction::OP_OMAP_RMKEYS ||
12770 op->op == Transaction::OP_OMAP_RMKEYRANGE ||
12771 op->op == Transaction::OP_OMAP_SETHEADER))
12772 // -ENOENT is usually okay
12773 ok = true;
12774 if (r == -ENODATA)
12775 ok = true;
12776
12777 if (!ok) {
12778 const char *msg = "unexpected error code";
12779
12780 if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
12781 op->op == Transaction::OP_CLONE ||
12782 op->op == Transaction::OP_CLONERANGE2))
12783 msg = "ENOENT on clone suggests osd bug";
12784
12785 if (r == -ENOSPC)
12786 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
12787 // by partially applying transactions.
12788 msg = "ENOSPC from bluestore, misconfigured cluster";
12789
12790 if (r == -ENOTEMPTY) {
12791 msg = "ENOTEMPTY suggests garbage data in osd data dir";
12792 }
12793
12794 derr << __func__ << " error " << cpp_strerror(r)
12795 << " not handled on operation " << op->op
12796 << " (op " << pos << ", counting from 0)"
12797 << dendl;
12798 derr << msg << dendl;
81eedcae 12799 _dump_transaction<0>(cct, t);
11fdf7f2 12800 ceph_abort_msg("unexpected error");
7c673cae
FG
12801 }
12802 }
12803 }
12804}
12805
12806
12807
12808// -----------------
12809// write operations
12810
12811int BlueStore::_touch(TransContext *txc,
12812 CollectionRef& c,
12813 OnodeRef &o)
12814{
12815 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
12816 int r = 0;
7c673cae
FG
12817 _assign_nid(txc, o);
12818 txc->write_onode(o);
12819 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
12820 return r;
12821}
12822
7c673cae
FG
12823void BlueStore::_pad_zeros(
12824 bufferlist *bl, uint64_t *offset,
12825 uint64_t chunk_size)
12826{
12827 auto length = bl->length();
12828 dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
12829 << " chunk_size 0x" << chunk_size << std::dec << dendl;
12830 dout(40) << "before:\n";
12831 bl->hexdump(*_dout);
12832 *_dout << dendl;
12833 // front
12834 size_t front_pad = *offset % chunk_size;
12835 size_t back_pad = 0;
12836 size_t pad_count = 0;
12837 if (front_pad) {
11fdf7f2
TL
12838 size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
12839 bufferptr z = buffer::create_small_page_aligned(chunk_size);
224ce89b 12840 z.zero(0, front_pad, false);
7c673cae 12841 pad_count += front_pad;
9f95a23c 12842 bl->begin().copy(front_copy, z.c_str() + front_pad);
7c673cae
FG
12843 if (front_copy + front_pad < chunk_size) {
12844 back_pad = chunk_size - (length + front_pad);
224ce89b 12845 z.zero(front_pad + length, back_pad, false);
7c673cae
FG
12846 pad_count += back_pad;
12847 }
12848 bufferlist old, t;
12849 old.swap(*bl);
12850 t.substr_of(old, front_copy, length - front_copy);
12851 bl->append(z);
12852 bl->claim_append(t);
12853 *offset -= front_pad;
224ce89b 12854 length += pad_count;
7c673cae
FG
12855 }
12856
12857 // back
12858 uint64_t end = *offset + length;
12859 unsigned back_copy = end % chunk_size;
12860 if (back_copy) {
11fdf7f2 12861 ceph_assert(back_pad == 0);
7c673cae 12862 back_pad = chunk_size - back_copy;
11fdf7f2 12863 ceph_assert(back_copy <= length);
7c673cae 12864 bufferptr tail(chunk_size);
9f95a23c 12865 bl->begin(length - back_copy).copy(back_copy, tail.c_str());
224ce89b 12866 tail.zero(back_copy, back_pad, false);
7c673cae
FG
12867 bufferlist old;
12868 old.swap(*bl);
12869 bl->substr_of(old, 0, length - back_copy);
12870 bl->append(tail);
12871 length += back_pad;
12872 pad_count += back_pad;
12873 }
12874 dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
12875 << back_pad << " on front/back, now 0x" << *offset << "~"
12876 << length << std::dec << dendl;
12877 dout(40) << "after:\n";
12878 bl->hexdump(*_dout);
12879 *_dout << dendl;
12880 if (pad_count)
12881 logger->inc(l_bluestore_write_pad_bytes, pad_count);
11fdf7f2 12882 ceph_assert(bl->length() == length);
7c673cae
FG
12883}
12884
12885void BlueStore::_do_write_small(
12886 TransContext *txc,
12887 CollectionRef &c,
12888 OnodeRef o,
12889 uint64_t offset, uint64_t length,
12890 bufferlist::iterator& blp,
12891 WriteContext *wctx)
12892{
12893 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
12894 << std::dec << dendl;
11fdf7f2 12895 ceph_assert(length < min_alloc_size);
7c673cae
FG
12896 uint64_t end_offs = offset + length;
12897
12898 logger->inc(l_bluestore_write_small);
12899 logger->inc(l_bluestore_write_small_bytes, length);
12900
12901 bufferlist bl;
12902 blp.copy(length, bl);
12903
81eedcae
TL
12904 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
12905 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
12906 uint32_t alloc_len = min_alloc_size;
12907 auto offset0 = p2align<uint64_t>(offset, alloc_len);
12908
12909 bool any_change;
12910
12911 // search suitable extent in both forward and reverse direction in
12912 // [offset - target_max_blob_size, offset + target_max_blob_size] range
12913 // then check if blob can be reused via can_reuse_blob func or apply
12914 // direct/deferred write (the latter for extents including or higher
12915 // than 'offset' only).
12916 o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
12917
7c673cae
FG
12918 // Look for an existing mutable blob we can use.
12919 auto begin = o->extent_map.extent_map.begin();
12920 auto end = o->extent_map.extent_map.end();
12921 auto ep = o->extent_map.seek_lextent(offset);
12922 if (ep != begin) {
12923 --ep;
12924 if (ep->blob_end() <= offset) {
12925 ++ep;
12926 }
12927 }
12928 auto prev_ep = ep;
12929 if (prev_ep != begin) {
12930 --prev_ep;
12931 } else {
12932 prev_ep = end; // to avoid this extent check as it's a duplicate
12933 }
12934
eafe8130
TL
12935 boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
12936 // We don't want to have more blobs than min alloc units fit
12937 // into 2 max blobs
12938 size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
12939 bool above_blob_threshold = false;
12940
12941 inspected_blobs.reserve(blob_threshold);
12942
12943 uint64_t max_off = 0;
12944 auto start_ep = ep;
12945 auto end_ep = ep; // exclusively
7c673cae
FG
12946 do {
12947 any_change = false;
12948
12949 if (ep != end && ep->logical_offset < offset + max_bsize) {
12950 BlobRef b = ep->blob;
eafe8130
TL
12951 if (!above_blob_threshold) {
12952 inspected_blobs.insert(&b->get_blob());
12953 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
12954 }
12955 max_off = ep->logical_end();
7c673cae 12956 auto bstart = ep->blob_start();
eafe8130 12957
7c673cae
FG
12958 dout(20) << __func__ << " considering " << *b
12959 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
12960 if (bstart >= end_offs) {
12961 dout(20) << __func__ << " ignoring distant " << *b << dendl;
12962 } else if (!b->get_blob().is_mutable()) {
12963 dout(20) << __func__ << " ignoring immutable " << *b << dendl;
12964 } else if (ep->logical_offset % min_alloc_size !=
12965 ep->blob_offset % min_alloc_size) {
12966 dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
12967 } else {
12968 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
12969 // can we pad our head/tail out with zeros?
12970 uint64_t head_pad, tail_pad;
11fdf7f2
TL
12971 head_pad = p2phase(offset, chunk_size);
12972 tail_pad = p2nphase(end_offs, chunk_size);
7c673cae
FG
12973 if (head_pad || tail_pad) {
12974 o->extent_map.fault_range(db, offset - head_pad,
12975 end_offs - offset + head_pad + tail_pad);
12976 }
12977 if (head_pad &&
12978 o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
12979 head_pad = 0;
12980 }
12981 if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
12982 tail_pad = 0;
12983 }
12984
12985 uint64_t b_off = offset - head_pad - bstart;
12986 uint64_t b_len = length + head_pad + tail_pad;
12987
12988 // direct write into unused blocks of an existing mutable blob?
12989 if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
12990 b->get_blob().get_ondisk_length() >= b_off + b_len &&
12991 b->get_blob().is_unused(b_off, b_len) &&
12992 b->get_blob().is_allocated(b_off, b_len)) {
224ce89b 12993 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
12994
12995 dout(20) << __func__ << " write to unused 0x" << std::hex
12996 << b_off << "~" << b_len
12997 << " pad 0x" << head_pad << " + 0x" << tail_pad
12998 << std::dec << " of mutable " << *b << dendl;
224ce89b 12999 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
13000 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13001
11fdf7f2 13002 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
13003 if (b_len <= prefer_deferred_size) {
13004 dout(20) << __func__ << " deferring small 0x" << std::hex
13005 << b_len << std::dec << " unused write via deferred" << dendl;
9f95a23c 13006 bluestore_deferred_op_t *op = _get_deferred_op(txc);
7c673cae
FG
13007 op->op = bluestore_deferred_op_t::OP_WRITE;
13008 b->get_blob().map(
13009 b_off, b_len,
13010 [&](uint64_t offset, uint64_t length) {
13011 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13012 return 0;
13013 });
224ce89b 13014 op->data = bl;
7c673cae
FG
13015 } else {
13016 b->get_blob().map_bl(
224ce89b 13017 b_off, bl,
7c673cae
FG
13018 [&](uint64_t offset, bufferlist& t) {
13019 bdev->aio_write(offset, t,
13020 &txc->ioc, wctx->buffered);
13021 });
13022 }
13023 }
224ce89b 13024 b->dirty_blob().calc_csum(b_off, bl);
7c673cae
FG
13025 dout(20) << __func__ << " lex old " << *ep << dendl;
13026 Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
13027 b,
13028 &wctx->old_extents);
13029 b->dirty_blob().mark_used(le->blob_offset, le->length);
13030 txc->statfs_delta.stored() += le->length;
13031 dout(20) << __func__ << " lex " << *le << dendl;
13032 logger->inc(l_bluestore_write_small_unused);
13033 return;
13034 }
13035 // read some data to fill out the chunk?
11fdf7f2
TL
13036 uint64_t head_read = p2phase(b_off, chunk_size);
13037 uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
7c673cae
FG
13038 if ((head_read || tail_read) &&
13039 (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
13040 head_read + tail_read < min_alloc_size) {
13041 b_off -= head_read;
13042 b_len += head_read + tail_read;
13043
13044 } else {
13045 head_read = tail_read = 0;
13046 }
13047
13048 // chunk-aligned deferred overwrite?
13049 if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
13050 b_off % chunk_size == 0 &&
13051 b_len % chunk_size == 0 &&
13052 b->get_blob().is_allocated(b_off, b_len)) {
13053
224ce89b 13054 _apply_padding(head_pad, tail_pad, bl);
7c673cae
FG
13055
13056 dout(20) << __func__ << " reading head 0x" << std::hex << head_read
13057 << " and tail 0x" << tail_read << std::dec << dendl;
13058 if (head_read) {
13059 bufferlist head_bl;
13060 int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
13061 head_bl, 0);
11fdf7f2 13062 ceph_assert(r >= 0 && r <= (int)head_read);
7c673cae
FG
13063 size_t zlen = head_read - r;
13064 if (zlen) {
13065 head_bl.append_zero(zlen);
13066 logger->inc(l_bluestore_write_pad_bytes, zlen);
13067 }
11fdf7f2
TL
13068 head_bl.claim_append(bl);
13069 bl.swap(head_bl);
7c673cae
FG
13070 logger->inc(l_bluestore_write_penalty_read_ops);
13071 }
13072 if (tail_read) {
13073 bufferlist tail_bl;
13074 int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
13075 tail_bl, 0);
11fdf7f2 13076 ceph_assert(r >= 0 && r <= (int)tail_read);
7c673cae
FG
13077 size_t zlen = tail_read - r;
13078 if (zlen) {
13079 tail_bl.append_zero(zlen);
13080 logger->inc(l_bluestore_write_pad_bytes, zlen);
13081 }
224ce89b 13082 bl.claim_append(tail_bl);
7c673cae
FG
13083 logger->inc(l_bluestore_write_penalty_read_ops);
13084 }
13085 logger->inc(l_bluestore_write_small_pre_read);
13086
224ce89b 13087 _buffer_cache_write(txc, b, b_off, bl,
7c673cae
FG
13088 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13089
7c673cae 13090 if (b->get_blob().csum_type) {
224ce89b 13091 b->dirty_blob().calc_csum(b_off, bl);
7c673cae 13092 }
11fdf7f2
TL
13093
13094 if (!g_conf()->bluestore_debug_omit_block_device_write) {
9f95a23c 13095 bluestore_deferred_op_t *op = _get_deferred_op(txc);
11fdf7f2
TL
13096 op->op = bluestore_deferred_op_t::OP_WRITE;
13097 int r = b->get_blob().map(
13098 b_off, b_len,
13099 [&](uint64_t offset, uint64_t length) {
13100 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13101 return 0;
13102 });
13103 ceph_assert(r == 0);
13104 op->data.claim(bl);
13105 dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~"
13106 << b_len << std::dec << " of mutable " << *b
13107 << " at " << op->extents << dendl;
13108 }
13109
7c673cae
FG
13110 Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
13111 b, &wctx->old_extents);
13112 b->dirty_blob().mark_used(le->blob_offset, le->length);
13113 txc->statfs_delta.stored() += le->length;
13114 dout(20) << __func__ << " lex " << *le << dendl;
13115 logger->inc(l_bluestore_write_small_deferred);
13116 return;
13117 }
224ce89b
WB
13118 // try to reuse blob if we can
13119 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
13120 max_bsize,
13121 offset0 - bstart,
13122 &alloc_len)) {
11fdf7f2 13123 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
13124 // fit into reused blob
13125 // Need to check for pending writes desiring to
13126 // reuse the same pextent. The rationale is that during GC two chunks
13127 // from garbage blobs(compressed?) can share logical space within the same
13128 // AU. That's in turn might be caused by unaligned len in clone_range2.
13129 // Hence the second write will fail in an attempt to reuse blob at
13130 // do_alloc_write().
13131 if (!wctx->has_conflict(b,
13132 offset0,
13133 offset0 + alloc_len,
13134 min_alloc_size)) {
13135
13136 // we can't reuse pad_head/pad_tail since they might be truncated
13137 // due to existent extents
13138 uint64_t b_off = offset - bstart;
13139 uint64_t b_off0 = b_off;
13140 _pad_zeros(&bl, &b_off0, chunk_size);
13141
13142 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
13143 << " (0x" << b_off0 << "~" << bl.length() << ")"
13144 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
13145 << std::dec << dendl;
13146
13147 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13148 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13149 false, false);
13150 logger->inc(l_bluestore_write_small_unused);
13151 return;
13152 }
13153 }
13154 }
13155 ++ep;
eafe8130 13156 end_ep = ep;
7c673cae
FG
13157 any_change = true;
13158 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13159
13160 // check extent for reuse in reverse order
13161 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
13162 BlobRef b = prev_ep->blob;
eafe8130
TL
13163 if (!above_blob_threshold) {
13164 inspected_blobs.insert(&b->get_blob());
13165 above_blob_threshold = inspected_blobs.size() >= blob_threshold;
13166 }
13167 start_ep = prev_ep;
7c673cae
FG
13168 auto bstart = prev_ep->blob_start();
13169 dout(20) << __func__ << " considering " << *b
13170 << " bstart 0x" << std::hex << bstart << std::dec << dendl;
224ce89b 13171 if (b->can_reuse_blob(min_alloc_size,
7c673cae
FG
13172 max_bsize,
13173 offset0 - bstart,
13174 &alloc_len)) {
11fdf7f2 13175 ceph_assert(alloc_len == min_alloc_size); // expecting data always
7c673cae
FG
13176 // fit into reused blob
13177 // Need to check for pending writes desiring to
13178 // reuse the same pextent. The rationale is that during GC two chunks
13179 // from garbage blobs(compressed?) can share logical space within the same
13180 // AU. That's in turn might be caused by unaligned len in clone_range2.
13181 // Hence the second write will fail in an attempt to reuse blob at
13182 // do_alloc_write().
13183 if (!wctx->has_conflict(b,
13184 offset0,
13185 offset0 + alloc_len,
13186 min_alloc_size)) {
13187
13188 uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
13189 uint64_t b_off = offset - bstart;
13190 uint64_t b_off0 = b_off;
13191 _pad_zeros(&bl, &b_off0, chunk_size);
13192
13193 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b
WB
13194 << " (0x" << b_off0 << "~" << bl.length() << ")"
13195 << " (0x" << b_off << "~" << length << ")"
7c673cae
FG
13196 << std::dec << dendl;
13197
13198 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13199 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
13200 false, false);
13201 logger->inc(l_bluestore_write_small_unused);
13202 return;
13203 }
13204 }
13205 if (prev_ep != begin) {
13206 --prev_ep;
13207 any_change = true;
13208 } else {
13209 prev_ep = end; // to avoid useless first extent re-check
13210 }
13211 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13212 } while (any_change);
13213
eafe8130
TL
13214 if (above_blob_threshold) {
13215 dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
13216 << " " << std::hex << min_off << "~" << max_off << std::dec
13217 << dendl;
13218 ceph_assert(start_ep != end_ep);
13219 for (auto ep = start_ep; ep != end_ep; ++ep) {
13220 dout(20) << __func__ << " inserting for GC "
13221 << std::hex << ep->logical_offset << "~" << ep->length
13222 << std::dec << dendl;
13223
13224 wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
13225 }
13226 // insert newly written extent to GC
13227 wctx->extents_to_gc.union_insert(offset, length);
13228 dout(20) << __func__ << " inserting (last) for GC "
13229 << std::hex << offset << "~" << length
13230 << std::dec << dendl;
13231 }
7c673cae 13232 // new blob.
7c673cae 13233 BlobRef b = c->new_blob();
11fdf7f2 13234 uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
7c673cae
FG
13235 uint64_t b_off0 = b_off;
13236 _pad_zeros(&bl, &b_off0, block_size);
13237 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
13238 wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
7c673cae
FG
13239
13240 return;
13241}
13242
13243void BlueStore::_do_write_big(
13244 TransContext *txc,
13245 CollectionRef &c,
13246 OnodeRef o,
13247 uint64_t offset, uint64_t length,
13248 bufferlist::iterator& blp,
13249 WriteContext *wctx)
13250{
13251 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
13252 << " target_blob_size 0x" << wctx->target_blob_size << std::dec
13253 << " compress " << (int)wctx->compress
13254 << dendl;
13255 logger->inc(l_bluestore_write_big);
13256 logger->inc(l_bluestore_write_big_bytes, length);
13257 o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
11fdf7f2 13258 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae
FG
13259 while (length > 0) {
13260 bool new_blob = false;
11fdf7f2 13261 uint32_t l = std::min(max_bsize, length);
7c673cae
FG
13262 BlobRef b;
13263 uint32_t b_off = 0;
13264
13265 //attempting to reuse existing blob
13266 if (!wctx->compress) {
13267 // look for an existing mutable blob we can reuse
13268 auto begin = o->extent_map.extent_map.begin();
13269 auto end = o->extent_map.extent_map.end();
13270 auto ep = o->extent_map.seek_lextent(offset);
13271 auto prev_ep = ep;
13272 if (prev_ep != begin) {
13273 --prev_ep;
13274 } else {
13275 prev_ep = end; // to avoid this extent check as it's a duplicate
13276 }
13277 auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
13278 // search suitable extent in both forward and reverse direction in
13279 // [offset - target_max_blob_size, offset + target_max_blob_size] range
224ce89b 13280 // then check if blob can be reused via can_reuse_blob func.
7c673cae
FG
13281 bool any_change;
13282 do {
13283 any_change = false;
13284 if (ep != end && ep->logical_offset < offset + max_bsize) {
13285 if (offset >= ep->blob_start() &&
224ce89b 13286 ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
13287 offset - ep->blob_start(),
13288 &l)) {
13289 b = ep->blob;
13290 b_off = offset - ep->blob_start();
13291 prev_ep = end; // to avoid check below
13292 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 13293 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
13294 } else {
13295 ++ep;
13296 any_change = true;
13297 }
13298 }
13299
13300 if (prev_ep != end && prev_ep->logical_offset >= min_off) {
224ce89b 13301 if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
7c673cae
FG
13302 offset - prev_ep->blob_start(),
13303 &l)) {
13304 b = prev_ep->blob;
13305 b_off = offset - prev_ep->blob_start();
13306 dout(20) << __func__ << " reuse blob " << *b << std::hex
224ce89b 13307 << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
7c673cae
FG
13308 } else if (prev_ep != begin) {
13309 --prev_ep;
13310 any_change = true;
13311 } else {
13312 prev_ep = end; // to avoid useless first extent re-check
13313 }
13314 }
13315 } while (b == nullptr && any_change);
13316 }
13317 if (b == nullptr) {
13318 b = c->new_blob();
13319 b_off = 0;
13320 new_blob = true;
13321 }
13322
13323 bufferlist t;
13324 blp.copy(l, t);
13325 wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
13326 offset += l;
13327 length -= l;
13328 logger->inc(l_bluestore_write_big_blobs);
13329 }
13330}
13331
13332int BlueStore::_do_alloc_write(
13333 TransContext *txc,
13334 CollectionRef coll,
13335 OnodeRef o,
13336 WriteContext *wctx)
13337{
13338 dout(20) << __func__ << " txc " << txc
13339 << " " << wctx->writes.size() << " blobs"
13340 << dendl;
3efd9988
FG
13341 if (wctx->writes.empty()) {
13342 return 0;
7c673cae
FG
13343 }
13344
7c673cae
FG
13345 CompressorRef c;
13346 double crr = 0;
13347 if (wctx->compress) {
13348 c = select_option(
13349 "compression_algorithm",
13350 compressor,
13351 [&]() {
13352 string val;
13353 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
13354 CompressorRef cp = compressor;
13355 if (!cp || cp->get_type_name() != val) {
13356 cp = Compressor::create(cct, val);
11fdf7f2
TL
13357 if (!cp) {
13358 if (_set_compression_alert(false, val.c_str())) {
13359 derr << __func__ << " unable to initialize " << val.c_str()
13360 << " compressor" << dendl;
13361 }
13362 }
7c673cae
FG
13363 }
13364 return boost::optional<CompressorRef>(cp);
13365 }
13366 return boost::optional<CompressorRef>();
13367 }
13368 );
13369
13370 crr = select_option(
13371 "compression_required_ratio",
13372 cct->_conf->bluestore_compression_required_ratio,
13373 [&]() {
13374 double val;
3efd9988 13375 if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
7c673cae
FG
13376 return boost::optional<double>(val);
13377 }
13378 return boost::optional<double>();
13379 }
13380 );
13381 }
13382
13383 // checksum
11fdf7f2 13384 int64_t csum = csum_type.load();
7c673cae
FG
13385 csum = select_option(
13386 "csum_type",
13387 csum,
13388 [&]() {
11fdf7f2 13389 int64_t val;
3efd9988 13390 if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
11fdf7f2 13391 return boost::optional<int64_t>(val);
7c673cae 13392 }
11fdf7f2 13393 return boost::optional<int64_t>();
7c673cae
FG
13394 }
13395 );
13396
3efd9988
FG
13397 // compress (as needed) and calc needed space
13398 uint64_t need = 0;
11fdf7f2 13399 auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
7c673cae 13400 for (auto& wi : wctx->writes) {
3efd9988 13401 if (c && wi.blob_length > min_alloc_size) {
11fdf7f2 13402 auto start = mono_clock::now();
7c673cae
FG
13403
13404 // compress
11fdf7f2
TL
13405 ceph_assert(wi.b_off == 0);
13406 ceph_assert(wi.blob_length == wi.bl.length());
3efd9988 13407
7c673cae
FG
13408 // FIXME: memory alignment here is bad
13409 bufferlist t;
3efd9988 13410 int r = c->compress(wi.bl, t);
3efd9988 13411 uint64_t want_len_raw = wi.blob_length * crr;
11fdf7f2 13412 uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
a8e16298
TL
13413 bool rejected = false;
13414 uint64_t compressed_len = t.length();
13415 // do an approximate (fast) estimation for resulting blob size
13416 // that doesn't take header overhead into account
11fdf7f2 13417 uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
13418 if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
13419 bluestore_compression_header_t chdr;
13420 chdr.type = c->get_type();
13421 chdr.length = t.length();
13422 encode(chdr, wi.compressed_bl);
13423 wi.compressed_bl.claim_append(t);
13424
13425 compressed_len = wi.compressed_bl.length();
11fdf7f2 13426 result_len = p2roundup(compressed_len, min_alloc_size);
a8e16298
TL
13427 if (result_len <= want_len && result_len < wi.blob_length) {
13428 // Cool. We compressed at least as much as we were hoping to.
13429 // pad out to min_alloc_size
13430 wi.compressed_bl.append_zero(result_len - compressed_len);
13431 wi.compressed_len = compressed_len;
13432 wi.compressed = true;
13433 logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
13434 dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
13435 << " -> 0x" << compressed_len << " => 0x" << result_len
13436 << " with " << c->get_type()
13437 << std::dec << dendl;
13438 txc->statfs_delta.compressed() += compressed_len;
13439 txc->statfs_delta.compressed_original() += wi.blob_length;
13440 txc->statfs_delta.compressed_allocated() += result_len;
13441 logger->inc(l_bluestore_compress_success_count);
13442 need += result_len;
13443 } else {
13444 rejected = true;
13445 }
13446 } else if (r != 0) {
13447 dout(5) << __func__ << std::hex << " 0x" << wi.blob_length
13448 << " bytes compressed using " << c->get_type_name()
13449 << std::dec
13450 << " failed with errcode = " << r
13451 << ", leaving uncompressed"
13452 << dendl;
13453 logger->inc(l_bluestore_compress_rejected_count);
13454 need += wi.blob_length;
7c673cae 13455 } else {
a8e16298
TL
13456 rejected = true;
13457 }
13458
13459 if (rejected) {
3efd9988 13460 dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
a8e16298 13461 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
3efd9988
FG
13462 << " with " << c->get_type()
13463 << ", which is more than required 0x" << want_len_raw
7c673cae 13464 << " -> 0x" << want_len
3efd9988
FG
13465 << ", leaving uncompressed"
13466 << std::dec << dendl;
13467 logger->inc(l_bluestore_compress_rejected_count);
13468 need += wi.blob_length;
7c673cae 13469 }
494da23a
TL
13470 log_latency("compress@_do_alloc_write",
13471 l_bluestore_compress_lat,
13472 mono_clock::now() - start,
13473 cct->_conf->bluestore_log_op_age );
3efd9988
FG
13474 } else {
13475 need += wi.blob_length;
7c673cae 13476 }
3efd9988 13477 }
a8e16298 13478 PExtentVector prealloc;
3efd9988 13479 prealloc.reserve(2 * wctx->writes.size());;
11fdf7f2 13480 int64_t prealloc_left = 0;
3efd9988
FG
13481 prealloc_left = alloc->allocate(
13482 need, min_alloc_size, need,
13483 0, &prealloc);
eafe8130 13484 if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
11fdf7f2 13485 derr << __func__ << " failed to allocate 0x" << std::hex << need
eafe8130 13486 << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
11fdf7f2
TL
13487 << " min_alloc_size 0x" << min_alloc_size
13488 << " available 0x " << alloc->get_free()
13489 << std::dec << dendl;
13490 if (prealloc.size()) {
13491 alloc->release(prealloc);
13492 }
a8e16298
TL
13493 return -ENOSPC;
13494 }
9f95a23c 13495 _collect_allocation_stats(need, min_alloc_size, prealloc.size());
a8e16298 13496
3efd9988
FG
13497 dout(20) << __func__ << " prealloc " << prealloc << dendl;
13498 auto prealloc_pos = prealloc.begin();
13499
13500 for (auto& wi : wctx->writes) {
13501 BlobRef b = wi.b;
13502 bluestore_blob_t& dblob = b->dirty_blob();
13503 uint64_t b_off = wi.b_off;
13504 bufferlist *l = &wi.bl;
13505 uint64_t final_length = wi.blob_length;
13506 uint64_t csum_length = wi.blob_length;
3efd9988
FG
13507 if (wi.compressed) {
13508 final_length = wi.compressed_bl.length();
13509 csum_length = final_length;
3efd9988
FG
13510 l = &wi.compressed_bl;
13511 dblob.set_compressed(wi.blob_length, wi.compressed_len);
13512 } else if (wi.new_blob) {
7c673cae 13513 // initialize newly created blob only
11fdf7f2
TL
13514 ceph_assert(dblob.is_mutable());
13515 unsigned csum_order;
7c673cae
FG
13516 if (l->length() != wi.blob_length) {
13517 // hrm, maybe we could do better here, but let's not bother.
13518 dout(20) << __func__ << " forcing csum_order to block_size_order "
13519 << block_size_order << dendl;
31f18b77 13520 csum_order = block_size_order;
7c673cae
FG
13521 } else {
13522 csum_order = std::min(wctx->csum_order, ctz(l->length()));
13523 }
13524 // try to align blob with max_blob_size to improve
13525 // its reuse ratio, e.g. in case of reverse write
13526 uint32_t suggested_boff =
13527 (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
13528 if ((suggested_boff % (1 << csum_order)) == 0 &&
13529 suggested_boff + final_length <= max_bsize &&
13530 suggested_boff > b_off) {
181888fb 13531 dout(20) << __func__ << " forcing blob_offset to 0x"
7c673cae 13532 << std::hex << suggested_boff << std::dec << dendl;
11fdf7f2 13533 ceph_assert(suggested_boff >= b_off);
7c673cae
FG
13534 csum_length += suggested_boff - b_off;
13535 b_off = suggested_boff;
13536 }
181888fb
FG
13537 if (csum != Checksummer::CSUM_NONE) {
13538 dout(20) << __func__ << " initialize csum setting for new blob " << *b
13539 << " csum_type " << Checksummer::get_csum_type_string(csum)
13540 << " csum_order " << csum_order
13541 << " csum_length 0x" << std::hex << csum_length << std::dec
13542 << dendl;
13543 dblob.init_csum(csum, csum_order, csum_length);
13544 }
7c673cae
FG
13545 }
13546
a8e16298 13547 PExtentVector extents;
3efd9988
FG
13548 int64_t left = final_length;
13549 while (left > 0) {
11fdf7f2 13550 ceph_assert(prealloc_left > 0);
3efd9988
FG
13551 if (prealloc_pos->length <= left) {
13552 prealloc_left -= prealloc_pos->length;
13553 left -= prealloc_pos->length;
13554 txc->statfs_delta.allocated() += prealloc_pos->length;
13555 extents.push_back(*prealloc_pos);
13556 ++prealloc_pos;
13557 } else {
13558 extents.emplace_back(prealloc_pos->offset, left);
13559 prealloc_pos->offset += left;
13560 prealloc_pos->length -= left;
13561 prealloc_left -= left;
13562 txc->statfs_delta.allocated() += left;
13563 left = 0;
13564 break;
13565 }
13566 }
7c673cae 13567 for (auto& p : extents) {
3efd9988 13568 txc->allocated.insert(p.offset, p.length);
7c673cae 13569 }
11fdf7f2 13570 dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
7c673cae 13571
181888fb
FG
13572 dout(20) << __func__ << " blob " << *b << dendl;
13573 if (dblob.has_csum()) {
7c673cae
FG
13574 dblob.calc_csum(b_off, *l);
13575 }
181888fb 13576
7c673cae
FG
13577 if (wi.mark_unused) {
13578 auto b_end = b_off + wi.bl.length();
13579 if (b_off) {
13580 dblob.add_unused(0, b_off);
13581 }
13582 if (b_end < wi.blob_length) {
13583 dblob.add_unused(b_end, wi.blob_length - b_end);
13584 }
13585 }
13586
13587 Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
13588 b_off + (wi.b_off0 - wi.b_off),
13589 wi.length0,
13590 wi.b,
13591 nullptr);
13592 wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
13593 txc->statfs_delta.stored() += le->length;
13594 dout(20) << __func__ << " lex " << *le << dendl;
13595 _buffer_cache_write(txc, wi.b, b_off, wi.bl,
13596 wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
13597
13598 // queue io
11fdf7f2 13599 if (!g_conf()->bluestore_debug_omit_block_device_write) {
7c673cae
FG
13600 if (l->length() <= prefer_deferred_size.load()) {
13601 dout(20) << __func__ << " deferring small 0x" << std::hex
13602 << l->length() << std::dec << " write via deferred" << dendl;
9f95a23c 13603 bluestore_deferred_op_t *op = _get_deferred_op(txc);
7c673cae
FG
13604 op->op = bluestore_deferred_op_t::OP_WRITE;
13605 int r = b->get_blob().map(
13606 b_off, l->length(),
13607 [&](uint64_t offset, uint64_t length) {
13608 op->extents.emplace_back(bluestore_pextent_t(offset, length));
13609 return 0;
13610 });
11fdf7f2 13611 ceph_assert(r == 0);
7c673cae 13612 op->data = *l;
81eedcae 13613 logger->inc(l_bluestore_write_small_deferred);
7c673cae
FG
13614 } else {
13615 b->get_blob().map_bl(
13616 b_off, *l,
13617 [&](uint64_t offset, bufferlist& t) {
13618 bdev->aio_write(offset, t, &txc->ioc, false);
13619 });
81eedcae 13620 logger->inc(l_bluestore_write_small_new);
7c673cae
FG
13621 }
13622 }
13623 }
11fdf7f2
TL
13624 ceph_assert(prealloc_pos == prealloc.end());
13625 ceph_assert(prealloc_left == 0);
7c673cae
FG
13626 return 0;
13627}
13628
13629void BlueStore::_wctx_finish(
13630 TransContext *txc,
13631 CollectionRef& c,
13632 OnodeRef o,
31f18b77
FG
13633 WriteContext *wctx,
13634 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
13635{
13636 auto oep = wctx->old_extents.begin();
13637 while (oep != wctx->old_extents.end()) {
13638 auto &lo = *oep;
13639 oep = wctx->old_extents.erase(oep);
13640 dout(20) << __func__ << " lex_old " << lo.e << dendl;
13641 BlobRef b = lo.e.blob;
13642 const bluestore_blob_t& blob = b->get_blob();
13643 if (blob.is_compressed()) {
13644 if (lo.blob_empty) {
13645 txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
13646 }
13647 txc->statfs_delta.compressed_original() -= lo.e.length;
13648 }
13649 auto& r = lo.r;
13650 txc->statfs_delta.stored() -= lo.e.length;
13651 if (!r.empty()) {
13652 dout(20) << __func__ << " blob release " << r << dendl;
13653 if (blob.is_shared()) {
13654 PExtentVector final;
13655 c->load_shared_blob(b->shared_blob);
11fdf7f2
TL
13656 bool unshare = false;
13657 bool* unshare_ptr =
13658 !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
7c673cae 13659 for (auto e : r) {
31f18b77
FG
13660 b->shared_blob->put_ref(
13661 e.offset, e.length, &final,
11fdf7f2
TL
13662 unshare_ptr);
13663 }
13664 if (unshare) {
13665 ceph_assert(maybe_unshared_blobs);
13666 maybe_unshared_blobs->insert(b->shared_blob.get());
7c673cae
FG
13667 }
13668 dout(20) << __func__ << " shared_blob release " << final
13669 << " from " << *b->shared_blob << dendl;
13670 txc->write_shared_blob(b->shared_blob);
13671 r.clear();
13672 r.swap(final);
13673 }
13674 }
13675 // we can't invalidate our logical extents as we drop them because
13676 // other lextents (either in our onode or others) may still
13677 // reference them. but we can throw out anything that is no
13678 // longer allocated. Note that this will leave behind edge bits
13679 // that are no longer referenced but not deallocated (until they
13680 // age out of the cache naturally).
13681 b->discard_unallocated(c.get());
13682 for (auto e : r) {
13683 dout(20) << __func__ << " release " << e << dendl;
13684 txc->released.insert(e.offset, e.length);
13685 txc->statfs_delta.allocated() -= e.length;
13686 if (blob.is_compressed()) {
13687 txc->statfs_delta.compressed_allocated() -= e.length;
13688 }
13689 }
9f95a23c
TL
13690
13691 if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
7c673cae
FG
13692 dout(20) << __func__ << " spanning_blob_map removing empty " << *b
13693 << dendl;
13694 o->extent_map.spanning_blob_map.erase(b->id);
13695 }
9f95a23c 13696 delete &lo;
7c673cae
FG
13697 }
13698}
13699
13700void BlueStore::_do_write_data(
13701 TransContext *txc,
13702 CollectionRef& c,
13703 OnodeRef o,
13704 uint64_t offset,
13705 uint64_t length,
13706 bufferlist& bl,
13707 WriteContext *wctx)
13708{
13709 uint64_t end = offset + length;
13710 bufferlist::iterator p = bl.begin();
13711
13712 if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
13713 (length != min_alloc_size)) {
13714 // we fall within the same block
13715 _do_write_small(txc, c, o, offset, length, p, wctx);
13716 } else {
13717 uint64_t head_offset, head_length;
13718 uint64_t middle_offset, middle_length;
13719 uint64_t tail_offset, tail_length;
13720
13721 head_offset = offset;
11fdf7f2 13722 head_length = p2nphase(offset, min_alloc_size);
7c673cae 13723
11fdf7f2
TL
13724 tail_offset = p2align(end, min_alloc_size);
13725 tail_length = p2phase(end, min_alloc_size);
7c673cae
FG
13726
13727 middle_offset = head_offset + head_length;
13728 middle_length = length - head_length - tail_length;
13729
13730 if (head_length) {
13731 _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
13732 }
13733
13734 if (middle_length) {
13735 _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
13736 }
13737
13738 if (tail_length) {
13739 _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
13740 }
13741 }
13742}
13743
31f18b77
FG
13744void BlueStore::_choose_write_options(
13745 CollectionRef& c,
13746 OnodeRef o,
13747 uint32_t fadvise_flags,
13748 WriteContext *wctx)
7c673cae 13749{
7c673cae
FG
13750 if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
13751 dout(20) << __func__ << " will do buffered write" << dendl;
31f18b77 13752 wctx->buffered = true;
7c673cae
FG
13753 } else if (cct->_conf->bluestore_default_buffered_write &&
13754 (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
13755 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
13756 dout(20) << __func__ << " defaulting to buffered write" << dendl;
31f18b77 13757 wctx->buffered = true;
7c673cae
FG
13758 }
13759
31f18b77
FG
13760 // apply basic csum block size
13761 wctx->csum_order = block_size_order;
7c673cae
FG
13762
13763 // compression parameters
13764 unsigned alloc_hints = o->onode.alloc_hint_flags;
13765 auto cm = select_option(
13766 "compression_mode",
31f18b77 13767 comp_mode.load(),
7c673cae
FG
13768 [&]() {
13769 string val;
11fdf7f2 13770 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
31f18b77
FG
13771 return boost::optional<Compressor::CompressionMode>(
13772 Compressor::get_comp_mode_type(val));
7c673cae
FG
13773 }
13774 return boost::optional<Compressor::CompressionMode>();
13775 }
13776 );
31f18b77
FG
13777
13778 wctx->compress = (cm != Compressor::COMP_NONE) &&
7c673cae
FG
13779 ((cm == Compressor::COMP_FORCE) ||
13780 (cm == Compressor::COMP_AGGRESSIVE &&
13781 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
13782 (cm == Compressor::COMP_PASSIVE &&
13783 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
13784
13785 if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
13786 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
31f18b77
FG
13787 (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
13788 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
7c673cae 13789 (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
31f18b77 13790
7c673cae 13791 dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
31f18b77 13792
7c673cae 13793 if (o->onode.expected_write_size) {
224ce89b 13794 wctx->csum_order = std::max(min_alloc_size_order,
31f18b77 13795 (uint8_t)ctz(o->onode.expected_write_size));
7c673cae 13796 } else {
224ce89b 13797 wctx->csum_order = min_alloc_size_order;
7c673cae
FG
13798 }
13799
31f18b77
FG
13800 if (wctx->compress) {
13801 wctx->target_blob_size = select_option(
7c673cae 13802 "compression_max_blob_size",
31f18b77 13803 comp_max_blob_size.load(),
7c673cae 13804 [&]() {
11fdf7f2
TL
13805 int64_t val;
13806 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
7c673cae
FG
13807 return boost::optional<uint64_t>((uint64_t)val);
13808 }
13809 return boost::optional<uint64_t>();
13810 }
13811 );
13812 }
13813 } else {
31f18b77
FG
13814 if (wctx->compress) {
13815 wctx->target_blob_size = select_option(
7c673cae 13816 "compression_min_blob_size",
31f18b77 13817 comp_min_blob_size.load(),
7c673cae 13818 [&]() {
11fdf7f2
TL
13819 int64_t val;
13820 if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
7c673cae
FG
13821 return boost::optional<uint64_t>((uint64_t)val);
13822 }
13823 return boost::optional<uint64_t>();
13824 }
13825 );
13826 }
13827 }
31f18b77 13828
7c673cae 13829 uint64_t max_bsize = max_blob_size.load();
31f18b77
FG
13830 if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
13831 wctx->target_blob_size = max_bsize;
7c673cae 13832 }
31f18b77 13833
7c673cae
FG
13834 // set the min blob size floor at 2x the min_alloc_size, or else we
13835 // won't be able to allocate a smaller extent for the compressed
13836 // data.
31f18b77
FG
13837 if (wctx->compress &&
13838 wctx->target_blob_size < min_alloc_size * 2) {
13839 wctx->target_blob_size = min_alloc_size * 2;
7c673cae 13840 }
31f18b77
FG
13841
13842 dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
13843 << " target_blob_size 0x" << std::hex << wctx->target_blob_size
f64942e4
AA
13844 << " compress=" << (int)wctx->compress
13845 << " buffered=" << (int)wctx->buffered
31f18b77
FG
13846 << std::dec << dendl;
13847}
13848
13849int BlueStore::_do_gc(
13850 TransContext *txc,
13851 CollectionRef& c,
13852 OnodeRef o,
31f18b77
FG
13853 const WriteContext& wctx,
13854 uint64_t *dirty_start,
13855 uint64_t *dirty_end)
13856{
31f18b77 13857
1adf2230 13858 bool dirty_range_updated = false;
31f18b77 13859 WriteContext wctx_gc;
7c673cae 13860 wctx_gc.fork(wctx); // make a clone for garbage collection
7c673cae 13861
eafe8130 13862 auto & extents_to_collect = wctx.extents_to_gc;
31f18b77
FG
13863 for (auto it = extents_to_collect.begin();
13864 it != extents_to_collect.end();
13865 ++it) {
13866 bufferlist bl;
eafe8130
TL
13867 auto offset = (*it).first;
13868 auto length = (*it).second;
13869 dout(20) << __func__ << " processing " << std::hex
13870 << offset << "~" << length << std::dec
13871 << dendl;
13872 int r = _do_read(c.get(), o, offset, length, bl, 0);
13873 ceph_assert(r == (int)length);
31f18b77 13874
eafe8130
TL
13875 _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
13876 logger->inc(l_bluestore_gc_merged, length);
31f18b77 13877
eafe8130
TL
13878 if (*dirty_start > offset) {
13879 *dirty_start = offset;
1adf2230 13880 dirty_range_updated = true;
31f18b77
FG
13881 }
13882
eafe8130
TL
13883 if (*dirty_end < offset + length) {
13884 *dirty_end = offset + length;
1adf2230 13885 dirty_range_updated = true;
31f18b77
FG
13886 }
13887 }
1adf2230
AA
13888 if (dirty_range_updated) {
13889 o->extent_map.fault_range(db, *dirty_start, *dirty_end);
13890 }
31f18b77
FG
13891
13892 dout(30) << __func__ << " alloc write" << dendl;
13893 int r = _do_alloc_write(txc, c, o, &wctx_gc);
13894 if (r < 0) {
13895 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
13896 << dendl;
13897 return r;
13898 }
13899
13900 _wctx_finish(txc, c, o, &wctx_gc);
13901 return 0;
13902}
13903
13904int BlueStore::_do_write(
13905 TransContext *txc,
13906 CollectionRef& c,
13907 OnodeRef o,
13908 uint64_t offset,
13909 uint64_t length,
13910 bufferlist& bl,
13911 uint32_t fadvise_flags)
13912{
13913 int r = 0;
13914
13915 dout(20) << __func__
13916 << " " << o->oid
13917 << " 0x" << std::hex << offset << "~" << length
13918 << " - have 0x" << o->onode.size
13919 << " (" << std::dec << o->onode.size << ")"
13920 << " bytes"
13921 << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
13922 << dendl;
81eedcae 13923 _dump_onode<30>(cct, *o);
31f18b77
FG
13924
13925 if (length == 0) {
13926 return 0;
13927 }
13928
13929 uint64_t end = offset + length;
13930
13931 GarbageCollector gc(c->store->cct);
eafe8130 13932 int64_t benefit = 0;
31f18b77
FG
13933 auto dirty_start = offset;
13934 auto dirty_end = end;
13935
13936 WriteContext wctx;
13937 _choose_write_options(c, o, fadvise_flags, &wctx);
7c673cae
FG
13938 o->extent_map.fault_range(db, offset, length);
13939 _do_write_data(txc, c, o, offset, length, bl, &wctx);
7c673cae
FG
13940 r = _do_alloc_write(txc, c, o, &wctx);
13941 if (r < 0) {
13942 derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
13943 << dendl;
13944 goto out;
13945 }
13946
eafe8130
TL
13947 if (wctx.extents_to_gc.empty() ||
13948 wctx.extents_to_gc.range_start() > offset ||
13949 wctx.extents_to_gc.range_end() < offset + length) {
13950 benefit = gc.estimate(offset,
13951 length,
13952 o->extent_map,
13953 wctx.old_extents,
13954 min_alloc_size);
13955 }
13956
31f18b77
FG
13957 // NB: _wctx_finish() will empty old_extents
13958 // so we must do gc estimation before that
7c673cae
FG
13959 _wctx_finish(txc, c, o, &wctx);
13960 if (end > o->onode.size) {
13961 dout(20) << __func__ << " extending size to 0x" << std::hex << end
31f18b77 13962 << std::dec << dendl;
7c673cae
FG
13963 o->onode.size = end;
13964 }
13965
11fdf7f2 13966 if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
eafe8130
TL
13967 wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
13968 dout(20) << __func__
13969 << " perform garbage collection for compressed extents, "
13970 << "expected benefit = " << benefit << " AUs" << dendl;
13971 }
13972 if (!wctx.extents_to_gc.empty()) {
13973 dout(20) << __func__ << " perform garbage collection" << dendl;
13974
13975 r = _do_gc(txc, c, o,
13976 wctx,
13977 &dirty_start, &dirty_end);
13978 if (r < 0) {
13979 derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
13980 << dendl;
13981 goto out;
7c673cae 13982 }
eafe8130
TL
13983 dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
13984 << "~" << dirty_end - dirty_start << std::dec << dendl;
7c673cae 13985 }
7c673cae 13986 o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
31f18b77
FG
13987 o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
13988
7c673cae
FG
13989 r = 0;
13990
13991 out:
13992 return r;
13993}
13994
13995int BlueStore::_write(TransContext *txc,
13996 CollectionRef& c,
13997 OnodeRef& o,
31f18b77
FG
13998 uint64_t offset, size_t length,
13999 bufferlist& bl,
14000 uint32_t fadvise_flags)
7c673cae
FG
14001{
14002 dout(15) << __func__ << " " << c->cid << " " << o->oid
14003 << " 0x" << std::hex << offset << "~" << length << std::dec
14004 << dendl;
35e4c445
FG
14005 int r = 0;
14006 if (offset + length >= OBJECT_MAX_SIZE) {
14007 r = -E2BIG;
14008 } else {
14009 _assign_nid(txc, o);
14010 r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
14011 txc->write_onode(o);
14012 }
7c673cae
FG
14013 dout(10) << __func__ << " " << c->cid << " " << o->oid
14014 << " 0x" << std::hex << offset << "~" << length << std::dec
14015 << " = " << r << dendl;
14016 return r;
14017}
14018
14019int BlueStore::_zero(TransContext *txc,
14020 CollectionRef& c,
14021 OnodeRef& o,
14022 uint64_t offset, size_t length)
14023{
14024 dout(15) << __func__ << " " << c->cid << " " << o->oid
14025 << " 0x" << std::hex << offset << "~" << length << std::dec
14026 << dendl;
35e4c445
FG
14027 int r = 0;
14028 if (offset + length >= OBJECT_MAX_SIZE) {
14029 r = -E2BIG;
14030 } else {
14031 _assign_nid(txc, o);
14032 r = _do_zero(txc, c, o, offset, length);
14033 }
7c673cae
FG
14034 dout(10) << __func__ << " " << c->cid << " " << o->oid
14035 << " 0x" << std::hex << offset << "~" << length << std::dec
14036 << " = " << r << dendl;
14037 return r;
14038}
14039
14040int BlueStore::_do_zero(TransContext *txc,
14041 CollectionRef& c,
14042 OnodeRef& o,
14043 uint64_t offset, size_t length)
14044{
14045 dout(15) << __func__ << " " << c->cid << " " << o->oid
14046 << " 0x" << std::hex << offset << "~" << length << std::dec
14047 << dendl;
14048 int r = 0;
14049
81eedcae 14050 _dump_onode<30>(cct, *o);
7c673cae
FG
14051
14052 WriteContext wctx;
14053 o->extent_map.fault_range(db, offset, length);
14054 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77 14055 o->extent_map.dirty_range(offset, length);
7c673cae
FG
14056 _wctx_finish(txc, c, o, &wctx);
14057
b32b8144 14058 if (length > 0 && offset + length > o->onode.size) {
7c673cae
FG
14059 o->onode.size = offset + length;
14060 dout(20) << __func__ << " extending size to " << offset + length
14061 << dendl;
14062 }
14063 txc->write_onode(o);
14064
14065 dout(10) << __func__ << " " << c->cid << " " << o->oid
14066 << " 0x" << std::hex << offset << "~" << length << std::dec
14067 << " = " << r << dendl;
14068 return r;
14069}
14070
14071void BlueStore::_do_truncate(
31f18b77
FG
14072 TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
14073 set<SharedBlob*> *maybe_unshared_blobs)
7c673cae
FG
14074{
14075 dout(15) << __func__ << " " << c->cid << " " << o->oid
14076 << " 0x" << std::hex << offset << std::dec << dendl;
14077
81eedcae 14078 _dump_onode<30>(cct, *o);
7c673cae
FG
14079
14080 if (offset == o->onode.size)
31f18b77 14081 return;
7c673cae
FG
14082
14083 if (offset < o->onode.size) {
14084 WriteContext wctx;
14085 uint64_t length = o->onode.size - offset;
14086 o->extent_map.fault_range(db, offset, length);
14087 o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
31f18b77
FG
14088 o->extent_map.dirty_range(offset, length);
14089 _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
7c673cae
FG
14090
14091 // if we have shards past EOF, ask for a reshard
14092 if (!o->onode.extent_map_shards.empty() &&
14093 o->onode.extent_map_shards.back().offset >= offset) {
14094 dout(10) << __func__ << " request reshard past EOF" << dendl;
14095 if (offset) {
14096 o->extent_map.request_reshard(offset - 1, offset + length);
14097 } else {
14098 o->extent_map.request_reshard(0, length);
14099 }
14100 }
14101 }
14102
14103 o->onode.size = offset;
14104
14105 txc->write_onode(o);
14106}
14107
35e4c445 14108int BlueStore::_truncate(TransContext *txc,
7c673cae
FG
14109 CollectionRef& c,
14110 OnodeRef& o,
14111 uint64_t offset)
14112{
14113 dout(15) << __func__ << " " << c->cid << " " << o->oid
14114 << " 0x" << std::hex << offset << std::dec
14115 << dendl;
35e4c445
FG
14116 int r = 0;
14117 if (offset >= OBJECT_MAX_SIZE) {
14118 r = -E2BIG;
14119 } else {
14120 _do_truncate(txc, c, o, offset);
14121 }
14122 dout(10) << __func__ << " " << c->cid << " " << o->oid
14123 << " 0x" << std::hex << offset << std::dec
14124 << " = " << r << dendl;
14125 return r;
7c673cae
FG
14126}
14127
14128int BlueStore::_do_remove(
14129 TransContext *txc,
14130 CollectionRef& c,
14131 OnodeRef o)
14132{
31f18b77 14133 set<SharedBlob*> maybe_unshared_blobs;
224ce89b
WB
14134 bool is_gen = !o->oid.is_no_gen();
14135 _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
7c673cae
FG
14136 if (o->onode.has_omap()) {
14137 o->flush();
9f95a23c 14138 _do_omap_clear(txc, o);
7c673cae
FG
14139 }
14140 o->exists = false;
14141 string key;
14142 for (auto &s : o->extent_map.shards) {
14143 dout(20) << __func__ << " removing shard 0x" << std::hex
14144 << s.shard_info->offset << std::dec << dendl;
14145 generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
14146 [&](const string& final_key) {
14147 txc->t->rmkey(PREFIX_OBJ, final_key);
14148 }
14149 );
14150 }
14151 txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
a8e16298 14152 txc->note_removed_object(o);
7c673cae
FG
14153 o->extent_map.clear();
14154 o->onode = bluestore_onode_t();
14155 _debug_obj_on_delete(o->oid);
31f18b77 14156
224ce89b
WB
14157 if (!is_gen || maybe_unshared_blobs.empty()) {
14158 return 0;
14159 }
31f18b77 14160
224ce89b
WB
14161 // see if we can unshare blobs still referenced by the head
14162 dout(10) << __func__ << " gen and maybe_unshared_blobs "
14163 << maybe_unshared_blobs << dendl;
14164 ghobject_t nogen = o->oid;
14165 nogen.generation = ghobject_t::NO_GEN;
14166 OnodeRef h = c->onode_map.lookup(nogen);
14167
14168 if (!h || !h->exists) {
14169 return 0;
14170 }
14171
14172 dout(20) << __func__ << " checking for unshareable blobs on " << h
14173 << " " << h->oid << dendl;
14174 map<SharedBlob*,bluestore_extent_ref_map_t> expect;
14175 for (auto& e : h->extent_map.extent_map) {
14176 const bluestore_blob_t& b = e.blob->get_blob();
14177 SharedBlob *sb = e.blob->shared_blob.get();
14178 if (b.is_shared() &&
14179 sb->loaded &&
14180 maybe_unshared_blobs.count(sb)) {
3efd9988
FG
14181 if (b.is_compressed()) {
14182 expect[sb].get(0, b.get_ondisk_length());
14183 } else {
14184 b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
14185 expect[sb].get(off, len);
14186 return 0;
14187 });
14188 }
224ce89b
WB
14189 }
14190 }
31f18b77 14191
224ce89b
WB
14192 vector<SharedBlob*> unshared_blobs;
14193 unshared_blobs.reserve(maybe_unshared_blobs.size());
14194 for (auto& p : expect) {
14195 dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
14196 if (p.first->persistent->ref_map == p.second) {
14197 SharedBlob *sb = p.first;
14198 dout(20) << __func__ << " unsharing " << *sb << dendl;
14199 unshared_blobs.push_back(sb);
14200 txc->unshare_blob(sb);
14201 uint64_t sbid = c->make_blob_unshared(sb);
14202 string key;
14203 get_shared_blob_key(sbid, &key);
14204 txc->t->rmkey(PREFIX_SHARED_BLOB, key);
14205 }
14206 }
14207
14208 if (unshared_blobs.empty()) {
14209 return 0;
14210 }
14211
224ce89b
WB
14212 for (auto& e : h->extent_map.extent_map) {
14213 const bluestore_blob_t& b = e.blob->get_blob();
14214 SharedBlob *sb = e.blob->shared_blob.get();
14215 if (b.is_shared() &&
14216 std::find(unshared_blobs.begin(), unshared_blobs.end(),
14217 sb) != unshared_blobs.end()) {
14218 dout(20) << __func__ << " unsharing " << e << dendl;
14219 bluestore_blob_t& blob = e.blob->dirty_blob();
14220 blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
d2e6a577 14221 h->extent_map.dirty_range(e.logical_offset, 1);
31f18b77
FG
14222 }
14223 }
224ce89b
WB
14224 txc->write_onode(h);
14225
7c673cae
FG
14226 return 0;
14227}
14228
14229int BlueStore::_remove(TransContext *txc,
14230 CollectionRef& c,
14231 OnodeRef &o)
14232{
11fdf7f2
TL
14233 dout(15) << __func__ << " " << c->cid << " " << o->oid
14234 << " onode " << o.get()
14235 << " txc "<< txc << dendl;
7c673cae
FG
14236 int r = _do_remove(txc, c, o);
14237 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14238 return r;
14239}
14240
14241int BlueStore::_setattr(TransContext *txc,
14242 CollectionRef& c,
14243 OnodeRef& o,
14244 const string& name,
14245 bufferptr& val)
14246{
14247 dout(15) << __func__ << " " << c->cid << " " << o->oid
14248 << " " << name << " (" << val.length() << " bytes)"
14249 << dendl;
14250 int r = 0;
3efd9988
FG
14251 if (val.is_partial()) {
14252 auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
14253 val.length());
14254 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
14255 } else {
14256 auto& b = o->onode.attrs[name.c_str()] = val;
14257 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
14258 }
7c673cae
FG
14259 txc->write_onode(o);
14260 dout(10) << __func__ << " " << c->cid << " " << o->oid
14261 << " " << name << " (" << val.length() << " bytes)"
14262 << " = " << r << dendl;
14263 return r;
14264}
14265
14266int BlueStore::_setattrs(TransContext *txc,
14267 CollectionRef& c,
14268 OnodeRef& o,
14269 const map<string,bufferptr>& aset)
14270{
14271 dout(15) << __func__ << " " << c->cid << " " << o->oid
14272 << " " << aset.size() << " keys"
14273 << dendl;
14274 int r = 0;
14275 for (map<string,bufferptr>::const_iterator p = aset.begin();
14276 p != aset.end(); ++p) {
3efd9988
FG
14277 if (p->second.is_partial()) {
14278 auto& b = o->onode.attrs[p->first.c_str()] =
7c673cae 14279 bufferptr(p->second.c_str(), p->second.length());
3efd9988
FG
14280 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
14281 } else {
14282 auto& b = o->onode.attrs[p->first.c_str()] = p->second;
14283 b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
14284 }
7c673cae
FG
14285 }
14286 txc->write_onode(o);
14287 dout(10) << __func__ << " " << c->cid << " " << o->oid
14288 << " " << aset.size() << " keys"
14289 << " = " << r << dendl;
14290 return r;
14291}
14292
14293
14294int BlueStore::_rmattr(TransContext *txc,
14295 CollectionRef& c,
14296 OnodeRef& o,
14297 const string& name)
14298{
14299 dout(15) << __func__ << " " << c->cid << " " << o->oid
14300 << " " << name << dendl;
14301 int r = 0;
14302 auto it = o->onode.attrs.find(name.c_str());
14303 if (it == o->onode.attrs.end())
14304 goto out;
14305
14306 o->onode.attrs.erase(it);
14307 txc->write_onode(o);
14308
14309 out:
14310 dout(10) << __func__ << " " << c->cid << " " << o->oid
14311 << " " << name << " = " << r << dendl;
14312 return r;
14313}
14314
14315int BlueStore::_rmattrs(TransContext *txc,
14316 CollectionRef& c,
14317 OnodeRef& o)
14318{
14319 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14320 int r = 0;
14321
14322 if (o->onode.attrs.empty())
14323 goto out;
14324
14325 o->onode.attrs.clear();
14326 txc->write_onode(o);
14327
14328 out:
14329 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14330 return r;
14331}
14332
9f95a23c 14333void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
7c673cae 14334{
9f95a23c 14335 const string& omap_prefix = o->get_omap_prefix();
7c673cae 14336 string prefix, tail;
9f95a23c
TL
14337 o->get_omap_header(&prefix);
14338 o->get_omap_tail(&tail);
11fdf7f2 14339 txc->t->rm_range_keys(omap_prefix, prefix, tail);
494da23a 14340 txc->t->rmkey(omap_prefix, tail);
11fdf7f2
TL
14341 dout(20) << __func__ << " remove range start: "
14342 << pretty_binary_string(prefix) << " end: "
14343 << pretty_binary_string(tail) << dendl;
7c673cae
FG
14344}
14345
14346int BlueStore::_omap_clear(TransContext *txc,
14347 CollectionRef& c,
14348 OnodeRef& o)
14349{
14350 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14351 int r = 0;
14352 if (o->onode.has_omap()) {
14353 o->flush();
9f95a23c 14354 _do_omap_clear(txc, o);
7c673cae
FG
14355 o->onode.clear_omap_flag();
14356 txc->write_onode(o);
14357 }
14358 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14359 return r;
14360}
14361
14362int BlueStore::_omap_setkeys(TransContext *txc,
14363 CollectionRef& c,
14364 OnodeRef& o,
14365 bufferlist &bl)
14366{
14367 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14368 int r;
11fdf7f2 14369 auto p = bl.cbegin();
7c673cae
FG
14370 __u32 num;
14371 if (!o->onode.has_omap()) {
11fdf7f2 14372 if (o->oid.is_pgmeta()) {
9f95a23c
TL
14373 o->onode.set_omap_flags_pgmeta();
14374 } else {
14375 o->onode.set_omap_flags();
11fdf7f2 14376 }
7c673cae 14377 txc->write_onode(o);
494da23a 14378
9f95a23c 14379 const string& prefix = o->get_omap_prefix();
494da23a
TL
14380 string key_tail;
14381 bufferlist tail;
9f95a23c 14382 o->get_omap_tail(&key_tail);
494da23a 14383 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
14384 } else {
14385 txc->note_modified_object(o);
14386 }
9f95a23c 14387 const string& prefix = o->get_omap_prefix();
7c673cae 14388 string final_key;
9f95a23c
TL
14389 o->get_omap_key(string(), &final_key);
14390 size_t base_key_len = final_key.size();
11fdf7f2 14391 decode(num, p);
7c673cae
FG
14392 while (num--) {
14393 string key;
14394 bufferlist value;
11fdf7f2
TL
14395 decode(key, p);
14396 decode(value, p);
9f95a23c 14397 final_key.resize(base_key_len); // keep prefix
7c673cae 14398 final_key += key;
11fdf7f2 14399 dout(20) << __func__ << " " << pretty_binary_string(final_key)
7c673cae 14400 << " <- " << key << dendl;
11fdf7f2 14401 txc->t->set(prefix, final_key, value);
7c673cae
FG
14402 }
14403 r = 0;
14404 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14405 return r;
14406}
14407
14408int BlueStore::_omap_setheader(TransContext *txc,
14409 CollectionRef& c,
14410 OnodeRef &o,
14411 bufferlist& bl)
14412{
14413 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14414 int r;
14415 string key;
14416 if (!o->onode.has_omap()) {
11fdf7f2 14417 if (o->oid.is_pgmeta()) {
9f95a23c
TL
14418 o->onode.set_omap_flags_pgmeta();
14419 } else {
14420 o->onode.set_omap_flags();
11fdf7f2 14421 }
7c673cae 14422 txc->write_onode(o);
494da23a 14423
9f95a23c 14424 const string& prefix = o->get_omap_prefix();
494da23a
TL
14425 string key_tail;
14426 bufferlist tail;
9f95a23c 14427 o->get_omap_tail(&key_tail);
494da23a 14428 txc->t->set(prefix, key_tail, tail);
7c673cae
FG
14429 } else {
14430 txc->note_modified_object(o);
14431 }
9f95a23c
TL
14432 const string& prefix = o->get_omap_prefix();
14433 o->get_omap_header(&key);
11fdf7f2 14434 txc->t->set(prefix, key, bl);
7c673cae
FG
14435 r = 0;
14436 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14437 return r;
14438}
14439
14440int BlueStore::_omap_rmkeys(TransContext *txc,
14441 CollectionRef& c,
14442 OnodeRef& o,
14443 bufferlist& bl)
14444{
14445 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
14446 int r = 0;
11fdf7f2 14447 auto p = bl.cbegin();
7c673cae
FG
14448 __u32 num;
14449 string final_key;
14450
14451 if (!o->onode.has_omap()) {
14452 goto out;
14453 }
11fdf7f2 14454 {
9f95a23c
TL
14455 const string& prefix = o->get_omap_prefix();
14456 o->get_omap_key(string(), &final_key);
14457 size_t base_key_len = final_key.size();
11fdf7f2
TL
14458 decode(num, p);
14459 while (num--) {
14460 string key;
14461 decode(key, p);
9f95a23c 14462 final_key.resize(base_key_len); // keep prefix
11fdf7f2
TL
14463 final_key += key;
14464 dout(20) << __func__ << " rm " << pretty_binary_string(final_key)
14465 << " <- " << key << dendl;
14466 txc->t->rmkey(prefix, final_key);
14467 }
7c673cae
FG
14468 }
14469 txc->note_modified_object(o);
14470
14471 out:
14472 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14473 return r;
14474}
14475
14476int BlueStore::_omap_rmkey_range(TransContext *txc,
14477 CollectionRef& c,
14478 OnodeRef& o,
14479 const string& first, const string& last)
14480{
14481 dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
7c673cae
FG
14482 string key_first, key_last;
14483 int r = 0;
14484 if (!o->onode.has_omap()) {
14485 goto out;
14486 }
11fdf7f2 14487 {
9f95a23c 14488 const string& prefix = o->get_omap_prefix();
11fdf7f2 14489 o->flush();
9f95a23c
TL
14490 o->get_omap_key(first, &key_first);
14491 o->get_omap_key(last, &key_last);
11fdf7f2
TL
14492 txc->t->rm_range_keys(prefix, key_first, key_last);
14493 dout(20) << __func__ << " remove range start: "
14494 << pretty_binary_string(key_first) << " end: "
14495 << pretty_binary_string(key_last) << dendl;
7c673cae
FG
14496 }
14497 txc->note_modified_object(o);
14498
14499 out:
14500 dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
14501 return r;
14502}
14503
14504int BlueStore::_set_alloc_hint(
14505 TransContext *txc,
14506 CollectionRef& c,
14507 OnodeRef& o,
14508 uint64_t expected_object_size,
14509 uint64_t expected_write_size,
14510 uint32_t flags)
14511{
14512 dout(15) << __func__ << " " << c->cid << " " << o->oid
14513 << " object_size " << expected_object_size
14514 << " write_size " << expected_write_size
14515 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
14516 << dendl;
14517 int r = 0;
14518 o->onode.expected_object_size = expected_object_size;
14519 o->onode.expected_write_size = expected_write_size;
14520 o->onode.alloc_hint_flags = flags;
14521 txc->write_onode(o);
14522 dout(10) << __func__ << " " << c->cid << " " << o->oid
14523 << " object_size " << expected_object_size
14524 << " write_size " << expected_write_size
14525 << " flags " << ceph_osd_alloc_hint_flag_string(flags)
14526 << " = " << r << dendl;
14527 return r;
14528}
14529
14530int BlueStore::_clone(TransContext *txc,
14531 CollectionRef& c,
14532 OnodeRef& oldo,
14533 OnodeRef& newo)
14534{
14535 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14536 << newo->oid << dendl;
14537 int r = 0;
14538 if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
14539 derr << __func__ << " mismatched hash on " << oldo->oid
14540 << " and " << newo->oid << dendl;
14541 return -EINVAL;
14542 }
14543
7c673cae
FG
14544 _assign_nid(txc, newo);
14545
14546 // clone data
14547 oldo->flush();
14548 _do_truncate(txc, c, newo, 0);
14549 if (cct->_conf->bluestore_clone_cow) {
14550 _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
14551 } else {
14552 bufferlist bl;
14553 r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
14554 if (r < 0)
14555 goto out;
14556 r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
14557 if (r < 0)
14558 goto out;
14559 }
14560
14561 // clone attrs
14562 newo->onode.attrs = oldo->onode.attrs;
14563
14564 // clone omap
14565 if (newo->onode.has_omap()) {
14566 dout(20) << __func__ << " clearing old omap data" << dendl;
14567 newo->flush();
9f95a23c 14568 _do_omap_clear(txc, newo);
494da23a 14569 newo->onode.clear_omap_flag();
7c673cae
FG
14570 }
14571 if (oldo->onode.has_omap()) {
14572 dout(20) << __func__ << " copying omap data" << dendl;
494da23a 14573 if (newo->oid.is_pgmeta()) {
9f95a23c
TL
14574 newo->onode.set_omap_flags_pgmeta();
14575 } else {
14576 newo->onode.set_omap_flags();
7c673cae 14577 }
9f95a23c 14578 const string& prefix = newo->get_omap_prefix();
11fdf7f2 14579 KeyValueDB::Iterator it = db->get_iterator(prefix);
7c673cae 14580 string head, tail;
9f95a23c
TL
14581 oldo->get_omap_header(&head);
14582 oldo->get_omap_tail(&tail);
7c673cae
FG
14583 it->lower_bound(head);
14584 while (it->valid()) {
14585 if (it->key() >= tail) {
14586 dout(30) << __func__ << " reached tail" << dendl;
14587 break;
14588 } else {
14589 dout(30) << __func__ << " got header/data "
14590 << pretty_binary_string(it->key()) << dendl;
14591 string key;
9f95a23c 14592 newo->rewrite_omap_key(it->key(), &key);
11fdf7f2 14593 txc->t->set(prefix, key, it->value());
7c673cae
FG
14594 }
14595 it->next();
14596 }
494da23a
TL
14597 string new_tail;
14598 bufferlist new_tail_value;
9f95a23c 14599 newo->get_omap_tail(&new_tail);
494da23a 14600 txc->t->set(prefix, new_tail, new_tail_value);
7c673cae
FG
14601 }
14602
14603 txc->write_onode(newo);
14604 r = 0;
14605
14606 out:
14607 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14608 << newo->oid << " = " << r << dendl;
14609 return r;
14610}
14611
14612int BlueStore::_do_clone_range(
14613 TransContext *txc,
14614 CollectionRef& c,
14615 OnodeRef& oldo,
14616 OnodeRef& newo,
224ce89b
WB
14617 uint64_t srcoff,
14618 uint64_t length,
14619 uint64_t dstoff)
7c673cae
FG
14620{
14621 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14622 << newo->oid
14623 << " 0x" << std::hex << srcoff << "~" << length << " -> "
14624 << " 0x" << dstoff << "~" << length << std::dec << dendl;
14625 oldo->extent_map.fault_range(db, srcoff, length);
14626 newo->extent_map.fault_range(db, dstoff, length);
81eedcae
TL
14627 _dump_onode<30>(cct, *oldo);
14628 _dump_onode<30>(cct, *newo);
7c673cae 14629
11fdf7f2 14630 oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
81eedcae
TL
14631 _dump_onode<30>(cct, *oldo);
14632 _dump_onode<30>(cct, *newo);
7c673cae
FG
14633 return 0;
14634}
14635
14636int BlueStore::_clone_range(TransContext *txc,
14637 CollectionRef& c,
14638 OnodeRef& oldo,
14639 OnodeRef& newo,
14640 uint64_t srcoff, uint64_t length, uint64_t dstoff)
14641{
14642 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14643 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
14644 << " to offset 0x" << dstoff << std::dec << dendl;
14645 int r = 0;
14646
35e4c445
FG
14647 if (srcoff + length >= OBJECT_MAX_SIZE ||
14648 dstoff + length >= OBJECT_MAX_SIZE) {
14649 r = -E2BIG;
14650 goto out;
14651 }
7c673cae
FG
14652 if (srcoff + length > oldo->onode.size) {
14653 r = -EINVAL;
14654 goto out;
14655 }
14656
7c673cae
FG
14657 _assign_nid(txc, newo);
14658
14659 if (length > 0) {
14660 if (cct->_conf->bluestore_clone_cow) {
14661 _do_zero(txc, c, newo, dstoff, length);
14662 _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
14663 } else {
14664 bufferlist bl;
14665 r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
14666 if (r < 0)
14667 goto out;
14668 r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
14669 if (r < 0)
14670 goto out;
14671 }
14672 }
14673
14674 txc->write_onode(newo);
14675 r = 0;
14676
14677 out:
14678 dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14679 << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
14680 << " to offset 0x" << dstoff << std::dec
14681 << " = " << r << dendl;
14682 return r;
14683}
14684
14685int BlueStore::_rename(TransContext *txc,
14686 CollectionRef& c,
14687 OnodeRef& oldo,
14688 OnodeRef& newo,
14689 const ghobject_t& new_oid)
14690{
14691 dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
14692 << new_oid << dendl;
14693 int r;
14694 ghobject_t old_oid = oldo->oid;
31f18b77 14695 mempool::bluestore_cache_other::string new_okey;
7c673cae
FG
14696
14697 if (newo) {
14698 if (newo->exists) {
14699 r = -EEXIST;
14700 goto out;
14701 }
11fdf7f2 14702 ceph_assert(txc->onodes.count(newo) == 0);
7c673cae
FG
14703 }
14704
14705 txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
14706
14707 // rewrite shards
14708 {
14709 oldo->extent_map.fault_range(db, 0, oldo->onode.size);
14710 get_object_key(cct, new_oid, &new_okey);
14711 string key;
14712 for (auto &s : oldo->extent_map.shards) {
14713 generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
14714 [&](const string& final_key) {
14715 txc->t->rmkey(PREFIX_OBJ, final_key);
14716 }
14717 );
14718 s.dirty = true;
14719 }
14720 }
14721
14722 newo = oldo;
14723 txc->write_onode(newo);
14724
14725 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
14726 // Onode in the old slot
14727 c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
14728 r = 0;
14729
f64942e4
AA
14730 // hold a ref to new Onode in old name position, to ensure we don't drop
14731 // it from the cache before this txc commits (or else someone may come along
14732 // and read newo's metadata via the old name).
14733 txc->note_modified_object(oldo);
14734
7c673cae
FG
14735 out:
14736 dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
14737 << new_oid << " = " << r << dendl;
14738 return r;
14739}
14740
14741// collections
14742
14743int BlueStore::_create_collection(
14744 TransContext *txc,
14745 const coll_t &cid,
14746 unsigned bits,
14747 CollectionRef *c)
14748{
14749 dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
14750 int r;
14751 bufferlist bl;
14752
14753 {
9f95a23c 14754 std::unique_lock l(coll_lock);
7c673cae
FG
14755 if (*c) {
14756 r = -EEXIST;
14757 goto out;
14758 }
11fdf7f2
TL
14759 auto p = new_coll_map.find(cid);
14760 ceph_assert(p != new_coll_map.end());
14761 *c = p->second;
7c673cae
FG
14762 (*c)->cnode.bits = bits;
14763 coll_map[cid] = *c;
11fdf7f2 14764 new_coll_map.erase(p);
7c673cae 14765 }
11fdf7f2 14766 encode((*c)->cnode, bl);
7c673cae
FG
14767 txc->t->set(PREFIX_COLL, stringify(cid), bl);
14768 r = 0;
14769
14770 out:
14771 dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
14772 return r;
14773}
14774
14775int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
14776 CollectionRef *c)
14777{
14778 dout(15) << __func__ << " " << cid << dendl;
14779 int r;
14780
11fdf7f2 14781 (*c)->flush_all_but_last();
7c673cae 14782 {
9f95a23c 14783 std::unique_lock l(coll_lock);
7c673cae
FG
14784 if (!*c) {
14785 r = -ENOENT;
14786 goto out;
14787 }
14788 size_t nonexistent_count = 0;
11fdf7f2 14789 ceph_assert((*c)->exists);
7c673cae
FG
14790 if ((*c)->onode_map.map_any([&](OnodeRef o) {
14791 if (o->exists) {
494da23a
TL
14792 dout(1) << __func__ << " " << o->oid << " " << o
14793 << " exists in onode_map" << dendl;
7c673cae
FG
14794 return true;
14795 }
14796 ++nonexistent_count;
14797 return false;
14798 })) {
14799 r = -ENOTEMPTY;
14800 goto out;
14801 }
14802
14803 vector<ghobject_t> ls;
14804 ghobject_t next;
14805 // Enumerate onodes in db, up to nonexistent_count + 1
14806 // then check if all of them are marked as non-existent.
11fdf7f2 14807 // Bypass the check if (next != ghobject_t::get_max())
7c673cae
FG
14808 r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
14809 nonexistent_count + 1, &ls, &next);
14810 if (r >= 0) {
11fdf7f2
TL
14811 // If true mean collecton has more objects than nonexistent_count,
14812 // so bypass check.
14813 bool exists = (!next.is_max());
7c673cae
FG
14814 for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
14815 dout(10) << __func__ << " oid " << *it << dendl;
14816 auto onode = (*c)->onode_map.lookup(*it);
14817 exists = !onode || onode->exists;
14818 if (exists) {
494da23a
TL
14819 dout(1) << __func__ << " " << *it
14820 << " exists in db, "
14821 << (!onode ? "not present in ram" : "present in ram")
14822 << dendl;
7c673cae
FG
14823 }
14824 }
14825 if (!exists) {
11fdf7f2 14826 _do_remove_collection(txc, c);
7c673cae
FG
14827 r = 0;
14828 } else {
14829 dout(10) << __func__ << " " << cid
14830 << " is non-empty" << dendl;
14831 r = -ENOTEMPTY;
14832 }
14833 }
14834 }
14835
14836 out:
14837 dout(10) << __func__ << " " << cid << " = " << r << dendl;
14838 return r;
14839}
14840
11fdf7f2
TL
14841void BlueStore::_do_remove_collection(TransContext *txc,
14842 CollectionRef *c)
14843{
14844 coll_map.erase((*c)->cid);
14845 txc->removed_collections.push_back(*c);
14846 (*c)->exists = false;
14847 _osr_register_zombie((*c)->osr.get());
14848 txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
14849 c->reset();
14850}
14851
7c673cae
FG
14852int BlueStore::_split_collection(TransContext *txc,
14853 CollectionRef& c,
14854 CollectionRef& d,
14855 unsigned bits, int rem)
14856{
14857 dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
14858 << " bits " << bits << dendl;
9f95a23c
TL
14859 std::unique_lock l(c->lock);
14860 std::unique_lock l2(d->lock);
7c673cae
FG
14861 int r;
14862
14863 // flush all previous deferred writes on this sequencer. this is a bit
14864 // heavyweight, but we need to make sure all deferred writes complete
14865 // before we split as the new collection's sequencer may need to order
14866 // this after those writes, and we don't bother with the complexity of
14867 // moving those TransContexts over to the new osr.
14868 _osr_drain_preceding(txc);
14869
14870 // move any cached items (onodes and referenced shared blobs) that will
14871 // belong to the child collection post-split. leave everything else behind.
14872 // this may include things that don't strictly belong to the now-smaller
14873 // parent split, but the OSD will always send us a split for every new
14874 // child.
14875
14876 spg_t pgid, dest_pgid;
14877 bool is_pg = c->cid.is_pg(&pgid);
11fdf7f2 14878 ceph_assert(is_pg);
7c673cae 14879 is_pg = d->cid.is_pg(&dest_pgid);
11fdf7f2 14880 ceph_assert(is_pg);
7c673cae
FG
14881
14882 // the destination should initially be empty.
11fdf7f2
TL
14883 ceph_assert(d->onode_map.empty());
14884 ceph_assert(d->shared_blob_set.empty());
14885 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
14886
14887 c->split_cache(d.get());
14888
14889 // adjust bits. note that this will be redundant for all but the first
14890 // split call for this parent (first child).
14891 c->cnode.bits = bits;
11fdf7f2 14892 ceph_assert(d->cnode.bits == bits);
7c673cae
FG
14893 r = 0;
14894
14895 bufferlist bl;
11fdf7f2 14896 encode(c->cnode, bl);
7c673cae
FG
14897 txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
14898
14899 dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
14900 << " bits " << bits << " = " << r << dendl;
14901 return r;
14902}
14903
11fdf7f2
TL
14904int BlueStore::_merge_collection(
14905 TransContext *txc,
14906 CollectionRef *c,
14907 CollectionRef& d,
14908 unsigned bits)
14909{
14910 dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
14911 << " bits " << bits << dendl;
9f95a23c
TL
14912 std::unique_lock l((*c)->lock);
14913 std::unique_lock l2(d->lock);
11fdf7f2
TL
14914 int r;
14915
14916 coll_t cid = (*c)->cid;
14917
14918 // flush all previous deferred writes on the source collection to ensure
14919 // that all deferred writes complete before we merge as the target collection's
14920 // sequencer may need to order new ops after those writes.
14921
14922 _osr_drain((*c)->osr.get());
14923
14924 // move any cached items (onodes and referenced shared blobs) that will
14925 // belong to the child collection post-split. leave everything else behind.
14926 // this may include things that don't strictly belong to the now-smaller
14927 // parent split, but the OSD will always send us a split for every new
14928 // child.
14929
14930 spg_t pgid, dest_pgid;
14931 bool is_pg = cid.is_pg(&pgid);
14932 ceph_assert(is_pg);
14933 is_pg = d->cid.is_pg(&dest_pgid);
14934 ceph_assert(is_pg);
14935
14936 // adjust bits. note that this will be redundant for all but the first
14937 // merge call for the parent/target.
14938 d->cnode.bits = bits;
14939
14940 // behavior depends on target (d) bits, so this after that is updated.
14941 (*c)->split_cache(d.get());
14942
14943 // remove source collection
14944 {
9f95a23c 14945 std::unique_lock l3(coll_lock);
11fdf7f2
TL
14946 _do_remove_collection(txc, c);
14947 }
14948
14949 r = 0;
14950
14951 bufferlist bl;
14952 encode(d->cnode, bl);
14953 txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
14954
14955 dout(10) << __func__ << " " << cid << " to " << d->cid << " "
14956 << " bits " << bits << " = " << r << dendl;
14957 return r;
14958}
14959
494da23a
TL
14960void BlueStore::log_latency(
14961 const char* name,
14962 int idx,
14963 const ceph::timespan& l,
14964 double lat_threshold,
14965 const char* info) const
14966{
14967 logger->tinc(idx, l);
14968 if (lat_threshold > 0.0 &&
14969 l >= make_timespan(lat_threshold)) {
14970 dout(0) << __func__ << " slow operation observed for " << name
14971 << ", latency = " << l
14972 << info
14973 << dendl;
14974 }
14975}
14976
11fdf7f2 14977void BlueStore::log_latency_fn(
494da23a 14978 const char* name,
11fdf7f2
TL
14979 int idx,
14980 const ceph::timespan& l,
494da23a
TL
14981 double lat_threshold,
14982 std::function<string (const ceph::timespan& lat)> fn) const
11fdf7f2 14983{
494da23a
TL
14984 logger->tinc(idx, l);
14985 if (lat_threshold > 0.0 &&
14986 l >= make_timespan(lat_threshold)) {
14987 dout(0) << __func__ << " slow operation observed for " << name
14988 << ", latency = " << l
14989 << fn(l)
14990 << dendl;
14991 }
11fdf7f2
TL
14992}
14993
9f95a23c
TL
14994#if defined(WITH_LTTNG)
14995void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
14996 KeyValueDB &db,
14997 TransContext &txc,
14998 mono_clock::time_point start_throttle_acquire)
14999{
15000 pending_kv_ios += txc.ios;
15001 if (txc.deferred_txn) {
15002 pending_deferred_ios += txc.ios;
15003 }
15004
15005 uint64_t started = 0;
15006 uint64_t completed = 0;
15007 if (should_trace(&started, &completed)) {
15008 txc.tracing = true;
15009 uint64_t rocksdb_base_level,
15010 rocksdb_estimate_pending_compaction_bytes,
15011 rocksdb_cur_size_all_mem_tables,
15012 rocksdb_compaction_pending,
15013 rocksdb_mem_table_flush_pending,
15014 rocksdb_num_running_compactions,
15015 rocksdb_num_running_flushes,
15016 rocksdb_actual_delayed_write_rate;
15017 db.get_property(
15018 "rocksdb.base-level",
15019 &rocksdb_base_level);
15020 db.get_property(
15021 "rocksdb.estimate-pending-compaction-bytes",
15022 &rocksdb_estimate_pending_compaction_bytes);
15023 db.get_property(
15024 "rocksdb.cur-size-all-mem-tables",
15025 &rocksdb_cur_size_all_mem_tables);
15026 db.get_property(
15027 "rocksdb.compaction-pending",
15028 &rocksdb_compaction_pending);
15029 db.get_property(
15030 "rocksdb.mem-table-flush-pending",
15031 &rocksdb_mem_table_flush_pending);
15032 db.get_property(
15033 "rocksdb.num-running-compactions",
15034 &rocksdb_num_running_compactions);
15035 db.get_property(
15036 "rocksdb.num-running-flushes",
15037 &rocksdb_num_running_flushes);
15038 db.get_property(
15039 "rocksdb.actual-delayed-write-rate",
15040 &rocksdb_actual_delayed_write_rate);
15041
15042
15043 tracepoint(
15044 bluestore,
15045 transaction_initial_state,
15046 txc.osr->get_sequencer_id(),
15047 txc.seq,
15048 throttle_bytes.get_current(),
15049 throttle_deferred_bytes.get_current(),
15050 pending_kv_ios,
15051 pending_deferred_ios,
15052 started,
15053 completed,
15054 ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
15055
15056 tracepoint(
15057 bluestore,
15058 transaction_initial_state_rocksdb,
15059 txc.osr->get_sequencer_id(),
15060 txc.seq,
15061 rocksdb_base_level,
15062 rocksdb_estimate_pending_compaction_bytes,
15063 rocksdb_cur_size_all_mem_tables,
15064 rocksdb_compaction_pending,
15065 rocksdb_mem_table_flush_pending,
15066 rocksdb_num_running_compactions,
15067 rocksdb_num_running_flushes,
15068 rocksdb_actual_delayed_write_rate);
15069 }
15070}
15071#endif
15072
15073mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
15074 TransContext &txc, PerfCounters *logger, int state)
15075{
15076 mono_clock::time_point now = mono_clock::now();
15077 mono_clock::duration lat = now - txc.last_stamp;
15078 logger->tinc(state, lat);
15079#if defined(WITH_LTTNG)
15080 if (txc.tracing &&
15081 state >= l_bluestore_state_prepare_lat &&
15082 state <= l_bluestore_state_done_lat) {
15083 OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
15084 tracepoint(
15085 bluestore,
15086 transaction_state_duration,
15087 txc.osr->get_sequencer_id(),
15088 txc.seq,
15089 state,
15090 ceph::to_seconds<double>(lat));
15091 }
15092#endif
15093 txc.last_stamp = now;
15094 return lat;
15095}
15096
15097bool BlueStore::BlueStoreThrottle::try_start_transaction(
15098 KeyValueDB &db,
15099 TransContext &txc,
15100 mono_clock::time_point start_throttle_acquire)
15101{
15102 throttle_bytes.get(txc.cost);
15103
15104 if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
15105 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15106 return true;
15107 } else {
15108 return false;
15109 }
15110}
15111
15112void BlueStore::BlueStoreThrottle::finish_start_transaction(
15113 KeyValueDB &db,
15114 TransContext &txc,
15115 mono_clock::time_point start_throttle_acquire)
15116{
15117 ceph_assert(txc.deferred_txn);
15118 throttle_deferred_bytes.get(txc.cost);
15119 emit_initial_tracepoint(db, txc, start_throttle_acquire);
15120}
15121
15122#if defined(WITH_LTTNG)
15123void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
15124{
15125 pending_kv_ios -= 1;
15126 ios_completed_since_last_traced++;
15127 if (txc.tracing) {
15128 tracepoint(
15129 bluestore,
15130 transaction_commit_latency,
15131 txc.osr->get_sequencer_id(),
15132 txc.seq,
15133 ceph::to_seconds<double>(mono_clock::now() - txc.start));
15134 }
15135}
15136#endif
15137
15138#if defined(WITH_LTTNG)
15139void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
15140{
15141 if (txc.deferred_txn) {
15142 pending_deferred_ios -= 1;
15143 }
15144 if (txc.tracing) {
15145 mono_clock::time_point now = mono_clock::now();
15146 mono_clock::duration lat = now - txc.start;
15147 tracepoint(
15148 bluestore,
15149 transaction_total_duration,
15150 txc.osr->get_sequencer_id(),
15151 txc.seq,
15152 ceph::to_seconds<double>(lat));
15153 }
15154}
15155#endif
11fdf7f2 15156
7c673cae
FG
15157// DB key value Histogram
15158#define KEY_SLAB 32
15159#define VALUE_SLAB 64
15160
15161const string prefix_onode = "o";
15162const string prefix_onode_shard = "x";
15163const string prefix_other = "Z";
15164
15165int BlueStore::DBHistogram::get_key_slab(size_t sz)
15166{
15167 return (sz/KEY_SLAB);
15168}
15169
15170string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
15171{
15172 int lower_bound = slab * KEY_SLAB;
15173 int upper_bound = (slab + 1) * KEY_SLAB;
15174 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15175 return ret;
15176}
15177
15178int BlueStore::DBHistogram::get_value_slab(size_t sz)
15179{
15180 return (sz/VALUE_SLAB);
15181}
15182
15183string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
15184{
15185 int lower_bound = slab * VALUE_SLAB;
15186 int upper_bound = (slab + 1) * VALUE_SLAB;
15187 string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
15188 return ret;
15189}
15190
15191void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
15192 const string &prefix, size_t key_size, size_t value_size)
15193{
15194 uint32_t key_slab = get_key_slab(key_size);
15195 uint32_t value_slab = get_value_slab(value_size);
15196 key_hist[prefix][key_slab].count++;
11fdf7f2
TL
15197 key_hist[prefix][key_slab].max_len =
15198 std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
7c673cae
FG
15199 key_hist[prefix][key_slab].val_map[value_slab].count++;
15200 key_hist[prefix][key_slab].val_map[value_slab].max_len =
11fdf7f2
TL
15201 std::max<size_t>(value_size,
15202 key_hist[prefix][key_slab].val_map[value_slab].max_len);
7c673cae
FG
15203}
15204
15205void BlueStore::DBHistogram::dump(Formatter *f)
15206{
15207 f->open_object_section("rocksdb_value_distribution");
15208 for (auto i : value_hist) {
15209 f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
15210 }
15211 f->close_section();
15212
15213 f->open_object_section("rocksdb_key_value_histogram");
15214 for (auto i : key_hist) {
15215 f->dump_string("prefix", i.first);
15216 f->open_object_section("key_hist");
15217 for ( auto k : i.second) {
15218 f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
15219 f->dump_unsigned("max_len", k.second.max_len);
15220 f->open_object_section("value_hist");
15221 for ( auto j : k.second.val_map) {
15222 f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
15223 f->dump_unsigned("max_len", j.second.max_len);
15224 }
15225 f->close_section();
15226 }
15227 f->close_section();
15228 }
15229 f->close_section();
15230}
15231
15232//Itrerates through the db and collects the stats
15233void BlueStore::generate_db_histogram(Formatter *f)
15234{
15235 //globals
15236 uint64_t num_onodes = 0;
15237 uint64_t num_shards = 0;
15238 uint64_t num_super = 0;
15239 uint64_t num_coll = 0;
15240 uint64_t num_omap = 0;
11fdf7f2 15241 uint64_t num_pgmeta_omap = 0;
7c673cae
FG
15242 uint64_t num_deferred = 0;
15243 uint64_t num_alloc = 0;
15244 uint64_t num_stat = 0;
15245 uint64_t num_others = 0;
15246 uint64_t num_shared_shards = 0;
15247 size_t max_key_size =0, max_value_size = 0;
15248 uint64_t total_key_size = 0, total_value_size = 0;
15249 size_t key_size = 0, value_size = 0;
15250 DBHistogram hist;
15251
11fdf7f2 15252 auto start = coarse_mono_clock::now();
7c673cae 15253
11fdf7f2 15254 KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
7c673cae
FG
15255 iter->seek_to_first();
15256 while (iter->valid()) {
15257 dout(30) << __func__ << " Key: " << iter->key() << dendl;
15258 key_size = iter->key_size();
15259 value_size = iter->value_size();
15260 hist.value_hist[hist.get_value_slab(value_size)]++;
11fdf7f2
TL
15261 max_key_size = std::max(max_key_size, key_size);
15262 max_value_size = std::max(max_value_size, value_size);
7c673cae
FG
15263 total_key_size += key_size;
15264 total_value_size += value_size;
15265
15266 pair<string,string> key(iter->raw_key());
15267
15268 if (key.first == PREFIX_SUPER) {
15269 hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
15270 num_super++;
15271 } else if (key.first == PREFIX_STAT) {
15272 hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
15273 num_stat++;
15274 } else if (key.first == PREFIX_COLL) {
15275 hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
15276 num_coll++;
15277 } else if (key.first == PREFIX_OBJ) {
15278 if (key.second.back() == ONODE_KEY_SUFFIX) {
15279 hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
15280 num_onodes++;
15281 } else {
15282 hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
15283 num_shards++;
15284 }
15285 } else if (key.first == PREFIX_OMAP) {
15286 hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
15287 num_omap++;
11fdf7f2
TL
15288 } else if (key.first == PREFIX_PGMETA_OMAP) {
15289 hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
15290 num_pgmeta_omap++;
7c673cae
FG
15291 } else if (key.first == PREFIX_DEFERRED) {
15292 hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
15293 num_deferred++;
11fdf7f2 15294 } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
7c673cae
FG
15295 hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
15296 num_alloc++;
15297 } else if (key.first == PREFIX_SHARED_BLOB) {
15298 hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
15299 num_shared_shards++;
15300 } else {
15301 hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
15302 num_others++;
15303 }
15304 iter->next();
15305 }
15306
11fdf7f2 15307 ceph::timespan duration = coarse_mono_clock::now() - start;
7c673cae
FG
15308 f->open_object_section("rocksdb_key_value_stats");
15309 f->dump_unsigned("num_onodes", num_onodes);
15310 f->dump_unsigned("num_shards", num_shards);
15311 f->dump_unsigned("num_super", num_super);
15312 f->dump_unsigned("num_coll", num_coll);
15313 f->dump_unsigned("num_omap", num_omap);
11fdf7f2 15314 f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
7c673cae
FG
15315 f->dump_unsigned("num_deferred", num_deferred);
15316 f->dump_unsigned("num_alloc", num_alloc);
15317 f->dump_unsigned("num_stat", num_stat);
15318 f->dump_unsigned("num_shared_shards", num_shared_shards);
15319 f->dump_unsigned("num_others", num_others);
15320 f->dump_unsigned("max_key_size", max_key_size);
15321 f->dump_unsigned("max_value_size", max_value_size);
15322 f->dump_unsigned("total_key_size", total_key_size);
15323 f->dump_unsigned("total_value_size", total_value_size);
15324 f->close_section();
15325
15326 hist.dump(f);
15327
15328 dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
15329
15330}
15331
31f18b77 15332void BlueStore::_flush_cache()
7c673cae
FG
15333{
15334 dout(10) << __func__ << dendl;
9f95a23c
TL
15335 for (auto i : onode_cache_shards) {
15336 i->flush();
15337 ceph_assert(i->empty());
15338 }
15339 for (auto i : buffer_cache_shards) {
15340 i->flush();
11fdf7f2 15341 ceph_assert(i->empty());
7c673cae
FG
15342 }
15343 for (auto& p : coll_map) {
3efd9988 15344 if (!p.second->onode_map.empty()) {
11fdf7f2
TL
15345 derr << __func__ << " stray onodes on " << p.first << dendl;
15346 p.second->onode_map.dump<0>(cct);
3efd9988
FG
15347 }
15348 if (!p.second->shared_blob_set.empty()) {
15349 derr << __func__ << " stray shared blobs on " << p.first << dendl;
11fdf7f2 15350 p.second->shared_blob_set.dump<0>(cct);
3efd9988 15351 }
11fdf7f2
TL
15352 ceph_assert(p.second->onode_map.empty());
15353 ceph_assert(p.second->shared_blob_set.empty());
7c673cae
FG
15354 }
15355 coll_map.clear();
15356}
15357
31f18b77
FG
15358// For external caller.
15359// We use a best-effort policy instead, e.g.,
15360// we don't care if there are still some pinned onodes/data in the cache
15361// after this command is completed.
11fdf7f2 15362int BlueStore::flush_cache(ostream *os)
31f18b77
FG
15363{
15364 dout(10) << __func__ << dendl;
9f95a23c
TL
15365 for (auto i : onode_cache_shards) {
15366 i->flush();
15367 }
15368 for (auto i : buffer_cache_shards) {
15369 i->flush();
31f18b77 15370 }
11fdf7f2
TL
15371
15372 return 0;
31f18b77
FG
15373}
15374
7c673cae
FG
15375void BlueStore::_apply_padding(uint64_t head_pad,
15376 uint64_t tail_pad,
7c673cae
FG
15377 bufferlist& padded)
15378{
7c673cae 15379 if (head_pad) {
224ce89b 15380 padded.prepend_zero(head_pad);
7c673cae
FG
15381 }
15382 if (tail_pad) {
15383 padded.append_zero(tail_pad);
15384 }
15385 if (head_pad || tail_pad) {
15386 dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad
15387 << " tail 0x" << tail_pad << std::dec << dendl;
15388 logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
15389 }
15390}
15391
11fdf7f2
TL
15392void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
15393{
15394 // finalize extent_map shards
15395 o->extent_map.update(txn, false);
15396 if (o->extent_map.needs_reshard()) {
15397 o->extent_map.reshard(db, txn);
15398 o->extent_map.update(txn, true);
15399 if (o->extent_map.needs_reshard()) {
15400 dout(20) << __func__ << " warning: still wants reshard, check options?"
15401 << dendl;
15402 o->extent_map.clear_needs_reshard();
15403 }
15404 logger->inc(l_bluestore_onode_reshard);
15405 }
15406
15407 // bound encode
15408 size_t bound = 0;
15409 denc(o->onode, bound);
15410 o->extent_map.bound_encode_spanning_blobs(bound);
15411 if (o->onode.extent_map_shards.empty()) {
15412 denc(o->extent_map.inline_bl, bound);
15413 }
15414
15415 // encode
15416 bufferlist bl;
15417 unsigned onode_part, blob_part, extent_part;
15418 {
15419 auto p = bl.get_contiguous_appender(bound, true);
15420 denc(o->onode, p);
15421 onode_part = p.get_logical_offset();
15422 o->extent_map.encode_spanning_blobs(p);
15423 blob_part = p.get_logical_offset() - onode_part;
15424 if (o->onode.extent_map_shards.empty()) {
15425 denc(o->extent_map.inline_bl, p);
15426 }
15427 extent_part = p.get_logical_offset() - onode_part - blob_part;
15428 }
15429
15430 dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
15431 << " (" << onode_part << " bytes onode + "
15432 << blob_part << " bytes spanning blobs + "
15433 << extent_part << " bytes inline extents)"
15434 << dendl;
15435
15436
15437 txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
15438}
15439
15440void BlueStore::_log_alerts(osd_alert_list_t& alerts)
15441{
15442 std::lock_guard l(qlock);
15443
81eedcae
TL
15444 if (!disk_size_mismatch_alert.empty()) {
15445 alerts.emplace(
15446 "BLUESTORE_DISK_SIZE_MISMATCH",
15447 disk_size_mismatch_alert);
15448 }
15449 if (!legacy_statfs_alert.empty()) {
15450 alerts.emplace(
15451 "BLUESTORE_LEGACY_STATFS",
15452 legacy_statfs_alert);
15453 }
11fdf7f2
TL
15454 if (!spillover_alert.empty() &&
15455 cct->_conf->bluestore_warn_on_bluefs_spillover) {
15456 alerts.emplace(
15457 "BLUEFS_SPILLOVER",
15458 spillover_alert);
15459 }
9f95a23c
TL
15460 if (!no_per_pool_omap_alert.empty()) {
15461 alerts.emplace(
15462 "BLUESTORE_NO_PER_POOL_OMAP",
15463 no_per_pool_omap_alert);
15464 }
11fdf7f2
TL
15465 string s0(failed_cmode);
15466
15467 if (!failed_compressors.empty()) {
15468 if (!s0.empty()) {
15469 s0 += ", ";
15470 }
15471 s0 += "unable to load:";
15472 bool first = true;
15473 for (auto& s : failed_compressors) {
15474 if (first) {
15475 first = false;
15476 } else {
15477 s0 += ", ";
15478 }
15479 s0 += s;
15480 }
15481 alerts.emplace(
15482 "BLUESTORE_NO_COMPRESSION",
15483 s0);
15484 }
15485}
15486
9f95a23c
TL
15487void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
15488 size_t extents)
15489{
15490 alloc_stats_count++;
15491 alloc_stats_fragments += extents;
15492 alloc_stats_size += need;
15493}
15494
15495void BlueStore::_record_allocation_stats()
15496{
15497 // don't care about data consistency,
15498 // fields can be partially modified while making the tuple
15499 auto t0 = std::make_tuple(
15500 alloc_stats_count.exchange(0),
15501 alloc_stats_fragments.exchange(0),
15502 alloc_stats_size.exchange(0));
15503
15504 dout(0) << " allocation stats probe "
15505 << probe_count << ":"
15506 << " cnt: " << std::get<0>(t0)
15507 << " frags: " << std::get<1>(t0)
15508 << " size: " << std::get<2>(t0)
15509 << dendl;
15510
15511
15512 //
15513 // Keep the history for probes from the power-of-two sequence:
15514 // -1, -2, -4, -8, -16
15515 //
15516 size_t base = 1;
15517 for (auto& t : alloc_stats_history) {
15518 dout(0) << " probe -"
15519 << base + (probe_count % base) << ": "
15520 << std::get<0>(t)
15521 << ", " << std::get<1>(t)
15522 << ", " << std::get<2>(t)
15523 << dendl;
15524 base <<= 1;
15525 }
15526 dout(0) << "------------" << dendl;
15527
15528 auto prev = probe_count++;
15529 auto mask = (1 << alloc_stats_history.size()) - 1;
15530 probe_count &= mask;
15531
15532 for (size_t i = cbits(prev ^ probe_count) - 1; i > 0 ; --i) {
15533 alloc_stats_history[i] = alloc_stats_history[i - 1];
15534 }
15535 alloc_stats_history[0].swap(t0);
15536}
15537
7c673cae 15538// ===========================================
11fdf7f2
TL
15539// BlueStoreRepairer
15540
15541size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
15542 const interval_set<uint64_t>& extents)
15543{
15544 ceph_assert(granularity); // initialized
15545 // can't call for the second time
15546 ceph_assert(!was_filtered_out);
15547 ceph_assert(collections_bfs.size() == objects_bfs.size());
15548
15549 uint64_t prev_pos = 0;
15550 uint64_t npos = collections_bfs.size();
15551
15552 bloom_vector collections_reduced;
15553 bloom_vector objects_reduced;
15554
15555 for (auto e : extents) {
15556 if (e.second == 0) {
15557 continue;
15558 }
15559 uint64_t pos = max(e.first / granularity, prev_pos);
15560 uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
15561 while (pos != npos && pos < end_pos) {
15562 ceph_assert( collections_bfs[pos].element_count() ==
15563 objects_bfs[pos].element_count());
15564 if (collections_bfs[pos].element_count()) {
15565 collections_reduced.push_back(std::move(collections_bfs[pos]));
15566 objects_reduced.push_back(std::move(objects_bfs[pos]));
15567 }
15568 ++pos;
15569 }
15570 prev_pos = end_pos;
15571 }
15572 collections_reduced.swap(collections_bfs);
15573 objects_reduced.swap(objects_bfs);
15574 was_filtered_out = true;
15575 return collections_bfs.size();
15576}
15577
15578bool BlueStoreRepairer::remove_key(KeyValueDB *db,
15579 const string& prefix,
15580 const string& key)
15581{
15582 if (!remove_key_txn) {
15583 remove_key_txn = db->get_transaction();
15584 }
15585 ++to_repair_cnt;
15586 remove_key_txn->rmkey(prefix, key);
15587
15588 return true;
15589}
15590
9f95a23c
TL
15591void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db)
15592{
15593 fix_per_pool_omap_txn = db->get_transaction();
15594 ++to_repair_cnt;
15595 bufferlist bl;
15596 bl.append("1");
15597 fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
15598}
15599
11fdf7f2
TL
15600bool BlueStoreRepairer::fix_shared_blob(
15601 KeyValueDB *db,
15602 uint64_t sbid,
15603 const bufferlist* bl)
15604{
15605 KeyValueDB::Transaction txn;
15606 if (fix_misreferences_txn) { // reuse this txn
15607 txn = fix_misreferences_txn;
15608 } else {
15609 if (!fix_shared_blob_txn) {
15610 fix_shared_blob_txn = db->get_transaction();
15611 }
15612 txn = fix_shared_blob_txn;
15613 }
15614 string key;
15615 get_shared_blob_key(sbid, &key);
15616
15617 ++to_repair_cnt;
15618 if (bl) {
15619 txn->set(PREFIX_SHARED_BLOB, key, *bl);
15620 } else {
15621 txn->rmkey(PREFIX_SHARED_BLOB, key);
15622 }
15623 return true;
15624}
15625
15626bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
15627 const string& key,
15628 const store_statfs_t& new_statfs)
15629{
15630 if (!fix_statfs_txn) {
15631 fix_statfs_txn = db->get_transaction();
15632 }
15633 BlueStore::volatile_statfs vstatfs;
15634 vstatfs = new_statfs;
15635 bufferlist bl;
15636 vstatfs.encode(bl);
15637 ++to_repair_cnt;
15638 fix_statfs_txn->set(PREFIX_STAT, key, bl);
15639 return true;
15640}
15641
15642bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
15643 FreelistManager* fm,
15644 uint64_t offset, uint64_t len)
15645{
15646 if (!fix_fm_leaked_txn) {
15647 fix_fm_leaked_txn = db->get_transaction();
15648 }
15649 ++to_repair_cnt;
15650 fm->release(offset, len, fix_fm_leaked_txn);
15651 return true;
15652}
15653bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
15654 FreelistManager* fm,
15655 uint64_t offset, uint64_t len)
15656{
15657 if (!fix_fm_false_free_txn) {
15658 fix_fm_false_free_txn = db->get_transaction();
15659 }
15660 ++to_repair_cnt;
15661 fm->allocate(offset, len, fix_fm_false_free_txn);
15662 return true;
15663}
15664
15665bool BlueStoreRepairer::fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag)
15666{
15667 // this is just a stub to count num of repairs properly,
15668 // actual repair happens in BlueStore::_close_db_and_around()
15669 // while doing _sync_bluefs_and_fm
15670 ++out_of_sync_flag;
15671 ++to_repair_cnt;
15672 return true;
15673}
15674
15675bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
15676{
15677 if (misreferenced_extents.size()) {
15678 size_t n = space_usage_tracker.filter_out(misreferenced_extents);
15679 ceph_assert(n > 0);
15680 if (!fix_misreferences_txn) {
15681 fix_misreferences_txn = db->get_transaction();
15682 }
15683 return true;
15684 }
15685 return false;
15686}
15687
15688unsigned BlueStoreRepairer::apply(KeyValueDB* db)
15689{
9f95a23c
TL
15690 if (fix_per_pool_omap_txn) {
15691 db->submit_transaction_sync(fix_per_pool_omap_txn);
15692 fix_per_pool_omap_txn = nullptr;
15693 }
11fdf7f2
TL
15694 if (fix_fm_leaked_txn) {
15695 db->submit_transaction_sync(fix_fm_leaked_txn);
15696 fix_fm_leaked_txn = nullptr;
15697 }
15698 if (fix_fm_false_free_txn) {
15699 db->submit_transaction_sync(fix_fm_false_free_txn);
15700 fix_fm_false_free_txn = nullptr;
15701 }
15702 if (remove_key_txn) {
15703 db->submit_transaction_sync(remove_key_txn);
15704 remove_key_txn = nullptr;
15705 }
15706 if (fix_misreferences_txn) {
15707 db->submit_transaction_sync(fix_misreferences_txn);
15708 fix_misreferences_txn = nullptr;
15709 }
15710 if (fix_shared_blob_txn) {
15711 db->submit_transaction_sync(fix_shared_blob_txn);
15712 fix_shared_blob_txn = nullptr;
15713 }
15714
15715 if (fix_statfs_txn) {
15716 db->submit_transaction_sync(fix_statfs_txn);
15717 fix_statfs_txn = nullptr;
15718 }
15719 unsigned repaired = to_repair_cnt;
15720 to_repair_cnt = 0;
15721 return repaired;
15722}
15723
15724// =======================================================
9f95a23c
TL
15725// RocksDBBlueFSVolumeSelector
15726
15727uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
15728 ceph_assert(h != nullptr);
15729 uint64_t hint = reinterpret_cast<uint64_t>(h);
15730 uint8_t res;
15731 switch (hint) {
15732 case LEVEL_SLOW:
15733 res = BlueFS::BDEV_SLOW;
15734 if (db_avail4slow > 0) {
15735 // considering statically available db space vs.
15736 // - observed maximums on DB dev for DB/WAL/UNSORTED data
15737 // - observed maximum spillovers
15738 uint64_t max_db_use = 0; // max db usage we potentially observed
15739 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
15740 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
15741 // this could go to db hence using it in the estimation
15742 max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
15743
15744 auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
15745 uint64_t avail = min(
15746 db_avail4slow,
15747 max_db_use < db_total ? db_total - max_db_use : 0);
15748
15749 // considering current DB dev usage for SLOW data
15750 if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
15751 res = BlueFS::BDEV_DB;
15752 }
15753 }
15754 break;
15755 case LEVEL_WAL:
15756 res = BlueFS::BDEV_WAL;
15757 break;
15758 case LEVEL_DB:
15759 default:
15760 res = BlueFS::BDEV_DB;
15761 break;
15762 }
15763 return res;
15764}
15765
15766void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
15767{
15768 res.emplace_back(base, l_totals[LEVEL_DB - LEVEL_FIRST]);
15769 res.emplace_back(base + ".slow", l_totals[LEVEL_SLOW - LEVEL_FIRST]);
15770}
15771
15772void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(const string& dirname) const {
15773 uint8_t res = LEVEL_DB;
15774 if (dirname.length() > 5) {
15775 // the "db.slow" and "db.wal" directory names are hard-coded at
15776 // match up with bluestore. the slow device is always the second
15777 // one (when a dedicated block.db device is present and used at
15778 // bdev 0). the wal device is always last.
15779 if (boost::algorithm::ends_with(dirname, ".slow")) {
15780 res = LEVEL_SLOW;
15781 }
15782 else if (boost::algorithm::ends_with(dirname, ".wal")) {
15783 res = LEVEL_WAL;
15784 }
15785 }
15786 return reinterpret_cast<void*>(res);
15787}
15788
15789void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
15790 auto max_x = per_level_per_dev_usage.get_max_x();
15791 auto max_y = per_level_per_dev_usage.get_max_y();
15792 sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
15793 << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
15794 << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
15795 << ", db_avail:" << db_avail4slow << std::endl
15796 << "Usage matrix:" << std::endl;
15797 constexpr std::array<const char*, 7> names{ {
15798 "DEV/LEV",
15799 "WAL",
15800 "DB",
15801 "SLOW",
15802 "*",
15803 "*",
15804 "REAL"
15805 } };
15806 const size_t width = 12;
15807 for (size_t i = 0; i < names.size(); ++i) {
15808 sout.setf(std::ios::left, std::ios::adjustfield);
15809 sout.width(width);
15810 sout << names[i];
15811 }
15812 sout << std::endl;
15813 for (size_t l = 0; l < max_y; l++) {
15814 sout.setf(std::ios::left, std::ios::adjustfield);
15815 sout.width(width);
15816 switch (l + LEVEL_FIRST) {
15817 case LEVEL_WAL:
15818 sout << "WAL"; break;
15819 case LEVEL_DB:
15820 sout << "DB"; break;
15821 case LEVEL_SLOW:
15822 sout << "SLOW"; break;
15823 case LEVEL_MAX:
15824 sout << "TOTALS"; break;
15825 }
15826 for (size_t d = 0; d < max_x - 1; d++) {
15827 sout.setf(std::ios::left, std::ios::adjustfield);
15828 sout.width(width);
15829 sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
15830 }
15831 sout.setf(std::ios::left, std::ios::adjustfield);
15832 sout.width(width);
15833 sout << stringify(byte_u_t(per_level_per_dev_usage.at(max_x - 1, l)))
15834 << std::endl;
15835 }
15836 ceph_assert(max_x == per_level_per_dev_max.get_max_x());
15837 ceph_assert(max_y == per_level_per_dev_max.get_max_y());
15838 sout << "MAXIMUMS:" << std::endl;
15839 for (size_t l = 0; l < max_y; l++) {
15840 sout.setf(std::ios::left, std::ios::adjustfield);
15841 sout.width(width);
15842 switch (l + LEVEL_FIRST) {
15843 case LEVEL_WAL:
15844 sout << "WAL"; break;
15845 case LEVEL_DB:
15846 sout << "DB"; break;
15847 case LEVEL_SLOW:
15848 sout << "SLOW"; break;
15849 case LEVEL_MAX:
15850 sout << "TOTALS"; break;
15851 }
15852 for (size_t d = 0; d < max_x - 1; d++) {
15853 sout.setf(std::ios::left, std::ios::adjustfield);
15854 sout.width(width);
15855 sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
15856 }
15857 sout.setf(std::ios::left, std::ios::adjustfield);
15858 sout.width(width);
15859 sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
15860 if (l < max_y - 1) {
15861 sout << std::endl;
15862 }
15863 }
15864}
11fdf7f2 15865
9f95a23c 15866// =======================================================